| rfc8670xml2.original.xml | rfc8670.xml | |||
|---|---|---|---|---|
| <?xml version="1.0" encoding="US-ASCII"?> | <?xml version='1.0' encoding='utf-8'?> | |||
| <!DOCTYPE rfc SYSTEM "rfc2629.dtd"> | <!DOCTYPE rfc SYSTEM "rfc2629-xhtml.ent"> | |||
| <?rfc toc="yes"?> | <rfc xmlns:xi="http://www.w3.org/2001/XInclude" number="8670" | |||
| <?rfc tocompact="yes"?> | category="info" consensus="true" submissionType="IETF" | |||
| <?rfc tocdepth="3"?> | docName="draft-ietf-spring-segment-routing-msdc-11" ipr="trust200902" obsol | |||
| <?rfc tocindent="yes"?> | etes="" updates="" xml:lang="en" tocInclude="true" symRefs="true" sortRefs="true | |||
| <?rfc symrefs="yes"?> | " version="3"> | |||
| <?rfc sortrefs="yes"?> | ||||
| <?rfc comments="yes"?> | ||||
| <?rfc inline="yes"?> | ||||
| <?rfc compact="yes"?> | ||||
| <?rfc subcompact="no"?> | ||||
| <rfc category="info" docName="draft-ietf-spring-segment-routing-msdc-11" | ||||
| ipr="trust200902"> | ||||
| <front> | ||||
| <title abbrev="BGP-Prefix SID in large-scale DCs">BGP-Prefix Segment in | ||||
| large-scale data centers</title> | ||||
| <author fullname="Clarence Filsfils" initials="C." role="editor" | <front> | |||
| surname="Filsfils"> | <title abbrev="BGP Prefix-SID in Large-Scale DCs">BGP Prefix Segment in | |||
| Large-Scale Data Centers</title> | ||||
| <seriesInfo name="RFC" value="8670"/> | ||||
| <author fullname="Clarence Filsfils" initials="C." role="editor" surname="Fi | ||||
| lsfils"> | ||||
| <organization>Cisco Systems, Inc.</organization> | <organization>Cisco Systems, Inc.</organization> | |||
| <address> | <address> | |||
| <postal> | <postal> | |||
| <street/> | <street/> | |||
| <city>Brussels</city> | <city>Brussels</city> | |||
| <region/> | <region/> | |||
| <code/> | <code/> | |||
| <country>Belgium</country> | ||||
| <country>BE</country> | ||||
| </postal> | </postal> | |||
| <email>cfilsfil@cisco.com</email> | <email>cfilsfil@cisco.com</email> | |||
| </address> | </address> | |||
| </author> | </author> | |||
| <author fullname="Stefano Previdi" initials="S." surname="Previdi"> | <author fullname="Stefano Previdi" initials="S." surname="Previdi"> | |||
| <organization>Cisco Systems, Inc.</organization> | <organization>Cisco Systems, Inc.</organization> | |||
| <address> | <address> | |||
| <postal> | <postal> | |||
| <street/> | <street/> | |||
| <city/> | <city/> | |||
| <code/> | <code/> | |||
| <country>Italy</country> | <country>Italy</country> | |||
| </postal> | </postal> | |||
| <email>stefano@previdi.net</email> | <email>stefano@previdi.net</email> | |||
| </address> | </address> | |||
| </author> | </author> | |||
| <author fullname="Gaurav Dawra" initials="G." surname="Dawra"> | <author fullname="Gaurav Dawra" initials="G." surname="Dawra"> | |||
| <organization>LinkedIn</organization> | <organization>LinkedIn</organization> | |||
| <address> | <address> | |||
| <postal> | <postal> | |||
| <street/> | <street/> | |||
| <city/> | <city/> | |||
| <code/> | <code/> | |||
| <country>United States of America</country> | ||||
| <country>USA</country> | ||||
| </postal> | </postal> | |||
| <email>gdawra.ietf@gmail.com</email> | <email>gdawra.ietf@gmail.com</email> | |||
| </address> | </address> | |||
| </author> | </author> | |||
| <author fullname="Ebben Aries" initials="E." surname="Aries"> | <author fullname="Ebben Aries" initials="E." surname="Aries"> | |||
| <organization>Juniper Networks</organization> | <organization>Arrcus, Inc.</organization> | |||
| <address> | <address> | |||
| <postal> | <postal> | |||
| <street>1133 Innovation Way</street> | <street>2077 Gateway Place, Suite #400</street> | |||
| <city>San Jose</city> | ||||
| <city>Sunnyvale</city> | <code>CA 95119</code> | |||
| <country>United States of America</country> | ||||
| <code>CA 94089</code> | ||||
| <country>US</country> | ||||
| </postal> | </postal> | |||
| <email>exa@arrcus.com</email> | ||||
| <email>exa@juniper.net</email> | ||||
| </address> | </address> | |||
| </author> | </author> | |||
| <author fullname="Petr Lapukhov" initials="P." surname="Lapukhov"> | <author fullname="Petr Lapukhov" initials="P." surname="Lapukhov"> | |||
| <organization>Facebook</organization> | <organization>Facebook</organization> | |||
| <address> | <address> | |||
| <postal> | <postal> | |||
| <street/> | <street/> | |||
| <city/> | <city/> | |||
| <code/> | <code/> | |||
| <country>United States of America</country> | ||||
| <country>US</country> | ||||
| </postal> | </postal> | |||
| <email>petr@fb.com</email> | <email>petr@fb.com</email> | |||
| </address> | </address> | |||
| </author> | </author> | |||
| <date month="December" year="2019"/> | ||||
| <date year="2018"/> | ||||
| <workgroup>Network Working Group</workgroup> | <workgroup>Network Working Group</workgroup> | |||
| <keyword>example</keyword> | ||||
| <abstract> | <abstract> | |||
| <t>This document describes the motivation and benefits for applying | <t>This document describes the motivation for, and benefits of, applying | |||
| segment routing in BGP-based large-scale data-centers. It describes the | Segment Routing (SR) in BGP-based large-scale data centers. It describes t | |||
| design to deploy segment routing in those data-centers, for both the | he | |||
| MPLS and IPv6 dataplanes.</t> | design to deploy SR in those data centers for both the | |||
| MPLS and IPv6 data planes.</t> | ||||
| </abstract> | </abstract> | |||
| </front> | </front> | |||
| <middle> | <middle> | |||
| <section anchor="INTRO" title="Introduction"> | <section anchor="INTRO" numbered="true" toc="default"> | |||
| <t>Segment Routing (SR), as described in <xref | <name>Introduction</name> | |||
| target="I-D.ietf-spring-segment-routing"/> leverages the source routing | <t>Segment Routing (SR), as described in <xref target="RFC8402" format="de | |||
| fault"/>, leverages the source-routing | ||||
| paradigm. A node steers a packet through an ordered list of | paradigm. A node steers a packet through an ordered list of | |||
| instructions, called segments. A segment can represent any instruction, | instructions called "segments". A segment can represent any instruction, | |||
| topological or service-based. A segment can have a local semantic to an | topological or service based. A segment can have a local semantic to an | |||
| SR node or global within an SR domain. SR allows to enforce a flow | SR node or a global semantic within an SR domain. SR allows the enforcemen | |||
| through any topological path while maintaining per-flow state only at | t of a flow | |||
| the ingress node to the SR domain. Segment Routing can be applied to the | through any topological path while maintaining per-flow state only from | |||
| MPLS and IPv6 data-planes.</t> | the ingress node to the SR domain. SR can be applied to the | |||
| MPLS and IPv6 data planes.</t> | ||||
| <t>The use-cases described in this document should be considered in the | <t>The use cases described in this document should be considered in the | |||
| context of the BGP-based large-scale data-center (DC) design described | context of the BGP-based large-scale data-center (DC) design described | |||
| in <xref target="RFC7938"/>. This document extends it by applying SR | in <xref target="RFC7938" format="default"/>. This document extends it by | |||
| both with IPv6 and MPLS dataplane.</t> | applying SR | |||
| both with IPv6 and MPLS data planes.</t> | ||||
| </section> | </section> | |||
| <section anchor="LARGESCALEDC" numbered="true" toc="default"> | ||||
| <section anchor="LARGESCALEDC" | <name>Large-Scale Data-Center Network Design Summary</name> | |||
| title="Large Scale Data Center Network Design Summary"> | <t>This section provides a brief summary of the Informational RFC | |||
| <t>This section provides a brief summary of the informational document | <xref target="RFC7938" format="default"/>, which outlines a practical netw | |||
| <xref target="RFC7938"/> that outlines a practical network design | ork design | |||
| suitable for data-centers of various scales:<list style="symbols"> | suitable for data centers of various scales:</t> | |||
| <t>Data-center networks have highly symmetric topologies with | <ul spacing="normal"> | |||
| multiple parallel paths between two server attachment points. The | <li>Data-center networks have highly symmetric topologies with | |||
| multiple parallel paths between two server-attachment points. The | ||||
| well-known Clos topology is most popular among the operators (as | well-known Clos topology is most popular among the operators (as | |||
| described in <xref target="RFC7938"/>). In a Clos topology, the | described in <xref target="RFC7938" format="default"/>). In a Clos top ology, the | |||
| minimum number of parallel paths between two elements is determined | minimum number of parallel paths between two elements is determined | |||
| by the "width" of the "Tier-1" stage. See <xref target="FIGLARGE"/> | by the "width" of the "Tier-1" stage. See <xref target="FIGLARGE" form | |||
| below for an illustration of the concept.</t> | at="default"/> | |||
| for an illustration of the concept.</li> | ||||
| <t>Large-scale data-centers commonly use a routing protocol, such as | <li>Large-scale data centers commonly use a routing protocol, such as | |||
| BGP-4 <xref target="RFC4271"/> in order to provide endpoint | BGP-4 <xref target="RFC4271" format="default"/>, in order to provide e | |||
| connectivity. Recovery after a network failure is therefore driven | ndpoint | |||
| connectivity. Therefore, recovery after a network failure is driven | ||||
| either by local knowledge of directly available backup paths or by | either by local knowledge of directly available backup paths or by | |||
| distributed signaling between the network devices.</t> | distributed signaling between the network devices.</li> | |||
| <li>Within data-center networks, traffic is load shared using the | ||||
| <t>Within data-center networks, traffic is load-shared using the | ||||
| Equal Cost Multipath (ECMP) mechanism. With ECMP, every network | Equal Cost Multipath (ECMP) mechanism. With ECMP, every network | |||
| device implements a pseudo-random decision, mapping packets to one | device implements a pseudorandom decision, mapping packets to one | |||
| of the parallel paths by means of a hash function calculated over | of the parallel paths by means of a hash function calculated over | |||
| certain parts of the packet, typically a combination of various | certain parts of the packet, typically a combination of various | |||
| packet header fields.</t> | packet header fields.</li> | |||
| </list></t> | </ul> | |||
| <t>The following is a schematic of a five-stage Clos topology with four | ||||
| <t>The following is a schematic of a five-stage Clos topology, with four | devices in the "Tier-1" stage. Notice that the number of paths between Nod | |||
| devices in the "Tier-1" stage. Notice that number of paths between Node1 | e1 | |||
| and Node12 equals to four: the paths have to cross all of Tier-1 | and Node12 equals four; the paths have to cross all of the Tier-1 | |||
| devices. At the same time, the number of paths between Node1 and Node2 | devices. At the same time, the number of paths between Node1 and Node2 | |||
| equals two, and the paths only cross Tier-2 devices. Other topologies | equals two, and the paths only cross Tier-2 devices. Other topologies | |||
| are possible, but for simplicity only the topologies that have a single | are possible, but for simplicity, only the topologies that have a single | |||
| path from Tier-1 to Tier-3 are considered below. The rest could be | path from Tier-1 to Tier-3 are considered below. The rest could be | |||
| treated similarly, with a few modifications to the logic.</t> | treated similarly, with a few modifications to the logic.</t> | |||
| <section anchor="REFDESIGN" numbered="true" toc="default"> | ||||
| <section anchor="REFDESIGN" title="Reference design"> | <name>Reference Design</name> | |||
| <figure anchor="FIGLARGE" title="5-stage Clos topology"> | <figure anchor="FIGLARGE"> | |||
| <artwork> Tier-1 | <name>5-Stage Clos Topology</name> | |||
| <artwork name="" type="" align="left" alt=""><![CDATA[ | ||||
| Tier-1 | ||||
| +-----+ | +-----+ | |||
| |NODE | | |NODE | | |||
| +->| 5 |--+ | +->| 5 |--+ | |||
| | +-----+ | | | +-----+ | | |||
| Tier-2 | | Tier-2 | Tier-2 | | Tier-2 | |||
| +-----+ | +-----+ | +-----+ | +-----+ | +-----+ | +-----+ | |||
| +------------>|NODE |--+->|NODE |--+--|NODE |-------------+ | +------------>|NODE |--+->|NODE |--+--|NODE |-------------+ | |||
| | +-----| 3 |--+ | 6 | +--| 9 |-----+ | | | +-----| 3 |--+ | 6 | +--| 9 |-----+ | | |||
| | | +-----+ +-----+ +-----+ | | | | | +-----+ +-----+ +-----+ | | | |||
| | | | | | | | | | | |||
| | | +-----+ +-----+ +-----+ | | | | | +-----+ +-----+ +-----+ | | | |||
| | +-----+---->|NODE |--+ |NODE | +--|NODE |-----+-----+ | | | +-----+---->|NODE |--+ |NODE | +--|NODE |-----+-----+ | | |||
| | | | +---| 4 |--+->| 7 |--+--| 10 |---+ | | | | | | | +---| 4 |--+->| 7 |--+--| 10 |---+ | | | | |||
| | | | | +-----+ | +-----+ | +-----+ | | | | | | | | | +-----+ | +-----+ | +-----+ | | | | | |||
| | | | | | | | | | | | | | | | | | | | | | | |||
| +-----+ +-----+ | +-----+ | +-----+ +-----+ | +-----+ +-----+ | +-----+ | +-----+ +-----+ | |||
| |NODE | |NODE | Tier-3 +->|NODE |--+ Tier-3 |NODE | |NODE | | |NODE | |NODE | Tier-3 +->|NODE |--+ Tier-3 |NODE | |NODE | | |||
| | 1 | | 2 | | 8 | | 11 | | 12 | | | 1 | | 2 | | 8 | | 11 | | 12 | | |||
| +-----+ +-----+ +-----+ +-----+ +-----+ | +-----+ +-----+ +-----+ +-----+ +-----+ | |||
| | | | | | | | | | | | | | | | | | | |||
| A O B O <- Servers -> Z O O O | A O B O <- Servers -> Z O O O]]></artwork> | |||
| </artwork> | ||||
| </figure> | </figure> | |||
| <t>In the reference topology illustrated in <xref target="FIGLARGE" form | ||||
| at="default"/>, | ||||
| it is assumed:</t> | ||||
| <ul spacing="normal"> | ||||
| <li> | ||||
| <t>Each node is its own autonomous system (AS) (Node X has AS X). 4- | ||||
| byte AS numbers | ||||
| are recommended (<xref target="RFC6793" format="default"/>).</t> | ||||
| <ul spacing="normal"> | ||||
| <li>For simple and efficient route propagation filtering, | ||||
| Node5, Node6, Node7, and Node8 use the same AS; Node3 and Node4 | ||||
| use the same AS; and Node9 and Node10 use the same AS.</li> | ||||
| <t>In the reference topology illustrated in <xref target="FIGLARGE"/>, | <li>In the case in which 2-byte autonomous system numbers are used | |||
| It is assumed:<list style="symbols"> | ||||
| <t>Each node is its own AS (Node X has AS X). 4-byte AS numbers | ||||
| are recommended (<xref target="RFC6793"/>).<list> | ||||
| <t>For simple and efficient route propagation filtering, | ||||
| Node5, Node6, Node7 and Node8 use the same AS, Node3 and Node4 | ||||
| use the same AS, Node9 and Node10 use the same AS.</t> | ||||
| <t>In case of 2-byte autonomous system numbers are used and | ||||
| for efficient usage of the scarce 2-byte Private Use AS pool, | for efficient usage of the scarce 2-byte Private Use AS pool, | |||
| different Tier-3 nodes might use the same AS.</t> | different Tier-3 nodes might use the same AS.</li> | |||
| <li>Without loss of generality, these details will be | ||||
| <t>Without loss of generality, these details will be | simplified in this document. It is to be assumed that each node | |||
| simplified in this document and assume that each node has its | has its | |||
| own AS.</t> | own AS.</li> | |||
| </list></t> | </ul> | |||
| </li> | ||||
| <t>Each node peers with its neighbors with a BGP session. If not | ||||
| specified, eBGP is assumed. In a specific use-case, iBGP will be | ||||
| used but this will be called out explicitly in that case.</t> | ||||
| <li>Each node peers with its neighbors with a BGP session. If not | ||||
| specified, external BGP (EBGP) is assumed. In a specific use case, | ||||
| internal BGP (IBGP) will be used, but this will be called out | ||||
| explicitly in that case.</li> | ||||
| <li> | ||||
| <t>Each node originates the IPv4 address of its loopback interface | <t>Each node originates the IPv4 address of its loopback interface | |||
| into BGP and announces it to its neighbors. <list> | into BGP and announces it to its neighbors. </t> | |||
| <t>The loopback of Node X is 192.0.2.x/32.</t> | <ul spacing="normal"> | |||
| </list></t> | <li>The loopback of Node X is 192.0.2.x/32.</li> | |||
| </list></t> | </ul> | |||
| </li> | ||||
| <t>In this document, the Tier-1, Tier-2 and Tier-3 nodes are referred | </ul> | |||
| to respectively as Spine, Leaf and ToR (top of rack) nodes. When a ToR | <t>In this document, the Tier-1, Tier-2, and Tier-3 nodes are referred | |||
| to as "Spine", "Leaf", and "ToR" (top of rack) nodes, respectively. Whe | ||||
| n a ToR | ||||
| node acts as a gateway to the "outside world", it is referred to as a | node acts as a gateway to the "outside world", it is referred to as a | |||
| border node.</t> | "border node".</t> | |||
| </section> | </section> | |||
| </section> | </section> | |||
| <section anchor="OPENPROBS" numbered="true" toc="default"> | ||||
| <section anchor="OPENPROBS" | <name>Some Open Problems in Large Data-Center Networks</name> | |||
| title="Some open problems in large data-center networks"> | <t>The data-center-network design summarized above provides means for | |||
| <t>The data-center network design summarized above provides means for | ||||
| moving traffic between hosts with reasonable efficiency. There are few | moving traffic between hosts with reasonable efficiency. There are few | |||
| open performance and reliability problems that arise in such design: | open performance and reliability problems that arise in such a design: | |||
| <list style="symbols"> | </t> | |||
| <t>ECMP routing is most commonly realized per-flow. This means that | <ul spacing="normal"> | |||
| <li>ECMP routing is most commonly realized per flow. This means that | ||||
| large, long-lived "elephant" flows may affect performance of | large, long-lived "elephant" flows may affect performance of | |||
| smaller, short-lived “mouse” flows and reduce efficiency | smaller, short-lived "mouse" flows and may reduce efficiency | |||
| of per-flow load-sharing. In other words, per-flow ECMP does not | of per-flow load sharing. In other words, per-flow ECMP does not | |||
| perform efficiently when flow lifetime distribution is heavy-tailed. | perform efficiently when flow-lifetime distribution is heavy tailed. | |||
| Furthermore, due to hash-function inefficiencies it is possible to | Furthermore, due to hash-function inefficiencies, it is possible to | |||
| have frequent flow collisions, where more flows get placed on one | have frequent flow collisions where more flows get placed on one | |||
| path over the others.</t> | path over the others.</li> | |||
| <li>Shortest-path routing with ECMP implements an oblivious routing | ||||
| <t>Shortest-path routing with ECMP implements an oblivious routing | model that is not aware of the network imbalances. If the network | |||
| model, which is not aware of the network imbalances. If the network | symmetry is broken, for example, due to link failures, utilization | |||
| symmetry is broken, for example due to link failures, utilization | ||||
| hotspots may appear. For example, if a link fails between Tier-1 and | hotspots may appear. For example, if a link fails between Tier-1 and | |||
| Tier-2 devices (e.g. Node5 and Node9), Tier-3 devices Node1 and | Tier-2 devices (e.g., Node5 and Node9), Tier-3 devices Node1 and | |||
| Node2 will not be aware of that, since there are other paths | Node2 will not be aware of that since there are other paths | |||
| available from perspective of Node3. They will continue sending | available from the perspective of Node3. They will continue sending | |||
| roughly equal traffic to Node3 and Node4 as if the failure didn't | roughly equal traffic to Node3 and Node4 as if the failure didn't | |||
| exist which may cause a traffic hotspot.</t> | exist, which may cause a traffic hotspot.</li> | |||
| <li>Isolating faults in the network with multiple parallel paths and | ||||
| <t>Isolating faults in the network with multiple parallel paths and | ECMP-based routing is nontrivial due to lack of determinism. | |||
| ECMP-based routing is non-trivial due to lack of determinism. | ||||
| Specifically, the connections from HostA to HostB may take a | Specifically, the connections from HostA to HostB may take a | |||
| different path every time a new connection is formed, thus making | different path every time a new connection is formed, thus making | |||
| consistent reproduction of a failure much more difficult. This | consistent reproduction of a failure much more difficult. This | |||
| complexity scales linearly with the number of parallel paths in the | complexity scales linearly with the number of parallel paths in the | |||
| network, and stems from the random nature of path selection by the | network and stems from the random nature of path selection by the | |||
| network devices.</t> | network devices.</li> | |||
| </list></t> | </ul> | |||
| <t>First, it will be explained how to apply SR in the DC, for MPLS and | ||||
| IPv6 data-planes.</t> | ||||
| </section> | </section> | |||
| <section anchor="APPLYSR" numbered="true" toc="default"> | ||||
| <section anchor="APPLYSR" | <name>Applying Segment Routing in the DC with MPLS Data Plane</name> | |||
| title="Applying Segment Routing in the DC with MPLS dataplane"> | <section anchor="BGPREFIXSEGMENT" numbered="true" toc="default"> | |||
| <section anchor="BGPREFIXSEGMENT" | <name>BGP Prefix Segment (BGP Prefix-SID)</name> | |||
| title="BGP Prefix Segment (BGP-Prefix-SID)"> | ||||
| <t>A BGP Prefix Segment is a segment associated with a BGP prefix. A | <t>A BGP Prefix Segment is a segment associated with a BGP prefix. A | |||
| BGP Prefix Segment is a network-wide instruction to forward the packet | BGP Prefix Segment is a network-wide instruction to forward the packet | |||
| along the ECMP-aware best path to the related prefix.</t> | along the ECMP-aware best path to the related prefix.</t> | |||
| <t>The BGP Prefix Segment is defined as the BGP Prefix-SID Attribute | ||||
| <t>The BGP Prefix Segment is defined as the BGP-Prefix-SID Attribute | in <xref target="RFC8669" format="default"/>, which contains an | |||
| in <xref target="I-D.ietf-idr-bgp-prefix-sid"/> which contains an | index. Throughout this document, the BGP Prefix Segment Attribute is | |||
| index. Throughout this document the BGP Prefix Segment Attribute is | referred to as the "BGP Prefix-SID" and the encoded index as the | |||
| referred as the BGP-Prefix-SID and the encoded index as the | label index.</t> | |||
| label-index.</t> | ||||
| <t>In this document, the network design decision has been made to | <t>In this document, the network design decision has been made to | |||
| assume that all the nodes are allocated the same SRGB (Segment Routing | assume that all the nodes are allocated the same SRGB (Segment Routing | |||
| Global Block), e.g. [16000, 23999]. This provides operational | Global Block), e.g., [16000, 23999]. This provides operational | |||
| simplification as explained in <xref target="SINGLESRGB"/>, but this | simplification as explained in <xref target="SINGLESRGB" format="default | |||
| "/>, but this | ||||
| is not a requirement.</t> | is not a requirement.</t> | |||
| <t>For illustration purposes, when considering an MPLS data plane, it | ||||
| <t>For illustration purpose, when considering an MPLS data-plane, it | is assumed that the label index allocated to prefix 192.0.2.x/32 is X. | |||
| is assumed that the label-index allocated to prefix 192.0.2.x/32 is X. | ||||
| As a result, a local label (16000+x) is allocated for prefix | As a result, a local label (16000+x) is allocated for prefix | |||
| 192.0.2.x/32 by each node throughout the DC fabric.</t> | 192.0.2.x/32 by each node throughout the DC fabric.</t> | |||
| <t>When the IPv6 data plane is considered, it is assumed that Node X is | ||||
| <t>When IPv6 data-plane is considered, it is assumed that Node X is | ||||
| allocated IPv6 address (segment) 2001:DB8::X.</t> | allocated IPv6 address (segment) 2001:DB8::X.</t> | |||
| </section> | </section> | |||
| <section anchor="eBGP8277" numbered="true" toc="default"> | ||||
| <section anchor="eBGP8277" title="eBGP Labeled Unicast (RFC8277)"> | <name>EBGP Labeled Unicast (RFC 8277)</name> | |||
| <t>Referring to <xref target="FIGLARGE"/> and <xref | <t>Referring to <xref target="FIGLARGE" format="default"/> and | |||
| target="RFC7938"/>, the following design modifications are | <xref target="RFC7938" format="default"/>, the following design modificat | |||
| introduced:<list style="symbols"> | ions are | |||
| <t>Each node peers with its neighbors via a eBGP session with | introduced:</t> | |||
| extensions defined in <xref target="RFC8277"/> (named "eBGP8277" | <ul spacing="normal"> | |||
| throughout this document) and with the BGP-Prefix-SID attribute | <li>Each node peers with its neighbors via an EBGP session with | |||
| extension as defined in <xref | extensions defined in <xref target="RFC8277" format="default"/> (nam | |||
| target="I-D.ietf-idr-bgp-prefix-sid"/>.</t> | ed "EBGP8277" | |||
| throughout this document) and with the BGP Prefix-SID attribute | ||||
| <t>The forwarding plane at Tier-2 and Tier-1 is MPLS.</t> | extension as defined in <xref target="RFC8669" format="default"/>.</ | |||
| li> | ||||
| <t>The forwarding plane at Tier-3 is either IP2MPLS (if the host | <li>The forwarding plane at Tier-2 and Tier-1 is MPLS.</li> | |||
| sends IP traffic) or MPLS2MPLS (if the host sends MPLS- | <li>The forwarding plane at Tier-3 is either IP2MPLS (if the host | |||
| encapsulated traffic).</t> | sends IP traffic) or MPLS2MPLS (if the host sends MPLS-encapsulated | |||
| </list></t> | traffic).</li> | |||
| </ul> | ||||
| <t><xref target="FIGSMALL"/> zooms into a path from server A to server | <t><xref target="FIGSMALL" format="default"/> zooms into a path from Ser | |||
| Z within the topology of <xref target="FIGLARGE"/>.</t> | verA to ServerZ within the topology of <xref target="FIGLARGE" format="default"/ | |||
| >.</t> | ||||
| <figure anchor="FIGSMALL" | <figure anchor="FIGSMALL"> | |||
| title="Path from A to Z via nodes 1, 4, 7, 10 and 11"> | <name>Path from A to Z via Nodes 1, 4, 7, 10, and 11</name> | |||
| <artwork> +-----+ +-----+ +-----+ | <artwork name="" type="" align="left" alt=""><![CDATA[ | |||
| +---------->|NODE | |NODE | |NODE | | +-----+ +-----+ +-----+ | |||
| | | 4 |--+->| 7 |--+--| 10 |---+ | +---------->|NODE | |NODE | |NODE | | |||
| | | 4 |--+->| 7 |--+--| 10 |---+ | ||||
| | +-----+ +-----+ +-----+ | | | +-----+ +-----+ +-----+ | | |||
| | | | | | | |||
| +-----+ +-----+ | +-----+ +-----+ | |||
| |NODE | |NODE | | |NODE | |NODE | | |||
| | 1 | | 11 | | | 1 | | 11 | | |||
| +-----+ +-----+ | +-----+ +-----+ | |||
| | | | | | | |||
| A <- Servers -> Z | A <- Servers -> Z]]></artwork> | |||
| </artwork> | ||||
| </figure> | </figure> | |||
| <t>Referring to Figures <xref target="FIGLARGE" | ||||
| <t>Referring to <xref target="FIGLARGE"/> and <xref | format="counter"/> and <xref target="FIGSMALL" format="counter"/>, and as | |||
| target="FIGSMALL"/> and assuming the IP address with the AS and | suming the IP address with the AS and | |||
| label-index allocation previously described, the following sections | label-index allocation previously described, the following sections | |||
| detail the control plane operation and the data plane states for the | detail the control-plane operation and the data-plane states for the | |||
| prefix 192.0.2.11/32 (loopback of Node11)</t> | prefix 192.0.2.11/32 (loopback of Node11).</t> | |||
| <section anchor="CONTROLPLANE" numbered="true" toc="default"> | ||||
| <section anchor="CONTROLPLANE" title="Control Plane"> | <name>Control Plane</name> | |||
| <t>Node11 originates 192.0.2.11/32 in BGP and allocates to it a | <t>Node11 originates 192.0.2.11/32 in BGP and allocates to it a | |||
| BGP-Prefix-SID with label-index: index11 <xref | BGP Prefix-SID with label-index: index11 <xref target="RFC8669" format | |||
| target="I-D.ietf-idr-bgp-prefix-sid"/>.</t> | ="default"/>.</t> | |||
| <t>Node11 sends the following EBGP8277 update to Node10:</t> | ||||
| <t>Node11 sends the following eBGP8277 update to Node10:<figure> | <ul empty="true"> | |||
| <artwork>. IP Prefix: 192.0.2.11/32 | ||||
| . Label: Implicit-Null | <li> | |||
| . Next-hop: Node11’s interface address on the link to Node10 | <dl> | |||
| . AS Path: {11} | ||||
| . BGP-Prefix-SID: Label-Index 11 | <dt>IP Prefix: | |||
| </artwork> | </dt> | |||
| </figure></t> | <dd>192.0.2.11/32 | |||
| </dd> | ||||
| <dt>Label: | ||||
| </dt> | ||||
| <dd>Implicit NULL | ||||
| </dd> | ||||
| <dt>Next hop: | ||||
| </dt> | ||||
| <dd>Node11's interface address on the link to Node10 | ||||
| </dd> | ||||
| <dt>AS Path: | ||||
| </dt> | ||||
| <dd>{11} | ||||
| </dd> | ||||
| <dt>BGP Prefix-SID: | ||||
| </dt> | ||||
| <dd>Label-Index 11 | ||||
| </dd> | ||||
| </dl> | ||||
| </li> | ||||
| </ul> | ||||
| <t>Node10 receives the above update. As it is SR capable, Node10 is | <t>Node10 receives the above update. As it is SR capable, Node10 is | |||
| able to interpret the BGP-Prefix-SID and hence understands that it | able to interpret the BGP Prefix-SID; therefore, it understands that i t | |||
| should allocate the label from its own SRGB block, offset by the | should allocate the label from its own SRGB block, offset by the | |||
| Label-Index received in the BGP-Prefix-SID (16000+11 hence 16011) to | label index received in the BGP Prefix-SID (16000+11, hence, 16011) to | |||
| the NLRI instead of allocating a non-deterministic label out of a | the Network Layer Reachability Information (NLRI) instead of | |||
| dynamically allocated portion of the local label space. The | allocating a nondeterministic label out of a dynamically allocated | |||
| implicit-null label in the NLRI tells Node10 that it is the | portion of the local label space. The implicit NULL label in the | |||
| penultimate hop and must pop the top label on the stack before | NLRI tells Node10 that it is the penultimate hop and that it must pop | |||
| forwarding traffic for this prefix to Node11.</t> | the | |||
| top label on the stack before forwarding traffic for this prefix to | ||||
| Node11.</t> | ||||
| <t>Then, Node10 sends the following EBGP8277 update to Node7:</t> | ||||
| <t>Then, Node10 sends the following eBGP8277 update to Node7:<figure> | <ul empty="true"> | |||
| <artwork>. IP Prefix: 192.0.2.11/32 | ||||
| . Label: 16011 | <li> | |||
| . Next-hop: Node10’s interface address on the link to Node7 | <dl> | |||
| . AS Path: {10, 11} | ||||
| . BGP-Prefix-SID: Label-Index 11 | <dt>IP Prefix: | |||
| </artwork> | </dt> | |||
| </figure></t> | <dd>192.0.2.11/32 | |||
| </dd> | ||||
| <dt>Label: | ||||
| </dt> | ||||
| <dd>16011 | ||||
| </dd> | ||||
| <dt>Next hop: | ||||
| </dt> | ||||
| <dd>Node10's interface address on the link to Node7 | ||||
| </dd> | ||||
| <dt>AS Path: | ||||
| </dt> | ||||
| <dd>{10, 11} | ||||
| </dd> | ||||
| <dt>BGP Prefix-SID: | ||||
| </dt> | ||||
| <dd>Label-Index 11 | ||||
| </dd> | ||||
| </dl> | ||||
| </li> | ||||
| </ul> | ||||
| <t>Node7 receives the above update. As it is SR capable, Node7 is | <t>Node7 receives the above update. As it is SR capable, Node7 is | |||
| able to interpret the BGP-Prefix-SID and hence allocates the local | able to interpret the BGP Prefix-SID; therefore, it allocates the loca l | |||
| (incoming) label 16011 (16000 + 11) to the NLRI (instead of | (incoming) label 16011 (16000 + 11) to the NLRI (instead of | |||
| allocating a “dynamic” local label from its label | allocating a "dynamic" local label from its label | |||
| manager). Node7 uses the label in the received eBGP8277 NLRI as the | manager). Node7 uses the label in the received EBGP8277 NLRI as the | |||
| outgoing label (the index is only used to derive the local/incoming | outgoing label (the index is only used to derive the local/incoming | |||
| label).</t> | label).</t> | |||
| <t>Node7 sends the following EBGP8277 update to Node4:</t> | ||||
| <t>Node7 sends the following eBGP8277 update to Node4:<figure> | <ul empty="true"> | |||
| <artwork>. IP Prefix: 192.0.2.11/32 | ||||
| . Label: 16011 | <li> | |||
| . Next-hop: Node7’s interface address on the link to Node4 | <dl> | |||
| . AS Path: {7, 10, 11} | ||||
| . BGP-Prefix-SID: Label-Index 11 | <dt>IP Prefix: | |||
| </artwork> | </dt> | |||
| </figure></t> | <dd>192.0.2.11/32 | |||
| </dd> | ||||
| <dt>Label: | ||||
| </dt> | ||||
| <dd>16011 | ||||
| </dd> | ||||
| <dt>Next hop: | ||||
| </dt> | ||||
| <dd>Node7's interface address on the link to Node4 | ||||
| </dd> | ||||
| <dt>AS Path: | ||||
| </dt> | ||||
| <dd>{7, 10, 11} | ||||
| </dd> | ||||
| <dt>BGP Prefix-SID: | ||||
| </dt> | ||||
| <dd>Label-Index 11 | ||||
| </dd> | ||||
| </dl> | ||||
| </li> | ||||
| </ul> | ||||
| <t>Node4 receives the above update. As it is SR capable, Node4 is | <t>Node4 receives the above update. As it is SR capable, Node4 is | |||
| able to interpret the BGP-Prefix-SID and hence allocates the local | able to interpret the BGP Prefix-SID; therefore, it allocates the loca l | |||
| (incoming) label 16011 to the NLRI (instead of allocating a | (incoming) label 16011 to the NLRI (instead of allocating a | |||
| “dynamic” local label from its label manager). Node4 | "dynamic" local label from its label manager). Node4 | |||
| uses the label in the received eBGP8277 NLRI as outgoing label (the | uses the label in the received EBGP8277 NLRI as an outgoing label (the | |||
| index is only used to derive the local/incoming label).</t> | index is only used to derive the local/incoming label).</t> | |||
| <t>Node4 sends the following EBGP8277 update to Node1:</t> | ||||
| <t>Node4 sends the following eBGP8277 update to Node1:<figure> | <ul empty="true"> | |||
| <artwork>. IP Prefix: 192.0.2.11/32 | ||||
| . Label: 16011 | <li> | |||
| . Next-hop: Node4’s interface address on the link to Node1 | <dl> | |||
| . AS Path: {4, 7, 10, 11} | ||||
| . BGP-Prefix-SID: Label-Index 11 | <dt>IP Prefix: | |||
| </artwork> | </dt> | |||
| </figure></t> | <dd>192.0.2.11/32 | |||
| </dd> | ||||
| <dt>Label: | ||||
| </dt> | ||||
| <dd>16011 | ||||
| </dd> | ||||
| <dt>Next hop: | ||||
| </dt> | ||||
| <dd>Node4's interface address on the link to Node1 | ||||
| </dd> | ||||
| <dt>AS Path: | ||||
| </dt> | ||||
| <dd>{4, 7, 10, 11} | ||||
| </dd> | ||||
| <dt>BGP Prefix-SID: | ||||
| </dt> | ||||
| <dd>Label-Index 11 | ||||
| </dd> | ||||
| </dl> | ||||
| </li> | ||||
| </ul> | ||||
| <t>Node1 receives the above update. As it is SR capable, Node1 is | <t>Node1 receives the above update. As it is SR capable, Node1 is | |||
| able to interpret the BGP-Prefix-SID and hence allocates the local | able to interpret the BGP Prefix-SID; therefore, it allocates the loca l | |||
| (incoming) label 16011 to the NLRI (instead of allocating a | (incoming) label 16011 to the NLRI (instead of allocating a | |||
| “dynamic” local label from its label manager). Node1 | "dynamic" local label from its label manager). Node1 | |||
| uses the label in the received eBGP8277 NLRI as outgoing label (the | uses the label in the received EBGP8277 NLRI as an outgoing label (the | |||
| index is only used to derive the local/incoming label).</t> | index is only used to derive the local/incoming label).</t> | |||
| </section> | </section> | |||
| <section anchor="DATAPLANE" numbered="true" toc="default"> | ||||
| <section anchor="DATAPLANE" title="Data Plane"> | <name>Data Plane</name> | |||
| <t>Referring to <xref target="FIGLARGE"/>, and assuming all nodes | <t>Referring to <xref target="FIGLARGE" format="default"/>, and assumi | |||
| ng all nodes | ||||
| apply the same advertisement rules described above and all nodes | apply the same advertisement rules described above and all nodes | |||
| have the same SRGB (16000-23999), here are the IP/MPLS forwarding | have the same SRGB (16000-23999), here are the IP/MPLS forwarding | |||
| tables for prefix 192.0.2.11/32 at Node1, Node4, Node7 and | tables for prefix 192.0.2.11/32 at Node1, Node4, Node7, and | |||
| Node10.</t> | Node10.</t> | |||
| <figure align="left" anchor="NODE1FIB" | <table anchor="NODE1FIB"> | |||
| title="Node1 Forwarding Table"> | ||||
| <artwork align="center">-------------------------------------------- | ||||
| --- | ||||
| Incoming label | outgoing label | Outgoing | ||||
| or IP destination | | Interface | ||||
| 16011 | 16011 | ECMP{3, 4} | ||||
| 192.0.2.11/32 | 16011 | ECMP{3, 4} | ||||
| </figure> | ||||
| <figure anchor="NODE4FIB" suppress-title="false" | <name>Node1 Forwarding Table | |||
| title="Node4 Forwarding Table"> | </name> | |||
| <artwork align="center"> | ||||
| Incoming label | outgoing label | Outgoing | ||||
| or IP destination | | Interface | ||||
| 16011 | 16011 | ECMP{7, 8} | ||||
| 192.0.2.11/32 | 16011 | ECMP{7, 8} | ||||
| </figure> | ||||
| <figure anchor="NODE7FIB" suppress-title="false" | <tbody> | |||
| title="Node7 Forwarding Table"> | ||||
| <artwork align="center"> | ||||
| Incoming label | outgoing label | Outgoing | ||||
| or IP destination | | Interface | ||||
| 16011 | 16011 | 10 | ||||
| 192.0.2.11/32 | 16011 | 10 | ||||
| </figure> | ||||
| <figure anchor="NODE10FIB" suppress-title="true" | <tr> | |||
| title="Node10 Forwarding Table"> | <td align="center">Incoming Label or IP Destination | |||
| <artwork align="center"> | </td> | |||
| Incoming label | outgoing label | Outgoing | <td align="center">Outgoing Label | |||
| or IP destination | | Interface | </td> | |||
| 16011 | POP | 11 | <td align="center">Outgoing Interface | |||
| 192.0.2.11/32 | N/A | 11 | </td> | |||
| </figure> | </tr> | |||
| </section> | ||||
| <section anchor="VARIATIONS" title="Network Design Variation"> | <tr> | |||
| <td align="center">16011 | ||||
| </td> | ||||
| <td align="center">16011 | ||||
| </td> | ||||
| <td align="center">ECMP{3, 4} | ||||
| </td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">192.0.2.11/32 | ||||
| </td> | ||||
| <td align="center">16011 | ||||
| </td> | ||||
| <td align="center">ECMP{3, 4} | ||||
| </td> | ||||
| </tr> | ||||
| </tbody> | ||||
| </table> | ||||
| <table anchor="NODE4FIB"> | ||||
| <name>Node4 Forwarding Table | ||||
| </name> | ||||
| <tbody > | ||||
| <tr> | ||||
| <td align="center">Incoming Label or IP Destination | ||||
| </td> | ||||
| <td align="center">Outgoing Label | ||||
| </td> | ||||
| <td align="center">Outgoing Interface | ||||
| </td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">16011 | ||||
| </td> | ||||
| <td align="center">16011 | ||||
| </td> | ||||
| <td align="center">ECMP{7, 8} | ||||
| </td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">192.0.2.11/32 | ||||
| </td> | ||||
| <td align="center">16011 | ||||
| </td> | ||||
| <td align="center">ECMP{7, 8} | ||||
| </td> | ||||
| </tr> | ||||
| </tbody> | ||||
| </table> | ||||
| <table anchor="NODE7FIB"> | ||||
| <name>Node7 Forwarding Table | ||||
| </name> | ||||
| <tbody > | ||||
| <tr > | ||||
| <td align="center">Incoming Label or IP Destination | ||||
| </td> | ||||
| <td align="center">Outgoing Label | ||||
| </td> | ||||
| <td align="center">Outgoing Interface | ||||
| </td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">16011 | ||||
| </td> | ||||
| <td align="center">16011 | ||||
| </td> | ||||
| <td align="center">10 | ||||
| </td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">192.0.2.11/32 | ||||
| </td> | ||||
| <td align="center">16011 | ||||
| </td> | ||||
| <td align="center">10 | ||||
| </td> | ||||
| </tr> | ||||
| </tbody> | ||||
| </table> | ||||
| <table anchor="NODE10FIB"> | ||||
| <name>Node10 Forwarding Table | ||||
| </name> | ||||
| <tbody > | ||||
| <tr > | ||||
| <td align="center">Incoming Label or IP Destination | ||||
| </td> | ||||
| <td align="center">Outgoing Label | ||||
| </td> | ||||
| <td align="center">Outgoing Interface | ||||
| </td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">16011 | ||||
| </td> | ||||
| <td align="center">POP | ||||
| </td> | ||||
| <td align="center">11 | ||||
| </td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">192.0.2.11/32 | ||||
| </td> | ||||
| <td align="center">N/A | ||||
| </td> | ||||
| <td align="center">11 | ||||
| </td> | ||||
| </tr> | ||||
| </tbody> | ||||
| </table> | ||||
| </section> | ||||
| <section anchor="VARIATIONS" numbered="true" toc="default"> | ||||
| <name>Network Design Variation</name> | ||||
| <t>A network design choice could consist of switching all the | <t>A network design choice could consist of switching all the | |||
| traffic through Tier-1 and Tier-2 as MPLS traffic. In this case, one | traffic through Tier-1 and Tier-2 as MPLS traffic. In this case, one | |||
| could filter away the IP entries at Node4, Node7 and Node10. This | could filter away the IP entries at Node4, Node7, and Node10. This | |||
| might be beneficial in order to optimize the forwarding table | might be beneficial in order to optimize the forwarding table | |||
| size.</t> | size.</t> | |||
| <t>A network design choice could consist in allowing the hosts to | <t>A network design choice could consist of allowing the hosts to | |||
| send MPLS-encapsulated traffic based on the Egress Peer Engineering | send MPLS-encapsulated traffic based on the Egress Peer Engineering | |||
| (EPE) use-case as defined in <xref | (EPE) use case as defined in <xref target="I-D.ietf-spring-segment-rou | |||
| target="I-D.ietf-spring-segment-routing-central-epe"/>. For example, | ting-central-epe" format="default"/>. For example, | |||
| applications at HostA would send their Z-destined traffic to Node1 | applications at HostA would send their Z-destined traffic to Node1 | |||
| with an MPLS label stack where the top label is 16011 and the next | with an MPLS label stack where the top label is 16011 and the next | |||
| label is an EPE peer segment (<xref | label is an EPE peer segment (<xref target="I-D.ietf-spring-segment-ro | |||
| target="I-D.ietf-spring-segment-routing-central-epe"/>) at Node11 | uting-central-epe" format="default"/>) at Node11 | |||
| directing the traffic to Z.</t> | directing the traffic to Z.</t> | |||
| </section> | </section> | |||
| <section anchor="FABRIC" numbered="true" toc="default"> | ||||
| <section anchor="FABRIC" | <name>Global BGP Prefix Segment through the Fabric</name> | |||
| title="Global BGP Prefix Segment through the fabric"> | ||||
| <t>When the previous design is deployed, the operator enjoys global | <t>When the previous design is deployed, the operator enjoys global | |||
| BGP-Prefix-SID and label allocation throughout the DC fabric.</t> | BGP Prefix-SID and label allocation throughout the DC fabric.</t> | |||
| <t>A few examples follow:</t> | ||||
| <t>A few examples follow:<list style="symbols"> | <ul spacing="normal"> | |||
| <t>Normal forwarding to Node11: a packet with top label 16011 | <li>Normal forwarding to Node11: A packet with top label 16011 | |||
| received by any node in the fabric will be forwarded along the | received by any node in the fabric will be forwarded along the | |||
| ECMP-aware BGP best-path towards Node11 and the label 16011 is | ECMP-aware BGP best path towards Node11, and the label 16011 is | |||
| penultimate-popped at Node10 (or at Node 9).</t> | penultimate popped at Node10 (or at Node 9).</li> | |||
| <li>Traffic-engineered path to Node11: An application on a host | ||||
| <t>Traffic-engineered path to Node11: an application on a host | ||||
| behind Node1 might want to restrict its traffic to paths via the | behind Node1 might want to restrict its traffic to paths via the | |||
| Spine node Node5. The application achieves this by sending its | Spine node Node5. The application achieves this by sending its | |||
| packets with a label stack of {16005, 16011}. BGP Prefix SID | packets with a label stack of {16005, 16011}. BGP Prefix-SID | |||
| 16005 directs the packet up to Node5 along the path (Node1, | 16005 directs the packet up to Node5 along the path (Node1, | |||
| Node3, Node5). BGP-Prefix-SID 16011 then directs the packet down | Node3, Node5). BGP Prefix-SID 16011 then directs the packet down | |||
| to Node11 along the path (Node5, Node9, Node11).</t> | to Node11 along the path (Node5, Node9, Node11).</li> | |||
| </list></t> | </ul> | |||
| </section> | </section> | |||
| <section anchor="INCRDEP" numbered="true" toc="default"> | ||||
| <section anchor="INCRDEP" title="Incremental Deployments"> | <name>Incremental Deployments</name> | |||
| <t>The design previously described can be deployed incrementally. | <t>The design previously described can be deployed incrementally. | |||
| Let us assume that Node7 does not support the BGP-Prefix-SID and let | Let us assume that Node7 does not support the BGP Prefix-SID, and let | |||
| us show how the fabric connectivity is preserved.</t> | us show how the fabric connectivity is preserved.</t> | |||
| <t>From a signaling viewpoint, nothing would change; even though | ||||
| <t>From a signaling viewpoint, nothing would change: even though | Node7 does not support the BGP Prefix-SID, it does propagate the | |||
| Node7 does not support the BGP-Prefix-SID, it does propagate the | ||||
| attribute unmodified to its neighbors.</t> | attribute unmodified to its neighbors.</t> | |||
| <t>From a label-allocation viewpoint, the only difference is that | ||||
| <t>From a label allocation viewpoint, the only difference is that | ||||
| Node7 would allocate a dynamic (random) label to the prefix | Node7 would allocate a dynamic (random) label to the prefix | |||
| 192.0.2.11/32 (e.g. 123456) instead of the "hinted" label as | 192.0.2.11/32 (e.g., 123456) instead of the "hinted" label as | |||
| instructed by the BGP-Prefix-SID. The neighbors of Node7 adapt | instructed by the BGP Prefix-SID. The neighbors of Node7 adapt | |||
| automatically as they always use the label in the BGP8277 NLRI as | automatically as they always use the label in the BGP8277 NLRI as | |||
| outgoing label.</t> | an outgoing label.</t> | |||
| <t>Node4 does understand the BGP Prefix-SID; therefore, it allocates t | ||||
| <t>Node4 does understand the BGP-Prefix-SID and hence allocates the | he | |||
| indexed label in the SRGB (16011) for 192.0.2.11/32.</t> | indexed label in the SRGB (16011) for 192.0.2.11/32.</t> | |||
| <t>As a result, all the data-plane entries across the network would | <t>As a result, all the data-plane entries across the network would | |||
| be unchanged except the entries at Node7 and its neighbor Node4 as | be unchanged except the entries at Node7 and its neighbor Node4 as | |||
| shown in the figures below.</t> | shown in the figures below.</t> | |||
| <t>The key point is that the end-to-end Label Switched Path (LSP) is | <t>The key point is that the end-to-end Label Switched Path (LSP) is | |||
| preserved because the outgoing label is always derived from the | preserved because the outgoing label is always derived from the | |||
| received label within the BGP8277 NLRI. The index in the | received label within the BGP8277 NLRI. The index in the | |||
| BGP-Prefix-SID is only used as a hint on how to allocate the local | BGP Prefix-SID is only used as a hint on how to allocate the local | |||
| label (the incoming label) but never for the outgoing label.</t> | label (the incoming label) but never for the outgoing label.</t> | |||
| <figure anchor="NODE7FIBINC" title="Node7 Forwarding Table"> | <table anchor="NODE7FIBINC"> | |||
| <artwork align="center">------------------------------------------ | ||||
| Incoming label | outgoing | Outgoing | ||||
| or IP destination | label | Interface | ||||
| 12345 | 16011 | 10 | ||||
| </artwork> | ||||
| </figure> | ||||
| <figure anchor="NODE4FIBINC" title="Node4 Forwarding Table"> | <name>Node7 Forwarding Table | |||
| <artwork align="center">------------------------------------------ | </name> | |||
| Incoming label | outgoing | Outgoing | ||||
| or IP destination | label | Interface | ||||
| 16011 | 12345 | 7 | ||||
| </artwork> | ||||
| </figure> | ||||
| <t>The BGP-Prefix-SID can thus be deployed incrementally one node at | <tbody > | |||
| a time.</t> | ||||
| <t>When deployed together with a homogeneous SRGB (same SRGB across | <tr > | |||
| <td align="center">Incoming Label or IP Destination | ||||
| </td> | ||||
| <td align="center">Outgoing Label | ||||
| </td> | ||||
| <td align="center">Outgoing Interface | ||||
| </td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">12345 | ||||
| </td> | ||||
| <td align="center">16011 | ||||
| </td> | ||||
| <td align="center">10 | ||||
| </td> | ||||
| </tr> | ||||
| </tbody> | ||||
| </table> | ||||
| <table anchor="NODE4FIBINC"> | ||||
| <name>Node4 Forwarding Table | ||||
| </name> | ||||
| <tbody > | ||||
| <tr > | ||||
| <td align="center">Incoming Label or IP Destination | ||||
| </td> | ||||
| <td align="center">Outgoing Label | ||||
| </td> | ||||
| <td align="center">Outgoing Interface | ||||
| </td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">16011 | ||||
| </td> | ||||
| <td align="center">12345 | ||||
| </td> | ||||
| <td align="center">7 | ||||
| </td> | ||||
| </tr> | ||||
| </tbody> | ||||
| </table> | ||||
| <t>The BGP Prefix-SID can thus be deployed incrementally, i.e., one no | ||||
| de at | ||||
| a time.</t> | ||||
| <t>When deployed together with a homogeneous SRGB (the same SRGB acros | ||||
| s | ||||
| the fabric), the operator incrementally enjoys the global prefix | the fabric), the operator incrementally enjoys the global prefix | |||
| segment benefits as the deployment progresses through the | segment benefits as the deployment progresses through the | |||
| fabric.</t> | fabric.</t> | |||
| </section> | </section> | |||
| </section> | </section> | |||
| <section anchor="iBGP3107" numbered="true" toc="default"> | ||||
| <name>IBGP Labeled Unicast (RFC 8277)</name> | ||||
| <t>The same exact design as EBGP8277 is used with the following | ||||
| modifications:</t> | ||||
| <ul spacing="normal"> | ||||
| <li>All nodes use the same AS number.</li> | ||||
| <li>Each node peers with its neighbors via an internal BGP session | ||||
| (IBGP) with extensions defined in <xref target="RFC8277" format="def | ||||
| ault"/> (named | ||||
| "IBGP8277" throughout this document).</li> | ||||
| <li>Each node acts as a route reflector for each of its neighbors | ||||
| and with the next-hop-self option. Next-hop-self is a well-known | ||||
| operational feature that consists of rewriting the next hop of a | ||||
| BGP update prior to sending it to the neighbor. Usually, | ||||
| it's a common practice to apply next-hop-self behavior | ||||
| towards IBGP peers for EBGP-learned routes. In the case outlined | ||||
| in this section, it is proposed to use the next-hop-self mechanism | ||||
| also to IBGP-learned routes.</li></ul> | ||||
| <section anchor="iBGP3107" title="iBGP Labeled Unicast (RFC8277)"> | <figure anchor="IBGPFIG"> | |||
| <t>The same exact design as eBGP8277 is used with the following | <name>IBGP Sessions with Reflection and Next-Hop-Self</name> | |||
| modifications:<list> | <artwork name="" type="" align="left" alt=""><![CDATA[ | |||
| <t>All nodes use the same AS number.</t> | ||||
| <t>Each node peers with its neighbors via an internal BGP session | ||||
| (iBGP) with extensions defined in <xref target="RFC8277"/> (named | ||||
| "iBGP8277" throughout this document).</t> | ||||
| <t>Each node acts as a route-reflector for each of its neighbors | ||||
| and with the next-hop-self option. Next-hop-self is a well known | ||||
| operational feature which consists of rewriting the next-hop of a | ||||
| BGP update prior to send it to the neighbor. Usually, it’s a | ||||
| common practice to apply next-hop-self behavior towards iBGP peers | ||||
| for eBGP learned routes. In the case outlined in this section it | ||||
| is proposed to use the next-hop-self mechanism also to iBGP | ||||
| learned routes.</t> | ||||
| <t><figure anchor="IBGPFIG" | ||||
| title="iBGP Sessions with Reflection and Next-Hop-Self"> | ||||
| <artwork> | ||||
| Cluster-1 | Cluster-1 | |||
| +-----------+ | +-----------+ | |||
| | Tier-1 | | | Tier-1 | | |||
| | +-----+ | | | +-----+ | | |||
| | |NODE | | | | |NODE | | | |||
| | | 5 | | | | | 5 | | | |||
| Cluster-2 | +-----+ | Cluster-3 | Cluster-2 | +-----+ | Cluster-3 | |||
| +---------+ | | +---------+ | +---------+ | | +---------+ | |||
| | Tier-2 | | | | Tier-2 | | | Tier-2 | | | | Tier-2 | | |||
| | +-----+ | | +-----+ | | +-----+ | | | +-----+ | | +-----+ | | +-----+ | | |||
| skipping to change at line 622 ¶ | skipping to change at line 798 ¶ | |||
| | | 4 | | | | 7 | | | | 10 | | | | | 4 | | | | 7 | | | | 10 | | | |||
| | +-----+ | | +-----+ | | +-----+ | | | +-----+ | | +-----+ | | +-----+ | | |||
| +---------+ | | +---------+ | +---------+ | | +---------+ | |||
| | | | | | | |||
| | +-----+ | | | +-----+ | | |||
| | |NODE | | | | |NODE | | | |||
| Tier-3 | | 8 | | Tier-3 | Tier-3 | | 8 | | Tier-3 | |||
| +-----+ +-----+ | +-----+ | +-----+ +-----+ | +-----+ +-----+ | +-----+ | +-----+ +-----+ | |||
| |NODE | |NODE | +-----------+ |NODE | |NODE | | |NODE | |NODE | +-----------+ |NODE | |NODE | | |||
| | 1 | | 2 | | 11 | | 12 | | | 1 | | 2 | | 11 | | 12 | | |||
| +-----+ +-----+ +-----+ +-----+ | +-----+ +-----+ +-----+ +-----+]]></artwork> | |||
| </artwork> | </figure> | |||
| </figure></t> | <ul spacing="normal"> | |||
| <li> | ||||
| <t>For simple and efficient route propagation filtering and as | <t>For simple and efficient route propagation filtering and as | |||
| illustrated in <xref target="IBGPFIG"/>: <list> | illustrated in <xref target="IBGPFIG" format="default"/>: </t> | |||
| <t>Node5, Node6, Node7 and Node8 use the same Cluster ID | <ul spacing="normal"> | |||
| (Cluster-1)</t> | <li>Node5, Node6, Node7, and Node8 use the same Cluster ID | |||
| (Cluster-1).</li> | ||||
| <t>Node3 and Node4 use the same Cluster ID (Cluster-2)</t> | <li>Node3 and Node4 use the same Cluster ID (Cluster-2).</li> | |||
| <li>Node9 and Node10 use the same Cluster ID (Cluster-3).</li> | ||||
| <t>Node9 and Node10 use the same Cluster ID (Cluster-3)</t> | </ul> | |||
| </list></t> | </li> | |||
| <li>The control-plane behavior is mostly the same as described in | ||||
| <t>The control-plane behavior is mostly the same as described in | the previous section; the only difference is that the EBGP8277 | |||
| the previous section: the only difference is that the eBGP8277 | path propagation is simply replaced by an IBGP8277 path reflection | |||
| path propagation is simply replaced by an iBGP8277 path reflection | with next hop changed to self.</li> | |||
| with next-hop changed to self.</t> | <li>The data-plane tables are exactly the same.</li> | |||
| </ul> | ||||
| <t>The data-plane tables are exactly the same.</t> | ||||
| </list></t> | ||||
| </section> | </section> | |||
| </section> | </section> | |||
| <section anchor="IPV6" numbered="true" toc="default"> | ||||
| <section anchor="IPV6" | <name>Applying Segment Routing in the DC with IPv6 Data Plane</name> | |||
| title="Applying Segment Routing in the DC with IPv6 dataplane"> | <t>The design described in <xref target="RFC7938" format="default"/> is re | |||
| <t>The design described in <xref target="RFC7938"/> is reused with one | used with one | |||
| single modification. It is highlighted using the example of the | single modification. It is highlighted using the example of the | |||
| reachability to Node11 via spine node Node5.</t> | reachability to Node11 via Spine node Node5.</t> | |||
| <t>Node5 originates 2001:DB8::5/128 with the attached BGP Prefix-SID for | ||||
| <t>Node5 originates 2001:DB8::5/128 with the attached BGP-Prefix-SID for | IPv6 packets destined to segment 2001:DB8::5 (<xref target="RFC8402" forma | |||
| IPv6 packets destined to segment 2001:DB8::5 (<xref | t="default"/>).</t> | |||
| target="I-D.ietf-idr-bgp-prefix-sid"/>).</t> | <t>Node11 originates 2001:DB8::11/128 with the attached BGP Prefix-SID | |||
| advertising the support of the Segment Routing Header (SRH) for IPv6 packe | ||||
| <t>Node11 originates 2001:DB8::11/128 with the attached BGP-Prefix-SID | ts destined to segment | |||
| advertising the support of the SRH for IPv6 packets destined to segment | ||||
| 2001:DB8::11.</t> | 2001:DB8::11.</t> | |||
| <t>The control-plane and data-plane processing of all the other nodes in | <t>The control-plane and data-plane processing of all the other nodes in | |||
| the fabric is unchanged. Specifically, the routes to 2001:DB8::5 and | the fabric is unchanged. Specifically, the routes to 2001:DB8::5 and | |||
| 2001:DB8::11 are installed in the FIB along the eBGP best-path to Node5 | 2001:DB8::11 are installed in the FIB along the EBGP best path to Node5 | |||
| (spine node) and Node11 (ToR node) respectively.</t> | (Spine node) and Node11 (ToR node) respectively.</t> | |||
| <t>An application on HostA that needs to send traffic to HostZ via only | ||||
| <t>An application on HostA which needs to send traffic to HostZ via only | Node5 (Spine node) can do so by sending IPv6 packets with a Segment | |||
| Node5 (spine node) can do so by sending IPv6 packets with a Segment | Routing Header (SRH, <xref target="I-D.ietf-6man-segment-routing-header" f | |||
| Routing header (SRH, <xref | ormat="default"/>). The destination | |||
| target="I-D.ietf-6man-segment-routing-header"/>). The destination | ||||
| address and active segment is set to 2001:DB8::5. The next and last | address and active segment is set to 2001:DB8::5. The next and last | |||
| segment is set to 2001:DB8::11.</t> | segment is set to 2001:DB8::11.</t> | |||
| <t>The application must only use IPv6 addresses that have been | <t>The application must only use IPv6 addresses that have been | |||
| advertised as capable for SRv6 segment processing (e.g. for which the | advertised as capable for SRv6 segment processing (e.g., for which the | |||
| BGP prefix segment capability has been advertised). How applications | BGP Prefix Segment capability has been advertised). How applications | |||
| learn this (e.g.: centralized controller and orchestration) is outside | learn this (e.g., centralized controller and orchestration) is outside | |||
| the scope of this document.</t> | the scope of this document.</t> | |||
| </section> | </section> | |||
| <section anchor="COMMHOSTS" numbered="true" toc="default"> | ||||
| <section anchor="COMMHOSTS" | <name>Communicating Path Information to the Host</name> | |||
| title="Communicating path information to the host"> | ||||
| <t>There are two general methods for communicating path information to | <t>There are two general methods for communicating path information to | |||
| the end-hosts: "proactive" and "reactive", aka "push" and "pull" models. | the end-hosts: "proactive" and "reactive", aka "push" and "pull" models. | |||
| There are multiple ways to implement either of these methods. Here, it | There are multiple ways to implement either of these methods. Here, it | |||
| is noted that one way could be using a centralized controller: the | is noted that one way could be using a centralized controller: the | |||
| controller either tells the hosts of the prefix-to-path mappings | controller either tells the hosts of the prefix-to-path mappings | |||
| beforehand and updates them as needed (network event driven push), or | beforehand and updates them as needed (network event driven push) or | |||
| responds to the hosts making request for a path to specific destination | responds to the hosts making requests for a path to a specific destination | |||
| (host event driven pull). It is also possible to use a hybrid model, | (host event driven pull). It is also possible to use a hybrid model, | |||
| i.e., pushing some state from the controller in response to particular | i.e., pushing some state from the controller in response to particular | |||
| network events, while the host pulls other state on demand.</t> | network events, while the host pulls other state on demand.</t> | |||
| <t>Note also that when disseminating network-related data to the | ||||
| <t>It is also noted, that when disseminating network-related data to the | end-hosts, a trade-off is made to balance the amount of information | |||
| end-hosts a trade-off is made to balance the amount of information Vs. | vs. the level of visibility in the network state. This applies | |||
| the level of visibility in the network state. This applies both to push | to both push and pull models. In the extreme case, the host would request | |||
| and pull models. In the extreme case, the host would request path | path information on every flow and keep no local state at all. On the | |||
| information on every flow, and keep no local state at all. On the other | other end of the spectrum, information for every prefix in the network | |||
| end of the spectrum, information for every prefix in the network along | along with available paths could be pushed and continuously updated on | |||
| with available paths could be pushed and continuously updated on all | all hosts.</t> | |||
| hosts.</t> | ||||
| </section> | </section> | |||
| <section anchor="BENEFITS" numbered="true" toc="default"> | ||||
| <section anchor="BENEFITS" title="Additional Benefits"> | <name>Additional Benefits</name> | |||
| <section anchor="MPLSIMPLE" | <section anchor="MPLSIMPLE" numbered="true" toc="default"> | |||
| title="MPLS Dataplane with operational simplicity"> | <name>MPLS Data Plane with Operational Simplicity</name> | |||
| <t>As required by <xref target="RFC7938"/>, no new signaling protocol | <t>As required by <xref target="RFC7938" format="default"/>, no new sign | |||
| is introduced. The BGP-Prefix-SID is a lightweight extension to BGP | aling protocol | |||
| Labeled Unicast <xref target="RFC8277"/>. It applies either to eBGP or | is introduced. The BGP Prefix-SID is a lightweight extension to BGP | |||
| iBGP based designs.</t> | Labeled Unicast <xref target="RFC8277" format="default"/>. It applies ei | |||
| ther to EBGP- or | ||||
| IBGP-based designs.</t> | ||||
| <t>Specifically, LDP and RSVP-TE are not used. These protocols would | <t>Specifically, LDP and RSVP-TE are not used. These protocols would | |||
| drastically impact the operational complexity of the Data Center and | drastically impact the operational complexity of the data center and | |||
| would not scale. This is in line with the requirements expressed in | would not scale. This is in line with the requirements expressed in | |||
| <xref target="RFC7938"/>.</t> | <xref target="RFC7938" format="default"/>.</t> | |||
| <t>Provided the same SRGB is configured on all nodes, all nodes use | <t>Provided the same SRGB is configured on all nodes, all nodes use | |||
| the same MPLS label for a given IP prefix. This is simpler from an | the same MPLS label for a given IP prefix. This is simpler from an | |||
| operation standpoint, as discussed in <xref target="SINGLESRGB"/></t> | operation standpoint, as discussed in <xref target="SINGLESRGB" format=" default"/>.</t> | |||
| </section> | </section> | |||
| <section anchor="MINFIB" numbered="true" toc="default"> | ||||
| <section anchor="MINFIB" title="Minimizing the FIB table"> | <name>Minimizing the FIB Table</name> | |||
| <t>The designer may decide to switch all the traffic at Tier-1 and | <t>The designer may decide to switch all the traffic at Tier-1 and | |||
| Tier-2's based on MPLS, hence drastically decreasing the IP table size | Tier-2 based on MPLS, thereby drastically decreasing the IP table size | |||
| at these nodes.</t> | at these nodes.</t> | |||
| <t>This is easily accomplished by encapsulating the traffic either | <t>This is easily accomplished by encapsulating the traffic either | |||
| directly at the host or the source ToR node by pushing the | directly at the host or at the source ToR node. The encapsulation is | |||
| BGP-Prefix-SID of the destination ToR for intra-DC traffic, or the | done by pushing the BGP Prefix-SID of the destination ToR for intra-DC | |||
| BGP-Prefix-SID for the the border node for inter-DC or | traffic, or by pushing the BGP Prefix-SID for the border node for | |||
| DC-to-outside-world traffic.</t> | inter-DC or DC-to-outside-world traffic.</t> | |||
| </section> | </section> | |||
| <section anchor="EPE" numbered="true" toc="default"> | ||||
| <section anchor="EPE" title="Egress Peer Engineering"> | <name>Egress Peer Engineering</name> | |||
| <t>It is straightforward to combine the design illustrated in this | <t>It is straightforward to combine the design illustrated in this | |||
| document with the Egress Peer Engineering (EPE) use-case described in | document with the Egress Peer Engineering (EPE) use case described in | |||
| <xref target="I-D.ietf-spring-segment-routing-central-epe"/>.</t> | <xref target="I-D.ietf-spring-segment-routing-central-epe" format="defau | |||
| lt"/>.</t> | ||||
| <t>In such case, the operator is able to engineer its outbound traffic | <t>In such a case, the operator is able to engineer its outbound traffic | |||
| on a per host-flow basis, without incurring any additional state at | on a per-host-flow basis, without incurring any additional state at | |||
| intermediate points in the DC fabric.</t> | intermediate points in the DC fabric.</t> | |||
| <t>For example, the controller only needs to inject a per-flow state | <t>For example, the controller only needs to inject a per-flow state | |||
| on the HostA to force it to send its traffic destined to a specific | on the HostA to force it to send its traffic destined to a specific | |||
| Internet destination D via a selected border node (say Node12 in <xref | Internet destination D via a selected border node (say Node12 in <xref t | |||
| target="FIGLARGE"/> instead of another border node, Node11) and a | arget="FIGLARGE" format="default"/> instead of another border node, Node11) and | |||
| a | ||||
| specific egress peer of Node12 (say peer AS 9999 of local PeerNode | specific egress peer of Node12 (say peer AS 9999 of local PeerNode | |||
| segment 9999 at Node12 instead of any other peer which provides a path | segment 9999 at Node12 instead of any other peer that provides a path | |||
| to the destination D). Any packet matching this state at host A would | to the destination D). Any packet matching this state at HostA would | |||
| be encapsulated with SR segment list (label stack) {16012, 9999}. | be encapsulated with SR segment list (label stack) {16012, 9999}. | |||
| 16012 would steer the flow through the DC fabric, leveraging any ECMP, | 16012 would steer the flow through the DC fabric, leveraging any ECMP, | |||
| along the best path to border node Node12. Once the flow gets to | along the best path to border node Node12. Once the flow gets to | |||
| border node Node12, the active segment is 9999 (because of PHP on the | border node Node12, the active segment is 9999 (because of Penultimate | |||
| upstream neighbor of Node12). This EPE PeerNode segment forces border | Hop Popping (PHP) on the upstream neighbor of Node12). This EPE | |||
| node Node12 to forward the packet to peer AS 9999, without any IP | PeerNode segment forces border node Node12 to forward the packet to | |||
| lookup at the border node. There is no per-flow state for this | peer AS 9999 without any IP lookup at the border node. There is no | |||
| engineered flow in the DC fabric. A benefit of segment routing is the | per-flow state for this engineered flow in the DC fabric. A benefit of | |||
| per-flow state is only required at the source.</t> | SR is that the per-flow state is only required at the | |||
| source.</t> | ||||
| <t>As well as allowing full traffic engineering control such a design | <t>As well as allowing full traffic-engineering control, such a design | |||
| also offers FIB table minimization benefits as the Internet-scale FIB | also offers FIB table-minimization benefits as the Internet-scale FIB | |||
| at border node Node12 is not required if all FIB lookups are avoided | at border node Node12 is not required if all FIB lookups are avoided | |||
| there by using EPE.</t> | there by using EPE.</t> | |||
| </section> | </section> | |||
| <section anchor="ANYCAST" numbered="true" toc="default"> | ||||
| <section anchor="ANYCAST" title="Anycast"> | <name>Anycast</name> | |||
| <t>The design presented in this document preserves the availability | <t>The design presented in this document preserves the availability | |||
| and load-balancing properties of the base design presented in <xref | and load-balancing properties of the base design presented in <xref targ | |||
| target="I-D.ietf-spring-segment-routing"/>.</t> | et="RFC8402" format="default"/>.</t> | |||
| <t>For example, one could assign an anycast loopback 192.0.2.20/32 and | <t>For example, one could assign an anycast loopback 192.0.2.20/32 and | |||
| associate segment index 20 to it on the border Node11 and Node12 (in | associate segment index 20 to it on the border nodes Node11 and Node12 ( in | |||
| addition to their node-specific loopbacks). Doing so, the EPE | addition to their node-specific loopbacks). Doing so, the EPE | |||
| controller could express a default "go-to-the-Internet via any border | controller could express a default "go-to-the-Internet via any border | |||
| node" policy as segment list {16020}. Indeed, from any host in the DC | node" policy as segment list {16020}. Indeed, from any host in the DC | |||
| fabric or from any ToR node, 16020 steers the packet towards the | fabric or from any ToR node, 16020 steers the packet towards the | |||
| border Node11 or Node12 leveraging ECMP where available along the best | border nodes Node11 or Node12 leveraging ECMP where available along the best | |||
| paths to these nodes.</t> | paths to these nodes.</t> | |||
| </section> | </section> | |||
| </section> | </section> | |||
| <section anchor="SINGLESRGB" numbered="true" toc="default"> | ||||
| <section anchor="SINGLESRGB" title="Preferred SRGB Allocation"> | <name>Preferred SRGB Allocation</name> | |||
| <t>In the MPLS case, it is recommend to use same SRGBs at each node.</t> | <t>In the MPLS case, it is recommended to use the same SRGBs at each node. | |||
| </t> | ||||
| <t>Different SRGBs in each node likely increase the complexity of the | <t>Different SRGBs in each node likely increase the complexity of the | |||
| solution both from an operational viewpoint and from a controller | solution both from an operational viewpoint and from a controller | |||
| viewpoint.</t> | viewpoint.</t> | |||
| <t>From an operational viewpoint, it is much simpler to have the same | ||||
| <t>From an operation viewpoint, it is much simpler to have the same | ||||
| global label at every node for the same destination (the MPLS | global label at every node for the same destination (the MPLS | |||
| troubleshooting is then similar to the IPv6 troubleshooting where this | troubleshooting is then similar to the IPv6 troubleshooting where this | |||
| global property is a given).</t> | global property is a given).</t> | |||
| <t>From a controller viewpoint, this allows us to construct simple | <t>From a controller viewpoint, this allows us to construct simple | |||
| policies applicable across the fabric.</t> | policies applicable across the fabric.</t> | |||
| <t>Let us consider two applications, A and B, respectively connected to | ||||
| <t>Let us consider two applications A and B respectively connected to | Node1 and Node2 (ToR nodes). Application A has two flows, FA1 and FA2, des | |||
| Node1 and Node2 (ToR nodes). A has two flows FA1 and FA2 destined to Z. | tined to Z. | |||
| B has two flows FB1 and FB2 destined to Z. The controller wants FA1 and | B has two flows, FB1 and FB2, destined to Z. The controller wants FA1 and | |||
| FB1 to be load-shared across the fabric while FA2 and FB2 must be | FB1 to be load shared across the fabric while FA2 and FB2 must be | |||
| respectively steered via Node5 and Node8.</t> | respectively steered via Node5 and Node8.</t> | |||
| <t>Assuming a consistent unique SRGB across the fabric as described in | <t>Assuming a consistent unique SRGB across the fabric as described in | |||
| the document, the controller can simply do it by instructing A and B to | this document, the controller can simply do it by instructing A and B to | |||
| use {16011} respectively for FA1 and FB1 and by instructing A and B to | use {16011} respectively for FA1 and FB1 and by instructing A and B to | |||
| use {16005 16011} and {16008 16011} respectively for FA2 and FB2.</t> | use {16005 16011} and {16008 16011} respectively for FA2 and FB2.</t> | |||
| <t>Let us assume a design where the SRGB is different at every node and | <t>Let us assume a design where the SRGB is different at every node and | |||
| where the SRGB of each node is advertised using the Originator SRGB TLV | where the SRGB of each node is advertised using the Originator SRGB TLV | |||
| of the BGP-Prefix-SID as defined in <xref | of the BGP Prefix-SID as defined in <xref target="RFC8669" format="default | |||
| target="I-D.ietf-idr-bgp-prefix-sid"/>: SRGB of Node K starts at value | "/>: SRGB of Node K starts at value | |||
| K*1000 and the SRGB length is 1000 (e.g. Node1’s SRGB is [1000, | K*1000, and the SRGB length is 1000 (e.g., Node1's SRGB is [1000, | |||
| 1999], Node2’s SRGB is [2000, 2999], …).</t> | 1999], Node2's SRGB is [2000, 2999], ...).</t> | |||
| <t>In this case, not only the controller would need to collect and store | ||||
| all of these different SRGB’s (e.g., through the Originator SRGB | ||||
| TLV of the BGP-Prefix-SID), furthermore it would need to adapt the | ||||
| policy for each host. Indeed, the controller would instruct A to use | ||||
| {1011} for FA1 while it would have to instruct B to use {2011} for FB1 | ||||
| (while with the same SRGB, both policies are the same {16011}).</t> | ||||
| <t>In this case, the controller would need to collect and store all of | ||||
| these different SRGBs (e.g., through the Originator SRGB TLV of the | ||||
| BGP Prefix-SID); furthermore, it would also need to adapt the policy for | ||||
| each host. Indeed, the controller would instruct A to use {1011} for FA1 | ||||
| while it would have to instruct B to use {2011} for FB1 (while with the | ||||
| same SRGB, both policies are the same {16011}).</t> | ||||
| <t>Even worse, the controller would instruct A to use {1005, 5011} for | <t>Even worse, the controller would instruct A to use {1005, 5011} for | |||
| FA1 while it would instruct B to use {2011, 8011} for FB1 (while with | FA1 while it would instruct B to use {2011, 8011} for FB1 (while with | |||
| the same SRGB, the second segment is the same across both policies: | the same SRGB, the second segment is the same across both policies: | |||
| 16011). When combining segments to create a policy, one need to | 16011). When combining segments to create a policy, one needs to | |||
| carefully update the label of each segment. This is obviously more | carefully update the label of each segment. This is obviously more error | |||
| error-prone, more complex and more difficult to troubleshoot.</t> | prone, more complex, and more difficult to troubleshoot.</t> | |||
| </section> | </section> | |||
| <section anchor="IANA" numbered="true" toc="default"> | ||||
| <section anchor="IANA" title="IANA Considerations"> | <name>IANA Considerations</name> | |||
| <t>This document does not make any IANA request.</t> | <t>This document has no IANA actions.</t> | |||
| </section> | </section> | |||
| <section anchor="MANAGE" numbered="true" toc="default"> | ||||
| <section anchor="MANAGE" title="Manageability Considerations"> | <name>Manageability Considerations</name> | |||
| <t>The design and deployment guidelines described in this document are | <t>The design and deployment guidelines described in this document are | |||
| based on the network design described in <xref target="RFC7938"/>.</t> | based on the network design described in <xref target="RFC7938" format="de | |||
| fault"/>.</t> | ||||
| <t>The deployment model assumed in this document is based on a single | <t>The deployment model assumed in this document is based on a single | |||
| domain where the interconnected DCs are part of the same administrative | domain where the interconnected DCs are part of the same administrative | |||
| domain (which, of course, is split into different autonomous systems). | domain (which, of course, is split into different autonomous systems). | |||
| The operator has full control of the whole domain and the usual | The operator has full control of the whole domain, and the usual | |||
| operational and management mechanisms and procedures are used in order | operational and management mechanisms and procedures are used in order | |||
| to prevent any information related to internal prefixes and topology to | to prevent any information related to internal prefixes and topology to | |||
| be leaked outside the domain.</t> | be leaked outside the domain.</t> | |||
| <t>As recommended in <xref target="RFC8402" format="default"/>, | ||||
| <t>As recommended in <xref target="I-D.ietf-spring-segment-routing"/>, | ||||
| the same SRGB should be allocated in all nodes in order to facilitate | the same SRGB should be allocated in all nodes in order to facilitate | |||
| the design, deployment and operations of the domain.</t> | the design, deployment, and operations of the domain.</t> | |||
| <t>When EPE (<xref target="I-D.ietf-spring-segment-routing-central-epe" fo | ||||
| <t>When EPE (<xref | rmat="default"/>) is used (as | |||
| target="I-D.ietf-spring-segment-routing-central-epe"/>) is used (as | explained in <xref target="EPE" format="default"/>), the same operational | |||
| explained in <xref target="EPE"/>, the same operational model is | model is | |||
| assumed. EPE information is originated and propagated throughout the | assumed. EPE information is originated and propagated throughout the | |||
| domain towards an internal server and unless explicitly configured by | domain towards an internal server, and unless explicitly configured by | |||
| the operator, no EPE information is leaked outside the domain | the operator, no EPE information is leaked outside the domain | |||
| boundaries.</t> | boundaries.</t> | |||
| </section> | </section> | |||
| <section anchor="SEC" numbered="true" toc="default"> | ||||
| <section anchor="SEC" title="Security Considerations"> | <name>Security Considerations</name> | |||
| <t>This document proposes to apply Segment Routing to a well known | <t>This document proposes to apply SR to a well-known | |||
| scalability requirement expressed in <xref target="RFC7938"/> using the | scalability requirement expressed in <xref target="RFC7938" format="defaul | |||
| BGP-Prefix-SID as defined in <xref | t"/> using the | |||
| target="I-D.ietf-idr-bgp-prefix-sid"/>.</t> | BGP Prefix-SID as defined in <xref target="RFC8669" format="default"/>.</t | |||
| > | ||||
| <t>It has to be noted, as described in <xref target="MANAGE"/> that the | <t>It has to be noted, as described in <xref target="MANAGE" format="defau | |||
| design illustrated in <xref target="RFC7938"/> and in this document, | lt"/>, that the | |||
| design illustrated in <xref target="RFC7938" format="default"/> and in thi | ||||
| s document | ||||
| refer to a deployment model where all nodes are under the same | refer to a deployment model where all nodes are under the same | |||
| administration. In this context, it is assumed that the operator doesn't | administration. In this context, it is assumed that the operator doesn't | |||
| want to leak outside of the domain any information related to internal | want to leak outside of the domain any information related to internal | |||
| prefixes and topology. The internal information includes prefix-sid and | prefixes and topology. The internal information includes Prefix-SID and | |||
| EPE information. In order to prevent such leaking, the standard BGP | EPE information. In order to prevent such leaking, the standard BGP | |||
| mechanisms (filters) are applied on the boundary of the domain.</t> | mechanisms (filters) are applied on the boundary of the domain.</t> | |||
| <t>Therefore, the solution proposed in this document does not introduce | <t>Therefore, the solution proposed in this document does not introduce | |||
| any additional security concerns from what expressed in <xref | any additional security concerns from what is expressed in <xref target="R | |||
| target="RFC7938"/> and <xref target="I-D.ietf-idr-bgp-prefix-sid"/>. It | FC7938" format="default"/> and <xref target="RFC8669" format="default"/>. It | |||
| is assumed that the security and confidentiality of the prefix and | is assumed that the security and confidentiality of the prefix and | |||
| topology information is preserved by outbound filters at each peering | topology information is preserved by outbound filters at each peering | |||
| point of the domain as described in <xref target="MANAGE"/>.</t> | point of the domain as described in <xref target="MANAGE" format="default" | |||
| </section> | />.</t> | |||
| <section anchor="Acknowledgements" title="Acknowledgements"> | ||||
| <t>The authors would like to thank Benjamin Black, Arjun Sreekantiah, | ||||
| Keyur Patel, Acee Lindem and Anoop Ghanwani for their comments and | ||||
| review of this document.</t> | ||||
| </section> | </section> | |||
| </middle> | ||||
| <back> | ||||
| <displayreference | ||||
| target="I-D.ietf-spring-segment-routing-central-epe" | ||||
| to="SR-CENTRAL-EPE"/> | ||||
| <section anchor="Contributors" title="Contributors"> | <displayreference target="I-D.ietf-6man-segment-routing-header" | |||
| <figure> | to="IPv6-SRH"/> | |||
| <artwork>Gaya Nagarajan | ||||
| US | ||||
| Email: gaya@fb.com</artwork> | <references> | |||
| </figure> | <name>References</name> | |||
| <references> | ||||
| <name>Normative References</name> | ||||
| <figure> | <xi:include href="https://xml2rfc.tools.ietf.org/public/rfc/bibxml/refer | |||
| <artwork>Gaurav Dawra | ence.RFC.8277.xml"/> | |||
| Cisco Systems | <xi:include href="https://xml2rfc.tools.ietf.org/public/rfc/bibxml/refer | |||
| US | ence.RFC.4271.xml"/> | |||
| <xi:include href="https://xml2rfc.tools.ietf.org/public/rfc/bibxml/refer | ||||
| ence.RFC.7938.xml"/> | ||||
| <!--I-D.ietf-spring-segment-routing became RFC 8402 --> | ||||
| <xi:include href="https://xml2rfc.tools.ietf.org/public/rfc/bibxml/refer | ||||
| ence.RFC.8402.xml"/> | ||||
| Email: gdawra.ietf@gmail.com</artwork> | <!-- I-D.ietf-idr-bgp-prefix-sid-27: companion document--> | |||
| </figure> | <reference anchor='RFC8669' target='https://www.rfc-editor.org/info/rfc8669'> | |||
| <front> | ||||
| <title>Segment Routing Prefix Segment Identifier Extensions for BGP</title> | ||||
| <figure> | <author initials='S' surname='Previdi' fullname='Stefano Previdi'> | |||
| <artwork>Dmitry Afanasiev | <organization /> | |||
| Yandex | </author> | |||
| RU | ||||
| Email: fl0w@yandex-team.ru</artwork> | <author initials='C' surname='Filsfils' fullname='Clarence Filsfils'> | |||
| </figure> | <organization /> | |||
| </author> | ||||
| <figure> | <author initials='A' surname='Lindem' fullname='Acee Lindem' role="editor"> | |||
| <artwork>Tim Laberge | <organization /> | |||
| Cisco | </author> | |||
| US | ||||
| Email: tlaberge@cisco.com</artwork> | <author initials='A' surname='Sreekantiah' fullname='Arjun Sreekantiah'> | |||
| </figure> | <organization /> | |||
| </author> | ||||
| <figure> | <author initials='H' surname='Gredler' fullname='Hannes Gredler'> | |||
| <artwork>Edet Nkposong | <organization /> | |||
| Salesforce.com Inc. | </author> | |||
| US | ||||
| Email: enkposong@salesforce.com</artwork> | <date month='December' year='2019' /> | |||
| </figure> | ||||
| <figure> | </front> | |||
| <artwork>Mohan Nanduri | ||||
| Microsoft | ||||
| US | ||||
| Email: mnanduri@microsoft.com</artwork> | <seriesInfo name='RFC' value='8669' /> | |||
| </figure> | <seriesInfo name="DOI" value="10.17487/RFC8669"/> | |||
| </reference> | ||||
| <figure> | </references> | |||
| <artwork>James Uttaro | <references> | |||
| ATT | <name>Informative References</name> | |||
| US | ||||
| Email: ju1738@att.com</artwork> | <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml3/reference.I-D.ietf | |||
| </figure> | -spring-segment-routing-central-epe.xml"/> | |||
| <figure> | <xi:include href="https://xml2rfc.tools.ietf.org/public/rfc/bibxml/reference.RFC | |||
| <artwork>Saikat Ray | .6793.xml"/> | |||
| Unaffiliated | ||||
| US | ||||
| Email: raysaikat@gmail.com</artwork> | <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml3/reference.I-D.ietf | |||
| </figure> | -6man-segment-routing-header.xml"/> | |||
| <figure> | <!-- I-D.ietf-6man-segment-routing-header: I-D exists --> | |||
| <artwork>Jon Mitchell | ||||
| Unaffiliated | ||||
| US | ||||
| Email: jrmitche@puck.nether.net</artwork> | </references> | |||
| </figure> | </references> | |||
| <section anchor="Acknowledgements" numbered="false" toc="default"> | ||||
| <name>Acknowledgements</name> | ||||
| <t>The authors would like to thank Benjamin Black, Arjun Sreekantiah, | ||||
| Keyur Patel, Acee Lindem, and Anoop Ghanwani for their comments and | ||||
| review of this document.</t> | ||||
| </section> | </section> | |||
| </middle> | <section anchor="Contributors" numbered="false" toc="default"> | |||
| <name>Contributors</name> | ||||
| <artwork name="" type="" align="left" alt=""><![CDATA[Gaya Nagarajan | ||||
| United States of America | ||||
| <back> | Email: gaya@fb.com]]></artwork> | |||
| <references title="Normative References"> | <artwork name="" type="" align="left" alt=""><![CDATA[Gaurav Dawra | |||
| <?rfc include="http://xml.resource.org/public/rfc/bibxml/reference.RFC.211 | Cisco Systems | |||
| 9.xml"?> | United States of America | |||
| <?rfc include="http://xml.resource.org/public/rfc/bibxml/reference.RFC.827 | Email: gdawra.ietf@gmail.com]]></artwork> | |||
| 7.xml"?> | <artwork name="" type="" align="left" alt=""><![CDATA[Dmitry Afanasiev | |||
| Yandex | ||||
| Russian Federation | ||||
| <?rfc include="http://xml.resource.org/public/rfc/bibxml/reference.RFC.427 | Email: fl0w@yandex-team.ru]]></artwork> | |||
| 1.xml"?> | <artwork name="" type="" align="left" alt=""><![CDATA[Tim Laberge | |||
| Cisco | ||||
| United States of America | ||||
| <?rfc include="http://xml.resource.org/public/rfc/bibxml/reference.RFC.793 | Email: tlaberge@cisco.com]]></artwork> | |||
| 8.xml"?> | <artwork name="" type="" align="left" alt=""><![CDATA[Edet Nkposong | |||
| Salesforce.com Inc. | ||||
| United States of America | ||||
| <?rfc include="reference.I-D.ietf-spring-segment-routing.xml"?> | Email: enkposong@salesforce.com]]></artwork> | |||
| <artwork name="" type="" align="left" alt=""><![CDATA[Mohan Nanduri | ||||
| Microsoft | ||||
| United States of America | ||||
| <?rfc include="reference.I-D.ietf-idr-bgp-prefix-sid.xml"?> | Email: mohan.nanduri@oracle.com]]></artwork> | |||
| <artwork name="" type="" align="left" alt=""><![CDATA[James Uttaro | ||||
| ATT | ||||
| United States of America | ||||
| <?rfc include="reference.I-D.ietf-spring-segment-routing-central-epe.xml"? | Email: ju1738@att.com]]></artwork> | |||
| > | <artwork name="" type="" align="left" alt=""><![CDATA[Saikat Ray | |||
| </references> | Unaffiliated | |||
| United States of America | ||||
| <references title="Informative References"> | Email: raysaikat@gmail.com]]></artwork> | |||
| <?rfc include="http://xml.resource.org/public/rfc/bibxml/reference.RFC.679 | <artwork name="" type="" align="left" alt=""><![CDATA[Jon Mitchell | |||
| 3.xml"?> | Unaffiliated | |||
| United States of America | ||||
| <?rfc include="reference.I-D.ietf-6man-segment-routing-header.xml"?> | Email: jrmitche@puck.nether.net]]></artwork> | |||
| </references> | </section> | |||
| </back> | </back> | |||
| </rfc> | </rfc> | |||
| End of changes. 195 change blocks. | ||||
| 634 lines changed or deleted | 863 lines changed or added | |||
This html diff was produced by rfcdiff 1.45. The latest version is available from http://tools.ietf.org/tools/rfcdiff/ | ||||