| rfc9722xml2.original.xml | rfc9722.xml | |||
|---|---|---|---|---|
| <?xml version="1.0" encoding="US-ASCII"?> | <?xml version='1.0' encoding='UTF-8'?> | |||
| <!DOCTYPE rfc [ | <!DOCTYPE rfc [ | |||
| <!ENTITY nbsp " "> | <!ENTITY nbsp " "> | |||
| <!ENTITY zwsp "​"> | <!ENTITY zwsp "​"> | |||
| <!ENTITY nbhy "‑"> | <!ENTITY nbhy "‑"> | |||
| <!ENTITY wj "⁠"> | <!ENTITY wj "⁠"> | |||
| ]> | ]> | |||
| <!-- used by XSLT processors --> | <rfc xmlns:xi="http://www.w3.org/2001/XInclude" category="std" docName="draft-ie | |||
| <?xml-stylesheet type='text/xsl' href='http://xml.resource.org/authoring/rfc2629 | tf-bess-evpn-fast-df-recovery-12" number="9722" updates="8584" obsoletes="" cons | |||
| .xslt'?> | ensus="true" submissionType="IETF" ipr="trust200902" tocInclude="true" tocDepth= | |||
| <!-- For a complete list and description of processing instructions (PIs), | "4" symRefs="true" sortRefs="true" version="3" xml:lang="en"> | |||
| please see http://xml.resource.org/authoring/README.html. --> | ||||
| <?rfc strict="yes" ?> | ||||
| <!-- give errors regarding ID-nits and DTD validation --> | ||||
| <!-- control the table of contents (ToC) --> | ||||
| <?rfc toc="yes"?> | ||||
| <!-- generate a ToC --> | ||||
| <?rfc tocdepth="4"?> | ||||
| <!-- the number of levels of subsections in ToC. default: 3 --> | ||||
| <!-- control references --> | ||||
| <?rfc symrefs="yes"?> | ||||
| <!-- use symbolic references tags, i.e, [RFC2119] instead of [1] --> | ||||
| <?rfc sortrefs="yes" ?> | ||||
| <!-- sort the reference entries alphabetically --> | ||||
| <!-- control vertical white space | ||||
| (using these PIs as follows is recommended by the RFC Editor) --> | ||||
| <?rfc compact="yes" ?> | ||||
| <!-- do not start each main section on a new page --> | ||||
| <?rfc subcompact="no" ?> | ||||
| <!-- keep one blank line between list items --> | ||||
| <!-- end of list of popular I-D processing instructions --> | ||||
| <rfc category="std" | ||||
| xmlns:xi="http://www.w3.org/2001/XInclude" | ||||
| docName="draft-ietf-bess-evpn-fast-df-recovery-12" | ||||
| updates="8584" | ||||
| consensus="true" | ||||
| submissionType="IETF" | ||||
| ipr="trust200902"> | ||||
| <!-- ***** FRONT MATTER ***** --> | ||||
| <front> | <front> | |||
| <!-- The abbreviated title is used in the page header - it is only necessary | <title abbrev="Fast Recovery for EVPN DF Election">Fast Recovery for EVPN Des | |||
| if the | ignated Forwarder Election</title> | |||
| full title is longer than 39 characters --> | <seriesInfo name="RFC" value="9722"/> | |||
| <title abbrev="Fast Recovery for EVPN DF-Election">Fast Recovery for EVPN Des | ||||
| ignated Forwarder Election</title> | ||||
| <!-- add 'role="editor"' below for the editors if appropriate --> | ||||
| <!-- Another author who claims to be an editor --> | ||||
| <author fullname="Patrice Brissette" initials="P." surname="Brissette"> | <author fullname="Patrice Brissette" initials="P." surname="Brissette"> | |||
| <organization>Cisco</organization> | <organization>Cisco</organization> | |||
| <address> | <address> | |||
| <email>pbrisset@cisco.com</email> | <email>pbrisset@cisco.com</email> | |||
| </address> | </address> | |||
| </author> | </author> | |||
| <author fullname="Ali Sajassi" initials="A." surname="Sajassi"> | ||||
| <author fullname="Ali Sajassi" initials="A." surname="Sajassi"> | <organization>Cisco</organization> | |||
| <organization>Cisco</organization> | <address> | |||
| <address> | <email>sajassi@cisco.com</email> | |||
| <email>sajassi@cisco.com</email> | </address> | |||
| </address> | </author> | |||
| </author> | <author fullname="Luc André Burdet" initials="LA." surname="Burdet" role="ed | |||
| itor"> | ||||
| <author fullname="Luc Andre Burdet" initials="LA." surname="Burdet" role="edit | <organization>Cisco</organization> | |||
| or"> | <address> | |||
| <organization>Cisco</organization> | <email>lburdet@cisco.com</email> | |||
| <address> | </address> | |||
| <email>lburdet@cisco.com</email> | </author> | |||
| </address> | <author fullname="John Drake" initials="J." surname="Drake"> | |||
| </author> | <organization>Independent</organization> | |||
| <address> | ||||
| <author fullname="John Drake" initials="J." surname="Drake"> | <email>je_drake@yahoo.com</email> | |||
| <organization>Independent</organization> | </address> | |||
| <address> | </author> | |||
| <email>je_drake@yahoo.com</email> | <author fullname="Jorge Rabadan" initials="J." surname="Rabadan"> | |||
| </address> | <organization>Nokia</organization> | |||
| </author> | <address> | |||
| <email>jorge.rabadan@nokia.com</email> | ||||
| <author fullname="Jorge Rabadan" initials="J." surname="Rabadan"> | </address> | |||
| <organization>Nokia</organization> | </author> | |||
| <address> | <date year="2025" month="April"/> | |||
| <email>jorge.rabadan@nokia.com</email> | ||||
| </address> | ||||
| </author> | ||||
| <date year="2024" /> | ||||
| <!-- Meta-data Declarations --> | ||||
| <area>General</area> | ||||
| <workgroup>BESS Working Group</workgroup> | ||||
| <!-- WG name at the upperleft corner of the doc, | ||||
| IETF is fine for individual submissions. | ||||
| If this element is not present, the default is "Network Working Group", | ||||
| which is used by the RFC Editor as a nod to the history of the IETF. --> | ||||
| <keyword>EVPN</keyword> | ||||
| <keyword>Designated Forwarder</keyword> | ||||
| <keyword>Convergence</keyword> | ||||
| <keyword>Recovery</keyword> | ||||
| <abstract> | ||||
| <t>The Ethernet Virtual Private Network (EVPN) solution in RFC 7432 provide | ||||
| s | ||||
| Designated Forwarder (DF) election procedures for multihomed Ethernet Segme | ||||
| nts. These | ||||
| procedures have been enhanced further by applying the Highest | ||||
| Random Weight (HRW) algorithm for Designated Forwarder election | ||||
| to avoid unnecessary DF status changes upon a failure. | ||||
| This document improves these procedures by providing a fast Designated Forw | ||||
| arder | ||||
| election upon recovery of the failed link or node associated | ||||
| with the multihomed Ethernet Segment. | ||||
| This document updates RFC 8584 by optionally introducing delays between | ||||
| some of the events therein.</t> | ||||
| <t>The solution is independent of the number of EVPN Instances (EVIs) assoc | ||||
| iated with that Ethernet | ||||
| Segment and it is performed via a simple signaling in BGP between the | ||||
| recovered node and each of the other nodes in the multihoming group.</t> | ||||
| </abstract> | ||||
| </front> | <area>RTG</area> | |||
| <workgroup>bess</workgroup> | ||||
| <middle> | <keyword>EVPN</keyword> | |||
| <section anchor="intro" title="Introduction"> | <keyword>Designated Forwarder</keyword> | |||
| <t>The Ethernet Virtual Private Network (EVPN) solution <xref target="RFC74 | <keyword>Convergence</keyword> | |||
| 32"/> is | <keyword>Recovery</keyword> | |||
| <abstract> | ||||
| <t>The Ethernet Virtual Private Network (EVPN) solution in RFC 7432 | ||||
| provides Designated Forwarder (DF) election procedures for multihomed | ||||
| Ethernet Segments. These procedures have been enhanced further by | ||||
| applying the Highest Random Weight (HRW) algorithm for DF election to | ||||
| avoid unnecessary DF status changes upon a failure. This document | ||||
| improves these procedures by providing a fast DF election upon recovery | ||||
| of the failed link or node associated with the multihomed Ethernet | ||||
| Segment. This document updates RFC 8584 by optionally introducing | ||||
| delays between some of the events therein.</t> | ||||
| <t>The solution is independent of the number of EVPN Instances (EVIs) | ||||
| associated with that Ethernet Segment, and it is performed via a simple | ||||
| signaling in BGP between the recovered node and each of the other nodes | ||||
| in the multihoming group.</t> | ||||
| </abstract> | ||||
| </front> | ||||
| <middle> | ||||
| <section anchor="intro"> | ||||
| <name>Introduction</name> | ||||
| <t>The Ethernet Virtual Private Network (EVPN) solution <xref target="RFC7 | ||||
| 432"/> is | ||||
| widely used in data center (DC) applications for Network | widely used in data center (DC) applications for Network | |||
| Virtualization Overlay (NVO) and DC interconnect (DCI) services, and | Virtualization Overlay (NVO) and Data Center Interconnect (DCI) services an | |||
| in service provider (SP) applications for next generation virtual | d | |||
| in service provider (SP) applications for next-generation virtual | ||||
| private LAN services.</t> | private LAN services.</t> | |||
| <t><xref target="RFC7432"/> describes Designated Forwarder (DF) election p | ||||
| <t><xref target="RFC7432"/> describes Designated Forwarder (DF) election pr | rocedures for | |||
| ocedures for | ||||
| multihomed Ethernet Segments. These procedures are enhanced further in | multihomed Ethernet Segments. These procedures are enhanced further in | |||
| <xref target="RFC8584"/> by applying the Highest Random Weight algorithm fo r DF | <xref target="RFC8584"/> by applying the Highest Random Weight (HRW) algori thm for DF | |||
| election in order to avoid unnecessary DF status changes upon a link | election in order to avoid unnecessary DF status changes upon a link | |||
| or node failure associated with the multihomed Ethernet Segment.</t> | or node failure associated with the multihomed Ethernet Segment.</t> | |||
| <t>This document makes further improvements to the DF election procedures i n | <t>This document makes further improvements to the DF election procedures in | |||
| <xref target="RFC8584"/> by providing an option for a fast DF election upon | <xref target="RFC8584"/> by providing an option for a fast DF election upon | |||
| recovery of the failed link or node associated with the multihomed | recovery of the failed link or node associated with the multihomed | |||
| Ethernet Segment. This DF election is achieved independent of the number | Ethernet Segment. This DF election is achieved independent of the number | |||
| of EVPN Instances (EVIs) associated with that Ethernet Segment and it is pe rformed via | of EVPN Instances (EVIs) associated with that Ethernet Segment, and it is p erformed via | |||
| straightforward signaling in BGP between the recovered node and each of the other nodes | straightforward signaling in BGP between the recovered node and each of the other nodes | |||
| in the multihomed Ethernet Segment redundancy group.<br/> | in the multihomed Ethernet Segment redundancy group.</t> | |||
| This document updates the DF Election Finite State Machine (FSM) described | <t>This document updates the DF Election Finite State Machine (FSM) | |||
| in <relref target="RFC8584" section="2.1"/>, | described in <xref target="RFC8584" section="2.1"/> by optionally | |||
| by optionally introducing delays between some events, as further detailed i | introducing delays between some events, as further detailed in <xref | |||
| n <xref target="fsm_8584"/>. | target="fsm_8584"/>. The solution is based on a simple one-way signaling | |||
| The solution is based on a simple one-way signaling mechanism.</t> | mechanism.</t> | |||
| <section> | ||||
| <section title="Requirements Language"> | <name>Requirements Language</name> | |||
| <t>The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", | ||||
| "SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and | ||||
| "OPTIONAL" in this document are to be interpreted as described in | ||||
| BCP 14 <xref target="RFC2119"/> <xref target="RFC8174"/> when, and only when, | ||||
| they appear in all | ||||
| capitals, as shown here.</t> | ||||
| </section> | ||||
| <section anchor="terminology" title="Terminology"> | ||||
| <t> | <t> | |||
| <dl> | The key words "<bcp14>MUST</bcp14>", "<bcp14>MUST NOT</bcp14>", "<bcp14>REQU | |||
| <dt>PE:</dt><dd>Provider Edge device.</dd> | IRED</bcp14>", "<bcp14>SHALL</bcp14>", "<bcp14>SHALL | |||
| <dt>Designated Forwarder (DF):</dt><dd>A PE that is currently forward | NOT</bcp14>", "<bcp14>SHOULD</bcp14>", "<bcp14>SHOULD NOT</bcp14>", "<bcp14> | |||
| ing | RECOMMENDED</bcp14>", "<bcp14>NOT RECOMMENDED</bcp14>", | |||
| "<bcp14>MAY</bcp14>", and "<bcp14>OPTIONAL</bcp14>" in this document are to | ||||
| be interpreted as | ||||
| described in BCP 14 <xref target="RFC2119"/> <xref target="RFC8174"/> | ||||
| when, and only when, they appear in all capitals, as shown here. | ||||
| </t> | ||||
| </section> | ||||
| <section anchor="terminology"> | ||||
| <name>Terminology</name> | ||||
| <dl> | ||||
| <dt>PE:</dt> | ||||
| <dd>Provider Edge</dd> | ||||
| <dt>DF:</dt> | ||||
| <dd>Designated Forwarder. A PE that is currently forwarding | ||||
| (encapsulating/decapsulating) traffic for a given VLAN in and out of | (encapsulating/decapsulating) traffic for a given VLAN in and out of | |||
| a site.</dd> | a site.</dd> | |||
| <dt>NDF:</dt><dd>Non-Designated Forwarder, a PE that is currently blo | <dt>NDF:</dt> | |||
| cking traffic (see | <dd>Non-Designated Forwarder. A PE that is currently blocking traffic | |||
| (see | ||||
| DF above).</dd> | DF above).</dd> | |||
| <dt>EVI:</dt><dd>An EVPN instance spanning the Provider Edge (PE) dev | <dt>EVI:</dt> | |||
| ices | <dd>EVPN Instance. It spans the PE devices participating in that | |||
| participating in that EVPN.</dd> | EVPN.</dd> | |||
| <dt>HRW:</dt><dd>Highest Random Weight algorithm, <xref target="HRW98 | <dt>HRW:</dt> | |||
| "/> </dd> | <dd>Highest Random Weight algorithm <xref target="HRW98"/></dd> | |||
| <dt>Service carving:</dt><dd>DF Election is also referred to as "serv | <dt>Service carving:</dt> | |||
| ice carving" in <xref | <dd>This refers to DF election, as defined in <xref target="RFC7432"/> | |||
| target="RFC7432"/></dd> | .</dd> | |||
| <dt>SCT:</dt><dd>Service Carving Time, defined in this document, the | <dt>SCT:</dt> | |||
| time at | <dd>Service Carving Time. Defined in this document as the time at | |||
| which all nodes participating in an Ethernet Segment perform DF Elect ion.</dd> | which all nodes participating in an Ethernet Segment perform DF Elect ion.</dd> | |||
| </dl> | </dl> | |||
| </t> | </section> | |||
| </section> | <section anchor="challenges"> | |||
| <name>Challenges with Existing Mechanism</name> | ||||
| <section anchor="challenges" title="Challenges with Existing Mechanism"> | <t>In EVPN technology, multiple PE devices encapsulate | |||
| <t>In EVPN technology, multiple Provider Edge (PE) devices encapsulate | ||||
| and decapsulate data belonging to the same VLAN. Under certain condition s, this | and decapsulate data belonging to the same VLAN. Under certain condition s, this | |||
| may cause duplicated Ethernet packets and potential loops if there is a momentary | may cause duplicated Ethernet packets and potential loops if there is a momentary | |||
| overlap in forwarding roles between two or more PE devices, potentially also leading | overlap in forwarding roles between two or more PE devices, potentially also leading | |||
| to broadcast storms of frames forwarded back into the VLAN.</t> | to broadcast storms of frames forwarded back into the VLAN.</t> | |||
| <t>EVPN <xref target="RFC7432"/> currently specifies timer-based synchro nization among PE | <t>EVPN <xref target="RFC7432"/> currently specifies timer-based synchro nization among PE | |||
| devices within an Ethernet Segment redundancy group. This approach can l ead to duplications and potential | devices within an Ethernet Segment redundancy group. This approach can l ead to duplications and potential | |||
| loops due to multiple Designated Forwarders (DFs) if the timer interval | loops due to multiple DFs if the timer interval is too short | |||
| is too short, | or can lead to packet drops if the timer interval is too long.</t> | |||
| or to packet drops if the timer interval is too long.</t> | <t>Split-horizon filtering, as described in <xref target="RFC7432" secti | |||
| on="8.3"/>, | ||||
| <t>Split-horizon filtering, as described in <relref target="RFC7432" sec | ||||
| tion="8.3"/>, | ||||
| can prevent loops but does not address duplicates. | can prevent loops but does not address duplicates. | |||
| However, if there are overlapping Designated Forwarders of two | However, if there are overlapping DFs of two | |||
| different sites simultaneously for the same VLAN, the site identifier wi ll differ when the | different sites simultaneously for the same VLAN, the site identifier wi ll differ when the | |||
| packet re-enters the Ethernet Segment. Consequently, the split-horizon c heck will fail, | packet re-enters the Ethernet Segment. Consequently, the split-horizon c heck will fail, | |||
| resulting in layer-2 loops.</t> | resulting in Layer 2 loops.</t> | |||
| <t>The updated DF procedures outlined in <xref target="RFC8584"/> | <t>The updated DF procedures outlined in <xref target="RFC8584"/> | |||
| use the well-known | use the well-known | |||
| Highest Random Weight (HRW) algorithm to prevent the reshuffling of VLANs among | HRW algorithm to prevent the reshuffling of VLANs among | |||
| PE devices within the Ethernet Segment redundancy group during failure o r recovery events. This | PE devices within the Ethernet Segment redundancy group during failure o r recovery events. This | |||
| approach minimizes the impact on VLANs not assigned to the failed or rec overed ports | approach minimizes the impact on VLANs not assigned to the failed or rec overed ports | |||
| and eliminates the occurrence of loops or duplicates during such events. </t> | and eliminates the occurrence of loops or duplicates during such events. </t> | |||
| <t>However, upon PE insertion or a port being newly added to a multihome d Ethernet Segment, | <t>However, upon PE insertion or a port being newly added to a multihome d Ethernet Segment, | |||
| HRW cannot help either as a transfer of DF role to the new port must occ ur | the HRW cannot help either, as a transfer of the DF role to the new port must occur | |||
| while the old DF is still active.</t> | while the old DF is still active.</t> | |||
| <figure anchor="topology"> | ||||
| <figure anchor="topology" title="CE1 multihomed to PE1 and PE2."> | <name>CE1 Multihomed to PE1 and PE2</name> | |||
| <artwork><![CDATA[ | <artwork><![CDATA[ | |||
| +---------+ | +---------+ | |||
| +-------------+ | | | +-------------+ | | | |||
| | | | | | | | | | | |||
| / | PE1 |----| | +-------------+ | / | PE1 |----| | +-------------+ | |||
| / | | | MPLS/ | | |---CE3 | / | | | MPLS/ | | |---CE3 | |||
| / +-------------+ | VxLAN/ | | PE3 | | / +-------------+ | VxLAN/ | | PE3 | | |||
| CE1 - | Cloud | | | | CE1 - | Cloud | | | | |||
| \ +-------------+ | |---| | | \ +-------------+ | |---| | | |||
| \ | | | | +-------------+ | \ | | | | +-------------+ | |||
| \ | PE2 |----| | | \ | PE2 |----| | | |||
| | | | | | | | | | | |||
| +-------------+ | | | +-------------+ | | | |||
| +---------+ | +---------+]]></artwork> | |||
| ]]> | </figure> | |||
| </artwork></figure> | ||||
| <t>In <xref target="topology"/>, when PE2 is inserted in the Ethernet Se gment or its | <t>In <xref target="topology"/>, when PE2 is inserted in the Ethernet Se gment or its | |||
| CE1-facing interface recovered, PE1 will transfer | CE1-facing interface is recovered, PE1 will transfer | |||
| the DF role of some VLANs to PE2 to achieve load balancing. However, | the DF role of some VLANs to PE2 to achieve load-balancing. However, | |||
| because there is no handshake mechanism between PE1 and PE2, | because there is no handshake mechanism between PE1 and PE2, | |||
| overlapping of DF roles for a given VLAN is possible which leads to dupl | overlapping of DF roles for a given VLAN is possible, which leads to dup | |||
| ication of | lication of | |||
| traffic as well as layer-2 loops.</t> | traffic as well as Layer 2 loops.</t> | |||
| <t>Current EVPN specifications <xref target="RFC7432"/> and <xref target ="RFC8584"/> | <t>Current EVPN specifications <xref target="RFC7432"/> and <xref target ="RFC8584"/> | |||
| rely on a timer-based approach for transferring the DF role to the newly inserted device. | rely on a timer-based approach for transferring the DF role to the newly inserted device. | |||
| This can cause the following issues: | This can cause the following issues:</t> | |||
| <ul> | <ul> | |||
| <li>Loops/Duplicates if the timer value is too short</li> | <li>Loops and duplicates, if the timer value is too short</li> | |||
| <li>Prolonged Traffic Blackholing if the timer value is too long</li | <li>Prolonged traffic loss, if the timer value is too long</li> | |||
| > | ||||
| </ul> | </ul> | |||
| </t> | </section> | |||
| </section> | <section anchor="advantages"> | |||
| <name>Design Principles for a Solution</name> | ||||
| <section anchor="advantages" title="Design Principles for a Solution"> | ||||
| <t>The clock-synchronization solution for fast DF recovery presented in this document | <t>The clock-synchronization solution for fast DF recovery presented in this document | |||
| follows several design principles and offers | follows several design principles and offers | |||
| multiple advantages, namely: | multiple advantages, namely: | |||
| </t> | ||||
| <ul> | <ul> | |||
| <li>Complex handshake signaling mechanisms and state machines are | <li>Complex handshake signaling mechanisms and state machines are | |||
| avoided in favor of a simple uni-directional signaling approach.</li | avoided in favor of a simple unidirectional signaling approach.</li> | |||
| > | <li>The fast DF recovery solution maintains backwards compatibility (s | |||
| <li>The fast DF recovery solution maintains backwards compatibility (s | ee <xref target="ntpcompat"/>) by ensuring that PEs reject any unrecognized new | |||
| ee <xref | BGP EVPN Extended Community.</li> | |||
| target="ntpcompat"/>) by ensuring that PEs reject any unrecognized new | ||||
| BGP EVPN Extended Community.</li> | ||||
| <li>Existing DF Election algorithms remain supported.</li> | <li>Existing DF Election algorithms remain supported.</li> | |||
| <li>The fast DF recovery solution is independent of any BGP delays in propagation of Ethernet Segment | <li>The fast DF recovery solution is independent of any BGP delays in propagation of Ethernet Segment | |||
| routes (Route Type 4)</li> | routes (Route Type 4)</li> | |||
| <li>The fast DF recovery solution is agnostic of the actual time synch ronization mechanism | <li>The fast DF recovery solution is agnostic of the actual time synch ronization mechanism | |||
| used; however, an NTP-based representation of time is used for EVPN si gnaling.</li> | used; however, an NTP-based representation of time is used for EVPN si gnaling.</li> | |||
| </ul> | </ul> | |||
| </t> | ||||
| <t>The solution in this document relies on nodes in the topology, more s pecifically | <t>The solution in this document relies on nodes in the topology, more s pecifically | |||
| the peering nodes of each Ethernet-Segment, to be clock-synchronized and advertise Time | the peering nodes of each Ethernet-Segment, to be clock-synchronized and to advertise the Time | |||
| Synchronization capability. | Synchronization capability. | |||
| When this is not the case, or clocks are badly desynchronized, network c | When this is not the case, or when clocks are badly desynchronized, netw | |||
| onvergence and DF | ork convergence and DF | |||
| Election is no worse than <xref target="RFC7432"/> due to the timestamp | Election is no worse than that described in <xref target="RFC7432"/> due | |||
| range checking (<xref | to the timestamp range checking (<xref target="timestamp_verification"/>). | |||
| target="timestamp_verification"/>). | ||||
| </t> | </t> | |||
| </section> | </section> | |||
| </section> | ||||
| </section> | <section anchor="sync"> | |||
| <name>DF Election Synchronization Solution</name> | ||||
| <section anchor="sync" title="DF Election Synchronization Solution"> | ||||
| <t>The fast DF recovery solution relies on the concept of common clock ali gnment between partner PEs participating | <t>The fast DF recovery solution relies on the concept of common clock ali gnment between partner PEs participating | |||
| in a common Ethernet Segment, i.e., PE1 and PE2 in <xref target="topology" />. The main idea is to have all peering PEs of that | in a common Ethernet Segment, i.e., PE1 and PE2 in <xref target="topology" />. The main idea is to have all peering PEs of that | |||
| Ethernet Segment perform DF election and apply the result at the same prev | Ethernet Segment perform DF election and apply the result at the same prev | |||
| iously-announced time. </t> | iously announced time. </t> | |||
| <t>The DF Election procedure, as described in <xref target="RFC7432"/> and as optionally | <t>The DF Election procedure, as described in <xref target="RFC7432"/> and as optionally | |||
| signaled in <xref target="RFC8584"/>, is applied. | signaled in <xref target="RFC8584"/>, is applied. | |||
| All PEs attached to a given Ethernet Segment are clock-synchronized | All PEs attached to a given Ethernet Segment are clock-synchronized | |||
| using a networking protocol for clock synchronization (e.g., NTP, PTP). | using a networking protocol for clock synchronization (e.g., NTP, Precisio | |||
| Whenever possible, recovery activities for failed PEs SHOULD NOT be initia | n Time Protocol (PTP)). | |||
| ted until after the | Whenever possible, recovery activities for failed PEs <bcp14>SHOULD NOT</b | |||
| cp14> be initiated until after the | ||||
| underlying clock synchronization protocol has converged to benefit from th is document's fast DF recovery | underlying clock synchronization protocol has converged to benefit from th is document's fast DF recovery | |||
| procedures. | procedures. | |||
| When a new PE is inserted in an Ethernet Segment or a failed PE of the Eth ernet | When a new PE is inserted in an Ethernet Segment or when a failed PE of th e Ethernet | |||
| Segment recovers, that PE communicates to peering partners the current tim e plus the value of | Segment recovers, that PE communicates to peering partners the current tim e plus the value of | |||
| the timer for partner discovery from step 2 in <relref target="RFC7432" se ction="8.5"/>. | the timer for partner discovery from step 2 in <xref target="RFC7432" sect ion="8.5"/>. | |||
| This constitutes an "end time" or "absolute time" as seen from the local P E. | This constitutes an "end time" or "absolute time" as seen from the local P E. | |||
| That absolute time is called the "Service Carving Time" (SCT).</t> | That absolute time is called the Service Carving Time (SCT).</t> | |||
| <t>A new BGP EVPN Extended Community, the Service Carving Time, is adverti | ||||
| <t>A new BGP EVPN Extended Community, the Service Carving Time is advertis | sed along with | |||
| ed along with | the Ethernet Segment Route Type 4 (RT-4) and communicates the SCT to other | |||
| the Ethernet Segment Route Type 4 (RT-4) and communicates the Service Carv | ||||
| ing Time to other | ||||
| partners to ensure an orderly transfer of forwarding duties.</t> | partners to ensure an orderly transfer of forwarding duties.</t> | |||
| <t>Upon receipt of the new BGP EVPN Extended Community, partner PEs can de | ||||
| <t>Upon receipt of the new BGP EVPN Extended Community, partner PEs can de | termine the SCT | |||
| termine the service carving time | ||||
| of the newly inserted PE. To eliminate any potential for duplicate traffic or loops, the | of the newly inserted PE. To eliminate any potential for duplicate traffic or loops, the | |||
| concept of skew is introduced: a small time offset to ensure a controlled | concept of "skew" is introduced: a small time offset to ensure a controlle | |||
| and orderly | d and orderly | |||
| transition when multiple Provider Edge (PE) devices are involved. | transition when multiple PE devices are involved. | |||
| The previously inserted PE(s) must perform service carving first for NDF t o DF transitions. | The previously inserted PE(s) must perform service carving first for NDF t o DF transitions. | |||
| The receiving PEs subtract this skew (default = 10ms) to the Service Carvi ng Time and apply NDF | The receiving PEs subtract this skew (default = 10 ms) to the Service Carv ing Time and apply NDF | |||
| to DF transitions first. This is followed shortly by the NDF to DF transit ions on both PEs, after the skew delay. | to DF transitions first. This is followed shortly by the NDF to DF transit ions on both PEs, after the skew delay. | |||
| On the recovering PE, all services are already in NDF state and no | On the recovering PE, all services are already in NDF state, and no | |||
| skew for DF to NDF transitions is required.<br/> | skew for DF to NDF transitions is required.</t> | |||
| This document proposes a default skew value of 10ms to allow completion of | <t>This document proposes a default skew value of 10 ms to allow completio | |||
| programming the DF | n of programming the DF | |||
| to NDF transitions, but implementations may make the skew larger (or confi gurable) taking | to NDF transitions, but implementations may make the skew larger (or confi gurable) taking | |||
| into consideration scale, hardware capabilities and clock accuracy.</t> | into consideration scale, hardware capabilities, and clock accuracy.</t> | |||
| <t>To summarize, all peering PEs perform service carving almost | ||||
| <t>To summarize, all peering PEs perform service carving almost simultaneo | simultaneously at the time announced by the newly added/recovered | |||
| usly at the time | PE. The newly inserted PE initiates the SCT and triggers service carving | |||
| announced by the newly added/recovered PE. The newly inserted PE initiates | immediately on its local timer expiry. The previously inserted PE(s) | |||
| the SCT, | receiving Ethernet Segment route (RT-4) with an SCT BGP extended | |||
| and triggers service carving immediately on its local timer expiry. The pr | community perform service carving shortly before the SCT for DF to NDF | |||
| eviously inserted PE(s) receiving Ethernet Segment route (RT-4) with an SCT BGP | transitions and at the SCT for NDF to DF transitions.</t> | |||
| extended community, | <section anchor="ntpencoding"> | |||
| perform service carving shortly before Service Carving Time for DF to NDF | <name>BGP Encoding</name> | |||
| transitions, and at | ||||
| Service Carving Time for NDF to DF transitions.</t> | ||||
| <section anchor="ntpencoding" title="BGP Encoding"> | ||||
| <t>A BGP extended community, with Type 0x06 and Sub-Type 0x0F, is define d to communicate the | <t>A BGP extended community, with Type 0x06 and Sub-Type 0x0F, is define d to communicate the | |||
| Service Carving Time for each Ethernet Segment: | SCT for each Ethernet Segment:</t> | |||
| <figure> | ||||
| <figure title="Service Carving Time"><artwork><![CDATA[ | <name>Service Carving Time</name> | |||
| <artwork><![CDATA[ | ||||
| 1 2 3 | 1 2 3 | |||
| 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |||
| +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |||
| | Type = 0x06 | Sub-Type(0x0F)| Timestamp Seconds ~ | | Type = 0x06 | Sub-Type(0x0F)| Timestamp Seconds ~ | |||
| +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |||
| ~ Timestamp Seconds | Timestamp Fractional Seconds | | ~ Timestamp Seconds | Timestamp Fraction | | |||
| +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+]]></artwork> | |||
| ]]> | </figure> | |||
| </artwork></figure> | ||||
| </t> | ||||
| <t> | ||||
| The timestamp exchanged uses the NTP prime epoch of January 1, 1900 <xre | ||||
| f target="RFC5905"/> | ||||
| and an adapted form of the 64-bit NTP Timestamp Format.<br/> | ||||
| The 64-bit NTP Timestamp Format consists of a 32-bit part for Seconds an | ||||
| d a 32-bit | ||||
| part for Fraction, which are encoded in the Service Carving Time as foll | ||||
| ows: | ||||
| <ul> | ||||
| <li>Timestamp Seconds: 32-bit NTP seconds are encoded in this field.</li | ||||
| > | ||||
| <li>Timestamp Fractional Seconds: the high order 16 bits of the NTP 'Fra | ||||
| ction' field are encoded in this | ||||
| field.</li> | ||||
| </ul> | ||||
| </t> | ||||
| <t>When rebuilding a 64-bit NTP Timestamp Format using the values from a | ||||
| received SCT BGP extended community, the lower order 16 bits of the | ||||
| Fractional field are set to 0. The use of a 16-bit fractional seconds va | ||||
| lue yields adequate precision of 15 microseconds | ||||
| (2^-16 s).</t> | ||||
| <t>This document introduces a new flag called Time | ||||
| Synchronization indicated by "T" in the DF Election Capabilities registr | ||||
| y defined in <xref | ||||
| target="RFC8584"/> for use in DF Election Extended Community. | ||||
| <figure title="DF Election Extended Community"><artwork><![CDATA[ | <t>The timestamp exchanged uses the NTP prime epoch of 0 h 1 January | |||
| 1900 UTC <xref target="RFC5905"/> and an adapted form of the 64-bit NTP times | ||||
| tamp format.</t> | ||||
| <t>The 64-bit NTP timestamp format consists of a 32-bit unsigned seconds | ||||
| field and a 32-bit fraction field, which are encoded in the | ||||
| Service Carving Time as follows:</t> | ||||
| <dl spacing="normal" newline="false"> | ||||
| <dt>Timestamp Seconds:</dt><dd>32-bit NTP seconds are encoded in this | ||||
| field.</dd> | ||||
| <dt>Timestamp Fraction:</dt><dd>The high-order 16 bits of | ||||
| the NTP "Fraction" field are encoded in this field.</dd> | ||||
| </dl> | ||||
| <t>When rebuilding a 64-bit NTP timestamp format using the values from a | ||||
| received SCT BGP extended community, the lower-order 16 bits of the | ||||
| NTP "Fraction" field are set to 0. The use of a 16-bit fractional second | ||||
| s value yields adequate precision of 15 microseconds | ||||
| (2<sup>-16</sup> s).</t> | ||||
| <t>The format of the DF Election Extended Community that is used in this | ||||
| document is:</t> | ||||
| <figure> | ||||
| <name>DF Election Extended Community (RFC 8584)</name> | ||||
| <artwork><![CDATA[ | ||||
| 1 2 3 | 1 2 3 | |||
| 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |||
| +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |||
| | Type = 0x06 | Sub-Type(0x06)| RSV | DF Alg | Bitmap ~ | | Type = 0x06 | Sub-Type(0x06)| RSV | DF Alg | Bitmap ~ | |||
| +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |||
| ~ Bitmap | Reserved | | ~ Bitmap | Reserved | | |||
| +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+]]></artwork> | |||
| </figure> | ||||
| Figure 4: DF Election Extended Community | <t>The Bitmap field (2 octets) encodes "capabilities" <xref target="RFC8 | |||
| ]]> | 584"/>, where this | |||
| </artwork></figure> | document introduces a new Time Synchronization capability indicated by " | |||
| T".</t> | ||||
| <figure title="DF Election Capabilities"><artwork><![CDATA[ | <figure> | |||
| <name>Bitmap Field in the DF Election Extended Community</name> | ||||
| <artwork><![CDATA[ | ||||
| 1 1 | 1 1 | |||
| 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 | 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 | |||
| +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |||
| | |A| |T| | | | |A| |T| | | |||
| +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+]]></artwork> | |||
| </figure> | ||||
| Figure 5: DF Election Capabilities | ||||
| ]]> | ||||
| </artwork></figure> | ||||
| </t> | ||||
| <t> | ||||
| <ul> | ||||
| <li>Bit 3: Time Synchronization (corresponds to Bit 27 of the DF Electio | ||||
| n Extended | ||||
| Community). When set | ||||
| to 1, it indicates the desire to use Time Synchronization capability | ||||
| with the rest of the PEs in the Ethernet Segment.</li> | ||||
| </ul> | ||||
| </t> | ||||
| <dl spacing="normal" newline="false"> | ||||
| <dt>Bit 3:</dt><dd>Time Synchronization (corresponds to Bit 27 of | ||||
| the DF Election Extended Community). When set to 1, it indicates the | ||||
| desire to use the Time Synchronization capability with the rest of the | ||||
| PEs in the Ethernet Segment.</dd> | ||||
| </dl> | ||||
| <t> | <t> | |||
| This capability is utilized in conjunction with the agreed-upon DF Elect ion Type. | This capability is utilized in conjunction with the agreed-upon DF Elect ion Type. | |||
| For instance, if all the PE devices in the Ethernet Segment indicate the desire to use the | For instance, if all the PE devices in the Ethernet Segment indicate the desire to use the | |||
| Time Synchronization capability and request the DF Election Type to be H | Time Synchronization capability and request the DF Election Type to be t | |||
| ighest Random Weight (HRW), | he HRW, | |||
| then the HRW algorithm is used in conjunction with this capability. A PE | then the HRW algorithm is used in conjunction with this capability. A PE | |||
| which does not | that does not | |||
| support the procedures set out in this document, or receives a route fro | support the procedures set out in this document or that receives a route | |||
| m another PE in | from another PE in | |||
| which the capability is not set, MUST NOT delay Designated Forwarder ele | which the capability is not set <bcp14>MUST NOT</bcp14> delay DF electio | |||
| ction as this could | n as this could | |||
| lead to duplicate traffic in some instances (overlapping Designated Forw | lead to duplicate traffic in some instances (overlapping DFs).</t> | |||
| arders).</t> | ||||
| </section> | </section> | |||
| <section anchor="timestamp_verification"> | ||||
| <section anchor="timestamp_verification" title="Timestamp Verification"> | <name>Timestamp Verification</name> | |||
| <t>The NTP Era value is not exchanged and participating PEs may consider | <t>The NTP Era value is not exchanged, and participating PEs may conside | |||
| the timestamps to be in the same Era as their local value. | r the timestamps to be in the same Era as their local value. | |||
| A DF Election operation occurring exactly at the next Era transition wil | A DF Election operation occurring exactly at the next Era transition wil | |||
| l be sometime on | l be some time on | |||
| February 7, 2036. Implementors and operators may address credi | February 7, 2036. Implementors and operators may address credible | |||
| ble | cases of rollover ambiguity (adjacent Eras n and n+1) as well as the sec | |||
| cases of rollover ambiguity (adjacent Eras n and n+1), as well as the se | urity issue of unreasonably | |||
| curity issue of unreasonably | large or unreasonably small NTP timestamps in the following manner.</t> | |||
| large or unreasonably small NTP timestamps, in the following manner.</t> | <t>The procedures in this document address implicitly what occurs with r | |||
| eceiving an SCT value | ||||
| <t>The procedures in this document address implicitly what occurs with r | ||||
| eceiving a SCT value | ||||
| in the past. This would be a naturally occurring event with a large BGP propagation delay: | in the past. This would be a naturally occurring event with a large BGP propagation delay: | |||
| the receiving PE treats | the receiving PE treats | |||
| the DF Election at the peer as having occurred already and proceeds with | the DF Election at the peer as having already occurred and proceeds with | |||
| out starting any | out starting any | |||
| timer to further delay service carving, effectively falling back on <xre | timer to further delay service carving, effectively falling back on beha | |||
| f target="RFC7432"/> behavior. | vior as specified in <xref target="RFC7432"/>. | |||
| A PE which receives a SCT value smaller than its current time, MUST disc | A PE that receives an SCT value smaller than its current time <bcp14>MUS | |||
| ard the Service Carving Time and SHALL treat the DF Election at | T</bcp14> discard the Service Carving Time and <bcp14>SHALL</bcp14> treat the DF | |||
| Election at | ||||
| the peer as having occurred already.</t> | the peer as having occurred already.</t> | |||
| <t>The more problematic scenario is the PE in Era n+1 that receives an S | ||||
| <t>The more problematic scenario is the PE in Era n+1 which receives a S | CT advertised by | |||
| ervice Carving Time advertised by | ||||
| the PE still in Era n, with a very large SCT value. To address this Era rollover as well as | the PE still in Era n, with a very large SCT value. To address this Era rollover as well as | |||
| the large values attack vector, implementations MUST validate the receiv | the large values attack vector, implementations <bcp14>MUST</bcp14> vali | |||
| ed SCT against | date the received SCT against | |||
| an upper-bound.<br/> | an upper bound.</t> | |||
| It is left to implementations to decide what constitutes an "unreasonabl | <t>It is left to implementations to decide what constitutes an "unreason | |||
| y large" SCT value. | ably large" SCT value. | |||
| A recommended approach, however, is to compare the received offset to th e local peering timer value. | A recommended approach, however, is to compare the received offset to th e local peering timer value. | |||
| In practice, peering timer values are configured uniformly across Ethern | In practice, peering timer values are configured uniformly across Ethern | |||
| et-Segment peers and | et Segment peers and | |||
| may be treated as an upper-bound on the offset of received SCT values. | may be treated as an upper bound on the offset of received SCT values. | |||
| A PE which receives an SCT representing an offset larger than the local | A PE that receives an SCT representing an offset larger than the local p | |||
| peering timer MUST | eering timer <bcp14>MUST</bcp14> | |||
| discard the Service Carving Time and SHALL treat | discard the SCT and <bcp14>SHALL</bcp14> treat | |||
| the DF Election at the peer as having occurred already, as above.</t> | the DF Election at the peer as having already occurred, as above.</t> | |||
| </section> | </section> | |||
| <section anchor="fsm_8584"> | ||||
| <section anchor="fsm_8584" title="Updates to RFC8584"> | <name>Updates to RFC 8584</name> | |||
| <t>This document introduces an additional delay to the events and | <t>This document introduces an additional delay to the events and | |||
| transitions defined for the default DF election algorithm FSM in | transitions defined for the default DF election algorithm FSM in | |||
| <relref target="RFC8584" section="2.1"/> without changing the FSM state or event definitions | <xref target="RFC8584" section="2.1"/> without changing the FSM state or event definitions | |||
| themselves.</t> | themselves.</t> | |||
| <t>Upon receiving an RCVD_ES message, the peering PE's FSM transitions | ||||
| <t>Upon receiving a RCVD_ES message, the peering PE's Finite State Machi | from the DF_DONE state (indicating the DF election process was complete) | |||
| ne (FSM) transitions | to the DF_CALC state | |||
| from the DF_DONE (indicating the DF election process was complete) state | (indicating that a new DF calculation is needed). Due to the | |||
| to the DF_CALC | SCT included in the Ethernet Segment update, the completion of the DF_CA | |||
| (indicating that a new DF calculation is needed) state. Due to the Servi | LC state and the | |||
| ce Carving Time | ||||
| (SCT) included in the Ethernet-Segment update, the completion of the DF_ | ||||
| CALC state and the | ||||
| subsequent transition back to the DF_DONE state are delayed. This delay ensures proper | subsequent transition back to the DF_DONE state are delayed. This delay ensures proper | |||
| synchronization and prevents conflicts. Consequently, the accompanying f orwarding updates to | synchronization and prevents conflicts. Consequently, the accompanying f orwarding updates to | |||
| the Designated Forwarder (DF) and Non-Designated Forwarder (NDF) states | the DF and NDF states are also deferred.</t> | |||
| are also deferred.</t> | ||||
| <t>Item 9. in <relref target="RFC8584" section="2.1"/>, the list "Corres | ||||
| ponding actions when transitions | ||||
| are performed or states are entered/exited" is changed as follows:</t> | ||||
| <ol start="9"> | ||||
| <li>DF_CALC on CALCULATED: Mark the election result for the VLAN or | ||||
| VLAN Bundle. | ||||
| <ol type="9.%d"> | ||||
| <li>If an SCT timestamp is present during the RCVD_ES event of Action 11 | ||||
| , wait until the | ||||
| time indicated by the SCT minus skew before proceeding to step 9.3.</li> | ||||
| <li>If an SCT timestamp is present during the RCVD_ES event of Action 11 | ||||
| , wait until the | ||||
| time indicated by the SCT before proceeding to step 9.4.</li> | ||||
| <li>Assume the role of NDF for the local PE concerning the VLAN or VLAN | ||||
| Bundle, and transition to the DF_DONE state.</li> | ||||
| <li>Assume the role of DF for the local PE concerning the VLAN or VLAN B | ||||
| undle, and transition to the DF_DONE state.</li> | ||||
| </ol> | ||||
| </li> | ||||
| </ol> | ||||
| <t>This revised approach ensures proper timing and synchronization in th | <t>Item 9 in <xref target="RFC8584" section="2.1"/>, in the list "Corres | |||
| e DF election | ponding actions when transitions | |||
| process, avoiding conflicts and ensuring accurate forwarding updates.</t | are performed or states are entered/exited", is changed as follows:</t> | |||
| > | ||||
| </section> | ||||
| </section> | <blockquote> | |||
| <ol start="9" spacing="normal"> | ||||
| <li><t>DF_CALC on CALCULATED: Mark the election result for the VLAN | ||||
| or VLAN bundle.</t> | ||||
| <section anchor="example" title="Synchronization Scenarios"> | <ol type="9.%d" spacing="normal"> | |||
| <li>If no Service Carving Time is present during the RCVD_ES event o | ||||
| f Action 11, | ||||
| proceed to step 9.4</li> | ||||
| <t>Consider <xref target="topology"/> as an example, where initially PE2 | <li>If a Service Carving Time is present during the RCVD_ES event of | |||
| has failed and PE1 has taken over. | Action 11, wait until the time indicated by the SCT minus skew befor | |||
| This scenario illustrates the problem with the DF-Election mechanism des | e proceeding to step | |||
| cribed in <relref target="RFC7432" section="8.5"/>, | 9.3.</li> | |||
| specifically in the context of the timer value configured for all PEs on | <li>Assume the role of NDF for the local PE concerning the VLAN or V | |||
| the Ethernet | LAN bundle. | |||
| Segment.</t> | Wait the remaining skew time before proceeding to step 9.4.</li> | |||
| <t>Procedure based on <relref target="RFC7432" section="8.5"/> with the | <li>Assume the election result's role (DF or NDF) for the local PE c | |||
| default 3-second timer in step 2: | oncerning the VLAN or | |||
| <ol> | VLAN bundle and transition to the DF_DONE state.</li> | |||
| <li>Initial state: PE1 is in a steady-state and PE2 is recovering.</li | </ol> | |||
| > | </li> | |||
| <li>Recovery: PE2 recovers at an absolute time of t=99.</li> | ||||
| <li>Advertisement: PE2 advertises RT-4, sent at t=100, to partner PE1. | ||||
| </li> | ||||
| <li>Timer Start: PE2 starts a 3-second timer to allow the reception of | ||||
| RT-4 from other PE | ||||
| nodes.</li> | ||||
| <li>Immediate carving: PE1 performs service carving immediately upon R | ||||
| T-4 reception, i.e., t=100 plus some BGP propagation delay.</li> | ||||
| <li>Delayed Carving: PE2 performs service carving at time t=103.</li> | ||||
| </ol> | </ol> | |||
| </t> | </blockquote> | |||
| <t><xref target="RFC7432"/> favors traffic drops over duplicate traffic. | ||||
| With the above procedure, traffic drops will occur as part of each PE | ||||
| recovery sequence | ||||
| since PE1 transitions some VLANs to Non-Designated Forwarder (NDF) immed | ||||
| iately upon RT-4 | ||||
| reception.<br/> | ||||
| The timer value (default = 3 seconds) directly affects the duration of t | ||||
| he packet | ||||
| drops. A shorter (or zero) timer may result in duplicate traffic or traf | ||||
| fic loops.</t> | ||||
| <t>Procedure based on the Service Carving Time (SCT) approach: | <t>This revised approach ensures proper timing and synchronization in th | |||
| <ol> | e DF election | |||
| <li>Initial state: PE1 is in a steady state, and PE2 is recovering.</l | process, avoiding conflicts and ensuring accurate forwarding updates.</t | |||
| i> | > | |||
| <li>Recovery: PE2 recovers at an absolute time of t=99.</li> | </section> | |||
| <li>Timer Start: PE2 starts at t=100 a 3-second timer to allow the rec | </section> | |||
| eption of RT-4 from other PE | <section anchor="example"> | |||
| <name>Synchronization Scenarios</name> | ||||
| <t>Consider <xref target="topology"/> as an example, where initially PE2 | ||||
| has failed and PE1 has taken over. This scenario illustrates the | ||||
| problem with the DF Election mechanism described in <xref | ||||
| target="RFC7432" section="8.5"/>, specifically in the context of the | ||||
| timer value configured for all PEs on the Ethernet Segment.</t> | ||||
| <t>The following procedure is based on <xref target="RFC7432" | ||||
| section="8.5"/> with the default 3-second timer in step 2. </t> | ||||
| <ol spacing="normal"> | ||||
| <li>Initial state: PE1 is in a steady-state and PE2 is recovering.</li> | ||||
| <li>Recovery: PE2 recovers at an absolute time of t=99.</li> | ||||
| <li>Advertisement: PE2 advertises RT-4, sent at t=100, to its partner (P | ||||
| E1).</li> | ||||
| <li>Timer Start: PE2 starts a 3-second timer to allow the reception of | ||||
| RT-4 from other PE nodes.</li> | ||||
| <li>Immediate carving: PE1 performs service carving immediately upon | ||||
| RT-4 reception, i.e., t=100 plus some BGP propagation delay.</li> | ||||
| <li>Delayed Carving: PE2 performs service carving at time t=103.</li> | ||||
| </ol> | ||||
| <t><xref target="RFC7432"/> favors traffic drops over duplicate traffic. | ||||
| With the above procedure, traffic drops will occur as part of each PE | ||||
| recovery sequence since PE1 transitions some VLANs to | ||||
| an NDF immediately upon RT-4 reception. The timer value | ||||
| (default = 3 seconds) directly affects the duration of the packet | ||||
| drops. A shorter (or zero) timer may result in duplicate traffic or | ||||
| traffic loops.</t> | ||||
| <t>The following procedure is based on the SCT approach: | ||||
| </t> | ||||
| <ol spacing="normal"> | ||||
| <li>Initial state: PE1 is in a steady state, and PE2 is recovering.</li> | ||||
| <li>Recovery: PE2 recovers at an absolute time of t=99.</li> | ||||
| <li>Timer Start: PE2 starts at t=100 a 3-second timer to allow the recep | ||||
| tion of RT-4 from other PE | ||||
| nodes.</li> | nodes.</li> | |||
| <li>Advertisement: PE2 advertises RT-4, sent at t=100, with a target S | <li>Advertisement: PE2 advertises RT-4, sent at t=100, with a target SCT | |||
| CT value of t=103 to | value of t=103 to | |||
| partner PE1.</li> | its partner (PE1).</li> | |||
| <li>Service Carving Timer: PE1 starts the service carving timer, with | <li>Service Carving Timer: PE1 starts the service carving timer, with th | |||
| the remaining time | e remaining time | |||
| until t=103.</li> | until t=103.</li> | |||
| <li>Simultaneous Carving: Both PE1 and PE2 carve at an absolute time o | <li>Simultaneous Carving: Both PE1 and PE2 carve at an absolute time of | |||
| f t=103.</li> | t=103.</li> | |||
| </ol> | </ol> | |||
| </t> | <t> | |||
| To maintain the preference for minimal loss over duplicate traffic, PE1 | ||||
| <t> | <bcp14>SHOULD</bcp14> carve | |||
| To maintain the preference for minimal loss over duplicate traffic, PE1 | slightly before PE2 (with skew). The recovering PE2 performs both DF-to- | |||
| SHOULD carve | NDF and NDF-to-DF | |||
| slightly before PE2 (with skew). The recovering PE2 performs both DF to | ||||
| NDF and NDF to DF | ||||
| transitions per VLAN at the timer's expiry. The original PE1, which rece ived the SCT, applies the following: | transitions per VLAN at the timer's expiry. The original PE1, which rece ived the SCT, applies the following: | |||
| <ul> | </t> | |||
| <li>DF to NDF Transition(s): at t=SCT minus skew, where both PEs are | <ul spacing="normal"> | |||
| NDF for the skew duration.</li> | <li>DF-to-NDF Transition(s): at t=SCT minus skew, where both PEs are NDF | |||
| <li>NDF to DF Transition(s): at t=SCT.</li> | for the skew duration.</li> | |||
| </ul> | <li>NDF-to-DF Transition(s): at t=SCT.</li> | |||
| This split-behavior ensures a smooth DF role transition with minimal los | </ul> | |||
| s. | <t> | |||
| </t> | This split behavior ensures a smooth DF role transition with minimal los | |||
| s. | ||||
| <t>Using the SCT approach, the negative effect of the timer to allow the | </t> | |||
| reception of | <t>The SCT approach mitigates the negative effect of requiring a timer for | |||
| Ethernet Segment RT-4 from other PE nodes is mitigated. Furthermore, the | discovery of | |||
| BGP | Ethernet Segment (ES) RT-4 from other PE nodes. Furthermore, the BGP | |||
| transmission delay (from PE2 to PE1) of the ES RT-4 becomes a non-issue. The SCT approach shortens the | transmission delay (from PE2 to PE1) of the ES RT-4 becomes a non-issue. The SCT approach shortens the | |||
| 3-second timer window to the order of milliseconds.</t> | 3-second timer window to the order of milliseconds.</t> | |||
| <t>The peering timer is a configurable value where 3 seconds represents th | ||||
| <t>The peering timer is a configurable value where 3 seconds represents | e default. | |||
| the default. | ||||
| Configuring a timer value of 0, or so small as to expire during propagat ion of the BGP | Configuring a timer value of 0, or so small as to expire during propagat ion of the BGP | |||
| routes, is outside the scope of this document. | routes, is outside the scope of this document. | |||
| In reality, the use of the SCT approach presented in this document encou rages the use of | In reality, the use of the SCT approach presented in this document encou rages the use of | |||
| larger peering timer values to overcome any sort of BGP route propagatio n delays.</t> | larger peering timer values to overcome any sort of BGP route propagatio n delays.</t> | |||
| <section anchor="concurrent"> | ||||
| <section anchor="concurrent" title="Concurrent Recoveries"> | <name>Concurrent Recoveries</name> | |||
| <t>In the eventuality 2 or more PEs in a peering Ethernet Segment group | <t>In the eventuality that two or more PEs in a peering Ethernet Segment | |||
| are recovering | group are recovering | |||
| concurrently or roughly the same time, each will advertise a Service Car | concurrently or roughly at the same time, each will advertise a SCT. | |||
| ving Time. | ||||
| This SCT value would correspond to what each recovering PE considers the "end time" for DF | This SCT value would correspond to what each recovering PE considers the "end time" for DF | |||
| Election. A similar situation arises in sequentially recovering PEs, whe n a second PE | Election. A similar situation arises in sequentially recovering PEs, whe n a second PE | |||
| recovers approximately at the time of the first PE's advertised SCT expi ry, and with its own | recovers approximately at the time of the first PE's advertised SCT expi ry and with its own | |||
| new SCT-2 outside of the initial SCT window.</t> | new SCT-2 outside of the initial SCT window.</t> | |||
| <t>In the case of multiple concurrent DF elections, each initiated by on e of the recovering | <t>In the case of multiple concurrent DF elections, each initiated by on e of the recovering | |||
| PEs, the SCTs must be ordered chronologically. All PEs SHALL execute onl y a single DF | PEs, the SCTs must be ordered chronologically. All PEs <bcp14>SHALL</bcp 14> execute only a single DF | |||
| Election at the service carving time corresponding to the largest (lates t) received timestamp value. | Election at the service carving time corresponding to the largest (lates t) received timestamp value. | |||
| This DF Election will lead peering PEs into a single co-ordinated DF Ele | This DF Election will lead peering PEs into a single coordinated DF Elec | |||
| ction update.</t> | tion update.</t> | |||
| <t>Example: | <t>Example: | |||
| </t> | ||||
| <ol> | <ol> | |||
| <li>Initial State: PE1 is in a steady state, with services elected at PE1.</li> | <li>Initial State: PE1 is in a steady state, with services elected at PE1.</li> | |||
| <li>Recovery of PE2: PE2 recovers at time t=100 and advertises RT-4 wi th a target SCT | <li>Recovery of PE2: PE2 recovers at time t=100 and advertises RT-4 wi th a target SCT | |||
| value of t=103 to its partners (PE1).</li> | value of t=103 to its partner (PE1).</li> | |||
| <li>Timer Initiation by PE2: PE2 starts a 3-second timer to allow the reception of RT-4 | <li>Timer Initiation by PE2: PE2 starts a 3-second timer to allow the reception of RT-4 | |||
| from other PE nodes.</li> | from other PE nodes.</li> | |||
| <li>Timer Initiation by PE1: PE1 starts the service carving timer, wit h the remaining time | <li>Timer Initiation by PE1: PE1 starts the service carving timer, wit h the remaining time | |||
| until t=103.</li> | until t=103.</li> | |||
| <li>Recovery of PE3: PE3 recovers at time t=102 and advertises RT-4 wi th a target SCT | <li>Recovery of PE3: PE3 recovers at time t=102 and advertises RT-4 wi th a target SCT | |||
| value of t=105 to its partners (PE1, PE2).</li> | value of t=105 to its partners (PE1, PE2).</li> | |||
| <li>Timer Initiation by PE3: PE3 starts a 3-second timer to allow the reception of RT-4 | <li>Timer Initiation by PE3: PE3 starts a 3-second timer to allow the reception of RT-4 | |||
| from other PE nodes.</li> | from other PE nodes.</li> | |||
| <li>Timer Update by PE2: PE2 cancels the running timer and starts the service carving | <li>Timer Update by PE2: PE2 cancels the running timer and starts the service carving | |||
| timer with the remaining time until t=105.</li> | timer with the remaining time until t=105.</li> | |||
| <li>Timer Update by PE1: PE1 updates its service carving timer, with t he remaining time | <li>Timer Update by PE1: PE1 updates its service carving timer, with t he remaining time | |||
| until t=105.</li> | until t=105.</li> | |||
| <li>Service Carving: PE1, PE2, and PE3 perform service carving at the absolute time of t=105.</li> | <li>Service Carving: PE1, PE2, and PE3 perform service carving at the absolute time of t=105.</li> | |||
| </ol> | </ol> | |||
| </t> | <t>In the eventuality that a PE in an Ethernet Segment group recovers du | |||
| ring the discovery window | ||||
| <t>In the eventuality a PE in an Ethernet Segment group recovers during | specified in <xref target="RFC7432" section="8.5"/> and does not support | |||
| the discovery window | or advertise the | |||
| specified in <relref target="RFC7432" section="8.5"/>, and does not supp | T-bit, all PEs in the current peering sequence <bcp14>SHALL</bcp14> imme | |||
| ort or advertise the | diately revert to the default | |||
| T-bit, then all PEs in the current peering sequence SHALL immediately re | behavior described in <xref target="RFC7432"/>.</t> | |||
| vert to the default | </section> | |||
| <xref target="RFC7432"/> behavior.</t> | </section> | |||
| <section anchor="ntpcompat"> | ||||
| </section> | <name>Backwards Compatibility</name> | |||
| </section> | <t>For the DF election procedures to achieve global convergence and unanim | |||
| ity within a | ||||
| <section anchor="ntpcompat" title="Backwards Compatibility"> | ||||
| <t>For the DF election procedures to achieve global convergence and un | ||||
| animity within a | ||||
| redundancy group, it is essential that all participating PEs agree on the DF election | redundancy group, it is essential that all participating PEs agree on the DF election | |||
| algorithm to be employed. However, it is possible that some PEs may co ntinue to use the | algorithm to be employed. However, it is possible that some PEs may co ntinue to use the | |||
| existing modulo-based DF election algorithm from <xref target="RFC7432 | existing modulo-based DF election algorithm from <xref target="RFC7432 | |||
| "/> and not utilize the new Service Carving Time | "/> and not utilize the new | |||
| (SCT) BGP extended community. PEs that operate using the baseline DF e | SCT BGP extended community. PEs that operate using the baseline DF ele | |||
| lection mechanism | ction mechanism | |||
| will simply discard the new SCT BGP extended community as unrecognized .</t> | will simply discard the new SCT BGP extended community as unrecognized .</t> | |||
| <t>A PE can indicate its willingness to support clock-synchronized carving | ||||
| <t>A PE can indicate its willingness to support clock-synchronized car | by signaling | |||
| ving by signaling | the new "T" DF Election Capability and including the new SCT BGP exten | |||
| the new 'T' DF Election Capability and including the new SCT BGP exten | ded community along | |||
| ded community along | ||||
| with the Ethernet Segment Route Type 4. If one or more PEs attached to the Ethernet | with the Ethernet Segment Route Type 4. If one or more PEs attached to the Ethernet | |||
| Segment do not signal T=1, then all PEs in the Ethernet Segment SHALL revert to the | Segment do not signal T=1, then all PEs in the Ethernet Segment <bcp14 >SHALL</bcp14> revert to the | |||
| timer-based approach as specified in <xref target="RFC7432"/>. This re version is particularly crucial in | timer-based approach as specified in <xref target="RFC7432"/>. This re version is particularly crucial in | |||
| preventing VLAN shuffling when more than two PEs are involved.</t> | preventing VLAN shuffling when more than two PEs are involved.</t> | |||
| <t>In the event a new or extra RT-4 is received without the new "T" DF Ele | ||||
| <t>In the event a new or extra RT-4 is received without the new 'T' DF | ction | |||
| Election | ||||
| Capability in the midst of an ongoing DF Election sequence, all SCT-ba sed delays are | Capability in the midst of an ongoing DF Election sequence, all SCT-ba sed delays are | |||
| cancelled and the DF Election immediately applied as specified in <xre | canceled, and the DF Election is immediately applied as specified in < | |||
| f | xref target="RFC7432"/>, as if no SCT had been previously exchanged.</t> | |||
| target="RFC7432"/>, as if no SCT had been previously exchanged.</t> | </section> | |||
| <section anchor="security"> | ||||
| </section> | <name>Security Considerations</name> | |||
| <t>The mechanisms in this document use the EVPN control plane as defined | ||||
| <section anchor="security" title="Security Considerations"> | in <xref target="RFC7432"/>. Security considerations described in <xref | |||
| <t>The mechanisms in this document use the EVPN control plane as defined | target="RFC7432"/> are equally applicable.</t> | |||
| in | <t>For the new SCT Extended Community, attack vectors may be setting the | |||
| <xref target="RFC7432"/>. Security considerations described in | value to zero, to a value in the past, or to large times in the | |||
| <xref target="RFC7432"/> are equally applicable.</t> | future. Handling of this attack vector is addressed in <xref | |||
| target="timestamp_verification"/> alongside NTP Era rollover | ||||
| <t>For the new SCT Extended Community, attack vectors may be setting the | ambiguity.</t> | |||
| value to zero, to a | <t>This document uses MPLS- and IP-based tunnel technologies to support | |||
| value in the past or to large times in the future. Handling of this atta | data plane transport. Security considerations described in <xref | |||
| ck vector is | target="RFC7432"/> and <xref target="RFC8365"/> are equally | |||
| addressed in <xref target="timestamp_verification"/> alongside NTP Era r | applicable.</t> | |||
| ollover ambiguity.</t> | </section> | |||
| <section anchor="IANA"> | ||||
| <t>This document uses MPLS and IP-based tunnel technologies to support d | <name>IANA Considerations</name> | |||
| ata plane transport. | <t>IANA has made the following assignment in the "EVPN Extended | |||
| Security considerations described in <xref target="RFC7432"/> and in <xr | Community Sub-Types" registry set up by <xref target="RFC7153"/>. | |||
| ef target="RFC8365"/> are equally applicable.</t> | </t> | |||
| </section> | <table> | |||
| <name></name> | ||||
| <section anchor="IANA" title="IANA Considerations"> | <thead> | |||
| <tr> | ||||
| <t>IANA maintains the "EVPN Extended Community Sub-Types" registry set | <th>Sub-Type Value</th> | |||
| up by <xref target='RFC7153'/>, where the following assignment has been m | <th>Name</th> | |||
| ade: | <th>Reference</th> | |||
| <figure><artwork><![CDATA[ | </tr> | |||
| Sub-Type Value Name Reference | </thead> | |||
| -------------- ------------------------- ------------- | <tbody> | |||
| 0x0F Service Carving Time This document | <tr> | |||
| ]]></artwork></figure> | <td>0x0F</td> | |||
| </t> | <td>Service Carving Time</td> | |||
| <td>RFC 9722</td> | ||||
| <t>IANA maintains the "DF Election Capabilities" registry set up by | </tr> | |||
| <xref target="RFC8584"/>. IANA is requested to make the following assign | </tbody> | |||
| ment from | </table> | |||
| this registry: | ||||
| <figure><artwork><![CDATA[ | <t>IANA has made the following assignment in the "DF Election | |||
| Bit Name Reference | Capabilities" registry set up by <xref target="RFC8584"/>.</t> | |||
| ---- ---------------- ------------- | ||||
| 3 Time Synchronization This document | ||||
| ]]></artwork></figure> | ||||
| </t> | <table> | |||
| </section> | <name></name> | |||
| </middle> | <thead> | |||
| <tr> | ||||
| <th>Bit</th> | ||||
| <th>Name</th> | ||||
| <th>Reference</th> | ||||
| </tr> | ||||
| </thead> | ||||
| <tbody> | ||||
| <tr> | ||||
| <td>3</td> | ||||
| <td>Time Synchronization</td> | ||||
| <td>RFC 9722</td> | ||||
| </tr> | ||||
| </tbody> | ||||
| </table> | ||||
| <!-- *****BACK MATTER ***** --> | </section> | |||
| </middle> | ||||
| <back> | <back> | |||
| <!-- References split into informative and normative --> | <references> | |||
| <references title="Normative References"> | <name>References</name> | |||
| <references> | ||||
| <name>Normative References</name> | ||||
| <xi:include href="https://www.rfc-editor.org/refs/bibxml/reference.RFC.2 119.xml"/> | <xi:include href="https://www.rfc-editor.org/refs/bibxml/reference.RFC.2 119.xml"/> | |||
| <xi:include href="https://www.rfc-editor.org/refs/bibxml/reference.RFC.8 174.xml"/> | <xi:include href="https://www.rfc-editor.org/refs/bibxml/reference.RFC.8 174.xml"/> | |||
| <xi:include href="https://www.rfc-editor.org/refs/bibxml/reference.RFC.7 153.xml"/> | <xi:include href="https://www.rfc-editor.org/refs/bibxml/reference.RFC.7 153.xml"/> | |||
| <xi:include href="https://www.rfc-editor.org/refs/bibxml/reference.RFC.7 432.xml"/> | <xi:include href="https://www.rfc-editor.org/refs/bibxml/reference.RFC.7 432.xml"/> | |||
| <xi:include href="https://www.rfc-editor.org/refs/bibxml/reference.RFC.8 365.xml"/> | <xi:include href="https://www.rfc-editor.org/refs/bibxml/reference.RFC.8 365.xml"/> | |||
| <xi:include href="https://www.rfc-editor.org/refs/bibxml/reference.RFC.8 584.xml"/> | <xi:include href="https://www.rfc-editor.org/refs/bibxml/reference.RFC.8 584.xml"/> | |||
| <xi:include href="https://www.rfc-editor.org/refs/bibxml/reference.RFC.5 905.xml"/> | <xi:include href="https://www.rfc-editor.org/refs/bibxml/reference.RFC.5 905.xml"/> | |||
| </references> | </references> | |||
| <!-- References split into informative and normative --> | <references> | |||
| <references title="Informative References"> | <name>Informative References</name> | |||
| <reference anchor="HRW98" target="https://www.microsoft.com/en-us/resear | <reference anchor="HRW98" target="https://www.microsoft.com/en-us/resear | |||
| ch/wp-content/ | ch/wp-content/uploads/2017/02/HRW98.pdf"> | |||
| uploads/2017/02/HRW98.pdf"> | ||||
| <front> | <front> | |||
| <title>Using Name-Based Mappings to Increase Hit Rates</title> | <title>Using Name-Based Mappings to Increase Hit Rates</title> | |||
| <author initials="D" surname="Thaler"> | <author initials="D" surname="Thaler"> | |||
| <organization/> | <organization/> | |||
| </author> | </author> | |||
| <author initials="C" surname="Ravishankar"> | <author initials="C" surname="Ravishankar"> | |||
| <organization/> | <organization/> | |||
| </author> | </author> | |||
| <date year="1998"/> | <date month="February" year="1998"/> | |||
| </front> | </front> | |||
| <refcontent>IEEE/ACM Transactions on Networking, vol. 6, no. 1</refcont ent> | ||||
| </reference> | </reference> | |||
| </references> | ||||
| </references> | </references> | |||
| <section anchor="contributors" title="Contributors"> | <section anchor="acknowledgements" numbered="false"> | |||
| <t>In addition to the authors listed on the front page, the following co-aut | <name>Acknowledgements</name> | |||
| hors | <t>Authors would like to acknowledge helpful comments and contributions | |||
| have also contributed substantially to this document:</t> | of <contact fullname="Satya Mohanty"/> and <contact fullname="Bharath | |||
| Vasudevan"/>. Also thank you to <contact fullname="Anoop Ghanwani"/> | ||||
| and <contact fullname="Gunter van de Velde"/> for their thorough review | ||||
| with valuable comments and corrections.</t> | ||||
| </section> | ||||
| <t>Gaurav Badoni<br/>Cisco</t> | <section anchor="contributors" numbered="false"> | |||
| <t>Email: gbadoni@cisco.com</t> | <name>Contributors</name> | |||
| <t>In addition to the authors listed on the front page, the following | ||||
| coauthors have also contributed substantially to this document:</t> | ||||
| <t>Dhananjaya Rao<br/>Cisco</t> | <contact fullname="Gaurav Badoni"> | |||
| <t>Email: dhrao@cisco.com</t> | <organization>Cisco</organization> | |||
| </section> | <address> | |||
| <email>gbadoni@cisco.com</email> | ||||
| </address> | ||||
| </contact> | ||||
| <contact fullname="Dhananjaya Rao"> | ||||
| <organization>Cisco</organization> | ||||
| <address> | ||||
| <email>dhrao@cisco.com</email> | ||||
| </address> | ||||
| </contact> | ||||
| <section anchor="acknowledgements" title="Acknowledgements"> | ||||
| <t>Authors would like to acknowledge helpful comments | ||||
| and contributions of Satya Mohanty and Bharath Vasudevan. | ||||
| Also thank you to Anoop Ghanwani and Gunter van de Velde for their thoro | ||||
| ugh review with valuable comments and | ||||
| corrections.</t> | ||||
| </section> | </section> | |||
| </back> | </back> | |||
| </rfc> | </rfc> | |||
| End of changes. 102 change blocks. | ||||
| 596 lines changed or deleted | 514 lines changed or added | |||
This html diff was produced by rfcdiff 1.48. | ||||