| rfc8684xml2.original.xml | rfc8684.xml | |||
|---|---|---|---|---|
| <?xml version="1.0" encoding="US-ASCII"?> | <?xml version='1.0' encoding='utf-8'?> | |||
| <!-- Convert to HTML and Text with xml2rfc: http://xml2rfc.ietf.org. --> | ||||
| <!DOCTYPE rfc SYSTEM "rfc2629.dtd" [ | <!DOCTYPE rfc SYSTEM "rfc2629-xhtml.ent"> | |||
| <!ENTITY RFC5533 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | <rfc xmlns:xi="http://www.w3.org/2001/XInclude" submissionType="IETF" | |||
| RFC.5533.xml"> | category="std" consensus="true" docName="draft-ietf-mptcp-rfc6824bis-18" nu | |||
| <!ENTITY RFC5062 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | mber="8684" ipr="trust200902" obsoletes="6824" updates="" xml:lang="en" tocInclu | |||
| RFC.5062.xml"> | de="true" symRefs="true" sortRefs="true" version="3"> | |||
| <!ENTITY RFC5061 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
| RFC.5061.xml"> | <!-- xml2rfc v2v3 conversion 2.27.0 --> | |||
| <!ENTITY RFC4960 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
| RFC.4960.xml"> | ||||
| <!ENTITY RFC4987 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
| RFC.4987.xml"> | ||||
| <!ENTITY RFC6234 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
| RFC.6234.xml"> | ||||
| <!ENTITY RFC4086 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
| RFC.4086.xml"> | ||||
| <!ENTITY RFC5681 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
| RFC.5681.xml"> | ||||
| <!ENTITY RFC2119 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
| RFC.2119.xml"> | ||||
| <!ENTITY RFC2992 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
| RFC.2992.xml"> | ||||
| <!ENTITY RFC2979 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
| RFC.2979.xml"> | ||||
| <!ENTITY RFC2104 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
| RFC.2104.xml"> | ||||
| <!ENTITY RFC2018 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
| RFC.2018.xml"> | ||||
| <!ENTITY RFC1918 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
| RFC.1918.xml"> | ||||
| <!ENTITY RFC0793 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
| RFC.0793.xml"> | ||||
| <!ENTITY RFC7323 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
| RFC.7323.xml"> | ||||
| <!ENTITY RFC1122 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
| RFC.1122.xml"> | ||||
| <!ENTITY RFC3135 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
| RFC.3135.xml"> | ||||
| <!ENTITY RFC3022 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
| RFC.3022.xml"> | ||||
| <!ENTITY RFC6181 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
| RFC.6181.xml"> | ||||
| <!ENTITY RFC6182 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
| RFC.6182.xml"> | ||||
| <!ENTITY RFC6356 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
| RFC.6356.xml"> | ||||
| <!ENTITY RFC6555 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
| RFC.6555.xml"> | ||||
| <!ENTITY RFC8126 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
| RFC.8126.xml"> | ||||
| <!ENTITY RFC6897 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
| RFC.6897.xml"> | ||||
| <!ENTITY RFC6528 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
| RFC.6528.xml"> | ||||
| <!ENTITY RFC5961 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
| RFC.5961.xml"> | ||||
| <!ENTITY RFC7413 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
| RFC.7413.xml"> | ||||
| <!ENTITY RFC7430 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
| RFC.7430.xml"> | ||||
| <!ENTITY RFC8174 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
| RFC.8174.xml"> | ||||
| <!ENTITY RFC8041 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference. | ||||
| RFC.8041.xml"> | ||||
| ]> | ||||
| <?xml-stylesheet type='text/xsl' href='rfc2629.xslt' ?> | ||||
| <?rfc strict="no" ?> | ||||
| <?rfc toc="yes"?> | ||||
| <?rfc tocdepth="4"?> | ||||
| <?rfc symrefs="yes"?> | ||||
| <?rfc sortrefs="yes" ?> | ||||
| <?rfc compact="yes" ?> | ||||
| <?rfc subcompact="no" ?> | ||||
| <?rfc rfcedstyle="yes"?> | ||||
| <rfc category="std" docName="draft-ietf-mptcp-rfc6824bis-18" ipr="trust200902" o bsoletes="6824"> | ||||
| <front> | <front> | |||
| <title abbrev="Multipath TCP">TCP Extensions for Multipath Operation with Mu ltiple Addresses</title> | <title abbrev="Multipath TCP">TCP Extensions for Multipath Operation with Mu ltiple Addresses</title> | |||
| <seriesInfo name="RFC" value="8684"/> | ||||
| <author fullname="Alan Ford" initials="A." surname="Ford"> | <author fullname="Alan Ford" initials="A." surname="Ford"> | |||
| <organization>Pexip</organization> | <organization>Pexip</organization> | |||
| <address> | <address> | |||
| <!-- <postal> | ||||
| <street>Beech Court</street> | ||||
| <city>Hurst</city> | ||||
| <region>Berkshire</region> | ||||
| <code>RG10 0RQ</code> | ||||
| <country>UK</country> | ||||
| </postal> --> | ||||
| <email>alan.ford@gmail.com</email> | <email>alan.ford@gmail.com</email> | |||
| </address> | </address> | |||
| </author> | </author> | |||
| <author fullname="Costin Raiciu" initials="C." surname="Raiciu"> | <author fullname="Costin Raiciu" initials="C." surname="Raiciu"> | |||
| <organization abbrev="U. Politechnica of Bucharest">University Politehnica of Bucharest</organization> | <organization abbrev="U. Politehnica of Bucharest">University Politehnica of Bucharest</organization> | |||
| <address> | <address> | |||
| <postal> | <postal> | |||
| <street>Splaiul Independentei 313</street> | <street>Splaiul Independentei 313</street> | |||
| <city>Bucharest</city> | <city>Bucharest</city> | |||
| <country>Romania</country> | <country>Romania</country> | |||
| </postal> | </postal> | |||
| <email>costin.raiciu@cs.pub.ro</email> | <email>costin.raiciu@cs.pub.ro</email> | |||
| </address> | </address> | |||
| </author> | </author> | |||
| <author fullname="Mark Handley" initials="M." surname="Handley"> | <author fullname="Mark Handley" initials="M." surname="Handley"> | |||
| <organization abbrev="U. College London">University College London</organi zation> | <organization abbrev="U. College London">University College London</organi zation> | |||
| <address> | <address> | |||
| <postal> | <postal> | |||
| <street>Gower Street</street> | <street>Gower Street</street> | |||
| <city>London</city> | <city>London</city> | |||
| <code>WC1E 6BT</code> | <code>WC1E 6BT</code> | |||
| <country>UK</country> | <country>United Kingdom</country> | |||
| </postal> | </postal> | |||
| <email>m.handley@cs.ucl.ac.uk</email> | <email>m.handley@cs.ucl.ac.uk</email> | |||
| </address> | </address> | |||
| </author> | </author> | |||
| <author fullname="Olivier Bonaventure" initials="O." surname="Bonaventure"> | <author fullname="Olivier Bonaventure" initials="O." surname="Bonaventure"> | |||
| <organization abbrev="U. catholique de Louvain">Université catholiq | <organization abbrev="U. catholique de Louvain" ascii="Universite catholique | |||
| ue de Louvain</organization> | de Louvain">Université catholique de Louvain</organization> | |||
| <address> | <address> | |||
| <postal> | <postal> | |||
| <street>Pl. Ste Barbe, 2</street> | <street>Pl. Ste Barbe, 2</street> | |||
| <code>1348</code> | <code>1348</code> | |||
| <city>Louvain-la-Neuve</city> | <city>Louvain-la-Neuve</city> | |||
| <country>Belgium</country> | <country>Belgium</country> | |||
| </postal> | </postal> | |||
| <email>olivier.bonaventure@uclouvain.be</email> | <email>olivier.bonaventure@uclouvain.be</email> | |||
| </address> | </address> | |||
| </author> | </author> | |||
| skipping to change at line 101 ¶ | skipping to change at line 55 ¶ | |||
| <address> | <address> | |||
| <postal> | <postal> | |||
| <street>Pl. Ste Barbe, 2</street> | <street>Pl. Ste Barbe, 2</street> | |||
| <code>1348</code> | <code>1348</code> | |||
| <city>Louvain-la-Neuve</city> | <city>Louvain-la-Neuve</city> | |||
| <country>Belgium</country> | <country>Belgium</country> | |||
| </postal> | </postal> | |||
| <email>olivier.bonaventure@uclouvain.be</email> | <email>olivier.bonaventure@uclouvain.be</email> | |||
| </address> | </address> | |||
| </author> | </author> | |||
| <author fullname="Christoph Paasch" initials="C." surname="Paasch"> | <author fullname="Christoph Paasch" initials="C." surname="Paasch"> | |||
| <organization abbrev="Apple, Inc.">Apple, Inc.</organization> | <organization abbrev="Apple, Inc.">Apple, Inc.</organization> | |||
| <address> | <address> | |||
| <postal> | <postal> | |||
| <street></street> | <street/> | |||
| <city>Cupertino</city> | <city>Cupertino</city> | |||
| <country>US</country> | <region>CA</region> | |||
| <country>United States of America</country> | ||||
| </postal> | </postal> | |||
| <email>cpaasch@apple.com</email> | <email>cpaasch@apple.com</email> | |||
| </address> | </address> | |||
| </author> | </author> | |||
| <date year="2020" month="March"/> | ||||
| <date year="2019" /> | <keyword>tcp</keyword> | |||
| <keyword>extensions</keyword> | ||||
| <area>General</area> | <keyword>multipath</keyword> | |||
| <workgroup>Internet Engineering Task Force</workgroup> | <keyword>multihomed</keyword> | |||
| <keyword>tcp extensions multipath multihomed subflow</keyword> | <keyword>subflow</keyword> | |||
| <abstract> | <abstract> | |||
| <t>TCP/IP communication is currently restricted to a single path per conne | <t>TCP/IP communication is currently restricted to a single path per conne | |||
| ction, yet multiple paths often exist between peers. The simultaneous use of the | ction, yet multiple paths often exist between peers. The simultaneous use of the | |||
| se multiple paths for a TCP/IP session would improve resource usage within the n | se multiple paths for a TCP/IP session would improve resource usage within the n | |||
| etwork and, thus, improve user experience through higher throughput and improved | etwork and thus improve user experience through higher throughput and improved r | |||
| resilience to network failure.</t> | esilience to network failure.</t> | |||
| <t>Multipath TCP provides the ability to simultaneously use multiple | ||||
| <t>Multipath TCP provides the ability to simultaneously use multiple paths | paths between peers. This document presents a set of extensions to | |||
| between peers. This document presents a set of extensions to traditional TCP to | traditional TCP to support multipath operation. The protocol offers the | |||
| support multipath operation. The protocol offers the same type of service to ap | same type of service to applications as TCP (i.e., a reliable bytestream), | |||
| plications as TCP (i.e., reliable bytestream), and it provides the components ne | and it provides the components necessary to establish and use multiple TCP flow | |||
| cessary to establish and use multiple TCP flows across potentially disjoint path | s across potentially disjoint paths.</t> | |||
| s.</t> | <t>This document specifies v1 of Multipath TCP, obsoleting v0 as | |||
| specified in RFC 6824, through clarifications and modifications primarily | ||||
| <t>This document specifies v1 of Multipath TCP, obsoleting v0 as specified | driven by deployment experience.</t> | |||
| in RFC6824, through clarifications and modifications primarily driven by deploy | ||||
| ment experience.</t> | ||||
| </abstract> | </abstract> | |||
| </front> | </front> | |||
| <middle> | <middle> | |||
| <section title="Introduction" anchor="sec_intro"> | <section anchor="sec_intro" numbered="true" toc="default"> | |||
| <t>Multipath TCP (MPTCP) is a set of extensions to regular TCP <xref targe | <name>Introduction</name> | |||
| t="RFC0793"/> to provide a Multipath TCP <xref target="RFC6182"/> service, which | <t>Multipath TCP (MPTCP) is a set of extensions to regular TCP <xref | |||
| enables a transport connection to operate across multiple paths | target="RFC0793" format="default"/> to provide a Multipath TCP service <xr | |||
| simultaneously. This document presents the protocol changes required to add mult | ef target="RFC6182" format="default"/>, which enables a transport connection to | |||
| ipath capability to TCP; specifically, those for signaling and setting up multip | operate across multiple paths | |||
| le paths ("subflows"), managing these subflows, reassembly of data, and terminat | simultaneously. This document presents the protocol changes required to add | |||
| ion of sessions. | multipath capability to TCP -- specifically, those for signaling and setting | |||
| This is not the only information required to create a Multipath TCP implem | up multiple paths ("subflows"), managing these subflows, reassembly of data, | |||
| entation, however. This document is complemented by three others: | and termination of sessions. This is not the only information required to create | |||
| <list style="symbols"> | a Multipath TCP implementation, however. This document is complemented by three | |||
| <t>Architecture <xref target="RFC6182"/>, which explains the motivatio | others: | |||
| ns behind Multipath TCP, contains a discussion of high-level design decisions on | ||||
| which this design is based, and an explanation of a functional separation throu | ||||
| gh which an extensible MPTCP implementation can be developed.</t> | ||||
| <t>Congestion control <xref target="RFC6356"/> presents a safe congest | ||||
| ion control algorithm for coupling the behavior of the multiple paths in order t | ||||
| o "do no harm" to other network users.</t> | ||||
| <t>Application considerations <xref target="RFC6897"/> discusses what | ||||
| impact MPTCP will have on applications, what applications will want to do with M | ||||
| PTCP, and as a consequence of these factors, what API extensions an MPTCP implem | ||||
| entation should present.</t> | ||||
| </list> | ||||
| This document is an update to, and obsoletes, the v0 specification of Mult | ||||
| ipath TCP (RFC6824). This document specifies MPTCP v1, which is not backward com | ||||
| patible with MPTCP v0. This document additionally defines version negotiation pr | ||||
| ocedures for implementations that support both versions. | ||||
| </t> | </t> | |||
| <ul spacing="normal"> | ||||
| <section title="Design Assumptions" anchor="sec_assum"> | <li><xref target="RFC6182" format="default"/> (MPTCP architecture), whic | |||
| <t>In order to limit the potentially huge design space, the mptcp workin | h | |||
| g group imposed two key constraints on the Multipath TCP design presented in thi | explains the motivations behind Multipath TCP, contains a discussion | |||
| s document: | of high-level design decisions on which this design is based, and provid | |||
| <list style="symbols"> | es an explanation of a functional separation through which an extensible MPTCP i | |||
| <t>It must be backwards-compatible with current, regular TCP, to inc | mplementation can be developed.</li> | |||
| rease its chances of deployment.</t> | <li><xref target="RFC6356" format="default"/> (congestion control), whic | |||
| <t>It can be assumed that one or both hosts are multihomed and multi | h presents a safe congestion control algorithm for coupling the behavior of the | |||
| addressed.</t> | multiple paths in order to "do no harm" to other network users.</li> | |||
| </list> | <li><xref target="RFC6897" | |||
| format="default"/> (application considerations), which discusses what im | ||||
| pact MPTCP will have on applications, what applications will want to do with MPT | ||||
| CP, and as a consequence of these factors, what API extensions an MPTCP implemen | ||||
| tation should present.</li> | ||||
| </ul> | ||||
| <t> | ||||
| This document obsoletes the v0 specification of | ||||
| Multipath TCP <xref target="RFC6824"/>. This document specifies MPTCP v1, | ||||
| which is not backward compatible with MPTCP v0. This document additionally defin | ||||
| es version negotiation procedures for implementations that support both versions | ||||
| . | ||||
| </t> | ||||
| <section anchor="sec_assum" numbered="true" toc="default"> | ||||
| <name>Design Assumptions</name> | ||||
| <t>In order to limit the potentially huge design space, the | ||||
| MPTCP Working Group imposed two key constraints on the Multipath TCP des | ||||
| ign presented in this document: | ||||
| </t> | </t> | |||
| <t>To simplify the design, we assume that the presence of multiple addre | <ul spacing="normal"> | |||
| sses at a host is sufficient to indicate the existence of multiple paths. These | <li>It must be backward compatible with current, regular TCP, to incre | |||
| paths need not be entirely disjoint: they may share one or many routers between | ase its chances of deployment.</li> | |||
| them. Even in such a situation, making use of multiple paths is beneficial, impr | <li>It can be assumed that one or both hosts are multihomed and multia | |||
| oving resource utilization and resilience to a subset of node failures. The cong | ddressed.</li> | |||
| estion control algorithms defined in <xref target="RFC6356"/> ensure this does n | </ul> | |||
| ot act detrimentally. Furthermore, there may be some scenarios where different T | <t>To simplify the design, we assume that the presence of multiple | |||
| CP ports on a single host can provide disjoint paths (such as through certain Eq | addresses at a host is sufficient to indicate the existence of | |||
| ual-Cost Multipath (ECMP) implementations <xref target="RFC2992"/>), and so the | multiple paths. These paths need not be entirely disjoint: they may | |||
| MPTCP design also supports the use of ports in path identifiers.</t> | share one or many routers between them. Even in such a situation, | |||
| <t>There are three aspects to the backwards-compatibility listed above ( | making use of multiple paths is beneficial, improving resource | |||
| discussed in more detail in <xref target="RFC6182"/>): | utilization and resilience to a subset of node failures. The | |||
| <list style="hanging"> | congestion control algorithm defined in <xref target="RFC6356" | |||
| <t hangText="External Constraints:"> The protocol must function thro | format="default"/> ensures that the use of multiple paths does not act d | |||
| ugh the vast majority of existing | etrimentally. | |||
| middleboxes such as NATs, firewalls, and proxies, and as such must resemble exis | Furthermore, there may be some scenarios where different TCP ports on a | |||
| ting TCP as far as possible on the | single host can provide disjoint paths (such as through certain | |||
| wire. Furthermore, the protocol must not assume the segments it sends on the wir | Equal-Cost Multipath (ECMP) implementations <xref target="RFC2992" | |||
| e arrive unmodified at the destination: | format="default"/>), and so the MPTCP design also supports the use of | |||
| they may be split or coalesced; TCP options may be removed or duplicated. </t> | ports in path identifiers.</t> | |||
| <t hangText="Application Constraints:"> The protocol must be usable | <t>There are three aspects to the backward compatibility listed above (d | |||
| with no change to existing applications that use the common TCP API (although it | iscussed in more detail in <xref target="RFC6182" format="default"/>): | |||
| is reasonable that not all features would be available to such legacy applicati | ||||
| ons). Furthermore, the protocol must provide the same service model as regular T | ||||
| CP to the application.</t> | ||||
| <t hangText="Fallback:"> The protocol should be able to fall back to | ||||
| standard TCP with no interference from the user, to be able to communicate with | ||||
| legacy hosts.</t> | ||||
| </list> | ||||
| </t> | </t> | |||
| <t>The complementary application considerations document <xref target="R | <dl newline="false" spacing="normal" indent="3"> | |||
| FC6897"/> discusses the necessary features of an API to provide backwards-compat | <dt>External Constraints:</dt> | |||
| ibility, as well as API extensions to convey the behavior of MPTCP at a level of | <dd> The protocol must function through the vast majority of existing | |||
| control and information equivalent to that available with regular, single-path | middleboxes such as NATs, firewalls, and proxies, and as such must resemble exis | |||
| TCP.</t> | ting TCP as far as possible on the | |||
| <t>Further discussion of the design constraints and associated design de | wire. Furthermore, the protocol must not assume that the segments it sends on th | |||
| cisions are given in the MPTCP Architecture document <xref target="RFC6182"/> an | e wire arrive unmodified at the destination: | |||
| d in <xref target="howhard"/>.</t> | they may be split or coalesced; TCP options may be removed or duplicated. </dd> | |||
| <dt>Application Constraints:</dt> | ||||
| <dd> The protocol must be usable with no change to existing applicatio | ||||
| ns that use the common TCP API (although it is reasonable that not all features | ||||
| would be available to such legacy applications). Furthermore, the protocol must | ||||
| provide the same service model as regular TCP to the application.</dd> | ||||
| <dt>Fallback:</dt> | ||||
| <dd> The protocol should be able to fall back to standard TCP with no | ||||
| interference from the user, to be able to communicate with legacy hosts.</dd> | ||||
| </dl> | ||||
| <t>The complementary application considerations document <xref | ||||
| target="RFC6897" format="default"/> discusses the necessary features | ||||
| of an API to provide backward compatibility, as well as API extensions t | ||||
| o convey the behavior of MPTCP at a level of control and information equivalent | ||||
| to that available with regular, single-path TCP.</t> | ||||
| <t>Further discussion of the design constraints and associated design de | ||||
| cisions is given in the MPTCP architecture document <xref target="RFC6182" forma | ||||
| t="default"/> and in <xref target="howhard" format="default"/>.</t> | ||||
| </section> | </section> | |||
| <section anchor="sec_layers" numbered="true" toc="default"> | ||||
| <section title="Multipath TCP in the Networking Stack" anchor="sec_layers" | <name>Multipath TCP in the Networking Stack</name> | |||
| > | ||||
| <t>MPTCP operates at the transport layer and aims to be transparent to b oth higher and lower | <t>MPTCP operates at the transport layer and aims to be transparent to b oth higher and lower | |||
| layers. It is a set of additional features on top of standard TCP; <xref target= "fig_arch" /> illustrates | layers. It is a set of additional features on top of standard TCP; <xref target= "fig_arch" format="default"/> illustrates | |||
| this layering. MPTCP is designed to be usable by legacy applications with no cha nges; detailed discussion | this layering. MPTCP is designed to be usable by legacy applications with no cha nges; detailed discussion | |||
| of its interactions with applications is given in <xref target="RFC6897"/>.</t> | of its interactions with applications is given in <xref target="RFC6897" format= | |||
| "default"/>.</t> | ||||
| <figure align="center" anchor="fig_arch" title="Comparison of Standard T | <figure anchor="fig_arch"> | |||
| CP and MPTCP Protocol Stacks"> | <name>Comparison of Standard TCP and MPTCP Protocol Stacks</name> | |||
| <artwork align="left"><![CDATA[ | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
| +-------------------------------+ | +-------------------------------+ | |||
| | Application | | | Application | | |||
| +---------------+ +-------------------------------+ | +---------------+ +-------------------------------+ | |||
| | Application | | MPTCP | | | Application | | MPTCP | | |||
| +---------------+ + - - - - - - - + - - - - - - - + | +---------------+ + - - - - - - - + - - - - - - - + | |||
| | TCP | | Subflow (TCP) | Subflow (TCP) | | | TCP | | Subflow (TCP) | Subflow (TCP) | | |||
| +---------------+ +-------------------------------+ | +---------------+ +-------------------------------+ | |||
| | IP | | IP | IP | | | IP | | IP | IP | | |||
| +---------------+ +-------------------------------+ | +---------------+ +-------------------------------+ ]]></artwork> | |||
| ]]></artwork> | ||||
| </figure> | </figure> | |||
| </section> | </section> | |||
| <section numbered="true" toc="default"> | ||||
| <section title="Terminology"> | <name>Terminology</name> | |||
| <t>This document makes use of a number of terms that are either MPTCP-sp | <t>This document makes use of a number of terms that are either MPTCP sp | |||
| ecific or have defined meaning in the context of MPTCP, as follows: | ecific or have defined meaning in the context of MPTCP, as follows: | |||
| <list style="hanging"> | </t> | |||
| <t hangText="Path:"> A sequence of links between a sender and a receiv | <dl newline="false" spacing="normal" indent="3"> | |||
| er, defined in this context by a 4-tuple of source and destination address/port | <dt>Path:</dt> | |||
| pairs.</t> | <dd> A sequence of links between a sender and a receiver, defined in t | |||
| <t hangText="Subflow:"> A flow of TCP segments operating over an indiv | his context by a 4-tuple of source and destination address&wj;/port pairs.</dd> | |||
| idual path, which forms part of a larger MPTCP connection. A subflow is started | <dt>Subflow:</dt> | |||
| and terminated similar to a regular TCP connection.</t> | <dd> A flow of TCP segments operating over an individual path, which f | |||
| <t hangText="(MPTCP) Connection:"> A set of one or more subflows, over | orms part of a larger MPTCP connection. A subflow is started and terminated simi | |||
| which an application can communicate between two hosts. There is a one-to-one m | larly to a regular TCP connection.</dd> | |||
| apping between a connection and an application socket.</t> | <dt>(MPTCP) Connection:</dt> | |||
| <t hangText="Data-level:"> The payload data is nominally transferred o | <dd> A set of one or more subflows, over which an application can comm | |||
| ver a connection, which in turn is transported over subflows. Thus, the term "d | unicate between two hosts. There is a one‑to‑one mapping between a c | |||
| ata-level" is synonymous with "connection level", in contrast to "subflow-level" | onnection and an application socket.</dd> | |||
| , which refers to properties of an individual subflow.</t> | <dt>Data-level:</dt> | |||
| <t hangText="Token:"> A locally unique identifier given to a multipath | <dd> The payload data is nominally transferred over a connection, whic | |||
| connection by a host. May also be referred to as a "Connection ID".</t> | h in turn is transported over subflows. Thus, the term "data-level" is synonymo | |||
| <t hangText="Host:"> An end host operating an MPTCP implementation, an | us with "connection-level", in contrast to "subflow-level", which refers to prop | |||
| d either initiating or accepting an MPTCP connection.</t> | erties of an individual subflow.</dd> | |||
| </list> | <dt>Token:</dt> | |||
| In addition to these terms, note that MPTCP's interpretation of, and eff | <dd> A locally unique identifier given to a multipath connection by a | |||
| ect on, regular single-path TCP semantics are discussed in <xref target="sec_sem | host. May also be referred to as a "Connection ID".</dd> | |||
| antics"/>.</t> | <dt>Host:</dt> | |||
| <dd> An end host operating an MPTCP implementation, and either initiat | ||||
| ing or accepting an MPTCP connection.</dd> | ||||
| </dl> | ||||
| <t> | ||||
| In addition to these terms, note that MPTCP's interpretation of, and eff | ||||
| ect on, regular single-path TCP semantics are discussed in <xref target="sec_sem | ||||
| antics" format="default"/>.</t> | ||||
| </section> | </section> | |||
| <section anchor="sec_operation" numbered="true" toc="default"> | ||||
| <section title="MPTCP Concept" anchor="sec_operation"> | <name>MPTCP Concept</name> | |||
| <t>This section provides a high-level summary of normal | <t>This section provides a high-level summary of normal | |||
| operation of MPTCP, and is illustrated by the scenario shown in | operation of MPTCP; this type of scenario is illustrated in | |||
| <xref target="fig_scenario"/>. A detailed description of operation is given in < | <xref target="fig_scenario" format="default"/>. A detailed description of how | |||
| xref target="sec_protocol"/>. | MPTCP operates is given in <xref target="sec_protocol" format="default"/>. | |||
| <list style="symbols"> | ||||
| <t>To a non-MPTCP-aware application, MPTCP will behave the same as n | ||||
| ormal TCP. Extended APIs could provide | ||||
| additional control to MPTCP-aware applications <xref target="RFC6897"/>. | ||||
| An application begins by opening a TCP socket in the normal way. | ||||
| MPTCP signaling and operation are handled by the MPTCP implementation. | ||||
| </t> | ||||
| <t>An MPTCP connection begins similarly to a regular TCP connection. | ||||
| This is | ||||
| illustrated in <xref target="fig_scenario"/> where an MPTCP connection is establ | ||||
| ished between | ||||
| addresses A1 and B1 on Hosts A and B, respectively.</t> | ||||
| <t>If extra paths are available, additional TCP sessions (termed MPT | ||||
| CP "subflows") | ||||
| are created on these paths, and are combined with the existing session, which co | ||||
| ntinues | ||||
| to appear as a single connection to the applications at both ends. The creation | ||||
| of the | ||||
| additional TCP session is illustrated between Address A2 on Host A and Address B | ||||
| 1 on | ||||
| Host B.</t> | ||||
| <t>MPTCP identifies multiple paths by the presence of multiple addre | ||||
| sses | ||||
| at hosts. Combinations of these multiple addresses equate to the additional path | ||||
| s. | ||||
| In the example, other potential paths that could be set up are A1<->B2 and | ||||
| A2<->B2. | ||||
| Although this additional session is shown as being initiated from A2, it could e | ||||
| qually have | ||||
| been initiated from B1 or B2.</t> | ||||
| <t>The discovery and setup of additional subflows | ||||
| will be achieved through a path management method; this document describes a mec | ||||
| hanism | ||||
| by which a host can initiate new subflows by using its own additional addresses, | ||||
| or by | ||||
| signaling its available addresses to the other host.</t> | ||||
| <t>MPTCP adds connection-level sequence numbers to allow the reassem | ||||
| bly of | ||||
| segments arriving on multiple subflows with differing network delays. </t> | ||||
| <t>Subflows are terminated as regular TCP connections, with a four-w | ||||
| ay FIN | ||||
| handshake. The MPTCP connection is terminated by a connection-level FIN.</t> | ||||
| </list> | ||||
| </t> | </t> | |||
| <?rfc needLines='17'?> | <figure anchor="fig_scenario"> | |||
| <figure align="center" anchor="fig_scenario" title="Example MPTCP Usag | <name>Example MPTCP Usage Scenario</name> | |||
| e Scenario"> | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
| <artwork align="left"><![CDATA[ | ||||
| Host A Host B | Host A Host B | |||
| ------------------------ ------------------------ | ------------------------ ------------------------ | |||
| Address A1 Address A2 Address B1 Address B2 | Address A1 Address A2 Address B1 Address B2 | |||
| ---------- ---------- ---------- ---------- | ---------- ---------- ---------- ---------- | |||
| | | | | | | | | | | |||
| | (initial connection setup) | | | | (initial connection setup) | | | |||
| |----------------------------------->| | | |----------------------------------->| | | |||
| |<-----------------------------------| | | |<-----------------------------------| | | |||
| | | | | | | | | | | |||
| | (additional subflow setup) | | | (additional subflow setup) | | |||
| | |--------------------->| | | | |--------------------->| | | |||
| | |<---------------------| | | | |<---------------------| | | |||
| | | | | | | | | | | |||
| | | | | | | | | | ]]></artwork> | |||
| ]]></artwork> | </figure> | |||
| </figure> | <ul spacing="normal"> | |||
| <li>To a non-MPTCP-aware application, MPTCP will behave the same as no | ||||
| rmal TCP. Extended APIs could provide | ||||
| additional control to MPTCP-aware applications <xref target="RFC6897" format="de | ||||
| fault"/>. | ||||
| An application begins by opening a TCP socket in the normal way. | ||||
| MPTCP signaling and operation are handled by the MPTCP implementation. | ||||
| </li> | ||||
| <li>An MPTCP connection begins similarly to a regular TCP connection. | ||||
| This is | ||||
| illustrated in <xref target="fig_scenario" format="default"/>, where an MPTCP co | ||||
| nnection is established between | ||||
| addresses A1 and B1 on Hosts A and B, respectively.</li> | ||||
| <li>If extra paths are available, additional TCP sessions (termed MPTC | ||||
| P "subflows") | ||||
| are created on these paths and are combined with the existing session, which con | ||||
| tinues | ||||
| to appear as a single connection to the applications at both ends. The creation | ||||
| of the | ||||
| additional TCP session is illustrated between Address A2 on Host A and Address B | ||||
| 1 on | ||||
| Host B.</li> | ||||
| <li>MPTCP identifies multiple paths by the presence of multiple addres | ||||
| ses | ||||
| at hosts. Combinations of these multiple addresses equate to the additional path | ||||
| s. | ||||
| In the example, other potential paths that could be set up are A1<->B2 and | ||||
| A2<->B2. | ||||
| Although this additional session is shown as being initiated from A2, it could e | ||||
| qually have | ||||
| been initiated from B1 or B2.</li> | ||||
| <li>The discovery and setup of additional subflows | ||||
| will be achieved through a path management method; this document describes a mec | ||||
| hanism | ||||
| by which a host can initiate new subflows by using its own additional addresses | ||||
| or by | ||||
| signaling its available addresses to the other host.</li> | ||||
| <li>MPTCP adds connection-level sequence numbers to allow the reassemb | ||||
| ly of | ||||
| segments arriving on multiple subflows with differing network delays. </li> | ||||
| <li>Subflows are terminated as regular TCP connections, with a four | ||||
| 209;way FIN | ||||
| handshake. The MPTCP connection is terminated by a connection-level FIN.</li> | ||||
| </ul> | ||||
| </section> | </section> | |||
| <section numbered="true" toc="default"> | ||||
| <name>Requirements Language</name> | ||||
| <section title="Requirements Language"> | <t> | |||
| <t>The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL | The key words "<bcp14>MUST</bcp14>", "<bcp14>MUST NOT</bcp14>", | |||
| NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", | "<bcp14>REQUIRED</bcp14>", "<bcp14>SHALL</bcp14>", "<bcp14>SHALL NOT</bcp14> | |||
| "MAY", and "OPTIONAL" in this document are to be interpreted as | ", | |||
| described in BCP 14 <xref target="RFC2119"/> <xref target="RFC8174" | "<bcp14>SHOULD</bcp14>", "<bcp14>SHOULD NOT</bcp14>", | |||
| /> | "<bcp14>RECOMMENDED</bcp14>", "<bcp14>NOT RECOMMENDED</bcp14>", | |||
| when, and only when, they appear in all capitals, as shown here.</t> | "<bcp14>MAY</bcp14>", and "<bcp14>OPTIONAL</bcp14>" in this document are to | |||
| be | ||||
| interpreted as described in BCP 14 <xref target="RFC2119"/> <xref | ||||
| target="RFC8174"/> when, and only when, they appear in all capitals, as | ||||
| shown here. | ||||
| </t> | ||||
| </section> | </section> | |||
| </section> | </section> | |||
| <section anchor="sec_overview" numbered="true" toc="default"> | ||||
| <section title="Operation Overview" anchor="sec_overview"> | <name>Operation Overview</name> | |||
| <t>This section presents a single description of common MPTCP operation, w | <t>This section presents a single description of common MPTCP operation, w | |||
| ith reference to the protocol operation. This is a high-level overview of the ke | ith reference to the protocol operation. This is a high-level overview of the ke | |||
| y functions; the full specification follows in <xref target="sec_protocol"/>. Ex | y functions; the full specification follows in <xref target="sec_protocol" forma | |||
| tensibility and negotiated features are not discussed here. Considerable referen | t="default"/>. Extensibility and negotiated features are not discussed here. Con | |||
| ce is made to symbolic names of MPTCP options throughout this section -- these a | siderable reference is made to symbolic names of MPTCP options throughout this s | |||
| re subtypes of the IANA-assigned MPTCP option (see <xref target="IANA"/>), and t | ection -- these are subtypes of the IANA‑assigned MPTCP option (see <xref | |||
| heir formats are defined in the detailed protocol specification that follows in | target="IANA" format="default"/>), and their formats are defined in the detailed | |||
| <xref target="sec_protocol"/>.</t> | protocol specification provided in <xref target="sec_protocol" format="default" | |||
| />.</t> | ||||
| <t>A Multipath TCP connection provides a bidirectional bytestream between two ho | <t>A Multipath TCP connection provides a bidirectional bytestream between | |||
| sts communicating like normal TCP and, thus, does not require any change to the | two hosts communicating like normal TCP and thus does not require any change to | |||
| applications. However, Multipath TCP enables the hosts to use different paths wi | the applications. However, Multipath TCP enables the hosts to use different path | |||
| th different IP addresses to exchange packets belonging to the MPTCP connection. | s with different IP addresses to exchange packets belonging to the MPTCP connect | |||
| A Multipath TCP connection appears like a normal TCP connection to an applicati | ion. A Multipath TCP connection appears like a normal TCP connection to an appli | |||
| on. However, to the network layer, each MPTCP subflow looks like a regular TCP f | cation. However, to the network layer, each MPTCP subflow looks like a regular T | |||
| low whose segments carry a new TCP option type. Multipath TCP manages the creati | CP flow whose segments carry a new TCP option type. Multipath TCP manages the cr | |||
| on, removal, and utilization of these subflows to send data. The number of subfl | eation, removal, and utilization of these subflows to send data. The number of s | |||
| ows that are managed within a Multipath TCP connection is not fixed and it can f | ubflows that are managed within a Multipath TCP connection is not fixed, and it | |||
| luctuate during the lifetime of the Multipath TCP connection.</t> | can fluctuate during the lifetime of the Multipath TCP connection.</t> | |||
| <t>All MPTCP operations are signaled with a TCP option -- a single numeric | ||||
| <t>All MPTCP operations are signaled with a TCP option -- a single numerical typ | al type for MPTCP, with "subtypes" for each MPTCP message. What follows is a sum | |||
| e for MPTCP, with "sub-types" for each MPTCP message. What follows is a summary | mary of the purpose and rationale of these messages.</t> | |||
| of the purpose and rationale of these messages.</t> | <section numbered="true" toc="default"> | |||
| <section title="Initiating an MPTCP Connection"> | <name>Initiating an MPTCP Connection</name> | |||
| <t>This is the same signaling as for initiating a normal TCP connection, but the | <t>This is the same signaling as for initiating a normal TCP connection, | |||
| SYN, SYN/ACK, and initial ACK (and data) packets also carry the MP_CAPABLE opti | but the SYN, SYN/ACK, and initial ACK (and data) packets also carry the MP_CAPA | |||
| on. This option has a variable length and serves multiple purposes. Firstly, it | BLE option. This option has a variable length and serves multiple purposes. Firs | |||
| verifies whether the remote host supports Multipath TCP; secondly, this option a | tly, it verifies whether the remote host supports Multipath TCP; secondly, this | |||
| llows the hosts to exchange some information to authenticate the establishment o | option allows the hosts to exchange some information to authenticate the establi | |||
| f additional subflows. Further details are given in <xref target="sec_init"/>.</ | shment of additional subflows. Further details are given in <xref target="sec_in | |||
| t> | it" format="default"/>.</t> | |||
| <artwork align="left" name="" type="" alt=""><![CDATA[ | ||||
| <figure><artwork align="left"><![CDATA[ | ||||
| Host A Host B | Host A Host B | |||
| ------ ------ | ------ ------ | |||
| MP_CAPABLE -> | MP_CAPABLE -> | |||
| [flags] | [flags] | |||
| <- MP_CAPABLE | <- MP_CAPABLE | |||
| [B's key, flags] | [B's key, flags] | |||
| ACK + MP_CAPABLE (+ data) -> | ACK + MP_CAPABLE (+ data) -> | |||
| [A's key, B's key, flags, (data-level details)] | [A's key, B's key, flags, (data-level details)] ]]></artwork> | |||
| ]]></artwork></figure> | <t>Retransmission of the ACK + MP_CAPABLE can occur if it is not known i | |||
| f it has been received. The following diagrams show all possible exchanges for t | ||||
| <t>Retransmission of the ACK + MP_CAPABLE can occur if it is not known if it has | he initial subflow setup to ensure this reliability.</t> | |||
| been received. The following diagrams show all possible exchanges for the initi | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
| al subflow setup to ensure this reliability.</t> | ||||
| <figure><artwork align="left"><![CDATA[ | ||||
| Host A (with data to send immediately) Host B | Host A (with data to send immediately) Host B | |||
| ------ ------ | ------ ------ | |||
| MP_CAPABLE -> | MP_CAPABLE -> | |||
| [flags] | [flags] | |||
| <- MP_CAPABLE | <- MP_CAPABLE | |||
| [B's key, flags] | [B's key, flags] | |||
| ACK + MP_CAPABLE + data -> | ACK + MP_CAPABLE + data -> | |||
| [A's key, B's key, flags, data-level details] | [A's key, B's key, flags, data-level details] | |||
| Host A (with data to send later) Host B | Host A (with data to send later) Host B | |||
| skipping to change at line 316 ¶ | skipping to change at line 308 ¶ | |||
| Host A Host B (sending first) | Host A Host B (sending first) | |||
| ------ ------ | ------ ------ | |||
| MP_CAPABLE -> | MP_CAPABLE -> | |||
| [flags] | [flags] | |||
| <- MP_CAPABLE | <- MP_CAPABLE | |||
| [B's key, flags] | [B's key, flags] | |||
| ACK + MP_CAPABLE -> | ACK + MP_CAPABLE -> | |||
| [A's key, B's key, flags] | [A's key, B's key, flags] | |||
| <- ACK + DSS + data | <- ACK + DSS + data | |||
| [data-level details] | [data-level details] ]]></artwork> | |||
| ]]></artwork></figure> | </section> | |||
| </section> | <section numbered="true" toc="default"> | |||
| <name>Associating a New Subflow with an Existing MPTCP Connection</name> | ||||
| <section title="Associating a New Subflow with an Existing MPTCP Connection"> | <t>The exchange of keys in the MP_CAPABLE handshake provides material th | |||
| <t>The exchange of keys in the MP_CAPABLE handshake provides material that can b | at can be used to authenticate the endpoints when new subflows will be set up. | |||
| e used to authenticate the endpoints when new subflows will be set up. | ||||
| Additional subflows begin in the same way as initiating a normal TCP connection, but the SYN, SYN/ACK, and ACK packets also carry the MP_JOIN option. </t> | Additional subflows begin in the same way as initiating a normal TCP connection, but the SYN, SYN/ACK, and ACK packets also carry the MP_JOIN option. </t> | |||
| <t>Host A initiates a new subflow between one of its addresses and one | ||||
| <t>Host A initiates a new subflow between one of its addresses and one of Host B | of Host B's addresses. The token -- generated from the key -- is used | |||
| 's addresses. The token -- generated from the key -- is used to identify which M | to identify which MPTCP connection it is joining, and the Hash‑bas | |||
| PTCP connection it is joining, and the HMAC is used for authentication. The Hash | ed | |||
| -based Message Authentication Code (HMAC) uses the keys exchanged in the MP_CAPA | Message Authentication Code (HMAC) is used for authentication. The HMAC | |||
| BLE handshake, and the random numbers (nonces) exchanged in these MP_JOIN option | uses the keys exchanged in the MP_CAPABLE handshake and the random numbers (nonc | |||
| s. MP_JOIN also contains flags and an Address ID that can be used to refer to th | es) exchanged in these MP_JOIN options. MP_JOIN also contains flags and an Addre | |||
| e source address without the sender needing to know if it has been changed by a | ss ID that can be used to refer to the source address without the sender needing | |||
| NAT. Further details are in <xref target="sec_join"/>.</t> | to know if it has been changed by a NAT. Further details are given in <xref tar | |||
| get="sec_join" format="default"/>.</t> | ||||
| <figure><artwork align="left"><![CDATA[ | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
| Host A Host B | Host A Host B | |||
| ------ ------ | ------ ------ | |||
| MP_JOIN -> | MP_JOIN -> | |||
| [B's token, A's nonce, | [B's token, A's nonce, | |||
| A's Address ID, flags] | A's Address ID, flags] | |||
| <- MP_JOIN | <- MP_JOIN | |||
| [B's HMAC, B's nonce, | [B's HMAC, B's nonce, | |||
| B's Address ID, flags] | B's Address ID, flags] | |||
| ACK + MP_JOIN -> | ACK + MP_JOIN -> | |||
| [A's HMAC] | [A's HMAC] | |||
| <- ACK | <- ACK ]]></artwork> | |||
| ]]></artwork></figure> | </section> | |||
| </section> | <section numbered="true" toc="default"> | |||
| <name>Informing the Other Host about Another Potential Address</name> | ||||
| <section title="Informing the Other Host about Another Potential Address"> | <t>The set of IP addresses associated to a multihomed host may change du | |||
| <t>The set of IP addresses associated to a multihomed host may change during the | ring the lifetime of an MPTCP connection. MPTCP supports the addition and remova | |||
| lifetime of an MPTCP connection. MPTCP supports the addition and removal of add | l of addresses on a host both implicitly and explicitly. If Host A has establish | |||
| resses on a host both implicitly and explicitly. If Host A has established a sub | ed a subflow starting at address&wj;/port pair IP#-A1 and wants to open a second | |||
| flow starting at address/port pair IP#-A1 and wants to open a second subflow sta | subflow starting at address&wj;/port pair IP#-A2, it simply initiates the estab | |||
| rting at address/port pair IP#-A2, it simply initiates the establishment of the | lishment of the subflow as explained above. The remote host will then be implici | |||
| subflow as explained above. The remote host will then be implicitly informed abo | tly informed about the new address.</t> | |||
| ut the new address.</t> | <t>In some circumstances, a host may want to advertise to the remote | |||
| host the availability of an address without establishing a new subflow | ||||
| <t>In some circumstances, a host may want to advertise to the remote host the av | -- for example, when a NAT prevents setup in one direction. In the exampl | |||
| ailability of an address without establishing a new subflow, for example, when a | e below, Host A informs Host B about its alternative IP address&wj;/port pa | |||
| NAT prevents setup in one direction. In the example below, Host A informs Host | ir (IP#-A2). Host B may later send an MP_JOIN to this new address. The ADD_ADDR | |||
| B about its alternative IP address/port pair (IP#-A2). Host B may later send an | option contains an HMAC to authenticate the address as having been sent from the | |||
| MP_JOIN to this new address. The ADD_ADDR option contains a HMAC to authenticat | originator of the connection. The receiver of this option echoes it back to the | |||
| e the address as having been sent from the originator of the connection. The rec | client to indicate successful receipt. Further details are given in <xref targe | |||
| eiver of this option echoes it back to the client to indicate successful receipt | t="sec_add_address" format="default"/>.</t> | |||
| . Further details are in <xref target="sec_add_address"/>.</t> | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
| <figure><artwork align="left"><![CDATA[ | ||||
| Host A Host B | Host A Host B | |||
| ------ ------ | ------ ------ | |||
| ADD_ADDR -> | ADD_ADDR -> | |||
| [Echo-flag=0, | [Echo-flag=0, | |||
| IP#-A2, | IP#-A2, | |||
| IP#-A2's Address ID, | IP#-A2's Address ID, | |||
| HMAC of IP#-A2] | HMAC of IP#-A2] | |||
| <- ADD_ADDR | <- ADD_ADDR | |||
| [Echo-flag=1, | [Echo-flag=1, | |||
| IP#-A2, | IP#-A2, | |||
| IP#-A2's Address ID, | IP#-A2's Address ID, | |||
| HMAC of IP#-A2] | HMAC of IP#-A2] ]]></artwork> | |||
| ]]></artwork></figure> | <t>There is a corresponding signal for address removal, making use of | |||
| the Address ID that is signaled in the ADD_ADDR handshake. | ||||
| <t>There is a corresponding signal for address removal, making use of the Addres | ||||
| s ID that is signaled in the add address handshake. Further details in <xref tar | ||||
| get="sec_remove_addr"/>.</t> | ||||
| <figure><artwork align="left"><![CDATA[ | Further details are given in <xref target="sec_remove_addr" format="default"/>. | |||
| </t> | ||||
| <artwork align="left" name="" type="" alt=""><![CDATA[ | ||||
| Host A Host B | Host A Host B | |||
| ------ ------ | ------ ------ | |||
| REMOVE_ADDR -> | REMOVE_ADDR -> | |||
| [IP#-A2's Address ID] | [IP#-A2's Address ID] ]]></artwork> | |||
| ]]></artwork></figure> | </section> | |||
| </section> | <section numbered="true" toc="default"> | |||
| <name>Data Transfer Using MPTCP</name> | ||||
| <section title="Data Transfer Using MPTCP"> | <t>To ensure reliable, in-order delivery of data over subflows that may | |||
| <t>To ensure reliable, in-order delivery of data over subflows that may appear a | appear and disappear at any time, MPTCP uses a 64-bit Data Sequence Number (DSN) | |||
| nd disappear at any time, MPTCP uses a 64-bit data sequence number (DSN) to numb | to number all data sent over the MPTCP connection. Each subflow has its own 32- | |||
| er all data sent over the MPTCP connection. Each subflow has its own 32-bit sequ | bit sequence number space, utilizing the regular TCP sequence number header, and | |||
| ence number space, utilising the regular TCP sequence number header, and an MPTC | an MPTCP option maps the subflow sequence space to the data sequence space. In | |||
| P option maps the subflow sequence space to the data sequence space. In this way | this way, data can be retransmitted on different subflows (mapped to the same DS | |||
| , data can be retransmitted on different subflows (mapped to the same DSN) in th | N) in the event of failure.</t> | |||
| e event of failure.</t> | <t>The Data Sequence Signal (DSS) carries the Data Sequence Mapping. The | |||
| Data Sequence Mapping consists of the subflow sequence number, data sequence nu | ||||
| <t>The Data Sequence Signal (DSS) carries the Data Sequence Mapping. The Data Se | mber, and length for which this mapping is valid. This option can also carry a c | |||
| quence Mapping consists of the subflow sequence number, data sequence number, an | onnection-level acknowledgment (the "Data ACK") for the received DSN.</t> | |||
| d length for which this mapping is valid. This option can also carry a connectio | <t>With MPTCP, all subflows share the same receive buffer and advertise | |||
| n-level acknowledgment (the "Data ACK") for the received DSN.</t> | the same receive window. There are two levels of acknowledgment in MPTCP. Regula | |||
| r TCP acknowledgments are used on each subflow to acknowledge the reception of t | ||||
| <t>With MPTCP, all subflows share the same receive buffer and advertise the same | he segments sent over the subflow independently of their DSN. In addition, there | |||
| receive window. There are two levels of acknowledgment in MPTCP. Regular TCP ac | are connection-level acknowledgments for the data sequence space. These acknowl | |||
| knowledgments are used on each subflow to acknowledge the reception of the segme | edgments track the advancement of the bytestream and slide the receive window.</ | |||
| nts sent over the subflow independently of their DSN. In addition, there are con | t> | |||
| nection-level acknowledgments for the data sequence space. These acknowledgments | <t>Further details are given in <xref target="sec_generalop" format="def | |||
| track the advancement of the bytestream and slide the receiving window.</t> | ault"/>.</t> | |||
| <artwork align="left" name="" type="" alt=""><![CDATA[ | ||||
| <t>Further details are in <xref target="sec_generalop"/>.</t> | ||||
| <figure><artwork align="left"><![CDATA[ | ||||
| Host A Host B | Host A Host B | |||
| ------ ------ | ------ ------ | |||
| DSS -> | DSS -> | |||
| [Data Sequence Mapping] | [Data Sequence Mapping] | |||
| [Data ACK] | [Data ACK] | |||
| [Checksum] | [Checksum] ]]></artwork> | |||
| ]]></artwork></figure> | </section> | |||
| </section> | <section numbered="true" toc="default"> | |||
| <name>Requesting a Change in a Path's Priority</name> | ||||
| <section title="Requesting a Change in a Path's Priority"> | <t>Hosts can indicate at initial subflow setup whether they wish the sub | |||
| <t>Hosts can indicate at initial subflow setup whether they wish the subflow to | flow to be used as a regular or backup path -- a backup path only being used if | |||
| be used as a regular or backup path -- a backup path only being used if there ar | there are no regular paths available. During a connection, Host A can request a | |||
| e no regular paths available. During a connection, Host A can request a change i | change in the priority of a subflow through the MP_PRIO signal to Host B. Furthe | |||
| n the priority of a subflow through the MP_PRIO signal to Host B. Further detail | r details are given in <xref target="sec_policy" format="default"/>.</t> | |||
| s are in <xref target="sec_policy"/>.</t> | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
| <figure><artwork align="left"><![CDATA[ | ||||
| Host A Host B | Host A Host B | |||
| ------ ------ | ------ ------ | |||
| MP_PRIO -> | MP_PRIO -> ]]></artwork> | |||
| ]]></artwork></figure> | </section> | |||
| </section> | <section numbered="true" toc="default"> | |||
| <name>Closing an MPTCP Connection</name> | ||||
| <section title="Closing an MPTCP Connection"> | <t>When a host wants to close an existing subflow but not the whole conn | |||
| <t>When a host wants to close an existing subflow, but not the whole connection, | ection, it can initiate a regular TCP FIN/ACK exchange.</t> | |||
| it can initiate a regular TCP FIN/ACK exchange.</t> | <t>When Host A wants to inform Host B that it has no more data to send, | |||
| it signals this "Data FIN" as part of the DSS (see above). It has the same seman | ||||
| <t>When Host A wants to inform Host B that it has no more data to send, it signa | tics and behavior as a regular TCP FIN, but at the connection level. Once all th | |||
| ls this "Data FIN" as part of the Data Sequence Signal (see above). It has the s | e data on the MPTCP connection has been successfully received, this message is a | |||
| ame semantics and behavior as a regular TCP FIN, but at the connection level. On | cknowledged at the connection level with a Data ACK. Further details are given i | |||
| ce all the data on the MPTCP connection has been successfully received, then thi | n <xref target="sec_close" format="default"/>.</t> | |||
| s message is acknowledged at the connection level with a Data ACK. Further detai | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
| ls are in <xref target="sec_close"/>.</t> | ||||
| <figure><artwork align="left"><![CDATA[ | ||||
| Host A Host B | Host A Host B | |||
| ------ ------ | ------ ------ | |||
| DSS -> | DSS -> | |||
| [Data FIN] | [Data FIN] | |||
| <- DSS | <- DSS | |||
| [Data ACK] | [Data ACK] ]]></artwork> | |||
| ]]></artwork></figure> | <t>There is an additional method of connection closure, referred to as | |||
| "Fast Close", which is analogous to closing a single-path TCP | ||||
| <t>There is an additional method of connection closure, referred to as "Fast Clo | connection with a RST signal. The MP_FASTCLOSE signal is used to | |||
| se", which is analogous to closing a single-path TCP connection with a RST signa | indicate to the peer that the connection will be abruptly closed and | |||
| l. The MP_FASTCLOSE signal is used to indicate to the peer that the connection w | no data will be accepted anymore. This can be used on an ACK (which | |||
| ill be abruptly closed and no data will be accepted anymore. This can be used on | ensures reliability of the signal) or a RST (which does not). | |||
| an ACK (ensuring reliability of the signal), or a RST (which is not). Both exam | Both examples are shown in the following diagrams. Further details are given in | |||
| ples are shown in the following diagrams. Further details are in <xref target="s | <xref target="sec_fastclose" format="default"/>.</t> | |||
| ec_fastclose"/>.</t> | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
| <figure><artwork align="left"><![CDATA[ | ||||
| Host A Host B | Host A Host B | |||
| ------ ------ | ------ ------ | |||
| ACK + MP_FASTCLOSE -> | ACK + MP_FASTCLOSE -> | |||
| [B's key] | [B's key] | |||
| [RST on all other subflows] -> | [RST on all other subflows] -> | |||
| <- [RST on all subflows] | <- [RST on all subflows] | |||
| Host A Host B | Host A Host B | |||
| ------ ------ | ------ ------ | |||
| RST + MP_FASTCLOSE -> | RST + MP_FASTCLOSE -> | |||
| [B's key] [on all subflows] | [B's key] [on all subflows] | |||
| <- [RST on all subflows] | <- [RST on all subflows] ]]></artwork> | |||
| ]]></artwork></figure> | </section> | |||
| </section> | <section numbered="true" toc="default"> | |||
| <name>Notable Features</name> | ||||
| <section title="Notable Features"> | <t>It is worth highlighting that MPTCP's signaling has been designed wit | |||
| <t>It is worth highlighting that MPTCP's signaling has been designed with severa | h several key requirements in mind: | |||
| l key requirements in mind: | ||||
| <list style="symbols"> | </t> | |||
| <t>To cope with NATs on the path, addresses are referred to by Address IDs, in c | <ul spacing="normal"> | |||
| ase the IP packet's source | <li>To cope with NATs on the path, addresses are referred to by Addres | |||
| s IDs, in case the IP packet's source | ||||
| address gets changed by a NAT. Setting up a new TCP flow is not possible if the receiver of the SYN is behind a NAT; | address gets changed by a NAT. Setting up a new TCP flow is not possible if the receiver of the SYN is behind a NAT; | |||
| to allow subflows to be created when either end is behind a NAT, MPTCP uses the | to allow subflows to be created when either end is behind a NAT, MPTCP uses the | |||
| ADD_ADDR message. </t> | ADD_ADDR message. </li> | |||
| <li>MPTCP falls back to ordinary TCP if MPTCP operation is not | ||||
| <t>MPTCP falls back to ordinary TCP if MPTCP operation is not possible, for exam | possible -- for example, if one host is not MPTCP capable or if a middlebox alt | |||
| ple, if one host is not MPTCP capable or if a middlebox alters the payload. This | ers the payload. This is discussed in <xref target="sec_fallback" format="defaul | |||
| is discussed in <xref target="sec_fallback"/>.</t> | t"/>.</li> | |||
| <li>To address the threats identified in <xref target="RFC6181" | ||||
| <t>To address the threats identified in <xref target="RFC6181"/>, the following | format="default"/>, the following steps are taken: keys are sent in | |||
| steps are taken: keys are sent in the clear in the MP_CAPABLE messages; MP_JOIN | the clear in the MP_CAPABLE messages; MP_JOIN messages are secured | |||
| messages are secured with HMAC-SHA256 (<xref target="RFC2104"/>, <xref target="R | with HMAC-SHA256 (<xref target="RFC2104" format="default"/> using | |||
| FC6234"/>) using those keys; and standard TCP validity checks are made on the ot | the algorithm in <xref target="RFC6234" format="default"/>) using thos | |||
| her messages (ensuring sequence numbers are in-window <xref target="RFC5961"/>). | e keys; and standard | |||
| Residual threats to MPTCP v0 were identified in <xref target="RFC7430"/>, and t | TCP validity checks are made on the other messages (ensuring that | |||
| hose affecting the protocol (i.e. modification to ADD_ADDR) have been incorporat | sequence numbers are in‑window <xref target="RFC5961" | |||
| ed in this document. Further discussion of security can be found in <xref target | format="default"/>). | |||
| ="sec_security"/>.</t> | Residual threats to MPTCP v0 were identified in <xref target="RFC7430" | |||
| </list></t> | format="default"/>, and those affecting the protocol (i.e., modifications to | |||
| </section> | ADD_ADDR) have been incorporated in this document. | |||
| Further discussion of security can be found in <xref target="sec_security" form | ||||
| at="default"/>.</li> | ||||
| </ul> | ||||
| </section> | ||||
| </section> | </section> | |||
| <section anchor="sec_protocol" numbered="true" toc="default"> | ||||
| <section title="MPTCP Protocol" anchor="sec_protocol"> | <name>MPTCP Operations: An Overview</name> | |||
| <t>This section describes the operation of the MPTCP protocol, and is subd | <t>This section describes the operation of MPTCP. The | |||
| ivided into sections for each key part of the protocol operation.</t> | subsections below discuss each key part of the protocol operation.</t> | |||
| <t>All MPTCP operations are signaled using optional TCP header fields. A s | <t>All MPTCP operations are signaled using optional TCP header fields. A s | |||
| ingle TCP option number ("Kind") has been assigned by IANA for MPTCP (see <xref | ingle TCP option number ("Kind") has been assigned by IANA for MPTCP (see <xref | |||
| target="IANA"/>), and then individual messages will be determined by a "subtype" | target="IANA" format="default"/>), and then individual messages will be determin | |||
| , the values of which are also stored in an IANA registry (and are also listed i | ed by a "subtype", the values of which are also stored in an IANA registry (and | |||
| n <xref target="IANA"/>). As with all TCP options, the Length field is specified | are also listed in <xref target="IANA" format="default"/>). As with all TCP opti | |||
| in bytes, and includes the 2 bytes of Kind and Length.</t> | ons, the Length field is specified in bytes and includes the 2 bytes of Kin | |||
| <t>Throughout this document, when reference is made to an MPTCP option by | d and Length.</t> | |||
| symbolic name, such as "MP_CAPABLE", this refers to a TCP option with the single | <t>Throughout this document, when reference is made to an MPTCP option by | |||
| MPTCP option type, and with the subtype value of the symbolic name as defined i | symbolic name, such as "MP_CAPABLE", this refers to a TCP option with the single | |||
| n <xref target="IANA"/>. This subtype is a 4-bit field -- the first 4 bits of th | MPTCP option type, and with the subtype value of the symbolic name as defined i | |||
| e option payload, as shown in <xref target="fig_option"/>. The MPTCP messages ar | n <xref target="IANA" format="default"/>. This subtype is a 4-bit field -- the f | |||
| e defined in the following sections.</t> | irst 4 bits of the option payload, as shown in <xref target="fig_option" format= | |||
| "default"/>. The MPTCP messages are defined in the following sections.</t> | ||||
| <?rfc needLines='8'?> | <figure anchor="fig_option"> | |||
| <figure align="center" anchor="fig_option" title="MPTCP Option Format"> | <name>MPTCP Option Format</name> | |||
| <artwork align="left"><![CDATA[ | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
| 1 2 3 | 1 2 3 | |||
| 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |||
| +---------------+---------------+-------+-----------------------+ | +---------------+---------------+-------+-----------------------+ | |||
| | Kind | Length |Subtype| | | | Kind | Length |Subtype| | | |||
| +---------------+---------------+-------+ | | +---------------+---------------+-------+ | | |||
| | Subtype-specific data | | | Subtype-specific data | | |||
| | (variable length) | | | (variable length) | | |||
| +---------------------------------------------------------------+ | +---------------------------------------------------------------+ ]]></artwork | |||
| ]]></artwork> | > | |||
| </figure> | </figure> | |||
| <t>Those MPTCP options associated with subflow initiation are used on | ||||
| <t>Those MPTCP options associated with subflow initiation are used on pack | packets with the SYN flag set. Additionally, there is one MPTCP option | |||
| ets with the SYN flag set. Additionally, there is one MPTCP option for signaling | for signaling metadata to ensure that segmented data can be recombined for | |||
| metadata to ensure segmented data can be recombined for delivery to the applica | delivery to the application.</t> | |||
| tion.</t> | <t>The remaining options, however, are signals that do not need to be on | |||
| <t>The remaining options, however, are signals that do not need to be on a | a specific packet, such as those for signaling additional | |||
| specific packet, such as those for signaling additional addresses. Whilst an im | addresses. While an implementation may desire to send MPTCP options as | |||
| plementation may desire to send MPTCP options as soon as possible, it may not be | soon as possible, it may not be possible to combine all desired options | |||
| possible to combine all desired options (both those for MPTCP and for regular T | (both those for MPTCP and for regular TCP, such as SACK (selective | |||
| CP, such as SACK (selective acknowledgment) <xref target="RFC2018"/>) on a singl | acknowledgment) <xref target="RFC2018" format="default"/>) on a single | |||
| e packet. Therefore, an implementation may choose to send duplicate ACKs contain | packet. Therefore, an implementation may choose to send duplicate ACKs | |||
| ing the additional signaling information. This changes the semantics of a duplic | containing the additional signaling information. This changes the | |||
| ate ACK; these are usually only sent as a signal of a lost segment <xref target= | semantics of a duplicate ACK; these are usually only sent as a signal of | |||
| "RFC5681"/> in regular TCP. Therefore, an MPTCP implementation receiving a dupli | a lost segment <xref target="RFC5681" format="default"/> in regular | |||
| cate ACK that contains an MPTCP option MUST NOT treat it as a signal of congesti | TCP. Therefore, an MPTCP implementation receiving a duplicate ACK that | |||
| on. Additionally, an MPTCP implementation SHOULD NOT send more than two duplicat | contains an MPTCP option <bcp14>MUST NOT</bcp14> treat it as a signal of | |||
| e ACKs in a row for the purposes of sending MPTCP options alone, in order to ens | congestion. Additionally, an MPTCP implementation <bcp14>SHOULD | |||
| ure no middleboxes misinterpret this as a sign of congestion.</t> | NOT</bcp14> send more than two duplicate ACKs in a row for the purposes | |||
| <t>Furthermore, standard TCP validity checks (such as ensuring the sequenc | of sending MPTCP options alone, in order to ensure that no middleboxes mis | |||
| e number and acknowledgment number are within window) MUST be undertaken before | interpret this as a sign of congestion.</t> | |||
| processing any MPTCP signals, as described in <xref target="RFC5961"/>, and init | <t>Furthermore, standard TCP validity checks (such as ensuring that the | |||
| ial subflow sequence numbers SHOULD be generated according to the recommendation | sequence number and acknowledgment number are within the window) <bcp14>MU | |||
| s in <xref target="RFC6528"/>.</t> | ST</bcp14> be undertaken before processing any MPTCP signals, as described in <x | |||
| ref target="RFC5961" format="default"/>, and initial subflow sequence numbers <b | ||||
| <section title="Connection Initiation" anchor="sec_init"> | cp14>SHOULD</bcp14> be generated according to the recommendations in <xref targe | |||
| t="RFC6528" format="default"/>.</t> | ||||
| <section anchor="sec_init" numbered="true" toc="default"> | ||||
| <name>Connection Initiation</name> | ||||
| <t>Connection initiation begins with a SYN, SYN/ACK, ACK exchange | <t>Connection initiation begins with a SYN, SYN/ACK, ACK exchange | |||
| on a single path. Each packet | on a single path. Each packet | |||
| contains the Multipath Capable (MP_CAPABLE) MPTCP option | contains the Multipath Capable (MP_CAPABLE) MPTCP option | |||
| (<xref target="tcpm_capable"/>). This option declares its | (<xref target="tcpm_capable" format="default"/>). This option declares i | |||
| sender is capable of performing Multipath TCP and wishes to do | ts | |||
| sender capable of performing Multipath TCP and wishes to do | ||||
| so on this particular connection.</t> | so on this particular connection.</t> | |||
| <figure anchor="tcpm_capable"> | ||||
| <t>The MP_CAPABLE exchange in this specification (v1) is different to | <name>Multipath Capable (MP_CAPABLE) Option</name> | |||
| <artwork align="left" name="" type="" alt=""><![CDATA[ | ||||
| 1 2 3 | ||||
| 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||||
| +---------------+---------------+-------+-------+---------------+ | ||||
| | Kind | Length |Subtype|Version|A|B|C|D|E|F|G|H| | ||||
| +---------------+---------------+-------+-------+---------------+ | ||||
| | Option Sender's Key (64 bits) | | ||||
| | (if option Length > 4) | | ||||
| | | | ||||
| +---------------------------------------------------------------+ | ||||
| | Option Receiver's Key (64 bits) | | ||||
| | (if option Length > 12) | | ||||
| | | | ||||
| +-------------------------------+-------------------------------+ | ||||
| | Data-Level Length (16 bits) | Checksum (16 bits, optional) | | ||||
| +-------------------------------+-------------------------------+ ]]></artwork | ||||
| > | ||||
| </figure> | ||||
| <t>The MP_CAPABLE exchange in this specification (v1) is different than | ||||
| that specified in v0. If a host supports multiple versions | that specified in v0. If a host supports multiple versions | |||
| of MPTCP, the sender of the MP_CAPABLE option SHOULD signal the | of MPTCP, the sender of the MP_CAPABLE option <bcp14>SHOULD</bcp14> sign al the | |||
| highest version number it supports. In return, in its MP_CAPABLE option , | highest version number it supports. In return, in its MP_CAPABLE option , | |||
| the receiver will signal the version number it wishes to use, which MUST | the receiver will signal the version number it wishes to use, which <bcp 14>MUST</bcp14> | |||
| be equal to or lower than the version number indicated in the initial | be equal to or lower than the version number indicated in the initial | |||
| MP_CAPABLE. | MP_CAPABLE. | |||
| There is a caveat though with respect to this version negotiation with | There is a caveat, though, with respect to this version negotiation with | |||
| old listeners that only support v0. A listener that supports v0 expects that | old listeners that only support v0. A listener that supports v0 expects that | |||
| the MP_CAPABLE option in the SYN-segment includes the initiator's key. I | the MP_CAPABLE option in the SYN segment will include the initiator's | |||
| f | key. If, however, | |||
| the initiator however already upgraded to v1, it won't include the key i | the initiator already upgraded to v1, it won't include the key in the | |||
| n the | SYN segment. Thus, the listener will ignore the MP_CAPABLE of this SYN s | |||
| SYN-segment. Thus, the listener will ignore the MP_CAPABLE of this SYN-s | egment | |||
| egment | and reply with a SYN/ACK that does not include an MP_CAPABLE. The initia | |||
| and reply with a SYN/ACK that does not include an MP_CAPABLE. The initia | tor <bcp14>MAY</bcp14> | |||
| tor MAY | choose to immediately fall back to TCP or <bcp14>MAY</bcp14> choose to a | |||
| choose to immediately fall back to TCP or MAY choose to attempt a connec | ttempt a connection | |||
| tion | ||||
| using MPTCP v0 (if the initiator supports v0), in order to discover whet her the | using MPTCP v0 (if the initiator supports v0), in order to discover whet her the | |||
| listener supports the earlier version of MPTCP. In general a MPTCP v0 co | listener supports the earlier version of MPTCP. In general, an MPTCP v0 | |||
| nnection | connection | |||
| is likely to be preferred to a TCP one, however in a particular deployme | will likely be preferred over a TCP connection; however, in a particular | |||
| nt scenario | deployment scenario, | |||
| it may be known that the listener is unlikely to support MPTCPv0 and so | it may be known that the listener is unlikely to support MPTCP v0 and so | |||
| the | the | |||
| initiator may prefer not to attempt a v0 connection. An initiator MAY ca | initiator may prefer not to attempt a v0 connection. An initiator <bcp14 | |||
| che | >MAY</bcp14> cache | |||
| information for a peer about what version of MPTCP it supports if any, a | information for a peer about what version of MPTCP it supports, if any, | |||
| nd use | and use | |||
| this information for future connection attempts.</t> | this information for future connection attempts.</t> | |||
| <t>The MP_CAPABLE option is of variable length, with different fields | ||||
| <t>The MP_CAPABLE option is variable-length, with different fields | included, depending on which packet the option is used on. The full | |||
| included depending on which packet the option is used on. The full | MP_CAPABLE option is shown in <xref target="tcpm_capable" format="defaul | |||
| MP_CAPABLE option is shown in <xref target="tcpm_capable"/>.</t> | t"/>.</t> | |||
| <t>The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK packets | ||||
| <?rfc needLines='10'?> | that start the first subflow of an MPTCP connection, as well as the first packe | |||
| <figure align="center" anchor="tcpm_capable" title="Multipath Capable (M | t that carries data, if the initiator wishes to send first. The data carried by | |||
| P_CAPABLE) Option"> | each option is as follows, where A = initiator and B = listener. | |||
| <artwork align="left"><![CDATA[ | </t> | |||
| 1 2 3 | <ul spacing="normal"> | |||
| 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | <li>SYN (A->B): only the first 4 octets (Length = 4).</li> | |||
| +---------------+---------------+-------+-------+---------------+ | <li>SYN/ACK (B->A): B's key for this connection (Length = 12).</li> | |||
| | Kind | Length |Subtype|Version|A|B|C|D|E|F|G|H| | <li>ACK (no data) (A->B): A's key followed by B's key (Length = 20) | |||
| +---------------+---------------+-------+-------+---------------+ | .</li> | |||
| | Option Sender's Key (64 bits) | | <li>ACK (with first data) (A->B): A's key followed by B's key follo | |||
| | (if option Length > 4) | | wed by Data-Level Length, and optional Checksum (Length = 22 or 24).</li> | |||
| | | | </ul> | |||
| +---------------------------------------------------------------+ | <t> | |||
| | Option Receiver's Key (64 bits) | | The contents of the option are determined by the SYN and ACK flags of th | |||
| | (if option Length > 12) | | e packet, along with the option's Length field. In <xref target="tcpm_capable" f | |||
| | | | ormat="default"/>, "Sender" and "Receiver" refer to the sender or receiver of th | |||
| +-------------------------------+-------------------------------+ | e TCP packet (which can be either host).</t> | |||
| | Data-Level Length (16 bits) | Checksum (16 bits, optional) | | ||||
| +-------------------------------+-------------------------------+ | ||||
| ]]></artwork> | ||||
| </figure> | ||||
| <t>The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK packets | ||||
| that start the first subflow of an MPTCP connection, as well as the first packe | ||||
| t that carries data, if the initiator wishes to send first. The data carried by | ||||
| each option is as follows, where A = initiator and B = listener. | ||||
| <list style="symbols"> | ||||
| <t>SYN (A->B): only the first four octets (Length = 4).</t> | ||||
| <t>SYN/ACK (B->A): B's Key for this connection (Length = 12).</t> | ||||
| <t>ACK (no data) (A->B): A's Key followed by B's Key (Length = 20 | ||||
| ).</t> | ||||
| <t>ACK (with first data) (A->B): A's Key followed by B's Key foll | ||||
| owed by Data-Level Length, and optional Checksum (Length = 22 or 24).</t> | ||||
| </list> | ||||
| The contents of the option is determined by the SYN and ACK flags of the | ||||
| packet, along with the option's length field. For the diagram shown in <xref ta | ||||
| rget="tcpm_capable"/>, "sender" and "receiver" refer to the sender or receiver o | ||||
| f the TCP packet (which can be either host).</t> | ||||
| <t>The initial SYN, containing just the MP_CAPABLE header, is used | <t>The initial SYN, containing just the MP_CAPABLE header, is used | |||
| to define the version of MPTCP being requested, as well as exchanging | to define the version of MPTCP being requested and also to exchange | |||
| flags to negotiate connection features, described later.</t> | flags to negotiate connection features, as described later.</t> | |||
| <t>This option is used to declare the 64-bit keys that the end hosts | ||||
| <t>This option is used to declare the 64-bit keys that the end hosts hav | have generated for this MPTCP connection. These keys are used to | |||
| e generated for this MPTCP connection. These keys are used to authenticate the a | authenticate the addition of future subflows to this connection. This | |||
| ddition of future subflows to this connection. This is the only time the key wil | is the only time the key will be sent in the clear on the wire (unless " | |||
| l be sent in clear on the wire (unless "fast close", <xref target="sec_fastclose | Fast Close" (<xref target="sec_fastclose" format="default"/>) is used); all futu | |||
| "/>, is used); all future subflows will identify the connection using a 32-bit " | re subflows will identify the connection using a 32-bit "token". This token is a | |||
| token". This token is a cryptographic hash of this key. The algorithm for this p | cryptographic hash of this key. The algorithm for this process is dependent on | |||
| rocess is dependent on the authentication algorithm selected; the method of sele | the authentication algorithm selected; the method of selection is defined later | |||
| ction is defined later in this section.</t> | in this section.</t> | |||
| <t>Upon reception of the initial SYN segment, a stateful server generate | ||||
| <t>Upon reception of the initial SYN-segment, a stateful server generate | s a random key and replies with a SYN/ACK. The key's method of generation is imp | |||
| s a random key and replies with a SYN/ACK. The key's method of generation is imp | lementation specific. The key <bcp14>MUST</bcp14> be hard to guess, and it <bcp1 | |||
| lementation specific. The key MUST be hard to guess, and it MUST be unique for t | 4>MUST</bcp14> be unique for the sending host across all its current MPTCP conne | |||
| he sending host across all its current MPTCP connections. Recommendations for ge | ctions. Recommendations for generating random numbers for use in keys are given | |||
| nerating random numbers for use in keys are given in <xref target="RFC4086"/>. C | in <xref target="RFC4086" format="default"/>. Connections will be indexed at eac | |||
| onnections will be indexed at each host by the token (a one-way hash of the key) | h host by the token (a one-way hash of the key). Therefore, an implementation wi | |||
| . Therefore, an implementation will require a mapping from each token to the cor | ll require a mapping from each token to the corresponding connection, and in tur | |||
| responding connection, and in turn to the keys for the connection.</t> | n to the keys for the connection.</t> | |||
| <t>There is a risk that two different keys will hash to the same | ||||
| <t>There is a risk that two different keys will hash to the same token. | token. The risk of hash collisions is usually small, unless the host | |||
| The risk of hash collisions is usually small, unless the host is handling many t | is handling many tens of thousands of connections. Therefore, an | |||
| ens of thousands of connections. Therefore, an implementation SHOULD check its l | implementation <bcp14>SHOULD</bcp14> check its list of connection | |||
| ist of connection tokens to ensure there is no collision before sending its key, | tokens to ensure that there is no collision before sending its key, | |||
| and if there is, then it should generate a new key. This would, however, be cos | and if there is, then it should generate a new key. This would, | |||
| tly for a server with thousands of connections. The subflow handshake mechanism | however, be costly for a server with thousands of connections. The | |||
| (<xref target="sec_join"/>) will ensure that new subflows only join the correct | subflow handshake mechanism (<xref target="sec_join" | |||
| connection, however, through the cryptographic handshake, as well as checking th | format="default"/>) will ensure that new subflows only join the | |||
| e connection tokens in both directions, and ensuring sequence numbers are in-win | correct connection, however, through the cryptographic handshake, as | |||
| dow. So in the worst case if there was a token collision, the new subflow would | well as checking the connection tokens in both directions, and | |||
| not succeed, but the MPTCP connection would continue to provide a regular TCP se | ensuring that sequence numbers are in-window. So, in the worst case, if | |||
| rvice.</t> | there was a token collision, the new subflow would not succeed, but the MPTCP co | |||
| nnection would continue to provide a regular TCP service.</t> | ||||
| <t>Since key generation is implementation-specific, there is no r | <t>Since key generation is implementation specific, there is no | |||
| equirement that they be simply random numbers. An implementation is free to exch | requirement that they simply be random numbers. An implementation is | |||
| ange cryptographic material out-of-band and generate these keys from this, in or | free to exchange cryptographic material out of band and generate these | |||
| der to provide additional mechanisms by which to verify the identity of the comm | keys from this material, in order to provide additional mechanisms by wh | |||
| unicating entities. For example, an implementation could choose to link its MPTC | ich to verify the identity of the communicating entities. For example, an implem | |||
| P keys to those used in higher-layer TLS or SSH connections.</t> | entation could choose to link its MPTCP keys to those used in higher-layer TLS o | |||
| r SSH connections.</t> | ||||
| <t>If the server behaves in a | <t>If the server behaves in a | |||
| stateless manner, it has to generate its own key in a verifiable | stateless manner, it has to generate its own key in a verifiable | |||
| fashion. This verifiable way of generating the key can be done by | fashion. This verifiable way of generating the key can be done by | |||
| using a hash of the 4-tuple, sequence number and a local secret | using a hash of the 4-tuple, sequence number, and a local secret | |||
| (similar to what is done for the TCP-sequence number <xref target="RFC49 | (similar to what is done for the TCP sequence number <xref target="RFC49 | |||
| 87"/>). | 87" format="default"/>). | |||
| It will thus be able to verify whether it is indeed the originator of | It will thus be able to verify whether it is indeed the originator of | |||
| the key echoed back in the later MP_CAPABLE option. | the key echoed back in the subsequent MP_CAPABLE option. | |||
| As for a stateful server, the tokens SHOULD be checked for uniqueness, h | As for a stateful server, the tokens <bcp14>SHOULD</bcp14> be checked fo | |||
| owever | r uniqueness; however, | |||
| if uniqueness is not met, and there is no way to generate an alternative | if uniqueness is not met and there is no way to generate an alternative | |||
| verifiable | verifiable | |||
| key, then the connection MUST fall back to using regular TCP by not send | key, then the connection <bcp14>MUST</bcp14> fall back to using regular | |||
| ing a | TCP by not sending an | |||
| MP_CAPABLE in the SYN/ACK.</t> | MP_CAPABLE in the SYN&wj;/ACK.</t> | |||
| <t>The ACK carries both A's key and B's key. This is the first time that A's key is seen on the wire, although it is expected that A will have generated a key locally before the initial SYN. The echoing of B's key allows B to operat e statelessly, as described above. Therefore, A's key must be delivered reliably to B, and in order to do this, the transmission of this packet must be made rel iable.</t> | <t>The ACK carries both A's key and B's key. This is the first time that A's key is seen on the wire, although it is expected that A will have generated a key locally before the initial SYN. The echoing of B's key allows B to operat e statelessly, as described above. Therefore, A's key must be delivered reliably to B, and in order to do this, the transmission of this packet must be made rel iable.</t> | |||
| <t>If B has data to send first, then the reliable delivery of the | ||||
| <t>If B has data to send first, then the reliable delivery of the ACK+MP | ACK + MP_CAPABLE is ensured by the receipt of this data with a | |||
| _CAPABLE can be inferred by the receipt of this data with a MPTCP Data Sequence | n | |||
| Signal (DSS) option (<xref target="sec_generalop"/>). If, however, A wishes to s | MPTCP Data Sequence Signal (DSS) option (<xref target="sec_generalop" | |||
| end data first, it has two options to ensure the reliable delivery of the ACK+MP | format="default"/>) containing a DATA_ACK for the MP_CAPABLE (which is | |||
| _CAPABLE. If it immediately has data to send, then the third ACK (with data) wou | the first octet of the data sequence space). If, however, A wishes to sen | |||
| ld also contain an MP_CAPABLE option with additional data parameters (the Data-L | d data first, it has | |||
| evel Length and optional Checksum as shown in <xref target="tcpm_capable"/>). If | two options to ensure the reliable delivery of the ACK + MP_CAPABLE. If | |||
| A does not immediately have data to send, it MUST include the MP_CAPABLE on the | it immediately has data to send, then the first ACK (with data) would | |||
| third ACK, but without the additional data parameters. When A does have data to | also contain an MP_CAPABLE option with additional data parameters (the | |||
| send, it must repeat the sending of the MP_CAPABLE option from the third ACK, w | Data-Level Length and optional Checksum as shown in <xref | |||
| ith additional data parameters. This MP_CAPABLE option is in place of the DSS, a | target="tcpm_capable" format="default"/>). If A does not immediately | |||
| nd simply specifies the data-level length of the payload, and the checksum (if t | have data to send, it <bcp14>MUST</bcp14> include the MP_CAPABLE on | |||
| he use of checksums is negotiated). This is the minimal data required to establi | the first ACK, but without the additional data parameters. When A does | |||
| sh a MPTCP connection - it allows validation of the payload, and given it is the | have data to send, it must repeat the sending of the MP_CAPABLE option | |||
| first data, the Initial Data Sequence Number (IDSN) is also known (as it is gen | from the first ACK, with additional data parameters. This MP_CAPABLE | |||
| erated from the key, as described below). Conveying the keys on the first data p | option is used in place of the DSS and simply specifies (1) the Dat | |||
| acket allows the TCP reliability mechanisms to ensure the packet is successfully | a-Level | |||
| delivered. The receiver will acknowledge this data at the connection level with | Length of the payload and (2) the checksum (if the use of checksums | |||
| a Data ACK, as if a DSS option has been received.</t> | is | |||
| negotiated). This is the minimal data required to establish an MPTCP | ||||
| <t>There could be situations where both A and B attempt to transmit init | connection -- it allows validation of the payload, and given that it is | |||
| ial data at the same time. For example, if A did not initially have data to send | the | |||
| , but then needed to transmit data before it had received anything from B, it wo | first data, the Initial Data Sequence Number (IDSN) is also known (as | |||
| uld use a MP_CAPABLE option with data parameters (since it would not know if the | it is generated from the key, as described below). Conveying the keys | |||
| MP_CAPABLE on the ACK was received). In such a situation, B may also have trans | on the first data packet allows the TCP reliability mechanisms to | |||
| mitted data with a DSS option, but it had not yet been received at A. Therefore, | ensure that the packet is successfully delivered. The receiver will ackn | |||
| B has received data with a MP_CAPABLE mapping after it has sent data with a DSS | owledge this data at the connection level with a Data ACK, as if a DSS option ha | |||
| option. To ensure these situations can be handled, it follows that the data par | s been received.</t> | |||
| ameters in a MP_CAPABLE are semantically equivalent to those in a DSS option and | <t>There could be situations where both A and B attempt to transmit | |||
| can be used interchangeably. Similar situations could occur when the MP_CAPABLE | initial data at the same time. For example, if A did not initially | |||
| with data is lost and retransmitted. Furthermore, in the case of TCP Segmentati | have data to send but then needed to transmit data before it had | |||
| on Offloading, the MP_CAPABLE with data parameters may be duplicated across mult | received anything from B, it would use an MP_CAPABLE option with data | |||
| iple packets, and implementations must also be able to cope with duplicate MP_CA | parameters (since it would not know if the MP_CAPABLE on the ACK was | |||
| PABLE mappings as well as duplicate DSS mappings.</t> | received). In such a situation, B may also have transmitted data with | |||
| a DSS option, but it had not yet been received at A. Therefore, B has | ||||
| <t>Additionally, the MP_CAPABLE exchange allows the safe passage of MPTC | received data with an MP_CAPABLE mapping after it has sent data with a | |||
| P options on SYN packets to be determined. If any of these options are dropped, | DSS option. To ensure that these situations can be handled, it follows t | |||
| MPTCP will gracefully fall back to regular single-path TCP, as documented in <xr | hat the data parameters in an MP_CAPABLE are semantically equivalent to those in | |||
| ef target="sec_fallback"/>. If at any point in the handshake either party think | a DSS option and can be used interchangeably. Similar situations could occur wh | |||
| s the MPTCP negotiation is compromised, for example by a middlebox corrupting th | en the MP_CAPABLE with data is lost and retransmitted. Furthermore, in the case | |||
| e TCP options, or unexpected ACK numbers being present, the host MUST stop using | of TCP segmentation offloading, the MP_CAPABLE with data parameters may be dupli | |||
| MPTCP and no longer include MPTCP options in future TCP packets. The other host | cated across multiple packets, and implementations must also be able to cope wit | |||
| will then also fall back to regular TCP using the fall back mechanism. Note th | h duplicate MP_CAPABLE mappings as well as duplicate DSS mappings.</t> | |||
| at new subflows MUST NOT be established (using the process documented in <xref t | <t>Additionally, the MP_CAPABLE exchange allows the safe passage of | |||
| arget="sec_join"/>) until a Data Sequence Signal (DSS) option has been successfu | MPTCP options on SYN packets to be determined. If any of these options | |||
| lly received across the path (as documented in <xref target="sec_generalop"/>).< | are dropped, MPTCP will gracefully fall back to regular single-path | |||
| /t> | TCP, as documented in <xref target="sec_fallback" format="default"/>. | |||
| If at any point in the handshake either party thinks the MPTCP | ||||
| <t>Like all MPTCP options, the MP_CAPABLE option starts with the Kind an | negotiation is compromised -- for example, by a middlebox corrupting | |||
| d Length to specify the TCP-option kind and its length. Followed by that is the | the TCP options or by unexpected ACK numbers being present -- the host < | |||
| MP_CAPABLE option. The first 4 bits of the first octet in the MP_CAPABLE option | bcp14>MUST</bcp14> stop using MPTCP and no longer include MPTCP options in futur | |||
| (<xref target="tcpm_capable"/>) define the MPTCP option subtype (see <xref targe | e TCP packets. The other host will then also fall back to regular TCP using the | |||
| t="IANA"/>; for MP_CAPABLE, this is 0x0), and the remaining 4 bits of this octet | fallback mechanism. Note that new subflows <bcp14>MUST NOT</bcp14> be establish | |||
| specify the MPTCP version in use (for this specification, this is 1).</t> | ed (using the process documented in <xref target="sec_join" format="default"/>) | |||
| until a DSS option has been successfully received across the path (as documented | ||||
| in <xref target="sec_generalop" format="default"/>).</t> | ||||
| <t>Like all MPTCP options, the MP_CAPABLE option starts with the Kind | ||||
| and Length to specify the TCP option's kind and length. This | ||||
| information is followed by the MP_CAPABLE option. The first 4 bits of | ||||
| the first octet in the MP_CAPABLE option (<xref target="tcpm_capable" | ||||
| format="default"/>) define the MPTCP Option Subtype (see <xref | ||||
| target="IANA" format="default"/>; for MP_CAPABLE, this value is | ||||
| 0x0), and the remaining 4 bits of this octet specify the MPTCP | ||||
| version in use (for this specification, this value is 1).</t> | ||||
| <t>The second octet is reserved for flags, allocated as follows: | <t>The second octet is reserved for flags, allocated as follows: | |||
| <list style="hanging"> | </t> | |||
| <t hangText="A:"> The leftmost bit, labeled "A", SHOULD be set to 1 to | <dl newline="false" spacing="normal" indent="14"> | |||
| indicate "Checksum Required", unless the system administrator has decided that | <dt>A:</dt> | |||
| checksums are not required (for example, if the environment is controlled and no | <dd> The leftmost bit, labeled "A", <bcp14>SHOULD</bcp14> be set to 1 | |||
| middleboxes exist that might adjust the payload).</t> | to indicate "Checksum required", unless the system administrator has decided tha | |||
| <t hangText="B:"> The second bit, labeled "B", is an extensibility fla | t checksums are not required (for example, if the environment is controlled and | |||
| g, and MUST be set to 0 for current implementations. This will be used for an ex | no middleboxes exist that might adjust the payload).</dd> | |||
| tensibility mechanism in a future specification, and the impact of this flag wil | <dt>B:</dt> | |||
| l be defined at a later date. It is expected, but not mandated, that this flag w | <dd> The second bit, labeled "B", is an extensibility flag. It | |||
| ould be used as part of an alternative security mechanism that does not require | <bcp14>MUST</bcp14> be set to 0 for current implementations. This | |||
| a full version upgrade of the protocol, but does require redefining some element | flag will be used for an extensibility mechanism in a future specifica | |||
| s of the handshake. If receiving a message with the 'B' flag set to 1, and this | tion, and the impact of this flag will be defined at a later date. It is expecte | |||
| is not understood, then the MP_CAPABLE in this SYN MUST be silently ignored, whi | d, but not mandated, that this flag would be used as part of an alternative secu | |||
| ch triggers a fallback to regular TCP; the sender is expected to retry with a fo | rity mechanism that does not require a full version upgrade of the protocol but | |||
| rmat compatible with this legacy specification. Note that the length of the MP_C | does require redefining some elements of the handshake. If receiving a message w | |||
| APABLE option, and the meanings of bits "D" through "H", may be altered by setti | ith the "B" flag set to 1 and this is not understood, then the MP_CAPABLE in thi | |||
| ng B=1.</t> | s SYN <bcp14>MUST</bcp14> be silently ignored, which triggers a fallback to regu | |||
| <t hangText="C:"> The third bit, labeled "C", is set to "1" to indicat | lar TCP; the sender is expected to retry with a format compatible with this lega | |||
| e that the sender of this option will not accept additional MPTCP subflows to th | cy specification. Note that the length of the MP_CAPABLE option, and the meaning | |||
| e source address and port, and therefore the receiver MUST NOT try to open any a | s of bits "D" through "H", may be altered by setting B=1.</dd> | |||
| dditional subflows towards this address and port. This is an efficiency improvem | <dt>C:</dt> | |||
| ent for situations where the sender knows a restriction is in place, for example | <dd> The third bit, labeled "C", is set to 1 to indicate that the | |||
| if the sender is behind a strict NAT, or operating behind a legacy Layer 4 load | sender of this option will not accept additional MPTCP subflows to | |||
| balancer.</t> | the source address and port, and therefore the receiver <bcp14>MUST | |||
| <t hangText="D through H:"> The remaining bits, labeled "D" through "H | NOT</bcp14> try to open any additional subflows toward this address | |||
| ", are used for crypto algorithm negotiation. In this specification only the ri | and port. This improves efficiency in situations where the | |||
| ghtmost bit, labeled "H", is assigned. Bit "H" indicates the use of HMAC-SHA256 | sender knows a restriction is in place -- for example, if the sender i | |||
| (as defined in <xref target="sec_join"/>). An implementation that only support | s behind a strict NAT or operating behind a legacy Layer 4 load balancer.</dd> | |||
| s this method MUST set bit "H" to 1, and bits "D" through "G" to 0.</t> | <dt>D through H:</dt> | |||
| </list> | <dd> The remaining bits, labeled "D" through "H", are used for | |||
| crypto algorithm negotiation. In this specification, only the | ||||
| A crypto algorithm MUST be specified. If flag bits D through H are all | rightmost bit, labeled "H", is assigned. Bit "H" indicates the use | |||
| 0, the MP_CAPABLE option MUST be treated as invalid and ignored (that is, it mus | of HMAC-SHA256 (as defined in <xref target="sec_join" | |||
| t be treated as a regular TCP handshake).</t> | format="default"/>). An implementation that only supports this | |||
| method <bcp14>MUST</bcp14> set bit "H" to 1 and bits "D" | ||||
| <t>The selection of the authentication algorithm also impacts the algori | through "G" to 0.</dd> | |||
| thm used to generate the token and the Initial Data Sequence Number (IDSN). In t | ||||
| his specification, with only the SHA-256 algorithm (bit "H") specified and selec | ||||
| ted, the token MUST be a truncated (most significant 32 bits) SHA-256 hash (<xre | ||||
| f target="RFC6234"/>) of the key. A different, 64-bit truncation (the least sign | ||||
| ificant 64 bits) of the SHA-256 hash of the key MUST be used as the IDSN. Note t | ||||
| hat the key MUST be hashed in network byte order. Also note that the "least sign | ||||
| ificant" bits MUST be the rightmost bits of the SHA-256 digest, as per <xref tar | ||||
| get="RFC6234"/>. Future specifications of the use of the crypto bits may choose | ||||
| to specify different algorithms for token and IDSN generation.</t> | ||||
| <t>Both the crypto and checksum bits negotiate capabilities in similar w | ||||
| ays. For the Checksum Required bit (labeled "A"), if either host requires the us | ||||
| e of checksums, checksums MUST be used. In other words, the only way for checksu | ||||
| ms not to be used is if both hosts in their SYNs set A=0. This decision is confi | ||||
| rmed by the setting of the "A" bit in the third packet (the ACK) of the handshak | ||||
| e. For example, if the initiator sets A=0 in the SYN, but the responder sets A=1 | ||||
| in the SYN/ACK, checksums MUST be used in both directions, and the initiator wi | ||||
| ll set A=1 in the ACK. The decision whether to use checksums will be stored by a | ||||
| n implementation in a per-connection binary state variable. If A=1 is received b | ||||
| y a host that does not want to use checksums, it MUST fall back to regular TCP b | ||||
| y ignoring the MP_CAPABLE option as if it was invalid.</t> | ||||
| <t>For crypto negotiation, the responder has the choice. The initiator c | ||||
| reates a proposal setting a bit for each algorithm it supports to 1 (in this ver | ||||
| sion of the specification, there is only one proposal, so bit "H" will be always | ||||
| set to 1). The responder responds with only 1 bit set -- this is the chosen alg | ||||
| orithm. The rationale for this behavior is that the responder will typically be | ||||
| a server with potentially many thousands of connections, so it may wish to choos | ||||
| e an algorithm with minimal computational complexity, depending on the load. If | ||||
| a responder does not support (or does not want to support) any of the initiator' | ||||
| s proposals, it MUST respond without an MP_CAPABLE option, thus forcing a fallba | ||||
| ck to regular TCP.</t> | ||||
| <t>The MP_CAPABLE option is only used in the first subflow of a connecti | </dl> | |||
| on, in order to identify the connection; all following subflows will use the "Jo | <t>A crypto algorithm <bcp14>MUST</bcp14> be specified. If flag bits "D | |||
| in" option (see <xref target="sec_join"/>) to join the existing connection.</t> | " through "H" are all 0, the MP_CAPABLE option <bcp14>MUST</bcp14> be treated as | |||
| invalid and ignored (that is, it must be treated as a regular TCP handshake).</ | ||||
| t> | ||||
| <t>The selection of the authentication algorithm also impacts the algori | ||||
| thm used to generate the token and the IDSN. In this specification, with only th | ||||
| e SHA-256 algorithm (bit "H") specified and selected, the token <bcp14>MUST</bcp | ||||
| 14> be a truncated (most significant 32 bits) SHA-256 hash <xref target="RF | ||||
| C6234" format="default"/> of the key. A different, 64-bit truncation (the least | ||||
| significant 64 bits) of the SHA-256 hash of the key <bcp14>MUST</bcp14> be used | ||||
| as the IDSN. Note that the key <bcp14>MUST</bcp14> be hashed in network byte ord | ||||
| er. Also note that the "least significant" bits <bcp14>MUST</bcp14> be the right | ||||
| most bits of the SHA-256 digest, as per <xref target="RFC6234" format="default"/ | ||||
| >. Future specifications of the use of the crypto bits may choose to specify dif | ||||
| ferent algorithms for token and IDSN generation.</t> | ||||
| <t>Both the crypto and checksum bits negotiate capabilities in similar | ||||
| ways. For the "Checksum required" bit (labeled "A"), if either host | ||||
| requires the use of checksums, checksums <bcp14>MUST</bcp14> be | ||||
| used. In other words, the only way for checksums not to be used is if | ||||
| both hosts in their SYNs set A=0. This decision is confirmed by the | ||||
| setting of the "A" bit in the third packet (the ACK) of the | ||||
| handshake. For example, if the initiator sets A=0 in the SYN but the | ||||
| responder sets A=1 in the SYN/ACK, checksums <bcp14>MUST</bcp14> be | ||||
| used in both directions, and the initiator will set A=1 in the | ||||
| ACK. The decision regarding whether to use checksums will be stored by a | ||||
| n implementation in a per-connection binary state variable. If A=1 is received b | ||||
| y a host that does not want to use checksums, it <bcp14>MUST</bcp14> fall back t | ||||
| o regular TCP by ignoring the MP_CAPABLE option as if it was invalid.</t> | ||||
| <t>For crypto negotiation, the responder has the choice. The initiator | ||||
| creates a proposal setting a bit for each algorithm it supports to 1 | ||||
| (in this version of the specification, there is only one proposal, so | ||||
| bit "H" will always be set to 1). The responder responds with only 1&nbs | ||||
| p;bit set -- this is the chosen algorithm. The rationale for this behavior is th | ||||
| at the responder will typically be a server with potentially many thousands of c | ||||
| onnections, so it may wish to choose an algorithm with minimal computational com | ||||
| plexity, depending on the load. If a responder does not support (or does not wan | ||||
| t to support) any of the initiator's proposals, it <bcp14>MUST</bcp14> respond w | ||||
| ithout an MP_CAPABLE option, thus forcing a fallback to regular TCP.</t> | ||||
| <t>The MP_CAPABLE option is only used in the first subflow of a | ||||
| connection, in order to identify the connection; all subsequent | ||||
| subflows will use the MP_JOIN option (see <xref target="sec_join" | ||||
| format="default"/>) to join the existing connection.</t> | ||||
| <t>If a SYN contains an MP_CAPABLE option but the | <t>If a SYN contains an MP_CAPABLE option but the | |||
| SYN/ACK does not, it is assumed that sender of the SYN/ACK is not | SYN/ACK does not, it is assumed that the sender of the SYN/ACK is not | |||
| multipath capable; thus, the MPTCP session MUST operate as | multipath capable; thus, the MPTCP session <bcp14>MUST</bcp14> operate a | |||
| a regular, single-path TCP. If a SYN does not contain a | s | |||
| MP_CAPABLE option, the SYN/ACK MUST NOT contain one | a regular, single-path TCP session. If a SYN does not contain an | |||
| MP_CAPABLE option, the SYN/ACK <bcp14>MUST NOT</bcp14> contain one | ||||
| in response. If the third packet (the ACK) does not contain | in response. If the third packet (the ACK) does not contain | |||
| the MP_CAPABLE option, then the session MUST fall back to | the MP_CAPABLE option, then the session <bcp14>MUST</bcp14> fall back to | |||
| operating as a regular, single-path TCP. This is to maintain | operating as a regular, single-path TCP session. This is done to maintai | |||
| n | ||||
| compatibility with middleboxes on the path that drop some | compatibility with middleboxes on the path that drop some | |||
| or all TCP options. Note that an implementation MAY choose | or all TCP options. Note that an implementation <bcp14>MAY</bcp14> choos e | |||
| to attempt sending MPTCP options more than one time before | to attempt sending MPTCP options more than one time before | |||
| making this decision to operate as regular TCP (see | making this decision to operate as regular TCP (see | |||
| <xref target="heuristics"/>).</t> | <xref target="heuristics" format="default"/>).</t> | |||
| <t>If the SYN packets are unacknowledged, it is up to local | <t>If the SYN packets are unacknowledged, it is up to local | |||
| policy to decide how to respond. It is expected that a sender | policy to decide how to respond. It is expected that a sender | |||
| will eventually fall back to single-path TCP (i.e., without the | will eventually fall back to single-path TCP (i.e., without the | |||
| MP_CAPABLE option) in order to work around middleboxes that | MP_CAPABLE option) in order to work around middleboxes that | |||
| may drop packets with unknown options; however, the number of | may drop packets with unknown options; however, the number of | |||
| multipath-capable attempts that are made first will be up to | multipath-capable attempts that are made first will be up to | |||
| local policy. | local policy. | |||
| It is possible that MPTCP and non-MPTCP SYNs could get reordered | It is possible that MPTCP and non-MPTCP SYNs could get reordered | |||
| in the network. Therefore, the final state is inferred from the | in the network. Therefore, the final state is inferred from the | |||
| presence or absence of the MP_CAPABLE option in the third packet | presence or absence of the MP_CAPABLE option in the third packet | |||
| of the TCP handshake. If this option is not present, the | of the TCP handshake. If this option is not present, the | |||
| connection SHOULD fall back to regular TCP, as documented in | connection <bcp14>SHOULD</bcp14> fall back to regular TCP, as documented | |||
| <xref target="sec_fallback"/>.</t> | in | |||
| <xref target="sec_fallback" format="default"/>.</t> | ||||
| <t>The initial data sequence number on an MPTCP connection | <t>The IDSN on an MPTCP connection | |||
| is generated from the key. The algorithm for IDSN generation is | is generated from the key. The algorithm for IDSN generation is | |||
| also determined from the negotiated authentication algorithm. | also determined from the negotiated authentication algorithm. | |||
| In this specification, with only the SHA-256 algorithm specified and | In this specification, with only the SHA-256 algorithm specified and | |||
| selected, the IDSN of a host MUST be the least significant 64 bits of th e | selected, the IDSN of a host <bcp14>MUST</bcp14> be the least significan t 64 bits of the | |||
| SHA-256 hash of its key, i.e., IDSN-A = Hash(Key-A) and IDSN-B = Hash(Ke y-B). | SHA-256 hash of its key, i.e., IDSN-A = Hash(Key-A) and IDSN-B = Hash(Ke y-B). | |||
| This deterministic generation of the IDSN allows a receiver to ensure | This deterministic generation of the IDSN allows a receiver to ensure | |||
| that there are no gaps in sequence space at the start of the connection. | that there are no gaps in sequence space at the start of the connection. | |||
| The SYN with MP_CAPABLE occupies the first octet of data sequence space, | The SYN with MP_CAPABLE occupies the first octet of data sequence space, | |||
| although this does not need to be acknowledged at the connection level | although this does not need to be acknowledged at the connection level | |||
| until the first data is sent (see <xref target="sec_generalop"/>).</t> | until the first data is sent (see <xref target="sec_generalop" format="d efault"/>).</t> | |||
| </section> | </section> | |||
| <section anchor="sec_join" numbered="true" toc="default"> | ||||
| <section title="Starting a New Subflow" anchor="sec_join"> | <name>Starting a New Subflow</name> | |||
| <t>Once an MPTCP connection has begun with the MP_CAPABLE | <t>Once an MPTCP connection has begun with the MP_CAPABLE | |||
| exchange, further subflows can be added to the connection. | exchange, further subflows can be added to the connection. | |||
| Hosts have knowledge of their own address(es), and can | Hosts have knowledge of their own address(es) and can | |||
| become aware of the other host's addresses through | become aware of the other host's addresses through | |||
| signaling exchanges as described in | signaling exchanges as described in | |||
| <xref target="sec_pm"/>. Using this knowledge, a host | <xref target="sec_pm" format="default"/>. Using this knowledge, a host | |||
| can initiate a new subflow over a currently unused pair of | can initiate a new subflow over a currently unused pair of | |||
| addresses. It is permitted for either host in a connection | addresses. It is permissible for either host in a connection | |||
| to initiate the creation of a new subflow, but it is expected | to initiate the creation of a new subflow, but it is expected | |||
| that this will normally be the original connection initiator | that this will normally be the original connection initiator | |||
| (see <xref target="heuristics"/> for heuristics).</t> | (see <xref target="heuristics" format="default"/> for heuristics).</t> | |||
| <t>A new subflow is started as a normal TCP SYN/ACK | <t>A new subflow is started as a normal TCP SYN/ACK | |||
| exchange. The Join Connection (MP_JOIN) MPTCP option | exchange. The Join Connection (MP_JOIN) MPTCP option | |||
| is used to identify the connection to be joined by the new subflow. | is used to identify the connection to be joined by the new subflow. | |||
| It uses keying material that was exchanged in the initial MP_CAPABLE | It uses keying material that was exchanged in the initial MP_CAPABLE | |||
| handshake (<xref target="sec_init"/>), and that handshake also | handshake (<xref target="sec_init" format="default"/>), and that handsha ke also | |||
| negotiates the crypto algorithm in use for the MP_JOIN handshake.</t> | negotiates the crypto algorithm in use for the MP_JOIN handshake.</t> | |||
| <t>This section specifies the behavior of MP_JOIN using the HMAC-SHA256 | <t>This section specifies the behavior of MP_JOIN using the HMAC-SHA256 | |||
| algorithm. An MP_JOIN option is present in the SYN, SYN/ACK, | algorithm. An MP_JOIN option is present in the SYN, SYN/ACK, | |||
| and ACK of the three-way handshake, although in each case with a | and ACK of the three-way handshake, although in each case with a | |||
| different format.</t> | different format.</t> | |||
| <t>In the first MP_JOIN on the SYN packet, illustrated in | <t>In the first MP_JOIN on the SYN packet, illustrated in | |||
| <xref target="tcpm_join"/>, the initiator sends a token, random | <xref target="tcpm_join" format="default"/>, the initiator sends a token | |||
| number, and address ID.</t> | , random | |||
| number, and Address ID.</t> | ||||
| <figure anchor="tcpm_join"> | ||||
| <name>Join Connection (MP_JOIN) Option (for Initial SYN)</name> | ||||
| <artwork align="left" name="" type="" alt=""><![CDATA[ | ||||
| 1 2 3 | ||||
| 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||||
| +---------------+---------------+-------+-----+-+---------------+ | ||||
| | Kind | Length = 12 |Subtype|(rsv)|B| Address ID | | ||||
| +---------------+---------------+-------+-----+-+---------------+ | ||||
| | Receiver's Token (32 bits) | | ||||
| +---------------------------------------------------------------+ | ||||
| | Sender's Random Number (32 bits) | | ||||
| +---------------------------------------------------------------+ ]]></artwork | ||||
| > | ||||
| </figure> | ||||
| <t>The token is used to identify the MPTCP connection and is a | <t>The token is used to identify the MPTCP connection and is a | |||
| cryptographic hash of the receiver's key, as exchanged | cryptographic hash of the receiver's key, as exchanged | |||
| in the initial MP_CAPABLE handshake (<xref target="sec_init"/>). | in the initial MP_CAPABLE handshake (<xref target="sec_init" format="def ault"/>). | |||
| In this specification, the tokens presented in this | In this specification, the tokens presented in this | |||
| option are generated by the SHA-256 <xref target="RFC6234"/> | option are generated by the SHA-256 algorithm <xref target="RFC6234" for | |||
| algorithm, truncated to the most significant 32 bits. The token | mat="default"/>, truncated to the most significant 32 bits. The token | |||
| included in the MP_JOIN option is the token that the receiver | included in the MP_JOIN option is the token that the receiver | |||
| of the packet uses to identify this connection; i.e., Host A | of the packet uses to identify this connection; i.e., Host A | |||
| will send Token-B (which is generated from Key-B). Note that the | will send Token-B (which is generated from Key-B). Note that the | |||
| hash generation algorithm can be overridden by the choice of | hash generation algorithm can be overridden by the choice of | |||
| cryptographic handshake algorithm, as defined in <xref target="sec_init" | cryptographic handshake algorithm, as defined in <xref target="sec_init" | |||
| />.</t> | format="default"/>.</t> | |||
| <t>The MP_JOIN SYN sends not only the token (which is static for a | <t>The MP_JOIN SYN sends not only the token (which is static for a | |||
| connection) but also random numbers (nonces) that are used to prevent | connection) but also random numbers (nonces) that are used to prevent | |||
| replay attacks on the authentication method. Recommendations for the | replay attacks on the authentication method. Recommendations for the | |||
| generation of random numbers for this purpose are given in <xref target= | generation of random numbers for this purpose are given in <xref target= | |||
| "RFC4086"/>.</t> | "RFC4086" format="default"/>.</t> | |||
| <t>The MP_JOIN option includes an "Address ID". This is an identifier | <t>The MP_JOIN option includes an "Address ID". This is an identifier | |||
| generated by the sender of the option, used to identify the source addre ss | generated by the sender of the option, used to identify the source addre ss | |||
| of this packet, even if the IP header has been changed in transit by a m iddlebox. | of this packet, even if the IP header has been changed in transit by a m iddlebox. | |||
| The numeric value of this field is generated by the sender and must map uniquely | The numeric value of this field is generated by the sender and must map uniquely | |||
| to a source IP address for the sending host. | to a source IP address for the sending host. | |||
| The Address ID allows address removal (<xref target="sec_remove_addr"/>) | The Address ID allows address removal (<xref target="sec_remove_addr" fo rmat="default"/>) | |||
| without needing to know what the source address at the | without needing to know what the source address at the | |||
| receiver is, thus allowing address removal through NATs. | receiver is, thus allowing address removal through NATs. | |||
| The Address ID also allows correlation between new subflow setup attempt s | The Address ID also allows correlation between new subflow setup attempt s | |||
| and address signaling (<xref target="sec_add_address"/>), | and address signaling (<xref target="sec_add_address" format="default"/> ), | |||
| to prevent setting up duplicate subflows on the same path, if an MP_JOIN | to prevent setting up duplicate subflows on the same path, if an MP_JOIN | |||
| and ADD_ADDR are sent at the same time.</t> | and ADD_ADDR are sent at the same time.</t> | |||
| <t>The Address IDs of the subflow used in the initial SYN | <t>The Address IDs of the subflow used in the initial SYN | |||
| exchange of the first subflow in the connection are implicit, | exchange of the first subflow in the connection are implicit | |||
| and have the value zero. A host MUST store the mappings between | and have the value zero. A host <bcp14>MUST</bcp14> store the mappings b | |||
| etween | ||||
| Address IDs and addresses both for itself and the remote host. | Address IDs and addresses both for itself and the remote host. | |||
| An implementation will also need to know which local and remote | An implementation will also need to know which local and remote | |||
| Address IDs are associated with which established subflows, for | Address IDs are associated with which established subflows, for | |||
| when addresses are removed from a local or remote host.</t> | when addresses are removed from a local or remote host.</t> | |||
| <t>The MP_JOIN option on packets with the SYN flag set also includes | ||||
| <t>The MP_JOIN option on packets with the SYN flag set also includes 4 b | 4 bits of flags, 3 of which are currently reserved and | |||
| its of flags, 3 of which are currently reserved and MUST be set to zero by the s | <bcp14>MUST</bcp14> be set to 0 by the sender. The final bit, labeled | |||
| ender. The final bit, labeled "B", indicates whether the sender of this option w | "B", indicates whether the sender of this option (1) wishes this | |||
| ishes this subflow to be used as a backup path (B=1) in the event of failure of | subflow to be used as a backup path (B=1) in the event of failure of | |||
| other paths, or whether it wants it to be used as part of the connection immedia | other paths or (2) wants the subflow to be used as part of the | |||
| tely. By setting B=1, the sender of the option is requesting the other host to o | connection immediately. By setting B=1, the sender of the option is | |||
| nly send data on this subflow if there are no available subflows where B=0. Subf | requesting that the other host only send data on this subflow if there | |||
| low policy is discussed in more detail in <xref target="sec_policy"/>.</t> | are no available subflows where B=0. Subflow policy is discussed in more | |||
| detail in <xref target="sec_policy" format="default"/>.</t> | ||||
| <?rfc needLines='10'?> | ||||
| <figure align="center" anchor="tcpm_join" title="Join Connection (MP_JOI | ||||
| N) Option (for Initial SYN)"> | ||||
| <artwork align="left"><![CDATA[ | ||||
| 1 2 3 | ||||
| 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||||
| +---------------+---------------+-------+-----+-+---------------+ | ||||
| | Kind | Length = 12 |Subtype|(rsv)|B| Address ID | | ||||
| +---------------+---------------+-------+-----+-+---------------+ | ||||
| | Receiver's Token (32 bits) | | ||||
| +---------------------------------------------------------------+ | ||||
| | Sender's Random Number (32 bits) | | ||||
| +---------------------------------------------------------------+ | ||||
| ]]></artwork> | ||||
| </figure> | ||||
| <t>When receiving a SYN with an MP_JOIN option that contains | <t>When receiving a SYN with an MP_JOIN option that contains | |||
| a valid token for an existing MPTCP connection, the recipient | a valid token for an existing MPTCP connection, the recipient | |||
| SHOULD respond with a SYN/ACK also containing an MP_JOIN | <bcp14>SHOULD</bcp14> respond with a SYN/ACK also containing an MP_JOIN | |||
| option containing a random number and a truncated (leftmost 64 | option containing a random number and a truncated (leftmost 64 bits | |||
| bits) Hash-based Message Authentication Code (HMAC). This | ) HMAC. This | |||
| version of the option is shown in <xref target="tcpm_join2"/>. | version of the option is shown in <xref target="tcpm_join2" format="defa | |||
| If the token is unknown, or the host wants to refuse subflow | ult"/>. If the token is unknown or the host wants to refuse subflow | |||
| establishment (for example, due to a limit on the number of | establishment (for example, due to a limit on the number of | |||
| subflows it will permit), the receiver will send back a reset | subflows it will permit), the receiver will send back a reset | |||
| (RST) signal, analogous to an unknown port in TCP, containing a | (RST) signal, analogous to an unknown port in TCP, containing an | |||
| MP_TCPRST option (<xref target="sec_reset"/>) with a "MPTCP | MP_TCPRST option (<xref target="sec_reset" format="default"/>) with an " | |||
| MPTCP | ||||
| specific error" reason code. Although calculating an HMAC | specific error" reason code. Although calculating an HMAC | |||
| requires cryptographic operations, it is believed that the | requires cryptographic operations, it is believed that the | |||
| 32-bit token in the MP_JOIN SYN gives sufficient protection against blin d state | 32-bit token in the MP_JOIN SYN gives sufficient protection against blin d state | |||
| exhaustion attacks; therefore, there is no need to provide | exhaustion attacks; therefore, there is no need to provide | |||
| mechanisms to allow a responder to operate statelessly at the | mechanisms to allow a responder to operate statelessly at the | |||
| MP_JOIN stage.</t> | MP_JOIN stage.</t> | |||
| <figure anchor="tcpm_join2"> | ||||
| <name>Join Connection (MP_JOIN) Option (for Responding SYN/ACK)</name> | ||||
| <artwork align="left" name="" type="" alt=""><![CDATA[ | ||||
| 1 2 3 | ||||
| 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||||
| +---------------+---------------+-------+-----+-+---------------+ | ||||
| | Kind | Length = 16 |Subtype|(rsv)|B| Address ID | | ||||
| +---------------+---------------+-------+-----+-+---------------+ | ||||
| | | | ||||
| | Sender's Truncated HMAC (64 bits) | | ||||
| | | | ||||
| +---------------------------------------------------------------+ | ||||
| | Sender's Random Number (32 bits) | | ||||
| +---------------------------------------------------------------+ ]]></artwork | ||||
| > | ||||
| </figure> | ||||
| <t>An HMAC is sent by both hosts -- by the initiator (Host A) | <t>An HMAC is sent by both hosts -- by the initiator (Host A) | |||
| in the third packet (the ACK) and by the responder (Host B) in | in the third packet (the ACK) and by the responder (Host B) in | |||
| the second packet (the SYN/ACK). Doing the HMAC exchange at this | the second packet (the SYN/ACK). Doing the HMAC exchange at this | |||
| stage allows both hosts to have first exchanged random data (in the | stage allows both hosts to have first exchanged random data (in the | |||
| first two SYN packets) that is used as the "message". This | first two SYN packets) that is used as the "message". This | |||
| specification defines that HMAC as defined in <xref target="RFC2104"/> | specification defines that HMAC as defined in <xref target="RFC2104" for | |||
| is used, along with the SHA-256 hash algorithm <xref target="RFC6234"/>, | mat="default"/> | |||
| is used, along with the SHA-256 hash algorithm <xref target="RFC6234" fo | ||||
| rmat="default"/>, | ||||
| and that the output is truncated to the leftmost 160 bits (20 octets). | and that the output is truncated to the leftmost 160 bits (20 octets). | |||
| Due to option space limitations, the HMAC included in | Due to option space limitations, the HMAC included in | |||
| the SYN/ACK is truncated to the leftmost 64 bits, but this is | the SYN/ACK is truncated to the leftmost 64 bits, but this is | |||
| acceptable since random numbers are used; thus, an attacker | acceptable, since random numbers are used; thus, an attacker | |||
| only has one chance to correctly guess the HMAC that matches the random | only has one chance to correctly guess the HMAC that matches the random | |||
| number previously sent by the peer (if the HMAC is | number previously sent by the peer (if the HMAC is | |||
| incorrect, the TCP connection is closed, so a new MP_JOIN negotiation | incorrect, the TCP connection is closed, so a new MP_JOIN negotiation | |||
| with a new random number is required).</t> | with a new random number is required).</t> | |||
| <t>The initiator's authentication information is sent in its | <t>The initiator's authentication information is sent in its | |||
| first ACK (the third packet of the handshake), as shown in | first ACK (the third packet of the handshake), as shown in | |||
| <xref target="tcpm_join3"/>. This data needs to be sent reliably, | <xref target="tcpm_join3" format="default"/>. This data needs to be sent reliably, | |||
| since it is the only time this HMAC is sent; | since it is the only time this HMAC is sent; | |||
| therefore, receipt of this packet MUST trigger a regular TCP ACK | therefore, receipt of this packet <bcp14>MUST</bcp14> trigger a regular | |||
| in response, and the packet MUST be retransmitted if this | TCP ACK | |||
| in response, and the packet <bcp14>MUST</bcp14> be retransmitted if this | ||||
| ACK is not received. In other words, sending the ACK/MP_JOIN | ACK is not received. In other words, sending the ACK/MP_JOIN | |||
| packet places the subflow in the PRE_ESTABLISHED state, and it | packet places the subflow in the PRE_ESTABLISHED state, and it | |||
| moves to the ESTABLISHED state only on receipt of an ACK from | moves to the ESTABLISHED state only on receipt of an ACK from | |||
| the receiver. It is not permitted to send data while in the | the receiver. It is not permissible to send data while in the | |||
| PRE_ESTABLISHED state. The reserved bits in this option MUST be set | PRE_ESTABLISHED state. The reserved bits in this option <bcp14>MUST</bcp | |||
| to zero by the sender.</t> | 14> be set | |||
| to 0 by the sender.</t> | ||||
| <t>The key for the HMAC algorithm, in the case of the message transmitte | <figure anchor="tcpm_join3"> | |||
| d by Host A, will be Key-A followed by Key-B, and in the case of Host B, Key-B f | <name>Join Connection (MP_JOIN) Option (for Initiator's Firs | |||
| ollowed by Key-A. These are the keys that were exchanged in the original MP_CAPA | t ACK)</name> | |||
| BLE handshake. The "message" for the HMAC algorithm in each case is the concaten | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
| ations of random number for each host (denoted by R): for Host A, R-A followed b | 1 2 3 | |||
| y R-B; and for Host B, R-B followed by R-A.</t> | 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |||
| +---------------+---------------+-------+-----------------------+ | ||||
| <?rfc needLines='10'?> | | Kind | Length = 24 |Subtype| (reserved) | | |||
| <figure align="center" anchor="tcpm_join2" title="Join Connection (MP_JO | +---------------+---------------+-------+-----------------------+ | |||
| IN) Option (for Responding SYN/ACK)"> | | | | |||
| <artwork align="left"><![CDATA[ | | | | |||
| 1 2 3 | | Sender's Truncated HMAC (160 bits) | | |||
| 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | | | | |||
| +---------------+---------------+-------+-----+-+---------------+ | | | | |||
| | Kind | Length = 16 |Subtype|(rsv)|B| Address ID | | +---------------------------------------------------------------+ ]]></artwork | |||
| +---------------+---------------+-------+-----+-+---------------+ | > | |||
| | | | ||||
| | Sender's Truncated HMAC (64 bits) | | ||||
| | | | ||||
| +---------------------------------------------------------------+ | ||||
| | Sender's Random Number (32 bits) | | ||||
| +---------------------------------------------------------------+ | ||||
| ]]></artwork> | ||||
| </figure> | ||||
| <?rfc needLines='12'?> | ||||
| <figure align="center" anchor="tcpm_join3" title="Join Connection (MP_JO | ||||
| IN) Option (for Third ACK)"> | ||||
| <artwork align="left"><![CDATA[ | ||||
| 1 2 3 | ||||
| 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||||
| +---------------+---------------+-------+-----------------------+ | ||||
| | Kind | Length = 24 |Subtype| (reserved) | | ||||
| +---------------+---------------+-------+-----------------------+ | ||||
| | | | ||||
| | | | ||||
| | Sender's Truncated HMAC (160 bits) | | ||||
| | | | ||||
| | | | ||||
| +---------------------------------------------------------------+ | ||||
| ]]></artwork> | ||||
| </figure> | </figure> | |||
| <t>The key for the HMAC algorithm, in the case of the message | ||||
| transmitted by Host A, will be Key-A followed by Key-B; and in the | ||||
| case of Host B, Key-B followed by Key-A. These are the keys that were | ||||
| exchanged in the original MP_CAPABLE handshake. The "message" for the | ||||
| HMAC algorithm in each case is the concatenations of random numbers for | ||||
| each host (denoted by R): for Host A, R-A followed by R-B; and for | ||||
| Host B, R-B followed by R-A.</t> | ||||
| <t>These various MPTCP options fit together to enable authenticated subf | ||||
| low setup as illustrated in <xref target="fig_tokens" format="default"/>.</t> | ||||
| <figure anchor="fig_tokens"> | ||||
| <name>Example Use of MPTCP Authentication</name> | ||||
| <artwork align="left" name="" type="" alt=""><![CDATA[ | ||||
| Host A Host B | ||||
| ------------------------ ---------- | ||||
| Address A1 Address A2 Address B1 | ||||
| ---------- ---------- ---------- | ||||
| | | | | ||||
| | | SYN + MP_CAPABLE | | ||||
| |--------------------------------------------->| | ||||
| |<---------------------------------------------| | ||||
| | SYN/ACK + MP_CAPABLE(Key-B) | | ||||
| | | | | ||||
| | ACK + MP_CAPABLE(Key-A, Key-B) | | ||||
| |--------------------------------------------->| | ||||
| | | | | ||||
| | | SYN + MP_JOIN(Token-B, R-A) | | ||||
| | |------------------------------->| | ||||
| | |<-------------------------------| | ||||
| | | SYN/ACK + MP_JOIN(HMAC-B, R-B) | | ||||
| | | | | ||||
| | | ACK + MP_JOIN(HMAC-A) | | ||||
| | |------------------------------->| | ||||
| | |<-------------------------------| | ||||
| | | ACK | | ||||
| <t>These various MPTCP options fit together to enable authenticated subf | HMAC-A = HMAC(Key=(Key-A + Key-B), Msg=(R-A + R-B)) | |||
| low setup as illustrated in <xref target="fig_tokens"/>.</t> | HMAC-B = HMAC(Key=(Key-B + Key-A), Msg=(R-B + R-A)) ]]></artwork> | |||
| <?rfc needLines='24'?> | ||||
| <figure align="center" anchor="fig_tokens" title="Example Use of MPTCP A | ||||
| uthentication"> | ||||
| <artwork align="left"><![CDATA[ | ||||
| Host A Host B | ||||
| ------------------------ ---------- | ||||
| Address A1 Address A2 Address B1 | ||||
| ---------- ---------- ---------- | ||||
| | | | | ||||
| | | SYN + MP_CAPABLE | | ||||
| |--------------------------------------------->| | ||||
| |<---------------------------------------------| | ||||
| | SYN/ACK + MP_CAPABLE(Key-B) | | ||||
| | | | | ||||
| | ACK + MP_CAPABLE(Key-A, Key-B) | | ||||
| |--------------------------------------------->| | ||||
| | | | | ||||
| | | SYN + MP_JOIN(Token-B, R-A) | | ||||
| | |------------------------------->| | ||||
| | |<-------------------------------| | ||||
| | | SYN/ACK + MP_JOIN(HMAC-B, R-B) | | ||||
| | | | | ||||
| | | ACK + MP_JOIN(HMAC-A) | | ||||
| | |------------------------------->| | ||||
| | |<-------------------------------| | ||||
| | | ACK | | ||||
| HMAC-A = HMAC(Key=(Key-A+Key-B), Msg=(R-A+R-B)) | ||||
| HMAC-B = HMAC(Key=(Key-B+Key-A), Msg=(R-B+R-A)) | ||||
| ]]></artwork> | ||||
| </figure> | </figure> | |||
| <t>If the token received at Host B is unknown or local policy | <t>If the token received at Host B is unknown or local policy | |||
| prohibits the acceptance of the new subflow, the recipient MUST | prohibits the acceptance of the new subflow, the recipient <bcp14>MUST</ | |||
| respond with a TCP RST for the subflow. If appropriate, a MP_TCPRST | bcp14> | |||
| option with a "Administratively prohibited" reason code | respond with a TCP RST for the subflow. If appropriate, an MP_TCPRST | |||
| (<xref target="sec_reset"/>) should be included.</t> | option with an "Administratively prohibited" reason code | |||
| (<xref target="sec_reset" format="default"/>) should be included.</t> | ||||
| <t>If the token is accepted at Host B, but the HMAC returned to | <t>If the token is accepted at Host B but the HMAC returned to | |||
| Host A does not match the one expected, Host A MUST close the | Host A does not match the one expected, Host A <bcp14>MUST</bcp14> close | |||
| subflow with a TCP RST. In this, and all following cases of sending | the | |||
| a RST in this section, the sender SHOULD send a MP_TCPRST option | subflow with a TCP RST. In this and all subsequent cases of sending | |||
| (<xref target="sec_reset"/>) on this RST packet with the reason | a RST as described in this section, the sender <bcp14>SHOULD</bcp14> sen | |||
| code for a "MPTCP specific error".</t> | d an MP_TCPRST option | |||
| (<xref target="sec_reset" format="default"/>) on this RST packet with th | ||||
| <t>If Host B does not receive the expected HMAC, or the MP_JOIN | e reason | |||
| option is missing from the ACK, it MUST close the subflow with a | code for an "MPTCP-specific error".</t> | |||
| <t>If Host B does not receive the expected HMAC or the MP_JOIN | ||||
| option is missing from the ACK, it <bcp14>MUST</bcp14> close the subflow | ||||
| with a | ||||
| TCP RST.</t> | TCP RST.</t> | |||
| <t>If the HMACs are verified as correct, then both hosts have | <t>If the HMACs are verified as correct, then both hosts have | |||
| verified each other as being the same peers as existed at | verified each other as being the same peers as those that existed at | |||
| the start of the connection, and they have agreed of which | the start of the connection, and they have agreed of which | |||
| connection this subflow will become a part.</t> | connection this subflow will become a part.</t> | |||
| <t>If the SYN/ACK as received at Host A does not have an MP_JOIN | <t>If the SYN/ACK as received at Host A does not have an MP_JOIN | |||
| option, Host A MUST close the subflow with a TCP RST.</t> | option, Host A <bcp14>MUST</bcp14> close the subflow with a TCP RST.</t> | |||
| <t>This covers all cases of the loss of an MP_JOIN. In more detail, | <t>This covers all cases of the loss of an MP_JOIN. In more detail, | |||
| if MP_JOIN is stripped from the SYN on the path from A to | if an MP_JOIN is stripped from the SYN on the path from A to | |||
| B, and Host B does not have a listener on the relevant | B and Host B does not have a listener on the relevant | |||
| port, it will respond with a RST in the normal way. If in | port, it will respond with a RST in the normal way. If in | |||
| response to a SYN with an MP_JOIN option, a SYN/ACK is | response to a SYN with an MP_JOIN option a SYN/ACK is | |||
| received without the MP_JOIN option (either since it was | received without the MP_JOIN option (because it was either | |||
| stripped on the return path, or it was stripped on the | stripped on the return path, or stripped on the | |||
| outgoing path but Host B responded as if | outgoing path leading to Host B responding as if | |||
| it were a new regular TCP session), then the subflow is | it was a new regular TCP session), then the subflow is | |||
| unusable and Host A MUST close it with a RST.</t> | unusable and Host A <bcp14>MUST</bcp14> close it with a RST.</t> | |||
| <t>Note that additional subflows can be created | <t>Note that additional subflows can be created | |||
| between any pair of ports (but see <xref target="heuristics"/> for | between any pair of ports (but see <xref target="heuristics" format="def ault"/> for | |||
| heuristics); no explicit application-level accept calls or | heuristics); no explicit application-level accept calls or | |||
| bind calls are required to open additional subflows. To | bind calls are required to open additional subflows. To | |||
| associate a new subflow with an existing connection, the token | associate a new subflow with an existing connection, the token | |||
| supplied in the subflow's SYN exchange is used for | supplied in the subflow's SYN exchange is used for | |||
| demultiplexing. This then binds the 5-tuple of the TCP | demultiplexing. This then binds the 5-tuple of the TCP | |||
| subflow to the local token of the connection. A consequence is | subflow to the local token of the connection. One consequence is | |||
| that it is possible to allow any port pairs to be used for a | that it is possible to allow any port pairs to be used for a | |||
| connection. </t> | connection. </t> | |||
| <t>Demultiplexing subflow SYNs <bcp14>MUST</bcp14> be done using the tok | ||||
| <t>Demultiplexing subflow SYNs MUST be done using the token; | en; | |||
| this is unlike traditional TCP, where the destination port is | this is unlike traditional TCP, where the destination port is | |||
| used for demultiplexing SYN packets. Once a subflow is set up, | used for demultiplexing SYN packets. Once a subflow is set up, | |||
| demultiplexing packets is done using the 5-tuple, as in | demultiplexing packets is done using the 5-tuple, as in | |||
| traditional TCP. The 5-tuples will be mapped to the local | traditional TCP. The 5-tuples will be mapped to the local | |||
| connection identifier (token). Note that Host A will know its | connection identifier (token). Note that Host A will know its | |||
| local token for the subflow even though it is not sent on the | local token for the subflow even though it is not sent on the | |||
| wire -- only the responder's token is sent.</t> | wire -- only the responder's token is sent.</t> | |||
| </section> | </section> | |||
| <section anchor="sec_generalop" numbered="true" toc="default"> | ||||
| <section title="General MPTCP Operation" anchor="sec_generalop"> | <name>MPTCP Operation and Data Transfer</name> | |||
| <t>This section discusses operation of MPTCP for data transfer. At a hig | <t>This section discusses the operation of MPTCP for data transfer. At a | |||
| h level, an MPTCP implementation will take one input data stream from an applica | high level, an MPTCP implementation will take one input data stream from an app | |||
| tion, and split it into one or more subflows, with sufficient control informatio | lication and split it into one or more subflows, with sufficient control informa | |||
| n to allow it to be reassembled and delivered reliably and in order to the recip | tion to allow it to be reassembled and delivered reliably and in order to the re | |||
| ient application. The following subsections define this behavior in detail.</t> | cipient application. The following subsections define this behavior in detail.</ | |||
| t> | ||||
| <t>The data sequence mapping and the Data ACK are signaled in the Data S | <t>The Data Sequence Mapping and the Data ACK are signaled in the DSS op | |||
| equence Signal (DSS) option (<xref target="tcpm_dsn"/>). Either or both can be s | tion (<xref target="tcpm_dsn" format="default"/>). Either or both can be signale | |||
| ignaled in one DSS, depending on the flags set. The data sequence mapping define | d in one DSS, depending on the flags set. The Data Sequence Mapping defines how | |||
| s how the sequence space on the subflow maps to the connection level, and the Da | the sequence space on the subflow maps to the connection level, and the Data ACK | |||
| ta ACK acknowledges receipt of data at the connection level. These functions are | acknowledges receipt of data at the connection level. These functions are descr | |||
| described in more detail in the following two subsections.</t> | ibed in more detail in the following two subsections.</t> | |||
| <figure anchor="tcpm_dsn"> | ||||
| <?rfc needLines='18'?> | <name>Data Sequence Signal (DSS) Option</name> | |||
| <figure align="center" anchor="tcpm_dsn" title="Data Sequence Signal (DS | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
| S) Option"> | ||||
| <artwork align="left"><![CDATA[ | ||||
| 1 2 3 | 1 2 3 | |||
| 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |||
| +---------------+---------------+-------+----------------------+ | +---------------+---------------+-------+----------------------+ | |||
| | Kind | Length |Subtype| (reserved) |F|m|M|a|A| | | Kind | Length |Subtype| (reserved) |F|m|M|a|A| | |||
| +---------------+---------------+-------+----------------------+ | +---------------+---------------+-------+----------------------+ | |||
| | Data ACK (4 or 8 octets, depending on flags) | | | Data ACK (4 or 8 octets, depending on flags) | | |||
| +--------------------------------------------------------------+ | +--------------------------------------------------------------+ | |||
| | Data sequence number (4 or 8 octets, depending on flags) | | | Data Sequence Number (4 or 8 octets, depending on flags) | | |||
| +--------------------------------------------------------------+ | +--------------------------------------------------------------+ | |||
| | Subflow Sequence Number (4 octets) | | | Subflow Sequence Number (4 octets) | | |||
| +-------------------------------+------------------------------+ | +-------------------------------+------------------------------+ | |||
| | Data-Level Length (2 octets) | Checksum (2 octets) | | | Data-Level Length (2 octets) | Checksum (2 octets) | | |||
| +-------------------------------+------------------------------+ | +-------------------------------+------------------------------+ ]]></artwork> | |||
| ]]></artwork> | ||||
| </figure> | </figure> | |||
| <t>The flags, when set, define the contents of this option, as follows: | <t>The flags, when set, define the contents of this option, as follows: | |||
| <list style="symbols"> | </t> | |||
| <t>A = Data ACK present</t> | <ul spacing="normal"> | |||
| <t>a = Data ACK is 8 octets (if not set, Data ACK is 4 octets)</t> | <li>A = Data ACK present</li> | |||
| <t>M = Data Sequence Number (DSN), Subflow Sequence Number (SSN), Da | <li>a = Data ACK is 8 octets (if not set, Data ACK is 4 octets)</li> | |||
| ta-Level Length, and Checksum (if negotiated) present</t> | <li>M = Data Sequence Number (DSN), Subflow Sequence Number (SSN), Dat | |||
| <t>m = Data sequence number is 8 octets (if not set, DSN is 4 octets | a-Level Length, and Checksum (if negotiated) present</li> | |||
| )</t> | <li>m = Data Sequence Number is 8 octets (if not set, DSN is 4 octets) | |||
| </list> | </li> | |||
| </ul> | ||||
| The flags 'a' and 'm' only have meaning if the corresponding 'A' or 'M' | <t> | |||
| flags are set; otherwise, they will be ignored. The maximum length of this optio | ||||
| n, with all flags set, is 28 octets.</t> | ||||
| <t>The 'F' flag indicates "Data FIN". If present, this means that this m | ||||
| apping covers the final data from the sender. This is the connection-level equiv | ||||
| alent to the FIN flag in single-path TCP. A connection is not closed unless ther | ||||
| e has been a Data FIN exchange, a MP_FASTCLOSE (<xref target="sec_fastclose"/>) | ||||
| message, or an implementation-specific, connection-level send timeout. The purpo | ||||
| se of the Data FIN and the interactions between this flag, the subflow-level FIN | ||||
| flag, and the data sequence mapping are described in <xref target="sec_close"/> | ||||
| . | ||||
| The remaining reserved bits MUST be set to zero by an implementation of | ||||
| this specification.</t> | ||||
| <t>Note that the checksum is only present in this option if the use of M | ||||
| PTCP checksumming has been negotiated at the MP_CAPABLE handshake (see <xref tar | ||||
| get="sec_init"/>). The presence of the checksum can be inferred from the length | ||||
| of the option. If a checksum is present, but its use had not been negotiated in | ||||
| the MP_CAPABLE handshake, the receiver MUST close the subflow with a RST as it n | ||||
| ot behaving as negotiated. If a checksum is not present when its use has been ne | ||||
| gotiated, the receiver MUST close the subflow with a RST as it is considered bro | ||||
| ken. In both cases, this RST SHOULD be accompanied with a MP_TCPRST option (<xre | ||||
| f target="sec_reset"/>) with the reason code for a "MPTCP specific error".</t> | ||||
| <section title="Data Sequence Mapping" anchor="sec_dsn"> | ||||
| <t>The data stream as a whole can be reassembled through the use of th | ||||
| e data sequence mapping components of the DSS option (<xref target="tcpm_dsn"/>) | ||||
| , which define the | ||||
| mapping from the subflow sequence number to the data sequence number. This is us | ||||
| ed by the receiver to ensure in-order delivery to the application layer. Meanwhi | ||||
| le, the subflow-level sequence numbers (i.e., the regular sequence numbers in th | ||||
| e TCP header) have subflow-only relevance. It is expected (but not mandated) tha | ||||
| t SACK <xref target='RFC2018'/> is used at the subflow level to improve efficien | ||||
| cy.</t> | ||||
| <t>The data sequence mapping specifies a mapping from subflow sequence s | ||||
| pace to data sequence space. This is expressed in terms of starting sequence num | ||||
| bers for the subflow and the data level, and a length of bytes for which this ma | ||||
| pping is valid. | ||||
| This explicit mapping for a range of data was chosen rather than per-packet sign | ||||
| aling to assist with compatibility with situations where TCP/IP segmentation or | ||||
| coalescing is undertaken separately from the stack that is generating the data f | ||||
| low (e.g., through the use of TCP segmentation offloading on network interface c | ||||
| ards, or by middleboxes such as performance enhancing proxies). It also allows a | ||||
| single mapping to cover many packets, which may be useful in bulk transfer situ | ||||
| ations.</t> | ||||
| <t>A mapping is fixed, in that the subflow sequence number is bound to t | ||||
| he data sequence number after the mapping has been processed. A sender MUST NOT | ||||
| change this mapping | ||||
| after it has been declared; however, the same data sequence number can be mapped | ||||
| to by different subflows for retransmission purposes (see <xref target="sec_ret | ||||
| ransmit"/>). This would also permit the same data to be sent simultaneously on m | ||||
| ultiple subflows for resilience or efficiency purposes, especially in the case o | ||||
| f lossy links. Although the detailed specification of such operation is outside | ||||
| the scope of this document, an implementation SHOULD treat the first data that i | ||||
| s received at a subflow for the data sequence space as that which should be deli | ||||
| vered to the application, and any later data for that sequence space SHOULD be i | ||||
| gnored.</t> | ||||
| <t>The data sequence number is specified as an absolute value, whereas t | ||||
| he subflow sequence numbering is relative (the SYN at the start of the subflow h | ||||
| as relative subflow sequence number 0). This is to allow middleboxes to change t | ||||
| he initial sequence number of a subflow, such as firewalls that undertake Initia | ||||
| l Sequence Number (ISN) randomization.</t> | ||||
| <t>The data sequence mapping also contains a checksum of the data that t | ||||
| his mapping covers, if use of checksums has been negotiated at the MP_CAPABLE ex | ||||
| change. Checksums are used to detect if the payload has been adjusted in any way | ||||
| by a non-MPTCP-aware middlebox. If this checksum fails, it will trigger a failu | ||||
| re of the subflow, or a fallback to regular TCP, as documented in <xref target=" | ||||
| sec_fallback"/>, since MPTCP can no longer reliably know the subflow sequence sp | ||||
| ace at the receiver to build data sequence mappings. Without checksumming enable | ||||
| d, corrupt data may be delivered to the application if a middlebox alters segmen | ||||
| t boundaries, alters content, or does not deliver all segments covered by a data | ||||
| sequence mapping. It is therefore RECOMMENDED to use checksumming unless it is | ||||
| known the network path contains no such devices.</t> | ||||
| <t>The checksum algorithm used is the standard TCP checksum <xref target | ||||
| ="RFC0793"/>, operating over the data covered by this mapping, along with a pseu | ||||
| do-header as shown in <xref target="fig_pseudo"/>.</t> | ||||
| <?rfc needLines='18'?> | The flags "a" and "m" only have meaning if the corresponding "A" or "M" | |||
| <figure align="center" anchor="fig_pseudo" title="Pseudo-Header for DSS | flags are set; otherwise, they will be ignored. The maximum length of this optio | |||
| Checksum"> | n, with all flags set, is 28 octets.</t> | |||
| <artwork align="left"><![CDATA[ | <t>The "F" flag indicates "Data FIN". If present, this means that this | |||
| mapping covers the final data from the sender. This is the | ||||
| connection-level equivalent of the FIN flag in single-path TCP. A connec | ||||
| tion is not closed unless there has been a Data FIN exchange, an MP_FASTCLOSE (< | ||||
| xref target="sec_fastclose" format="default"/>) message, or an implementation-sp | ||||
| ecific connection-level send timeout. The purpose of the Data FIN and the intera | ||||
| ctions between this flag, the subflow-level FIN flag, and the Data Sequence Mapp | ||||
| ing are described in <xref target="sec_close" format="default"/>. | ||||
| The remaining reserved bits <bcp14>MUST</bcp14> be set to 0 by an implem | ||||
| entation of this specification.</t> | ||||
| <t>Note that the checksum is only present in this option if the use of | ||||
| MPTCP checksumming has been negotiated at the MP_CAPABLE handshake | ||||
| (see <xref target="sec_init" format="default"/>). The presence of the | ||||
| checksum can be inferred from the length of the option. If a checksum | ||||
| is present but its use had not been negotiated in the MP_CAPABLE | ||||
| handshake, the receiver <bcp14>MUST</bcp14> close the subflow with a | ||||
| RST, as it is not behaving as negotiated. If a checksum is not present w | ||||
| hen its use has been negotiated, the receiver <bcp14>MUST</bcp14> close the subf | ||||
| low with a RST, as it is considered broken. In both cases, this RST <bcp14>SHOUL | ||||
| D</bcp14> be accompanied by an MP_TCPRST option (<xref target="sec_reset" format | ||||
| ="default"/>) with the reason code for an "MPTCP-specific error".</t> | ||||
| <section anchor="sec_dsn" numbered="true" toc="default"> | ||||
| <name>Data Sequence Mapping</name> | ||||
| <t>The data stream as a whole can be reassembled through the use of th | ||||
| e Data Sequence Mapping components of the DSS option (<xref target="tcpm_dsn" fo | ||||
| rmat="default"/>), which define the | ||||
| mapping from the subflow sequence number to the data sequence number. This is | ||||
| used by the receiver to ensure in-order delivery to the application | ||||
| layer. Meanwhile, the subflow-level sequence numbers (i.e., the | ||||
| regular sequence numbers in the TCP header) are only relevant to the s | ||||
| ubflow. It is expected (but not mandated) that SACK <xref | ||||
| target="RFC2018" format="default"/> will be used at the subflow level | ||||
| to improve efficiency.</t> | ||||
| <t>The Data Sequence Mapping specifies a mapping from the subflow | ||||
| sequence space to the data sequence space. This is expressed in terms | ||||
| of starting sequence numbers for the subflow and the data level, and a length of | ||||
| bytes for which this mapping is valid. | ||||
| This explicit mapping for a range of data, rather than per‑packet signalin | ||||
| g, was chosen to assist with compatibility with | ||||
| situations where TCP/IP segmentation or coalescing is undertaken | ||||
| separately from the stack that is generating the data flow (e.g., | ||||
| through the use of TCP segmentation offloading on network interface | ||||
| cards, or by middleboxes such as Performance Enhancing Proxies | ||||
| (PEPs) <xref target="RFC3135" format="default"/>). It | ||||
| also allows a single mapping to cover many packets; this may be useful | ||||
| in bulk‑transfer situations.</t> | ||||
| <t>A mapping is fixed, in that the subflow sequence number is bound to | ||||
| the data sequence number after the mapping has been processed. A sender <bcp14> | ||||
| MUST NOT</bcp14> change this mapping | ||||
| after it has been declared; however, the same data sequence number can be | ||||
| mapped to by different subflows for retransmission purposes (see | ||||
| <xref target="sec_retransmit" format="default"/>). This would also | ||||
| permit the same data to be sent simultaneously on multiple subflows | ||||
| for resilience or efficiency purposes, especially in the case of | ||||
| lossy links. Although the detailed specification of such operation | ||||
| is outside the scope of this document, an implementation | ||||
| <bcp14>SHOULD</bcp14> treat the first data that is received at a | ||||
| subflow for the data sequence space as the data that should be deliver | ||||
| ed to the application, and any subsequent data for that sequence space <bcp14>SH | ||||
| OULD</bcp14> be ignored.</t> | ||||
| <t>The data sequence number is specified as an absolute value, | ||||
| whereas the subflow sequence numbering is relative (the SYN at the | ||||
| start of the subflow has a relative subflow sequence number of | ||||
| 0). This is done to allow middleboxes to change the Initial Sequence | ||||
| Number (ISN) of a subflow, such as firewalls that undertake ISN random | ||||
| ization.</t> | ||||
| <t>The Data Sequence Mapping also contains a checksum of the data | ||||
| that this mapping covers, if the use of checksums has been negotiated | ||||
| at | ||||
| the MP_CAPABLE exchange. Checksums are used to detect if the payload | ||||
| has been adjusted in any way by a non-MPTCP-aware middlebox. If this | ||||
| checksum fails, it will trigger a failure of the subflow, or a | ||||
| fallback to regular TCP, as documented in <xref | ||||
| target="sec_fallback" format="default"/>, since MPTCP can no longer | ||||
| reliably know the subflow sequence space at the receiver to build | ||||
| Data Sequence Mappings. Without checksumming enabled, corrupt data | ||||
| may be delivered to the application if a middlebox alters segment | ||||
| boundaries, alters content, or does not deliver all segments covered | ||||
| by a Data Sequence Mapping. It is therefore | ||||
| <bcp14>RECOMMENDED</bcp14> that checksumming be used, unless it is kno | ||||
| wn | ||||
| that the network path contains no such devices.</t> | ||||
| <t>The checksum algorithm used is the standard TCP checksum <xref targ | ||||
| et="RFC0793" format="default"/>, operating over the data covered by this mapping | ||||
| , along with a pseudo‑header as shown in <xref target="fig_pseudo" format= | ||||
| "default"/>.</t> | ||||
| <figure anchor="fig_pseudo"> | ||||
| <name>Pseudo-Header for DSS Checksum</name> | ||||
| <artwork align="left" name="" type="" alt=""><![CDATA[ | ||||
| 1 2 3 | 1 2 3 | |||
| 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |||
| +--------------------------------------------------------------+ | +--------------------------------------------------------------+ | |||
| | | | | | | |||
| | Data Sequence Number (8 octets) | | | Data Sequence Number (8 octets) | | |||
| | | | | | | |||
| +--------------------------------------------------------------+ | +--------------------------------------------------------------+ | |||
| | Subflow Sequence Number (4 octets) | | | Subflow Sequence Number (4 octets) | | |||
| +-------------------------------+------------------------------+ | +-------------------------------+------------------------------+ | |||
| | Data-Level Length (2 octets) | Zeros (2 octets) | | | Data-Level Length (2 octets) | Zeros (2 octets) | | |||
| +-------------------------------+------------------------------+ | +-------------------------------+------------------------------+ ]]></artwork> | |||
| ]]></artwork> | </figure> | |||
| </figure> | <t>Note that the data sequence number used in the pseudo-header is alw | |||
| ays the 64-bit value, irrespective of what length is used in the DSS option itse | ||||
| <t>Note that the data sequence number used in the pseudo-header is alway | lf. The standard TCP checksum algorithm has been chosen, since it will be calcul | |||
| s the 64-bit value, irrespective of what length is used in the DSS option itself | ated anyway for the TCP subflow, and if calculated first over the data before ad | |||
| . The standard TCP checksum algorithm has been chosen since it will be calculate | ding the pseudo-headers, it only needs to be calculated once. Furthermore, since | |||
| d anyway for the TCP subflow, and if calculated first over the data before addin | the TCP checksum is additive, the checksum for a DSN_MAP can be constructed by | |||
| g the pseudo-headers, it only needs to be calculated once. Furthermore, since th | simply adding together the checksums for the data of each constituent TCP segmen | |||
| e TCP checksum is additive, the checksum for a DSN_MAP can be constructed by sim | t and adding the checksum for the DSS pseudo‑header.</t> | |||
| ply adding together the checksums for the data of each constituent TCP segment, | <t>Note that checksumming relies on the TCP subflow containing contigu | |||
| and adding the checksum for the DSS pseudo-header.</t> | ous data; therefore, a TCP subflow <bcp14>MUST NOT</bcp14> use the Urgent Pointe | |||
| r to interrupt an existing mapping. Further note, however, that if Urgent data i | ||||
| <t>Note that checksumming relies on the TCP subflow containing contiguou | s received on a subflow, it <bcp14>SHOULD</bcp14> be mapped to the data sequence | |||
| s data; therefore, a TCP subflow MUST NOT use the Urgent Pointer to interrupt an | space and delivered to the application, analogous to Urgent data in regular TCP | |||
| existing mapping. Further note, however, that if Urgent data is received on a s | .</t> | |||
| ubflow, it SHOULD be mapped to the data sequence space and delivered to the appl | <t>To avoid possible deadlock scenarios, subflow-level | |||
| ication analogous to Urgent data in regular TCP.</t> | processing should be undertaken separately from processing at the | |||
| <t>To avoid possible deadlock scenarios, subflow-level | ||||
| processing should be undertaken separately from that at | ||||
| connection level. Therefore, even if a mapping does not exist | connection level. Therefore, even if a mapping does not exist | |||
| from the subflow space to the data-level space, the data | from the subflow space to the data‑level space, the data | |||
| SHOULD still be ACKed at the subflow (if it is in-window). | <bcp14>SHOULD</bcp14> still be ACKed at the subflow (if it is in-window) | |||
| . | ||||
| This data cannot, however, be acknowledged at the data level | This data cannot, however, be acknowledged at the data level | |||
| (<xref target="sec_dataack"/>) because its data sequence | (<xref target="sec_dataack" format="default"/>) because its data sequenc | |||
| numbers are unknown. Implementations MAY hold onto such | e | |||
| unmapped data for a short while in the expectation that a | numbers are unknown. Implementations <bcp14>MAY</bcp14> hold onto such | |||
| unmapped data for a short while, in the expectation that a | ||||
| mapping will arrive shortly. Such unmapped data cannot be | mapping will arrive shortly. Such unmapped data cannot be | |||
| counted as being within the connection level receive window because this is | counted as being within the connection-level receive window because this is | |||
| relative to the data sequence numbers, so if the receiver runs | relative to the data sequence numbers, so if the receiver runs | |||
| out of memory to hold this data, it will have to be discarded. | out of memory to hold this data, it will have to be discarded. | |||
| If a mapping for that subflow-level sequence space does not | If a mapping for that subflow-level sequence space does not | |||
| arrive within a receive window of data, that subflow SHOULD be | arrive within a receive window of data, that subflow <bcp14>SHOULD</bcp1 4> be | |||
| treated as broken, closed with a RST, and any unmapped data | treated as broken, closed with a RST, and any unmapped data | |||
| silently discarded.</t> | silently discarded.</t> | |||
| <t>Data sequence numbers are always 64-bit quantities and | ||||
| <t>Data sequence numbers are always 64-bit quantities, and | <bcp14>MUST</bcp14> be maintained as such in implementations. If a | |||
| MUST be maintained as such in implementations. If a | ||||
| connection is progressing at a slow rate, so protection | connection is progressing at a slow rate, so protection | |||
| against wrapped sequence numbers is not required, | against wrapped sequence numbers is not required, | |||
| then an implementation MAY include just the lower 32 | then an implementation <bcp14>MAY</bcp14> include just the lower 32 | |||
| bits of the data sequence number in the data sequence mapping and/or | bits of the data sequence number in the Data Sequence Mapping and&wj;/or | |||
| Data ACK as an optimization, and an implementation can make this choice | Data ACK as an optimization, and an implementation can make this choice | |||
| independently for each packet. An implementation MUST be able to receive | independently for each packet. An implementation <bcp14>MUST</bcp14> be | |||
| and process both 64-bit or 32-bit sequence number values, but it is not | able to receive | |||
| required that an implementation is able to send both.</t> | and process both 64-bit and 32-bit sequence number values, but it is not | |||
| required that an implementation be able to send both.</t> | ||||
| <t>An implementation MUST send the full 64-bit data sequence number | <t>An implementation <bcp14>MUST</bcp14> send the full 64-bit data seq | |||
| uence number | ||||
| if it is transmitting at a sufficiently high rate that the 32-bit value | if it is transmitting at a sufficiently high rate that the 32-bit value | |||
| could wrap within the Maximum Segment Lifetime | could wrap within the Maximum Segment Lifetime | |||
| (MSL) <xref target="RFC7323"/>. The lengths of the DSNs used in these | (MSL) <xref target="RFC7323" format="default"/>. The lengths of the DSNs used in these | |||
| values (which may be different) are declared with flags in the | values (which may be different) are declared with flags in the | |||
| DSS option. Implementations MUST accept a 32-bit DSN and implicitly | DSS option. Implementations <bcp14>MUST</bcp14> accept a 32-bit DSN and implicitly | |||
| promote it to a 64-bit quantity by incrementing the upper 32 | promote it to a 64-bit quantity by incrementing the upper 32 | |||
| bits of sequence number each time the lower 32 | bits of the sequence number each time the lower 32 | |||
| bits wrap. A sanity check MUST be implemented to ensure that | bits wrap. A sanity check <bcp14>MUST</bcp14> be implemented to ensure t | |||
| hat | ||||
| a wrap occurs at an expected time (e.g., the sequence number jumps | a wrap occurs at an expected time (e.g., the sequence number jumps | |||
| from a very high number to a very low number) and is not triggered | from a very high number to a very low number) and is not triggered | |||
| by out-of-order packets.</t> | by out‑of-order packets.</t> | |||
| <t>As with the standard TCP sequence number, the data sequence | ||||
| <t>As with the standard TCP sequence number, the data sequence | ||||
| number should not start at zero, but at a random value to make | number should not start at zero, but at a random value to make | |||
| blind session hijacking harder. This specification requires | blind session hijacking harder. This specification requires | |||
| setting the initial data sequence number (IDSN) of each host to the | setting the IDSN of each host to the | |||
| least significant 64 bits of the SHA-256 hash of the host's key, as | least significant 64 bits of the SHA-256 hash of the host's key, as | |||
| described in <xref target="sec_init"/>. This is required also in | described in <xref target="sec_init" format="default"/>. This is also re | |||
| order for the receiver to know what the expected IDSN is, and thus | quired in | |||
| order for the receiver to know what the expected IDSN is and thus | ||||
| determine if any initial connection-level packets are missing; this | determine if any initial connection-level packets are missing; this | |||
| is particularly relevant if two subflows start transmitting simultaneous ly.</t> | is particularly relevant if two subflows start transmitting simultaneous ly.</t> | |||
| <t>The mapping provided by a Data Sequence Mapping MUST apply to | ||||
| <t>A data sequence mapping does not need to be included in | some or all of the subflow sequence space in the TCP segment that | |||
| carries the option. It does not need to be included in | ||||
| every MPTCP packet, as long as the subflow sequence space in | every MPTCP packet, as long as the subflow sequence space in | |||
| that packet is covered by a mapping known at the receiver. This | that packet is covered by a mapping known at the receiver. This | |||
| can be used to reduce overhead in cases where the mapping is | can be used to reduce overhead in cases where the mapping is | |||
| known in advance; one such case is when there is a single | known in advance. One such case is when there is a single | |||
| subflow between the hosts, another is when segments of | subflow between the hosts, and another is when segments of | |||
| data are scheduled in larger than packet-sized chunks.</t> | data are scheduled in larger-than-packet-sized chunks.</t> | |||
| <t>An "infinite" mapping can be used to fall back to regular TCP by | ||||
| <t>An "infinite" mapping can be used to fall back to regular TCP by | ||||
| mapping the subflow-level data to the connection-level data | mapping the subflow-level data to the connection-level data | |||
| for the remainder of the connection (see | for the remainder of the connection (see | |||
| <xref target="sec_fallback"/>). This is achieved by setting | <xref target="sec_fallback" format="default"/>). This is achieved by set ting | |||
| the Data-Level Length field of the DSS option to the reserved value of 0 . The | the Data-Level Length field of the DSS option to the reserved value of 0 . The | |||
| checksum, in such a case, will also be set to zero.</t> | checksum, in such a case, will also be set to 0.</t> | |||
| </section> | </section> | |||
| <section anchor="sec_dataack" numbered="true" toc="default"> | ||||
| <section title="Data Acknowledgments" anchor="sec_dataack"> | <name>Data Acknowledgments</name> | |||
| <t>To provide full end-to-end resilience, MPTCP provides a | <t>To provide full end-to-end resilience, MPTCP provides a | |||
| connection-level acknowledgment, to act as a cumulative ACK for | connection-level acknowledgment, to act as a cumulative ACK for | |||
| the connection as a whole. This is the "Data ACK" field of | the connection as a whole. This is done via the "Data ACK" field of | |||
| the DSS option (<xref target="tcpm_dsn"/>). The Data ACK | the DSS option (<xref target="tcpm_dsn" format="default"/>). The Data AC | |||
| K | ||||
| is analogous to the behavior | is analogous to the behavior | |||
| of the standard TCP cumulative ACK -- indicating | of the standard TCP cumulative ACK -- indicating | |||
| how much data has been successfully received (with no | how much data has been successfully received (with no | |||
| holes). This is in comparison to the subflow-level ACK, which | holes). This can be compared to the subflow-level ACK, which | |||
| acts analogous to TCP SACK, given that there may still be | acts in a fashion analogous to TCP SACK, given that there may still be | |||
| holes in the data stream at the connection level. | holes in the data stream at the connection level. | |||
| The Data ACK specifies the next data sequence number | The Data ACK specifies the next data sequence number | |||
| it expects to receive.</t> | it expects to receive.</t> | |||
| <t>The Data ACK, as for the DSN, can be sent as the full 64-bit | ||||
| <t>The Data ACK, as for the DSN, can be sent as the full 64-bit | value or as the lower 32 bits. If data is received with a 64-bit DSN, | |||
| value, or as the lower 32 bits. If data is received with a 64-bit DSN, | it <bcp14>MUST</bcp14> be acknowledged with a 64-bit Data ACK. If the D | |||
| it MUST be acknowledged with a 64-bit Data ACK. If the DSN received | SN received | |||
| is 32 bits, an implementation can choose whether to send a 32-bit or | is 32 bits, an implementation can choose whether to send a 32-bit o | |||
| 64-bit Data ACK, and an implementation MUST accept either in this situat | r | |||
| ion.</t> | 64-bit Data ACK, and an implementation <bcp14>MUST</bcp14> accept either | |||
| in this situation.</t> | ||||
| <t>The Data ACK proves that the data, and all required MPTCP | <t>The Data ACK proves that the data, and all required MPTCP | |||
| signaling, has been received and accepted by the remote end. | signaling, have been received and accepted by the remote end. | |||
| One key use of the Data ACK signal is that it is used to indicate | One key use of the Data ACK signal is that it is used to indicate | |||
| the left edge of the advertised receive window. As explained in | the left edge of the advertised receive window. As explained in | |||
| <xref target="sec_rwin"/>, the receive window is shared by all | <xref target="sec_rwin" format="default"/>, the receive window is shared by all | |||
| subflows and is relative to the Data ACK. Because of this, an | subflows and is relative to the Data ACK. Because of this, an | |||
| implementation MUST NOT use the RCV.WND field of a TCP segment | implementation <bcp14>MUST NOT</bcp14> use the RCV.WND field of a TCP se gment | |||
| at the connection level if it does not also carry a DSS option with | at the connection level if it does not also carry a DSS option with | |||
| a Data ACK field. Furthermore, | a Data ACK field. Furthermore, | |||
| separating the connection-level acknowledgments from the | separating the connection-level acknowledgments from the | |||
| subflow level allows processing to be done separately, and | subflow level allows processing to be done separately, and | |||
| a receiver has the freedom to drop segments after acknowledgment | a receiver has the freedom to drop segments after acknowledgment | |||
| at the subflow level, for example, due to memory constraints | at the subflow level -- for example, due to memory constraints | |||
| when many segments arrive out of order.</t> | when many segments arrive out of order.</t> | |||
| <t>An MPTCP sender <bcp14>MUST NOT</bcp14> free data from the send buf | ||||
| <t>An MPTCP sender MUST NOT free data from the send buffer until | fer until | |||
| it has been acknowledged by both a Data ACK received on any subflow | it has been acknowledged by both a Data ACK received on any subflow | |||
| and at the subflow level by all subflows on which the data was sent. | and at the subflow level by all subflows on which the data was sent. | |||
| The former condition ensures liveness of the | The former condition ensures liveness of the | |||
| connection and the latter condition ensures liveness and | connection, and the latter condition ensures liveness and | |||
| self-consistence of a subflow when data needs to be | self-consistence of a subflow when data needs to be | |||
| retransmitted. | retransmitted. | |||
| Note, however, that if some data needs to be retransmitted multiple | Note, however, that if some data needs to be retransmitted multiple | |||
| times over a subflow, there is a risk of blocking the sending | times over a subflow, there is a risk of blocking the send | |||
| window. In this case, the MPTCP sender can decide to terminate the | window. In this case, the MPTCP sender can decide to terminate the | |||
| subflow that is behaving badly by sending a RST, using an appropriate | subflow that is behaving badly by sending a RST, using an appropriate | |||
| MP_TCPRST (<xref target="sec_reset"/>) error code.</t> | MP_TCPRST (<xref target="sec_reset" format="default"/>) error code.</t> | |||
| <t>The Data ACK <bcp14>MAY</bcp14> be included in all segments; howeve | ||||
| <t>The Data ACK MAY be included in all segments; however, optimizations | r, optimizations | |||
| SHOULD be considered in more advanced implementations, where the | <bcp14>SHOULD</bcp14> be considered in more advanced implementations, wh | |||
| ere the | ||||
| Data ACK is present in segments | Data ACK is present in segments | |||
| only when the Data ACK value advances, and this behavior MUST | only when the Data ACK value advances, and this behavior <bcp14>MUST</bc | |||
| be treated as valid. This behavior ensures the sender buffer | p14> | |||
| be treated as valid. This behavior ensures that the send buffer | ||||
| is freed, while reducing overhead when the data transfer is | is freed, while reducing overhead when the data transfer is | |||
| unidirectional.</t> | unidirectional.</t> | |||
| </section> | </section> | |||
| <section anchor="sec_close" numbered="true" toc="default"> | ||||
| <section title="Closing a Connection" anchor="sec_close"> | <name>Closing a Connection</name> | |||
| <t>In regular TCP, a FIN announces the receiver that the sender has no m | <t>In regular TCP, a FIN announces to the receiver that the sender has | |||
| ore data to send. | no more data to send. | |||
| In order to allow subflows to operate independently and to keep the appearance o f TCP over the wire, | In order to allow subflows to operate independently and to keep the appearance o f TCP over the wire, | |||
| a FIN in MPTCP only affects the subflow on which it is sent. This | a FIN in MPTCP only affects the subflow on which it is sent. This | |||
| allows nodes to exercise considerable freedom over which paths are in use at any one time. | allows nodes to exercise considerable freedom over which paths are in use at any one time. | |||
| The semantics of a FIN remain as for regular TCP; i.e., it is not until both sid es have ACKed | The semantics of a FIN remain as for regular TCP; i.e., it is not until both sid es have ACKed | |||
| each other's FINs that the subflow is fully closed.</t> | each other's FINs that the subflow is fully closed.</t> | |||
| <t>When an application calls close() on a socket, this indicates that it has no more | <t>When an application calls close() on a socket, this indicates that it has no more | |||
| data to send; for regular TCP, this would result in a FIN on the connection. For MPTCP, an | data to send; for regular TCP, this would result in a FIN on the connection. For MPTCP, an | |||
| equivalent mechanism is needed, and this is referred to as the DATA_FIN.</t> | equivalent mechanism is needed; this is referred to as the DATA_FIN.</t> | |||
| <t>A DATA_FIN is an indication that the sender has no more data to sen | ||||
| <t>A DATA_FIN is an indication that the sender has no more data to send, | d, and | |||
| and | as such it can be used to verify that all data has been successfully rec | |||
| as such can be used to verify that all data has been successfully receiv | eived. A DATA_FIN, | |||
| ed. A DATA_FIN, | ||||
| as with the FIN on a regular TCP connection, is a unidirectional signal. </t> | as with the FIN on a regular TCP connection, is a unidirectional signal. </t> | |||
| <t>The DATA_FIN is signaled by setting the "F" flag in the DSS | ||||
| <t>The DATA_FIN is signaled by setting the 'F' flag in the Data Sequence | option (<xref target="tcpm_dsn" format="default"/>) | |||
| Signal option (<xref target="tcpm_dsn"/>) to 1. A DATA_FIN occupies 1 octet (th | to 1. A DATA_FIN occupies 1 octet (the final octet) of the | |||
| e final octet) of the connection-level sequence space. Note that the DATA_FIN is | connection-level sequence space. Note that the | |||
| included in the Data-Level Length, but not at the subflow level: for example, a | DATA_FIN is included in the Data-Level Length but not at the subflow | |||
| segment with DSN 80, and Data-Level Length 11, with DATA_FIN set, would map 10 | level: for example, a segment with a DSN value of 80 and a | |||
| octets from the subflow into data sequence space 80-89, the DATA_FIN is DSN 90; | Data-Level Length of 11, with DATA_FIN set, would map 10 octets from | |||
| therefore, this segment including DATA_FIN would be acknowledged with a DATA_ACK | the subflow into data sequence space 80-89, and the DATA_FIN would | |||
| of 91.</t> | be DSN 90; therefore, this segment, including DATA_FIN, would be | |||
| acknowledged with a DATA_ACK of 91.</t> | ||||
| <t>Note that when the DATA_FIN is not attached to a TCP segment containi | <t>Note that when the DATA_FIN is not attached to a TCP segment contai | |||
| ng data, the Data Sequence Signal MUST have a subflow sequence number of 0, a Da | ning data, the DSS <bcp14>MUST</bcp14> have a subflow sequence number of 0, a Da | |||
| ta-Level Length of 1, and the data sequence number that corresponds with the DAT | ta-Level Length of 1, and the data sequence number that corresponds with the DAT | |||
| A_FIN itself. The checksum in this case will only cover the pseudo-header.</t> | A_FIN itself. The checksum in this case will only cover the pseudo-header.</t> | |||
| <t>A DATA_FIN has the same semantics and behavior as a regular TCP FIN | ||||
| <t>A DATA_FIN has the semantics and behavior as a regular TCP FIN, but a | , but at the connection level. Notably, it is only DATA_ACKed once all data has | |||
| t the connection level. Notably, it is only DATA_ACKed once all data has been su | been successfully received at the connection level. Note, therefore, that a DATA | |||
| ccessfully received at the connection level. Note, therefore, that a DATA_FIN is | _FIN is decoupled from a subflow FIN. It is only permissible to combine these si | |||
| decoupled from a subflow FIN. It is only permissible to combine these signals o | gnals on one subflow if there is no data outstanding on other subflows. Otherwis | |||
| n one subflow if there is no data outstanding on other subflows. Otherwise, it m | e, it may be necessary to retransmit data on different subflows. Essentially, a | |||
| ay be necessary to retransmit data on different subflows. Essentially, a host MU | host <bcp14>MUST NOT</bcp14> close all functioning subflows unless it is safe to | |||
| ST NOT close all functioning subflows unless it is safe to do so, i.e., until al | do so, i.e., until all outstanding data has been DATA_ACKed or until the segmen | |||
| l outstanding data has been DATA_ACKed, or until the segment with the DATA_FIN f | t with the DATA_FIN flag set is the only outstanding segment.</t> | |||
| lag set is the only outstanding segment.</t> | <t>Once a DATA_FIN has been acknowledged, all remaining subflows | |||
| <bcp14>MUST</bcp14> be closed with standard FIN exchanges. Both | ||||
| <t>Once a DATA_FIN has been acknowledged, all remaining subflows MUST be | hosts <bcp14>SHOULD</bcp14> send FINs on all subflows, as a courtesy, | |||
| closed with standard FIN exchanges. Both hosts SHOULD send FINs on all subflows | to allow middleboxes to clean up state even if an individual subflow | |||
| , as a courtesy to allow middleboxes to clean up state even if an individual sub | has failed. Reducing the timeouts (MSL) on subflows at end hosts after | |||
| flow has failed. It is also encouraged to reduce the timeouts (Maximum Segment L | receiving a | |||
| ifetime) on subflows at end hosts after receiving a DATA_FIN. In particular, any | DATA_FIN is also encouraged. In particular, any subflows where there i | |||
| subflows where there is still outstanding data queued (which has been retransmi | s still | |||
| tted on other subflows in order to get the DATA_FIN acknowledged) MAY be closed | outstanding data queued (which has been retransmitted on other | |||
| with a RST with MP_TCPRST (<xref target="sec_reset"/>) error code for "too much | subflows in order to get the DATA_FIN acknowledged) | |||
| outstanding data".</t> | <bcp14>MAY</bcp14> be closed with a RST with an MP_TCPRST (<xref targe | |||
| t="sec_reset" format="default"/>) error code for "too much outstanding data".</t | ||||
| <t>A connection is considered closed once both hosts' DATA_FINs have bee | > | |||
| n acknowledged by DATA_ACKs.</t> | <t>A connection is considered closed once both hosts' DATA_FINs have b | |||
| een acknowledged by DATA_ACKs.</t> | ||||
| <t>As specified above, a standard TCP FIN on an individual subflow only | <t>As specified above, a standard TCP FIN on an individual subflow | |||
| shuts down the subflow on which it was sent. If all subflows have been closed wi | only shuts down the subflow on which it was sent. If all subflows | |||
| th a FIN exchange, but no DATA_FIN has been received and acknowledged, the MPTCP | have been closed with a FIN exchange but no DATA_FIN has been | |||
| connection is treated as closed only after a timeout. This implies that an impl | received and acknowledged, the MPTCP connection is treated as closed | |||
| ementation will have TIME_WAIT states at both the subflow and connection levels | only after a timeout. This implies that an implementation will have | |||
| (see <xref target="app_fsm"/>). This permits "break-before-make" scenarios where | TIME_WAIT states at both the subflow level and the connection level (s | |||
| connectivity is lost on all subflows before a new one can be re-established.</t | ee <xref target="app_fsm" format="default"/>). This permits "break-before-make" | |||
| > | scenarios where connectivity is lost on all subflows before a new one can be re& | |||
| </section> | #8209;established.</t> | |||
| </section> | ||||
| <section title="Receiver Considerations" anchor="sec_rwin"> | <section anchor="sec_rwin" numbered="true" toc="default"> | |||
| <name>Receiver Considerations</name> | ||||
| <t>Regular TCP advertises a receive window in each packet, telling the sender how much data the receiver | <t>Regular TCP advertises a receive window in each packet, telling the sender how much data the receiver | |||
| is willing to accept past the cumulative ack. The receive window is used to impl ement flow control, throttling | is willing to accept past the cumulative ACK. The receive window is used to impl ement flow control, throttling | |||
| down fast senders when receivers cannot keep up. </t> | down fast senders when receivers cannot keep up. </t> | |||
| <t>MPTCP also uses a unique receive window, shared between the subflow s. The idea is to allow any | <t>MPTCP also uses a unique receive window, shared between the subflow s. The idea is to allow any | |||
| subflow to send data as long as the receiver is willing to accept it. The altern | subflow to send data as long as the receiver is willing to accept it. The | |||
| ative, maintaining per subflow | alternative -- maintaining per-subflow | |||
| receive windows, could end up stalling some subflows while others would not use | receive windows -- could end up stalling some subflows while others would not us | |||
| up their window.</t> | e up their window.</t> | |||
| <t>The receive window is relative to the DATA_ACK. As in TCP, a receiv | ||||
| <t>The receive window is relative to the DATA_ACK. As in TCP, a receiv | er <bcp14>MUST NOT</bcp14> shrink the right edge of the receive window (i.e., DA | |||
| er MUST NOT shrink the right edge of the receive window (i.e., DATA_ACK + receiv | TA_ACK + receive window). The receiver will | |||
| e window). The receiver will | ||||
| use the data sequence number to tell if a packet should be accepted at the conne ction level.</t> | use the data sequence number to tell if a packet should be accepted at the conne ction level.</t> | |||
| <t>When deciding to accept packets at the subflow level, regular TCP c | ||||
| <t>When deciding to accept packets at subflow level, regular TCP check | hecks | |||
| s | ||||
| the sequence number in the packet against the allowed receive window. | the sequence number in the packet against the allowed receive window. | |||
| With multipath, such a check is done using only the connection-level window. A s | With MPTCP, such a check is done using only the connection-level window. A sanit | |||
| anity | y | |||
| check SHOULD be performed at subflow level to ensure that the subflow and mapped | check <bcp14>SHOULD</bcp14> be performed at the subflow level to ensure that the | |||
| sequence | subflow and mapped sequence | |||
| numbers meet the following test: SSN - SUBFLOW_ACK <= DSN - DATA_ACK, where S SN is the subflow sequence number of the received packet and SUBFLOW_ACK is the RCV.NXT (next expected sequence number) of the subflow (with the equivalent conn ection-level definitions for DSN and DATA_ACK).</t> | numbers meet the following test: SSN - SUBFLOW_ACK <= DSN - DATA_ACK, where S SN is the subflow sequence number of the received packet and SUBFLOW_ACK is the RCV.NXT (next expected sequence number) of the subflow (with the equivalent conn ection-level definitions for DSN and DATA_ACK).</t> | |||
| <t>In regular TCP, once a segment is deemed in-window, it is put in ei | ||||
| <t>In regular TCP, once a segment is deemed in-window, it is put either | ther | |||
| in the in-order receive queue or in the out-of-order queue. | the in-order receive queue or the out-of-order queue. | |||
| In Multipath TCP, the same happens but at the connection level: a segment | In Multipath TCP, the same thing happens, but at the connection level: a segment | |||
| is placed in the connection level in-order or out-of-order queue if | is placed in the connection-level in-order or out-of-order queue if | |||
| it is in-window at both connection and subflow levels. | it is in-window at both the connection level and the subflow level. | |||
| The stack still has to remember, for each subflow, which segments were | The stack still has to remember, for each subflow, which segments were | |||
| received successfully so that it can ACK them at subflow level appropriately. | received successfully so that it can ACK them at the subflow level appropriately | |||
| Typically, this will be implemented by keeping per subflow out-of-order | . | |||
| queues (containing only message headers, not the payloads) and remembering | Typically, this will be implemented by keeping per-subflow out-of-order | |||
| queues (containing only message headers -- not the payloads) and remembering | ||||
| the value of the cumulative ACK. | the value of the cumulative ACK. | |||
| </t> | </t> | |||
| <t>It is important for implementers to understand how large | <t>It is important for implementers to understand how large | |||
| a receiver buffer is appropriate. The lower bound for full | a receive buffer is appropriate. The lower bound for full | |||
| network utilization is the maximum bandwidth-delay product | network utilization is the maximum bandwidth-delay product | |||
| of any one of the paths. However, this might be insufficient | of any one of the paths. However, this might be insufficient | |||
| when a packet is lost on a slower subflow and needs to be | when a packet is lost on a slower subflow and needs to be | |||
| retransmitted (see <xref target="sec_retransmit"/>). A tight | retransmitted (see <xref target="sec_retransmit" format="default"/>). A tight | |||
| upper bound would be the maximum round-trip time (RTT) of any path mul tiplied | upper bound would be the maximum round-trip time (RTT) of any path mul tiplied | |||
| by the total bandwidth available across all paths. This | by the total bandwidth available across all paths. This | |||
| permits all subflows to continue at full speed while a | permits all subflows to continue at full speed while a | |||
| packet is fast-retransmitted on the maximum RTT path. Even | packet is fast-retransmitted on the maximum RTT path. Even | |||
| this might be insufficient to maintain full performance in | this might be insufficient to maintain full performance in | |||
| the event of a retransmit timeout on the maximum RTT path. | the event of a retransmit timeout on the maximum RTT path. | |||
| It is for future study to determine the relationship between | Determining the relationship between | |||
| retransmission strategies and receive buffer sizing.</t> | retransmission strategies and receive buffer sizing is left for future | |||
| study.</t> | ||||
| </section> | </section> | |||
| <section anchor="sec_sender" numbered="true" toc="default"> | ||||
| <section title="Sender Considerations" anchor="sec_sender"> | <name>Sender Considerations</name> | |||
| <t>The sender remembers receiver window advertisements from the receiv | <t>The sender remembers receive window advertisements from the | |||
| er. It should only update its local receive window values when the largest seque | receiver. It should only update its local receive window values when | |||
| nce number allowed (i.e., DATA_ACK + receive window) increases, on the receipt o | the largest sequence number allowed (i.e., DATA_ACK + receive | |||
| f a DATA_ACK. This is important to allow using paths with different RTTs, and th | window) increases on the receipt of a DATA_ACK. This is important | |||
| us different feedback loops. </t> | for allowing the use of paths with different RTTs and thus different f | |||
| eedback loops. </t> | ||||
| <t>MPTCP uses a single receive window across all subflows, and if the | <t>MPTCP uses a single receive window across all subflows, and if | |||
| receive window was guaranteed to be unchanged end-to-end, a host could always re | the receive window was guaranteed to be unchanged end to end, a host c | |||
| ad the most recent receive window value. However, some classes of middleboxes ma | ould always read the most recent receive window value. However, some classes of | |||
| y alter the TCP-level receive window. Typically, these will | middleboxes may alter the TCP-level receive window. Typically, these will | |||
| shrink the offered window, although for short periods of time it may be possible for the window to be larger (however, | shrink the offered window, although for short periods of time it may be possible for the window to be larger (however, | |||
| note that this would not continue for long periods since ultimately the middlebo x must keep up with | note that this would not continue for long periods, since ultimately the middleb ox must keep up with | |||
| delivering data to the receiver). Therefore, if receive window sizes differ on m ultiple subflows, | delivering data to the receiver). Therefore, if receive window sizes differ on m ultiple subflows, | |||
| when sending data MPTCP SHOULD take the largest of the most recent window sizes as the one to use in calculations. | when sending data MPTCP <bcp14>SHOULD</bcp14> take the largest of the most recen t window sizes as the one to use in calculations. | |||
| This rule is implicit in the requirement not to reduce the right edge of the win dow.</t> | This rule is implicit in the requirement not to reduce the right edge of the win dow.</t> | |||
| <t>The sender <bcp14>MUST</bcp14> also remember the receive windows ad | ||||
| <t>The sender MUST also remember the receive windows advertised by eac | vertised by each subflow. | |||
| h subflow. | ||||
| The allowed window for subflow i is (ack_i, ack_i + rcv_wnd_i), where ack_i is t he | The allowed window for subflow i is (ack_i, ack_i + rcv_wnd_i), where ack_i is t he | |||
| subflow-level cumulative ACK of subflow i. This ensures data will not be sent to a middlebox | subflow-level cumulative ACK of subflow i. This ensures that data will not be se nt to a middlebox | |||
| unless there is enough buffering for the data. </t> | unless there is enough buffering for the data. </t> | |||
| <t>Putting the two rules together, we get the following: a sender is a llowed to send | <t>Putting the two rules together, we get the following: a sender is a llowed to send | |||
| data segments with data-level sequence numbers between (DATA_ACK, DATA_ACK + rec eive_window). | data segments with data-level sequence numbers between (DATA_ACK, DATA_ACK + rec eive_window). | |||
| Each of these segments will be mapped onto subflows, as long as subflow sequence numbers | Each of these segments will be mapped onto subflows, as long as subflow sequence numbers | |||
| are in the allowed windows for those subflows. Note that subflow sequence number s do not | are in the allowed windows for those subflows. Note that subflow sequence number s do not | |||
| generally affect flow control if the same receive window is advertised across al l subflows. | generally affect flow control if the same receive window is advertised across al l subflows. | |||
| They will perform flow control for those subflows with a smaller advertised rece ive window. | They will perform flow control for those subflows with a smaller advertised rece ive window. | |||
| </t> | </t> | |||
| <t>The send buffer <bcp14>MUST</bcp14>, at a minimum, be as big as the | ||||
| <t>The send buffer MUST, at a minimum, be as big as the receive buffer | receive buffer, to enable the sender to reach maximum throughput.</t> | |||
| , to enable the sender to reach maximum throughput.</t> | ||||
| </section> | </section> | |||
| <section anchor="sec_retransmit" numbered="true" toc="default"> | ||||
| <section title="Reliability and Retransmissions" anchor="sec_retransmit" | <name>Reliability and Retransmissions</name> | |||
| > | <t>The Data Sequence Mapping allows senders to resend data with the | |||
| same data sequence number on a different subflow. When doing this, a | ||||
| <t>The data sequence mapping allows senders to resend data with the sa | host <bcp14>MUST</bcp14> still retransmit the original data on the | |||
| me data sequence number on a different subflow. When doing this, a host MUST sti | original subflow, in order to preserve the subflow's integrity | |||
| ll retransmit the original data on the original subflow, in order to preserve th | (middleboxes could replay old data and&wj;/or could reject holes in | |||
| e subflow integrity (middleboxes could replay old data, and/or could reject hole | subflows), and a receiver will ignore these retransmissions. While | |||
| s in subflows), and a receiver will ignore these retransmissions. While this is | this is clearly suboptimal, for compatibility reasons this is | |||
| clearly suboptimal, for compatibility reasons this is sensible behavior. Optimiz | sensible behavior. Optimizations could be negotiated in future | |||
| ations could be negotiated in future versions of this protocol. Note also that t | versions of this protocol. Note also that this property would also per | |||
| his property would also permit a sender to always send the same data, with the s | mit a sender to always send the same data, with the same data sequence number, o | |||
| ame data sequence number, on multiple subflows, if desired for reliability reaso | n multiple subflows, if desired for reliability reasons.</t> | |||
| ns.</t> | ||||
| <t>This protocol specification does not mandate any mechanisms for han dling retransmissions, and much will be dependent upon local policy | <t>This protocol specification does not mandate any mechanisms for han dling retransmissions, and much will be dependent upon local policy | |||
| (as discussed in <xref target="sec_policy"/>). One can imagine aggressive connec | (as discussed in <xref target="sec_policy" format="default"/>). One can imagine | |||
| tion-level retransmissions policies where every packet lost at subflow level is | aggressive connection-level retransmission policies where every packet lost at t | |||
| retransmitted on | he subflow level is retransmitted on | |||
| a different subflow (hence, wasting bandwidth but possibly reducing application- | a different subflow (hence wasting bandwidth but possibly reducing application-t | |||
| to-application delays), or conservative retransmission policies where connection | o-application delays) or conservative retransmission policies where connection-l | |||
| -level retransmits | evel retransmissions | |||
| are only used after a few subflow-level retransmission timeouts occur.</t> | are only used after a few subflow-level retransmission timeouts occur.</t> | |||
| <t>It is envisaged that a standard connection-level retransmission mec hanism | <t>It is envisaged that a standard connection-level retransmission mec hanism | |||
| would be implemented around a connection-level data queue: all segments that hav en't | would be implemented around a connection-level data queue: all segments that hav en't | |||
| been DATA_ACKed are stored. A timer is set when | been DATA_ACKed are stored. A timer is set when | |||
| the head of the connection-level is ACKed at subflow level but its corresponding | the head of the connection level is ACKed at the subflow level but is not DATA_A | |||
| data | CKed at the data level. This timer will guard against retransmission failures | |||
| is not ACKed at data level. This timer will guard against failures in retransmis | ||||
| sion | ||||
| by middleboxes that proactively ACK data.</t> | by middleboxes that proactively ACK data.</t> | |||
| <t>The sender <bcp14>MUST</bcp14> keep data in its send buffer as | ||||
| <t>The sender MUST keep data in its send buffer as long as the data ha | long as the data has not been acknowledged both (1) at the | |||
| s not been acknowledged at both connection level and on all subflows on which it | connection level and (2) on all subflows on which it | |||
| has been sent. In this way, the sender can always retransmit the data if needed, on the same subflow or on a different one. A special case is when a subflow fai ls: the sender | has been sent. In this way, the sender can always retransmit the data if needed, on the same subflow or on a different one. A special case is when a subflow fai ls: the sender | |||
| will typically resend the data on other working subflows after a timeout, and wi | will typically resend the data on other working subflows after a timeout and wil | |||
| ll keep trying to retransmit the data | l keep trying to retransmit the data | |||
| on the failed subflow too. The sender will declare the subflow failed after a pr | on the failed subflow too. The sender will declare the subflow failed after a pr | |||
| edefined upper bound on retransmissions is reached (which MAY be lower than the | edefined upper bound on retransmissions is reached (which <bcp14>MAY</bcp14> be | |||
| usual TCP limits of the Maximum Segment Life), or on the receipt of an ICMP erro | lower than the usual TCP limits of the MSL) or on the receipt of an ICMP error, | |||
| r, and only then delete the outstanding data segments. </t> | and only then delete the outstanding data segments. </t> | |||
| <t>If multiple retransmissions that indicate that a | ||||
| <t>If multiple retransmissions are triggered that indicate that a subf | subflow is performing badly are triggered, this <bcp14>MAY</bcp14> lea | |||
| low performs badly, this MAY lead to a host resetting the subflow with a RST. Ho | d to a host resetting the subflow with a RST. However, additional research is re | |||
| wever, additional research is required to understand the heuristics of how and w | quired to understand the heuristics of how and when to reset underperforming sub | |||
| hen to reset underperforming subflows. For example, a highly asymmetric path may | flows. For example, a highly asymmetric path may be misdiagnosed as underperform | |||
| be misdiagnosed as underperforming. A RST for this purpose SHOULD be accompanie | ing. A RST for this purpose <bcp14>SHOULD</bcp14> be accompanied by an "Unaccept | |||
| d with an "Unacceptable performance" MP_TCPRST option (<xref target="sec_reset"/ | able performance" MP_TCPRST option (<xref target="sec_reset" format="default"/>) | |||
| >).</t> | .</t> | |||
| </section> | </section> | |||
| <section anchor="sec_cc" numbered="true" toc="default"> | ||||
| <section title="Congestion Control Considerations" anchor="sec_cc"> | <name>Congestion Control Considerations</name> | |||
| <t>Different subflows in an MPTCP connection have different congestion windows. | <t>Different subflows in an MPTCP connection have different congestion windows. | |||
| To achieve fairness at bottlenecks and resource pooling, it is necessary to coup le the | To achieve fairness at bottlenecks and resource pooling, it is necessary to coup le the | |||
| congestion windows in use on each subflow, in order to push most traffic to unco ngested links. | congestion windows in use on each subflow, in order to push most traffic to unco ngested links. | |||
| One algorithm for achieving this is presented in <xref target="RFC6356"/>; | One algorithm for achieving this is presented in <xref target="RFC6356" format=" default"/>; | |||
| the algorithm does not achieve perfect resource pooling but is "safe" in that it is readily | the algorithm does not achieve perfect resource pooling but is "safe" in that it is readily | |||
| deployable in the current Internet. By this, we mean that it does not take up mo re capacity | deployable in the current Internet. By this we mean that it does not take up mor e capacity | |||
| on any one path than if it was a single path flow using only that route, so this ensures | on any one path than if it was a single path flow using only that route, so this ensures | |||
| fair coexistence with single-path TCP at shared bottlenecks.</t> | fair coexistence with single-path TCP at shared bottlenecks.</t> | |||
| <t>It is foreseeable that different congestion controllers will be | ||||
| <t>It is foreseeable that different congestion controllers will be imp | implemented for MPTCP, each aiming to achieve different properties | |||
| lemented for MPTCP, each aiming to achieve different properties in the resource | in the resource pooling / fairness / stability design space, as well a | |||
| pooling/fairness/stability design space, as well as those for achieving differen | s those for achieving different properties in quality of service, reliability, a | |||
| t properties in quality of service, reliability, and resilience.</t> | nd resilience.</t> | |||
| <t>Regardless of the algorithm used, | <t>Regardless of the algorithm used, | |||
| the design of the MPTCP protocol aims to provide the congestion control implemen | the design of MPTCP aims to provide the congestion control | |||
| tations sufficient information | implementations with sufficient information | |||
| to take the right decisions; this information includes, for each subflow, which | to make the right decisions; this information includes, for each subflow, which | |||
| packets were lost and when. </t> | packets were lost and when. </t> | |||
| </section> | </section> | |||
| <section anchor="sec_policy" numbered="true" toc="default"> | ||||
| <section title="Subflow Policy" anchor="sec_policy"> | <name>Subflow Policy</name> | |||
| <t>Within a local MPTCP implementation, a host may use any local polic y it wishes to decide how to share the traffic to be sent over the available pat hs.</t> | <t>Within a local MPTCP implementation, a host may use any local polic y it wishes to decide how to share the traffic to be sent over the available pat hs.</t> | |||
| <t>In the typical use case, where the goal is to maximize throughput, | <t>In the typical use case, where the goal is to maximize throughput, | |||
| all available paths will be used simultaneously for data transfer, using coupled | all available paths will be used simultaneously for data transfer, using coupled | |||
| congestion control as described in <xref target="RFC6356"/>. It is expected, ho | congestion control as described in <xref target="RFC6356" format="default"/>. I | |||
| wever, that other use cases will appear.</t> | t is expected, however, that other use cases will appear.</t> | |||
| <t>For instance, a possibility is an 'all-or-nothing' approach, i.e., | <t>For instance, one possibility is an "all-or-nothing" approach, i.e. | |||
| have a second path ready for use in the event of | , have a second path ready for use in the event of | |||
| failure of the first path, but alternatives could include entirely saturating on e path before using an additional | failure of the first path, but alternatives could include entirely saturating on e path before using an additional | |||
| path (the 'overflow' case). Such choices would be most likely based on the monet ary cost of links, but may also be | path (the "overflow" case). Such choices would be most likely based on the monet ary cost of links but may also be | |||
| based on properties such as the delay or jitter of links, where stability (of de lay or bandwidth) is more important than throughput. Application | based on properties such as the delay or jitter of links, where stability (of de lay or bandwidth) is more important than throughput. Application | |||
| requirements such as these are discussed in detail in <xref target="RFC6897"/>.< /t> | requirements such as these are discussed in detail in <xref target="RFC6897" for mat="default"/>.</t> | |||
| <t>The ability to make effective choices at the sender requires full k nowledge of the path "cost", which | <t>The ability to make effective choices at the sender requires full k nowledge of the path "cost", which | |||
| is unlikely to be the case. It would be desirable for a receiver to be able to s ignal their own preferences for paths, | is unlikely to be the case. It would be desirable for a receiver to be able to s ignal their own preferences for paths, | |||
| since they will often be the multihomed party, and may have to pay for metered i | since they will often be the multihomed party and may have to pay for metered in | |||
| ncoming bandwidth.</t> | coming bandwidth.</t> | |||
| <t>To enable this, the MP_JOIN option (see <xref target="sec_join"/>) | <t>To enable this behavior, the MP_JOIN option (see <xref | |||
| contains the 'B' bit, which allows a host to indicate to its peer that this path | target="sec_join" format="default"/>) contains the "B" bit, | |||
| should be treated as a backup path to use only in the event of failure of other | which allows a host to indicate to its peer that this path should be | |||
| working subflows (i.e., a subflow where the receiver has indicated B=1 SHOULD N | treated as a backup path to use only in the event of failure of | |||
| OT be used to send data unless there are no usable subflows where B=0).</t> | other working subflows (i.e., a subflow where the receiver has | |||
| <t>In the event that the available set of paths changes, a host may wi | indicated that B=1 <bcp14>SHOULD NOT</bcp14> be used to send data unle | |||
| sh to signal a change in priority of subflows to the peer (e.g., a subflow that | ss there are no usable subflows where B=0).</t> | |||
| was previously set as backup should now take priority over all remaining subflow | <t>In the event that the available set of paths changes, a host may | |||
| s). Therefore, the MP_PRIO option, shown in <xref target="tcpm_prio"/>, can be u | wish to signal a change in priority of subflows to the peer (e.g., a | |||
| sed to change the 'B' flag of the subflow on which it is sent.</t> | subflow that was previously set as a backup should now take priority | |||
| <t>Another use of the MP_PRIO option is to set the 'B' flag on a subfl | over all remaining subflows). Therefore, the MP_PRIO option, shown | |||
| ow to cleanly retire its use before closing it and removing it with REMOVE_ADDR | in <xref target="tcpm_prio" format="default"/>, can be used to | |||
| <xref target="sec_remove_addr"/>, for example to support make-before-break sessi | change the "B" flag of the subflow on which it is sent.</t> | |||
| on continuity, where new subflows are added before the previously used ones are | <figure anchor="tcpm_prio"> | |||
| closed.</t> | <name>Change Subflow Priority (MP_PRIO) Option</name> | |||
| <?rfc needLines='8'?> | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
| <figure align="center" anchor="tcpm_prio" title="Change Subflow Priori | 1 2 3 | |||
| ty (MP_PRIO) Option"> | 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |||
| <artwork align="left"><![CDATA[ | +---------------+---------------+-------+-----+-+ | |||
| 1 2 3 | | Kind | Length |Subtype|(rsv)|B| | |||
| 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | +---------------+---------------+-------+-----+-+ ]]></artwork> | |||
| +---------------+---------------+-------+-----+-+ | ||||
| | Kind | Length |Subtype|(rsv)|B| | ||||
| +---------------+---------------+-------+-----+-+ | ||||
| ]]></artwork> | ||||
| </figure> | </figure> | |||
| <t>Another use of the MP_PRIO option is to set the "B" flag on a | ||||
| <t>It should be noted that the backup flag is a request from a data receiver to | subflow to cleanly "retire" its use before closing it and removing it | |||
| a data sender only, and the data sender SHOULD adhere to these requests. A host | with REMOVE_ADDR (<xref target="sec_remove_addr" format="default"/>) - | |||
| cannot assume that the data sender will do so, however, since local policies -- | - for example, to support make-before-break session continuity, where new subflo | |||
| or technical difficulties -- may override MP_PRIO requests. Note also that this | ws are added before the previously used subflows are closed.</t> | |||
| signal applies to a single direction, and so the sender of this option could cho | <t>It should be noted that the backup flag is a request from a data re | |||
| ose to continue using the subflow to send data even if it has signaled B=1 to th | ceiver to a data sender only, and the data sender <bcp14>SHOULD</bcp14> adhere t | |||
| e other host.</t> | o these requests. A host cannot assume that the data sender will do so, however, | |||
| since local policies -- or technical difficulties -- may override MP_PRIO reque | ||||
| sts. Note also that this signal applies to a single direction, and so the sender | ||||
| of this option could choose to continue using the subflow to send data even if | ||||
| it has signaled B=1 to the other host.</t> | ||||
| </section> | </section> | |||
| </section> | </section> | |||
| <section anchor="sec_pm" numbered="true" toc="default"> | ||||
| <section title="Address Knowledge Exchange (Path Management)" anchor="sec_ | <name>Address Knowledge Exchange (Path Management)</name> | |||
| pm"> | <t>We use the term "path management" to refer to the exchange of informa | |||
| <t>We use the term "path management" to refer to the exchange of informa | tion about additional paths between hosts, which in this design is managed by mu | |||
| tion about additional paths between hosts, which in this design is managed by mu | ltiple addresses at hosts. For more details regarding the architectural thinking | |||
| ltiple addresses at hosts. For more detail of the architectural thinking behind | behind this design, see the MPTCP architecture document <xref target="RFC6182" | |||
| this design, see the MPTCP Architecture document <xref target="RFC6182"/>.</t> | format="default"/>.</t> | |||
| <t>This design makes use of two methods of sharing such | <t>This design makes use of two methods of sharing such | |||
| information, and both can be used on a connection. | information, and both can be used on a connection. | |||
| The first is the direct | The first is the direct | |||
| setup of new subflows, already described in | setup of new subflows (described in | |||
| <xref target="sec_join"/>, where the initiator has an | <xref target="sec_join" format="default"/>), where the initiator has an | |||
| additional address. The second method, described in the | additional address. The second method (described in the | |||
| following subsections, signals addresses explicitly to the | following subsections) signals addresses explicitly to the | |||
| other host to allow it to initiate new subflows. The | other host to allow it to initiate new subflows. The | |||
| two mechanisms are complementary: the first is implicit and | two mechanisms are complementary: the first is implicit and | |||
| simple, while the explicit is more complex but is more | simple, while the second (explicit) is more complex but is more | |||
| robust. Together, the mechanisms allow addresses to change in | robust. Together, these mechanisms allow addresses to change in | |||
| flight (and thus support operation through NATs, since the | flight (and thus support operation through NATs, since the | |||
| source address need not be known), and also allow the | source address need not be known); they also allow the | |||
| signaling of previously unknown addresses, and of addresses | signaling of previously unknown addresses and of addresses | |||
| belonging to other address families (e.g., both IPv4 and IPv6).</t> | belonging to other address families (e.g., both IPv4 and IPv6).</t> | |||
| <t>Here is an example of typical operation of the protocol: | <t>Here is an example of typical operation of the protocol: | |||
| <list style="symbols"> | </t> | |||
| <t>An MPTCP connection is initially set up between address/port A1 o | <ul spacing="normal"> | |||
| f Host A | <li>An MPTCP connection is initially set up between address&wj;/port A | |||
| and address/port B1 of Host B. If Host A is multihomed and | 1 of Host A | |||
| and address&wj;/port B1 of Host B. If Host A is multihomed and | ||||
| multiaddressed, it can start an additional subflow from | multiaddressed, it can start an additional subflow from | |||
| its address A2 to B1, by sending a SYN with a Join | its address A2 to B1, by sending a SYN with an MP_JOIN | |||
| option from A2 to B1, using B's previously declared | option from A2 to B1, using B's previously declared | |||
| token for this connection. Alternatively, if B is | token for this connection. Alternatively, if B is | |||
| multihomed, it can try to set up a new subflow from B2 to | multihomed, it can try to set up a new subflow from B2 to | |||
| A1, using A's previously declared token. In either | A1, using A's previously declared token. In either | |||
| case, the SYN will be sent to the port already in use | case, the SYN will be sent to the port already in use | |||
| for the original subflow on the receiving host.</t> | for the original subflow on the receiving host.</li> | |||
| <li>Simultaneously (or after a timeout), an ADD_ADDR option | ||||
| <t>Simultaneously (or after a timeout), an ADD_ADDR option | (<xref target="sec_add_address" format="default"/>) is sent on an existing subfl | |||
| (<xref target="sec_add_address"/>) is sent on an existing subflow, informing | ow, informing | |||
| the receiver of the sender's alternative address(es). The recipient can use | the receiver of the sender's alternative address(es). The recipient can use | |||
| this information to open a new subflow to the sender's additional address. | this information to open a new subflow to the sender's additional address(es). | |||
| In our example, A will send ADD_ADDR option informing B of address/port A2. | In our example, A will send the ADD_ADDR option informing B of address&wj;/port | |||
| The mix of using the SYN-based option and the ADD_ADDR option, including | A2. | |||
| timeouts, is implementation specific and can be tailored to agree with local pol | The mix of using the SYN‑based option and the ADD_ADDR option, including | |||
| icy.</t> | timeouts, is implementation specific and can be tailored to agree with local pol | |||
| icy.</li> | ||||
| <t>If subflow A2-B1 is successfully set up, Host B can use the Addre | <li>If subflow A2-B1 is successfully set up, Host B can use the Addres | |||
| ss ID in | s ID in | |||
| the Join option to correlate this with the ADD_ADDR option that will also arrive | the MP_JOIN option to correlate this source address with the ADD_ADDR option tha | |||
| on | t will also arrive on | |||
| an existing subflow; now B knows not to open A2-B1, ignoring the ADD_ADDR. | an existing subflow; now B knows not to open A2-B1, ignoring the ADD_ADDR. | |||
| Otherwise, if B has not received the A2-B1 MP_JOIN SYN but received the ADD_ADDR , | Otherwise, if B has not received the A2-B1 MP_JOIN SYN but received the ADD_ADDR , | |||
| it can try to initiate a new subflow from one or more of its addresses to addres s | it can try to initiate a new subflow from one or more of its addresses to addres s | |||
| A2. This permits new sessions to be opened if one host is behind a NAT.</t> | A2. This permits new sessions to be opened if one host is behind a NAT.</li> | |||
| </list> | </ul> | |||
| <t> | ||||
| Other ways of using the two signaling mechanisms are possible; for instan ce, | Other ways of using the two signaling mechanisms are possible; for instan ce, | |||
| signaling addresses in other address families can only be done explicitly using | signaling addresses in other address families can only be done explicitly | |||
| the Add Address option. | using the Add Address (ADD_ADDR) option. | |||
| </t> | </t> | |||
| <section anchor="sec_add_address" numbered="true" toc="default"> | ||||
| <section title="Address Advertisement" anchor="sec_add_address"> | <name>Address Advertisement</name> | |||
| <t>The Add Address (ADD_ADDR) MPTCP option announces additional addresse | <t>The ADD_ADDR MPTCP option announces additional addresses (and, opti | |||
| s (and optionally, ports) on which a | onally, ports) on which a | |||
| host can be reached (<xref target="tcpm_address"/>). | host can be reached (<xref target="tcpm_address" format="default"/>). | |||
| This option can be used at any time during a connection, depending on when the | This option can be used at any time during a connection, depending on when the | |||
| sender wishes to enable multiple paths and/or when paths become available. As wi | sender wishes to enable multiple paths and&wj;/or when paths become available. A | |||
| th all MPTCP | s with all MPTCP | |||
| signals, the receiver MUST undertake standard TCP validity checks, e.g. <xref ta | signals, the receiver <bcp14>MUST</bcp14> undertake standard TCP validity | |||
| rget="RFC5961"/>, before acting upon it.</t> | checks, e.g., per <xref target="RFC5961" format="default"/>, before | |||
| acting upon it.</t> | ||||
| <t>Every address has an Address ID that can be used for uniquely identif | <figure anchor="tcpm_address"> | |||
| ying the address within a connection for address removal. The Address ID is also | <name>Add Address (ADD_ADDR) Option</name> | |||
| used to identify MP_JOIN options (see <xref target="sec_join"/>) relating to | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
| the same address, even when address translators are in use. The Address ID MUST | 1 2 3 | |||
| uniquely | 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |||
| identify the address for the sender of the option (within the scope of the conne | +---------------+---------------+-------+-------+---------------+ | |||
| ction), but the mechanism for | | Kind | Length |Subtype|(rsv)|E| Address ID | | |||
| allocating such IDs is implementation specific.</t> | +---------------+---------------+-------+-------+---------------+ | |||
| | Address (IPv4: 4 octets / IPv6: 16 octets) | | ||||
| +-------------------------------+-------------------------------+ | ||||
| | Port (2 octets, optional) | | | ||||
| +-------------------------------+ | | ||||
| | Truncated HMAC (8 octets, if E=0) | | ||||
| | +-------------------------------+ | ||||
| | | | ||||
| +-------------------------------+ ]]></artwork> | ||||
| </figure> | ||||
| <t>All address IDs learned via either MP_JOIN or ADD_ADDR | <t>Every address has an Address ID that can be used for uniquely ident | |||
| SHOULD be stored by the receiver in a data structure that gathers all th | ifying the address within a connection for address removal. The Address ID is al | |||
| e Address ID | so | |||
| to address mappings for a connection (identified by a token pair). In th | used to identify MP_JOIN options (see <xref target="sec_join" format="default"/> | |||
| is way, there is | ) relating to | |||
| a stored mapping between Address ID, observed source address, and token | the same address, even when address translators are in use. The Address ID <bcp1 | |||
| pair for | 4>MUST</bcp14> uniquely | |||
| identify the address for the sender of the option (within the scope of the conne | ||||
| ction); the mechanism for | ||||
| allocating such IDs is implementation specific.</t> | ||||
| <t>All Address IDs learned via either MP_JOIN or ADD_ADDR | ||||
| <bcp14>SHOULD</bcp14> be stored by the receiver in a data structure | ||||
| that gathers all the Address-ID-to-address mappings for a connection | ||||
| (identified by a token pair). In this way, there is | ||||
| a stored mapping between the Address ID, observed source address, and to | ||||
| ken pair for | ||||
| future processing of control information for a connection. Note that an implementation | future processing of control information for a connection. Note that an implementation | |||
| MAY discard incoming address advertisements at will, for example, for av | <bcp14>MAY</bcp14> discard incoming address advertisements at will -- fo | |||
| oiding updating | r example, to avoid updating | |||
| mapping state, or because advertised addresses are of no use to it (for | mapping state or because advertised addresses are of no use to it (for | |||
| example, IPv6 addresses when it has IPv4 only). Therefore, a host MUST t | example, IPv6 addresses when it has IPv4 only). Therefore, a host <bcp14 | |||
| reat address | >MUST</bcp14> treat address | |||
| advertisements as soft state, and it MAY choose to refresh advertisement | advertisements as soft state, and it <bcp14>MAY</bcp14> choose to refres | |||
| s periodically. | h advertisements periodically. | |||
| Note also that an implementation MAY choose to cache these address adver | Note also that an implementation <bcp14>MAY</bcp14> choose to cache thes | |||
| tisements even | e address advertisements even | |||
| if they are not currently relevant but may be relevant in the future, su ch as IPv4 | if they are not currently relevant but may be relevant in the future, su ch as IPv4 | |||
| addresses when IPv6 connectivity is available but IPv4 is awaiting DHCP. </t> | addresses when IPv6 connectivity is available but IPv4 is awaiting DHCP. </t> | |||
| <t>This option is shown in <xref target="tcpm_address" format="default | ||||
| <t>This option is shown in <xref target="tcpm_address"/>. The illustrati | "/>. The illustration is sized for | |||
| on is sized for | IPv4 addresses. For IPv6, the length of the address will be 16 octe | |||
| IPv4 addresses. For IPv6, the length of the address will be 16 octets (i | ts (instead of 4).</t> | |||
| nstead of 4).</t> | <t>The 2 octets that specify the TCP port number to use are optional, | |||
| and their presence | ||||
| <t>The 2 octets that specify the TCP port number to use are optional and | ||||
| their presence | ||||
| can be inferred from the length of the option. Although it is expected t hat the majority of | can be inferred from the length of the option. Although it is expected t hat the majority of | |||
| use cases will use the same port pairs as used for the initial subflow ( e.g., port | use cases will use the same port pairs as those used for the initial sub flow (e.g., port | |||
| 80 remains port 80 on all subflows, as does the ephemeral port at the cl ient), there | 80 remains port 80 on all subflows, as does the ephemeral port at the cl ient), there | |||
| may be cases (such as port-based load balancing) where the explicit spec ification of | may be cases (such as port-based load balancing) where the explicit spec ification of | |||
| a different port is required. If no port is specified, MPTCP SHOULD atte | a different port is required. If no port is specified, MPTCP <bcp14>SHOU | |||
| mpt to | LD</bcp14> attempt to | |||
| connect to the specified address on the same port as is already in use b | connect to the specified address on the same port as the port that is al | |||
| y the subflow | ready in use by the subflow | |||
| on which the ADD_ADDR signal was sent; this is discussed in more detail | on which the ADD_ADDR signal was sent; this is discussed in more detail | |||
| in <xref target="heuristics"/>.</t> | in <xref target="heuristics" format="default"/>.</t> | |||
| <t>The Truncated HMAC parameter present in this option is the rightmos | ||||
| <t>The Truncated HMAC present in this Option is the rightmost 64 bits of | t 64 bits of an HMAC, negotiated and | |||
| an HMAC, negotiated and | calculated in the same way as for MP_JOIN as described in <xref target=" | |||
| calculated in the same way as for MP_JOIN as described in <xref target=" | sec_join" format="default"/>. For this | |||
| sec_join"/>. For this | ||||
| specification of MPTCP, as there is only one hash algorithm option speci fied, this will be HMAC as | specification of MPTCP, as there is only one hash algorithm option speci fied, this will be HMAC as | |||
| defined in <xref target="RFC2104"/>, using the SHA-256 hash algorithm <x ref target="RFC6234"/>. | defined in <xref target="RFC2104" format="default"/>, using the SHA-256 hash algorithm <xref target="RFC6234" format="default"/>. | |||
| In the same way as for MP_JOIN, the key for the HMAC | In the same way as for MP_JOIN, the key for the HMAC | |||
| algorithm, in the case of the message transmitted by Host A, will be Key -A followed by Key-B, and in | algorithm, in the case of the message transmitted by Host A, will be Key -A followed by Key-B, and in | |||
| the case of Host B, Key-B followed by Key-A. These are the keys that we re exchanged in the original | the case of Host B, Key-B followed by Key-A. These are the keys that we re exchanged in the original | |||
| MP_CAPABLE handshake. The message for the HMAC is the Address ID, IP Add ress, and Port which precede | MP_CAPABLE handshake. The message for the HMAC is the Address ID, IP add ress, and port that precede | |||
| the HMAC in the ADD_ADDR option. If the port is not present in the ADD_A DDR option, the HMAC message | the HMAC in the ADD_ADDR option. If the port is not present in the ADD_A DDR option, the HMAC message | |||
| will nevertheless include two octets of value zero. The rationale for th e HMAC is to | will nevertheless include 2 octets of value zero. The rationale for the HMAC is to | |||
| prevent unauthorized entities from injecting ADD_ADDR signals in an atte mpt to hijack a connection. | prevent unauthorized entities from injecting ADD_ADDR signals in an atte mpt to hijack a connection. | |||
| Note that additionally the presence of this HMAC prevents the address be | Note that, additionally, the presence of this HMAC prevents the | |||
| ing changed in flight unless | address from being changed in flight unless | |||
| the key is known by an intermediary. If a host receives an ADD_ADDR opti on for which it cannot | the key is known by an intermediary. If a host receives an ADD_ADDR opti on for which it cannot | |||
| validate the HMAC, it SHOULD silently ignore the option.</t> | validate the HMAC, it <bcp14>SHOULD</bcp14> silently ignore the option.< | |||
| /t> | ||||
| <t>A set of four flags are present after the subtype and before the Addr | <t>A set of four flags is present after the subtype and before the Add | |||
| ess ID. Only the rightmost | ress ID. Only the rightmost | |||
| bit - labelled 'E' - is assigned in this specification. The other bits a | bit -- labeled "E" -- is assigned in this specification. The other | |||
| re currently unassigned and MUST | bits are currently unassigned; they <bcp14>MUST</bcp14> | |||
| be set to zero by a sender and MUST be ignored by the receiver.</t> | be set to 0 by a sender and <bcp14>MUST</bcp14> be ignored by the receiv | |||
| er.</t> | ||||
| <t>The 'E' flag exists to provide reliability for this option. Because t | <t>The "E" flag exists to provide reliability for this option. Because | |||
| his option will often be sent | this option will often be sent | |||
| on pure ACKs, there is no guarantee of reliability. Therefore, a receive r receiving a fresh ADD_ADDR | on pure ACKs, there is no guarantee of reliability. Therefore, a receive r receiving a fresh ADD_ADDR | |||
| option (where E=0), will send the same option back to the sender, but no | option (where E=0) will send the same option back to the sender, but not | |||
| t including the HMAC, and | including the HMAC and | |||
| with E=1, to indicate receipt. The lack of this echo can be used by the | with E=1, to indicate receipt. According to local policy, the lack of | |||
| initial ADD_ADDR sender to | this type of "echo" can indicate to the initial ADD_ADDR sender that the | |||
| retransmit the ADD_ADDR according to local policy.</t> | ADD_ADDR needs to be retransmitted.</t> | |||
| <?rfc needLines='11'?> | ||||
| <figure align="center" anchor="tcpm_address" title="Add Address (ADD_ADD | ||||
| R) Option"> | ||||
| <artwork align="left"><![CDATA[ | ||||
| 1 2 3 | ||||
| 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||||
| +---------------+---------------+-------+-------+---------------+ | ||||
| | Kind | Length |Subtype|(rsv)|E| Address ID | | ||||
| +---------------+---------------+-------+-------+---------------+ | ||||
| | Address (IPv4 - 4 octets / IPv6 - 16 octets) | | ||||
| +-------------------------------+-------------------------------+ | ||||
| | Port (2 octets, optional) | | | ||||
| +-------------------------------+ | | ||||
| | Truncated HMAC (8 octets, if E=0) | | ||||
| | +-------------------------------+ | ||||
| | | | ||||
| +-------------------------------+ | ||||
| ]]></artwork> | ||||
| </figure> | ||||
| <t>Due to the proliferation of NATs, it is reasonably likely that one ho | ||||
| st may attempt to advertise private addresses <xref target="RFC1918"/>. It is no | ||||
| t desirable to prohibit this, since there may be cases where both hosts have add | ||||
| itional interfaces on the same private network, and a host MAY advertise such ad | ||||
| dresses. The MP_JOIN handshake to create a new subflow (<xref target="sec_join"/ | ||||
| >) provides mechanisms to minimize security risks. The MP_JOIN message contains | ||||
| a 32-bit token that uniquely identifies the connection to the receiving host. If | ||||
| the token is unknown, the host will return with a RST. In the unlikely event th | ||||
| at the token is valid at the receiving host, subflow setup will continue, but th | ||||
| e HMAC exchange must occur for authentication. This will fail, and will provide | ||||
| sufficient protection against two unconnected hosts accidentally setting up a ne | ||||
| w subflow upon the signal of a private address. Further security considerations | ||||
| around the issue of ADD_ADDR messages that accidentally misdirect, or maliciousl | ||||
| y direct, new MP_JOIN attempts are discussed in <xref target="sec_security"/>.</ | ||||
| t> | ||||
| <t>A host that receives an ADD_ADDR but finds a connection set up to tha | ||||
| t IP address and port number is unsuccessful SHOULD NOT perform further connecti | ||||
| on attempts to this address/port combination for this connection. A sender that | ||||
| wants to trigger a new incoming connection attempt on a previously advertised ad | ||||
| dress/port combination can therefore refresh ADD_ADDR information by sending the | ||||
| option again.</t> | ||||
| <t>A host can therefore send an ADD_ADDR message with an already assigne | ||||
| d Address ID, but the Address MUST be the same as previously assigned to this Ad | ||||
| dress ID. A new ADD_ADDR may have the same, or different, port number. If the po | ||||
| rt number is different, the receiving host SHOULD try to set up a new subflow to | ||||
| this new address/port combination.</t> | ||||
| <t>A host wishing to replace an existing Address ID MUST first remove th | ||||
| e existing one (<xref target="sec_remove_addr"/>).</t> | ||||
| <t>During normal MPTCP operation, it is unlikely that there will be suff | <t>Due to the proliferation of NATs, it is reasonably likely that | |||
| icient TCP option space for ADD_ADDR to be included along with those for data se | one host may attempt to advertise private addresses <xref | |||
| quence numbering (<xref target="sec_dsn"/>). Therefore, it is expected that an M | target="RFC1918" format="default"/>. It is not desirable to prohibit | |||
| PTCP implementation will send the ADD_ADDR option on separate ACKs. As discussed | this behavior, since there may be cases where both hosts have additional | |||
| earlier, however, an MPTCP implementation MUST NOT treat duplicate ACKs with an | interfaces on the same private network, and a host | |||
| y MPTCP option, with the exception of the DSS option, as indications of congesti | <bcp14>MAY</bcp14> advertise such addresses. The MP_JOIN handshake | |||
| on <xref target="RFC5681"/>, and an MPTCP implementation SHOULD NOT send more th | to create a new subflow (<xref target="sec_join" format="default"/>) | |||
| an two duplicate ACKs in a row for signaling purposes.</t> | provides mechanisms to minimize security risks. The MP_JOIN message | |||
| contains a 32-bit token that uniquely identifies the connection to | ||||
| the receiving host. If the token is unknown, the host will respond | ||||
| with a RST. In the unlikely event that the token is valid at the | ||||
| receiving host, subflow setup will continue, but the HMAC exchange | ||||
| must occur for authentication. The HMAC exchange | ||||
| will fail and will provide | ||||
| sufficient protection against two unconnected hosts accidentally | ||||
| setting up a new subflow upon the signal of a private address. | ||||
| Further security considerations around the issue of ADD_ADDR messages that acci | ||||
| dentally misdirect, or maliciously direct, new MP_JOIN attempts are discussed in | ||||
| <xref target="sec_security" format="default"/>.</t> | ||||
| <t>A host that receives an ADD_ADDR but finds that a connection set up | ||||
| to that IP address and port number is unsuccessful <bcp14>SHOULD NOT</bcp14> pe | ||||
| rform further connection attempts to this address&wj;/port combination for this | ||||
| connection. A sender that wants to trigger a new incoming connection attempt on | ||||
| a previously advertised address&wj;/port combination can therefore refresh ADD_A | ||||
| DDR information by sending the option again.</t> | ||||
| <t>A host can therefore send an ADD_ADDR message with an | ||||
| already-assigned Address ID, but the address <bcp14>MUST</bcp14> be | ||||
| the same as the address previously assigned to this Address ID. A | ||||
| new ADD_ADDR may have the same port number or a different port number. | ||||
| If the port number is different, the receiving host <bcp14>SHOULD</bcp14> try t | ||||
| o set up a new subflow to this new address&wj;/port combination.</t> | ||||
| <t>A host wishing to replace an existing Address ID <bcp14>MUST</bcp14 | ||||
| > first remove the existing one (<xref target="sec_remove_addr" format="default" | ||||
| />).</t> | ||||
| <t>During normal MPTCP operation, it is unlikely that there will be su | ||||
| fficient TCP option space for ADD_ADDR to be included along with those for data | ||||
| sequence numbering (<xref target="sec_dsn" format="default"/>). Therefore, it is | ||||
| expected that an MPTCP implementation will send the ADD_ADDR option on separate | ||||
| ACKs. As discussed earlier, however, an MPTCP implementation <bcp14>MUST NOT</b | ||||
| cp14> treat duplicate ACKs with any MPTCP option, with the exception of the DSS | ||||
| option, as indications of congestion <xref target="RFC5681" format="default"/>, | ||||
| and an MPTCP implementation <bcp14>SHOULD NOT</bcp14> send more than two duplica | ||||
| te ACKs in a row for signaling purposes.</t> | ||||
| </section> | ||||
| <section anchor="sec_remove_addr" numbered="true" toc="default"> | ||||
| <name>Remove Address</name> | ||||
| <t>If, during the lifetime of an MPTCP connection, a previously | ||||
| announced address becomes invalid (e.g., if the interface | ||||
| disappears or an IPv6 address is no longer preferred), the affected | ||||
| host <bcp14>SHOULD</bcp14> announce this situation so that the peer ca | ||||
| n remove | ||||
| subflows related to this address. Even if an address is not in use | ||||
| by an MPTCP connection, if it has been previously announced, an | ||||
| implementation <bcp14>SHOULD</bcp14> announce its removal. A host | ||||
| <bcp14>MAY</bcp14> also choose to announce that a valid IP address | ||||
| should not be used any longer -- for example, for make‑before-br | ||||
| eak session continuity.</t> | ||||
| <t>This is achieved through the Remove Address (REMOVE_ADDR) option | ||||
| (<xref target="tcpm_remove" format="default"/>), which will remove a | ||||
| previously added address (or list of addresses) from a connection | ||||
| and terminate any subflows currently using that address.</t> | ||||
| </section> | <figure anchor="tcpm_remove"> | |||
| <section title="Remove Address" anchor="sec_remove_addr"> | <name>Remove Address (REMOVE_ADDR) Option</name> | |||
| <t>If, during the lifetime of an MPTCP connection, a previously announce | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
| d address becomes invalid (e.g., if the interface disappears, or an IPv6 address | 1 2 3 | |||
| is no longer preferred), the affected host SHOULD announce this so that the pee | 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |||
| r can remove subflows related to this address. Even if an address is not in use | +---------------+---------------+-------+-------+---------------+ | |||
| by a MPTCP connection, if it has been previously announced, an implementation SH | | Kind |Length = 3 + n |Subtype|(resvd)| Address ID | ... | |||
| OULD announce its removal. A host MAY also choose to announce that a valid IP ad | +---------------+---------------+-------+-------+---------------+ | |||
| dress should not be used any longer, for example for make-before-break session c | (followed by n-1 Address IDs, if required) ]]></artwo | |||
| ontinuity.</t> | rk> | |||
| <t>This is achieved through the Remove Address (REMOVE_ADDR) option (<xr | </figure> | |||
| ef target="tcpm_remove"/>), which will remove a previously added address (or lis | ||||
| t of addresses) from a connection and terminate any subflows currently using tha | ||||
| t address.</t> | ||||
| <t>For security purposes, if a host receives a REMOVE_ADDR option, it mu | ||||
| st ensure the affected path(s) are no longer in use before it instigates closure | ||||
| . The receipt of REMOVE_ADDR SHOULD first trigger the sending of a TCP keepalive | ||||
| <xref target="RFC1122"/> on the path, and if a response is received the path SH | ||||
| OULD NOT be removed. If the path is found to still be alive, the receiving host | ||||
| SHOULD no longer use the specified address for future connections, but it is the | ||||
| responsibility of the host which sent the REMOVE_ADDR to shut down the subflow. | ||||
| The requesting host MAY also use MP_PRIO (<xref target="sec_policy"/>) to reque | ||||
| st a path is no longer used, before removal. Typical TCP validity tests on the s | ||||
| ubflow (e.g., ensuring sequence and ACK numbers are correct) MUST also be undert | ||||
| aken. An implementation can use indications of these test failures as part of in | ||||
| trusion detection or error logging.</t> | ||||
| <t>The sending and receipt (if no keepalive response was received) of th | ||||
| is message SHOULD trigger the sending of RSTs by both hosts on the affected subf | ||||
| low(s) (if possible), as a courtesy to cleaning up middlebox state, before clean | ||||
| ing up any local state.</t> | ||||
| <t>Address removal is undertaken by ID, so as to permit the use of NATs | ||||
| and other middleboxes that rewrite source addresses. If there is no address at t | ||||
| he requested ID, the receiver will silently ignore the request.</t> | ||||
| <t>A subflow that is still functioning MUST be closed with a FIN exchang | ||||
| e as in regular TCP, rather than using this option. For more information, see <x | ||||
| ref target="sec_close"/>.</t> | ||||
| <?rfc needLines='8'?> | ||||
| <figure align="center" anchor="tcpm_remove" title="Remove Address (REMOV | ||||
| E_ADDR) Option"> | ||||
| <artwork align="left"><![CDATA[ | ||||
| 1 2 3 | ||||
| 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||||
| +---------------+---------------+-------+-------+---------------+ | ||||
| | Kind | Length = 3+n |Subtype|(resvd)| Address ID | ... | ||||
| +---------------+---------------+-------+-------+---------------+ | ||||
| (followed by n-1 Address IDs, if required) | ||||
| ]]></artwork> | ||||
| </figure> | ||||
| </section> | ||||
| <t>For security purposes, if a host receives a REMOVE_ADDR option, | ||||
| it must ensure that the affected path or paths are no longer in use | ||||
| before it instigates closure. The receipt of REMOVE_ADDR | ||||
| <bcp14>SHOULD</bcp14> first trigger the sending of a TCP keepalive | ||||
| <xref target="RFC1122" format="default"/> on the path, and if a | ||||
| response is received, the path <bcp14>SHOULD NOT</bcp14> be | ||||
| removed. If the path is found to still be alive, the receiving host | ||||
| <bcp14>SHOULD</bcp14> no longer use the specified address for future | ||||
| connections, but it is the responsibility of the host that sent the | ||||
| REMOVE_ADDR to shut down the subflow. Before the address is removed, | ||||
| the requesting host | ||||
| <bcp14>MAY</bcp14> also use MP_PRIO (<xref target="sec_policy" | ||||
| format="default"/>) to request that a path no longer be used. Typical | ||||
| TCP validity tests on the subflow (e.g., ensuring | ||||
| that sequence and ACK numbers are correct) <bcp14>MUST</bcp14> also be | ||||
| undertaken. An implementation can use indications of these test failures as par | ||||
| t of intrusion detection or error logging.</t> | ||||
| <t>The sending and receipt (if no keepalive response was received) | ||||
| of this message <bcp14>SHOULD</bcp14> trigger the sending of RSTs by | ||||
| both hosts on the affected subflow(s) (if possible), as a courtesy, | ||||
| to allow the cleanup of middlebox state before cleaning up any local s | ||||
| tate.</t> | ||||
| <t>Address removal is undertaken according to the Address ID, so as to | ||||
| permit the use of NATs and other middleboxes that rewrite source | ||||
| addresses. If an Address ID is not known, the receiver will | ||||
| silently ignore the request.</t> | ||||
| <t>A subflow that is still functioning <bcp14>MUST</bcp14> be closed w | ||||
| ith a FIN exchange as in regular TCP, rather than using this option. For more in | ||||
| formation, see <xref target="sec_close" format="default"/>.</t> | ||||
| </section> | ||||
| </section> | </section> | |||
| <section anchor="sec_fastclose" numbered="true" toc="default"> | ||||
| <section title="Fast Close" anchor="sec_fastclose"> | <name>Fast Close</name> | |||
| <t>Regular TCP has the means of sending a reset (RST) signal to abruptly | <t>Regular TCP has the means of sending a RST signal to abruptly | |||
| close a connection. With MPTCP, a regular RST only has the scope of the | close a connection. With MPTCP, a regular RST only has the scope of | |||
| subflow | the subflow; it | |||
| and will only close the concerned subflow but not affect the remaining | will only close the applicable subflow and will not affect the remaining | |||
| subflows. MPTCP's connection will stay alive at the data level, in order | subflows. MPTCP's connection will stay alive at the data level, in order | |||
| to permit break-before-make handover between subflows. It is therefore | to permit break-before-make handover between subflows. It is therefore | |||
| necessary to provide an MPTCP-level "reset" to allow the abrupt closure | necessary to provide an MPTCP-level "reset" to allow the abrupt closure | |||
| of the whole MPTCP connection, and this is the MP_FASTCLOSE option.</t> | of the whole MPTCP connection; this is done via the MP_FASTCLOSE option. | |||
| </t> | ||||
| <t>MP_FASTCLOSE is used to indicate to the peer that the connection will be | <t>MP_FASTCLOSE is used to indicate to the peer that the connection will be | |||
| abruptly closed and no data will be accepted anymore. The reasons for | abruptly closed and no data will be accepted anymore. The reasons for | |||
| triggering an MP_FASTCLOSE are implementation specific. Regular TCP does | triggering an MP_FASTCLOSE are implementation specific. Regular TCP does | |||
| not allow sending a RST while the connection is in a synchronized | not allow the sending of a RST while the connection is in a synchronized | |||
| state <xref target="RFC0793"/>. Nevertheless, implementations allow | state <xref target="RFC0793" format="default"/>. Nevertheless, implement | |||
| the sending of a RST in this state, if, for example, the operating | ations allow | |||
| the sending of a RST in this state if, for example, the operating | ||||
| system is running out of resources. In these cases, MPTCP should send | system is running out of resources. In these cases, MPTCP should send | |||
| the MP_FASTCLOSE. This option is illustrated in <xref target="tcpm_fastc | the MP_FASTCLOSE. This option is illustrated in <xref target="tcpm_fastc | |||
| lose"/>.</t> | lose" format="default"/>.</t> | |||
| <figure anchor="tcpm_fastclose"> | ||||
| <?rfc needLines='12'?> | <name>Fast Close (MP_FASTCLOSE) Option</name> | |||
| <figure align="center" anchor="tcpm_fastclose" title="Fast Close (MP_FAS | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
| TCLOSE) Option"> | 1 2 3 | |||
| <artwork align="left"><![CDATA[ | 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |||
| 1 2 3 | +---------------+---------------+-------+-----------------------+ | |||
| 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | | Kind | Length |Subtype| (reserved) | | |||
| +---------------+---------------+-------+-----------------------+ | +---------------+---------------+-------+-----------------------+ | |||
| | Kind | Length |Subtype| (reserved) | | | Option Receiver's Key | | |||
| +---------------+---------------+-------+-----------------------+ | | (64 bits) | | |||
| | Option Receiver's Key | | | | | |||
| | (64 bits) | | +---------------------------------------------------------------+ ]]></artwork | |||
| | | | > | |||
| +---------------------------------------------------------------+ | ||||
| ]]></artwork> | ||||
| </figure> | </figure> | |||
| <t>If Host A wants to force the closure of an MPTCP connection, it can | ||||
| <t>If Host A wants to force the closure of an MPTCP connection, it has t | do so via two | |||
| wo | options: | |||
| different options: | </t> | |||
| <list style="symbols"> | <ul spacing="normal"> | |||
| <t>Option A (ACK) : Host A sends an ACK containing the MP_FASTCLOSE | <li>Option A (ACK): Host A sends an ACK containing the MP_FASTCLOSE | |||
| option on one subflow, containing the key of Host B as declared in | option on one subflow, containing the key of Host B as declared in | |||
| the initial connection handshake. On all the other subflows, Host A | the initial connection handshake. On all the other subflows, Host&n | |||
| sends a regular TCP RST to close these subflows, and tears them down. | bsp;A | |||
| Host A now enters FASTCLOSE_WAIT state.</t> | sends a regular TCP RST to close these subflows and tears them down. | |||
| Host A now enters FASTCLOSE_WAIT state.</li> | ||||
| <t>Option R (RST) : Host A sends a RST containing the MP_FASTCLOSE | <li>Option R (RST): Host A sends a RST containing the MP_FASTCLOSE | |||
| option on all subflows, containing the key of Host B as declared in | option on all subflows, containing the key of Host B as declared in | |||
| the initial connection handshake. Host A can tear the subflows and | the initial connection handshake. Host A can tear down the subflows | |||
| the connection down immediately.</t> | and | |||
| </list> | the connection immediately.</li> | |||
| </t> | </ul> | |||
| <t>If Host A decides to force the closure by using Option A and sending | ||||
| <t>If host A decides to force the closure by using Option A and sending | an ACK with the MP_FASTCLOSE option, the connection shall proceed as fol | |||
| an ACK with the MP_FASTCLOSE option, the connection shall proceed as foll | lows: | |||
| ows: | </t> | |||
| <list style="symbols"> | <ul spacing="normal"> | |||
| <t>Upon receipt of an ACK with MP_FASTCLOSE by Host B, containing th | <li>Upon receipt of an ACK with MP_FASTCLOSE by Host B, containing the | |||
| e valid key, Host B answers | valid key, Host B answers | |||
| on the same subflow with a TCP RST and tears down all subflows also | on the same subflow with a TCP RST and tears down all subflows | |||
| through sending TCP RST signals. Host B can | also through sending TCP RST signals. Host B can | |||
| now close the whole MPTCP connection (it transitions directly to CLO | now close the whole MPTCP connection (it transitions directly to CLO | |||
| SED state).</t> | SED state).</li> | |||
| <li>As soon as Host A has received the TCP RST on the remaining subflo | ||||
| <t>As soon as Host A has received the TCP RST on the remaining subfl | w, it | |||
| ow, it | ||||
| can close this subflow and tear down the whole connection (transitio n from | can close this subflow and tear down the whole connection (transitio n from | |||
| FASTCLOSE_WAIT to CLOSED states). If Host A receives an MP_FASTCLOSE instead | FASTCLOSE_WAIT state to CLOSED state). If Host A receives an MP_FAST CLOSE instead | |||
| of a TCP RST, both hosts attempted fast closure simultaneously. Host A should | of a TCP RST, both hosts attempted fast closure simultaneously. Host A should | |||
| reply with a TCP RST and tear down the connection.</t> | reply with a TCP RST and tear down the connection.</li> | |||
| <li>If Host A does not receive a TCP RST in reply to its MP_FASTCLOSE | ||||
| <t>If Host A does not receive a TCP RST in reply to its MP_FASTCLOSE | after one | |||
| after one | retransmission timeout (RTO) (the RTO of the subflow where the MP_FA | |||
| retransmission timeout (RTO) (the RTO of the subflow where the MP_FA | STCLOSE has been sent), it <bcp14>SHOULD</bcp14> | |||
| STCLOSE has been sent), it SHOULD | retransmit the MP_FASTCLOSE. To keep this connection from being | |||
| retransmit the MP_FASTCLOSE. The number of retransmissions SHOULD be | retained for a long time, the number of retransmissions <bcp14>SHOUL | |||
| limited to avoid this connection from being retained for a long time | D</bcp14> be | |||
| , but | limited; | |||
| this limit is implementation specific. A RECOMMENDED number is 3. If | this limit is implementation specific. A <bcp14>RECOMMENDED</bcp14> | |||
| no TCP RST | number is 3. If no TCP RST | |||
| is received in response, Host A SHOULD send a TCP RST with the MP_FA | is received in response, Host A <bcp14>SHOULD</bcp14> send a TCP RST | |||
| STCLOSE option | with the MP_FASTCLOSE option | |||
| itself when it releases state in order to clear any remaining state a | itself when it releases state in order to clear any remaining state | |||
| t middleboxes.</t> | at middleboxes.</li> | |||
| </list> | </ul> | |||
| </t> | <t>If, however, Host A decides to force the closure by using Option R an | |||
| d | ||||
| <t>If however host A decides to force the closure by using Option R and | sending a RST with the MP_FASTCLOSE option, Host B will act as follows: | |||
| sending a RST with the MP_FASTCLOSE option, Host B will act as follows: | upon receipt of a RST with MP_FASTCLOSE, containing the valid key, | |||
| Upon receipt of a RST with MP_FASTCLOSE, containing the valid key, | Host B tears down all subflows by sending a TCP RST. Host B can now | |||
| Host B tears down all subflows by sending a TCP RST. Host B can now close | close the whole MPTCP | |||
| the whole MPTCP | connection (it transitions directly to CLOSED state).</t> | |||
| connection (it transitions directly to CLOSED state).</t> | ||||
| </section> | </section> | |||
| <section anchor="sec_reset" numbered="true" toc="default"> | ||||
| <section title="Subflow Reset" anchor="sec_reset"> | <name>Subflow Reset</name> | |||
| <t>An implementation of MPTCP may also need to send a regular TCP RST to | <t>An implementation of MPTCP may also need to send a regular TCP RST to | |||
| force | force | |||
| the closure of a subflow. A host sends a TCP RST in order to close a subf | the closure of a subflow. A host sends a TCP RST in order to close a sub | |||
| low | flow | |||
| or reject an attempt to open a subflow (MP_JOIN). In order to inform the | or reject an attempt to open a subflow (MP_JOIN). In order to let the | |||
| receiving host why a subflow is being closed or rejected, the TCP RST pac | receiving host know why a subflow is being closed or rejected, the TCP R | |||
| ket | ST packet | |||
| MAY include the MP_TCPRST Option. The host MAY use this information to | <bcp14>MAY</bcp14> include the MP_TCPRST option (<xref target="tcpm_rese | |||
| decide, for example, whether it tries to re-establish the subflow | t"/>). The host <bcp14>MAY</bcp14> use this information to | |||
| immediately, later, or never.</t> | decide, for example, whether it tries to re-establish the subflow | |||
| immediately, later, or never.</t> | ||||
| <?rfc needLines='8'?> | <figure anchor="tcpm_reset"> | |||
| <figure align="center" anchor="tcpm_reset" title="TCP RST Reason (MP_TCP | <name>TCP RST Reason (MP_TCPRST) Option</name> | |||
| RST) Option"> | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
| <artwork align="left"><![CDATA[ | 1 2 3 | |||
| 1 2 3 | 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |||
| 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | +---------------+---------------+-------+-----------------------+ | |||
| +---------------+---------------+-------+-----------------------+ | | Kind | Length |Subtype|U|V|W|T| Reason | | |||
| | Kind | Length |Subtype|U|V|W|T| Reason | | +---------------+---------------+-------+-----------------------+ ]]></artwork | |||
| +---------------+---------------+-------+-----------------------+ | > | |||
| ]]></artwork> | ||||
| </figure> | </figure> | |||
| <t>The MP_TCPRST option contains a reason code that allows the | <t>The MP_TCPRST option contains a reason code that allows the | |||
| sender of the option to provide more information about the reason for | sender of the option to provide more information about the reason for | |||
| the termination of the subflow. Using 12 bits of option space, the | the termination of the subflow. Using 12 bits of option space, the | |||
| first four bits are reserved for flags (only one of which is currently | first 4 bits are reserved for flags (only one of which is currently | |||
| defined), and the remaining octet is used to express a reason code for | defined), and the remaining octet is used to express a reason code for | |||
| this subflow termination, from which a receiver MAY infer information | this subflow termination, from which a receiver <bcp14>MAY</bcp14> infer information | |||
| about the usability of this path.</t> | about the usability of this path.</t> | |||
| <t>The "T" flag is used by the sender to indicate whether the error | <t>The "T" flag is used by the sender to indicate whether the error | |||
| condition that is reported is Transient (T bit set to 1) or Permanent | condition that is reported is Transient ("T" bit set to 1) or Permanent | |||
| (T bit set to 0). If the error condition is considered to be | ("T" bit set to 0). If the error condition is considered to be | |||
| Transient by the sender of the RST segment, the recipient of this | Transient by the sender of the RST segment, the recipient of this | |||
| segment MAY try to reestablish a subflow for this connection over the | segment <bcp14>MAY</bcp14> try to re-establish a subflow for this connec | |||
| failed path. The time at which a receiver may try to re-establish this | tion over the | |||
| is implementation-specific, but SHOULD take into account the properties | failed path. The time at which a receiver may try to | |||
| of the failure defined by the following reason code. If the error condi | re‑establish this subflow | |||
| tion | is implementation specific but <bcp14>SHOULD</bcp14> take into account t | |||
| is considered to be permanent, the receiver of the RST segment SHOULD NO | he properties | |||
| T try | of the failure as defined by the provided reason code. If the error con | |||
| to reestablish a subflow for this connection over this path. The "U", " | dition | |||
| V" | is considered to be Permanent, the receiver of the RST segment <bcp14>SH | |||
| OULD NOT</bcp14> try | ||||
| to re‑establish a subflow for this connection over this path. The | ||||
| "U", "V", | ||||
| and "W" flags are not defined by this specification and are reserved for | and "W" flags are not defined by this specification and are reserved for | |||
| future use. An implementation of this specification MUST set these flags | future use. An implementation of this specification <bcp14>MUST</bcp14> | |||
| to 0, and a receiver MUST ignore them.</t> | set these flags | |||
| to 0, and a receiver <bcp14>MUST</bcp14> ignore them.</t> | ||||
| <t>The "Reason" code is an 8-bit field that indicates the reason for | <t>"Reason" is an 8-bit field that indicates the reason code for | |||
| the termination of the subflow. The following codes are defined in | the termination of the subflow. The following codes are defined in | |||
| this document: | this document: | |||
| <list style="symbols"> | </t> | |||
| <t>Unspecified error (code 0x0). This is the default error implying | <ul spacing="normal"> | |||
| the | <li>Unspecified error (code 0x00). This is the default error; | |||
| it implies that the | ||||
| subflow is no longer available. The presence of this option shows | subflow is no longer available. The presence of this option shows | |||
| that the RST was generated by a MPTCP-aware device.</t> | that the RST was generated by an MPTCP-aware device.</li> | |||
| <li>MPTCP-specific error (code 0x01). An error has been detected in t | ||||
| <t>MPTCP specific error (code 0x01). An error has been detected in | he | |||
| the | ||||
| processing of MPTCP options. This is the usual reason code to retur n | processing of MPTCP options. This is the usual reason code to retur n | |||
| in the cases where a RST is being sent to close a subflow for reason | in the cases where a RST is being sent to close a subflow because | |||
| s | of an invalid response.</li> | |||
| of an invalid response.</t> | <li>Lack of resources (code 0x02). This code indicates that the | |||
| <t>Lack of resources (code 0x02). This code indicates that the | ||||
| sending host does not have enough resources to support the | sending host does not have enough resources to support the | |||
| terminated subflow.</t> | terminated subflow.</li> | |||
| <li>Administratively prohibited (code 0x03). This code indicates that | ||||
| <t>Administratively prohibited (code 0x03). This code indicates tha | ||||
| t | ||||
| the requested subflow is prohibited by the policies of the sending | the requested subflow is prohibited by the policies of the sending | |||
| host.</t> | host.</li> | |||
| <li>Too much outstanding data (code 0x04). This code indicates that | ||||
| <t>Too much outstanding data (code 0x04). This code indicates that | there is an excessive amount of data that needs to be transmitted | |||
| there is an excessive amount of data that need to be transmitted | ||||
| over the terminated subflow while having already been acknowledged | over the terminated subflow while having already been acknowledged | |||
| over one or more other subflows. This may occur if a path has been | over one or more other subflows. This may occur if a path has been | |||
| unavailable for a short period and it is more efficient to reset and | unavailable for a short period and it is more efficient to reset and | |||
| start again than it is to retransmit the queued data.</t> | start again than it is to retransmit the queued data.</li> | |||
| <li>Unacceptable performance (code 0x05). This code indicates that | ||||
| <t>Unacceptable performance (code 0x05). This code indicates that | ||||
| the performance of this subflow was too low compared to the other | the performance of this subflow was too low compared to the other | |||
| subflows of this Multipath TCP connection.</t> | subflows of this Multipath TCP connection.</li> | |||
| <li>Middlebox interference (code 0x06). Middlebox interference has | ||||
| <t>Middlebox interference (code 0x06). Middlebox interference has | been detected over this subflow, making MPTCP signaling invalid. Fo | |||
| been detected over this subflow making MPTCP signaling invalid. For | r | |||
| example, this may be sent if the checksum does not validate.</t> | example, this may be sent if the checksum does not validate.</li> | |||
| </list> | </ul> | |||
| </t> | ||||
| </section> | </section> | |||
| <section anchor="sec_fallback" numbered="true" toc="default"> | ||||
| <section title="Fallback" anchor="sec_fallback"> | <name>Fallback</name> | |||
| <t>Sometimes, middleboxes will exist on a path that could prevent the op | <t>Sometimes, middleboxes will exist on a path that could prevent the | |||
| eration of MPTCP. MPTCP has been designed in order to cope with many middlebox m | operation of MPTCP. MPTCP has been designed to cope with many | |||
| odifications (see <xref target="sec_middleboxes"/>), but there are still some ca | middlebox modifications (see <xref target="sec_middleboxes" | |||
| ses where a subflow could fail to operate within the MPTCP requirements. These c | format="default"/>), but there are still some cases where a subflow | |||
| ases are notably the following: the loss of MPTCP options on a path, and the mod | could fail to operate within the MPTCP requirements. Notably, these case | |||
| ification of payload data. If such an event occurs, it is necessary to "fall bac | s are the following: the loss of MPTCP options on a path and the modification of | |||
| k" to the previous, safe operation. This may be either falling back to regular T | payload data. If such an event occurs, it is necessary to "fall back" to the pr | |||
| CP or removing a problematic subflow.</t> | evious, safe operation. This may be either falling back to regular TCP or removi | |||
| ng a problematic subflow.</t> | ||||
| <t>At the start of an MPTCP connection (i.e., the first subflow), it is | <t>At the start of an MPTCP connection (i.e., the first subflow), it is | |||
| important to ensure that the path is fully MPTCP capable and the necessary MPTCP | important to ensure that the path is fully MPTCP capable and the necessary MPTCP | |||
| options can reach each host. The handshake as described in <xref target="sec_in | options can reach each host. The handshake as described in <xref target="sec_in | |||
| it"/> SHOULD fall back to regular TCP if either of the SYN messages do not have | it" format="default"/> <bcp14>SHOULD</bcp14> fall back to regular TCP if either | |||
| the MPTCP options: this is the same, and desired, behavior in the case where a h | of the SYN messages does not have the MPTCP options: this is the same, and desir | |||
| ost is not MPTCP capable, or the path does not support the MPTCP options. When a | ed, behavior in the case where a host is not MPTCP capable or the path does not | |||
| ttempting to join an existing MPTCP connection (<xref target="sec_join"/>), if a | support the MPTCP options. When attempting to join an existing MPTCP connection | |||
| path is not MPTCP capable and the MPTCP options do not get through on the SYNs, | (<xref target="sec_join" format="default"/>), if a path is not MPTCP capable and | |||
| the subflow will be closed according to the MP_JOIN logic.</t> | the MPTCP options do not get through on the SYNs, the subflow will be closed ac | |||
| cording to the MP_JOIN logic.</t> | ||||
| <t>There is, however, another corner case that should be addressed. That | <t>There is, however, another corner case that should be addressed: | |||
| is one of MPTCP options getting through on the SYN, but not on regular packets. | the case where MPTCP options get through on the SYN but not on regular | |||
| This can be resolved if the subflow is the first subflow, and thus all data in | packets. If the subflow is the first subflow and thus all data in | |||
| flight is contiguous, using the following rules.</t> | flight is contiguous, this situation can be resolved by using the follow | |||
| ing rules:</t> | ||||
| <t>A sender MUST include a DSS option with data sequence mapping in ever | <ul spacing="normal"> | |||
| y segment until one of the sent segments has been acknowledged with a DSS option | <li>A sender <bcp14>MUST</bcp14> include a DSS option with Data Sequence Mapping | |||
| containing a Data ACK. Upon reception of the acknowledgment, the sender has the | in every segment until one of the sent segments has been acknowledged with a DS | |||
| confirmation that the DSS option passes in both directions and may choose to se | S option containing a Data ACK. Upon reception of the acknowledgment, the sender | |||
| nd fewer DSS options than once per segment.</t> | has the confirmation that the DSS option passes in both directions and may choo | |||
| se to send fewer DSS options than once per segment.</li> | ||||
| <t>If, however, an ACK is received for data (not just for the SYN) witho | <li>If, however, an ACK is received for data (not just for the SYN) | |||
| ut a DSS option containing a Data ACK, the sender determines the path is not MPT | without a DSS option containing a Data ACK, the sender determines that t | |||
| CP capable. In the case of this occurring on an additional subflow (i.e., one st | he path is not MPTCP capable. In the case of this occurring on an additional sub | |||
| arted with MP_JOIN), the host MUST close the subflow with a RST, which SHOULD co | flow (i.e., one started with MP_JOIN), the host <bcp14>MUST</bcp14> close the su | |||
| ntain a MP_TCPRST option (<xref target="sec_reset"/>) with a "Middlebox interfer | bflow with a RST, which <bcp14>SHOULD</bcp14> contain an MP_TCPRST option (<xref | |||
| ence" reason code.</t> | target="sec_reset" format="default"/>) with a "Middlebox interference" reason c | |||
| ode.</li> | ||||
| <t>In the case of such an ACK being received on the first subflow (i.e., | <li>In the case of such an ACK being received on the first subflow | |||
| that started with MP_CAPABLE), before any additional subflows are added, the im | (i.e., that started with MP_CAPABLE), before any additional subflows | |||
| plementation MUST drop out of an MPTCP mode, back to regular TCP. The sender wil | are added, the implementation <bcp14>MUST</bcp14> drop out of MPTCP | |||
| l send one final data sequence mapping, with the Data-Level Length value of 0 in | mode and fall back to regular TCP. The sender will send one final Data S | |||
| dicating an infinite mapping (to inform the other end in case the path drops opt | equence Mapping, with the Data-Level Length value of 0 indicating an infinite ma | |||
| ions in one direction only), and then revert to sending data on the single subfl | pping (to inform the other end in case the path drops options in one direction o | |||
| ow without any MPTCP options.</t> | nly), and then revert to sending data on the single subflow without any MPTCP op | |||
| tions.</li> | ||||
| <t>If a subflow breaks during operation, e.g. if it is re-routed and MPT | <li>If a subflow breaks during operation, e.g., if it is rerouted and | |||
| CP options are no longer permitted, then once this is detected (by the subflow-l | MPTCP options are no longer permitted, then once this is detected (by | |||
| evel receive buffer filling up, since there is no mapping available in order to | the subflow-level receive buffer filling up, since there is no mapping | |||
| DATA_ACK this data), the subflow SHOULD be treated as broken and closed with a R | available in order to DATA_ACK this data), the subflow | |||
| ST, since no data can be delivered to the application layer, and no fallback sig | <bcp14>SHOULD</bcp14> be treated as broken and closed with a RST, | |||
| nal can be reliably sent. This RST SHOULD include the MP_TCPRST option (<xref ta | since no data can be delivered to the application layer and no | |||
| rget="sec_reset"/>) with a "Middlebox interference" reason code.</t> | fallback signal can be reliably sent. This RST <bcp14>SHOULD</bcp14> | |||
| include the MP_TCPRST option (<xref target="sec_reset" | ||||
| <t>These rules should cover all cases where such a failure could happen: | format="default"/>) with a "Middlebox interference" reason code.</li> | |||
| whether it's on the forward or reverse path and whether the server or the clien | </ul> | |||
| t first sends data.</t> | <t>These rules should cover all cases where such a failure could | |||
| happen -- whether it's on the forward or reverse path and whether the se | ||||
| <t>So far this section has discussed the loss of MPTCP options, either i | rver or the client first sends data.</t> | |||
| nitially, or during the course of the connection. As described in <xref target=" | <t>So far, this section has discussed the loss of MPTCP options, | |||
| sec_generalop"/>, each portion of data for which there is a mapping is protected | either initially or during the course of the connection. As described | |||
| by a checksum, if checksums have been negotiated. This mechanism is used to det | in <xref target="sec_generalop" format="default"/>, each portion of | |||
| ect if middleboxes have made any adjustments to the payload (added, removed, or | data for which there is a mapping is protected by a checksum, if | |||
| changed data). A checksum will fail if the data has been changed in any way. Thi | checksums have been negotiated. This mechanism is used to detect if | |||
| s will also detect if the length of data on the subflow is increased or decrease | middleboxes have made any adjustments to the payload (added, removed, | |||
| d, and this means the data sequence mapping is no longer valid. The sender no lo | or changed data). A checksum will fail if the data has been changed in | |||
| nger knows what subflow-level sequence number the receiver is genuinely operatin | any way. The use of a checksum will also detect whether the length of da | |||
| g at (the middlebox will be faking ACKs in return), and it cannot signal any fur | ta on the subflow is | |||
| ther mappings. Furthermore, in addition to the possibility of payload modificati | increased or decreased, and this means the Data Sequence Mapping is no | |||
| ons that are valid at the application layer, there is the possibility that such | longer valid. The sender no longer knows what subflow-level sequence | |||
| modifications could be triggered across MPTCP segment boundaries, corrupting the | number the receiver is genuinely operating at (the middlebox will be | |||
| data. Therefore, all data from the start of the segment that failed the checksu | faking ACKs in return), and it cannot signal any further | |||
| m onwards is not trustworthy.</t> | mappings. Furthermore, in addition to the possibility of payload | |||
| modifications that are valid at the application layer, it is possible th | ||||
| <t>Note that if checksum usage has not been negotiated, this fallback me | at such modifications could be triggered across MPTCP segment boundaries, corrup | |||
| chanism cannot be used unless there is some higher or lower layer signal to info | ting the data. Therefore, all data from the start of the segment that failed the | |||
| rm the MPTCP implementation that the payload has been tampered with.</t> | checksum onward is not trustworthy.</t> | |||
| <t>Note that if checksum usage has not been negotiated, this fallback me | ||||
| <t>When multiple subflows are in use, the data in flight on a subflow wi | chanism cannot be used unless there is some higher-layer or lower‑layer si | |||
| ll likely involve data that is not contiguously part of the connection-level str | gnal to inform the MPTCP implementation that the payload has been tampered with. | |||
| eam, since segments will be spread across the multiple subflows. Due to the prob | </t> | |||
| lems identified above, it is not possible to determine what adjustment has done | <t>When multiple subflows are in use, the data in flight on a subflow | |||
| to the data (notably, any changes to the subflow sequence numbering). Therefore, | will likely involve data that is not contiguously part of the | |||
| it is not possible to recover the subflow, and the affected subflow must be imm | connection-level stream, since segments will be spread across the | |||
| ediately closed with a RST, featuring an MP_FAIL option (<xref target="tcpm_fall | multiple subflows. Due to the problems identified above, it is not | |||
| back"/>), which defines the data sequence number at the start of the segment (de | possible to determine what adjustments have been done to the data (notab | |||
| fined by the data sequence mapping) that had the checksum failure. Note that the | ly, | |||
| MP_FAIL option requires the use of the full 64-bit sequence number, even if 32- | any changes to the subflow sequence numbering). Therefore, it is not | |||
| bit sequence numbers are normally in use in the DSS signals on the path.</t> | possible to recover the subflow, and the affected subflow must be | |||
| immediately closed with a RST that includes an MP_FAIL option (<xref tar | ||||
| <?rfc needLines='8'?> | get="tcpm_fallback" format="default"/>), which defines the data sequence number | |||
| <figure align="center" anchor="tcpm_fallback" title="Fallback (MP_FAIL) | at the start of the segment (defined by the Data Sequence Mapping) that had the | |||
| Option"> | checksum failure. Note that the MP_FAIL option requires the use of the full 64-b | |||
| <artwork align="left"><![CDATA[ | it sequence number, even if 32-bit sequence numbers are normally in use in the D | |||
| 1 2 3 | SS signals on the path.</t> | |||
| 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | <figure anchor="tcpm_fallback"> | |||
| +---------------+---------------+-------+----------------------+ | <name>Fallback (MP_FAIL) Option</name> | |||
| | Kind | Length=12 |Subtype| (reserved) | | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
| +---------------+---------------+-------+----------------------+ | 1 2 3 | |||
| | | | 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |||
| | Data Sequence Number (8 octets) | | +---------------+---------------+-------+----------------------+ | |||
| | | | | Kind | Length=12 |Subtype| (reserved) | | |||
| +--------------------------------------------------------------+ | +---------------+---------------+-------+----------------------+ | |||
| | | | ||||
| ]]></artwork> | | Data Sequence Number (8 octets) | | |||
| | | | ||||
| +--------------------------------------------------------------+ ]]></artwork> | ||||
| </figure> | </figure> | |||
| <t>The receiver of this option <bcp14>MUST</bcp14> discard all data foll | ||||
| owing the data sequence number specified. | ||||
| Failed data <bcp14>MUST NOT</bcp14> be DATA_ACKed and so will be retrans | ||||
| mitted on other subflows (<xref target="sec_retransmit" format="default"/>). </t | ||||
| > | ||||
| <t>A special case is when there is a single subflow and it fails with a | ||||
| checksum error. If it is known that all unacknowledged data in | ||||
| flight is contiguous (which will usually be the case with a single | ||||
| subflow), an infinite mapping can be applied to the subflow without | ||||
| the need to close it first, essentially turning off all further | ||||
| MPTCP signaling. | ||||
| <t>The receiver of this option MUST discard all data following the data | In this case, if a receiver identifies a checksum failure | |||
| sequence number specified. | ||||
| Failed data MUST NOT be DATA_ACKed and so will be retransmitted on other | ||||
| subflows (<xref target="sec_retransmit"/>). </t> | ||||
| <t>A special case is when there is a single subflow and it fails with a | ||||
| checksum error. | ||||
| If it is known that all unacknowledged data in flight is | ||||
| contiguous (which will usually be the case with a single subflow), an infinite m | ||||
| apping can be applied to the subflow without the need to close it first, and | ||||
| essentially turn off all further MPTCP signaling. In this case, if a receiver id | ||||
| entifies a checksum failure | ||||
| when there is only one path, it will send back an MP_FAIL option on the subflow- level ACK, referring to the data-level sequence number of the start of the | when there is only one path, it will send back an MP_FAIL option on the subflow- level ACK, referring to the data-level sequence number of the start of the | |||
| segment on which the checksum error was detected. The sender will receive | segment on which the checksum error was detected. The sender will receive | |||
| this, and if all unacknowledged data in flight is contiguous, will signal an inf | this information and, if all unacknowledged data in flight is contiguous, will s | |||
| inite mapping. | ignal an infinite mapping. | |||
| This infinite mapping will be a DSS option (<xref target="sec_generalop"/>) | This infinite mapping will be a DSS option (<xref target="sec_generalop" format= | |||
| on the first new packet, containing a data sequence mapping that acts retroactiv | "default"/>) | |||
| ely, referring to the start of the subflow sequence | on the first new packet, containing a Data Sequence Mapping that acts retroactiv | |||
| number of the most recent segment that was known to be delivered intact (i.e. wa | ely, referring to the start of the subflow sequence | |||
| s successfully DATA_ACKed). From that point onwards, data can be altered | number of the most recent segment that was known to be delivered intact (i.e., w | |||
| as successfully DATA_ACKed). From that point onward, data can be altered | ||||
| by a middlebox without affecting MPTCP, as the data stream is equivalent to a re gular, legacy TCP session. | by a middlebox without affecting MPTCP, as the data stream is equivalent to a re gular, legacy TCP session. | |||
| Whilst in theory paths may only be damaged in one direction, and the MP_FAIL sig | While in theory paths may only be damaged in one direction -- and the MP_FAIL | |||
| nal affects only one direction of traffic, | signal affects only one direction of traffic -- | |||
| for implementation simplicity, the receiver of an MP_FAIL MUST also respond with | for simplicity of implementation, the receiver of an MP_FAIL <bcp14>MUST</bcp14> | |||
| an MP_FAIL in the reverse direction and entirely revert to a regular TCP sessio | also respond with an MP_FAIL in the reverse direction and entirely revert to a | |||
| n.</t> | regular TCP session.</t> | |||
| <t>In the rare case that the data is not contiguous (which could happen when there is only one subflow but it is retransmitting data from a subflow | <t>In the rare case that the data is not contiguous (which could happen when there is only one subflow but it is retransmitting data from a subflow | |||
| that has recently been uncleanly closed), the receiver MUST close the subflow wi | that has recently been uncleanly closed), the receiver <bcp14>MUST</bcp14> close | |||
| th a RST with MP_FAIL. The receiver MUST discard all data that follows the | the subflow with a RST with MP_FAIL. The receiver <bcp14>MUST</bcp14> discard a | |||
| data sequence number specified. The sender MAY attempt to create a new subflow b | ll data that follows the | |||
| elonging to the same connection, and, if it chooses to do so, SHOULD place | data sequence number specified. The sender <bcp14>MAY</bcp14> attempt to | |||
| the single subflow immediately in single-path mode by setting an infinite data s | create a new subflow belonging to the same connection and, if it chooses to do | |||
| equence mapping. This mapping will begin from the data-level sequence number | so, <bcp14>SHOULD</bcp14> immediately place | |||
| the single subflow in single-path mode by setting an infinite Data Sequence Mapp | ||||
| ing. This mapping will begin from the data-level sequence number | ||||
| that was declared in the MP_FAIL.</t> | that was declared in the MP_FAIL.</t> | |||
| <t>After a sender signals an infinite mapping, it <bcp14>MUST</bcp14> on | ||||
| <t>After a sender signals an infinite mapping, it MUST only use subflow | ly use subflow ACKs to clear its send buffer. | |||
| ACKs to clear its send buffer. | ||||
| This is because Data ACKs may become misaligned with the subflow ACKs when middl eboxes insert or delete data. | This is because Data ACKs may become misaligned with the subflow ACKs when middl eboxes insert or delete data. | |||
| The receive SHOULD stop generating Data ACKs after it receives an infinite mappi | The receiver <bcp14>SHOULD</bcp14> stop generating Data ACKs after it receives | |||
| ng. </t> | an infinite mapping.</t> | |||
| <t>When a connection has fallen back with an infinite mapping, only one | ||||
| <t>When a connection has fallen back with an infinite mapping, only one | subflow can send data; otherwise, the receiver would not know how to reorder the | |||
| subflow can send data; otherwise, the receiver would not know how to reorder the | data. In practice, this means that all MPTCP subflows will have to be terminate | |||
| data. In practice, this means that all MPTCP subflows will have to be terminate | d except one. Once MPTCP falls back to regular TCP, it <bcp14>MUST NOT</bcp14> r | |||
| d except one. Once MPTCP falls back to regular TCP, it MUST NOT revert to MPTCP | evert to MPTCP later in the connection.</t> | |||
| later in the connection.</t> | ||||
| <t>It should be emphasized that MPTCP is not attempting to prevent the u se of middleboxes that want to adjust the payload. An MPTCP-aware middlebox coul d provide such functionality by also rewriting checksums.</t> | <t>It should be emphasized that MPTCP is not attempting to prevent the u se of middleboxes that want to adjust the payload. An MPTCP-aware middlebox coul d provide such functionality by also rewriting checksums.</t> | |||
| </section> | </section> | |||
| <section anchor="sec_errors" numbered="true" toc="default"> | ||||
| <section title="Error Handling" anchor="sec_errors"> | <name>Error Handling</name> | |||
| <t>In addition to the fallback mechanism as described above, the standar | <t>In addition to the fallback mechanism described above, the standard c | |||
| d classes of TCP errors may need to be handled in an MPTCP-specific way. Note th | lasses of TCP errors may need to be handled in an MPTCP‑specific way. Note | |||
| at changing semantics -- such as the relevance of a RST -- are covered in <xref | that changing semantics -- such as the relevance of a RST -- are covered in <xr | |||
| target="sec_semantics"/>. Where possible, we do not want to deviate from regular | ef target="sec_semantics" format="default"/>. Where possible, we do not want to | |||
| TCP behavior.</t> | deviate from regular TCP behavior.</t> | |||
| <t>The following list covers possible errors and the appropriate MPTCP b ehavior: | <t>The following list covers possible errors and the appropriate MPTCP b ehavior: | |||
| <list style="symbols"> | ||||
| <t>Unknown token in MP_JOIN (or HMAC failure in MP_JOIN ACK, or miss | ||||
| ing MP_JOIN in SYN/ACK response): send RST (analogous to TCP's behavior on an un | ||||
| known port)</t> | ||||
| <t>DSN out of window (during normal operation): drop the data, do no | ||||
| t send Data ACKs</t> | ||||
| <t>Remove request for unknown address ID: silently ignore</t> | ||||
| </list> | ||||
| </t> | </t> | |||
| <ul spacing="normal"> | ||||
| <li>Unknown token in MP_JOIN (or HMAC failure in MP_JOIN ACK, or missi | ||||
| ng MP_JOIN in SYN/ACK response): send RST (analogous to TCP's behavior on an unk | ||||
| nown port)</li> | ||||
| <li>DSN out of window (during normal operation): drop the data; do not | ||||
| send Data ACKs</li> | ||||
| <li>Remove request for unknown Address ID: silently ignore</li> | ||||
| </ul> | ||||
| </section> | </section> | |||
| <section anchor="heuristics" numbered="true" toc="default"> | ||||
| <section title="Heuristics" anchor="heuristics"> | <name>Heuristics</name> | |||
| <t>There are a number of heuristics that are needed for | <t>There are a number of heuristics that are needed for | |||
| performance or deployment but that are not required for | performance or deployment but that are not required for | |||
| protocol correctness. In this section, we detail such | protocol correctness. In this section, we detail such | |||
| heuristics. Note that discussion of buffering and certain | heuristics. Note that discussions of buffering and certain | |||
| sender and receiver window behaviors are presented in Sections | sender and receiver window behaviors are presented in Sections | |||
| <xref target="sec_rwin" format="counter"/> and <xref target="sec_sender" | <xref target="sec_rwin" format="counter"/> and <xref | |||
| format="counter"/>, | target="sec_sender" format="counter"/>, | |||
| as well as retransmission in <xref target="sec_retransmit"/>.</t> | and retransmission is discussed in <xref target="sec_retransmit" format= | |||
| "default"/>.</t> | ||||
| <section title="Port Usage"> | <section numbered="true" toc="default"> | |||
| <t>Under typical operation, an MPTCP implementation SHOULD use | <name>Port Usage</name> | |||
| the same ports as already in use. In other words, the | <t>Under typical operation, an MPTCP implementation <bcp14>SHOULD</bcp | |||
| destination port of a SYN containing an MP_JOIN option SHOULD | 14> use | |||
| the same ports as the ports that are already in use. In other words, t | ||||
| he | ||||
| destination port of a SYN containing an MP_JOIN option <bcp14>SHOULD</ | ||||
| bcp14> | ||||
| be the same as the remote port of the first subflow in the | be the same as the remote port of the first subflow in the | |||
| connection. The local port for such SYNs SHOULD also be the | connection. The local port for such SYNs <bcp14>SHOULD</bcp14> also b | |||
| same as for the first subflow (and as such, an | e the | |||
| implementation SHOULD reserve ephemeral ports across all | same as the port for the first subflow (and as such, an | |||
| implementation <bcp14>SHOULD</bcp14> reserve ephemeral ports across al | ||||
| l | ||||
| local IP addresses), although there may be cases where this | local IP addresses), although there may be cases where this | |||
| is infeasible. This strategy is intended to maximize the | is infeasible. This strategy is intended to maximize the | |||
| probability of the SYN being permitted by a firewall or NAT | probability of the SYN being permitted by a firewall or NAT | |||
| at the recipient and to avoid confusing any network | at the recipient and to avoid confusing any network-monitoring softwar | |||
| monitoring software.</t> | e.</t> | |||
| <t>There may also be cases, however, where a host wishes to | <t>There may also be cases, however, where a host wishes to | |||
| signal that a specific port should be used, and this facility | signal that a specific port should be used; this facility | |||
| is provided in the ADD_ADDR option as documented in | is provided in the ADD_ADDR option as documented in | |||
| <xref target="sec_add_address"/>. It is therefore feasible | <xref target="sec_add_address" format="default"/>. It is therefore fe asible | |||
| to allow multiple subflows between the same two addresses | to allow multiple subflows between the same two addresses | |||
| but using different port pairs, and | but using different port pairs, and | |||
| such a facility could be used to allow load balancing within | such a facility could be used to allow load balancing within | |||
| the network based on 5-tuples (e.g., some ECMP implementations <xref t arget="RFC2992"/>).</t> | the network based on 5-tuples (e.g., some ECMP implementations <xref t arget="RFC2992" format="default"/>).</t> | |||
| </section> | </section> | |||
| <section numbered="true" toc="default"> | ||||
| <section title="Delayed Subflow Start and Subflow Symmetry"> | <name>Delayed Subflow Start and Subflow Symmetry</name> | |||
| <t>Many TCP connections are short-lived and consist only of a few | <t>Many TCP connections are short-lived and consist only of a few | |||
| segments, and so the overheads | segments, and so the overhead | |||
| of using MPTCP outweigh any benefits. A heuristic is required, | of using MPTCP outweighs any benefits. A heuristic is required, | |||
| therefore, to decide when to start using additional subflows in | therefore, to decide when to start using additional subflows in | |||
| an MPTCP connection. Experimental deployments have shown that | an MPTCP connection. Experimental deployments have shown that | |||
| MPTCP can be applied in a range of scenarios so an implementation | MPTCP can be applied in a range of scenarios, so an implementation | |||
| is likely to need to take into account factors including the type of | will likely need to take into account such factors as the type of | |||
| traffic being sent and duration of session, and this information | traffic being sent and the duration of the session; this information | |||
| MAY be signalled by the application layer.</t> | <bcp14>MAY</bcp14> be signaled by the application layer.</t> | |||
| <t>However, for standard TCP traffic, a suggested general-purpose | <t>However, for standard TCP traffic, a suggested general-purpose | |||
| heuristic that an implementation MAY choose to employ is as follows.</ | heuristic that an implementation <bcp14>MAY</bcp14> choose to employ i | |||
| t> | s as follows.</t> | |||
| <t>If a host has data buffered for its peer (which implies that the | <t>If a host has data buffered for its peer (which implies that the | |||
| application has received a request for data), the host opens one | application has received a request for data), the host opens one | |||
| subflow for each initial window's worth of data that is buffered.</t> | subflow for each initial window's worth of data that is buffered.</t> | |||
| <t>Consideration should also be given to limiting the rate of adding | <t>Consideration should also be given to limiting the rate of adding | |||
| new subflows, as well as limiting the total number of subflows open | new subflows, as well as limiting the total number of subflows open | |||
| for a particular connection. A host may choose to vary these values | for a particular connection. A host may choose to vary these values | |||
| based on its load or knowledge of traffic and path characteristics.</t > | based on its load or knowledge of traffic and path characteristics.</t > | |||
| <t>Note that this heuristic alone is probably insufficient. Traffic | <t>Note that this heuristic alone is probably insufficient. Traffic | |||
| for many common applications, such as downloads, is highly asymmetric and | for many common applications, such as downloads, is highly asymmetric, and | |||
| the host that is multihomed may well be the client that will never fil l | the host that is multihomed may well be the client that will never fil l | |||
| its buffers, and thus never use MPTCP according to this heuristic. Adv anced APIs that allow an | its buffers and thus never use MPTCP according to this heuristic. Adva nced APIs that allow an | |||
| application to signal its traffic requirements would aid in these deci sions.</t> | application to signal its traffic requirements would aid in these deci sions.</t> | |||
| <t>An additional time-based heuristic could be applied, opening additi onal | <t>An additional time-based heuristic could be applied, opening additi onal | |||
| subflows after a given period of time has passed. This would alleviate the | subflows after a given period of time has passed. This would alleviate the | |||
| above issue, and also provide resilience for low-bandwidth but long-li ved | above issue and also provide resilience for low‑bandwidth but lo ng-lived | |||
| applications.</t> | applications.</t> | |||
| <t>Another issue is that both communicating hosts may simultaneously t ry to | <t>Another issue is that both communicating hosts may simultaneously t ry to | |||
| set up a subflow between the same pair of addresses. This leads to an | set up a subflow between the same pair of addresses. This leads to an | |||
| inefficient use of resources.</t> | inefficient use of resources.</t> | |||
| <t>If the same ports are used on all subflows, as recommended above, | <t>If the same ports are used on all subflows, as recommended above, | |||
| then standard TCP simultaneous open logic should take care of this sit uation | then standard TCP simultaneous-open logic should take care of this sit uation | |||
| and only one subflow will be established between the address pairs. Ho wever, | and only one subflow will be established between the address pairs. Ho wever, | |||
| this relies on the same ports being used at both end hosts. If a host does | this relies on the same ports being used at both end hosts. If a host does | |||
| not support TCP simultaneous open, it is RECOMMENDED that some element | not support TCP simultaneous open, it is <bcp14>RECOMMENDED</bcp14> th | |||
| of randomization is applied to the time to wait before opening new sub | at some element | |||
| flows, | of randomization be applied to the time to wait before opening new sub | |||
| flows, | ||||
| so that only one subflow is created between a given address pair. If, however, | so that only one subflow is created between a given address pair. If, however, | |||
| hosts signal additional ports to use (for example, for leveraging ECMP on-path), | hosts signal additional ports to use (for example, for leveraging ECMP on-path), | |||
| this heuristic is not appropriate.</t> | this heuristic is not appropriate.</t> | |||
| <t>This section has shown some of the factors that an implementer | ||||
| <t>This section has shown some of the considerations that an implement | should consider when developing MPTCP heuristics, but it is not intend | |||
| er | ed to be | |||
| should give when developing MPTCP heuristics, but is not intended to b | ||||
| e | ||||
| prescriptive.</t> | prescriptive.</t> | |||
| </section> | </section> | |||
| <section numbered="true" toc="default"> | ||||
| <section title="Failure Handling"> | <name>Failure Handling</name> | |||
| <t>Requirements for MPTCP's handling of unexpected signals have been | <t>Requirements for MPTCP's handling of unexpected signals are | |||
| given in <xref target="sec_errors"/>. There are other failure cases, | given in <xref target="sec_errors" format="default"/>. There are other | |||
| however, where a hosts can choose appropriate behavior.</t> | failure cases, | |||
| however, where hosts can choose appropriate behavior.</t> | ||||
| <t>For example, <xref target="sec_init"/> suggests that a host SHOULD | <t>For example, <xref target="sec_init" format="default"/> suggests th | |||
| at a host <bcp14>SHOULD</bcp14> | ||||
| fall back to trying regular TCP SYNs after one or more failures of MPT CP | fall back to trying regular TCP SYNs after one or more failures of MPT CP | |||
| SYNs for a connection. A host may keep a system-wide cache of such | SYNs for a connection. A host may keep a system-wide cache of such | |||
| information, so that it can back off from using MPTCP, firstly for tha t | information, so that it can back off from using MPTCP, firstly for tha t | |||
| particular destination host, and eventually on a whole interface, if | particular destination host and, eventually, on a whole interface, if | |||
| MPTCP connections continue failing. The duration of such a cache would | MPTCP connections continue to fail. The duration of such a cache would | |||
| be implementation-specific.</t> | be implementation specific.</t> | |||
| <t>Another failure could occur when the MP_JOIN handshake fails. | <t>Another failure could occur when the MP_JOIN handshake fails. | |||
| <xref target="sec_errors"/> specifies that an incorrect handshake MUST | <xref target="sec_errors" format="default"/> specifies that an incorre ct handshake <bcp14>MUST</bcp14> | |||
| lead to the subflow being closed with a RST. A host operating an activ e | lead to the subflow being closed with a RST. A host operating an activ e | |||
| intrusion detection system may choose to start blocking MP_JOIN packet s | intrusion-detection system may choose to start blocking MP_JOIN packet s | |||
| from the source host if multiple failed MP_JOIN attempts are seen. Fro m | from the source host if multiple failed MP_JOIN attempts are seen. Fro m | |||
| the connection initiator's point of view, if an MP_JOIN fails, it SHOU | the connection initiator's point of view, if an MP_JOIN fails, it | |||
| LD | <bcp14>SHOULD NOT</bcp14> | |||
| NOT attempt to connect to the same IP address and port during the life | attempt to connect to the same IP address and port during the lifetime | |||
| time | ||||
| of the connection, unless the other host refreshes the information wit h | of the connection, unless the other host refreshes the information wit h | |||
| another ADD_ADDR option. Note that the ADD_ADDR option is informationa l | another ADD_ADDR option. Note that the ADD_ADDR option is informationa l | |||
| only, and does not guarantee the other host will attempt a connection. | only and does not guarantee that the other host will attempt a connect | |||
| </t> | ion.</t> | |||
| <t>In addition, an implementation may learn, over a number of connecti ons, | <t>In addition, an implementation may learn, over a number of connecti ons, | |||
| that certain interfaces or destination addresses consistently fail and | that certain interfaces or destination addresses consistently fail and | |||
| may default to not trying to use MPTCP for these. Behavior could also | may default to not trying to use MPTCP for such interfaces or | |||
| be learned for particularly badly performing subflows or subflows that | addresses. The behavior of subflows that perform particularly badly | |||
| regularly fail during use, in order to temporarily choose not to use | or subflows that regularly fail during use could also | |||
| be learned, so that an implementation can temporarily choose not to us | ||||
| e | ||||
| these paths.</t> | these paths.</t> | |||
| </section> | </section> | |||
| </section> | </section> | |||
| </section> | </section> | |||
| <section anchor="sec_semantics" numbered="true" toc="default"> | ||||
| <section title="Semantic Issues" anchor="sec_semantics"> | <name>Semantic Issues</name> | |||
| <t>In order to support multipath operation, the semantics of some TCP comp | <t>In order to support multipath operation, the semantics of some TCP | |||
| onents have changed. To aid clarity, this section collects these semantic change | components have changed. To help clarify, this section lists these | |||
| s as a reference. | semantic changes as a point of reference. | |||
| <list style="hanging"> | </t> | |||
| <t hangText="Sequence number:"> The (in-header) TCP sequence | <dl newline="false" spacing="normal" indent="3"> | |||
| <dt>Sequence number:</dt> | ||||
| <dd> The (in-header) TCP sequence | ||||
| number is specific to the subflow. To allow the receiver to | number is specific to the subflow. To allow the receiver to | |||
| reorder application data, an additional data-level | reorder application data, an additional data-level | |||
| sequence space is used. In this data-level sequence space, the initi | sequence space is used. In this data‑level sequence space, the | |||
| al SYN and | initial SYN and | |||
| the final DATA_FIN occupy 1 octet of sequence space. This is to ensu | the final DATA_FIN occupy 1 octet of sequence space. This is done to | |||
| re these | ensure that these | |||
| signals are acknowledged at the connection level. There is an explic it | signals are acknowledged at the connection level. There is an explic it | |||
| mapping of data sequence space to subflow sequence space, | mapping of data sequence space to subflow sequence space, | |||
| which is signaled through TCP options in data | which is signaled through TCP options in data | |||
| packets.</t> | packets.</dd> | |||
| <dt>ACK:</dt> | ||||
| <t hangText="ACK:"> The ACK field in the TCP header | <dd> The ACK field in the TCP header | |||
| acknowledges only the subflow sequence number, not the | acknowledges only the subflow sequence number -- not the | |||
| data-level sequence space. Implementations SHOULD NOT | data-level sequence space. Implementations <bcp14>SHOULD NOT</bcp14> | |||
| attempt to infer a data-level acknowledgment from the | attempt to infer a data-level acknowledgment from the | |||
| subflow ACKs. | subflow ACKs. | |||
| This separates subflow- and connection-level processing | This separates subflow-level and connection-level processing | |||
| at an end host.</t> | at an end host.</dd> | |||
| <dt>Duplicate ACK:</dt> | ||||
| <t hangText="Duplicate ACK:"> A duplicate ACK that includes any MPTCP | <dd> A duplicate ACK that includes any MPTCP signaling | |||
| signaling | (with the exception of the DSS option) <bcp14>MUST NOT</bcp14> be tr | |||
| (with the exception of the DSS option) MUST NOT be treated as a sign | eated as a signal of congestion. | |||
| al of congestion. | ||||
| To limit the chances of non-MPTCP-aware entities mistakenly interpre ting duplicate | To limit the chances of non-MPTCP-aware entities mistakenly interpre ting duplicate | |||
| ACKs as a signal of congestion, MPTCP SHOULD NOT send more than two | ACKs as a signal of congestion, MPTCP <bcp14>SHOULD NOT</bcp14> send | |||
| duplicate ACKs | more than two duplicate ACKs | |||
| containing (non-DSS) MPTCP signals in a row.</t> | containing (non-DSS) MPTCP signals in a row.</dd> | |||
| <dt>Receive Window:</dt> | ||||
| <t hangText="Receive Window:">The receive window in the TCP | <dd>The receive window in the TCP | |||
| header indicates the amount of free buffer space for the | header indicates the amount of free buffer space for the | |||
| whole data-level connection (as opposed to for this | whole data-level connection (as opposed to the amount of space for t | |||
| subflow) that is available at the receiver. This is the | his | |||
| same semantics as regular TCP, but to maintain these | subflow) that is available at the receiver. The | |||
| semantics are the same as for regular TCP, but to maintain these | ||||
| semantics the receive window must be interpreted at the | semantics the receive window must be interpreted at the | |||
| sender as relative to the sequence number given in the | sender as relative to the sequence number given in the | |||
| DATA_ACK rather than the subflow ACK in the TCP header. | DATA_ACK rather than the subflow ACK in the TCP header. | |||
| In this way, the original flow control role is preserved. | In this way, the original role of flow control is preserved. | |||
| Note that some middleboxes may change the receive window, | Note that some middleboxes may change the receive window, | |||
| and so a host SHOULD use the maximum value of those recently | and so a host <bcp14>SHOULD</bcp14> use the maximum value of those r ecently | |||
| seen on the constituent subflows for the connection-level | seen on the constituent subflows for the connection-level | |||
| receive window, and also needs to maintain a subflow-level | receive window and also needs to maintain a subflow-level | |||
| window for subflow-level processing.</t> | window for subflow-level processing.</dd> | |||
| <dt>FIN:</dt> | ||||
| <t hangText="FIN:"> The FIN flag in the TCP header applies | <dd> The FIN flag in the TCP header applies | |||
| only to the subflow it is sent on, not to the whole | only to the subflow it is sent on -- not to the whole | |||
| connection. For connection-level FIN semantics, the | connection. For connection-level FIN semantics, the | |||
| DATA_FIN option is used.</t> | DATA_FIN option is used.</dd> | |||
| <dt>RST:</dt> | ||||
| <t hangText="RST:"> The RST flag in the TCP header applies | <dd> The RST flag in the TCP header applies | |||
| only to the subflow it is sent on, not to the whole | only to the subflow it is sent on -- not to the whole | |||
| connection. The MP_FASTCLOSE option provides the fast close | connection. The MP_FASTCLOSE option provides the Fast Close | |||
| functionality of a RST at the MPTCP connection level.</t> | functionality of a RST at the MPTCP connection level.</dd> | |||
| <dt>Address List:</dt> | ||||
| <t hangText="Address List:"> Address list management (i.e., | <dd> Address list management (i.e., | |||
| knowledge of the local and remote hosts' lists of | knowledge of the local and remote hosts' lists of | |||
| available IP addresses) is handled | available IP addresses) is handled | |||
| on a per-connection basis (as opposed to per subflow, per | on a per-connection basis (as opposed to per subflow, per | |||
| host, or per pair of communicating hosts). This permits | host, or per pair of communicating hosts). This permits | |||
| the application of per-connection local policy. Adding an | the application of per-connection local policy. Adding an | |||
| address to one connection (either explicitly through an Add | address to one connection (either explicitly through an | |||
| Address message, or implicitly through a Join) has no implication | ADD_ADDR message or implicitly through an MP_JOIN) has no implicatio | |||
| for other connections between the same pair of hosts.</t> | ns | |||
| for other connections between the same pair of hosts.</dd> | ||||
| <t hangText="5-tuple:"> The 5-tuple (protocol, local | <dt>5-tuple:</dt> | |||
| <dd> The 5-tuple (protocol, local | ||||
| address, local port, remote address, remote port) | address, local port, remote address, remote port) | |||
| presented by kernel APIs to the application layer in a | presented by kernel APIs to the application layer in a | |||
| non-multipath-aware application is that of the first | non-multipath-aware application is that of the first | |||
| subflow, even if the subflow has since been closed and | subflow, even if the subflow has since been closed and | |||
| removed from the connection. This decision, and other | removed from the connection. This decision, and other | |||
| related API issues, are discussed in more detail in | related API issues, are discussed in more detail in | |||
| <xref target="RFC6897"/>.</t> | <xref target="RFC6897" format="default"/>.</dd> | |||
| </list> | </dl> | |||
| </t> | ||||
| </section> | </section> | |||
| <section anchor="sec_security" numbered="true" toc="default"> | ||||
| <section title="Security Considerations" anchor="sec_security"> | <name>Security Considerations</name> | |||
| <t>As identified in <xref target="RFC6181"/>, the addition of multipath ca | <t>As identified in <xref target="RFC6181" format="default"/>, the | |||
| pability to TCP will bring with it a number of new classes of threat. In order t | addition of multipath capability to TCP will bring with it a number of | |||
| o prevent these, <xref target="RFC6182"/> presents a set of requirements for a s | new classes of threats. In order to prevent these threats, <xref target="R | |||
| ecurity solution for MPTCP. The fundamental goal is for the security of MPTCP to | FC6182" | |||
| be "no worse" than regular TCP today, and the key security requirements are: | format="default"/> presents a set of requirements for a security | |||
| <list style="symbols"> | solution for MPTCP. The fundamental goal is for the security of MPTCP to | |||
| <t>Provide a mechanism to confirm that the parties in a subflow handsh | be "no worse" than regular TCP today. The key security requirements | |||
| ake are the same as in the original connection setup.</t> | are as follows: | |||
| <t>Provide verification that the peer can receive traffic at a new add | </t> | |||
| ress before using it as part of a connection.</t> | <ul spacing="normal"> | |||
| <t>Provide replay protection, i.e., ensure that a request to add/remov | <li>Provide a mechanism to confirm that the parties in a subflow | |||
| e a subflow is 'fresh'.</t> | handshake are the same as the parties in the original connection setup.< | |||
| </list> | /li> | |||
| <li>Provide verification that the peer can receive traffic at a new addr | ||||
| In order to achieve these goals, MPTCP includes a hash-based handshake a | ess before using it as part of a connection.</li> | |||
| lgorithm documented in Sections <xref target="sec_init" format="counter"/> and < | <li>Provide replay protection, i.e., ensure that a request to add&wj;/re | |||
| xref target="sec_join" format="counter"/>.</t> | move a subflow is "fresh".</li> | |||
| </ul> | ||||
| <t>The security of the MPTCP connection hangs on the use of keys that are | <t> | |||
| shared once at the start of the first subflow, and are never sent again over the | In order to achieve these goals, MPTCP includes a hash-based handshake | |||
| network (unless used in the fast close mechanism, <xref target="sec_fastclose"/ | algorithm, as documented in Sections <xref target="sec_init" format="count | |||
| >). To ease demultiplexing while not giving away any cryptographic material, fu | er"/> and <xref target="sec_join" format="counter"/>.</t> | |||
| ture subflows use a truncated cryptographic hash of this key as the connection i | <t>The security of the MPTCP connection hangs on the use of keys that | |||
| dentification "token". The keys are concatenated and used as keys for creating | are shared once at the start of the first subflow and are never sent | |||
| Hash-based Message Authentication Codes (HMACs) used on subflow setup, in order | again over the network (unless used in the Fast Close mechanism (<xref | |||
| to verify that the parties in the handshake are the same as in the original conn | target="sec_fastclose" format="default"/>)). To ease demultiplexing | |||
| ection setup. It also provides verification that the peer can receive traffic a | while not giving away any cryptographic material, future subflows use a | |||
| t this new address. Replay attacks would still be possible when only keys are u | truncated cryptographic hash of this key as the connection | |||
| sed; therefore, the handshakes use single-use random numbers (nonces) at both en | identification "token". The keys are concatenated and used as keys for | |||
| ds -- this ensures the HMAC will never be the same on two handshakes. Guidance o | creating Hash-based Message Authentication Codes (HMACs) used on subflow | |||
| n generating random numbers suitable for use as keys is given in <xref target="R | setup, in order to verify that the parties in the handshake are the same | |||
| FC4086"/> and discussed in <xref target="sec_init"/>. The nonces are valid for t | as the parties in the original connection setup. It also provides verific | |||
| he lifetime of the TCP connection attempt. HMAC is also used to secure the ADD_A | ation that | |||
| DDR option, due to the threats identified in <xref target="RFC7430"/>.</t> | the peer can receive traffic at this new address. Replay attacks would | |||
| <t>The use of crypto capability bits in the initial connection handshake t | still be possible when only keys are used; therefore, the handshakes use | |||
| o negotiate use of a particular algorithm allows the deployment of additional cr | single-use random numbers (nonces) at both ends -- this ensures that the H | |||
| ypto mechanisms in the future. This negotiation would nevertheless be susceptib | MAC will never be the same on two handshakes. Guidance on generating random numb | |||
| le to a bid-down attack by an on-path active attacker who could modify the crypt | ers suitable for use as keys is given in <xref target="RFC4086" format="default" | |||
| o capability bits in the response from the receiver to use a less secure crypto | /> and discussed in <xref target="sec_init" format="default"/>. The nonces are v | |||
| mechanism. The security mechanism presented in this document should therefore pr | alid for the lifetime of the TCP connection attempt. HMAC is also used to secure | |||
| otect against all forms of flooding and hijacking attacks discussed in <xref tar | the ADD_ADDR option, due to the threats identified in <xref target="RFC7430" fo | |||
| get="RFC6181"/>.</t> | rmat="default"/>.</t> | |||
| <t>The use of crypto capability bits in the initial connection handshake | ||||
| <t>The version negotiation specified in <xref target="sec_init"/>, if diff | to negotiate the use of a particular algorithm allows the deployment of ad | |||
| ering MPTCP versions shared a common negotiation format, would allow an on-path | ditional crypto mechanisms in the future. This negotiation would nevertheless b | |||
| attacker to apply a theoretical bid-down attack. Since the v1 and v0 protocols h | e susceptible to a bid-down attack by an on-path active attacker who could modif | |||
| ave a different handshake, such an attack would require the client to re-establi | y the crypto capability bits in the response from the receiver to use a less sec | |||
| sh the connection using v0, and this being supported by the server. Note that an | ure crypto mechanism. The security mechanism presented in this document should t | |||
| on-path attacker would have access to the raw data, negating any other TCP-leve | herefore protect against all forms of flooding and hijacking attacks discussed i | |||
| l security mechanisms. | n <xref target="RFC6181" format="default"/>.</t> | |||
| Also a change from RFC6824 has removed the subflow identifier from the MP_ | <t>The version negotiation specified in <xref target="sec_init" | |||
| PRIO option (<xref target="sec_policy"/>), to remove the theoretical attack wher | format="default"/>, if differing MPTCP versions shared a common | |||
| e a subflow could be placed in "backup" mode by an attacker.</t> | negotiation format, would allow an on-path attacker to apply a | |||
| theoretical bid-down attack. Since the v1 and v0 protocols have a | ||||
| <t>During normal operation, regular TCP protection mechanisms (such as ens | different handshake, such an attack would require that the client | |||
| uring sequence numbers are in-window) will provide the same level of protection | re-establish the connection using v0 and that the server support v0. | |||
| against attacks on individual TCP subflows as exists for regular TCP today. Impl | Note that an on-path attacker would have access to the raw data, negating any o | |||
| ementations will introduce additional buffers compared to regular TCP, to reasse | ther TCP-level security mechanisms. As also noted in <xref target="app_changelog | |||
| mble data at the connection level. The application of window sizing will minimiz | "/>, this document specifies the removal of the AddrID field <xref target="RFC68 | |||
| e the risk of denial-of-service attacks consuming resources.</t> | 24"/> in the MP_PRIO option (<xref target="sec_policy" format="default"/>). | |||
| This change eliminates the possibility of a theoretical attack where | ||||
| <t>As discussed in <xref target="sec_add_address"/>, a host may advertise | a subflow could be placed in "backup" mode by an attacker.</t> | |||
| its private addresses, but these might point to different hosts in the receiver' | <t>During normal operation, regular TCP protection mechanisms (such as | |||
| s network. The MP_JOIN handshake (<xref target="sec_join"/>) will ensure that th | ensuring that sequence numbers are in-window) will provide the same | |||
| is does not succeed in setting up a subflow to the incorrect host. However, it c | level of protection against attacks on individual TCP subflows as the | |||
| ould still create unwanted TCP handshake traffic. This feature of MPTCP could be | level of protection that exists for regular TCP today. Implementations wil | |||
| a target for denial-of-service exploits, with malicious participants in MPTCP c | l introduce additional buffers compared to regular TCP, to reassemble data at th | |||
| onnections encouraging the recipient to target other hosts in the network. There | e connection level. The application of window sizing will minimize the risk of d | |||
| fore, implementations should consider heuristics (<xref target="heuristics"/>) a | enial-of-service attacks consuming resources.</t> | |||
| t both the sender and receiver to reduce the impact of this.</t> | <t>As discussed in <xref target="sec_add_address" format="default"/>, a ho | |||
| st may advertise its private addresses, but these might point to different hosts | ||||
| in the receiver's network. The MP_JOIN handshake (<xref target="sec_join" forma | ||||
| t="default"/>) will ensure that this does not succeed in setting up a subflow to | ||||
| the incorrect host. However, it could still create unwanted TCP handshake traff | ||||
| ic. This feature of MPTCP could be a target for denial-of-service exploits, with | ||||
| malicious participants in MPTCP connections encouraging the recipient to target | ||||
| other hosts in the network. Therefore, implementations should consider heuristi | ||||
| cs (<xref target="heuristics" format="default"/>) at both the sender and receive | ||||
| r to reduce the impact of this.</t> | ||||
| <t>To further protect against malicious ADD_ADDR messages sent by an off-p ath attacker, the ADD_ADDR includes an HMAC using the keys negotiated during the handshake. This effectively prevents an attacker from diverting an MPTCP connec tion through an off-path ADD_ADDR injection into the stream.</t> | <t>To further protect against malicious ADD_ADDR messages sent by an off-p ath attacker, the ADD_ADDR includes an HMAC using the keys negotiated during the handshake. This effectively prevents an attacker from diverting an MPTCP connec tion through an off-path ADD_ADDR injection into the stream.</t> | |||
| <t>A small security risk could theoretically exist with key reuse, but in | ||||
| <t>A small security risk could theoretically exist with key reuse, but in | order to accomplish a replay attack, both the sender and receiver keys, and the | |||
| order to accomplish a replay attack, both the sender and receiver keys, and the | sender and receiver random numbers, in the MP_JOIN handshake (<xref target="sec_ | |||
| sender and receiver random numbers, in the MP_JOIN handshake (<xref target="sec_ | join" format="default"/>) would have to match.</t> | |||
| join"/>) would have to match.</t> | <t>While this specification defines a "medium" security solution, | |||
| meeting the criteria specified at the start of this section and in the | ||||
| <t>Whilst this specification defines a "medium" security solution, meeting | threat analysis document <xref target="RFC6181" format="default"/>, since | |||
| the criteria specified at the start of this section and the threat analysis (<x | attacks | |||
| ref target="RFC6181"/>), since attacks only ever get worse, it is likely that a | only ever get worse, it is likely that a future version of MPTCP would | |||
| future version of MPTCP would need to be able to support stronger security. Ther | need to be able to support stronger security. | |||
| e are several ways the security of MPTCP could potentially be improved; some of | There are several ways the security of MPTCP could potentially be improved; som | |||
| these would be compatible with MPTCP as defined in this document, whilst others | e of these would be compatible with MPTCP as defined in this document, while oth | |||
| may not be. For now, the best approach is to get experience with the current app | ers may not be. For now, the best approach is to gain experience with the curren | |||
| roach, establish what might work, and check that the threat analysis is still ac | t approach, establish what might work, and check that the threat analysis is sti | |||
| curate.</t> | ll accurate.</t> | |||
| <t>Possible ways of improving MPTCP security could include:</t> | ||||
| <t>Possible ways of improving MPTCP security could include:<list style="symbols" | <ul spacing="normal"> | |||
| > | <li>defining a new MPTCP cryptographic algorithm, as negotiated in | |||
| <t>defining a new MPCTP cryptographic algorithm, as negotiated in MP_CAPABLE. A | MP_CAPABLE. If an implementation was being deployed in a controlled | |||
| sub-case could be to include an additional deployment assumption, such as statef | environment where additional assumptions could be made, such as the | |||
| ul servers, in order to allow a more powerful algorithm to be used.</t> | ability for the servers to store state during the TCP handshake, then | |||
| <t>defining how to secure data transfer with MPTCP, whilst not changing the sign | it may be possible to use a stronger cryptographic algorithm than | |||
| aling part of the protocol.</t> | would otherwise be possible.</li> | |||
| <t>defining security that requires more option space, perhaps in conjunction wit | <li>defining how to secure data transfer with MPTCP, while not changing | |||
| h a "long options" proposal for extending the TCP options space (such as those s | the signaling part of the protocol.</li> | |||
| urveyed in <xref target="TCPLO"/>), or perhaps building on the current approach | <li>defining security that requires more option space, perhaps in | |||
| with a second stage of MPTCP-option-based security.</t> | conjunction with a "long options" proposal for extending the TCP | |||
| <t>revisiting the working group's decision to exclusively use TCP options for MP | option space (such as those surveyed in <xref | |||
| TCP signaling, and instead look at also making use of the TCP payloads.</t> | target="I-D.ananth-tcpm-tcpoptext" format="default"/>), or perhaps | |||
| </list></t> | building on the current approach with a second stage of | |||
| security based on MPTCP options.</li> | ||||
| <t>MPTCP has been designed with several methods available to indicate a new secu | <li>revisiting the working group's decision to exclusively use TCP | |||
| rity mechanism, including: | options for MPTCP signaling and instead looking at the | |||
| <list style="symbols"> | possibility of using TCP payloads as well.</li> | |||
| <t>available flags in MP_CAPABLE (<xref target="tcpm_capable"/>);</t> | </ul> | |||
| <t>available subtypes in the MPTCP option (<xref target="fig_option"/>);</t> | <t>MPTCP has been designed with several methods available to indicate a ne | |||
| <t>the version field in MP_CAPABLE (<xref target="tcpm_capable"/>);</t> | w security mechanism, including: | |||
| </list></t> | </t> | |||
| <ul spacing="normal"> | ||||
| <li>available flags in MP_CAPABLE (<xref target="tcpm_capable" format="d | ||||
| efault"/>).</li> | ||||
| <li>available subtypes in the MPTCP option (<xref target="fig_option" fo | ||||
| rmat="default"/>).</li> | ||||
| <li>the Version field in MP_CAPABLE (<xref target="tcpm_capable" format= | ||||
| "default"/>).</li> | ||||
| </ul> | ||||
| </section> | </section> | |||
| <section anchor="sec_middleboxes" numbered="true" toc="default"> | ||||
| <section title="Interactions with Middleboxes" anchor="sec_middleboxes"> | <name>Interactions with Middleboxes</name> | |||
| <t>Multipath TCP was designed to be deployable in the present world. Its d | ||||
| <t>Multipath TCP was designed to be deployable in the present world. Its | esign takes into account "reasonable" | |||
| design takes into account "reasonable" | ||||
| existing middlebox behavior. In this section, we outline a few representative mi ddlebox-related failure scenarios and | existing middlebox behavior. In this section, we outline a few representative mi ddlebox-related failure scenarios and | |||
| show how Multipath TCP handles them. Next, we list the design decisions multipat | show how Multipath TCP handles them. Next, we list the design decisions | |||
| h has made to accommodate the different | Multipath TCP has made to accommodate the different | |||
| middleboxes.</t> | middleboxes.</t> | |||
| <t>A primary concern is our use of a new TCP option. Middleboxes should fo | ||||
| <t>A primary concern is our use of a new TCP option. Middleboxes should | rward packets | |||
| forward packets | with unknown options unchanged, yet there are some that don't. We expect these | |||
| with unknown options unchanged, yet there are some that don't. These we expect w | middleboxes to strip options and pass the data, | |||
| ill either strip options and pass the data, | ||||
| drop packets with new options, copy the same option into multiple segments (e.g. , when doing segmentation), or drop | drop packets with new options, copy the same option into multiple segments (e.g. , when doing segmentation), or drop | |||
| options during segment coalescing.</t> | options during segment coalescing.</t> | |||
| <t>MPTCP uses a single new TCP option called "Kind", and all message types | ||||
| are defined by "subtype" values (see <xref target="IANA" format="default"/>). T | ||||
| his should reduce the chances of only some types of MPTCP options being passed; | ||||
| instead, the key differing characteristics are different paths and the presence | ||||
| of the SYN flag.</t> | ||||
| <t>MPTCP SYN packets on the first subflow of a connection contain the MP_C | ||||
| APABLE option (<xref target="sec_init" format="default"/>). If this is dropped, | ||||
| MPTCP <bcp14>SHOULD</bcp14> fall back to regular TCP. If packets with the MP_JOI | ||||
| N option (<xref target="sec_join" format="default"/>) are dropped, the paths wil | ||||
| l simply not be used.</t> | ||||
| <t>If a middlebox strips options but otherwise passes the packets | ||||
| unchanged, MPTCP will behave safely. If an MP_CAPABLE option is dropped | ||||
| on either the outgoing path or the return path, the initiating host can | ||||
| fall back to regular TCP, as illustrated in <xref target="fig_syn" | ||||
| format="default"/> and discussed in <xref target="sec_init" | ||||
| format="default"/>.</t> | ||||
| <figure anchor="fig_syn"> | ||||
| <name>Connection Setup with Middleboxes That Strip Options from Packets< | ||||
| /name> | ||||
| <artwork align="left" name="" type="" alt=""><![CDATA[ | ||||
| Host A Host B | ||||
| | Middlebox M | | ||||
| | | | | ||||
| | SYN (MP_CAPABLE) | SYN | | ||||
| |-------------------|---------------->| | ||||
| | SYN/ACK | | ||||
| |<------------------------------------| | ||||
| a) MP_CAPABLE option stripped on outgoing path | ||||
| <t>MPTCP uses a single new TCP option "Kind", and all message types are | Host A Host B | |||
| defined by "subtype" values (see <xref target="IANA"/>). This should reduce the | | SYN (MP_CAPABLE) | | |||
| chances of only some types of MPTCP options being passed, and instead the key di | |-------------------------------------->| | |||
| ffering characteristics are different paths, and the presence of the SYN flag.</ | | Middlebox M | | |||
| t> | | | | | |||
| | SYN/ACK |SYN/ACK (MP_CAPABLE)| | ||||
| <t>MPTCP SYN packets on the first subflow of a connection contain the MP | |<-----------------|--------------------| | |||
| _CAPABLE option (<xref target="sec_init"/>). If this is dropped, MPTCP SHOULD fa | b) MP_CAPABLE option stripped on return path ]]></artwork> | |||
| ll back to regular TCP. If packets with the MP_JOIN option (<xref target="sec_jo | </figure> | |||
| in"/>) are dropped, the paths will simply not be used.</t> | <t>Subflow SYNs contain the MP_JOIN option. If this option is stripped on | |||
| the outgoing path, | ||||
| <t>If a middlebox strips options but otherwise passes the packets unchan | the SYN will appear to be a regular SYN to Host B. Depending on whether th | |||
| ged, MPTCP will behave safely. If an MP_CAPABLE option is dropped on either the | ere is a listening socket on | |||
| outgoing or the return path, the initiating host can fall back to regular TCP, a | the target port, Host B will reply with either a SYN/ACK or a RST (subflow conne | |||
| s illustrated in <xref target="fig_syn"/> and discussed in <xref target="sec_ini | ction fails). When Host A | |||
| t"/>.</t> | receives the SYN/ACK, it sends a RST because the SYN/ACK does not contain the MP | |||
| _JOIN option and its token. | ||||
| <t>Subflow SYNs contain the MP_JOIN option. If this option is stripped | Either way, the subflow setup fails but otherwise does not affect the MPTCP conn | |||
| on the outgoing path, | ection as a whole.</t> | |||
| the SYN will appear to be a regular SYN to Host B. Depending on whether th | <t>We now examine data flow with MPTCP, assuming that the flow is | |||
| ere is a listening socket on | correctly set up, which implies that the options in the SYN | |||
| the target port, Host B will reply either with SYN/ACK or RST (subflow connectio | ||||
| n fails). When Host A | ||||
| receives the SYN/ACK it sends a RST because the SYN/ACK does not contain the MP_ | ||||
| JOIN option and its token. | ||||
| Either way, the subflow setup fails, but otherwise does not affect the MPTCP con | ||||
| nection as a whole.</t> | ||||
| <figure align="center" anchor="fig_syn" title="Connection Setup with Mid | ||||
| dleboxes that Strip Options from Packets"> | ||||
| <artwork align="left"><![CDATA[ | ||||
| Host A Host B | ||||
| | Middlebox M | | ||||
| | | | | ||||
| | SYN(MP_CAPABLE) | SYN | | ||||
| |-------------------|---------------->| | ||||
| | SYN/ACK | | ||||
| |<------------------------------------| | ||||
| a) MP_CAPABLE option stripped on outgoing path | ||||
| Host A Host B | ||||
| | SYN(MP_CAPABLE) | | ||||
| |------------------------------------>| | ||||
| | Middlebox M | | ||||
| | | | | ||||
| | SYN/ACK |SYN/ACK(MP_CAPABLE)| | ||||
| |<----------------|-------------------| | ||||
| b) MP_CAPABLE option stripped on return path | ||||
| ]]></artwork> | ||||
| </figure> | ||||
| <t>We now examine data flow with MPTCP, assuming the flow is correctly s | ||||
| et up, which implies the options in the SYN | ||||
| packets were allowed through by the relevant middleboxes. If options are allowed through and there is no resegmentation or | packets were allowed through by the relevant middleboxes. If options are allowed through and there is no resegmentation or | |||
| coalescing to TCP segments, Multipath TCP flows can proceed without problems.</t > | coalescing to TCP segments, Multipath TCP flows can proceed without problems.</t > | |||
| <t>The case when options get stripped on data packets is discussed | ||||
| <t>The case when options get stripped on data packets has been discussed | in <xref target="sec_fallback" format="default"/>. | |||
| in the Fallback section. | If only some MPTCP options are stripped, behavior is not deterministic. | |||
| If only some MPTCP options are stripped, behavior is not deterministic. | If some Data Sequence Mappings are lost, the connection can continue so long as | |||
| If some data sequence mappings are lost, the connection can continue so long as | mappings exist for the subflow-level data (e.g., if multiple maps have been sent | |||
| mappings exist for the subflow-level data (e.g., if multiple maps have been sent | that reinforce each other). If some subflow-level space is left unmapped, howev | |||
| that reinforce each other). If some subflow-level space is left unmapped, howev | er, the subflow is treated as broken and is closed, using the process described | |||
| er, the subflow is treated as broken and is closed, through the process describe | in <xref target="sec_fallback" format="default"/>. MPTCP should survive with a l | |||
| d in <xref target="sec_fallback"/>. MPTCP should survive with a loss of some Dat | oss of some Data ACKs, but performance will degrade as the fraction of stripped | |||
| a ACKs, but performance will degrade as the fraction of stripped options increas | options increases. | |||
| es. | ||||
| We do not expect such cases to appear in practice, though: most | We do not expect such cases to appear in practice, though: most | |||
| middleboxes will either strip all options or let them all through.</t> | middleboxes will either strip all options or let them all through.</t> | |||
| <t>We end this section with a list of middlebox classes, their behavior, a | ||||
| <t>We end this section with a list of middlebox classes, their behavior, | nd the elements in the MPTCP design | |||
| and the elements in the MPTCP design | ||||
| that allow operation through such middleboxes. Issues surrounding dropping packe ts with options | that allow operation through such middleboxes. Issues surrounding dropping packe ts with options | |||
| or stripping options were discussed above, and are not included here: | or stripping options were discussed above and are not included here: | |||
| <list style="symbols"> | </t> | |||
| <t>NATs <xref target="RFC3022"/> (Network Address (and Port) Translato | <ul spacing="normal"> | |||
| rs) change the source address (and often source port) of packets. This means tha | <li>NATs (Network Address (and port) Translators) <xref | |||
| t a host will not know its | target="RFC3022" format="default"/> change the source address (and | |||
| often the source port) of packets. This means that a host will not know | ||||
| its | ||||
| public-facing address for signaling in MPTCP. Therefore, MPTCP permits impli cit address addition via the MP_JOIN option, | public-facing address for signaling in MPTCP. Therefore, MPTCP permits impli cit address addition via the MP_JOIN option, | |||
| and the handshake mechanism ensures that connection attempts to private addr | and the handshake mechanism ensures that connection attempts to private addr | |||
| esses <xref target="RFC1918"/>, since they are authenticated, will only set up s | esses <xref target="RFC1918" format="default"/>, since they are authenticated, w | |||
| ubflows to the correct hosts. | ill only set up subflows to the correct hosts. | |||
| Explicit address removal is undertaken by an Address ID to allow no knowledg | Explicit address removal is undertaken by an Address ID to allow no knowledg | |||
| e of the source address.</t> | e of the source address.</li> | |||
| <li>Performance Enhancing Proxies (PEPs) <xref target="RFC3135" format=" | ||||
| <t>Performance Enhancing Proxies (PEPs) <xref target="RFC3135"/> might | default"/> might proactively ACK data to increase performance. MPTCP, however, r | |||
| proactively ACK data to increase performance. MPTCP, however, relies on accurat | elies on accurate congestion control signals from the end host, and non‑MP | |||
| e congestion control signals from the end host, and non-MPTCP-aware PEPs will no | TCP-aware PEPs will not be able to provide such signals. MPTCP will, therefore, | |||
| t be able to provide such signals. MPTCP will, therefore, fall back to single-pa | fall back to single-path TCP or close the problematic subflow (see <xref target= | |||
| th TCP, or close the problematic subflow (see <xref target="sec_fallback"/>).</t | "sec_fallback" format="default"/>).</li> | |||
| > | <li>Traffic normalizers <xref target="norm" format="default"/> may not | |||
| allow holes in sequence numbers, and they may cache packets and retransm | ||||
| <t>Traffic Normalizers <xref target="norm"/> may not allow holes in se | it the same data. | |||
| quence numbers, and may cache packets and retransmit the same data. | MPTCP looks like standard TCP on the wire and will not retransmit different data | |||
| MPTCP looks like standard TCP on the wire, and will not retransmit different dat | on the same subflow sequence number. In the event of a retransmission, the same | |||
| a on the same subflow sequence number. In the event of a retransmission, the sam | data will be retransmitted on the original TCP subflow even if it is additional | |||
| e data will be retransmitted on the original TCP subflow even if it is additiona | ly retransmitted at the connection level on a different subflow.</li> | |||
| lly retransmitted at the connection level on a different subflow.</t> | <li>Firewalls <xref target="RFC2979" format="default"/> might perform | |||
| Initial Sequence Number (ISN) randomization on TCP connections. MPTCP us | ||||
| <t>Firewalls <xref target="RFC2979"/> might perform initial sequence n | es relative | |||
| umber randomization on TCP connections. MPTCP uses relative | sequence numbers in Data Sequence Mappings to cope with this. Like NATs, firewal | |||
| sequence numbers in data sequence mapping to cope with this. Like NATs, firewall | ls will not permit many incoming connections, so | |||
| s will not permit many incoming connections, so | ||||
| MPTCP supports address signaling (ADD_ADDR) so that a multiaddressed host can in vite its peer behind the firewall/NAT to connect | MPTCP supports address signaling (ADD_ADDR) so that a multiaddressed host can in vite its peer behind the firewall/NAT to connect | |||
| out to its additional interface.</t> | out to its additional interface.</li> | |||
| <li>Intrusion Detection Systems / Intrusion Prevention Systems (IDSs&wj; | ||||
| <t>Intrusion Detection/Prevention Systems (IDS/IPS) observe packet str | /IPSs) observe packet streams for patterns and content that could threaten a net | |||
| eams for patterns and content that could threaten a network. MPTCP may require t | work. MPTCP may require the | |||
| he | instrumentation of additional paths, and an MPTCP-aware IDS or IPS would need to | |||
| instrumentation of additional paths, and an MPTCP-aware IDS/IPS would need to re | read MPTCP tokens to correlate data from multiple subflows to maintain comparab | |||
| ad MPTCP tokens to correlate data from mutliple subflows to maintain comparable | le visibility into all of the traffic between devices. Without such changes, an | |||
| visibility into all of the traffic between devices. Without such changes, an IDS | IDS would get an incomplete view of the traffic, increasing the risk of missing | |||
| would get an incomplete view of the traffic, increasing the risk of missing tra | traffic of interest (false negatives) and increasing the chances of erroneously | |||
| ffic of interest (false negatives), and increasing the chances of erroneously id | identifying a subflow as a risk due to only seeing partial data (false positives | |||
| entifying a subflow as a risk due to only seeing partial data (false positives). | ).</li> | |||
| </t> | <li>Application-level middleboxes such as content-aware firewalls may | |||
| alter the payload within a subflow -- for example, rewriting URIs in | ||||
| <t>Application-level middleboxes such as content-aware firewalls may a | HTTP traffic. MPTCP will detect such changes using the checksum | |||
| lter the payload within a subflow, such as rewriting URIs in HTTP traffic. MPTCP | and close the affected subflow(s), if there are other subflows that can be used. | |||
| will detect these using the checksum | If all subflows are affected, MPTCP | |||
| and close the affected subflow(s), if there are other subflows that can be used. | will fall back to TCP, allowing such middleboxes to change the payload. MPTCP-aw | |||
| If all subflows are affected, multipath | are middleboxes should be able to adjust the payload and MPTCP metadata in order | |||
| will fall back to TCP, allowing such middleboxes to change the payload. MPTCP-aw | not to break the connection.</li> | |||
| are middleboxes should be able to adjust the payload and MPTCP metadata in order | </ul> | |||
| not to break the connection.</t> | <t> | |||
| </list> | ||||
| In addition, all classes of middleboxes may affect TCP traffic in the fo llowing ways: | In addition, all classes of middleboxes may affect TCP traffic in the fo llowing ways: | |||
| <list style="symbols"> | ||||
| <t>TCP options may be removed, or packets with unknown options dropped | ||||
| , by many classes of middleboxes. It is intended | ||||
| that the initial SYN exchange, with a TCP option, will be sufficient to identify | ||||
| the path capabilities. If such a packet does | ||||
| not get through, MPTCP will end up falling back to regular TCP.</t> | ||||
| <t>Segmentation/Coalescing (e.g., TCP segmentation offloading) might c | ||||
| opy options between packets and might | ||||
| strip some options. MPTCP's data sequence mapping includes the relative subflow | ||||
| sequence number instead of using the sequence | ||||
| number in the segment. In this way, the mapping is independent of the packets th | ||||
| at carry it.</t> | ||||
| <t>The receive window may be shrunk by some middleboxes at the subflow | ||||
| level. MPTCP will use the maximum window at data level, but will also obey | ||||
| subflow-specific windows.</t> | ||||
| </list> | ||||
| </t> | </t> | |||
| <ul spacing="normal"> | ||||
| </section> | <li>TCP options may be removed, or packets with unknown options dropped, | |||
| by many classes of middleboxes. It is intended | ||||
| <section anchor="Acknowledgments" title="Acknowledgments"> | that the initial SYN exchange, with a TCP option, will be sufficient to identify | |||
| <!-- <t>The authors were originally supported by Trilogy (http://www.trilo | the path's capabilities. If such a packet does | |||
| gy-project.org), a research project (ICT-216372) partially funded by the Europea | not get through, MPTCP will end up falling back to regular TCP.</li> | |||
| n Community under its Seventh Framework Program.</t> | <li>Segmentation/coalescing (e.g., TCP segmentation offloading) might co | |||
| <t>Alan Ford was originally supported by Roke Manor Research and later Cis | py options between packets and might | |||
| co Systems.</t> --> | strip some options. MPTCP's Data Sequence Mapping includes the relative subflow | |||
| <t>The authors gratefully acknowledge significant input into this document | sequence number instead of using the sequence | |||
| from Sébastien Barré and Andrew McDonald.</t> | number in the segment. In this way, the mapping is independent of the packets th | |||
| <t>The authors also wish to acknowledge reviews and contributions from Ilj | at carry it.</li> | |||
| itsch van Beijnum, Lars Eggert, Marcelo Bagnulo, Robert Hancock, Pasi Sarolahti, | <li>The receive window may be shrunk by some middleboxes at the | |||
| Toby Moncaster, Philip Eardley, Sergio Lembo, Lawrence Conroy, Yoshifumi Nishid | subflow level. MPTCP will use the maximum window at the data level but w | |||
| a, Bob Briscoe, Stein Gjessing, Andrew McGregor, Georg Hampel, Anumita Biswas, W | ill also obey | |||
| es Eddy, Alexey Melnikov, Francis Dupont, Adrian Farrel, Barry Leiba, Robert Spa | subflow-specific windows.</li> | |||
| rks, Sean Turner, Stephen Farrell, Martin Stiemerling, Gregory Detal, Fabien Duc | </ul> | |||
| hene, Xavier de Foy, Rahul Jadhav, Klemens Schragel, Mirja Kuehlewind, Sheng Jia | ||||
| ng, Alissa Cooper, Ines Robles, Roman Danyliw, Adam Roach, Barry Leiba, Alexey M | ||||
| elnikov, Eric Vyncke, and Ben Kaduk.</t> | ||||
| </section> | ||||
| <section anchor="IANA" title="IANA Considerations"> | ||||
| <t>This document obsoletes RFC6824 and as such IANA is requested to update | ||||
| the TCP option space registry to point to this document for Multipath TCP, as f | ||||
| ollows:</t> | ||||
| <texttable anchor="table_tcpo" title="TCP Option Kind Numbers"> | ||||
| <ttcol align="center">Kind</ttcol> | ||||
| <ttcol align="center">Length</ttcol> | ||||
| <ttcol align="center">Meaning</ttcol> | ||||
| <ttcol align="center">Reference</ttcol> | ||||
| <c>30</c> | ||||
| <c>N</c> | ||||
| <c>Multipath TCP (MPTCP)</c> | ||||
| <c>This document</c> | ||||
| </texttable> | ||||
| <section anchor="IANA_subtypes" title="MPTCP Option Subtypes"> | ||||
| <t>The 4-bit MPTCP subtype sub-registry ("MPTCP Option Subtypes" under the | ||||
| "Transmission Control Protocol (TCP) Parameters" registry) was defined in RFC68 | ||||
| 24. Since RFC6824 was an Experimental not Standards Track RFC, and since no furt | ||||
| her entries have occurred beyond those pointing to RFC6824, IANA is requested to | ||||
| replace the existing registry with <xref target="table_iana"/> and with the fol | ||||
| lowing explanatory note.</t> | ||||
| <t>Note: This registry specifies the MPTCP Option Subtypes for MPTCP v1, w | ||||
| hich obsoletes the Experimental MPTCP v0. For the MPTCP v0 subtypes, please refe | ||||
| r to RFC6824.</t> | ||||
| <texttable anchor="table_iana" title="MPTCP Option Subtypes"> | ||||
| <ttcol align="center">Value</ttcol> | ||||
| <ttcol align="center">Symbol</ttcol> | ||||
| <ttcol align="center">Name</ttcol> | ||||
| <ttcol align="center">Reference</ttcol> | ||||
| <c>0x0</c> | ||||
| <c>MP_CAPABLE</c> | ||||
| <c>Multipath Capable</c> | ||||
| <c>This document, <xref target="sec_init"/></c> | ||||
| <c>0x1</c> | ||||
| <c>MP_JOIN</c> | ||||
| <c>Join Connection</c> | ||||
| <c>This document, <xref target="sec_join"/></c> | ||||
| <c>0x2</c> | ||||
| <c>DSS</c> | ||||
| <c>Data Sequence Signal (Data ACK and data sequence mapping)</c> | ||||
| <c>This document, <xref target="sec_generalop"/></c> | ||||
| <c>0x3</c> | ||||
| <c>ADD_ADDR</c> | ||||
| <c>Add Address</c> | ||||
| <c>This document, <xref target="sec_add_address"/></c> | ||||
| <c>0x4</c> | ||||
| <c>REMOVE_ADDR</c> | ||||
| <c>Remove Address</c> | ||||
| <c>This document, <xref target="sec_remove_addr"/></c> | ||||
| <c>0x5</c> | ||||
| <c>MP_PRIO</c> | ||||
| <c>Change Subflow Priority</c> | ||||
| <c>This document, <xref target="sec_policy"/></c> | ||||
| <c>0x6</c> | ||||
| <c>MP_FAIL</c> | ||||
| <c>Fallback</c> | ||||
| <c>This document, <xref target="sec_fallback"/></c> | ||||
| <c>0x7</c> | ||||
| <c>MP_FASTCLOSE</c> | ||||
| <c>Fast Close</c> | ||||
| <c>This document, <xref target="sec_fastclose"/></c> | ||||
| <c>0x8</c> | ||||
| <c>MP_TCPRST</c> | ||||
| <c>Subflow Reset</c> | ||||
| <c>This document, <xref target="sec_reset"/></c> | ||||
| <c>0xf</c> | ||||
| <c>MP_EXPERIMENTAL</c> | ||||
| <c>Reserved for private experiments</c> | ||||
| <c></c> | ||||
| </texttable> | ||||
| <t>Values 0x9 through 0xe are currently unassigned. Option 0xf is reserved | ||||
| for use by private experiments. Its use may be formalized in a future specifica | ||||
| tion. Future assignments in this registry are to be defined by Standards Action | ||||
| as defined by <xref target="RFC8126"/>. Assignments consist of the MPTCP subtyp | ||||
| e's symbolic name and its associated value, and a reference to its specification | ||||
| .</t> | ||||
| </section> | </section> | |||
| <section anchor="IANA_handshake" title="MPTCP Handshake Algorithms"> | <section anchor="IANA" numbered="true" toc="default"> | |||
| <name>IANA Considerations</name> | ||||
| <t>The "MPTCP Handshake Algorithms" sub-registry under the "Transmission C | ||||
| ontrol Protocol (TCP) Parameters" registry was defined in RFC6824. Since RFC6824 | ||||
| was an Experimental not Standards Track RFC, and since no further entries have | ||||
| occurred beyond those pointing to RFC6824, IANA is requested to replace the exis | ||||
| ting registry with <xref target="table_crypto"/> and with the following explanat | ||||
| ory note.</t> | ||||
| <t>Note: This registry specifies the MPTCP Handshake Algorithms for MPTCP | ||||
| v1, which obsoletes the Experimental MPTCP v0. For the MPTCP v0 subtypes, please | ||||
| refer to RFC6824.</t> | ||||
| <texttable anchor="table_crypto" title="MPTCP Handshake Algorithms"> | ||||
| <ttcol align="center">Flag Bit</ttcol> | ||||
| <ttcol align="center">Meaning</ttcol> | ||||
| <ttcol align="center">Reference</ttcol> | ||||
| <c>A</c> | ||||
| <c>Checksum required</c> | ||||
| <c>This document, <xref target="sec_init"/></c> | ||||
| <c>B</c> | ||||
| <c>Extensibility</c> | ||||
| <c>This document, <xref target="sec_init"/></c> | ||||
| <c>C</c> | ||||
| <c>Do not attempt to establish new subflows to the source address.</c> | ||||
| <c>This document, <xref target="sec_init"/></c> | ||||
| <c>D-G</c> | <t>This document obsoletes <xref target="RFC6824"/>. As such, IANA has upd | |||
| <c>Unassigned</c> | ated | |||
| <c></c> | several registries to point to this document. In addition, this document | |||
| creates one new registry. These topics are described in the following sub | ||||
| sections.</t> | ||||
| <c>H</c> | <section anchor="IANA-TCP-Option-Kind" numbered="true" toc="default"> | |||
| <c>HMAC-SHA256</c> | <name>TCP Option Kind Numbers</name> | |||
| <c>This document, <xref target="sec_join"/></c> | <t>IANA has | |||
| </texttable> | updated the "TCP Option Kind Numbers" registry to point to this document | |||
| for Multipath TCP, as shown in <xref target="table_tcpo"/>:</t> | ||||
| <table anchor="table_tcpo" align="center"> | ||||
| <name>TCP Option Kind Numbers</name> | ||||
| <thead> | ||||
| <tr> | ||||
| <th align="center">Kind</th> | ||||
| <th align="center">Length</th> | ||||
| <th align="center">Meaning</th> | ||||
| <th align="center">Reference</th> | ||||
| </tr> | ||||
| </thead> | ||||
| <tbody> | ||||
| <tr> | ||||
| <td align="center">30</td> | ||||
| <td align="center">N</td> | ||||
| <td align="center">Multipath TCP (MPTCP)</td> | ||||
| <td align="center">RFC 8684</td> | ||||
| </tr> | ||||
| </tbody> | ||||
| </table> | ||||
| </section> | ||||
| <section anchor="IANA_subtypes" numbered="true" toc="default"> | ||||
| <name>MPTCP Option Subtypes</name> | ||||
| <t>The 4-bit MPTCP subtype in the "MPTCP Option Subtypes" | ||||
| subregistry under the "Transmission Control Protocol (TCP) Parameters" | ||||
| registry was defined in <xref target="RFC6824"/>. Since <xref target="RF | ||||
| C6824"/> is an | ||||
| Experimental RFC and not a Standards Track RFC, and since no further | ||||
| entries have occurred beyond those pointing to <xref target="RFC6824"/>, | ||||
| IANA has | ||||
| replaced the existing registry with the contents of | ||||
| <xref target="table_iana" format="default"/> and with the following | ||||
| explanatory note.</t> | ||||
| <t>Note that the meanings of bits D through H can be dependent upon bit B, | <t>Note: This registry specifies the MPTCP Option Subtypes for MPTCP v1, | |||
| depending on how Extensibility is defined in future specifications; see | which obsoletes the Experimental MPTCP v0. For the MPTCP v0 subtypes, please re | |||
| <xref target="sec_init"/> for more information.</t> | fer to <xref target="RFC6824"/>.</t> | |||
| <table anchor="table_iana" align="center"> | ||||
| <name>MPTCP Option Subtypes</name> | ||||
| <thead> | ||||
| <tr> | ||||
| <th align="center">Value</th> | ||||
| <th align="center">Symbol</th> | ||||
| <th align="center">Name</th> | ||||
| <th align="center">Reference</th> | ||||
| </tr> | ||||
| </thead> | ||||
| <tbody> | ||||
| <tr> | ||||
| <td align="center">0x0</td> | ||||
| <td align="center">MP_CAPABLE</td> | ||||
| <td align="center">Multipath Capable</td> | ||||
| <td align="center">RFC 8684, <xref target="sec_init" format="defau | ||||
| lt"/></td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">0x1</td> | ||||
| <td align="center">MP_JOIN</td> | ||||
| <td align="center">Join Connection</td> | ||||
| <td align="center">RFC 8684, <xref target="sec_join" format="defau | ||||
| lt"/></td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">0x2</td> | ||||
| <td align="center">DSS</td> | ||||
| <td align="center">Data Sequence Signal (Data ACK and Data Sequenc | ||||
| e Mapping)</td> | ||||
| <td align="center">RFC 8684, <xref target="sec_generalop" format=" | ||||
| default"/></td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">0x3</td> | ||||
| <td align="center">ADD_ADDR</td> | ||||
| <td align="center">Add Address</td> | ||||
| <td align="center">RFC 8684, <xref target="sec_add_address" format | ||||
| ="default"/></td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">0x4</td> | ||||
| <td align="center">REMOVE_ADDR</td> | ||||
| <td align="center">Remove Address</td> | ||||
| <td align="center">RFC 8684, <xref target="sec_remove_addr" format | ||||
| ="default"/></td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">0x5</td> | ||||
| <td align="center">MP_PRIO</td> | ||||
| <td align="center">Change Subflow Priority</td> | ||||
| <td align="center">RFC 8684, <xref target="sec_policy" format="def | ||||
| ault"/></td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">0x6</td> | ||||
| <td align="center">MP_FAIL</td> | ||||
| <td align="center">Fallback</td> | ||||
| <td align="center">RFC 8684, <xref target="sec_fallback" format="d | ||||
| efault"/></td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">0x7</td> | ||||
| <td align="center">MP_FASTCLOSE</td> | ||||
| <td align="center">Fast Close</td> | ||||
| <td align="center">RFC 8684, <xref target="sec_fastclose" format=" | ||||
| default"/></td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">0x8</td> | ||||
| <td align="center">MP_TCPRST</td> | ||||
| <td align="center">Subflow Reset</td> | ||||
| <td align="center">RFC 8684, <xref target="sec_reset" format="defa | ||||
| ult"/></td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">0xf</td> | ||||
| <td align="center">MP_EXPERIMENTAL</td> | ||||
| <td align="center">Reserved for Private Use</td> | ||||
| <td align="center"/> | ||||
| </tr> | ||||
| </tbody> | ||||
| </table> | ||||
| <t>Values 0x9 through 0xe are currently unassigned. Option 0xf is reserv | ||||
| ed for use by private experiments. Its use may be formalized in a future specifi | ||||
| cation. Future assignments in this registry are to be defined by Standards Actio | ||||
| n as defined by <xref target="RFC8126" format="default"/>. Assignments consist | ||||
| of the MPTCP subtype's symbolic name, its associated value, and a reference to i | ||||
| ts specification.</t> | ||||
| </section> | ||||
| <section anchor="IANA_handshake" numbered="true" toc="default"> | ||||
| <name>MPTCP Handshake Algorithms</name> | ||||
| <t>The "MPTCP Handshake Algorithms" subregistry under the | ||||
| "Transmission Control Protocol (TCP) Parameters" registry was defined | ||||
| in <xref target="RFC6824"/>. Since <xref target="RFC6824"/> is an Experi | ||||
| mental RFC and not | ||||
| a Standards Track RFC, and since no further entries have occurred | ||||
| beyond those pointing to <xref target="RFC6824"/>, IANA has replaced | ||||
| the existing registry with the contents of | ||||
| <xref target="table_crypto" format="default"/> and with the following explanato | ||||
| ry note.</t> | ||||
| <t>Note: This registry specifies the MPTCP Handshake Algorithms for MPTC | ||||
| P v1, which obsoletes the Experimental MPTCP v0. For the MPTCP v0 subtypes, plea | ||||
| se refer to <xref target="RFC6824"/>.</t> | ||||
| <table anchor="table_crypto" align="center"> | ||||
| <name>MPTCP Handshake Algorithms</name> | ||||
| <thead> | ||||
| <tr> | ||||
| <th align="center">Flag Bit</th> | ||||
| <th align="center">Meaning</th> | ||||
| <th align="center">Reference</th> | ||||
| </tr> | ||||
| </thead> | ||||
| <tbody> | ||||
| <tr> | ||||
| <td align="center">A</td> | ||||
| <td align="center">Checksum required</td> | ||||
| <td align="center">RFC 8684, <xref target="sec_init" format="defau | ||||
| lt"/></td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">B</td> | ||||
| <td align="center">Extensibility</td> | ||||
| <td align="center">RFC 8684, <xref target="sec_init" format="defau | ||||
| lt"/></td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">C</td> | ||||
| <td align="center">Do not attempt to establish new subflows to the | ||||
| source address.</td> | ||||
| <td align="center">RFC 8684, <xref target="sec_init" format="defau | ||||
| lt"/></td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">D-G</td> | ||||
| <td align="center">Unassigned</td> | ||||
| <td align="center"/> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">H</td> | ||||
| <td align="center">HMAC-SHA256</td> | ||||
| <td align="center">RFC 8684, <xref target="sec_join" format="defau | ||||
| lt"/></td> | ||||
| </tr> | ||||
| </tbody> | ||||
| </table> | ||||
| <t>Future assignments in this registry are also | <t>Note that the meanings of bits "D" through "H" can be dependent upon | |||
| to be defined by Standards Action as defined by <xref target="RFC8126"/>. | bit "B", | |||
| depending on how the Extensibility parameter is defined in future specific | ||||
| ations; see | ||||
| <xref target="sec_init" format="default"/> for more information.</t> | ||||
| <t>Future assignments in this registry are also | ||||
| to be defined by Standards Action as defined by <xref target="RFC8126" for | ||||
| mat="default"/>. | ||||
| Assignments consist of the value of the flags, a symbolic name for the alg orithm, | Assignments consist of the value of the flags, a symbolic name for the alg orithm, | |||
| and a reference to its specification.</t> | and a reference to its specification.</t> | |||
| </section> | ||||
| <section anchor="IANA_rst" title="MP_TCPRST Reason Codes"> | ||||
| <t>IANA is requested to create a further sub-registry, "MPTCP MP_TCPRST Re | ||||
| ason Codes" under the "Transmission Control Protocol (TCP) Parameters" registry, | ||||
| based on the reason code in MP_TCPRST (<xref target="sec_reset"/>) message. Ini | ||||
| tial values for this registry are given in <xref target="table_rstcodes"/>; futu | ||||
| re assignments are to be defined by Specification Required as defined by <xref t | ||||
| arget="RFC8126"/>. Assignments consist of the value of the code, a short descrip | ||||
| tion of its meaning, and a reference to its specification. The maximum value is | ||||
| 0xff.</t> | ||||
| <t>As guidance to the Designated Expert <xref target="RFC8126"/>, assignme | ||||
| nts should not normally be refused unless codepoint space is becoming scarce, pr | ||||
| oviding that there is a clear distinction from other, already-existing codes, an | ||||
| d also providing there is sufficient guidance for implementors both sending and | ||||
| receiving these codes.</t> | ||||
| <texttable anchor="table_rstcodes" title="MPTCP MP_TCPRST Reason Codes"> | ||||
| <ttcol align="center">Code</ttcol> | ||||
| <ttcol align="center">Meaning</ttcol> | ||||
| <ttcol align="center">Reference</ttcol> | ||||
| <c>0x00</c> | ||||
| <c>Unspecified TCP error</c> | ||||
| <c>This document, <xref target="sec_reset"/></c> | ||||
| <c>0x01</c> | ||||
| <c>MPTCP specific error</c> | ||||
| <c>This document, <xref target="sec_reset"/></c> | ||||
| <c>0x02</c> | ||||
| <c>Lack of resources</c> | ||||
| <c>This document, <xref target="sec_reset"/></c> | ||||
| <c>0x03</c> | ||||
| <c>Administratively prohibited</c> | ||||
| <c>This document, <xref target="sec_reset"/></c> | ||||
| <c>0x04</c> | ||||
| <c>Too much outstanding data</c> | ||||
| <c>This document, <xref target="sec_reset"/></c> | ||||
| <c>0x05</c> | ||||
| <c>Unacceptable performance</c> | ||||
| <c>This document, <xref target="sec_reset"/></c> | ||||
| <c>0x06</c> | ||||
| <c>Middlebox interference</c> | ||||
| <c>This document, <xref target="sec_reset"/></c> | ||||
| </texttable> | ||||
| </section> | </section> | |||
| <section anchor="IANA_rst" numbered="true" toc="default"> | ||||
| <name>MP_TCPRST Reason Codes</name> | ||||
| <t>IANA has created a further subregistry, "MPTCP MP_TCPRST | ||||
| Reason Codes" under the "Transmission Control Protocol (TCP) | ||||
| Parameters" registry, based on the reason code in the MP_TCPRST (<xref t | ||||
| arget="sec_reset" format="default"/>) message. Initial values for this registry | ||||
| are given in <xref target="table_rstcodes" format="default"/>; future assignment | ||||
| s are to be defined by Specification Required as defined by <xref target="RFC812 | ||||
| 6" format="default"/>. Assignments consist of the value of the code, a short des | ||||
| cription of its meaning, and a reference to its specification. The maximum value | ||||
| is 0xff.</t> | ||||
| <table anchor="table_rstcodes" align="center"> | ||||
| <name>MPTCP MP_TCPRST Reason Codes</name> | ||||
| <thead> | ||||
| <tr> | ||||
| <th align="center">Code</th> | ||||
| <th align="center">Meaning</th> | ||||
| <th align="center">Reference</th> | ||||
| </tr> | ||||
| </thead> | ||||
| <tbody> | ||||
| <tr> | ||||
| <td align="center">0x00</td> | ||||
| <td align="center">Unspecified error</td> | ||||
| <td align="center">RFC 8684, <xref target="sec_reset" format="defa | ||||
| ult"/></td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">0x01</td> | ||||
| <td align="center">MPTCP-specific error</td> | ||||
| <td align="center">RFC 8684, <xref target="sec_reset" format="defa | ||||
| ult"/></td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">0x02</td> | ||||
| <td align="center">Lack of resources</td> | ||||
| <td align="center">RFC 8684, <xref target="sec_reset" format="defa | ||||
| ult"/></td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">0x03</td> | ||||
| <td align="center">Administratively prohibited</td> | ||||
| <td align="center">RFC 8684, <xref target="sec_reset" format="defa | ||||
| ult"/></td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">0x04</td> | ||||
| <td align="center">Too much outstanding data</td> | ||||
| <td align="center">RFC 8684, <xref target="sec_reset" format="defa | ||||
| ult"/></td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">0x05</td> | ||||
| <td align="center">Unacceptable performance</td> | ||||
| <td align="center">RFC 8684, <xref target="sec_reset" format="defa | ||||
| ult"/></td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">0x06</td> | ||||
| <td align="center">Middlebox interference</td> | ||||
| <td align="center">RFC 8684, <xref target="sec_reset" format="defa | ||||
| ult"/></td> | ||||
| </tr> | ||||
| </tbody> | ||||
| </table> | ||||
| <t>As guidance to the designated expert <xref target="RFC8126" | ||||
| format="default"/>, assignments should not normally be refused unless | ||||
| codepoint space is becoming scarce, provided that there is a clear | ||||
| distinction from other, already-existing codes and also provided that th | ||||
| ere is sufficient guidance for implementers both sending and receiving these cod | ||||
| es.</t> | ||||
| </section> | ||||
| </section> | </section> | |||
| </middle> | </middle> | |||
| <!-- *****BACK MATTER ***** --> | ||||
| <back> | <back> | |||
| <references title="Normative References"> | <displayreference target="I-D.ananth-tcpm-tcpoptext" to="TCPLO"/> | |||
| &RFC0793; | ||||
| &RFC2104; | ||||
| &RFC2119; | ||||
| &RFC5961; | ||||
| &RFC6234; | ||||
| &RFC8174; | ||||
| </references> | ||||
| <references title="Informative References"> | ||||
| &RFC1122; | ||||
| &RFC7323; | ||||
| &RFC1918; | ||||
| &RFC2018; | ||||
| &RFC5681; | ||||
| &RFC2979; | ||||
| &RFC2992; | ||||
| &RFC3022; | ||||
| &RFC3135; | ||||
| &RFC4086; | ||||
| &RFC4987; | ||||
| &RFC8126; | ||||
| &RFC6181; | ||||
| &RFC6356; | ||||
| &RFC6897; | ||||
| &RFC6182; | ||||
| &RFC6528; | ||||
| &RFC7413; | ||||
| &RFC7430; | ||||
| &RFC8041; | ||||
| <!-- &TCPLO; draft-ananth-tcpm-tcpoptext-00; Expired--> | ||||
| <reference anchor='TCPLO'> | ||||
| <front> | ||||
| <title>TCP option space extension</title> | ||||
| <author initials='A' surname='Ramaiah' fullname='Anantha Ramaiah'> | ||||
| <organization /> | ||||
| </author> | ||||
| <date month='March' day='26' year='2012' /> | ||||
| <abstract><t>The document goals are as follows: Firstly, this document summarize | ||||
| s the motivations for extending TCP option space. Secondly, It tries to summari | ||||
| ze the various known issues that needs to be taken into account while extending | ||||
| the TCP option space. Thirdly, it briefly provides a short summary of the vario | ||||
| us TCP option space proposals that has been proposed so far. Some additional pr | ||||
| oposals which includes variations to the existing proposals are also presented. | ||||
| The goal of this document is to rejuvenate the discussions on this topic and eve | ||||
| ntually to converge on a scheme for extending TCP option space.</t></abstract> | ||||
| </front> | <references> | |||
| <name>References</name> | ||||
| <references> | ||||
| <name>Normative References</name> | ||||
| <seriesInfo name='Work in' value='Progress' /> | <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.0793. | |||
| xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.2104. | ||||
| xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.2119. | ||||
| xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.5961. | ||||
| xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.6234. | ||||
| xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.8174. | ||||
| xml"/> | ||||
| </references> | ||||
| </reference> | <references> | |||
| <name>Informative References</name> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.1122. | ||||
| xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.7323. | ||||
| xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.1918. | ||||
| xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.2018. | ||||
| xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.5681. | ||||
| xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.2979. | ||||
| xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.2992. | ||||
| xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.3022. | ||||
| xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.3135. | ||||
| xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.4086. | ||||
| xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.4987. | ||||
| xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.8126. | ||||
| xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.6181. | ||||
| xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.6356. | ||||
| xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.6897. | ||||
| xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.6182. | ||||
| xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.6528. | ||||
| xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.6824. | ||||
| xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.7413. | ||||
| xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.7430. | ||||
| xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RFC.8041. | ||||
| xml"/> | ||||
| <reference anchor='norm' target="http://www.usenix.org/events/sec01/full_papers/ | <!-- draft-ananth-tcpm-tcpoptext (Expired) --> | |||
| handley/handley.pdf"><front><title abbrev="Network Intrusion Detection: Evasion, | <xi:include href="https://www.rfc-editor.org/refs/bibxml3/reference.I-D.ananth-t | |||
| Traffic Normalization, and End-to-End Protocol Semantics ">Network Intrusion De | cpm-tcpoptext.xml"/> | |||
| tection: Evasion, Traffic Normalization, and End-to-End Protocol Semantics</titl | ||||
| e><author initials='M.' surname='Handley' fullname='Mark Handley'><organization> | ||||
| ACIRI</organization></author><author initials='V.' surname='Paxson' fullname='Ve | ||||
| rn Paxson'><organization>ACIRI</organization></author><author initials='C.' surn | ||||
| ame='Kreibich' fullname='Christian Kreibich'><organization>Technische Universita | ||||
| t Munchen</organization></author><date year="2001"/></front><seriesInfo name="Us | ||||
| enix Security" value="2001"/></reference> | ||||
| <reference anchor='howhard' target="https://www.usenix.org/conference/nsdi12/how | <reference anchor="norm" target="https://www.usenix.org/legacy/events/se | |||
| -hard-can-it-be-designing-and-implementing-deployable-multipath-tcp"> | c01/full_papers/handley/handley.pdf"> | |||
| <front><title abbrev="How Hard Can It Be? Designing and Implementing a Deployabl | <front> | |||
| e Multipath TCP">How Hard Can It Be? Designing and Implementing a Deployable Mul | <title abbrev="Network Intrusion Detection: Evasion, Traffic Normali | |||
| tipath TCP</title> | zation, and End-to-End Protocol Semantics ">Network Intrusion Detection: Evasion | |||
| <author initials='C.' surname='Raiciu' fullname='Costin Raiciu'><organization>Un | , Traffic | |||
| iversitatea Politehnica Bucuresti</organization></author> | Normalization, and End-to-End Protocol Semantics</title> | |||
| <author initials='C.' surname='Paasch' fullname='Christoph Paasch'><organization | <seriesInfo name="Usenix Security Symposium" value="2001"/> | |||
| >Universite Catholique de Louvain</organization></author> | <author initials="M." surname="Handley" fullname="Mark Handley"> | |||
| <author initials='S.' surname='Barre' fullname='Sebastien Barre'><organization>U | <organization>ACIRI</organization> | |||
| niversite Catholique de Louvain</organization></author> | </author> | |||
| <author initials='A.' surname='Ford' fullname='Alan Ford'><organization/></autho | <author initials="V." surname="Paxson" fullname="Vern Paxson"> | |||
| r> | <organization>ACIRI</organization> | |||
| <author initials='M.' surname='Honda' fullname='Michio Honda'><organization>Keio | </author> | |||
| University</organization></author> | <author initials="C." surname="Kreibich" fullname="Christian Kreibic | |||
| <author initials='F.' surname='Duchene' fullname='Fabien Duchene'><organization> | h"> | |||
| Universite Catholique de Louvain</organization></author> | <organization>Technische | |||
| <author initials='O.' surname='Bonaventure' fullname='Olivier Bonaventure'><orga | Universitat Munchen</organization> | |||
| nization>Universite Catholique de Louvain</organization></author> | </author> | |||
| <author initials='M.' surname='Handley' fullname='Mark Handley'><organization>Un | <date month="August" year="2001"/> | |||
| iversity College London</organization></author> | </front> | |||
| <date year="2012" /> | </reference> | |||
| </front> | ||||
| <seriesInfo name="Usenix Symposium on Networked Systems Design and Implementatio | ||||
| n" value="2012"/> | ||||
| </reference> | ||||
| <reference anchor='deployments' target="https://www.ietfjournal.org/multipath-tc | <reference anchor="howhard" target="https://www.usenix.org/conference/ns | |||
| p-deployments/"><front><title abbrev="MPTCP Deployments">Multipath TCP Deploymen | di12/technical-sessions/presentation/raiciu"> | |||
| ts</title><author initials='O.' surname='Bonaventure' fullname='Olivier Bonavent | <front> | |||
| ure'><organization>Universite Catholique de Louvain</organization></author><auth | <title abbrev="How Hard Can It Be? Designing and Implementing a Depl | |||
| or initials='S.' surname='Seo' fullname='SungHoon Seo'></author><date day="1" mo | oyable Multipath TCP">How Hard Can It Be? Designing and Implementing a Deployabl | |||
| nth="November" year="2016"/></front><seriesInfo name="IETF Journal" value="2016" | e Multipath TCP</title> | |||
| /></reference> | <seriesInfo name="Usenix Symposium on Networked Systems Design and I | |||
| mplementation" value="2012"/> | ||||
| <author initials="C." surname="Raiciu" fullname="Costin Raiciu"> | ||||
| <organization>Universitatea Politehnica Bucuresti</organization> | ||||
| </author> | ||||
| <author initials="C." surname="Paasch" fullname="Christoph Paasch"> | ||||
| <organization>Universite Catholique de Louvain</organization> | ||||
| </author> | ||||
| <author initials="S." surname="Barre" fullname="Sebastien Barre"> | ||||
| <organization>Universite Catholique de Louvain</organization> | ||||
| </author> | ||||
| <author initials="A." surname="Ford" fullname="Alan Ford"> | ||||
| <organization/> | ||||
| </author> | ||||
| <author initials="M." surname="Honda" fullname="Michio Honda"> | ||||
| <organization>Keio University</organization> | ||||
| </author> | ||||
| <author initials="F." surname="Duchene" fullname="Fabien Duchene"> | ||||
| <organization>Universite Catholique de Louvain</organization> | ||||
| </author> | ||||
| <author initials="O." surname="Bonaventure" fullname="Olivier Bonave | ||||
| nture"> | ||||
| <organization>Universite Catholique de Louvain</organization> | ||||
| </author> | ||||
| <author initials="M." surname="Handley" fullname="Mark Handley"> | ||||
| <organization>University College London</organization> | ||||
| </author> | ||||
| <date month="April" year="2012"/> | ||||
| </front> | ||||
| </reference> | ||||
| </references> | <reference anchor="deployments" target="https://www.ietfjournal.org/mult | |||
| ipath-tcp-deployments/"> | ||||
| <front> | ||||
| <title abbrev="MPTCP Deployments">Multipath TCP Deployments</title> | ||||
| <seriesInfo name="IETF Journal" value="2016"/> | ||||
| <author initials="O." surname="Bonaventure" fullname="Olivier Bonave | ||||
| nture"> | ||||
| <organization>Universite Catholique de Louvain</organization> | ||||
| </author> | ||||
| <author initials="S." surname="Seo" fullname="SungHoon Seo"/> | ||||
| <date month="November" year="2016"/> | ||||
| </front> | ||||
| </reference> | ||||
| </references> | ||||
| </references> | ||||
| <section title="Notes on Use of TCP Options" anchor="app_options"> | <section anchor="app_options" numbered="true" toc="default"> | |||
| <name>Notes on Use of TCP Options</name> | ||||
| <t>The TCP option space is limited due to the length of the Data Offset fi eld in the TCP header (4 bits), which defines the TCP header length in 32-bit wo rds. With the standard TCP header being 20 bytes, this leaves a maximum of 40 by tes for options, and many of these may already be used by options such as timest amp and SACK.</t> | <t>The TCP option space is limited due to the length of the Data Offset fi eld in the TCP header (4 bits), which defines the TCP header length in 32-bit wo rds. With the standard TCP header being 20 bytes, this leaves a maximum of 40 by tes for options, and many of these may already be used by options such as timest amp and SACK.</t> | |||
| <t>We performed a brief study on the commonly used TCP options in SYN, | ||||
| data, and pure ACK packets and found that there is enough room | ||||
| to fit all the options discussed in this document.</t> | ||||
| <t>SYN packets typically include the following options: Maximum Segment Si | ||||
| ze (MSS) (4 bytes), | ||||
| window scale (3 bytes), SACK permitted (2 bytes), and timestamp | ||||
| (10 bytes). The sum of these options is 19 bytes. Some operating | ||||
| systems appear to pad each option up to a word boundary, thus using 24 | ||||
| bytes (a brief survey suggests that Windows XP and Mac OS X do this, where | ||||
| as Linux does not). | ||||
| <t>We have performed a brief study on the commonly used TCP options in SYN | Optimistically, therefore, we have 21 bytes available, or 16 if options ha | |||
| , data, and pure ACK packets, and found that there is enough room to fit all the | ve to be | |||
| options we propose using in this document.</t> | word-aligned. In either case, however, the SYN versions of | |||
| MP_CAPABLE (12 bytes) and MP_JOIN (12 or 16 bytes) will fit in t | ||||
| <t>SYN packets typically include Maximum Segment Size (MSS) (4 bytes), win | his remaining space.</t> | |||
| dow scale (3 bytes), SACK permitted (2 bytes), and timestamp (10 bytes) options. | <t>Note that due to the use of a 64-bit data-level sequence space, it is | |||
| Together these sum to 19 bytes. Some operating systems appear to pad each optio | feasible that MPTCP will not require the timestamp option for | |||
| n up to a word boundary, thus using 24 bytes (a brief survey suggests Windows XP | protection against wrapped sequence numbers (per the Protection | |||
| and Mac OS X do this, whereas Linux does not). | Against Wrapped Sequences (PAWS) mechanism, as described in <xref target=" | |||
| RFC7323" | ||||
| Optimistically, therefore, we have 21 bytes spare, or 16 if it has to be w | format="default"/>), since the data-level sequence space has far less | |||
| ord-aligned. In either case, however, the SYN versions of Multipath Capable (12 | chance of wrapping. Confirmation of the validity of this optimization is | |||
| bytes) and Join (12 or 16 bytes) options will fit in this remaining space.</t> | left for further study.</t> | |||
| <t>TCP data packets typically carry timestamp options in every packet, | ||||
| <t>Note that due to the use of a 64-bit data-level sequence space, it is f | taking 10 bytes (or 12, with padding). That leaves 30 bytes (or 28, if | |||
| easible that MPTCP will not require the timestamp option for protection against | word-aligned). The DSS option varies in length, depending on (1) whet | |||
| wrapped sequence numbers (PAWS <xref target="RFC7323"/>), since the data-level s | her the | |||
| equence space has far less chance of wrapping. Confirmation of the validity of t | Data Sequence Mapping, DATA_ACK, or both are included, (2) whether th | |||
| his optimisation is for further study.</t> | e | |||
| sequence numbers in use are 4 or 8 octets, and (3) whether the | ||||
| <t>TCP data packets typically carry timestamp options in every packet, tak | checksum is present. The maximum size of the DSS option is 28 bytes, so ev | |||
| ing 10 bytes (or 12 with padding). That leaves 30 bytes (or 28, if word-aligned) | en that will fit in the available space. But unless a connection is both bidirec | |||
| . The Data Sequence Signal (DSS) option varies in length depending on whether th | tional and high-bandwidth, it is unlikely that all that option space will be req | |||
| e data sequence mapping and DATA_ACK are included, and whether the sequence numb | uired on each DSS option.</t> | |||
| ers in use are 4 or 8 octets. The maximum size of the DSS option is 28 bytes, so | <t>Within the DSS option, it is not necessary to include the Data Sequence | |||
| even that will fit in the available space. But unless a connection is both bidi | Mapping and DATA_ACK in each packet, and in many cases it may be possible to al | |||
| rectional and high-bandwidth, it is unlikely that all that option space will be | ternate their presence (so long as the mapping covers the data being sent in the | |||
| required on each DSS option.</t> | subsequent packet). It would also be possible to alternate between 4-byte and 8 | |||
| -byte sequence numbers in each option.</t> | ||||
| <t>Within the DSS option, it is not necessary to include the data sequence | <t>On subflow and connection setup, an MPTCP option is also set on the thi | |||
| mapping and DATA_ACK in each packet, and in many cases it may be possible to al | rd packet (an ACK). These are 20 bytes (for MP_CAPABLE) and 24 bytes (for M | |||
| ternate their presence (so long as the mapping covers the data being sent in the | P_JOIN), both of which will fit in the available option space.</t> | |||
| following packet). It would also be possible to alternate between 4- and 8-byte | ||||
| sequence numbers in each option.</t> | ||||
| <t>On subflow and connection setup, an MPTCP option is also set on the thi | ||||
| rd packet (an ACK). These are 20 bytes (for Multipath Capable) and 24 bytes (for | ||||
| Join), both of which will fit in the available option space.</t> | ||||
| <t>Pure ACKs in TCP typically contain only timestamps (10 bytes). Here, Mu ltipath TCP typically | <t>Pure ACKs in TCP typically contain only timestamps (10 bytes). Here, Mu ltipath TCP typically | |||
| needs to encode only the DATA_ACK (maximum of 12 bytes). Occasionally, ACKs will contain SACK information. Depending | needs to encode only the DATA_ACK (maximum of 12 bytes). Occasionally, ACKs will contain SACK information. Depending | |||
| on the number of lost packets, SACK may utilize the entire option space. If a DA TA_ACK had to be | on the number of lost packets, SACK may utilize the entire option space. If a DA TA_ACK had to be | |||
| included, then it is probably necessary to reduce the number of SACK blocks to a ccommodate the | included, then it is probably necessary to reduce the number of SACK blocks to a ccommodate the | |||
| DATA_ACK. However, the presence of the DATA_ACK is unlikely to be necessary in a case where SACK is | DATA_ACK. However, the presence of the DATA_ACK is unlikely to be necessary in a case where SACK is | |||
| in use, since until at least some of the SACK blocks have been retransmitted, th e cumulative | in use, since until at least some of the SACK blocks have been retransmitted, th e cumulative | |||
| data-level ACK will not be moving forward (or if it does, due to retransmissions on another path, | data-level ACK will not be moving forward (or if it does, due to retransmissions on another path, | |||
| then that path can also be used to transmit the new DATA_ACK).</t> | then that path can also be used to transmit the new DATA_ACK).</t> | |||
| <t>The ADD_ADDR option can be between 16 and 30 bytes, depending on | ||||
| <t>The ADD_ADDR option can be between 16 and 30 bytes, depending on whethe | (1) whether IPv4 or IPv6 is used and (2) whether or not the port | |||
| r IPv4 or IPv6 is used, and whether or not the port number is present. It is unl | number is | |||
| ikely that such signaling would fit in a data packet (although if there is space | present. It is unlikely that such signaling would fit in a data packet | |||
| , it is fine to include it). It is recommended to use duplicate ACKs with no oth | (although if there is space, it is fine to include it). It is | |||
| er payload or options in order to transmit these rare signals. Note this is the | recommended that duplicate ACKs not be used with any other payload or opti | |||
| reason for mandating that duplicate ACKs with MPTCP options are not taken as a s | ons, in | |||
| ignal of congestion.</t> | order to transmit these rare signals. Note that this is the reason for | |||
| mandating that duplicate ACKs with MPTCP options not be taken as a signal | ||||
| of congestion.</t> | ||||
| </section> | </section> | |||
| <section anchor="app_tfo" numbered="true" toc="default"> | ||||
| <section title="TCP Fast Open and MPTCP" anchor="app_tfo"> | <name>TCP Fast Open and MPTCP</name> | |||
| <t>TCP Fast Open (TFO) is an experimental TCP extension, described in | <t>TCP Fast Open (TFO) is an experimental TCP extension, described in | |||
| <xref target="RFC7413"/>, which has been introduced to allow sending data | <xref target="RFC7413" format="default"/>, which has been introduced to | |||
| allow the sending of data | ||||
| one RTT earlier than with regular TCP. This is | one RTT earlier than with regular TCP. This is | |||
| considered a valuable gain as very short connections are very common, | considered a valuable gain, as very short connections are very common, | |||
| especially for HTTP request/response schemes. It achieves this by sending | especially for HTTP request/response schemes. It achieves this by sending | |||
| the SYN-segment together with the application's data and allowing the list | the SYN segment together with the application's data and allowing the list | |||
| ener to reply | ener to reply | |||
| immediately with data after the SYN/ACK. <xref target="RFC7413"/> secures | immediately with data after the SYN/ACK. <xref target="RFC7413" format="de | |||
| this mechanism, by using a new TCP option that includes a cookie which | fault"/> secures | |||
| this mechanism by using a new TCP option that includes a cookie that | ||||
| is negotiated in a preceding connection.</t> | is negotiated in a preceding connection.</t> | |||
| <t>When using TFO in conjunction with MPTCP, there are two key | ||||
| points to take into account, as detailed below.</t> | ||||
| <section anchor="tfocookie" numbered="true" toc="default"> | ||||
| <name>TFO Cookie Request with MPTCP</name> | ||||
| <t>When a TFO initiator first connects to a listener, it cannot immediat | ||||
| ely | ||||
| include data in the SYN for security reasons <xref target="RFC7413" fo | ||||
| rmat="default"/>. | ||||
| Instead, it requests a cookie that will be used in subsequent | ||||
| connections. This is done with the TCP cookie request/response options | ||||
| , | ||||
| of 2 bytes and 6-18 bytes, respectively (depending on the chosen cooki | ||||
| e length).</t> | ||||
| <t>TFO and MPTCP can be combined, provided that the total length of all | ||||
| the | ||||
| options does not exceed the maximum 40 bytes possible in TCP: | ||||
| <t>When using TCP Fast Open in conjunction with MPTCP, there are two key | </t> | |||
| points to take into account, detailed hereafter.</t> | <ul spacing="normal"> | |||
| <li>In the SYN: MPTCP uses a 4-byte MP_CAPABLE option. The sum | ||||
| <section title="TFO cookie request with MPTCP" anchor="tfocookie"> | of the MPTCP and TFO options is 6 bytes. With typical TCP options usin | |||
| <t>When a TFO initiator first connects to a listener, it cannot immedia | g up | |||
| tely | to 19 bytes in the SYN (24 bytes if options are padded at a word bound | |||
| include data in the SYN for security reasons <xref target="RFC7413"/>. | ary), | |||
| Instead, it requests a cookie that will be used in subsequent | there is enough space to combine the MP_CAPABLE with the TFO cookie re | |||
| connections. This is done with the TCP cookie request/response options, | quest.</li> | |||
| of respectively 2 bytes and 6-18 bytes (depending on the chosen cookie | <li>In the SYN + ACK: MPTCP uses a 12-byte MP_CAPABLE option, but | |||
| length).</t> | now the TFO option can be as long as 18 bytes. Since the maximum optio | |||
| n length | ||||
| <t>TFO and MPTCP can be combined provided that the total length of all | may be exceeded, it is up to the listener to avoid this problem by usi | |||
| the | ng a | |||
| options does not exceed the maximum 40 bytes possible in TCP: | shorter cookie. | |||
| As an example, if we consider that 19 bytes are used for classical | ||||
| <list style="symbols"> | TCP options, the maximum possible cookie length would be | |||
| <t>In the SYN: MPTCP uses a 4-bytes long MP_CAPABLE option. The MPTCP | 7 bytes. Note that, for the SYN packet, the same limitation applies to | |||
| and TFO options sum up to 6 bytes. With typical TCP-options using up | subsequent | |||
| to 19 bytes in the SYN (24 bytes if options are padded at a word bounda | connections (because the initiator then echoes | |||
| ry), | the cookie back to the listener). Finally, if the security impact of r | |||
| there is enough space to combine the MP_CAPABLE with the TFO Cookie Req | educing | |||
| uest.</t> | the cookie size is not deemed acceptable, the listener can reduce the | |||
| amount of space used by other TCP options by omitting the TCP timestam | ||||
| <t>In the SYN+ACK: MPTCP uses a 12-bytes long MP_CAPABLE option, but | ps (as | |||
| now TFO can be as long as 18 bytes. Since the maximum option length | outlined in <xref target="app_options" format="default"/>).</li> | |||
| may be exceeded, it is up to the listener to solve this by using a | </ul> | |||
| shorter cookie. | </section> | |||
| As an example, if we consider that 19 bytes are used for classical | <section anchor="tfodata" numbered="true" toc="default"> | |||
| TCP options, the maximum possible cookie length would be | <name>Data Sequence Mapping under TFO</name> | |||
| of 7 bytes. Note that the same limitation applies to subsequent | <t>In the TCP establishment phase, MPTCP uses a key exchange that is | |||
| connections, for the SYN packet (because the initiator then echoes back | used to generate the Initial Data Sequence Numbers (IDSNs). In particu | |||
| the cookie to the listener). Finally, if the security impact of reducin | lar, | |||
| g | the SYN with MP_CAPABLE occupies the first octet of data sequence | |||
| the cookie size is not deemed acceptable, the listener can reduce the | space. With TFO, one way to handle the data sent together with the SYN | |||
| amount of other TCP-options by omitting the TCP timestamps (as | would be to consider an implicit DSS mapping that covers that SYN segm | |||
| outlined in <xref target="app_options"/>).</t> | ent | |||
| </list></t> | (since there is not enough space in the SYN to include a DSS option). | |||
| </section> | The problem with that approach is that if a middlebox modifies the TFO | |||
| data, this will not be noticed by MPTCP because of the absence of a | ||||
| <section title="Data sequence mapping under TFO" anchor="tfodata"> | DSS checksum. For example, a TCP‑aware (but not MPTCP-aware) mid | |||
| <t>MPTCP uses, in the TCP establishment phase, a key exchange that is | dlebox could | |||
| used to generate the Initial Data Sequence Numbers (IDSNs). In particul | insert bytes at the beginning of the stream and adapt the TCP checksum | |||
| ar, | and sequence numbers accordingly. With an implicit mapping, this infor | |||
| the SYN with MP_CAPABLE occupies the first octet of the data sequence | mation would | |||
| space. With TFO, one way to handle the data sent together with the SYN | give to the initiator and listener a different view of the DSS | |||
| would be to consider an implicit DSS mapping that covers that SYN segme | mapping; there would be no | |||
| nt | way to detect this inconsistency, because the DSS checksum is not pres | |||
| (since there is not enough space in the SYN to include a DSS option). | ent.</t> | |||
| The problem with that approach is that if a middlebox modifies the TFO | <t>To solve this issue, the TFO data must not be considered part of the | |||
| data, this will not be noticed by MPTCP because of the absence of a | data sequence number space: the SYN with MP_CAPABLE still occupies | |||
| DSS-checksum. For example, a TCP (but not MPTCP)-aware middlebox could | the first octet of data sequence space, but then the first non-TFO | |||
| insert bytes at the beginning of the stream and adapt the TCP checksum | data byte occupies the second octet. This guarantees that, if the | |||
| and sequence numbers accordingly. With an implicit mapping, this would | use of the DSS checksum is negotiated, all data in the data sequence | |||
| give to initiator and listener a different view on the DSS-mapping, wit | number space is checksummed. We also note that this does not entail | |||
| h no | a loss of functionality, because TFO data is always only sent on the | |||
| way to detect this inconsistency as the DSS checksum is not present.</t | initial subflow, before any attempt to create additional subflows.</t> | |||
| > | </section> | |||
| <section anchor="tfoexamples" numbered="true" toc="default"> | ||||
| <t>To solve this, the TFO data must not be considered part of the | <name>Connection Establishment Examples</name> | |||
| Data Sequence Number space: the SYN with MP_CAPABLE still occupies | <t>A few examples of possible "TFO + MPTCP" | |||
| the first octet of data sequence space, but then the first non-TFO | establishment scenarios are shown below.</t> | |||
| data byte occupies the second octet. This guarantees that, if the | <t>Before an initiator can send data together with the SYN, it must requ | |||
| use of DSS-checksum is negotiated, all data in the data sequence | est | |||
| number space is checksummed. We also note that this does not entail | a cookie from the listener, as shown in <xref target="fig_tfocookie" | |||
| a loss of functionality, because TFO-data is always only sent on the | format="default"/>. (Note: The sequence number | |||
| initial subflow before any attempt to create additional subflows.</t> | and length are annotated in <xref target="fig_tfocookie" format="default"/> as | |||
| </section> | Seq(Length) (e.g., "S. 0(0)") and used as such in the subsequent figures | |||
| (e.g., "S 0(20)" in <xref target="fig_tfodata"/>).) This is done b | ||||
| <section title="Connection establishment examples" anchor="tfoexamples"> | y simply combining the TFO and MPTCP options.</t> | |||
| <t>The following shows a few examples of possible TFO+MPTCP | <figure anchor="fig_tfocookie"> | |||
| establishment scenarios.</t> | <name>Cookie Request</name> | |||
| <artwork align="left" name="" type="" alt=""><![CDATA[ | ||||
| <t>Before an initiator can send data together with the SYN, it must re | initiator listener | |||
| quest | | | | |||
| a cookie to the listener, as shown in <xref target="fig_tfocookie"/>. | | S Seq=0(Length=0) <MP_CAPABLE>, <TFO cookie request> | | |||
| This is done by simply combining the TFO and MPTCP options.</t> | | --------------------------------------------------------> | | |||
| | | | ||||
| <figure align="center" anchor="fig_tfocookie" title="Cookie request - | | S. 0(0) ack 1 <MP_CAPABLE>, <TFO cookie> | | |||
| sequence number and length are annotated as Seq(Length) and used hereafter in th | | <-------------------------------------------------------- | | |||
| e figures."> | | | | |||
| <artwork align="left"><![CDATA[ | | . 0(0) ack 1 <MP_CAPABLE> | | |||
| initiator listener | | --------------------------------------------------------> | | |||
| | | | | | ]]></artwork> | |||
| | S Seq=0(Length=0) <MP_CAPABLE>, <TFO cookie request> | | </figure> | |||
| | -----------------------------------------------------------> | | <t>Once this is done, the received cookie can be used for TFO, as shown | |||
| | | | in <xref target="fig_tfodata" format="default"/>. In this example, the | |||
| | S. 0(0) ack 1 <MP_CAPABLE>, <TFO cookie> | | initiator first | |||
| | <----------------------------------------------------------- | | sends 20 bytes in the SYN. The listener immediately replies with 100 b | |||
| | | | ytes | |||
| | . 0(0) ack 1 <MP_CAPABLE> | | following the SYN-ACK, to which the initiator replies with 20 more byt | |||
| | -----------------------------------------------------------> | | es. | |||
| | | | Note that the last segment in the figure | |||
| ]]></artwork> | ||||
| </figure> | ||||
| <t>Once this is done, the received cookie can be used for TFO, as show | ||||
| n | ||||
| in <xref target="fig_tfodata"/>. In this example, the initiator first | ||||
| sends 20 bytes in the SYN. The listener immediately replies with 100 by | ||||
| tes | ||||
| following the SYN-ACK upon which the initiator replies with 20 more byt | ||||
| es. | ||||
| Note that the last segment in the figure | ||||
| has a TCP sequence number of 21, while the DSS subflow sequence | has a TCP sequence number of 21, while the DSS subflow sequence | |||
| number is 1 (because the TFO data is not part of the data sequence | number is 1 (because the TFO data is not part of the data sequence | |||
| number space, as explained in Section <xref target="tfodata"/>.</t> | number space, as explained in <xref target="tfodata" format="default"/ | |||
| >.</t> | ||||
| <figure align="center" anchor="fig_tfodata" title="The listener support | ||||
| s TFO"> | ||||
| <artwork align="left"><![CDATA[ | ||||
| initiator listener | ||||
| | | | ||||
| | S 0(20) <MP_CAPABLE>, <TFO cookie> | | ||||
| | -----------------------------------------------------------> | | ||||
| | | | ||||
| | S. 0(0) ack 21 <MP_CAPABLE> | | ||||
| | <----------------------------------------------------------- | | ||||
| | | | ||||
| | . 1(100) ack 21 <DSS ack=1 seq=1 ssn=1 dlen=100> | | ||||
| | <----------------------------------------------------------- | | ||||
| | | | ||||
| | . 21(0) ack 1 <MP_CAPABLE> | | ||||
| | -----------------------------------------------------------> | | ||||
| | | | ||||
| | . 21(20) ack 101 <DSS ack=101 seq=1 ssn=1 dlen=20> | | ||||
| | -----------------------------------------------------------> | | ||||
| | | | ||||
| ]]></artwork> | ||||
| </figure> | ||||
| <t>In <xref target="fig_tfofallback"/>, the listener does not support | <figure anchor="fig_tfodata"> | |||
| TFO. The initiator detects | <name>The Listener Supports TFO</name> | |||
| that no state is created in the listener (as no data is acked), and no | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
| w | initiator listener | |||
| sends the MP_CAPABLE in the third ack, in order for the listener to | | | | |||
| build its MPTCP context at then end of the establishment. Now, the | | S 0(20) <MP_CAPABLE>, <TFO cookie> | | |||
| tfo data, retransmitted, becomes part of the data sequence mapping | | --------------------------------------------------------> | | |||
| because it is effectively sent (in fact re-sent) after the | | | | |||
| | S. 0(0) ack 21 <MP_CAPABLE> | | ||||
| | <-------------------------------------------------------- | | ||||
| | | | ||||
| | . 1(100) ack 21 <DSS ack=1 seq=1 ssn=1 dlen=100> | | ||||
| | <-------------------------------------------------------- | | ||||
| | | | ||||
| | . 21(0) ack 1 <MP_CAPABLE> | | ||||
| | --------------------------------------------------------> | | ||||
| | | | ||||
| | . 21(20) ack 101 <DSS ack=101 seq=1 ssn=1 dlen=20> | | ||||
| | --------------------------------------------------------> | | ||||
| | | ]]></artwork> | ||||
| </figure> | ||||
| <t>In <xref target="fig_tfofallback" format="default"/>, the listener do | ||||
| es not support TFO. The initiator detects | ||||
| that no state is created in the listener (as no data is ACKed) and now | ||||
| sends the MP_CAPABLE in the third packet, in order for the listener to | ||||
| build its MPTCP context at the end of the establishment. Now, the | ||||
| TFO data, when retransmitted, becomes part of the Data Sequence Mappin | ||||
| g | ||||
| because it is effectively sent (in fact re‑sent) after the | ||||
| establishment.</t> | establishment.</t> | |||
| <figure anchor="fig_tfofallback"> | ||||
| <figure align="center" anchor="fig_tfofallback" title="The listener doe | <name>The Listener Does Not Support TFO</name> | |||
| s not support TFO"> | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
| <artwork align="left"><![CDATA[ | initiator listener | |||
| initiator listener | | | | |||
| | | | | S 0(20) <MP_CAPABLE>, <TFO cookie> | | |||
| | S 0(20) <MP_CAPABLE>, <TFO cookie> | | | --------------------------------------------------------> | | |||
| | -----------------------------------------------------------> | | | | | |||
| | | | | S. 0(0) ack 1 <MP_CAPABLE> | | |||
| | S. 0(0) ack 1 <MP_CAPABLE> | | | <-------------------------------------------------------- | | |||
| | <----------------------------------------------------------- | | | | | |||
| | | | | . 1(0) ack 1 <MP_CAPABLE> | | |||
| | . 1(0) ack 1 <MP_CAPABLE> | | | --------------------------------------------------------> | | |||
| | -----------------------------------------------------------> | | | | | |||
| | | | | . 1(20) ack 1 <DSS ack=1 seq=1 ssn=1 dlen=20> | | |||
| | . 1(20) ack 1 <DSS ack=1 seq=1 ssn=1 dlen=20> | | | --------------------------------------------------------> | | |||
| | -----------------------------------------------------------> | | | | | |||
| | | | | . 0(0) ack 21 <DSS ack=21 seq=1 ssn=1 dlen=0> | | |||
| | . 0(0) ack 21 <DSS ack=21 seq=1 ssn=1 dlen=0> | | | <-------------------------------------------------------- | | |||
| | <----------------------------------------------------------- | | | | ]]></artwork> | |||
| | | | </figure> | |||
| ]]></artwork> | <t>It is also possible that the listener acknowledges only part of the T | |||
| </figure> | FO | |||
| data, as illustrated in <xref target="fig_tfopartial" format="default" | ||||
| <t>It is also possible that the listener acknowledges only part of the | />. The | |||
| TFO | initiator will simply retransmit the missing data together with a | |||
| data, as illustrated in <xref target="fig_tfopartial"/>. The | DSS mapping.</t> | |||
| initiator will simply retransmit the missing data together with a DSS-m | <figure anchor="fig_tfopartial"> | |||
| apping.</t> | <name>Partial Data Acknowledgment</name> | |||
| <artwork align="left" name="" type="" alt=""><![CDATA[ | ||||
| <figure align="center" anchor="fig_tfopartial" title="Partial data ackn | initiator listener | |||
| owledgement"> | | | | |||
| <artwork align="left"><![CDATA[ | | S 0(1000) <MP_CAPABLE>, <TFO cookie> | | |||
| initiator listener | | --------------------------------------------------------> | | |||
| | | | | | | |||
| | S 0(1000) <MP_CAPABLE>, <TFO cookie> | | | S. 0(0) ack 501 <MP_CAPABLE> | | |||
| | -----------------------------------------------------------> | | | <-------------------------------------------------------- | | |||
| | | | | | | |||
| | S. 0(0) ack 501 <MP_CAPABLE> | | | . 501(0) ack 1 <MP_CAPABLE> | | |||
| | <----------------------------------------------------------- | | | --------------------------------------------------------> | | |||
| | | | | | | |||
| | . 501(0) ack 1 <MP_CAPABLE> | | | . 501(500) ack 1 <DSS ack=1 seq=1 ssn=1 dlen=500> | | |||
| | -----------------------------------------------------------> | | | --------------------------------------------------------> | | |||
| | | | | | ]]></artwork> | |||
| | . 501(500) ack 1 <DSS ack=1 seq=1 ssn=1 dlen=500> | | </figure> | |||
| | -----------------------------------------------------------> | | </section> | |||
| | | | ||||
| ]]></artwork> | ||||
| </figure> | ||||
| </section> | ||||
| </section> | </section> | |||
| <section anchor="app_tcb" numbered="true" toc="default"> | ||||
| <section title="Control Blocks" anchor="app_tcb"> | <name>Control Blocks</name> | |||
| <t>Conceptually, an MPTCP connection can be represented as an MPTCP protocol con | <t>Conceptually, an MPTCP connection can be represented as an MPTCP protoc | |||
| trol | ol control | |||
| block (PCB) that contains several variables that track the progress and the | block (PCB) that contains several variables that track the progress and the | |||
| state of the MPTCP connection and a set of linked TCP control blocks | state of the MPTCP connection and a set of linked TCP control blocks | |||
| that correspond to the subflows that have been established.</t> | that correspond to the subflows that have been established.</t> | |||
| <t>RFC 793 <xref target="RFC0793" format="default"/> specifies several sta | ||||
| <t>RFC 793 <xref target="RFC0793"/> specifies several state variables. Whenever | te variables. Whenever possible, we reuse | |||
| possible, we reuse | the same terminology as RFC 793 to describe the state variables that are | |||
| the same terminology as RFC 793 to describe the state variables that are | ||||
| maintained by MPTCP.</t> | maintained by MPTCP.</t> | |||
| <section numbered="true" toc="default"> | ||||
| <section title="MPTCP Control Block"> | <name>MPTCP Control Block</name> | |||
| <t>The MPTCP control block contains the following variable per connection.</t> | <t>The MPTCP control block contains the following variables per connecti | |||
| on.</t> | ||||
| <section title="Authentication and Metadata"> | <section numbered="true" toc="default"> | |||
| <t><list style="hanging"> | <name>Authentication and Metadata</name> | |||
| <t hangText="Local.Token (32 bits):"> This is the token chosen by the local host | <dl newline="false" spacing="normal" indent="3"> | |||
| on | <dt>Local.Token (32 bits):</dt> | |||
| <dd> This is the token chosen by the local host on | ||||
| this MPTCP connection. The token must be unique among all established | this MPTCP connection. The token must be unique among all established | |||
| MPTCP connections, and is generated from the local key.</t> | MPTCP connections and is generated from the local key.</dd> | |||
| <t hangText="Local.Key (64 bits):"> This is the key sent by the local host on th | <dt>Local.Key (64 bits):</dt> | |||
| is | <dd> This is the key sent by the local host on this | |||
| MPTCP connection.</t> | MPTCP connection.</dd> | |||
| <t hangText="Remote.Token (32 bits):"> This is the token chosen by the remote ho | <dt>Remote.Token (32 bits):</dt> | |||
| st on | <dd> This is the token chosen by the remote host on | |||
| this MPTCP connection, generated from the remote key.</t> | this MPTCP connection, generated from the remote key.</dd> | |||
| <t hangText="Remote.Key (64 bits):"> This is the key chosen by the remote host o | <dt>Remote.Key (64 bits):</dt> | |||
| n | <dd> This is the key chosen by the remote host on | |||
| this MPTCP connection</t> | this MPTCP connection.</dd> | |||
| <t hangText="MPTCP.Checksum (flag):"> This flag is set to true if at least one o | <dt>MPTCP.Checksum (flag):</dt> | |||
| f the | <dd> This flag is set to true if at least one of the | |||
| hosts has set the A bit in the MP_CAPABLE options exchanged during connection es | hosts has set the "A" bit in the MP_CAPABLE options exchanged during | |||
| tablishment, | connection establishment; otherwise, | |||
| and is set to false otherwise. If this flag is set, the checksum must be comput | it is set to false. If this flag is set, the checksum must be computed in | |||
| ed in | all DSS options.</dd> | |||
| all DSS options.</t> | </dl> | |||
| </list></t> | </section> | |||
| </section> | <section numbered="true" toc="default"> | |||
| <name>Sending Side</name> | ||||
| <section title="Sending Side"> | <dl newline="false" spacing="normal" indent="3"> | |||
| <t><list style="hanging"> | <dt>SND.UNA (64 bits):</dt> | |||
| <t hangText="SND.UNA (64 bits):"> This is the data sequence number of the next b | <dd> This is the data sequence number of the next byte to be | |||
| yte to be | ||||
| acknowledged, at the MPTCP connection level. This variable is updated | acknowledged, at the MPTCP connection level. This variable is updated | |||
| upon reception of a DSS option containing a DATA_ACK.</t> | upon reception of a DSS option containing a DATA_ACK.</dd> | |||
| <t hangText="SND.NXT (64 bits):"> This is the data sequence number of the next b | <dt>SND.NXT (64 bits):</dt> | |||
| yte to be | <dd> This is the data sequence number of the next byte to be | |||
| sent. SND.NXT is used to determine the value of the DSN in the DSS option.</t> | sent. SND.NXT is used to determine the value of the DSN in the DSS option.</dd> | |||
| <t hangText="SND.WND (32 bits with RFC 7323, 16 bits otherwise):"> This is the s | <dt>SND.WND (32 bits):</dt> | |||
| ending window. MPTCP | <dd> This is the send window. 32 bits if the features in RFC | |||
| maintains the sending window at the MPTCP connection level and the same | 7323 are used; 16 bits otherwise. MPTCP maintains the send window at | |||
| window is shared by all subflows. All subflows use the MPTCP connection | the MPTCP connection level, and the same | |||
| level SND.WND to compute the SEQ.WND value that is sent in each | window is shared by all subflows. All subflows use the MPTCP connection-level | |||
| transmitted segment.</t> | SND.WND to compute the SEQ.WND value that is sent in each | |||
| </list></t> | transmitted segment.</dd> | |||
| </section> | </dl> | |||
| </section> | ||||
| <section title="Receiving Side"> | <section numbered="true" toc="default"> | |||
| <t><list style="hanging"> | <name>Receiving Side</name> | |||
| <t hangText="RCV.NXT (64 bits):"> This is the data sequence number of the next b | <dl newline="false" spacing="normal" indent="3"> | |||
| yte that | <dt>RCV.NXT (64 bits):</dt> | |||
| <dd> This is the data sequence number of the next byte that | ||||
| is expected on the MPTCP connection. This state variable is modified | is expected on the MPTCP connection. This state variable is modified | |||
| upon reception of in-order data. The value of RCV.NXT is used to specify | upon reception of in-order data. The value of RCV.NXT is used to specify | |||
| the DATA_ACK that is sent in the DSS option on all subflows.</t> | the DATA_ACK that is sent in the DSS option on all subflows.</dd> | |||
| <t hangText="RCV.WND (32 bits with RFC 7323, 16 bits otherwise):"> This is the c | <dt>RCV.WND (32 bits):</dt> | |||
| onnection-level | <dd> This is the connection-level receive window, which is the | |||
| receive window, which is the maximum of the RCV.WND on all the subflows.</t> | maximum of the RCV.WND on all the subflows. 32 bits if the fea | |||
| </list></t> | tures in RFC 7323 are used; 16 bits otherwise.</dd> | |||
| </section> | </dl> | |||
| </section> | </section> | |||
| </section> | ||||
| <section title="TCP Control Blocks"> | <section numbered="true" toc="default"> | |||
| <t>The MPTCP control block also contains a list of the TCP control blocks | <name>TCP Control Blocks</name> | |||
| <t>The MPTCP control block also contains a list of the TCP control block | ||||
| s | ||||
| that are associated with the MPTCP connection.</t> | that are associated with the MPTCP connection.</t> | |||
| <t>Note that the TCP control block on the TCP subflows does not contain | ||||
| <t>Note that the TCP control block on the TCP subflows does not contain the | the | |||
| RCV.WND and SND.WND state variables as these are maintained at the MPTCP | RCV.WND and SND.WND state variables, as these are maintained at the MPTCP | |||
| connection level and not at the subflow level.</t> | connection level and not at the subflow level.</t> | |||
| <t>Inside each TCP control block, the following state variables are defi | ||||
| <t>Inside each TCP control block, the following state variables are defined.</t> | ned.</t> | |||
| <section numbered="true" toc="default"> | ||||
| <section title="Sending Side"> | <name>Sending Side</name> | |||
| <t><list style="hanging"> | <dl newline="false" spacing="normal" indent="3"> | |||
| <t hangText="SND.UNA (32 bits):"> This is the sequence number of the next byte t | <dt>SND.UNA (32 bits):</dt> | |||
| o be | <dd> This is the sequence number of the next byte to be | |||
| acknowledged on the subflow. This variable is updated upon reception of | acknowledged on the subflow. This variable is updated upon reception of | |||
| each TCP acknowledgment on the subflow.</t> | each TCP acknowledgment on the subflow.</dd> | |||
| <t hangText="SND.NXT (32 bits):"> This is the sequence number of the next byte t | <dt>SND.NXT (32 bits):</dt> | |||
| o be | <dd> This is the sequence number of the next byte to be | |||
| sent on the subflow. SND.NXT is used to set the value of SEG.SEQ upon | sent on the subflow. SND.NXT is used to set the value of SEG.SEQ upon | |||
| transmission of the next segment.</t> | transmission of the next segment.</dd> | |||
| </list></t> | </dl> | |||
| </section> | </section> | |||
| <section numbered="true" toc="default"> | ||||
| <section title="Receiving Side"> | <name>Receiving Side</name> | |||
| <t><list style="hanging"> | <dl newline="false" spacing="normal" indent="3"> | |||
| <t hangText="RCV.NXT (32 bits):"> This is the sequence number of the next byte t | <dt>RCV.NXT (32 bits):</dt> | |||
| hat | <dd> This is the sequence number of the next byte that | |||
| is expected on the subflow. This state variable is modified upon | is expected on the subflow. This state variable is modified upon | |||
| reception of in-order segments. The value of RCV.NXT is copied to the | reception of in-order segments. The value of RCV.NXT is copied to the | |||
| SEG.ACK field of the next segments transmitted on the subflow.</t> | SEG.ACK field of the next segments transmitted on the subflow.</dd> | |||
| <t hangText="RCV.WND (32 bits with RFC 7323, 16 bits otherwise):"> This is the | <dt>RCV.WND (32 bits):</dt> | |||
| subflow-level receive window that is updated with the window field from the | <dd>This is the subflow-level receive window that is updated with | |||
| segments received on this subflow.</t> | the window field from the segments received on this subflow. 3 | |||
| </list></t> | 2 | |||
| </section> | bits if the features in RFC 7323 are used; 16 bits otherwise.</dd> | |||
| </section> | </dl> | |||
| </section> | ||||
| </section> | </section> | |||
| </section> | ||||
| <section title="Finite State Machine" anchor="app_fsm"> | <section anchor="app_fsm" numbered="true" toc="default"> | |||
| <t>The diagram in <xref target="fig_fsm"/> shows the Finite State Machine | <name>Finite State Machine</name> | |||
| for connection-level closure. This illustrates how the DATA_FIN connection-leve | <t>The diagram in <xref target="fig_fsm" format="default"/> shows the | |||
| l signal (indicated in the diagram as the DFIN flag on a DATA_ACK) interacts wit | Finite State Machine for connection-level closure. This illustrates how | |||
| h subflow-level FINs, and permits "break-before-make" handover between subflows. | the DATA_FIN connection-level signal (indicated in the diagram as the | |||
| </t> | DFIN flag on a DATA_ACK) (1) interacts with subflow-level FINs and (2) per | |||
| mits break-before-make handover between subflows.</t> | ||||
| <figure align="center" anchor="fig_fsm" title="Finite State Machine for Co | <figure anchor="fig_fsm"> | |||
| nnection Closure"> | <name>Finite State Machine for Connection Closure</name> | |||
| <artwork align="left"><![CDATA[ | <artwork align="left" name="" type="" alt=""><![CDATA[ | |||
| +---------+ | +---------+ | |||
| | M_ESTAB | | | M_ESTAB | | |||
| +---------+ | +---------+ | |||
| M_CLOSE | | rcv DATA_FIN | M_CLOSE | | rcv DATA_FIN | |||
| ------- | | ------- | ------- | | ------- | |||
| +---------+ snd DATA_FIN / \ snd DATA_ACK[DFIN] +---------+ | +---------+ snd DATA_FIN / \ snd DATA_ACK[DFIN] +-------+ | |||
| | M_FIN |<----------------- ------------------->| M_CLOSE | | | M_FIN |<----------------- ------------------->|M_CLOSE| | |||
| | WAIT-1 |--------------------------- | WAIT | | | WAIT-1 |--------------------------- | WAIT | | |||
| +---------+ rcv DATA_FIN \ +---------+ | +---------+ rcv DATA_FIN \ +-------+ | |||
| | rcv DATA_ACK[DFIN] ------- | M_CLOSE | | | rcv DATA_ACK[DFIN] ------- | M_CLOSE | | |||
| | -------------- snd DATA_ACK | ------- | | | -------------- snd DATA_ACK | ------- | | |||
| | CLOSE all subflows | snd DATA_FIN | | | CLOSE all subflows | snd DATA_FIN | | |||
| V V V | V V V | |||
| +-----------+ +-----------+ +-----------+ | +-----------+ +-----------+ +----------+ | |||
| |M_FINWAIT-2| | M_CLOSING | | M_LAST-ACK| | |M_FINWAIT-2| | M_CLOSING | |M_LAST-ACK| | |||
| +-----------+ +-----------+ +-----------+ | +-----------+ +-----------+ +----------+ | |||
| | rcv DATA_ACK[DFIN] | rcv DATA_ACK[DFIN] | | | rcv DATA_ACK[DFIN] | rcv DATA_ACK[DFIN] | | |||
| | rcv DATA_FIN -------------- | -------------- | | | rcv DATA_FIN -------------- | -------------- | | |||
| | ------- CLOSE all subflows | CLOSE all subflows | | | ------- CLOSE all subflows | CLOSE all subflows | | |||
| | snd DATA_ACK[DFIN] V delete MPTCP PCB V | | snd DATA_ACK[DFIN] V delete MPTCP PCB V | |||
| \ +-----------+ +---------+ | \ +-----------+ +--------+ | |||
| ------------------------>|M_TIME WAIT|----------------->| M_CLOSED| | ------------------------>|M_TIME WAIT|---------------->|M_CLOSED| | |||
| +-----------+ +---------+ | +-----------+ +--------+ | |||
| All subflows in CLOSED | All subflows in CLOSED | |||
| ------------ | ------------ | |||
| delete MPTCP PCB | delete MPTCP PCB ]]></artwork> | |||
| ]]></artwork> | ||||
| </figure> | </figure> | |||
| </section> | </section> | |||
| <section anchor="app_changelog" numbered="true" toc="default"> | ||||
| <name>Changes from RFC 6824</name> | ||||
| <t>This appendix lists the key technical changes between <xref target="RFC | ||||
| 6824"/>, | ||||
| which specifies MPTCP v0; and this document, which obsoletes <xref target= | ||||
| "RFC6824"/> and specifies MPTCP v1. Note that this specification is not backward | ||||
| compatible with <xref target="RFC6824"/>. | ||||
| <section title="Changes from RFC6824" anchor="app_changelog"> | </t> | |||
| <t>This section lists the key technical changes between RFC6824, specifyin | <ul spacing="normal"> | |||
| g MPTCP v0, and this document, which obsoletes RFC6824 and specifies MPTCP v1. N | <li>This document incorporates lessons learned from the various implemen | |||
| ote that this specification is not backwards compatible with RFC6824. | tations, deployments, and experiments gathered in the documents "Use Cases and O | |||
| perational Experience with Multipath TCP" <xref target="RFC8041" format="default | ||||
| <list style="symbols"> | "/> and the IETF Journal article "Multipath TCP Deployments" <xref target="deplo | |||
| <t>The document incorporates lessons learnt from the various implementat | yments" format="default"/>.</li> | |||
| ions, deployments and experiments gathered in the documents "Use Cases and Opera | <li>Connection initiation, through the exchange of the MP_CAPABLE | |||
| tional Experience with Multipath TCP" <xref target="RFC8041"/> and the IETF Jour | MPTCP option, is different from <xref target="RFC6824"/>. The SYN no lon | |||
| nal article "Multipath TCP Deployments" <xref target="deployments"/>.</t> | ger | |||
| <t>Connection initiation, through the exchange of the MP_CAPABLE MPTCP o | includes the initiator's key, to allow the MP_CAPABLE option on the SYN | |||
| ption, is different from RFC6824. The SYN no longer includes the initiator's key | to be shorter in length and to avoid duplicating the sending of keying material. | |||
| , allowing the MP_CAPABLE option on the SYN to be shorter in length, and to avoi | </li> | |||
| d duplicating the sending of keying material.</t> | <li>This also ensures reliable delivery of the key on the MP_CAPABLE | |||
| <t>This also ensures reliable delivery of the key on the MP_CAPABLE opti | option by allowing its transmission to be combined with data and thus | |||
| on by allowing its transmission to be combined with data and thus using TCP's in | using TCP's built-in reliability mechanism. If the initiator does not | |||
| -built reliability mechanism. If the initiator does not immediately have data to | immediately have data to send, the MP_CAPABLE option with the keys | |||
| send, the MP_CAPABLE option with the keys will be repeated on the first data pa | will be repeated on the first data packet. If the other end is the first | |||
| cket. If the other end is first to send, then the presence of the DSS option imp | to send, then the presence of the DSS option implicitly confirms the receipt of | |||
| licitly confirms the receipt of the MP_CAPABLE.</t> | the MP_CAPABLE.</li> | |||
| <t>In the Flags field of MP_CAPABLE, C is now assigned to mean that the | <li>In the Flags field of MP_CAPABLE, "C" is now assigned to mean that | |||
| sender of this option will not accept additional MPTCP subflows to the source ad | the sender of this option will not accept additional MPTCP subflows to | |||
| dress and port. This is an efficiency improvement, for example where the sender | the source address and port. This improves efficiency -- for example, | |||
| is behind a strict NAT.</t> | in cases where the sender is behind a strict NAT.</li> | |||
| <t>In the Flags field of MP_CAPABLE, H now indicates the use of HMAC-SHA | <li>In the Flags field of MP_CAPABLE, "H" now indicates the use of HMAC- | |||
| 256 (rather than HMAC-SHA1).</t> | SHA256 (rather than HMAC-SHA1).</li> | |||
| <t>Connection initiation also defines the procedure for version negotiat | <li>Connection initiation also defines the procedure for version negotia | |||
| ion, for implementations that support both v0 (RFC6824) and v1 (this document).< | tion, for implementations that support both v0 <xref target="RFC6824"/> and v1 ( | |||
| /t> | this document).</li> | |||
| <t>The HMAC-SHA256 (rather than HMAC-SHA1) algorithm is used, as the alg | <li>The HMAC-SHA256 (rather than HMAC-SHA1) algorithm is used, as it pro | |||
| orithm provides better security. It is used to generate the token in the MP_JOIN | vides better security. It is used to generate the token in the MP_JOIN and ADD_A | |||
| and ADD_ADDR messages, and to set the initial data sequence number.</t> | DDR messages and to set the IDSN.</li> | |||
| <t>A new subflow-level option exists to signal reasons for sending a RST | <li>A new subflow-level option exists to signal reasons for sending a | |||
| on a subflow (MP_TCPRST <xref target="sec_reset"/>), which can help an implemen | RST on a subflow (MP_TCPRST (<xref target="sec_reset" | |||
| tation decide whether to attempt later re-connection.</t> | format="default"/>)); this can help an implementation decide whether to | |||
| <t>The MP_PRIO option (<xref target="sec_policy"/>), which is used to si | attempt later reconnection.</li> | |||
| gnal a change of priority for a subflow, no longer includes the AddrID field. It | <li>The MP_PRIO option (<xref target="sec_policy" format="default"/>), | |||
| s purpose was to allow the changed priority to be applied on a subflow other tha | which is used to signal a change of priority for a subflow, no longer | |||
| n the one it was sent on. However, it has been realised that this could be used | includes the AddrID field. Its purpose was to allow the changed | |||
| by a man-in-the-middle to divert all traffic on to its own path, and MP_PRIO doe | priority to be applied on a subflow other than the one it was sent | |||
| s not include a token or other security mechanism.</t> | on. However, it was determined that this could be used by a | |||
| <t>The ADD_ADDR option (<xref target="sec_add_address"/>), which is used | man-in-the-middle to divert all traffic onto its own path, and MP_PRIO | |||
| to inform the other host about another potential address, is different in sever | does not include a token or other type of security mechanism.</li> | |||
| al ways. It now includes an HMAC of the added address, for enhanced security. In | <li>The ADD_ADDR option (<xref target="sec_add_address" format="default" | |||
| addition, reliability for the ADD_ADDR option has been added: the IPVer field i | />), which is used to inform the other host about another potential address, is | |||
| s replaced with a flag field, and one flag is assigned (E) which is used as an ' | different in several ways. It now includes an HMAC of the added address, for enh | |||
| Echo' so a host can indicate that it has received the option.</t> | anced security. In addition, reliability for the ADD_ADDR option has been added: | |||
| <t>An additional way of performing a Fast Close is described, by sending | the IPVer field is replaced with a flag field, and one flag is assigned ("E") t | |||
| a MP_FASTCLOSE option on a RST on all subflows. This allows the host to tear do | hat is used as an "echo" so a host can indicate that it has received the option. | |||
| wn the subflows and the connection immediately.</t> | </li> | |||
| <t>In the IANA registry a new MPTCP subtype option, MP_EXPERIMENTAL, is | <li>This document describes an additional way of performing a Fast | |||
| reserved for private experiments. However, the document doesn't define how to us | Close -- by sending an MP_FASTCLOSE option on a RST on all subflows. Thi | |||
| e the subtype option.</t> | s allows the host to tear down the subflows and the connection immediately.</li> | |||
| <t>A new Appendix discusses the usage of both the MPTCP and TCP Fast Ope | <li>IANA has reserved the MPTCP option subtype of value 0xf for | |||
| n on the same packet (<xref target="app_tfo"/>).</t> | Private Use (<xref target="IANA_subtypes"/>). This document doesn't defi | |||
| </list></t> | ne how to use that value.</li> | |||
| <li>This document adds a new appendix (<xref target="app_tfo" | ||||
| format="default"/>), which discusses the usage of both MPTCP options | ||||
| and TFO options on the same packet.</li> | ||||
| </ul> | ||||
| </section> | ||||
| <section anchor="Acknowledgments" numbered="false" toc="default"> | ||||
| <name>Acknowledgments</name> | ||||
| <t>The authors gratefully acknowledge significant input into this | ||||
| document from <contact fullname="Sebastien Barre"/> and <contact fullname= | ||||
| "Andrew McDonald"/>.</t> | ||||
| <t>The authors also wish to acknowledge reviews and contributions from | ||||
| <contact fullname="Iljitsch van Beijnum"/>, <contact fullname="Lars | ||||
| Eggert"/>, <contact fullname="Marcelo Bagnulo"/>, <contact | ||||
| fullname="Robert Hancock"/>, <contact fullname="Pasi Sarolahti"/>, | ||||
| <contact fullname="Toby Moncaster"/>, <contact fullname="Philip | ||||
| Eardley"/>, <contact fullname="Sergio Lembo"/>, <contact | ||||
| fullname="Lawrence Conroy"/>, <contact fullname="Yoshifumi Nishida"/>, | ||||
| <contact fullname="Bob Briscoe"/>, <contact fullname="Stein Gjessing"/>, | ||||
| <contact fullname="Andrew McGregor"/>, <contact fullname="Georg | ||||
| Hampel"/>, <contact fullname="Anumita Biswas"/>, <contact fullname="Wes | ||||
| Eddy"/>, <contact fullname="Alexey Melnikov"/>, <contact | ||||
| fullname="Francis Dupont"/>, <contact fullname="Adrian Farrel"/>, | ||||
| <contact fullname="Barry Leiba"/>, <contact fullname="Robert Sparks"/>, | ||||
| <contact fullname="Sean Turner"/>, <contact fullname="Stephen | ||||
| Farrell"/>, <contact fullname="Martin Stiemerling"/>, <contact | ||||
| fullname="Gregory Detal"/>, <contact fullname="Fabien Duchene"/>, | ||||
| <contact fullname="Xavier de Foy"/>, <contact fullname="Rahul Jadhav"/>, | ||||
| <contact fullname="Klemens Schragel"/>, <contact fullname="Mirja | ||||
| Kühlewind"/>, <contact fullname="Sheng Jiang"/>, <contact | ||||
| fullname="Alissa Cooper"/>, <contact fullname="Ines Robles"/>, <contact | ||||
| fullname="Roman Danyliw"/>, <contact fullname="Adam Roach"/>, | ||||
| <contact fullname="Eric Vyncke"/>, and <contact fullname="Ben Kaduk"/>.</t | ||||
| > | ||||
| </section> | </section> | |||
| </back> | </back> | |||
| </rfc> | </rfc> | |||
| End of changes. 343 change blocks. | ||||
| 3074 lines changed or deleted | 3156 lines changed or added | |||
This html diff was produced by rfcdiff 1.45. The latest version is available from http://tools.ietf.org/tools/rfcdiff/ | ||||