| rfc9040xml2.original.xml | rfc9040.xml | |||
|---|---|---|---|---|
| <?xml version='1.0' encoding='utf-8'?> | <?xml version="1.0" encoding="UTF-8"?> | |||
| <!-- [rfced] Change log section removed from draft-ietf-tcpm-2140bis-11-manual.t | ||||
| xt --> | ||||
| <!DOCTYPE rfc SYSTEM "rfc2629.dtd" [ | ||||
| <!ENTITY RFC0793 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
| C.0793.xml"> | ||||
| <!ENTITY RFC1122 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
| C.1122.xml"> | ||||
| <!ENTITY RFC1191 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
| C.1191.xml"> | ||||
| <!ENTITY RFC2119 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
| C.2119.xml"> | ||||
| <!ENTITY RFC4821 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
| C.4821.xml"> | ||||
| <!ENTITY RFC5681 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
| C.5681.xml"> | ||||
| <!ENTITY RFC6298 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
| C.6298.xml"> | ||||
| <!ENTITY RFC7413 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
| C.7413.xml"> | ||||
| <!ENTITY RFC8174 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
| C.8174.xml"> | ||||
| <!ENTITY RFC8201 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
| C.8201.xml"> | ||||
| <!ENTITY I-D.allman-tcpm-bump-initcwnd SYSTEM "https://xml2rfc.ietf.org/public/r | ||||
| fc/bibxml3/reference.I-D.draft-allman-tcpm-bump-initcwnd-00.xml"> | ||||
| <!ENTITY I-D.ietf-tcpm-generalized-ecn SYSTEM "https://xml2rfc.ietf.org/public/r | ||||
| fc/bibxml3/reference.I-D.draft-ietf-tcpm-generalized-ecn-07.xml"> | ||||
| <!ENTITY I-D.hughes-restart SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml3/ | ||||
| reference.I-D.draft-hughes-restart-00.xml"> | ||||
| <!ENTITY RFC1644 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
| C.1644.xml"> | ||||
| <!ENTITY RFC1379 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
| C.1379.xml"> | ||||
| <!ENTITY RFC2001 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
| C.2001.xml"> | ||||
| <!ENTITY RFC2140 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
| C.2140.xml"> | ||||
| <!ENTITY RFC2414 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
| C.2414.xml"> | ||||
| <!ENTITY RFC2663 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
| C.2663.xml"> | ||||
| <!ENTITY RFC3390 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
| C.3390.xml"> | ||||
| <!ENTITY RFC3124 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
| C.3124.xml"> | ||||
| <!ENTITY RFC4340 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
| C.4340.xml"> | ||||
| <!ENTITY RFC4960 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
| C.4960.xml"> | ||||
| <!ENTITY RFC5925 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
| C.5925.xml"> | ||||
| <!ENTITY RFC6437 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
| C.6437.xml"> | ||||
| <!ENTITY RFC6691 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
| C.6691.xml"> | ||||
| <!ENTITY RFC6928 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
| C.6928.xml"> | ||||
| <!ENTITY RFC7231 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
| C.7231.xml"> | ||||
| <!ENTITY RFC7323 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
| C.7323.xml"> | ||||
| <!ENTITY RFC7424 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
| C.7424.xml"> | ||||
| <!ENTITY RFC7540 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
| C.7540.xml"> | ||||
| <!ENTITY RFC7661 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
| C.7661.xml"> | ||||
| <!ENTITY RFC8684 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF | ||||
| C.8684.xml"> | ||||
| ]> | ||||
| <rfc submissionType="IETF" docName="draft-ietf-tcpm-2140bis-11" category="info" | ||||
| obsoletes="2140" ipr="trust200902"> | ||||
| <!-- Generated by id2xml 1.5.0 on 2021-05-03T23:46:00Z --> | ||||
| <?rfc strict="yes"?> | ||||
| <?rfc compact="yes"?> | ||||
| <?rfc subcompact="no"?> | ||||
| <?rfc symrefs="yes"?> | ||||
| <?rfc sortrefs="no"?> | ||||
| <?rfc text-list-symbols="o*+-"?> | ||||
| <?rfc toc="yes"?> | ||||
| <front> | ||||
| <title>TCP Control Block Interdependence</title> | ||||
| <author initials="J." surname="Touch" fullname="Joe Touch"> | ||||
| <organization abbrev="Independent"></organization> | ||||
| <address> | ||||
| <postal> | ||||
| <street/> | ||||
| <city>Manhattan Beach</city> | ||||
| <region>CA</region> | ||||
| <code>90266</code> | ||||
| <country>United States of America</country> | ||||
| </postal> | ||||
| <phone>+1 (310) 560-0334</phone> | ||||
| <email>touch@strayalpha.com</email> | ||||
| </address> | ||||
| </author> | ||||
| <author initials="M." surname="Welzl" fullname="Michael Welzl"> | ||||
| <organization>University of Oslo</organization> | ||||
| <address> | ||||
| <postal> | ||||
| <street>PO Box 1080 Blindern</street> | ||||
| <city>Oslo</city> | ||||
| <region/> | ||||
| <code>N-0316</code> | ||||
| <country>Norway</country> | ||||
| </postal> | ||||
| <phone>+47 22 85 24 20</phone> | ||||
| <email>michawe@ifi.uio.no</email> | ||||
| </address> | ||||
| </author> | ||||
| <author initials="S." surname="Islam" fullname="Safiqul Islam"> | ||||
| <organization>University of Oslo</organization> | ||||
| <address><postal><street>PO Box 1080 Blindern</street> | ||||
| <street>Oslo N-0316</street> | ||||
| <street>Norway</street> | ||||
| </postal> | ||||
| <phone>+47 22 84 08 37</phone> | ||||
| <email>safiquli@ifi.uio.no</email> | ||||
| </address> | ||||
| </author> | ||||
| <date year="2021" month="May"/> | <!DOCTYPE rfc SYSTEM "rfc2629-xhtml.ent"> | |||
| <workgroup>TCPM WG</workgroup> | ||||
| <!-- [rfced] Please insert any keywords (beyond those that appear in | <rfc xmlns:xi="http://www.w3.org/2001/XInclude" docName="draft-ietf-tcpm-2140bis | |||
| the title) for use on https://www.rfc-editor.org/search. --> | -11" | |||
| number="9040" submissionType="IETF" category="info" consensus="true" obsoletes=" | ||||
| 2140" | ||||
| ipr="trust200902" updates="" xml:lang="en" symRefs="true" sortRefs="true" tocInc | ||||
| lude="true" | ||||
| version="3"> | ||||
| <keyword>example</keyword> | <front> | |||
| <title>TCP Control Block Interdependence</title> | ||||
| <seriesInfo name="RFC" value="9040"/> | ||||
| <author initials="J." surname="Touch" fullname="Joe Touch"> | ||||
| <organization abbrev="Independent"/> | ||||
| <address> | ||||
| <postal> | ||||
| <street/> | ||||
| <city>Manhattan Beach</city> | ||||
| <region>CA</region> | ||||
| <code>90266</code> | ||||
| <country>United States of America</country> | ||||
| </postal> | ||||
| <phone>+1 (310) 560-0334</phone> | ||||
| <email>touch@strayalpha.com</email> | ||||
| </address> | ||||
| </author> | ||||
| <author initials="M." surname="Welzl" fullname="Michael Welzl"> | ||||
| <organization>University of Oslo</organization> | ||||
| <address> | ||||
| <postal> | ||||
| <street>PO Box 1080 Blindern</street> | ||||
| <city>Oslo</city> | ||||
| <region/> | ||||
| <code>N-0316</code> | ||||
| <country>Norway</country> | ||||
| </postal> | ||||
| <phone>+47 22 85 24 20</phone> | ||||
| <email>michawe@ifi.uio.no</email> | ||||
| </address> | ||||
| </author> | ||||
| <author initials="S." surname="Islam" fullname="Safiqul Islam"> | ||||
| <organization>University of Oslo</organization> | ||||
| <address> | ||||
| <postal> | ||||
| <street>PO Box 1080 Blindern</street> | ||||
| <street>Oslo N-0316</street> | ||||
| <street>Norway</street> | ||||
| </postal> | ||||
| <phone>+47 22 84 08 37</phone> | ||||
| <email>safiquli@ifi.uio.no</email> | ||||
| </address> | ||||
| </author> | ||||
| <date year="2021" month="July"/> | ||||
| <workgroup>TCPM WG</workgroup> | ||||
| <abstract><t> | <abstract> | |||
| <t> | ||||
| This memo provides guidance to TCP implementers that is intended to | This memo provides guidance to TCP implementers that is intended to | |||
| help improve connection convergence to steady-state operation | help improve connection convergence to steady-state operation | |||
| without affecting interoperability. It updates and replaces RFC | without affecting interoperability. It updates and replaces RFC | |||
| 2140's description of sharing TCP state, as typically represented in | 2140's description of sharing TCP state, as typically represented in | |||
| TCP Control Blocks, among similar concurrent or consecutive | TCP Control Blocks, among similar concurrent or consecutive | |||
| connections.</t> | connections.</t> | |||
| </abstract> | ||||
| </front> | ||||
| <middle> | ||||
| <section anchor="sect-1" numbered="true" toc="default"> | ||||
| <name>Introduction</name> | ||||
| </abstract> | <t> | |||
| </front> | TCP is a connection-oriented reliable transport protocol layered over IP | |||
| <xref target="RFC0793" format="default"/>. Each TCP connection maintains | ||||
| <middle> | state, usually in a data structure called the "TCP Control Block (TCB)". The | |||
| <section title="Introduction" anchor="sect-1"><t> | TCB contains information about the connection state, its associated local | |||
| TCP is a connection-oriented reliable transport protocol layered | ||||
| over IP <xref target="RFC0793"/>. Each TCP connection maintains state, usuall | ||||
| y in a | ||||
| data structure called the TCP Control Block (TCB). The TCB contains | ||||
| information about the connection state, its associated local | ||||
| process, and feedback parameters about the connection's transmission | process, and feedback parameters about the connection's transmission | |||
| properties. As originally specified and usually implemented, most | properties. As originally specified and usually implemented, most TCB | |||
| TCB information is maintained on a per-connection basis. Some | information is maintained on a per-connection basis. Some implementations | |||
| implementations share certain TCB information across connections to | share certain TCB information across connections to the same host <xref | |||
| the same host <xref target="RFC2140"/>. Such sharing is intended to lead to b | target="RFC2140" format="default"/>. Such sharing is intended to lead to | |||
| etter | better overall transient performance, especially for numerous short-lived | |||
| overall transient performance, especially for numerous short-lived | and simultaneous connections, as can be used in the World Wide Web and | |||
| and simultaneous connections, as can be used in the World-Wide Web | other applications <xref target="Be94" format="default"/> <xref | |||
| and other applications <xref target="Be94"/><xref target="Br02"/>. This shari | target="Br02" format="default"/>. This sharing of state is intended to help | |||
| ng of state is | TCP connections converge to long-term behavior (assuming stable application | |||
| intended to help TCP connections converge to long term behavior | load, i.e., so-called "steady-state") more quickly without affecting TCP | |||
| (assuming stable application load, i.e., so-called "steady-state") | interoperability.</t> | |||
| more quickly without affecting TCP interoperability.</t> | ||||
| <t> | <t> | |||
| This document updates RFC 2140's discussion of TCB state sharing and | This document updates RFC 2140's discussion of TCB state sharing and | |||
| provides a complete replacement for that document. This state | provides a complete replacement for that document. This state sharing | |||
| sharing affects only TCB initialization <xref target="RFC2140"/> and thus has | affects only TCB initialization <xref target="RFC2140" format="default"/> | |||
| no | and thus has no effect on the long-term behavior of TCP after a connection | |||
| effect on the long-term behavior of TCP after a connection has been | has been established or on interoperability. Path information shared | |||
| established nor on interoperability. Path information shared across | across SYN destination port numbers assumes that TCP segments having the | |||
| SYN destination port numbers assumes that TCP segments having the | same host-pair experience the same path properties, i.e., that traffic is | |||
| same host-pair experience the same path properties, i.e., that | not routed differently based on port numbers or other connection parameters | |||
| traffic is not routed differently based on port numbers or other | (also addressed further in <xref target="sect-8.1" format="default"/>). The | |||
| connection parameters (also addressed further in <xref target="sect-8.1"/>). | observations about TCB sharing in this document apply similarly to any | |||
| The | protocol with congestion state, including the Stream Control Transmission | |||
| observations about TCB sharing in this document apply similarly to | Protocol (SCTP) <xref target="RFC4960" format="default"/> and the Datagram | |||
| any protocol with congestion state, including SCTP <xref target="RFC4960"/> a | Congestion Control Protocol (DCCP) <xref target="RFC4340" | |||
| nd | format="default"/>, as well as to individual subflows in Multipath TCP | |||
| DCCP <xref target="RFC4340"/>, as well as for individual subflows in Multipat | <xref target="RFC8684" format="default"/>.</t> | |||
| h TCP | </section> | |||
| <xref target="RFC8684"/>.</t> | ||||
| </section> | ||||
| <section title="Conventions Used in This Document" anchor="sect-2"><t> | ||||
| The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", | ||||
| "SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and | ||||
| "OPTIONAL" in this document are to be interpreted as described in | ||||
| BCP 14 <xref target="RFC2119"/> <xref target="RFC8174"/> when, and only when, | ||||
| they appear in all | ||||
| capitals, as shown here.</t> | ||||
| <t> | ||||
| The core of this document describes behavior that is already | ||||
| permitted by TCP standards. As a result, it provides informative | ||||
| guidance but does not use normative language, except when quoting | ||||
| other documents. Normative language is used in Appendix C as | ||||
| examples of requirements for future consideration.</t> | ||||
| </section> | ||||
| <section title="Terminology" anchor="sect-3"><t> | ||||
| The following terminology is used frequently in this document. Items | ||||
| preceded with a "+" may be part of the state maintained as TCP | ||||
| connection state in the associated connections TCB and are the focus | ||||
| of sharing as described in this document. Note that terms are used | ||||
| as originally introduced where possible; in some cases, direction is | ||||
| indicated with a suffix (_S for send, _R for receive) and in other | ||||
| cases spelled out (sendcwnd). | ||||
| <list style="hanging" hangIndent="6"> | ||||
| <t hangText="+cwnd:">TCP congestion window size <xref target="RFC5681"/>< | ||||
| /t> | ||||
| <t hangText="host:">a source or sink of TCP segments associated with a si | ||||
| ngle IP | ||||
| address</t> | ||||
| <t hangText="host-pair:">a pair of hosts and their corresponding IP addre | ||||
| sses</t> | ||||
| <t hangText="+MMS_R:">maximum message size that can be received, the larg | ||||
| est | ||||
| received transport payload of an IP datagram <xref target="RFC1122"/></t> | ||||
| <t hangText="+MMS_S:">maximum message size that can be sent, the largest | ||||
| transmitted transport payload of an IP datagram <xref target="RFC1122"/>< | ||||
| /t> | ||||
| <t hangText="path:">an Internet path between the IP addresses of two host | ||||
| s</t> | ||||
| <t hangText="PCB:">protocol control block, the data associated with | <section anchor="sect-2" numbered="true" toc="default"> | |||
| a protocol as maintained by an endpoint; a TCP PCB is called a TCB | <name>Conventions Used in This Document</name> | |||
| PLPMTUD - packetization-layer path MTU discovery, a mechanism that | ||||
| uses transport packets to discover the PMTU <xref | ||||
| target="RFC4821"/></t> | ||||
| <t hangText="+PMTU:">largest IP datagram that can traverse a path | <t> | |||
| <xref target="RFC1191"/><xref target="RFC8201"/></t> | The key words "<bcp14>MUST</bcp14>", "<bcp14>MUST NOT</bcp14>", | |||
| "<bcp14>REQUIRED</bcp14>", "<bcp14>SHALL</bcp14>", "<bcp14>SHALL | ||||
| NOT</bcp14>", "<bcp14>SHOULD</bcp14>", "<bcp14>SHOULD NOT</bcp14>", | ||||
| "<bcp14>RECOMMENDED</bcp14>", "<bcp14>NOT RECOMMENDED</bcp14>", | ||||
| "<bcp14>MAY</bcp14>", and "<bcp14>OPTIONAL</bcp14>" in this document are | ||||
| to be interpreted as described in BCP 14 <xref target="RFC2119"/> | ||||
| <xref target="RFC8174"/> when, and only when, they appear in all capitals, | ||||
| as shown here. | ||||
| </t> | ||||
| <t hangText="PMTUD:">path-layer MTU discovery, a mechanism that | <t> | |||
| relies on ICMP error messages to discover the PMTU <xref | The core of this document describes behavior that is already permitted by | |||
| target="RFC1191"/><xref target="RFC8201"/></t> | TCP standards. As a result, this document provides informative guidance but d | |||
| oes not | ||||
| use normative language except when quoting other documents. Normative | ||||
| language is used in <xref target="sect-c"/> as examples of requirements for | ||||
| future consideration.</t> | ||||
| </section> | ||||
| <t hangText="+RTT:">round-trip time of a TCP packet exchange <xref | <section anchor="sect-3" numbered="true" toc="default"> | |||
| target="RFC0793"/></t> | <name>Terminology</name> | |||
| <t hangText="+RTTVAR:">variation of round-trip times of a TCP packet | <t> | |||
| exchange <xref target="RFC6298"/></t> | The following terminology is used frequently in this document. Items | |||
| preceded with a "+" may be part of the state maintained as TCP connection | ||||
| state in the TCB of associated connections and are the focus of sharing as | ||||
| described in this document. Note that terms are used as originally | ||||
| introduced where possible; in some cases, direction is indicated with a | ||||
| suffix (_S for send, _R for receive) and in other cases spelled out | ||||
| (sendcwnd). | ||||
| <t hangText="+rwnd:">TCP receive window size <xref | </t> | |||
| target="RFC5681"/></t> | ||||
| <t hangText="+sendcwnd:">TCP send-side congestion window (cwnd) size | <dl newline="false" spacing="normal" indent="6"> | |||
| <xref target="RFC5681"/></t> | <dt>+cwnd:</dt> | |||
| <dd>TCP congestion window size <xref target="RFC5681" format="default"/> | ||||
| </dd> | ||||
| <dt>host:</dt> | ||||
| <dd>a source or sink of TCP segments associated with a single IP | ||||
| address</dd> | ||||
| <dt>host-pair:</dt> | ||||
| <dd>a pair of hosts and their corresponding IP addresses</dd> | ||||
| <dt>ISN: | ||||
| </dt> | ||||
| <dd>Initial Sequence Number | ||||
| </dd> | ||||
| <dt>+MMS_R:</dt> | ||||
| <dd>maximum message size that can be received, the largest | ||||
| received transport payload of an IP datagram <xref target="RFC1122" forma | ||||
| t="default"/></dd> | ||||
| <dt>+MMS_S:</dt> | ||||
| <dd>maximum message size that can be sent, the largest | ||||
| transmitted transport payload of an IP datagram <xref target="RFC1122" fo | ||||
| rmat="default"/></dd> | ||||
| <dt>path:</dt> | ||||
| <dd>an Internet path between the IP addresses of two hosts</dd> | ||||
| <t hangText="+sendMSS:">TCP maximum segment size, a value | <dt>PCB:</dt> | |||
| <dd>protocol control block, the data associated with a protocol as | ||||
| maintained by an endpoint; a TCP PCB is called a "TCB"</dd> | ||||
| <dt>PLPMTUD:</dt><dd>packetization-layer path MTU discovery, a mechanism | ||||
| that | ||||
| uses transport packets to discover the Path Maximum Transmission Unit (P | ||||
| MTU) <xref target="RFC4821" | ||||
| format="default"/></dd> | ||||
| <dt>+PMTU:</dt> | ||||
| <dd>largest IP datagram that can traverse a path | ||||
| <xref target="RFC1191" format="default"/> <xref target="RFC8201" format=" | ||||
| default"/></dd> | ||||
| <dt>PMTUD:</dt> | ||||
| <dd>path-layer MTU discovery, a mechanism that | ||||
| relies on ICMP error messages to discover the PMTU <xref target="RFC1191" | ||||
| format="default"/> <xref target="RFC8201" format="default"/></dd> | ||||
| <dt>+RTT:</dt> | ||||
| <dd>round-trip time of a TCP packet exchange <xref target="RFC0793" form | ||||
| at="default"/></dd> | ||||
| <dt>+RTTVAR:</dt> | ||||
| <dd>variation of round-trip times of a TCP packet | ||||
| exchange <xref target="RFC6298" format="default"/></dd> | ||||
| <dt>+rwnd:</dt> | ||||
| <dd>TCP receive window size <xref target="RFC5681" format="default"/></d | ||||
| d> | ||||
| <dt>+sendcwnd:</dt> | ||||
| <dd>TCP send-side congestion window (cwnd) size | ||||
| <xref target="RFC5681" format="default"/></dd> | ||||
| <dt>+sendMSS:</dt> | ||||
| <dd>TCP maximum segment size, a value | ||||
| transmitted in a TCP option that represents the largest TCP user data | transmitted in a TCP option that represents the largest TCP user data | |||
| payload that can be received <xref target="RFC6691"/></t> | payload that can be received <xref target="RFC6691" format="default"/></d | |||
| d> | ||||
| <t hangText="+ssthresh:">TCP slow-start threshold <xref | <dt>+ssthresh:</dt> | |||
| target="RFC5681"/></t> | <dd>TCP slow-start threshold <xref target="RFC5681" format="default"/></ | |||
| dd> | ||||
| <t hangText="TCB:">TCP Control Block, the data associated with a TCP | <dt>TCB:</dt> | |||
| connection as maintained by an endpoint</t> | <dd>TCP Control Block, the data associated with a TCP | |||
| connection as maintained by an endpoint</dd> | ||||
| <t hangText="TCP-AO:">TCP Authentication Option <xref | <dt>TCP-AO:</dt> | |||
| target="RFC5925"/></t> | <dd>TCP Authentication Option <xref target="RFC5925" format="default"/>< | |||
| /dd> | ||||
| <t hangText="TFO:">TCP Fast Open option <xref target="RFC7413"/></t> | <dt>TFO:</dt> | |||
| <dd>TCP Fast Open option <xref target="RFC7413" format="default"/></dd> | ||||
| <t hangText="+TFO_cookie:">TCP Fast Open cookie, state that is used | <dt>+TFO_cookie:</dt> | |||
| as part of the TFO mechanism, when TFO is supported <xref | <dd>TCP Fast Open cookie, state that is used | |||
| target="RFC7413"/></t> | as part of the TFO mechanism, when TFO is supported <xref target="RFC7413 | |||
| " format="default"/></dd> | ||||
| <t hangText="+TFO_failure:">an indication of when TFO option | <dt>+TFO_failure:</dt> | |||
| negotiation failed, when TFO is supported</t> | <dd>an indication of when TFO option | |||
| negotiation failed, when TFO is supported</dd> | ||||
| <t hangText="+TFOinfo:">information cached when a TFO connection is | <dt>+TFOinfo:</dt> | |||
| established, which includes the TFO_cookie <xref | <dd>information cached when a TFO connection is | |||
| target="RFC7413"/></t> | established, which includes the TFO_cookie <xref target="RFC7413" format= | |||
| "default"/></dd> | ||||
| </list> | </dl> | |||
| </t> | </section> | |||
| <section anchor="sect-4" numbered="true" toc="default"> | ||||
| </section> | <name>The TCP Control Block (TCB)</name> | |||
| <t> | ||||
| <section title="The TCP Control Block (TCB)" anchor="sect-4"><t> | ||||
| A TCB describes the data associated with each connection, i.e., with | A TCB describes the data associated with each connection, i.e., with | |||
| each association of a pair of applications across the network. The | each association of a pair of applications across the network. The | |||
| TCB contains at least the following information <xref target="RFC0793"/>:</t> | TCB contains at least the following information <xref target="RFC0793" format ="default"/>:</t> | |||
| <figure><artwork><![CDATA[ | <ul empty="true"> | |||
| Local process state | <li><t>Local process state</t> | |||
| pointers to send and receive buffers | <ul empty="true" spacing="compact"> | |||
| pointers to retransmission queue and current segment | <li>pointers to send and receive buffers</li> | |||
| pointers to Internet Protocol (IP) PCB | <li>pointers to retransmission queue and current segment</li> | |||
| Per-connection shared state | <li>pointers to Internet Protocol (IP) PCB</li> | |||
| macro-state | </ul> | |||
| connection state | </li> | |||
| timers | <li><t>Per-connection shared state</t> | |||
| flags | <ul empty="true" spacing="compact"> | |||
| local and remote host numbers and ports | <li><t>macro-state</t> | |||
| TCP option state | <ul empty="true" spacing="compact"> | |||
| micro-state | <li>connection state</li> | |||
| send and receive window state (size*, current number) | <li>timers</li> | |||
| congestion window size (sendcwnd)* | <li>flags</li> | |||
| congestion window size threshold (ssthresh)* | <li>local and remote host numbers and ports</li> | |||
| max window size seen* | <li>TCP option state</li> | |||
| sendMSS# | </ul> | |||
| MMS_S# | </li> | |||
| MMS_R# | <li><t>micro-state</t> | |||
| PMTU# | <ul empty="true" spacing="compact"> | |||
| round-trip time and its variation# | <li>send and receive window state (size*, current number)</li> | |||
| ]]></artwork></figure> | <li>congestion window size (sendcwnd)*</li> | |||
| <li>congestion window size threshold (ssthresh)*</li> | ||||
| <li>max window size seen*</li> | ||||
| <li>sendMSS#</li> | ||||
| <li>MMS_S#</li> | ||||
| <li>MMS_R#</li> | ||||
| <li>PMTU#</li> | ||||
| <li>round-trip time and its variation#</li> | ||||
| </ul> | ||||
| </li> | ||||
| </ul> | ||||
| </li> | ||||
| </ul> | ||||
| <t> | <t> | |||
| The per-connection information is shown as split into macro-state | The per-connection information is shown as split into macro-state and | |||
| and micro-state, terminology borrowed from <xref target="Co91"/>. Macro-state | micro-state, terminology borrowed from <xref target="Co91" | |||
| describes the protocol for establishing the initial shared state | format="default"/>. Macro-state describes the protocol for establishing the | |||
| about the connection; we include the endpoint numbers and components | initial shared state about the connection; we include the endpoint numbers | |||
| (timers, flags) required upon commencement that are later used to | and components (timers, flags) required upon commencement that are later | |||
| help maintain that state. Micro-state describes the protocol after a | used to help maintain that state. Micro-state describes the protocol after | |||
| connection has been established, to maintain the reliability and | a connection has been established, to maintain the reliability and | |||
| congestion control of the data transferred in the connection.</t> | congestion control of the data transferred in the connection.</t> | |||
| <t> | <t> | |||
| We distinguish two other classes of shared micro-state that are | We distinguish two other classes of shared micro-state that are associated | |||
| associated more with host-pairs than with application pairs. One | more with host-pairs than with application pairs. | |||
| class is clearly host-pair dependent (shown above as "#", e.g., | ||||
| sendMSS, MMS_R, MMS_S, PMTU, RTT), because these parameters are | ||||
| defined by the endpoint or endpoint pair (sendMSS, MMS_R, MMS_S, | ||||
| RTT) or are already cached and shared on that basis (PMTU | ||||
| <xref target="RFC1191"/><xref target="RFC4821"/>). The other is host-pair dep | ||||
| endent in its | ||||
| aggregate (shown above as "*", e.g., congestion window information, | ||||
| current window sizes, etc.) because they depend on the total | ||||
| capacity between the two endpoints.</t> | ||||
| <t> | One class is clearly host-pair dependent (shown above as "#", e.g., | |||
| Not all of the TCB state is necessarily sharable. In particular, | sendMSS, MMS_R, MMS_S, PMTU, RTT), because these parameters are defined by | |||
| the endpoint or endpoint pair (of the given example: sendMSS, MMS_R, MMS_S, | ||||
| RTT) or are already cached and shared on that basis (of the given example: | ||||
| PMTU <xref target="RFC1191" format="default"/> <xref target="RFC4821" | ||||
| format="default"/>). | ||||
| The other is host-pair dependent in its aggregate (shown above as "*", e.g., | ||||
| congestion window information, current window sizes, etc.) because they depend | ||||
| on the total capacity between the two endpoints.</t> | ||||
| <t> | ||||
| Not all of the TCB state is necessarily shareable. In particular, | ||||
| some TCP options are negotiated only upon request by the application | some TCP options are negotiated only upon request by the application | |||
| layer, so their use may not be correlated across connections. Other | layer, so their use may not be correlated across connections. Other | |||
| options negotiate connection-specific parameters, which are | options negotiate connection-specific parameters, which are | |||
| similarly not shareable. These are discussed further in Appendix B.</t> | similarly not shareable. These are discussed further in <xref target="sect-b" | |||
| />.</t> | ||||
| <t> | <t> | |||
| Finally, we exclude rwnd from further discussion because its value | Finally, we exclude rwnd from further discussion because its value | |||
| should depend on the send window size, so it is already addressed by | should depend on the send window size, so it is already addressed by | |||
| send window sharing and is not independently affected by sharing.</t> | send window sharing and is not independently affected by sharing.</t> | |||
| </section> | ||||
| </section> | <section anchor="sect-5" numbered="true" toc="default"> | |||
| <name>TCB Interdependence</name> | ||||
| <section title="TCB Interdependence" anchor="sect-5"><t> | <t> | |||
| There are two cases of TCB interdependence. Temporal sharing occurs | There are two cases of TCB interdependence. Temporal sharing occurs | |||
| when the TCB of an earlier (now CLOSED) connection to a host is used | when the TCB of an earlier (now CLOSED) connection to a host is used | |||
| to initialize some parameters of a new connection to that same host, | to initialize some parameters of a new connection to that same host, | |||
| i.e., in sequence. Ensemble sharing occurs when a currently active | i.e., in sequence. Ensemble sharing occurs when a currently active | |||
| connection to a host is used to initialize another (concurrent) | connection to a host is used to initialize another (concurrent) | |||
| connection to that host.</t> | connection to that host.</t> | |||
| </section> | ||||
| <section anchor="sect-6" numbered="true" toc="default"> | ||||
| <name>Temporal Sharing</name> | ||||
| </section> | <t> | |||
| <section title="Temporal Sharing" anchor="sect-6"><t> | ||||
| The TCB data cache is accessed in two ways: it is read to initialize | The TCB data cache is accessed in two ways: it is read to initialize | |||
| new TCBs and written when more current per-host state is available.</t> | new TCBs and written when more current per-host state is available.</t> | |||
| <section anchor="sect-6.1" numbered="true" toc="default"> | ||||
| <section title="Initialization of a new TCB" anchor="sect-6.1"><t> | <name>Initialization of a New TCB</name> | |||
| <t> | ||||
| TCBs for new connections can be initialized using cached context | TCBs for new connections can be initialized using cached context | |||
| from past connections as follows:</t> | from past connections as follows:</t> | |||
| <figure><artwork><![CDATA[ | <table anchor="TCB_initialization"> | |||
| TEMPORAL SHARING - TCB Initialization | <name>Temporal Sharing - TCB Initialization</name> | |||
| <thead> | ||||
| Cached TCB New TCB | <tr> | |||
| -------------------------------------- | <th>Cached TCB</th> | |||
| old_MMS_S old_MMS_S or not cached* | <th>New TCB</th> | |||
| </tr> | ||||
| old_MMS_R old_MMS_R or not cached* | </thead> | |||
| <tbody> | ||||
| old_sendMSS old_sendMSS | <tr> | |||
| <td>old_MMS_S</td> | ||||
| old_PMTU old_PMTU+ | <td>old_MMS_S or not cached (2)</td> | |||
| </tr> | ||||
| old_RTT old_RTT | <tr> | |||
| <td>old_MMS_R</td> | ||||
| old_RTTVAR old_RTTVAR | <td>old_MMS_R or not cached (2)</td> | |||
| </tr> | ||||
| old_option (option specific) | <tr> | |||
| <td>old_sendMSS</td> | ||||
| old_ssthresh old_ssthresh | <td>old_sendMSS</td> | |||
| </tr> | ||||
| old_sendcwnd old_sendcwnd | <tr> | |||
| ]]></artwork></figure> | <td>old_PMTU</td> | |||
| <td>old_PMTU (1)</td> | ||||
| <t> | </tr> | |||
| +Note that PMTU is cached at the IP layer <xref target="RFC1191"/><xref targe | <tr> | |||
| t="RFC4821"/>. | <td>old_RTT</td> | |||
| *Note that some values are not cached when they are computed locally | <td>old_RTT</td> | |||
| (MMS_R) or indicated in the connection itself (MMS_S in the SYN).</t> | </tr> | |||
| <tr> | ||||
| <t> | <td>old_RTTVAR</td> | |||
| The table below gives an overview of option-specific information | <td>old_RTTVAR</td> | |||
| that can be shared. Additional information on some specific TCP | </tr> | |||
| options and sharing is provided in Appendix B.</t> | <tr> | |||
| <td>old_option</td> | ||||
| <figure><artwork><![CDATA[ | <td>(option specific)</td> | |||
| TEMPORAL SHARING - Option Info Initialization | </tr> | |||
| <tr> | ||||
| <td>old_ssthresh</td> | ||||
| <td>old_ssthresh</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td>old_sendcwnd</td> | ||||
| <td>old_sendcwnd</td> | ||||
| </tr> | ||||
| </tbody> | ||||
| </table> | ||||
| Cached New | <dl> | |||
| ------------------------------------ | <dt>(1)</dt><dd>Note that PMTU is cached at the IP layer <xref target="RFC1191" | |||
| old_TFO_cookie old_TFO_cookie | format="default"/> <xref target="RFC4821" format="default"/>. | |||
| </dd> | ||||
| <dt>(2)</dt><dd>Note that some values are not cached when they are computed loca | ||||
| lly | ||||
| (MMS_R) or indicated in the connection itself (MMS_S in the SYN).</dd> | ||||
| </dl> | ||||
| <t> | ||||
| old_TFO_failure old_TFO_failure | <xref target="Option_Info_Initialization"/> gives an overview of | |||
| ]]></artwork> | option-specific information that can be shared. Additional information on | |||
| </figure> | some specific TCP options and sharing is provided in <xref | |||
| target="sect-b"/>.</t> | ||||
| </section> | <table anchor="Option_Info_Initialization"> | |||
| <name>Temporal Sharing - Option Info Initialization</name> | ||||
| <thead> | ||||
| <tr> | ||||
| <th>Cached</th> | ||||
| <th>New</th> | ||||
| </tr> | ||||
| </thead> | ||||
| <tbody> | ||||
| <tr> | ||||
| <td>old_TFO_cookie</td> | ||||
| <td>old_TFO_cookie</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td>old_TFO_failure</td> | ||||
| <td>old_TFO_failure</td> | ||||
| </tr> | ||||
| </tbody> | ||||
| </table> | ||||
| <section title="Updates to the TCB cache" anchor="sect-6.2"><t> | </section> | |||
| <section anchor="sect-6.2" numbered="true" toc="default"> | ||||
| <name>Updates to the TCB Cache</name> | ||||
| <t> | ||||
| During a connection, the TCB cache can be updated based on events of | During a connection, the TCB cache can be updated based on events of | |||
| current connections and their TCBs as they progress over time, as | current connections and their TCBs as they progress over time, as shown in | |||
| shown below:</t> | <xref target="Cache_Updates"/>.</t> | |||
| <figure><artwork><![CDATA[ | ||||
| TEMPORAL SHARING - Cache Updates | ||||
| Cached TCB Current TCB when? New Cached TCB | ||||
| ---------------------------------------------------------- | ||||
| old_MMS_S curr_MMS_S OPEN curr_MMS_S | ||||
| old_MMS_R curr_MMS_R OPEN curr_MMS_R | ||||
| old_sendMSS curr_sendMSS MSSopt curr_sendMSS | ||||
| old_PMTU curr_PMTU PMTUD+ / curr_PMTU | ||||
| PLPMTUD+ | ||||
| old_RTT curr_RTT CLOSE merge(curr,old) | ||||
| old_RTTVAR curr_RTTVAR CLOSE merge(curr,old) | ||||
| old_option curr_option ESTAB (depends on option) | ||||
| old_ssthresh curr_ssthresh CLOSE merge(curr,old) | ||||
| old_sendcwnd curr_sendcwnd CLOSE merge(curr,old) | ||||
| ]]></artwork> | ||||
| </figure> | ||||
| <t> | <table anchor="Cache_Updates"> | |||
| +Note that PMTU is cached at the IP layer <xref target="RFC1191"/><xref targe | <name>Temporal Sharing - Cache Updates</name> | |||
| t="RFC4821"/>.</t> | <thead> | |||
| <tr> | ||||
| <th>Cached TCB</th> | ||||
| <th>Current TCB</th> | ||||
| <th>When?</th> | ||||
| <th>New Cached TCB</th> | ||||
| </tr> | ||||
| </thead> | ||||
| <tbody> | ||||
| <tr> | ||||
| <td>old_MMS_S</td> | ||||
| <td>curr_MMS_S</td> | ||||
| <td>OPEN</td> | ||||
| <td>curr_MMS_S</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td>old_MMS_R</td> | ||||
| <td>curr_MMS_R</td> | ||||
| <td>OPEN</td> | ||||
| <td>curr_MMS_R</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td>old_sendMSS</td> | ||||
| <td>curr_sendMSS</td> | ||||
| <td>MSSopt</td> | ||||
| <td>curr_sendMSS</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td>old_PMTU</td> | ||||
| <td>curr_PMTU</td> | ||||
| <td>PMTUD (1) / PLPMTUD (1)</td> | ||||
| <td>curr_PMTU</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td>old_RTT</td> | ||||
| <td>curr_RTT</td> | ||||
| <td>CLOSE</td> | ||||
| <td>merge(curr,old)</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td>old_RTTVAR</td> | ||||
| <td>curr_RTTVAR</td> | ||||
| <td>CLOSE</td> | ||||
| <td>merge(curr,old)</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td>old_option</td> | ||||
| <td>curr_option</td> | ||||
| <td>ESTAB</td> | ||||
| <td>(depends on option)</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td>old_ssthresh</td> | ||||
| <td>curr_ssthresh</td> | ||||
| <td>CLOSE</td> | ||||
| <td>merge(curr,old)</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td>old_sendcwnd</td> | ||||
| <td>curr_sendcwnd</td> | ||||
| <td>CLOSE</td> | ||||
| <td>merge(curr,old)</td> | ||||
| </tr> | ||||
| </tbody> | ||||
| </table> | ||||
| <t> | <dl> | |||
| <dt>(1)</dt><dd>Note that PMTU is cached at the IP layer <xref target="RFC1191" | ||||
| format="default"/> <xref target="RFC4821" format="default"/>.</dd> | ||||
| </dl> | ||||
| <t> | ||||
| Merge() is the function that combines the current and previous (old) | Merge() is the function that combines the current and previous (old) | |||
| values and may vary for each parameter of the TCB cache. The | values and may vary for each parameter of the TCB cache. The | |||
| particular function is not specified in this document; examples | particular function is not specified in this document; examples | |||
| include windowed averages (mean of the past N values, for some N) | include windowed averages (mean of the past N values, for some N) | |||
| and exponential decay (new = (1-alpha)*old + alpha *new, where alpha | and exponential decay (new = (1-alpha)*old + alpha *new, where alpha | |||
| is in the range [0..1]).</t> | is in the range [0..1]).</t> | |||
| <t> | ||||
| <xref target="Option_Info_Updates"/> gives an overview of option-specific | ||||
| information that can be similarly shared. The TFO cookie is maintained | ||||
| until the client explicitly requests it be updated as a separate event.</t> | ||||
| <t> | <table anchor="Option_Info_Updates"> | |||
| The table below gives an overview of option-specific information | <name>Temporal Sharing - Option Info Updates</name> | |||
| that can be similarly shared. The TFO cookie is maintained until the | <thead> | |||
| client explicitly requests it be updated as a separate event.</t> | <tr> | |||
| <th>Cached</th> | ||||
| <figure><artwork><![CDATA[ | <th>Current</th> | |||
| TEMPORAL SHARING - Option Info Updates | <th>When?</th> | |||
| <th>New Cached</th> | ||||
| Cached Current when? New Cached | </tr> | |||
| --------------------------------------------------------- | </thead> | |||
| old_TFO_cookie old_TFO_cookie ESTAB old_TFO_cookie | <tbody> | |||
| <tr> | ||||
| old_TFO_failure old_TFO_failure ESTAB old_TFO_failure | <td>old_TFO_cookie</td> | |||
| ]]></artwork> | <td>old_TFO_cookie</td> | |||
| </figure> | <td>ESTAB</td> | |||
| <td>old_TFO_cookie</td> | ||||
| </section> | </tr> | |||
| <tr> | ||||
| <section title="Discussion" anchor="sect-6.3"><t> | <td>old_TFO_failure</td> | |||
| As noted, there is no particular benefit to caching MMS_S and MMS_R | <td>old_TFO_failure</td> | |||
| as these are reported by the local IP stack. Caching sendMSS and | <td>ESTAB</td> | |||
| PMTU is trivial; reported values are cached (PMTU at the IP layer), | <td>old_TFO_failure</td> | |||
| and the most recent values are used. The cache is updated when the | </tr> | |||
| MSS option is received in a SYN or after PMTUD (i.e., when an ICMPv4 | </tbody> | |||
| Fragmentation Needed <xref target="RFC1191"/> or ICMPv6 Packet Too Big messag | </table> | |||
| e is | ||||
| received <xref target="RFC8201"/> or the equivalent is inferred, e.g., as fro | ||||
| m | ||||
| PLPMTUD <xref target="RFC4821"/>), respectively, so the cache always has the | ||||
| most | ||||
| recent values from any connection. For sendMSS, the cache is | ||||
| consulted only at connection establishment and not otherwise | ||||
| updated, which means that MSS options do not affect current | ||||
| connections. The default sendMSS is never saved; only reported MSS | ||||
| values update the cache, so an explicit override is required to | ||||
| reduce the sendMSS. Cached sendMSS affects only data sent in the SYN | ||||
| segment, i.e., during client connection initiation or during | ||||
| simultaneous open; all other segment MSS are based on the value | ||||
| updated as included in the SYN.</t> | ||||
| <t> | </section> | |||
| RTT values are updated by formulae that merges the old and new | <section anchor="sect-6.3" numbered="true" toc="default"> | |||
| values, as noted in <xref target="sect-6.2"/>. Dynamic RTT estimation require | <name>Discussion</name> | |||
| s a | <t> | |||
| sequence of RTT measurements. As a result, the cached RTT (and its | As noted, there is no particular benefit to caching MMS_S and MMS_R as | |||
| variation) is an average of its previous value with the contents of | these are reported by the local IP stack. Caching sendMSS and PMTU is | |||
| the currently active TCB for that host, when a TCB is closed. RTT | trivial; reported values are cached (PMTU at the IP layer), and the most | |||
| values are updated only when a connection is closed. The method for | recent values are used. The cache is updated when the MSS option is | |||
| merging old and current values needs to attempt to reduce the | received in a SYN or after PMTUD (i.e., when an ICMPv4 Fragmentation Needed | |||
| transient effects of the new connections.</t> | <xref target="RFC1191" format="default"/> or ICMPv6 Packet Too Big message | |||
| is received <xref target="RFC8201" format="default"/> or the equivalent is | ||||
| inferred, e.g., as from PLPMTUD <xref target="RFC4821" format="default"/>), | ||||
| respectively, so the cache always has the most recent values from any | ||||
| connection. For sendMSS, the cache is consulted only at connection | ||||
| establishment and not otherwise updated, which means that MSS options do | ||||
| not affect current connections. The default sendMSS is never saved; only | ||||
| reported MSS values update the cache, so an explicit override is required | ||||
| to reduce the sendMSS. Cached sendMSS affects only data sent in the SYN | ||||
| segment, i.e., during client connection initiation or during simultaneous | ||||
| open; the MSS of all other segments are constrained by the value updated as | ||||
| included in the SYN. | ||||
| </t> | ||||
| <t> | <t> | |||
| The updates for RTT, RTTVAR and ssthresh rely on existing | RTT values are updated by formulae that merge the old and new values, as | |||
| noted in <xref target="sect-6.2" format="default"/>. Dynamic RTT estimation | ||||
| requires a sequence of RTT measurements. As a result, the cached RTT (and | ||||
| its variation) is an average of its previous value with the contents of the | ||||
| currently active TCB for that host, when a TCB is closed. RTT values are | ||||
| updated only when a connection is closed. The method for merging old and | ||||
| current values needs to attempt to reduce the transient effects of the new | ||||
| connections.</t> | ||||
| <t> | ||||
| The updates for RTT, RTTVAR, and ssthresh rely on existing | ||||
| information, i.e., old values. Should no such values exist, the | information, i.e., old values. Should no such values exist, the | |||
| current values are cached instead.</t> | current values are cached instead.</t> | |||
| <t> | ||||
| <t> | ||||
| TCP options are copied or merged depending on the details of each | TCP options are copied or merged depending on the details of each | |||
| option. E.g., TFO state is updated when a connection is established | option. For example, TFO state is updated when a connection is established | |||
| and read before establishing a new connection.</t> | and read before establishing a new connection.</t> | |||
| <t> | <t> | |||
| Sections 8 and 9 discuss compatibility issues and implications of | Sections <xref target="sect-8" format="counter"/> and <xref target="sect-9" | |||
| sharing the specific information listed above. <xref target="sect-10"/> gives | format="counter"/> discuss compatibility issues and implications of sharing | |||
| an | the specific information listed above. <xref target="sect-10" | |||
| overview of known implementations.</t> | format="default"/> gives an overview of known implementations.</t> | |||
| <t> | ||||
| <t> | Most cached TCB values are updated when a connection closes. The exceptions | |||
| Most cached TCB values are updated when a connection closes. The | are MMS_R and MMS_S, which are reported by IP <xref target="RFC1122" | |||
| exceptions are MMS_R and MMS_S, which are reported by IP <xref target="RFC112 | format="default"/>; PMTU, which is updated after Path MTU Discovery and | |||
| 2"/>, | also reported by IP <xref target="RFC1191" format="default"/> <xref | |||
| PMTU which is updated after Path MTU Discovery and also reported by | target="RFC4821" format="default"/> <xref target="RFC8201" | |||
| IP <xref target="RFC1191"/><xref target="RFC4821"/><xref target="RFC8201"/>, | format="default"/>; and sendMSS, which is updated if the MSS option is | |||
| and sendMSS, which is updated if the | received in the TCP SYN header.</t> | |||
| MSS option is received in the TCP SYN header.</t> | <t> | |||
| <t> | ||||
| Sharing sendMSS information affects only data in the SYN of the next | Sharing sendMSS information affects only data in the SYN of the next | |||
| connection, because sendMSS information is typically included in | connection, because sendMSS information is typically included in | |||
| most TCP SYN segments. Caching PMTU can accelerate the efficiency of | most TCP SYN segments. Caching PMTU can accelerate the efficiency of | |||
| PMTUD but can also result in black-holing until corrected if in | PMTUD but can also result in black-holing until corrected if in | |||
| error. Caching MMS_R and MMS_S may be of little direct value as they | error. Caching MMS_R and MMS_S may be of little direct value as they | |||
| are reported by the local IP stack anyway.</t> | are reported by the local IP stack anyway.</t> | |||
| <t> | <t> | |||
| The way in which other TCP option state can be shared depends on the | The way in which state related to other TCP options can be shared depends on | |||
| details of that option. E.g., TFO state includes the TCP Fast Open | the | |||
| Cookie <xref target="RFC7413"/> or, in case TFO fails, a negative TCP Fast Op | details of that option. For example, TFO state includes the TCP Fast Open | |||
| en | cookie <xref target="RFC7413" format="default"/> or, in case TFO fails, a neg | |||
| response. RFC 7413 states, "The client MUST cache negative responses from the | ative TCP Fast Open | |||
| server in order to avoid potential connection failures. Negative responses incl | response. RFC 7413 states, </t> | |||
| ude the server not acknowledging the data in the SYN, ICMP error messages, and ( | ||||
| most importantly) no response (SYN-ACK) from the server at all, i.e., connection | ||||
| timeout." [RFC 7413]. TFOinfo is cached when a connection is established.</t> | ||||
| <t> | ||||
| Other TCP option state might not be as readily cached. E.g., TCP-AO | ||||
| <xref target="RFC5925"/> success or failure between a host pair for a single | ||||
| SYN | ||||
| destination port might be usefully cached. TCP-AO success or failure | ||||
| to other SYN destination ports on that host pair is never useful to | ||||
| cache because TCP-AO security parameters can vary per service.</t> | ||||
| </section> | ||||
| </section> | ||||
| <section title="Ensemble Sharing" anchor="sect-7"><t> | <blockquote>The client <bcp14>MUST</bcp14> cache negative responses from the ser | |||
| ver in order to avoid potential connection failures. Negative responses include | ||||
| the server not acknowledging the data in the SYN, ICMP error messages, and (most | ||||
| importantly) no response (SYN-ACK) from the server at all, i.e., connection tim | ||||
| eout. | ||||
| </blockquote> | ||||
| <t>TFOinfo is cached when a connection is established.</t> | ||||
| <t> | ||||
| State related to other TCP options might not be as readily cached. For | ||||
| example, TCP-AO <xref target="RFC5925" format="default"/> success or | ||||
| failure between a host-pair for a single SYN destination port might be | ||||
| usefully cached. TCP-AO success or failure to other SYN destination ports | ||||
| on that host-pair is never useful to cache because TCP-AO security | ||||
| parameters can vary per service.</t> | ||||
| </section> | ||||
| </section> | ||||
| <section anchor="sect-7" numbered="true" toc="default"> | ||||
| <name>Ensemble Sharing</name> | ||||
| <t> | ||||
| Sharing cached TCB data across concurrent connections requires | Sharing cached TCB data across concurrent connections requires | |||
| attention to the aggregate nature of some of the shared state. For | attention to the aggregate nature of some of the shared state. For | |||
| example, although MSS and RTT values can be shared by copying, it | example, although MSS and RTT values can be shared by copying, it | |||
| may not be appropriate to simply copy congestion window or ssthresh | may not be appropriate to simply copy congestion window or ssthresh | |||
| information; instead, the new values can be a function (f) of the | information; instead, the new values can be a function (f) of the | |||
| cumulative values and the number of connections (N).</t> | cumulative values and the number of connections (N).</t> | |||
| <section anchor="sect-7.1" numbered="true" toc="default"> | ||||
| <section title="Initialization of a new TCB" anchor="sect-7.1"><t> | <name>Initialization of a New TCB</name> | |||
| <t> | ||||
| TCBs for new connections can be initialized using cached context | TCBs for new connections can be initialized using cached context | |||
| from concurrent connections as follows:</t> | from concurrent connections as follows:</t> | |||
| <figure><artwork><![CDATA[ | <table anchor="TCB_Initialization"> | |||
| ENSEMBLE SHARING - TCB Initialization | <name>Ensemble Sharing - TCB Initialization</name> | |||
| <thead> | ||||
| Cached TCB New TCB | <tr> | |||
| ------------------------------------------ | <th>Cached TCB</th> | |||
| old_MMS_S old_MMS_S | <th>New TCB</th> | |||
| </tr> | ||||
| old_MMS_R old_MMS_R | </thead> | |||
| <tbody> | ||||
| old_sendMSS old_sendMSS | <tr> | |||
| <td>old_MMS_S</td> | ||||
| old_PMTU old_PMTU+ | <td>old_MMS_S</td> | |||
| </tr> | ||||
| old_RTT old_RTT | <tr> | |||
| <td>old_MMS_R</td> | ||||
| old_RTTVAR old_RTTVAR | <td>old_MMS_R</td> | |||
| </tr> | ||||
| sum(old_ssthresh) f(sum(old_ssthresh), N) | <tr> | |||
| <td>old_sendMSS</td> | ||||
| sum(old_sendcwnd) f(sum(old_sendcwnd), N) | <td>old_sendMSS</td> | |||
| _ | </tr> | |||
| old_option (option specific) | <tr> | |||
| ]]></artwork> | <td>old_PMTU</td> | |||
| </figure> | <td>old_PMTU (1)</td> | |||
| </tr> | ||||
| <t> | <tr> | |||
| +Note that PMTU is cached at the IP layer <xref target="RFC1191"/><xref targe | <td>old_RTT</td> | |||
| t="RFC4821"/>.</t> | <td>old_RTT</td> | |||
| </tr> | ||||
| <tr> | ||||
| <td>old_RTTVAR</td> | ||||
| <td>old_RTTVAR</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td>sum(old_ssthresh)</td> | ||||
| <td>f(sum(old_ssthresh), N)</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td>sum(old_sendcwnd)</td> | ||||
| <td>f(sum(old_sendcwnd), N)</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td>old_option</td> | ||||
| <td>(option specific)</td> | ||||
| </tr> | ||||
| </tbody> | ||||
| </table> | ||||
| <t> | <dl> | |||
| In the table, the cached sum() is a total across all active | <dt>(1)</dt> | |||
| connections because these parameters act in aggregate; similarly f() | <dd>Note that PMTU is cached at the IP layer <xref target="RFC1191" format="defa | |||
| ult"/> <xref target="RFC4821" format="default"/>.</dd> | ||||
| </dl> | ||||
| <t> | ||||
| In <xref target="TCB_Initialization"/>, the cached sum() is a total across al | ||||
| l active | ||||
| connections because these parameters act in aggregate; similarly, f() | ||||
| is a function that updates that sum based on the new connection's | is a function that updates that sum based on the new connection's | |||
| values, represented as "N".</t> | values, represented as "N".</t> | |||
| <t> | ||||
| <xref target="Ensemble_Option_Info_Initialization"/> gives an overview of | ||||
| option-specific information that can be similarly shared. Again, the | ||||
| TFO_cookie is updated upon explicit client request, which is a separate | ||||
| event.</t> | ||||
| <t> | <table anchor="Ensemble_Option_Info_Initialization"> | |||
| The table below gives an overview of option-specific information | <name>Ensemble Sharing - Option Info Initialization</name> | |||
| that can be similarly shared. Again, The TFO_cookie is updated upon | <thead> | |||
| explicit client request, which is a separate event.</t> | <tr> | |||
| <th>Cached</th> | ||||
| <figure><artwork><![CDATA[ | <th>New</th> | |||
| ENSEMBLE SHARING - Option Info Initialization | </tr> | |||
| </thead> | ||||
| Cached New | <tbody> | |||
| ------------------------------------ | <tr> | |||
| old_TFO_cookie old_TFO_cookie | <td>old_TFO_cookie</td> | |||
| <td>old_TFO_cookie</td> | ||||
| old_TFO_failure old_TFO_failure | </tr> | |||
| ]]></artwork> | <tr> | |||
| </figure> | <td>old_TFO_failure</td> | |||
| <td>old_TFO_failure</td> | ||||
| </section> | </tr> | |||
| </tbody> | ||||
| <section title="Updates to the TCB cache" anchor="sect-7.2"><t> | </table> | |||
| During a connection, the TCB cache can be updated based on changes | ||||
| to concurrent connections and their TCBs, as shown below:</t> | ||||
| <figure><artwork><![CDATA[ | ||||
| ENSEMBLE SHARING - Cache Updates | ||||
| Cached TCB Current TCB when? New Cached TCB | ||||
| --------------------------------------------------------------- | ||||
| old_MMS_S curr_MMS_S OPEN curr_MMS_S | ||||
| old_MMS_R curr_MMS_R OPEN curr_MMS_R | ||||
| old_sendMSS curr_sendMSS MSSopt curr_sendMSS | ||||
| old_PMTU curr_PMTU PMTUD+ / curr_PMTU | ||||
| PLPMTUD+ | ||||
| old_RTT curr_RTT update rtt_update(old, curr) | ||||
| old_RTTVAR curr_RTTVAR update rtt_update(old, curr) | ||||
| old_ssthresh curr_ssthresh update adjust sum as appropriate | ||||
| old_sendcwnd curr_sendcwnd update adjust sum as appropriate | ||||
| old_option curr_option (depends) (option specific) | ||||
| ]]></artwork> | ||||
| </figure> | ||||
| <t> | </section> | |||
| +Note that the PMTU is cached at the IP layer <xref target="RFC1191"/><xref t | <section anchor="sect-7.2" numbered="true" toc="default"> | |||
| arget="RFC4821"/>.</t> | <name>Updates to the TCB Cache</name> | |||
| <t> | ||||
| During a connection, the TCB cache can be updated based on changes to | ||||
| concurrent connections and their TCBs, as shown below:</t> | ||||
| <t> | <table anchor="Ensemble_Cache_Updates"> | |||
| In the table, rtt_update() is the function used to combine old and | <name>Ensemble Sharing - Cache Updates</name> | |||
| current values, e.g., as a windowed average or exponentially decayed | <thead> | |||
| average.</t> | <tr> | |||
| <th>Cached TCB</th> | ||||
| <th>Current TCB</th> | ||||
| <th>When?</th> | ||||
| <th>New Cached TCB</th> | ||||
| </tr> | ||||
| </thead> | ||||
| <tbody> | ||||
| <tr> | ||||
| <td>old_MMS_S</td> | ||||
| <td>curr_MMS_S</td> | ||||
| <td>OPEN</td> | ||||
| <td>curr_MMS_S</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td>old_MMS_R</td> | ||||
| <td>curr_MMS_R</td> | ||||
| <td>OPEN</td> | ||||
| <td>curr_MMS_R</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td>old_sendMSS</td> | ||||
| <td>curr_sendMSS</td> | ||||
| <td>MSSopt</td> | ||||
| <td>curr_sendMSS</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td>old_PMTU</td> | ||||
| <td>curr_PMTU</td> | ||||
| <td>PMTUD+ / PLPMTUD+</td> | ||||
| <td>curr_PMTU</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td>old_RTT</td> | ||||
| <td>curr_RTT</td> | ||||
| <td>update</td> | ||||
| <td>rtt_update(old, curr)</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td>old_RTTVAR</td> | ||||
| <td>curr_RTTVAR</td> | ||||
| <td>update</td> | ||||
| <td>rtt_update(old, curr)</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td>old_ssthresh</td> | ||||
| <td>curr_ssthresh</td> | ||||
| <td>update</td> | ||||
| <td>adjust sum as appropriate</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td>old_sendcwnd</td> | ||||
| <td>curr_sendcwnd</td> | ||||
| <td>update</td> | ||||
| <td>adjust sum as appropriate</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td>old_option</td> | ||||
| <td>curr_option</td> | ||||
| <td>(depends)</td> | ||||
| <td>(option specific)</td> | ||||
| </tr> | ||||
| </tbody> | ||||
| </table> | ||||
| <t> | <dl> | |||
| The table below gives an overview of option-specific information | <dt>+</dt> | |||
| <dd>Note that the PMTU is cached at the IP layer <xref target="RFC1191" format=" | ||||
| default"/> <xref target="RFC4821" format="default"/>.</dd> | ||||
| </dl> | ||||
| <t> | ||||
| In <xref target="Ensemble_Cache_Updates"/>, rtt_update() is the function | ||||
| used to combine old and current values, e.g., as a windowed average or | ||||
| exponentially decayed average.</t> | ||||
| <t> | ||||
| <xref target="Ensemble_Option_Info_Updates"/> gives an overview of opti | ||||
| on-specific information | ||||
| that can be similarly shared.</t> | that can be similarly shared.</t> | |||
| <figure><artwork><![CDATA[ | <table anchor="Ensemble_Option_Info_Updates"> | |||
| ENSEMBLE SHARING - Option Info Updates | <name>Ensemble Sharing - Option Info Updates</name> | |||
| <thead> | ||||
| Cached Current when? New Cached | <tr> | |||
| ---------------------------------------------------------- | <th>Cached</th> | |||
| old_TFO_cookie old_TFO_cookie ESTAB old_TFO_cookie | <th>Current</th> | |||
| <th>When?</th> | ||||
| old_TFO_failure old_TFO_failure ESTAB old_TFO_failure | <th>New Cached</th> | |||
| ]]></artwork></figure> | </tr> | |||
| </thead> | ||||
| <tbody> | ||||
| <tr> | ||||
| <td>old_TFO_cookie</td> | ||||
| <td>old_TFO_cookie</td> | ||||
| <td>ESTAB</td> | ||||
| <td>old_TFO_cookie</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td>old_TFO_failure</td> | ||||
| <td>old_TFO_failure</td> | ||||
| <td>ESTAB</td> | ||||
| <td>old_TFO_failure</td> | ||||
| </tr> | ||||
| </tbody> | ||||
| </table> | ||||
| </section> | </section> | |||
| <section anchor="sect-7.3" numbered="true" toc="default"> | ||||
| <name>Discussion</name> | ||||
| <section title="Discussion" anchor="sect-7.3"><t> | <t> | |||
| For ensemble sharing, TCB information should be cached as early as | For ensemble sharing, TCB information should be cached as early as | |||
| possible, sometimes before a connection is closed. Otherwise, | possible, sometimes before a connection is closed. Otherwise, | |||
| opening multiple concurrent connections may not result in TCB data | opening multiple concurrent connections may not result in TCB data | |||
| sharing if no connection closes before others open. The amount of | sharing if no connection closes before others open. The amount of | |||
| work involved in updating the aggregate average should be minimized, | work involved in updating the aggregate average should be minimized, | |||
| but the resulting value should be equivalent to having all values | but the resulting value should be equivalent to having all values | |||
| measured within a single connection. The function "rtt_update" in | measured within a single connection. | |||
| the ensemble sharing table indicates this operation, which occurs | ||||
| whenever the RTT would have been updated in the individual TCP | ||||
| connection. As a result, the cache contains the shared RTT | ||||
| variables, which no longer need to reside in the TCB.</t> | ||||
| <t> | The function "rtt_update" in <xref target="Ensemble_Cache_Updates" | |||
| format="default"/> indicates this operation, which occurs whenever the RTT | ||||
| would have been updated in the individual TCP connection. As a result, the | ||||
| cache contains the shared RTT variables, which no longer need to reside in the | ||||
| TCB.</t> | ||||
| <t> | ||||
| Congestion window size and ssthresh aggregation are more complicated | Congestion window size and ssthresh aggregation are more complicated | |||
| in the concurrent case. When there is an ensemble of connections, we | in the concurrent case. When there is an ensemble of connections, we | |||
| need to decide how that ensemble would have shared these variables, | need to decide how that ensemble would have shared these variables, | |||
| in order to derive initial values for new TCBs.</t> | in order to derive initial values for new TCBs.</t> | |||
| <t> | ||||
| <t> | Sections <xref target="sect-8" format="counter"/> and <xref target="sect-9" | |||
| Sections 8 and 9 discuss compatibility issues and implications of | format="counter"/> discuss compatibility issues and implications of sharing | |||
| sharing the specific information listed above.</t> | the specific information listed above.</t> | |||
| <t> | ||||
| <t> | There are several ways to initialize the congestion window in a new TCB | |||
| There are several ways to initialize the congestion window in a new | among an ensemble of current connections to a host. Current TCP | |||
| TCB among an ensemble of current connections to a host. Current TCP | implementations initialize it to 4 segments as standard <xref | |||
| implementations initialize it to four segments as standard <xref target="RFC3 | target="RFC3390" format="default"/> and 10 segments experimentally <xref | |||
| 390"/> | target="RFC6928" format="default"/>. These approaches assume that new | |||
| and 10 segments experimentally <xref target="RFC6928"/>. These approaches ass | connections should behave as conservatively as possible. The algorithm | |||
| ume | described in <xref target="Ba12" format="default"/> adjusts the initial | |||
| that new connections should behave as conservatively as possible. | cwnd depending on the cwnd values of ongoing connections. It is also | |||
| The algorithm described in <xref target="Ba12"/> adjusts the initial cwnd dep | possible to use sharing mechanisms over long timescales to adapt TCP's | |||
| ending | initial window automatically, as described further in <xref | |||
| on the cwnd values of ongoing connections. It is also possible to | target="sect-c"/>.</t> | |||
| use sharing mechanisms over long timescales to adapt TCP's initial | </section> | |||
| window automatically, as described further in Appendix C.</t> | </section> | |||
| <section anchor="sect-8" numbered="true" toc="default"> | ||||
| </section> | <name>Issues with TCB Information Sharing</name> | |||
| <t> | ||||
| </section> | ||||
| <section title="Issues with TCB information sharing" anchor="sect-8"><t> | ||||
| Here, we discuss various types of problems that may arise with TCB | Here, we discuss various types of problems that may arise with TCB | |||
| information sharing.</t> | information sharing.</t> | |||
| <t> | ||||
| <t> | ||||
| For the congestion and current window information, the initial | For the congestion and current window information, the initial | |||
| values computed by TCB interdependence may not be consistent with | values computed by TCB interdependence may not be consistent with | |||
| the long-term aggregate behavior of a set of concurrent connections | the long-term aggregate behavior of a set of concurrent connections | |||
| between the same endpoints. Under conventional TCP congestion | between the same endpoints. | |||
| control, if the congestion window of a single existing connection | ||||
| has converged to 40 segments, two newly joining concurrent | ||||
| connections assume initial windows of 10 segments <xref target="RFC6928"/>, a | ||||
| nd the | ||||
| current connection's window doesn't decrease to accommodate this | ||||
| additional load and connections can mutually interfere. One example | ||||
| of this is seen on low-bandwidth, high-delay links, where concurrent | ||||
| connections supporting Web traffic can collide because their initial | ||||
| windows were too large, even when set at one segment.</t> | ||||
| <t> | Under conventional TCP congestion control, if the congestion window of a | |||
| The authors of <xref target="Hu12"/> recommend caching ssthresh for temporal | single existing connection has converged to 40 segments, two newly joining | |||
| sharing only when flows are long. Some studies suggest that sharing | concurrent connections will assume initial windows of 10 segments <xref | |||
| ssthresh between short flows can deteriorate the performance of | target="RFC6928"/> and the existing connection's window will not decrease | |||
| individual connections [Hu12, <xref target="Du16"/>], although this may benef | to accommodate this additional load. As a consequence, the three | |||
| it | connections can mutually interfere. | |||
| aggregate network performance.</t> | ||||
| <section title="Traversing the same network path" anchor="sect-8.1"><t> | One example of this is seen on low-bandwidth, high-delay links, where | |||
| concurrent connections supporting Web traffic can collide because their | ||||
| initial windows were too large, even when set at 1 segment.</t> | ||||
| <t> | ||||
| The authors of <xref target="Hu12" format="default"/> recommend caching | ||||
| ssthresh for temporal sharing only when flows are long. Some studies | ||||
| suggest that sharing ssthresh between short flows can deteriorate the | ||||
| performance of individual connections <xref target="Hu12"/> <xref | ||||
| target="Du16" format="default"/>, although this may benefit aggregate | ||||
| network performance.</t> | ||||
| <section anchor="sect-8.1" numbered="true" toc="default"> | ||||
| <name>Traversing the Same Network Path</name> | ||||
| <t> | ||||
| TCP is sometimes used in situations where packets of the same host-pair do | TCP is sometimes used in situations where packets of the same host-pair do | |||
| not always take the same path, such as when connection- specific parameters | not always take the same path, such as when connection-specific parameters | |||
| are used for routing (e.g., for load balancing). Multipath routing that | are used for routing (e.g., for load balancing). Multipath routing that | |||
| relies on examining transport headers, such as ECMP and LAG <xref target="RFC | relies on examining transport headers, such as ECMP and Link Aggregation | |||
| 7424"/>, may | Group (LAG) <xref target="RFC7424" format="default"/>, may not result in | |||
| not result in repeatable path selection when TCP segments are encapsulated, | repeatable path selection when TCP segments are encapsulated, encrypted, or | |||
| encrypted, or altered - for example, in some Virtual Private Network (VPN) | altered -- for example, in some Virtual Private Network (VPN) tunnels that | |||
| tunnels that rely on proprietary encapsulation. Similarly, such approaches | rely on proprietary encapsulation. Similarly, such approaches cannot | |||
| cannot operate deterministically when the TCP header is encrypted, e.g., | operate deterministically when the TCP header is encrypted, e.g., when | |||
| when using IPsec ESP (although TCB interdependence among the entire set | using IPsec Encapsulating Security Payload (ESP) (although TCB | |||
| sharing the same endpoint IP addresses should work without problems when | interdependence among the entire set sharing the same endpoint IP addresses | |||
| the TCP header is encrypted). Measures to increase the probability that | should work without problems when the TCP header is encrypted). Measures to | |||
| connections use the same path could be applied: e.g., the connections could | increase the probability that connections use the same path could be | |||
| be given the same IPv6 flow label <xref target="RFC6437"/>. TCB interdependen | applied; for example, the connections could be given the same IPv6 flow | |||
| ce can also | label <xref target="RFC6437" format="default"/>. TCB interdependence can | |||
| be extended to sets of host IP address pairs that share the same network | also be extended to sets of host IP address pairs that share the same | |||
| path conditions, such as when a group of addresses is on the same LAN (see | network path conditions, such as when a group of addresses is on the same | |||
| <xref target="sect-9"/>).</t> | LAN (see <xref target="sect-9" format="default"/>).</t> | |||
| <t> | ||||
| Traversing the same path is not important for host-specific information | ||||
| (e.g., rwnd), TCP option state (e.g., TFOinfo), or for information that is | ||||
| already cached per-host (e.g., path MTU). | ||||
| <t> | ||||
| Traversing the same path is not important for host-specific | ||||
| information such as rwnd and TCP option state, such as TFOinfo, or | ||||
| for information that is already cached per-host, such as path MTU. | ||||
| When TCB information is shared across different SYN destination | When TCB information is shared across different SYN destination | |||
| ports, path-related information can be incorrect; however, the | ports, path-related information can be incorrect; however, the | |||
| impact of this error is potentially diminished if (as discussed | impact of this error is potentially diminished if (as discussed | |||
| here) TCB sharing affects only the transient event of a connection | here) TCB sharing affects only the transient event of a connection | |||
| start or if TCB information is shared only within connections to the | start or if TCB information is shared only within connections to the | |||
| same SYN destination port.</t> | same SYN destination port.</t> | |||
| <t> | ||||
| In the case of temporal sharing, TCB information could also become invalid | ||||
| over time, i.e., indicating that although the path remains the same, path | ||||
| properties have changed. Because this is similar to the case when a | ||||
| connection becomes idle, mechanisms that address idle TCP connections | ||||
| (e.g., <xref target="RFC7661" format="default"/>) could also be applied to | ||||
| TCB cache management, especially when TCP Fast Open is used <xref | ||||
| target="RFC7413" format="default"/>.</t> | ||||
| </section> | ||||
| <section anchor="sect-8.2" numbered="true" toc="default"> | ||||
| <name>State Dependence</name> | ||||
| <t> | <t> | |||
| In case of Temporal Sharing, TCB information could also become | There may be additional considerations to the way in which TCB | |||
| invalid over time, i.e., indicating that although the path remains | interdependence rebalances congestion feedback among the current | |||
| the same, path properties have changed. Because this is similar to | connections. For example, it may be appropriate to consider the impact of a | |||
| the case when a connection becomes idle, mechanisms that address | connection being in Fast Recovery <xref target="RFC5681" format="default"/> | |||
| idle TCP connections (e.g., <xref target="RFC7661"/>) could also be applied t | or some other similar unusual feedback state that could inhibit or affect the | |||
| o TCB | calculations described herein. | |||
| cache management, especially when TCP Fast Open is used <xref target="RFC7413 | </t> | |||
| "/>.</t> | </section> | |||
| <section anchor="sect-8.3" numbered="true" toc="default"> | ||||
| </section> | <name>Problems with Sharing Based on IP Address</name> | |||
| <t> | ||||
| <section title="State dependence" anchor="sect-8.2"><t> | ||||
| There may be additional considerations to the way in which TCB | ||||
| interdependence rebalances congestion feedback among the current | ||||
| connections, e.g., it may be appropriate to consider the impact of a | ||||
| connection being in Fast Recovery <xref target="RFC5681"/> or some other simi | ||||
| lar | ||||
| unusual feedback state, e.g., as inhibiting or affecting the | ||||
| calculations described herein.</t> | ||||
| </section> | ||||
| <section title="Problems with sharing based on IP address" anchor="sect-8 | It can be wrong to share TCB information between TCP connections on the | |||
| .3"><t> | same host as identified by the IP address if an IP address is assigned to a | |||
| It can be wrong to share TCB information between TCP connections on | new host (e.g., IP address spinning, as is used by ISPs to inhibit running | |||
| the same host as identified by the IP address if an IP address is | servers). | |||
| assigned to a new host (e.g., IP address spinning, as is used by | ||||
| ISPs to inhibit running servers). It can be wrong if Network Address | ||||
| (and Port) Translation (NA(P)T) <xref target="RFC2663"/> or any other IP shar | ||||
| ing | ||||
| mechanism is used. Such mechanisms are less likely to be used with | ||||
| IPv6. Other methods to identify a host could also be considered to | ||||
| make correct TCB sharing more likely. Moreover, some TCB information | ||||
| is about dominant path properties rather than the specific host. IP | ||||
| addresses may differ, yet the relevant part of the path may be the | ||||
| same.</t> | ||||
| </section> | It can be wrong if Network Address Translation (NAT) <xref target="RFC2663" | |||
| format="default"/>, Network Address and Port Translation (NAPT) <xref | ||||
| target="RFC2663" format="default"/>, or any other IP sharing mechanism is | ||||
| used. | ||||
| </section> | Such mechanisms are less likely to be used with IPv6. Other methods to | |||
| identify a host could also be considered to make correct TCB sharing more | ||||
| likely. Moreover, some TCB information is about dominant path properties | ||||
| rather than the specific host. IP addresses may differ, yet the relevant | ||||
| part of the path may be the same.</t> | ||||
| </section> | ||||
| <section title="Implications" anchor="sect-9"><t> | </section> | |||
| <section anchor="sect-9" numbered="true" toc="default"> | ||||
| <name>Implications</name> | ||||
| <t> | ||||
| There are several implications to incorporating TCB interdependence in TCP | There are several implications to incorporating TCB interdependence in TCP | |||
| implementations. First, it may reduce the need for application-layer | implementations. First, it may reduce the need for application-layer | |||
| multiplexing for performance enhancement <xref target="RFC7231"/>. Protocols | multiplexing for performance enhancement <xref target="RFC7231" format="defau | |||
| like HTTP/2 | lt"/>. Protocols like HTTP/2 | |||
| <xref target="RFC7540"/> avoid connection reestablishment costs by serializin | <xref target="RFC7540" format="default"/> avoid connection re-establishment c | |||
| g or | osts by serializing or | |||
| multiplexing a set of per-host connections across a single TCP | multiplexing a set of per-host connections across a single TCP | |||
| connection. This avoids TCP's per-connection OPEN handshake and also avoids | connection. This avoids TCP's per-connection OPEN handshake and also avoids | |||
| recomputing the MSS, RTT, and congestion window values. By avoiding the | recomputing the MSS, RTT, and congestion window values. By avoiding the | |||
| so-called "slow-start restart", performance can be optimized <xref target="I- D.hughes-restart"/>. TCB | so-called "slow-start restart", performance can be optimized <xref target="I- D.hughes-restart" format="default"/>. TCB | |||
| interdependence can provide the "slow-start restart avoidance" of | interdependence can provide the "slow-start restart avoidance" of | |||
| multiplexing, without requiring a multiplexing mechanism at the application | multiplexing, without requiring a multiplexing mechanism at the application | |||
| layer.</t> | layer.</t> | |||
| <t> | ||||
| <t> | Like the initial version of this document <xref target="RFC2140" | |||
| Like the initial version of this document <xref target="RFC2140"/>, this upda | format="default"/>, this update's approach to TCB interdependence focuses | |||
| te's | on sharing a set of TCBs by updating the TCB state to reduce the impact of | |||
| approach to TCB interdependence focuses on sharing a set of TCBs by | transients when connections begin, end, or otherwise significantly change | |||
| updating the TCB state to reduce the impact of transients when | state. | |||
| connections begin, end, or otherwise significantly change state. | ||||
| Other mechanisms have since been proposed to continuously share | ||||
| information between all ongoing communication (including | ||||
| connectionless protocols), updating the congestion state during any | ||||
| congestion-related event (e.g., timeout, loss confirmation, etc.) | ||||
| <xref target="RFC3124"/>. By dealing exclusively with transients, the approac | ||||
| h in | ||||
| this document is more likely to exhibit the "steady-state" behavior | ||||
| as unmodified, independent TCP connections.</t> | ||||
| <section title="Layering" anchor="sect-9.1"><t> | Other mechanisms have since been proposed to continuously share information | |||
| TCB interdependence pushes some of the TCP implementation from the | between all ongoing communication (including connectionless protocols) and | |||
| traditional transport layer (in the ISO model), to the network | update the congestion state during any congestion-related event (e.g., | |||
| layer. This acknowledges that some state is in fact per-host-pair or | timeout, loss confirmation, etc.) <xref target="RFC3124" | |||
| can be per-path as indicated solely by that host-pair. Transport | format="default"/>. | |||
| protocols typically manage per-application-pair associations (per | ||||
| stream), and network protocols manage per-host-pair and path | ||||
| associations (routing). Round-trip time, MSS, and congestion | ||||
| information could be more appropriately handled at the network | ||||
| layer, aggregated among concurrent connections, and shared across | ||||
| connection instances <xref target="RFC3124"/>.</t> | ||||
| <t> | By dealing exclusively with transients, the approach in this document is | |||
| An earlier version of RTT sharing suggested implementing RTT state | more likely to exhibit the "steady-state" behavior as unmodified, | |||
| at the IP layer, rather than at the TCP layer. Our observations | independent TCP connections.</t> | |||
| describe sharing state among TCP connections, which avoids some of | <section anchor="sect-9.1" numbered="true" toc="default"> | |||
| the difficulties in an IP-layer solution. One such problem of an IP | <name>Layering</name> | |||
| layer solution is determining the correspondence between packet | ||||
| exchanges using IP header information alone, where such | ||||
| correspondence is needed to compute RTT. Because TCB sharing | ||||
| computes RTTs inside the TCP layer using TCP header information, it | ||||
| can be implemented more directly and simply than at the IP layer. | ||||
| This is a case where information should be computed at the transport | ||||
| layer but could be shared at the network layer.</t> | ||||
| </section> | <t> | |||
| <section title="Other possibilities" anchor="sect-9.2"><t> | TCB interdependence pushes some of the TCP implementation from its typical | |||
| Per-host-pair associations are not the limit of these techniques. It | placement solely within the transport layer (in the ISO model) to the | |||
| is possible that TCBs could be similarly shared between hosts on a | network layer. | |||
| subnet or within a cluster, because the predominant path can be | ||||
| subnet-subnet, rather than host-host. Additionally, TCB | This acknowledges that some components of state are, in fact, per-host-pair | |||
| interdependence can be applied to any protocol with congestion | or can be per-path as indicated solely by that host-pair. | |||
| state, including SCTP <xref target="RFC4960"/> and DCCP <xref target="RFC4340 | ||||
| "/>, as well as for | Transport protocols typically manage per-application-pair associations (per | |||
| individual subflows in Multipath TCP <xref target="RFC8684"/>.</t> | stream), and network protocols manage per-host-pair and path associations | |||
| (routing). Round-trip time, MSS, and congestion information could be more | ||||
| appropriately handled at the network layer, aggregated among concurrent | ||||
| connections, and shared across connection instances <xref target="RFC3124" | ||||
| format="default"/>.</t> | ||||
| <t> | ||||
| An earlier version of RTT sharing suggested implementing RTT state at the | ||||
| IP layer rather than at the TCP layer. Our observations describe sharing | ||||
| state among TCP connections, which avoids some of the difficulties in an | ||||
| IP-layer solution. One such problem of an IP-layer solution is determining | ||||
| the correspondence between packet exchanges using IP header information | ||||
| alone, where such correspondence is needed to compute RTT. Because TCB | ||||
| sharing computes RTTs inside the TCP layer using TCP header information, it | ||||
| can be implemented more directly and simply than at the IP layer. This is | ||||
| a case where information should be computed at the transport layer but | ||||
| could be shared at the network layer.</t> | ||||
| </section> | ||||
| <section anchor="sect-9.2" numbered="true" toc="default"> | ||||
| <name>Other Possibilities</name> | ||||
| <t> | ||||
| Per-host-pair associations are not the limit of these techniques. It is | ||||
| possible that TCBs could be similarly shared between hosts on a subnet or | ||||
| within a cluster, because the predominant path can be subnet-subnet rather | ||||
| than host-host. Additionally, TCB interdependence can be applied to any | ||||
| protocol with congestion state, including SCTP <xref target="RFC4960" | ||||
| format="default"/> and DCCP <xref target="RFC4340" format="default"/>, as | ||||
| well as to individual subflows in Multipath TCP <xref target="RFC8684" | ||||
| format="default"/>.</t> | ||||
| <t> | ||||
| <t> | ||||
| There may be other information that can be shared between concurrent | There may be other information that can be shared between concurrent | |||
| connections. For example, knowing that another connection has just | connections. For example, knowing that another connection has just | |||
| tried to expand its window size and failed, a connection may not | tried to expand its window size and failed, a connection may not | |||
| attempt to do the same for some period. The idea is that existing | attempt to do the same for some period. The idea is that existing | |||
| TCP implementations infer the behavior of all competing connections, | TCP implementations infer the behavior of all competing connections, | |||
| including those within the same host or subnet. One possible | including those within the same host or subnet. One possible | |||
| optimization is to make that implicit feedback explicit, via | optimization is to make that implicit feedback explicit, via | |||
| extended information associated with the endpoint IP address and its | extended information associated with the endpoint IP address and its | |||
| TCP implementation, rather than per-connection state in the TCB.</t> | TCP implementation, rather than per-connection state in the TCB.</t> | |||
| <t> | ||||
| <t> | ||||
| This document focuses on sharing TCB information at connection | This document focuses on sharing TCB information at connection | |||
| initialization. Subsequent to RFC 2140, there have been numerous approaches | initialization. Subsequent to RFC 2140, there have been numerous approaches | |||
| that attempt to coordinate ongoing state across concurrent connections, | that attempt to coordinate ongoing state across concurrent connections, | |||
| both within TCP and other congestion-reactive protocols, which are | both within TCP and other congestion-reactive protocols, which are | |||
| summarized in <xref target="Is18"/>. These approaches are more complex to imp | summarized in <xref target="Is18" format="default"/>. These approaches are | |||
| lement and | more complex to implement, and their comparison to steady-state TCP | |||
| their comparison to steady-state TCP equivalence can be more difficult to | equivalence can be more difficult to establish, sometimes intentionally | |||
| establish, sometimes intentionally (i.e., they sometimes intend to provide | (i.e., they sometimes intend to provide a different kind of "fairness" than | |||
| a different kind of "fairness" than emerges from TCP operation).</t> | emerges from TCP operation).</t> | |||
| </section> | ||||
| </section> | </section> | |||
| </section> | ||||
| <section title="Implementation Observations" anchor="sect-10"><t> | ||||
| The observation that some TCB state is host-pair specific rather | ||||
| than application-pair dependent is not new and is a common | ||||
| engineering decision in layered protocol implementations. Although | ||||
| now deprecated, T/TCP <xref target="RFC1644"/> was the first to propose using | ||||
| caches in order to maintain TCB states (see Appendix A).</t> | ||||
| <t> | ||||
| The table below describes the current implementation status for TCB | ||||
| temporal sharing in Windows as of December 2020, Apple variants | ||||
| (macOS, iOS, iPadOS, tvOS, watchOS) as of January 2021, Linux kernel | ||||
| version 5.10.3, and FreeBSD 12. Ensemble sharing is not yet | ||||
| implemented.</t> | ||||
| <figure><artwork><![CDATA[ | ||||
| KNOWN IMPLEMENTATION STATUS | ||||
| TCB data Status | ||||
| ------------------------------------------------------------ | ||||
| old_MMS_S Not shared | ||||
| old_MMS_R Not shared | ||||
| old_sendMSS Cached and shared in Apple, Linux (MSS) | ||||
| old_PMTU Cached and shared in Apple, FreeBSD, Windows (PMTU) | ||||
| old_RTT Cached and shared in Apple, FreeBSD, Linux, Windows | ||||
| old_RTTVAR Cached and shared in Apple, FreeBSD, Windows | ||||
| old_TFOinfo Cached and shared in Apple, Linux, Windows | ||||
| old_sendcwnd Not shared | ||||
| old_ssthresh Cached and shared in Apple, FreeBSD*, Linux* | <section anchor="sect-10" numbered="true" toc="default"> | |||
| <name>Implementation Observations</name> | ||||
| <t> | ||||
| The observation that some TCB state is host-pair specific rather than | ||||
| application-pair dependent is not new and is a common engineering decision | ||||
| in layered protocol implementations. Although now deprecated, T/TCP <xref | ||||
| target="RFC1644" format="default"/> was the first to propose using caches | ||||
| in order to maintain TCB states (see <xref target="sect-a"/>).</t> | ||||
| <t> | ||||
| <xref target="Known_Implementation_Status"/> describes the current | ||||
| implementation status for TCB temporal sharing in Windows as of | ||||
| December 2020, Apple variants (macOS, iOS, iPadOS, tvOS, and watchOS) | ||||
| as of January 2021, Linux kernel version 5.10.3, and FreeBSD | ||||
| 12. Ensemble sharing is not yet implemented.</t> | ||||
| TFO failure Cached and shared in Apple | <table anchor="Known_Implementation_Status"> | |||
| ]]></artwork> | <name>KNOWN IMPLEMENTATION STATUS</name> | |||
| </figure> | <thead> | |||
| <tr> | ||||
| <th>TCB data</th> | ||||
| <th>Status</th> | ||||
| </tr> | ||||
| </thead> | ||||
| <tbody> | ||||
| <tr> | ||||
| <td>old_MMS_S</td> | ||||
| <td>Not shared</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td>old_MMS_R</td> | ||||
| <td>Not shared</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td>old_sendMSS</td> | ||||
| <td>Cached and shared in Apple, Linux (MSS)</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td>old_PMTU</td> | ||||
| <td>Cached and shared in Apple, FreeBSD, Windows (PMTU)</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td>old_RTT</td> | ||||
| <td>Cached and shared in Apple, FreeBSD, Linux, Windows</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td>old_RTTVAR</td> | ||||
| <td>Cached and shared in Apple, FreeBSD, Windows</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td>old_TFOinfo</td> | ||||
| <td>Cached and shared in Apple, Linux, Windows</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td>old_sendcwnd</td> | ||||
| <td>Not shared</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td>old_ssthresh</td> | ||||
| <td>Cached and shared in Apple, FreeBSD*, Linux*</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td>TFO failure</td> | ||||
| <td>Cached and shared in Apple</td> | ||||
| </tr> | ||||
| </tbody> | ||||
| </table> | ||||
| <t> | <dl> | |||
| In the table above, "Apple" refers to all Apple OSes, i.e., | <dt>*</dt> | |||
| desktop/laptop macOS, phone iOS, pad iPadOS, video player tvOS, and | <dd>Note: | |||
| watch watchOS, which all share the same Internet protocol stack.</t> | ||||
| <t> | In FreeBSD, new ssthresh is the mean of curr_ssthresh and its previous value | |||
| *Note: In FreeBSD, new ssthresh is the mean of curr_ssthresh and | if a previous value exists; in Linux, the calculation depends on state and is | |||
| previous value if a previous value exists; in Linux, the calculation | max(curr_cwnd/2, old_ssthresh) in most cases.</dd> | |||
| depends on state and is max(curr_cwnd/2, old_ssthresh) in most | </dl> | |||
| cases.</t> | ||||
| </section> | <t>In <xref target="Known_Implementation_Status"/>, "Apple" refers to all | |||
| Apple OSes, i.e., macOS (desktop/laptop), iOS (phone), iPadOS (tablet), tvOS | ||||
| (video player), and watchOS (smart watch), which all share the same Internet | ||||
| protocol stack. | ||||
| </t> | ||||
| <section title="Changes Compared to RFC 2140" anchor="sect-11"><t> | </section> | |||
| This document updates the description of TCB sharing in RFC 2140 and | <section anchor="sect-11" numbered="true" toc="default"> | |||
| its associated impact on existing and new connection state, | <name>Changes Compared to RFC 2140</name> | |||
| providing a complete replacement for that document <xref target="RFC2140"/>. | <t> | |||
| It | This document updates the description of TCB sharing in RFC 2140 and its | |||
| clarifies the previous description and terminology and extends the | associated impact on existing and new connection state, providing a | |||
| mechanism to its impact on new protocols and mechanisms, including | complete replacement for that document <xref target="RFC2140" | |||
| multipath TCP, fast open, PLPMTUD, NAT, and the TCP Authentication | format="default"/>. It clarifies the previous description and terminology | |||
| Option.</t> | and extends the mechanism to its impact on new protocols and mechanisms, | |||
| including multipath TCP, Fast Open, PLPMTUD, NAT, and the TCP | ||||
| Authentication Option.</t> | ||||
| <t> | <t> | |||
| The detailed impact on TCB state addresses TCB parameters in greater | The detailed impact on TCB state addresses TCB parameters with greater | |||
| detail, addressing MSS in both the send and receive direction, MSS | specificity. It separates the way MSS is used in both send and receive | |||
| and sendMSS separately, adds path MTU and ssthresh, and addresses | directions, it separates the way both of these MSS values differ from | |||
| the impact on TCP option state.</t> | sendMSS, it adds both path MTU and ssthresh, and it addresses the impact on | |||
| state associated with TCP options. | ||||
| </t> | ||||
| <t> | <t> | |||
| New sections have been added to address compatibility issues and | New sections have been added to address compatibility issues and | |||
| implementation observations. The relation of this work to T/TCP has | implementation observations. | |||
| been moved to 0 on history, partly to reflect the deprecation of | ||||
| that protocol.</t> | ||||
| <t> | The relation of this work to T/TCP has been moved to <xref | |||
| Appendix C has been added to discuss the potential to use temporal | target="sect-a"/> (which describes the history to TCB sharing) partly to | |||
| reflect the deprecation of that protocol. | ||||
| </t> | ||||
| <t> | ||||
| <xref target="sect-c"/> has been added to discuss the potential to use tempor | ||||
| al | ||||
| sharing over long timescales to adapt TCP's initial window | sharing over long timescales to adapt TCP's initial window | |||
| automatically, avoiding the need to periodically revise a single | automatically, avoiding the need to periodically revise a single | |||
| global constant value.</t> | global constant value.</t> | |||
| <t> | ||||
| <t> | ||||
| Finally, this document updates and significantly expands the | Finally, this document updates and significantly expands the | |||
| referenced literature.</t> | referenced literature.</t> | |||
| </section> | ||||
| </section> | <section anchor="sect-12" numbered="true" toc="default"> | |||
| <name>Security Considerations</name> | ||||
| <section title="Security Considerations" anchor="sect-12"><t> | <t> | |||
| These presented implementation methods do not have additional | These presented implementation methods do not have additional ramifications | |||
| ramifications for direct (connection-aborting or information | for direct (connection-aborting or information-injecting) attacks on | |||
| injecting) attacks on individual connections. Individual | individual connections. Individual connections, whether using sharing or | |||
| connections, whether using sharing or not, also may be susceptible | not, also may be susceptible to denial-of-service attacks that reduce | |||
| to denial-of-service attacks that reduce performance or completely | performance or completely deny connections and transfers if not otherwise | |||
| deny connections and transfers if not otherwise secured.</t> | secured.</t> | |||
| <t> | ||||
| <t> | TCB sharing may create additional denial-of-service attacks that affect the | |||
| TCB sharing may create additional denial-of-service attacks that | performance of other connections by polluting the cached information. This | |||
| affect the performance of other connections by polluting the cached | can occur across any set of connections in which the TCB is shared, | |||
| information. This can occur across whatever set of connections where | between connections in a single host, or between hosts if TCB sharing is | |||
| the TCB is shared, between connections in a single host, or between | implemented within a subnet (see <xref target="sect-9" | |||
| hosts if TCB sharing is implemented within a subnet (see | sectionFormat="bare">"Implications"</xref>). Some shared TCB parameters are | |||
| Implications section). Some shared TCB parameters are used only to | used only to create new TCBs; others are shared among the TCBs of ongoing | |||
| create new TCBs, others are shared among the TCBs of ongoing | connections. New connections can join the ongoing set, e.g., to optimize | |||
| connections. New connections can join the ongoing set, e.g., to | send window size among a set of connections to the same host. PMTU is | |||
| optimize send window size among a set of connections to the same | defined as shared at the IP layer and is already susceptible in this | |||
| host. PMTU is defined as shared at the IP layer, and is already | way.</t> | |||
| susceptible in this way.</t> | <t> | |||
| <t> | ||||
| Options in client SYNs can be easier to forge than complete, two-way | Options in client SYNs can be easier to forge than complete, two-way | |||
| connections. As a result, their values may not be safely | connections. As a result, their values may not be safely | |||
| incorporated in shared values until after the three-way handshake | incorporated in shared values until after the three-way handshake | |||
| completes.</t> | completes.</t> | |||
| <t> | ||||
| <t> | ||||
| Attacks on parameters used only for initialization affect only the | Attacks on parameters used only for initialization affect only the | |||
| transient performance of a TCP connection. For short connections, the | transient performance of a TCP connection. For short connections, the | |||
| performance ramification can approach that of a denial-of-service | performance ramification can approach that of a denial-of-service | |||
| attack. E.g., if an application changes its TCB to have a false and small | attack. For example, if an application changes its TCB to have a false and sm all | |||
| window size, subsequent connections will experience performance degradation | window size, subsequent connections will experience performance degradation | |||
| until their window grew appropriately.</t> | until their window grows appropriately.</t> | |||
| <t> | ||||
| <t> | ||||
| TCB sharing reuses and mixes information from past and current | TCB sharing reuses and mixes information from past and current | |||
| connections. Although reusing information could create a potential | connections. Although reusing information could create a potential | |||
| for fingerprinting to identify hosts, the mixing reduces that | for fingerprinting to identify hosts, the mixing reduces that | |||
| potential. There has been no evidence of fingerprinting based on | potential. There has been no evidence of fingerprinting based on | |||
| this technique and it is currently considered safe in that regard. | this technique, and it is currently considered safe in that regard. | |||
| Further, information about the performance of a TCP connection has | Further, information about the performance of a TCP connection has | |||
| not been considered as private.</t> | not been considered as private.</t> | |||
| </section> | ||||
| <section anchor="sect-13" numbered="true" toc="default"> | ||||
| <name>IANA Considerations</name> | ||||
| <t> | ||||
| This document has no IANA actions.</t> | ||||
| </section> | </section> | |||
| </middle> | ||||
| <section title="IANA Considerations" anchor="sect-13"><t> | <back> | |||
| There are no IANA implications or requests in this document.</t> | ||||
| <t> | <displayreference target="I-D.allman-tcpm-bump-initcwnd" to="Al10"/> | |||
| This section should be removed upon final publication as an RFC.</t> | <displayreference target="I-D.ietf-tcpm-generalized-ecn" to="Ba20"/> | |||
| <displayreference target="I-D.hughes-restart" to="Hu01"/> | ||||
| </section> | <references> | |||
| <name>References</name> | ||||
| <references> | ||||
| <name>Normative References</name> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.0793.xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.1122.xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.1191.xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.2119.xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.4821.xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.5681.xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.6298.xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.7413.xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.8174.xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.8201.xml"/> | ||||
| </references> | ||||
| <references> | ||||
| <name>Informative References</name> | ||||
| </middle> | <xi:include href="https://datatracker.ietf.org/doc/bibxml3/reference.I-D .allman-tcpm-bump-initcwnd.xml"/> | |||
| <back> | <reference anchor="Ba12"> | |||
| <references title="Normative References"> | <front> | |||
| &RFC0793; | <title>LISA: A linked slow-start algorithm for MPTCP</title> | |||
| &RFC1122; | <author initials="R." surname="Barik" fullname="Runa Barik"> | |||
| &RFC1191; | ||||
| &RFC2119; | ||||
| &RFC4821; | ||||
| &RFC5681; | ||||
| &RFC6298; | ||||
| &RFC7413; | ||||
| &RFC8174; | ||||
| &RFC8201; | ||||
| </references> | ||||
| <references title="Informative References"> | ||||
| &I-D.allman-tcpm-bump-initcwnd; | ||||
| <reference anchor="Ba12"><front> | ||||
| <title>LISA: A Linked Slow-Start Algorithm for MPTCP</title> | ||||
| <author initials="R." surname="Barik" fullname="R. Barik"> | ||||
| </author> | </author> | |||
| <author initials="M." surname="Welzl" fullname="Michael Welzl"> | ||||
| <author initials="M." surname="Welzl" fullname="M. Welzl"> | ||||
| </author> | </author> | |||
| <author initials="S." surname="Ferlin" fullname="Simone Ferlin"> | ||||
| <author initials="S." surname="Ferlin" fullname="S. Ferlin"> | ||||
| </author> | </author> | |||
| <author initials="O." surname="Alay" fullname="Ozgu Alay"> | ||||
| <author initials="O." surname="Alay" fullname="O. Alay"> | ||||
| </author> | </author> | |||
| <date month="May" year="2016"/> | ||||
| </front> | ||||
| <refcontent>IEEE ICC | ||||
| </refcontent> | ||||
| <seriesInfo name="DOI" value="10.1109/ICC.2016.7510786"/> | ||||
| </reference> | ||||
| <date month="May" year="2016"/> | <xi:include href="https://datatracker.ietf.org/doc/bibxml3/reference.I-D | |||
| </front> | .ietf-tcpm-generalized-ecn.xml"/> | |||
| <seriesInfo name="IEEE" value="ICC"/> | <reference anchor="Be94"> | |||
| </reference> | <front> | |||
| &I-D.ietf-tcpm-generalized-ecn; | <title>The World-Wide Web</title> | |||
| <reference anchor="Be94"><front> | <author initials="T." surname="Berners-Lee" fullname="Tim Berners-Le | |||
| <title>The World-Wide Web</title> | e"> | |||
| <author initials="T." surname="Berners-Lee" fullname="T. Berners-Lee"> | ||||
| </author> | </author> | |||
| <author initials="C." surname="Cailliau" fullname="Robert Cailliau"/ | ||||
| > | ||||
| <author initials="A." surname="Luotonen" fullname="Ari Luotonen"/> | ||||
| <author initials="H." surname="Nielsen" fullname="Henrik Frystyk Niel | ||||
| sen"/> | ||||
| <author initials="A." surname="Secret" fullname="Arthur Secret"/> | ||||
| <date month="August" year="1994"/> | <date month="August" year="1994"/> | |||
| </front> | </front> | |||
| <seriesInfo name="DOI" value="10.1145/179606.179671"/> | ||||
| <seriesInfo name="Communications" value="of the ACM"/> | <refcontent>Communications of the ACM V37, pp. 76-82</refcontent> | |||
| </reference> | ||||
| <reference anchor="Br94"><front> | ||||
| <title>T/TCP -- Transaction TCP: Source Changes for Sun OS 4.1.3</title> | ||||
| <author initials="B." surname="Braden" fullname="B. Braden"> | ||||
| </author> | ||||
| <date month="September" year="1994"/> | </reference> | |||
| </front> | ||||
| </reference> | <reference anchor="Br94"> | |||
| <reference anchor="Br02"><front> | <front> | |||
| <title>Understanding Internet Traffic Streams: Dragonflies and Tortoises< | <title>T/TCP -- Transaction TCP: Source Changes for Sun OS 4.1.3</ti | |||
| /title> | tle> | |||
| <author initials="N." surname="Brownlee" fullname="N. Brownlee"> | <author initials="B." surname="Braden" fullname="Bob Braden"> | |||
| </author> | </author> | |||
| <date month="September" year="1994"/> | ||||
| </front> | ||||
| <refcontent>USC/ISI Release 1.0</refcontent> | ||||
| </reference> | ||||
| <author initials="K." surname="Claffy" fullname="K. Claffy"> | <reference anchor="Br02"> | |||
| <front> | ||||
| <title>Understanding Internet traffic streams: dragonflies and torto | ||||
| ises</title> | ||||
| <author initials="N" surname="Brownlee" fullname="Nevil Brownlee"> | ||||
| </author> | </author> | |||
| <author initials="KC" surname="Claffy" fullname="KC Claffy"> | ||||
| <date year="2002"/> | ||||
| </front> | ||||
| <seriesInfo name="IEEE" value="Communications Magazine p110-117"/> | ||||
| </reference> | ||||
| <reference anchor="Co91"><front> | ||||
| <title>Internetworking with TCP/IP</title> | ||||
| <author initials="D." surname="Comer" fullname="D. Comer"> | ||||
| </author> | </author> | |||
| <date year="2002"/> | ||||
| </front> | ||||
| <seriesInfo name="DOI" value="10.1109/MCOM.2002.1039865"/> | ||||
| <refcontent>IEEE Communications Magazine, pp. 110-117</refcontent> | ||||
| </reference> | ||||
| <author initials="D." surname="Stevens" fullname="D. Stevens"> | <reference anchor="Co91"> | |||
| <front> | ||||
| <title>Internetworking with TCP/IP</title> | ||||
| <author initials="D" surname="Comer" fullname="Douglas Comer"> | ||||
| </author> | </author> | |||
| <author initials="D" surname="Stevens" fullname="David Stevens"> | ||||
| <date year="1991"/> | ||||
| </front> | ||||
| </reference> | ||||
| <reference anchor="Du16"><front> | ||||
| <title>Research Impacting the Practice of Congestion Control</title> | ||||
| <author> | ||||
| <organization>Dukkipati, N., Yuchung C. and V. Amin</organization> | ||||
| </author> | </author> | |||
| <date year="1991"/> | ||||
| </front> | ||||
| <seriesInfo name='ISBN 10:' value='0134685059' /> | ||||
| <seriesInfo name='ISBN 13:' value='9780134685052' /> | ||||
| </reference> | ||||
| <date month="July" year="2016"/> | <reference anchor="Du16"> | |||
| </front> | <front> | |||
| <title>Research Impacting the Practice of Congestion Control</title> | ||||
| <author initials="N" surname="Dukkipati" fullname="Nandita Dukkipati | ||||
| "/> | ||||
| <author initials="Y" surname="Cheng" fullname="Yuchung Cheng"/> | ||||
| <author initials="A" surname="Vahdat" fullname="Amin Vahdat"/> | ||||
| <date month="July" year="2016"/> | ||||
| </front> | ||||
| <refcontent>Computer Communication Review</refcontent> | ||||
| <refcontent>The ACM SIGCOMM newsletter</refcontent> | ||||
| </reference> | ||||
| <seriesInfo name="ACM" value="SIGCOMM CCR editorial"/> | <reference anchor="FreeBSD" target="https://www.freebsd.org/"> | |||
| </reference> | <front> | |||
| <reference anchor="FreeBSD" target="http://www.freebsd.org/"><front> | <title>The FreeBSD Project</title> | |||
| <title>FreeBSD source code</title> | <author> | |||
| <author> | <organization>FreeBSD</organization> | |||
| </author> | </author> | |||
| <date/> | ||||
| </front> | ||||
| </reference> | ||||
| <date/> | <reference anchor="I-D.hughes-restart"> | |||
| </front> | <front> | |||
| <title>Issues in TCP Slow-Start Restart After Idle</title> | ||||
| </reference> | <author initials="A" surname="Hughes" fullname="Amy Hughes"/> | |||
| &I-D.hughes-restart; | <author initials="J" surname="Touch" fullname="Joe Touch"/> | |||
| <reference anchor="Hu12"><front> | <author initials="J" surname="Heidemann" fullname="John Heidemann"/> | |||
| <title>Enhanced metric caching for short TCP flows</title> | ||||
| <author initials="P." surname="Hurtig" fullname="P. Hurtig"> | ||||
| </author> | ||||
| <author initials="A." surname="Brunstrom" fullname="A. Brunstrom"> | <date month="December" year="2001" /> | |||
| </author> | </front> | |||
| <date year="2012"/> | <seriesInfo name="Internet-Draft" value="draft-hughes-restart-00" /> | |||
| </front> | </reference> | |||
| <seriesInfo name="IEEE" value="International Conference on Communications | <reference anchor="Hu12"> | |||
| "/> | <front> | |||
| </reference> | <title>Enhanced metric caching for short TCP flows</title> | |||
| <reference anchor="IANA" target="https://www.iana.org/assignments/tcp-par | <author initials="P." surname="Hurtig" fullname="Per Hurtig"> | |||
| ameters"><front> | ||||
| <title>IANA TCP Parameters (options) registry</title> | ||||
| <author> | ||||
| </author> | </author> | |||
| <author initials="A." surname="Brunstrom" fullname="Anna Brunstrom"> | ||||
| <date/> | ||||
| </front> | ||||
| </reference> | ||||
| <reference anchor="Is18"><front> | ||||
| <title>ctrlTCP: Reducing Latency through Coupled Heterogeneous Multi-Flow | ||||
| TCP Congestion Control</title> | ||||
| <author initials="S." surname="Islam" fullname="S. Islam"> | ||||
| </author> | </author> | |||
| <date year="2012"/> | ||||
| </front> | ||||
| <seriesInfo name="DOI" value="10.1109/ICC.2012.6364516"/> | ||||
| <refcontent>IEEE International Conference on Communications</refcontent> | ||||
| </reference> | ||||
| <author initials="M." surname="Welzl" fullname="M. Welzl"> | <reference anchor="IANA" target="https://www.iana.org/assignments/tcp-pa | |||
| rameters"> | ||||
| <front> | ||||
| <title>Transmission Control Protocol (TCP) Parameters</title> | ||||
| <author> | ||||
| <organization>IANA</organization> | ||||
| </author> | </author> | |||
| <date/> | ||||
| </front> | ||||
| </reference> | ||||
| <author initials="K." surname="Hiorth" fullname="K. Hiorth"> | <reference anchor="Is18"> | |||
| <front> | ||||
| <title>ctrlTCP: Reducing latency through coupled, heterogeneous | ||||
| multi-flow TCP congestion control</title> | ||||
| <author initials="S." surname="Islam" fullname="Safiqul Islam"> | ||||
| </author> | </author> | |||
| <author initials="M." surname="Welzl" fullname="Michael Welzl"> | ||||
| <author initials="D." surname="Hayes" fullname="D. Hayes"> | ||||
| </author> | </author> | |||
| <author initials="K." surname="Hiorth" fullname="Kristian Hiorth"> | ||||
| <author initials="G." surname="Armitage" fullname="G. Armitage"> | ||||
| </author> | </author> | |||
| <author initials="D." surname="Hayes" fullname="David Hayes"> | ||||
| <author initials="S." surname="Gjessing" fullname="S. Gjessing"> | ||||
| </author> | </author> | |||
| <author initials="G." surname="Armitage" fullname="Grenville Armitag | ||||
| <date month="April" year="2018"/> | e"> | |||
| </front> | ||||
| <seriesInfo name="Proc" value="IEEE INFOCOM Global Internet Symposium GI | ||||
| workshop"/> | ||||
| </reference> | ||||
| <reference anchor="Ja88"><front> | ||||
| <title>Congestion Avoidance and Control</title> | ||||
| <author initials="V." surname="Jacobson" fullname="V. Jacobson"> | ||||
| </author> | </author> | |||
| <author initials="S." surname="Gjessing" fullname="Stein Gjessing"> | ||||
| <author initials="M." surname="Karels" fullname="M. Karels"> | ||||
| </author> | </author> | |||
| <date month="April" year="2018"/> | ||||
| </front> | ||||
| <seriesInfo name="DOI" value="10.1109/INFCOMW.2018.8406887"/> | ||||
| <refcontent>IEEE INFOCOM 2018 - IEEE Conference on Computer | ||||
| Communications Workshops (INFOCOM WKSHPS)</refcontent> | ||||
| </reference> | ||||
| <date year="1988"/> | <reference anchor="Ja88"> | |||
| </front> | <front> | |||
| <title>Congestion Avoidance and Control</title> | ||||
| <seriesInfo name="Proc" value="Sigcomm"/> | <author initials="V." surname="Jacobson" fullname="Van Jacobson"> | |||
| </reference> | </author> | |||
| &RFC1644; | <author initials="M." surname="Karels" fullname="Michael Karels"> | |||
| &RFC1379; | </author> | |||
| &RFC2001; | <date month="November" year="1988"/> | |||
| &RFC2140; | </front> | |||
| &RFC2414; | <refcontent>SIGCOMM Symposium proceedings on Communications | |||
| &RFC2663; | architectures and protocols | |||
| &RFC3390; | </refcontent> | |||
| &RFC3124; | </reference> | |||
| &RFC4340; | ||||
| &RFC4960; | ||||
| &RFC5925; | ||||
| &RFC6437; | ||||
| &RFC6691; | ||||
| &RFC6928; | ||||
| &RFC7231; | ||||
| &RFC7323; | ||||
| &RFC7424; | ||||
| &RFC7540; | ||||
| &RFC7661; | ||||
| &RFC8684; | ||||
| </references> | ||||
| <section title="TCB Sharing History" anchor="sect-a"><t> | ||||
| T/TCP proposed using caches to maintain TCB information across | ||||
| instances (temporal sharing), e.g., smoothed RTT, RTT variation, | ||||
| congestion avoidance threshold, and MSS <xref target="RFC1644"/>. These value | ||||
| s were | ||||
| in addition to connection counts used by T/TCP to accelerate data | ||||
| delivery prior to the full three-way handshake during an OPEN. The | ||||
| goal was to aggregate TCB components where they reflect one | ||||
| association - that of the host-pair, rather than artificially | ||||
| separating those components by connection.</t> | ||||
| <t> | <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | |||
| FC.1644.xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.1379.xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.2001.xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.2140.xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.2414.xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.2663.xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.3390.xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.3124.xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.4340.xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.4960.xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.5925.xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.6437.xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.6691.xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.6928.xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.7231.xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.7323.xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.7424.xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.7540.xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.7661.xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.8684.xml"/> | ||||
| </references> | ||||
| </references> | ||||
| <section anchor="sect-a" numbered="true" toc="default"> | ||||
| <name>TCB Sharing History</name> | ||||
| <t> | ||||
| T/TCP proposed using caches to maintain TCB information across instances | ||||
| (temporal sharing), e.g., smoothed RTT, RTT variation, congestion-avoidance | ||||
| threshold, and MSS <xref target="RFC1644" format="default"/>. These values | ||||
| were in addition to connection counts used by T/TCP to accelerate data | ||||
| delivery prior to the full three-way handshake during an OPEN. The goal was | ||||
| to aggregate TCB components where they reflect one association -- that of the | ||||
| host-pair rather than artificially separating those components by | ||||
| connection.</t> | ||||
| <t> | ||||
| At least one T/TCP implementation saved the MSS and aggregated the | At least one T/TCP implementation saved the MSS and aggregated the | |||
| RTT parameters across multiple connections but omitted caching the | RTT parameters across multiple connections but omitted caching the | |||
| congestion window information <xref target="Br94"/>, as originally specified | congestion window information <xref target="Br94" format="default"/>, as orig | |||
| in | inally specified in | |||
| <xref target="RFC1379"/>. Some T/TCP implementations immediately updated MSS | <xref target="RFC1379" format="default"/>. Some T/TCP implementations immedia | |||
| when | tely updated MSS when | |||
| the TCP MSS header option was received <xref target="Br94"/>, although this w | the TCP MSS header option was received <xref target="Br94" format="default"/> | |||
| as not | , although this was not | |||
| addressed specifically in the concepts or functional specification | addressed specifically in the concepts or functional specification | |||
| <xref target="RFC1379"/><xref target="RFC1644"/>. In later T/TCP implementati ons, RTT values were | <xref target="RFC1379" format="default"/> <xref target="RFC1644" format="defa ult"/>. In later T/TCP implementations, RTT values were | |||
| updated only after a CLOSE, which does not benefit concurrent | updated only after a CLOSE, which does not benefit concurrent | |||
| sessions.</t> | sessions.</t> | |||
| <t> | ||||
| Temporal sharing of cached TCB data was originally implemented in the Sun | ||||
| OS 4.1.3 T/TCP extensions <xref target="Br94" format="default"/> and the | ||||
| FreeBSD port of same <xref target="FreeBSD" format="default"/>. As | ||||
| mentioned before, only the MSS and RTT parameters were cached, as originally | ||||
| specified in <xref target="RFC1379" format="default"/>. Later discussion of | ||||
| T/TCP suggested including congestion control parameters in this cache; for | ||||
| example, <xref target="RFC1644" sectionFormat="of" section="3.1" | ||||
| format="default"/> hints at initializing the congestion window to the old | ||||
| window size.</t> | ||||
| </section> | ||||
| <section anchor="sect-b" numbered="true" toc="default"> | ||||
| <t> | <name>TCP Option Sharing and Caching</name> | |||
| Temporal sharing of cached TCB data was originally implemented in | <t> | |||
| the SunOS 4.1.3 T/TCP extensions <xref target="Br94"/> and the FreeBSD port o | In addition to the options that can be cached and shared, this memo also | |||
| f same | lists known TCP options <xref target="IANA" format="default"/> for which | |||
| <xref target="FreeBSD"/>. As mentioned before, only the MSS and RTT parameter | state is unsafe to be kept. This list is not intended to be authoritative | |||
| s were | or exhaustive.</t> | |||
| cached, as originally specified in <xref target="RFC1379"/>. Later discussion | ||||
| of | ||||
| T/TCP suggested including congestion control parameters in this | ||||
| cache; for example, <xref target="RFC1644"/> (Section 3.1) hints at initializ | ||||
| ing | ||||
| the congestion window to the old window size.</t> | ||||
| </section> | ||||
| <section title="TCP Option Sharing and Caching" anchor="sect-b"><t> | ||||
| In addition to the options that can be cached and shared, this memo | ||||
| also lists known TCP options <xref target="IANA"/> for which state is unsafe | ||||
| to be | ||||
| kept. This list is not intended to be authoritative or exhaustive.</t> | ||||
| <figure><artwork><![CDATA[ | ||||
| Obsolete (unsafe to keep state): | ||||
| ECHO | ||||
| ECHO REPLY | <t>Obsolete (unsafe to keep state): | |||
| </t> | ||||
| <ul empty="true"> | ||||
| PO Conn permitted | <li>Echo | |||
| </li> | ||||
| PO service profile | <li>Echo Reply | |||
| </li> | ||||
| CC | <li>Partial Order Connection Permitted | |||
| </li> | ||||
| CC.NEW | <li>Partial Order Service Profile | |||
| </li> | ||||
| CC.ECHO | <li>CC | |||
| </li> | ||||
| Alt CS req | <li>CC.NEW | |||
| </li> | ||||
| Alt CS data | <li>CC.ECHO | |||
| </li> | ||||
| No state to keep: | <li>TCP Alternate Checksum Request | |||
| </li> | ||||
| EOL | <li>TCP Alternate Checksum Data | |||
| </li> | ||||
| NOP | </ul> | |||
| WS | <t>No state to keep: | |||
| </t> | ||||
| SACK | <ul empty="true"> | |||
| <li>End of Option List (EOL) | ||||
| </li> | ||||
| <li>No-Operation (NOP) | ||||
| </li> | ||||
| <li>Window Scale (WS) | ||||
| </li> | ||||
| <li>SACK | ||||
| </li> | ||||
| <li>Timestamps (TS) | ||||
| </li> | ||||
| <li>MD5 Signature Option | ||||
| </li> | ||||
| <li>TCP Authentication Option (TCP-AO) | ||||
| </li> | ||||
| <li>RFC3692-style Experiment 1 | ||||
| </li> | ||||
| <li>RFC3692-style Experiment 2 | ||||
| </li> | ||||
| </ul> | ||||
| TS | <t>Unsafe to keep state: | |||
| </t> | ||||
| MD5 | <ul empty="true"> | |||
| TCP-AO | <li>Skeeter (DH exchange, known to be vulnerable) | |||
| </li> | ||||
| EXP1 | <li>Bubba (DH exchange, known to be vulnerable) | |||
| </li> | ||||
| EXP2 | <li>Trailer Checksum Option | |||
| </li> | ||||
| Unsafe to keep state: | <li>SCPS capabilities | |||
| </li> | ||||
| Skeeter (DH exchange, known to be vulnerable) | <li>Selective Negative Acknowledgements (S-NACK) | |||
| </li> | ||||
| Bubba (DH exchange, known to be vulnerable) | <li>Records Boundaries | |||
| </li> | ||||
| Trailer CS | <li>Corruption experienced | |||
| </li> | ||||
| SCPS capabilities | <li>SNAP | |||
| </li> | ||||
| S-NACK | <li>TCP Compression Filter | |||
| </li> | ||||
| Records boundaries | <li>Quick-Start Response | |||
| </li> | ||||
| Corruption experienced | <li>User Timeout Option (UTO) | |||
| </li> | ||||
| SNAP | <li>Multipath TCP (MPTCP) negotiation success (see below for negotiation failure | |||
| ) | ||||
| </li> | ||||
| TCP Compression | <li>TCP Fast Open (TFO) negotiation success (see below for negotiation failure) | |||
| </li> | ||||
| Quickstart response | </ul> | |||
| UTO | <t>Safe but optional to keep state: | |||
| </t> | ||||
| MPTCP negotiation success (see below for negotiation failure) | <ul empty="true"> | |||
| <li>Multipath TCP (MPTCP) negotiation failure (to avoid negotiation retries) | ||||
| </li> | ||||
| TFO negotiation success (see below for negotiation failure) | <li>Maximum Segment Size (MSS) | |||
| </li> | ||||
| Safe but optional to keep state: | <li>TCP Fast Open (TFO) negotiation failure (to avoid negotiation retries) | |||
| </li> | ||||
| MPTCP negotiation failure (to avoid negotiation retries) | </ul> | |||
| MSS | <t>Safe and necessary to keep state: | |||
| </t> | ||||
| TFO negotiation failure (to avoid negotiation retries) | <ul empty="true"> | |||
| Safe and necessary to keep state: | <li>TCP Fast Open (TFO) Cookie (if TFO succeeded in the past) | |||
| </li> | ||||
| TFO cookie (if TFO succeeded in the past) | </ul> | |||
| ]]></artwork> | ||||
| </figure> | ||||
| </section> | ||||
| <section title="Automating the Initial Window in TCP over Long Timescales | </section> | |||
| " anchor="sect-c"><section title="Introduction" anchor="sect-c.1"><t> | <section anchor="sect-c" numbered="true" toc="default"> | |||
| <name>Automating the Initial Window in TCP over Long Timescales</name> | ||||
| <section anchor="sect-c.1" numbered="true" toc="default"> | ||||
| <name>Introduction</name> | ||||
| <t> | ||||
| Temporal sharing, as described earlier in this document, builds on | Temporal sharing, as described earlier in this document, builds on | |||
| the assumption that multiple consecutive connections between the | the assumption that multiple consecutive connections between the | |||
| same host pair are somewhat likely to be exposed to similar | same host-pair are somewhat likely to be exposed to similar | |||
| environment characteristics. The stored information can become less | environment characteristics. The stored information can become less | |||
| accurate over time and suitable precautions should take this ageing | accurate over time and suitable precautions should take this aging | |||
| into consideration (this is discussed further in section 8.1). | into consideration (this is discussed further in <xref target="sect-8.1"/>). | |||
| However, there are also cases where it can make sense to track these | However, there are also cases where it can make sense to track these | |||
| values over longer periods, observing properties of TCP connections | values over longer periods, observing properties of TCP connections | |||
| to gradually influence evolving trends in TCP parameters. This | to gradually influence evolving trends in TCP parameters. This | |||
| appendix describes an example of such a case.</t> | appendix describes an example of such a case.</t> | |||
| <t> | ||||
| <t> | ||||
| TCP's congestion control algorithm uses an initial window value | TCP's congestion control algorithm uses an initial window value | |||
| (IW), both as a starting point for new connections and as an upper | (IW) both as a starting point for new connections and as an upper | |||
| limit for restarting after an idle period <xref target="RFC5681"/><xref targe | limit for restarting after an idle period <xref target="RFC5681" format="defa | |||
| t="RFC7661"/>. This | ult"/> <xref target="RFC7661" format="default"/>. This | |||
| value has evolved over time, originally one maximum segment size | value has evolved over time; it was originally 1 maximum segment size | |||
| (MSS), and increased to the lesser of four MSS or 4,380 bytes | (MSS) and increased to the lesser of 4 MSSs or 4,380 bytes | |||
| <xref target="RFC3390"/><xref target="RFC5681"/>. For a typical Internet conn | <xref target="RFC3390" format="default"/> <xref target="RFC5681" format="defa | |||
| ection with a maximum | ult"/>. For a typical Internet connection with a maximum | |||
| transmission unit (MTU) of 1500 bytes, this permits three segments | transmission unit (MTU) of 1500 bytes, this permits 3 segments | |||
| of 1,460 bytes each.</t> | of 1,460 bytes each.</t> | |||
| <t> | ||||
| <t> | The IW value was originally implied in the original TCP congestion control | |||
| The IW value was originally implied in the original TCP congestion | description and documented as a standard in 1997 <xref target="RFC2001" | |||
| control description and documented as a standard in 1997 | format="default"/> <xref target="Ja88" format="default"/>. The value was | |||
| <xref target="RFC2001"/><xref target="Ja88"/>. The value was updated in 1998 | updated in 1998 experimentally and moved to the Standards Track in 2002 | |||
| experimentally and | <xref target="RFC2414" format="default"/> <xref target="RFC3390" | |||
| moved to the standards track in 2002 <xref target="RFC2414"/><xref target="RF | format="default"/>. In 2013, it was experimentally increased to 10 <xref | |||
| C3390"/>. In 2013, it | target="RFC6928" format="default"/>.</t> | |||
| was experimentally increased to 10 <xref target="RFC6928"/>.</t> | <t> | |||
| <t> | ||||
| This appendix discusses how TCP can objectively measure when an IW | This appendix discusses how TCP can objectively measure when an IW | |||
| is too large, and that such feedback should be used over long | is too large and that such feedback should be used over long | |||
| timescales to adjust the IW automatically. The result should be | timescales to adjust the IW automatically. The result should be | |||
| safer to deploy and might avoid the need to repeatedly revisit IW | safer to deploy and might avoid the need to repeatedly revisit IW | |||
| over time.</t> | over time.</t> | |||
| <t> | ||||
| <t> | ||||
| Note that this mechanism attempts to make the IW more adaptive over | Note that this mechanism attempts to make the IW more adaptive over | |||
| time. It can increase the IW beyond that which is currently | time. It can increase the IW beyond that which is currently | |||
| recommended for widescale deployment, and so its use should be | recommended for wide-scale deployment, so its use should be | |||
| carefully monitored.</t> | carefully monitored.</t> | |||
| </section> | ||||
| </section> | <section anchor="sect-c.2" numbered="true" toc="default"> | |||
| <name>Design Considerations</name> | ||||
| <section title="Design Considerations" anchor="sect-c.2"><t> | <t> | |||
| TCP's IW value has existed statically for over two decades, so any | TCP's IW value has existed statically for over two decades, so any | |||
| solution to adjusting the IW dynamically should have similarly | solution to adjusting the IW dynamically should have similarly | |||
| stable, non-invasive effects on the performance and complexity of | stable, non-invasive effects on the performance and complexity of | |||
| TCP. In order to be fair, the IW should be similar for most machines | TCP. In order to be fair, the IW should be similar for most machines | |||
| on the public Internet. Finally, a desirable goal is to develop a | on the public Internet. Finally, a desirable goal is to develop a | |||
| self-correcting algorithm, so that IW values that cause network | self-correcting algorithm so that IW values that cause network | |||
| problems can be avoided. To that end, we propose the following | problems can be avoided. To that end, we propose the following | |||
| design goals:</t> | design goals:</t> | |||
| <ul spacing="normal"> | ||||
| <t><list style="symbols"><t>Impart little to no impact to TCP in the abse | <li>Impart little to no impact to TCP in the absence of loss, i.e., | |||
| nce of loss, i.e., | ||||
| it should not increase the complexity of default packet | it should not increase the complexity of default packet | |||
| processing in the normal case.</t> | processing in the normal case.</li> | |||
| <li>Adapt to network feedback over long timescales, avoiding values | ||||
| <t>Adapt to network feedback over long timescales, avoiding values | that persistently cause network problems.</li> | |||
| that persistently cause network problems.</t> | <li>Decrease the IW in the presence of sustained loss of IW segments, | |||
| as determined over a number of different connections.</li> | ||||
| <t>Decrease the IW in the presence of sustained loss of IW segments, | <li>Increase the IW in the absence of sustained loss of IW segments, | |||
| as determined over a number of different connections.</t> | as determined over a number of different connections.</li> | |||
| <li>Operate conservatively, i.e., tend towards leaving the IW the | ||||
| <t>Increase the IW in the absence of sustained loss of IW segments, | ||||
| as determined over a number of different connections.</t> | ||||
| <t>Operate conservatively, i.e., tend towards leaving the IW the | ||||
| same in the absence of sufficient information, and give greater | same in the absence of sufficient information, and give greater | |||
| consideration to IW segment loss than IW segment success.</t> | consideration to IW segment loss than IW segment success.</li> | |||
| </ul> | ||||
| </list> | <t> | |||
| </t> | ||||
| <t> | ||||
| We expect that, without other context, a good IW algorithm will | We expect that, without other context, a good IW algorithm will | |||
| converge to a single value, but this is not required. An endpoint | converge to a single value, but this is not required. An endpoint | |||
| with additional context or information, or deployed in a constrained | with additional context or information, or deployed in a constrained | |||
| environment, can always use a different value. In particular, | environment, can always use a different value. In particular, | |||
| information from previous connections, or sets of connections with a | information from previous connections, or sets of connections with a | |||
| similar path, can already be used as context for such decisions (as | similar path, can already be used as context for such decisions (as | |||
| noted in the core of this document).</t> | noted in the core of this document).</t> | |||
| <t> | ||||
| <t> | ||||
| However, if a given IW value persistently causes packet loss during | However, if a given IW value persistently causes packet loss during | |||
| the initial burst of packets, it is clearly inappropriate and could | the initial burst of packets, it is clearly inappropriate and could | |||
| be inducing unnecessary loss in other competing connections. This | be inducing unnecessary loss in other competing connections. This | |||
| might happen for sites behind very slow boxes with small buffers, | might happen for sites behind very slow boxes with small buffers, | |||
| which may or may not be the first hop.</t> | which may or may not be the first hop.</t> | |||
| </section> | ||||
| </section> | <section anchor="sect-c.3" numbered="true" toc="default"> | |||
| <name>Proposed IW Algorithm</name> | ||||
| <section title="Proposed IW Algorithm" anchor="sect-c.3"><t> | <t> | |||
| Below is a simple description of the proposed IW algorithm. It | Below is a simple description of the proposed IW algorithm. It | |||
| relies on the following parameters:</t> | relies on the following parameters:</t> | |||
| <ul spacing="normal"> | ||||
| <t><list style="symbols"><t>MinIW = 3 MSS or 4,380 bytes (as per <xref ta | <li>MinIW = 3 MSS or 4,380 bytes (as per <xref target="RFC3390" format | |||
| rget="RFC3390"/>)</t> | ="default"/>)</li> | |||
| <li>MaxIW = 10 MSS (as per <xref target="RFC6928" format="default"/>)< | ||||
| <t>MaxIW = 10 MSS (as per <xref target="RFC6928"/>)</t> | /li> | |||
| <li>MulDecr = 0.5</li> | ||||
| <t>MulDecr = 0.5</t> | <li>AddIncr = 2 MSS</li> | |||
| <li>Threshold = 0.05</li> | ||||
| <t>AddIncr = 2 MSS</t> | </ul> | |||
| <t> | ||||
| <t>Threshold = 0.05</t> | ||||
| </list> | ||||
| </t> | ||||
| <t> | ||||
| We assume that the minimum IW (MinIW) should be as currently specified as | We assume that the minimum IW (MinIW) should be as currently specified as | |||
| standard <xref target="RFC3390"/>. The maximum IW can be set to a fixed | standard <xref target="RFC3390" format="default"/>. The maximum IW (MaxIW) ca | |||
| value (we suggest using the experimental and now somewhat de- facto | n be | |||
| standard in <xref target="RFC6928"/>) or set based on a schedule if trusted | set to a fixed value (we suggest using the experimental and now somewhat de | |||
| time references are available <xref | facto standard in <xref target="RFC6928" format="default"/>) or set based | |||
| target="I-D.allman-tcpm-bump-initcwnd"/>; here we prefer a fixed value. We | on a schedule if trusted time references are available <xref | |||
| also propose to use an AIMD algorithm, with increase and decreases as | target="I-D.allman-tcpm-bump-initcwnd" format="default"/>; here, we prefer | |||
| noted.</t> | a fixed value. We also propose to use an Additive Increase Multiplicative | |||
| Decrease (AIMD) algorithm, with increase and decreases as noted.</t> | ||||
| <t> | <t> | |||
| Although these parameters are somewhat arbitrary, their initial | Although these parameters are somewhat arbitrary, their initial | |||
| values are not important except that the algorithm is AIMD and the | values are not important except that the algorithm is AIMD and the | |||
| MaxIW should not exceed that recommended for other systems on the | MaxIW should not exceed that recommended for other systems on the | |||
| Internet (here we selected the current de-facto standard rather than | Internet (here, we selected the current de facto standard rather than | |||
| the actual standard). Current proposals, including default current | the actual standard). Current proposals, including default current | |||
| operation, are degenerate cases of the algorithm below for given | operation, are degenerate cases of the algorithm below for given | |||
| parameters - notably MulDec = 1.0 and AddIncr = 0 MSS, thus | parameters, notably MulDec = 1.0 and AddIncr = 0 MSS, thus | |||
| disabling the automatic part of the algorithm.</t> | disabling the automatic part of the algorithm.</t> | |||
| <t> | ||||
| <t> | ||||
| The proposed algorithm is as follows:</t> | The proposed algorithm is as follows:</t> | |||
| <figure><artwork><![CDATA[ | <ol> | |||
| 1. On boot: | ||||
| IW = MaxIW; # assume this is in bytes, and indicates an integer | ||||
| multiple of 2 MSS (an even number to support ACK compression) | ||||
| 2. Upon starting a new connection: | ||||
| CWND = IW; | <li> | |||
| conncount++; | <t>On boot:</t> | |||
| IWnotchecked = 1; # true | <sourcecode type="pseudocode"> | |||
| IW = MaxIW; # assume this is in bytes and indicates an integer | ||||
| # multiple of 2 MSS (an even number to support | ||||
| # ACK compression) | ||||
| </sourcecode> | ||||
| </li> | ||||
| 3. During a connection's SYN-ACK processing, if SYN-ACK includes ECN | <li><t>Upon starting a new connection:</t> | |||
| (as similarly addressed in Sec 5 of ECN++ for TCP [Ba20]), treat | <sourcecode type="pseudocode"> | |||
| as if the IW is too large: | CWND = IW; | |||
| conncount++; | ||||
| IWnotchecked = 1; # true | ||||
| </sourcecode> | ||||
| </li> | ||||
| if (IWnotchecked && (synackecn == 1)) { | <li> | |||
| losscount++; | <t>During a connection's SYN-ACK processing, if SYN-ACK includes ECN (as | |||
| IWnotchecked = 0; # never check again | similarly addressed in Section 5 of ECN++ for TCP <xref | |||
| } | target="I-D.ietf-tcpm-generalized-ecn"/>), treat as if the IW is too large: | |||
| </t> | ||||
| <sourcecode type="pseudocode"> | ||||
| if (IWnotchecked && (synackecn == 1)) { | ||||
| losscount++; | ||||
| IWnotchecked = 0; # never check again | ||||
| } | ||||
| </sourcecode> | ||||
| </li> | ||||
| 4. During a connection, if retransmission occurs, check the seqno of | <li><t>During a connection, if retransmission occurs, check the seqno of the | |||
| the outgoing packet (in bytes) to see if the resent segment fixes | outgoing packet (in bytes) to see if the re-sent segment fixes an IW loss:</t> | |||
| an IW loss: | <sourcecode type="pseudocode"> | |||
| if (Retransmitting && IWnotchecked && ((seqno - ISN) < IW) | ||||
| )) { | ||||
| losscount++; | ||||
| IWnotchecked = 0; # never do this entire "if" again | ||||
| } else { | ||||
| IWnotchecked = 0; # you're beyond the IW so stop checking | ||||
| } | ||||
| </sourcecode> | ||||
| </li> | ||||
| if (Retransmitting && IWnotchecked && ((seqno - ISN) < IW))) { | <li> | |||
| losscount++; | <t>Once every 1000 connections, as a separate process (i.e., not as part of | |||
| IWnotchecked = 0; # never do this entire "if" again | processing a given connection): | |||
| </t> | ||||
| <sourcecode type="pseudocode"> | ||||
| if (conncount > 1000) { | ||||
| if (losscount/conncount > threshold) { | ||||
| # the number of connections with errors is too high | ||||
| IW = IW * MulDecr; | ||||
| } else { | } else { | |||
| IWnotchecked = 0; # you're beyond the IW so stop checking | IW = IW + AddIncr; | |||
| } | } | |||
| } | ||||
| </sourcecode> | ||||
| </li> | ||||
| 5. Once every 1000 connections, as a separate process (i.e., not as | </ol> | |||
| part of processing a given connection): | ||||
| if (conncount > 1000) { | ||||
| if (losscount/conncount > threshold) { | ||||
| # the number of connections with errors is too high | ||||
| IW = IW * MulDecr; | ||||
| } else { | ||||
| IW = IW + AddIncr; | ||||
| } | ||||
| } | ||||
| ]]></artwork> | ||||
| </figure> | ||||
| <t> | ||||
| As presented, this algorithm can yield a false positive when the | ||||
| sequence number wraps around, e.g., the code might increment | ||||
| losscount in step 4 when no loss occurred or fail to increment | ||||
| losscount when a loss did occur. This can be avoided using either | ||||
| PAWS <xref target="RFC7323"/> context or internal extended sequence number | ||||
| representations (as in TCP-AO <xref target="RFC5925"/>). Alternately, false | ||||
| positives can be tolerated because they are expected to be | ||||
| infrequent and thus will not significantly impact the algorithm.</t> | ||||
| <t> | <t> | |||
| As presented, this algorithm can yield a false positive when the sequence | ||||
| number wraps around, e.g., the code might increment losscount in step 4 | ||||
| when no loss occurred or fail to increment losscount when a loss did | ||||
| occur. This can be avoided using either Protection Against Wrapped | ||||
| Sequences (PAWS) <xref target="RFC7323" format="default"/> context or | ||||
| internal extended sequence number representations (as in TCP Authentication | ||||
| Option (TCP-AO) <xref target="RFC5925" format="default"/>). Alternately, | ||||
| false positives can be tolerated because they are expected to be infrequent | ||||
| and thus will not significantly impact the algorithm.</t> | ||||
| <t> | ||||
| A number of additional constraints need to be imposed if this | A number of additional constraints need to be imposed if this | |||
| mechanism is implemented to ensure that it defaults to values that | mechanism is implemented to ensure that it defaults to values that | |||
| comply with current Internet standards, is conservative in how it | comply with current Internet standards, is conservative in how it | |||
| extends those values, and returns to those values in the absence of | extends those values, and returns to those values in the absence of | |||
| positive feedback (i.e., success). To that end, we recommend the | positive feedback (i.e., success). To that end, we recommend the | |||
| following list of example constraints:</t> | following list of example constraints:</t> | |||
| <t> | <ul> | |||
| >> The automatic IW algorithm MUST initialize MaxIW a value no | <li> <t> The automatic IW algorithm <bcp14>MUST</bcp14> initialize MaxIW a | |||
| larger than the currently recommended Internet default, in the | value no larger than the currently recommended Internet default in the | |||
| absence of other context information.</t> | absence of other context information.</t> | |||
| <t> | ||||
| <t> | ||||
| Thus, if there are too few connections to make a decision or if | Thus, if there are too few connections to make a decision or if | |||
| there is otherwise insufficient information to increase the IW, then | there is otherwise insufficient information to increase the IW, then | |||
| the MaxIW defaults to the current recommended value.</t> | the MaxIW defaults to the current recommended value.</t></li> | |||
| <t> | <li> <t> | |||
| >> An implementation MAY allow the MaxIW to grow beyond the | An implementation <bcp14>MAY</bcp14> allow the MaxIW to grow beyond the | |||
| currently recommended Internet default, but not more than 2 segments | currently recommended Internet default but not more than 2 segments | |||
| per calendar year.</t> | per calendar year.</t> | |||
| <t> | ||||
| <t> | Thus, if an endpoint has a persistent history of successfully transmitting | |||
| Thus, if an endpoint has a persistent history of successfully | IW segments without loss, then it is allowed to probe the Internet to | |||
| transmitting IW segments without loss, then it is allowed to probe | determine if larger IW values have similar success. This probing is | |||
| the Internet to determine if larger IW values have similar success. | limited and requires a trusted time source; otherwise, the MaxIW remains | |||
| This probing is limited and requires a trusted time source, | constant.</t></li> | |||
| otherwise the MaxIW remains constant.</t> | <li> | |||
| <t> | ||||
| <t> | An implementation <bcp14>MUST</bcp14> adjust the IW based on loss statistics | |||
| >> An implementation MUST adjust the IW based on loss statistics at | at | |||
| least once every 1000 connections.</t> | least once every 1000 connections.</t> | |||
| <t> | ||||
| <t> | ||||
| An endpoint needs to be sufficiently reactive to IW loss.</t> | An endpoint needs to be sufficiently reactive to IW loss.</t> | |||
| </li> | ||||
| <t> | <li> <t> | |||
| >> An implementation MUST decrease the IW by at least one MSS when | An implementation <bcp14>MUST</bcp14> decrease the IW by at least 1 MSS when | |||
| indicated during an evaluation interval.</t> | indicated during an evaluation interval.</t> | |||
| <t> | ||||
| <t> | ||||
| An endpoint that detects loss needs to decrease its IW by at least | An endpoint that detects loss needs to decrease its IW by at least | |||
| one MSS, otherwise it is not participating in an automatic reactive | 1 MSS; otherwise, it is not participating in an automatic reactive | |||
| algorithm.</t> | algorithm.</t></li> | |||
| <li> | ||||
| <t> | <t> | |||
| >> An implementation MUST increase by no more than 2 MSS per | An implementation <bcp14>MUST</bcp14> increase by no more than 2 MSSs per | |||
| evaluation interval.</t> | evaluation interval.</t> | |||
| <t> | ||||
| <t> | ||||
| An endpoint that does not experience IW loss needs to probe the | An endpoint that does not experience IW loss needs to probe the | |||
| network incrementally.</t> | network incrementally.</t> | |||
| </li> | ||||
| <t> | <li> | |||
| >> An implementation SHOULD use an IW that is an integer multiple of | <t> | |||
| 2 MSS.</t> | An implementation <bcp14>SHOULD</bcp14> use an IW that is an integer multiple | |||
| of | ||||
| <t> | 2 MSSs.</t> | |||
| The IW should remain a multiple of 2 MSS segments, to enable | <t> | |||
| The IW should remain a multiple of 2 MSS segments to enable | ||||
| efficient ACK compression without incurring unnecessary timeouts.</t> | efficient ACK compression without incurring unnecessary timeouts.</t> | |||
| </li> | ||||
| <t> | <li> <t> | |||
| >> An implementation MUST decrease the IW if more than 95% of | An implementation <bcp14>MUST</bcp14> decrease the IW if more than 95% of | |||
| connections have IW losses.</t> | connections have IW losses.</t> | |||
| <t> | ||||
| Again, this is to ensure an implementation is sufficiently reactive.</t></li> | ||||
| <t> | <li | |||
| Again, this is to ensure an implementation is sufficiently reactive.</t> | > <t> | |||
| An implementation <bcp14>MAY</bcp14> group IW values and statistics within | ||||
| <t> | subsets of connections. Such grouping <bcp14>MAY</bcp14> use any information | |||
| >> An implementation MAY group IW values and statistics within | about | |||
| subsets of connections. Such grouping MAY use any information about | ||||
| connections to form groups except loss statistics.</t> | connections to form groups except loss statistics.</t> | |||
| </li> | ||||
| <t> | </ul> | |||
| There are some TCP connections which might not be counted at all, | <t> | |||
| such as those to/from loopback addresses, or those within the same | There are some TCP connections that might not be counted at all, | |||
| such as those to/from loopback addresses or those within the same | ||||
| subnet as that of a local interface (for which congestion control is | subnet as that of a local interface (for which congestion control is | |||
| sometimes disabled anyway). This may also include connections that | sometimes disabled anyway). This may also include connections that | |||
| terminate before the IW is full, i.e., as a separate check at the | terminate before the IW is full, i.e., as a separate check at the | |||
| time of the connection closing.</t> | time of the connection closing.</t> | |||
| <t> | ||||
| <t> | The period over which the IW is updated is intended to be a long timescale, | |||
| The period over which the IW is updated is intended to be a long | e.g., a month or so, or 1,000 connections, whichever is longer. An | |||
| timescale, e.g., a month or so, or 1,000 connections, whichever is | implementation might check the IW once a month and simply not update the IW | |||
| longer. An implementation might check the IW once a month, and | or clear the connection counts in months where the number of connections is | |||
| simply not update the IW or clear the connection counts in months | too small.</t> | |||
| where the number of connections is too small.</t> | </section> | |||
| <section anchor="sect-c.4" numbered="true" toc="default"> | ||||
| </section> | <name>Discussion</name> | |||
| <t> | ||||
| <section title="Discussion" anchor="sect-c.4"><t> | ||||
| There are numerous parameters to the above algorithm that are | There are numerous parameters to the above algorithm that are | |||
| compliant with the given requirements; this is intended to allow | compliant with the given requirements; this is intended to allow | |||
| variation in configuration and implementation while ensuring that | variation in configuration and implementation while ensuring that | |||
| all such algorithms are reactive and safe.</t> | all such algorithms are reactive and safe.</t> | |||
| <t> | ||||
| <t> | ||||
| This algorithm continues to assume segments because that is the | This algorithm continues to assume segments because that is the | |||
| basis of most TCP implementations. It might be useful to consider | basis of most TCP implementations. It might be useful to consider | |||
| revising the specifications to allow byte-based congestion given | revising the specifications to allow byte-based congestion given | |||
| sufficient experience.</t> | sufficient experience.</t> | |||
| <t> | ||||
| <t> | ||||
| The algorithm checks for IW losses only during the first IW after a | The algorithm checks for IW losses only during the first IW after a | |||
| connection start; it does not check for IW losses elsewhere the IW | connection start; it does not check for IW losses elsewhere the IW | |||
| is used, e.g., during slow-start restarts.</t> | is used, e.g., during slow-start restarts.</t> | |||
| <t> | <ul> | |||
| >> An implementation MAY detect IW losses during slow-start restarts | <li> <t> An implementation <bcp14>MAY</bcp14> detect IW losses during | |||
| in addition to losses during the first IW of a connection. In this | slow-start restarts in addition to losses during the first IW of a | |||
| case, the implementation MUST count each restart as a "connection" | connection. In this case, the implementation <bcp14>MUST</bcp14> count | |||
| for the purposes of connection counts and periodic rechecking of the | each restart as a "connection" for the purposes of connection counts and | |||
| IW value.</t> | periodic rechecking of the IW value.</t> | |||
| </li> | ||||
| <t> | </ul> | |||
| <t> | ||||
| False positives can occur during some kinds of segment reordering, | False positives can occur during some kinds of segment reordering, | |||
| e.g., that might trigger spurious retransmissions even without a | e.g., that might trigger spurious retransmissions even without a | |||
| true segment loss. These are not expected to be sufficiently common | true segment loss. These are not expected to be sufficiently common | |||
| to dominate the algorithm and its conclusions.</t> | to dominate the algorithm and its conclusions.</t> | |||
| <t> | <t> | |||
| This mechanism does require additional per-connection state, which | This mechanism does require additional per-connection state, which is | |||
| is currently common in some implementations, and is useful for other | currently common in some implementations and is useful for other reasons | |||
| reasons (e.g., the ISN is used in TCP-AO <xref target="RFC5925"/>). The mecha | (e.g., the ISN is used in TCP-AO <xref target="RFC5925" | |||
| nism | format="default"/>). | |||
| also benefits from persistent state kept across reboots, as would be | ||||
| other state sharing mechanisms (e.g., TCP Control Block Sharing per | ||||
| the main body of this document).</t> | ||||
| <t> | The mechanism in this appendix also benefits from persistent state kept across | |||
| reboots, which would also be useful to other state sharing mechanisms (e.g., | ||||
| TCP Control Block Sharing per the main body of this document). | ||||
| </t> | ||||
| <t> | ||||
| The receive window (rwnd) is not involved in this calculation. The | The receive window (rwnd) is not involved in this calculation. The | |||
| size of rwnd is determined by receiver resources and provides space | size of rwnd is determined by receiver resources and provides space | |||
| to accommodate segment reordering. It is not involved with | to accommodate segment reordering. | |||
| congestion control, which is the focus of this document and its | ||||
| management of the IW.</t> | ||||
| </section> | Also, rwnd is not involved with congestion control, which is the focus of the wa | |||
| y | ||||
| this appendix manages the IW. | ||||
| <section title="Observations" anchor="sect-c.5"><t> | </t> | |||
| The IW may not converge to a single, global value. It also may not | </section> | |||
| converge at all, but rather may oscillate by a few MSS as it | <section anchor="sect-c.5" numbered="true" toc="default"> | |||
| <name>Observations</name> | ||||
| <t> | ||||
| The IW may not converge to a single global value. It also may not | ||||
| converge at all but rather may oscillate by a few MSSs as it | ||||
| repeatedly probes the Internet for larger IWs and fails. Both | repeatedly probes the Internet for larger IWs and fails. Both | |||
| properties are consistent with TCP behavior during each individual | properties are consistent with TCP behavior during each individual | |||
| connection.</t> | connection.</t> | |||
| <t> | ||||
| <t> | ||||
| This mechanism assumes that losses during the IW are due to IW size. | This mechanism assumes that losses during the IW are due to IW size. | |||
| Persistent errors that drop packets for other reasons - e.g., OS | Persistent errors that drop packets for other reasons, e.g., OS | |||
| bugs, can cause false positives. Again, this is consistent with | bugs, can cause false positives. Again, this is consistent with | |||
| TCP's basic assumption that loss is caused by congestion and | TCP's basic assumption that loss is caused by congestion and | |||
| requires backoff. This algorithm treats the IW of new connections as | requires backoff. This algorithm treats the IW of new connections as | |||
| a long-timescale backoff system.</t> | a long-timescale backoff system.</t> | |||
| </section> | ||||
| </section> | </section> | |||
| <section numbered="false" anchor="acknowledgments" toc="default"> | ||||
| </section> | <name>Acknowledgments</name> | |||
| <t> | ||||
| <section title="Acknowledgments" numbered="no" anchor="acknowledgments">< | The authors would like to thank <contact fullname="Praveen | |||
| t> | Balasubramanian"/> for information regarding TCB sharing in Windows; | |||
| The authors would like to thank for Praveen Balasubramanian for | <contact fullname="Christoph Paasch"/> for information regarding TCB | |||
| information regarding TCB sharing in Windows, Christoph Paasch for | sharing in Apple OSs; <contact fullname="Yuchung Cheng"/>, <contact | |||
| information regarding TCB sharing in Apple OSes, and Yuchung Cheng, | fullname="Lars Eggert"/>, <contact fullname="Ilpo Jarvinen"/>, and <contact | |||
| Lars Eggert, Ilpo Jarvinen and Michael Scharf for comments on | fullname="Michael Scharf"/> for comments on earlier draft versions of this | |||
| earlier versions of the draft, as well as members of the TCPM WG. | document; as well as members of the TCPM WG. Earlier revisions of this | |||
| Earlier revisions of this work received funding from a collaborative | work received funding from a collaborative research project between the | |||
| research project between the University of Oslo and Huawei | University of Oslo and Huawei Technologies Co., Ltd. and were partly | |||
| Technologies Co., Ltd. and were partly supported by USC/ISI's Postel | supported by USC/ISI's Postel Center.</t> | |||
| Center.</t> | <t> | |||
| <t> | ||||
| This document was prepared using 2-Word-v2.0.template.dot.</t> | This document was prepared using 2-Word-v2.0.template.dot.</t> | |||
| </section> | ||||
| </back> | ||||
| </section> | </rfc> | |||
| </back> | ||||
| </rfc> | ||||
| End of changes. 247 change blocks. | ||||
| 1365 lines changed or deleted | 1653 lines changed or added | |||
This html diff was produced by rfcdiff 1.48. The latest version is available from http://tools.ietf.org/tools/rfcdiff/ | ||||