rfc9040xml2.original.xml   rfc9040.xml 
<?xml version='1.0' encoding='utf-8'?> <?xml version="1.0" encoding="UTF-8"?>
<!-- [rfced] Change log section removed from draft-ietf-tcpm-2140bis-11-manual.t
xt -->
<!DOCTYPE rfc SYSTEM "rfc2629.dtd" [
<!ENTITY RFC0793 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF
C.0793.xml">
<!ENTITY RFC1122 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF
C.1122.xml">
<!ENTITY RFC1191 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF
C.1191.xml">
<!ENTITY RFC2119 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF
C.2119.xml">
<!ENTITY RFC4821 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF
C.4821.xml">
<!ENTITY RFC5681 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF
C.5681.xml">
<!ENTITY RFC6298 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF
C.6298.xml">
<!ENTITY RFC7413 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF
C.7413.xml">
<!ENTITY RFC8174 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF
C.8174.xml">
<!ENTITY RFC8201 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF
C.8201.xml">
<!ENTITY I-D.allman-tcpm-bump-initcwnd SYSTEM "https://xml2rfc.ietf.org/public/r
fc/bibxml3/reference.I-D.draft-allman-tcpm-bump-initcwnd-00.xml">
<!ENTITY I-D.ietf-tcpm-generalized-ecn SYSTEM "https://xml2rfc.ietf.org/public/r
fc/bibxml3/reference.I-D.draft-ietf-tcpm-generalized-ecn-07.xml">
<!ENTITY I-D.hughes-restart SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml3/
reference.I-D.draft-hughes-restart-00.xml">
<!ENTITY RFC1644 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF
C.1644.xml">
<!ENTITY RFC1379 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF
C.1379.xml">
<!ENTITY RFC2001 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF
C.2001.xml">
<!ENTITY RFC2140 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF
C.2140.xml">
<!ENTITY RFC2414 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF
C.2414.xml">
<!ENTITY RFC2663 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF
C.2663.xml">
<!ENTITY RFC3390 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF
C.3390.xml">
<!ENTITY RFC3124 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF
C.3124.xml">
<!ENTITY RFC4340 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF
C.4340.xml">
<!ENTITY RFC4960 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF
C.4960.xml">
<!ENTITY RFC5925 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF
C.5925.xml">
<!ENTITY RFC6437 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF
C.6437.xml">
<!ENTITY RFC6691 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF
C.6691.xml">
<!ENTITY RFC6928 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF
C.6928.xml">
<!ENTITY RFC7231 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF
C.7231.xml">
<!ENTITY RFC7323 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF
C.7323.xml">
<!ENTITY RFC7424 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF
C.7424.xml">
<!ENTITY RFC7540 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF
C.7540.xml">
<!ENTITY RFC7661 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF
C.7661.xml">
<!ENTITY RFC8684 SYSTEM "https://xml2rfc.ietf.org/public/rfc/bibxml/reference.RF
C.8684.xml">
]>
<rfc submissionType="IETF" docName="draft-ietf-tcpm-2140bis-11" category="info"
obsoletes="2140" ipr="trust200902">
<!-- Generated by id2xml 1.5.0 on 2021-05-03T23:46:00Z -->
<?rfc strict="yes"?>
<?rfc compact="yes"?>
<?rfc subcompact="no"?>
<?rfc symrefs="yes"?>
<?rfc sortrefs="no"?>
<?rfc text-list-symbols="o*+-"?>
<?rfc toc="yes"?>
<front>
<title>TCP Control Block Interdependence</title>
<author initials="J." surname="Touch" fullname="Joe Touch">
<organization abbrev="Independent"></organization>
<address>
<postal>
<street/>
<city>Manhattan Beach</city>
<region>CA</region>
<code>90266</code>
<country>United States of America</country>
</postal>
<phone>+1 (310) 560-0334</phone>
<email>touch@strayalpha.com</email>
</address>
</author>
<author initials="M." surname="Welzl" fullname="Michael Welzl">
<organization>University of Oslo</organization>
<address>
<postal>
<street>PO Box 1080 Blindern</street>
<city>Oslo</city>
<region/>
<code>N-0316</code>
<country>Norway</country>
</postal>
<phone>+47 22 85 24 20</phone>
<email>michawe@ifi.uio.no</email>
</address>
</author>
<author initials="S." surname="Islam" fullname="Safiqul Islam">
<organization>University of Oslo</organization>
<address><postal><street>PO Box 1080 Blindern</street>
<street>Oslo N-0316</street>
<street>Norway</street>
</postal>
<phone>+47 22 84 08 37</phone>
<email>safiquli@ifi.uio.no</email>
</address>
</author>
<date year="2021" month="May"/> <!DOCTYPE rfc SYSTEM "rfc2629-xhtml.ent">
<workgroup>TCPM WG</workgroup>
<!-- [rfced] Please insert any keywords (beyond those that appear in <rfc xmlns:xi="http://www.w3.org/2001/XInclude" docName="draft-ietf-tcpm-2140bis
the title) for use on https://www.rfc-editor.org/search. --> -11"
number="9040" submissionType="IETF" category="info" consensus="true" obsoletes="
2140"
ipr="trust200902" updates="" xml:lang="en" symRefs="true" sortRefs="true" tocInc
lude="true"
version="3">
<keyword>example</keyword> <front>
<title>TCP Control Block Interdependence</title>
<seriesInfo name="RFC" value="9040"/>
<author initials="J." surname="Touch" fullname="Joe Touch">
<organization abbrev="Independent"/>
<address>
<postal>
<street/>
<city>Manhattan Beach</city>
<region>CA</region>
<code>90266</code>
<country>United States of America</country>
</postal>
<phone>+1 (310) 560-0334</phone>
<email>touch@strayalpha.com</email>
</address>
</author>
<author initials="M." surname="Welzl" fullname="Michael Welzl">
<organization>University of Oslo</organization>
<address>
<postal>
<street>PO Box 1080 Blindern</street>
<city>Oslo</city>
<region/>
<code>N-0316</code>
<country>Norway</country>
</postal>
<phone>+47 22 85 24 20</phone>
<email>michawe@ifi.uio.no</email>
</address>
</author>
<author initials="S." surname="Islam" fullname="Safiqul Islam">
<organization>University of Oslo</organization>
<address>
<postal>
<street>PO Box 1080 Blindern</street>
<street>Oslo N-0316</street>
<street>Norway</street>
</postal>
<phone>+47 22 84 08 37</phone>
<email>safiquli@ifi.uio.no</email>
</address>
</author>
<date year="2021" month="July"/>
<workgroup>TCPM WG</workgroup>
<abstract><t> <abstract>
<t>
This memo provides guidance to TCP implementers that is intended to This memo provides guidance to TCP implementers that is intended to
help improve connection convergence to steady-state operation help improve connection convergence to steady-state operation
without affecting interoperability. It updates and replaces RFC without affecting interoperability. It updates and replaces RFC
2140's description of sharing TCP state, as typically represented in 2140's description of sharing TCP state, as typically represented in
TCP Control Blocks, among similar concurrent or consecutive TCP Control Blocks, among similar concurrent or consecutive
connections.</t> connections.</t>
</abstract>
</front>
<middle>
<section anchor="sect-1" numbered="true" toc="default">
<name>Introduction</name>
</abstract> <t>
</front> TCP is a connection-oriented reliable transport protocol layered over IP
<xref target="RFC0793" format="default"/>. Each TCP connection maintains
<middle> state, usually in a data structure called the "TCP Control Block (TCB)". The
<section title="Introduction" anchor="sect-1"><t> TCB contains information about the connection state, its associated local
TCP is a connection-oriented reliable transport protocol layered
over IP <xref target="RFC0793"/>. Each TCP connection maintains state, usuall
y in a
data structure called the TCP Control Block (TCB). The TCB contains
information about the connection state, its associated local
process, and feedback parameters about the connection's transmission process, and feedback parameters about the connection's transmission
properties. As originally specified and usually implemented, most properties. As originally specified and usually implemented, most TCB
TCB information is maintained on a per-connection basis. Some information is maintained on a per-connection basis. Some implementations
implementations share certain TCB information across connections to share certain TCB information across connections to the same host <xref
the same host <xref target="RFC2140"/>. Such sharing is intended to lead to b target="RFC2140" format="default"/>. Such sharing is intended to lead to
etter better overall transient performance, especially for numerous short-lived
overall transient performance, especially for numerous short-lived and simultaneous connections, as can be used in the World Wide Web and
and simultaneous connections, as can be used in the World-Wide Web other applications <xref target="Be94" format="default"/> <xref
and other applications <xref target="Be94"/><xref target="Br02"/>. This shari target="Br02" format="default"/>. This sharing of state is intended to help
ng of state is TCP connections converge to long-term behavior (assuming stable application
intended to help TCP connections converge to long term behavior load, i.e., so-called "steady-state") more quickly without affecting TCP
(assuming stable application load, i.e., so-called "steady-state") interoperability.</t>
more quickly without affecting TCP interoperability.</t>
<t> <t>
This document updates RFC 2140's discussion of TCB state sharing and This document updates RFC 2140's discussion of TCB state sharing and
provides a complete replacement for that document. This state provides a complete replacement for that document. This state sharing
sharing affects only TCB initialization <xref target="RFC2140"/> and thus has affects only TCB initialization <xref target="RFC2140" format="default"/>
no and thus has no effect on the long-term behavior of TCP after a connection
effect on the long-term behavior of TCP after a connection has been has been established or on interoperability. Path information shared
established nor on interoperability. Path information shared across across SYN destination port numbers assumes that TCP segments having the
SYN destination port numbers assumes that TCP segments having the same host-pair experience the same path properties, i.e., that traffic is
same host-pair experience the same path properties, i.e., that not routed differently based on port numbers or other connection parameters
traffic is not routed differently based on port numbers or other (also addressed further in <xref target="sect-8.1" format="default"/>). The
connection parameters (also addressed further in <xref target="sect-8.1"/>). observations about TCB sharing in this document apply similarly to any
The protocol with congestion state, including the Stream Control Transmission
observations about TCB sharing in this document apply similarly to Protocol (SCTP) <xref target="RFC4960" format="default"/> and the Datagram
any protocol with congestion state, including SCTP <xref target="RFC4960"/> a Congestion Control Protocol (DCCP) <xref target="RFC4340"
nd format="default"/>, as well as to individual subflows in Multipath TCP
DCCP <xref target="RFC4340"/>, as well as for individual subflows in Multipat <xref target="RFC8684" format="default"/>.</t>
h TCP </section>
<xref target="RFC8684"/>.</t>
</section>
<section title="Conventions Used in This Document" anchor="sect-2"><t>
The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
"SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and
"OPTIONAL" in this document are to be interpreted as described in
BCP 14 <xref target="RFC2119"/> <xref target="RFC8174"/> when, and only when,
they appear in all
capitals, as shown here.</t>
<t>
The core of this document describes behavior that is already
permitted by TCP standards. As a result, it provides informative
guidance but does not use normative language, except when quoting
other documents. Normative language is used in Appendix C as
examples of requirements for future consideration.</t>
</section>
<section title="Terminology" anchor="sect-3"><t>
The following terminology is used frequently in this document. Items
preceded with a "+" may be part of the state maintained as TCP
connection state in the associated connections TCB and are the focus
of sharing as described in this document. Note that terms are used
as originally introduced where possible; in some cases, direction is
indicated with a suffix (_S for send, _R for receive) and in other
cases spelled out (sendcwnd).
<list style="hanging" hangIndent="6">
<t hangText="+cwnd:">TCP congestion window size <xref target="RFC5681"/><
/t>
<t hangText="host:">a source or sink of TCP segments associated with a si
ngle IP
address</t>
<t hangText="host-pair:">a pair of hosts and their corresponding IP addre
sses</t>
<t hangText="+MMS_R:">maximum message size that can be received, the larg
est
received transport payload of an IP datagram <xref target="RFC1122"/></t>
<t hangText="+MMS_S:">maximum message size that can be sent, the largest
transmitted transport payload of an IP datagram <xref target="RFC1122"/><
/t>
<t hangText="path:">an Internet path between the IP addresses of two host
s</t>
<t hangText="PCB:">protocol control block, the data associated with <section anchor="sect-2" numbered="true" toc="default">
a protocol as maintained by an endpoint; a TCP PCB is called a TCB <name>Conventions Used in This Document</name>
PLPMTUD - packetization-layer path MTU discovery, a mechanism that
uses transport packets to discover the PMTU <xref
target="RFC4821"/></t>
<t hangText="+PMTU:">largest IP datagram that can traverse a path <t>
<xref target="RFC1191"/><xref target="RFC8201"/></t> The key words "<bcp14>MUST</bcp14>", "<bcp14>MUST NOT</bcp14>",
"<bcp14>REQUIRED</bcp14>", "<bcp14>SHALL</bcp14>", "<bcp14>SHALL
NOT</bcp14>", "<bcp14>SHOULD</bcp14>", "<bcp14>SHOULD NOT</bcp14>",
"<bcp14>RECOMMENDED</bcp14>", "<bcp14>NOT RECOMMENDED</bcp14>",
"<bcp14>MAY</bcp14>", and "<bcp14>OPTIONAL</bcp14>" in this document are
to be interpreted as described in BCP&nbsp;14 <xref target="RFC2119"/>
<xref target="RFC8174"/> when, and only when, they appear in all capitals,
as shown here.
</t>
<t hangText="PMTUD:">path-layer MTU discovery, a mechanism that <t>
relies on ICMP error messages to discover the PMTU <xref The core of this document describes behavior that is already permitted by
target="RFC1191"/><xref target="RFC8201"/></t> TCP standards. As a result, this document provides informative guidance but d
oes not
use normative language except when quoting other documents. Normative
language is used in <xref target="sect-c"/> as examples of requirements for
future consideration.</t>
</section>
<t hangText="+RTT:">round-trip time of a TCP packet exchange <xref <section anchor="sect-3" numbered="true" toc="default">
target="RFC0793"/></t> <name>Terminology</name>
<t hangText="+RTTVAR:">variation of round-trip times of a TCP packet <t>
exchange <xref target="RFC6298"/></t> The following terminology is used frequently in this document. Items
preceded with a "+" may be part of the state maintained as TCP connection
state in the TCB of associated connections and are the focus of sharing as
described in this document. Note that terms are used as originally
introduced where possible; in some cases, direction is indicated with a
suffix (_S for send, _R for receive) and in other cases spelled out
(sendcwnd).
<t hangText="+rwnd:">TCP receive window size <xref </t>
target="RFC5681"/></t>
<t hangText="+sendcwnd:">TCP send-side congestion window (cwnd) size <dl newline="false" spacing="normal" indent="6">
<xref target="RFC5681"/></t> <dt>+cwnd:</dt>
<dd>TCP congestion window size <xref target="RFC5681" format="default"/>
</dd>
<dt>host:</dt>
<dd>a source or sink of TCP segments associated with a single IP
address</dd>
<dt>host-pair:</dt>
<dd>a pair of hosts and their corresponding IP addresses</dd>
<dt>ISN:
</dt>
<dd>Initial Sequence Number
</dd>
<dt>+MMS_R:</dt>
<dd>maximum message size that can be received, the largest
received transport payload of an IP datagram <xref target="RFC1122" forma
t="default"/></dd>
<dt>+MMS_S:</dt>
<dd>maximum message size that can be sent, the largest
transmitted transport payload of an IP datagram <xref target="RFC1122" fo
rmat="default"/></dd>
<dt>path:</dt>
<dd>an Internet path between the IP addresses of two hosts</dd>
<t hangText="+sendMSS:">TCP maximum segment size, a value <dt>PCB:</dt>
<dd>protocol control block, the data associated with a protocol as
maintained by an endpoint; a TCP PCB is called a "TCB"</dd>
<dt>PLPMTUD:</dt><dd>packetization-layer path MTU discovery, a mechanism
that
uses transport packets to discover the Path Maximum Transmission Unit (P
MTU) <xref target="RFC4821"
format="default"/></dd>
<dt>+PMTU:</dt>
<dd>largest IP datagram that can traverse a path
<xref target="RFC1191" format="default"/> <xref target="RFC8201" format="
default"/></dd>
<dt>PMTUD:</dt>
<dd>path-layer MTU discovery, a mechanism that
relies on ICMP error messages to discover the PMTU <xref target="RFC1191"
format="default"/> <xref target="RFC8201" format="default"/></dd>
<dt>+RTT:</dt>
<dd>round-trip time of a TCP packet exchange <xref target="RFC0793" form
at="default"/></dd>
<dt>+RTTVAR:</dt>
<dd>variation of round-trip times of a TCP packet
exchange <xref target="RFC6298" format="default"/></dd>
<dt>+rwnd:</dt>
<dd>TCP receive window size <xref target="RFC5681" format="default"/></d
d>
<dt>+sendcwnd:</dt>
<dd>TCP send-side congestion window (cwnd) size
<xref target="RFC5681" format="default"/></dd>
<dt>+sendMSS:</dt>
<dd>TCP maximum segment size, a value
transmitted in a TCP option that represents the largest TCP user data transmitted in a TCP option that represents the largest TCP user data
payload that can be received <xref target="RFC6691"/></t> payload that can be received <xref target="RFC6691" format="default"/></d
d>
<t hangText="+ssthresh:">TCP slow-start threshold <xref <dt>+ssthresh:</dt>
target="RFC5681"/></t> <dd>TCP slow-start threshold <xref target="RFC5681" format="default"/></
dd>
<t hangText="TCB:">TCP Control Block, the data associated with a TCP <dt>TCB:</dt>
connection as maintained by an endpoint</t> <dd>TCP Control Block, the data associated with a TCP
connection as maintained by an endpoint</dd>
<t hangText="TCP-AO:">TCP Authentication Option <xref <dt>TCP-AO:</dt>
target="RFC5925"/></t> <dd>TCP Authentication Option <xref target="RFC5925" format="default"/><
/dd>
<t hangText="TFO:">TCP Fast Open option <xref target="RFC7413"/></t> <dt>TFO:</dt>
<dd>TCP Fast Open option <xref target="RFC7413" format="default"/></dd>
<t hangText="+TFO_cookie:">TCP Fast Open cookie, state that is used <dt>+TFO_cookie:</dt>
as part of the TFO mechanism, when TFO is supported <xref <dd>TCP Fast Open cookie, state that is used
target="RFC7413"/></t> as part of the TFO mechanism, when TFO is supported <xref target="RFC7413
" format="default"/></dd>
<t hangText="+TFO_failure:">an indication of when TFO option <dt>+TFO_failure:</dt>
negotiation failed, when TFO is supported</t> <dd>an indication of when TFO option
negotiation failed, when TFO is supported</dd>
<t hangText="+TFOinfo:">information cached when a TFO connection is <dt>+TFOinfo:</dt>
established, which includes the TFO_cookie <xref <dd>information cached when a TFO connection is
target="RFC7413"/></t> established, which includes the TFO_cookie <xref target="RFC7413" format=
"default"/></dd>
</list> </dl>
</t> </section>
<section anchor="sect-4" numbered="true" toc="default">
</section> <name>The TCP Control Block (TCB)</name>
<t>
<section title="The TCP Control Block (TCB)" anchor="sect-4"><t>
A TCB describes the data associated with each connection, i.e., with A TCB describes the data associated with each connection, i.e., with
each association of a pair of applications across the network. The each association of a pair of applications across the network. The
TCB contains at least the following information <xref target="RFC0793"/>:</t> TCB contains at least the following information <xref target="RFC0793" format ="default"/>:</t>
<figure><artwork><![CDATA[ <ul empty="true">
Local process state <li><t>Local process state</t>
pointers to send and receive buffers <ul empty="true" spacing="compact">
pointers to retransmission queue and current segment <li>pointers to send and receive buffers</li>
pointers to Internet Protocol (IP) PCB <li>pointers to retransmission queue and current segment</li>
Per-connection shared state <li>pointers to Internet Protocol (IP) PCB</li>
macro-state </ul>
connection state </li>
timers <li><t>Per-connection shared state</t>
flags <ul empty="true" spacing="compact">
local and remote host numbers and ports <li><t>macro-state</t>
TCP option state <ul empty="true" spacing="compact">
micro-state <li>connection state</li>
send and receive window state (size*, current number) <li>timers</li>
congestion window size (sendcwnd)* <li>flags</li>
congestion window size threshold (ssthresh)* <li>local and remote host numbers and ports</li>
max window size seen* <li>TCP option state</li>
sendMSS# </ul>
MMS_S# </li>
MMS_R# <li><t>micro-state</t>
PMTU# <ul empty="true" spacing="compact">
round-trip time and its variation# <li>send and receive window state (size*, current number)</li>
]]></artwork></figure> <li>congestion window size (sendcwnd)*</li>
<li>congestion window size threshold (ssthresh)*</li>
<li>max window size seen*</li>
<li>sendMSS#</li>
<li>MMS_S#</li>
<li>MMS_R#</li>
<li>PMTU#</li>
<li>round-trip time and its variation#</li>
</ul>
</li>
</ul>
</li>
</ul>
<t> <t>
The per-connection information is shown as split into macro-state The per-connection information is shown as split into macro-state and
and micro-state, terminology borrowed from <xref target="Co91"/>. Macro-state micro-state, terminology borrowed from <xref target="Co91"
describes the protocol for establishing the initial shared state format="default"/>. Macro-state describes the protocol for establishing the
about the connection; we include the endpoint numbers and components initial shared state about the connection; we include the endpoint numbers
(timers, flags) required upon commencement that are later used to and components (timers, flags) required upon commencement that are later
help maintain that state. Micro-state describes the protocol after a used to help maintain that state. Micro-state describes the protocol after
connection has been established, to maintain the reliability and a connection has been established, to maintain the reliability and
congestion control of the data transferred in the connection.</t> congestion control of the data transferred in the connection.</t>
<t> <t>
We distinguish two other classes of shared micro-state that are We distinguish two other classes of shared micro-state that are associated
associated more with host-pairs than with application pairs. One more with host-pairs than with application pairs.
class is clearly host-pair dependent (shown above as "#", e.g.,
sendMSS, MMS_R, MMS_S, PMTU, RTT), because these parameters are
defined by the endpoint or endpoint pair (sendMSS, MMS_R, MMS_S,
RTT) or are already cached and shared on that basis (PMTU
<xref target="RFC1191"/><xref target="RFC4821"/>). The other is host-pair dep
endent in its
aggregate (shown above as "*", e.g., congestion window information,
current window sizes, etc.) because they depend on the total
capacity between the two endpoints.</t>
<t> One class is clearly host-pair dependent (shown above as "#", e.g.,
Not all of the TCB state is necessarily sharable. In particular, sendMSS, MMS_R, MMS_S, PMTU, RTT), because these parameters are defined by
the endpoint or endpoint pair (of the given example: sendMSS, MMS_R, MMS_S,
RTT) or are already cached and shared on that basis (of the given example:
PMTU <xref target="RFC1191" format="default"/> <xref target="RFC4821"
format="default"/>).
The other is host-pair dependent in its aggregate (shown above as "*", e.g.,
congestion window information, current window sizes, etc.) because they depend
on the total capacity between the two endpoints.</t>
<t>
Not all of the TCB state is necessarily shareable. In particular,
some TCP options are negotiated only upon request by the application some TCP options are negotiated only upon request by the application
layer, so their use may not be correlated across connections. Other layer, so their use may not be correlated across connections. Other
options negotiate connection-specific parameters, which are options negotiate connection-specific parameters, which are
similarly not shareable. These are discussed further in Appendix B.</t> similarly not shareable. These are discussed further in <xref target="sect-b"
/>.</t>
<t> <t>
Finally, we exclude rwnd from further discussion because its value Finally, we exclude rwnd from further discussion because its value
should depend on the send window size, so it is already addressed by should depend on the send window size, so it is already addressed by
send window sharing and is not independently affected by sharing.</t> send window sharing and is not independently affected by sharing.</t>
</section>
</section> <section anchor="sect-5" numbered="true" toc="default">
<name>TCB Interdependence</name>
<section title="TCB Interdependence" anchor="sect-5"><t> <t>
There are two cases of TCB interdependence. Temporal sharing occurs There are two cases of TCB interdependence. Temporal sharing occurs
when the TCB of an earlier (now CLOSED) connection to a host is used when the TCB of an earlier (now CLOSED) connection to a host is used
to initialize some parameters of a new connection to that same host, to initialize some parameters of a new connection to that same host,
i.e., in sequence. Ensemble sharing occurs when a currently active i.e., in sequence. Ensemble sharing occurs when a currently active
connection to a host is used to initialize another (concurrent) connection to a host is used to initialize another (concurrent)
connection to that host.</t> connection to that host.</t>
</section>
<section anchor="sect-6" numbered="true" toc="default">
<name>Temporal Sharing</name>
</section> <t>
<section title="Temporal Sharing" anchor="sect-6"><t>
The TCB data cache is accessed in two ways: it is read to initialize The TCB data cache is accessed in two ways: it is read to initialize
new TCBs and written when more current per-host state is available.</t> new TCBs and written when more current per-host state is available.</t>
<section anchor="sect-6.1" numbered="true" toc="default">
<section title="Initialization of a new TCB" anchor="sect-6.1"><t> <name>Initialization of a New TCB</name>
<t>
TCBs for new connections can be initialized using cached context TCBs for new connections can be initialized using cached context
from past connections as follows:</t> from past connections as follows:</t>
<figure><artwork><![CDATA[ <table anchor="TCB_initialization">
TEMPORAL SHARING - TCB Initialization <name>Temporal Sharing - TCB Initialization</name>
<thead>
Cached TCB New TCB <tr>
-------------------------------------- <th>Cached TCB</th>
old_MMS_S old_MMS_S or not cached* <th>New TCB</th>
</tr>
old_MMS_R old_MMS_R or not cached* </thead>
<tbody>
old_sendMSS old_sendMSS <tr>
<td>old_MMS_S</td>
old_PMTU old_PMTU+ <td>old_MMS_S or not cached (2)</td>
</tr>
old_RTT old_RTT <tr>
<td>old_MMS_R</td>
old_RTTVAR old_RTTVAR <td>old_MMS_R or not cached (2)</td>
</tr>
old_option (option specific) <tr>
<td>old_sendMSS</td>
old_ssthresh old_ssthresh <td>old_sendMSS</td>
</tr>
old_sendcwnd old_sendcwnd <tr>
]]></artwork></figure> <td>old_PMTU</td>
<td>old_PMTU (1)</td>
<t> </tr>
+Note that PMTU is cached at the IP layer <xref target="RFC1191"/><xref targe <tr>
t="RFC4821"/>. <td>old_RTT</td>
*Note that some values are not cached when they are computed locally <td>old_RTT</td>
(MMS_R) or indicated in the connection itself (MMS_S in the SYN).</t> </tr>
<tr>
<t> <td>old_RTTVAR</td>
The table below gives an overview of option-specific information <td>old_RTTVAR</td>
that can be shared. Additional information on some specific TCP </tr>
options and sharing is provided in Appendix B.</t> <tr>
<td>old_option</td>
<figure><artwork><![CDATA[ <td>(option specific)</td>
TEMPORAL SHARING - Option Info Initialization </tr>
<tr>
<td>old_ssthresh</td>
<td>old_ssthresh</td>
</tr>
<tr>
<td>old_sendcwnd</td>
<td>old_sendcwnd</td>
</tr>
</tbody>
</table>
Cached New <dl>
------------------------------------ <dt>(1)</dt><dd>Note that PMTU is cached at the IP layer <xref target="RFC1191"
old_TFO_cookie old_TFO_cookie format="default"/> <xref target="RFC4821" format="default"/>.
</dd>
<dt>(2)</dt><dd>Note that some values are not cached when they are computed loca
lly
(MMS_R) or indicated in the connection itself (MMS_S in the SYN).</dd>
</dl>
<t>
old_TFO_failure old_TFO_failure <xref target="Option_Info_Initialization"/> gives an overview of
]]></artwork> option-specific information that can be shared. Additional information on
</figure> some specific TCP options and sharing is provided in <xref
target="sect-b"/>.</t>
</section> <table anchor="Option_Info_Initialization">
<name>Temporal Sharing - Option Info Initialization</name>
<thead>
<tr>
<th>Cached</th>
<th>New</th>
</tr>
</thead>
<tbody>
<tr>
<td>old_TFO_cookie</td>
<td>old_TFO_cookie</td>
</tr>
<tr>
<td>old_TFO_failure</td>
<td>old_TFO_failure</td>
</tr>
</tbody>
</table>
<section title="Updates to the TCB cache" anchor="sect-6.2"><t> </section>
<section anchor="sect-6.2" numbered="true" toc="default">
<name>Updates to the TCB Cache</name>
<t>
During a connection, the TCB cache can be updated based on events of During a connection, the TCB cache can be updated based on events of
current connections and their TCBs as they progress over time, as current connections and their TCBs as they progress over time, as shown in
shown below:</t> <xref target="Cache_Updates"/>.</t>
<figure><artwork><![CDATA[
TEMPORAL SHARING - Cache Updates
Cached TCB Current TCB when? New Cached TCB
----------------------------------------------------------
old_MMS_S curr_MMS_S OPEN curr_MMS_S
old_MMS_R curr_MMS_R OPEN curr_MMS_R
old_sendMSS curr_sendMSS MSSopt curr_sendMSS
old_PMTU curr_PMTU PMTUD+ / curr_PMTU
PLPMTUD+
old_RTT curr_RTT CLOSE merge(curr,old)
old_RTTVAR curr_RTTVAR CLOSE merge(curr,old)
old_option curr_option ESTAB (depends on option)
old_ssthresh curr_ssthresh CLOSE merge(curr,old)
old_sendcwnd curr_sendcwnd CLOSE merge(curr,old)
]]></artwork>
</figure>
<t> <table anchor="Cache_Updates">
+Note that PMTU is cached at the IP layer <xref target="RFC1191"/><xref targe <name>Temporal Sharing - Cache Updates</name>
t="RFC4821"/>.</t> <thead>
<tr>
<th>Cached TCB</th>
<th>Current TCB</th>
<th>When?</th>
<th>New Cached TCB</th>
</tr>
</thead>
<tbody>
<tr>
<td>old_MMS_S</td>
<td>curr_MMS_S</td>
<td>OPEN</td>
<td>curr_MMS_S</td>
</tr>
<tr>
<td>old_MMS_R</td>
<td>curr_MMS_R</td>
<td>OPEN</td>
<td>curr_MMS_R</td>
</tr>
<tr>
<td>old_sendMSS</td>
<td>curr_sendMSS</td>
<td>MSSopt</td>
<td>curr_sendMSS</td>
</tr>
<tr>
<td>old_PMTU</td>
<td>curr_PMTU</td>
<td>PMTUD (1) / PLPMTUD (1)</td>
<td>curr_PMTU</td>
</tr>
<tr>
<td>old_RTT</td>
<td>curr_RTT</td>
<td>CLOSE</td>
<td>merge(curr,old)</td>
</tr>
<tr>
<td>old_RTTVAR</td>
<td>curr_RTTVAR</td>
<td>CLOSE</td>
<td>merge(curr,old)</td>
</tr>
<tr>
<td>old_option</td>
<td>curr_option</td>
<td>ESTAB</td>
<td>(depends on option)</td>
</tr>
<tr>
<td>old_ssthresh</td>
<td>curr_ssthresh</td>
<td>CLOSE</td>
<td>merge(curr,old)</td>
</tr>
<tr>
<td>old_sendcwnd</td>
<td>curr_sendcwnd</td>
<td>CLOSE</td>
<td>merge(curr,old)</td>
</tr>
</tbody>
</table>
<t> <dl>
<dt>(1)</dt><dd>Note that PMTU is cached at the IP layer <xref target="RFC1191"
format="default"/> <xref target="RFC4821" format="default"/>.</dd>
</dl>
<t>
Merge() is the function that combines the current and previous (old) Merge() is the function that combines the current and previous (old)
values and may vary for each parameter of the TCB cache. The values and may vary for each parameter of the TCB cache. The
particular function is not specified in this document; examples particular function is not specified in this document; examples
include windowed averages (mean of the past N values, for some N) include windowed averages (mean of the past N values, for some N)
and exponential decay (new = (1-alpha)*old + alpha *new, where alpha and exponential decay (new = (1-alpha)*old + alpha *new, where alpha
is in the range [0..1]).</t> is in the range [0..1]).</t>
<t>
<xref target="Option_Info_Updates"/> gives an overview of option-specific
information that can be similarly shared. The TFO cookie is maintained
until the client explicitly requests it be updated as a separate event.</t>
<t> <table anchor="Option_Info_Updates">
The table below gives an overview of option-specific information <name>Temporal Sharing - Option Info Updates</name>
that can be similarly shared. The TFO cookie is maintained until the <thead>
client explicitly requests it be updated as a separate event.</t> <tr>
<th>Cached</th>
<figure><artwork><![CDATA[ <th>Current</th>
TEMPORAL SHARING - Option Info Updates <th>When?</th>
<th>New Cached</th>
Cached Current when? New Cached </tr>
--------------------------------------------------------- </thead>
old_TFO_cookie old_TFO_cookie ESTAB old_TFO_cookie <tbody>
<tr>
old_TFO_failure old_TFO_failure ESTAB old_TFO_failure <td>old_TFO_cookie</td>
]]></artwork> <td>old_TFO_cookie</td>
</figure> <td>ESTAB</td>
<td>old_TFO_cookie</td>
</section> </tr>
<tr>
<section title="Discussion" anchor="sect-6.3"><t> <td>old_TFO_failure</td>
As noted, there is no particular benefit to caching MMS_S and MMS_R <td>old_TFO_failure</td>
as these are reported by the local IP stack. Caching sendMSS and <td>ESTAB</td>
PMTU is trivial; reported values are cached (PMTU at the IP layer), <td>old_TFO_failure</td>
and the most recent values are used. The cache is updated when the </tr>
MSS option is received in a SYN or after PMTUD (i.e., when an ICMPv4 </tbody>
Fragmentation Needed <xref target="RFC1191"/> or ICMPv6 Packet Too Big messag </table>
e is
received <xref target="RFC8201"/> or the equivalent is inferred, e.g., as fro
m
PLPMTUD <xref target="RFC4821"/>), respectively, so the cache always has the
most
recent values from any connection. For sendMSS, the cache is
consulted only at connection establishment and not otherwise
updated, which means that MSS options do not affect current
connections. The default sendMSS is never saved; only reported MSS
values update the cache, so an explicit override is required to
reduce the sendMSS. Cached sendMSS affects only data sent in the SYN
segment, i.e., during client connection initiation or during
simultaneous open; all other segment MSS are based on the value
updated as included in the SYN.</t>
<t> </section>
RTT values are updated by formulae that merges the old and new <section anchor="sect-6.3" numbered="true" toc="default">
values, as noted in <xref target="sect-6.2"/>. Dynamic RTT estimation require <name>Discussion</name>
s a <t>
sequence of RTT measurements. As a result, the cached RTT (and its As noted, there is no particular benefit to caching MMS_S and MMS_R as
variation) is an average of its previous value with the contents of these are reported by the local IP stack. Caching sendMSS and PMTU is
the currently active TCB for that host, when a TCB is closed. RTT trivial; reported values are cached (PMTU at the IP layer), and the most
values are updated only when a connection is closed. The method for recent values are used. The cache is updated when the MSS option is
merging old and current values needs to attempt to reduce the received in a SYN or after PMTUD (i.e., when an ICMPv4 Fragmentation Needed
transient effects of the new connections.</t> <xref target="RFC1191" format="default"/> or ICMPv6 Packet Too Big message
is received <xref target="RFC8201" format="default"/> or the equivalent is
inferred, e.g., as from PLPMTUD <xref target="RFC4821" format="default"/>),
respectively, so the cache always has the most recent values from any
connection. For sendMSS, the cache is consulted only at connection
establishment and not otherwise updated, which means that MSS options do
not affect current connections. The default sendMSS is never saved; only
reported MSS values update the cache, so an explicit override is required
to reduce the sendMSS. Cached sendMSS affects only data sent in the SYN
segment, i.e., during client connection initiation or during simultaneous
open; the MSS of all other segments are constrained by the value updated as
included in the SYN.
</t>
<t> <t>
The updates for RTT, RTTVAR and ssthresh rely on existing RTT values are updated by formulae that merge the old and new values, as
noted in <xref target="sect-6.2" format="default"/>. Dynamic RTT estimation
requires a sequence of RTT measurements. As a result, the cached RTT (and
its variation) is an average of its previous value with the contents of the
currently active TCB for that host, when a TCB is closed. RTT values are
updated only when a connection is closed. The method for merging old and
current values needs to attempt to reduce the transient effects of the new
connections.</t>
<t>
The updates for RTT, RTTVAR, and ssthresh rely on existing
information, i.e., old values. Should no such values exist, the information, i.e., old values. Should no such values exist, the
current values are cached instead.</t> current values are cached instead.</t>
<t>
<t>
TCP options are copied or merged depending on the details of each TCP options are copied or merged depending on the details of each
option. E.g., TFO state is updated when a connection is established option. For example, TFO state is updated when a connection is established
and read before establishing a new connection.</t> and read before establishing a new connection.</t>
<t> <t>
Sections 8 and 9 discuss compatibility issues and implications of Sections <xref target="sect-8" format="counter"/> and <xref target="sect-9"
sharing the specific information listed above. <xref target="sect-10"/> gives format="counter"/> discuss compatibility issues and implications of sharing
an the specific information listed above. <xref target="sect-10"
overview of known implementations.</t> format="default"/> gives an overview of known implementations.</t>
<t>
<t> Most cached TCB values are updated when a connection closes. The exceptions
Most cached TCB values are updated when a connection closes. The are MMS_R and MMS_S, which are reported by IP <xref target="RFC1122"
exceptions are MMS_R and MMS_S, which are reported by IP <xref target="RFC112 format="default"/>; PMTU, which is updated after Path MTU Discovery and
2"/>, also reported by IP <xref target="RFC1191" format="default"/> <xref
PMTU which is updated after Path MTU Discovery and also reported by target="RFC4821" format="default"/> <xref target="RFC8201"
IP <xref target="RFC1191"/><xref target="RFC4821"/><xref target="RFC8201"/>, format="default"/>; and sendMSS, which is updated if the MSS option is
and sendMSS, which is updated if the received in the TCP SYN header.</t>
MSS option is received in the TCP SYN header.</t> <t>
<t>
Sharing sendMSS information affects only data in the SYN of the next Sharing sendMSS information affects only data in the SYN of the next
connection, because sendMSS information is typically included in connection, because sendMSS information is typically included in
most TCP SYN segments. Caching PMTU can accelerate the efficiency of most TCP SYN segments. Caching PMTU can accelerate the efficiency of
PMTUD but can also result in black-holing until corrected if in PMTUD but can also result in black-holing until corrected if in
error. Caching MMS_R and MMS_S may be of little direct value as they error. Caching MMS_R and MMS_S may be of little direct value as they
are reported by the local IP stack anyway.</t> are reported by the local IP stack anyway.</t>
<t> <t>
The way in which other TCP option state can be shared depends on the The way in which state related to other TCP options can be shared depends on
details of that option. E.g., TFO state includes the TCP Fast Open the
Cookie <xref target="RFC7413"/> or, in case TFO fails, a negative TCP Fast Op details of that option. For example, TFO state includes the TCP Fast Open
en cookie <xref target="RFC7413" format="default"/> or, in case TFO fails, a neg
response. RFC 7413 states, "The client MUST cache negative responses from the ative TCP Fast Open
server in order to avoid potential connection failures. Negative responses incl response. RFC 7413 states, </t>
ude the server not acknowledging the data in the SYN, ICMP error messages, and (
most importantly) no response (SYN-ACK) from the server at all, i.e., connection
timeout." [RFC 7413]. TFOinfo is cached when a connection is established.</t>
<t>
Other TCP option state might not be as readily cached. E.g., TCP-AO
<xref target="RFC5925"/> success or failure between a host pair for a single
SYN
destination port might be usefully cached. TCP-AO success or failure
to other SYN destination ports on that host pair is never useful to
cache because TCP-AO security parameters can vary per service.</t>
</section>
</section>
<section title="Ensemble Sharing" anchor="sect-7"><t> <blockquote>The client <bcp14>MUST</bcp14> cache negative responses from the ser
ver in order to avoid potential connection failures. Negative responses include
the server not acknowledging the data in the SYN, ICMP error messages, and (most
importantly) no response (SYN-ACK) from the server at all, i.e., connection tim
eout.
</blockquote>
<t>TFOinfo is cached when a connection is established.</t>
<t>
State related to other TCP options might not be as readily cached. For
example, TCP-AO <xref target="RFC5925" format="default"/> success or
failure between a host-pair for a single SYN destination port might be
usefully cached. TCP-AO success or failure to other SYN destination ports
on that host-pair is never useful to cache because TCP-AO security
parameters can vary per service.</t>
</section>
</section>
<section anchor="sect-7" numbered="true" toc="default">
<name>Ensemble Sharing</name>
<t>
Sharing cached TCB data across concurrent connections requires Sharing cached TCB data across concurrent connections requires
attention to the aggregate nature of some of the shared state. For attention to the aggregate nature of some of the shared state. For
example, although MSS and RTT values can be shared by copying, it example, although MSS and RTT values can be shared by copying, it
may not be appropriate to simply copy congestion window or ssthresh may not be appropriate to simply copy congestion window or ssthresh
information; instead, the new values can be a function (f) of the information; instead, the new values can be a function (f) of the
cumulative values and the number of connections (N).</t> cumulative values and the number of connections (N).</t>
<section anchor="sect-7.1" numbered="true" toc="default">
<section title="Initialization of a new TCB" anchor="sect-7.1"><t> <name>Initialization of a New TCB</name>
<t>
TCBs for new connections can be initialized using cached context TCBs for new connections can be initialized using cached context
from concurrent connections as follows:</t> from concurrent connections as follows:</t>
<figure><artwork><![CDATA[ <table anchor="TCB_Initialization">
ENSEMBLE SHARING - TCB Initialization <name>Ensemble Sharing - TCB Initialization</name>
<thead>
Cached TCB New TCB <tr>
------------------------------------------ <th>Cached TCB</th>
old_MMS_S old_MMS_S <th>New TCB</th>
</tr>
old_MMS_R old_MMS_R </thead>
<tbody>
old_sendMSS old_sendMSS <tr>
<td>old_MMS_S</td>
old_PMTU old_PMTU+ <td>old_MMS_S</td>
</tr>
old_RTT old_RTT <tr>
<td>old_MMS_R</td>
old_RTTVAR old_RTTVAR <td>old_MMS_R</td>
</tr>
sum(old_ssthresh) f(sum(old_ssthresh), N) <tr>
<td>old_sendMSS</td>
sum(old_sendcwnd) f(sum(old_sendcwnd), N) <td>old_sendMSS</td>
_ </tr>
old_option (option specific) <tr>
]]></artwork> <td>old_PMTU</td>
</figure> <td>old_PMTU (1)</td>
</tr>
<t> <tr>
+Note that PMTU is cached at the IP layer <xref target="RFC1191"/><xref targe <td>old_RTT</td>
t="RFC4821"/>.</t> <td>old_RTT</td>
</tr>
<tr>
<td>old_RTTVAR</td>
<td>old_RTTVAR</td>
</tr>
<tr>
<td>sum(old_ssthresh)</td>
<td>f(sum(old_ssthresh), N)</td>
</tr>
<tr>
<td>sum(old_sendcwnd)</td>
<td>f(sum(old_sendcwnd), N)</td>
</tr>
<tr>
<td>old_option</td>
<td>(option specific)</td>
</tr>
</tbody>
</table>
<t> <dl>
In the table, the cached sum() is a total across all active <dt>(1)</dt>
connections because these parameters act in aggregate; similarly f() <dd>Note that PMTU is cached at the IP layer <xref target="RFC1191" format="defa
ult"/> <xref target="RFC4821" format="default"/>.</dd>
</dl>
<t>
In <xref target="TCB_Initialization"/>, the cached sum() is a total across al
l active
connections because these parameters act in aggregate; similarly, f()
is a function that updates that sum based on the new connection's is a function that updates that sum based on the new connection's
values, represented as "N".</t> values, represented as "N".</t>
<t>
<xref target="Ensemble_Option_Info_Initialization"/> gives an overview of
option-specific information that can be similarly shared. Again, the
TFO_cookie is updated upon explicit client request, which is a separate
event.</t>
<t> <table anchor="Ensemble_Option_Info_Initialization">
The table below gives an overview of option-specific information <name>Ensemble Sharing - Option Info Initialization</name>
that can be similarly shared. Again, The TFO_cookie is updated upon <thead>
explicit client request, which is a separate event.</t> <tr>
<th>Cached</th>
<figure><artwork><![CDATA[ <th>New</th>
ENSEMBLE SHARING - Option Info Initialization </tr>
</thead>
Cached New <tbody>
------------------------------------ <tr>
old_TFO_cookie old_TFO_cookie <td>old_TFO_cookie</td>
<td>old_TFO_cookie</td>
old_TFO_failure old_TFO_failure </tr>
]]></artwork> <tr>
</figure> <td>old_TFO_failure</td>
<td>old_TFO_failure</td>
</section> </tr>
</tbody>
<section title="Updates to the TCB cache" anchor="sect-7.2"><t> </table>
During a connection, the TCB cache can be updated based on changes
to concurrent connections and their TCBs, as shown below:</t>
<figure><artwork><![CDATA[
ENSEMBLE SHARING - Cache Updates
Cached TCB Current TCB when? New Cached TCB
---------------------------------------------------------------
old_MMS_S curr_MMS_S OPEN curr_MMS_S
old_MMS_R curr_MMS_R OPEN curr_MMS_R
old_sendMSS curr_sendMSS MSSopt curr_sendMSS
old_PMTU curr_PMTU PMTUD+ / curr_PMTU
PLPMTUD+
old_RTT curr_RTT update rtt_update(old, curr)
old_RTTVAR curr_RTTVAR update rtt_update(old, curr)
old_ssthresh curr_ssthresh update adjust sum as appropriate
old_sendcwnd curr_sendcwnd update adjust sum as appropriate
old_option curr_option (depends) (option specific)
]]></artwork>
</figure>
<t> </section>
+Note that the PMTU is cached at the IP layer <xref target="RFC1191"/><xref t <section anchor="sect-7.2" numbered="true" toc="default">
arget="RFC4821"/>.</t> <name>Updates to the TCB Cache</name>
<t>
During a connection, the TCB cache can be updated based on changes to
concurrent connections and their TCBs, as shown below:</t>
<t> <table anchor="Ensemble_Cache_Updates">
In the table, rtt_update() is the function used to combine old and <name>Ensemble Sharing - Cache Updates</name>
current values, e.g., as a windowed average or exponentially decayed <thead>
average.</t> <tr>
<th>Cached TCB</th>
<th>Current TCB</th>
<th>When?</th>
<th>New Cached TCB</th>
</tr>
</thead>
<tbody>
<tr>
<td>old_MMS_S</td>
<td>curr_MMS_S</td>
<td>OPEN</td>
<td>curr_MMS_S</td>
</tr>
<tr>
<td>old_MMS_R</td>
<td>curr_MMS_R</td>
<td>OPEN</td>
<td>curr_MMS_R</td>
</tr>
<tr>
<td>old_sendMSS</td>
<td>curr_sendMSS</td>
<td>MSSopt</td>
<td>curr_sendMSS</td>
</tr>
<tr>
<td>old_PMTU</td>
<td>curr_PMTU</td>
<td>PMTUD+ / PLPMTUD+</td>
<td>curr_PMTU</td>
</tr>
<tr>
<td>old_RTT</td>
<td>curr_RTT</td>
<td>update</td>
<td>rtt_update(old, curr)</td>
</tr>
<tr>
<td>old_RTTVAR</td>
<td>curr_RTTVAR</td>
<td>update</td>
<td>rtt_update(old, curr)</td>
</tr>
<tr>
<td>old_ssthresh</td>
<td>curr_ssthresh</td>
<td>update</td>
<td>adjust sum as appropriate</td>
</tr>
<tr>
<td>old_sendcwnd</td>
<td>curr_sendcwnd</td>
<td>update</td>
<td>adjust sum as appropriate</td>
</tr>
<tr>
<td>old_option</td>
<td>curr_option</td>
<td>(depends)</td>
<td>(option specific)</td>
</tr>
</tbody>
</table>
<t> <dl>
The table below gives an overview of option-specific information <dt>+</dt>
<dd>Note that the PMTU is cached at the IP layer <xref target="RFC1191" format="
default"/> <xref target="RFC4821" format="default"/>.</dd>
</dl>
<t>
In <xref target="Ensemble_Cache_Updates"/>, rtt_update() is the function
used to combine old and current values, e.g., as a windowed average or
exponentially decayed average.</t>
<t>
<xref target="Ensemble_Option_Info_Updates"/> gives an overview of opti
on-specific information
that can be similarly shared.</t> that can be similarly shared.</t>
<figure><artwork><![CDATA[ <table anchor="Ensemble_Option_Info_Updates">
ENSEMBLE SHARING - Option Info Updates <name>Ensemble Sharing - Option Info Updates</name>
<thead>
Cached Current when? New Cached <tr>
---------------------------------------------------------- <th>Cached</th>
old_TFO_cookie old_TFO_cookie ESTAB old_TFO_cookie <th>Current</th>
<th>When?</th>
old_TFO_failure old_TFO_failure ESTAB old_TFO_failure <th>New Cached</th>
]]></artwork></figure> </tr>
</thead>
<tbody>
<tr>
<td>old_TFO_cookie</td>
<td>old_TFO_cookie</td>
<td>ESTAB</td>
<td>old_TFO_cookie</td>
</tr>
<tr>
<td>old_TFO_failure</td>
<td>old_TFO_failure</td>
<td>ESTAB</td>
<td>old_TFO_failure</td>
</tr>
</tbody>
</table>
</section> </section>
<section anchor="sect-7.3" numbered="true" toc="default">
<name>Discussion</name>
<section title="Discussion" anchor="sect-7.3"><t> <t>
For ensemble sharing, TCB information should be cached as early as For ensemble sharing, TCB information should be cached as early as
possible, sometimes before a connection is closed. Otherwise, possible, sometimes before a connection is closed. Otherwise,
opening multiple concurrent connections may not result in TCB data opening multiple concurrent connections may not result in TCB data
sharing if no connection closes before others open. The amount of sharing if no connection closes before others open. The amount of
work involved in updating the aggregate average should be minimized, work involved in updating the aggregate average should be minimized,
but the resulting value should be equivalent to having all values but the resulting value should be equivalent to having all values
measured within a single connection. The function "rtt_update" in measured within a single connection.
the ensemble sharing table indicates this operation, which occurs
whenever the RTT would have been updated in the individual TCP
connection. As a result, the cache contains the shared RTT
variables, which no longer need to reside in the TCB.</t>
<t> The function "rtt_update" in <xref target="Ensemble_Cache_Updates"
format="default"/> indicates this operation, which occurs whenever the RTT
would have been updated in the individual TCP connection. As a result, the
cache contains the shared RTT variables, which no longer need to reside in the
TCB.</t>
<t>
Congestion window size and ssthresh aggregation are more complicated Congestion window size and ssthresh aggregation are more complicated
in the concurrent case. When there is an ensemble of connections, we in the concurrent case. When there is an ensemble of connections, we
need to decide how that ensemble would have shared these variables, need to decide how that ensemble would have shared these variables,
in order to derive initial values for new TCBs.</t> in order to derive initial values for new TCBs.</t>
<t>
<t> Sections <xref target="sect-8" format="counter"/> and <xref target="sect-9"
Sections 8 and 9 discuss compatibility issues and implications of format="counter"/> discuss compatibility issues and implications of sharing
sharing the specific information listed above.</t> the specific information listed above.</t>
<t>
<t> There are several ways to initialize the congestion window in a new TCB
There are several ways to initialize the congestion window in a new among an ensemble of current connections to a host. Current TCP
TCB among an ensemble of current connections to a host. Current TCP implementations initialize it to 4 segments as standard <xref
implementations initialize it to four segments as standard <xref target="RFC3 target="RFC3390" format="default"/> and 10 segments experimentally <xref
390"/> target="RFC6928" format="default"/>. These approaches assume that new
and 10 segments experimentally <xref target="RFC6928"/>. These approaches ass connections should behave as conservatively as possible. The algorithm
ume described in <xref target="Ba12" format="default"/> adjusts the initial
that new connections should behave as conservatively as possible. cwnd depending on the cwnd values of ongoing connections. It is also
The algorithm described in <xref target="Ba12"/> adjusts the initial cwnd dep possible to use sharing mechanisms over long timescales to adapt TCP's
ending initial window automatically, as described further in <xref
on the cwnd values of ongoing connections. It is also possible to target="sect-c"/>.</t>
use sharing mechanisms over long timescales to adapt TCP's initial </section>
window automatically, as described further in Appendix C.</t> </section>
<section anchor="sect-8" numbered="true" toc="default">
</section> <name>Issues with TCB Information Sharing</name>
<t>
</section>
<section title="Issues with TCB information sharing" anchor="sect-8"><t>
Here, we discuss various types of problems that may arise with TCB Here, we discuss various types of problems that may arise with TCB
information sharing.</t> information sharing.</t>
<t>
<t>
For the congestion and current window information, the initial For the congestion and current window information, the initial
values computed by TCB interdependence may not be consistent with values computed by TCB interdependence may not be consistent with
the long-term aggregate behavior of a set of concurrent connections the long-term aggregate behavior of a set of concurrent connections
between the same endpoints. Under conventional TCP congestion between the same endpoints.
control, if the congestion window of a single existing connection
has converged to 40 segments, two newly joining concurrent
connections assume initial windows of 10 segments <xref target="RFC6928"/>, a
nd the
current connection's window doesn't decrease to accommodate this
additional load and connections can mutually interfere. One example
of this is seen on low-bandwidth, high-delay links, where concurrent
connections supporting Web traffic can collide because their initial
windows were too large, even when set at one segment.</t>
<t> Under conventional TCP congestion control, if the congestion window of a
The authors of <xref target="Hu12"/> recommend caching ssthresh for temporal single existing connection has converged to 40 segments, two newly joining
sharing only when flows are long. Some studies suggest that sharing concurrent connections will assume initial windows of 10 segments <xref
ssthresh between short flows can deteriorate the performance of target="RFC6928"/> and the existing connection's window will not decrease
individual connections [Hu12, <xref target="Du16"/>], although this may benef to accommodate this additional load. As a consequence, the three
it connections can mutually interfere.
aggregate network performance.</t>
<section title="Traversing the same network path" anchor="sect-8.1"><t> One example of this is seen on low-bandwidth, high-delay links, where
concurrent connections supporting Web traffic can collide because their
initial windows were too large, even when set at 1 segment.</t>
<t>
The authors of <xref target="Hu12" format="default"/> recommend caching
ssthresh for temporal sharing only when flows are long. Some studies
suggest that sharing ssthresh between short flows can deteriorate the
performance of individual connections <xref target="Hu12"/> <xref
target="Du16" format="default"/>, although this may benefit aggregate
network performance.</t>
<section anchor="sect-8.1" numbered="true" toc="default">
<name>Traversing the Same Network Path</name>
<t>
TCP is sometimes used in situations where packets of the same host-pair do TCP is sometimes used in situations where packets of the same host-pair do
not always take the same path, such as when connection- specific parameters not always take the same path, such as when connection-specific parameters
are used for routing (e.g., for load balancing). Multipath routing that are used for routing (e.g., for load balancing). Multipath routing that
relies on examining transport headers, such as ECMP and LAG <xref target="RFC relies on examining transport headers, such as ECMP and Link Aggregation
7424"/>, may Group (LAG) <xref target="RFC7424" format="default"/>, may not result in
not result in repeatable path selection when TCP segments are encapsulated, repeatable path selection when TCP segments are encapsulated, encrypted, or
encrypted, or altered - for example, in some Virtual Private Network (VPN) altered -- for example, in some Virtual Private Network (VPN) tunnels that
tunnels that rely on proprietary encapsulation. Similarly, such approaches rely on proprietary encapsulation. Similarly, such approaches cannot
cannot operate deterministically when the TCP header is encrypted, e.g., operate deterministically when the TCP header is encrypted, e.g., when
when using IPsec ESP (although TCB interdependence among the entire set using IPsec Encapsulating Security Payload (ESP) (although TCB
sharing the same endpoint IP addresses should work without problems when interdependence among the entire set sharing the same endpoint IP addresses
the TCP header is encrypted). Measures to increase the probability that should work without problems when the TCP header is encrypted). Measures to
connections use the same path could be applied: e.g., the connections could increase the probability that connections use the same path could be
be given the same IPv6 flow label <xref target="RFC6437"/>. TCB interdependen applied; for example, the connections could be given the same IPv6 flow
ce can also label <xref target="RFC6437" format="default"/>. TCB interdependence can
be extended to sets of host IP address pairs that share the same network also be extended to sets of host IP address pairs that share the same
path conditions, such as when a group of addresses is on the same LAN (see network path conditions, such as when a group of addresses is on the same
<xref target="sect-9"/>).</t> LAN (see <xref target="sect-9" format="default"/>).</t>
<t>
Traversing the same path is not important for host-specific information
(e.g., rwnd), TCP option state (e.g., TFOinfo), or for information that is
already cached per-host (e.g., path MTU).
<t>
Traversing the same path is not important for host-specific
information such as rwnd and TCP option state, such as TFOinfo, or
for information that is already cached per-host, such as path MTU.
When TCB information is shared across different SYN destination When TCB information is shared across different SYN destination
ports, path-related information can be incorrect; however, the ports, path-related information can be incorrect; however, the
impact of this error is potentially diminished if (as discussed impact of this error is potentially diminished if (as discussed
here) TCB sharing affects only the transient event of a connection here) TCB sharing affects only the transient event of a connection
start or if TCB information is shared only within connections to the start or if TCB information is shared only within connections to the
same SYN destination port.</t> same SYN destination port.</t>
<t>
In the case of temporal sharing, TCB information could also become invalid
over time, i.e., indicating that although the path remains the same, path
properties have changed. Because this is similar to the case when a
connection becomes idle, mechanisms that address idle TCP connections
(e.g., <xref target="RFC7661" format="default"/>) could also be applied to
TCB cache management, especially when TCP Fast Open is used <xref
target="RFC7413" format="default"/>.</t>
</section>
<section anchor="sect-8.2" numbered="true" toc="default">
<name>State Dependence</name>
<t> <t>
In case of Temporal Sharing, TCB information could also become There may be additional considerations to the way in which TCB
invalid over time, i.e., indicating that although the path remains interdependence rebalances congestion feedback among the current
the same, path properties have changed. Because this is similar to connections. For example, it may be appropriate to consider the impact of a
the case when a connection becomes idle, mechanisms that address connection being in Fast Recovery <xref target="RFC5681" format="default"/>
idle TCP connections (e.g., <xref target="RFC7661"/>) could also be applied t or some other similar unusual feedback state that could inhibit or affect the
o TCB calculations described herein.
cache management, especially when TCP Fast Open is used <xref target="RFC7413 </t>
"/>.</t> </section>
<section anchor="sect-8.3" numbered="true" toc="default">
</section> <name>Problems with Sharing Based on IP Address</name>
<t>
<section title="State dependence" anchor="sect-8.2"><t>
There may be additional considerations to the way in which TCB
interdependence rebalances congestion feedback among the current
connections, e.g., it may be appropriate to consider the impact of a
connection being in Fast Recovery <xref target="RFC5681"/> or some other simi
lar
unusual feedback state, e.g., as inhibiting or affecting the
calculations described herein.</t>
</section>
<section title="Problems with sharing based on IP address" anchor="sect-8 It can be wrong to share TCB information between TCP connections on the
.3"><t> same host as identified by the IP address if an IP address is assigned to a
It can be wrong to share TCB information between TCP connections on new host (e.g., IP address spinning, as is used by ISPs to inhibit running
the same host as identified by the IP address if an IP address is servers).
assigned to a new host (e.g., IP address spinning, as is used by
ISPs to inhibit running servers). It can be wrong if Network Address
(and Port) Translation (NA(P)T) <xref target="RFC2663"/> or any other IP shar
ing
mechanism is used. Such mechanisms are less likely to be used with
IPv6. Other methods to identify a host could also be considered to
make correct TCB sharing more likely. Moreover, some TCB information
is about dominant path properties rather than the specific host. IP
addresses may differ, yet the relevant part of the path may be the
same.</t>
</section> It can be wrong if Network Address Translation (NAT) <xref target="RFC2663"
format="default"/>, Network Address and Port Translation (NAPT) <xref
target="RFC2663" format="default"/>, or any other IP sharing mechanism is
used.
</section> Such mechanisms are less likely to be used with IPv6. Other methods to
identify a host could also be considered to make correct TCB sharing more
likely. Moreover, some TCB information is about dominant path properties
rather than the specific host. IP addresses may differ, yet the relevant
part of the path may be the same.</t>
</section>
<section title="Implications" anchor="sect-9"><t> </section>
<section anchor="sect-9" numbered="true" toc="default">
<name>Implications</name>
<t>
There are several implications to incorporating TCB interdependence in TCP There are several implications to incorporating TCB interdependence in TCP
implementations. First, it may reduce the need for application-layer implementations. First, it may reduce the need for application-layer
multiplexing for performance enhancement <xref target="RFC7231"/>. Protocols multiplexing for performance enhancement <xref target="RFC7231" format="defau
like HTTP/2 lt"/>. Protocols like HTTP/2
<xref target="RFC7540"/> avoid connection reestablishment costs by serializin <xref target="RFC7540" format="default"/> avoid connection re-establishment c
g or osts by serializing or
multiplexing a set of per-host connections across a single TCP multiplexing a set of per-host connections across a single TCP
connection. This avoids TCP's per-connection OPEN handshake and also avoids connection. This avoids TCP's per-connection OPEN handshake and also avoids
recomputing the MSS, RTT, and congestion window values. By avoiding the recomputing the MSS, RTT, and congestion window values. By avoiding the
so-called "slow-start restart", performance can be optimized <xref target="I- D.hughes-restart"/>. TCB so-called "slow-start restart", performance can be optimized <xref target="I- D.hughes-restart" format="default"/>. TCB
interdependence can provide the "slow-start restart avoidance" of interdependence can provide the "slow-start restart avoidance" of
multiplexing, without requiring a multiplexing mechanism at the application multiplexing, without requiring a multiplexing mechanism at the application
layer.</t> layer.</t>
<t>
<t> Like the initial version of this document <xref target="RFC2140"
Like the initial version of this document <xref target="RFC2140"/>, this upda format="default"/>, this update's approach to TCB interdependence focuses
te's on sharing a set of TCBs by updating the TCB state to reduce the impact of
approach to TCB interdependence focuses on sharing a set of TCBs by transients when connections begin, end, or otherwise significantly change
updating the TCB state to reduce the impact of transients when state.
connections begin, end, or otherwise significantly change state.
Other mechanisms have since been proposed to continuously share
information between all ongoing communication (including
connectionless protocols), updating the congestion state during any
congestion-related event (e.g., timeout, loss confirmation, etc.)
<xref target="RFC3124"/>. By dealing exclusively with transients, the approac
h in
this document is more likely to exhibit the "steady-state" behavior
as unmodified, independent TCP connections.</t>
<section title="Layering" anchor="sect-9.1"><t> Other mechanisms have since been proposed to continuously share information
TCB interdependence pushes some of the TCP implementation from the between all ongoing communication (including connectionless protocols) and
traditional transport layer (in the ISO model), to the network update the congestion state during any congestion-related event (e.g.,
layer. This acknowledges that some state is in fact per-host-pair or timeout, loss confirmation, etc.) <xref target="RFC3124"
can be per-path as indicated solely by that host-pair. Transport format="default"/>.
protocols typically manage per-application-pair associations (per
stream), and network protocols manage per-host-pair and path
associations (routing). Round-trip time, MSS, and congestion
information could be more appropriately handled at the network
layer, aggregated among concurrent connections, and shared across
connection instances <xref target="RFC3124"/>.</t>
<t> By dealing exclusively with transients, the approach in this document is
An earlier version of RTT sharing suggested implementing RTT state more likely to exhibit the "steady-state" behavior as unmodified,
at the IP layer, rather than at the TCP layer. Our observations independent TCP connections.</t>
describe sharing state among TCP connections, which avoids some of <section anchor="sect-9.1" numbered="true" toc="default">
the difficulties in an IP-layer solution. One such problem of an IP <name>Layering</name>
layer solution is determining the correspondence between packet
exchanges using IP header information alone, where such
correspondence is needed to compute RTT. Because TCB sharing
computes RTTs inside the TCP layer using TCP header information, it
can be implemented more directly and simply than at the IP layer.
This is a case where information should be computed at the transport
layer but could be shared at the network layer.</t>
</section> <t>
<section title="Other possibilities" anchor="sect-9.2"><t> TCB interdependence pushes some of the TCP implementation from its typical
Per-host-pair associations are not the limit of these techniques. It placement solely within the transport layer (in the ISO model) to the
is possible that TCBs could be similarly shared between hosts on a network layer.
subnet or within a cluster, because the predominant path can be
subnet-subnet, rather than host-host. Additionally, TCB This acknowledges that some components of state are, in fact, per-host-pair
interdependence can be applied to any protocol with congestion or can be per-path as indicated solely by that host-pair.
state, including SCTP <xref target="RFC4960"/> and DCCP <xref target="RFC4340
"/>, as well as for Transport protocols typically manage per-application-pair associations (per
individual subflows in Multipath TCP <xref target="RFC8684"/>.</t> stream), and network protocols manage per-host-pair and path associations
(routing). Round-trip time, MSS, and congestion information could be more
appropriately handled at the network layer, aggregated among concurrent
connections, and shared across connection instances <xref target="RFC3124"
format="default"/>.</t>
<t>
An earlier version of RTT sharing suggested implementing RTT state at the
IP layer rather than at the TCP layer. Our observations describe sharing
state among TCP connections, which avoids some of the difficulties in an
IP-layer solution. One such problem of an IP-layer solution is determining
the correspondence between packet exchanges using IP header information
alone, where such correspondence is needed to compute RTT. Because TCB
sharing computes RTTs inside the TCP layer using TCP header information, it
can be implemented more directly and simply than at the IP layer. This is
a case where information should be computed at the transport layer but
could be shared at the network layer.</t>
</section>
<section anchor="sect-9.2" numbered="true" toc="default">
<name>Other Possibilities</name>
<t>
Per-host-pair associations are not the limit of these techniques. It is
possible that TCBs could be similarly shared between hosts on a subnet or
within a cluster, because the predominant path can be subnet-subnet rather
than host-host. Additionally, TCB interdependence can be applied to any
protocol with congestion state, including SCTP <xref target="RFC4960"
format="default"/> and DCCP <xref target="RFC4340" format="default"/>, as
well as to individual subflows in Multipath TCP <xref target="RFC8684"
format="default"/>.</t>
<t>
<t>
There may be other information that can be shared between concurrent There may be other information that can be shared between concurrent
connections. For example, knowing that another connection has just connections. For example, knowing that another connection has just
tried to expand its window size and failed, a connection may not tried to expand its window size and failed, a connection may not
attempt to do the same for some period. The idea is that existing attempt to do the same for some period. The idea is that existing
TCP implementations infer the behavior of all competing connections, TCP implementations infer the behavior of all competing connections,
including those within the same host or subnet. One possible including those within the same host or subnet. One possible
optimization is to make that implicit feedback explicit, via optimization is to make that implicit feedback explicit, via
extended information associated with the endpoint IP address and its extended information associated with the endpoint IP address and its
TCP implementation, rather than per-connection state in the TCB.</t> TCP implementation, rather than per-connection state in the TCB.</t>
<t>
<t>
This document focuses on sharing TCB information at connection This document focuses on sharing TCB information at connection
initialization. Subsequent to RFC 2140, there have been numerous approaches initialization. Subsequent to RFC 2140, there have been numerous approaches
that attempt to coordinate ongoing state across concurrent connections, that attempt to coordinate ongoing state across concurrent connections,
both within TCP and other congestion-reactive protocols, which are both within TCP and other congestion-reactive protocols, which are
summarized in <xref target="Is18"/>. These approaches are more complex to imp summarized in <xref target="Is18" format="default"/>. These approaches are
lement and more complex to implement, and their comparison to steady-state TCP
their comparison to steady-state TCP equivalence can be more difficult to equivalence can be more difficult to establish, sometimes intentionally
establish, sometimes intentionally (i.e., they sometimes intend to provide (i.e., they sometimes intend to provide a different kind of "fairness" than
a different kind of "fairness" than emerges from TCP operation).</t> emerges from TCP operation).</t>
</section>
</section> </section>
</section>
<section title="Implementation Observations" anchor="sect-10"><t>
The observation that some TCB state is host-pair specific rather
than application-pair dependent is not new and is a common
engineering decision in layered protocol implementations. Although
now deprecated, T/TCP <xref target="RFC1644"/> was the first to propose using
caches in order to maintain TCB states (see Appendix A).</t>
<t>
The table below describes the current implementation status for TCB
temporal sharing in Windows as of December 2020, Apple variants
(macOS, iOS, iPadOS, tvOS, watchOS) as of January 2021, Linux kernel
version 5.10.3, and FreeBSD 12. Ensemble sharing is not yet
implemented.</t>
<figure><artwork><![CDATA[
KNOWN IMPLEMENTATION STATUS
TCB data Status
------------------------------------------------------------
old_MMS_S Not shared
old_MMS_R Not shared
old_sendMSS Cached and shared in Apple, Linux (MSS)
old_PMTU Cached and shared in Apple, FreeBSD, Windows (PMTU)
old_RTT Cached and shared in Apple, FreeBSD, Linux, Windows
old_RTTVAR Cached and shared in Apple, FreeBSD, Windows
old_TFOinfo Cached and shared in Apple, Linux, Windows
old_sendcwnd Not shared
old_ssthresh Cached and shared in Apple, FreeBSD*, Linux* <section anchor="sect-10" numbered="true" toc="default">
<name>Implementation Observations</name>
<t>
The observation that some TCB state is host-pair specific rather than
application-pair dependent is not new and is a common engineering decision
in layered protocol implementations. Although now deprecated, T/TCP <xref
target="RFC1644" format="default"/> was the first to propose using caches
in order to maintain TCB states (see <xref target="sect-a"/>).</t>
<t>
<xref target="Known_Implementation_Status"/> describes the current
implementation status for TCB temporal sharing in Windows as of
December 2020, Apple variants (macOS, iOS, iPadOS, tvOS, and watchOS)
as of January 2021, Linux kernel version 5.10.3, and FreeBSD
12. Ensemble sharing is not yet implemented.</t>
TFO failure Cached and shared in Apple <table anchor="Known_Implementation_Status">
]]></artwork> <name>KNOWN IMPLEMENTATION STATUS</name>
</figure> <thead>
<tr>
<th>TCB data</th>
<th>Status</th>
</tr>
</thead>
<tbody>
<tr>
<td>old_MMS_S</td>
<td>Not shared</td>
</tr>
<tr>
<td>old_MMS_R</td>
<td>Not shared</td>
</tr>
<tr>
<td>old_sendMSS</td>
<td>Cached and shared in Apple, Linux (MSS)</td>
</tr>
<tr>
<td>old_PMTU</td>
<td>Cached and shared in Apple, FreeBSD, Windows (PMTU)</td>
</tr>
<tr>
<td>old_RTT</td>
<td>Cached and shared in Apple, FreeBSD, Linux, Windows</td>
</tr>
<tr>
<td>old_RTTVAR</td>
<td>Cached and shared in Apple, FreeBSD, Windows</td>
</tr>
<tr>
<td>old_TFOinfo</td>
<td>Cached and shared in Apple, Linux, Windows</td>
</tr>
<tr>
<td>old_sendcwnd</td>
<td>Not shared</td>
</tr>
<tr>
<td>old_ssthresh</td>
<td>Cached and shared in Apple, FreeBSD*, Linux*</td>
</tr>
<tr>
<td>TFO failure</td>
<td>Cached and shared in Apple</td>
</tr>
</tbody>
</table>
<t> <dl>
In the table above, "Apple" refers to all Apple OSes, i.e., <dt>*</dt>
desktop/laptop macOS, phone iOS, pad iPadOS, video player tvOS, and <dd>Note:
watch watchOS, which all share the same Internet protocol stack.</t>
<t> In FreeBSD, new ssthresh is the mean of curr_ssthresh and its previous value
*Note: In FreeBSD, new ssthresh is the mean of curr_ssthresh and if a previous value exists; in Linux, the calculation depends on state and is
previous value if a previous value exists; in Linux, the calculation max(curr_cwnd/2, old_ssthresh) in most cases.</dd>
depends on state and is max(curr_cwnd/2, old_ssthresh) in most </dl>
cases.</t>
</section> <t>In <xref target="Known_Implementation_Status"/>, "Apple" refers to all
Apple OSes, i.e., macOS (desktop/laptop), iOS (phone), iPadOS (tablet), tvOS
(video player), and watchOS (smart watch), which all share the same Internet
protocol stack.
</t>
<section title="Changes Compared to RFC 2140" anchor="sect-11"><t> </section>
This document updates the description of TCB sharing in RFC 2140 and <section anchor="sect-11" numbered="true" toc="default">
its associated impact on existing and new connection state, <name>Changes Compared to RFC 2140</name>
providing a complete replacement for that document <xref target="RFC2140"/>. <t>
It This document updates the description of TCB sharing in RFC 2140 and its
clarifies the previous description and terminology and extends the associated impact on existing and new connection state, providing a
mechanism to its impact on new protocols and mechanisms, including complete replacement for that document <xref target="RFC2140"
multipath TCP, fast open, PLPMTUD, NAT, and the TCP Authentication format="default"/>. It clarifies the previous description and terminology
Option.</t> and extends the mechanism to its impact on new protocols and mechanisms,
including multipath TCP, Fast Open, PLPMTUD, NAT, and the TCP
Authentication Option.</t>
<t> <t>
The detailed impact on TCB state addresses TCB parameters in greater The detailed impact on TCB state addresses TCB parameters with greater
detail, addressing MSS in both the send and receive direction, MSS specificity. It separates the way MSS is used in both send and receive
and sendMSS separately, adds path MTU and ssthresh, and addresses directions, it separates the way both of these MSS values differ from
the impact on TCP option state.</t> sendMSS, it adds both path MTU and ssthresh, and it addresses the impact on
state associated with TCP options.
</t>
<t> <t>
New sections have been added to address compatibility issues and New sections have been added to address compatibility issues and
implementation observations. The relation of this work to T/TCP has implementation observations.
been moved to 0 on history, partly to reflect the deprecation of
that protocol.</t>
<t> The relation of this work to T/TCP has been moved to <xref
Appendix C has been added to discuss the potential to use temporal target="sect-a"/> (which describes the history to TCB sharing) partly to
reflect the deprecation of that protocol.
</t>
<t>
<xref target="sect-c"/> has been added to discuss the potential to use tempor
al
sharing over long timescales to adapt TCP's initial window sharing over long timescales to adapt TCP's initial window
automatically, avoiding the need to periodically revise a single automatically, avoiding the need to periodically revise a single
global constant value.</t> global constant value.</t>
<t>
<t>
Finally, this document updates and significantly expands the Finally, this document updates and significantly expands the
referenced literature.</t> referenced literature.</t>
</section>
</section> <section anchor="sect-12" numbered="true" toc="default">
<name>Security Considerations</name>
<section title="Security Considerations" anchor="sect-12"><t> <t>
These presented implementation methods do not have additional These presented implementation methods do not have additional ramifications
ramifications for direct (connection-aborting or information for direct (connection-aborting or information-injecting) attacks on
injecting) attacks on individual connections. Individual individual connections. Individual connections, whether using sharing or
connections, whether using sharing or not, also may be susceptible not, also may be susceptible to denial-of-service attacks that reduce
to denial-of-service attacks that reduce performance or completely performance or completely deny connections and transfers if not otherwise
deny connections and transfers if not otherwise secured.</t> secured.</t>
<t>
<t> TCB sharing may create additional denial-of-service attacks that affect the
TCB sharing may create additional denial-of-service attacks that performance of other connections by polluting the cached information. This
affect the performance of other connections by polluting the cached can occur across any set of connections in which the TCB is shared,
information. This can occur across whatever set of connections where between connections in a single host, or between hosts if TCB sharing is
the TCB is shared, between connections in a single host, or between implemented within a subnet (see <xref target="sect-9"
hosts if TCB sharing is implemented within a subnet (see sectionFormat="bare">"Implications"</xref>). Some shared TCB parameters are
Implications section). Some shared TCB parameters are used only to used only to create new TCBs; others are shared among the TCBs of ongoing
create new TCBs, others are shared among the TCBs of ongoing connections. New connections can join the ongoing set, e.g., to optimize
connections. New connections can join the ongoing set, e.g., to send window size among a set of connections to the same host. PMTU is
optimize send window size among a set of connections to the same defined as shared at the IP layer and is already susceptible in this
host. PMTU is defined as shared at the IP layer, and is already way.</t>
susceptible in this way.</t> <t>
<t>
Options in client SYNs can be easier to forge than complete, two-way Options in client SYNs can be easier to forge than complete, two-way
connections. As a result, their values may not be safely connections. As a result, their values may not be safely
incorporated in shared values until after the three-way handshake incorporated in shared values until after the three-way handshake
completes.</t> completes.</t>
<t>
<t>
Attacks on parameters used only for initialization affect only the Attacks on parameters used only for initialization affect only the
transient performance of a TCP connection. For short connections, the transient performance of a TCP connection. For short connections, the
performance ramification can approach that of a denial-of-service performance ramification can approach that of a denial-of-service
attack. E.g., if an application changes its TCB to have a false and small attack. For example, if an application changes its TCB to have a false and sm all
window size, subsequent connections will experience performance degradation window size, subsequent connections will experience performance degradation
until their window grew appropriately.</t> until their window grows appropriately.</t>
<t>
<t>
TCB sharing reuses and mixes information from past and current TCB sharing reuses and mixes information from past and current
connections. Although reusing information could create a potential connections. Although reusing information could create a potential
for fingerprinting to identify hosts, the mixing reduces that for fingerprinting to identify hosts, the mixing reduces that
potential. There has been no evidence of fingerprinting based on potential. There has been no evidence of fingerprinting based on
this technique and it is currently considered safe in that regard. this technique, and it is currently considered safe in that regard.
Further, information about the performance of a TCP connection has Further, information about the performance of a TCP connection has
not been considered as private.</t> not been considered as private.</t>
</section>
<section anchor="sect-13" numbered="true" toc="default">
<name>IANA Considerations</name>
<t>
This document has no IANA actions.</t>
</section> </section>
</middle>
<section title="IANA Considerations" anchor="sect-13"><t> <back>
There are no IANA implications or requests in this document.</t>
<t> <displayreference target="I-D.allman-tcpm-bump-initcwnd" to="Al10"/>
This section should be removed upon final publication as an RFC.</t> <displayreference target="I-D.ietf-tcpm-generalized-ecn" to="Ba20"/>
<displayreference target="I-D.hughes-restart" to="Hu01"/>
</section> <references>
<name>References</name>
<references>
<name>Normative References</name>
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R
FC.0793.xml"/>
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R
FC.1122.xml"/>
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R
FC.1191.xml"/>
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R
FC.2119.xml"/>
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R
FC.4821.xml"/>
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R
FC.5681.xml"/>
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R
FC.6298.xml"/>
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R
FC.7413.xml"/>
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R
FC.8174.xml"/>
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R
FC.8201.xml"/>
</references>
<references>
<name>Informative References</name>
</middle> <xi:include href="https://datatracker.ietf.org/doc/bibxml3/reference.I-D .allman-tcpm-bump-initcwnd.xml"/>
<back> <reference anchor="Ba12">
<references title="Normative References"> <front>
&RFC0793; <title>LISA: A linked slow-start algorithm for MPTCP</title>
&RFC1122; <author initials="R." surname="Barik" fullname="Runa Barik">
&RFC1191;
&RFC2119;
&RFC4821;
&RFC5681;
&RFC6298;
&RFC7413;
&RFC8174;
&RFC8201;
</references>
<references title="Informative References">
&I-D.allman-tcpm-bump-initcwnd;
<reference anchor="Ba12"><front>
<title>LISA: A Linked Slow-Start Algorithm for MPTCP</title>
<author initials="R." surname="Barik" fullname="R. Barik">
</author> </author>
<author initials="M." surname="Welzl" fullname="Michael Welzl">
<author initials="M." surname="Welzl" fullname="M. Welzl">
</author> </author>
<author initials="S." surname="Ferlin" fullname="Simone Ferlin">
<author initials="S." surname="Ferlin" fullname="S. Ferlin">
</author> </author>
<author initials="O." surname="Alay" fullname="Ozgu Alay">
<author initials="O." surname="Alay" fullname="O. Alay">
</author> </author>
<date month="May" year="2016"/>
</front>
<refcontent>IEEE ICC
</refcontent>
<seriesInfo name="DOI" value="10.1109/ICC.2016.7510786"/>
</reference>
<date month="May" year="2016"/> <xi:include href="https://datatracker.ietf.org/doc/bibxml3/reference.I-D
</front> .ietf-tcpm-generalized-ecn.xml"/>
<seriesInfo name="IEEE" value="ICC"/> <reference anchor="Be94">
</reference> <front>
&I-D.ietf-tcpm-generalized-ecn; <title>The World-Wide Web</title>
<reference anchor="Be94"><front> <author initials="T." surname="Berners-Lee" fullname="Tim Berners-Le
<title>The World-Wide Web</title> e">
<author initials="T." surname="Berners-Lee" fullname="T. Berners-Lee">
</author> </author>
<author initials="C." surname="Cailliau" fullname="Robert Cailliau"/
>
<author initials="A." surname="Luotonen" fullname="Ari Luotonen"/>
<author initials="H." surname="Nielsen" fullname="Henrik Frystyk Niel
sen"/>
<author initials="A." surname="Secret" fullname="Arthur Secret"/>
<date month="August" year="1994"/> <date month="August" year="1994"/>
</front> </front>
<seriesInfo name="DOI" value="10.1145/179606.179671"/>
<seriesInfo name="Communications" value="of the ACM"/> <refcontent>Communications of the ACM V37, pp. 76-82</refcontent>
</reference>
<reference anchor="Br94"><front>
<title>T/TCP -- Transaction TCP: Source Changes for Sun OS 4.1.3</title>
<author initials="B." surname="Braden" fullname="B. Braden">
</author>
<date month="September" year="1994"/> </reference>
</front>
</reference> <reference anchor="Br94">
<reference anchor="Br02"><front> <front>
<title>Understanding Internet Traffic Streams: Dragonflies and Tortoises< <title>T/TCP -- Transaction TCP: Source Changes for Sun OS 4.1.3</ti
/title> tle>
<author initials="N." surname="Brownlee" fullname="N. Brownlee"> <author initials="B." surname="Braden" fullname="Bob Braden">
</author> </author>
<date month="September" year="1994"/>
</front>
<refcontent>USC/ISI Release 1.0</refcontent>
</reference>
<author initials="K." surname="Claffy" fullname="K. Claffy"> <reference anchor="Br02">
<front>
<title>Understanding Internet traffic streams: dragonflies and torto
ises</title>
<author initials="N" surname="Brownlee" fullname="Nevil Brownlee">
</author> </author>
<author initials="KC" surname="Claffy" fullname="KC Claffy">
<date year="2002"/>
</front>
<seriesInfo name="IEEE" value="Communications Magazine p110-117"/>
</reference>
<reference anchor="Co91"><front>
<title>Internetworking with TCP/IP</title>
<author initials="D." surname="Comer" fullname="D. Comer">
</author> </author>
<date year="2002"/>
</front>
<seriesInfo name="DOI" value="10.1109/MCOM.2002.1039865"/>
<refcontent>IEEE Communications Magazine, pp. 110-117</refcontent>
</reference>
<author initials="D." surname="Stevens" fullname="D. Stevens"> <reference anchor="Co91">
<front>
<title>Internetworking with TCP/IP</title>
<author initials="D" surname="Comer" fullname="Douglas Comer">
</author> </author>
<author initials="D" surname="Stevens" fullname="David Stevens">
<date year="1991"/>
</front>
</reference>
<reference anchor="Du16"><front>
<title>Research Impacting the Practice of Congestion Control</title>
<author>
<organization>Dukkipati, N., Yuchung C. and V. Amin</organization>
</author> </author>
<date year="1991"/>
</front>
<seriesInfo name='ISBN 10:' value='0134685059' />
<seriesInfo name='ISBN 13:' value='9780134685052' />
</reference>
<date month="July" year="2016"/> <reference anchor="Du16">
</front> <front>
<title>Research Impacting the Practice of Congestion Control</title>
<author initials="N" surname="Dukkipati" fullname="Nandita Dukkipati
"/>
<author initials="Y" surname="Cheng" fullname="Yuchung Cheng"/>
<author initials="A" surname="Vahdat" fullname="Amin Vahdat"/>
<date month="July" year="2016"/>
</front>
<refcontent>Computer Communication Review</refcontent>
<refcontent>The ACM SIGCOMM newsletter</refcontent>
</reference>
<seriesInfo name="ACM" value="SIGCOMM CCR editorial"/> <reference anchor="FreeBSD" target="https://www.freebsd.org/">
</reference> <front>
<reference anchor="FreeBSD" target="http://www.freebsd.org/"><front> <title>The FreeBSD Project</title>
<title>FreeBSD source code</title> <author>
<author> <organization>FreeBSD</organization>
</author> </author>
<date/>
</front>
</reference>
<date/> <reference anchor="I-D.hughes-restart">
</front> <front>
<title>Issues in TCP Slow-Start Restart After Idle</title>
</reference> <author initials="A" surname="Hughes" fullname="Amy Hughes"/>
&I-D.hughes-restart; <author initials="J" surname="Touch" fullname="Joe Touch"/>
<reference anchor="Hu12"><front> <author initials="J" surname="Heidemann" fullname="John Heidemann"/>
<title>Enhanced metric caching for short TCP flows</title>
<author initials="P." surname="Hurtig" fullname="P. Hurtig">
</author>
<author initials="A." surname="Brunstrom" fullname="A. Brunstrom"> <date month="December" year="2001" />
</author> </front>
<date year="2012"/> <seriesInfo name="Internet-Draft" value="draft-hughes-restart-00" />
</front> </reference>
<seriesInfo name="IEEE" value="International Conference on Communications <reference anchor="Hu12">
"/> <front>
</reference> <title>Enhanced metric caching for short TCP flows</title>
<reference anchor="IANA" target="https://www.iana.org/assignments/tcp-par <author initials="P." surname="Hurtig" fullname="Per Hurtig">
ameters"><front>
<title>IANA TCP Parameters (options) registry</title>
<author>
</author> </author>
<author initials="A." surname="Brunstrom" fullname="Anna Brunstrom">
<date/>
</front>
</reference>
<reference anchor="Is18"><front>
<title>ctrlTCP: Reducing Latency through Coupled Heterogeneous Multi-Flow
TCP Congestion Control</title>
<author initials="S." surname="Islam" fullname="S. Islam">
</author> </author>
<date year="2012"/>
</front>
<seriesInfo name="DOI" value="10.1109/ICC.2012.6364516"/>
<refcontent>IEEE International Conference on Communications</refcontent>
</reference>
<author initials="M." surname="Welzl" fullname="M. Welzl"> <reference anchor="IANA" target="https://www.iana.org/assignments/tcp-pa
rameters">
<front>
<title>Transmission Control Protocol (TCP) Parameters</title>
<author>
<organization>IANA</organization>
</author> </author>
<date/>
</front>
</reference>
<author initials="K." surname="Hiorth" fullname="K. Hiorth"> <reference anchor="Is18">
<front>
<title>ctrlTCP: Reducing latency through coupled, heterogeneous
multi-flow TCP congestion control</title>
<author initials="S." surname="Islam" fullname="Safiqul Islam">
</author> </author>
<author initials="M." surname="Welzl" fullname="Michael Welzl">
<author initials="D." surname="Hayes" fullname="D. Hayes">
</author> </author>
<author initials="K." surname="Hiorth" fullname="Kristian Hiorth">
<author initials="G." surname="Armitage" fullname="G. Armitage">
</author> </author>
<author initials="D." surname="Hayes" fullname="David Hayes">
<author initials="S." surname="Gjessing" fullname="S. Gjessing">
</author> </author>
<author initials="G." surname="Armitage" fullname="Grenville Armitag
<date month="April" year="2018"/> e">
</front>
<seriesInfo name="Proc" value="IEEE INFOCOM Global Internet Symposium GI
workshop"/>
</reference>
<reference anchor="Ja88"><front>
<title>Congestion Avoidance and Control</title>
<author initials="V." surname="Jacobson" fullname="V. Jacobson">
</author> </author>
<author initials="S." surname="Gjessing" fullname="Stein Gjessing">
<author initials="M." surname="Karels" fullname="M. Karels">
</author> </author>
<date month="April" year="2018"/>
</front>
<seriesInfo name="DOI" value="10.1109/INFCOMW.2018.8406887"/>
<refcontent>IEEE INFOCOM 2018 - IEEE Conference on Computer
Communications Workshops (INFOCOM WKSHPS)</refcontent>
</reference>
<date year="1988"/> <reference anchor="Ja88">
</front> <front>
<title>Congestion Avoidance and Control</title>
<seriesInfo name="Proc" value="Sigcomm"/> <author initials="V." surname="Jacobson" fullname="Van Jacobson">
</reference> </author>
&RFC1644; <author initials="M." surname="Karels" fullname="Michael Karels">
&RFC1379; </author>
&RFC2001; <date month="November" year="1988"/>
&RFC2140; </front>
&RFC2414; <refcontent>SIGCOMM Symposium proceedings on Communications
&RFC2663; architectures and protocols
&RFC3390; </refcontent>
&RFC3124; </reference>
&RFC4340;
&RFC4960;
&RFC5925;
&RFC6437;
&RFC6691;
&RFC6928;
&RFC7231;
&RFC7323;
&RFC7424;
&RFC7540;
&RFC7661;
&RFC8684;
</references>
<section title="TCB Sharing History" anchor="sect-a"><t>
T/TCP proposed using caches to maintain TCB information across
instances (temporal sharing), e.g., smoothed RTT, RTT variation,
congestion avoidance threshold, and MSS <xref target="RFC1644"/>. These value
s were
in addition to connection counts used by T/TCP to accelerate data
delivery prior to the full three-way handshake during an OPEN. The
goal was to aggregate TCB components where they reflect one
association - that of the host-pair, rather than artificially
separating those components by connection.</t>
<t> <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R
FC.1644.xml"/>
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R
FC.1379.xml"/>
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R
FC.2001.xml"/>
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R
FC.2140.xml"/>
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R
FC.2414.xml"/>
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R
FC.2663.xml"/>
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R
FC.3390.xml"/>
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R
FC.3124.xml"/>
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R
FC.4340.xml"/>
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R
FC.4960.xml"/>
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R
FC.5925.xml"/>
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R
FC.6437.xml"/>
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R
FC.6691.xml"/>
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R
FC.6928.xml"/>
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R
FC.7231.xml"/>
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R
FC.7323.xml"/>
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R
FC.7424.xml"/>
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R
FC.7540.xml"/>
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R
FC.7661.xml"/>
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R
FC.8684.xml"/>
</references>
</references>
<section anchor="sect-a" numbered="true" toc="default">
<name>TCB Sharing History</name>
<t>
T/TCP proposed using caches to maintain TCB information across instances
(temporal sharing), e.g., smoothed RTT, RTT variation, congestion-avoidance
threshold, and MSS <xref target="RFC1644" format="default"/>. These values
were in addition to connection counts used by T/TCP to accelerate data
delivery prior to the full three-way handshake during an OPEN. The goal was
to aggregate TCB components where they reflect one association -- that of the
host-pair rather than artificially separating those components by
connection.</t>
<t>
At least one T/TCP implementation saved the MSS and aggregated the At least one T/TCP implementation saved the MSS and aggregated the
RTT parameters across multiple connections but omitted caching the RTT parameters across multiple connections but omitted caching the
congestion window information <xref target="Br94"/>, as originally specified congestion window information <xref target="Br94" format="default"/>, as orig
in inally specified in
<xref target="RFC1379"/>. Some T/TCP implementations immediately updated MSS <xref target="RFC1379" format="default"/>. Some T/TCP implementations immedia
when tely updated MSS when
the TCP MSS header option was received <xref target="Br94"/>, although this w the TCP MSS header option was received <xref target="Br94" format="default"/>
as not , although this was not
addressed specifically in the concepts or functional specification addressed specifically in the concepts or functional specification
<xref target="RFC1379"/><xref target="RFC1644"/>. In later T/TCP implementati ons, RTT values were <xref target="RFC1379" format="default"/> <xref target="RFC1644" format="defa ult"/>. In later T/TCP implementations, RTT values were
updated only after a CLOSE, which does not benefit concurrent updated only after a CLOSE, which does not benefit concurrent
sessions.</t> sessions.</t>
<t>
Temporal sharing of cached TCB data was originally implemented in the Sun
OS 4.1.3 T/TCP extensions <xref target="Br94" format="default"/> and the
FreeBSD port of same <xref target="FreeBSD" format="default"/>. As
mentioned before, only the MSS and RTT parameters were cached, as originally
specified in <xref target="RFC1379" format="default"/>. Later discussion of
T/TCP suggested including congestion control parameters in this cache; for
example, <xref target="RFC1644" sectionFormat="of" section="3.1"
format="default"/> hints at initializing the congestion window to the old
window size.</t>
</section>
<section anchor="sect-b" numbered="true" toc="default">
<t> <name>TCP Option Sharing and Caching</name>
Temporal sharing of cached TCB data was originally implemented in <t>
the SunOS 4.1.3 T/TCP extensions <xref target="Br94"/> and the FreeBSD port o In addition to the options that can be cached and shared, this memo also
f same lists known TCP options <xref target="IANA" format="default"/> for which
<xref target="FreeBSD"/>. As mentioned before, only the MSS and RTT parameter state is unsafe to be kept. This list is not intended to be authoritative
s were or exhaustive.</t>
cached, as originally specified in <xref target="RFC1379"/>. Later discussion
of
T/TCP suggested including congestion control parameters in this
cache; for example, <xref target="RFC1644"/> (Section 3.1) hints at initializ
ing
the congestion window to the old window size.</t>
</section>
<section title="TCP Option Sharing and Caching" anchor="sect-b"><t>
In addition to the options that can be cached and shared, this memo
also lists known TCP options <xref target="IANA"/> for which state is unsafe
to be
kept. This list is not intended to be authoritative or exhaustive.</t>
<figure><artwork><![CDATA[
Obsolete (unsafe to keep state):
ECHO
ECHO REPLY <t>Obsolete (unsafe to keep state):
</t>
<ul empty="true">
PO Conn permitted <li>Echo
</li>
PO service profile <li>Echo Reply
</li>
CC <li>Partial Order Connection Permitted
</li>
CC.NEW <li>Partial Order Service Profile
</li>
CC.ECHO <li>CC
</li>
Alt CS req <li>CC.NEW
</li>
Alt CS data <li>CC.ECHO
</li>
No state to keep: <li>TCP Alternate Checksum Request
</li>
EOL <li>TCP Alternate Checksum Data
</li>
NOP </ul>
WS <t>No state to keep:
</t>
SACK <ul empty="true">
<li>End of Option List (EOL)
</li>
<li>No-Operation (NOP)
</li>
<li>Window Scale (WS)
</li>
<li>SACK
</li>
<li>Timestamps (TS)
</li>
<li>MD5 Signature Option
</li>
<li>TCP Authentication Option (TCP-AO)
</li>
<li>RFC3692-style Experiment 1
</li>
<li>RFC3692-style Experiment 2
</li>
</ul>
TS <t>Unsafe to keep state:
</t>
MD5 <ul empty="true">
TCP-AO <li>Skeeter (DH exchange, known to be vulnerable)
</li>
EXP1 <li>Bubba (DH exchange, known to be vulnerable)
</li>
EXP2 <li>Trailer Checksum Option
</li>
Unsafe to keep state: <li>SCPS capabilities
</li>
Skeeter (DH exchange, known to be vulnerable) <li>Selective Negative Acknowledgements (S-NACK)
</li>
Bubba (DH exchange, known to be vulnerable) <li>Records Boundaries
</li>
Trailer CS <li>Corruption experienced
</li>
SCPS capabilities <li>SNAP
</li>
S-NACK <li>TCP Compression Filter
</li>
Records boundaries <li>Quick-Start Response
</li>
Corruption experienced <li>User Timeout Option (UTO)
</li>
SNAP <li>Multipath TCP (MPTCP) negotiation success (see below for negotiation failure
)
</li>
TCP Compression <li>TCP Fast Open (TFO) negotiation success (see below for negotiation failure)
</li>
Quickstart response </ul>
UTO <t>Safe but optional to keep state:
</t>
MPTCP negotiation success (see below for negotiation failure) <ul empty="true">
<li>Multipath TCP (MPTCP) negotiation failure (to avoid negotiation retries)
</li>
TFO negotiation success (see below for negotiation failure) <li>Maximum Segment Size (MSS)
</li>
Safe but optional to keep state: <li>TCP Fast Open (TFO) negotiation failure (to avoid negotiation retries)
</li>
MPTCP negotiation failure (to avoid negotiation retries) </ul>
MSS <t>Safe and necessary to keep state:
</t>
TFO negotiation failure (to avoid negotiation retries) <ul empty="true">
Safe and necessary to keep state: <li>TCP Fast Open (TFO) Cookie (if TFO succeeded in the past)
</li>
TFO cookie (if TFO succeeded in the past) </ul>
]]></artwork>
</figure>
</section>
<section title="Automating the Initial Window in TCP over Long Timescales </section>
" anchor="sect-c"><section title="Introduction" anchor="sect-c.1"><t> <section anchor="sect-c" numbered="true" toc="default">
<name>Automating the Initial Window in TCP over Long Timescales</name>
<section anchor="sect-c.1" numbered="true" toc="default">
<name>Introduction</name>
<t>
Temporal sharing, as described earlier in this document, builds on Temporal sharing, as described earlier in this document, builds on
the assumption that multiple consecutive connections between the the assumption that multiple consecutive connections between the
same host pair are somewhat likely to be exposed to similar same host-pair are somewhat likely to be exposed to similar
environment characteristics. The stored information can become less environment characteristics. The stored information can become less
accurate over time and suitable precautions should take this ageing accurate over time and suitable precautions should take this aging
into consideration (this is discussed further in section 8.1). into consideration (this is discussed further in <xref target="sect-8.1"/>).
However, there are also cases where it can make sense to track these However, there are also cases where it can make sense to track these
values over longer periods, observing properties of TCP connections values over longer periods, observing properties of TCP connections
to gradually influence evolving trends in TCP parameters. This to gradually influence evolving trends in TCP parameters. This
appendix describes an example of such a case.</t> appendix describes an example of such a case.</t>
<t>
<t>
TCP's congestion control algorithm uses an initial window value TCP's congestion control algorithm uses an initial window value
(IW), both as a starting point for new connections and as an upper (IW) both as a starting point for new connections and as an upper
limit for restarting after an idle period <xref target="RFC5681"/><xref targe limit for restarting after an idle period <xref target="RFC5681" format="defa
t="RFC7661"/>. This ult"/> <xref target="RFC7661" format="default"/>. This
value has evolved over time, originally one maximum segment size value has evolved over time; it was originally 1 maximum segment size
(MSS), and increased to the lesser of four MSS or 4,380 bytes (MSS) and increased to the lesser of 4 MSSs or 4,380 bytes
<xref target="RFC3390"/><xref target="RFC5681"/>. For a typical Internet conn <xref target="RFC3390" format="default"/> <xref target="RFC5681" format="defa
ection with a maximum ult"/>. For a typical Internet connection with a maximum
transmission unit (MTU) of 1500 bytes, this permits three segments transmission unit (MTU) of 1500 bytes, this permits 3 segments
of 1,460 bytes each.</t> of 1,460 bytes each.</t>
<t>
<t> The IW value was originally implied in the original TCP congestion control
The IW value was originally implied in the original TCP congestion description and documented as a standard in 1997 <xref target="RFC2001"
control description and documented as a standard in 1997 format="default"/> <xref target="Ja88" format="default"/>. The value was
<xref target="RFC2001"/><xref target="Ja88"/>. The value was updated in 1998 updated in 1998 experimentally and moved to the Standards Track in 2002
experimentally and <xref target="RFC2414" format="default"/> <xref target="RFC3390"
moved to the standards track in 2002 <xref target="RFC2414"/><xref target="RF format="default"/>. In 2013, it was experimentally increased to 10 <xref
C3390"/>. In 2013, it target="RFC6928" format="default"/>.</t>
was experimentally increased to 10 <xref target="RFC6928"/>.</t> <t>
<t>
This appendix discusses how TCP can objectively measure when an IW This appendix discusses how TCP can objectively measure when an IW
is too large, and that such feedback should be used over long is too large and that such feedback should be used over long
timescales to adjust the IW automatically. The result should be timescales to adjust the IW automatically. The result should be
safer to deploy and might avoid the need to repeatedly revisit IW safer to deploy and might avoid the need to repeatedly revisit IW
over time.</t> over time.</t>
<t>
<t>
Note that this mechanism attempts to make the IW more adaptive over Note that this mechanism attempts to make the IW more adaptive over
time. It can increase the IW beyond that which is currently time. It can increase the IW beyond that which is currently
recommended for widescale deployment, and so its use should be recommended for wide-scale deployment, so its use should be
carefully monitored.</t> carefully monitored.</t>
</section>
</section> <section anchor="sect-c.2" numbered="true" toc="default">
<name>Design Considerations</name>
<section title="Design Considerations" anchor="sect-c.2"><t> <t>
TCP's IW value has existed statically for over two decades, so any TCP's IW value has existed statically for over two decades, so any
solution to adjusting the IW dynamically should have similarly solution to adjusting the IW dynamically should have similarly
stable, non-invasive effects on the performance and complexity of stable, non-invasive effects on the performance and complexity of
TCP. In order to be fair, the IW should be similar for most machines TCP. In order to be fair, the IW should be similar for most machines
on the public Internet. Finally, a desirable goal is to develop a on the public Internet. Finally, a desirable goal is to develop a
self-correcting algorithm, so that IW values that cause network self-correcting algorithm so that IW values that cause network
problems can be avoided. To that end, we propose the following problems can be avoided. To that end, we propose the following
design goals:</t> design goals:</t>
<ul spacing="normal">
<t><list style="symbols"><t>Impart little to no impact to TCP in the abse <li>Impart little to no impact to TCP in the absence of loss, i.e.,
nce of loss, i.e.,
it should not increase the complexity of default packet it should not increase the complexity of default packet
processing in the normal case.</t> processing in the normal case.</li>
<li>Adapt to network feedback over long timescales, avoiding values
<t>Adapt to network feedback over long timescales, avoiding values that persistently cause network problems.</li>
that persistently cause network problems.</t> <li>Decrease the IW in the presence of sustained loss of IW segments,
as determined over a number of different connections.</li>
<t>Decrease the IW in the presence of sustained loss of IW segments, <li>Increase the IW in the absence of sustained loss of IW segments,
as determined over a number of different connections.</t> as determined over a number of different connections.</li>
<li>Operate conservatively, i.e., tend towards leaving the IW the
<t>Increase the IW in the absence of sustained loss of IW segments,
as determined over a number of different connections.</t>
<t>Operate conservatively, i.e., tend towards leaving the IW the
same in the absence of sufficient information, and give greater same in the absence of sufficient information, and give greater
consideration to IW segment loss than IW segment success.</t> consideration to IW segment loss than IW segment success.</li>
</ul>
</list> <t>
</t>
<t>
We expect that, without other context, a good IW algorithm will We expect that, without other context, a good IW algorithm will
converge to a single value, but this is not required. An endpoint converge to a single value, but this is not required. An endpoint
with additional context or information, or deployed in a constrained with additional context or information, or deployed in a constrained
environment, can always use a different value. In particular, environment, can always use a different value. In particular,
information from previous connections, or sets of connections with a information from previous connections, or sets of connections with a
similar path, can already be used as context for such decisions (as similar path, can already be used as context for such decisions (as
noted in the core of this document).</t> noted in the core of this document).</t>
<t>
<t>
However, if a given IW value persistently causes packet loss during However, if a given IW value persistently causes packet loss during
the initial burst of packets, it is clearly inappropriate and could the initial burst of packets, it is clearly inappropriate and could
be inducing unnecessary loss in other competing connections. This be inducing unnecessary loss in other competing connections. This
might happen for sites behind very slow boxes with small buffers, might happen for sites behind very slow boxes with small buffers,
which may or may not be the first hop.</t> which may or may not be the first hop.</t>
</section>
</section> <section anchor="sect-c.3" numbered="true" toc="default">
<name>Proposed IW Algorithm</name>
<section title="Proposed IW Algorithm" anchor="sect-c.3"><t> <t>
Below is a simple description of the proposed IW algorithm. It Below is a simple description of the proposed IW algorithm. It
relies on the following parameters:</t> relies on the following parameters:</t>
<ul spacing="normal">
<t><list style="symbols"><t>MinIW = 3 MSS or 4,380 bytes (as per <xref ta <li>MinIW = 3 MSS or 4,380 bytes (as per <xref target="RFC3390" format
rget="RFC3390"/>)</t> ="default"/>)</li>
<li>MaxIW = 10 MSS (as per <xref target="RFC6928" format="default"/>)<
<t>MaxIW = 10 MSS (as per <xref target="RFC6928"/>)</t> /li>
<li>MulDecr = 0.5</li>
<t>MulDecr = 0.5</t> <li>AddIncr = 2 MSS</li>
<li>Threshold = 0.05</li>
<t>AddIncr = 2 MSS</t> </ul>
<t>
<t>Threshold = 0.05</t>
</list>
</t>
<t>
We assume that the minimum IW (MinIW) should be as currently specified as We assume that the minimum IW (MinIW) should be as currently specified as
standard <xref target="RFC3390"/>. The maximum IW can be set to a fixed standard <xref target="RFC3390" format="default"/>. The maximum IW (MaxIW) ca
value (we suggest using the experimental and now somewhat de- facto n be
standard in <xref target="RFC6928"/>) or set based on a schedule if trusted set to a fixed value (we suggest using the experimental and now somewhat de
time references are available <xref facto standard in <xref target="RFC6928" format="default"/>) or set based
target="I-D.allman-tcpm-bump-initcwnd"/>; here we prefer a fixed value. We on a schedule if trusted time references are available <xref
also propose to use an AIMD algorithm, with increase and decreases as target="I-D.allman-tcpm-bump-initcwnd" format="default"/>; here, we prefer
noted.</t> a fixed value. We also propose to use an Additive Increase Multiplicative
Decrease (AIMD) algorithm, with increase and decreases as noted.</t>
<t> <t>
Although these parameters are somewhat arbitrary, their initial Although these parameters are somewhat arbitrary, their initial
values are not important except that the algorithm is AIMD and the values are not important except that the algorithm is AIMD and the
MaxIW should not exceed that recommended for other systems on the MaxIW should not exceed that recommended for other systems on the
Internet (here we selected the current de-facto standard rather than Internet (here, we selected the current de facto standard rather than
the actual standard). Current proposals, including default current the actual standard). Current proposals, including default current
operation, are degenerate cases of the algorithm below for given operation, are degenerate cases of the algorithm below for given
parameters - notably MulDec = 1.0 and AddIncr = 0 MSS, thus parameters, notably MulDec = 1.0 and AddIncr = 0 MSS, thus
disabling the automatic part of the algorithm.</t> disabling the automatic part of the algorithm.</t>
<t>
<t>
The proposed algorithm is as follows:</t> The proposed algorithm is as follows:</t>
<figure><artwork><![CDATA[ <ol>
1. On boot:
IW = MaxIW; # assume this is in bytes, and indicates an integer
multiple of 2 MSS (an even number to support ACK compression)
2. Upon starting a new connection:
CWND = IW; <li>
conncount++; <t>On boot:</t>
IWnotchecked = 1; # true <sourcecode type="pseudocode">
IW = MaxIW; # assume this is in bytes and indicates an integer
# multiple of 2 MSS (an even number to support
# ACK compression)
</sourcecode>
</li>
3. During a connection's SYN-ACK processing, if SYN-ACK includes ECN <li><t>Upon starting a new connection:</t>
(as similarly addressed in Sec 5 of ECN++ for TCP [Ba20]), treat <sourcecode type="pseudocode">
as if the IW is too large: CWND = IW;
conncount++;
IWnotchecked = 1; # true
</sourcecode>
</li>
if (IWnotchecked && (synackecn == 1)) { <li>
losscount++; <t>During a connection's SYN-ACK processing, if SYN-ACK includes ECN (as
IWnotchecked = 0; # never check again similarly addressed in Section 5 of ECN++ for TCP <xref
} target="I-D.ietf-tcpm-generalized-ecn"/>), treat as if the IW is too large:
</t>
<sourcecode type="pseudocode">
if (IWnotchecked &amp;&amp; (synackecn == 1)) {
losscount++;
IWnotchecked = 0; # never check again
}
</sourcecode>
</li>
4. During a connection, if retransmission occurs, check the seqno of <li><t>During a connection, if retransmission occurs, check the seqno of the
the outgoing packet (in bytes) to see if the resent segment fixes outgoing packet (in bytes) to see if the re-sent segment fixes an IW loss:</t>
an IW loss: <sourcecode type="pseudocode">
if (Retransmitting &amp;&amp; IWnotchecked &amp;&amp; ((seqno - ISN) &lt; IW)
)) {
losscount++;
IWnotchecked = 0; # never do this entire "if" again
} else {
IWnotchecked = 0; # you're beyond the IW so stop checking
}
</sourcecode>
</li>
if (Retransmitting && IWnotchecked && ((seqno - ISN) < IW))) { <li>
losscount++; <t>Once every 1000 connections, as a separate process (i.e., not as part of
IWnotchecked = 0; # never do this entire "if" again processing a given connection):
</t>
<sourcecode type="pseudocode">
if (conncount > 1000) {
if (losscount/conncount > threshold) {
# the number of connections with errors is too high
IW = IW * MulDecr;
} else { } else {
IWnotchecked = 0; # you're beyond the IW so stop checking IW = IW + AddIncr;
} }
}
</sourcecode>
</li>
5. Once every 1000 connections, as a separate process (i.e., not as </ol>
part of processing a given connection):
if (conncount > 1000) {
if (losscount/conncount > threshold) {
# the number of connections with errors is too high
IW = IW * MulDecr;
} else {
IW = IW + AddIncr;
}
}
]]></artwork>
</figure>
<t>
As presented, this algorithm can yield a false positive when the
sequence number wraps around, e.g., the code might increment
losscount in step 4 when no loss occurred or fail to increment
losscount when a loss did occur. This can be avoided using either
PAWS <xref target="RFC7323"/> context or internal extended sequence number
representations (as in TCP-AO <xref target="RFC5925"/>). Alternately, false
positives can be tolerated because they are expected to be
infrequent and thus will not significantly impact the algorithm.</t>
<t> <t>
As presented, this algorithm can yield a false positive when the sequence
number wraps around, e.g., the code might increment losscount in step 4
when no loss occurred or fail to increment losscount when a loss did
occur. This can be avoided using either Protection Against Wrapped
Sequences (PAWS) <xref target="RFC7323" format="default"/> context or
internal extended sequence number representations (as in TCP Authentication
Option (TCP-AO) <xref target="RFC5925" format="default"/>). Alternately,
false positives can be tolerated because they are expected to be infrequent
and thus will not significantly impact the algorithm.</t>
<t>
A number of additional constraints need to be imposed if this A number of additional constraints need to be imposed if this
mechanism is implemented to ensure that it defaults to values that mechanism is implemented to ensure that it defaults to values that
comply with current Internet standards, is conservative in how it comply with current Internet standards, is conservative in how it
extends those values, and returns to those values in the absence of extends those values, and returns to those values in the absence of
positive feedback (i.e., success). To that end, we recommend the positive feedback (i.e., success). To that end, we recommend the
following list of example constraints:</t> following list of example constraints:</t>
<t> <ul>
&gt;&gt; The automatic IW algorithm MUST initialize MaxIW a value no <li> <t> The automatic IW algorithm <bcp14>MUST</bcp14> initialize MaxIW a
larger than the currently recommended Internet default, in the value no larger than the currently recommended Internet default in the
absence of other context information.</t> absence of other context information.</t>
<t>
<t>
Thus, if there are too few connections to make a decision or if Thus, if there are too few connections to make a decision or if
there is otherwise insufficient information to increase the IW, then there is otherwise insufficient information to increase the IW, then
the MaxIW defaults to the current recommended value.</t> the MaxIW defaults to the current recommended value.</t></li>
<t> <li> <t>
&gt;&gt; An implementation MAY allow the MaxIW to grow beyond the An implementation <bcp14>MAY</bcp14> allow the MaxIW to grow beyond the
currently recommended Internet default, but not more than 2 segments currently recommended Internet default but not more than 2 segments
per calendar year.</t> per calendar year.</t>
<t>
<t> Thus, if an endpoint has a persistent history of successfully transmitting
Thus, if an endpoint has a persistent history of successfully IW segments without loss, then it is allowed to probe the Internet to
transmitting IW segments without loss, then it is allowed to probe determine if larger IW values have similar success. This probing is
the Internet to determine if larger IW values have similar success. limited and requires a trusted time source; otherwise, the MaxIW remains
This probing is limited and requires a trusted time source, constant.</t></li>
otherwise the MaxIW remains constant.</t> <li>
<t>
<t> An implementation <bcp14>MUST</bcp14> adjust the IW based on loss statistics
&gt;&gt; An implementation MUST adjust the IW based on loss statistics at at
least once every 1000 connections.</t> least once every 1000 connections.</t>
<t>
<t>
An endpoint needs to be sufficiently reactive to IW loss.</t> An endpoint needs to be sufficiently reactive to IW loss.</t>
</li>
<t> <li> <t>
&gt;&gt; An implementation MUST decrease the IW by at least one MSS when An implementation <bcp14>MUST</bcp14> decrease the IW by at least 1 MSS when
indicated during an evaluation interval.</t> indicated during an evaluation interval.</t>
<t>
<t>
An endpoint that detects loss needs to decrease its IW by at least An endpoint that detects loss needs to decrease its IW by at least
one MSS, otherwise it is not participating in an automatic reactive 1 MSS; otherwise, it is not participating in an automatic reactive
algorithm.</t> algorithm.</t></li>
<li>
<t> <t>
&gt;&gt; An implementation MUST increase by no more than 2 MSS per An implementation <bcp14>MUST</bcp14> increase by no more than 2 MSSs per
evaluation interval.</t> evaluation interval.</t>
<t>
<t>
An endpoint that does not experience IW loss needs to probe the An endpoint that does not experience IW loss needs to probe the
network incrementally.</t> network incrementally.</t>
</li>
<t> <li>
&gt;&gt; An implementation SHOULD use an IW that is an integer multiple of <t>
2 MSS.</t> An implementation <bcp14>SHOULD</bcp14> use an IW that is an integer multiple
of
<t> 2 MSSs.</t>
The IW should remain a multiple of 2 MSS segments, to enable <t>
The IW should remain a multiple of 2 MSS segments to enable
efficient ACK compression without incurring unnecessary timeouts.</t> efficient ACK compression without incurring unnecessary timeouts.</t>
</li>
<t> <li> <t>
&gt;&gt; An implementation MUST decrease the IW if more than 95% of An implementation <bcp14>MUST</bcp14> decrease the IW if more than 95% of
connections have IW losses.</t> connections have IW losses.</t>
<t>
Again, this is to ensure an implementation is sufficiently reactive.</t></li>
<t> <li
Again, this is to ensure an implementation is sufficiently reactive.</t> > <t>
An implementation <bcp14>MAY</bcp14> group IW values and statistics within
<t> subsets of connections. Such grouping <bcp14>MAY</bcp14> use any information
&gt;&gt; An implementation MAY group IW values and statistics within about
subsets of connections. Such grouping MAY use any information about
connections to form groups except loss statistics.</t> connections to form groups except loss statistics.</t>
</li>
<t> </ul>
There are some TCP connections which might not be counted at all, <t>
such as those to/from loopback addresses, or those within the same There are some TCP connections that might not be counted at all,
such as those to/from loopback addresses or those within the same
subnet as that of a local interface (for which congestion control is subnet as that of a local interface (for which congestion control is
sometimes disabled anyway). This may also include connections that sometimes disabled anyway). This may also include connections that
terminate before the IW is full, i.e., as a separate check at the terminate before the IW is full, i.e., as a separate check at the
time of the connection closing.</t> time of the connection closing.</t>
<t>
<t> The period over which the IW is updated is intended to be a long timescale,
The period over which the IW is updated is intended to be a long e.g., a month or so, or 1,000 connections, whichever is longer. An
timescale, e.g., a month or so, or 1,000 connections, whichever is implementation might check the IW once a month and simply not update the IW
longer. An implementation might check the IW once a month, and or clear the connection counts in months where the number of connections is
simply not update the IW or clear the connection counts in months too small.</t>
where the number of connections is too small.</t> </section>
<section anchor="sect-c.4" numbered="true" toc="default">
</section> <name>Discussion</name>
<t>
<section title="Discussion" anchor="sect-c.4"><t>
There are numerous parameters to the above algorithm that are There are numerous parameters to the above algorithm that are
compliant with the given requirements; this is intended to allow compliant with the given requirements; this is intended to allow
variation in configuration and implementation while ensuring that variation in configuration and implementation while ensuring that
all such algorithms are reactive and safe.</t> all such algorithms are reactive and safe.</t>
<t>
<t>
This algorithm continues to assume segments because that is the This algorithm continues to assume segments because that is the
basis of most TCP implementations. It might be useful to consider basis of most TCP implementations. It might be useful to consider
revising the specifications to allow byte-based congestion given revising the specifications to allow byte-based congestion given
sufficient experience.</t> sufficient experience.</t>
<t>
<t>
The algorithm checks for IW losses only during the first IW after a The algorithm checks for IW losses only during the first IW after a
connection start; it does not check for IW losses elsewhere the IW connection start; it does not check for IW losses elsewhere the IW
is used, e.g., during slow-start restarts.</t> is used, e.g., during slow-start restarts.</t>
<t> <ul>
&gt;&gt; An implementation MAY detect IW losses during slow-start restarts <li> <t> An implementation <bcp14>MAY</bcp14> detect IW losses during
in addition to losses during the first IW of a connection. In this slow-start restarts in addition to losses during the first IW of a
case, the implementation MUST count each restart as a "connection" connection. In this case, the implementation <bcp14>MUST</bcp14> count
for the purposes of connection counts and periodic rechecking of the each restart as a "connection" for the purposes of connection counts and
IW value.</t> periodic rechecking of the IW value.</t>
</li>
<t> </ul>
<t>
False positives can occur during some kinds of segment reordering, False positives can occur during some kinds of segment reordering,
e.g., that might trigger spurious retransmissions even without a e.g., that might trigger spurious retransmissions even without a
true segment loss. These are not expected to be sufficiently common true segment loss. These are not expected to be sufficiently common
to dominate the algorithm and its conclusions.</t> to dominate the algorithm and its conclusions.</t>
<t> <t>
This mechanism does require additional per-connection state, which This mechanism does require additional per-connection state, which is
is currently common in some implementations, and is useful for other currently common in some implementations and is useful for other reasons
reasons (e.g., the ISN is used in TCP-AO <xref target="RFC5925"/>). The mecha (e.g., the ISN is used in TCP-AO <xref target="RFC5925"
nism format="default"/>).
also benefits from persistent state kept across reboots, as would be
other state sharing mechanisms (e.g., TCP Control Block Sharing per
the main body of this document).</t>
<t> The mechanism in this appendix also benefits from persistent state kept across
reboots, which would also be useful to other state sharing mechanisms (e.g.,
TCP Control Block Sharing per the main body of this document).
</t>
<t>
The receive window (rwnd) is not involved in this calculation. The The receive window (rwnd) is not involved in this calculation. The
size of rwnd is determined by receiver resources and provides space size of rwnd is determined by receiver resources and provides space
to accommodate segment reordering. It is not involved with to accommodate segment reordering.
congestion control, which is the focus of this document and its
management of the IW.</t>
</section> Also, rwnd is not involved with congestion control, which is the focus of the wa
y
this appendix manages the IW.
<section title="Observations" anchor="sect-c.5"><t> </t>
The IW may not converge to a single, global value. It also may not </section>
converge at all, but rather may oscillate by a few MSS as it <section anchor="sect-c.5" numbered="true" toc="default">
<name>Observations</name>
<t>
The IW may not converge to a single global value. It also may not
converge at all but rather may oscillate by a few MSSs as it
repeatedly probes the Internet for larger IWs and fails. Both repeatedly probes the Internet for larger IWs and fails. Both
properties are consistent with TCP behavior during each individual properties are consistent with TCP behavior during each individual
connection.</t> connection.</t>
<t>
<t>
This mechanism assumes that losses during the IW are due to IW size. This mechanism assumes that losses during the IW are due to IW size.
Persistent errors that drop packets for other reasons - e.g., OS Persistent errors that drop packets for other reasons, e.g., OS
bugs, can cause false positives. Again, this is consistent with bugs, can cause false positives. Again, this is consistent with
TCP's basic assumption that loss is caused by congestion and TCP's basic assumption that loss is caused by congestion and
requires backoff. This algorithm treats the IW of new connections as requires backoff. This algorithm treats the IW of new connections as
a long-timescale backoff system.</t> a long-timescale backoff system.</t>
</section>
</section> </section>
<section numbered="false" anchor="acknowledgments" toc="default">
</section> <name>Acknowledgments</name>
<t>
<section title="Acknowledgments" numbered="no" anchor="acknowledgments">< The authors would like to thank <contact fullname="Praveen
t> Balasubramanian"/> for information regarding TCB sharing in Windows;
The authors would like to thank for Praveen Balasubramanian for <contact fullname="Christoph Paasch"/> for information regarding TCB
information regarding TCB sharing in Windows, Christoph Paasch for sharing in Apple OSs; <contact fullname="Yuchung Cheng"/>, <contact
information regarding TCB sharing in Apple OSes, and Yuchung Cheng, fullname="Lars Eggert"/>, <contact fullname="Ilpo Jarvinen"/>, and <contact
Lars Eggert, Ilpo Jarvinen and Michael Scharf for comments on fullname="Michael Scharf"/> for comments on earlier draft versions of this
earlier versions of the draft, as well as members of the TCPM WG. document; as well as members of the TCPM WG. Earlier revisions of this
Earlier revisions of this work received funding from a collaborative work received funding from a collaborative research project between the
research project between the University of Oslo and Huawei University of Oslo and Huawei Technologies Co., Ltd. and were partly
Technologies Co., Ltd. and were partly supported by USC/ISI's Postel supported by USC/ISI's Postel Center.</t>
Center.</t> <t>
<t>
This document was prepared using 2-Word-v2.0.template.dot.</t> This document was prepared using 2-Word-v2.0.template.dot.</t>
</section>
</back>
</section> </rfc>
</back>
</rfc>
 End of changes. 247 change blocks. 
1365 lines changed or deleted 1653 lines changed or added

This html diff was produced by rfcdiff 1.48. The latest version is available from http://tools.ietf.org/tools/rfcdiff/