| rfc9309xml2.original.xml | rfc9309.xml | |||
|---|---|---|---|---|
| <?xml version="1.0" encoding="US-ASCII"?> | <?xml version="1.0" encoding="UTF-8"?> | |||
| <!DOCTYPE rfc SYSTEM "rfc2629.dtd" [ | <!DOCTYPE rfc [ | |||
| <!ENTITY RFC1945 PUBLIC "" "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/re | <!ENTITY nbsp " "> | |||
| ference.RFC.1945.xml"> | <!ENTITY zwsp "​"> | |||
| <!ENTITY RFC2046 PUBLIC "" "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/re | <!ENTITY nbhy "‑"> | |||
| ference.RFC.2046.xml"> | <!ENTITY wj "⁠"> | |||
| <!ENTITY RFC2119 PUBLIC "" "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/re | ||||
| ference.RFC.2119.xml"> | ||||
| <!ENTITY RFC3629 PUBLIC "" "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/re | ||||
| ference.RFC.3629.xml"> | ||||
| <!ENTITY RFC3986 PUBLIC "" "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/re | ||||
| ference.RFC.3986.xml"> | ||||
| <!ENTITY RFC5234 PUBLIC "" "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/re | ||||
| ference.RFC.5234.xml"> | ||||
| <!ENTITY RFC8174 PUBLIC "" "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/re | ||||
| ference.RFC.8174.xml"> | ||||
| <!ENTITY RFC8288 PUBLIC "" "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/re | ||||
| ference.RFC.8288.xml"> | ||||
| <!ENTITY RFC9110 PUBLIC "" "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/re | ||||
| ference.RFC.9110.xml"> | ||||
| <!ENTITY RFC9111 PUBLIC "" "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/re | ||||
| ference.RFC.9111.xml"> | ||||
| ]> | ]> | |||
| <rfc ipr="trust200902" category="std" docName="draft-koster-rep-12" > | <rfc xmlns:xi="http://www.w3.org/2001/XInclude" ipr="trust200902" docName="draft | |||
| -koster-rep-12" number="9309" obsoletes="" updates="" submissionType="IETF" cate | ||||
| <?xml-stylesheet type="text/xsl" href="rfc2629.xslt" ?> | gory="std" consensus="true" xml:lang="en" tocInclude="true" tocDepth="4" symRefs | |||
| ="true" sortRefs="true" version="3"> | ||||
| <?rfc toc="yes" ?> | ||||
| <?rfc tocdepth="4" ?> | ||||
| <?rfc symrefs="yes" ?> | ||||
| <?rfc sortrefs="yes"?> | ||||
| <?rfc compact="yes" ?> | ||||
| <?rfc subcompact="no"?> | ||||
| <front> | <!-- xml2rfc v2v3 conversion 3.13.0 --> | |||
| <title abbrev="REP">Robots Exclusion Protocol</title> | ||||
| <author initials="M." surname="Koster" fullname="Martijn Koster" role="edito | <front> | |||
| r"> | <title abbrev="Robots Exclusion Protocol (REP)">Robots Exclusion Protocol</t | |||
| <organization>Stalworthy Computing, Ltd.</organization> | itle> | |||
| <seriesInfo name="RFC" value="9309"/> | ||||
| <author initials="M." surname="Koster" fullname="Martijn Koster"> | ||||
| <address> | <address> | |||
| <postal> | <postal> | |||
| <extaddr>Stalworthy Manor Farm</extaddr> | ||||
| <street>Suton Lane</street> | <street>Suton Lane</street> | |||
| <city>Wymondham, Norfolk</city> | <city>Wymondham, Norfolk</city> | |||
| <code>NR18 9JG</code> | <code>NR18 9JG</code> | |||
| <country>United Kingdom</country> | <country>United Kingdom</country> | |||
| </postal> | </postal> | |||
| <email>m.koster@greenhills.co.uk</email> | <email>m.koster@greenhills.co.uk</email> | |||
| </address> | </address> | |||
| </author> | </author> | |||
| <author initials="G." surname="Illyes" fullname="Gary Illyes" role="editor"> | <author initials="G." surname="Illyes" fullname="Gary Illyes"> | |||
| <organization>Google LLC.</organization> | <organization>Google LLC</organization> | |||
| <address> | <address> | |||
| <postal> | <postal> | |||
| <street>Brandschenkestrasse 110</street> | <street>Brandschenkestrasse 110</street> | |||
| <city>Zurich</city> | <city>Zürich</city> | |||
| <code>8002</code> | <code>8002</code> | |||
| <country>Switzerland</country> | <country>Switzerland</country> | |||
| </postal> | </postal> | |||
| <email>garyillyes@google.com</email> | <email>garyillyes@google.com</email> | |||
| </address> | </address> | |||
| </author> | </author> | |||
| <author initials="H." surname="Zeller" fullname="Henner Zeller" role="editor | <author initials="H." surname="Zeller" fullname="Henner Zeller"> | |||
| "> | <organization>Google LLC</organization> | |||
| <organization>Google LLC.</organization> | ||||
| <address> | <address> | |||
| <postal> | <postal> | |||
| <street>1600 Amphitheatre Pkwy</street> | <street>1600 Amphitheatre Pkwy</street> | |||
| <city>Mountain View, CA</city> | <city>Mountain View</city> | |||
| <region>CA</region> | ||||
| <code>94043</code> | <code>94043</code> | |||
| <country>USA</country> | <country>United States of America</country> | |||
| </postal> | </postal> | |||
| <email>henner@google.com</email> | <email>henner@google.com</email> | |||
| </address> | </address> | |||
| </author> | </author> | |||
| <author initials="L." surname="Sassman" fullname="Lizzi Sassman" role="edito | <author initials="L." surname="Sassman" fullname="Lizzi Sassman"> | |||
| r"> | <organization>Google LLC</organization> | |||
| <organization>Google LLC.</organization> | ||||
| <address> | <address> | |||
| <postal> | <postal> | |||
| <street>Brandschenkestrasse 110</street> | <street>Brandschenkestrasse 110</street> | |||
| <city>Zurich</city> | <city>Zürich</city> | |||
| <code>8002</code> | <code>8002</code> | |||
| <country>Switzerland</country> | <country>Switzerland</country> | |||
| </postal> | </postal> | |||
| <email>lizzi@google.com</email> | <email>lizzi@google.com</email> | |||
| </address> | </address> | |||
| </author> | </author> | |||
| <date year="2022" month="September"/> | ||||
| <date year="2022" month="July" day="06"/> | <keyword>robot</keyword> | |||
| <keyword>crawler</keyword> | ||||
| <area>General</area> | <keyword>robots.txt</keyword> | |||
| <keyword>internet-drafts</keyword> | ||||
| <abstract> | <abstract> | |||
| <t> This document specifies and extends the "Robots Exclusion Protoco | <t> This document specifies and extends the "Robots Exclusion Protocol" | |||
| l" | method originally defined by Martijn Koster in 1994 for service owners | |||
| method originally defined by Martijn Koster in 1996 for service owners | ||||
| to control how content served by their services may be accessed, if at | to control how content served by their services may be accessed, if at | |||
| all, by automatic clients known as crawlers. Specifically, it adds | all, by automatic clients known as crawlers. Specifically, it adds | |||
| definition language for the protocol and instructions for handling | definition language for the protocol, instructions for handling | |||
| errors and caching. </t> | errors, and instructions for caching. </t> | |||
| </abstract> | </abstract> | |||
| </front> | </front> | |||
| <middle> | <middle> | |||
| <section anchor="introduction" title="Introduction"> | <section anchor="introduction" numbered="true" toc="default"> | |||
| <name>Introduction</name> | ||||
| <t> This document applies to services that provide resources that clients | <t> This document applies to services that provide resources that clients | |||
| can access through URIs as defined in <xref target="RFC3986"/>. For ex ample, | can access through URIs as defined in <xref target="RFC3986" format="d efault"/>. For example, | |||
| in the context of HTTP, a browser is a client that displays the conten t of a | in the context of HTTP, a browser is a client that displays the conten t of a | |||
| web page. </t> | web page. </t> | |||
| <t> Crawlers are automated clients. Search engines, for instance, have cra | ||||
| <t> Crawlers are automated clients. Search engines for instance have crawl | wlers to | |||
| ers to | ||||
| recursively traverse links for indexing as defined in | recursively traverse links for indexing as defined in | |||
| <xref target="RFC8288"/>. </t> | <xref target="RFC8288" format="default"/>. </t> | |||
| <t> It may be inconvenient for service owners if crawlers visit the entire ty of | <t> It may be inconvenient for service owners if crawlers visit the entire ty of | |||
| their URI space. This document specifies the rules originally defined by | their URI space. This document specifies the rules originally defined by | |||
| the "Robots Exclusion Protocol" <xref target="ROBOTSTXT"/> t hat crawlers | the "Robots Exclusion Protocol" <xref target="ROBOTSTXT" format="defau lt"/> that crawlers | |||
| are requested to honor when accessing URIs. </t> | are requested to honor when accessing URIs. </t> | |||
| <t> These rules are not a form of access authorization. </t> | <t> These rules are not a form of access authorization. </t> | |||
| <section anchor="requirements-language" numbered="true" toc="default"> | ||||
| <section anchor="requirements-language" title="Requirements Language"> | <name>Requirements Language</name> | |||
| <t> The key words "<bcp14>MUST</bcp14>", | <t>The key words "<bcp14>MUST</bcp14>", "<bcp14>MUST NOT</bcp14>", | |||
| "<bcp14>MUST NOT</bcp14>", "<bcp14>REQUIRED</bcp14>&q | "<bcp14>REQUIRED</bcp14>", "<bcp14>SHALL</bcp14>", | |||
| uot;, | "<bcp14>SHALL NOT</bcp14>", "<bcp14>SHOULD</bcp14>", | |||
| "<bcp14>SHALL</bcp14>", "<bcp14>SHALL NOT</bcp14>&quo | "<bcp14>SHOULD NOT</bcp14>", | |||
| t;, | "<bcp14>RECOMMENDED</bcp14>", "<bcp14>NOT RECOMMENDED</bcp14>", | |||
| "<bcp14>SHOULD</bcp14>", "<bcp14>SHOULD NOT</bcp14>&q | "<bcp14>MAY</bcp14>", and "<bcp14>OPTIONAL</bcp14>" in this document | |||
| uot;, | are to be interpreted as described in BCP 14 | |||
| "<bcp14>RECOMMENDED</bcp14>", | <xref target="RFC2119"/> <xref target="RFC8174"/> when, and only | |||
| "<bcp14>NOT RECOMMENDED</bcp14>", "<bcp14>MAY</bcp14> | when, they appear in all capitals, as shown here.</t> | |||
| ", | ||||
| and "<bcp14>OPTIONAL</bcp14>" in this document are to be | ||||
| interpreted as described in | ||||
| BCP 14 <xref target="RFC2119"/> <xref target="RFC8174"/> when, and o | ||||
| nly | ||||
| when, they appear in all capitals, as shown here. </t> | ||||
| </section> | </section> | |||
| </section> | </section> | |||
| <section anchor="specification" title="Specification"> | <section anchor="specification" numbered="true" toc="default"> | |||
| <section anchor="protocol-definition" title="Protocol Definition"> | <name>Specification</name> | |||
| <section anchor="protocol-definition" numbered="true" toc="default"> | ||||
| <name>Protocol Definition</name> | ||||
| <t> The protocol language consists of rule(s) and group(s) that the serv ice | <t> The protocol language consists of rule(s) and group(s) that the serv ice | |||
| makes available in a file named 'robots.txt' as described in | makes available in a file named "robots.txt" as described in | |||
| <xref target="access-method" />: </t> | <xref target="access-method" format="default"/>: </t> | |||
| <t> | <dl spacing="normal"> | |||
| <list style="symbols"> | <dt> Rule:</dt><dd> A line with a key-value pair that defines how a | |||
| <t> Rule: A line with a key-value pair that defines how a | ||||
| crawler may access URIs. See | crawler may access URIs. See | |||
| <xref target="the-allow-and-disallow-lines" />. </t> | <xref target="the-allow-and-disallow-lines" format="default"/>. | |||
| <t> Group: One or more user-agent lines that is followed by | </dd> | |||
| <dt> Group:</dt><dd> One or more user-agent lines that are followed by | ||||
| one or more rules. The group is terminated by a user-agent line | one or more rules. The group is terminated by a user-agent line | |||
| or end of file. See <xref target="the-user-agent-line" />. | or end of file. See <xref target="the-user-agent-line" format="d efault"/>. | |||
| The last group may have no rules, which means it implicitly | The last group may have no rules, which means it implicitly | |||
| allows everything. </t> | allows everything. </dd> | |||
| </list> </t> | </dl> | |||
| </section> | </section> | |||
| <section anchor="formal-syntax" title="Formal Syntax"> | <section anchor="formal-syntax" numbered="true" toc="default"> | |||
| <name>Formal Syntax</name> | ||||
| <t> Below is an Augmented Backus-Naur Form (ABNF) description, as descri bed | <t> Below is an Augmented Backus-Naur Form (ABNF) description, as descri bed | |||
| in <xref target="RFC5234"/>. </t> | in <xref target="RFC5234" format="default"/>. </t> | |||
| <sourcecode name="" type="abnf"><![CDATA[ | ||||
| <figure><artwork> | robotstxt = *(group / emptyline) | |||
| <![CDATA[ | group = startgroupline ; We start with a user-agent | |||
| robotstxt = *(group / emptyline) | ; line | |||
| group = startgroupline ; We start with a user-agent | *(startgroupline / emptyline) ; ... and possibly more | |||
| *(startgroupline / emptyline) ; ... and possibly more | ; user-agent lines | |||
| ; user-agents | *(rule / emptyline) ; followed by rules relevant | |||
| *(rule / emptyline) ; followed by rules relevant | ; for the preceding | |||
| ; for UAs | ; user-agent lines | |||
| startgroupline = *WS "user-agent" *WS ":" *WS product-token EOL | startgroupline = *WS "user-agent" *WS ":" *WS product-token EOL | |||
| rule = *WS ("allow" / "disallow") *WS ":" | rule = *WS ("allow" / "disallow") *WS ":" | |||
| *WS (path-pattern / empty-pattern) EOL | *WS (path-pattern / empty-pattern) EOL | |||
| ; parser implementors: define additional lines you need (for | ; parser implementors: define additional lines you need (for | |||
| ; example, sitemaps). | ; example, Sitemaps). | |||
| product-token = identifier / "*" | product-token = identifier / "*" | |||
| path-pattern = "/" *UTF8-char-noctl ; valid URI path pattern | path-pattern = "/" *UTF8-char-noctl ; valid URI path pattern | |||
| empty-pattern = *WS | empty-pattern = *WS | |||
| identifier = 1*(%x2D / %x41-5A / %x5F / %x61-7A) | identifier = 1*(%x2D / %x41-5A / %x5F / %x61-7A) | |||
| comment = "#" *(UTF8-char-noctl / WS / "#") | comment = "#" *(UTF8-char-noctl / WS / "#") | |||
| emptyline = EOL | emptyline = EOL | |||
| EOL = *WS [comment] NL ; end-of-line may have | EOL = *WS [comment] NL ; end-of-line may have | |||
| ; optional trailing comment | ; optional trailing comment | |||
| NL = %x0D / %x0A / %x0D.0A | NL = %x0D / %x0A / %x0D.0A | |||
| WS = %x20 / %x09 | WS = %x20 / %x09 | |||
| ; UTF8 derived from RFC3629, but excluding control characters | ; UTF8 derived from RFC 3629, but excluding control characters | |||
| UTF8-char-noctl = UTF8-1-noctl / UTF8-2 / UTF8-3 / UTF8-4 | UTF8-char-noctl = UTF8-1-noctl / UTF8-2 / UTF8-3 / UTF8-4 | |||
| UTF8-1-noctl = %x21 / %x22 / %x24-7F ; excluding control, space, '#' | UTF8-1-noctl = %x21 / %x22 / %x24-7F ; excluding control, space, "#" | |||
| UTF8-2 = %xC2-DF UTF8-tail | UTF8-2 = %xC2-DF UTF8-tail | |||
| UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2UTF8-tail / | UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2UTF8-tail / | |||
| %xED %x80-9F UTF8-tail / %xEE-EF 2UTF8-tail | %xED %x80-9F UTF8-tail / %xEE-EF 2UTF8-tail | |||
| UTF8-4 = %xF0 %x90-BF 2UTF8-tail / %xF1-F3 3UTF8-tail / | UTF8-4 = %xF0 %x90-BF 2UTF8-tail / %xF1-F3 3UTF8-tail / | |||
| %xF4 %x80-8F 2UTF8-tail | %xF4 %x80-8F 2UTF8-tail | |||
| UTF8-tail = %x80-BF | UTF8-tail = %x80-BF | |||
| ]]> | ]]></sourcecode> | |||
| </artwork></figure> | <section anchor="the-user-agent-line" numbered="true" toc="default"> | |||
| <section anchor="the-user-agent-line" title="The User-Agent Line"> | <name>The User-Agent Line</name> | |||
| <t> Crawlers set their own name, which is called a product token, to f ind | <t> Crawlers set their own name, which is called a product token, to f ind | |||
| relevant groups. The product token <bcp14>MUST</bcp14> contain onl y | relevant groups. The product token <bcp14>MUST</bcp14> contain onl y | |||
| upper and lowercase letters ("a-z" and "A-Z"), | uppercase and lowercase letters ("a-z" and "A-Z"), | |||
| underscores ("_"), and hyphens ("-"). | underscores ("_"), and hyphens ("-"). | |||
| The product token <bcp14>SHOULD</bcp14> | The product token <bcp14>SHOULD</bcp14> | |||
| be a substring of the identification string that the crawler sends to | be a substring of the identification string that the crawler sends to | |||
| the service (for example, in the case of HTTP, the product token | the service. For example, in the case of HTTP | |||
| <bcp14>SHOULD</bcp14> be a substring in the user-agent header). | <xref target="RFC9110" format="default"/>, the product token | |||
| <bcp14>SHOULD</bcp14> be a substring in the User-Agent header. | ||||
| The identification string <bcp14>SHOULD</bcp14> describe the purpo se of | The identification string <bcp14>SHOULD</bcp14> describe the purpo se of | |||
| the crawler. Here's an example of a user-agent HTTP request he ader | the crawler. Here's an example of a User-Agent HTTP request header | |||
| with a link pointing to a page describing the purpose of the | with a link pointing to a page describing the purpose of the | |||
| ExampleBot crawler, which appears as a substring in the user-agent HTTP | ExampleBot crawler, which appears as a substring in the User-Agent HTTP | |||
| header and as a product token in the robots.txt user-agent line: < /t> | header and as a product token in the robots.txt user-agent line: < /t> | |||
| <texttable title="Example of a user-agent HTTP header and | <figure anchor="fig-1"> | |||
| robots.txt user-agent line for the ExampleBot produc | <name>Example of a User-Agent HTTP header and | |||
| t token. | robots.txt user-agent line for the ExampleBot product token</n | |||
| Note that the product token (ExampleBot) is a substr | ame> | |||
| ing of the | <artwork name="" type="" align="center" alt=""><![CDATA[ | |||
| user-agent HTTP header"> | +==========================================+========================+ | |||
| <ttcol align="left">user-agent HTTP header</ttcol> | | User-Agent HTTP header | robots.txt user-agent | | |||
| <ttcol align="left">robots.txt user-agent line</ttcol> | | | line | | |||
| <c>user-agent: Mozilla/5.0 (compatible; ExampleBot/0.1; https://www. | +==========================================+========================+ | |||
| example.com/bot.html)</c> | | User-Agent: Mozilla/5.0 (compatible; | user-agent: ExampleBot | | |||
| <c>user-agent: ExampleBot</c> | | ExampleBot/0.1; | | | |||
| </texttable> | | https://www.example.com/bot.html) | | | |||
| +------------------------------------------+------------------------+ | ||||
| ]]></artwork> | ||||
| </figure> | ||||
| <t> Note that the product token (ExampleBot) is a substring of | ||||
| the User-Agent HTTP header.</t> | ||||
| <t> Crawlers <bcp14>MUST</bcp14> use case-insensitive matching | <t> Crawlers <bcp14>MUST</bcp14> use case-insensitive matching | |||
| to find the group that matches the product token, and then | to find the group that matches the product token and then | |||
| obey the rules of the group. If there is more than one | obey the rules of the group. If there is more than one | |||
| group matching the user-agent, the matching groups' rules | group matching the user-agent, the matching groups' rules | |||
| <bcp14>MUST</bcp14> be combined into one group and parsed | <bcp14>MUST</bcp14> be combined into one group and parsed | |||
| according to | according to | |||
| <xref target="the-allow-and-disallow-lines" />. </t> | <xref target="the-allow-and-disallow-lines" format="default"/>.</t | |||
| > | ||||
| <texttable title="Example of how to merge two robots.txt | ||||
| groups that match the same product token"> | ||||
| <ttcol align="left">Two groups that match the same product token exa | ||||
| ctly</ttcol> | ||||
| <ttcol align="left">Merged group</ttcol> | ||||
| <c>user-agent: ExampleBot<br /> | ||||
| disallow: /foo<br /> | ||||
| disallow: /bar<br /> | ||||
| <br /> | ||||
| user-agent: ExampleBot<br /> | ||||
| disallow: /baz | ||||
| </c> | ||||
| <c>user-agent: ExampleBot<br /> | ||||
| disallow: /foo<br /> | ||||
| disallow: /bar<br /> | ||||
| disallow: /baz</c> | ||||
| </texttable> | ||||
| <figure anchor="fig-2"> | ||||
| <name>Example of how to merge two robots.txt | ||||
| groups that match the same product token</name> | ||||
| <artwork name="" type="" align="center" alt=""><![CDATA[ | ||||
| +========================================+========================+ | ||||
| | Two groups that match the same product | Merged group | | ||||
| | token exactly | | | ||||
| +========================================+========================+ | ||||
| | user-agent: ExampleBot | user-agent: ExampleBot | | ||||
| | disallow: /foo | disallow: /foo | | ||||
| | disallow: /bar | disallow: /bar | | ||||
| | | disallow: /baz | | ||||
| | user-agent: ExampleBot | | | ||||
| | disallow: /baz | | | ||||
| +----------------------------------------+------------------------+ | ||||
| ]]></artwork> | ||||
| </figure> | ||||
| <t> If no matching group exists, crawlers <bcp14>MUST</bcp14> obey the group | <t> If no matching group exists, crawlers <bcp14>MUST</bcp14> obey the group | |||
| with a user-agent line with the "*" value, if present. </t> | with a user-agent line with the "*" value, if present. </t> | |||
| <figure anchor="fig-3"> | ||||
| <texttable title="Example of no matching groups other than the '*' | <name>Example of no matching groups other than the "*" | |||
| for the ExampleBot product token"> | for the ExampleBot product token</name> | |||
| <ttcol align="left">Two groups that don't explicitly match ExampleBo | <artwork name="" type="" align="center" alt=""><![CDATA[ | |||
| t</ttcol> | +==================================+======================+ | |||
| <ttcol align="left">Applicable group for ExampleBot</ttcol> | | Two groups that don't explicitly | Applicable group for | | |||
| <c>user-agent: *<br /> | | match ExampleBot | ExampleBot | | |||
| disallow: /foo<br /> | +==================================+======================+ | |||
| disallow: /bar<br /> | | user-agent: * | user-agent: * | | |||
| <br /> | | disallow: /foo | disallow: /foo | | |||
| user-agent: BazBot<br /> | | disallow: /bar | disallow: /bar | | |||
| disallow: /baz | | | | | |||
| </c> | | user-agent: BazBot | | | |||
| <c>user-agent: *<br /> | | disallow: /baz | | | |||
| disallow: /foo<br /> | +----------------------------------+----------------------+ | |||
| disallow: /bar</c> | ]]></artwork> | |||
| </texttable> | </figure> | |||
| <t> If no group matches the product token and there is no group with a user-agent | <t> If no group matches the product token and there is no group with a user-agent | |||
| line with the "*" value, or no groups are present at all, no | line with the "*" value, or no groups are present at all, no | |||
| rules apply. </t> | rules apply. </t> | |||
| </section> | </section> | |||
| <section anchor="the-allow-and-disallow-lines" title="The Allow and Disa | <section anchor="the-allow-and-disallow-lines" numbered="true" toc="defa | |||
| llow Lines"> | ult"> | |||
| <name>The "Allow" and "Disallow" Lines</name> | ||||
| <t> These lines indicate whether accessing a URI that matches the | <t> These lines indicate whether accessing a URI that matches the | |||
| corresponding path is allowed or disallowed. </t> | corresponding path is allowed or disallowed. </t> | |||
| <t> To evaluate if access to a URI is allowed, a crawler <bcp14>MUST</ bcp14> | <t> To evaluate if access to a URI is allowed, a crawler <bcp14>MUST</ bcp14> | |||
| match the paths in allow and disallow rules against the URI. | match the paths in "allow" and "disallow" rules against the URI. | |||
| The matching <bcp14>SHOULD</bcp14> be case sensitive. The matching | The matching <bcp14>SHOULD</bcp14> be case sensitive. The matching | |||
| <bcp14>MUST</bcp14> start with the first octet of the path. The mo st | <bcp14>MUST</bcp14> start with the first octet of the path. The mo st | |||
| specific match found <bcp14>MUST</bcp14> be used. The most specifi c | specific match found <bcp14>MUST</bcp14> be used. The most specifi c | |||
| match is the match that has the most octets. Duplicate rules in a | match is the match that has the most octets. Duplicate rules in a | |||
| group <bcp14>MAY</bcp14> be deduplicated. If an allow and disallow | group <bcp14>MAY</bcp14> be deduplicated. If an "allow" rule and a | |||
| rule are equivalent, then the allow rule <bcp14>SHOULD</bcp14> be | "disallow" | |||
| used. If no | rule are equivalent, then the "allow" rule <bcp14>SHOULD</bcp14> b | |||
| match is found amongst the rules in a group for a matching user-ag | e used. If no | |||
| ent, | match is found amongst the rules in a group for a matching user-ag | |||
| ent | ||||
| or there are no rules in the group, the URI is allowed. The | or there are no rules in the group, the URI is allowed. The | |||
| /robots.txt URI is implicitly allowed. </t> | /robots.txt URI is implicitly allowed. </t> | |||
| <t> Octets in the URI and robots.txt paths outside the range of the | <t> Octets in the URI and robots.txt paths outside the range of the | |||
| US-ASCII coded character set, and those in the reserved range defi | ASCII coded character set, and those in the reserved range defined | |||
| ned | by <xref target="RFC3986" format="default"/>, <bcp14>MUST</bcp14> | |||
| by <xref target="RFC3986"/>, <bcp14>MUST</bcp14> be percent-encode | be percent-encoded as | |||
| d as | defined by <xref target="RFC3986" format="default"/> prior to comp | |||
| defined by <xref target="RFC3986"></xref> prior to comparison. </t | arison. </t> | |||
| > | <t> If a percent-encoded ASCII octet is encountered in the URI, it | |||
| <t> If a percent-encoded US-ASCII octet is encountered in the URI, it | ||||
| <bcp14>MUST</bcp14> be unencoded prior to comparison, unless it is a | <bcp14>MUST</bcp14> be unencoded prior to comparison, unless it is a | |||
| reserved character in the URI as defined by <xref target="RFC3986" /> | reserved character in the URI as defined by <xref target="RFC3986" format="default"/> | |||
| or the character is outside the unreserved character range. The ma tch | or the character is outside the unreserved character range. The ma tch | |||
| evaluates positively if and only if the end of the path from the r ule | evaluates positively if and only if the end of the path from the r ule | |||
| is reached before a difference in octets is encountered. </t> | is reached before a difference in octets is encountered. </t> | |||
| <t> For example: </t> | <t> For example: </t> | |||
| <texttable title="Examples of matching percent-encoded URI components" | <figure anchor="fig-4"> | |||
| > | <name>Examples of matching percent-encoded URI components</name> | |||
| <ttcol align='left'>Path</ttcol> | <artwork name="" type="" align="center" alt=""><![CDATA[ | |||
| <ttcol align='left'>Encoded Path</ttcol> | +==================+=======================+=======================+ | |||
| <ttcol align='left'>Path to Match</ttcol> | | Path | Encoded Path | Path to Match | | |||
| <c>/foo/bar?baz=quz</c> | +==================+=======================+=======================+ | |||
| <c>/foo/bar?baz=quz</c> | | /foo/bar?baz=quz | /foo/bar?baz=quz | /foo/bar?baz=quz | | |||
| <c>/foo/bar?baz=quz</c> | +------------------+-----------------------+-----------------------+ | |||
| <c>/foo/bar?baz=http<br />://foo.bar</c> | | /foo/bar?baz= | /foo/bar?baz= | /foo/bar?baz= | | |||
| <c>/foo/bar?baz=http%3A<br />%2F%2Ffoo.bar</c> | | https://foo.bar | https%3A%2F%2Ffoo.bar | https%3A%2F%2Ffoo.bar | | |||
| <c>/foo/bar?baz=http%3A<br />%2F%2Ffoo.bar</c> | +------------------+-----------------------+-----------------------+ | |||
| <c>/foo/bar/U+E38384</c> | | /foo/bar/ | /foo/bar/%E3%83%84 | /foo/bar/%E3%83%84 | | |||
| <c>/foo/bar/%E3%83%84</c> | | U+E38384 | | | | |||
| <c>/foo/bar/%E3%83%84</c> | +------------------+-----------------------+-----------------------+ | |||
| <c>/foo/bar/%E3%83%84</c> | | /foo/ | /foo/bar/%E3%83%84 | /foo/bar/%E3%83%84 | | |||
| <c>/foo/bar/%E3%83%84</c> | | bar/%E3%83%84 | | | | |||
| <c>/foo/bar/%E3%83%84</c> | +------------------+-----------------------+-----------------------+ | |||
| <c>/foo/bar/%62%61%7A</c> | | /foo/ | /foo/bar/%62%61%7A | /foo/bar/baz | | |||
| <c>/foo/bar/%62%61%7A</c> | | bar/%62%61%7A | | | | |||
| <c>/foo/bar/baz</c> | +------------------+-----------------------+-----------------------+ | |||
| </texttable> | ]]></artwork> | |||
| </figure> | ||||
| <t> The crawler <bcp14>SHOULD</bcp14> ignore "disallow" and | <t> The crawler <bcp14>SHOULD</bcp14> ignore "disallow" and | |||
| "allow" rules that are not in any group (for example, an | "allow" rules that are not in any group (for example, any | |||
| y | ||||
| rule that precedes the first user-agent line). </t> | rule that precedes the first user-agent line). </t> | |||
| <t> Implementors <bcp14>MAY</bcp14> bridge encoding mismatches if they | ||||
| <t> Implementers <bcp14>MAY</bcp14> bridge encoding mismatches if they | detect that the robots.txt file is not UTF-8 encoded. </t> | |||
| detect that the robots.txt file is not UTF8 encoded. </t> | ||||
| </section> | </section> | |||
| <section anchor="special-characters" title="Special Characters"> | <section anchor="special-characters" numbered="true" toc="default"> | |||
| <t> Crawlers <bcp14>MUST</bcp14> allow the following special character | <name>Special Characters</name> | |||
| s: </t> | <t> Crawlers <bcp14>MUST</bcp14> support the following special charact | |||
| ers: </t> | ||||
| <texttable title="List of special characters in robots.txt files"> | <figure anchor="fig-5"> | |||
| <ttcol align='left'>Character</ttcol> | <name>List of special characters in robots.txt files</name> | |||
| <ttcol align='left'>Description</ttcol> | <artwork name="" type="" align="center" alt=""><![CDATA[ | |||
| <ttcol align='left'>Example</ttcol> | +===========+===================+==============================+ | |||
| <c>"#"</c> | | Character | Description | Example | | |||
| <c>Designates an end of line comment.</c> | +===========+===================+==============================+ | |||
| <c>"allow: / # comment in line"<br /><br />"# comment | | # | Designates a line | allow: / # comment in line | | |||
| on its own line"</c> | | | comment. | | | |||
| <c>"$"</c> | | | | # comment on its own line | | |||
| <c>Designates the end of the match pattern.</c> | +-----------+-------------------+------------------------------+ | |||
| <c>"allow: /this/path/exactly$"</c> | | $ | Designates the | allow: /this/path/exactly$ | | |||
| <c>"*"</c> | | | end of the match | | | |||
| <c>Designates 0 or more instances of any character.</c> | | | pattern. | | | |||
| <c>"allow: /this/*/exactly"</c> | +-----------+-------------------+------------------------------+ | |||
| </texttable> | | * | Designates 0 or | allow: /this/*/exactly | | |||
| | | more instances of | | | ||||
| | | any character. | | | ||||
| +-----------+-------------------+------------------------------+ | ||||
| ]]></artwork> | ||||
| </figure> | ||||
| <t> If crawlers match special characters verbatim in the URI, crawlers | <t> If crawlers match special characters verbatim in the URI, crawlers | |||
| <bcp14>SHOULD</bcp14> use "%" encoding. For example: </t | <bcp14>SHOULD</bcp14> use "%" encoding. For example: </t> | |||
| > | <figure anchor="fig-6"> | |||
| <name>Example of percent-encoding</name> | ||||
| <texttable title="Example of percent-encoding"> | <artwork name="" type="" align="center" alt=""><![CDATA[ | |||
| <ttcol align='left'>Percent-encoded Pattern</ttcol> | +============================+====================================+ | |||
| <ttcol align='left'>URI</ttcol> | | Percent-encoded Pattern | URI | | |||
| <c>/path/file-with-a-%2A.html</c> | +============================+====================================+ | |||
| <c>https://www.example.com/path/file-with-a-*.html</c> | | /path/file-with-a-%2A.html | https://www.example.com/path/ | | |||
| <c>/path/foo-%24</c> | | | file-with-a-*.html | | |||
| <c>https://www.example.com/path/foo-$</c> | +----------------------------+------------------------------------+ | |||
| </texttable> | | /path/foo-%24 | https://www.example.com/path/foo-$ | | |||
| +----------------------------+------------------------------------+ | ||||
| ]]></artwork> | ||||
| </figure> | ||||
| </section> | </section> | |||
| <section anchor="other-records" title="Other Records"> | <section anchor="other-records" numbered="true" toc="default"> | |||
| <name>Other Records</name> | ||||
| <t> Crawlers <bcp14>MAY</bcp14> interpret other records that are not | <t> Crawlers <bcp14>MAY</bcp14> interpret other records that are not | |||
| part of the robots.txt protocol. For example, 'sitemap' | part of the robots.txt protocol -- for example, "Sitemaps" | |||
| <xref target="SITEMAPS"/>. Crawlers MAY be lenient when | <xref target="SITEMAPS" format="default"/>. Crawlers <bcp14>MAY</b | |||
| cp14> be lenient when | ||||
| interpreting other records. For example, crawlers may accept | interpreting other records. For example, crawlers may accept | |||
| common typos of the record. </t> | common misspellings of the record. </t> | |||
| <t> Parsing of other records | <t> Parsing of other records | |||
| <bcp14>MUST NOT</bcp14> interfere with the parsing of explicitly | <bcp14>MUST NOT</bcp14> interfere with the parsing of explicitly | |||
| defined records in <xref target="specification" />. </t> | defined records in <xref target="specification" format="default"/> | |||
| . | ||||
| For example, a "Sitemaps" record <bcp14>MUST NOT</bcp14> terminate | ||||
| a | ||||
| group. </t> | ||||
| </section> | </section> | |||
| </section> | </section> | |||
| <section anchor="access-method" title="Access Method"> | <section anchor="access-method" numbered="true" toc="default"> | |||
| <t> The rules <bcp14>MUST</bcp14> be accessible in a file named | <name>Access Method</name> | |||
| "/robots.txt" (all lower case) in the top level path of | <t> The rules <bcp14>MUST</bcp14> be accessible in a file named | |||
| "/robots.txt" (all lowercase) in the top-level path of | ||||
| the service. The file <bcp14>MUST</bcp14> be UTF-8 encoded (as | the service. The file <bcp14>MUST</bcp14> be UTF-8 encoded (as | |||
| defined in <xref target="RFC3629"/>) and Internet Media Type | defined in <xref target="RFC3629" format="default"/>) and Internet Med | |||
| "text/plain" | ia Type | |||
| (as defined in <xref target="RFC2046"/>). </t> | "text/plain" | |||
| <t> As per <xref target="RFC3986"/>, the URI of the robots.txt is: </t> | (as defined in <xref target="RFC2046" format="default"/>). </t> | |||
| <t> "scheme:[//authority]/robots.txt" </t> | <t> As per <xref target="RFC3986" format="default"/>, the URI of the rob | |||
| <t> For example, in the context of HTTP or FTP, the URI is: </t> | ots.txt file is: </t> | |||
| <t> "scheme:[//authority]/robots.txt" </t> | ||||
| <figure> | <t> For example, in the context of HTTP or FTP, the URI is: </t> | |||
| <artwork><![CDATA[ | <artwork name="" type="" align="left" alt=""><![CDATA[ | |||
| https://www.example.com/robots.txt | https://www.example.com/robots.txt | |||
| ftp://ftp.example.com/robots.txt | ftp://ftp.example.com/robots.txt | |||
| ]]></artwork> | ]]></artwork> | |||
| </figure> | <section anchor="access-results" numbered="true" toc="default"> | |||
| <name>Access Results</name> | ||||
| <section anchor="access-results" title="Access Results"> | <section anchor="successful-access" numbered="true" toc="default"> | |||
| <section anchor="successful-access" title="Successful Access"> | <name>Successful Access</name> | |||
| <t> If the crawler successfully downloads the robots.txt, the | <t> If the crawler successfully downloads the robots.txt file, the | |||
| crawler <bcp14>MUST</bcp14> follow the parseable rules. </t> | crawler <bcp14>MUST</bcp14> follow the parseable rules. </t> | |||
| </section> | </section> | |||
| <section anchor="redirects" title="Redirects"> | <section anchor="redirects" numbered="true" toc="default"> | |||
| <t> It's possible that a server responds to a robots.txt fetch | <name>Redirects</name> | |||
| request with a redirect, such as HTTP 301 and HTTP 302 in | <t> It's possible that a server responds to a robots.txt fetch | |||
| request with a redirect, such as HTTP 301 or HTTP 302 in the | ||||
| case of HTTP. The crawlers <bcp14>SHOULD</bcp14> follow at | case of HTTP. The crawlers <bcp14>SHOULD</bcp14> follow at | |||
| least five consecutive redirects, even across authorities | least five consecutive redirects, even across authorities | |||
| (for example, hosts in case of HTTP), as defined in | (for example, hosts in the case of HTTP). </t> | |||
| <xref target="RFC1945"/>. </t> | <t> If a robots.txt file is reached within five consecutive | |||
| <t> If a robots.txt file is reached within five consecutive | ||||
| redirects, the robots.txt file <bcp14>MUST</bcp14> be fetched, | redirects, the robots.txt file <bcp14>MUST</bcp14> be fetched, | |||
| parsed, and its rules followed in the context of the initial | parsed, and its rules followed in the context of the initial | |||
| authority. </t> | authority. </t> | |||
| <t> If there are more than five consecutive redirects, crawlers | <t> If there are more than five consecutive redirects, crawlers | |||
| <bcp14>MAY</bcp14> assume that the robots.txt is | <bcp14>MAY</bcp14> assume that the robots.txt file is | |||
| unavailable. </t> | unavailable. </t> | |||
| </section> | </section> | |||
| <section anchor="unavailable-status" title="Unavailable Status"> | <section anchor="unavailable-status" numbered="true" toc="default"> | |||
| <t> Unavailable means the crawler tries to fetch the robots.txt, | <name>"Unavailable" Status</name> | |||
| and the server responds with unavailable status codes. For | <t> "Unavailable" means the crawler tries to fetch the robots.txt fi | |||
| example, in the context of HTTP, unavailable status codes are | le | |||
| and the server responds with status codes indicating that the reso | ||||
| urce in question is unavailable. For | ||||
| example, in the context of HTTP, such status codes are | ||||
| in the 400-499 range. </t> | in the 400-499 range. </t> | |||
| <t> If a server status code indicates that the robots.txt file is | ||||
| <t> If a server status code indicates that the robots.txt file is | unavailable to the crawler, then the crawler <bcp14>MAY</bcp14> ac | |||
| unavailable to the crawler, then the crawler MAY access any | cess any | |||
| resources on the server. </t> | resources on the server. </t> | |||
| </section> | </section> | |||
| <section anchor="unreachable-status" title="Unreachable Status"> | <section anchor="unreachable-status" numbered="true" toc="default"> | |||
| <t> If the robots.txt is unreachable due to server or network | <name>"Unreachable" Status</name> | |||
| errors, this means the robots.txt is undefined and the crawler | <t> If the robots.txt file is unreachable due to server or network | |||
| errors, this means the robots.txt file is undefined and the crawle | ||||
| r | ||||
| <bcp14>MUST</bcp14> assume complete disallow. For example, in | <bcp14>MUST</bcp14> assume complete disallow. For example, in | |||
| the context of HTTP, an unreachable robots.txt has a response | the context of HTTP, server errors are identified by status codes | |||
| code in the 500-599 range. </t> | in the 500-599 range. </t> | |||
| <t> If the robots.txt is undefined for a reasonably long period of | <t> If the robots.txt file is undefined for a reasonably long period | |||
| time (for example, 30 days), crawlers <bcp14>MAY</bcp14> assume | of | |||
| the robots.txt is unavailable as defined in | time (for example, 30 days), crawlers <bcp14>MAY</bcp14> assume th | |||
| <xref target="unavailable-status"/> or continue to use a cached | at | |||
| the robots.txt file is unavailable as defined in | ||||
| <xref target="unavailable-status" format="default"/> or continue t | ||||
| o use a cached | ||||
| copy. </t> | copy. </t> | |||
| </section> | </section> | |||
| <section anchor="parsing-errors" title="Parsing Errors"> | <section anchor="parsing-errors" numbered="true" toc="default"> | |||
| <t> Crawlers <bcp14>MUST</bcp14> try to parse each line of the | <name>Parsing Errors</name> | |||
| <t> Crawlers <bcp14>MUST</bcp14> try to parse each line of the | ||||
| robots.txt file. Crawlers <bcp14>MUST</bcp14> use the parseable | robots.txt file. Crawlers <bcp14>MUST</bcp14> use the parseable | |||
| rules. </t> | rules. </t> | |||
| </section> | ||||
| </section> | </section> | |||
| </section> | </section> | |||
| </section> | <section anchor="caching" numbered="true" toc="default"> | |||
| <section anchor="caching" title="Caching"> | <name>Caching</name> | |||
| <t> Crawlers <bcp14>MAY</bcp14> cache the fetched robots.txt file's | <t> Crawlers <bcp14>MAY</bcp14> cache the fetched robots.txt file's | |||
| contents. Crawlers <bcp14>MAY</bcp14> use standard cache control as | contents. Crawlers <bcp14>MAY</bcp14> use standard cache control as | |||
| defined in <xref target="RFC9111"/>. Crawlers | defined in <xref target="RFC9111" format="default"/>. Crawlers | |||
| <bcp14>SHOULD NOT</bcp14> use the cached version for more than 24 | <bcp14>SHOULD NOT</bcp14> use the cached version for more than 24 | |||
| hours, unless the robots.txt is unreachable. </t> | hours, unless the robots.txt file is unreachable. </t> | |||
| </section> | </section> | |||
| <section anchor="limits" title="Limits"> | <section anchor="limits" numbered="true" toc="default"> | |||
| <t> Crawlers SHOULD impose a parsing limit to protect their systems; | <name>Limits</name> | |||
| see <xref target="security"/>. The parsing limit MUST be at least | <t> Crawlers <bcp14>SHOULD</bcp14> impose a parsing limit to protect the | |||
| 500 kibibytes <xref target="KiB"/>. </t> | ir systems; | |||
| see <xref target="security" format="default"/>. The parsing limit <bcp | ||||
| 14>MUST</bcp14> be at least | ||||
| 500 kibibytes <xref target="KiB" format="default"/>. </t> | ||||
| </section> | </section> | |||
| </section> | </section> | |||
| <section anchor="security" title="Security Considerations"> | <section anchor="security" numbered="true" toc="default"> | |||
| <t> The Robots Exclusion Protocol is not a substitute for more valid | <name>Security Considerations</name> | |||
| <t> The Robots Exclusion Protocol is not a substitute for valid | ||||
| content security measures. Listing paths in the robots.txt file | content security measures. Listing paths in the robots.txt file | |||
| exposes them publicly and thus makes the paths discoverable. To | exposes them publicly and thus makes the paths discoverable. To | |||
| control access to the URI paths in a robots.txt file, users of | control access to the URI paths in a robots.txt file, users of | |||
| the protocol should employ a valid security measure relevant to | the protocol should employ a valid security measure relevant to | |||
| the application layer on which the robots.txt file is served. | the application layer on which the robots.txt file is served -- | |||
| For example, in case of HTTP, HTTP Authentication defined in | for example, in the case of HTTP, HTTP Authentication as defined in | |||
| <xref target="RFC9110"/>. </t> | <xref target="RFC9110" format="default"/>. </t> | |||
| <t> To protect against attacks against their system, implementors | <t> To protect against attacks against their system, implementors | |||
| of robots.txt parsing and matching logic should take the | of robots.txt parsing and matching logic should take the | |||
| following considerations into account: </t> | following considerations into account: </t> | |||
| <t> | <dl spacing="normal"> | |||
| <list style="symbols"> | <dt> Memory management:</dt><dd> <xref target="limits" format="default"/ | |||
| <t> Memory management: <xref target="limits" /> defines the lower | > defines the lower | |||
| limit of bytes that must be processed, which inherently also | limit of bytes that must be processed, which inherently also | |||
| protects the parser from out of memory scenarios. </t> | protects the parser from out-of-memory scenarios. </dd> | |||
| <t> Invalid characters: <xref target="formal-syntax" /> defines | <dt> Invalid characters:</dt><dd> <xref target="formal-syntax" format="d | |||
| efault"/> defines | ||||
| a set of characters that parsers and matchers can expect in | a set of characters that parsers and matchers can expect in | |||
| robots.txt files. Out of bound characters should be rejected | robots.txt files. Out-of-bound characters should be rejected | |||
| as invalid, which limits the available attack vectors that | as invalid, which limits the available attack vectors that | |||
| attempt to compromise the system. </t> | attempt to compromise the system. </dd> | |||
| <t> Untrusted content: Implementors should treat the content of | <dt> Untrusted content:</dt><dd> Implementors should treat the content o | |||
| f | ||||
| a robots.txt file as untrusted content, as defined by the | a robots.txt file as untrusted content, as defined by the | |||
| specification of the application layer used. For example, | specification of the application layer used. For example, | |||
| in the context of HTTP, implementors should follow the | in the context of HTTP, implementors should follow the | |||
| security considerations section of | Security Considerations section of | |||
| <xref target="RFC9110"/>. </t> | <xref target="RFC9110" format="default"/>. </dd> | |||
| </list> | </dl> | |||
| </t> | ||||
| </section> | </section> | |||
| <section anchor="IANA" title="IANA Considerations"> | <section anchor="IANA" numbered="true" toc="default"> | |||
| <t> This document has no actions for IANA. </t> | <name>IANA Considerations</name> | |||
| <t> This document has no IANA actions. </t> | ||||
| </section> | </section> | |||
| <section anchor="examples" title="Examples"> | <section anchor="examples" numbered="true" toc="default"> | |||
| <section anchor="simple-example" title="Simple Example"> | <name>Examples</name> | |||
| <section anchor="simple-example" numbered="true" toc="default"> | ||||
| <name>Simple Example</name> | ||||
| <t> The following example shows: </t> | <t> The following example shows: </t> | |||
| <t> | <dl spacing="normal"> | |||
| <list style="symbols"> | <dt> *:</dt><dd> A group that's relevant to all user agents that | |||
| <t> *: A group that's relevant to all user-agents that | ||||
| don't have an explicitly defined matching group. It allows | don't have an explicitly defined matching group. It allows | |||
| access to the URLs with the /publications/ path prefix, and | access to the URLs with the /publications/ path prefix, and it | |||
| restricts access to the URLs with the /example/ path prefix | restricts access to the URLs with the /example/ path prefix | |||
| and to all URLs with .gif suffix. The * character designates | and to all URLs with a .gif suffix. The "*" character designates | |||
| any character, including the otherwise required forward | any character, including the otherwise-required forward | |||
| slash; see <xref target="formal-syntax" />. </t> | slash; see <xref target="formal-syntax" format="default"/>. </dd | |||
| <t> foobot: A regular case. A single user-agent followed | > | |||
| <dt> foobot:</dt><dd> A regular case. A single user agent followed | ||||
| by rules. The crawler only has access to two URL path | by rules. The crawler only has access to two URL path | |||
| prefixes on the site, /example/page.html and | prefixes on the site -- /example/page.html and | |||
| /example/allowed.gif. The rules of the group are missing | /example/allowed.gif. The rules of the group are missing | |||
| the optional whitespace character, which is acceptable as | the optional space character, which is acceptable as | |||
| defined in <xref target="formal-syntax" />. </t> | defined in <xref target="formal-syntax" format="default"/>. </dd | |||
| <t> barbot and bazbot: A group that's relevant for more | > | |||
| than one user-agent. The crawlers are not allowed to access | <dt> barbot and bazbot:</dt><dd> A group that's relevant for more | |||
| the URLs with the /example/page.html path prefix, but | than one user agent. The crawlers are not allowed to access | |||
| the URLs with the /example/page.html path prefix but | ||||
| otherwise have unrestricted access to the rest of the URLs | otherwise have unrestricted access to the rest of the URLs | |||
| on the site. </t> | on the site. </dd> | |||
| <t> quxbot: An empty group at end of the file. The crawler has | <dt> quxbot:</dt><dd> An empty group at the end of the file. The crawl | |||
| unrestricted access to the URLs on the site. </t> | er has | |||
| </list> | unrestricted access to the URLs on the site. </dd> | |||
| </t> | </dl> | |||
| <figure> | <artwork name="" type="" align="left" alt=""><![CDATA[ | |||
| <artwork><![CDATA[ | User-Agent: * | |||
| User-agent: * | ||||
| Disallow: *.gif$ | Disallow: *.gif$ | |||
| Disallow: /example/ | Disallow: /example/ | |||
| Allow: /publications/ | Allow: /publications/ | |||
| User-Agent: foobot | User-Agent: foobot | |||
| Disallow:/ | Disallow:/ | |||
| Allow:/example/page.html | Allow:/example/page.html | |||
| Allow:/example/allowed.gif | Allow:/example/allowed.gif | |||
| User-Agent: barbot | User-Agent: barbot | |||
| User-Agent: bazbot | User-Agent: bazbot | |||
| Disallow: /example/page.html | Disallow: /example/page.html | |||
| User-Agent: quxbot | User-Agent: quxbot | |||
| EOF | EOF | |||
| ]]></artwork> | ]]></artwork> | |||
| </figure> | ||||
| </section> | </section> | |||
| <section anchor="longest-match" title="Longest Match"> | <section anchor="longest-match" numbered="true" toc="default"> | |||
| <name>Longest Match</name> | ||||
| <t> The following example shows that in the case of two rules, the | <t> The following example shows that in the case of two rules, the | |||
| longest one is used for matching. In the following case, | longest one is used for matching. In the following case, | |||
| /example/page/disallowed.gif <bcp14>MUST</bcp14> be used for | /example/page/disallowed.gif <bcp14>MUST</bcp14> be used for | |||
| the URI example.com/example/page/disallow.gif. </t> | the URI example.com/example/page/disallow.gif. </t> | |||
| <figure> | <artwork name="" type="" align="left" alt=""><![CDATA[ | |||
| <artwork><![CDATA[ | ||||
| User-Agent: foobot | User-Agent: foobot | |||
| Allow: /example/page/ | Allow: /example/page/ | |||
| Disallow: /example/page/disallowed.gif | Disallow: /example/page/disallowed.gif | |||
| ]]></artwork> | ]]></artwork> | |||
| </figure> | ||||
| </section> | </section> | |||
| </section> | </section> | |||
| </middle> | </middle> | |||
| <back> | <back> | |||
| <references title='Normative References'> | <references> | |||
| &RFC1945; | <name>References</name> | |||
| &RFC2046; | <references> | |||
| &RFC2119; | <name>Normative References</name> | |||
| &RFC3629; | <xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.2 | |||
| &RFC3986; | 046.xml"/> | |||
| &RFC5234; | <xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.2 | |||
| &RFC8174; | 119.xml"/> | |||
| &RFC8288; | <xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.3 | |||
| &RFC9110; | 629.xml"/> | |||
| &RFC9111; | <xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.3 | |||
| </references> | 986.xml"/> | |||
| <xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.5 | ||||
| 234.xml"/> | ||||
| <xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.8 | ||||
| 174.xml"/> | ||||
| <xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.8 | ||||
| 288.xml"/> | ||||
| <xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.9 | ||||
| 110.xml"/> | ||||
| <xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.9 | ||||
| 111.xml"/> | ||||
| </references> | ||||
| <references> | ||||
| <name>Informative References</name> | ||||
| <references title='Informative References'> | <reference anchor="ROBOTSTXT" target="https://www.robotstxt.org/"> | |||
| <reference anchor="ROBOTSTXT" target="http://www.robotstxt.org/"> | <front> | |||
| <front> | <title>The Web Robots Pages (including /robots.txt)</title> | |||
| <title>Robots Exclusion Protocol</title> | <author> | |||
| <author> | <organization/> | |||
| <organization></organization> | </author> | |||
| </author> | <date>2007</date> | |||
| <date year="n.d."/> | </front> | |||
| </front> | </reference> | |||
| </reference> | <reference anchor="SITEMAPS" target="https://www.sitemaps.org/index.html | |||
| <reference anchor="SITEMAPS" target="https://www.sitemaps.org/index.html"> | "> | |||
| <front> | <front> | |||
| <title>Sitemaps Protocol</title> | <title>What are Sitemaps? (Sitemap protocol)</title> | |||
| <author> | <author> | |||
| <organization></organization> | <organization/> | |||
| </author> | </author> | |||
| <date year="n.d."/> | <date>April 2020</date> | |||
| </front> | </front> | |||
| </reference> | </reference> | |||
| <reference anchor="KiB" target="https://simple.wikipedia.org/wiki/Kibibyte | <reference anchor="KiB" target="https://simple.wikipedia.org/wiki/Kibiby | |||
| "> | te"> | |||
| <front> | <front> | |||
| <title>Kibibyte - Simple English Wikipedia, the free encyclopedia</tit | <title>Kibibyte</title> | |||
| le> | <author> | |||
| <author> | <organization/> | |||
| <organization></organization> | </author> | |||
| </author> | <date day="17" month="September" year="2020"/> | |||
| <date year="n.d."/> | </front> | |||
| </front> | <refcontent>Simple English Wikipedia, the free encyclopedia</refconten | |||
| </reference> | t> | |||
| </reference> | ||||
| </references> | ||||
| </references> | </references> | |||
| </back> | </back> | |||
| </rfc> | </rfc> | |||
| End of changes. 104 change blocks. | ||||
| 425 lines changed or deleted | 438 lines changed or added | |||
This html diff was produced by rfcdiff 1.48. | ||||