Kaydet (Commit) 960e848f authored tarafından Berker Peksag's avatar Berker Peksag

Issue #16099: RobotFileParser now supports Crawl-delay and Request-rate

extensions.

Patch by Nikolay Bogoychev.
üst 2137dc15
...@@ -53,15 +53,41 @@ structure of :file:`robots.txt` files, see http://www.robotstxt.org/orig.html. ...@@ -53,15 +53,41 @@ structure of :file:`robots.txt` files, see http://www.robotstxt.org/orig.html.
Sets the time the ``robots.txt`` file was last fetched to the current Sets the time the ``robots.txt`` file was last fetched to the current
time. time.
.. method:: crawl_delay(useragent)
The following example demonstrates basic use of the RobotFileParser class. Returns the value of the ``Crawl-delay`` parameter from ``robots.txt``
for the *useragent* in question. If there is no such parameter or it
doesn't apply to the *useragent* specified or the ``robots.txt`` entry
for this parameter has invalid syntax, return ``None``.
.. versionadded:: 3.6
.. method:: request_rate(useragent)
Returns the contents of the ``Request-rate`` parameter from
``robots.txt`` in the form of a :func:`~collections.namedtuple`
``(requests, seconds)``. If there is no such parameter or it doesn't
apply to the *useragent* specified or the ``robots.txt`` entry for this
parameter has invalid syntax, return ``None``.
.. versionadded:: 3.6
The following example demonstrates basic use of the :class:`RobotFileParser`
class::
>>> import urllib.robotparser >>> import urllib.robotparser
>>> rp = urllib.robotparser.RobotFileParser() >>> rp = urllib.robotparser.RobotFileParser()
>>> rp.set_url("http://www.musi-cal.com/robots.txt") >>> rp.set_url("http://www.musi-cal.com/robots.txt")
>>> rp.read() >>> rp.read()
>>> rrate = rp.request_rate("*")
>>> rrate.requests
3
>>> rrate.seconds
20
>>> rp.crawl_delay("*")
6
>>> rp.can_fetch("*", "http://www.musi-cal.com/cgi-bin/search?city=San+Francisco") >>> rp.can_fetch("*", "http://www.musi-cal.com/cgi-bin/search?city=San+Francisco")
False False
>>> rp.can_fetch("*", "http://www.musi-cal.com/") >>> rp.can_fetch("*", "http://www.musi-cal.com/")
True True
...@@ -119,6 +119,14 @@ datetime ...@@ -119,6 +119,14 @@ datetime
(Contributed by Ashley Anderson in :issue:`12006`.) (Contributed by Ashley Anderson in :issue:`12006`.)
urllib.robotparser
------------------
:class:`~urllib.robotparser.RobotFileParser` now supports ``Crawl-delay`` and
``Request-rate`` extensions.
(Contributed by Nikolay Bogoychev in :issue:`16099`.)
Optimizations Optimizations
============= =============
......
import io import io
import unittest import unittest
import urllib.robotparser import urllib.robotparser
from collections import namedtuple
from urllib.error import URLError, HTTPError from urllib.error import URLError, HTTPError
from urllib.request import urlopen from urllib.request import urlopen
from test import support from test import support
...@@ -12,7 +13,8 @@ except ImportError: ...@@ -12,7 +13,8 @@ except ImportError:
class RobotTestCase(unittest.TestCase): class RobotTestCase(unittest.TestCase):
def __init__(self, index=None, parser=None, url=None, good=None, agent=None): def __init__(self, index=None, parser=None, url=None, good=None,
agent=None, request_rate=None, crawl_delay=None):
# workaround to make unittest discovery work (see #17066) # workaround to make unittest discovery work (see #17066)
if not isinstance(index, int): if not isinstance(index, int):
return return
...@@ -25,6 +27,8 @@ class RobotTestCase(unittest.TestCase): ...@@ -25,6 +27,8 @@ class RobotTestCase(unittest.TestCase):
self.url = url self.url = url
self.good = good self.good = good
self.agent = agent self.agent = agent
self.request_rate = request_rate
self.crawl_delay = crawl_delay
def runTest(self): def runTest(self):
if isinstance(self.url, tuple): if isinstance(self.url, tuple):
...@@ -34,6 +38,18 @@ class RobotTestCase(unittest.TestCase): ...@@ -34,6 +38,18 @@ class RobotTestCase(unittest.TestCase):
agent = self.agent agent = self.agent
if self.good: if self.good:
self.assertTrue(self.parser.can_fetch(agent, url)) self.assertTrue(self.parser.can_fetch(agent, url))
self.assertEqual(self.parser.crawl_delay(agent), self.crawl_delay)
# if we have actual values for request rate
if self.request_rate and self.parser.request_rate(agent):
self.assertEqual(
self.parser.request_rate(agent).requests,
self.request_rate.requests
)
self.assertEqual(
self.parser.request_rate(agent).seconds,
self.request_rate.seconds
)
self.assertEqual(self.parser.request_rate(agent), self.request_rate)
else: else:
self.assertFalse(self.parser.can_fetch(agent, url)) self.assertFalse(self.parser.can_fetch(agent, url))
...@@ -43,15 +59,17 @@ class RobotTestCase(unittest.TestCase): ...@@ -43,15 +59,17 @@ class RobotTestCase(unittest.TestCase):
tests = unittest.TestSuite() tests = unittest.TestSuite()
def RobotTest(index, robots_txt, good_urls, bad_urls, def RobotTest(index, robots_txt, good_urls, bad_urls,
agent="test_robotparser"): request_rate, crawl_delay, agent="test_robotparser"):
lines = io.StringIO(robots_txt).readlines() lines = io.StringIO(robots_txt).readlines()
parser = urllib.robotparser.RobotFileParser() parser = urllib.robotparser.RobotFileParser()
parser.parse(lines) parser.parse(lines)
for url in good_urls: for url in good_urls:
tests.addTest(RobotTestCase(index, parser, url, 1, agent)) tests.addTest(RobotTestCase(index, parser, url, 1, agent,
request_rate, crawl_delay))
for url in bad_urls: for url in bad_urls:
tests.addTest(RobotTestCase(index, parser, url, 0, agent)) tests.addTest(RobotTestCase(index, parser, url, 0, agent,
request_rate, crawl_delay))
# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002) # Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)
...@@ -65,14 +83,18 @@ Disallow: /foo.html ...@@ -65,14 +83,18 @@ Disallow: /foo.html
good = ['/','/test.html'] good = ['/','/test.html']
bad = ['/cyberworld/map/index.html','/tmp/xxx','/foo.html'] bad = ['/cyberworld/map/index.html','/tmp/xxx','/foo.html']
request_rate = None
crawl_delay = None
RobotTest(1, doc, good, bad) RobotTest(1, doc, good, bad, request_rate, crawl_delay)
# 2. # 2.
doc = """ doc = """
# robots.txt for http://www.example.com/ # robots.txt for http://www.example.com/
User-agent: * User-agent: *
Crawl-delay: 1
Request-rate: 3/15
Disallow: /cyberworld/map/ # This is an infinite virtual URL space Disallow: /cyberworld/map/ # This is an infinite virtual URL space
# Cybermapper knows where to go. # Cybermapper knows where to go.
...@@ -83,8 +105,10 @@ Disallow: ...@@ -83,8 +105,10 @@ Disallow:
good = ['/','/test.html',('cybermapper','/cyberworld/map/index.html')] good = ['/','/test.html',('cybermapper','/cyberworld/map/index.html')]
bad = ['/cyberworld/map/index.html'] bad = ['/cyberworld/map/index.html']
request_rate = None # The parameters should be equal to None since they
crawl_delay = None # don't apply to the cybermapper user agent
RobotTest(2, doc, good, bad) RobotTest(2, doc, good, bad, request_rate, crawl_delay)
# 3. # 3.
doc = """ doc = """
...@@ -95,14 +119,18 @@ Disallow: / ...@@ -95,14 +119,18 @@ Disallow: /
good = [] good = []
bad = ['/cyberworld/map/index.html','/','/tmp/'] bad = ['/cyberworld/map/index.html','/','/tmp/']
request_rate = None
crawl_delay = None
RobotTest(3, doc, good, bad) RobotTest(3, doc, good, bad, request_rate, crawl_delay)
# Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002) # Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002)
# 4. # 4.
doc = """ doc = """
User-agent: figtree User-agent: figtree
Crawl-delay: 3
Request-rate: 9/30
Disallow: /tmp Disallow: /tmp
Disallow: /a%3cd.html Disallow: /a%3cd.html
Disallow: /a%2fb.html Disallow: /a%2fb.html
...@@ -115,8 +143,17 @@ bad = ['/tmp','/tmp.html','/tmp/a.html', ...@@ -115,8 +143,17 @@ bad = ['/tmp','/tmp.html','/tmp/a.html',
'/~joe/index.html' '/~joe/index.html'
] ]
RobotTest(4, doc, good, bad, 'figtree') request_rate = namedtuple('req_rate', 'requests seconds')
RobotTest(5, doc, good, bad, 'FigTree Robot libwww-perl/5.04') request_rate.requests = 9
request_rate.seconds = 30
crawl_delay = 3
request_rate_bad = None # not actually tested, but we still need to parse it
crawl_delay_bad = None # in order to accommodate the input parameters
RobotTest(4, doc, good, bad, request_rate, crawl_delay, 'figtree' )
RobotTest(5, doc, good, bad, request_rate_bad, crawl_delay_bad,
'FigTree Robot libwww-perl/5.04')
# 6. # 6.
doc = """ doc = """
...@@ -125,14 +162,18 @@ Disallow: /tmp/ ...@@ -125,14 +162,18 @@ Disallow: /tmp/
Disallow: /a%3Cd.html Disallow: /a%3Cd.html
Disallow: /a/b.html Disallow: /a/b.html
Disallow: /%7ejoe/index.html Disallow: /%7ejoe/index.html
Crawl-delay: 3
Request-rate: 9/banana
""" """
good = ['/tmp',] # XFAIL: '/a%2fb.html' good = ['/tmp',] # XFAIL: '/a%2fb.html'
bad = ['/tmp/','/tmp/a.html', bad = ['/tmp/','/tmp/a.html',
'/a%3cd.html','/a%3Cd.html',"/a/b.html", '/a%3cd.html','/a%3Cd.html',"/a/b.html",
'/%7Ejoe/index.html'] '/%7Ejoe/index.html']
crawl_delay = 3
request_rate = None # since request rate has invalid syntax, return None
RobotTest(6, doc, good, bad) RobotTest(6, doc, good, bad, None, None)
# From bug report #523041 # From bug report #523041
...@@ -140,12 +181,16 @@ RobotTest(6, doc, good, bad) ...@@ -140,12 +181,16 @@ RobotTest(6, doc, good, bad)
doc = """ doc = """
User-Agent: * User-Agent: *
Disallow: /. Disallow: /.
Crawl-delay: pears
""" """
good = ['/foo.html'] good = ['/foo.html']
bad = [] # Bug report says "/" should be denied, but that is not in the RFC bad = [] # bug report says "/" should be denied, but that is not in the RFC
crawl_delay = None # since crawl delay has invalid syntax, return None
request_rate = None
RobotTest(7, doc, good, bad) RobotTest(7, doc, good, bad, crawl_delay, request_rate)
# From Google: http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40364 # From Google: http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40364
...@@ -154,12 +199,15 @@ doc = """ ...@@ -154,12 +199,15 @@ doc = """
User-agent: Googlebot User-agent: Googlebot
Allow: /folder1/myfile.html Allow: /folder1/myfile.html
Disallow: /folder1/ Disallow: /folder1/
Request-rate: whale/banana
""" """
good = ['/folder1/myfile.html'] good = ['/folder1/myfile.html']
bad = ['/folder1/anotherfile.html'] bad = ['/folder1/anotherfile.html']
crawl_delay = None
request_rate = None # invalid syntax, return none
RobotTest(8, doc, good, bad, agent="Googlebot") RobotTest(8, doc, good, bad, crawl_delay, request_rate, agent="Googlebot")
# 9. This file is incorrect because "Googlebot" is a substring of # 9. This file is incorrect because "Googlebot" is a substring of
# "Googlebot-Mobile", so test 10 works just like test 9. # "Googlebot-Mobile", so test 10 works just like test 9.
...@@ -174,12 +222,12 @@ Allow: / ...@@ -174,12 +222,12 @@ Allow: /
good = [] good = []
bad = ['/something.jpg'] bad = ['/something.jpg']
RobotTest(9, doc, good, bad, agent="Googlebot") RobotTest(9, doc, good, bad, None, None, agent="Googlebot")
good = [] good = []
bad = ['/something.jpg'] bad = ['/something.jpg']
RobotTest(10, doc, good, bad, agent="Googlebot-Mobile") RobotTest(10, doc, good, bad, None, None, agent="Googlebot-Mobile")
# 11. Get the order correct. # 11. Get the order correct.
doc = """ doc = """
...@@ -193,12 +241,12 @@ Disallow: / ...@@ -193,12 +241,12 @@ Disallow: /
good = [] good = []
bad = ['/something.jpg'] bad = ['/something.jpg']
RobotTest(11, doc, good, bad, agent="Googlebot") RobotTest(11, doc, good, bad, None, None, agent="Googlebot")
good = ['/something.jpg'] good = ['/something.jpg']
bad = [] bad = []
RobotTest(12, doc, good, bad, agent="Googlebot-Mobile") RobotTest(12, doc, good, bad, None, None, agent="Googlebot-Mobile")
# 13. Google also got the order wrong in #8. You need to specify the # 13. Google also got the order wrong in #8. You need to specify the
...@@ -212,7 +260,7 @@ Disallow: /folder1/ ...@@ -212,7 +260,7 @@ Disallow: /folder1/
good = ['/folder1/myfile.html'] good = ['/folder1/myfile.html']
bad = ['/folder1/anotherfile.html'] bad = ['/folder1/anotherfile.html']
RobotTest(13, doc, good, bad, agent="googlebot") RobotTest(13, doc, good, bad, None, None, agent="googlebot")
# 14. For issue #6325 (query string support) # 14. For issue #6325 (query string support)
...@@ -224,7 +272,7 @@ Disallow: /some/path?name=value ...@@ -224,7 +272,7 @@ Disallow: /some/path?name=value
good = ['/some/path'] good = ['/some/path']
bad = ['/some/path?name=value'] bad = ['/some/path?name=value']
RobotTest(14, doc, good, bad) RobotTest(14, doc, good, bad, None, None)
# 15. For issue #4108 (obey first * entry) # 15. For issue #4108 (obey first * entry)
doc = """ doc = """
...@@ -238,7 +286,7 @@ Disallow: /another/path ...@@ -238,7 +286,7 @@ Disallow: /another/path
good = ['/another/path'] good = ['/another/path']
bad = ['/some/path'] bad = ['/some/path']
RobotTest(15, doc, good, bad) RobotTest(15, doc, good, bad, None, None)
# 16. Empty query (issue #17403). Normalizing the url first. # 16. Empty query (issue #17403). Normalizing the url first.
doc = """ doc = """
...@@ -250,7 +298,7 @@ Disallow: /another/path? ...@@ -250,7 +298,7 @@ Disallow: /another/path?
good = ['/some/path?'] good = ['/some/path?']
bad = ['/another/path?'] bad = ['/another/path?']
RobotTest(16, doc, good, bad) RobotTest(16, doc, good, bad, None, None)
class RobotHandler(BaseHTTPRequestHandler): class RobotHandler(BaseHTTPRequestHandler):
......
...@@ -10,7 +10,9 @@ ...@@ -10,7 +10,9 @@
http://www.robotstxt.org/norobots-rfc.txt http://www.robotstxt.org/norobots-rfc.txt
""" """
import urllib.parse, urllib.request import collections
import urllib.parse
import urllib.request
__all__ = ["RobotFileParser"] __all__ = ["RobotFileParser"]
...@@ -120,10 +122,29 @@ class RobotFileParser: ...@@ -120,10 +122,29 @@ class RobotFileParser:
if state != 0: if state != 0:
entry.rulelines.append(RuleLine(line[1], True)) entry.rulelines.append(RuleLine(line[1], True))
state = 2 state = 2
elif line[0] == "crawl-delay":
if state != 0:
# before trying to convert to int we need to make
# sure that robots.txt has valid syntax otherwise
# it will crash
if line[1].strip().isdigit():
entry.delay = int(line[1])
state = 2
elif line[0] == "request-rate":
if state != 0:
numbers = line[1].split('/')
# check if all values are sane
if (len(numbers) == 2 and numbers[0].strip().isdigit()
and numbers[1].strip().isdigit()):
req_rate = collections.namedtuple('req_rate',
'requests seconds')
entry.req_rate = req_rate
entry.req_rate.requests = int(numbers[0])
entry.req_rate.seconds = int(numbers[1])
state = 2
if state == 2: if state == 2:
self._add_entry(entry) self._add_entry(entry)
def can_fetch(self, useragent, url): def can_fetch(self, useragent, url):
"""using the parsed robots.txt decide if useragent can fetch url""" """using the parsed robots.txt decide if useragent can fetch url"""
if self.disallow_all: if self.disallow_all:
...@@ -153,6 +174,18 @@ class RobotFileParser: ...@@ -153,6 +174,18 @@ class RobotFileParser:
# agent not found ==> access granted # agent not found ==> access granted
return True return True
def crawl_delay(self, useragent):
for entry in self.entries:
if entry.applies_to(useragent):
return entry.delay
return None
def request_rate(self, useragent):
for entry in self.entries:
if entry.applies_to(useragent):
return entry.req_rate
return None
def __str__(self): def __str__(self):
return ''.join([str(entry) + "\n" for entry in self.entries]) return ''.join([str(entry) + "\n" for entry in self.entries])
...@@ -180,6 +213,8 @@ class Entry: ...@@ -180,6 +213,8 @@ class Entry:
def __init__(self): def __init__(self):
self.useragents = [] self.useragents = []
self.rulelines = [] self.rulelines = []
self.delay = None
self.req_rate = None
def __str__(self): def __str__(self):
ret = [] ret = []
......
...@@ -151,6 +151,7 @@ Finn Bock ...@@ -151,6 +151,7 @@ Finn Bock
Paul Boddie Paul Boddie
Matthew Boedicker Matthew Boedicker
Robin Boerdijk Robin Boerdijk
Nikolay Bogoychev
David Bolen David Bolen
Wouter Bolsterlee Wouter Bolsterlee
Gawain Bolton Gawain Bolton
......
+++++++++++ +++++++++++
Python News Python News
+++++++++++ +++++++++++
...@@ -46,6 +46,9 @@ Core and Builtins ...@@ -46,6 +46,9 @@ Core and Builtins
Library Library
------- -------
- Issue #16099: RobotFileParser now supports Crawl-delay and Request-rate
extensions. Patch by Nikolay Bogoychev.
- Issue #25316: distutils raises OSError instead of DistutilsPlatformError - Issue #25316: distutils raises OSError instead of DistutilsPlatformError
when MSVC is not installed. when MSVC is not installed.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment