Kaydet (Commit) 9a7bbb2e authored tarafından Berker Peksag's avatar Berker Peksag

Issue #25400: RobotFileParser now correctly returns default values for crawl_delay and request_rate

Initial patch by Peter Wirtz.
üst 85c98bf9
......@@ -79,32 +79,17 @@ Disallow: /
bad = ['/cyberworld/map/index.html', '/', '/tmp/']
class CrawlDelayAndRequestRateTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
User-agent: figtree
Crawl-delay: 3
Request-rate: 9/30
Disallow: /tmp
Disallow: /a%3cd.html
Disallow: /a%2fb.html
Disallow: /%7ejoe/index.html
"""
agent = 'figtree'
request_rate = namedtuple('req_rate', 'requests seconds')(9, 30)
crawl_delay = 3
good = [('figtree', '/foo.html')]
bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',
'/a%2fb.html', '/~joe/index.html']
class BaseRequestRateTest(BaseRobotTest):
def test_request_rate(self):
for url in self.good:
for url in self.good + self.bad:
agent, url = self.get_agent_and_url(url)
with self.subTest(url=url, agent=agent):
if self.crawl_delay:
self.assertEqual(
self.parser.crawl_delay(agent), self.crawl_delay
)
if self.request_rate and self.parser.request_rate(agent):
if self.request_rate:
self.assertEqual(
self.parser.request_rate(agent).requests,
self.request_rate.requests
......@@ -115,6 +100,24 @@ Disallow: /%7ejoe/index.html
)
class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
robots_txt = """\
User-agent: figtree
Crawl-delay: 3
Request-rate: 9/30
Disallow: /tmp
Disallow: /a%3cd.html
Disallow: /a%2fb.html
Disallow: /%7ejoe/index.html
"""
agent = 'figtree'
request_rate = namedtuple('req_rate', 'requests seconds')(9, 30)
crawl_delay = 3
good = [('figtree', '/foo.html')]
bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',
'/a%2fb.html', '/~joe/index.html']
class DifferentAgentTest(CrawlDelayAndRequestRateTest):
agent = 'FigTree Robot libwww-perl/5.04'
# these are not actually tested, but we still need to parse it
......@@ -230,6 +233,19 @@ Disallow: /another/path?
bad = ['/another/path?']
class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
robots_txt = """\
User-agent: *
Crawl-delay: 1
Request-rate: 3/15
Disallow: /cyberworld/map/
"""
request_rate = namedtuple('req_rate', 'requests seconds')(3, 15)
crawl_delay = 1
good = ['/', '/test.html']
bad = ['/cyberworld/map/index.html']
class RobotHandler(BaseHTTPRequestHandler):
def do_GET(self):
......@@ -309,6 +325,8 @@ class NetworkTestCase(unittest.TestCase):
self.assertTrue(parser.allow_all)
self.assertFalse(parser.disallow_all)
self.assertEqual(parser.mtime(), 0)
self.assertIsNone(parser.crawl_delay('*'))
self.assertIsNone(parser.request_rate('*'))
if __name__=='__main__':
unittest.main()
......@@ -175,16 +175,20 @@ class RobotFileParser:
return True
def crawl_delay(self, useragent):
if not self.mtime():
return None
for entry in self.entries:
if entry.applies_to(useragent):
return entry.delay
return None
return self.default_entry.delay
def request_rate(self, useragent):
if not self.mtime():
return None
for entry in self.entries:
if entry.applies_to(useragent):
return entry.req_rate
return None
return self.default_entry.req_rate
def __str__(self):
return ''.join([str(entry) + "\n" for entry in self.entries])
......
......@@ -29,6 +29,9 @@ Core and Builtins
Library
-------
- Issue #25400: RobotFileParser now correctly returns default values for
crawl_delay and request_rate. Initial patch by Peter Wirtz.
- Issue #27932: Prevent memory leak in win32_ver().
- Fix UnboundLocalError in socket._sendfile_use_sendfile.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment