Kaydet (Commit) 5db5c066 authored tarafından Christopher Beacham's avatar Christopher Beacham Kaydeden (comit) Ned Deily

bpo-21475: Support the Sitemap extension in robotparser (GH-6883)

üst 7a1c0275
...@@ -76,6 +76,15 @@ structure of :file:`robots.txt` files, see http://www.robotstxt.org/orig.html. ...@@ -76,6 +76,15 @@ structure of :file:`robots.txt` files, see http://www.robotstxt.org/orig.html.
.. versionadded:: 3.6 .. versionadded:: 3.6
.. method:: site_maps()
Returns the contents of the ``Sitemap`` parameter from
``robots.txt`` in the form of a :func:`list`. If there is no such
parameter or the ``robots.txt`` entry for this parameter has
invalid syntax, return ``None``.
.. versionadded:: 3.8
The following example demonstrates basic use of the :class:`RobotFileParser` The following example demonstrates basic use of the :class:`RobotFileParser`
class:: class::
......
...@@ -12,6 +12,7 @@ class BaseRobotTest: ...@@ -12,6 +12,7 @@ class BaseRobotTest:
agent = 'test_robotparser' agent = 'test_robotparser'
good = [] good = []
bad = [] bad = []
site_maps = None
def setUp(self): def setUp(self):
lines = io.StringIO(self.robots_txt).readlines() lines = io.StringIO(self.robots_txt).readlines()
...@@ -36,6 +37,9 @@ class BaseRobotTest: ...@@ -36,6 +37,9 @@ class BaseRobotTest:
with self.subTest(url=url, agent=agent): with self.subTest(url=url, agent=agent):
self.assertFalse(self.parser.can_fetch(agent, url)) self.assertFalse(self.parser.can_fetch(agent, url))
def test_site_maps(self):
self.assertEqual(self.parser.site_maps(), self.site_maps)
class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase): class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\ robots_txt = """\
...@@ -65,6 +69,23 @@ Disallow: ...@@ -65,6 +69,23 @@ Disallow:
bad = ['/cyberworld/map/index.html'] bad = ['/cyberworld/map/index.html']
class SitemapTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
# robots.txt for http://www.example.com/
User-agent: *
Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml
Sitemap: http://www.google.com/hostednews/sitemap_index.xml
Request-rate: 3/15
Disallow: /cyberworld/map/ # This is an infinite virtual URL space
"""
good = ['/', '/test.html']
bad = ['/cyberworld/map/index.html']
site_maps = ['http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml',
'http://www.google.com/hostednews/sitemap_index.xml']
class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase): class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\ robots_txt = """\
# go away # go away
......
...@@ -27,6 +27,7 @@ class RobotFileParser: ...@@ -27,6 +27,7 @@ class RobotFileParser:
def __init__(self, url=''): def __init__(self, url=''):
self.entries = [] self.entries = []
self.sitemaps = []
self.default_entry = None self.default_entry = None
self.disallow_all = False self.disallow_all = False
self.allow_all = False self.allow_all = False
...@@ -141,6 +142,12 @@ class RobotFileParser: ...@@ -141,6 +142,12 @@ class RobotFileParser:
and numbers[1].strip().isdigit()): and numbers[1].strip().isdigit()):
entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1])) entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1]))
state = 2 state = 2
elif line[0] == "sitemap":
# According to http://www.sitemaps.org/protocol.html
# "This directive is independent of the user-agent line,
# so it doesn't matter where you place it in your file."
# Therefore we do not change the state of the parser.
self.sitemaps.append(line[1])
if state == 2: if state == 2:
self._add_entry(entry) self._add_entry(entry)
...@@ -189,6 +196,11 @@ class RobotFileParser: ...@@ -189,6 +196,11 @@ class RobotFileParser:
return entry.req_rate return entry.req_rate
return self.default_entry.req_rate return self.default_entry.req_rate
def site_maps(self):
if not self.sitemaps:
return None
return self.sitemaps
def __str__(self): def __str__(self):
entries = self.entries entries = self.entries
if self.default_entry is not None: if self.default_entry is not None:
......
...@@ -109,6 +109,7 @@ Anthony Baxter ...@@ -109,6 +109,7 @@ Anthony Baxter
Mike Bayer Mike Bayer
Samuel L. Bayer Samuel L. Bayer
Bo Bayles Bo Bayles
Christopher Beacham AKA Lady Red
Tommy Beadle Tommy Beadle
Donald Beaudry Donald Beaudry
David Beazley David Beazley
...@@ -1760,6 +1761,7 @@ Dik Winter ...@@ -1760,6 +1761,7 @@ Dik Winter
Blake Winton Blake Winton
Jean-Claude Wippler Jean-Claude Wippler
Stéphane Wirtel Stéphane Wirtel
Peter Wirtz
Lars Wirzenius Lars Wirzenius
John Wiseman John Wiseman
Chris Withers Chris Withers
......
Added support for Site Maps to urllib's ``RobotFileParser`` as
:meth:`RobotFileParser.site_maps() <urllib.robotparser.RobotFileParser.site_maps>`.
Patch by Lady Red, based on patch by Peter Wirtz.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment