Kaydet (Commit) 73fd46d2 authored tarafından Jeremy Hylton's avatar Jeremy Hylton

Bug 3347: robotparser failed because it didn't convert bytes to string.

The solution is to convert bytes to text via utf-8.  I'm not entirely
sure if this is safe, but it looks like robots.txt is expected to be
ascii.
üst 48577d19
...@@ -136,8 +136,9 @@ bad = [] # Bug report says "/" should be denied, but that is not in the RFC ...@@ -136,8 +136,9 @@ bad = [] # Bug report says "/" should be denied, but that is not in the RFC
RobotTest(7, doc, good, bad) RobotTest(7, doc, good, bad)
class TestCase(unittest.TestCase): class NetworkTestCase(unittest.TestCase):
def runTest(self):
def testPasswordProtectedSite(self):
support.requires('network') support.requires('network')
# whole site is password-protected. # whole site is password-protected.
url = 'http://mueblesmoraleda.com' url = 'http://mueblesmoraleda.com'
...@@ -146,9 +147,17 @@ class TestCase(unittest.TestCase): ...@@ -146,9 +147,17 @@ class TestCase(unittest.TestCase):
parser.read() parser.read()
self.assertEqual(parser.can_fetch("*", url+"/robots.txt"), False) self.assertEqual(parser.can_fetch("*", url+"/robots.txt"), False)
def testPythonOrg(self):
support.requires('network')
parser = urllib.robotparser.RobotFileParser(
"http://www.python.org/robots.txt")
parser.read()
self.assertTrue(parser.can_fetch("*",
"http://www.python.org/robots.txt"))
def test_main(): def test_main():
support.run_unittest(NetworkTestCase)
support.run_unittest(tests) support.run_unittest(tests)
TestCase().run()
if __name__=='__main__': if __name__=='__main__':
support.Verbose = 1 support.Verbose = 1
......
...@@ -60,7 +60,8 @@ class RobotFileParser: ...@@ -60,7 +60,8 @@ class RobotFileParser:
elif err.code >= 400: elif err.code >= 400:
self.allow_all = True self.allow_all = True
else: else:
self.parse(f.read().splitlines()) raw = f.read()
self.parse(raw.decode("utf-8").splitlines())
def _add_entry(self, entry): def _add_entry(self, entry):
if "*" in entry.useragents: if "*" in entry.useragents:
...@@ -123,7 +124,10 @@ class RobotFileParser: ...@@ -123,7 +124,10 @@ class RobotFileParser:
return True return True
# search for given user agent matches # search for given user agent matches
# the first match counts # the first match counts
url = urllib.parse.quote(urllib.parse.urlparse(urllib.parse.unquote(url))[2]) or "/" url = urllib.parse.quote(
urllib.parse.urlparse(urllib.parse.unquote(url))[2])
if not url:
url = "/"
for entry in self.entries: for entry in self.entries:
if entry.applies_to(useragent): if entry.applies_to(useragent):
return entry.allowance(url) return entry.allowance(url)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment