Fix Issue754016 - urlparse goes wrong with IP:port without scheme

0b5019fe · Senthil Kumaran · 75a292e5 · 0b5019fe · 0b5019fe · 0b5019fe
Kaydet (Commit) 0b5019fe authored Agu 04, 2010 tarafından Senthil Kumaran
Show whitespace changes
Inline Side-by-side

Showing with 39 additions and 0 deletions

urlparse.rst Doc/library/urlparse.rst +18 -0

test_urlparse.py Lib/test/test_urlparse.py +20 -0

urlparse.py Lib/urlparse.py +1 -0

No files found.
--- a/Doc/library/urlparse.rst
+++ b/Doc/library/urlparse.rst
@@ -58,6 +58,24 @@ The :mod:`urlparse` module defines the following functions:
      >>> o.geturl()
      'http://www.cwi.nl:80/%7Eguido/Python.html'
+   If the scheme value is not specified, urlparse following the syntax
+   specifications from RFC 1808, expects the netloc value to start with '//',
+   Otherwise, it is not possible to distinguish between net_loc and path
+   component and would classify the indistinguishable component as path as in
+   a relative url.
+       >>> from urlparse import urlparse
+       >>> urlparse('//www.cwi.nl:80/%7Eguido/Python.html')
+       ParseResult(scheme='', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html',
+                  params='', query='', fragment='')
+       >>> urlparse('www.cwi.nl:80/%7Eguido/Python.html')
+       ParseResult(scheme='', netloc='', path='www.cwi.nl:80/%7Eguido/Python.html',
+                  params='', query='', fragment='')
+       >>> urlparse('help/Python.html')
+       ParseResult(scheme='', netloc='', path='help/Python.html', params='',
+                  query='', fragment='')
   If the *scheme* argument is specified, it gives the default addressing
   scheme, to be used only if the URL does not specify one.  The default value for
   this argument is the empty string.

--- a/Lib/test/test_urlparse.py
+++ b/Lib/test/test_urlparse.py
@@ -478,6 +478,26 @@ class UrlParseTestCase(unittest.TestCase):
        self.assertEqual(urlparse.urlparse("x-newscheme://foo.com/stuff"),
                         ('x-newscheme','foo.com','/stuff','','',''))
+    def test_withoutscheme(self):
+        # Test urlparse without scheme
+        # Issue 754016: urlparse goes wrong with IP:port without scheme
+        # RFC 1808 specifies that netloc should start with //, urlparse expects
+        # the same, otherwise it classifies the portion of url as path.
+        self.assertEqual(urlparse.urlparse("path"),
+                ('','','path','','',''))
+        self.assertEqual(urlparse.urlparse("//www.python.org:80"),
+                ('','www.python.org:80','','','',''))
+        self.assertEqual(urlparse.urlparse("http://www.python.org:80"),
+                ('http','www.python.org:80','','','',''))
+    def test_portseparator(self):
+        # Issue 754016 makes changes for port separator ':' from scheme separator
+        self.assertEqual(urlparse.urlparse("path:80"),
+                ('','','path:80','','',''))
+        self.assertEqual(urlparse.urlparse("http:"),('http','','','','',''))
+        self.assertEqual(urlparse.urlparse("https:"),('https','','','','',''))
+        self.assertEqual(urlparse.urlparse("http://www.python.org:80"),
+                ('http','www.python.org:80','','','',''))
 def test_main():

--- a/Lib/urlparse.py
+++ b/Lib/urlparse.py
@@ -187,6 +187,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
            v = SplitResult(scheme, netloc, url, query, fragment)
            _parse_cache[key] = v
            return v
+        if url.endswith(':') or not url[i+1].isdigit():
            for c in url[:i]:
                if c not in scheme_chars:
                    break