parse.py 35.5 KB
Newer Older
1 2
"""Parse (absolute and relative) URLs.

3 4 5 6 7 8 9 10
urlparse module is based upon the following RFC specifications.

RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
and L.  Masinter, January 2005.

RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
and L.Masinter, December 1999.

Benjamin Peterson's avatar
Benjamin Peterson committed
11
RFC 2396:  "Uniform Resource Identifiers (URI)": Generic Syntax by T.
12 13
Berners-Lee, R. Fielding, and L. Masinter, August 1998.

14
RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.
15 16 17 18

RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
1995.

Benjamin Peterson's avatar
Benjamin Peterson committed
19
RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
20 21
McCahill, December 1994

Benjamin Peterson's avatar
Benjamin Peterson committed
22 23 24 25 26
RFC 3986 is considered the current standard and any future changes to
urlparse module should conform with it.  The urlparse module is
currently not entirely compliant with this RFC due to defacto
scenarios for parsing, and for backward compatibility purposes, some
parsing quirks from older RFCs are retained. The testcases in
27
test_urlparse.py provides a good indicator of parsing behavior.
28
"""
29

30
import re
31
import sys
32
import collections
33

34
__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
35 36
           "urlsplit", "urlunsplit", "urlencode", "parse_qs",
           "parse_qsl", "quote", "quote_plus", "quote_from_bytes",
37 38 39
           "unquote", "unquote_plus", "unquote_to_bytes",
           "DefragResult", "ParseResult", "SplitResult",
           "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"]
40

41
# A classification of schemes ('' means apply by default)
Raymond Hettinger's avatar
Raymond Hettinger committed
42
uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
43
                 'wais', 'file', 'https', 'shttp', 'mms',
44 45
                 'prospero', 'rtsp', 'rtspu', '', 'sftp',
                 'svn', 'svn+ssh']
Raymond Hettinger's avatar
Raymond Hettinger committed
46
uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
47 48
               'imap', 'wais', 'file', 'mms', 'https', 'shttp',
               'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
49
               'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh']
Raymond Hettinger's avatar
Raymond Hettinger committed
50
uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
51
               'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
52
               'mms', '', 'sftp', 'tel']
53

54 55 56 57 58 59 60 61 62 63
# These are not actually used anymore, but should stay for backwards
# compatibility.  (They are undocumented, but have a public-looking name.)
non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
                    'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
              'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
                 'nntp', 'wais', 'https', 'shttp', 'snews',
                 'file', 'prospero', '']

64
# Characters valid in scheme names
65 66 67 68
scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
                'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
                '0123456789'
                '+-.')
69

70
# XXX: Consider replacing with functools.lru_cache
71
MAX_CACHE_SIZE = 20
72 73 74
_parse_cache = {}

def clear_cache():
75
    """Clear the parse cache and the quoters cache."""
76
    _parse_cache.clear()
77
    _safe_quoters.clear()
78 79


80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
# Helpers for bytes handling
# For 3.2, we deliberately require applications that
# handle improperly quoted URLs to do their own
# decoding and encoding. If valid use cases are
# presented, we may relax this by using latin-1
# decoding internally for 3.3
_implicit_encoding = 'ascii'
_implicit_errors = 'strict'

def _noop(obj):
    return obj

def _encode_result(obj, encoding=_implicit_encoding,
                        errors=_implicit_errors):
    return obj.encode(encoding, errors)

def _decode_args(args, encoding=_implicit_encoding,
                       errors=_implicit_errors):
    return tuple(x.decode(encoding, errors) if x else '' for x in args)

def _coerce_args(*args):
    # Invokes decode if necessary to create str args
    # and returns the coerced inputs along with
    # an appropriate result coercion function
    #   - noop for str inputs
    #   - encoding function otherwise
    str_input = isinstance(args[0], str)
    for arg in args[1:]:
        # We special-case the empty string to support the
        # "scheme=''" default argument to some functions
        if arg and isinstance(arg, str) != str_input:
            raise TypeError("Cannot mix str and non-str arguments")
    if str_input:
        return args + (_noop,)
    return _decode_args(args) + (_encode_result,)

# Result objects are more helpful than simple tuples
class _ResultMixinStr(object):
    """Standard approach to encoding parsed results from str to bytes"""
    __slots__ = ()

    def encode(self, encoding='ascii', errors='strict'):
        return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))


class _ResultMixinBytes(object):
    """Standard approach to decoding parsed results from bytes to str"""
    __slots__ = ()

    def decode(self, encoding='ascii', errors='strict'):
        return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))


class _NetlocResultMixinBase(object):
    """Shared methods for the parsed result objects containing a netloc element"""
    __slots__ = ()
136 137 138

    @property
    def username(self):
139
        return self._userinfo[0]
140 141 142

    @property
    def password(self):
143
        return self._userinfo[1]
144 145 146

    @property
    def hostname(self):
147 148 149 150 151 152
        hostname = self._hostinfo[0]
        if not hostname:
            hostname = None
        elif hostname is not None:
            hostname = hostname.lower()
        return hostname
153 154 155

    @property
    def port(self):
156 157 158
        port = self._hostinfo[1]
        if port is not None:
            port = int(port, 10)
159
            if not ( 0 <= port <= 65535):
160
                raise ValueError("Port out of range 0-65535")
161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185
        return port


class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
    __slots__ = ()

    @property
    def _userinfo(self):
        netloc = self.netloc
        userinfo, have_info, hostinfo = netloc.rpartition('@')
        if have_info:
            username, have_password, password = userinfo.partition(':')
            if not have_password:
                password = None
        else:
            username = password = None
        return username, password

    @property
    def _hostinfo(self):
        netloc = self.netloc
        _, _, hostinfo = netloc.rpartition('@')
        _, have_open_br, bracketed = hostinfo.partition('[')
        if have_open_br:
            hostname, _, port = bracketed.partition(']')
186
            _, _, port = port.partition(':')
187
        else:
188 189
            hostname, _, port = hostinfo.partition(':')
        if not port:
190 191 192 193 194 195 196 197 198 199 200 201 202 203 204
            port = None
        return hostname, port


class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
    __slots__ = ()

    @property
    def _userinfo(self):
        netloc = self.netloc
        userinfo, have_info, hostinfo = netloc.rpartition(b'@')
        if have_info:
            username, have_password, password = userinfo.partition(b':')
            if not have_password:
                password = None
205
        else:
206 207 208 209 210 211 212 213 214 215
            username = password = None
        return username, password

    @property
    def _hostinfo(self):
        netloc = self.netloc
        _, _, hostinfo = netloc.rpartition(b'@')
        _, have_open_br, bracketed = hostinfo.partition(b'[')
        if have_open_br:
            hostname, _, port = bracketed.partition(b']')
216
            _, _, port = port.partition(b':')
217
        else:
218 219
            hostname, _, port = hostinfo.partition(b':')
        if not port:
220 221 222
            port = None
        return hostname, port

223

224
from collections import namedtuple
225

226
_DefragResultBase = namedtuple('DefragResult', 'url fragment')
227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291
_SplitResultBase = namedtuple(
    'SplitResult', 'scheme netloc path query fragment')
_ParseResultBase = namedtuple(
    'ParseResult', 'scheme netloc path params query fragment')

_DefragResultBase.__doc__ = """
DefragResult(url, fragment)

A 2-tuple that contains the url without fragment identifier and the fragment
identifier as a separate argument.
"""

_DefragResultBase.url.__doc__ = """The URL with no fragment identifier."""

_DefragResultBase.fragment.__doc__ = """
Fragment identifier separated from URL, that allows indirect identification of a
secondary resource by reference to a primary resource and additional identifying
information.
"""

_SplitResultBase.__doc__ = """
SplitResult(scheme, netloc, path, query, fragment)

A 5-tuple that contains the different components of a URL. Similar to
ParseResult, but does not split params.
"""

_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request."""

_SplitResultBase.netloc.__doc__ = """
Network location where the request is made to.
"""

_SplitResultBase.path.__doc__ = """
The hierarchical path, such as the path to a file to download.
"""

_SplitResultBase.query.__doc__ = """
The query component, that contains non-hierarchical data, that along with data
in path component, identifies a resource in the scope of URI's scheme and
network location.
"""

_SplitResultBase.fragment.__doc__ = """
Fragment identifier, that allows indirect identification of a secondary resource
by reference to a primary resource and additional identifying information.
"""

_ParseResultBase.__doc__ = """
ParseResult(scheme, netloc, path, params,  query, fragment)

A 6-tuple that contains components of a parsed URL.
"""

_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__
_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__
_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__
_ParseResultBase.params.__doc__ = """
Parameters for last path element used to dereference the URI in order to provide
access to perform some operation on the resource.
"""

_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__
_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__

292 293 294 295 296

# For backwards compatibility, alias _NetlocResultMixinStr
# ResultBase is no longer part of the documented API, but it is
# retained since deprecating it isn't worth the hassle
ResultBase = _NetlocResultMixinStr
297

298 299
# Structured result objects for string data
class DefragResult(_DefragResultBase, _ResultMixinStr):
300
    __slots__ = ()
301 302 303 304 305
    def geturl(self):
        if self.fragment:
            return self.url + '#' + self.fragment
        else:
            return self.url
306

307 308
class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
    __slots__ = ()
309 310 311
    def geturl(self):
        return urlunsplit(self)

312 313 314 315
class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
    __slots__ = ()
    def geturl(self):
        return urlunparse(self)
316

317 318 319 320 321 322 323 324
# Structured result objects for bytes data
class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
    __slots__ = ()
    def geturl(self):
        if self.fragment:
            return self.url + b'#' + self.fragment
        else:
            return self.url
325

326
class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
327
    __slots__ = ()
328 329
    def geturl(self):
        return urlunsplit(self)
330

331 332
class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
    __slots__ = ()
333 334 335
    def geturl(self):
        return urlunparse(self)

336 337 338 339 340 341 342 343 344 345 346 347 348
# Set up the encode/decode result pairs
def _fix_result_transcoding():
    _result_pairs = (
        (DefragResult, DefragResultBytes),
        (SplitResult, SplitResultBytes),
        (ParseResult, ParseResultBytes),
    )
    for _decoded, _encoded in _result_pairs:
        _decoded._encoded_counterpart = _encoded
        _encoded._decoded_counterpart = _decoded

_fix_result_transcoding()
del _fix_result_transcoding
349 350

def urlparse(url, scheme='', allow_fragments=True):
351 352 353 354 355
    """Parse a URL into 6 components:
    <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
    Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
    Note that we don't break the components up in smaller bits
    (e.g. netloc is a single string) and we don't expand % escapes."""
356
    url, scheme, _coerce_result = _coerce_args(url, scheme)
357 358
    splitresult = urlsplit(url, scheme, allow_fragments)
    scheme, netloc, url, query, fragment = splitresult
359 360 361 362
    if scheme in uses_params and ';' in url:
        url, params = _splitparams(url)
    else:
        params = ''
363 364
    result = ParseResult(scheme, netloc, url, params, query, fragment)
    return _coerce_result(result)
365 366 367 368 369 370 371 372 373 374

def _splitparams(url):
    if '/'  in url:
        i = url.find(';', url.rfind('/'))
        if i < 0:
            return url, ''
    else:
        i = url.find(';')
    return url[:i], url[i+1:]

375
def _splitnetloc(url, start=0):
376 377 378 379 380 381
    delim = len(url)   # position of end of domain part of url, default is end
    for c in '/?#':    # look for delimiters; the order is NOT important
        wdelim = url.find(c, start)        # find first of this delim
        if wdelim >= 0:                    # if found
            delim = min(delim, wdelim)     # use earliest delim position
    return url[start:delim], url[delim:]   # return (domain, rest)
382

383
def urlsplit(url, scheme='', allow_fragments=True):
384 385 386 387 388
    """Parse a URL into 5 components:
    <scheme>://<netloc>/<path>?<query>#<fragment>
    Return a 5-tuple: (scheme, netloc, path, query, fragment).
    Note that we don't break the components up in smaller bits
    (e.g. netloc is a single string) and we don't expand % escapes."""
389
    url, scheme, _coerce_result = _coerce_args(url, scheme)
390
    allow_fragments = bool(allow_fragments)
391
    key = url, scheme, allow_fragments, type(url), type(scheme)
392 393
    cached = _parse_cache.get(key, None)
    if cached:
394
        return _coerce_result(cached)
395 396
    if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
        clear_cache()
397
    netloc = query = fragment = ''
398 399 400 401 402 403
    i = url.find(':')
    if i > 0:
        if url[:i] == 'http': # optimize the common case
            scheme = url[:i].lower()
            url = url[i+1:]
            if url[:2] == '//':
404
                netloc, url = _splitnetloc(url, 2)
405 406 407
                if (('[' in netloc and ']' not in netloc) or
                        (']' in netloc and '[' not in netloc)):
                    raise ValueError("Invalid IPv6 URL")
408 409 410 411
            if allow_fragments and '#' in url:
                url, fragment = url.split('#', 1)
            if '?' in url:
                url, query = url.split('?', 1)
412 413
            v = SplitResult(scheme, netloc, url, query, fragment)
            _parse_cache[key] = v
414
            return _coerce_result(v)
415 416 417 418
        for c in url[:i]:
            if c not in scheme_chars:
                break
        else:
419 420 421 422 423 424
            # make sure "url" is not actually a port number (in which case
            # "scheme" is really part of the path)
            rest = url[i+1:]
            if not rest or any(c not in '0123456789' for c in rest):
                # not a port number
                scheme, url = url[:i].lower(), rest
425

426
    if url[:2] == '//':
427
        netloc, url = _splitnetloc(url, 2)
428 429 430
        if (('[' in netloc and ']' not in netloc) or
                (']' in netloc and '[' not in netloc)):
            raise ValueError("Invalid IPv6 URL")
431
    if allow_fragments and '#' in url:
432
        url, fragment = url.split('#', 1)
433
    if '?' in url:
434
        url, query = url.split('?', 1)
435 436
    v = SplitResult(scheme, netloc, url, query, fragment)
    _parse_cache[key] = v
437
    return _coerce_result(v)
438

439
def urlunparse(components):
440 441 442 443
    """Put a parsed URL back together again.  This may result in a
    slightly different, but equivalent URL, if the URL that was parsed
    originally had redundant delimiters, e.g. a ? with an empty query
    (the draft states that these are equivalent)."""
444 445
    scheme, netloc, url, params, query, fragment, _coerce_result = (
                                                  _coerce_args(*components))
446 447
    if params:
        url = "%s;%s" % (url, params)
448
    return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
449

450
def urlunsplit(components):
451 452 453 454 455
    """Combine the elements of a tuple as returned by urlsplit() into a
    complete URL as a string. The data argument can be any five-item iterable.
    This may result in a slightly different, but equivalent URL, if the URL that
    was parsed originally had unnecessary delimiters (for example, a ? with an
    empty query; the RFC states that these are equivalent)."""
456 457
    scheme, netloc, url, query, fragment, _coerce_result = (
                                          _coerce_args(*components))
458
    if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
459 460 461 462 463 464 465 466
        if url and url[:1] != '/': url = '/' + url
        url = '//' + (netloc or '') + url
    if scheme:
        url = scheme + ':' + url
    if query:
        url = url + '?' + query
    if fragment:
        url = url + '#' + fragment
467
    return _coerce_result(url)
468

469
def urljoin(base, url, allow_fragments=True):
470 471 472 473 474 475
    """Join a base URL and a possibly relative URL to form an absolute
    interpretation of the latter."""
    if not base:
        return url
    if not url:
        return base
476

477
    base, url, _coerce_result = _coerce_args(base, url)
478 479 480 481
    bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
            urlparse(base, '', allow_fragments)
    scheme, netloc, path, params, query, fragment = \
            urlparse(url, bscheme, allow_fragments)
482

483
    if scheme != bscheme or scheme not in uses_relative:
484
        return _coerce_result(url)
485 486
    if scheme in uses_netloc:
        if netloc:
487 488
            return _coerce_result(urlunparse((scheme, netloc, path,
                                              params, query, fragment)))
489
        netloc = bnetloc
490

491
    if not path and not params:
492
        path = bpath
493
        params = bparams
494 495
        if not query:
            query = bquery
496 497
        return _coerce_result(urlunparse((scheme, netloc, path,
                                          params, query, fragment)))
498 499 500 501 502 503 504 505 506 507 508 509

    base_parts = bpath.split('/')
    if base_parts[-1] != '':
        # the last item is not a directory, so will not be taken into account
        # in resolving the relative path
        del base_parts[-1]

    # for rfc3986, ignore all base path should the first character be root.
    if path[:1] == '/':
        segments = path.split('/')
    else:
        segments = base_parts + path.split('/')
510 511
        # filter out elements that would cause redundant slashes on re-joining
        # the resolved_path
512
        segments[1:-1] = filter(None, segments[1:-1])
513 514 515 516 517 518 519 520 521 522 523 524 525

    resolved_path = []

    for seg in segments:
        if seg == '..':
            try:
                resolved_path.pop()
            except IndexError:
                # ignore any .. segments that would otherwise cause an IndexError
                # when popped from resolved_path if resolving for rfc3986
                pass
        elif seg == '.':
            continue
526
        else:
527 528 529 530 531 532 533 534
            resolved_path.append(seg)

    if segments[-1] in ('.', '..'):
        # do some post-processing here. if the last segment was a relative dir,
        # then we need to append the trailing '/'
        resolved_path.append('')

    return _coerce_result(urlunparse((scheme, netloc, '/'.join(
535
        resolved_path) or '/', params, query, fragment)))
536

537

538
def urldefrag(url):
539
    """Removes any existing fragment from URL.
540

541 542 543 544
    Returns a tuple of the defragmented URL and the fragment.  If
    the URL contained no fragments, the second element is the
    empty string.
    """
545
    url, _coerce_result = _coerce_args(url)
546 547 548 549
    if '#' in url:
        s, n, p, a, q, frag = urlparse(url)
        defrag = urlunparse((s, n, p, a, q, ''))
    else:
550 551 552
        frag = ''
        defrag = url
    return _coerce_result(DefragResult(defrag, frag))
553

554
_hexdig = '0123456789ABCDEFabcdef'
555
_hextobyte = None
556

557 558 559 560
def unquote_to_bytes(string):
    """unquote_to_bytes('abc%20def') -> b'abc def'."""
    # Note: strings are encoded as UTF-8. This is only an issue if it contains
    # unescaped non-ASCII characters, which URIs should not.
561 562 563
    if not string:
        # Is it a string-like object?
        string.split
564
        return b''
565 566
    if isinstance(string, str):
        string = string.encode('utf-8')
567 568
    bits = string.split(b'%')
    if len(bits) == 1:
569
        return string
570 571
    res = [bits[0]]
    append = res.append
572 573 574 575 576 577
    # Delay the initialization of the table to not waste memory
    # if the function is never called
    global _hextobyte
    if _hextobyte is None:
        _hextobyte = {(a + b).encode(): bytes([int(a + b, 16)])
                      for a in _hexdig for b in _hexdig}
578
    for item in bits[1:]:
579
        try:
580 581 582 583 584 585 586 587
            append(_hextobyte[item[:2]])
            append(item[2:])
        except KeyError:
            append(b'%')
            append(item)
    return b''.join(res)

_asciire = re.compile('([\x00-\x7f]+)')
588 589 590 591 592 593 594 595 596 597 598

def unquote(string, encoding='utf-8', errors='replace'):
    """Replace %xx escapes by their single-character equivalent. The optional
    encoding and errors parameters specify how to decode percent-encoded
    sequences into Unicode characters, as accepted by the bytes.decode()
    method.
    By default, percent-encoded sequences are decoded with UTF-8, and invalid
    sequences are replaced by a placeholder character.

    unquote('abc%20def') -> 'abc def'.
    """
599 600
    if '%' not in string:
        string.split
601 602 603 604 605
        return string
    if encoding is None:
        encoding = 'utf-8'
    if errors is None:
        errors = 'replace'
606 607 608 609 610 611 612
    bits = _asciire.split(string)
    res = [bits[0]]
    append = res.append
    for i in range(1, len(bits), 2):
        append(unquote_to_bytes(bits[i]).decode(encoding, errors))
        append(bits[i + 1])
    return ''.join(res)
613

614 615
def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
             encoding='utf-8', errors='replace'):
616 617 618 619
    """Parse a query given as a string argument.

        Arguments:

620
        qs: percent-encoded query string to be parsed
621 622

        keep_blank_values: flag indicating whether blank values in
623
            percent-encoded queries should be treated as blank strings.
624 625 626 627 628 629 630 631
            A true value indicates that blanks should be retained as
            blank strings.  The default false value indicates that
            blank values are to be ignored and treated as if they were
            not included.

        strict_parsing: flag indicating what to do with parsing errors.
            If false (the default), errors are silently ignored.
            If true, errors raise a ValueError exception.
632 633 634

        encoding and errors: specify how to decode percent-encoded sequences
            into Unicode characters, as accepted by the bytes.decode() method.
635
    """
636
    parsed_result = {}
637 638 639
    pairs = parse_qsl(qs, keep_blank_values, strict_parsing,
                      encoding=encoding, errors=errors)
    for name, value in pairs:
640 641
        if name in parsed_result:
            parsed_result[name].append(value)
642
        else:
643 644
            parsed_result[name] = [value]
    return parsed_result
645

646 647
def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
              encoding='utf-8', errors='replace'):
648 649 650 651
    """Parse a query given as a string argument.

    Arguments:

652
    qs: percent-encoded query string to be parsed
653 654

    keep_blank_values: flag indicating whether blank values in
655
        percent-encoded queries should be treated as blank strings.  A
656 657 658 659 660 661 662 663
        true value indicates that blanks should be retained as blank
        strings.  The default false value indicates that blank values
        are to be ignored and treated as if they were  not included.

    strict_parsing: flag indicating what to do with parsing errors. If
        false (the default), errors are silently ignored. If true,
        errors raise a ValueError exception.

664 665 666
    encoding and errors: specify how to decode percent-encoded sequences
        into Unicode characters, as accepted by the bytes.decode() method.

667 668
    Returns a list, as G-d intended.
    """
669
    qs, _coerce_result = _coerce_args(qs)
670 671 672 673 674 675 676 677 678 679 680 681 682 683 684
    pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
    r = []
    for name_value in pairs:
        if not name_value and not strict_parsing:
            continue
        nv = name_value.split('=', 1)
        if len(nv) != 2:
            if strict_parsing:
                raise ValueError("bad query field: %r" % (name_value,))
            # Handle case of a control-name with no equal sign
            if keep_blank_values:
                nv.append('')
            else:
                continue
        if len(nv[1]) or keep_blank_values:
685 686 687 688 689 690
            name = nv[0].replace('+', ' ')
            name = unquote(name, encoding=encoding, errors=errors)
            name = _coerce_result(name)
            value = nv[1].replace('+', ' ')
            value = unquote(value, encoding=encoding, errors=errors)
            value = _coerce_result(value)
691 692 693
            r.append((name, value))
    return r

694 695 696 697 698 699 700 701 702 703 704 705 706
def unquote_plus(string, encoding='utf-8', errors='replace'):
    """Like unquote(), but also replace plus signs by spaces, as required for
    unquoting HTML form values.

    unquote_plus('%7e/abc+def') -> '~/abc def'
    """
    string = string.replace('+', ' ')
    return unquote(string, encoding, errors)

_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
                         b'abcdefghijklmnopqrstuvwxyz'
                         b'0123456789'
                         b'_.-')
707 708
_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
_safe_quoters = {}
709

710 711 712 713 714 715 716 717
class Quoter(collections.defaultdict):
    """A mapping from bytes (in range(0,256)) to strings.

    String values are percent-encoded byte values, unless the key < 128, and
    in the "safe" set (either the specified safe set, or default set).
    """
    # Keeps a cache internally, using defaultdict, for efficiency (lookups
    # of cached keys don't call Python code at all).
718
    def __init__(self, safe):
719
        """safe: bytes object."""
720
        self.safe = _ALWAYS_SAFE.union(safe)
721

722 723
    def __repr__(self):
        # Without this, will just display as a defaultdict
724
        return "<%s %r>" % (self.__class__.__name__, dict(self))
725 726 727

    def __missing__(self, b):
        # Handle a cache miss. Store quoted string in cache and return.
728
        res = chr(b) if b in self.safe else '%{:02X}'.format(b)
729 730 731 732
        self[b] = res
        return res

def quote(string, safe='/', encoding=None, errors=None):
733
    """quote('abc def') -> 'abc%20def'
734

735 736 737 738 739
    Each part of a URL, e.g. the path info, the query, etc., has a
    different set of reserved characters that must be quoted.

    RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
    the following reserved characters.
740

741 742 743 744 745 746 747 748 749 750 751
    reserved    = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
                  "$" | ","

    Each of these characters is reserved in some component of a URL,
    but not necessarily in all of them.

    By default, the quote function is intended for quoting the path
    section of a URL.  Thus, it will not encode '/'.  This character
    is reserved, but in typical usage the quote function is being
    called on a path where the existing slash characters are used as
    reserved characters.
752

753 754
    string and safe may be either str or bytes objects. encoding and errors
    must not be specified if string is a bytes object.
755 756 757 758 759 760 761

    The optional encoding and errors parameters specify how to deal with
    non-ASCII characters, as accepted by the str.encode method.
    By default, encoding='utf-8' (characters are encoded with UTF-8), and
    errors='strict' (unsupported characters raise a UnicodeEncodeError).
    """
    if isinstance(string, str):
762 763
        if not string:
            return string
764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779
        if encoding is None:
            encoding = 'utf-8'
        if errors is None:
            errors = 'strict'
        string = string.encode(encoding, errors)
    else:
        if encoding is not None:
            raise TypeError("quote() doesn't support 'encoding' for bytes")
        if errors is not None:
            raise TypeError("quote() doesn't support 'errors' for bytes")
    return quote_from_bytes(string, safe)

def quote_plus(string, safe='', encoding=None, errors=None):
    """Like quote(), but also replace ' ' with '+', as required for quoting
    HTML form values. Plus signs in the original string are escaped unless
    they are included in safe. It also does not have safe default to '/'.
780
    """
781 782 783 784 785 786 787 788 789
    # Check if ' ' in string, where string may either be a str or bytes.  If
    # there are no spaces, the regular quote will produce the right answer.
    if ((isinstance(string, str) and ' ' not in string) or
        (isinstance(string, bytes) and b' ' not in string)):
        return quote(string, safe, encoding, errors)
    if isinstance(safe, str):
        space = ' '
    else:
        space = b' '
790
    string = quote(string, safe + space, encoding, errors)
791
    return string.replace(' ', '+')
792 793 794 795

def quote_from_bytes(bs, safe='/'):
    """Like quote(), but accepts a bytes object rather than a str, and does
    not perform string-to-bytes encoding.  It always returns an ASCII string.
796
    quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f'
797
    """
798 799 800 801
    if not isinstance(bs, (bytes, bytearray)):
        raise TypeError("quote_from_bytes() expected bytes")
    if not bs:
        return ''
802 803 804
    if isinstance(safe, str):
        # Normalize 'safe' by converting to bytes and removing non-ASCII chars
        safe = safe.encode('ascii', 'ignore')
805 806 807 808
    else:
        safe = bytes([c for c in safe if c < 128])
    if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
        return bs.decode()
809
    try:
810
        quoter = _safe_quoters[safe]
811
    except KeyError:
812 813
        _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
    return ''.join([quoter(char) for char in bs])
814

815 816
def urlencode(query, doseq=False, safe='', encoding=None, errors=None,
              quote_via=quote_plus):
817
    """Encode a dict or sequence of two-element tuples into a URL query string.
818 819 820 821 822 823 824

    If any values in the query arg are sequences and doseq is true, each
    sequence element is converted to a separate parameter.

    If the query arg is a sequence of two-element tuples, the order of the
    parameters in the output will match the order of parameters in the
    input.
825

826
    The components of a query arg may each be either a string or a bytes type.
827

828 829
    The safe, encoding, and errors parameters are passed down to the function
    specified by quote_via (encoding and errors only if a component is a str).
830 831
    """

832
    if hasattr(query, "items"):
833 834
        query = query.items()
    else:
Jeremy Hylton's avatar
Jeremy Hylton committed
835 836
        # It's a bother at times that strings and string-like objects are
        # sequences.
837 838 839 840 841
        try:
            # non-sequence items should not work with len()
            # non-empty strings will fail this
            if len(query) and not isinstance(query[0], tuple):
                raise TypeError
Jeremy Hylton's avatar
Jeremy Hylton committed
842 843
            # Zero-length sequences of all types will get here and succeed,
            # but that's a minor nit.  Since the original implementation
844 845 846
            # allowed empty dicts that type of behavior probably should be
            # preserved for consistency
        except TypeError:
847 848 849
            ty, va, tb = sys.exc_info()
            raise TypeError("not a valid non-string sequence "
                            "or mapping object").with_traceback(tb)
850 851 852 853

    l = []
    if not doseq:
        for k, v in query:
854
            if isinstance(k, bytes):
855
                k = quote_via(k, safe)
856
            else:
857
                k = quote_via(str(k), safe, encoding, errors)
858 859

            if isinstance(v, bytes):
860
                v = quote_via(v, safe)
861
            else:
862
                v = quote_via(str(v), safe, encoding, errors)
863 864 865
            l.append(k + '=' + v)
    else:
        for k, v in query:
866
            if isinstance(k, bytes):
867
                k = quote_via(k, safe)
868
            else:
869
                k = quote_via(str(k), safe, encoding, errors)
870 871

            if isinstance(v, bytes):
872
                v = quote_via(v, safe)
873 874
                l.append(k + '=' + v)
            elif isinstance(v, str):
875
                v = quote_via(v, safe, encoding, errors)
876 877 878
                l.append(k + '=' + v)
            else:
                try:
Jeremy Hylton's avatar
Jeremy Hylton committed
879
                    # Is this a sufficient test for sequence-ness?
880 881 882
                    x = len(v)
                except TypeError:
                    # not a sequence
883
                    v = quote_via(str(v), safe, encoding, errors)
884 885 886 887
                    l.append(k + '=' + v)
                else:
                    # loop over the sequence
                    for elt in v:
888
                        if isinstance(elt, bytes):
889
                            elt = quote_via(elt, safe)
890
                        else:
891
                            elt = quote_via(str(elt), safe, encoding, errors)
892
                        l.append(k + '=' + elt)
893 894
    return '&'.join(l)

895 896
def to_bytes(url):
    """to_bytes(u"URL") --> 'URL'."""
897 898
    # Most URL schemes require ASCII. If that changes, the conversion
    # can be relaxed.
899
    # XXX get rid of to_bytes()
900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920
    if isinstance(url, str):
        try:
            url = url.encode("ASCII").decode()
        except UnicodeError:
            raise UnicodeError("URL " + repr(url) +
                               " contains non-ASCII characters")
    return url

def unwrap(url):
    """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
    url = str(url).strip()
    if url[:1] == '<' and url[-1:] == '>':
        url = url[1:-1].strip()
    if url[:4] == 'URL:': url = url[4:].strip()
    return url

_typeprog = None
def splittype(url):
    """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
    global _typeprog
    if _typeprog is None:
921
        _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL)
922 923 924

    match = _typeprog.match(url)
    if match:
925 926
        scheme, data = match.groups()
        return scheme.lower(), data
927 928 929 930 931 932 933
    return None, url

_hostprog = None
def splithost(url):
    """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
    global _hostprog
    if _hostprog is None:
934
        _hostprog = re.compile('//([^/?]*)(.*)', re.DOTALL)
935 936

    match = _hostprog.match(url)
937
    if match:
938 939
        host_port, path = match.groups()
        if path and path[0] != '/':
940 941
            path = '/' + path
        return host_port, path
942 943 944 945
    return None, url

def splituser(host):
    """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
946 947
    user, delim, host = host.rpartition('@')
    return (user if delim else None), host
948 949 950

def splitpasswd(user):
    """splitpasswd('user:passwd') -> 'user', 'passwd'."""
951 952
    user, delim, passwd = user.partition(':')
    return user, (passwd if delim else None)
953 954 955 956 957 958 959

# splittag('/path#tag') --> '/path', 'tag'
_portprog = None
def splitport(host):
    """splitport('host:port') --> 'host', 'port'."""
    global _portprog
    if _portprog is None:
960
        _portprog = re.compile('(.*):([0-9]*)$', re.DOTALL)
961 962

    match = _portprog.match(host)
963 964 965 966
    if match:
        host, port = match.groups()
        if port:
            return host, port
967 968 969 970 971 972 973
    return host, None

def splitnport(host, defport=-1):
    """Split host and port, returning numeric port.
    Return given default port if no ':' found; defaults to -1.
    Return numerical port if a valid number are found after ':'.
    Return None if ':' but not a valid number."""
974 975 976 977 978 979 980 981 982
    host, delim, port = host.rpartition(':')
    if not delim:
        host = port
    elif port:
        try:
            nport = int(port)
        except ValueError:
            nport = None
        return host, nport
983 984 985 986
    return host, defport

def splitquery(url):
    """splitquery('/path?query') --> '/path', 'query'."""
987 988 989
    path, delim, query = url.rpartition('?')
    if delim:
        return path, query
990 991 992 993
    return url, None

def splittag(url):
    """splittag('/path#tag') --> '/path', 'tag'."""
994 995 996
    path, delim, tag = url.rpartition('#')
    if delim:
        return path, tag
997 998 999 1000 1001 1002 1003 1004 1005 1006
    return url, None

def splitattr(url):
    """splitattr('/path;attr1=value1;attr2=value2;...') ->
        '/path', ['attr1=value1', 'attr2=value2', ...]."""
    words = url.split(';')
    return words[0], words[1:]

def splitvalue(attr):
    """splitvalue('attr=value') --> 'attr', 'value'."""
1007 1008
    attr, delim, value = attr.partition('=')
    return attr, (value if delim else None)