urllib.py 49.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
"""Open an arbitrary URL.

See the following document for more info on URLs:
"Names and Addresses, URIs, URLs, URNs, URCs", at
http://www.w3.org/pub/WWW/Addressing/Overview.html

See also the HTTP spec (from which the error codes are derived):
"HTTP - Hypertext Transfer Protocol", at
http://www.w3.org/pub/WWW/Protocols/

Related standards and specs:
- RFC1808: the "relative URL" spec. (authoritative status)
- RFC1738 - the "URL standard". (authoritative status)
- RFC1630 - the "URI spec". (informational status)

The object returned by URLopener().open(file) will differ per
protocol.  All you know is that is has methods read(), readline(),
readlines(), fileno(), close() and info().  The read*(), fileno()
19
and close() methods work like those of open files.
20 21 22 23
The info() method returns a mimetools.Message object which can be
used to query various info about the object, if available.
(mimetools.Message objects are queried with the getheader() method.)
"""
24

25
import string
26
import socket
27
import os
28 29
import stat
import time
30
import sys
31
import types
32

33 34
__all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
           "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
35 36 37 38 39
           "urlencode", "url2pathname", "pathname2url", "splittag",
           "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
           "splittype", "splithost", "splituser", "splitpasswd", "splitport",
           "splitnport", "splitquery", "splitattr", "splitvalue",
           "splitgophertype", "getproxies"]
40

41
__version__ = '1.15'    # XXX This version is not always updated :-(
42

43
MAXFTPCACHE = 10        # Trim the ftp cache beyond this size
44

45 46
# Helper for non-unix systems
if os.name == 'mac':
47
    from macurl2path import url2pathname, pathname2url
48
elif os.name == 'nt':
49
    from nturl2path import url2pathname, pathname2url
50 51
elif os.name == 'riscos':
    from rourl2path import url2pathname, pathname2url
52
else:
53
    def url2pathname(pathname):
54
        return unquote(pathname)
55
    def pathname2url(pathname):
56
        return quote(pathname)
Guido van Rossum's avatar
Guido van Rossum committed
57

58 59 60 61 62 63 64 65 66
# This really consists of two pieces:
# (1) a class which handles opening of all sorts of URLs
#     (plus assorted utilities etc.)
# (2) a set of functions for parsing URLs
# XXX Should these be separated out into different modules?


# Shortcut for basic usage
_urlopener = None
67
def urlopen(url, data=None):
68
    """urlopen(url [, data]) -> open file-like object"""
69 70 71 72 73 74 75
    global _urlopener
    if not _urlopener:
        _urlopener = FancyURLopener()
    if data is None:
        return _urlopener.open(url)
    else:
        return _urlopener.open(url, data)
76
def urlretrieve(url, filename=None, reporthook=None, data=None):
77 78 79
    global _urlopener
    if not _urlopener:
        _urlopener = FancyURLopener()
80
    return _urlopener.retrieve(url, filename, reporthook, data)
81
def urlcleanup():
82 83
    if _urlopener:
        _urlopener.cleanup()
84 85 86 87


ftpcache = {}
class URLopener:
88 89 90 91 92 93
    """Class to open URLs.
    This is a class rather than just a subroutine because we may need
    more than one set of global protocol-specific options.
    Note -- this is a base class for those who don't want the
    automatic handling of errors type 302 (relocated) and 401
    (authorization needed)."""
94

95 96
    __tempfiles = None

97 98
    version = "Python-urllib/%s" % __version__

99
    # Constructor
100
    def __init__(self, proxies=None, **x509):
101 102 103 104
        if proxies is None:
            proxies = getproxies()
        assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
        self.proxies = proxies
105 106
        self.key_file = x509.get('key_file')
        self.cert_file = x509.get('cert_file')
107
        self.addheaders = [('User-agent', self.version)]
108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
        self.__tempfiles = []
        self.__unlink = os.unlink # See cleanup()
        self.tempcache = None
        # Undocumented feature: if you assign {} to tempcache,
        # it is used to cache files retrieved with
        # self.retrieve().  This is not enabled by default
        # since it does not work for changing documents (and I
        # haven't got the logic to check expiration headers
        # yet).
        self.ftpcache = ftpcache
        # Undocumented feature: you can use a different
        # ftp cache by assigning to the .ftpcache member;
        # in case you want logically independent URL openers
        # XXX This is not threadsafe.  Bah.

    def __del__(self):
        self.close()

    def close(self):
        self.cleanup()

    def cleanup(self):
        # This code sometimes runs when the rest of this module
        # has already been deleted, so it can't use any globals
        # or import anything.
        if self.__tempfiles:
            for file in self.__tempfiles:
                try:
                    self.__unlink(file)
137
                except OSError:
138 139 140 141 142 143
                    pass
            del self.__tempfiles[:]
        if self.tempcache:
            self.tempcache.clear()

    def addheader(self, *args):
144 145
        """Add a header to be used by the HTTP interface only
        e.g. u.addheader('Accept', 'sound/basic')"""
146 147 148 149
        self.addheaders.append(args)

    # External interface
    def open(self, fullurl, data=None):
150
        """Use URLopener().open(file) instead of open(file, 'r')."""
151
        fullurl = unwrap(toBytes(fullurl))
152 153 154 155
        if self.tempcache and self.tempcache.has_key(fullurl):
            filename, headers = self.tempcache[fullurl]
            fp = open(filename, 'rb')
            return addinfourl(fp, headers, fullurl)
156 157 158 159 160 161
        urltype, url = splittype(fullurl)
        if not urltype:
            urltype = 'file'
        if self.proxies.has_key(urltype):
            proxy = self.proxies[urltype]
            urltype, proxyhost = splittype(proxy)
162
            host, selector = splithost(proxyhost)
163
            url = (host, fullurl) # Signal special case to open_*()
164 165
        else:
            proxy = None
166 167
        name = 'open_' + urltype
        self.type = urltype
168 169
        if '-' in name:
            # replace - with _
170
            name = '_'.join(name.split('-'))
171
        if not hasattr(self, name):
172 173
            if proxy:
                return self.open_unknown_proxy(proxy, fullurl, data)
174 175 176 177 178 179 180 181 182 183 184
            else:
                return self.open_unknown(fullurl, data)
        try:
            if data is None:
                return getattr(self, name)(url)
            else:
                return getattr(self, name)(url, data)
        except socket.error, msg:
            raise IOError, ('socket error', msg), sys.exc_info()[2]

    def open_unknown(self, fullurl, data=None):
185
        """Overridable interface to open unknown URL type."""
186 187 188
        type, url = splittype(fullurl)
        raise IOError, ('url error', 'unknown url type', type)

189 190 191 192 193
    def open_unknown_proxy(self, proxy, fullurl, data=None):
        """Overridable interface to open unknown URL type."""
        type, url = splittype(fullurl)
        raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)

194
    # External interface
195
    def retrieve(self, url, filename=None, reporthook=None, data=None):
196 197
        """retrieve(url) returns (filename, None) for a local object
        or (tempfilename, headers) for a remote object."""
198
        url = unwrap(toBytes(url))
199 200 201 202 203 204 205 206 207 208 209
        if self.tempcache and self.tempcache.has_key(url):
            return self.tempcache[url]
        type, url1 = splittype(url)
        if not filename and (not type or type == 'file'):
            try:
                fp = self.open_local_file(url1)
                hdrs = fp.info()
                del fp
                return url2pathname(splithost(url1)[1]), hdrs
            except IOError, msg:
                pass
210
        fp = self.open(url, data)
211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249
        headers = fp.info()
        if not filename:
            import tempfile
            garbage, path = splittype(url)
            garbage, path = splithost(path or "")
            path, garbage = splitquery(path or "")
            path, garbage = splitattr(path or "")
            suffix = os.path.splitext(path)[1]
            filename = tempfile.mktemp(suffix)
            self.__tempfiles.append(filename)
        result = filename, headers
        if self.tempcache is not None:
            self.tempcache[url] = result
        tfp = open(filename, 'wb')
        bs = 1024*8
        size = -1
        blocknum = 1
        if reporthook:
            if headers.has_key("content-length"):
                size = int(headers["Content-Length"])
            reporthook(0, bs, size)
        block = fp.read(bs)
        if reporthook:
            reporthook(1, bs, size)
        while block:
            tfp.write(block)
            block = fp.read(bs)
            blocknum = blocknum + 1
            if reporthook:
                reporthook(blocknum, bs, size)
        fp.close()
        tfp.close()
        del fp
        del tfp
        return result

    # Each method named open_<type> knows how to open that type of URL

    def open_http(self, url, data=None):
250
        """Use HTTP protocol."""
251 252
        import httplib
        user_passwd = None
253
        if type(url) is types.StringType:
254 255 256 257 258 259 260 261 262 263
            host, selector = splithost(url)
            if host:
                user_passwd, host = splituser(host)
                host = unquote(host)
            realhost = host
        else:
            host, selector = url
            urltype, rest = splittype(selector)
            url = rest
            user_passwd = None
264
            if urltype.lower() != 'http':
265 266 267 268 269 270 271
                realhost = None
            else:
                realhost, rest = splithost(rest)
                if realhost:
                    user_passwd, realhost = splituser(realhost)
                if user_passwd:
                    selector = "%s://%s%s" % (urltype, realhost, rest)
272 273 274
                if proxy_bypass(realhost):
                    host = realhost

275 276 277 278
            #print "proxy via http:", host, selector
        if not host: raise IOError, ('http error', 'no host given')
        if user_passwd:
            import base64
279
            auth = base64.encodestring(user_passwd).strip()
280 281 282 283 284 285 286 287 288 289 290 291 292 293
        else:
            auth = None
        h = httplib.HTTP(host)
        if data is not None:
            h.putrequest('POST', selector)
            h.putheader('Content-type', 'application/x-www-form-urlencoded')
            h.putheader('Content-length', '%d' % len(data))
        else:
            h.putrequest('GET', selector)
        if auth: h.putheader('Authorization', 'Basic %s' % auth)
        if realhost: h.putheader('Host', realhost)
        for args in self.addheaders: apply(h.putheader, args)
        h.endheaders()
        if data is not None:
294
            h.send(data)
295 296 297 298 299 300 301
        errcode, errmsg, headers = h.getreply()
        fp = h.getfile()
        if errcode == 200:
            return addinfourl(fp, headers, "http:" + url)
        else:
            if data is None:
                return self.http_error(url, fp, errcode, errmsg, headers)
302 303
            else:
                return self.http_error(url, fp, errcode, errmsg, headers, data)
304 305

    def http_error(self, url, fp, errcode, errmsg, headers, data=None):
306 307 308
        """Handle http errors.
        Derived class can override this, or provide specific handlers
        named http_error_DDD where DDD is the 3-digit error code."""
309 310 311 312 313 314
        # First check if there's a specific handler for this error
        name = 'http_error_%d' % errcode
        if hasattr(self, name):
            method = getattr(self, name)
            if data is None:
                result = method(url, fp, errcode, errmsg, headers)
315 316
            else:
                result = method(url, fp, errcode, errmsg, headers, data)
317
            if result: return result
318
        return self.http_error_default(url, fp, errcode, errmsg, headers)
319 320

    def http_error_default(self, url, fp, errcode, errmsg, headers):
321
        """Default error handler: close the connection and raise IOError."""
322 323 324 325
        void = fp.read()
        fp.close()
        raise IOError, ('http error', errcode, errmsg, headers)

326
    if hasattr(socket, "ssl"):
327
        def open_https(self, url, data=None):
328
            """Use HTTPS protocol."""
329
            import httplib
330
            user_passwd = None
331
            if type(url) is types.StringType:
332
                host, selector = splithost(url)
333 334 335 336
                if host:
                    user_passwd, host = splituser(host)
                    host = unquote(host)
                realhost = host
337 338 339
            else:
                host, selector = url
                urltype, rest = splittype(selector)
340 341
                url = rest
                user_passwd = None
342
                if urltype.lower() != 'https':
343 344
                    realhost = None
                else:
345
                    realhost, rest = splithost(rest)
346 347
                    if realhost:
                        user_passwd, realhost = splituser(realhost)
348 349
                    if user_passwd:
                        selector = "%s://%s%s" % (urltype, realhost, rest)
350
                #print "proxy via https:", host, selector
351 352 353
            if not host: raise IOError, ('https error', 'no host given')
            if user_passwd:
                import base64
354
                auth = base64.encodestring(user_passwd).strip()
355 356 357 358 359
            else:
                auth = None
            h = httplib.HTTPS(host, 0,
                              key_file=self.key_file,
                              cert_file=self.cert_file)
360 361 362 363 364 365 366
            if data is not None:
                h.putrequest('POST', selector)
                h.putheader('Content-type',
                            'application/x-www-form-urlencoded')
                h.putheader('Content-length', '%d' % len(data))
            else:
                h.putrequest('GET', selector)
367
            if auth: h.putheader('Authorization: Basic %s' % auth)
368
            if realhost: h.putheader('Host', realhost)
369 370
            for args in self.addheaders: apply(h.putheader, args)
            h.endheaders()
371
            if data is not None:
372
                h.send(data)
373 374 375 376 377
            errcode, errmsg, headers = h.getreply()
            fp = h.getfile()
            if errcode == 200:
                return addinfourl(fp, headers, url)
            else:
378 379 380
                if data is None:
                    return self.http_error(url, fp, errcode, errmsg, headers)
                else:
381 382
                    return self.http_error(url, fp, errcode, errmsg, headers,
                                           data)
383

384
    def open_gopher(self, url):
385
        """Use Gopher protocol."""
386 387 388 389 390 391 392 393 394 395 396 397 398 399 400
        import gopherlib
        host, selector = splithost(url)
        if not host: raise IOError, ('gopher error', 'no host given')
        host = unquote(host)
        type, selector = splitgophertype(selector)
        selector, query = splitquery(selector)
        selector = unquote(selector)
        if query:
            query = unquote(query)
            fp = gopherlib.send_query(selector, query, host)
        else:
            fp = gopherlib.send_selector(selector, host)
        return addinfourl(fp, noheaders(), "gopher:" + url)

    def open_file(self, url):
401
        """Use local file or FTP depending on form of URL."""
402 403 404 405 406 407
        if url[:2] == '//' and url[2:3] != '/':
            return self.open_ftp(url)
        else:
            return self.open_local_file(url)

    def open_local_file(self, url):
408
        """Use local file."""
409
        import mimetypes, mimetools, rfc822, StringIO
410 411 412 413
        host, file = splithost(url)
        localname = url2pathname(file)
        stats = os.stat(localname)
        size = stats[stat.ST_SIZE]
414
        modified = rfc822.formatdate(stats[stat.ST_MTIME])
415 416
        mtype = mimetypes.guess_type(url)[0]
        headers = mimetools.Message(StringIO.StringIO(
417 418
            'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
            (mtype or 'text/plain', size, modified)))
419
        if not host:
420 421 422
            urlfile = file
            if file[:1] == '/':
                urlfile = 'file://' + file
423
            return addinfourl(open(localname, 'rb'),
424
                              headers, urlfile)
425 426
        host, port = splitport(host)
        if not port \
427
           and socket.gethostbyname(host) in (localhost(), thishost()):
428 429 430
            urlfile = file
            if file[:1] == '/':
                urlfile = 'file://' + file
431
            return addinfourl(open(localname, 'rb'),
432
                              headers, urlfile)
433 434 435
        raise IOError, ('local file error', 'not on local host')

    def open_ftp(self, url):
436
        """Use FTP protocol."""
437
        import mimetypes, mimetools, StringIO
438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454
        host, path = splithost(url)
        if not host: raise IOError, ('ftp error', 'no host given')
        host, port = splitport(host)
        user, host = splituser(host)
        if user: user, passwd = splitpasswd(user)
        else: passwd = None
        host = unquote(host)
        user = unquote(user or '')
        passwd = unquote(passwd or '')
        host = socket.gethostbyname(host)
        if not port:
            import ftplib
            port = ftplib.FTP_PORT
        else:
            port = int(port)
        path, attrs = splitattr(path)
        path = unquote(path)
455
        dirs = path.split('/')
456 457
        dirs, file = dirs[:-1], dirs[-1]
        if dirs and not dirs[0]: dirs = dirs[1:]
458
        if dirs and not dirs[0]: dirs[0] = '/'
459
        key = user, host, port, '/'.join(dirs)
460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475
        # XXX thread unsafe!
        if len(self.ftpcache) > MAXFTPCACHE:
            # Prune the cache, rather arbitrarily
            for k in self.ftpcache.keys():
                if k != key:
                    v = self.ftpcache[k]
                    del self.ftpcache[k]
                    v.close()
        try:
            if not self.ftpcache.has_key(key):
                self.ftpcache[key] = \
                    ftpwrapper(user, passwd, host, port, dirs)
            if not file: type = 'D'
            else: type = 'I'
            for attr in attrs:
                attr, value = splitvalue(attr)
476
                if attr.lower() == 'type' and \
477
                   value in ('a', 'A', 'i', 'I', 'd', 'D'):
478
                    type = value.upper()
479
            (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
480 481 482 483
            mtype = mimetypes.guess_type("ftp:" + url)[0]
            headers = ""
            if mtype:
                headers += "Content-Type: %s\n" % mtype
484
            if retrlen is not None and retrlen >= 0:
485 486
                headers += "Content-Length: %d\n" % retrlen
            headers = mimetools.Message(StringIO.StringIO(headers))
487 488 489 490 491
            return addinfourl(fp, headers, "ftp:" + url)
        except ftperrors(), msg:
            raise IOError, ('ftp error', msg), sys.exc_info()[2]

    def open_data(self, url, data=None):
492
        """Use "data" URL."""
493 494 495 496 497 498 499 500 501
        # ignore POSTed data
        #
        # syntax of data URLs:
        # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
        # mediatype := [ type "/" subtype ] *( ";" parameter )
        # data      := *urlchar
        # parameter := attribute "=" value
        import StringIO, mimetools, time
        try:
502
            [type, data] = url.split(',', 1)
503 504 505 506
        except ValueError:
            raise IOError, ('data error', 'bad data URL')
        if not type:
            type = 'text/plain;charset=US-ASCII'
507
        semi = type.rfind(';')
508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524
        if semi >= 0 and '=' not in type[semi:]:
            encoding = type[semi+1:]
            type = type[:semi]
        else:
            encoding = ''
        msg = []
        msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
                                            time.gmtime(time.time())))
        msg.append('Content-type: %s' % type)
        if encoding == 'base64':
            import base64
            data = base64.decodestring(data)
        else:
            data = unquote(data)
        msg.append('Content-length: %d' % len(data))
        msg.append('')
        msg.append(data)
525
        msg = '\n'.join(msg)
526 527 528 529
        f = StringIO.StringIO(msg)
        headers = mimetools.Message(f, 0)
        f.fileno = None     # needed for addinfourl
        return addinfourl(f, headers, url)
530

531

532
class FancyURLopener(URLopener):
533
    """Derived class with handlers for errors we can handle (perhaps)."""
534

535 536 537
    def __init__(self, *args):
        apply(URLopener.__init__, (self,) + args)
        self.auth_cache = {}
538 539
        self.tries = 0
        self.maxtries = 10
540 541

    def http_error_default(self, url, fp, errcode, errmsg, headers):
542
        """Default error handling -- don't raise an exception."""
543 544
        return addinfourl(fp, headers, "http:" + url)

545
    def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
546
        """Error 302 -- relocated (temporarily)."""
547 548 549 550 551 552 553 554 555 556 557 558 559 560 561
        self.tries += 1
        if self.maxtries and self.tries >= self.maxtries:
            if hasattr(self, "http_error_500"):
                meth = self.http_error_500
            else:
                meth = self.http_error_default
            self.tries = 0
            return meth(url, fp, 500,
                        "Internal Server Error: Redirect Recursion", headers)
        result = self.redirect_internal(url, fp, errcode, errmsg, headers,
                                        data)
        self.tries = 0
        return result

    def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
562 563 564 565 566 567 568 569
        if headers.has_key('location'):
            newurl = headers['location']
        elif headers.has_key('uri'):
            newurl = headers['uri']
        else:
            return
        void = fp.read()
        fp.close()
570
        # In case the server sent a relative URL, join with original:
571
        newurl = basejoin(self.type + ":" + url, newurl)
572 573 574 575
        if data is None:
            return self.open(newurl)
        else:
            return self.open(newurl, data)
576

577
    def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
578 579
        """Error 301 -- also relocated (permanently)."""
        return self.http_error_302(url, fp, errcode, errmsg, headers, data)
580

581
    def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
582 583 584
        """Error 401 -- authentication required.
        See this URL for a description of the basic authentication scheme:
        http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
585
        if not headers.has_key('www-authenticate'):
Tim Peters's avatar
Tim Peters committed
586
            URLopener.http_error_default(self, url, fp,
587
                                         errcode, errmsg, headers)
588 589 590 591
        stuff = headers['www-authenticate']
        import re
        match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
        if not match:
Tim Peters's avatar
Tim Peters committed
592
            URLopener.http_error_default(self, url, fp,
593 594 595
                                         errcode, errmsg, headers)
        scheme, realm = match.groups()
        if scheme.lower() != 'basic':
Tim Peters's avatar
Tim Peters committed
596
            URLopener.http_error_default(self, url, fp,
597 598 599 600 601 602
                                         errcode, errmsg, headers)
        name = 'retry_' + self.type + '_basic_auth'
        if data is None:
            return getattr(self,name)(url, realm)
        else:
            return getattr(self,name)(url, realm, data)
603

604
    def retry_http_basic_auth(self, url, realm, data=None):
605
        host, selector = splithost(url)
606
        i = host.find('@') + 1
607 608 609
        host = host[i:]
        user, passwd = self.get_user_passwd(host, realm, i)
        if not (user or passwd): return None
610
        host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
611
        newurl = 'http://' + host + selector
612 613 614 615
        if data is None:
            return self.open(newurl)
        else:
            return self.open(newurl, data)
616

617
    def retry_https_basic_auth(self, url, realm, data=None):
618 619 620 621 622
        host, selector = splithost(url)
        i = host.find('@') + 1
        host = host[i:]
        user, passwd = self.get_user_passwd(host, realm, i)
        if not (user or passwd): return None
623
        host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
624
        newurl = '//' + host + selector
625
        return self.open_https(newurl, data)
626 627

    def get_user_passwd(self, host, realm, clear_cache = 0):
628
        key = realm + '@' + host.lower()
629 630 631 632 633 634 635 636 637 638
        if self.auth_cache.has_key(key):
            if clear_cache:
                del self.auth_cache[key]
            else:
                return self.auth_cache[key]
        user, passwd = self.prompt_user_passwd(host, realm)
        if user or passwd: self.auth_cache[key] = (user, passwd)
        return user, passwd

    def prompt_user_passwd(self, host, realm):
639
        """Override this in a GUI environment!"""
640 641 642
        import getpass
        try:
            user = raw_input("Enter username for %s at %s: " % (realm,
643
                                                                host))
644 645 646 647 648 649
            passwd = getpass.getpass("Enter password for %s in %s at %s: " %
                (user, realm, host))
            return user, passwd
        except KeyboardInterrupt:
            print
            return None, None
650 651


652 653 654 655
# Utility functions

_localhost = None
def localhost():
656
    """Return the IP address of the magic hostname 'localhost'."""
657 658 659 660
    global _localhost
    if not _localhost:
        _localhost = socket.gethostbyname('localhost')
    return _localhost
661 662 663

_thishost = None
def thishost():
664
    """Return the IP address of the current host."""
665 666 667 668
    global _thishost
    if not _thishost:
        _thishost = socket.gethostbyname(socket.gethostname())
    return _thishost
669 670 671

_ftperrors = None
def ftperrors():
672
    """Return the set of errors raised by the FTP class."""
673 674 675 676 677
    global _ftperrors
    if not _ftperrors:
        import ftplib
        _ftperrors = ftplib.all_errors
    return _ftperrors
678 679 680

_noheaders = None
def noheaders():
681
    """Return an empty mimetools.Message object."""
682 683 684 685 686 687 688
    global _noheaders
    if not _noheaders:
        import mimetools
        import StringIO
        _noheaders = mimetools.Message(StringIO.StringIO(), 0)
        _noheaders.fp.close()   # Recycle file descriptor
    return _noheaders
689 690 691 692 693


# Utility classes

class ftpwrapper:
694 695
    """Class used by open_ftp() for cache of open FTP connections."""

696 697 698 699 700 701 702
    def __init__(self, user, passwd, host, port, dirs):
        self.user = user
        self.passwd = passwd
        self.host = host
        self.port = port
        self.dirs = dirs
        self.init()
703

704 705 706 707 708 709 710 711
    def init(self):
        import ftplib
        self.busy = 0
        self.ftp = ftplib.FTP()
        self.ftp.connect(self.host, self.port)
        self.ftp.login(self.user, self.passwd)
        for dir in self.dirs:
            self.ftp.cwd(dir)
712

713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736
    def retrfile(self, file, type):
        import ftplib
        self.endtransfer()
        if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
        else: cmd = 'TYPE ' + type; isdir = 0
        try:
            self.ftp.voidcmd(cmd)
        except ftplib.all_errors:
            self.init()
            self.ftp.voidcmd(cmd)
        conn = None
        if file and not isdir:
            # Use nlst to see if the file exists at all
            try:
                self.ftp.nlst(file)
            except ftplib.error_perm, reason:
                raise IOError, ('ftp error', reason), sys.exc_info()[2]
            # Restore the transfer mode!
            self.ftp.voidcmd(cmd)
            # Try to retrieve as a file
            try:
                cmd = 'RETR ' + file
                conn = self.ftp.ntransfercmd(cmd)
            except ftplib.error_perm, reason:
737
                if str(reason)[:3] != '550':
738 739 740 741 742 743 744 745 746 747 748
                    raise IOError, ('ftp error', reason), sys.exc_info()[2]
        if not conn:
            # Set transfer mode to ASCII!
            self.ftp.voidcmd('TYPE A')
            # Try a directory listing
            if file: cmd = 'LIST ' + file
            else: cmd = 'LIST'
            conn = self.ftp.ntransfercmd(cmd)
        self.busy = 1
        # Pass back both a suitably decorated object and a retrieval length
        return (addclosehook(conn[0].makefile('rb'),
749
                             self.endtransfer), conn[1])
750 751 752 753 754 755 756 757
    def endtransfer(self):
        if not self.busy:
            return
        self.busy = 0
        try:
            self.ftp.voidresp()
        except ftperrors():
            pass
758

759 760 761 762 763 764
    def close(self):
        self.endtransfer()
        try:
            self.ftp.close()
        except ftperrors():
            pass
765 766

class addbase:
767 768
    """Base class for addinfo and addclosehook."""

769 770 771 772
    def __init__(self, fp):
        self.fp = fp
        self.read = self.fp.read
        self.readline = self.fp.readline
773 774
        if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
        if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno
775

776 777
    def __repr__(self):
        return '<%s at %s whose fp = %s>' % (self.__class__.__name__,
778
                                             `id(self)`, `self.fp`)
779

780 781 782 783 784 785 786
    def close(self):
        self.read = None
        self.readline = None
        self.readlines = None
        self.fileno = None
        if self.fp: self.fp.close()
        self.fp = None
787 788

class addclosehook(addbase):
789 790
    """Class to add a close hook to an open file."""

791 792 793 794
    def __init__(self, fp, closehook, *hookargs):
        addbase.__init__(self, fp)
        self.closehook = closehook
        self.hookargs = hookargs
795

796
    def close(self):
797
        addbase.close(self)
798 799 800 801
        if self.closehook:
            apply(self.closehook, self.hookargs)
            self.closehook = None
            self.hookargs = None
802 803

class addinfo(addbase):
804 805
    """class to add an info() method to an open file."""

806 807 808
    def __init__(self, fp, headers):
        addbase.__init__(self, fp)
        self.headers = headers
809

810 811
    def info(self):
        return self.headers
812

813
class addinfourl(addbase):
814 815
    """class to add info() and geturl() methods to an open file."""

816 817 818 819
    def __init__(self, fp, headers, url):
        addbase.__init__(self, fp)
        self.headers = headers
        self.url = url
820

821 822
    def info(self):
        return self.headers
823

824 825
    def geturl(self):
        return self.url
826

827 828

def basejoin(base, url):
829
    """Utility to combine a URL with a base URL to form a new URL."""
830 831 832 833 834 835 836 837 838 839 840 841 842 843
    type, path = splittype(url)
    if type:
        # if url is complete (i.e., it contains a type), return it
        return url
    host, path = splithost(path)
    type, basepath = splittype(base) # inherit type from base
    if host:
        # if url contains host, just inherit type
        if type: return type + '://' + host + path
        else:
            # no type inherited, so url must have started with //
            # just return it
            return url
    host, basepath = splithost(basepath) # inherit host
844
    basepath, basetag = splittag(basepath) # remove extraneous cruft
845 846 847 848 849 850 851 852
    basepath, basequery = splitquery(basepath) # idem
    if path[:1] != '/':
        # non-absolute path name
        if path[:1] in ('#', '?'):
            # path is just a tag or query, attach to basepath
            i = len(basepath)
        else:
            # else replace last component
853
            i = basepath.rfind('/')
854 855 856 857 858 859 860 861 862 863 864 865 866 867
        if i < 0:
            # basepath not absolute
            if host:
                # host present, make absolute
                basepath = '/'
            else:
                # else keep non-absolute
                basepath = ''
        else:
            # remove last file component
            basepath = basepath[:i+1]
        # Interpret ../ (important because of symlinks)
        while basepath and path[:3] == '../':
            path = path[3:]
868
            i = basepath[:-1].rfind('/')
869 870 871 872 873 874 875
            if i > 0:
                basepath = basepath[:i+1]
            elif i == 0:
                basepath = '/'
                break
            else:
                basepath = ''
876

877
        path = basepath + path
878 879
    if host and path and path[0] != '/':
        path = '/' + path
880 881 882 883
    if type and host: return type + '://' + host + path
    elif type: return type + ':' + path
    elif host: return '//' + host + path # don't know what this means
    else: return path
884 885


886
# Utilities to parse URLs (most of these return None for missing parts):
887
# unwrap('<URL:type://host/path>') --> 'type://host/path'
888 889
# splittype('type:opaquestring') --> 'type', 'opaquestring'
# splithost('//host[:port]/path') --> 'host[:port]', '/path'
890 891
# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
# splitpasswd('user:passwd') -> 'user', 'passwd'
892 893 894
# splitport('host:port') --> 'host', 'port'
# splitquery('/path?query') --> '/path', 'query'
# splittag('/path#tag') --> '/path', 'tag'
895 896 897
# splitattr('/path;attr1=value1;attr2=value2;...') ->
#   '/path', ['attr1=value1', 'attr2=value2', ...]
# splitvalue('attr=value') --> 'attr', 'value'
898 899 900 901
# splitgophertype('/Xselector') --> 'X', 'selector'
# unquote('abc%20def') -> 'abc def'
# quote('abc def') -> 'abc%20def')

902 903 904 905 906 907 908 909
def toBytes(url):
    """toBytes(u"URL") --> 'URL'."""
    # Most URL schemes require ASCII. If that changes, the conversion
    # can be relaxed
    if type(url) is types.UnicodeType:
        try:
            url = url.encode("ASCII")
        except UnicodeError:
910 911
            raise UnicodeError("URL " + repr(url) +
                               " contains non-ASCII characters")
912 913
    return url

914
def unwrap(url):
915
    """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
916
    url = url.strip()
917
    if url[:1] == '<' and url[-1:] == '>':
918 919
        url = url[1:-1].strip()
    if url[:4] == 'URL:': url = url[4:].strip()
920
    return url
921

922
_typeprog = None
923
def splittype(url):
924
    """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
925 926 927 928
    global _typeprog
    if _typeprog is None:
        import re
        _typeprog = re.compile('^([^/:]+):')
929

930 931 932
    match = _typeprog.match(url)
    if match:
        scheme = match.group(1)
933
        return scheme.lower(), url[len(scheme) + 1:]
934
    return None, url
935

936
_hostprog = None
937
def splithost(url):
938
    """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
939 940 941
    global _hostprog
    if _hostprog is None:
        import re
Guido van Rossum's avatar
Guido van Rossum committed
942
        _hostprog = re.compile('^//([^/]*)(.*)$')
943

944
    match = _hostprog.match(url)
945 946
    if match: return match.group(1, 2)
    return None, url
947

948
_userprog = None
949
def splituser(host):
950
    """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
951 952 953 954
    global _userprog
    if _userprog is None:
        import re
        _userprog = re.compile('^([^@]*)@(.*)$')
955

956
    match = _userprog.match(host)
957
    if match: return map(unquote, match.group(1, 2))
958
    return None, host
959

960
_passwdprog = None
961
def splitpasswd(user):
962
    """splitpasswd('user:passwd') -> 'user', 'passwd'."""
963 964 965 966
    global _passwdprog
    if _passwdprog is None:
        import re
        _passwdprog = re.compile('^([^:]*):(.*)$')
967

968 969 970
    match = _passwdprog.match(user)
    if match: return match.group(1, 2)
    return user, None
971

972
# splittag('/path#tag') --> '/path', 'tag'
973
_portprog = None
974
def splitport(host):
975
    """splitport('host:port') --> 'host', 'port'."""
976 977 978 979
    global _portprog
    if _portprog is None:
        import re
        _portprog = re.compile('^(.*):([0-9]+)$')
980

981 982 983
    match = _portprog.match(host)
    if match: return match.group(1, 2)
    return host, None
984

985
_nportprog = None
986
def splitnport(host, defport=-1):
987 988 989 990
    """Split host and port, returning numeric port.
    Return given default port if no ':' found; defaults to -1.
    Return numerical port if a valid number are found after ':'.
    Return None if ':' but not a valid number."""
991 992 993 994 995 996 997 998 999
    global _nportprog
    if _nportprog is None:
        import re
        _nportprog = re.compile('^(.*):(.*)$')

    match = _nportprog.match(host)
    if match:
        host, port = match.group(1, 2)
        try:
1000 1001 1002
            if not port: raise ValueError, "no digits"
            nport = int(port)
        except ValueError:
1003 1004 1005
            nport = None
        return host, nport
    return host, defport
1006

1007
_queryprog = None
1008
def splitquery(url):
1009
    """splitquery('/path?query') --> '/path', 'query'."""
1010 1011 1012 1013
    global _queryprog
    if _queryprog is None:
        import re
        _queryprog = re.compile('^(.*)\?([^?]*)$')
1014

1015 1016 1017
    match = _queryprog.match(url)
    if match: return match.group(1, 2)
    return url, None
1018

1019
_tagprog = None
1020
def splittag(url):
1021
    """splittag('/path#tag') --> '/path', 'tag'."""
1022 1023 1024 1025
    global _tagprog
    if _tagprog is None:
        import re
        _tagprog = re.compile('^(.*)#([^#]*)$')
1026

1027 1028 1029
    match = _tagprog.match(url)
    if match: return match.group(1, 2)
    return url, None
1030

1031
def splitattr(url):
1032 1033
    """splitattr('/path;attr1=value1;attr2=value2;...') ->
        '/path', ['attr1=value1', 'attr2=value2', ...]."""
1034
    words = url.split(';')
1035
    return words[0], words[1:]
1036

1037
_valueprog = None
1038
def splitvalue(attr):
1039
    """splitvalue('attr=value') --> 'attr', 'value'."""
1040 1041 1042 1043
    global _valueprog
    if _valueprog is None:
        import re
        _valueprog = re.compile('^([^=]*)=(.*)$')
1044

1045 1046 1047
    match = _valueprog.match(attr)
    if match: return match.group(1, 2)
    return attr, None
1048

1049
def splitgophertype(selector):
1050
    """splitgophertype('/Xselector') --> 'X', 'selector'."""
1051 1052 1053
    if selector[:1] == '/' and selector[1:2]:
        return selector[1], selector[2:]
    return None, selector
1054 1055

def unquote(s):
1056
    """unquote('abc%20def') -> 'abc def'."""
1057
    mychr = chr
1058 1059
    myatoi = int
    list = s.split('%')
1060 1061 1062 1063 1064 1065 1066 1067
    res = [list[0]]
    myappend = res.append
    del list[0]
    for item in list:
        if item[1:2]:
            try:
                myappend(mychr(myatoi(item[:2], 16))
                     + item[2:])
1068
            except ValueError:
1069 1070 1071
                myappend('%' + item)
        else:
            myappend('%' + item)
1072
    return "".join(res)
1073

1074
def unquote_plus(s):
1075
    """unquote('%7e/abc+def') -> '~/abc def'"""
1076 1077
    if '+' in s:
        # replace '+' with ' '
1078
        s = ' '.join(s.split('+'))
1079
    return unquote(s)
1080

1081
always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
Jeremy Hylton's avatar
Jeremy Hylton committed
1082
               'abcdefghijklmnopqrstuvwxyz'
1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097
               '0123456789' '_.-')

_fast_safe_test = always_safe + '/'
_fast_safe = None

def _fast_quote(s):
    global _fast_safe
    if _fast_safe is None:
        _fast_safe = {}
        for c in _fast_safe_test:
            _fast_safe[c] = c
    res = list(s)
    for i in range(len(res)):
        c = res[i]
        if not _fast_safe.has_key(c):
1098
            res[i] = '%%%02X' % ord(c)
1099
    return ''.join(res)
1100

1101
def quote(s, safe = '/'):
1102
    """quote('abc def') -> 'abc%20def'
1103

1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121
    Each part of a URL, e.g. the path info, the query, etc., has a
    different set of reserved characters that must be quoted.

    RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
    the following reserved characters.

    reserved    = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
                  "$" | ","

    Each of these characters is reserved in some component of a URL,
    but not necessarily in all of them.

    By default, the quote function is intended for quoting the path
    section of a URL.  Thus, it will not encode '/'.  This character
    is reserved, but in typical usage the quote function is being
    called on a path where the existing slash characters are used as
    reserved characters.
    """
1122
    safe = always_safe + safe
1123 1124
    if _fast_safe_test == safe:
        return _fast_quote(s)
1125 1126 1127 1128
    res = list(s)
    for i in range(len(res)):
        c = res[i]
        if c not in safe:
1129
            res[i] = '%%%02X' % ord(c)
1130
    return ''.join(res)
1131

1132 1133
def quote_plus(s, safe = ''):
    """Quote the query fragment of a URL; replacing ' ' with '+'"""
1134
    if ' ' in s:
1135
        l = s.split(' ')
1136 1137
        for i in range(len(l)):
            l[i] = quote(l[i], safe)
1138
        return '+'.join(l)
1139 1140
    else:
        return quote(s, safe)
1141

1142 1143
def urlencode(query,doseq=0):
    """Encode a sequence of two-element tuples or dictionary into a URL query string.
1144

1145
    If any values in the query arg are sequences and doseq is true, each
1146
    sequence element is converted to a separate parameter.
1147 1148 1149 1150

    If the query arg is a sequence of two-element tuples, the order of the
    parameters in the output will match the order of parameters in the
    input.
1151
    """
Tim Peters's avatar
Tim Peters committed
1152

1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172
    if hasattr(query,"items"):
        # mapping objects
        query = query.items()
    else:
        # it's a bother at times that strings and string-like objects are
        # sequences...
        try:
            # non-sequence items should not work with len()
            x = len(query)
            # non-empty strings will fail this
            if len(query) and type(query[0]) != types.TupleType:
                raise TypeError
            # zero-length sequences of all types will get here and succeed,
            # but that's a minor nit - since the original implementation
            # allowed empty dicts that type of behavior probably should be
            # preserved for consistency
        except TypeError:
            ty,va,tb = sys.exc_info()
            raise TypeError, "not a valid non-string sequence or mapping object", tb

1173
    l = []
1174 1175
    if not doseq:
        # preserve old behavior
1176
        for k, v in query:
1177 1178 1179 1180
            k = quote_plus(str(k))
            v = quote_plus(str(v))
            l.append(k + '=' + v)
    else:
1181
        for k, v in query:
1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203
            k = quote_plus(str(k))
            if type(v) == types.StringType:
                v = quote_plus(v)
                l.append(k + '=' + v)
            elif type(v) == types.UnicodeType:
                # is there a reasonable way to convert to ASCII?
                # encode generates a string, but "replace" or "ignore"
                # lose information and "strict" can raise UnicodeError
                v = quote_plus(v.encode("ASCII","replace"))
                l.append(k + '=' + v)
            else:
                try:
                    # is this a sufficient test for sequence-ness?
                    x = len(v)
                except TypeError:
                    # not a sequence
                    v = quote_plus(str(v))
                    l.append(k + '=' + v)
                else:
                    # loop over the sequence
                    for elt in v:
                        l.append(k + '=' + quote_plus(str(elt)))
1204
    return '&'.join(l)
1205

1206
# Proxy handling
1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217
def getproxies_environment():
    """Return a dictionary of scheme -> proxy server URL mappings.

    Scan the environment for variables named <scheme>_proxy;
    this seems to be the standard convention.  If you need a
    different way, you can pass a proxies dictionary to the
    [Fancy]URLopener constructor.

    """
    proxies = {}
    for name, value in os.environ.items():
1218
        name = name.lower()
1219 1220 1221 1222
        if value and name[-6:] == '_proxy':
            proxies[name[:-6]] = value
    return proxies

1223
if os.name == 'mac':
1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235
    def getproxies():
        """Return a dictionary of scheme -> proxy server URL mappings.

        By convention the mac uses Internet Config to store
        proxies.  An HTTP proxy, for instance, is stored under
        the HttpProxy key.

        """
        try:
            import ic
        except ImportError:
            return {}
1236

1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252
        try:
            config = ic.IC()
        except ic.error:
            return {}
        proxies = {}
        # HTTP:
        if config.has_key('UseHTTPProxy') and config['UseHTTPProxy']:
            try:
                value = config['HTTPProxyHost']
            except ic.error:
                pass
            else:
                proxies['http'] = 'http://%s' % value
        # FTP: XXXX To be done.
        # Gopher: XXXX To be done.
        return proxies
1253

1254 1255 1256
    def proxy_bypass(x):
        return 0

1257 1258
elif os.name == 'nt':
    def getproxies_registry():
1259
        """Return a dictionary of scheme -> proxy server URL mappings.
1260 1261 1262

        Win32 uses the registry to store proxies.

1263 1264
        """
        proxies = {}
1265 1266 1267 1268 1269 1270
        try:
            import _winreg
        except ImportError:
            # Std module, so should be around - but you never know!
            return proxies
        try:
1271 1272
            internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1273 1274 1275 1276 1277 1278
            proxyEnable = _winreg.QueryValueEx(internetSettings,
                                               'ProxyEnable')[0]
            if proxyEnable:
                # Returned as Unicode but problems if not converted to ASCII
                proxyServer = str(_winreg.QueryValueEx(internetSettings,
                                                       'ProxyServer')[0])
1279 1280
                if '=' in proxyServer:
                    # Per-protocol settings
1281
                    for p in proxyServer.split(';'):
1282
                        protocol, address = p.split('=', 1)
1283
                        proxies[protocol] = '%s://%s' % (protocol, address)
1284 1285 1286 1287 1288 1289 1290
                else:
                    # Use one setting for all protocols
                    if proxyServer[:5] == 'http:':
                        proxies['http'] = proxyServer
                    else:
                        proxies['http'] = 'http://%s' % proxyServer
                        proxies['ftp'] = 'ftp://%s' % proxyServer
1291 1292 1293 1294 1295 1296
            internetSettings.Close()
        except (WindowsError, ValueError, TypeError):
            # Either registry key not found etc, or the value in an
            # unexpected format.
            # proxies already set up to be empty so nothing to do
            pass
1297
        return proxies
1298

1299 1300 1301 1302 1303 1304 1305 1306
    def getproxies():
        """Return a dictionary of scheme -> proxy server URL mappings.

        Returns settings gathered from the environment, if specified,
        or the registry.

        """
        return getproxies_environment() or getproxies_registry()
1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351

    def proxy_bypass(host):
        try:
            import _winreg
            import re
            import socket
        except ImportError:
            # Std modules, so should be around - but you never know!
            return 0
        try:
            internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
            proxyEnable = _winreg.QueryValueEx(internetSettings,
                                               'ProxyEnable')[0]
            proxyOverride = str(_winreg.QueryValueEx(internetSettings,
                                                     'ProxyOverride')[0])
            # ^^^^ Returned as Unicode but problems if not converted to ASCII
        except WindowsError:
            return 0
        if not proxyEnable or not proxyOverride:
            return 0
        # try to make a host list from name and IP address.
        host = [host]
        try:
            addr = socket.gethostbyname(host[0])
            if addr != host:
                host.append(addr)
        except socket.error:
            pass
        # make a check value list from the registry entry: replace the
        # '<local>' string by the localhost entry and the corresponding
        # canonical entry.
        proxyOverride = proxyOverride.split(';')
        i = 0
        while i < len(proxyOverride):
            if proxyOverride[i] == '<local>':
                proxyOverride[i:i+1] = ['localhost',
                                        '127.0.0.1',
                                        socket.gethostname(),
                                        socket.gethostbyname(
                                            socket.gethostname())]
            i += 1
        # print proxyOverride
        # now check if we match one of the registry values.
        for test in proxyOverride:
Tim Peters's avatar
Tim Peters committed
1352 1353 1354
            test = test.replace(".", r"\.")     # mask dots
            test = test.replace("*", r".*")     # change glob sequence
            test = test.replace("?", r".")      # change glob char
1355 1356 1357 1358 1359 1360
            for val in host:
                # print "%s <--> %s" %( test, val )
                if re.match(test, val, re.I):
                    return 1
        return 0

1361 1362 1363 1364
else:
    # By default use environment variables
    getproxies = getproxies_environment

1365 1366
    def proxy_bypass(host):
        return 0
1367

1368 1369
# Test and time quote() and unquote()
def test1():
1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383
    import time
    s = ''
    for i in range(256): s = s + chr(i)
    s = s*4
    t0 = time.time()
    qs = quote(s)
    uqs = unquote(qs)
    t1 = time.time()
    if uqs != s:
        print 'Wrong!'
    print `s`
    print `qs`
    print `uqs`
    print round(t1 - t0, 3), 'sec'
1384 1385


1386 1387
def reporthook(blocknum, blocksize, totalsize):
    # Report during remote transfers
1388 1389
    print "Block number: %d, Block size: %d, Total size: %d" % (
        blocknum, blocksize, totalsize)
1390

1391
# Test program
1392
def test(args=[]):
1393 1394 1395 1396 1397 1398 1399 1400 1401
    if not args:
        args = [
            '/etc/passwd',
            'file:/etc/passwd',
            'file://localhost/etc/passwd',
            'ftp://ftp.python.org/etc/passwd',
##          'gopher://gopher.micro.umn.edu/1/',
            'http://www.python.org/index.html',
            ]
1402 1403
        if hasattr(URLopener, "open_https"):
            args.append('https://synergy.as.cmu.edu/~geek/')
1404 1405 1406 1407
    try:
        for url in args:
            print '-'*10, url, '-'*10
            fn, h = urlretrieve(url, None, reporthook)
1408
            print fn
1409 1410 1411 1412 1413 1414 1415 1416 1417
            if h:
                print '======'
                for k in h.keys(): print k + ':', h[k]
                print '======'
            fp = open(fn, 'rb')
            data = fp.read()
            del fp
            if '\r' in data:
                table = string.maketrans("", "")
1418
                data = data.translate(table, "\r")
1419 1420 1421 1422 1423
            print data
            fn, h = None, None
        print '-'*40
    finally:
        urlcleanup()
1424

1425
def main():
1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450
    import getopt, sys
    try:
        opts, args = getopt.getopt(sys.argv[1:], "th")
    except getopt.error, msg:
        print msg
        print "Use -h for help"
        return
    t = 0
    for o, a in opts:
        if o == '-t':
            t = t + 1
        if o == '-h':
            print "Usage: python urllib.py [-t] [url ...]"
            print "-t runs self-test;",
            print "otherwise, contents of urls are printed"
            return
    if t:
        if t > 1:
            test1()
        test(args)
    else:
        if not args:
            print "Use -h for help"
        for url in args:
            print urlopen(url).read(),
1451

1452 1453
# Run test program when run as a script
if __name__ == '__main__':
1454
    main()