Give in to tabnanny

986abac1 · Guido van Rossum · 36dfbcf3 · 986abac1 · 36dfbcf3 · 986abac1
Kaydet (Commit) 986abac1 authored Nis 06, 1998 tarafından Guido van Rossum
7 changed files
--- a/Lib/robotparser.py
+++ b/Lib/robotparser.py
@@ -9,79 +9,79 @@ fetchability of other URLs.
 class RobotFileParser:

    def __init__(self):
-	self.rules = {}
-	self.debug = 0
-	self.url = ''
-	self.last_checked = 0
+        self.rules = {}
+        self.debug = 0
+        self.url = ''
+        self.last_checked = 0

    def mtime(self):
-	return self.last_checked
+        return self.last_checked

    def modified(self):
-	import time
-	self.last_checked = time.time()
+        import time
+        self.last_checked = time.time()

    def set_url(self, url):
-	self.url = url
-## 	import urlmisc
-## 	self.url = urlmisc.canonical_url(url)
+        self.url = url
+##      import urlmisc
+##      self.url = urlmisc.canonical_url(url)

    def read(self):
-	import urllib
-	self.parse(urllib.urlopen(self.url).readlines())
+        import urllib
+        self.parse(urllib.urlopen(self.url).readlines())

    def parse(self, lines):
-	import regsub, string, regex
-	active = []
-	for line in lines:
-	    if self.debug: print '>', line,
-	    # blank line terminates current record
-	    if not line[:-1]:
-		active = []
-		continue
-	    # remove optional comment and strip line
-	    line = string.strip(line[:string.find(line, '#')])
-	    if not line:
-		continue
-	    line = regsub.split(line, ' *: *')
-	    if len(line) == 2:
-		line[0] = string.lower(line[0])
-		if line[0] == 'user-agent':
-		    # this record applies to this user agent
-		    if self.debug: print '>> user-agent:', line[1]
-		    active.append(line[1])
-		    if not self.rules.has_key(line[1]):
-			self.rules[line[1]] = []
-		elif line[0] == 'disallow':
-		    if line[1]:
-			if self.debug: print '>> disallow:', line[1]
-			for agent in active:
-			    self.rules[agent].append(regex.compile(line[1]))
-		    else:
-			pass
-			for agent in active:
-			    if self.debug: print '>> allow', agent
-			    self.rules[agent] = []
-		else:
-		    if self.debug: print '>> unknown:', line
+        import regsub, string, regex
+        active = []
+        for line in lines:
+            if self.debug: print '>', line,
+            # blank line terminates current record
+            if not line[:-1]:
+                active = []
+                continue
+            # remove optional comment and strip line
+            line = string.strip(line[:string.find(line, '#')])
+            if not line:
+                continue
+            line = regsub.split(line, ' *: *')
+            if len(line) == 2:
+                line[0] = string.lower(line[0])
+                if line[0] == 'user-agent':
+                    # this record applies to this user agent
+                    if self.debug: print '>> user-agent:', line[1]
+                    active.append(line[1])
+                    if not self.rules.has_key(line[1]):
+                        self.rules[line[1]] = []
+                elif line[0] == 'disallow':
+                    if line[1]:
+                        if self.debug: print '>> disallow:', line[1]
+                        for agent in active:
+                            self.rules[agent].append(regex.compile(line[1]))
+                    else:
+                        pass
+                        for agent in active:
+                            if self.debug: print '>> allow', agent
+                            self.rules[agent] = []
+                else:
+                    if self.debug: print '>> unknown:', line

-	self.modified()
+        self.modified()

    # returns true if agent is allowed to fetch url
    def can_fetch(self, agent, url):
-	import urlparse
-	ag = agent
-	if not self.rules.has_key(ag): ag = '*'
-	if not self.rules.has_key(ag):
-	    if self.debug: print '>> allowing', url, 'fetch by', agent
-	    return 1
-	path = urlparse.urlparse(url)[2]
-	for rule in self.rules[ag]:
-	    if rule.match(path) != -1:
-		if self.debug: print '>> disallowing', url, 'fetch by', agent
-		return 0
-	if self.debug: print '>> allowing', url, 'fetch by', agent
-	return 1
+        import urlparse
+        ag = agent
+        if not self.rules.has_key(ag): ag = '*'
+        if not self.rules.has_key(ag):
+            if self.debug: print '>> allowing', url, 'fetch by', agent
+            return 1
+        path = urlparse.urlparse(url)[2]
+        for rule in self.rules[ag]:
+            if rule.match(path) != -1:
+                if self.debug: print '>> disallowing', url, 'fetch by', agent
+                return 0
+        if self.debug: print '>> allowing', url, 'fetch by', agent
+        return 1

 def test():
    rp = RobotFileParser()
@@ -91,7 +91,7 @@ def test():
    print rp.rules
    print rp.can_fetch('*', 'http://www.calendar.com/concerts/')
    print rp.can_fetch('Musi-Cal-Robot',
-		       'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones')
+                       'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones')

    print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/')
    print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/vanagon-list-001')
--- a/Tools/webchecker/mimetypes.py
+++ b/Tools/webchecker/mimetypes.py
-"""Guess the MIME type of a file.
-
-This module defines one useful function:
-
-guess_type(url) -- guess the MIME type and encoding of a URL.
-
-It also contains the following, for tuning the behavior:
-
-Data:
-
-knownfiles -- list of files to parse
-inited -- flag set when init() has been called
-suffixes_map -- dictionary mapping suffixes to suffixes
-encodings_map -- dictionary mapping suffixes to encodings
-types_map -- dictionary mapping suffixes to types
-
-Functions:
-
-init([files]) -- parse a list of files, default knownfiles
-read_mime_types(file) -- parse one file, return a dictionary or None
-
-"""
-
-import string
-import posixpath
-
-knownfiles = [
-    "/usr/local/etc/httpd/conf/mime.types",
-    "/usr/local/lib/netscape/mime.types",
-    ]
-
-inited = 0
-
-def guess_type(url):
-    """Guess the type of a file based on its URL.
-
-    Return value is a tuple (type, encoding) where type is None if the
-    type can't be guessed (no or unknown suffix) or a string of the
-    form type/subtype, usable for a MIME Content-type header; and
-    encoding is None for no encoding or the name of the program used
-    to encode (e.g. compress or gzip).  The mappings are table
-    driven.  Encoding suffixes are case sensitive; type suffixes are
-    first tried case sensitive, then case insensitive.
-
-    The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped
-    to ".tar.gz".  (This is table-driven too, using the dictionary
-    suffixes_map).
-
-    """
-    if not inited:
-	init()
-    base, ext = posixpath.splitext(url)
-    while suffix_map.has_key(ext):
-	base, ext = posixpath.splitext(base + suffix_map[ext])
-    if encodings_map.has_key(ext):
-	encoding = encodings_map[ext]
-	base, ext = posixpath.splitext(base)
-    else:
-	encoding = None
-    if types_map.has_key(ext):
-	return types_map[ext], encoding
-    elif types_map.has_key(string.lower(ext)):
-	return types_map[string.lower(ext)], encoding
-    else:
-	return None, encoding
-
-def init(files=None):
-    global inited
-    for file in files or knownfiles:
-	s = read_mime_types(file)
-	if s:
-	    for key, value in s.items():
-		types_map[key] = value
-    inited = 1
-
-def read_mime_types(file):
-    try:
-	f = open(file)
-    except IOError:
-	return None
-    map = {}
-    while 1:
-	line = f.readline()
-	if not line: break
-	words = string.split(line)
-	for i in range(len(words)):
-	    if words[i][0] == '#':
-		del words[i:]
-		break
-	if not words: continue
-	type, suffixes = words[0], words[1:]
-	for suff in suffixes:
-	    map['.'+suff] = type
-    f.close()
-    return map
-
-suffix_map = {
-    '.tgz': '.tar.gz',
-    '.taz': '.tar.gz',
-    '.tz': '.tar.gz',
-}
-
-encodings_map = {
-    '.gz': 'gzip',
-    '.Z': 'compress',
-    }
-
-types_map = {
-    '.a': 'application/octet-stream',
-    '.ai': 'application/postscript',
-    '.aif': 'audio/x-aiff',
-    '.aifc': 'audio/x-aiff',
-    '.aiff': 'audio/x-aiff',
-    '.au': 'audio/basic',
-    '.avi': 'video/x-msvideo',
-    '.bcpio': 'application/x-bcpio',
-    '.bin': 'application/octet-stream',
-    '.cdf': 'application/x-netcdf',
-    '.cpio': 'application/x-cpio',
-    '.csh': 'application/x-csh',
-    '.dll': 'application/octet-stream',
-    '.dvi': 'application/x-dvi',
-    '.exe': 'application/octet-stream',
-    '.eps': 'application/postscript',
-    '.etx': 'text/x-setext',
-    '.gif': 'image/gif',
-    '.gtar': 'application/x-gtar',
-    '.hdf': 'application/x-hdf',
-    '.htm': 'text/html',
-    '.html': 'text/html',
-    '.shtml': 'text/html',
-    '.ief': 'image/ief',
-    '.jpe': 'image/jpeg',
-    '.jpeg': 'image/jpeg',
-    '.jpg': 'image/jpeg',
-    '.latex': 'application/x-latex',
-    '.man': 'application/x-troff-man',
-    '.me': 'application/x-troff-me',
-    '.mif': 'application/x-mif',
-    '.mov': 'video/quicktime',
-    '.movie': 'video/x-sgi-movie',
-    '.mpe': 'video/mpeg',
-    '.mpeg': 'video/mpeg',
-    '.mpg': 'video/mpeg',
-    '.ms': 'application/x-troff-ms',
-    '.nc': 'application/x-netcdf',
-    '.o': 'application/octet-stream',
-    '.obj': 'application/octet-stream',
-    '.oda': 'application/oda',
-    '.pbm': 'image/x-portable-bitmap',
-    '.pdf': 'application/pdf',
-    '.pgm': 'image/x-portable-graymap',
-    '.pnm': 'image/x-portable-anymap',
-    '.png': 'image/png',
-    '.ppm': 'image/x-portable-pixmap',
-    '.py': 'text/x-python',
-    '.pyc': 'application/x-python-code',
-    '.ps': 'application/postscript',
-    '.qt': 'video/quicktime',
-    '.ras': 'image/x-cmu-raster',
-    '.rgb': 'image/x-rgb',
-    '.roff': 'application/x-troff',
-    '.rtf': 'application/rtf',
-    '.rtx': 'text/richtext',
-    '.sgm': 'text/x-sgml',
-    '.sgml': 'text/x-sgml',
-    '.sh': 'application/x-sh',
-    '.shar': 'application/x-shar',
-    '.snd': 'audio/basic',
-    '.so': 'application/octet-stream',
-    '.src': 'application/x-wais-source',
-    '.sv4cpio': 'application/x-sv4cpio',
-    '.sv4crc': 'application/x-sv4crc',
-    '.t': 'application/x-troff',
-    '.tar': 'application/x-tar',
-    '.tcl': 'application/x-tcl',
-    '.tex': 'application/x-tex',
-    '.texi': 'application/x-texinfo',
-    '.texinfo': 'application/x-texinfo',
-    '.tif': 'image/tiff',
-    '.tiff': 'image/tiff',
-    '.tr': 'application/x-troff',
-    '.tsv': 'text/tab-separated-values',
-    '.txt': 'text/plain',
-    '.ustar': 'application/x-ustar',
-    '.wav': 'audio/x-wav',
-    '.xbm': 'image/x-xbitmap',
-    '.xpm': 'image/x-xpixmap',
-    '.xwd': 'image/x-xwindowdump',
-    '.zip': 'application/zip',
-    }
--- a/Tools/webchecker/robotparser.py
+++ b/Tools/webchecker/robotparser.py
@@ -9,79 +9,79 @@ fetchability of other URLs.
 class RobotFileParser:

    def __init__(self):
-	self.rules = {}
-	self.debug = 0
-	self.url = ''
-	self.last_checked = 0
+        self.rules = {}
+        self.debug = 0
+        self.url = ''
+        self.last_checked = 0

    def mtime(self):
-	return self.last_checked
+        return self.last_checked

    def modified(self):
-	import time
-	self.last_checked = time.time()
+        import time
+        self.last_checked = time.time()

    def set_url(self, url):
-	self.url = url
-## 	import urlmisc
-## 	self.url = urlmisc.canonical_url(url)
+        self.url = url
+##      import urlmisc
+##      self.url = urlmisc.canonical_url(url)

    def read(self):
-	import urllib
-	self.parse(urllib.urlopen(self.url).readlines())
+        import urllib
+        self.parse(urllib.urlopen(self.url).readlines())

    def parse(self, lines):
-	import regsub, string, regex
-	active = []
-	for line in lines:
-	    if self.debug: print '>', line,
-	    # blank line terminates current record
-	    if not line[:-1]:
-		active = []
-		continue
-	    # remove optional comment and strip line
-	    line = string.strip(line[:string.find(line, '#')])
-	    if not line:
-		continue
-	    line = regsub.split(line, ' *: *')
-	    if len(line) == 2:
-		line[0] = string.lower(line[0])
-		if line[0] == 'user-agent':
-		    # this record applies to this user agent
-		    if self.debug: print '>> user-agent:', line[1]
-		    active.append(line[1])
-		    if not self.rules.has_key(line[1]):
-			self.rules[line[1]] = []
-		elif line[0] == 'disallow':
-		    if line[1]:
-			if self.debug: print '>> disallow:', line[1]
-			for agent in active:
-			    self.rules[agent].append(regex.compile(line[1]))
-		    else:
-			pass
-			for agent in active:
-			    if self.debug: print '>> allow', agent
-			    self.rules[agent] = []
-		else:
-		    if self.debug: print '>> unknown:', line
+        import regsub, string, regex
+        active = []
+        for line in lines:
+            if self.debug: print '>', line,
+            # blank line terminates current record
+            if not line[:-1]:
+                active = []
+                continue
+            # remove optional comment and strip line
+            line = string.strip(line[:string.find(line, '#')])
+            if not line:
+                continue
+            line = regsub.split(line, ' *: *')
+            if len(line) == 2:
+                line[0] = string.lower(line[0])
+                if line[0] == 'user-agent':
+                    # this record applies to this user agent
+                    if self.debug: print '>> user-agent:', line[1]
+                    active.append(line[1])
+                    if not self.rules.has_key(line[1]):
+                        self.rules[line[1]] = []
+                elif line[0] == 'disallow':
+                    if line[1]:
+                        if self.debug: print '>> disallow:', line[1]
+                        for agent in active:
+                            self.rules[agent].append(regex.compile(line[1]))
+                    else:
+                        pass
+                        for agent in active:
+                            if self.debug: print '>> allow', agent
+                            self.rules[agent] = []
+                else:
+                    if self.debug: print '>> unknown:', line

-	self.modified()
+        self.modified()

    # returns true if agent is allowed to fetch url
    def can_fetch(self, agent, url):
-	import urlparse
-	ag = agent
-	if not self.rules.has_key(ag): ag = '*'
-	if not self.rules.has_key(ag):
-	    if self.debug: print '>> allowing', url, 'fetch by', agent
-	    return 1
-	path = urlparse.urlparse(url)[2]
-	for rule in self.rules[ag]:
-	    if rule.match(path) != -1:
-		if self.debug: print '>> disallowing', url, 'fetch by', agent
-		return 0
-	if self.debug: print '>> allowing', url, 'fetch by', agent
-	return 1
+        import urlparse
+        ag = agent
+        if not self.rules.has_key(ag): ag = '*'
+        if not self.rules.has_key(ag):
+            if self.debug: print '>> allowing', url, 'fetch by', agent
+            return 1
+        path = urlparse.urlparse(url)[2]
+        for rule in self.rules[ag]:
+            if rule.match(path) != -1:
+                if self.debug: print '>> disallowing', url, 'fetch by', agent
+                return 0
+        if self.debug: print '>> allowing', url, 'fetch by', agent
+        return 1

 def test():
    rp = RobotFileParser()
@@ -91,7 +91,7 @@ def test():
    print rp.rules
    print rp.can_fetch('*', 'http://www.calendar.com/concerts/')
    print rp.can_fetch('Musi-Cal-Robot',
-		       'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones')
+                       'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones')

    print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/')
    print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/vanagon-list-001')
--- a/Tools/webchecker/tktools.py
+++ b/Tools/webchecker/tktools.py
--- a/Tools/webchecker/wcgui.py
+++ b/Tools/webchecker/wcgui.py
--- a/Tools/webchecker/webchecker.py
+++ b/Tools/webchecker/webchecker.py
--- a/Tools/webchecker/websucker.py
+++ b/Tools/webchecker/websucker.py
@@ -16,29 +16,29 @@ import webchecker
 if __version__[0] == '$':
    _v = string.split(__version__)
    if len(_v) == 3:
-	__version__ = _v[1]
+        __version__ = _v[1]

 def main():
    verbose = webchecker.VERBOSE
    try:
-	opts, args = getopt.getopt(sys.argv[1:], "qv")
+        opts, args = getopt.getopt(sys.argv[1:], "qv")
    except getopt.error, msg:
-	print msg
-	print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..."
-	return 2
+        print msg
+        print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..."
+        return 2
    for o, a in opts:
-	if o == "-q":
-	    verbose = 0
-	if o == "-v":
-	    verbose = verbose + 1
+        if o == "-q":
+            verbose = 0
+        if o == "-v":
+            verbose = verbose + 1
    c = Sucker()
    c.setflags(verbose=verbose)
    c.urlopener.addheaders = [
-	    ('User-agent', 'websucker/%s' % __version__),
-	]
+            ('User-agent', 'websucker/%s' % __version__),
+        ]
    for arg in args:
-	print "Adding root", arg
-	c.addroot(arg)
+        print "Adding root", arg
+        c.addroot(arg)
    print "Run..."
    c.run()

@@ -47,57 +47,57 @@ class Sucker(webchecker.Checker):
    checkext = 0

    def readhtml(self, url):
-	text = None
-	path = self.savefilename(url)
-	try:
-	    f = open(path, "rb")
-	except IOError:
-	    f = self.openpage(url)
-	    if f:
-		info = f.info()
-		nurl = f.geturl()
-		if nurl != url:
-		    url = nurl
-		    path = self.savefilename(url)
-		text = f.read()
-		f.close()
-		self.savefile(text, path)
-		if not self.checkforhtml(info, url):
-		    text = None
-	else:
-	    if self.checkforhtml({}, url):
-		text = f.read()
-	    f.close()
-	return text, url
+        text = None
+        path = self.savefilename(url)
+        try:
+            f = open(path, "rb")
+        except IOError:
+            f = self.openpage(url)
+            if f:
+                info = f.info()
+                nurl = f.geturl()
+                if nurl != url:
+                    url = nurl
+                    path = self.savefilename(url)
+                text = f.read()
+                f.close()
+                self.savefile(text, path)
+                if not self.checkforhtml(info, url):
+                    text = None
+        else:
+            if self.checkforhtml({}, url):
+                text = f.read()
+            f.close()
+        return text, url

    def savefile(self, text, path):
-	dir, base = os.path.split(path)
-	makedirs(dir)
-	f = open(path, "wb")
-	f.write(text)
-	f.close()
-	print "saved", path
+        dir, base = os.path.split(path)
+        makedirs(dir)
+        f = open(path, "wb")
+        f.write(text)
+        f.close()
+        print "saved", path

    def savefilename(self, url):
-	type, rest = urllib.splittype(url)
-	host, path = urllib.splithost(rest)
-	while path[:1] == "/": path = path[1:]
-	user, host = urllib.splituser(host)
-	host, port = urllib.splitnport(host)
-	host = string.lower(host)
-	path = os.path.join(host, path)
-	if path[-1] == "/": path = path + "index.html"
-	if os.sep != "/":
-	    path = string.join(string.split(path, "/"), os.sep)
-	return path
+        type, rest = urllib.splittype(url)
+        host, path = urllib.splithost(rest)
+        while path[:1] == "/": path = path[1:]
+        user, host = urllib.splituser(host)
+        host, port = urllib.splitnport(host)
+        host = string.lower(host)
+        path = os.path.join(host, path)
+        if path[-1] == "/": path = path + "index.html"
+        if os.sep != "/":
+            path = string.join(string.split(path, "/"), os.sep)
+        return path

 def makedirs(dir):
    if not dir or os.path.exists(dir):
-	return
+        return
    head, tail = os.path.split(dir)
    if not tail:
-	print "Huh?  Don't know how to make dir", dir
-	return
+        print "Huh?  Don't know how to make dir", dir
+        return
    makedirs(head)
    os.mkdir(dir, 0777)