Kaydet (Commit) 986abac1 authored tarafından Guido van Rossum's avatar Guido van Rossum

Give in to tabnanny

üst 36dfbcf3
......@@ -9,79 +9,79 @@ fetchability of other URLs.
class RobotFileParser:
def __init__(self):
self.rules = {}
self.debug = 0
self.url = ''
self.last_checked = 0
self.rules = {}
self.debug = 0
self.url = ''
self.last_checked = 0
def mtime(self):
return self.last_checked
return self.last_checked
def modified(self):
import time
self.last_checked = time.time()
import time
self.last_checked = time.time()
def set_url(self, url):
self.url = url
## import urlmisc
## self.url = urlmisc.canonical_url(url)
self.url = url
## import urlmisc
## self.url = urlmisc.canonical_url(url)
def read(self):
import urllib
self.parse(urllib.urlopen(self.url).readlines())
import urllib
self.parse(urllib.urlopen(self.url).readlines())
def parse(self, lines):
import regsub, string, regex
active = []
for line in lines:
if self.debug: print '>', line,
# blank line terminates current record
if not line[:-1]:
active = []
continue
# remove optional comment and strip line
line = string.strip(line[:string.find(line, '#')])
if not line:
continue
line = regsub.split(line, ' *: *')
if len(line) == 2:
line[0] = string.lower(line[0])
if line[0] == 'user-agent':
# this record applies to this user agent
if self.debug: print '>> user-agent:', line[1]
active.append(line[1])
if not self.rules.has_key(line[1]):
self.rules[line[1]] = []
elif line[0] == 'disallow':
if line[1]:
if self.debug: print '>> disallow:', line[1]
for agent in active:
self.rules[agent].append(regex.compile(line[1]))
else:
pass
for agent in active:
if self.debug: print '>> allow', agent
self.rules[agent] = []
else:
if self.debug: print '>> unknown:', line
import regsub, string, regex
active = []
for line in lines:
if self.debug: print '>', line,
# blank line terminates current record
if not line[:-1]:
active = []
continue
# remove optional comment and strip line
line = string.strip(line[:string.find(line, '#')])
if not line:
continue
line = regsub.split(line, ' *: *')
if len(line) == 2:
line[0] = string.lower(line[0])
if line[0] == 'user-agent':
# this record applies to this user agent
if self.debug: print '>> user-agent:', line[1]
active.append(line[1])
if not self.rules.has_key(line[1]):
self.rules[line[1]] = []
elif line[0] == 'disallow':
if line[1]:
if self.debug: print '>> disallow:', line[1]
for agent in active:
self.rules[agent].append(regex.compile(line[1]))
else:
pass
for agent in active:
if self.debug: print '>> allow', agent
self.rules[agent] = []
else:
if self.debug: print '>> unknown:', line
self.modified()
self.modified()
# returns true if agent is allowed to fetch url
def can_fetch(self, agent, url):
import urlparse
ag = agent
if not self.rules.has_key(ag): ag = '*'
if not self.rules.has_key(ag):
if self.debug: print '>> allowing', url, 'fetch by', agent
return 1
path = urlparse.urlparse(url)[2]
for rule in self.rules[ag]:
if rule.match(path) != -1:
if self.debug: print '>> disallowing', url, 'fetch by', agent
return 0
if self.debug: print '>> allowing', url, 'fetch by', agent
return 1
import urlparse
ag = agent
if not self.rules.has_key(ag): ag = '*'
if not self.rules.has_key(ag):
if self.debug: print '>> allowing', url, 'fetch by', agent
return 1
path = urlparse.urlparse(url)[2]
for rule in self.rules[ag]:
if rule.match(path) != -1:
if self.debug: print '>> disallowing', url, 'fetch by', agent
return 0
if self.debug: print '>> allowing', url, 'fetch by', agent
return 1
def test():
rp = RobotFileParser()
......@@ -91,7 +91,7 @@ def test():
print rp.rules
print rp.can_fetch('*', 'http://www.calendar.com/concerts/')
print rp.can_fetch('Musi-Cal-Robot',
'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones')
'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones')
print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/')
print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/vanagon-list-001')
"""Guess the MIME type of a file.
This module defines one useful function:
guess_type(url) -- guess the MIME type and encoding of a URL.
It also contains the following, for tuning the behavior:
Data:
knownfiles -- list of files to parse
inited -- flag set when init() has been called
suffixes_map -- dictionary mapping suffixes to suffixes
encodings_map -- dictionary mapping suffixes to encodings
types_map -- dictionary mapping suffixes to types
Functions:
init([files]) -- parse a list of files, default knownfiles
read_mime_types(file) -- parse one file, return a dictionary or None
"""
import string
import posixpath
knownfiles = [
"/usr/local/etc/httpd/conf/mime.types",
"/usr/local/lib/netscape/mime.types",
]
inited = 0
def guess_type(url):
"""Guess the type of a file based on its URL.
Return value is a tuple (type, encoding) where type is None if the
type can't be guessed (no or unknown suffix) or a string of the
form type/subtype, usable for a MIME Content-type header; and
encoding is None for no encoding or the name of the program used
to encode (e.g. compress or gzip). The mappings are table
driven. Encoding suffixes are case sensitive; type suffixes are
first tried case sensitive, then case insensitive.
The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped
to ".tar.gz". (This is table-driven too, using the dictionary
suffixes_map).
"""
if not inited:
init()
base, ext = posixpath.splitext(url)
while suffix_map.has_key(ext):
base, ext = posixpath.splitext(base + suffix_map[ext])
if encodings_map.has_key(ext):
encoding = encodings_map[ext]
base, ext = posixpath.splitext(base)
else:
encoding = None
if types_map.has_key(ext):
return types_map[ext], encoding
elif types_map.has_key(string.lower(ext)):
return types_map[string.lower(ext)], encoding
else:
return None, encoding
def init(files=None):
global inited
for file in files or knownfiles:
s = read_mime_types(file)
if s:
for key, value in s.items():
types_map[key] = value
inited = 1
def read_mime_types(file):
try:
f = open(file)
except IOError:
return None
map = {}
while 1:
line = f.readline()
if not line: break
words = string.split(line)
for i in range(len(words)):
if words[i][0] == '#':
del words[i:]
break
if not words: continue
type, suffixes = words[0], words[1:]
for suff in suffixes:
map['.'+suff] = type
f.close()
return map
suffix_map = {
'.tgz': '.tar.gz',
'.taz': '.tar.gz',
'.tz': '.tar.gz',
}
encodings_map = {
'.gz': 'gzip',
'.Z': 'compress',
}
types_map = {
'.a': 'application/octet-stream',
'.ai': 'application/postscript',
'.aif': 'audio/x-aiff',
'.aifc': 'audio/x-aiff',
'.aiff': 'audio/x-aiff',
'.au': 'audio/basic',
'.avi': 'video/x-msvideo',
'.bcpio': 'application/x-bcpio',
'.bin': 'application/octet-stream',
'.cdf': 'application/x-netcdf',
'.cpio': 'application/x-cpio',
'.csh': 'application/x-csh',
'.dll': 'application/octet-stream',
'.dvi': 'application/x-dvi',
'.exe': 'application/octet-stream',
'.eps': 'application/postscript',
'.etx': 'text/x-setext',
'.gif': 'image/gif',
'.gtar': 'application/x-gtar',
'.hdf': 'application/x-hdf',
'.htm': 'text/html',
'.html': 'text/html',
'.shtml': 'text/html',
'.ief': 'image/ief',
'.jpe': 'image/jpeg',
'.jpeg': 'image/jpeg',
'.jpg': 'image/jpeg',
'.latex': 'application/x-latex',
'.man': 'application/x-troff-man',
'.me': 'application/x-troff-me',
'.mif': 'application/x-mif',
'.mov': 'video/quicktime',
'.movie': 'video/x-sgi-movie',
'.mpe': 'video/mpeg',
'.mpeg': 'video/mpeg',
'.mpg': 'video/mpeg',
'.ms': 'application/x-troff-ms',
'.nc': 'application/x-netcdf',
'.o': 'application/octet-stream',
'.obj': 'application/octet-stream',
'.oda': 'application/oda',
'.pbm': 'image/x-portable-bitmap',
'.pdf': 'application/pdf',
'.pgm': 'image/x-portable-graymap',
'.pnm': 'image/x-portable-anymap',
'.png': 'image/png',
'.ppm': 'image/x-portable-pixmap',
'.py': 'text/x-python',
'.pyc': 'application/x-python-code',
'.ps': 'application/postscript',
'.qt': 'video/quicktime',
'.ras': 'image/x-cmu-raster',
'.rgb': 'image/x-rgb',
'.roff': 'application/x-troff',
'.rtf': 'application/rtf',
'.rtx': 'text/richtext',
'.sgm': 'text/x-sgml',
'.sgml': 'text/x-sgml',
'.sh': 'application/x-sh',
'.shar': 'application/x-shar',
'.snd': 'audio/basic',
'.so': 'application/octet-stream',
'.src': 'application/x-wais-source',
'.sv4cpio': 'application/x-sv4cpio',
'.sv4crc': 'application/x-sv4crc',
'.t': 'application/x-troff',
'.tar': 'application/x-tar',
'.tcl': 'application/x-tcl',
'.tex': 'application/x-tex',
'.texi': 'application/x-texinfo',
'.texinfo': 'application/x-texinfo',
'.tif': 'image/tiff',
'.tiff': 'image/tiff',
'.tr': 'application/x-troff',
'.tsv': 'text/tab-separated-values',
'.txt': 'text/plain',
'.ustar': 'application/x-ustar',
'.wav': 'audio/x-wav',
'.xbm': 'image/x-xbitmap',
'.xpm': 'image/x-xpixmap',
'.xwd': 'image/x-xwindowdump',
'.zip': 'application/zip',
}
......@@ -9,79 +9,79 @@ fetchability of other URLs.
class RobotFileParser:
def __init__(self):
self.rules = {}
self.debug = 0
self.url = ''
self.last_checked = 0
self.rules = {}
self.debug = 0
self.url = ''
self.last_checked = 0
def mtime(self):
return self.last_checked
return self.last_checked
def modified(self):
import time
self.last_checked = time.time()
import time
self.last_checked = time.time()
def set_url(self, url):
self.url = url
## import urlmisc
## self.url = urlmisc.canonical_url(url)
self.url = url
## import urlmisc
## self.url = urlmisc.canonical_url(url)
def read(self):
import urllib
self.parse(urllib.urlopen(self.url).readlines())
import urllib
self.parse(urllib.urlopen(self.url).readlines())
def parse(self, lines):
import regsub, string, regex
active = []
for line in lines:
if self.debug: print '>', line,
# blank line terminates current record
if not line[:-1]:
active = []
continue
# remove optional comment and strip line
line = string.strip(line[:string.find(line, '#')])
if not line:
continue
line = regsub.split(line, ' *: *')
if len(line) == 2:
line[0] = string.lower(line[0])
if line[0] == 'user-agent':
# this record applies to this user agent
if self.debug: print '>> user-agent:', line[1]
active.append(line[1])
if not self.rules.has_key(line[1]):
self.rules[line[1]] = []
elif line[0] == 'disallow':
if line[1]:
if self.debug: print '>> disallow:', line[1]
for agent in active:
self.rules[agent].append(regex.compile(line[1]))
else:
pass
for agent in active:
if self.debug: print '>> allow', agent
self.rules[agent] = []
else:
if self.debug: print '>> unknown:', line
import regsub, string, regex
active = []
for line in lines:
if self.debug: print '>', line,
# blank line terminates current record
if not line[:-1]:
active = []
continue
# remove optional comment and strip line
line = string.strip(line[:string.find(line, '#')])
if not line:
continue
line = regsub.split(line, ' *: *')
if len(line) == 2:
line[0] = string.lower(line[0])
if line[0] == 'user-agent':
# this record applies to this user agent
if self.debug: print '>> user-agent:', line[1]
active.append(line[1])
if not self.rules.has_key(line[1]):
self.rules[line[1]] = []
elif line[0] == 'disallow':
if line[1]:
if self.debug: print '>> disallow:', line[1]
for agent in active:
self.rules[agent].append(regex.compile(line[1]))
else:
pass
for agent in active:
if self.debug: print '>> allow', agent
self.rules[agent] = []
else:
if self.debug: print '>> unknown:', line
self.modified()
self.modified()
# returns true if agent is allowed to fetch url
def can_fetch(self, agent, url):
import urlparse
ag = agent
if not self.rules.has_key(ag): ag = '*'
if not self.rules.has_key(ag):
if self.debug: print '>> allowing', url, 'fetch by', agent
return 1
path = urlparse.urlparse(url)[2]
for rule in self.rules[ag]:
if rule.match(path) != -1:
if self.debug: print '>> disallowing', url, 'fetch by', agent
return 0
if self.debug: print '>> allowing', url, 'fetch by', agent
return 1
import urlparse
ag = agent
if not self.rules.has_key(ag): ag = '*'
if not self.rules.has_key(ag):
if self.debug: print '>> allowing', url, 'fetch by', agent
return 1
path = urlparse.urlparse(url)[2]
for rule in self.rules[ag]:
if rule.match(path) != -1:
if self.debug: print '>> disallowing', url, 'fetch by', agent
return 0
if self.debug: print '>> allowing', url, 'fetch by', agent
return 1
def test():
rp = RobotFileParser()
......@@ -91,7 +91,7 @@ def test():
print rp.rules
print rp.can_fetch('*', 'http://www.calendar.com/concerts/')
print rp.can_fetch('Musi-Cal-Robot',
'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones')
'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones')
print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/')
print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/vanagon-list-001')
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -16,29 +16,29 @@ import webchecker
if __version__[0] == '$':
_v = string.split(__version__)
if len(_v) == 3:
__version__ = _v[1]
__version__ = _v[1]
def main():
verbose = webchecker.VERBOSE
try:
opts, args = getopt.getopt(sys.argv[1:], "qv")
opts, args = getopt.getopt(sys.argv[1:], "qv")
except getopt.error, msg:
print msg
print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..."
return 2
print msg
print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..."
return 2
for o, a in opts:
if o == "-q":
verbose = 0
if o == "-v":
verbose = verbose + 1
if o == "-q":
verbose = 0
if o == "-v":
verbose = verbose + 1
c = Sucker()
c.setflags(verbose=verbose)
c.urlopener.addheaders = [
('User-agent', 'websucker/%s' % __version__),
]
('User-agent', 'websucker/%s' % __version__),
]
for arg in args:
print "Adding root", arg
c.addroot(arg)
print "Adding root", arg
c.addroot(arg)
print "Run..."
c.run()
......@@ -47,57 +47,57 @@ class Sucker(webchecker.Checker):
checkext = 0
def readhtml(self, url):
text = None
path = self.savefilename(url)
try:
f = open(path, "rb")
except IOError:
f = self.openpage(url)
if f:
info = f.info()
nurl = f.geturl()
if nurl != url:
url = nurl
path = self.savefilename(url)
text = f.read()
f.close()
self.savefile(text, path)
if not self.checkforhtml(info, url):
text = None
else:
if self.checkforhtml({}, url):
text = f.read()
f.close()
return text, url
text = None
path = self.savefilename(url)
try:
f = open(path, "rb")
except IOError:
f = self.openpage(url)
if f:
info = f.info()
nurl = f.geturl()
if nurl != url:
url = nurl
path = self.savefilename(url)
text = f.read()
f.close()
self.savefile(text, path)
if not self.checkforhtml(info, url):
text = None
else:
if self.checkforhtml({}, url):
text = f.read()
f.close()
return text, url
def savefile(self, text, path):
dir, base = os.path.split(path)
makedirs(dir)
f = open(path, "wb")
f.write(text)
f.close()
print "saved", path
dir, base = os.path.split(path)
makedirs(dir)
f = open(path, "wb")
f.write(text)
f.close()
print "saved", path
def savefilename(self, url):
type, rest = urllib.splittype(url)
host, path = urllib.splithost(rest)
while path[:1] == "/": path = path[1:]
user, host = urllib.splituser(host)
host, port = urllib.splitnport(host)
host = string.lower(host)
path = os.path.join(host, path)
if path[-1] == "/": path = path + "index.html"
if os.sep != "/":
path = string.join(string.split(path, "/"), os.sep)
return path
type, rest = urllib.splittype(url)
host, path = urllib.splithost(rest)
while path[:1] == "/": path = path[1:]
user, host = urllib.splituser(host)
host, port = urllib.splitnport(host)
host = string.lower(host)
path = os.path.join(host, path)
if path[-1] == "/": path = path + "index.html"
if os.sep != "/":
path = string.join(string.split(path, "/"), os.sep)
return path
def makedirs(dir):
if not dir or os.path.exists(dir):
return
return
head, tail = os.path.split(dir)
if not tail:
print "Huh? Don't know how to make dir", dir
return
print "Huh? Don't know how to make dir", dir
return
makedirs(head)
os.mkdir(dir, 0777)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment