Kaydet (Commit) 00756bd4 authored tarafından Guido van Rossum's avatar Guido van Rossum

Major overhaul. Don't use global variable (e.g. verbose); use

instance variables.  Make all global functions methods, for easy
overriding.  Restructure getpage() for easy overriding.  Add
save_pickle() method and load_pickle() global function to make it
easier for other programs to emulate the toplevel interface.
üst 1ee492e5
......@@ -94,7 +94,7 @@ rooturl -- URL to start checking
"""
__version__ = "0.5"
__version__ = "$Revision$"
import sys
......@@ -112,9 +112,17 @@ import sgmllib
import mimetypes
import robotparser
# Extract real version number if necessary
if __version__[0] == '$':
_v = string.split(__version__)
if len(_v) == 3:
__version__ = _v[1]
# Tunable parameters
DEFROOT = "file:/usr/local/etc/httpd/htdocs/" # Default root URL
CHECKEXT = 1 # Check external references (1 deep)
VERBOSE = 1 # Verbosity level (0-3)
MAXPAGE = 150000 # Ignore files bigger than this
ROUNDSIZE = 50 # Number of links processed per round
DUMPFILE = "@webchecker.pickle" # Pickled checkpoint
......@@ -122,16 +130,15 @@ AGENTNAME = "webchecker" # Agent name for robots.txt parser
# Global variables
verbose = 1
maxpage = MAXPAGE
roundsize = ROUNDSIZE
def main():
global verbose, maxpage, roundsize
checkext = CHECKEXT
verbose = VERBOSE
maxpage = MAXPAGE
roundsize = ROUNDSIZE
dumpfile = DUMPFILE
restart = 0
checkext = 1
norun = 0
try:
......@@ -163,17 +170,14 @@ def main():
print AGENTNAME, "version", __version__
if restart:
if verbose > 0:
print "Loading checkpoint from %s ..." % dumpfile
f = open(dumpfile, "rb")
c = pickle.load(f)
f.close()
if verbose > 0:
print "Done."
print "Root:", string.join(c.roots, "\n ")
c = load_pickle(dumpfile=dumpfile, verbose=verbose)
else:
c = Checker(checkext)
if not args:
c = Checker()
c.setflags(checkext=checkext, verbose=verbose,
maxpage=maxpage, roundsize=roundsize)
if not restart and not args:
args.append(DEFROOT)
for arg in args:
......@@ -192,40 +196,43 @@ def main():
if verbose > 0:
print "[report interrupted]"
if not c.changed:
if verbose > 0:
print
print "No need to save checkpoint"
elif not dumpfile:
if verbose > 0:
print "No dumpfile, won't save checkpoint"
if c.save_pickle(dumpfile):
if dumpfile == DUMPFILE:
print "Use ``%s -R'' to restart." % sys.argv[0]
else:
print "Use ``%s -R -d %s'' to restart." % (sys.argv[0], dumpfile)
def load_pickle(dumpfile=DUMPFILE, verbose=VERBOSE):
if verbose > 0:
print
print "Saving checkpoint to %s ..." % dumpfile
newfile = dumpfile + ".new"
f = open(newfile, "wb")
pickle.dump(c, f)
print "Loading checkpoint from %s ..." % dumpfile
f = open(dumpfile, "rb")
c = pickle.load(f)
f.close()
try:
os.unlink(dumpfile)
except os.error:
pass
os.rename(newfile, dumpfile)
if verbose > 0:
print "Done."
if dumpfile == DUMPFILE:
print "Use ``%s -R'' to restart." % sys.argv[0]
else:
print "Use ``%s -R -d %s'' to restart." % (sys.argv[0],
dumpfile)
print "Root:", string.join(c.roots, "\n ")
return c
class Checker:
def __init__(self, checkext=1):
checkext = CHECKEXT
verbose = VERBOSE
maxpage = MAXPAGE
roundsize = ROUNDSIZE
validflags = tuple(dir())
def __init__(self):
self.reset()
self.checkext = checkext
def setflags(self, **kw):
for key in kw.keys():
if key not in self.validflags:
raise NameError, "invalid keyword argument: %s" % str(key)
for key, value in kw.items():
setattr(self, key, value)
def reset(self):
self.roots = []
......@@ -243,6 +250,7 @@ class Checker:
return (self.roots, self.todo, self.done, self.bad, self.round)
def __setstate__(self, state):
self.reset()
(self.roots, self.todo, self.done, self.bad, self.round) = state
for root in self.roots:
self.addrobot(root)
......@@ -268,24 +276,24 @@ class Checker:
if self.robots.has_key(root): return
url = urlparse.urljoin(root, "/robots.txt")
self.robots[root] = rp = robotparser.RobotFileParser()
if verbose > 2:
if self.verbose > 2:
print "Parsing", url
rp.debug = verbose > 3
rp.debug = self.verbose > 3
rp.set_url(url)
try:
rp.read()
except IOError, msg:
if verbose > 1:
if self.verbose > 1:
print "I/O error parsing", url, ":", msg
def run(self):
while self.todo:
self.round = self.round + 1
if verbose > 0:
if self.verbose > 0:
print
print "Round %d (%s)" % (self.round, self.status())
print
urls = self.todo.keys()[:roundsize]
urls = self.todo.keys()[:self.roundsize]
for url in urls:
self.dopage(url)
......@@ -325,9 +333,9 @@ class Checker:
print " msg", msg
def dopage(self, url):
if verbose > 1:
if verbose > 2:
show("Check ", url, " from", self.todo[url])
if self.verbose > 1:
if self.verbose > 2:
self.show("Check ", url, " from", self.todo[url])
else:
print "Check ", url
page = self.getpage(url)
......@@ -346,17 +354,17 @@ class Checker:
def newdonelink(self, url, origin):
self.done[url].append(origin)
if verbose > 3:
if self.verbose > 3:
print " Done link", url
def newtodolink(self, url, origin):
if self.todo.has_key(url):
self.todo[url].append(origin)
if verbose > 3:
if self.verbose > 3:
print " Seen todo link", url
else:
self.todo[url] = [origin]
if verbose > 3:
if self.verbose > 3:
print " New todo link", url
def markdone(self, url):
......@@ -373,56 +381,79 @@ class Checker:
def getpage(self, url):
if url[:7] == 'mailto:' or url[:5] == 'news:':
if verbose > 1: print " Not checking mailto/news URL"
if self.verbose > 1: print " Not checking mailto/news URL"
return None
isint = self.inroots(url)
if not isint and not self.checkext:
if verbose > 1: print " Not checking ext link"
if not isint:
if not self.checkext:
if self.verbose > 1: print " Not checking ext link"
return None
f = self.openpage(url)
if f:
self.safeclose(f)
return None
text, nurl = self.readhtml(url)
if nurl != url:
if self.verbose > 1:
print " Redirected to", nurl
url = nurl
if text:
return Page(text, url, verbose=self.verbose, maxpage=self.maxpage)
def readhtml(self, url):
text = None
f, url = self.openhtml(url)
if f:
text = f.read()
f.close()
return text, url
def openhtml(self, url):
f = self.openpage(url)
if f:
url = f.geturl()
info = f.info()
if not self.checkforhtml(info, url):
self.safeclose(f)
f = None
return f, url
def openpage(self, url):
try:
f = self.urlopener.open(url)
return self.urlopener.open(url)
except IOError, msg:
msg = sanitize(msg)
if verbose > 0:
msg = self.sanitize(msg)
if self.verbose > 0:
print "Error ", msg
if verbose > 0:
show(" HREF ", url, " from", self.todo[url])
if self.verbose > 0:
self.show(" HREF ", url, " from", self.todo[url])
self.setbad(url, msg)
return None
if not isint:
if verbose > 1: print " Not gathering links from ext URL"
safeclose(f)
return None
nurl = f.geturl()
info = f.info()
def checkforhtml(self, info, url):
if info.has_key('content-type'):
ctype = string.lower(info['content-type'])
else:
ctype = None
if nurl != url:
if verbose > 1:
print " Redirected to", nurl
if not ctype:
ctype, encoding = mimetypes.guess_type(nurl)
if ctype != 'text/html':
safeclose(f)
if verbose > 1:
if url[-1:] == "/":
return 1
ctype, encoding = mimetypes.guess_type(url)
if ctype == 'text/html':
return 1
else:
if self.verbose > 1:
print " Not HTML, mime type", ctype
return None
text = f.read()
f.close()
return Page(text, nurl)
return 0
def setgood(self, url):
if self.bad.has_key(url):
del self.bad[url]
self.changed = 1
if verbose > 0:
if self.verbose > 0:
print "(Clear previously seen error)"
def setbad(self, url, msg):
if self.bad.has_key(url) and self.bad[url] == msg:
if verbose > 0:
if self.verbose > 0:
print "(Seen this error before)"
return
self.bad[url] = msg
......@@ -444,23 +475,88 @@ class Checker:
except KeyError:
self.errors[url] = [triple]
# The following used to be toplevel functions; they have been
# changed into methods so they can be overridden in subclasses.
def show(self, p1, link, p2, origins):
print p1, link
i = 0
for source, rawlink in origins:
i = i+1
if i == 2:
p2 = ' '*len(p2)
print p2, source,
if rawlink != link: print "(%s)" % rawlink,
print
def sanitize(self, msg):
if isinstance(IOError, ClassType) and isinstance(msg, IOError):
# Do the other branch recursively
msg.args = self.sanitize(msg.args)
elif isinstance(msg, TupleType):
if len(msg) >= 4 and msg[0] == 'http error' and \
isinstance(msg[3], InstanceType):
# Remove the Message instance -- it may contain
# a file object which prevents pickling.
msg = msg[:3] + msg[4:]
return msg
def safeclose(self, f):
try:
url = f.geturl()
except AttributeError:
pass
else:
if url[:4] == 'ftp:' or url[:7] == 'file://':
# Apparently ftp connections don't like to be closed
# prematurely...
text = f.read()
f.close()
def save_pickle(self, dumpfile=DUMPFILE):
if not self.changed:
if self.verbose > 0:
print
print "No need to save checkpoint"
elif not dumpfile:
if self.verbose > 0:
print "No dumpfile, won't save checkpoint"
else:
if self.verbose > 0:
print
print "Saving checkpoint to %s ..." % dumpfile
newfile = dumpfile + ".new"
f = open(newfile, "wb")
pickle.dump(self, f)
f.close()
try:
os.unlink(dumpfile)
except os.error:
pass
os.rename(newfile, dumpfile)
if self.verbose > 0:
print "Done."
return 1
class Page:
def __init__(self, text, url):
def __init__(self, text, url, verbose=VERBOSE, maxpage=MAXPAGE):
self.text = text
self.url = url
self.verbose = verbose
self.maxpage = maxpage
def getlinkinfos(self):
size = len(self.text)
if size > maxpage:
if verbose > 0:
if size > self.maxpage:
if self.verbose > 0:
print "Skip huge file", self.url
print " (%.0f Kbytes)" % (size*0.001)
return []
if verbose > 2:
if self.verbose > 2:
print " Parsing", self.url, "(%d bytes)" % size
parser = MyHTMLParser()
parser = MyHTMLParser(verbose=self.verbose)
parser.feed(self.text)
parser.close()
rawlinks = parser.getlinks()
......@@ -529,10 +625,11 @@ class MyURLopener(urllib.FancyURLopener):
class MyHTMLParser(sgmllib.SGMLParser):
def __init__(self):
def __init__(self, verbose=VERBOSE):
self.base = None
self.links = {}
sgmllib.SGMLParser.__init__ (self)
self.myverbose = verbose
sgmllib.SGMLParser.__init__(self)
def start_a(self, attributes):
self.link_attr(attributes, 'href')
......@@ -559,7 +656,7 @@ class MyHTMLParser(sgmllib.SGMLParser):
if name == 'href':
if value: value = string.strip(value)
if value:
if verbose > 1:
if self.myverbose > 1:
print " Base", value
self.base = value
......@@ -570,41 +667,5 @@ class MyHTMLParser(sgmllib.SGMLParser):
return self.base
def show(p1, link, p2, origins):
print p1, link
i = 0
for source, rawlink in origins:
i = i+1
if i == 2:
p2 = ' '*len(p2)
print p2, source,
if rawlink != link: print "(%s)" % rawlink,
print
def sanitize(msg):
if (type(msg) == TupleType and
len(msg) >= 4 and
msg[0] == 'http error' and
type(msg[3]) == InstanceType):
# Remove the Message instance -- it may contain
# a file object which prevents pickling.
msg = msg[:3] + msg[4:]
return msg
def safeclose(f):
try:
url = f.geturl()
except AttributeError:
pass
else:
if url[:4] == 'ftp:' or url[:7] == 'file://':
# Apparently ftp connections don't like to be closed
# prematurely...
text = f.read()
f.close()
if __name__ == '__main__':
main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment