Kaydet (Commit) e284b214 authored tarafından Guido van Rossum's avatar Guido van Rossum

Integrated Sam Bayer's wcnew.py code. It seems silly to keep two

files.  Removed Sam's "SLB" change comments; otherwise this is the
same as wcnew.py.
üst 61b95db3
#! /usr/bin/env python #! /usr/bin/env python
# Original code by Guido van Rossum; extensive changes by Sam Bayer,
# including code to check URL fragments.
"""Web tree checker. """Web tree checker.
This utility is handy to check a subweb of the world-wide web for This utility is handy to check a subweb of the world-wide web for
...@@ -64,14 +67,18 @@ directory) has a built-in table mapping most currently known suffixes, ...@@ -64,14 +67,18 @@ directory) has a built-in table mapping most currently known suffixes,
and in addition attempts to read the mime.types configuration files in and in addition attempts to read the mime.types configuration files in
the default locations of Netscape and the NCSA HTTP daemon. the default locations of Netscape and the NCSA HTTP daemon.
- We follows links indicated by <A>, <FRAME> and <IMG> tags. We also - We follow links indicated by <A>, <FRAME> and <IMG> tags. We also
honor the <BASE> tag. honor the <BASE> tag.
- We now check internal NAME anchor links, as well as toplevel links.
- Checking external links is now done by default; use -x to *disable* - Checking external links is now done by default; use -x to *disable*
this feature. External links are now checked during normal this feature. External links are now checked during normal
processing. (XXX The status of a checked link could be categorized processing. (XXX The status of a checked link could be categorized
better. Later...) better. Later...)
- If external links are not checked, you can use the -t flag to
provide specific overrides to -x.
Usage: webchecker.py [option] ... [rooturl] ... Usage: webchecker.py [option] ... [rooturl] ...
...@@ -83,8 +90,10 @@ Options: ...@@ -83,8 +90,10 @@ Options:
-n -- reports only, no checking (use with -R) -n -- reports only, no checking (use with -R)
-q -- quiet operation (also suppresses external links report) -q -- quiet operation (also suppresses external links report)
-r number -- number of links processed per round (default %(ROUNDSIZE)d) -r number -- number of links processed per round (default %(ROUNDSIZE)d)
-t root -- specify root dir which should be treated as internal (can repeat)
-v -- verbose operation; repeating -v will increase verbosity -v -- verbose operation; repeating -v will increase verbosity
-x -- don't check external links (these are often slow to check) -x -- don't check external links (these are often slow to check)
-a -- don't check name anchors
Arguments: Arguments:
...@@ -127,6 +136,7 @@ MAXPAGE = 150000 # Ignore files bigger than this ...@@ -127,6 +136,7 @@ MAXPAGE = 150000 # Ignore files bigger than this
ROUNDSIZE = 50 # Number of links processed per round ROUNDSIZE = 50 # Number of links processed per round
DUMPFILE = "@webchecker.pickle" # Pickled checkpoint DUMPFILE = "@webchecker.pickle" # Pickled checkpoint
AGENTNAME = "webchecker" # Agent name for robots.txt parser AGENTNAME = "webchecker" # Agent name for robots.txt parser
NONAMES = 0 # Force name anchor checking
# Global variables # Global variables
...@@ -142,12 +152,17 @@ def main(): ...@@ -142,12 +152,17 @@ def main():
norun = 0 norun = 0
try: try:
opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:nqr:vx') opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:nqr:t:vxa')
except getopt.error, msg: except getopt.error, msg:
sys.stdout = sys.stderr sys.stdout = sys.stderr
print msg print msg
print __doc__%globals() print __doc__%globals()
sys.exit(2) sys.exit(2)
# The extra_roots variable collects extra roots.
extra_roots = []
nonames = NONAMES
for o, a in opts: for o, a in opts:
if o == '-R': if o == '-R':
restart = 1 restart = 1
...@@ -161,6 +176,10 @@ def main(): ...@@ -161,6 +176,10 @@ def main():
verbose = 0 verbose = 0
if o == '-r': if o == '-r':
roundsize = string.atoi(a) roundsize = string.atoi(a)
if o == '-t':
extra_roots.append(a)
if o == '-a':
nonames = not nonames
if o == '-v': if o == '-v':
verbose = verbose + 1 verbose = verbose + 1
if o == '-x': if o == '-x':
...@@ -175,7 +194,9 @@ def main(): ...@@ -175,7 +194,9 @@ def main():
c = Checker() c = Checker()
c.setflags(checkext=checkext, verbose=verbose, c.setflags(checkext=checkext, verbose=verbose,
maxpage=maxpage, roundsize=roundsize) maxpage=maxpage, roundsize=roundsize,
nonames=nonames
)
if not restart and not args: if not restart and not args:
args.append(DEFROOT) args.append(DEFROOT)
...@@ -183,6 +204,17 @@ def main(): ...@@ -183,6 +204,17 @@ def main():
for arg in args: for arg in args:
c.addroot(arg) c.addroot(arg)
# The -t flag is only needed if external links are not to be
# checked. So -t values are ignored unless -x was specified.
if not checkext:
for root in extra_roots:
# Make sure it's terminated by a slash,
# so that addroot doesn't discard the last
# directory component.
if root[-1] != "/":
root = root + "/"
c.addroot(root, add_to_do = 0)
try: try:
if not norun: if not norun:
...@@ -225,6 +257,7 @@ class Checker: ...@@ -225,6 +257,7 @@ class Checker:
verbose = VERBOSE verbose = VERBOSE
maxpage = MAXPAGE maxpage = MAXPAGE
roundsize = ROUNDSIZE roundsize = ROUNDSIZE
nonames = NONAMES
validflags = tuple(dir()) validflags = tuple(dir())
...@@ -243,19 +276,24 @@ class Checker: ...@@ -243,19 +276,24 @@ class Checker:
self.todo = {} self.todo = {}
self.done = {} self.done = {}
self.bad = {} self.bad = {}
# Add a name table, so that the name URLs can be checked. Also
# serves as an implicit cache for which URLs are done.
self.name_table = {}
self.round = 0 self.round = 0
# The following are not pickled: # The following are not pickled:
self.robots = {} self.robots = {}
self.errors = {} self.errors = {}
self.urlopener = MyURLopener() self.urlopener = MyURLopener()
self.changed = 0 self.changed = 0
def note(self, level, format, *args): def note(self, level, format, *args):
if self.verbose > level: if self.verbose > level:
if args: if args:
format = format%args format = format%args
self.message(format) self.message(format)
def message(self, format, *args): def message(self, format, *args):
if args: if args:
format = format%args format = format%args
...@@ -272,7 +310,7 @@ class Checker: ...@@ -272,7 +310,7 @@ class Checker:
for url in self.bad.keys(): for url in self.bad.keys():
self.markerror(url) self.markerror(url)
def addroot(self, root): def addroot(self, root, add_to_do = 1):
if root not in self.roots: if root not in self.roots:
troot = root troot = root
scheme, netloc, path, params, query, fragment = \ scheme, netloc, path, params, query, fragment = \
...@@ -284,7 +322,8 @@ class Checker: ...@@ -284,7 +322,8 @@ class Checker:
params, query, fragment)) params, query, fragment))
self.roots.append(troot) self.roots.append(troot)
self.addrobot(root) self.addrobot(root)
self.newlink(root, ("<root>", root)) if add_to_do:
self.newlink((root, ""), ("<root>", root))
def addrobot(self, root): def addrobot(self, root):
root = urlparse.urljoin(root, "/") root = urlparse.urljoin(root, "/")
...@@ -336,24 +375,53 @@ class Checker: ...@@ -336,24 +375,53 @@ class Checker:
self.message("%d Errors in %s", len(triples), source) self.message("%d Errors in %s", len(triples), source)
else: else:
self.message("Error in %s", source) self.message("Error in %s", source)
for url, rawlink, msg in triples: # Call self.format_url() instead of referring
if rawlink != url: s = " (%s)" % rawlink # to the URL directly, since the URLs in these
# triples is now a (URL, fragment) pair. The value
# of the "source" variable comes from the list of
# origins, and is a URL, not a pair.
for url, rawlink, msg in triples:
if rawlink != self.format_url(url): s = " (%s)" % rawlink
else: s = "" else: s = ""
self.message(" HREF %s%s\n msg %s", url, s, msg) self.message(" HREF %s%s\n msg %s",
self.format_url(url), s, msg)
def dopage(self, url_pair):
def dopage(self, url): # All printing of URLs uses format_url(); argument changed to
# url_pair for clarity.
if self.verbose > 1: if self.verbose > 1:
if self.verbose > 2: if self.verbose > 2:
self.show("Check ", url, " from", self.todo[url]) self.show("Check ", self.format_url(url_pair),
" from", self.todo[url_pair])
else: else:
self.message("Check %s", url) self.message("Check %s", self.format_url(url_pair))
page = self.getpage(url) url, local_fragment = url_pair
if local_fragment and self.nonames:
self.markdone(url_pair)
return
page = self.getpage(url_pair)
if page: if page:
# Store the page which corresponds to this URL.
self.name_table[url] = page
# If there is a fragment in this url_pair, and it's not
# in the list of names for the page, call setbad(), since
# it's a missing anchor.
if local_fragment and local_fragment not in page.getnames():
self.setbad(url_pair, ("Missing name anchor `%s'" % local_fragment))
for info in page.getlinkinfos(): for info in page.getlinkinfos():
link, rawlink = info # getlinkinfos() now returns the fragment as well,
# and we store that fragment here in the "todo" dictionary.
link, rawlink, fragment = info
# However, we don't want the fragment as the origin, since
# the origin is logically a page.
origin = url, rawlink origin = url, rawlink
self.newlink(link, origin) self.newlink((link, fragment), origin)
self.markdone(url) else:
# If no page has been created yet, we want to
# record that fact.
self.name_table[url_pair[0]] = None
self.markdone(url_pair)
def newlink(self, url, origin): def newlink(self, url, origin):
if self.done.has_key(url): if self.done.has_key(url):
...@@ -362,21 +430,34 @@ class Checker: ...@@ -362,21 +430,34 @@ class Checker:
self.newtodolink(url, origin) self.newtodolink(url, origin)
def newdonelink(self, url, origin): def newdonelink(self, url, origin):
self.done[url].append(origin) if origin not in self.done[url]:
self.note(3, " Done link %s", url) self.done[url].append(origin)
# Call self.format_url(), since the URL here
# is now a (URL, fragment) pair.
self.note(3, " Done link %s", self.format_url(url))
# Make sure that if it's bad, that the origin gets added.
if self.bad.has_key(url): if self.bad.has_key(url):
source, rawlink = origin source, rawlink = origin
triple = url, rawlink, self.bad[url] triple = url, rawlink, self.bad[url]
self.seterror(source, triple) self.seterror(source, triple)
def newtodolink(self, url, origin): def newtodolink(self, url, origin):
# Call self.format_url(), since the URL here
# is now a (URL, fragment) pair.
if self.todo.has_key(url): if self.todo.has_key(url):
if origin not in self.todo[url]: if origin not in self.todo[url]:
self.todo[url].append(origin) self.todo[url].append(origin)
self.note(3, " Seen todo link %s", url) self.note(3, " Seen todo link %s", self.format_url(url))
else: else:
self.todo[url] = [origin] self.todo[url] = [origin]
self.note(3, " New todo link %s", url) self.note(3, " New todo link %s", self.format_url(url))
def format_url(self, url):
link, fragment = url
if fragment: return link + "#" + fragment
else: return link
def markdone(self, url): def markdone(self, url):
self.done[url] = self.todo[url] self.done[url] = self.todo[url]
...@@ -388,41 +469,57 @@ class Checker: ...@@ -388,41 +469,57 @@ class Checker:
if url[:len(root)] == root: if url[:len(root)] == root:
return self.isallowed(root, url) return self.isallowed(root, url)
return 0 return 0
def isallowed(self, root, url): def isallowed(self, root, url):
root = urlparse.urljoin(root, "/") root = urlparse.urljoin(root, "/")
return self.robots[root].can_fetch(AGENTNAME, url) return self.robots[root].can_fetch(AGENTNAME, url)
def getpage(self, url): def getpage(self, url_pair):
# Incoming argument name is a (URL, fragment) pair.
# The page may have been cached in the name_table variable.
url, fragment = url_pair
if self.name_table.has_key(url):
return self.name_table[url]
if url[:7] == 'mailto:' or url[:5] == 'news:': if url[:7] == 'mailto:' or url[:5] == 'news:':
self.note(1, " Not checking mailto/news URL") self.note(1, " Not checking mailto/news URL")
return None return None
isint = self.inroots(url) isint = self.inroots(url)
# Ensure that openpage gets the URL pair to
# print out its error message and record the error pair
# correctly.
if not isint: if not isint:
if not self.checkext: if not self.checkext:
self.note(1, " Not checking ext link") self.note(1, " Not checking ext link")
return None return None
f = self.openpage(url) f = self.openpage(url_pair)
if f: if f:
self.safeclose(f) self.safeclose(f)
return None return None
text, nurl = self.readhtml(url) text, nurl = self.readhtml(url_pair)
if nurl != url: if nurl != url:
self.note(1, " Redirected to %s", nurl) self.note(1, " Redirected to %s", nurl)
url = nurl url = nurl
if text: if text:
return Page(text, url, maxpage=self.maxpage, checker=self) return Page(text, url, maxpage=self.maxpage, checker=self)
def readhtml(self, url): # These next three functions take (URL, fragment) pairs as
# arguments, so that openpage() receives the appropriate tuple to
# record error messages.
def readhtml(self, url_pair):
url, fragment = url_pair
text = None text = None
f, url = self.openhtml(url) f, url = self.openhtml(url_pair)
if f: if f:
text = f.read() text = f.read()
f.close() f.close()
return text, url return text, url
def openhtml(self, url): def openhtml(self, url_pair):
f = self.openpage(url) url, fragment = url_pair
f = self.openpage(url_pair)
if f: if f:
url = f.geturl() url = f.geturl()
info = f.info() info = f.info()
...@@ -431,15 +528,16 @@ class Checker: ...@@ -431,15 +528,16 @@ class Checker:
f = None f = None
return f, url return f, url
def openpage(self, url): def openpage(self, url_pair):
url, fragment = url_pair
try: try:
return self.urlopener.open(url) return self.urlopener.open(url)
except IOError, msg: except IOError, msg:
msg = self.sanitize(msg) msg = self.sanitize(msg)
self.note(0, "Error %s", msg) self.note(0, "Error %s", msg)
if self.verbose > 0: if self.verbose > 0:
self.show(" HREF ", url, " from", self.todo[url]) self.show(" HREF ", url, " from", self.todo[url_pair])
self.setbad(url, msg) self.setbad(url_pair, msg)
return None return None
def checkforhtml(self, info, url): def checkforhtml(self, info, url):
...@@ -468,7 +566,7 @@ class Checker: ...@@ -468,7 +566,7 @@ class Checker:
self.bad[url] = msg self.bad[url] = msg
self.changed = 1 self.changed = 1
self.markerror(url) self.markerror(url)
def markerror(self, url): def markerror(self, url):
try: try:
origins = self.todo[url] origins = self.todo[url]
...@@ -480,7 +578,13 @@ class Checker: ...@@ -480,7 +578,13 @@ class Checker:
def seterror(self, url, triple): def seterror(self, url, triple):
try: try:
self.errors[url].append(triple) # Because of the way the URLs are now processed, I need to
# check to make sure the URL hasn't been entered in the
# error list. The first element of the triple here is a
# (URL, fragment) pair, but the URL key is not, since it's
# from the list of origins.
if triple not in self.errors[url]:
self.errors[url].append(triple)
except KeyError: except KeyError:
self.errors[url] = [triple] self.errors[url] = [triple]
...@@ -551,6 +655,21 @@ class Page: ...@@ -551,6 +655,21 @@ class Page:
self.maxpage = maxpage self.maxpage = maxpage
self.checker = checker self.checker = checker
# The parsing of the page is done in the __init__() routine in
# order to initialize the list of names the file
# contains. Stored the parser in an instance variable. Passed
# the URL to MyHTMLParser().
size = len(self.text)
if size > self.maxpage:
self.note(0, "Skip huge file %s (%.0f Kbytes)", self.url, (size*0.001))
self.parser = None
return
self.checker.note(2, " Parsing %s (%d bytes)", self.url, size)
self.parser = MyHTMLParser(url, verbose=self.verbose,
checker=self.checker)
self.parser.feed(self.text)
self.parser.close()
def note(self, level, msg, *args): def note(self, level, msg, *args):
if self.checker: if self.checker:
apply(self.checker.note, (level, msg) + args) apply(self.checker.note, (level, msg) + args)
...@@ -560,24 +679,30 @@ class Page: ...@@ -560,24 +679,30 @@ class Page:
msg = msg%args msg = msg%args
print msg print msg
# Method to retrieve names.
def getnames(self):
return self.parser.names
def getlinkinfos(self): def getlinkinfos(self):
size = len(self.text) # File reading is done in __init__() routine. Store parser in
if size > self.maxpage: # local variable to indicate success of parsing.
self.note(0, "Skip huge file %s (%.0f Kbytes)", self.url, (size*0.001))
return [] # If no parser was stored, fail.
self.checker.note(2, " Parsing %s (%d bytes)", self.url, size) if not self.parser: return []
parser = MyHTMLParser(verbose=self.verbose, checker=self.checker)
parser.feed(self.text) rawlinks = self.parser.getlinks()
parser.close() base = urlparse.urljoin(self.url, self.parser.getbase() or "")
rawlinks = parser.getlinks()
base = urlparse.urljoin(self.url, parser.getbase() or "")
infos = [] infos = []
for rawlink in rawlinks: for rawlink in rawlinks:
t = urlparse.urlparse(rawlink) t = urlparse.urlparse(rawlink)
# DON'T DISCARD THE FRAGMENT! Instead, include
# it in the tuples which are returned. See Checker.dopage().
fragment = t[-1]
t = t[:-1] + ('',) t = t[:-1] + ('',)
rawlink = urlparse.urlunparse(t) rawlink = urlparse.urlunparse(t)
link = urlparse.urljoin(base, rawlink) link = urlparse.urljoin(base, rawlink)
infos.append((link, rawlink)) infos.append((link, rawlink, fragment))
return infos return infos
...@@ -635,16 +760,30 @@ class MyURLopener(urllib.FancyURLopener): ...@@ -635,16 +760,30 @@ class MyURLopener(urllib.FancyURLopener):
class MyHTMLParser(sgmllib.SGMLParser): class MyHTMLParser(sgmllib.SGMLParser):
def __init__(self, verbose=VERBOSE, checker=None): def __init__(self, url, verbose=VERBOSE, checker=None):
self.myverbose = verbose # now unused self.myverbose = verbose # now unused
self.checker = checker self.checker = checker
self.base = None self.base = None
self.links = {} self.links = {}
self.names = []
self.url = url
sgmllib.SGMLParser.__init__(self) sgmllib.SGMLParser.__init__(self)
def start_a(self, attributes): def start_a(self, attributes):
self.link_attr(attributes, 'href') self.link_attr(attributes, 'href')
# We must rescue the NAME
# attributes from the anchor, in order to
# cache the internal anchors which are made
# available in the page.
for name, value in attributes:
if name == "name":
if value in self.names:
self.checker.message("WARNING: duplicate name %s in %s",
value, self.url)
else: self.names.append(value)
break
def end_a(self): pass def end_a(self): pass
def do_area(self, attributes): def do_area(self, attributes):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment