Kaydet (Commit) f97eeccc authored tarafından Guido van Rossum's avatar Guido van Rossum

Samuel L. Bayer:

- same fixes from webchecker.py
- incorporated small diff between current webchecker.py and 1.5.2
- fixed bug where "extra roots" added with the -t argument were being
  checked as real roots, not just as possible continuations
- added -a argument to suppress checking of name anchors

[And untabified --GvR]
üst dbd5c3e6
...@@ -124,6 +124,7 @@ Options: ...@@ -124,6 +124,7 @@ Options:
-t root -- specify root dir which should be treated as internal (can repeat) -t root -- specify root dir which should be treated as internal (can repeat)
-v -- verbose operation; repeating -v will increase verbosity -v -- verbose operation; repeating -v will increase verbosity
-x -- don't check external links (these are often slow to check) -x -- don't check external links (these are often slow to check)
-a -- don't check name anchors
Arguments: Arguments:
...@@ -166,6 +167,7 @@ MAXPAGE = 150000 # Ignore files bigger than this ...@@ -166,6 +167,7 @@ MAXPAGE = 150000 # Ignore files bigger than this
ROUNDSIZE = 50 # Number of links processed per round ROUNDSIZE = 50 # Number of links processed per round
DUMPFILE = "@webchecker.pickle" # Pickled checkpoint DUMPFILE = "@webchecker.pickle" # Pickled checkpoint
AGENTNAME = "webchecker" # Agent name for robots.txt parser AGENTNAME = "webchecker" # Agent name for robots.txt parser
NONAMES = 0 # Force name anchor checking
# Global variables # Global variables
...@@ -183,7 +185,7 @@ def main(): ...@@ -183,7 +185,7 @@ def main():
try: try:
# Begin SLB 2/24/99: Added -t option here. # Begin SLB 2/24/99: Added -t option here.
opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:nqr:t:vx') opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:nqr:t:vxa')
# End SLB 2/24/99 # End SLB 2/24/99
except getopt.error, msg: except getopt.error, msg:
...@@ -195,6 +197,7 @@ def main(): ...@@ -195,6 +197,7 @@ def main():
# Begin SLB 2/24/99: Added extra_roots variable to # Begin SLB 2/24/99: Added extra_roots variable to
# collect extra roots. # collect extra roots.
extra_roots = [] extra_roots = []
nonames = NONAMES
# End SLB 2/24/99 # End SLB 2/24/99
for o, a in opts: for o, a in opts:
...@@ -215,6 +218,8 @@ def main(): ...@@ -215,6 +218,8 @@ def main():
# -t option. # -t option.
if o == '-t': if o == '-t':
extra_roots.append(a) extra_roots.append(a)
if o == '-a':
nonames = not nonames
# End SLB 2/24/99 # End SLB 2/24/99
if o == '-v': if o == '-v':
...@@ -231,7 +236,9 @@ def main(): ...@@ -231,7 +236,9 @@ def main():
c = Checker() c = Checker()
c.setflags(checkext=checkext, verbose=verbose, c.setflags(checkext=checkext, verbose=verbose,
maxpage=maxpage, roundsize=roundsize) maxpage=maxpage, roundsize=roundsize,
nonames=nonames
)
if not restart and not args: if not restart and not args:
args.append(DEFROOT) args.append(DEFROOT)
...@@ -249,7 +256,7 @@ def main(): ...@@ -249,7 +256,7 @@ def main():
# directory component. # directory component.
if root[-1] != "/": if root[-1] != "/":
root = root + "/" root = root + "/"
c.addroot(root) c.addroot(root, add_to_do = 0)
# End SLB 2/24/99 # End SLB 2/24/99
try: try:
...@@ -294,6 +301,7 @@ class Checker: ...@@ -294,6 +301,7 @@ class Checker:
verbose = VERBOSE verbose = VERBOSE
maxpage = MAXPAGE maxpage = MAXPAGE
roundsize = ROUNDSIZE roundsize = ROUNDSIZE
nonames = NONAMES
validflags = tuple(dir()) validflags = tuple(dir())
...@@ -348,7 +356,7 @@ class Checker: ...@@ -348,7 +356,7 @@ class Checker:
for url in self.bad.keys(): for url in self.bad.keys():
self.markerror(url) self.markerror(url)
def addroot(self, root): def addroot(self, root, add_to_do = 1):
if root not in self.roots: if root not in self.roots:
troot = root troot = root
scheme, netloc, path, params, query, fragment = \ scheme, netloc, path, params, query, fragment = \
...@@ -363,6 +371,7 @@ class Checker: ...@@ -363,6 +371,7 @@ class Checker:
# Begin SLB 2/24/99: Modified this call to respect # Begin SLB 2/24/99: Modified this call to respect
# the fact that the "done" and "todo" dictionaries # the fact that the "done" and "todo" dictionaries
# are now (URL, fragment) pairs # are now (URL, fragment) pairs
if add_to_do:
self.newlink((root, ""), ("<root>", root)) self.newlink((root, ""), ("<root>", root))
# End SLB 2/24/99 # End SLB 2/24/99
...@@ -441,9 +450,12 @@ class Checker: ...@@ -441,9 +450,12 @@ class Checker:
" from", self.todo[url_pair]) " from", self.todo[url_pair])
else: else:
self.message("Check %s", self.format_url(url_pair)) self.message("Check %s", self.format_url(url_pair))
url, local_fragment = url_pair
if local_fragment and self.nonames:
self.markdone(url_pair)
return
page = self.getpage(url_pair) page = self.getpage(url_pair)
if page: if page:
url, local_fragment = url_pair
# Store the page which corresponds to this URL. # Store the page which corresponds to this URL.
self.name_table[url] = page self.name_table[url] = page
# If there is a fragment in this url_pair, and it's not # If there is a fragment in this url_pair, and it's not
...@@ -473,12 +485,23 @@ class Checker: ...@@ -473,12 +485,23 @@ class Checker:
self.newtodolink(url, origin) self.newtodolink(url, origin)
def newdonelink(self, url, origin): def newdonelink(self, url, origin):
if origin not in self.done[url]:
self.done[url].append(origin) self.done[url].append(origin)
# Begin SLB 2/24/99: changed reference to URL # Begin SLB 2/24/99: changed reference to URL
# to call self.format_url(), since the URL here # to call self.format_url(), since the URL here
# is now a (URL, fragment) pair. # is now a (URL, fragment) pair.
self.note(3, " Done link %s", self.format_url(url)) self.note(3, " Done link %s", self.format_url(url))
# SLB 11/11/99: Make sure that if it's bad, that
# the origin gets added.
if self.bad.has_key(url):
source, rawlink = origin
triple = url, rawlink, self.bad[url]
self.seterror(source, triple)
# End SLB 2/24/99 # End SLB 2/24/99
def newtodolink(self, url, origin): def newtodolink(self, url, origin):
...@@ -487,6 +510,7 @@ class Checker: ...@@ -487,6 +510,7 @@ class Checker:
# to call self.format_url(), since the URL here # to call self.format_url(), since the URL here
# is now a (URL, fragment) pair. # is now a (URL, fragment) pair.
if self.todo.has_key(url): if self.todo.has_key(url):
if origin not in self.todo[url]:
self.todo[url].append(origin) self.todo[url].append(origin)
self.note(3, " Seen todo link %s", self.format_url(url)) self.note(3, " Seen todo link %s", self.format_url(url))
else: else:
...@@ -793,9 +817,9 @@ class MyURLopener(urllib.FancyURLopener): ...@@ -793,9 +817,9 @@ class MyURLopener(urllib.FancyURLopener):
def open_file(self, url): def open_file(self, url):
path = urllib.url2pathname(urllib.unquote(url)) path = urllib.url2pathname(urllib.unquote(url))
if os.path.isdir(path):
if path[-1] != os.sep: if path[-1] != os.sep:
url = url + '/' url = url + '/'
if os.path.isdir(path):
indexpath = os.path.join(path, "index.html") indexpath = os.path.join(path, "index.html")
if os.path.exists(indexpath): if os.path.exists(indexpath):
return self.open_file(url + "index.html") return self.open_file(url + "index.html")
...@@ -812,7 +836,7 @@ class MyURLopener(urllib.FancyURLopener): ...@@ -812,7 +836,7 @@ class MyURLopener(urllib.FancyURLopener):
s.write('<A HREF="%s">%s</A>\n' % (q, q)) s.write('<A HREF="%s">%s</A>\n' % (q, q))
s.seek(0) s.seek(0)
return s return s
return urllib.FancyURLopener.open_file(self, path) return urllib.FancyURLopener.open_file(self, url)
class MyHTMLParser(sgmllib.SGMLParser): class MyHTMLParser(sgmllib.SGMLParser):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment