Kaydet (Commit) 0b0b5f02 authored tarafından Guido van Rossum's avatar Guido van Rossum

Spin off checking of external page in a subroutine.

Increase MAXPAGE to 150K.
Add back printing of __doc__ for usage message.
üst 42218ce3
...@@ -121,7 +121,7 @@ import robotparser ...@@ -121,7 +121,7 @@ import robotparser
# Tunable parameters # Tunable parameters
DEFROOT = "file:/usr/local/etc/httpd/htdocs/" # Default root URL DEFROOT = "file:/usr/local/etc/httpd/htdocs/" # Default root URL
MAXPAGE = 50000 # Ignore files bigger than this MAXPAGE = 150000 # Ignore files bigger than this
ROUNDSIZE = 50 # Number of links processed per round ROUNDSIZE = 50 # Number of links processed per round
DUMPFILE = "@webchecker.pickle" # Pickled checkpoint DUMPFILE = "@webchecker.pickle" # Pickled checkpoint
AGENTNAME = "webchecker" # Agent name for robots.txt parser AGENTNAME = "webchecker" # Agent name for robots.txt parser
...@@ -145,6 +145,7 @@ def main(): ...@@ -145,6 +145,7 @@ def main():
except getopt.error, msg: except getopt.error, msg:
sys.stdout = sys.stderr sys.stdout = sys.stderr
print msg print msg
print __doc__%globals()
sys.exit(2) sys.exit(2)
for o, a in opts: for o, a in opts:
if o == '-R': if o == '-R':
...@@ -314,22 +315,24 @@ class Checker: ...@@ -314,22 +315,24 @@ class Checker:
for url in urls: for url in urls:
if verbose > 0: if verbose > 0:
show("HREF ", url, " from", self.ext[url]) show("HREF ", url, " from", self.ext[url])
if not checkext: if checkext:
continue self.checkextpage(url)
if url[:7] == 'mailto:':
if verbose > 2: print "Not checking", url def checkextpage(self, url):
continue if url[:7] == 'mailto:' or url[:5] == 'news:':
if verbose > 2: print "Checking", url, "..." if verbose > 2: print "Not checking", url
try: return
f = self.urlopener.open(url) if verbose > 2: print "Checking", url, "..."
safeclose(f) try:
if verbose > 3: print "OK" f = self.urlopener.open(url)
if self.bad.has_key(url): safeclose(f)
self.setgood(url) if verbose > 3: print "OK"
except IOError, msg: if self.bad.has_key(url):
msg = sanitize(msg) self.setgood(url)
if verbose > 0: print "Error", msg except IOError, msg:
self.setbad(url, msg) msg = sanitize(msg)
if verbose > 0: print "Error", msg
self.setbad(url, msg)
def report_errors(self): def report_errors(self):
if not self.bad: if not self.bad:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment