Kaydet (Commit) 0b0b5f02 authored tarafından Guido van Rossum's avatar Guido van Rossum

Spin off checking of external page in a subroutine.

Increase MAXPAGE to 150K.
Add back printing of __doc__ for usage message.
üst 42218ce3
......@@ -121,7 +121,7 @@ import robotparser
# Tunable parameters
DEFROOT = "file:/usr/local/etc/httpd/htdocs/" # Default root URL
MAXPAGE = 50000 # Ignore files bigger than this
MAXPAGE = 150000 # Ignore files bigger than this
ROUNDSIZE = 50 # Number of links processed per round
DUMPFILE = "@webchecker.pickle" # Pickled checkpoint
AGENTNAME = "webchecker" # Agent name for robots.txt parser
......@@ -145,6 +145,7 @@ def main():
except getopt.error, msg:
sys.stdout = sys.stderr
print msg
print __doc__%globals()
sys.exit(2)
for o, a in opts:
if o == '-R':
......@@ -314,11 +315,13 @@ class Checker:
for url in urls:
if verbose > 0:
show("HREF ", url, " from", self.ext[url])
if not checkext:
continue
if url[:7] == 'mailto:':
if checkext:
self.checkextpage(url)
def checkextpage(self, url):
if url[:7] == 'mailto:' or url[:5] == 'news:':
if verbose > 2: print "Not checking", url
continue
return
if verbose > 2: print "Checking", url, "..."
try:
f = self.urlopener.open(url)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment