Kaydet (Commit) 1a7eae91 authored tarafından Guido van Rossum's avatar Guido van Rossum

Adapt to new webchecker structure. Due to better structure of

getpage(), much less duplicate code is needed -- we only need to
override readhtml().
üst 00756bd4
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
"""A variant on webchecker that creates a mirror copy of a remote site.""" """A variant on webchecker that creates a mirror copy of a remote site."""
__version__ = "0.1" __version__ = "$Revision$"
import os import os
import sys import sys
...@@ -11,22 +11,28 @@ import urllib ...@@ -11,22 +11,28 @@ import urllib
import getopt import getopt
import webchecker import webchecker
verbose = webchecker.verbose
# Extract real version number if necessary
if __version__[0] == '$':
_v = string.split(__version__)
if len(_v) == 3:
__version__ = _v[1]
def main(): def main():
global verbose verbose = webchecker.VERBOSE
try: try:
opts, args = getopt.getopt(sys.argv[1:], "qv") opts, args = getopt.getopt(sys.argv[1:], "qv")
except getopt.error, msg: except getopt.error, msg:
print msg print msg
print "usage:", sys.argv[0], "[-v] ... [rooturl] ..." print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..."
return 2 return 2
for o, a in opts: for o, a in opts:
if o == "-q": if o == "-q":
webchecker.verbose = verbose = 0 verbose = 0
if o == "-v": if o == "-v":
webchecker.verbose = verbose = verbose + 1 verbose = verbose + 1
c = Sucker(0) c = Sucker()
c.setflags(verbose=verbose)
c.urlopener.addheaders = [ c.urlopener.addheaders = [
('User-agent', 'websucker/%s' % __version__), ('User-agent', 'websucker/%s' % __version__),
] ]
...@@ -38,63 +44,31 @@ def main(): ...@@ -38,63 +44,31 @@ def main():
class Sucker(webchecker.Checker): class Sucker(webchecker.Checker):
# Alas, had to copy this to make one change... checkext = 0
def getpage(self, url):
if url[:7] == 'mailto:' or url[:5] == 'news:': def readhtml(self, url):
if verbose > 1: print " Not checking mailto/news URL" text = None
return None
isint = self.inroots(url)
if not isint and not self.checkext:
if verbose > 1: print " Not checking ext link"
return None
path = self.savefilename(url) path = self.savefilename(url)
saved = 0
try: try:
f = open(path, "rb") f = open(path, "rb")
except IOError: except IOError:
try: f = self.openpage(url)
f = self.urlopener.open(url) if f:
except IOError, msg: info = f.info()
msg = webchecker.sanitize(msg)
if verbose > 0:
print "Error ", msg
if verbose > 0:
webchecker.show(" HREF ", url, " from", self.todo[url])
self.setbad(url, msg)
return None
if not isint:
if verbose > 1: print " Not gathering links from ext URL"
safeclose(f)
return None
nurl = f.geturl() nurl = f.geturl()
if nurl != url: if nurl != url:
path = self.savefilename(nurl) url = nurl
info = f.info() path = self.savefilename(url)
else:
if verbose: print "Loading cached URL", url
saved = 1
nurl = url
info = {}
if url[-1:] == "/":
info["content-type"] = "text/html"
text = f.read() text = f.read()
if not saved: self.savefile(text, path) f.close()
if info.has_key('content-type'): self.savefile(text, path)
ctype = string.lower(info['content-type']) if not self.checkforhtml(info, url):
text = None
else: else:
ctype = None if self.checkforhtml({}, url):
if nurl != url: text = f.read()
if verbose > 1:
print " Redirected to", nurl
if not ctype:
ctype, encoding = webchecker.mimetypes.guess_type(nurl)
if ctype != 'text/html':
webchecker.safeclose(f)
if verbose > 1:
print " Not HTML, mime type", ctype
return None
f.close() f.close()
return webchecker.Page(text, nurl) return text, url
def savefile(self, text, path): def savefile(self, text, path):
dir, base = os.path.split(path) dir, base = os.path.split(path)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment