Skip to content
Projeler
Gruplar
Parçacıklar
Yardım
Yükleniyor...
Oturum aç / Kaydol
Gezinmeyi değiştir
C
cpython
Proje
Proje
Ayrıntılar
Etkinlik
Cycle Analytics
Depo (repository)
Depo (repository)
Dosyalar
Kayıtlar (commit)
Dallar (branch)
Etiketler
Katkıda bulunanlar
Grafik
Karşılaştır
Grafikler
Konular (issue)
0
Konular (issue)
0
Liste
Pano
Etiketler
Kilometre Taşları
Birleştirme (merge) Talepleri
0
Birleştirme (merge) Talepleri
0
CI / CD
CI / CD
İş akışları (pipeline)
İşler
Zamanlamalar
Grafikler
Paketler
Paketler
Wiki
Wiki
Parçacıklar
Parçacıklar
Üyeler
Üyeler
Collapse sidebar
Close sidebar
Etkinlik
Grafik
Grafikler
Yeni bir konu (issue) oluştur
İşler
Kayıtlar (commit)
Konu (issue) Panoları
Kenar çubuğunu aç
Batuhan Osman TASKAYA
cpython
Commits
986abac1
Kaydet (Commit)
986abac1
authored
Nis 06, 1998
tarafından
Guido van Rossum
Dosyalara gözat
Seçenekler
Dosyalara Gözat
İndir
Eposta Yamaları
Sade Fark
Give in to tabnanny
üst
36dfbcf3
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
910 additions
and
1101 deletions
+910
-1101
robotparser.py
Lib/robotparser.py
+60
-60
mimetypes.py
Tools/webchecker/mimetypes.py
+0
-191
robotparser.py
Tools/webchecker/robotparser.py
+60
-60
tktools.py
Tools/webchecker/tktools.py
+82
-82
wcgui.py
Tools/webchecker/wcgui.py
+274
-274
webchecker.py
Tools/webchecker/webchecker.py
+379
-379
websucker.py
Tools/webchecker/websucker.py
+55
-55
No files found.
Lib/robotparser.py
Dosyayı görüntüle @
986abac1
...
...
@@ -9,79 +9,79 @@ fetchability of other URLs.
class
RobotFileParser
:
def
__init__
(
self
):
self
.
rules
=
{}
self
.
debug
=
0
self
.
url
=
''
self
.
last_checked
=
0
self
.
rules
=
{}
self
.
debug
=
0
self
.
url
=
''
self
.
last_checked
=
0
def
mtime
(
self
):
return
self
.
last_checked
return
self
.
last_checked
def
modified
(
self
):
import
time
self
.
last_checked
=
time
.
time
()
import
time
self
.
last_checked
=
time
.
time
()
def
set_url
(
self
,
url
):
self
.
url
=
url
##
import urlmisc
##
self.url = urlmisc.canonical_url(url)
self
.
url
=
url
##
import urlmisc
##
self.url = urlmisc.canonical_url(url)
def
read
(
self
):
import
urllib
self
.
parse
(
urllib
.
urlopen
(
self
.
url
)
.
readlines
())
import
urllib
self
.
parse
(
urllib
.
urlopen
(
self
.
url
)
.
readlines
())
def
parse
(
self
,
lines
):
import
regsub
,
string
,
regex
active
=
[]
for
line
in
lines
:
if
self
.
debug
:
print
'>'
,
line
,
# blank line terminates current record
if
not
line
[:
-
1
]:
active
=
[]
continue
# remove optional comment and strip line
line
=
string
.
strip
(
line
[:
string
.
find
(
line
,
'#'
)])
if
not
line
:
continue
line
=
regsub
.
split
(
line
,
' *: *'
)
if
len
(
line
)
==
2
:
line
[
0
]
=
string
.
lower
(
line
[
0
])
if
line
[
0
]
==
'user-agent'
:
# this record applies to this user agent
if
self
.
debug
:
print
'>> user-agent:'
,
line
[
1
]
active
.
append
(
line
[
1
])
if
not
self
.
rules
.
has_key
(
line
[
1
]):
self
.
rules
[
line
[
1
]]
=
[]
elif
line
[
0
]
==
'disallow'
:
if
line
[
1
]:
if
self
.
debug
:
print
'>> disallow:'
,
line
[
1
]
for
agent
in
active
:
self
.
rules
[
agent
]
.
append
(
regex
.
compile
(
line
[
1
]))
else
:
pass
for
agent
in
active
:
if
self
.
debug
:
print
'>> allow'
,
agent
self
.
rules
[
agent
]
=
[]
else
:
if
self
.
debug
:
print
'>> unknown:'
,
line
import
regsub
,
string
,
regex
active
=
[]
for
line
in
lines
:
if
self
.
debug
:
print
'>'
,
line
,
# blank line terminates current record
if
not
line
[:
-
1
]:
active
=
[]
continue
# remove optional comment and strip line
line
=
string
.
strip
(
line
[:
string
.
find
(
line
,
'#'
)])
if
not
line
:
continue
line
=
regsub
.
split
(
line
,
' *: *'
)
if
len
(
line
)
==
2
:
line
[
0
]
=
string
.
lower
(
line
[
0
])
if
line
[
0
]
==
'user-agent'
:
# this record applies to this user agent
if
self
.
debug
:
print
'>> user-agent:'
,
line
[
1
]
active
.
append
(
line
[
1
])
if
not
self
.
rules
.
has_key
(
line
[
1
]):
self
.
rules
[
line
[
1
]]
=
[]
elif
line
[
0
]
==
'disallow'
:
if
line
[
1
]:
if
self
.
debug
:
print
'>> disallow:'
,
line
[
1
]
for
agent
in
active
:
self
.
rules
[
agent
]
.
append
(
regex
.
compile
(
line
[
1
]))
else
:
pass
for
agent
in
active
:
if
self
.
debug
:
print
'>> allow'
,
agent
self
.
rules
[
agent
]
=
[]
else
:
if
self
.
debug
:
print
'>> unknown:'
,
line
self
.
modified
()
self
.
modified
()
# returns true if agent is allowed to fetch url
def
can_fetch
(
self
,
agent
,
url
):
import
urlparse
ag
=
agent
if
not
self
.
rules
.
has_key
(
ag
):
ag
=
'*'
if
not
self
.
rules
.
has_key
(
ag
):
if
self
.
debug
:
print
'>> allowing'
,
url
,
'fetch by'
,
agent
return
1
path
=
urlparse
.
urlparse
(
url
)[
2
]
for
rule
in
self
.
rules
[
ag
]:
if
rule
.
match
(
path
)
!=
-
1
:
if
self
.
debug
:
print
'>> disallowing'
,
url
,
'fetch by'
,
agent
return
0
if
self
.
debug
:
print
'>> allowing'
,
url
,
'fetch by'
,
agent
return
1
import
urlparse
ag
=
agent
if
not
self
.
rules
.
has_key
(
ag
):
ag
=
'*'
if
not
self
.
rules
.
has_key
(
ag
):
if
self
.
debug
:
print
'>> allowing'
,
url
,
'fetch by'
,
agent
return
1
path
=
urlparse
.
urlparse
(
url
)[
2
]
for
rule
in
self
.
rules
[
ag
]:
if
rule
.
match
(
path
)
!=
-
1
:
if
self
.
debug
:
print
'>> disallowing'
,
url
,
'fetch by'
,
agent
return
0
if
self
.
debug
:
print
'>> allowing'
,
url
,
'fetch by'
,
agent
return
1
def
test
():
rp
=
RobotFileParser
()
...
...
@@ -91,7 +91,7 @@ def test():
print
rp
.
rules
print
rp
.
can_fetch
(
'*'
,
'http://www.calendar.com/concerts/'
)
print
rp
.
can_fetch
(
'Musi-Cal-Robot'
,
'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones'
)
'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones'
)
print
rp
.
can_fetch
(
'Lycos'
,
'http://www/~skip/volkswagen/'
)
print
rp
.
can_fetch
(
'Lycos'
,
'http://www/~skip/volkswagen/vanagon-list-001'
)
Tools/webchecker/mimetypes.py
deleted
100644 → 0
Dosyayı görüntüle @
36dfbcf3
"""Guess the MIME type of a file.
This module defines one useful function:
guess_type(url) -- guess the MIME type and encoding of a URL.
It also contains the following, for tuning the behavior:
Data:
knownfiles -- list of files to parse
inited -- flag set when init() has been called
suffixes_map -- dictionary mapping suffixes to suffixes
encodings_map -- dictionary mapping suffixes to encodings
types_map -- dictionary mapping suffixes to types
Functions:
init([files]) -- parse a list of files, default knownfiles
read_mime_types(file) -- parse one file, return a dictionary or None
"""
import
string
import
posixpath
knownfiles
=
[
"/usr/local/etc/httpd/conf/mime.types"
,
"/usr/local/lib/netscape/mime.types"
,
]
inited
=
0
def
guess_type
(
url
):
"""Guess the type of a file based on its URL.
Return value is a tuple (type, encoding) where type is None if the
type can't be guessed (no or unknown suffix) or a string of the
form type/subtype, usable for a MIME Content-type header; and
encoding is None for no encoding or the name of the program used
to encode (e.g. compress or gzip). The mappings are table
driven. Encoding suffixes are case sensitive; type suffixes are
first tried case sensitive, then case insensitive.
The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped
to ".tar.gz". (This is table-driven too, using the dictionary
suffixes_map).
"""
if
not
inited
:
init
()
base
,
ext
=
posixpath
.
splitext
(
url
)
while
suffix_map
.
has_key
(
ext
):
base
,
ext
=
posixpath
.
splitext
(
base
+
suffix_map
[
ext
])
if
encodings_map
.
has_key
(
ext
):
encoding
=
encodings_map
[
ext
]
base
,
ext
=
posixpath
.
splitext
(
base
)
else
:
encoding
=
None
if
types_map
.
has_key
(
ext
):
return
types_map
[
ext
],
encoding
elif
types_map
.
has_key
(
string
.
lower
(
ext
)):
return
types_map
[
string
.
lower
(
ext
)],
encoding
else
:
return
None
,
encoding
def
init
(
files
=
None
):
global
inited
for
file
in
files
or
knownfiles
:
s
=
read_mime_types
(
file
)
if
s
:
for
key
,
value
in
s
.
items
():
types_map
[
key
]
=
value
inited
=
1
def
read_mime_types
(
file
):
try
:
f
=
open
(
file
)
except
IOError
:
return
None
map
=
{}
while
1
:
line
=
f
.
readline
()
if
not
line
:
break
words
=
string
.
split
(
line
)
for
i
in
range
(
len
(
words
)):
if
words
[
i
][
0
]
==
'#'
:
del
words
[
i
:]
break
if
not
words
:
continue
type
,
suffixes
=
words
[
0
],
words
[
1
:]
for
suff
in
suffixes
:
map
[
'.'
+
suff
]
=
type
f
.
close
()
return
map
suffix_map
=
{
'.tgz'
:
'.tar.gz'
,
'.taz'
:
'.tar.gz'
,
'.tz'
:
'.tar.gz'
,
}
encodings_map
=
{
'.gz'
:
'gzip'
,
'.Z'
:
'compress'
,
}
types_map
=
{
'.a'
:
'application/octet-stream'
,
'.ai'
:
'application/postscript'
,
'.aif'
:
'audio/x-aiff'
,
'.aifc'
:
'audio/x-aiff'
,
'.aiff'
:
'audio/x-aiff'
,
'.au'
:
'audio/basic'
,
'.avi'
:
'video/x-msvideo'
,
'.bcpio'
:
'application/x-bcpio'
,
'.bin'
:
'application/octet-stream'
,
'.cdf'
:
'application/x-netcdf'
,
'.cpio'
:
'application/x-cpio'
,
'.csh'
:
'application/x-csh'
,
'.dll'
:
'application/octet-stream'
,
'.dvi'
:
'application/x-dvi'
,
'.exe'
:
'application/octet-stream'
,
'.eps'
:
'application/postscript'
,
'.etx'
:
'text/x-setext'
,
'.gif'
:
'image/gif'
,
'.gtar'
:
'application/x-gtar'
,
'.hdf'
:
'application/x-hdf'
,
'.htm'
:
'text/html'
,
'.html'
:
'text/html'
,
'.shtml'
:
'text/html'
,
'.ief'
:
'image/ief'
,
'.jpe'
:
'image/jpeg'
,
'.jpeg'
:
'image/jpeg'
,
'.jpg'
:
'image/jpeg'
,
'.latex'
:
'application/x-latex'
,
'.man'
:
'application/x-troff-man'
,
'.me'
:
'application/x-troff-me'
,
'.mif'
:
'application/x-mif'
,
'.mov'
:
'video/quicktime'
,
'.movie'
:
'video/x-sgi-movie'
,
'.mpe'
:
'video/mpeg'
,
'.mpeg'
:
'video/mpeg'
,
'.mpg'
:
'video/mpeg'
,
'.ms'
:
'application/x-troff-ms'
,
'.nc'
:
'application/x-netcdf'
,
'.o'
:
'application/octet-stream'
,
'.obj'
:
'application/octet-stream'
,
'.oda'
:
'application/oda'
,
'.pbm'
:
'image/x-portable-bitmap'
,
'.pdf'
:
'application/pdf'
,
'.pgm'
:
'image/x-portable-graymap'
,
'.pnm'
:
'image/x-portable-anymap'
,
'.png'
:
'image/png'
,
'.ppm'
:
'image/x-portable-pixmap'
,
'.py'
:
'text/x-python'
,
'.pyc'
:
'application/x-python-code'
,
'.ps'
:
'application/postscript'
,
'.qt'
:
'video/quicktime'
,
'.ras'
:
'image/x-cmu-raster'
,
'.rgb'
:
'image/x-rgb'
,
'.roff'
:
'application/x-troff'
,
'.rtf'
:
'application/rtf'
,
'.rtx'
:
'text/richtext'
,
'.sgm'
:
'text/x-sgml'
,
'.sgml'
:
'text/x-sgml'
,
'.sh'
:
'application/x-sh'
,
'.shar'
:
'application/x-shar'
,
'.snd'
:
'audio/basic'
,
'.so'
:
'application/octet-stream'
,
'.src'
:
'application/x-wais-source'
,
'.sv4cpio'
:
'application/x-sv4cpio'
,
'.sv4crc'
:
'application/x-sv4crc'
,
'.t'
:
'application/x-troff'
,
'.tar'
:
'application/x-tar'
,
'.tcl'
:
'application/x-tcl'
,
'.tex'
:
'application/x-tex'
,
'.texi'
:
'application/x-texinfo'
,
'.texinfo'
:
'application/x-texinfo'
,
'.tif'
:
'image/tiff'
,
'.tiff'
:
'image/tiff'
,
'.tr'
:
'application/x-troff'
,
'.tsv'
:
'text/tab-separated-values'
,
'.txt'
:
'text/plain'
,
'.ustar'
:
'application/x-ustar'
,
'.wav'
:
'audio/x-wav'
,
'.xbm'
:
'image/x-xbitmap'
,
'.xpm'
:
'image/x-xpixmap'
,
'.xwd'
:
'image/x-xwindowdump'
,
'.zip'
:
'application/zip'
,
}
Tools/webchecker/robotparser.py
Dosyayı görüntüle @
986abac1
...
...
@@ -9,79 +9,79 @@ fetchability of other URLs.
class
RobotFileParser
:
def
__init__
(
self
):
self
.
rules
=
{}
self
.
debug
=
0
self
.
url
=
''
self
.
last_checked
=
0
self
.
rules
=
{}
self
.
debug
=
0
self
.
url
=
''
self
.
last_checked
=
0
def
mtime
(
self
):
return
self
.
last_checked
return
self
.
last_checked
def
modified
(
self
):
import
time
self
.
last_checked
=
time
.
time
()
import
time
self
.
last_checked
=
time
.
time
()
def
set_url
(
self
,
url
):
self
.
url
=
url
##
import urlmisc
##
self.url = urlmisc.canonical_url(url)
self
.
url
=
url
##
import urlmisc
##
self.url = urlmisc.canonical_url(url)
def
read
(
self
):
import
urllib
self
.
parse
(
urllib
.
urlopen
(
self
.
url
)
.
readlines
())
import
urllib
self
.
parse
(
urllib
.
urlopen
(
self
.
url
)
.
readlines
())
def
parse
(
self
,
lines
):
import
regsub
,
string
,
regex
active
=
[]
for
line
in
lines
:
if
self
.
debug
:
print
'>'
,
line
,
# blank line terminates current record
if
not
line
[:
-
1
]:
active
=
[]
continue
# remove optional comment and strip line
line
=
string
.
strip
(
line
[:
string
.
find
(
line
,
'#'
)])
if
not
line
:
continue
line
=
regsub
.
split
(
line
,
' *: *'
)
if
len
(
line
)
==
2
:
line
[
0
]
=
string
.
lower
(
line
[
0
])
if
line
[
0
]
==
'user-agent'
:
# this record applies to this user agent
if
self
.
debug
:
print
'>> user-agent:'
,
line
[
1
]
active
.
append
(
line
[
1
])
if
not
self
.
rules
.
has_key
(
line
[
1
]):
self
.
rules
[
line
[
1
]]
=
[]
elif
line
[
0
]
==
'disallow'
:
if
line
[
1
]:
if
self
.
debug
:
print
'>> disallow:'
,
line
[
1
]
for
agent
in
active
:
self
.
rules
[
agent
]
.
append
(
regex
.
compile
(
line
[
1
]))
else
:
pass
for
agent
in
active
:
if
self
.
debug
:
print
'>> allow'
,
agent
self
.
rules
[
agent
]
=
[]
else
:
if
self
.
debug
:
print
'>> unknown:'
,
line
import
regsub
,
string
,
regex
active
=
[]
for
line
in
lines
:
if
self
.
debug
:
print
'>'
,
line
,
# blank line terminates current record
if
not
line
[:
-
1
]:
active
=
[]
continue
# remove optional comment and strip line
line
=
string
.
strip
(
line
[:
string
.
find
(
line
,
'#'
)])
if
not
line
:
continue
line
=
regsub
.
split
(
line
,
' *: *'
)
if
len
(
line
)
==
2
:
line
[
0
]
=
string
.
lower
(
line
[
0
])
if
line
[
0
]
==
'user-agent'
:
# this record applies to this user agent
if
self
.
debug
:
print
'>> user-agent:'
,
line
[
1
]
active
.
append
(
line
[
1
])
if
not
self
.
rules
.
has_key
(
line
[
1
]):
self
.
rules
[
line
[
1
]]
=
[]
elif
line
[
0
]
==
'disallow'
:
if
line
[
1
]:
if
self
.
debug
:
print
'>> disallow:'
,
line
[
1
]
for
agent
in
active
:
self
.
rules
[
agent
]
.
append
(
regex
.
compile
(
line
[
1
]))
else
:
pass
for
agent
in
active
:
if
self
.
debug
:
print
'>> allow'
,
agent
self
.
rules
[
agent
]
=
[]
else
:
if
self
.
debug
:
print
'>> unknown:'
,
line
self
.
modified
()
self
.
modified
()
# returns true if agent is allowed to fetch url
def
can_fetch
(
self
,
agent
,
url
):
import
urlparse
ag
=
agent
if
not
self
.
rules
.
has_key
(
ag
):
ag
=
'*'
if
not
self
.
rules
.
has_key
(
ag
):
if
self
.
debug
:
print
'>> allowing'
,
url
,
'fetch by'
,
agent
return
1
path
=
urlparse
.
urlparse
(
url
)[
2
]
for
rule
in
self
.
rules
[
ag
]:
if
rule
.
match
(
path
)
!=
-
1
:
if
self
.
debug
:
print
'>> disallowing'
,
url
,
'fetch by'
,
agent
return
0
if
self
.
debug
:
print
'>> allowing'
,
url
,
'fetch by'
,
agent
return
1
import
urlparse
ag
=
agent
if
not
self
.
rules
.
has_key
(
ag
):
ag
=
'*'
if
not
self
.
rules
.
has_key
(
ag
):
if
self
.
debug
:
print
'>> allowing'
,
url
,
'fetch by'
,
agent
return
1
path
=
urlparse
.
urlparse
(
url
)[
2
]
for
rule
in
self
.
rules
[
ag
]:
if
rule
.
match
(
path
)
!=
-
1
:
if
self
.
debug
:
print
'>> disallowing'
,
url
,
'fetch by'
,
agent
return
0
if
self
.
debug
:
print
'>> allowing'
,
url
,
'fetch by'
,
agent
return
1
def
test
():
rp
=
RobotFileParser
()
...
...
@@ -91,7 +91,7 @@ def test():
print
rp
.
rules
print
rp
.
can_fetch
(
'*'
,
'http://www.calendar.com/concerts/'
)
print
rp
.
can_fetch
(
'Musi-Cal-Robot'
,
'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones'
)
'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones'
)
print
rp
.
can_fetch
(
'Lycos'
,
'http://www/~skip/volkswagen/'
)
print
rp
.
can_fetch
(
'Lycos'
,
'http://www/~skip/volkswagen/vanagon-list-001'
)
Tools/webchecker/tktools.py
Dosyayı görüntüle @
986abac1
...
...
@@ -7,8 +7,8 @@ from Tkinter import *
def
_clear_entry_widget
(
event
):
try
:
widget
=
event
.
widget
widget
.
delete
(
0
,
INSERT
)
widget
=
event
.
widget
widget
.
delete
(
0
,
INSERT
)
except
:
pass
def
install_keybindings
(
root
):
root
.
bind_class
(
'Entry'
,
'<Control-u>'
,
_clear_entry_widget
)
...
...
@@ -23,12 +23,12 @@ def make_toplevel(master, title=None, class_=None):
"""
if
class_
:
widget
=
Toplevel
(
master
,
class_
=
class_
)
widget
=
Toplevel
(
master
,
class_
=
class_
)
else
:
widget
=
Toplevel
(
master
)
widget
=
Toplevel
(
master
)
if
title
:
widget
.
title
(
title
)
widget
.
iconname
(
title
)
widget
.
title
(
title
)
widget
.
iconname
(
title
)
return
widget
def
set_transient
(
widget
,
master
,
relx
=
0.5
,
rely
=
0.3
,
expose
=
1
):
...
...
@@ -43,26 +43,26 @@ def set_transient(widget, master, relx=0.5, rely=0.3, expose=1):
widget
.
transient
(
master
)
widget
.
update_idletasks
()
# Actualize geometry information
if
master
.
winfo_ismapped
():
m_width
=
master
.
winfo_width
()
m_height
=
master
.
winfo_height
()
m_x
=
master
.
winfo_rootx
()
m_y
=
master
.
winfo_rooty
()
m_width
=
master
.
winfo_width
()
m_height
=
master
.
winfo_height
()
m_x
=
master
.
winfo_rootx
()
m_y
=
master
.
winfo_rooty
()
else
:
m_width
=
master
.
winfo_screenwidth
()
m_height
=
master
.
winfo_screenheight
()
m_x
=
m_y
=
0
m_width
=
master
.
winfo_screenwidth
()
m_height
=
master
.
winfo_screenheight
()
m_x
=
m_y
=
0
w_width
=
widget
.
winfo_reqwidth
()
w_height
=
widget
.
winfo_reqheight
()
x
=
m_x
+
(
m_width
-
w_width
)
*
relx
y
=
m_y
+
(
m_height
-
w_height
)
*
rely
widget
.
geometry
(
"+
%
d+
%
d"
%
(
x
,
y
))
if
expose
:
widget
.
deiconify
()
# Become visible at the desired location
widget
.
deiconify
()
# Become visible at the desired location
return
widget
def
make_scrollbars
(
parent
,
hbar
,
vbar
,
pack
=
1
,
class_
=
None
,
name
=
None
,
takefocus
=
0
):
takefocus
=
0
):
"""Subroutine to create a frame with scrollbars.
...
...
@@ -76,38 +76,38 @@ def make_scrollbars(parent, hbar, vbar, pack=1, class_=None, name=None,
"""
if
class_
:
if
name
:
frame
=
Frame
(
parent
,
class_
=
class_
,
name
=
name
)
else
:
frame
=
Frame
(
parent
,
class_
=
class_
)
if
name
:
frame
=
Frame
(
parent
,
class_
=
class_
,
name
=
name
)
else
:
frame
=
Frame
(
parent
,
class_
=
class_
)
else
:
if
name
:
frame
=
Frame
(
parent
,
name
=
name
)
else
:
frame
=
Frame
(
parent
)
if
name
:
frame
=
Frame
(
parent
,
name
=
name
)
else
:
frame
=
Frame
(
parent
)
if
pack
:
frame
.
pack
(
fill
=
BOTH
,
expand
=
1
)
frame
.
pack
(
fill
=
BOTH
,
expand
=
1
)
corner
=
None
if
vbar
:
if
not
hbar
:
vbar
=
Scrollbar
(
frame
,
takefocus
=
takefocus
)
vbar
.
pack
(
fill
=
Y
,
side
=
RIGHT
)
else
:
vbarframe
=
Frame
(
frame
,
borderwidth
=
0
)
vbarframe
.
pack
(
fill
=
Y
,
side
=
RIGHT
)
vbar
=
Scrollbar
(
frame
,
name
=
"vbar"
,
takefocus
=
takefocus
)
vbar
.
pack
(
in_
=
vbarframe
,
expand
=
1
,
fill
=
Y
,
side
=
TOP
)
sbwidth
=
vbar
.
winfo_reqwidth
()
corner
=
Frame
(
vbarframe
,
width
=
sbwidth
,
height
=
sbwidth
)
corner
.
propagate
(
0
)
corner
.
pack
(
side
=
BOTTOM
)
if
not
hbar
:
vbar
=
Scrollbar
(
frame
,
takefocus
=
takefocus
)
vbar
.
pack
(
fill
=
Y
,
side
=
RIGHT
)
else
:
vbarframe
=
Frame
(
frame
,
borderwidth
=
0
)
vbarframe
.
pack
(
fill
=
Y
,
side
=
RIGHT
)
vbar
=
Scrollbar
(
frame
,
name
=
"vbar"
,
takefocus
=
takefocus
)
vbar
.
pack
(
in_
=
vbarframe
,
expand
=
1
,
fill
=
Y
,
side
=
TOP
)
sbwidth
=
vbar
.
winfo_reqwidth
()
corner
=
Frame
(
vbarframe
,
width
=
sbwidth
,
height
=
sbwidth
)
corner
.
propagate
(
0
)
corner
.
pack
(
side
=
BOTTOM
)
else
:
vbar
=
None
vbar
=
None
if
hbar
:
hbar
=
Scrollbar
(
frame
,
orient
=
HORIZONTAL
,
name
=
"hbar"
,
takefocus
=
takefocus
)
hbar
.
pack
(
fill
=
X
,
side
=
BOTTOM
)
hbar
=
Scrollbar
(
frame
,
orient
=
HORIZONTAL
,
name
=
"hbar"
,
takefocus
=
takefocus
)
hbar
.
pack
(
fill
=
X
,
side
=
BOTTOM
)
else
:
hbar
=
None
hbar
=
None
return
hbar
,
vbar
,
frame
...
...
@@ -121,20 +121,20 @@ def set_scroll_commands(widget, hbar, vbar):
"""
if
vbar
:
widget
[
'yscrollcommand'
]
=
(
vbar
,
'set'
)
vbar
[
'command'
]
=
(
widget
,
'yview'
)
widget
[
'yscrollcommand'
]
=
(
vbar
,
'set'
)
vbar
[
'command'
]
=
(
widget
,
'yview'
)
if
hbar
:
widget
[
'xscrollcommand'
]
=
(
hbar
,
'set'
)
hbar
[
'command'
]
=
(
widget
,
'xview'
)
widget
[
'xscrollcommand'
]
=
(
hbar
,
'set'
)
hbar
[
'command'
]
=
(
widget
,
'xview'
)
widget
.
vbar
=
vbar
widget
.
hbar
=
hbar
def
make_text_box
(
parent
,
width
=
0
,
height
=
0
,
hbar
=
0
,
vbar
=
1
,
fill
=
BOTH
,
expand
=
1
,
wrap
=
WORD
,
pack
=
1
,
class_
=
None
,
name
=
None
,
takefocus
=
None
):
fill
=
BOTH
,
expand
=
1
,
wrap
=
WORD
,
pack
=
1
,
class_
=
None
,
name
=
None
,
takefocus
=
None
):
"""Subroutine to create a text box.
...
...
@@ -148,8 +148,8 @@ def make_text_box(parent, width=0, height=0, hbar=0, vbar=1,
"""
hbar
,
vbar
,
frame
=
make_scrollbars
(
parent
,
hbar
,
vbar
,
pack
,
class_
=
class_
,
name
=
name
,
takefocus
=
takefocus
)
class_
=
class_
,
name
=
name
,
takefocus
=
takefocus
)
widget
=
Text
(
frame
,
wrap
=
wrap
,
name
=
"text"
)
if
width
:
widget
.
config
(
width
=
width
)
...
...
@@ -162,16 +162,16 @@ def make_text_box(parent, width=0, height=0, hbar=0, vbar=1,
def
make_list_box
(
parent
,
width
=
0
,
height
=
0
,
hbar
=
0
,
vbar
=
1
,
fill
=
BOTH
,
expand
=
1
,
pack
=
1
,
class_
=
None
,
name
=
None
,
takefocus
=
None
):
fill
=
BOTH
,
expand
=
1
,
pack
=
1
,
class_
=
None
,
name
=
None
,
takefocus
=
None
):
"""Subroutine to create a list box.
Like make_text_box().
"""
hbar
,
vbar
,
frame
=
make_scrollbars
(
parent
,
hbar
,
vbar
,
pack
,
class_
=
class_
,
name
=
name
,
takefocus
=
takefocus
)
class_
=
class_
,
name
=
name
,
takefocus
=
takefocus
)
widget
=
Listbox
(
frame
,
name
=
"listbox"
)
if
width
:
widget
.
config
(
width
=
width
)
...
...
@@ -184,8 +184,8 @@ def make_list_box(parent, width=0, height=0, hbar=0, vbar=1,
def
make_canvas
(
parent
,
width
=
0
,
height
=
0
,
hbar
=
1
,
vbar
=
1
,
fill
=
BOTH
,
expand
=
1
,
pack
=
1
,
class_
=
None
,
name
=
None
,
takefocus
=
None
):
fill
=
BOTH
,
expand
=
1
,
pack
=
1
,
class_
=
None
,
name
=
None
,
takefocus
=
None
):
"""Subroutine to create a canvas.
...
...
@@ -194,8 +194,8 @@ def make_canvas(parent, width=0, height=0, hbar=1, vbar=1,
"""
hbar
,
vbar
,
frame
=
make_scrollbars
(
parent
,
hbar
,
vbar
,
pack
,
class_
=
class_
,
name
=
name
,
takefocus
=
takefocus
)
class_
=
class_
,
name
=
name
,
takefocus
=
takefocus
)
widget
=
Canvas
(
frame
,
scrollregion
=
(
0
,
0
,
width
,
height
),
name
=
"canvas"
)
if
width
:
widget
.
config
(
width
=
width
)
...
...
@@ -228,9 +228,9 @@ def make_form_entry(parent, label, borderwidth=None):
label
.
pack
(
side
=
LEFT
)
if
borderwidth
is
None
:
entry
=
Entry
(
frame
,
relief
=
SUNKEN
)
entry
=
Entry
(
frame
,
relief
=
SUNKEN
)
else
:
entry
=
Entry
(
frame
,
relief
=
SUNKEN
,
borderwidth
=
borderwidth
)
entry
=
Entry
(
frame
,
relief
=
SUNKEN
,
borderwidth
=
borderwidth
)
entry
.
pack
(
side
=
LEFT
,
fill
=
X
,
expand
=
1
)
return
entry
,
frame
...
...
@@ -243,8 +243,8 @@ def make_form_entry(parent, label, borderwidth=None):
# expandable while still aligning the colons. This doesn't work yet.
#
def
make_labeled_form_entry
(
parent
,
label
,
entrywidth
=
20
,
entryheight
=
1
,
labelwidth
=
0
,
borderwidth
=
None
,
takefocus
=
None
):
labelwidth
=
0
,
borderwidth
=
None
,
takefocus
=
None
):
"""Subroutine to create a form entry.
Create:
...
...
@@ -261,32 +261,32 @@ def make_labeled_form_entry(parent, label, entrywidth=20, entryheight=1,
label
=
Label
(
frame
,
text
=
label
,
width
=
labelwidth
,
anchor
=
E
)
label
.
pack
(
side
=
LEFT
)
if
entryheight
==
1
:
if
borderwidth
is
None
:
entry
=
Entry
(
frame
,
relief
=
SUNKEN
,
width
=
entrywidth
)
else
:
entry
=
Entry
(
frame
,
relief
=
SUNKEN
,
width
=
entrywidth
,
borderwidth
=
borderwidth
)
entry
.
pack
(
side
=
RIGHT
,
expand
=
1
,
fill
=
X
)
frame
.
pack
(
fill
=
X
)
if
borderwidth
is
None
:
entry
=
Entry
(
frame
,
relief
=
SUNKEN
,
width
=
entrywidth
)
else
:
entry
=
Entry
(
frame
,
relief
=
SUNKEN
,
width
=
entrywidth
,
borderwidth
=
borderwidth
)
entry
.
pack
(
side
=
RIGHT
,
expand
=
1
,
fill
=
X
)
frame
.
pack
(
fill
=
X
)
else
:
entry
=
make_text_box
(
frame
,
entrywidth
,
entryheight
,
1
,
1
,
takefocus
=
takefocus
)
frame
.
pack
(
fill
=
BOTH
,
expand
=
1
)
entry
=
make_text_box
(
frame
,
entrywidth
,
entryheight
,
1
,
1
,
takefocus
=
takefocus
)
frame
.
pack
(
fill
=
BOTH
,
expand
=
1
)
return
entry
,
frame
,
label
def
make_double_frame
(
master
=
None
,
class_
=
None
,
name
=
None
,
relief
=
RAISED
,
borderwidth
=
1
):
borderwidth
=
1
):
"""Create a pair of frames suitable for 'hosting' a dialog."""
if
name
:
if
class_
:
frame
=
Frame
(
master
,
class_
=
class_
,
name
=
name
)
else
:
frame
=
Frame
(
master
,
name
=
name
)
if
class_
:
frame
=
Frame
(
master
,
class_
=
class_
,
name
=
name
)
else
:
frame
=
Frame
(
master
,
name
=
name
)
else
:
if
class_
:
frame
=
Frame
(
master
,
class_
=
class_
)
else
:
frame
=
Frame
(
master
)
if
class_
:
frame
=
Frame
(
master
,
class_
=
class_
)
else
:
frame
=
Frame
(
master
)
top
=
Frame
(
frame
,
name
=
"topframe"
,
relief
=
relief
,
borderwidth
=
borderwidth
)
borderwidth
=
borderwidth
)
bottom
=
Frame
(
frame
,
name
=
"bottomframe"
)
bottom
.
pack
(
fill
=
X
,
padx
=
'1m'
,
pady
=
'1m'
,
side
=
BOTTOM
)
top
.
pack
(
expand
=
1
,
fill
=
BOTH
,
padx
=
'1m'
,
pady
=
'1m'
)
...
...
@@ -298,7 +298,7 @@ def make_double_frame(master=None, class_=None, name=None, relief=RAISED,
def
make_group_frame
(
master
,
name
=
None
,
label
=
None
,
fill
=
Y
,
side
=
None
,
expand
=
None
,
font
=
None
):
side
=
None
,
expand
=
None
,
font
=
None
):
"""Create nested frames with a border and optional label.
The outer frame is only used to provide the decorative border, to
...
...
@@ -311,7 +311,7 @@ def make_group_frame(master, name=None, label=None, fill=Y,
outer
=
Frame
(
master
,
borderwidth
=
2
,
relief
=
GROOVE
)
outer
.
pack
(
expand
=
expand
,
fill
=
fill
,
side
=
side
)
if
label
:
Label
(
outer
,
text
=
label
,
font
=
font
,
anchor
=
W
)
.
pack
(
fill
=
X
)
Label
(
outer
,
text
=
label
,
font
=
font
,
anchor
=
W
)
.
pack
(
fill
=
X
)
inner
=
Frame
(
master
,
borderwidth
=
'1m'
,
name
=
name
)
inner
.
pack
(
expand
=
1
,
fill
=
BOTH
,
in_
=
outer
)
inner
.
forget
=
outer
.
forget
...
...
@@ -326,20 +326,20 @@ def unify_button_widths(*buttons):
"""
wid
=
0
for
btn
in
buttons
:
wid
=
max
(
wid
,
len
(
btn
[
"text"
]))
wid
=
max
(
wid
,
len
(
btn
[
"text"
]))
for
btn
in
buttons
:
btn
[
"width"
]
=
wid
btn
[
"width"
]
=
wid
def
flatten
(
msg
):
"""Turn a list or tuple into a single string -- recursively."""
t
=
type
(
msg
)
if
t
in
(
ListType
,
TupleType
):
msg
=
string
.
join
(
map
(
flatten
,
msg
))
msg
=
string
.
join
(
map
(
flatten
,
msg
))
elif
t
is
ClassType
:
msg
=
msg
.
__name__
msg
=
msg
.
__name__
else
:
msg
=
str
(
msg
)
msg
=
str
(
msg
)
return
msg
...
...
@@ -356,8 +356,8 @@ def test():
entry
,
eframe
=
make_form_entry
(
root
,
'Boolean:'
)
text
,
tframe
=
make_text_box
(
root
)
def
enter
(
event
,
entry
=
entry
,
text
=
text
):
s
=
boolean
(
entry
.
get
())
and
'
\n
yes'
or
'
\n
no'
text
.
insert
(
'end'
,
s
)
s
=
boolean
(
entry
.
get
())
and
'
\n
yes'
or
'
\n
no'
text
.
insert
(
'end'
,
s
)
entry
.
bind
(
'<Return>'
,
enter
)
entry
.
insert
(
END
,
flatten
(
sys
.
argv
))
root
.
mainloop
()
...
...
Tools/webchecker/wcgui.py
Dosyayı görüntüle @
986abac1
...
...
@@ -72,365 +72,365 @@ if sys.platform == 'mac':
def
main
():
try
:
opts
,
args
=
getopt
.
getopt
(
sys
.
argv
[
1
:],
'm:qv'
)
opts
,
args
=
getopt
.
getopt
(
sys
.
argv
[
1
:],
'm:qv'
)
except
getopt
.
error
,
msg
:
sys
.
stdout
=
sys
.
stderr
print
msg
print
__doc__
%
vars
(
webchecker
)
sys
.
exit
(
2
)
sys
.
stdout
=
sys
.
stderr
print
msg
print
__doc__
%
vars
(
webchecker
)
sys
.
exit
(
2
)
for
o
,
a
in
opts
:
if
o
==
'-m'
:
webchecker
.
maxpage
=
string
.
atoi
(
a
)
if
o
==
'-q'
:
webchecker
.
verbose
=
0
if
o
==
'-v'
:
webchecker
.
verbose
=
webchecker
.
verbose
+
1
if
o
==
'-m'
:
webchecker
.
maxpage
=
string
.
atoi
(
a
)
if
o
==
'-q'
:
webchecker
.
verbose
=
0
if
o
==
'-v'
:
webchecker
.
verbose
=
webchecker
.
verbose
+
1
root
=
Tk
(
className
=
'Webchecker'
)
root
.
protocol
(
"WM_DELETE_WINDOW"
,
root
.
quit
)
c
=
CheckerWindow
(
root
)
if
args
:
for
arg
in
args
[:
-
1
]:
c
.
addroot
(
arg
)
c
.
suggestroot
(
args
[
-
1
])
for
arg
in
args
[:
-
1
]:
c
.
addroot
(
arg
)
c
.
suggestroot
(
args
[
-
1
])
root
.
mainloop
()
class
CheckerWindow
(
webchecker
.
Checker
):
def
__init__
(
self
,
parent
,
root
=
webchecker
.
DEFROOT
):
self
.
__parent
=
parent
self
.
__topcontrols
=
Frame
(
parent
)
self
.
__topcontrols
.
pack
(
side
=
TOP
,
fill
=
X
)
self
.
__label
=
Label
(
self
.
__topcontrols
,
text
=
"Root URL:"
)
self
.
__label
.
pack
(
side
=
LEFT
)
self
.
__rootentry
=
Entry
(
self
.
__topcontrols
,
width
=
60
)
self
.
__rootentry
.
pack
(
side
=
LEFT
)
self
.
__rootentry
.
bind
(
'<Return>'
,
self
.
enterroot
)
self
.
__rootentry
.
focus_set
()
self
.
__controls
=
Frame
(
parent
)
self
.
__controls
.
pack
(
side
=
TOP
,
fill
=
X
)
self
.
__running
=
0
self
.
__start
=
Button
(
self
.
__controls
,
text
=
"Run"
,
command
=
self
.
start
)
self
.
__start
.
pack
(
side
=
LEFT
)
self
.
__stop
=
Button
(
self
.
__controls
,
text
=
"Stop"
,
command
=
self
.
stop
,
state
=
DISABLED
)
self
.
__stop
.
pack
(
side
=
LEFT
)
self
.
__step
=
Button
(
self
.
__controls
,
text
=
"Check one"
,
command
=
self
.
step
)
self
.
__step
.
pack
(
side
=
LEFT
)
self
.
__cv
=
BooleanVar
(
parent
)
self
.
__cv
.
set
(
self
.
checkext
)
self
.
__checkext
=
Checkbutton
(
self
.
__controls
,
variable
=
self
.
__cv
,
command
=
self
.
update_checkext
,
text
=
"Check nonlocal links"
,)
self
.
__checkext
.
pack
(
side
=
LEFT
)
self
.
__reset
=
Button
(
self
.
__controls
,
text
=
"Start over"
,
command
=
self
.
reset
)
self
.
__reset
.
pack
(
side
=
LEFT
)
if
__name__
==
'__main__'
:
# No Quit button under Grail!
self
.
__quit
=
Button
(
self
.
__controls
,
text
=
"Quit"
,
command
=
self
.
__parent
.
quit
)
self
.
__quit
.
pack
(
side
=
RIGHT
)
self
.
__status
=
Label
(
parent
,
text
=
"Status: initial"
,
anchor
=
W
)
self
.
__status
.
pack
(
side
=
TOP
,
fill
=
X
)
self
.
__checking
=
Label
(
parent
,
text
=
"Idle"
,
anchor
=
W
)
self
.
__checking
.
pack
(
side
=
TOP
,
fill
=
X
)
self
.
__mp
=
mp
=
MultiPanel
(
parent
)
sys
.
stdout
=
self
.
__log
=
LogPanel
(
mp
,
"Log"
)
self
.
__todo
=
ListPanel
(
mp
,
"To check"
,
self
.
showinfo
)
self
.
__done
=
ListPanel
(
mp
,
"Checked"
,
self
.
showinfo
)
self
.
__bad
=
ListPanel
(
mp
,
"Bad links"
,
self
.
showinfo
)
self
.
__errors
=
ListPanel
(
mp
,
"Pages w/ bad links"
,
self
.
showinfo
)
self
.
__details
=
LogPanel
(
mp
,
"Details"
)
webchecker
.
Checker
.
__init__
(
self
)
if
root
:
root
=
string
.
strip
(
str
(
root
))
if
root
:
self
.
suggestroot
(
root
)
self
.
newstatus
()
self
.
__parent
=
parent
self
.
__topcontrols
=
Frame
(
parent
)
self
.
__topcontrols
.
pack
(
side
=
TOP
,
fill
=
X
)
self
.
__label
=
Label
(
self
.
__topcontrols
,
text
=
"Root URL:"
)
self
.
__label
.
pack
(
side
=
LEFT
)
self
.
__rootentry
=
Entry
(
self
.
__topcontrols
,
width
=
60
)
self
.
__rootentry
.
pack
(
side
=
LEFT
)
self
.
__rootentry
.
bind
(
'<Return>'
,
self
.
enterroot
)
self
.
__rootentry
.
focus_set
()
self
.
__controls
=
Frame
(
parent
)
self
.
__controls
.
pack
(
side
=
TOP
,
fill
=
X
)
self
.
__running
=
0
self
.
__start
=
Button
(
self
.
__controls
,
text
=
"Run"
,
command
=
self
.
start
)
self
.
__start
.
pack
(
side
=
LEFT
)
self
.
__stop
=
Button
(
self
.
__controls
,
text
=
"Stop"
,
command
=
self
.
stop
,
state
=
DISABLED
)
self
.
__stop
.
pack
(
side
=
LEFT
)
self
.
__step
=
Button
(
self
.
__controls
,
text
=
"Check one"
,
command
=
self
.
step
)
self
.
__step
.
pack
(
side
=
LEFT
)
self
.
__cv
=
BooleanVar
(
parent
)
self
.
__cv
.
set
(
self
.
checkext
)
self
.
__checkext
=
Checkbutton
(
self
.
__controls
,
variable
=
self
.
__cv
,
command
=
self
.
update_checkext
,
text
=
"Check nonlocal links"
,)
self
.
__checkext
.
pack
(
side
=
LEFT
)
self
.
__reset
=
Button
(
self
.
__controls
,
text
=
"Start over"
,
command
=
self
.
reset
)
self
.
__reset
.
pack
(
side
=
LEFT
)
if
__name__
==
'__main__'
:
# No Quit button under Grail!
self
.
__quit
=
Button
(
self
.
__controls
,
text
=
"Quit"
,
command
=
self
.
__parent
.
quit
)
self
.
__quit
.
pack
(
side
=
RIGHT
)
self
.
__status
=
Label
(
parent
,
text
=
"Status: initial"
,
anchor
=
W
)
self
.
__status
.
pack
(
side
=
TOP
,
fill
=
X
)
self
.
__checking
=
Label
(
parent
,
text
=
"Idle"
,
anchor
=
W
)
self
.
__checking
.
pack
(
side
=
TOP
,
fill
=
X
)
self
.
__mp
=
mp
=
MultiPanel
(
parent
)
sys
.
stdout
=
self
.
__log
=
LogPanel
(
mp
,
"Log"
)
self
.
__todo
=
ListPanel
(
mp
,
"To check"
,
self
.
showinfo
)
self
.
__done
=
ListPanel
(
mp
,
"Checked"
,
self
.
showinfo
)
self
.
__bad
=
ListPanel
(
mp
,
"Bad links"
,
self
.
showinfo
)
self
.
__errors
=
ListPanel
(
mp
,
"Pages w/ bad links"
,
self
.
showinfo
)
self
.
__details
=
LogPanel
(
mp
,
"Details"
)
webchecker
.
Checker
.
__init__
(
self
)
if
root
:
root
=
string
.
strip
(
str
(
root
))
if
root
:
self
.
suggestroot
(
root
)
self
.
newstatus
()
def
reset
(
self
):
webchecker
.
Checker
.
reset
(
self
)
for
p
in
self
.
__todo
,
self
.
__done
,
self
.
__bad
,
self
.
__errors
:
p
.
clear
()
webchecker
.
Checker
.
reset
(
self
)
for
p
in
self
.
__todo
,
self
.
__done
,
self
.
__bad
,
self
.
__errors
:
p
.
clear
()
def
suggestroot
(
self
,
root
):
self
.
__rootentry
.
delete
(
0
,
END
)
self
.
__rootentry
.
insert
(
END
,
root
)
self
.
__rootentry
.
select_range
(
0
,
END
)
self
.
__rootentry
.
delete
(
0
,
END
)
self
.
__rootentry
.
insert
(
END
,
root
)
self
.
__rootentry
.
select_range
(
0
,
END
)
def
enterroot
(
self
,
event
=
None
):
root
=
self
.
__rootentry
.
get
()
root
=
string
.
strip
(
root
)
if
root
:
self
.
__checking
.
config
(
text
=
"Adding root "
+
root
)
self
.
__checking
.
update_idletasks
()
self
.
addroot
(
root
)
self
.
__checking
.
config
(
text
=
"Idle"
)
try
:
i
=
self
.
__todo
.
items
.
index
(
root
)
except
(
ValueError
,
IndexError
):
pass
else
:
self
.
__todo
.
list
.
select_clear
(
0
,
END
)
self
.
__todo
.
list
.
select_set
(
i
)
self
.
__todo
.
list
.
yview
(
i
)
self
.
__rootentry
.
delete
(
0
,
END
)
root
=
self
.
__rootentry
.
get
()
root
=
string
.
strip
(
root
)
if
root
:
self
.
__checking
.
config
(
text
=
"Adding root "
+
root
)
self
.
__checking
.
update_idletasks
()
self
.
addroot
(
root
)
self
.
__checking
.
config
(
text
=
"Idle"
)
try
:
i
=
self
.
__todo
.
items
.
index
(
root
)
except
(
ValueError
,
IndexError
):
pass
else
:
self
.
__todo
.
list
.
select_clear
(
0
,
END
)
self
.
__todo
.
list
.
select_set
(
i
)
self
.
__todo
.
list
.
yview
(
i
)
self
.
__rootentry
.
delete
(
0
,
END
)
def
start
(
self
):
self
.
__start
.
config
(
state
=
DISABLED
,
relief
=
SUNKEN
)
self
.
__stop
.
config
(
state
=
NORMAL
)
self
.
__step
.
config
(
state
=
DISABLED
)
self
.
enterroot
()
self
.
__running
=
1
self
.
go
()
self
.
__start
.
config
(
state
=
DISABLED
,
relief
=
SUNKEN
)
self
.
__stop
.
config
(
state
=
NORMAL
)
self
.
__step
.
config
(
state
=
DISABLED
)
self
.
enterroot
()
self
.
__running
=
1
self
.
go
()
def
stop
(
self
):
self
.
__stop
.
config
(
state
=
DISABLED
,
relief
=
SUNKEN
)
self
.
__running
=
0
self
.
__stop
.
config
(
state
=
DISABLED
,
relief
=
SUNKEN
)
self
.
__running
=
0
def
step
(
self
):
self
.
__start
.
config
(
state
=
DISABLED
)
self
.
__step
.
config
(
state
=
DISABLED
,
relief
=
SUNKEN
)
self
.
enterroot
()
self
.
__running
=
0
self
.
dosomething
()
self
.
__start
.
config
(
state
=
DISABLED
)
self
.
__step
.
config
(
state
=
DISABLED
,
relief
=
SUNKEN
)
self
.
enterroot
()
self
.
__running
=
0
self
.
dosomething
()
def
go
(
self
):
if
self
.
__running
:
self
.
__parent
.
after_idle
(
self
.
dosomething
)
else
:
self
.
__checking
.
config
(
text
=
"Idle"
)
self
.
__start
.
config
(
state
=
NORMAL
,
relief
=
RAISED
)
self
.
__stop
.
config
(
state
=
DISABLED
,
relief
=
RAISED
)
self
.
__step
.
config
(
state
=
NORMAL
,
relief
=
RAISED
)
if
self
.
__running
:
self
.
__parent
.
after_idle
(
self
.
dosomething
)
else
:
self
.
__checking
.
config
(
text
=
"Idle"
)
self
.
__start
.
config
(
state
=
NORMAL
,
relief
=
RAISED
)
self
.
__stop
.
config
(
state
=
DISABLED
,
relief
=
RAISED
)
self
.
__step
.
config
(
state
=
NORMAL
,
relief
=
RAISED
)
__busy
=
0
def
dosomething
(
self
):
if
self
.
__busy
:
return
self
.
__busy
=
1
if
self
.
todo
:
l
=
self
.
__todo
.
selectedindices
()
if
l
:
i
=
l
[
0
]
else
:
i
=
0
self
.
__todo
.
list
.
select_set
(
i
)
self
.
__todo
.
list
.
yview
(
i
)
url
=
self
.
__todo
.
items
[
i
]
self
.
__checking
.
config
(
text
=
"Checking "
+
url
)
self
.
__parent
.
update
()
self
.
dopage
(
url
)
else
:
self
.
stop
()
self
.
__busy
=
0
self
.
go
()
if
self
.
__busy
:
return
self
.
__busy
=
1
if
self
.
todo
:
l
=
self
.
__todo
.
selectedindices
()
if
l
:
i
=
l
[
0
]
else
:
i
=
0
self
.
__todo
.
list
.
select_set
(
i
)
self
.
__todo
.
list
.
yview
(
i
)
url
=
self
.
__todo
.
items
[
i
]
self
.
__checking
.
config
(
text
=
"Checking "
+
url
)
self
.
__parent
.
update
()
self
.
dopage
(
url
)
else
:
self
.
stop
()
self
.
__busy
=
0
self
.
go
()
def
showinfo
(
self
,
url
):
d
=
self
.
__details
d
.
clear
()
d
.
put
(
"URL:
%
s
\n
"
%
url
)
if
self
.
bad
.
has_key
(
url
):
d
.
put
(
"Error:
%
s
\n
"
%
str
(
self
.
bad
[
url
]))
if
url
in
self
.
roots
:
d
.
put
(
"Note: This is a root URL
\n
"
)
if
self
.
done
.
has_key
(
url
):
d
.
put
(
"Status: checked
\n
"
)
o
=
self
.
done
[
url
]
elif
self
.
todo
.
has_key
(
url
):
d
.
put
(
"Status: to check
\n
"
)
o
=
self
.
todo
[
url
]
else
:
d
.
put
(
"Status: unknown (!)
\n
"
)
o
=
[]
if
self
.
errors
.
has_key
(
url
):
d
.
put
(
"Bad links from this page:
\n
"
)
for
triple
in
self
.
errors
[
url
]:
link
,
rawlink
,
msg
=
triple
d
.
put
(
" HREF
%
s"
%
link
)
if
link
!=
rawlink
:
d
.
put
(
" (
%
s)"
%
rawlink
)
d
.
put
(
"
\n
"
)
d
.
put
(
" error
%
s
\n
"
%
str
(
msg
))
self
.
__mp
.
showpanel
(
"Details"
)
for
source
,
rawlink
in
o
:
d
.
put
(
"Origin:
%
s"
%
source
)
if
rawlink
!=
url
:
d
.
put
(
" (
%
s)"
%
rawlink
)
d
.
put
(
"
\n
"
)
d
.
text
.
yview
(
"1.0"
)
d
=
self
.
__details
d
.
clear
()
d
.
put
(
"URL:
%
s
\n
"
%
url
)
if
self
.
bad
.
has_key
(
url
):
d
.
put
(
"Error:
%
s
\n
"
%
str
(
self
.
bad
[
url
]))
if
url
in
self
.
roots
:
d
.
put
(
"Note: This is a root URL
\n
"
)
if
self
.
done
.
has_key
(
url
):
d
.
put
(
"Status: checked
\n
"
)
o
=
self
.
done
[
url
]
elif
self
.
todo
.
has_key
(
url
):
d
.
put
(
"Status: to check
\n
"
)
o
=
self
.
todo
[
url
]
else
:
d
.
put
(
"Status: unknown (!)
\n
"
)
o
=
[]
if
self
.
errors
.
has_key
(
url
):
d
.
put
(
"Bad links from this page:
\n
"
)
for
triple
in
self
.
errors
[
url
]:
link
,
rawlink
,
msg
=
triple
d
.
put
(
" HREF
%
s"
%
link
)
if
link
!=
rawlink
:
d
.
put
(
" (
%
s)"
%
rawlink
)
d
.
put
(
"
\n
"
)
d
.
put
(
" error
%
s
\n
"
%
str
(
msg
))
self
.
__mp
.
showpanel
(
"Details"
)
for
source
,
rawlink
in
o
:
d
.
put
(
"Origin:
%
s"
%
source
)
if
rawlink
!=
url
:
d
.
put
(
" (
%
s)"
%
rawlink
)
d
.
put
(
"
\n
"
)
d
.
text
.
yview
(
"1.0"
)
def
setbad
(
self
,
url
,
msg
):
webchecker
.
Checker
.
setbad
(
self
,
url
,
msg
)
self
.
__bad
.
insert
(
url
)
self
.
newstatus
()
webchecker
.
Checker
.
setbad
(
self
,
url
,
msg
)
self
.
__bad
.
insert
(
url
)
self
.
newstatus
()
def
setgood
(
self
,
url
):
webchecker
.
Checker
.
setgood
(
self
,
url
)
self
.
__bad
.
remove
(
url
)
self
.
newstatus
()
webchecker
.
Checker
.
setgood
(
self
,
url
)
self
.
__bad
.
remove
(
url
)
self
.
newstatus
()
def
newlink
(
self
,
url
,
origin
):
webchecker
.
Checker
.
newlink
(
self
,
url
,
origin
)
if
self
.
done
.
has_key
(
url
):
self
.
__done
.
insert
(
url
)
elif
self
.
todo
.
has_key
(
url
):
self
.
__todo
.
insert
(
url
)
self
.
newstatus
()
webchecker
.
Checker
.
newlink
(
self
,
url
,
origin
)
if
self
.
done
.
has_key
(
url
):
self
.
__done
.
insert
(
url
)
elif
self
.
todo
.
has_key
(
url
):
self
.
__todo
.
insert
(
url
)
self
.
newstatus
()
def
markdone
(
self
,
url
):
webchecker
.
Checker
.
markdone
(
self
,
url
)
self
.
__done
.
insert
(
url
)
self
.
__todo
.
remove
(
url
)
self
.
newstatus
()
webchecker
.
Checker
.
markdone
(
self
,
url
)
self
.
__done
.
insert
(
url
)
self
.
__todo
.
remove
(
url
)
self
.
newstatus
()
def
seterror
(
self
,
url
,
triple
):
webchecker
.
Checker
.
seterror
(
self
,
url
,
triple
)
self
.
__errors
.
insert
(
url
)
self
.
newstatus
()
webchecker
.
Checker
.
seterror
(
self
,
url
,
triple
)
self
.
__errors
.
insert
(
url
)
self
.
newstatus
()
def
newstatus
(
self
):
self
.
__status
.
config
(
text
=
"Status: "
+
self
.
status
())
self
.
__parent
.
update
()
self
.
__status
.
config
(
text
=
"Status: "
+
self
.
status
())
self
.
__parent
.
update
()
def
update_checkext
(
self
):
self
.
checkext
=
self
.
__cv
.
get
()
self
.
checkext
=
self
.
__cv
.
get
()
class
ListPanel
:
def
__init__
(
self
,
mp
,
name
,
showinfo
=
None
):
self
.
mp
=
mp
self
.
name
=
name
self
.
showinfo
=
showinfo
self
.
panel
=
mp
.
addpanel
(
name
)
self
.
list
,
self
.
frame
=
tktools
.
make_list_box
(
self
.
panel
,
width
=
60
,
height
=
5
)
self
.
list
.
config
(
exportselection
=
0
)
if
showinfo
:
self
.
list
.
bind
(
'<Double-Button-1>'
,
self
.
doubleclick
)
self
.
items
=
[]
self
.
mp
=
mp
self
.
name
=
name
self
.
showinfo
=
showinfo
self
.
panel
=
mp
.
addpanel
(
name
)
self
.
list
,
self
.
frame
=
tktools
.
make_list_box
(
self
.
panel
,
width
=
60
,
height
=
5
)
self
.
list
.
config
(
exportselection
=
0
)
if
showinfo
:
self
.
list
.
bind
(
'<Double-Button-1>'
,
self
.
doubleclick
)
self
.
items
=
[]
def
clear
(
self
):
self
.
items
=
[]
self
.
list
.
delete
(
0
,
END
)
self
.
mp
.
hidepanel
(
self
.
name
)
self
.
items
=
[]
self
.
list
.
delete
(
0
,
END
)
self
.
mp
.
hidepanel
(
self
.
name
)
def
doubleclick
(
self
,
event
):
l
=
self
.
selectedindices
()
if
l
:
self
.
showinfo
(
self
.
list
.
get
(
l
[
0
]))
l
=
self
.
selectedindices
()
if
l
:
self
.
showinfo
(
self
.
list
.
get
(
l
[
0
]))
def
selectedindices
(
self
):
l
=
self
.
list
.
curselection
()
if
not
l
:
return
[]
return
map
(
string
.
atoi
,
l
)
l
=
self
.
list
.
curselection
()
if
not
l
:
return
[]
return
map
(
string
.
atoi
,
l
)
def
insert
(
self
,
url
):
if
url
not
in
self
.
items
:
if
not
self
.
items
:
self
.
mp
.
showpanel
(
self
.
name
)
# (I tried sorting alphabetically, but the display is too jumpy)
i
=
len
(
self
.
items
)
self
.
list
.
insert
(
i
,
url
)
self
.
list
.
yview
(
i
)
self
.
items
.
insert
(
i
,
url
)
if
url
not
in
self
.
items
:
if
not
self
.
items
:
self
.
mp
.
showpanel
(
self
.
name
)
# (I tried sorting alphabetically, but the display is too jumpy)
i
=
len
(
self
.
items
)
self
.
list
.
insert
(
i
,
url
)
self
.
list
.
yview
(
i
)
self
.
items
.
insert
(
i
,
url
)
def
remove
(
self
,
url
):
try
:
i
=
self
.
items
.
index
(
url
)
except
(
ValueError
,
IndexError
):
pass
else
:
was_selected
=
i
in
self
.
selectedindices
()
self
.
list
.
delete
(
i
)
del
self
.
items
[
i
]
if
not
self
.
items
:
self
.
mp
.
hidepanel
(
self
.
name
)
elif
was_selected
:
if
i
>=
len
(
self
.
items
):
i
=
len
(
self
.
items
)
-
1
self
.
list
.
select_set
(
i
)
try
:
i
=
self
.
items
.
index
(
url
)
except
(
ValueError
,
IndexError
):
pass
else
:
was_selected
=
i
in
self
.
selectedindices
()
self
.
list
.
delete
(
i
)
del
self
.
items
[
i
]
if
not
self
.
items
:
self
.
mp
.
hidepanel
(
self
.
name
)
elif
was_selected
:
if
i
>=
len
(
self
.
items
):
i
=
len
(
self
.
items
)
-
1
self
.
list
.
select_set
(
i
)
class
LogPanel
:
def
__init__
(
self
,
mp
,
name
):
self
.
mp
=
mp
self
.
name
=
name
self
.
panel
=
mp
.
addpanel
(
name
)
self
.
text
,
self
.
frame
=
tktools
.
make_text_box
(
self
.
panel
,
height
=
10
)
self
.
text
.
config
(
wrap
=
NONE
)
self
.
mp
=
mp
self
.
name
=
name
self
.
panel
=
mp
.
addpanel
(
name
)
self
.
text
,
self
.
frame
=
tktools
.
make_text_box
(
self
.
panel
,
height
=
10
)
self
.
text
.
config
(
wrap
=
NONE
)
def
clear
(
self
):
self
.
text
.
delete
(
"1.0"
,
END
)
self
.
text
.
yview
(
"1.0"
)
self
.
text
.
delete
(
"1.0"
,
END
)
self
.
text
.
yview
(
"1.0"
)
def
put
(
self
,
s
):
self
.
text
.
insert
(
END
,
s
)
if
'
\n
'
in
s
:
self
.
text
.
yview
(
END
)
self
.
text
.
insert
(
END
,
s
)
if
'
\n
'
in
s
:
self
.
text
.
yview
(
END
)
def
write
(
self
,
s
):
self
.
text
.
insert
(
END
,
s
)
if
'
\n
'
in
s
:
self
.
text
.
yview
(
END
)
self
.
panel
.
update
()
self
.
text
.
insert
(
END
,
s
)
if
'
\n
'
in
s
:
self
.
text
.
yview
(
END
)
self
.
panel
.
update
()
class
MultiPanel
:
def
__init__
(
self
,
parent
):
self
.
parent
=
parent
self
.
frame
=
Frame
(
self
.
parent
)
self
.
frame
.
pack
(
expand
=
1
,
fill
=
BOTH
)
self
.
topframe
=
Frame
(
self
.
frame
,
borderwidth
=
2
,
relief
=
RAISED
)
self
.
topframe
.
pack
(
fill
=
X
)
self
.
botframe
=
Frame
(
self
.
frame
)
self
.
botframe
.
pack
(
expand
=
1
,
fill
=
BOTH
)
self
.
panelnames
=
[]
self
.
panels
=
{}
self
.
parent
=
parent
self
.
frame
=
Frame
(
self
.
parent
)
self
.
frame
.
pack
(
expand
=
1
,
fill
=
BOTH
)
self
.
topframe
=
Frame
(
self
.
frame
,
borderwidth
=
2
,
relief
=
RAISED
)
self
.
topframe
.
pack
(
fill
=
X
)
self
.
botframe
=
Frame
(
self
.
frame
)
self
.
botframe
.
pack
(
expand
=
1
,
fill
=
BOTH
)
self
.
panelnames
=
[]
self
.
panels
=
{}
def
addpanel
(
self
,
name
,
on
=
0
):
v
=
StringVar
(
self
.
parent
)
if
on
:
v
.
set
(
name
)
else
:
v
.
set
(
""
)
check
=
Checkbutton
(
self
.
topframe
,
text
=
name
,
offvalue
=
""
,
onvalue
=
name
,
variable
=
v
,
command
=
self
.
checkpanel
)
check
.
pack
(
side
=
LEFT
)
panel
=
Frame
(
self
.
botframe
)
label
=
Label
(
panel
,
text
=
name
,
borderwidth
=
2
,
relief
=
RAISED
,
anchor
=
W
)
label
.
pack
(
side
=
TOP
,
fill
=
X
)
t
=
v
,
check
,
panel
self
.
panelnames
.
append
(
name
)
self
.
panels
[
name
]
=
t
if
on
:
panel
.
pack
(
expand
=
1
,
fill
=
BOTH
)
return
panel
v
=
StringVar
(
self
.
parent
)
if
on
:
v
.
set
(
name
)
else
:
v
.
set
(
""
)
check
=
Checkbutton
(
self
.
topframe
,
text
=
name
,
offvalue
=
""
,
onvalue
=
name
,
variable
=
v
,
command
=
self
.
checkpanel
)
check
.
pack
(
side
=
LEFT
)
panel
=
Frame
(
self
.
botframe
)
label
=
Label
(
panel
,
text
=
name
,
borderwidth
=
2
,
relief
=
RAISED
,
anchor
=
W
)
label
.
pack
(
side
=
TOP
,
fill
=
X
)
t
=
v
,
check
,
panel
self
.
panelnames
.
append
(
name
)
self
.
panels
[
name
]
=
t
if
on
:
panel
.
pack
(
expand
=
1
,
fill
=
BOTH
)
return
panel
def
showpanel
(
self
,
name
):
v
,
check
,
panel
=
self
.
panels
[
name
]
v
.
set
(
name
)
panel
.
pack
(
expand
=
1
,
fill
=
BOTH
)
v
,
check
,
panel
=
self
.
panels
[
name
]
v
.
set
(
name
)
panel
.
pack
(
expand
=
1
,
fill
=
BOTH
)
def
hidepanel
(
self
,
name
):
v
,
check
,
panel
=
self
.
panels
[
name
]
v
.
set
(
""
)
panel
.
pack_forget
()
v
,
check
,
panel
=
self
.
panels
[
name
]
v
.
set
(
""
)
panel
.
pack_forget
()
def
checkpanel
(
self
):
for
name
in
self
.
panelnames
:
v
,
check
,
panel
=
self
.
panels
[
name
]
panel
.
pack_forget
()
for
name
in
self
.
panelnames
:
v
,
check
,
panel
=
self
.
panels
[
name
]
if
v
.
get
():
panel
.
pack
(
expand
=
1
,
fill
=
BOTH
)
for
name
in
self
.
panelnames
:
v
,
check
,
panel
=
self
.
panels
[
name
]
panel
.
pack_forget
()
for
name
in
self
.
panelnames
:
v
,
check
,
panel
=
self
.
panels
[
name
]
if
v
.
get
():
panel
.
pack
(
expand
=
1
,
fill
=
BOTH
)
if
__name__
==
'__main__'
:
...
...
Tools/webchecker/webchecker.py
Dosyayı görüntüle @
986abac1
...
...
@@ -116,17 +116,17 @@ import robotparser
if
__version__
[
0
]
==
'$'
:
_v
=
string
.
split
(
__version__
)
if
len
(
_v
)
==
3
:
__version__
=
_v
[
1
]
__version__
=
_v
[
1
]
# Tunable parameters
DEFROOT
=
"file:/usr/local/etc/httpd/htdocs/"
# Default root URL
CHECKEXT
=
1
# Check external references (1 deep)
VERBOSE
=
1
# Verbosity level (0-3)
MAXPAGE
=
150000
# Ignore files bigger than this
ROUNDSIZE
=
50
# Number of links processed per round
DUMPFILE
=
"@webchecker.pickle"
# Pickled checkpoint
AGENTNAME
=
"webchecker"
# Agent name for robots.txt parser
DEFROOT
=
"file:/usr/local/etc/httpd/htdocs/"
# Default root URL
CHECKEXT
=
1
# Check external references (1 deep)
VERBOSE
=
1
# Verbosity level (0-3)
MAXPAGE
=
150000
# Ignore files bigger than this
ROUNDSIZE
=
50
# Number of links processed per round
DUMPFILE
=
"@webchecker.pickle"
# Pickled checkpoint
AGENTNAME
=
"webchecker"
# Agent name for robots.txt parser
# Global variables
...
...
@@ -142,76 +142,76 @@ def main():
norun
=
0
try
:
opts
,
args
=
getopt
.
getopt
(
sys
.
argv
[
1
:],
'Rd:m:nqr:vx'
)
opts
,
args
=
getopt
.
getopt
(
sys
.
argv
[
1
:],
'Rd:m:nqr:vx'
)
except
getopt
.
error
,
msg
:
sys
.
stdout
=
sys
.
stderr
print
msg
print
__doc__
%
globals
()
sys
.
exit
(
2
)
sys
.
stdout
=
sys
.
stderr
print
msg
print
__doc__
%
globals
()
sys
.
exit
(
2
)
for
o
,
a
in
opts
:
if
o
==
'-R'
:
restart
=
1
if
o
==
'-d'
:
dumpfile
=
a
if
o
==
'-m'
:
maxpage
=
string
.
atoi
(
a
)
if
o
==
'-n'
:
norun
=
1
if
o
==
'-q'
:
verbose
=
0
if
o
==
'-r'
:
roundsize
=
string
.
atoi
(
a
)
if
o
==
'-v'
:
verbose
=
verbose
+
1
if
o
==
'-x'
:
checkext
=
not
checkext
if
o
==
'-R'
:
restart
=
1
if
o
==
'-d'
:
dumpfile
=
a
if
o
==
'-m'
:
maxpage
=
string
.
atoi
(
a
)
if
o
==
'-n'
:
norun
=
1
if
o
==
'-q'
:
verbose
=
0
if
o
==
'-r'
:
roundsize
=
string
.
atoi
(
a
)
if
o
==
'-v'
:
verbose
=
verbose
+
1
if
o
==
'-x'
:
checkext
=
not
checkext
if
verbose
>
0
:
print
AGENTNAME
,
"version"
,
__version__
print
AGENTNAME
,
"version"
,
__version__
if
restart
:
c
=
load_pickle
(
dumpfile
=
dumpfile
,
verbose
=
verbose
)
c
=
load_pickle
(
dumpfile
=
dumpfile
,
verbose
=
verbose
)
else
:
c
=
Checker
()
c
=
Checker
()
c
.
setflags
(
checkext
=
checkext
,
verbose
=
verbose
,
maxpage
=
maxpage
,
roundsize
=
roundsize
)
maxpage
=
maxpage
,
roundsize
=
roundsize
)
if
not
restart
and
not
args
:
args
.
append
(
DEFROOT
)
args
.
append
(
DEFROOT
)
for
arg
in
args
:
c
.
addroot
(
arg
)
c
.
addroot
(
arg
)
if
not
norun
:
try
:
c
.
run
()
except
KeyboardInterrupt
:
if
verbose
>
0
:
print
"[run interrupted]"
try
:
c
.
run
()
except
KeyboardInterrupt
:
if
verbose
>
0
:
print
"[run interrupted]"
try
:
c
.
report
()
c
.
report
()
except
KeyboardInterrupt
:
if
verbose
>
0
:
print
"[report interrupted]"
if
verbose
>
0
:
print
"[report interrupted]"
if
c
.
save_pickle
(
dumpfile
):
if
dumpfile
==
DUMPFILE
:
print
"Use ``
%
s -R'' to restart."
%
sys
.
argv
[
0
]
else
:
print
"Use ``
%
s -R -d
%
s'' to restart."
%
(
sys
.
argv
[
0
],
dumpfile
)
if
dumpfile
==
DUMPFILE
:
print
"Use ``
%
s -R'' to restart."
%
sys
.
argv
[
0
]
else
:
print
"Use ``
%
s -R -d
%
s'' to restart."
%
(
sys
.
argv
[
0
],
dumpfile
)
def
load_pickle
(
dumpfile
=
DUMPFILE
,
verbose
=
VERBOSE
):
if
verbose
>
0
:
print
"Loading checkpoint from
%
s ..."
%
dumpfile
print
"Loading checkpoint from
%
s ..."
%
dumpfile
f
=
open
(
dumpfile
,
"rb"
)
c
=
pickle
.
load
(
f
)
f
.
close
()
if
verbose
>
0
:
print
"Done."
print
"Root:"
,
string
.
join
(
c
.
roots
,
"
\n
"
)
print
"Done."
print
"Root:"
,
string
.
join
(
c
.
roots
,
"
\n
"
)
return
c
...
...
@@ -225,364 +225,364 @@ class Checker:
validflags
=
tuple
(
dir
())
def
__init__
(
self
):
self
.
reset
()
self
.
reset
()
def
setflags
(
self
,
**
kw
):
for
key
in
kw
.
keys
():
if
key
not
in
self
.
validflags
:
raise
NameError
,
"invalid keyword argument:
%
s"
%
str
(
key
)
for
key
,
value
in
kw
.
items
():
setattr
(
self
,
key
,
value
)
for
key
in
kw
.
keys
():
if
key
not
in
self
.
validflags
:
raise
NameError
,
"invalid keyword argument:
%
s"
%
str
(
key
)
for
key
,
value
in
kw
.
items
():
setattr
(
self
,
key
,
value
)
def
reset
(
self
):
self
.
roots
=
[]
self
.
todo
=
{}
self
.
done
=
{}
self
.
bad
=
{}
self
.
round
=
0
# The following are not pickled:
self
.
robots
=
{}
self
.
errors
=
{}
self
.
urlopener
=
MyURLopener
()
self
.
changed
=
0
self
.
roots
=
[]
self
.
todo
=
{}
self
.
done
=
{}
self
.
bad
=
{}
self
.
round
=
0
# The following are not pickled:
self
.
robots
=
{}
self
.
errors
=
{}
self
.
urlopener
=
MyURLopener
()
self
.
changed
=
0
def
__getstate__
(
self
):
return
(
self
.
roots
,
self
.
todo
,
self
.
done
,
self
.
bad
,
self
.
round
)
return
(
self
.
roots
,
self
.
todo
,
self
.
done
,
self
.
bad
,
self
.
round
)
def
__setstate__
(
self
,
state
):
self
.
reset
()
(
self
.
roots
,
self
.
todo
,
self
.
done
,
self
.
bad
,
self
.
round
)
=
state
for
root
in
self
.
roots
:
self
.
addrobot
(
root
)
for
url
in
self
.
bad
.
keys
():
self
.
markerror
(
url
)
self
.
reset
()
(
self
.
roots
,
self
.
todo
,
self
.
done
,
self
.
bad
,
self
.
round
)
=
state
for
root
in
self
.
roots
:
self
.
addrobot
(
root
)
for
url
in
self
.
bad
.
keys
():
self
.
markerror
(
url
)
def
addroot
(
self
,
root
):
if
root
not
in
self
.
roots
:
troot
=
root
scheme
,
netloc
,
path
,
params
,
query
,
fragment
=
\
urlparse
.
urlparse
(
root
)
i
=
string
.
rfind
(
path
,
"/"
)
+
1
if
0
<
i
<
len
(
path
):
path
=
path
[:
i
]
troot
=
urlparse
.
urlunparse
((
scheme
,
netloc
,
path
,
params
,
query
,
fragment
))
self
.
roots
.
append
(
troot
)
self
.
addrobot
(
root
)
self
.
newlink
(
root
,
(
"<root>"
,
root
))
if
root
not
in
self
.
roots
:
troot
=
root
scheme
,
netloc
,
path
,
params
,
query
,
fragment
=
\
urlparse
.
urlparse
(
root
)
i
=
string
.
rfind
(
path
,
"/"
)
+
1
if
0
<
i
<
len
(
path
):
path
=
path
[:
i
]
troot
=
urlparse
.
urlunparse
((
scheme
,
netloc
,
path
,
params
,
query
,
fragment
))
self
.
roots
.
append
(
troot
)
self
.
addrobot
(
root
)
self
.
newlink
(
root
,
(
"<root>"
,
root
))
def
addrobot
(
self
,
root
):
root
=
urlparse
.
urljoin
(
root
,
"/"
)
if
self
.
robots
.
has_key
(
root
):
return
url
=
urlparse
.
urljoin
(
root
,
"/robots.txt"
)
self
.
robots
[
root
]
=
rp
=
robotparser
.
RobotFileParser
()
if
self
.
verbose
>
2
:
print
"Parsing"
,
url
rp
.
debug
=
self
.
verbose
>
3
rp
.
set_url
(
url
)
try
:
rp
.
read
()
except
IOError
,
msg
:
if
self
.
verbose
>
1
:
print
"I/O error parsing"
,
url
,
":"
,
msg
root
=
urlparse
.
urljoin
(
root
,
"/"
)
if
self
.
robots
.
has_key
(
root
):
return
url
=
urlparse
.
urljoin
(
root
,
"/robots.txt"
)
self
.
robots
[
root
]
=
rp
=
robotparser
.
RobotFileParser
()
if
self
.
verbose
>
2
:
print
"Parsing"
,
url
rp
.
debug
=
self
.
verbose
>
3
rp
.
set_url
(
url
)
try
:
rp
.
read
()
except
IOError
,
msg
:
if
self
.
verbose
>
1
:
print
"I/O error parsing"
,
url
,
":"
,
msg
def
run
(
self
):
while
self
.
todo
:
self
.
round
=
self
.
round
+
1
if
self
.
verbose
>
0
:
print
print
"Round
%
d (
%
s)"
%
(
self
.
round
,
self
.
status
())
print
urls
=
self
.
todo
.
keys
()[:
self
.
roundsize
]
for
url
in
urls
:
self
.
dopage
(
url
)
while
self
.
todo
:
self
.
round
=
self
.
round
+
1
if
self
.
verbose
>
0
:
print
print
"Round
%
d (
%
s)"
%
(
self
.
round
,
self
.
status
())
print
urls
=
self
.
todo
.
keys
()[:
self
.
roundsize
]
for
url
in
urls
:
self
.
dopage
(
url
)
def
status
(
self
):
return
"
%
d total,
%
d to do,
%
d done,
%
d bad"
%
(
len
(
self
.
todo
)
+
len
(
self
.
done
),
len
(
self
.
todo
),
len
(
self
.
done
),
len
(
self
.
bad
))
return
"
%
d total,
%
d to do,
%
d done,
%
d bad"
%
(
len
(
self
.
todo
)
+
len
(
self
.
done
),
len
(
self
.
todo
),
len
(
self
.
done
),
len
(
self
.
bad
))
def
report
(
self
):
print
if
not
self
.
todo
:
print
"Final"
,
else
:
print
"Interim"
,
print
"Report (
%
s)"
%
self
.
status
()
self
.
report_errors
()
print
if
not
self
.
todo
:
print
"Final"
,
else
:
print
"Interim"
,
print
"Report (
%
s)"
%
self
.
status
()
self
.
report_errors
()
def
report_errors
(
self
):
if
not
self
.
bad
:
print
print
"No errors"
return
print
print
"Error Report:"
sources
=
self
.
errors
.
keys
()
sources
.
sort
()
for
source
in
sources
:
triples
=
self
.
errors
[
source
]
print
if
len
(
triples
)
>
1
:
print
len
(
triples
),
"Errors in"
,
source
else
:
print
"Error in"
,
source
for
url
,
rawlink
,
msg
in
triples
:
print
" HREF"
,
url
,
if
rawlink
!=
url
:
print
"(
%
s)"
%
rawlink
,
print
print
" msg"
,
msg
if
not
self
.
bad
:
print
print
"No errors"
return
print
print
"Error Report:"
sources
=
self
.
errors
.
keys
()
sources
.
sort
()
for
source
in
sources
:
triples
=
self
.
errors
[
source
]
print
if
len
(
triples
)
>
1
:
print
len
(
triples
),
"Errors in"
,
source
else
:
print
"Error in"
,
source
for
url
,
rawlink
,
msg
in
triples
:
print
" HREF"
,
url
,
if
rawlink
!=
url
:
print
"(
%
s)"
%
rawlink
,
print
print
" msg"
,
msg
def
dopage
(
self
,
url
):
if
self
.
verbose
>
1
:
if
self
.
verbose
>
2
:
self
.
show
(
"Check "
,
url
,
" from"
,
self
.
todo
[
url
])
else
:
print
"Check "
,
url
page
=
self
.
getpage
(
url
)
if
page
:
for
info
in
page
.
getlinkinfos
():
link
,
rawlink
=
info
origin
=
url
,
rawlink
self
.
newlink
(
link
,
origin
)
self
.
markdone
(
url
)
if
self
.
verbose
>
1
:
if
self
.
verbose
>
2
:
self
.
show
(
"Check "
,
url
,
" from"
,
self
.
todo
[
url
])
else
:
print
"Check "
,
url
page
=
self
.
getpage
(
url
)
if
page
:
for
info
in
page
.
getlinkinfos
():
link
,
rawlink
=
info
origin
=
url
,
rawlink
self
.
newlink
(
link
,
origin
)
self
.
markdone
(
url
)
def
newlink
(
self
,
url
,
origin
):
if
self
.
done
.
has_key
(
url
):
self
.
newdonelink
(
url
,
origin
)
else
:
self
.
newtodolink
(
url
,
origin
)
if
self
.
done
.
has_key
(
url
):
self
.
newdonelink
(
url
,
origin
)
else
:
self
.
newtodolink
(
url
,
origin
)
def
newdonelink
(
self
,
url
,
origin
):
self
.
done
[
url
]
.
append
(
origin
)
if
self
.
verbose
>
3
:
print
" Done link"
,
url
self
.
done
[
url
]
.
append
(
origin
)
if
self
.
verbose
>
3
:
print
" Done link"
,
url
def
newtodolink
(
self
,
url
,
origin
):
if
self
.
todo
.
has_key
(
url
):
self
.
todo
[
url
]
.
append
(
origin
)
if
self
.
verbose
>
3
:
print
" Seen todo link"
,
url
else
:
self
.
todo
[
url
]
=
[
origin
]
if
self
.
verbose
>
3
:
print
" New todo link"
,
url
if
self
.
todo
.
has_key
(
url
):
self
.
todo
[
url
]
.
append
(
origin
)
if
self
.
verbose
>
3
:
print
" Seen todo link"
,
url
else
:
self
.
todo
[
url
]
=
[
origin
]
if
self
.
verbose
>
3
:
print
" New todo link"
,
url
def
markdone
(
self
,
url
):
self
.
done
[
url
]
=
self
.
todo
[
url
]
del
self
.
todo
[
url
]
self
.
changed
=
1
self
.
done
[
url
]
=
self
.
todo
[
url
]
del
self
.
todo
[
url
]
self
.
changed
=
1
def
inroots
(
self
,
url
):
for
root
in
self
.
roots
:
if
url
[:
len
(
root
)]
==
root
:
root
=
urlparse
.
urljoin
(
root
,
"/"
)
return
self
.
robots
[
root
]
.
can_fetch
(
AGENTNAME
,
url
)
return
0
for
root
in
self
.
roots
:
if
url
[:
len
(
root
)]
==
root
:
root
=
urlparse
.
urljoin
(
root
,
"/"
)
return
self
.
robots
[
root
]
.
can_fetch
(
AGENTNAME
,
url
)
return
0
def
getpage
(
self
,
url
):
if
url
[:
7
]
==
'mailto:'
or
url
[:
5
]
==
'news:'
:
if
self
.
verbose
>
1
:
print
" Not checking mailto/news URL"
return
None
isint
=
self
.
inroots
(
url
)
if
not
isint
:
if
not
self
.
checkext
:
if
self
.
verbose
>
1
:
print
" Not checking ext link"
return
None
f
=
self
.
openpage
(
url
)
if
f
:
self
.
safeclose
(
f
)
return
None
text
,
nurl
=
self
.
readhtml
(
url
)
if
nurl
!=
url
:
if
self
.
verbose
>
1
:
print
" Redirected to"
,
nurl
url
=
nurl
if
text
:
return
Page
(
text
,
url
,
verbose
=
self
.
verbose
,
maxpage
=
self
.
maxpage
)
if
url
[:
7
]
==
'mailto:'
or
url
[:
5
]
==
'news:'
:
if
self
.
verbose
>
1
:
print
" Not checking mailto/news URL"
return
None
isint
=
self
.
inroots
(
url
)
if
not
isint
:
if
not
self
.
checkext
:
if
self
.
verbose
>
1
:
print
" Not checking ext link"
return
None
f
=
self
.
openpage
(
url
)
if
f
:
self
.
safeclose
(
f
)
return
None
text
,
nurl
=
self
.
readhtml
(
url
)
if
nurl
!=
url
:
if
self
.
verbose
>
1
:
print
" Redirected to"
,
nurl
url
=
nurl
if
text
:
return
Page
(
text
,
url
,
verbose
=
self
.
verbose
,
maxpage
=
self
.
maxpage
)
def
readhtml
(
self
,
url
):
text
=
None
f
,
url
=
self
.
openhtml
(
url
)
if
f
:
text
=
f
.
read
()
f
.
close
()
return
text
,
url
text
=
None
f
,
url
=
self
.
openhtml
(
url
)
if
f
:
text
=
f
.
read
()
f
.
close
()
return
text
,
url
def
openhtml
(
self
,
url
):
f
=
self
.
openpage
(
url
)
if
f
:
url
=
f
.
geturl
()
info
=
f
.
info
()
if
not
self
.
checkforhtml
(
info
,
url
):
self
.
safeclose
(
f
)
f
=
None
return
f
,
url
f
=
self
.
openpage
(
url
)
if
f
:
url
=
f
.
geturl
()
info
=
f
.
info
()
if
not
self
.
checkforhtml
(
info
,
url
):
self
.
safeclose
(
f
)
f
=
None
return
f
,
url
def
openpage
(
self
,
url
):
try
:
return
self
.
urlopener
.
open
(
url
)
except
IOError
,
msg
:
msg
=
self
.
sanitize
(
msg
)
if
self
.
verbose
>
0
:
print
"Error "
,
msg
if
self
.
verbose
>
0
:
self
.
show
(
" HREF "
,
url
,
" from"
,
self
.
todo
[
url
])
self
.
setbad
(
url
,
msg
)
return
None
try
:
return
self
.
urlopener
.
open
(
url
)
except
IOError
,
msg
:
msg
=
self
.
sanitize
(
msg
)
if
self
.
verbose
>
0
:
print
"Error "
,
msg
if
self
.
verbose
>
0
:
self
.
show
(
" HREF "
,
url
,
" from"
,
self
.
todo
[
url
])
self
.
setbad
(
url
,
msg
)
return
None
def
checkforhtml
(
self
,
info
,
url
):
if
info
.
has_key
(
'content-type'
):
ctype
=
string
.
lower
(
info
[
'content-type'
])
else
:
if
url
[
-
1
:]
==
"/"
:
return
1
ctype
,
encoding
=
mimetypes
.
guess_type
(
url
)
if
ctype
==
'text/html'
:
return
1
else
:
if
self
.
verbose
>
1
:
print
" Not HTML, mime type"
,
ctype
return
0
if
info
.
has_key
(
'content-type'
):
ctype
=
string
.
lower
(
info
[
'content-type'
])
else
:
if
url
[
-
1
:]
==
"/"
:
return
1
ctype
,
encoding
=
mimetypes
.
guess_type
(
url
)
if
ctype
==
'text/html'
:
return
1
else
:
if
self
.
verbose
>
1
:
print
" Not HTML, mime type"
,
ctype
return
0
def
setgood
(
self
,
url
):
if
self
.
bad
.
has_key
(
url
):
del
self
.
bad
[
url
]
self
.
changed
=
1
if
self
.
verbose
>
0
:
print
"(Clear previously seen error)"
if
self
.
bad
.
has_key
(
url
):
del
self
.
bad
[
url
]
self
.
changed
=
1
if
self
.
verbose
>
0
:
print
"(Clear previously seen error)"
def
setbad
(
self
,
url
,
msg
):
if
self
.
bad
.
has_key
(
url
)
and
self
.
bad
[
url
]
==
msg
:
if
self
.
verbose
>
0
:
print
"(Seen this error before)"
return
self
.
bad
[
url
]
=
msg
self
.
changed
=
1
self
.
markerror
(
url
)
if
self
.
bad
.
has_key
(
url
)
and
self
.
bad
[
url
]
==
msg
:
if
self
.
verbose
>
0
:
print
"(Seen this error before)"
return
self
.
bad
[
url
]
=
msg
self
.
changed
=
1
self
.
markerror
(
url
)
def
markerror
(
self
,
url
):
try
:
origins
=
self
.
todo
[
url
]
except
KeyError
:
origins
=
self
.
done
[
url
]
for
source
,
rawlink
in
origins
:
triple
=
url
,
rawlink
,
self
.
bad
[
url
]
self
.
seterror
(
source
,
triple
)
try
:
origins
=
self
.
todo
[
url
]
except
KeyError
:
origins
=
self
.
done
[
url
]
for
source
,
rawlink
in
origins
:
triple
=
url
,
rawlink
,
self
.
bad
[
url
]
self
.
seterror
(
source
,
triple
)
def
seterror
(
self
,
url
,
triple
):
try
:
self
.
errors
[
url
]
.
append
(
triple
)
except
KeyError
:
self
.
errors
[
url
]
=
[
triple
]
try
:
self
.
errors
[
url
]
.
append
(
triple
)
except
KeyError
:
self
.
errors
[
url
]
=
[
triple
]
# The following used to be toplevel functions; they have been
# changed into methods so they can be overridden in subclasses.
def
show
(
self
,
p1
,
link
,
p2
,
origins
):
print
p1
,
link
i
=
0
for
source
,
rawlink
in
origins
:
i
=
i
+
1
if
i
==
2
:
p2
=
' '
*
len
(
p2
)
print
p2
,
source
,
if
rawlink
!=
link
:
print
"(
%
s)"
%
rawlink
,
print
print
p1
,
link
i
=
0
for
source
,
rawlink
in
origins
:
i
=
i
+
1
if
i
==
2
:
p2
=
' '
*
len
(
p2
)
print
p2
,
source
,
if
rawlink
!=
link
:
print
"(
%
s)"
%
rawlink
,
print
def
sanitize
(
self
,
msg
):
if
isinstance
(
IOError
,
ClassType
)
and
isinstance
(
msg
,
IOError
):
# Do the other branch recursively
msg
.
args
=
self
.
sanitize
(
msg
.
args
)
elif
isinstance
(
msg
,
TupleType
):
if
len
(
msg
)
>=
4
and
msg
[
0
]
==
'http error'
and
\
isinstance
(
msg
[
3
],
InstanceType
):
# Remove the Message instance -- it may contain
# a file object which prevents pickling.
msg
=
msg
[:
3
]
+
msg
[
4
:]
return
msg
if
isinstance
(
IOError
,
ClassType
)
and
isinstance
(
msg
,
IOError
):
# Do the other branch recursively
msg
.
args
=
self
.
sanitize
(
msg
.
args
)
elif
isinstance
(
msg
,
TupleType
):
if
len
(
msg
)
>=
4
and
msg
[
0
]
==
'http error'
and
\
isinstance
(
msg
[
3
],
InstanceType
):
# Remove the Message instance -- it may contain
# a file object which prevents pickling.
msg
=
msg
[:
3
]
+
msg
[
4
:]
return
msg
def
safeclose
(
self
,
f
):
try
:
url
=
f
.
geturl
()
except
AttributeError
:
pass
else
:
if
url
[:
4
]
==
'ftp:'
or
url
[:
7
]
==
'file://'
:
# Apparently ftp connections don't like to be closed
# prematurely...
text
=
f
.
read
()
f
.
close
()
try
:
url
=
f
.
geturl
()
except
AttributeError
:
pass
else
:
if
url
[:
4
]
==
'ftp:'
or
url
[:
7
]
==
'file://'
:
# Apparently ftp connections don't like to be closed
# prematurely...
text
=
f
.
read
()
f
.
close
()
def
save_pickle
(
self
,
dumpfile
=
DUMPFILE
):
if
not
self
.
changed
:
if
self
.
verbose
>
0
:
print
print
"No need to save checkpoint"
elif
not
dumpfile
:
if
self
.
verbose
>
0
:
print
"No dumpfile, won't save checkpoint"
else
:
if
self
.
verbose
>
0
:
print
print
"Saving checkpoint to
%
s ..."
%
dumpfile
newfile
=
dumpfile
+
".new"
f
=
open
(
newfile
,
"wb"
)
pickle
.
dump
(
self
,
f
)
f
.
close
()
try
:
os
.
unlink
(
dumpfile
)
except
os
.
error
:
pass
os
.
rename
(
newfile
,
dumpfile
)
if
self
.
verbose
>
0
:
print
"Done."
return
1
if
not
self
.
changed
:
if
self
.
verbose
>
0
:
print
print
"No need to save checkpoint"
elif
not
dumpfile
:
if
self
.
verbose
>
0
:
print
"No dumpfile, won't save checkpoint"
else
:
if
self
.
verbose
>
0
:
print
print
"Saving checkpoint to
%
s ..."
%
dumpfile
newfile
=
dumpfile
+
".new"
f
=
open
(
newfile
,
"wb"
)
pickle
.
dump
(
self
,
f
)
f
.
close
()
try
:
os
.
unlink
(
dumpfile
)
except
os
.
error
:
pass
os
.
rename
(
newfile
,
dumpfile
)
if
self
.
verbose
>
0
:
print
"Done."
return
1
class
Page
:
def
__init__
(
self
,
text
,
url
,
verbose
=
VERBOSE
,
maxpage
=
MAXPAGE
):
self
.
text
=
text
self
.
url
=
url
self
.
verbose
=
verbose
self
.
maxpage
=
maxpage
self
.
text
=
text
self
.
url
=
url
self
.
verbose
=
verbose
self
.
maxpage
=
maxpage
def
getlinkinfos
(
self
):
size
=
len
(
self
.
text
)
if
size
>
self
.
maxpage
:
if
self
.
verbose
>
0
:
print
"Skip huge file"
,
self
.
url
print
" (
%.0
f Kbytes)"
%
(
size
*
0.001
)
return
[]
if
self
.
verbose
>
2
:
print
" Parsing"
,
self
.
url
,
"(
%
d bytes)"
%
size
parser
=
MyHTMLParser
(
verbose
=
self
.
verbose
)
parser
.
feed
(
self
.
text
)
parser
.
close
()
rawlinks
=
parser
.
getlinks
()
base
=
urlparse
.
urljoin
(
self
.
url
,
parser
.
getbase
()
or
""
)
infos
=
[]
for
rawlink
in
rawlinks
:
t
=
urlparse
.
urlparse
(
rawlink
)
t
=
t
[:
-
1
]
+
(
''
,)
rawlink
=
urlparse
.
urlunparse
(
t
)
link
=
urlparse
.
urljoin
(
base
,
rawlink
)
infos
.
append
((
link
,
rawlink
))
return
infos
size
=
len
(
self
.
text
)
if
size
>
self
.
maxpage
:
if
self
.
verbose
>
0
:
print
"Skip huge file"
,
self
.
url
print
" (
%.0
f Kbytes)"
%
(
size
*
0.001
)
return
[]
if
self
.
verbose
>
2
:
print
" Parsing"
,
self
.
url
,
"(
%
d bytes)"
%
size
parser
=
MyHTMLParser
(
verbose
=
self
.
verbose
)
parser
.
feed
(
self
.
text
)
parser
.
close
()
rawlinks
=
parser
.
getlinks
()
base
=
urlparse
.
urljoin
(
self
.
url
,
parser
.
getbase
()
or
""
)
infos
=
[]
for
rawlink
in
rawlinks
:
t
=
urlparse
.
urlparse
(
rawlink
)
t
=
t
[:
-
1
]
+
(
''
,)
rawlink
=
urlparse
.
urlunparse
(
t
)
link
=
urlparse
.
urljoin
(
base
,
rawlink
)
infos
.
append
((
link
,
rawlink
))
return
infos
class
MyStringIO
(
StringIO
.
StringIO
):
def
__init__
(
self
,
url
,
info
):
self
.
__url
=
url
self
.
__info
=
info
StringIO
.
StringIO
.
__init__
(
self
)
self
.
__url
=
url
self
.
__info
=
info
StringIO
.
StringIO
.
__init__
(
self
)
def
info
(
self
):
return
self
.
__info
return
self
.
__info
def
geturl
(
self
):
return
self
.
__url
return
self
.
__url
class
MyURLopener
(
urllib
.
FancyURLopener
):
...
...
@@ -590,81 +590,81 @@ class MyURLopener(urllib.FancyURLopener):
http_error_default
=
urllib
.
URLopener
.
http_error_default
def
__init__
(
*
args
):
self
=
args
[
0
]
apply
(
urllib
.
FancyURLopener
.
__init__
,
args
)
self
.
addheaders
=
[
(
'User-agent'
,
'Python-webchecker/
%
s'
%
__version__
),
]
self
=
args
[
0
]
apply
(
urllib
.
FancyURLopener
.
__init__
,
args
)
self
.
addheaders
=
[
(
'User-agent'
,
'Python-webchecker/
%
s'
%
__version__
),
]
def
http_error_401
(
self
,
url
,
fp
,
errcode
,
errmsg
,
headers
):
return
None
def
open_file
(
self
,
url
):
path
=
urllib
.
url2pathname
(
urllib
.
unquote
(
url
))
if
path
[
-
1
]
!=
os
.
sep
:
url
=
url
+
'/'
if
os
.
path
.
isdir
(
path
):
indexpath
=
os
.
path
.
join
(
path
,
"index.html"
)
if
os
.
path
.
exists
(
indexpath
):
return
self
.
open_file
(
url
+
"index.html"
)
try
:
names
=
os
.
listdir
(
path
)
except
os
.
error
,
msg
:
raise
IOError
,
msg
,
sys
.
exc_traceback
names
.
sort
()
s
=
MyStringIO
(
"file:"
+
url
,
{
'content-type'
:
'text/html'
})
s
.
write
(
'<BASE HREF="file:
%
s">
\n
'
%
urllib
.
quote
(
os
.
path
.
join
(
path
,
""
)))
for
name
in
names
:
q
=
urllib
.
quote
(
name
)
s
.
write
(
'<A HREF="
%
s">
%
s</A>
\n
'
%
(
q
,
q
))
s
.
seek
(
0
)
return
s
return
urllib
.
FancyURLopener
.
open_file
(
self
,
path
)
path
=
urllib
.
url2pathname
(
urllib
.
unquote
(
url
))
if
path
[
-
1
]
!=
os
.
sep
:
url
=
url
+
'/'
if
os
.
path
.
isdir
(
path
):
indexpath
=
os
.
path
.
join
(
path
,
"index.html"
)
if
os
.
path
.
exists
(
indexpath
):
return
self
.
open_file
(
url
+
"index.html"
)
try
:
names
=
os
.
listdir
(
path
)
except
os
.
error
,
msg
:
raise
IOError
,
msg
,
sys
.
exc_traceback
names
.
sort
()
s
=
MyStringIO
(
"file:"
+
url
,
{
'content-type'
:
'text/html'
})
s
.
write
(
'<BASE HREF="file:
%
s">
\n
'
%
urllib
.
quote
(
os
.
path
.
join
(
path
,
""
)))
for
name
in
names
:
q
=
urllib
.
quote
(
name
)
s
.
write
(
'<A HREF="
%
s">
%
s</A>
\n
'
%
(
q
,
q
))
s
.
seek
(
0
)
return
s
return
urllib
.
FancyURLopener
.
open_file
(
self
,
path
)
class
MyHTMLParser
(
sgmllib
.
SGMLParser
):
def
__init__
(
self
,
verbose
=
VERBOSE
):
self
.
base
=
None
self
.
links
=
{}
self
.
myverbose
=
verbose
sgmllib
.
SGMLParser
.
__init__
(
self
)
self
.
base
=
None
self
.
links
=
{}
self
.
myverbose
=
verbose
sgmllib
.
SGMLParser
.
__init__
(
self
)
def
start_a
(
self
,
attributes
):
self
.
link_attr
(
attributes
,
'href'
)
self
.
link_attr
(
attributes
,
'href'
)
def
end_a
(
self
):
pass
def
do_area
(
self
,
attributes
):
self
.
link_attr
(
attributes
,
'href'
)
self
.
link_attr
(
attributes
,
'href'
)
def
do_img
(
self
,
attributes
):
self
.
link_attr
(
attributes
,
'src'
,
'lowsrc'
)
self
.
link_attr
(
attributes
,
'src'
,
'lowsrc'
)
def
do_frame
(
self
,
attributes
):
self
.
link_attr
(
attributes
,
'src'
)
self
.
link_attr
(
attributes
,
'src'
)
def
link_attr
(
self
,
attributes
,
*
args
):
for
name
,
value
in
attributes
:
if
name
in
args
:
if
value
:
value
=
string
.
strip
(
value
)
if
value
:
self
.
links
[
value
]
=
None
for
name
,
value
in
attributes
:
if
name
in
args
:
if
value
:
value
=
string
.
strip
(
value
)
if
value
:
self
.
links
[
value
]
=
None
def
do_base
(
self
,
attributes
):
for
name
,
value
in
attributes
:
if
name
==
'href'
:
if
value
:
value
=
string
.
strip
(
value
)
if
value
:
if
self
.
myverbose
>
1
:
print
" Base"
,
value
self
.
base
=
value
for
name
,
value
in
attributes
:
if
name
==
'href'
:
if
value
:
value
=
string
.
strip
(
value
)
if
value
:
if
self
.
myverbose
>
1
:
print
" Base"
,
value
self
.
base
=
value
def
getlinks
(
self
):
return
self
.
links
.
keys
()
return
self
.
links
.
keys
()
def
getbase
(
self
):
return
self
.
base
return
self
.
base
if
__name__
==
'__main__'
:
...
...
Tools/webchecker/websucker.py
Dosyayı görüntüle @
986abac1
...
...
@@ -16,29 +16,29 @@ import webchecker
if
__version__
[
0
]
==
'$'
:
_v
=
string
.
split
(
__version__
)
if
len
(
_v
)
==
3
:
__version__
=
_v
[
1
]
__version__
=
_v
[
1
]
def
main
():
verbose
=
webchecker
.
VERBOSE
try
:
opts
,
args
=
getopt
.
getopt
(
sys
.
argv
[
1
:],
"qv"
)
opts
,
args
=
getopt
.
getopt
(
sys
.
argv
[
1
:],
"qv"
)
except
getopt
.
error
,
msg
:
print
msg
print
"usage:"
,
sys
.
argv
[
0
],
"[-qv] ... [rooturl] ..."
return
2
print
msg
print
"usage:"
,
sys
.
argv
[
0
],
"[-qv] ... [rooturl] ..."
return
2
for
o
,
a
in
opts
:
if
o
==
"-q"
:
verbose
=
0
if
o
==
"-v"
:
verbose
=
verbose
+
1
if
o
==
"-q"
:
verbose
=
0
if
o
==
"-v"
:
verbose
=
verbose
+
1
c
=
Sucker
()
c
.
setflags
(
verbose
=
verbose
)
c
.
urlopener
.
addheaders
=
[
(
'User-agent'
,
'websucker/
%
s'
%
__version__
),
]
(
'User-agent'
,
'websucker/
%
s'
%
__version__
),
]
for
arg
in
args
:
print
"Adding root"
,
arg
c
.
addroot
(
arg
)
print
"Adding root"
,
arg
c
.
addroot
(
arg
)
print
"Run..."
c
.
run
()
...
...
@@ -47,57 +47,57 @@ class Sucker(webchecker.Checker):
checkext
=
0
def
readhtml
(
self
,
url
):
text
=
None
path
=
self
.
savefilename
(
url
)
try
:
f
=
open
(
path
,
"rb"
)
except
IOError
:
f
=
self
.
openpage
(
url
)
if
f
:
info
=
f
.
info
()
nurl
=
f
.
geturl
()
if
nurl
!=
url
:
url
=
nurl
path
=
self
.
savefilename
(
url
)
text
=
f
.
read
()
f
.
close
()
self
.
savefile
(
text
,
path
)
if
not
self
.
checkforhtml
(
info
,
url
):
text
=
None
else
:
if
self
.
checkforhtml
({},
url
):
text
=
f
.
read
()
f
.
close
()
return
text
,
url
text
=
None
path
=
self
.
savefilename
(
url
)
try
:
f
=
open
(
path
,
"rb"
)
except
IOError
:
f
=
self
.
openpage
(
url
)
if
f
:
info
=
f
.
info
()
nurl
=
f
.
geturl
()
if
nurl
!=
url
:
url
=
nurl
path
=
self
.
savefilename
(
url
)
text
=
f
.
read
()
f
.
close
()
self
.
savefile
(
text
,
path
)
if
not
self
.
checkforhtml
(
info
,
url
):
text
=
None
else
:
if
self
.
checkforhtml
({},
url
):
text
=
f
.
read
()
f
.
close
()
return
text
,
url
def
savefile
(
self
,
text
,
path
):
dir
,
base
=
os
.
path
.
split
(
path
)
makedirs
(
dir
)
f
=
open
(
path
,
"wb"
)
f
.
write
(
text
)
f
.
close
()
print
"saved"
,
path
dir
,
base
=
os
.
path
.
split
(
path
)
makedirs
(
dir
)
f
=
open
(
path
,
"wb"
)
f
.
write
(
text
)
f
.
close
()
print
"saved"
,
path
def
savefilename
(
self
,
url
):
type
,
rest
=
urllib
.
splittype
(
url
)
host
,
path
=
urllib
.
splithost
(
rest
)
while
path
[:
1
]
==
"/"
:
path
=
path
[
1
:]
user
,
host
=
urllib
.
splituser
(
host
)
host
,
port
=
urllib
.
splitnport
(
host
)
host
=
string
.
lower
(
host
)
path
=
os
.
path
.
join
(
host
,
path
)
if
path
[
-
1
]
==
"/"
:
path
=
path
+
"index.html"
if
os
.
sep
!=
"/"
:
path
=
string
.
join
(
string
.
split
(
path
,
"/"
),
os
.
sep
)
return
path
type
,
rest
=
urllib
.
splittype
(
url
)
host
,
path
=
urllib
.
splithost
(
rest
)
while
path
[:
1
]
==
"/"
:
path
=
path
[
1
:]
user
,
host
=
urllib
.
splituser
(
host
)
host
,
port
=
urllib
.
splitnport
(
host
)
host
=
string
.
lower
(
host
)
path
=
os
.
path
.
join
(
host
,
path
)
if
path
[
-
1
]
==
"/"
:
path
=
path
+
"index.html"
if
os
.
sep
!=
"/"
:
path
=
string
.
join
(
string
.
split
(
path
,
"/"
),
os
.
sep
)
return
path
def
makedirs
(
dir
):
if
not
dir
or
os
.
path
.
exists
(
dir
):
return
return
head
,
tail
=
os
.
path
.
split
(
dir
)
if
not
tail
:
print
"Huh? Don't know how to make dir"
,
dir
return
print
"Huh? Don't know how to make dir"
,
dir
return
makedirs
(
head
)
os
.
mkdir
(
dir
,
0777
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment