robotparser.py 9.72 KB
Newer Older
1 2 3
""" robotparser.py

    Copyright (C) 2000  Bastian Kleineidam
4

5 6
    You can choose between two licenses when using this package:
    1) GNU GPLv2
7
    2) PSF license for Python 2.2
8

9 10
    The robots.txt Exclusion Protocol is implemented as specified in
    http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
11
"""
12
import urlparse,urllib
13

14 15
__all__ = ["RobotFileParser"]

16
debug = 0
17

18 19 20 21 22
def _debug(msg):
    if debug: print msg


class RobotFileParser:
23 24 25 26 27
    """ This class provides a set of methods to read, parse and answer
    questions about a single robots.txt file.

    """

28 29
    def __init__(self, url=''):
        self.entries = []
30
        self.default_entry = None
31 32
        self.disallow_all = False
        self.allow_all = False
33
        self.set_url(url)
Guido van Rossum's avatar
Guido van Rossum committed
34
        self.last_checked = 0
35 36

    def mtime(self):
37 38 39 40 41 42
        """Returns the time the robots.txt file was last fetched.

        This is useful for long-running web spiders that need to
        check for new robots.txt files periodically.

        """
Guido van Rossum's avatar
Guido van Rossum committed
43
        return self.last_checked
44 45

    def modified(self):
46 47 48 49
        """Sets the time the robots.txt file was last fetched to the
        current time.

        """
Guido van Rossum's avatar
Guido van Rossum committed
50 51
        import time
        self.last_checked = time.time()
52 53

    def set_url(self, url):
54
        """Sets the URL referring to a robots.txt file."""
Guido van Rossum's avatar
Guido van Rossum committed
55
        self.url = url
56
        self.host, self.path = urlparse.urlparse(url)[1:3]
57 58

    def read(self):
59
        """Reads the robots.txt URL and feeds it to the parser."""
60 61
        opener = URLopener()
        f = opener.open(self.url)
62 63 64 65 66
        lines = []
        line = f.readline()
        while line:
            lines.append(line.strip())
            line = f.readline()
67 68
        self.errcode = opener.errcode
        if self.errcode == 401 or self.errcode == 403:
69
            self.disallow_all = True
70 71
            _debug("disallow all")
        elif self.errcode >= 400:
72
            self.allow_all = True
73 74 75 76
            _debug("allow all")
        elif self.errcode == 200 and lines:
            _debug("parse lines")
            self.parse(lines)
77

78 79 80 81 82 83 84
    def _add_entry(self, entry):
        if "*" in entry.useragents:
            # the default entry is considered last
            self.default_entry = entry
        else:
            self.entries.append(entry)

85
    def parse(self, lines):
86
        """parse the input lines from a robots.txt file.
Tim Peters's avatar
Tim Peters committed
87 88
           We allow that a user-agent: line is not preceded by
           one or more blank lines."""
89 90 91
        state = 0
        linenumber = 0
        entry = Entry()
Tim Peters's avatar
Tim Peters committed
92

Guido van Rossum's avatar
Guido van Rossum committed
93
        for line in lines:
94 95 96 97
            linenumber = linenumber + 1
            if not line:
                if state==1:
                    _debug("line %d: warning: you should insert"
Tim Peters's avatar
Tim Peters committed
98 99
                           " allow: or disallow: directives below any"
                           " user-agent: line" % linenumber)
100 101 102
                    entry = Entry()
                    state = 0
                elif state==2:
103
                    self._add_entry(entry)
104 105
                    entry = Entry()
                    state = 0
Guido van Rossum's avatar
Guido van Rossum committed
106
            # remove optional comment and strip line
107
            i = line.find('#')
108 109
            if i>=0:
                line = line[:i]
110
            line = line.strip()
Guido van Rossum's avatar
Guido van Rossum committed
111 112
            if not line:
                continue
113
            line = line.split(':', 1)
Guido van Rossum's avatar
Guido van Rossum committed
114
            if len(line) == 2:
115
                line[0] = line[0].strip().lower()
116
                line[1] = urllib.unquote(line[1].strip())
117 118 119
                if line[0] == "user-agent":
                    if state==2:
                        _debug("line %d: warning: you should insert a blank"
Tim Peters's avatar
Tim Peters committed
120
                               " line before any user-agent"
121
                               " directive" % linenumber)
122
                        self._add_entry(entry)
123 124 125 126 127 128
                        entry = Entry()
                    entry.useragents.append(line[1])
                    state = 1
                elif line[0] == "disallow":
                    if state==0:
                        _debug("line %d: error: you must insert a user-agent:"
Tim Peters's avatar
Tim Peters committed
129
                               " directive before this line" % linenumber)
130
                    else:
131
                        entry.rulelines.append(RuleLine(line[1], False))
132 133 134 135
                        state = 2
                elif line[0] == "allow":
                    if state==0:
                        _debug("line %d: error: you must insert a user-agent:"
Tim Peters's avatar
Tim Peters committed
136
                               " directive before this line" % linenumber)
Guido van Rossum's avatar
Guido van Rossum committed
137
                    else:
138
                        entry.rulelines.append(RuleLine(line[1], True))
Guido van Rossum's avatar
Guido van Rossum committed
139
                else:
140 141 142 143 144 145 146
                    _debug("line %d: warning: unknown key %s" % (linenumber,
                               line[0]))
            else:
                _debug("line %d: error: malformed line %s"%(linenumber, line))
        if state==2:
            self.entries.append(entry)
        _debug("Parsed rules:\n%s" % str(self))
147 148


Guido van Rossum's avatar
Guido van Rossum committed
149 150
    def can_fetch(self, useragent, url):
        """using the parsed robots.txt decide if useragent can fetch url"""
151
        _debug("Checking robots.txt allowance for:\n  user agent: %s\n  url: %s" %
152
               (useragent, url))
153
        if self.disallow_all:
154
            return False
155
        if self.allow_all:
156
            return True
157 158
        # search for given user agent matches
        # the first match counts
159
        url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/"
160 161 162
        for entry in self.entries:
            if entry.applies_to(useragent):
                return entry.allowance(url)
163 164 165
        # try the default entry last
        if self.default_entry:
            return self.default_entry.allowance(url)
166
        # agent not found ==> access granted
167
        return True
168

169 170 171 172 173 174 175 176 177

    def __str__(self):
        ret = ""
        for entry in self.entries:
            ret = ret + str(entry) + "\n"
        return ret


class RuleLine:
178 179
    """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
       (allowance==False) followed by a path."""
180
    def __init__(self, path, allowance):
181 182
        if path == '' and not allowance:
            # an empty value means allow all
183
            allowance = True
184 185 186 187
        self.path = urllib.quote(path)
        self.allowance = allowance

    def applies_to(self, filename):
188
        return self.path=="*" or filename.startswith(self.path)
189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208

    def __str__(self):
        return (self.allowance and "Allow" or "Disallow")+": "+self.path


class Entry:
    """An entry has one or more user-agents and zero or more rulelines"""
    def __init__(self):
        self.useragents = []
        self.rulelines = []

    def __str__(self):
        ret = ""
        for agent in self.useragents:
            ret = ret + "User-agent: "+agent+"\n"
        for line in self.rulelines:
            ret = ret + str(line) + "\n"
        return ret

    def applies_to(self, useragent):
209 210 211
        """check if this entry applies to the specified agent"""
        # split the name token and make it lower case
        useragent = useragent.split("/")[0].lower()
212
        for agent in self.useragents:
213 214
            if agent=='*':
                # we have the catch-all agent
215
                return True
216
            agent = agent.lower()
217
            if agent in useragent:
218 219
                return True
        return False
220 221 222 223 224 225

    def allowance(self, filename):
        """Preconditions:
        - our agent applies to this entry
        - filename is URL decoded"""
        for line in self.rulelines:
226
            _debug((filename, str(line), line.allowance))
227 228
            if line.applies_to(filename):
                return line.allowance
229
        return True
230

231 232
class URLopener(urllib.FancyURLopener):
    def __init__(self, *args):
233
        urllib.FancyURLopener.__init__(self, *args)
234
        self.errcode = 200
Tim Peters's avatar
Tim Peters committed
235

236 237 238 239 240 241 242 243 244 245 246 247 248 249 250
    def http_error_default(self, url, fp, errcode, errmsg, headers):
        self.errcode = errcode
        return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
                                                        errmsg, headers)

def _check(a,b):
    if not b:
        ac = "access denied"
    else:
        ac = "access allowed"
    if a!=b:
        print "failed"
    else:
        print "ok (%s)" % ac
    print
251

Guido van Rossum's avatar
Guido van Rossum committed
252
def _test():
253
    global debug
254
    rp = RobotFileParser()
255
    debug = 1
256 257 258 259 260 261 262 263 264 265 266 267 268 269

    # robots.txt that exists, gotten to by redirection
    rp.set_url('http://www.musi-cal.com/robots.txt')
    rp.read()

    # test for re.escape
    _check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1)
    # this should match the first rule, which is a disallow
    _check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0)
    # various cherry pickers
    _check(rp.can_fetch('CherryPickerSE',
                       'http://www.musi-cal.com/cgi-bin/event-search'
                       '?city=San+Francisco'), 0)
    _check(rp.can_fetch('CherryPickerSE/1.0',
270
                       'http://www.musi-cal.com/cgi-bin/event-search'
271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289
                       '?city=San+Francisco'), 0)
    _check(rp.can_fetch('CherryPickerSE/1.5',
                       'http://www.musi-cal.com/cgi-bin/event-search'
                       '?city=San+Francisco'), 0)
    # case sensitivity
    _check(rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0)
    _check(rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0)
    # substring test
    _check(rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0)
    # tests for catch-all * agent
    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/search'), 0)
    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1)
    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)

    # robots.txt that does not exist
    rp.set_url('http://www.lycos.com/robots.txt')
    rp.read()
    _check(rp.can_fetch('Mozilla', 'http://www.lycos.com/search'), 1)
290 291

if __name__ == '__main__':
Guido van Rossum's avatar
Guido van Rossum committed
292
    _test()