robotparser.py 6.76 KB
Newer Older
1 2 3
""" robotparser.py

    Copyright (C) 2000  Bastian Kleineidam
4

5 6
    You can choose between two licenses when using this package:
    1) GNU GPLv2
7
    2) PSF license for Python 2.2
8

9 10
    The robots.txt Exclusion Protocol is implemented as specified in
    http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
11
"""
12 13
import urlparse
import urllib
14

15 16
__all__ = ["RobotFileParser"]

17 18

class RobotFileParser:
19 20 21 22 23
    """ This class provides a set of methods to read, parse and answer
    questions about a single robots.txt file.

    """

24 25
    def __init__(self, url=''):
        self.entries = []
26
        self.default_entry = None
27 28
        self.disallow_all = False
        self.allow_all = False
29
        self.set_url(url)
Guido van Rossum's avatar
Guido van Rossum committed
30
        self.last_checked = 0
31 32

    def mtime(self):
33 34 35 36 37 38
        """Returns the time the robots.txt file was last fetched.

        This is useful for long-running web spiders that need to
        check for new robots.txt files periodically.

        """
Guido van Rossum's avatar
Guido van Rossum committed
39
        return self.last_checked
40 41

    def modified(self):
42 43 44 45
        """Sets the time the robots.txt file was last fetched to the
        current time.

        """
Guido van Rossum's avatar
Guido van Rossum committed
46 47
        import time
        self.last_checked = time.time()
48 49

    def set_url(self, url):
50
        """Sets the URL referring to a robots.txt file."""
Guido van Rossum's avatar
Guido van Rossum committed
51
        self.url = url
52
        self.host, self.path = urlparse.urlparse(url)[1:3]
53 54

    def read(self):
55
        """Reads the robots.txt URL and feeds it to the parser."""
56 57
        opener = URLopener()
        f = opener.open(self.url)
58 59
        lines = [line.strip() for line in f]
        f.close()
60
        self.errcode = opener.errcode
61
        if self.errcode in (401, 403):
62
            self.disallow_all = True
63
        elif self.errcode >= 400:
64
            self.allow_all = True
65 66
        elif self.errcode == 200 and lines:
            self.parse(lines)
67

68 69 70 71 72 73 74
    def _add_entry(self, entry):
        if "*" in entry.useragents:
            # the default entry is considered last
            self.default_entry = entry
        else:
            self.entries.append(entry)

75
    def parse(self, lines):
76
        """parse the input lines from a robots.txt file.
Tim Peters's avatar
Tim Peters committed
77 78
           We allow that a user-agent: line is not preceded by
           one or more blank lines."""
79 80 81 82
        # states:
        #   0: start state
        #   1: saw user-agent line
        #   2: saw an allow or disallow line
83 84 85
        state = 0
        linenumber = 0
        entry = Entry()
Tim Peters's avatar
Tim Peters committed
86

Guido van Rossum's avatar
Guido van Rossum committed
87
        for line in lines:
88
            linenumber += 1
89
            if not line:
90
                if state == 1:
91 92
                    entry = Entry()
                    state = 0
93
                elif state == 2:
94
                    self._add_entry(entry)
95 96
                    entry = Entry()
                    state = 0
Guido van Rossum's avatar
Guido van Rossum committed
97
            # remove optional comment and strip line
98
            i = line.find('#')
99
            if i >= 0:
100
                line = line[:i]
101
            line = line.strip()
Guido van Rossum's avatar
Guido van Rossum committed
102 103
            if not line:
                continue
104
            line = line.split(':', 1)
Guido van Rossum's avatar
Guido van Rossum committed
105
            if len(line) == 2:
106
                line[0] = line[0].strip().lower()
107
                line[1] = urllib.unquote(line[1].strip())
108
                if line[0] == "user-agent":
109
                    if state == 2:
110
                        self._add_entry(entry)
111 112 113 114
                        entry = Entry()
                    entry.useragents.append(line[1])
                    state = 1
                elif line[0] == "disallow":
115
                    if state != 0:
116
                        entry.rulelines.append(RuleLine(line[1], False))
117 118
                        state = 2
                elif line[0] == "allow":
119
                    if state != 0:
120
                        entry.rulelines.append(RuleLine(line[1], True))
121
                        state = 2
122
        if state == 2:
123
            self.entries.append(entry)
124 125


Guido van Rossum's avatar
Guido van Rossum committed
126 127
    def can_fetch(self, useragent, url):
        """using the parsed robots.txt decide if useragent can fetch url"""
128
        if self.disallow_all:
129
            return False
130
        if self.allow_all:
131
            return True
132 133
        # search for given user agent matches
        # the first match counts
134
        url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/"
135 136 137
        for entry in self.entries:
            if entry.applies_to(useragent):
                return entry.allowance(url)
138 139 140
        # try the default entry last
        if self.default_entry:
            return self.default_entry.allowance(url)
141
        # agent not found ==> access granted
142
        return True
143

144 145

    def __str__(self):
146
        return ''.join([str(entry) + "\n" for entry in self.entries])
147 148 149


class RuleLine:
150 151
    """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
       (allowance==False) followed by a path."""
152
    def __init__(self, path, allowance):
153 154
        if path == '' and not allowance:
            # an empty value means allow all
155
            allowance = True
156 157 158 159
        self.path = urllib.quote(path)
        self.allowance = allowance

    def applies_to(self, filename):
160
        return self.path == "*" or filename.startswith(self.path)
161 162

    def __str__(self):
163
        return (self.allowance and "Allow" or "Disallow") + ": " + self.path
164 165 166 167 168 169 170 171 172


class Entry:
    """An entry has one or more user-agents and zero or more rulelines"""
    def __init__(self):
        self.useragents = []
        self.rulelines = []

    def __str__(self):
173
        ret = []
174
        for agent in self.useragents:
175
            ret.extend(["User-agent: ", agent, "\n"])
176
        for line in self.rulelines:
177 178
            ret.extend([str(line), "\n"])
        return ''.join(ret)
179 180

    def applies_to(self, useragent):
181 182 183
        """check if this entry applies to the specified agent"""
        # split the name token and make it lower case
        useragent = useragent.split("/")[0].lower()
184
        for agent in self.useragents:
185
            if agent == '*':
186
                # we have the catch-all agent
187
                return True
188
            agent = agent.lower()
189
            if agent in useragent:
190 191
                return True
        return False
192 193 194 195 196 197 198 199

    def allowance(self, filename):
        """Preconditions:
        - our agent applies to this entry
        - filename is URL decoded"""
        for line in self.rulelines:
            if line.applies_to(filename):
                return line.allowance
200
        return True
201

202 203
class URLopener(urllib.FancyURLopener):
    def __init__(self, *args):
204
        urllib.FancyURLopener.__init__(self, *args)
205
        self.errcode = 200
Tim Peters's avatar
Tim Peters committed
206

Skip Montanaro's avatar
Skip Montanaro committed
207 208 209 210 211
    def prompt_user_passwd(self, host, realm):
        ## If robots.txt file is accessible only with a password,
        ## we act as if the file wasn't there.
        return None, None

212 213 214 215
    def http_error_default(self, url, fp, errcode, errmsg, headers):
        self.errcode = errcode
        return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
                                                        errmsg, headers)