robotparser.py 3.22 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11
"""

Robots.txt file parser class.  Accepts a list of lines or robots.txt URL as
input, builds a set of rules from that list, then answers questions about
fetchability of other URLs.

"""

class RobotFileParser:

    def __init__(self):
Guido van Rossum's avatar
Guido van Rossum committed
12 13 14 15
        self.rules = {}
        self.debug = 0
        self.url = ''
        self.last_checked = 0
16 17

    def mtime(self):
Guido van Rossum's avatar
Guido van Rossum committed
18
        return self.last_checked
19 20

    def modified(self):
Guido van Rossum's avatar
Guido van Rossum committed
21 22
        import time
        self.last_checked = time.time()
23 24

    def set_url(self, url):
Guido van Rossum's avatar
Guido van Rossum committed
25 26 27
        self.url = url
##      import urlmisc
##      self.url = urlmisc.canonical_url(url)
28 29

    def read(self):
Guido van Rossum's avatar
Guido van Rossum committed
30 31
        import urllib
        self.parse(urllib.urlopen(self.url).readlines())
32 33

    def parse(self, lines):
Guido van Rossum's avatar
Guido van Rossum committed
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
        import regsub, string, regex
        active = []
        for line in lines:
            if self.debug: print '>', line,
            # blank line terminates current record
            if not line[:-1]:
                active = []
                continue
            # remove optional comment and strip line
            line = string.strip(line[:string.find(line, '#')])
            if not line:
                continue
            line = regsub.split(line, ' *: *')
            if len(line) == 2:
                line[0] = string.lower(line[0])
                if line[0] == 'user-agent':
                    # this record applies to this user agent
                    if self.debug: print '>> user-agent:', line[1]
                    active.append(line[1])
                    if not self.rules.has_key(line[1]):
                        self.rules[line[1]] = []
                elif line[0] == 'disallow':
                    if line[1]:
                        if self.debug: print '>> disallow:', line[1]
                        for agent in active:
                            self.rules[agent].append(regex.compile(line[1]))
                    else:
                        pass
                        for agent in active:
                            if self.debug: print '>> allow', agent
                            self.rules[agent] = []
                else:
                    if self.debug: print '>> unknown:', line
67

Guido van Rossum's avatar
Guido van Rossum committed
68
        self.modified()
69 70 71

    # returns true if agent is allowed to fetch url
    def can_fetch(self, agent, url):
Guido van Rossum's avatar
Guido van Rossum committed
72 73 74 75 76 77 78 79 80 81 82 83 84
        import urlparse
        ag = agent
        if not self.rules.has_key(ag): ag = '*'
        if not self.rules.has_key(ag):
            if self.debug: print '>> allowing', url, 'fetch by', agent
            return 1
        path = urlparse.urlparse(url)[2]
        for rule in self.rules[ag]:
            if rule.match(path) != -1:
                if self.debug: print '>> disallowing', url, 'fetch by', agent
                return 0
        if self.debug: print '>> allowing', url, 'fetch by', agent
        return 1
85 86 87 88 89 90 91 92 93

def test():
    rp = RobotFileParser()
    rp.debug = 1
    rp.set_url('http://www.automatrix.com/robots.txt')
    rp.read()
    print rp.rules
    print rp.can_fetch('*', 'http://www.calendar.com/concerts/')
    print rp.can_fetch('Musi-Cal-Robot',
Guido van Rossum's avatar
Guido van Rossum committed
94
                       'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones')
95 96 97

    print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/')
    print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/vanagon-list-001')