_parseaddr.py 14.6 KB
Newer Older
1
# Copyright (C) 2002-2007 Python Software Foundation
2
# Contact: email-sig@python.org
3 4 5 6 7 8

"""Email address parsing code.

Lifted directly from rfc822.py.  This should eventually be rewritten.
"""

9 10 11 12 13 14 15
__all__ = [
    'mktime_tz',
    'parsedate',
    'parsedate_tz',
    'quote',
    ]

16
import time
Barry Warsaw's avatar
Barry Warsaw committed
17 18 19 20

SPACE = ' '
EMPTYSTRING = ''
COMMASPACE = ', '
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50

# Parse a date field
_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
               'aug', 'sep', 'oct', 'nov', 'dec',
               'january', 'february', 'march', 'april', 'may', 'june', 'july',
               'august', 'september', 'october', 'november', 'december']

_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']

# The timezone table does not include the military time zones defined
# in RFC822, other than Z.  According to RFC1123, the description in
# RFC822 gets the signs wrong, so we can't rely on any such time
# zones.  RFC1123 recommends that numeric timezone indicators be used
# instead of timezone names.

_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
              'AST': -400, 'ADT': -300,  # Atlantic (used in Canada)
              'EST': -500, 'EDT': -400,  # Eastern
              'CST': -600, 'CDT': -500,  # Central
              'MST': -700, 'MDT': -600,  # Mountain
              'PST': -800, 'PDT': -700   # Pacific
              }


def parsedate_tz(data):
    """Convert a date string to a time tuple.

    Accounts for military timezones.
    """
    data = data.split()
51 52 53
    # The FWS after the comma after the day-of-week is optional, so search and
    # adjust for this.
    if data[0].endswith(',') or data[0].lower() in _daynames:
54 55
        # There's a dayname here. Skip it
        del data[0]
56 57
    else:
        i = data[0].rfind(',')
58 59
        if i >= 0:
            data[0] = data[0][i+1:]
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
    if len(data) == 3: # RFC 850 date, deprecated
        stuff = data[0].split('-')
        if len(stuff) == 3:
            data = stuff + data[1:]
    if len(data) == 4:
        s = data[3]
        i = s.find('+')
        if i > 0:
            data[3:] = [s[:i], s[i+1:]]
        else:
            data.append('') # Dummy tz
    if len(data) < 5:
        return None
    data = data[:5]
    [dd, mm, yy, tm, tz] = data
    mm = mm.lower()
Barry Warsaw's avatar
Barry Warsaw committed
76
    if mm not in _monthnames:
77
        dd, mm = mm, dd.lower()
Barry Warsaw's avatar
Barry Warsaw committed
78
        if mm not in _monthnames:
79
            return None
Barry Warsaw's avatar
Barry Warsaw committed
80 81 82
    mm = _monthnames.index(mm) + 1
    if mm > 12:
        mm -= 12
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
    if dd[-1] == ',':
        dd = dd[:-1]
    i = yy.find(':')
    if i > 0:
        yy, tm = tm, yy
    if yy[-1] == ',':
        yy = yy[:-1]
    if not yy[0].isdigit():
        yy, tz = tz, yy
    if tm[-1] == ',':
        tm = tm[:-1]
    tm = tm.split(':')
    if len(tm) == 2:
        [thh, tmm] = tm
        tss = '0'
    elif len(tm) == 3:
        [thh, tmm, tss] = tm
    else:
        return None
    try:
        yy = int(yy)
        dd = int(dd)
        thh = int(thh)
        tmm = int(tmm)
        tss = int(tss)
    except ValueError:
        return None
    tzoffset = None
    tz = tz.upper()
    if _timezones.has_key(tz):
        tzoffset = _timezones[tz]
    else:
        try:
            tzoffset = int(tz)
        except ValueError:
            pass
    # Convert a timezone offset into seconds ; -0500 -> -18000
    if tzoffset:
        if tzoffset < 0:
            tzsign = -1
            tzoffset = -tzoffset
        else:
            tzsign = 1
126
        tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60)
127 128
    # Daylight Saving Time flag is set to -1, since DST is unknown.
    return yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset
129 130 131 132 133


def parsedate(data):
    """Convert a time string to a time tuple."""
    t = parsedate_tz(data)
134
    if isinstance(t, tuple):
135
        return t[:9]
Barry Warsaw's avatar
Barry Warsaw committed
136 137
    else:
        return t
138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157


def mktime_tz(data):
    """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
    if data[9] is None:
        # No zone info, so localtime is better assumption than GMT
        return time.mktime(data[:8] + (-1,))
    else:
        t = time.mktime(data[:8] + (0,))
        return t - data[9] - time.timezone


def quote(str):
    """Add quotes around a string."""
    return str.replace('\\', '\\\\').replace('"', '\\"')


class AddrlistClass:
    """Address parser class by Ben Escoto.

158 159
    To understand what this class does, it helps to have a copy of RFC 2822 in
    front of you.
160 161 162 163 164 165 166 167 168 169 170 171 172 173 174

    Note: this class interface is deprecated and may be removed in the future.
    Use rfc822.AddressList instead.
    """

    def __init__(self, field):
        """Initialize a new instance.

        `field' is an unparsed address header field, containing
        one or more addresses.
        """
        self.specials = '()<>@,:;.\"[]'
        self.pos = 0
        self.LWS = ' \t'
        self.CR = '\r\n'
175
        self.FWS = self.LWS + self.CR
176
        self.atomends = self.specials + self.LWS + self.CR
177 178 179 180
        # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
        # is obsolete syntax.  RFC 2822 requires that we recognize obsolete
        # syntax, so allow dots in phrases.
        self.phraseends = self.atomends.replace('.', '')
181 182 183 184 185 186 187
        self.field = field
        self.commentlist = []

    def gotonext(self):
        """Parse up to the start of the next address."""
        while self.pos < len(self.field):
            if self.field[self.pos] in self.LWS + '\n\r':
Barry Warsaw's avatar
Barry Warsaw committed
188
                self.pos += 1
189 190
            elif self.field[self.pos] == '(':
                self.commentlist.append(self.getcomment())
Barry Warsaw's avatar
Barry Warsaw committed
191 192
            else:
                break
193 194 195 196 197 198

    def getaddrlist(self):
        """Parse all addresses.

        Returns a list containing all of the addresses.
        """
199
        result = []
200
        while self.pos < len(self.field):
201 202 203 204
            ad = self.getaddress()
            if ad:
                result += ad
            else:
205
                result.append(('', ''))
206
        return result
207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222

    def getaddress(self):
        """Parse the next address."""
        self.commentlist = []
        self.gotonext()

        oldpos = self.pos
        oldcl = self.commentlist
        plist = self.getphraselist()

        self.gotonext()
        returnlist = []

        if self.pos >= len(self.field):
            # Bad email address technically, no domain.
            if plist:
Barry Warsaw's avatar
Barry Warsaw committed
223
                returnlist = [(SPACE.join(self.commentlist), plist[0])]
224 225 226 227 228 229 230

        elif self.field[self.pos] in '.@':
            # email address is just an addrspec
            # this isn't very efficient since we start over
            self.pos = oldpos
            self.commentlist = oldcl
            addrspec = self.getaddrspec()
Barry Warsaw's avatar
Barry Warsaw committed
231
            returnlist = [(SPACE.join(self.commentlist), addrspec)]
232 233 234 235 236 237

        elif self.field[self.pos] == ':':
            # address is a group
            returnlist = []

            fieldlen = len(self.field)
Barry Warsaw's avatar
Barry Warsaw committed
238
            self.pos += 1
239 240 241
            while self.pos < len(self.field):
                self.gotonext()
                if self.pos < fieldlen and self.field[self.pos] == ';':
Barry Warsaw's avatar
Barry Warsaw committed
242
                    self.pos += 1
243 244 245 246 247 248 249 250
                    break
                returnlist = returnlist + self.getaddress()

        elif self.field[self.pos] == '<':
            # Address is a phrase then a route addr
            routeaddr = self.getrouteaddr()

            if self.commentlist:
Barry Warsaw's avatar
Barry Warsaw committed
251 252 253 254
                returnlist = [(SPACE.join(plist) + ' (' +
                               ' '.join(self.commentlist) + ')', routeaddr)]
            else:
                returnlist = [(SPACE.join(plist), routeaddr)]
255 256 257

        else:
            if plist:
Barry Warsaw's avatar
Barry Warsaw committed
258
                returnlist = [(SPACE.join(self.commentlist), plist[0])]
259
            elif self.field[self.pos] in self.specials:
Barry Warsaw's avatar
Barry Warsaw committed
260
                self.pos += 1
261 262 263

        self.gotonext()
        if self.pos < len(self.field) and self.field[self.pos] == ',':
Barry Warsaw's avatar
Barry Warsaw committed
264
            self.pos += 1
265 266 267 268 269 270 271 272 273 274
        return returnlist

    def getrouteaddr(self):
        """Parse a route address (Return-path value).

        This method just skips all the route stuff and returns the addrspec.
        """
        if self.field[self.pos] != '<':
            return

Barry Warsaw's avatar
Barry Warsaw committed
275 276
        expectroute = False
        self.pos += 1
277
        self.gotonext()
Barry Warsaw's avatar
Barry Warsaw committed
278
        adlist = ''
279 280 281
        while self.pos < len(self.field):
            if expectroute:
                self.getdomain()
Barry Warsaw's avatar
Barry Warsaw committed
282
                expectroute = False
283
            elif self.field[self.pos] == '>':
Barry Warsaw's avatar
Barry Warsaw committed
284
                self.pos += 1
285 286
                break
            elif self.field[self.pos] == '@':
Barry Warsaw's avatar
Barry Warsaw committed
287 288
                self.pos += 1
                expectroute = True
289
            elif self.field[self.pos] == ':':
Barry Warsaw's avatar
Barry Warsaw committed
290
                self.pos += 1
291 292
            else:
                adlist = self.getaddrspec()
Barry Warsaw's avatar
Barry Warsaw committed
293
                self.pos += 1
294 295 296 297 298 299
                break
            self.gotonext()

        return adlist

    def getaddrspec(self):
300
        """Parse an RFC 2822 addr-spec."""
301 302 303 304 305 306
        aslist = []

        self.gotonext()
        while self.pos < len(self.field):
            if self.field[self.pos] == '.':
                aslist.append('.')
Barry Warsaw's avatar
Barry Warsaw committed
307
                self.pos += 1
308 309 310 311
            elif self.field[self.pos] == '"':
                aslist.append('"%s"' % self.getquote())
            elif self.field[self.pos] in self.atomends:
                break
Barry Warsaw's avatar
Barry Warsaw committed
312 313
            else:
                aslist.append(self.getatom())
314 315 316
            self.gotonext()

        if self.pos >= len(self.field) or self.field[self.pos] != '@':
Barry Warsaw's avatar
Barry Warsaw committed
317
            return EMPTYSTRING.join(aslist)
318 319

        aslist.append('@')
Barry Warsaw's avatar
Barry Warsaw committed
320
        self.pos += 1
321
        self.gotonext()
Barry Warsaw's avatar
Barry Warsaw committed
322
        return EMPTYSTRING.join(aslist) + self.getdomain()
323 324 325 326 327 328

    def getdomain(self):
        """Get the complete domain name from an address."""
        sdlist = []
        while self.pos < len(self.field):
            if self.field[self.pos] in self.LWS:
Barry Warsaw's avatar
Barry Warsaw committed
329
                self.pos += 1
330 331 332 333 334
            elif self.field[self.pos] == '(':
                self.commentlist.append(self.getcomment())
            elif self.field[self.pos] == '[':
                sdlist.append(self.getdomainliteral())
            elif self.field[self.pos] == '.':
Barry Warsaw's avatar
Barry Warsaw committed
335
                self.pos += 1
336 337 338
                sdlist.append('.')
            elif self.field[self.pos] in self.atomends:
                break
Barry Warsaw's avatar
Barry Warsaw committed
339 340 341
            else:
                sdlist.append(self.getatom())
        return EMPTYSTRING.join(sdlist)
342

Barry Warsaw's avatar
Barry Warsaw committed
343
    def getdelimited(self, beginchar, endchars, allowcomments=True):
344 345 346 347 348 349 350 351 352
        """Parse a header fragment delimited by special characters.

        `beginchar' is the start character for the fragment.
        If self is not looking at an instance of `beginchar' then
        getdelimited returns the empty string.

        `endchars' is a sequence of allowable end-delimiting characters.
        Parsing stops when one of these is encountered.

353 354
        If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
        within the parsed fragment.
355 356 357 358 359
        """
        if self.field[self.pos] != beginchar:
            return ''

        slist = ['']
Barry Warsaw's avatar
Barry Warsaw committed
360 361
        quote = False
        self.pos += 1
362
        while self.pos < len(self.field):
Barry Warsaw's avatar
Barry Warsaw committed
363
            if quote:
364
                slist.append(self.field[self.pos])
Barry Warsaw's avatar
Barry Warsaw committed
365
                quote = False
366
            elif self.field[self.pos] in endchars:
Barry Warsaw's avatar
Barry Warsaw committed
367
                self.pos += 1
368 369 370
                break
            elif allowcomments and self.field[self.pos] == '(':
                slist.append(self.getcomment())
371
                continue        # have already advanced pos from getcomment
372
            elif self.field[self.pos] == '\\':
Barry Warsaw's avatar
Barry Warsaw committed
373
                quote = True
374 375
            else:
                slist.append(self.field[self.pos])
Barry Warsaw's avatar
Barry Warsaw committed
376
            self.pos += 1
377

Barry Warsaw's avatar
Barry Warsaw committed
378
        return EMPTYSTRING.join(slist)
379 380 381

    def getquote(self):
        """Get a quote-delimited fragment from self's field."""
Barry Warsaw's avatar
Barry Warsaw committed
382
        return self.getdelimited('"', '"\r', False)
383 384 385

    def getcomment(self):
        """Get a parenthesis-delimited fragment from self's field."""
Barry Warsaw's avatar
Barry Warsaw committed
386
        return self.getdelimited('(', ')\r', True)
387 388

    def getdomainliteral(self):
389
        """Parse an RFC 2822 domain-literal."""
Barry Warsaw's avatar
Barry Warsaw committed
390
        return '[%s]' % self.getdelimited('[', ']\r', False)
391

392 393 394 395 396 397 398
    def getatom(self, atomends=None):
        """Parse an RFC 2822 atom.

        Optional atomends specifies a different set of end token delimiters
        (the default is to use self.atomends).  This is used e.g. in
        getphraselist() since phrase endings must not include the `.' (which
        is legal in phrases)."""
399
        atomlist = ['']
400 401
        if atomends is None:
            atomends = self.atomends
402 403

        while self.pos < len(self.field):
404
            if self.field[self.pos] in atomends:
405
                break
Barry Warsaw's avatar
Barry Warsaw committed
406 407 408
            else:
                atomlist.append(self.field[self.pos])
            self.pos += 1
409

Barry Warsaw's avatar
Barry Warsaw committed
410
        return EMPTYSTRING.join(atomlist)
411 412

    def getphraselist(self):
413
        """Parse a sequence of RFC 2822 phrases.
414

415 416 417
        A phrase is a sequence of words, which are in turn either RFC 2822
        atoms or quoted-strings.  Phrases are canonicalized by squeezing all
        runs of continuous whitespace into one space.
418 419 420 421
        """
        plist = []

        while self.pos < len(self.field):
422
            if self.field[self.pos] in self.FWS:
Barry Warsaw's avatar
Barry Warsaw committed
423
                self.pos += 1
424 425 426 427
            elif self.field[self.pos] == '"':
                plist.append(self.getquote())
            elif self.field[self.pos] == '(':
                self.commentlist.append(self.getcomment())
428
            elif self.field[self.pos] in self.phraseends:
429
                break
Barry Warsaw's avatar
Barry Warsaw committed
430 431
            else:
                plist.append(self.getatom(self.phraseends))
432 433 434 435

        return plist

class AddressList(AddrlistClass):
436
    """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480
    def __init__(self, field):
        AddrlistClass.__init__(self, field)
        if field:
            self.addresslist = self.getaddrlist()
        else:
            self.addresslist = []

    def __len__(self):
        return len(self.addresslist)

    def __add__(self, other):
        # Set union
        newaddr = AddressList(None)
        newaddr.addresslist = self.addresslist[:]
        for x in other.addresslist:
            if not x in self.addresslist:
                newaddr.addresslist.append(x)
        return newaddr

    def __iadd__(self, other):
        # Set union, in-place
        for x in other.addresslist:
            if not x in self.addresslist:
                self.addresslist.append(x)
        return self

    def __sub__(self, other):
        # Set difference
        newaddr = AddressList(None)
        for x in self.addresslist:
            if not x in other.addresslist:
                newaddr.addresslist.append(x)
        return newaddr

    def __isub__(self, other):
        # Set difference, in-place
        for x in other.addresslist:
            if x in self.addresslist:
                self.addresslist.remove(x)
        return self

    def __getitem__(self, index):
        # Make indexing, slices, and 'in' work
        return self.addresslist[index]