_parseaddr.py 14.6 KB
Newer Older
1
# Copyright (C) 2002-2006 Python Software Foundation
2
# Contact: email-sig@python.org
3 4 5 6 7 8

"""Email address parsing code.

Lifted directly from rfc822.py.  This should eventually be rewritten.
"""

9 10 11 12 13 14 15
__all__ = [
    'mktime_tz',
    'parsedate',
    'parsedate_tz',
    'quote',
    ]

16
import time
Barry Warsaw's avatar
Barry Warsaw committed
17 18 19 20

SPACE = ' '
EMPTYSTRING = ''
COMMASPACE = ', '
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50

# Parse a date field
_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
               'aug', 'sep', 'oct', 'nov', 'dec',
               'january', 'february', 'march', 'april', 'may', 'june', 'july',
               'august', 'september', 'october', 'november', 'december']

_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']

# The timezone table does not include the military time zones defined
# in RFC822, other than Z.  According to RFC1123, the description in
# RFC822 gets the signs wrong, so we can't rely on any such time
# zones.  RFC1123 recommends that numeric timezone indicators be used
# instead of timezone names.

_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
              'AST': -400, 'ADT': -300,  # Atlantic (used in Canada)
              'EST': -500, 'EDT': -400,  # Eastern
              'CST': -600, 'CDT': -500,  # Central
              'MST': -700, 'MDT': -600,  # Mountain
              'PST': -800, 'PDT': -700   # Pacific
              }


def parsedate_tz(data):
    """Convert a date string to a time tuple.

    Accounts for military timezones.
    """
    data = data.split()
51 52 53
    # The FWS after the comma after the day-of-week is optional, so search and
    # adjust for this.
    if data[0].endswith(',') or data[0].lower() in _daynames:
54 55
        # There's a dayname here. Skip it
        del data[0]
56 57
    else:
        i = data[0].rfind(',')
58 59
        if i >= 0:
            data[0] = data[0][i+1:]
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
    if len(data) == 3: # RFC 850 date, deprecated
        stuff = data[0].split('-')
        if len(stuff) == 3:
            data = stuff + data[1:]
    if len(data) == 4:
        s = data[3]
        i = s.find('+')
        if i > 0:
            data[3:] = [s[:i], s[i+1:]]
        else:
            data.append('') # Dummy tz
    if len(data) < 5:
        return None
    data = data[:5]
    [dd, mm, yy, tm, tz] = data
    mm = mm.lower()
Barry Warsaw's avatar
Barry Warsaw committed
76
    if mm not in _monthnames:
77
        dd, mm = mm, dd.lower()
Barry Warsaw's avatar
Barry Warsaw committed
78
        if mm not in _monthnames:
79
            return None
Barry Warsaw's avatar
Barry Warsaw committed
80 81 82
    mm = _monthnames.index(mm) + 1
    if mm > 12:
        mm -= 12
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
    if dd[-1] == ',':
        dd = dd[:-1]
    i = yy.find(':')
    if i > 0:
        yy, tm = tm, yy
    if yy[-1] == ',':
        yy = yy[:-1]
    if not yy[0].isdigit():
        yy, tz = tz, yy
    if tm[-1] == ',':
        tm = tm[:-1]
    tm = tm.split(':')
    if len(tm) == 2:
        [thh, tmm] = tm
        tss = '0'
    elif len(tm) == 3:
        [thh, tmm, tss] = tm
    else:
        return None
    try:
        yy = int(yy)
        dd = int(dd)
        thh = int(thh)
        tmm = int(tmm)
        tss = int(tss)
    except ValueError:
        return None
    tzoffset = None
    tz = tz.upper()
    if _timezones.has_key(tz):
        tzoffset = _timezones[tz]
    else:
        try:
            tzoffset = int(tz)
        except ValueError:
            pass
    # Convert a timezone offset into seconds ; -0500 -> -18000
    if tzoffset:
        if tzoffset < 0:
            tzsign = -1
            tzoffset = -tzoffset
        else:
            tzsign = 1
126
        tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60)
127 128
    # Daylight Saving Time flag is set to -1, since DST is unknown.
    return yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset
129 130 131 132 133


def parsedate(data):
    """Convert a time string to a time tuple."""
    t = parsedate_tz(data)
134
    if isinstance(t, tuple):
135
        return t[:9]
Barry Warsaw's avatar
Barry Warsaw committed
136 137
    else:
        return t
138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157


def mktime_tz(data):
    """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
    if data[9] is None:
        # No zone info, so localtime is better assumption than GMT
        return time.mktime(data[:8] + (-1,))
    else:
        t = time.mktime(data[:8] + (0,))
        return t - data[9] - time.timezone


def quote(str):
    """Add quotes around a string."""
    return str.replace('\\', '\\\\').replace('"', '\\"')


class AddrlistClass:
    """Address parser class by Ben Escoto.

158 159
    To understand what this class does, it helps to have a copy of RFC 2822 in
    front of you.
160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175

    Note: this class interface is deprecated and may be removed in the future.
    Use rfc822.AddressList instead.
    """

    def __init__(self, field):
        """Initialize a new instance.

        `field' is an unparsed address header field, containing
        one or more addresses.
        """
        self.specials = '()<>@,:;.\"[]'
        self.pos = 0
        self.LWS = ' \t'
        self.CR = '\r\n'
        self.atomends = self.specials + self.LWS + self.CR
176 177 178 179
        # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
        # is obsolete syntax.  RFC 2822 requires that we recognize obsolete
        # syntax, so allow dots in phrases.
        self.phraseends = self.atomends.replace('.', '')
180 181 182 183 184 185 186
        self.field = field
        self.commentlist = []

    def gotonext(self):
        """Parse up to the start of the next address."""
        while self.pos < len(self.field):
            if self.field[self.pos] in self.LWS + '\n\r':
Barry Warsaw's avatar
Barry Warsaw committed
187
                self.pos += 1
188 189
            elif self.field[self.pos] == '(':
                self.commentlist.append(self.getcomment())
Barry Warsaw's avatar
Barry Warsaw committed
190 191
            else:
                break
192 193 194 195 196 197

    def getaddrlist(self):
        """Parse all addresses.

        Returns a list containing all of the addresses.
        """
198
        result = []
199
        while self.pos < len(self.field):
200 201 202 203
            ad = self.getaddress()
            if ad:
                result += ad
            else:
204
                result.append(('', ''))
205
        return result
206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221

    def getaddress(self):
        """Parse the next address."""
        self.commentlist = []
        self.gotonext()

        oldpos = self.pos
        oldcl = self.commentlist
        plist = self.getphraselist()

        self.gotonext()
        returnlist = []

        if self.pos >= len(self.field):
            # Bad email address technically, no domain.
            if plist:
Barry Warsaw's avatar
Barry Warsaw committed
222
                returnlist = [(SPACE.join(self.commentlist), plist[0])]
223 224 225 226 227 228 229

        elif self.field[self.pos] in '.@':
            # email address is just an addrspec
            # this isn't very efficient since we start over
            self.pos = oldpos
            self.commentlist = oldcl
            addrspec = self.getaddrspec()
Barry Warsaw's avatar
Barry Warsaw committed
230
            returnlist = [(SPACE.join(self.commentlist), addrspec)]
231 232 233 234 235 236

        elif self.field[self.pos] == ':':
            # address is a group
            returnlist = []

            fieldlen = len(self.field)
Barry Warsaw's avatar
Barry Warsaw committed
237
            self.pos += 1
238 239 240
            while self.pos < len(self.field):
                self.gotonext()
                if self.pos < fieldlen and self.field[self.pos] == ';':
Barry Warsaw's avatar
Barry Warsaw committed
241
                    self.pos += 1
242 243 244 245 246 247 248 249
                    break
                returnlist = returnlist + self.getaddress()

        elif self.field[self.pos] == '<':
            # Address is a phrase then a route addr
            routeaddr = self.getrouteaddr()

            if self.commentlist:
Barry Warsaw's avatar
Barry Warsaw committed
250 251 252 253
                returnlist = [(SPACE.join(plist) + ' (' +
                               ' '.join(self.commentlist) + ')', routeaddr)]
            else:
                returnlist = [(SPACE.join(plist), routeaddr)]
254 255 256

        else:
            if plist:
Barry Warsaw's avatar
Barry Warsaw committed
257
                returnlist = [(SPACE.join(self.commentlist), plist[0])]
258
            elif self.field[self.pos] in self.specials:
Barry Warsaw's avatar
Barry Warsaw committed
259
                self.pos += 1
260 261 262

        self.gotonext()
        if self.pos < len(self.field) and self.field[self.pos] == ',':
Barry Warsaw's avatar
Barry Warsaw committed
263
            self.pos += 1
264 265 266 267 268 269 270 271 272 273
        return returnlist

    def getrouteaddr(self):
        """Parse a route address (Return-path value).

        This method just skips all the route stuff and returns the addrspec.
        """
        if self.field[self.pos] != '<':
            return

Barry Warsaw's avatar
Barry Warsaw committed
274 275
        expectroute = False
        self.pos += 1
276
        self.gotonext()
Barry Warsaw's avatar
Barry Warsaw committed
277
        adlist = ''
278 279 280
        while self.pos < len(self.field):
            if expectroute:
                self.getdomain()
Barry Warsaw's avatar
Barry Warsaw committed
281
                expectroute = False
282
            elif self.field[self.pos] == '>':
Barry Warsaw's avatar
Barry Warsaw committed
283
                self.pos += 1
284 285
                break
            elif self.field[self.pos] == '@':
Barry Warsaw's avatar
Barry Warsaw committed
286 287
                self.pos += 1
                expectroute = True
288
            elif self.field[self.pos] == ':':
Barry Warsaw's avatar
Barry Warsaw committed
289
                self.pos += 1
290 291
            else:
                adlist = self.getaddrspec()
Barry Warsaw's avatar
Barry Warsaw committed
292
                self.pos += 1
293 294 295 296 297 298
                break
            self.gotonext()

        return adlist

    def getaddrspec(self):
299
        """Parse an RFC 2822 addr-spec."""
300 301 302 303 304 305
        aslist = []

        self.gotonext()
        while self.pos < len(self.field):
            if self.field[self.pos] == '.':
                aslist.append('.')
Barry Warsaw's avatar
Barry Warsaw committed
306
                self.pos += 1
307 308 309 310
            elif self.field[self.pos] == '"':
                aslist.append('"%s"' % self.getquote())
            elif self.field[self.pos] in self.atomends:
                break
Barry Warsaw's avatar
Barry Warsaw committed
311 312
            else:
                aslist.append(self.getatom())
313 314 315
            self.gotonext()

        if self.pos >= len(self.field) or self.field[self.pos] != '@':
Barry Warsaw's avatar
Barry Warsaw committed
316
            return EMPTYSTRING.join(aslist)
317 318

        aslist.append('@')
Barry Warsaw's avatar
Barry Warsaw committed
319
        self.pos += 1
320
        self.gotonext()
Barry Warsaw's avatar
Barry Warsaw committed
321
        return EMPTYSTRING.join(aslist) + self.getdomain()
322 323 324 325 326 327

    def getdomain(self):
        """Get the complete domain name from an address."""
        sdlist = []
        while self.pos < len(self.field):
            if self.field[self.pos] in self.LWS:
Barry Warsaw's avatar
Barry Warsaw committed
328
                self.pos += 1
329 330 331 332 333
            elif self.field[self.pos] == '(':
                self.commentlist.append(self.getcomment())
            elif self.field[self.pos] == '[':
                sdlist.append(self.getdomainliteral())
            elif self.field[self.pos] == '.':
Barry Warsaw's avatar
Barry Warsaw committed
334
                self.pos += 1
335 336 337
                sdlist.append('.')
            elif self.field[self.pos] in self.atomends:
                break
Barry Warsaw's avatar
Barry Warsaw committed
338 339 340
            else:
                sdlist.append(self.getatom())
        return EMPTYSTRING.join(sdlist)
341

Barry Warsaw's avatar
Barry Warsaw committed
342
    def getdelimited(self, beginchar, endchars, allowcomments=True):
343 344 345 346 347 348 349 350 351
        """Parse a header fragment delimited by special characters.

        `beginchar' is the start character for the fragment.
        If self is not looking at an instance of `beginchar' then
        getdelimited returns the empty string.

        `endchars' is a sequence of allowable end-delimiting characters.
        Parsing stops when one of these is encountered.

352 353
        If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
        within the parsed fragment.
354 355 356 357 358
        """
        if self.field[self.pos] != beginchar:
            return ''

        slist = ['']
Barry Warsaw's avatar
Barry Warsaw committed
359 360
        quote = False
        self.pos += 1
361
        while self.pos < len(self.field):
Barry Warsaw's avatar
Barry Warsaw committed
362
            if quote:
363
                slist.append(self.field[self.pos])
Barry Warsaw's avatar
Barry Warsaw committed
364
                quote = False
365
            elif self.field[self.pos] in endchars:
Barry Warsaw's avatar
Barry Warsaw committed
366
                self.pos += 1
367 368 369
                break
            elif allowcomments and self.field[self.pos] == '(':
                slist.append(self.getcomment())
370
                continue        # have already advanced pos from getcomment
371
            elif self.field[self.pos] == '\\':
Barry Warsaw's avatar
Barry Warsaw committed
372
                quote = True
373 374
            else:
                slist.append(self.field[self.pos])
Barry Warsaw's avatar
Barry Warsaw committed
375
            self.pos += 1
376

Barry Warsaw's avatar
Barry Warsaw committed
377
        return EMPTYSTRING.join(slist)
378 379 380

    def getquote(self):
        """Get a quote-delimited fragment from self's field."""
Barry Warsaw's avatar
Barry Warsaw committed
381
        return self.getdelimited('"', '"\r', False)
382 383 384

    def getcomment(self):
        """Get a parenthesis-delimited fragment from self's field."""
Barry Warsaw's avatar
Barry Warsaw committed
385
        return self.getdelimited('(', ')\r', True)
386 387

    def getdomainliteral(self):
388
        """Parse an RFC 2822 domain-literal."""
Barry Warsaw's avatar
Barry Warsaw committed
389
        return '[%s]' % self.getdelimited('[', ']\r', False)
390

391 392 393 394 395 396 397
    def getatom(self, atomends=None):
        """Parse an RFC 2822 atom.

        Optional atomends specifies a different set of end token delimiters
        (the default is to use self.atomends).  This is used e.g. in
        getphraselist() since phrase endings must not include the `.' (which
        is legal in phrases)."""
398
        atomlist = ['']
399 400
        if atomends is None:
            atomends = self.atomends
401 402

        while self.pos < len(self.field):
403
            if self.field[self.pos] in atomends:
404
                break
Barry Warsaw's avatar
Barry Warsaw committed
405 406 407
            else:
                atomlist.append(self.field[self.pos])
            self.pos += 1
408

Barry Warsaw's avatar
Barry Warsaw committed
409
        return EMPTYSTRING.join(atomlist)
410 411

    def getphraselist(self):
412
        """Parse a sequence of RFC 2822 phrases.
413

414 415 416
        A phrase is a sequence of words, which are in turn either RFC 2822
        atoms or quoted-strings.  Phrases are canonicalized by squeezing all
        runs of continuous whitespace into one space.
417 418 419 420 421
        """
        plist = []

        while self.pos < len(self.field):
            if self.field[self.pos] in self.LWS:
Barry Warsaw's avatar
Barry Warsaw committed
422
                self.pos += 1
423 424 425 426
            elif self.field[self.pos] == '"':
                plist.append(self.getquote())
            elif self.field[self.pos] == '(':
                self.commentlist.append(self.getcomment())
427
            elif self.field[self.pos] in self.phraseends:
428
                break
Barry Warsaw's avatar
Barry Warsaw committed
429 430
            else:
                plist.append(self.getatom(self.phraseends))
431 432 433 434

        return plist

class AddressList(AddrlistClass):
435
    """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479
    def __init__(self, field):
        AddrlistClass.__init__(self, field)
        if field:
            self.addresslist = self.getaddrlist()
        else:
            self.addresslist = []

    def __len__(self):
        return len(self.addresslist)

    def __add__(self, other):
        # Set union
        newaddr = AddressList(None)
        newaddr.addresslist = self.addresslist[:]
        for x in other.addresslist:
            if not x in self.addresslist:
                newaddr.addresslist.append(x)
        return newaddr

    def __iadd__(self, other):
        # Set union, in-place
        for x in other.addresslist:
            if not x in self.addresslist:
                self.addresslist.append(x)
        return self

    def __sub__(self, other):
        # Set difference
        newaddr = AddressList(None)
        for x in self.addresslist:
            if not x in other.addresslist:
                newaddr.addresslist.append(x)
        return newaddr

    def __isub__(self, other):
        # Set difference, in-place
        for x in other.addresslist:
            if x in self.addresslist:
                self.addresslist.remove(x)
        return self

    def __getitem__(self, index):
        # Make indexing, slices, and 'in' work
        return self.addresslist[index]