rfc822.py 32.6 KB
Newer Older
1
"""RFC 2822 message manipulation.
2

3 4 5 6 7 8 9 10 11
Note: This is only a very rough sketch of a full RFC-822 parser; in particular
the tokenizing of addresses does not adhere to all the quoting rules.

Note: RFC 2822 is a long awaited update to RFC 822.  This module should
conform to RFC 2822, and is thus mis-named (it's not worth renaming it).  Some
effort at RFC 2822 updates have been made, but a thorough audit has not been
performed.  Consider any RFC 2822 non-conformance to be a bug.

    RFC 2822: http://www.faqs.org/rfcs/rfc2822.html
12
    RFC 822 : http://www.faqs.org/rfcs/rfc822.html (obsolete)
13 14 15 16

Directions for use:

To create a Message object: first open a file, e.g.:
17

18
  fp = open(file, 'r')
19

20
You can use any other legal way of getting an open file object, e.g. use
21 22 23
sys.stdin or call os.popen().  Then pass the open file object to the Message()
constructor:

24 25
  m = Message(fp)

26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
This class can work with any input object that supports a readline method.  If
the input object has seek and tell capability, the rewindbody method will
work; also illegal lines will be pushed back onto the input stream.  If the
input object lacks seek but has an `unread' method that can push back a line
of input, Message will use that to push back illegal lines.  Thus this class
can be used to parse messages coming from a buffered stream.

The optional `seekable' argument is provided as a workaround for certain stdio
libraries in which tell() discards buffered data before discovering that the
lseek() system call doesn't work.  For maximum portability, you should set the
seekable argument to zero to prevent that initial \code{tell} when passing in
an unseekable object such as a a file object created from a socket object.  If
it is 1 on entry -- which it is by default -- the tell() method of the open
file object is called once; if this raises an exception, seekable is reset to
0.  For other nonzero values of seekable, this test is not made.
41

42
To get the text of a particular header there are several methods:
43

44 45
  str = m.getheader(name)
  str = m.getrawheader(name)
46 47 48 49 50 51

where name is the name of the header, e.g. 'Subject'.  The difference is that
getheader() strips the leading and trailing whitespace, while getrawheader()
doesn't.  Both functions retain embedded whitespace (including newlines)
exactly as they are specified in the header, and leave the case of the text
unchanged.
52 53

For addresses and address lists there are functions
54 55

  realname, mailaddress = m.getaddr(name)
56
  list = m.getaddrlist(name)
57

58 59 60
where the latter returns a list of (realname, mailaddr) tuples.

There is also a method
61

62
  time = m.getdate(name)
63

64 65 66 67 68 69 70 71
which parses a Date-like field and returns a time-compatible tuple,
i.e. a tuple such as returned by time.localtime() or accepted by
time.mktime().

See the class definition for lower level access methods.

There are also some utility functions here.
"""
72
# Cleanup and extensions by Eric S. Raymond <esr@thyrsus.com>
73

74
import time
75

76
__all__ = ["Message","AddressList","parsedate","parsedate_tz","mktime_tz"]
77

78
_blanklines = ('\r\n', '\n')            # Optimization for islast()
79 80


81
class Message:
82
    """Represents a single RFC 2822-compliant message."""
Tim Peters's avatar
Tim Peters committed
83

84 85
    def __init__(self, fp, seekable = 1):
        """Initialize the class instance and read the headers."""
86 87 88 89 90
        if seekable == 1:
            # Exercise tell() to make sure it works
            # (and then assume seek() works, too)
            try:
                fp.tell()
91
            except (AttributeError, IOError):
92 93 94
                seekable = 0
            else:
                seekable = 1
95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
        self.fp = fp
        self.seekable = seekable
        self.startofheaders = None
        self.startofbody = None
        #
        if self.seekable:
            try:
                self.startofheaders = self.fp.tell()
            except IOError:
                self.seekable = 0
        #
        self.readheaders()
        #
        if self.seekable:
            try:
                self.startofbody = self.fp.tell()
            except IOError:
                self.seekable = 0
Tim Peters's avatar
Tim Peters committed
113

114 115 116 117 118
    def rewindbody(self):
        """Rewind the file to the start of the body (if seekable)."""
        if not self.seekable:
            raise IOError, "unseekable file"
        self.fp.seek(self.startofbody)
Tim Peters's avatar
Tim Peters committed
119

120 121
    def readheaders(self):
        """Read header lines.
Tim Peters's avatar
Tim Peters committed
122

123 124 125 126 127 128 129 130 131 132 133
        Read header lines up to the entirely blank line that terminates them.
        The (normally blank) line that ends the headers is skipped, but not
        included in the returned list.  If a non-header line ends the headers,
        (which is an error), an attempt is made to backspace over it; it is
        never included in the returned list.

        The variable self.status is set to the empty string if all went well,
        otherwise it is an error message.  The variable self.headers is a
        completely uninterpreted list of lines contained in the header (so
        printing them will reproduce the header exactly as it appears in the
        file).
134 135 136 137 138 139 140
        """
        self.dict = {}
        self.unixfrom = ''
        self.headers = list = []
        self.status = ''
        headerseen = ""
        firstline = 1
141 142 143 144 145
        startofline = unread = tell = None
        if hasattr(self.fp, 'unread'):
            unread = self.fp.unread
        elif self.seekable:
            tell = self.fp.tell
146
        while 1:
147
            if tell:
148 149 150 151 152
                try:
                    startofline = tell()
                except IOError:
                    startofline = tell = None
                    self.seekable = 0
153 154 155 156 157
            line = self.fp.readline()
            if not line:
                self.status = 'EOF in headers'
                break
            # Skip unix From name time lines
158
            if firstline and line.startswith('From '):
159 160 161
                self.unixfrom = self.unixfrom + line
                continue
            firstline = 0
162
            if headerseen and line[0] in ' \t':
163 164
                # It's a continuation line.
                list.append(line)
165 166
                x = (self.dict[headerseen] + "\n " + line.strip())
                self.dict[headerseen] = x.strip()
167
                continue
168
            elif self.iscomment(line):
169 170 171 172 173 174 175 176 177
                # It's a comment.  Ignore it.
                continue
            elif self.islast(line):
                # Note! No pushback here!  The delimiter line gets eaten.
                break
            headerseen = self.isheader(line)
            if headerseen:
                # It's a legal header line, save it.
                list.append(line)
178
                self.dict[headerseen] = line[len(headerseen)+1:].strip()
179
                continue
180
            else:
181 182
                # It's not a header line; throw it back and stop here.
                if not self.dict:
183 184
                    self.status = 'No headers'
                else:
185
                    self.status = 'Non-header line where header expected'
186
                # Try to undo the read.
187 188 189 190
                if unread:
                    unread(line)
                elif tell:
                    self.fp.seek(startofline)
191
                else:
192
                    self.status = self.status + '; bad seek'
193
                break
194 195 196 197 198

    def isheader(self, line):
        """Determine whether a given line is a legal header.

        This method should return the header name, suitably canonicalized.
199 200
        You may override this method in order to use Message parsing on tagged
        data in RFC 2822-like formats with special header formats.
201
        """
202
        i = line.find(':')
203
        if i > 0:
204
            return line[:i].lower()
205 206
        else:
            return None
Tim Peters's avatar
Tim Peters committed
207

208
    def islast(self, line):
209
        """Determine whether a line is a legal end of RFC 2822 headers.
Tim Peters's avatar
Tim Peters committed
210

211 212 213 214
        You may override this method if your application wants to bend the
        rules, e.g. to strip trailing whitespace, or to recognize MH template
        separators ('--------').  For convenience (e.g. for code reading from
        sockets) a line consisting of \r\n also matches.
215 216
        """
        return line in _blanklines
217 218 219 220

    def iscomment(self, line):
        """Determine whether a line should be skipped entirely.

221 222 223
        You may override this method in order to use Message parsing on tagged
        data in RFC 2822-like formats that support embedded comments or
        free-text data.
224
        """
225
        return False
Tim Peters's avatar
Tim Peters committed
226

227 228
    def getallmatchingheaders(self, name):
        """Find all header lines matching a given header name.
Tim Peters's avatar
Tim Peters committed
229

230 231 232 233 234
        Look through the list of headers and find all lines matching a given
        header name (and their continuation lines).  A list of the lines is
        returned, without interpretation.  If the header does not occur, an
        empty list is returned.  If the header occurs multiple times, all
        occurrences are returned.  Case is not important in the header name.
235
        """
236
        name = name.lower() + ':'
237 238 239 240
        n = len(name)
        list = []
        hit = 0
        for line in self.headers:
241
            if line[:n].lower() == name:
242
                hit = 1
243
            elif not line[:1].isspace():
244 245 246 247
                hit = 0
            if hit:
                list.append(line)
        return list
Tim Peters's avatar
Tim Peters committed
248

249 250
    def getfirstmatchingheader(self, name):
        """Get the first header line matching name.
Tim Peters's avatar
Tim Peters committed
251

252 253
        This is similar to getallmatchingheaders, but it returns only the
        first matching header (and its continuation lines).
254
        """
255
        name = name.lower() + ':'
256 257 258 259 260
        n = len(name)
        list = []
        hit = 0
        for line in self.headers:
            if hit:
261
                if not line[:1].isspace():
262
                    break
263
            elif line[:n].lower() == name:
264 265 266 267
                hit = 1
            if hit:
                list.append(line)
        return list
Tim Peters's avatar
Tim Peters committed
268

269 270
    def getrawheader(self, name):
        """A higher-level interface to getfirstmatchingheader().
Tim Peters's avatar
Tim Peters committed
271

272 273 274 275
        Return a string containing the literal text of the header but with the
        keyword stripped.  All leading, trailing and embedded whitespace is
        kept in the string, however.  Return None if the header does not
        occur.
276
        """
Tim Peters's avatar
Tim Peters committed
277

278 279 280 281
        list = self.getfirstmatchingheader(name)
        if not list:
            return None
        list[0] = list[0][len(name) + 1:]
282
        return ''.join(list)
Tim Peters's avatar
Tim Peters committed
283

284
    def getheader(self, name, default=None):
285
        """Get the header value for a name.
Tim Peters's avatar
Tim Peters committed
286

287 288 289
        This is the normal interface: it returns a stripped version of the
        header value for a given header name, or None if it doesn't exist.
        This uses the dictionary version which finds the *last* such header.
290 291
        """
        try:
292
            return self.dict[name.lower()]
293
        except KeyError:
294 295
            return default
    get = getheader
296 297 298 299

    def getheaders(self, name):
        """Get all values for a header.

300 301 302
        This returns a list of values for headers given more than once; each
        value in the result list is stripped in the same way as the result of
        getheader().  If the header is not given, return an empty list.
303 304 305 306 307
        """
        result = []
        current = ''
        have_header = 0
        for s in self.getallmatchingheaders(name):
308
            if s[0].isspace():
309
                if current:
310
                    current = "%s\n %s" % (current, s.strip())
311
                else:
312
                    current = s.strip()
313 314 315
            else:
                if have_header:
                    result.append(current)
316
                current = s[s.find(":") + 1:].strip()
317 318 319
                have_header = 1
        if have_header:
            result.append(current)
320
        return result
Tim Peters's avatar
Tim Peters committed
321

322 323
    def getaddr(self, name):
        """Get a single address from a header, as a tuple.
Tim Peters's avatar
Tim Peters committed
324

325 326 327 328 329 330 331 332 333
        An example return value:
        ('Guido van Rossum', 'guido@cwi.nl')
        """
        # New, by Ben Escoto
        alist = self.getaddrlist(name)
        if alist:
            return alist[0]
        else:
            return (None, None)
Tim Peters's avatar
Tim Peters committed
334

335 336
    def getaddrlist(self, name):
        """Get a list of addresses from a header.
337 338 339 340

        Retrieves a list of addresses from a header, where each address is a
        tuple as returned by getaddr().  Scans all named headers, so it works
        properly with multiple To: or Cc: headers for example.
341
        """
342 343
        raw = []
        for h in self.getallmatchingheaders(name):
344 345 346 347 348
            if h[0] in ' \t':
                raw.append(h)
            else:
                if raw:
                    raw.append(', ')
349
                i = h.find(':')
350 351 352
                if i > 0:
                    addr = h[i+1:]
                raw.append(addr)
353
        alladdrs = ''.join(raw)
354
        a = AddressList(alladdrs)
355
        return a.addresslist
Tim Peters's avatar
Tim Peters committed
356

357 358
    def getdate(self, name):
        """Retrieve a date field from a header.
Tim Peters's avatar
Tim Peters committed
359

360 361
        Retrieves a date field from the named header, returning a tuple
        compatible with time.mktime().
362 363 364 365 366 367
        """
        try:
            data = self[name]
        except KeyError:
            return None
        return parsedate(data)
Tim Peters's avatar
Tim Peters committed
368

369 370
    def getdate_tz(self, name):
        """Retrieve a date field from a header as a 10-tuple.
Tim Peters's avatar
Tim Peters committed
371

372 373
        The first 9 elements make up a tuple compatible with time.mktime(),
        and the 10th is the offset of the poster's time zone from GMT/UTC.
374 375 376 377 378 379
        """
        try:
            data = self[name]
        except KeyError:
            return None
        return parsedate_tz(data)
Tim Peters's avatar
Tim Peters committed
380 381


382
    # Access as a dictionary (only finds *last* header of each type):
Tim Peters's avatar
Tim Peters committed
383

384 385 386
    def __len__(self):
        """Get the number of headers in a message."""
        return len(self.dict)
Tim Peters's avatar
Tim Peters committed
387

388 389
    def __getitem__(self, name):
        """Get a specific header, as from a dictionary."""
390
        return self.dict[name.lower()]
391 392

    def __setitem__(self, name, value):
393 394
        """Set the value of a header.

395 396 397
        Note: This is not a perfect inversion of __getitem__, because any
        changed headers get stuck at the end of the raw-headers list rather
        than where the altered header was.
398
        """
399
        del self[name] # Won't fail if it doesn't exist
400
        self.dict[name.lower()] = value
401
        text = name + ": " + value
402
        lines = text.split("\n")
403 404
        for line in lines:
            self.headers.append(line + "\n")
Tim Peters's avatar
Tim Peters committed
405

406 407
    def __delitem__(self, name):
        """Delete all occurrences of a specific header, if it is present."""
408
        name = name.lower()
409
        if not name in self.dict:
410 411 412
            return
        del self.dict[name]
        name = name + ':'
413 414 415 416 417
        n = len(name)
        list = []
        hit = 0
        for i in range(len(self.headers)):
            line = self.headers[i]
418
            if line[:n].lower() == name:
419
                hit = 1
420
            elif not line[:1].isspace():
421 422 423
                hit = 0
            if hit:
                list.append(i)
424
        for i in reversed(list):
425 426
            del self.headers[i]

427
    def setdefault(self, name, default=""):
428
        lowername = name.lower()
429
        if lowername in self.dict:
430 431
            return self.dict[lowername]
        else:
432
            text = name + ": " + default
433 434 435
            lines = text.split("\n")
            for line in lines:
                self.headers.append(line + "\n")
436
            self.dict[lowername] = default
437 438
            return default

439 440
    def has_key(self, name):
        """Determine whether a message contains the named header."""
441 442 443 444
        return name.lower() in self.dict

    def __contains__(self, name):
        """Determine whether a message contains the named header."""
Tim Peters's avatar
Tim Peters committed
445
        return name.lower() in self.dict
Tim Peters's avatar
Tim Peters committed
446

447 448 449
    def __iter__(self):
        return iter(self.dict)

450 451 452
    def keys(self):
        """Get all of a message's header field names."""
        return self.dict.keys()
Tim Peters's avatar
Tim Peters committed
453

454 455 456
    def values(self):
        """Get all of a message's header field values."""
        return self.dict.values()
Tim Peters's avatar
Tim Peters committed
457

458 459
    def items(self):
        """Get all of a message's headers.
Tim Peters's avatar
Tim Peters committed
460

461 462 463
        Returns a list of name, value tuples.
        """
        return self.dict.items()
464

465
    def __str__(self):
466
        return ''.join(self.headers)
467 468 469 470 471


# Utility functions
# -----------------

472
# XXX Should fix unquote() and quote() to be really conformant.
473 474
# XXX The inverses of the parse functions may also be useful.

475 476

def unquote(str):
477 478
    """Remove quotes from a string."""
    if len(str) > 1:
479 480 481
        if str.startswith('"') and str.endswith('"'):
            return str[1:-1].replace('\\\\', '\\').replace('\\"', '"')
        if str.startswith('<') and str.endswith('>'):
482 483
            return str[1:-1]
    return str
484

485 486

def quote(str):
487
    """Add quotes around a string."""
488
    return str.replace('\\', '\\\\').replace('"', '\\"')
489

490

491
def parseaddr(address):
492
    """Parse an address into a (realname, mailaddr) tuple."""
493
    a = AddressList(address)
494
    list = a.addresslist
495
    if not list:
496
        return (None, None)
497
    else:
498
        return list[0]
499 500 501


class AddrlistClass:
502
    """Address parser class by Ben Escoto.
Tim Peters's avatar
Tim Peters committed
503

504
    To understand what this class does, it helps to have a copy of
505 506 507
    RFC 2822 in front of you.

    http://www.faqs.org/rfcs/rfc2822.html
508 509 510

    Note: this class interface is deprecated and may be removed in the future.
    Use rfc822.AddressList instead.
511
    """
Tim Peters's avatar
Tim Peters committed
512

513
    def __init__(self, field):
514
        """Initialize a new instance.
Tim Peters's avatar
Tim Peters committed
515

516 517
        `field' is an unparsed address header field, containing one or more
        addresses.
518 519 520 521
        """
        self.specials = '()<>@,:;.\"[]'
        self.pos = 0
        self.LWS = ' \t'
522
        self.CR = '\r\n'
523
        self.atomends = self.specials + self.LWS + self.CR
524 525 526 527
        # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
        # is obsolete syntax.  RFC 2822 requires that we recognize obsolete
        # syntax, so allow dots in phrases.
        self.phraseends = self.atomends.replace('.', '')
528 529
        self.field = field
        self.commentlist = []
Tim Peters's avatar
Tim Peters committed
530

531
    def gotonext(self):
532 533 534 535 536 537 538
        """Parse up to the start of the next address."""
        while self.pos < len(self.field):
            if self.field[self.pos] in self.LWS + '\n\r':
                self.pos = self.pos + 1
            elif self.field[self.pos] == '(':
                self.commentlist.append(self.getcomment())
            else: break
Tim Peters's avatar
Tim Peters committed
539

540
    def getaddrlist(self):
541
        """Parse all addresses.
Tim Peters's avatar
Tim Peters committed
542

543 544
        Returns a list containing all of the addresses.
        """
545 546 547 548 549 550 551 552
        result = []
        while 1:
            ad = self.getaddress()
            if ad:
                result += ad
            else:
                break
        return result
Tim Peters's avatar
Tim Peters committed
553

554
    def getaddress(self):
555 556 557
        """Parse the next address."""
        self.commentlist = []
        self.gotonext()
Tim Peters's avatar
Tim Peters committed
558

559 560 561
        oldpos = self.pos
        oldcl = self.commentlist
        plist = self.getphraselist()
Tim Peters's avatar
Tim Peters committed
562

563 564
        self.gotonext()
        returnlist = []
Tim Peters's avatar
Tim Peters committed
565

566 567 568
        if self.pos >= len(self.field):
            # Bad email address technically, no domain.
            if plist:
569
                returnlist = [(' '.join(self.commentlist), plist[0])]
Tim Peters's avatar
Tim Peters committed
570

571 572 573 574 575 576
        elif self.field[self.pos] in '.@':
            # email address is just an addrspec
            # this isn't very efficient since we start over
            self.pos = oldpos
            self.commentlist = oldcl
            addrspec = self.getaddrspec()
577
            returnlist = [(' '.join(self.commentlist), addrspec)]
Tim Peters's avatar
Tim Peters committed
578

579 580 581
        elif self.field[self.pos] == ':':
            # address is a group
            returnlist = []
Tim Peters's avatar
Tim Peters committed
582

583
            fieldlen = len(self.field)
584 585 586
            self.pos = self.pos + 1
            while self.pos < len(self.field):
                self.gotonext()
587
                if self.pos < fieldlen and self.field[self.pos] == ';':
588 589 590
                    self.pos = self.pos + 1
                    break
                returnlist = returnlist + self.getaddress()
Tim Peters's avatar
Tim Peters committed
591

592 593 594
        elif self.field[self.pos] == '<':
            # Address is a phrase then a route addr
            routeaddr = self.getrouteaddr()
Tim Peters's avatar
Tim Peters committed
595

596
            if self.commentlist:
597 598 599
                returnlist = [(' '.join(plist) + ' (' + \
                         ' '.join(self.commentlist) + ')', routeaddr)]
            else: returnlist = [(' '.join(plist), routeaddr)]
Tim Peters's avatar
Tim Peters committed
600

601 602
        else:
            if plist:
603
                returnlist = [(' '.join(self.commentlist), plist[0])]
604 605
            elif self.field[self.pos] in self.specials:
                self.pos = self.pos + 1
Tim Peters's avatar
Tim Peters committed
606

607 608 609 610
        self.gotonext()
        if self.pos < len(self.field) and self.field[self.pos] == ',':
            self.pos = self.pos + 1
        return returnlist
Tim Peters's avatar
Tim Peters committed
611

612
    def getrouteaddr(self):
613
        """Parse a route address (Return-path value).
Tim Peters's avatar
Tim Peters committed
614

615 616 617 618
        This method just skips all the route stuff and returns the addrspec.
        """
        if self.field[self.pos] != '<':
            return
Tim Peters's avatar
Tim Peters committed
619

620 621 622
        expectroute = 0
        self.pos = self.pos + 1
        self.gotonext()
623
        adlist = ""
624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640
        while self.pos < len(self.field):
            if expectroute:
                self.getdomain()
                expectroute = 0
            elif self.field[self.pos] == '>':
                self.pos = self.pos + 1
                break
            elif self.field[self.pos] == '@':
                self.pos = self.pos + 1
                expectroute = 1
            elif self.field[self.pos] == ':':
                self.pos = self.pos + 1
            else:
                adlist = self.getaddrspec()
                self.pos = self.pos + 1
                break
            self.gotonext()
Tim Peters's avatar
Tim Peters committed
641

642
        return adlist
Tim Peters's avatar
Tim Peters committed
643

644
    def getaddrspec(self):
645
        """Parse an RFC 2822 addr-spec."""
646
        aslist = []
Tim Peters's avatar
Tim Peters committed
647

648 649 650 651 652 653
        self.gotonext()
        while self.pos < len(self.field):
            if self.field[self.pos] == '.':
                aslist.append('.')
                self.pos = self.pos + 1
            elif self.field[self.pos] == '"':
Guido van Rossum's avatar
Guido van Rossum committed
654
                aslist.append('"%s"' % self.getquote())
655 656 657 658
            elif self.field[self.pos] in self.atomends:
                break
            else: aslist.append(self.getatom())
            self.gotonext()
Tim Peters's avatar
Tim Peters committed
659

660
        if self.pos >= len(self.field) or self.field[self.pos] != '@':
661
            return ''.join(aslist)
Tim Peters's avatar
Tim Peters committed
662

663 664 665
        aslist.append('@')
        self.pos = self.pos + 1
        self.gotonext()
666
        return ''.join(aslist) + self.getdomain()
Tim Peters's avatar
Tim Peters committed
667

668
    def getdomain(self):
669 670 671 672 673 674 675 676 677 678 679 680 681 682 683
        """Get the complete domain name from an address."""
        sdlist = []
        while self.pos < len(self.field):
            if self.field[self.pos] in self.LWS:
                self.pos = self.pos + 1
            elif self.field[self.pos] == '(':
                self.commentlist.append(self.getcomment())
            elif self.field[self.pos] == '[':
                sdlist.append(self.getdomainliteral())
            elif self.field[self.pos] == '.':
                self.pos = self.pos + 1
                sdlist.append('.')
            elif self.field[self.pos] in self.atomends:
                break
            else: sdlist.append(self.getatom())
684
        return ''.join(sdlist)
Tim Peters's avatar
Tim Peters committed
685

686
    def getdelimited(self, beginchar, endchars, allowcomments = 1):
687
        """Parse a header fragment delimited by special characters.
Tim Peters's avatar
Tim Peters committed
688

689 690 691
        `beginchar' is the start character for the fragment.  If self is not
        looking at an instance of `beginchar' then getdelimited returns the
        empty string.
Tim Peters's avatar
Tim Peters committed
692

693 694
        `endchars' is a sequence of allowable end-delimiting characters.
        Parsing stops when one of these is encountered.
Tim Peters's avatar
Tim Peters committed
695

696 697
        If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
        within the parsed fragment.
698 699 700
        """
        if self.field[self.pos] != beginchar:
            return ''
Tim Peters's avatar
Tim Peters committed
701

702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718
        slist = ['']
        quote = 0
        self.pos = self.pos + 1
        while self.pos < len(self.field):
            if quote == 1:
                slist.append(self.field[self.pos])
                quote = 0
            elif self.field[self.pos] in endchars:
                self.pos = self.pos + 1
                break
            elif allowcomments and self.field[self.pos] == '(':
                slist.append(self.getcomment())
            elif self.field[self.pos] == '\\':
                quote = 1
            else:
                slist.append(self.field[self.pos])
            self.pos = self.pos + 1
Tim Peters's avatar
Tim Peters committed
719

720
        return ''.join(slist)
Tim Peters's avatar
Tim Peters committed
721

722
    def getquote(self):
723 724
        """Get a quote-delimited fragment from self's field."""
        return self.getdelimited('"', '"\r', 0)
Tim Peters's avatar
Tim Peters committed
725

726
    def getcomment(self):
727 728
        """Get a parenthesis-delimited fragment from self's field."""
        return self.getdelimited('(', ')\r', 1)
Tim Peters's avatar
Tim Peters committed
729

730
    def getdomainliteral(self):
731
        """Parse an RFC 2822 domain-literal."""
732
        return '[%s]' % self.getdelimited('[', ']\r', 0)
Tim Peters's avatar
Tim Peters committed
733

734 735 736 737 738 739 740
    def getatom(self, atomends=None):
        """Parse an RFC 2822 atom.

        Optional atomends specifies a different set of end token delimiters
        (the default is to use self.atomends).  This is used e.g. in
        getphraselist() since phrase endings must not include the `.' (which
        is legal in phrases)."""
741
        atomlist = ['']
742 743
        if atomends is None:
            atomends = self.atomends
Tim Peters's avatar
Tim Peters committed
744

745
        while self.pos < len(self.field):
746
            if self.field[self.pos] in atomends:
747 748 749
                break
            else: atomlist.append(self.field[self.pos])
            self.pos = self.pos + 1
Tim Peters's avatar
Tim Peters committed
750

751
        return ''.join(atomlist)
Tim Peters's avatar
Tim Peters committed
752

753
    def getphraselist(self):
754
        """Parse a sequence of RFC 2822 phrases.
Tim Peters's avatar
Tim Peters committed
755

756 757 758
        A phrase is a sequence of words, which are in turn either RFC 2822
        atoms or quoted-strings.  Phrases are canonicalized by squeezing all
        runs of continuous whitespace into one space.
759 760
        """
        plist = []
Tim Peters's avatar
Tim Peters committed
761

762 763 764 765 766 767 768
        while self.pos < len(self.field):
            if self.field[self.pos] in self.LWS:
                self.pos = self.pos + 1
            elif self.field[self.pos] == '"':
                plist.append(self.getquote())
            elif self.field[self.pos] == '(':
                self.commentlist.append(self.getcomment())
769
            elif self.field[self.pos] in self.phraseends:
770
                break
771 772
            else:
                plist.append(self.getatom(self.phraseends))
Tim Peters's avatar
Tim Peters committed
773

774
        return plist
775

776
class AddressList(AddrlistClass):
777
    """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
778 779 780 781 782 783 784 785 786 787 788
    def __init__(self, field):
        AddrlistClass.__init__(self, field)
        if field:
            self.addresslist = self.getaddrlist()
        else:
            self.addresslist = []

    def __len__(self):
        return len(self.addresslist)

    def __str__(self):
789
        return ", ".join(map(dump_address_pair, self.addresslist))
790 791 792 793 794 795 796 797 798 799

    def __add__(self, other):
        # Set union
        newaddr = AddressList(None)
        newaddr.addresslist = self.addresslist[:]
        for x in other.addresslist:
            if not x in self.addresslist:
                newaddr.addresslist.append(x)
        return newaddr

800 801 802 803 804 805 806
    def __iadd__(self, other):
        # Set union, in-place
        for x in other.addresslist:
            if not x in self.addresslist:
                self.addresslist.append(x)
        return self

807 808 809 810 811 812 813 814
    def __sub__(self, other):
        # Set difference
        newaddr = AddressList(None)
        for x in self.addresslist:
            if not x in other.addresslist:
                newaddr.addresslist.append(x)
        return newaddr

815 816 817 818 819 820 821
    def __isub__(self, other):
        # Set difference, in-place
        for x in other.addresslist:
            if x in self.addresslist:
                self.addresslist.remove(x)
        return self

822 823
    def __getitem__(self, index):
        # Make indexing, slices, and 'in' work
824
        return self.addresslist[index]
825

826 827 828 829 830 831
def dump_address_pair(pair):
    """Dump a (name, address) pair in a canonicalized form."""
    if pair[0]:
        return '"' + pair[0] + '" <' + pair[1] + '>'
    else:
        return pair[1]
832 833 834

# Parse a date field

Guido van Rossum's avatar
Guido van Rossum committed
835 836
_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
               'aug', 'sep', 'oct', 'nov', 'dec',
837
               'january', 'february', 'march', 'april', 'may', 'june', 'july',
Guido van Rossum's avatar
Guido van Rossum committed
838 839
               'august', 'september', 'october', 'november', 'december']
_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
840

841 842 843 844 845 846
# The timezone table does not include the military time zones defined
# in RFC822, other than Z.  According to RFC1123, the description in
# RFC822 gets the signs wrong, so we can't rely on any such time
# zones.  RFC1123 recommends that numeric timezone indicators be used
# instead of timezone names.

Tim Peters's avatar
Tim Peters committed
847
_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
848
              'AST': -400, 'ADT': -300,  # Atlantic (used in Canada)
849
              'EST': -500, 'EDT': -400,  # Eastern
850 851 852
              'CST': -600, 'CDT': -500,  # Central
              'MST': -700, 'MDT': -600,  # Mountain
              'PST': -800, 'PDT': -700   # Pacific
Tim Peters's avatar
Tim Peters committed
853
              }
854

855 856

def parsedate_tz(data):
857
    """Convert a date string to a time tuple.
Tim Peters's avatar
Tim Peters committed
858

859 860
    Accounts for military timezones.
    """
861 862
    if not data:
        return None
863 864
    data = data.split()
    if data[0][-1] in (',', '.') or data[0].lower() in _daynames:
865 866 867
        # There's a dayname here. Skip it
        del data[0]
    if len(data) == 3: # RFC 850 date, deprecated
868
        stuff = data[0].split('-')
869 870 871 872
        if len(stuff) == 3:
            data = stuff + data[1:]
    if len(data) == 4:
        s = data[3]
873
        i = s.find('+')
874 875 876 877 878 879 880 881
        if i > 0:
            data[3:] = [s[:i], s[i+1:]]
        else:
            data.append('') # Dummy tz
    if len(data) < 5:
        return None
    data = data[:5]
    [dd, mm, yy, tm, tz] = data
882
    mm = mm.lower()
883
    if not mm in _monthnames:
884
        dd, mm = mm, dd.lower()
885 886 887
        if not mm in _monthnames:
            return None
    mm = _monthnames.index(mm)+1
888
    if mm > 12: mm = mm - 12
Guido van Rossum's avatar
Guido van Rossum committed
889
    if dd[-1] == ',':
890
        dd = dd[:-1]
891
    i = yy.find(':')
Guido van Rossum's avatar
Guido van Rossum committed
892
    if i > 0:
893
        yy, tm = tm, yy
Guido van Rossum's avatar
Guido van Rossum committed
894
    if yy[-1] == ',':
895
        yy = yy[:-1]
896
    if not yy[0].isdigit():
897
        yy, tz = tz, yy
Guido van Rossum's avatar
Guido van Rossum committed
898
    if tm[-1] == ',':
899
        tm = tm[:-1]
900
    tm = tm.split(':')
901 902 903
    if len(tm) == 2:
        [thh, tmm] = tm
        tss = '0'
904
    elif len(tm) == 3:
905
        [thh, tmm, tss] = tm
906 907
    else:
        return None
908
    try:
909 910 911 912 913 914
        yy = int(yy)
        dd = int(dd)
        thh = int(thh)
        tmm = int(tmm)
        tss = int(tss)
    except ValueError:
915
        return None
916 917
    tzoffset = None
    tz = tz.upper()
918
    if tz in _timezones:
919
        tzoffset = _timezones[tz]
920
    else:
Tim Peters's avatar
Tim Peters committed
921
        try:
922
            tzoffset = int(tz)
Tim Peters's avatar
Tim Peters committed
923
        except ValueError:
924 925
            pass
    # Convert a timezone offset into seconds ; -0500 -> -18000
926
    if tzoffset:
927 928 929 930 931
        if tzoffset < 0:
            tzsign = -1
            tzoffset = -tzoffset
        else:
            tzsign = 1
932
        tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60)
933
    tuple = (yy, mm, dd, thh, tmm, tss, 0, 1, 0, tzoffset)
934 935
    return tuple

936

937
def parsedate(data):
938
    """Convert a time string to a time tuple."""
939 940
    t = parsedate_tz(data)
    if type(t) == type( () ):
941
        return t[:9]
Tim Peters's avatar
Tim Peters committed
942
    else: return t
943

944

945
def mktime_tz(data):
946
    """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
947
    if data[9] is None:
948 949
        # No zone info, so localtime is better assumption than GMT
        return time.mktime(data[:8] + (-1,))
950
    else:
951 952
        t = time.mktime(data[:8] + (0,))
        return t - data[9] - time.timezone
953

954 955 956 957
def formatdate(timeval=None):
    """Returns time format preferred for Internet standards.

    Sun, 06 Nov 1994 08:49:37 GMT  ; RFC 822, updated by RFC 1123
958 959 960 961 962

    According to RFC 1123, day and month names must always be in
    English.  If not for that, this code could use strftime().  It
    can't because strftime() honors the locale and could generated
    non-English names.
963 964 965
    """
    if timeval is None:
        timeval = time.time()
966 967 968 969 970 971
    timeval = time.gmtime(timeval)
    return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
            ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][timeval[6]],
            timeval[2],
            ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
             "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"][timeval[1]-1],
Tim Peters's avatar
Tim Peters committed
972
                                timeval[0], timeval[3], timeval[4], timeval[5])
973

974 975 976 977 978 979

# When used as script, run a small test program.
# The first command line argument must be a filename containing one
# message in RFC-822 format.

if __name__ == '__main__':
980 981 982 983 984 985 986 987 988 989
    import sys, os
    file = os.path.join(os.environ['HOME'], 'Mail/inbox/1')
    if sys.argv[1:]: file = sys.argv[1]
    f = open(file, 'r')
    m = Message(f)
    print 'From:', m.getaddr('from')
    print 'To:', m.getaddrlist('to')
    print 'Subject:', m.getheader('subject')
    print 'Date:', m.getheader('date')
    date = m.getdate_tz('date')
990 991
    tz = date[-1]
    date = time.localtime(mktime_tz(date))
992
    if date:
993 994
        print 'ParsedDate:', time.asctime(date),
        hhmmss = tz
995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008
        hhmm, ss = divmod(hhmmss, 60)
        hh, mm = divmod(hhmm, 60)
        print "%+03d%02d" % (hh, mm),
        if ss: print ".%02d" % ss,
        print
    else:
        print 'ParsedDate:', None
    m.rewindbody()
    n = 0
    while f.readline():
        n = n + 1
    print 'Lines:', n
    print '-'*70
    print 'len =', len(m)
1009 1010
    if 'Date' in m: print 'Date =', m['Date']
    if 'X-Nonsense' in m: pass
1011 1012 1013
    print 'keys =', m.keys()
    print 'values =', m.values()
    print 'items =', m.items()