Kaydet (Commit) 97f43c01 authored tarafından R David Murray's avatar R David Murray

#15160: Extend the new email parser to handle MIME headers.

This code passes all the same tests that the existing RFC mime header
parser passes, plus a bunch of additional ones.

There are a couple of commented out tests where there are issues with the
folding.  The folding doesn't normally get invoked for headers parsed from
source, and the cases are marginal anyway (headers with invalid binary data)
so I'm not worried about them, but will fix them after the beta.

There are things that can be done to make this API even more convenient, but I
think this is a solid foundation worth having.  And the parser is a full RFC
parser, so it handles cases that the current parser doesn't.  (There are also
probably cases where it fails when the current parser doesn't, but I haven't
found them yet ;)

Oh, yeah, and there are some really ugly bits in the parser for handling some
'postel' cases that are unfortunately common.

I hope/plan to to eventually refactor a lot of the code in the parser which
should reduce the line count...but there is no escaping the fact that the
error recovery is welter of special cases.
üst 49c15d4a
......@@ -234,11 +234,80 @@ headers.
result in a :exc:`ValueError`.
Each of the above classes also has a ``Unique`` variant (for example,
Many of the above classes also have a ``Unique`` variant (for example,
``UniqueUnstructuredHeader``). The only difference is that in the ``Unique``
variant, :attr:`~.BaseHeader.max_count` is set to 1.
.. class:: MIMEVersionHeader
There is really only one valid value for the :mailheader:`MIME-Version`
header, and that is ``1.0``. For future proofing, this header class
supports other valid version numbers. If a version number has a valid value
per :rfc:`2045`, then the header object will have non-``None`` values for
the following attributes:
.. attribute:: version
The version number as a string, with any whitespace and/or comments
removed.
.. attribute:: major
The major version number as an integer
.. attribute:: minor
The minor version number as an integer
.. class:: ParameterizedMIMEHeader
MOME headers all start with the prefix 'Content-'. Each specific header has
a certain value, described under the class for that header. Some can
also take a list of supplemental parameters, which have a common format.
This class serves as a base for all the MIME headers that take parameters.
.. attrbibute:: params
A dictionary mapping parameter names to parameter values.
.. class:: ContentTypeHeader
A :class:`ParameterizedMIMEHheader` class that handles the
:mailheader:`Content-Type` header.
.. attribute:: content_type
The content type string, in the form ``maintype/subtype``.
.. attribute:: maintype
.. attribute:: subtype
.. class:: ContentDispositionHeader
A :class:`ParameterizedMIMEHheader` class that handles the
:mailheader:`Content-Disposition` header.
.. attribute:: content-disposition
``inline`` and ``attachment`` are the only valid values in common use.
.. class:: ContentTransferEncoding
Handles the :mailheader:`Content-Transfer-Encoding` header.
.. attribute:: cte
Valid values are ``7bit``, ``8bit``, ``base64``, and
``quoted-printable``. See :rfc:`2045` for more information.
.. class:: HeaderRegistry(base_class=BaseHeader, \
default_class=UnstructuredHeader, \
use_default_map=True)
......
......@@ -391,24 +391,151 @@ class UniqueSingleAddressHeader(SingleAddressHeader):
max_count = 1
class MIMEVersionHeader:
max_count = 1
value_parser = staticmethod(parser.parse_mime_version)
@classmethod
def parse(cls, value, kwds):
kwds['parse_tree'] = parse_tree = cls.value_parser(value)
kwds['decoded'] = str(parse_tree)
kwds['defects'].extend(parse_tree.all_defects)
kwds['major'] = None if parse_tree.minor is None else parse_tree.major
kwds['minor'] = parse_tree.minor
if parse_tree.minor is not None:
kwds['version'] = '{}.{}'.format(kwds['major'], kwds['minor'])
else:
kwds['version'] = None
def init(self, *args, **kw):
self._version = kw.pop('version')
self._major = kw.pop('major')
self._minor = kw.pop('minor')
super().init(*args, **kw)
@property
def major(self):
return self._major
@property
def minor(self):
return self._minor
@property
def version(self):
return self._version
class ParameterizedMIMEHeader:
# Mixin that handles the params dict. Must be subclassed and
# a property value_parser for the specific header provided.
max_count = 1
@classmethod
def parse(cls, value, kwds):
kwds['parse_tree'] = parse_tree = cls.value_parser(value)
kwds['decoded'] = str(parse_tree)
kwds['defects'].extend(parse_tree.all_defects)
if parse_tree.params is None:
kwds['params'] = {}
else:
# The MIME RFCs specify that parameter ordering is arbitrary.
kwds['params'] = {utils._sanitize(name).lower():
utils._sanitize(value)
for name, value in parse_tree.params}
def init(self, *args, **kw):
self._params = kw.pop('params')
super().init(*args, **kw)
@property
def params(self):
return self._params.copy()
class ContentTypeHeader(ParameterizedMIMEHeader):
value_parser = staticmethod(parser.parse_content_type_header)
def init(self, *args, **kw):
super().init(*args, **kw)
self._maintype = utils._sanitize(self._parse_tree.maintype)
self._subtype = utils._sanitize(self._parse_tree.subtype)
@property
def maintype(self):
return self._maintype
@property
def subtype(self):
return self._subtype
@property
def content_type(self):
return self.maintype + '/' + self.subtype
class ContentDispositionHeader(ParameterizedMIMEHeader):
value_parser = staticmethod(parser.parse_content_disposition_header)
def init(self, *args, **kw):
super().init(*args, **kw)
cd = self._parse_tree.content_disposition
self._content_disposition = cd if cd is None else utils._sanitize(cd)
@property
def content_disposition(self):
return self._content_disposition
class ContentTransferEncodingHeader:
max_count = 1
value_parser = staticmethod(parser.parse_content_transfer_encoding_header)
@classmethod
def parse(cls, value, kwds):
kwds['parse_tree'] = parse_tree = cls.value_parser(value)
kwds['decoded'] = str(parse_tree)
kwds['defects'].extend(parse_tree.all_defects)
def init(self, *args, **kw):
super().init(*args, **kw)
self._cte = utils._sanitize(self._parse_tree.cte)
@property
def cte(self):
return self._cte
# The header factory #
_default_header_map = {
'subject': UniqueUnstructuredHeader,
'date': UniqueDateHeader,
'resent-date': DateHeader,
'orig-date': UniqueDateHeader,
'sender': UniqueSingleAddressHeader,
'resent-sender': SingleAddressHeader,
'to': UniqueAddressHeader,
'resent-to': AddressHeader,
'cc': UniqueAddressHeader,
'resent-cc': AddressHeader,
'bcc': UniqueAddressHeader,
'resent-bcc': AddressHeader,
'from': UniqueAddressHeader,
'resent-from': AddressHeader,
'reply-to': UniqueAddressHeader,
'subject': UniqueUnstructuredHeader,
'date': UniqueDateHeader,
'resent-date': DateHeader,
'orig-date': UniqueDateHeader,
'sender': UniqueSingleAddressHeader,
'resent-sender': SingleAddressHeader,
'to': UniqueAddressHeader,
'resent-to': AddressHeader,
'cc': UniqueAddressHeader,
'resent-cc': AddressHeader,
'bcc': UniqueAddressHeader,
'resent-bcc': AddressHeader,
'from': UniqueAddressHeader,
'resent-from': AddressHeader,
'reply-to': UniqueAddressHeader,
'mime-version': MIMEVersionHeader,
'content-type': ContentTypeHeader,
'content-disposition': ContentDispositionHeader,
'content-transfer-encoding': ContentTransferEncodingHeader,
}
class HeaderRegistry:
......
......@@ -3,7 +3,7 @@ import unittest
from email import _header_value_parser as parser
from email import errors
from email import policy
from test.test_email import TestEmailBase
from test.test_email import TestEmailBase, parameterize
class TestTokens(TestEmailBase):
......@@ -28,7 +28,32 @@ class TestTokens(TestEmailBase):
self.assertDefectsEqual(parts[2].all_defects, [errors.UndecodableBytesDefect])
class TestParser(TestEmailBase):
class TestParserMixin:
def _assert_results(self, tl, rest, string, value, defects, remainder,
comments=None):
self.assertEqual(str(tl), string)
self.assertEqual(tl.value, value)
self.assertDefectsEqual(tl.all_defects, defects)
self.assertEqual(rest, remainder)
if comments is not None:
self.assertEqual(tl.comments, comments)
def _test_get_x(self, method, source, string, value, defects,
remainder, comments=None):
tl, rest = method(source)
self._assert_results(tl, rest, string, value, defects, remainder,
comments=None)
return tl
def _test_parse_x(self, method, input, string, value, defects,
comments=None):
tl = method(input)
self._assert_results(tl, '', string, value, defects, '', comments)
return tl
class TestParser(TestParserMixin, TestEmailBase):
# _wsp_splitter
......@@ -49,19 +74,6 @@ class TestParser(TestEmailBase):
['foo', ' \t ', 'def jik'])
# test harness
def _test_get_x(self, method, input, string, value, defects,
remainder, comments=None):
token, rest = method(input)
self.assertEqual(str(token), string)
self.assertEqual(token.value, value)
self.assertDefectsEqual(token.all_defects, defects)
self.assertEqual(rest, remainder)
if comments is not None:
self.assertEqual(token.comments, comments)
return token
# get_fws
def test_get_fws_only(self):
......@@ -2390,6 +2402,67 @@ class TestParser(TestEmailBase):
str(address_list.mailboxes[2]))
@parameterize
class Test_parse_mime_version(TestParserMixin, TestEmailBase):
def mime_version_as_value(self,
value,
tl_str,
tl_value,
major,
minor,
defects):
mime_version = self._test_parse_x(parser.parse_mime_version,
value, tl_str, tl_value, defects)
self.assertEqual(mime_version.major, major)
self.assertEqual(mime_version.minor, minor)
mime_version_params = {
'rfc_2045_1': (
'1.0',
'1.0',
'1.0',
1,
0,
[]),
'RFC_2045_2': (
'1.0 (produced by MetaSend Vx.x)',
'1.0 (produced by MetaSend Vx.x)',
'1.0 ',
1,
0,
[]),
'RFC_2045_3': (
'(produced by MetaSend Vx.x) 1.0',
'(produced by MetaSend Vx.x) 1.0',
' 1.0',
1,
0,
[]),
'RFC_2045_4': (
'1.(produced by MetaSend Vx.x)0',
'1.(produced by MetaSend Vx.x)0',
'1. 0',
1,
0,
[]),
'empty': (
'',
'',
'',
None,
None,
[errors.HeaderMissingRequiredValue]),
}
class TestFolding(TestEmailBase):
policy = policy.default
......
......@@ -259,6 +259,7 @@ class TestMessageAPI(TestEmailBase):
self.assertTrue(lines[0].startswith('From '))
eq(text, NL.join(lines[1:]))
# test_headerregistry.TestContentTypeHeader.bad_params
def test_bad_param(self):
msg = email.message_from_string("Content-Type: blarg; baz; boo\n")
self.assertEqual(msg.get_param('baz'), '')
......@@ -292,6 +293,7 @@ class TestMessageAPI(TestEmailBase):
eq(msg.get_params(header='x-header'),
[('foo', ''), ('bar', 'one'), ('baz', 'two')])
# test_headerregistry.TestContentTypeHeader.spaces_around_param_equals
def test_get_param_liberal(self):
msg = Message()
msg['Content-Type'] = 'Content-Type: Multipart/mixed; boundary = "CPIMSSMTPC06p5f3tG"'
......@@ -314,10 +316,12 @@ class TestMessageAPI(TestEmailBase):
# msg.get_param("weird")
# yet.
# test_headerregistry.TestContentTypeHeader.spaces_around_semis
def test_get_param_funky_continuation_lines(self):
msg = self._msgobj('msg_22.txt')
self.assertEqual(msg.get_payload(1).get_param('name'), 'wibble.JPG')
# test_headerregistry.TestContentTypeHeader.semis_inside_quotes
def test_get_param_with_semis_in_quotes(self):
msg = email.message_from_string(
'Content-Type: image/pjpeg; name="Jim&&Jill"\n')
......@@ -325,6 +329,7 @@ class TestMessageAPI(TestEmailBase):
self.assertEqual(msg.get_param('name', unquote=False),
'"Jim&&Jill"')
# test_headerregistry.TestContentTypeHeader.quotes_inside_rfc2231_value
def test_get_param_with_quotes(self):
msg = email.message_from_string(
'Content-Type: foo; bar*0="baz\\"foobar"; bar*1="\\"baz"')
......@@ -1885,6 +1890,7 @@ class TestNonConformant(TestEmailBase):
"\nContent-Transfer-Encoding: {}".format(cte)))
self.assertEqual(len(msg.defects), 0)
# test_headerregistry.TestContentTyopeHeader invalid_1 and invalid_2.
def test_invalid_content_type(self):
eq = self.assertEqual
neq = self.ndiffAssertEqual
......@@ -3437,6 +3443,7 @@ class Test8BitBytesHandling(unittest.TestCase):
self.assertEqual(msg.get_content_maintype(), "text")
self.assertEqual(msg.get_content_subtype(), "pl\uFFFDin")
# test_headerregistry.TestContentTypeHeader.non_ascii_in_params
def test_get_params_with_8bit(self):
msg = email.message_from_bytes(
'X-Header: foo=\xa7ne; b\xa7r=two; baz=three\n'.encode('latin-1'))
......@@ -3446,6 +3453,7 @@ class Test8BitBytesHandling(unittest.TestCase):
# XXX: someday you might be able to get 'b\xa7r', for now you can't.
self.assertEqual(msg.get_param('b\xa7r', header='x-header'), None)
# test_headerregistry.TestContentTypeHeader.non_ascii_in_rfc2231_value
def test_get_rfc2231_params_with_8bit(self):
msg = email.message_from_bytes(textwrap.dedent("""\
Content-Type: text/plain; charset=us-ascii;
......@@ -4491,6 +4499,9 @@ A very long line that must get split to something other than at the
# Test RFC 2231 header parameters (en/de)coding
class TestRFC2231(TestEmailBase):
# test_headerregistry.TestContentTypeHeader.rfc2231_encoded_with_double_quotes
# test_headerregistry.TestContentTypeHeader.rfc2231_single_quote_inside_double_quotes
def test_get_param(self):
eq = self.assertEqual
msg = self._msgobj('msg_29.txt')
......@@ -4576,11 +4587,15 @@ Do you like this message?
-Me
""")
# test_headerregistry.TestContentTypeHeader.rfc2231_encoded_charset
# I changed the charset name, though, because the one in the file isn't
# a legal charset name. Should add a test for an illegal charset.
def test_rfc2231_get_content_charset(self):
eq = self.assertEqual
msg = self._msgobj('msg_32.txt')
eq(msg.get_content_charset(), 'us-ascii')
# test_headerregistry.TestContentTypeHeader.rfc2231_encoded_no_double_quotes
def test_rfc2231_parse_rfc_quoting(self):
m = textwrap.dedent('''\
Content-Disposition: inline;
......@@ -4594,6 +4609,7 @@ Do you like this message?
'This is even more ***fun*** is it not.pdf')
self.assertEqual(m, msg.as_string())
# test_headerregistry.TestContentTypeHeader.rfc2231_encoded_with_double_quotes
def test_rfc2231_parse_extra_quoting(self):
m = textwrap.dedent('''\
Content-Disposition: inline;
......@@ -4607,6 +4623,9 @@ Do you like this message?
'This is even more ***fun*** is it not.pdf')
self.assertEqual(m, msg.as_string())
# test_headerregistry.TestContentTypeHeader.rfc2231_no_language_or_charset
# but new test uses *0* because otherwise lang/charset is not valid.
# test_headerregistry.TestContentTypeHeader.rfc2231_segmented_normal_values
def test_rfc2231_no_language_or_charset(self):
m = '''\
Content-Transfer-Encoding: 8bit
......@@ -4621,6 +4640,7 @@ Content-Type: text/html; NAME*0=file____C__DOCUMENTS_20AND_20SETTINGS_FABIEN_LOC
param,
'file____C__DOCUMENTS_20AND_20SETTINGS_FABIEN_LOCAL_20SETTINGS_TEMP_nsmail.htm')
# test_headerregistry.TestContentTypeHeader.rfc2231_encoded_no_charset
def test_rfc2231_no_language_or_charset_in_filename(self):
m = '''\
Content-Disposition: inline;
......@@ -4633,6 +4653,7 @@ Content-Disposition: inline;
self.assertEqual(msg.get_filename(),
'This is even more ***fun*** is it not.pdf')
# Duplicate of previous test?
def test_rfc2231_no_language_or_charset_in_filename_encoded(self):
m = '''\
Content-Disposition: inline;
......@@ -4645,6 +4666,8 @@ Content-Disposition: inline;
self.assertEqual(msg.get_filename(),
'This is even more ***fun*** is it not.pdf')
# test_headerregistry.TestContentTypeHeader.rfc2231_partly_encoded,
# but the test below is wrong (the first part should be decoded).
def test_rfc2231_partly_encoded(self):
m = '''\
Content-Disposition: inline;
......@@ -4696,6 +4719,7 @@ Content-Type: text/plain;
self.assertEqual(msg.get_content_charset(),
'this is even more ***fun*** is it not.pdf')
# test_headerregistry.TestContentTypeHeader.rfc2231_unknown_charset_treated_as_ascii
def test_rfc2231_bad_encoding_in_filename(self):
m = '''\
Content-Disposition: inline;
......@@ -4762,6 +4786,7 @@ Content-Type: application/x-foo;
eq(language, None)
eq(s, "Frank's Document")
# test_headerregistry.TestContentTypeHeader.rfc2231_single_quote_inside_double_quotes
def test_rfc2231_single_tick_in_filename(self):
m = """\
Content-Type: application/x-foo; name*0=\"Frank's\"; name*1=\" Document\"
......@@ -4772,6 +4797,7 @@ Content-Type: application/x-foo; name*0=\"Frank's\"; name*1=\" Document\"
self.assertFalse(isinstance(param, tuple))
self.assertEqual(param, "Frank's Document")
# test_headerregistry.TestContentTypeHeader.rfc2231_single_quote_in_value_with_charset_and_lang
def test_rfc2231_tick_attack_extended(self):
eq = self.assertEqual
m = """\
......@@ -4785,6 +4811,7 @@ Content-Type: application/x-foo;
eq(language, 'en-us')
eq(s, "Frank's Document")
# test_headerregistry.TestContentTypeHeader.rfc2231_single_quote_in_non_encoded_value
def test_rfc2231_tick_attack(self):
m = """\
Content-Type: application/x-foo;
......@@ -4796,6 +4823,7 @@ Content-Type: application/x-foo;
self.assertFalse(isinstance(param, tuple))
self.assertEqual(param, "us-ascii'en-us'Frank's Document")
# test_headerregistry.TestContentTypeHeader.rfc2231_single_quotes_inside_quotes
def test_rfc2231_no_extended_values(self):
eq = self.assertEqual
m = """\
......@@ -4805,6 +4833,7 @@ Content-Type: application/x-foo; name=\"Frank's Document\"
msg = email.message_from_string(m)
eq(msg.get_param('name'), "Frank's Document")
# test_headerregistry.TestContentTypeHeader.rfc2231_encoded_then_unencoded_segments
def test_rfc2231_encoded_then_unencoded_segments(self):
eq = self.assertEqual
m = """\
......@@ -4820,6 +4849,8 @@ Content-Type: application/x-foo;
eq(language, 'en-us')
eq(s, 'My Document For You')
# test_headerregistry.TestContentTypeHeader.rfc2231_unencoded_then_encoded_segments
# test_headerregistry.TestContentTypeHeader.rfc2231_quoted_unencoded_then_encoded_segments
def test_rfc2231_unencoded_then_encoded_segments(self):
eq = self.assertEqual
m = """\
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment