Kaydet (Commit) bc6f8de7 authored tarafından Serhiy Storchaka's avatar Serhiy Storchaka

Issue #21448: Fixed FeedParser feed() to avoid O(N**2) behavior when parsing long line.

Original patch by Raymond Hettinger.
üst 3fdffc9f
...@@ -49,8 +49,8 @@ class BufferedSubFile(object): ...@@ -49,8 +49,8 @@ class BufferedSubFile(object):
simple abstraction -- it parses until EOF closes the current message. simple abstraction -- it parses until EOF closes the current message.
""" """
def __init__(self): def __init__(self):
# The last partial line pushed into this object. # Chunks of the last partial line pushed into this object.
self._partial = '' self._partial = []
# The list of full, pushed lines, in reverse order # The list of full, pushed lines, in reverse order
self._lines = [] self._lines = []
# The stack of false-EOF checking predicates. # The stack of false-EOF checking predicates.
...@@ -66,8 +66,8 @@ class BufferedSubFile(object): ...@@ -66,8 +66,8 @@ class BufferedSubFile(object):
def close(self): def close(self):
# Don't forget any trailing partial line. # Don't forget any trailing partial line.
self._lines.append(self._partial) self.pushlines(''.join(self._partial).splitlines(True))
self._partial = '' self._partial = []
self._closed = True self._closed = True
def readline(self): def readline(self):
...@@ -95,8 +95,29 @@ class BufferedSubFile(object): ...@@ -95,8 +95,29 @@ class BufferedSubFile(object):
def push(self, data): def push(self, data):
"""Push some new data into this object.""" """Push some new data into this object."""
# Handle any previous leftovers # Crack into lines, but preserve the linesep characters on the end of each
data, self._partial = self._partial + data, '' parts = data.splitlines(True)
if not parts or not parts[0].endswith(('\n', '\r')):
# No new complete lines, so just accumulate partials
self._partial += parts
return
if self._partial:
# If there are previous leftovers, complete them now
self._partial.append(parts[0])
parts[0:1] = ''.join(self._partial).splitlines(True)
del self._partial[:]
# If the last element of the list does not end in a newline, then treat
# it as a partial line. We only check for '\n' here because a line
# ending with '\r' might be a line that was split in the middle of a
# '\r\n' sequence (see bugs 1555570 and 1721862).
if not parts[-1].endswith('\n'):
self._partial = [parts.pop()]
self.pushlines(parts)
def pushlines(self, lines):
# Crack into lines, but preserve the newlines on the end of each # Crack into lines, but preserve the newlines on the end of each
parts = NLCRE_crack.split(data) parts = NLCRE_crack.split(data)
# The *ahem* interesting behaviour of re.split when supplied grouping # The *ahem* interesting behaviour of re.split when supplied grouping
......
...@@ -11,6 +11,7 @@ import unittest ...@@ -11,6 +11,7 @@ import unittest
import warnings import warnings
import textwrap import textwrap
from cStringIO import StringIO from cStringIO import StringIO
from random import choice
import email import email
...@@ -2578,16 +2579,63 @@ Do you like this message? ...@@ -2578,16 +2579,63 @@ Do you like this message?
bsf.push(il) bsf.push(il)
nt += n nt += n
n1 = 0 n1 = 0
while True: for ol in iter(bsf.readline, NeedMoreData):
ol = bsf.readline()
if ol == NeedMoreData:
break
om.append(ol) om.append(ol)
n1 += 1 n1 += 1
self.assertEqual(n, n1) self.assertEqual(n, n1)
self.assertEqual(len(om), nt) self.assertEqual(len(om), nt)
self.assertEqual(''.join([il for il, n in imt]), ''.join(om)) self.assertEqual(''.join([il for il, n in imt]), ''.join(om))
def test_push_random(self):
from email.feedparser import BufferedSubFile, NeedMoreData
n = 10000
chunksize = 5
chars = 'abcd \t\r\n'
s = ''.join(choice(chars) for i in range(n)) + '\n'
target = s.splitlines(True)
bsf = BufferedSubFile()
lines = []
for i in range(0, len(s), chunksize):
chunk = s[i:i+chunksize]
bsf.push(chunk)
lines.extend(iter(bsf.readline, NeedMoreData))
self.assertEqual(lines, target)
class TestFeedParsers(TestEmailBase):
def parse(self, chunks):
from email.feedparser import FeedParser
feedparser = FeedParser()
for chunk in chunks:
feedparser.feed(chunk)
return feedparser.close()
def test_newlines(self):
m = self.parse(['a:\nb:\rc:\r\nd:\n'])
self.assertEqual(m.keys(), ['a', 'b', 'c', 'd'])
m = self.parse(['a:\nb:\rc:\r\nd:'])
self.assertEqual(m.keys(), ['a', 'b', 'c', 'd'])
m = self.parse(['a:\rb', 'c:\n'])
self.assertEqual(m.keys(), ['a', 'bc'])
m = self.parse(['a:\r', 'b:\n'])
self.assertEqual(m.keys(), ['a', 'b'])
m = self.parse(['a:\r', '\nb:\n'])
self.assertEqual(m.keys(), ['a', 'b'])
def test_long_lines(self):
M, N = 1000, 100000
m = self.parse(['a:b\n\n'] + ['x'*M] * N)
self.assertEqual(m.items(), [('a', 'b')])
self.assertEqual(m.get_payload(), 'x'*M*N)
m = self.parse(['a:b\r\r'] + ['x'*M] * N)
self.assertEqual(m.items(), [('a', 'b')])
self.assertEqual(m.get_payload(), 'x'*M*N)
m = self.parse(['a:\r', 'b: '] + ['x'*M] * N)
self.assertEqual(m.items(), [('a', ''), ('b', 'x'*M*N)])
class TestParsers(TestEmailBase): class TestParsers(TestEmailBase):
...@@ -3180,7 +3228,6 @@ A very long line that must get split to something other than at the ...@@ -3180,7 +3228,6 @@ A very long line that must get split to something other than at the
self.assertEqual(res, '=?iso-8859-2?q?abc?=') self.assertEqual(res, '=?iso-8859-2?q?abc?=')
self.assertIsInstance(res, str) self.assertIsInstance(res, str)
# Test RFC 2231 header parameters (en/de)coding # Test RFC 2231 header parameters (en/de)coding
class TestRFC2231(TestEmailBase): class TestRFC2231(TestEmailBase):
def test_get_param(self): def test_get_param(self):
......
...@@ -19,6 +19,9 @@ Core and Builtins ...@@ -19,6 +19,9 @@ Core and Builtins
Library Library
------- -------
- Issue #21448: Changed FeedParser feed() to avoid O(N**2) behavior when
parsing long line. Original patch by Raymond Hettinger.
- Issue #17923: glob() patterns ending with a slash no longer match non-dirs on - Issue #17923: glob() patterns ending with a slash no longer match non-dirs on
AIX. Based on patch by Delhallt. AIX. Based on patch by Delhallt.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment