Kaydet (Commit) 5b235d09 authored tarafından Antoine Pitrou's avatar Antoine Pitrou

Issue #17741: Add ElementTree.IncrementalParser, an event-driven parser for…

Issue #17741: Add ElementTree.IncrementalParser, an event-driven parser for non-blocking applications.
üst 323d2927
...@@ -397,6 +397,9 @@ Functions ...@@ -397,6 +397,9 @@ Functions
If you need a fully populated element, look for "end" events instead. If you need a fully populated element, look for "end" events instead.
.. note::
For real event-driven parsing, see :class:`IncrementalParser`.
.. function:: parse(source, parser=None) .. function:: parse(source, parser=None)
...@@ -833,6 +836,48 @@ QName Objects ...@@ -833,6 +836,48 @@ QName Objects
:class:`QName` instances are opaque. :class:`QName` instances are opaque.
IncrementalParser Objects
^^^^^^^^^^^^^^^^^^^^^^^^^
.. class:: IncrementalParser(events=None, parser=None)
An incremental, event-driven parser suitable for non-blocking applications.
*events* is a list of events to report back. The supported events are the
strings ``"start"``, ``"end"``, ``"start-ns"`` and ``"end-ns"`` (the "ns"
events are used to get detailed namespace information). If *events* is
omitted, only ``"end"`` events are reported. *parser* is an optional
parser instance. If not given, the standard :class:`XMLParser` parser is
used.
.. method:: data_received(data)
Feed the given bytes data to the incremental parser.
.. method:: eof_received()
Signal the incremental parser that the data stream is terminated.
.. method:: events()
Iterate over the events which have been encountered in the data fed
to the parser. This method yields ``(event, elem)`` pairs, where
*event* is a string representing the type of event (e.g. ``"end"``)
and *elem* is the encountered :class:`Element` object.
.. note::
:class:`IncrementalParser` only guarantees that it has seen the ">"
character of a starting tag when it emits a "start" event, so the
attributes are defined, but the contents of the text and tail attributes
are undefined at that point. The same applies to the element children;
they may or may not be present.
If you need a fully populated element, look for "end" events instead.
.. versionadded:: 3.4
.. _elementtree-treebuilder-objects: .. _elementtree-treebuilder-objects:
TreeBuilder Objects TreeBuilder Objects
......
...@@ -903,6 +903,134 @@ class ElementTreeTest(unittest.TestCase): ...@@ -903,6 +903,134 @@ class ElementTreeTest(unittest.TestCase):
self.assertEqual(serialized, expected) self.assertEqual(serialized, expected)
class IncrementalParserTest(unittest.TestCase):
def _feed(self, parser, data, chunk_size=None):
if chunk_size is None:
parser.data_received(data)
else:
for i in range(0, len(data), chunk_size):
parser.data_received(data[i:i+chunk_size])
def assert_event_tags(self, parser, expected):
events = parser.events()
self.assertEqual([(action, elem.tag) for action, elem in events],
expected)
def test_simple_xml(self):
for chunk_size in (None, 1, 5):
with self.subTest(chunk_size=chunk_size):
parser = ET.IncrementalParser()
self.assert_event_tags(parser, [])
self._feed(parser, "<!-- comment -->\n", chunk_size)
self.assert_event_tags(parser, [])
self._feed(parser,
"<root>\n <element key='value'>text</element",
chunk_size)
self.assert_event_tags(parser, [])
self._feed(parser, ">\n", chunk_size)
self.assert_event_tags(parser, [('end', 'element')])
self._feed(parser, "<element>text</element>tail\n", chunk_size)
self._feed(parser, "<empty-element/>\n", chunk_size)
self.assert_event_tags(parser, [
('end', 'element'),
('end', 'empty-element'),
])
self._feed(parser, "</root>\n", chunk_size)
self.assert_event_tags(parser, [('end', 'root')])
# Receiving EOF sets the `root` attribute
self.assertIs(parser.root, None)
parser.eof_received()
self.assertEqual(parser.root.tag, 'root')
def test_data_received_while_iterating(self):
parser = ET.IncrementalParser()
it = parser.events()
self._feed(parser, "<root>\n <element key='value'>text</element>\n")
action, elem = next(it)
self.assertEqual((action, elem.tag), ('end', 'element'))
self._feed(parser, "</root>\n")
action, elem = next(it)
self.assertEqual((action, elem.tag), ('end', 'root'))
with self.assertRaises(StopIteration):
next(it)
def test_simple_xml_with_ns(self):
parser = ET.IncrementalParser()
self.assert_event_tags(parser, [])
self._feed(parser, "<!-- comment -->\n")
self.assert_event_tags(parser, [])
self._feed(parser, "<root xmlns='namespace'>\n")
self.assert_event_tags(parser, [])
self._feed(parser, "<element key='value'>text</element")
self.assert_event_tags(parser, [])
self._feed(parser, ">\n")
self.assert_event_tags(parser, [('end', '{namespace}element')])
self._feed(parser, "<element>text</element>tail\n")
self._feed(parser, "<empty-element/>\n")
self.assert_event_tags(parser, [
('end', '{namespace}element'),
('end', '{namespace}empty-element'),
])
self._feed(parser, "</root>\n")
self.assert_event_tags(parser, [('end', '{namespace}root')])
# Receiving EOF sets the `root` attribute
self.assertIs(parser.root, None)
parser.eof_received()
self.assertEqual(parser.root.tag, '{namespace}root')
def test_events(self):
parser = ET.IncrementalParser(events=())
self._feed(parser, "<root/>\n")
self.assert_event_tags(parser, [])
parser = ET.IncrementalParser(events=('start', 'end'))
self._feed(parser, "<!-- comment -->\n")
self.assert_event_tags(parser, [])
self._feed(parser, "<root>\n")
self.assert_event_tags(parser, [('start', 'root')])
self._feed(parser, "<element key='value'>text</element")
self.assert_event_tags(parser, [('start', 'element')])
self._feed(parser, ">\n")
self.assert_event_tags(parser, [('end', 'element')])
self._feed(parser,
"<element xmlns='foo'>text<empty-element/></element>tail\n")
self.assert_event_tags(parser, [
('start', '{foo}element'),
('start', '{foo}empty-element'),
('end', '{foo}empty-element'),
('end', '{foo}element'),
])
self._feed(parser, "</root>")
parser.eof_received()
self.assertIs(parser.root, None)
self.assert_event_tags(parser, [('end', 'root')])
self.assertEqual(parser.root.tag, 'root')
parser = ET.IncrementalParser(events=('start',))
self._feed(parser, "<!-- comment -->\n")
self.assert_event_tags(parser, [])
self._feed(parser, "<root>\n")
self.assert_event_tags(parser, [('start', 'root')])
self._feed(parser, "<element key='value'>text</element")
self.assert_event_tags(parser, [('start', 'element')])
self._feed(parser, ">\n")
self.assert_event_tags(parser, [])
self._feed(parser,
"<element xmlns='foo'>text<empty-element/></element>tail\n")
self.assert_event_tags(parser, [
('start', '{foo}element'),
('start', '{foo}empty-element'),
])
self._feed(parser, "</root>")
parser.eof_received()
self.assertEqual(parser.root.tag, 'root')
def test_unknown_event(self):
with self.assertRaises(ValueError):
ET.IncrementalParser(events=('start', 'end', 'bogus'))
# #
# xinclude tests (samples from appendix C of the xinclude specification) # xinclude tests (samples from appendix C of the xinclude specification)
...@@ -1406,6 +1534,7 @@ class BugsTest(unittest.TestCase): ...@@ -1406,6 +1534,7 @@ class BugsTest(unittest.TestCase):
ET.register_namespace('test10777', 'http://myuri/') ET.register_namespace('test10777', 'http://myuri/')
ET.register_namespace('test10777', 'http://myuri/') ET.register_namespace('test10777', 'http://myuri/')
# -------------------------------------------------------------------- # --------------------------------------------------------------------
...@@ -2301,6 +2430,7 @@ def test_main(module=None): ...@@ -2301,6 +2430,7 @@ def test_main(module=None):
ElementSlicingTest, ElementSlicingTest,
BasicElementTest, BasicElementTest,
ElementTreeTest, ElementTreeTest,
IncrementalParserTest,
IOTest, IOTest,
ParseErrorTest, ParseErrorTest,
XIncludeTest, XIncludeTest,
......
...@@ -1216,84 +1216,85 @@ def iterparse(source, events=None, parser=None): ...@@ -1216,84 +1216,85 @@ def iterparse(source, events=None, parser=None):
if not hasattr(source, "read"): if not hasattr(source, "read"):
source = open(source, "rb") source = open(source, "rb")
close_source = True close_source = True
if not parser:
parser = XMLParser(target=TreeBuilder())
return _IterParseIterator(source, events, parser, close_source) return _IterParseIterator(source, events, parser, close_source)
class _IterParseIterator:
def __init__(self, source, events, parser, close_source=False): class IncrementalParser:
self._file = source
self._close_file = close_source def __init__(self, events=None, parser=None):
self._events = [] # _elementtree.c expects a list, not a deque
self._events_queue = []
self._index = 0 self._index = 0
self._error = None
self.root = self._root = None self.root = self._root = None
if not parser:
parser = XMLParser(target=TreeBuilder())
self._parser = parser self._parser = parser
# wire up the parser for event reporting # wire up the parser for event reporting
parser = self._parser._parser
append = self._events.append
if events is None: if events is None:
events = ["end"] events = ("end",)
for event in events: self._parser._setevents(self._events_queue, events)
if event == "start":
try: def data_received(self, data):
parser.ordered_attributes = 1 if self._parser is None:
parser.specified_attributes = 1 raise ValueError("data_received() called after end of stream")
def handler(tag, attrib_in, event=event, append=append, if data:
start=self._parser._start_list): try:
append((event, start(tag, attrib_in))) self._parser.feed(data)
parser.StartElementHandler = handler except SyntaxError as exc:
except AttributeError: self._events_queue.append(exc)
def handler(tag, attrib_in, event=event, append=append,
start=self._parser._start): def eof_received(self):
append((event, start(tag, attrib_in))) self._root = self._parser.close()
parser.StartElementHandler = handler self._parser = None
elif event == "end": if self._index >= len(self._events_queue):
def handler(tag, event=event, append=append, self.root = self._root
end=self._parser._end):
append((event, end(tag))) def events(self):
parser.EndElementHandler = handler events = self._events_queue
elif event == "start-ns": while True:
def handler(prefix, uri, event=event, append=append): index = self._index
append((event, (prefix or "", uri or ""))) try:
parser.StartNamespaceDeclHandler = handler event = events[self._index]
elif event == "end-ns": # Avoid retaining references to past events
def handler(prefix, event=event, append=append): events[self._index] = None
append((event, None)) except IndexError:
parser.EndNamespaceDeclHandler = handler break
index += 1
# Compact the list in a O(1) amortized fashion
if index * 2 >= len(events):
events[:index] = []
self._index = 0
else: else:
raise ValueError("unknown event %r" % event) self._index = index
if isinstance(event, Exception):
raise event
else:
yield event
if self._parser is None:
self.root = self._root
class _IterParseIterator(IncrementalParser):
def __init__(self, source, events, parser, close_source=False):
IncrementalParser.__init__(self, events, parser)
self._file = source
self._close_file = close_source
def __next__(self): def __next__(self):
while 1: while 1:
try: for event in self.events():
item = self._events[self._index] return event
self._index += 1
return item
except IndexError:
pass
if self._error:
e = self._error
self._error = None
raise e
if self._parser is None: if self._parser is None:
self.root = self._root
if self._close_file: if self._close_file:
self._file.close() self._file.close()
raise StopIteration raise StopIteration
# load event buffer # load event buffer
del self._events[:]
self._index = 0
data = self._file.read(16384) data = self._file.read(16384)
if data: if data:
try: self.data_received(data)
self._parser.feed(data)
except SyntaxError as exc:
self._error = exc
else: else:
self._root = self._parser.close() self.eof_received()
self._parser = None
def __iter__(self): def __iter__(self):
return self return self
...@@ -1498,6 +1499,40 @@ class XMLParser: ...@@ -1498,6 +1499,40 @@ class XMLParser:
except AttributeError: except AttributeError:
pass # unknown pass # unknown
def _setevents(self, event_list, events):
# Internal API for IncrementalParser
parser = self._parser
append = event_list.append
for event in events:
if event == "start":
try:
parser.ordered_attributes = 1
parser.specified_attributes = 1
def handler(tag, attrib_in, event=event, append=append,
start=self._start_list):
append((event, start(tag, attrib_in)))
parser.StartElementHandler = handler
except AttributeError:
def handler(tag, attrib_in, event=event, append=append,
start=self._start):
append((event, start(tag, attrib_in)))
parser.StartElementHandler = handler
elif event == "end":
def handler(tag, event=event, append=append,
end=self._end):
append((event, end(tag)))
parser.EndElementHandler = handler
elif event == "start-ns":
def handler(prefix, uri, event=event, append=append):
append((event, (prefix or "", uri or "")))
parser.StartNamespaceDeclHandler = handler
elif event == "end-ns":
def handler(prefix, event=event, append=append):
append((event, None))
parser.EndNamespaceDeclHandler = handler
else:
raise ValueError("unknown event %r" % event)
def _raiseerror(self, value): def _raiseerror(self, value):
err = ParseError(value) err = ParseError(value)
err.code = value.code err.code = value.code
...@@ -1635,7 +1670,7 @@ try: ...@@ -1635,7 +1670,7 @@ try:
except ImportError: except ImportError:
pass pass
else: else:
# Overwrite 'ElementTree.parse' and 'iterparse' to use the C XMLParser # Overwrite 'ElementTree.parse' to use the C XMLParser
class ElementTree(ElementTree): class ElementTree(ElementTree):
__doc__ = ElementTree.__doc__ __doc__ = ElementTree.__doc__
...@@ -1661,56 +1696,6 @@ else: ...@@ -1661,56 +1696,6 @@ else:
if close_source: if close_source:
source.close() source.close()
class iterparse:
__doc__ = iterparse.__doc__
root = None
def __init__(self, source, events=None, parser=None):
self._close_file = False
if not hasattr(source, 'read'):
source = open(source, 'rb')
self._close_file = True
self._file = source
self._events = []
self._index = 0
self._error = None
self.root = self._root = None
if parser is None:
parser = XMLParser(target=TreeBuilder())
self._parser = parser
self._parser._setevents(self._events, events)
def __next__(self):
while True:
try:
item = self._events[self._index]
self._index += 1
return item
except IndexError:
pass
if self._error:
e = self._error
self._error = None
raise e
if self._parser is None:
self.root = self._root
if self._close_file:
self._file.close()
raise StopIteration
# load event buffer
del self._events[:]
self._index = 0
data = self._file.read(16384)
if data:
try:
self._parser.feed(data)
except SyntaxError as exc:
self._error = exc
else:
self._root = self._parser.close()
self._parser = None
def __iter__(self):
return self
# compatibility # compatibility
XMLTreeBuilder = XMLParser XMLTreeBuilder = XMLParser
......
...@@ -45,6 +45,9 @@ Core and Builtins ...@@ -45,6 +45,9 @@ Core and Builtins
Library Library
------- -------
- Issue #17741: Add ElementTree.IncrementalParser, an event-driven parser
for non-blocking applications.
- Issue #17555: Fix ForkAwareThreadLock so that size of after fork - Issue #17555: Fix ForkAwareThreadLock so that size of after fork
registry does not grow exponentially with generation of process. registry does not grow exponentially with generation of process.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment