pulldom.py 10.7 KB
Newer Older
1 2
import xml.sax
import xml.sax.handler
3 4 5 6 7 8
import types

try:
    _StringTypes = [types.StringType, types.UnicodeType]
except AttributeError:
    _StringTypes = [types.StringType]
9

10 11 12 13 14 15 16 17
START_ELEMENT = "START_ELEMENT"
END_ELEMENT = "END_ELEMENT"
COMMENT = "COMMENT"
START_DOCUMENT = "START_DOCUMENT"
END_DOCUMENT = "END_DOCUMENT"
PROCESSING_INSTRUCTION = "PROCESSING_INSTRUCTION"
IGNORABLE_WHITESPACE = "IGNORABLE_WHITESPACE"
CHARACTERS = "CHARACTERS"
18

19
class PullDOM(xml.sax.ContentHandler):
20 21 22 23 24
    _locator = None
    document = None

    def __init__(self, documentFactory=None):
        self.documentFactory = documentFactory
25 26
        self.firstEvent = [None, None]
        self.lastEvent = self.firstEvent
27 28 29 30 31 32 33
        self.elementStack = []
        self.push = self.elementStack.append
        try:
            self.pop = self.elementStack.pop
        except AttributeError:
            # use class' pop instead
            pass
34 35
        self._ns_contexts = [{}] # contains uri -> prefix dicts
        self._current_context = self._ns_contexts[-1]
36
        self.pending_events = []
37

38 39
    def pop(self):
        result = self.elementStack[-1]
Martin v. Löwis's avatar
Martin v. Löwis committed
40
        del self.elementStack[-1]
41 42
        return result

43 44
    def setDocumentLocator(self, locator):
        self._locator = locator
45

46
    def startPrefixMapping(self, prefix, uri):
47 48 49
        if not hasattr(self, '_xmlns_attrs'):
            self._xmlns_attrs = []
        self._xmlns_attrs.append((prefix or 'xmlns', uri))
50
        self._ns_contexts.append(self._current_context.copy())
51
        self._current_context[uri] = prefix or ''
52 53

    def endPrefixMapping(self, prefix):
54
        self._current_context = self._ns_contexts.pop()
55 56

    def startElementNS(self, name, tagName , attrs):
57 58 59 60 61 62 63
        # Retrieve xml namespace declaration attributes.
        xmlns_uri = 'http://www.w3.org/2000/xmlns/'
        xmlns_attrs = getattr(self, '_xmlns_attrs', None)
        if xmlns_attrs is not None:
            for aname, value in xmlns_attrs:
                attrs._attrs[(xmlns_uri, aname)] = value
            self._xmlns_attrs = []
64
        uri, localname = name
65
        if uri:
66 67 68 69
            # When using namespaces, the reader may or may not
            # provide us with the original name. If not, create
            # *a* valid tagName from the current context.
            if tagName is None:
70 71 72 73 74
                prefix = self._current_context[uri]
                if prefix:
                    tagName = prefix + ":" + localname
                else:
                    tagName = localname
75 76 77 78
            if self.document:
                node = self.document.createElementNS(uri, tagName)
            else:
                node = self.buildDocument(uri, tagName)
79 80
        else:
            # When the tagname is not prefixed, it just appears as
81
            # localname
82 83 84 85
            if self.document:
                node = self.document.createElement(localname)
            else:
                node = self.buildDocument(None, localname)
86 87

        for aname,value in attrs.items():
88
            a_uri, a_localname = aname
89 90 91 92 93 94 95 96
            if a_uri == xmlns_uri:
                if a_localname == 'xmlns':
                    qname = a_localname
                else:
                    qname = 'xmlns:' + a_localname
                attr = self.document.createAttributeNS(a_uri, qname)
                node.setAttributeNodeNS(attr)
            elif a_uri:
97 98 99 100 101
                prefix = self._current_context[a_uri]
                if prefix:
                    qname = prefix + ":" + a_localname
                else:
                    qname = a_localname
102
                attr = self.document.createAttributeNS(a_uri, qname)
103
                node.setAttributeNodeNS(attr)
104
            else:
105
                attr = self.document.createAttribute(a_localname)
106
                node.setAttributeNode(attr)
107
            attr.value = value
108

109 110
        self.lastEvent[1] = [(START_ELEMENT, node), None]
        self.lastEvent = self.lastEvent[1]
111
        self.push(node)
112

113
    def endElementNS(self, name, tagName):
114
        self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
115
        self.lastEvent = self.lastEvent[1]
116

117
    def startElement(self, name, attrs):
118 119 120 121
        if self.document:
            node = self.document.createElement(name)
        else:
            node = self.buildDocument(None, name)
122 123 124 125 126

        for aname,value in attrs.items():
            attr = self.document.createAttribute(aname)
            attr.value = value
            node.setAttributeNode(attr)
127

128 129
        self.lastEvent[1] = [(START_ELEMENT, node), None]
        self.lastEvent = self.lastEvent[1]
130
        self.push(node)
131 132

    def endElement(self, name):
133
        self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
134
        self.lastEvent = self.lastEvent[1]
135

136
    def comment(self, s):
137 138 139 140 141 142 143
        if self.document:
            node = self.document.createComment(s)
            self.lastEvent[1] = [(COMMENT, node), None]
            self.lastEvent = self.lastEvent[1]
        else:
            event = [(COMMENT, s), None]
            self.pending_events.append(event)
144 145

    def processingInstruction(self, target, data):
146 147 148 149 150 151 152
        if self.document:
            node = self.document.createProcessingInstruction(target, data)
            self.lastEvent[1] = [(PROCESSING_INSTRUCTION, node), None]
            self.lastEvent = self.lastEvent[1]
        else:
            event = [(PROCESSING_INSTRUCTION, target, data), None]
            self.pending_events.append(event)
153 154

    def ignorableWhitespace(self, chars):
155
        node = self.document.createTextNode(chars)
156 157 158 159 160 161 162 163 164
        self.lastEvent[1] = [(IGNORABLE_WHITESPACE, node), None]
        self.lastEvent = self.lastEvent[1]

    def characters(self, chars):
        node = self.document.createTextNode(chars)
        self.lastEvent[1] = [(CHARACTERS, node), None]
        self.lastEvent = self.lastEvent[1]

    def startDocument(self):
165 166 167
        if self.documentFactory is None:
            import xml.dom.minidom
            self.documentFactory = xml.dom.minidom.Document.implementation
168 169 170 171 172

    def buildDocument(self, uri, tagname):
        # Can't do that in startDocument, since we need the tagname
        # XXX: obtain DocumentType
        node = self.documentFactory.createDocument(uri, tagname, None)
173
        self.document = node
174 175
        self.lastEvent[1] = [(START_DOCUMENT, node), None]
        self.lastEvent = self.lastEvent[1]
176
        self.push(node)
177 178 179 180 181 182 183 184 185 186 187 188 189 190
        # Put everything we have seen so far into the document
        for e in self.pending_events:
            if e[0][0] == PROCESSING_INSTRUCTION:
                _,target,data = e[0]
                n = self.document.createProcessingInstruction(target, data)
                e[0] = (PROCESSING_INSTRUCTION, n)
            elif e[0][0] == COMMENT:
                n = self.document.createComment(e[0][1])
                e[0] = (COMMENT, n)
            else:
                raise AssertionError("Unknown pending event ",e[0][0])
            self.lastEvent[1] = e
            self.lastEvent = e
        self.pending_events = None
191
        return node.firstChild
192 193

    def endDocument(self):
194 195
        self.lastEvent[1] = [(END_DOCUMENT, self.document), None]
        self.pop()
196

197 198 199 200
    def clear(self):
        "clear(): Explicitly release parsing structures"
        self.document = None

201
class ErrorHandler:
202
    def warning(self, exception):
203
        print exception
204
    def error(self, exception):
205
        raise exception
206
    def fatalError(self, exception):
207
        raise exception
208 209

class DOMEventStream:
210 211 212 213
    def __init__(self, stream, parser, bufsize):
        self.stream = stream
        self.parser = parser
        self.bufsize = bufsize
214 215
        self.reset()

216
    def reset(self):
217
        self.pulldom = PullDOM()
218
        # This content handler relies on namespace support
219
        self.parser.setFeature(xml.sax.handler.feature_namespaces, 1)
220
        self.parser.setContentHandler(self.pulldom)
221

222 223 224 225
    def __getitem__(self, pos):
        rc = self.getEvent()
        if rc:
            return rc
226 227
        raise IndexError

228 229
    def expandNode(self, node):
        event = self.getEvent()
230
        parents = [node]
231
        while event:
232 233 234
            token, cur_node = event
            if cur_node is node:
                return
235
            if token != END_ELEMENT:
236 237 238 239 240
                parents[-1].appendChild(cur_node)
            if token == START_ELEMENT:
                parents.append(cur_node)
            elif token == END_ELEMENT:
                del parents[-1]
241 242 243
            event = self.getEvent()

    def getEvent(self):
244
        if not self.pulldom.firstEvent[1]:
245
            self.pulldom.lastEvent = self.pulldom.firstEvent
246
        while not self.pulldom.firstEvent[1]:
247
            buf = self.stream.read(self.bufsize)
248
            if not buf:
249
                self.parser.close()
250
                return None
251 252 253
            self.parser.feed(buf)
        rc = self.pulldom.firstEvent[1][0]
        self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
254 255
        return rc

256 257 258 259 260 261 262
    def clear(self):
        "clear(): Explicitly release parsing objects"
        self.pulldom.clear()
        del self.pulldom
        self.parser = None
        self.stream = None

263 264 265 266
class SAX2DOM(PullDOM):

    def startElementNS(self, name, tagName , attrs):
        PullDOM.startElementNS(self, name, tagName, attrs)
267 268 269
        curNode = self.elementStack[-1]
        parentNode = self.elementStack[-2]
        parentNode.appendChild(curNode)
270 271 272

    def startElement(self, name, attrs):
        PullDOM.startElement(self, name, attrs)
273 274 275
        curNode = self.elementStack[-1]
        parentNode = self.elementStack[-2]
        parentNode.appendChild(curNode)
276 277 278 279

    def processingInstruction(self, target, data):
        PullDOM.processingInstruction(self, target, data)
        node = self.lastEvent[0][1]
280 281
        parentNode = self.elementStack[-1]
        parentNode.appendChild(node)
282 283 284 285

    def ignorableWhitespace(self, chars):
        PullDOM.ignorableWhitespace(self, chars)
        node = self.lastEvent[0][1]
286 287
        parentNode = self.elementStack[-1]
        parentNode.appendChild(node)
288 289 290 291

    def characters(self, chars):
        PullDOM.characters(self, chars)
        node = self.lastEvent[0][1]
292 293
        parentNode = self.elementStack[-1]
        parentNode.appendChild(node)
294

295

296 297
default_bufsize = (2 ** 14) - 20

298 299 300
def parse(stream_or_string, parser=None, bufsize=None):
    if bufsize is None:
        bufsize = default_bufsize
301
    if type(stream_or_string) in _StringTypes:
302
        stream = open(stream_or_string)
303
    else:
304
        stream = stream_or_string
305
    if not parser:
306
        parser = xml.sax.make_parser()
307
    return DOMEventStream(stream, parser, bufsize)
308

309
def parseString(string, parser=None):
310
    try:
311
        from cStringIO import StringIO
312
    except ImportError:
313
        from StringIO import StringIO
314

315 316
    bufsize = len(string)
    buf = StringIO(string)
317 318
    if not parser:
        parser = xml.sax.make_parser()
319
    return DOMEventStream(buf, parser, bufsize)