ElementPath.py 9.21 KB
Newer Older
Armin Rigo's avatar
Armin Rigo committed
1 2
#
# ElementTree
3
# $Id: ElementPath.py 3375 2008-02-13 08:05:08Z fredrik $
Armin Rigo's avatar
Armin Rigo committed
4 5 6 7 8 9 10
#
# limited xpath support for element trees
#
# history:
# 2003-05-23 fl   created
# 2003-05-28 fl   added support for // etc
# 2003-08-27 fl   fixed parsing of periods in element names
11 12 13 14 15
# 2007-09-10 fl   new selection engine
# 2007-09-12 fl   fixed parent selector
# 2007-09-13 fl   added iterfind; changed findall to return a list
# 2007-11-30 fl   added namespaces support
# 2009-10-30 fl   added child element value filter
Armin Rigo's avatar
Armin Rigo committed
16
#
17
# Copyright (c) 2003-2009 by Fredrik Lundh.  All rights reserved.
Armin Rigo's avatar
Armin Rigo committed
18 19 20 21 22 23 24
#
# fredrik@pythonware.com
# http://www.pythonware.com
#
# --------------------------------------------------------------------
# The ElementTree toolkit is
#
25
# Copyright (c) 1999-2009 by Fredrik Lundh
Armin Rigo's avatar
Armin Rigo committed
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
#
# By obtaining, using, and/or copying this software and/or its
# associated documentation, you agree that you have read, understood,
# and will comply with the following terms and conditions:
#
# Permission to use, copy, modify, and distribute this software and
# its associated documentation for any purpose and without fee is
# hereby granted, provided that the above copyright notice appears in
# all copies, and that both that copyright notice and this permission
# notice appear in supporting documentation, and that the name of
# Secret Labs AB or the author not be used in advertising or publicity
# pertaining to distribution of the software without specific, written
# prior permission.
#
# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
# ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
# OF THIS SOFTWARE.
# --------------------------------------------------------------------

50
# Licensed to PSF under a Contributor Agreement.
51
# See http://www.python.org/psf/license for licensing details.
52

Armin Rigo's avatar
Armin Rigo committed
53 54 55 56 57 58 59 60
##
# Implementation module for XPath support.  There's usually no reason
# to import this module directly; the <b>ElementTree</b> does this for
# you, if needed.
##

import re

61 62 63 64 65 66 67 68 69 70 71
xpath_tokenizer_re = re.compile(
    "("
    "'[^']*'|\"[^\"]*\"|"
    "::|"
    "//?|"
    "\.\.|"
    "\(\)|"
    "[/.*:\[\]\(\)@=])|"
    "((?:\{[^}]+\})?[^/\[\]\(\)@=\s]+)|"
    "\s+"
    )
Armin Rigo's avatar
Armin Rigo committed
72

73 74 75 76 77 78 79 80 81 82 83 84 85
def xpath_tokenizer(pattern, namespaces=None):
    for token in xpath_tokenizer_re.findall(pattern):
        tag = token[1]
        if tag and tag[0] != "{" and ":" in tag:
            try:
                prefix, uri = tag.split(":", 1)
                if not namespaces:
                    raise KeyError
                yield token[0], "{%s}%s" % (namespaces[prefix], uri)
            except KeyError:
                raise SyntaxError("prefix %r not found in prefix map" % prefix)
        else:
            yield token
Armin Rigo's avatar
Armin Rigo committed
86

87 88 89 90 91 92 93 94
def get_parent_map(context):
    parent_map = context.parent_map
    if parent_map is None:
        context.parent_map = parent_map = {}
        for p in context.root.iter():
            for e in p:
                parent_map[e] = p
    return parent_map
Armin Rigo's avatar
Armin Rigo committed
95

96 97 98 99 100 101 102 103
def prepare_child(next, token):
    tag = token[1]
    def select(context, result):
        for elem in result:
            for e in elem:
                if e.tag == tag:
                    yield e
    return select
Armin Rigo's avatar
Armin Rigo committed
104

105 106 107
def prepare_star(next, token):
    def select(context, result):
        for elem in result:
Philip Jenvey's avatar
Philip Jenvey committed
108
            yield from elem
109
    return select
Armin Rigo's avatar
Armin Rigo committed
110

111 112
def prepare_self(next, token):
    def select(context, result):
Philip Jenvey's avatar
Philip Jenvey committed
113
        yield from result
114
    return select
Armin Rigo's avatar
Armin Rigo committed
115

116 117 118 119 120 121 122 123 124 125 126 127 128 129
def prepare_descendant(next, token):
    token = next()
    if token[0] == "*":
        tag = "*"
    elif not token[0]:
        tag = token[1]
    else:
        raise SyntaxError("invalid descendant")
    def select(context, result):
        for elem in result:
            for e in elem.iter(tag):
                if e is not elem:
                    yield e
    return select
Armin Rigo's avatar
Armin Rigo committed
130

131 132 133 134 135 136 137 138 139 140 141 142
def prepare_parent(next, token):
    def select(context, result):
        # FIXME: raise error if .. is applied at toplevel?
        parent_map = get_parent_map(context)
        result_map = {}
        for elem in result:
            if elem in parent_map:
                parent = parent_map[elem]
                if parent not in result_map:
                    result_map[parent] = None
                    yield parent
    return select
Armin Rigo's avatar
Armin Rigo committed
143

144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
def prepare_predicate(next, token):
    # FIXME: replace with real parser!!! refs:
    # http://effbot.org/zone/simple-iterator-parser.htm
    # http://javascript.crockford.com/tdop/tdop.html
    signature = []
    predicate = []
    while 1:
        token = next()
        if token[0] == "]":
            break
        if token[0] and token[0][:1] in "'\"":
            token = "'", token[0][1:-1]
        signature.append(token[0] or "-")
        predicate.append(token[1])
    signature = "".join(signature)
    # use signature to determine predicate type
    if signature == "@-":
        # [@attribute] predicate
        key = predicate[1]
        def select(context, result):
            for elem in result:
                if elem.get(key) is not None:
                    yield elem
        return select
    if signature == "@-='":
        # [@attribute='value']
        key = predicate[1]
        value = predicate[-1]
        def select(context, result):
            for elem in result:
                if elem.get(key) == value:
                    yield elem
        return select
    if signature == "-" and not re.match("\d+$", predicate[0]):
        # [tag]
        tag = predicate[0]
        def select(context, result):
            for elem in result:
                if elem.find(tag) is not None:
                    yield elem
        return select
    if signature == "-='" and not re.match("\d+$", predicate[0]):
        # [tag='value']
        tag = predicate[0]
        value = predicate[-1]
        def select(context, result):
            for elem in result:
                for e in elem.findall(tag):
                    if "".join(e.itertext()) == value:
                        yield elem
                        break
        return select
    if signature == "-" or signature == "-()" or signature == "-()-":
        # [index] or [last()] or [last()-index]
        if signature == "-":
            index = int(predicate[0]) - 1
        else:
            if predicate[0] != "last":
                raise SyntaxError("unsupported function")
            if signature == "-()-":
Armin Rigo's avatar
Armin Rigo committed
204
                try:
205 206 207
                    index = int(predicate[2]) - 1
                except ValueError:
                    raise SyntaxError("unsupported expression")
Armin Rigo's avatar
Armin Rigo committed
208
            else:
209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
                index = -1
        def select(context, result):
            parent_map = get_parent_map(context)
            for elem in result:
                try:
                    parent = parent_map[elem]
                    # FIXME: what if the selector is "*" ?
                    elems = list(parent.findall(elem.tag))
                    if elems[index] is elem:
                        yield elem
                except (IndexError, KeyError):
                    pass
        return select
    raise SyntaxError("invalid predicate")

ops = {
    "": prepare_child,
    "*": prepare_star,
    ".": prepare_self,
    "..": prepare_parent,
    "//": prepare_descendant,
    "[": prepare_predicate,
    }
Armin Rigo's avatar
Armin Rigo committed
232 233 234

_cache = {}

235 236 237 238 239 240 241
class _SelectorContext:
    parent_map = None
    def __init__(self, root):
        self.root = root

# --------------------------------------------------------------------

Armin Rigo's avatar
Armin Rigo committed
242
##
243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276
# Generate all matching objects.

def iterfind(elem, path, namespaces=None):
    # compile selector pattern
    if path[-1:] == "/":
        path = path + "*" # implicit all (FIXME: keep this?)
    try:
        selector = _cache[path]
    except KeyError:
        if len(_cache) > 100:
            _cache.clear()
        if path[:1] == "/":
            raise SyntaxError("cannot use absolute path on element")
        next = iter(xpath_tokenizer(path, namespaces)).__next__
        token = next()
        selector = []
        while 1:
            try:
                selector.append(ops[token[0]](next, token))
            except StopIteration:
                raise SyntaxError("invalid path")
            try:
                token = next()
                if token[0] == "/":
                    token = next()
            except StopIteration:
                break
        _cache[path] = selector
    # execute selector pattern
    result = [elem]
    context = _SelectorContext(elem)
    for select in selector:
        result = select(context, result)
    return result
Armin Rigo's avatar
Armin Rigo committed
277 278 279 280

##
# Find first matching object.

281 282 283 284 285
def find(elem, path, namespaces=None):
    try:
        return next(iterfind(elem, path, namespaces))
    except StopIteration:
        return None
Armin Rigo's avatar
Armin Rigo committed
286 287

##
288
# Find all matching objects.
Armin Rigo's avatar
Armin Rigo committed
289

290 291
def findall(elem, path, namespaces=None):
    return list(iterfind(elem, path, namespaces))
Armin Rigo's avatar
Armin Rigo committed
292 293

##
294
# Find text for first matching object.
Armin Rigo's avatar
Armin Rigo committed
295

296 297 298 299 300 301
def findtext(elem, path, default=None, namespaces=None):
    try:
        elem = next(iterfind(elem, path, namespaces))
        return elem.text or ""
    except StopIteration:
        return default