sre_parse.py 25.7 KB
Newer Older
1 2 3
#
# Secret Labs' Regular Expression Engine
#
4
# convert re-style regular expression to sre pattern
5
#
Fredrik Lundh's avatar
Fredrik Lundh committed
6
# Copyright (c) 1998-2001 by Secret Labs AB.  All rights reserved.
7
#
8
# See the sre.py file for information on usage and redistribution.
9 10
#

11 12
"""Internal support module for sre"""

13 14
# XXX: show string offset and offending character for all errors

15 16
# this module works under 1.5.2 and later.  don't use string methods
import string, sys
17 18 19 20

from sre_constants import *

SPECIAL_CHARS = ".\\[{()*+?^$|"
21
REPEAT_CHARS = "*+?{"
22

23
DIGITS = tuple("0123456789")
24

25 26
OCTDIGITS = tuple("01234567")
HEXDIGITS = tuple("0123456789abcdefABCDEF")
27

28
WHITESPACE = tuple(" \t\n\r\v\f")
29

30
ESCAPES = {
31 32 33 34 35 36 37
    r"\a": (LITERAL, ord("\a")),
    r"\b": (LITERAL, ord("\b")),
    r"\f": (LITERAL, ord("\f")),
    r"\n": (LITERAL, ord("\n")),
    r"\r": (LITERAL, ord("\r")),
    r"\t": (LITERAL, ord("\t")),
    r"\v": (LITERAL, ord("\v")),
38
    r"\\": (LITERAL, ord("\\"))
39 40 41
}

CATEGORIES = {
Fredrik Lundh's avatar
Fredrik Lundh committed
42
    r"\A": (AT, AT_BEGINNING_STRING), # start of string
43 44 45 46 47 48 49 50
    r"\b": (AT, AT_BOUNDARY),
    r"\B": (AT, AT_NON_BOUNDARY),
    r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
    r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
    r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
    r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
    r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
    r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
Fredrik Lundh's avatar
Fredrik Lundh committed
51
    r"\Z": (AT, AT_END_STRING), # end of string
52 53
}

54
FLAGS = {
Fredrik Lundh's avatar
Fredrik Lundh committed
55
    # standard flags
56 57 58 59 60
    "i": SRE_FLAG_IGNORECASE,
    "L": SRE_FLAG_LOCALE,
    "m": SRE_FLAG_MULTILINE,
    "s": SRE_FLAG_DOTALL,
    "x": SRE_FLAG_VERBOSE,
Fredrik Lundh's avatar
Fredrik Lundh committed
61 62 63
    # extensions
    "t": SRE_FLAG_TEMPLATE,
    "u": SRE_FLAG_UNICODE,
64 65
}

66 67 68 69 70 71 72
# figure out best way to convert hex/octal numbers to integers
try:
    int("10", 8)
    atoi = int # 2.0 and later
except TypeError:
    atoi = string.atoi # 1.5.2

73 74
class Pattern:
    # master pattern object.  keeps track of global attributes
75
    def __init__(self):
76
        self.flags = 0
77
        self.open = []
78 79
        self.groups = 1
        self.groupdict = {}
80
    def opengroup(self, name=None):
81 82
        gid = self.groups
        self.groups = gid + 1
83
        if name is not None:
84 85
            ogid = self.groupdict.get(name, None)
            if ogid is not None:
86 87
                raise error, ("redefinition of group name %s as group %d; "
                              "was group %d" % (repr(name), gid,  ogid))
88
            self.groupdict[name] = gid
89
        self.open.append(gid)
90
        return gid
91 92 93 94
    def closegroup(self, gid):
        self.open.remove(gid)
    def checkgroup(self, gid):
        return gid < self.groups and gid not in self.open
95 96 97 98

class SubPattern:
    # a subpattern, in intermediate form
    def __init__(self, pattern, data=None):
99
        self.pattern = pattern
100
        if data is None:
101 102 103
            data = []
        self.data = data
        self.width = None
104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
    def dump(self, level=0):
        nl = 1
        for op, av in self.data:
            print level*"  " + op,; nl = 0
            if op == "in":
                # member sublanguage
                print; nl = 1
                for op, a in av:
                    print (level+1)*"  " + op, a
            elif op == "branch":
                print; nl = 1
                i = 0
                for a in av[1]:
                    if i > 0:
                        print level*"  " + "or"
                    a.dump(level+1); nl = 1
                    i = i + 1
            elif type(av) in (type(()), type([])):
                for a in av:
                    if isinstance(a, SubPattern):
                        if not nl: print
                        a.dump(level+1); nl = 1
                    else:
                        print a, ; nl = 0
            else:
                print av, ; nl = 0
            if not nl: print
131
    def __repr__(self):
132
        return repr(self.data)
133
    def __len__(self):
134
        return len(self.data)
135
    def __delitem__(self, index):
136
        del self.data[index]
137
    def __getitem__(self, index):
138
        return self.data[index]
139
    def __setitem__(self, index, code):
140
        self.data[index] = code
141
    def __getslice__(self, start, stop):
142
        return SubPattern(self.pattern, self.data[start:stop])
143
    def insert(self, index, code):
144
        self.data.insert(index, code)
145
    def append(self, code):
146
        self.data.append(code)
147
    def getwidth(self):
148 149 150 151 152 153
        # determine the width (min, max) for this subpattern
        if self.width:
            return self.width
        lo = hi = 0L
        for op, av in self.data:
            if op is BRANCH:
154 155
                i = sys.maxint
                j = 0
156
                for av in av[1]:
157 158
                    l, h = av.getwidth()
                    i = min(i, l)
Fredrik Lundh's avatar
Fredrik Lundh committed
159
                    j = max(j, h)
160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
                lo = lo + i
                hi = hi + j
            elif op is CALL:
                i, j = av.getwidth()
                lo = lo + i
                hi = hi + j
            elif op is SUBPATTERN:
                i, j = av[1].getwidth()
                lo = lo + i
                hi = hi + j
            elif op in (MIN_REPEAT, MAX_REPEAT):
                i, j = av[2].getwidth()
                lo = lo + long(i) * av[0]
                hi = hi + long(j) * av[1]
            elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY):
                lo = lo + 1
                hi = hi + 1
            elif op == SUCCESS:
                break
        self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))
        return self.width
181 182 183

class Tokenizer:
    def __init__(self, string):
184
        self.string = string
185 186
        self.index = 0
        self.__next()
187
    def __next(self):
188
        if self.index >= len(self.string):
189 190
            self.next = None
            return
191 192 193 194 195
        char = self.string[self.index]
        if char[0] == "\\":
            try:
                c = self.string[self.index + 1]
            except IndexError:
196
                raise error, "bogus escape (end of line)"
197 198
            char = char + c
        self.index = self.index + len(char)
199
        self.next = char
200
    def match(self, char, skip=1):
201
        if char == self.next:
202 203
            if skip:
                self.__next()
204 205
            return 1
        return 0
206
    def get(self):
207
        this = self.next
208
        self.__next()
209
        return this
210 211 212 213
    def tell(self):
        return self.index, self.next
    def seek(self, index):
        self.index, self.next = index
214

215 216 217 218 219 220 221 222 223
def isident(char):
    return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"

def isdigit(char):
    return "0" <= char <= "9"

def isname(name):
    # check that group name is a valid string
    if not isident(name[0]):
224
        return False
225
    for char in name:
226
        if not isident(char) and not isdigit(char):
227 228
            return False
    return True
229

230
def _group(escape, groups):
231 232
    # check if the escape string represents a valid group
    try:
233
        gid = atoi(escape[1:])
234 235
        if gid and gid < groups:
            return gid
236
    except ValueError:
237
        pass
238 239 240 241 242 243
    return None # not a valid group

def _class_escape(source, escape):
    # handle escape code inside character class
    code = ESCAPES.get(escape)
    if code:
244
        return code
245 246
    code = CATEGORIES.get(escape)
    if code:
247
        return code
248
    try:
249
        if escape[1:2] == "x":
250 251
            # hexadecimal escape (exactly two digits)
            while source.next in HEXDIGITS and len(escape) < 4:
252 253
                escape = escape + source.get()
            escape = escape[2:]
254 255
            if len(escape) != 2:
                raise error, "bogus escape: %s" % repr("\\" + escape)
256
            return LITERAL, atoi(escape, 16) & 0xff
257
        elif escape[1:2] in OCTDIGITS:
258 259
            # octal escape (up to three digits)
            while source.next in OCTDIGITS and len(escape) < 5:
260 261
                escape = escape + source.get()
            escape = escape[1:]
262
            return LITERAL, atoi(escape, 8) & 0xff
263
        if len(escape) == 2:
264
            return LITERAL, ord(escape[1])
265
    except ValueError:
266
        pass
Fredrik Lundh's avatar
Fredrik Lundh committed
267
    raise error, "bogus escape: %s" % repr(escape)
268 269 270 271 272

def _escape(source, escape, state):
    # handle escape code in expression
    code = CATEGORIES.get(escape)
    if code:
273
        return code
274
    code = ESCAPES.get(escape)
275
    if code:
276
        return code
277
    try:
278
        if escape[1:2] == "x":
279 280
            # hexadecimal escape
            while source.next in HEXDIGITS and len(escape) < 4:
281
                escape = escape + source.get()
282 283
            if len(escape) != 4:
                raise ValueError
284
            return LITERAL, atoi(escape[2:], 16) & 0xff
285 286
        elif escape[1:2] == "0":
            # octal escape
287
            while source.next in OCTDIGITS and len(escape) < 4:
288
                escape = escape + source.get()
289
            return LITERAL, atoi(escape[1:], 8) & 0xff
290
        elif escape[1:2] in DIGITS:
291 292 293
            # octal escape *or* decimal group reference (sigh)
            if source.next in DIGITS:
                escape = escape + source.get()
294 295
                if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
                    source.next in OCTDIGITS):
296
                    # got three octal digits; this is an octal escape
297
                    escape = escape + source.get()
298
                    return LITERAL, atoi(escape[1:], 8) & 0xff
299 300 301
            # got at least one decimal digit; this is a group reference
            group = _group(escape, state.groups)
            if group:
302 303
                if not state.checkgroup(group):
                    raise error, "cannot refer to open group"
304
                return GROUPREF, group
305
            raise ValueError
306
        if len(escape) == 2:
307
            return LITERAL, ord(escape[1])
308
    except ValueError:
309
        pass
Fredrik Lundh's avatar
Fredrik Lundh committed
310
    raise error, "bogus escape: %s" % repr(escape)
311

312 313
def _parse_sub(source, state, nested=1):
    # parse an alternation: a|b|c
314

315 316 317 318 319 320 321
    items = []
    while 1:
        items.append(_parse(source, state))
        if source.match("|"):
            continue
        if not nested:
            break
322
        if not source.next or source.match(")", 0):
323 324 325 326 327 328 329 330
            break
        else:
            raise error, "pattern not properly closed"

    if len(items) == 1:
        return items[0]

    subpattern = SubPattern(state)
331

332 333
    # check if all items share a common prefix
    while 1:
334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349
        prefix = None
        for item in items:
            if not item:
                break
            if prefix is None:
                prefix = item[0]
            elif item[0] != prefix:
                break
        else:
            # all subitems start with a common "prefix".
            # move it out of the branch
            for item in items:
                del item[0]
            subpattern.append(prefix)
            continue # check next one
        break
350 351 352

    # check if the branch can be replaced by a character set
    for item in items:
353 354
        if len(item) != 1 or item[0][0] != LITERAL:
            break
355
    else:
356
        # we can store this as a character set instead of a
357
        # branch (the compiler may optimize this even more)
358 359 360 361 362
        set = []
        for item in items:
            set.append(item[0])
        subpattern.append((IN, set))
        return subpattern
363 364

    subpattern.append((BRANCH, (None, items)))
365
    return subpattern
366

367 368 369 370 371 372 373 374 375 376 377 378 379 380
def _parse_sub_cond(source, state, condgroup):
    item_yes = _parse(source, state) 
    if source.match("|"):
        item_no = _parse(source, state) 
        if source.match("|"):
            raise error, "conditional backref with more than two branches"
    else:
        item_no = None
    if source.next and not source.match(")", 0):
        raise error, "pattern not properly closed"
    subpattern = SubPattern(state)
    subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
    return subpattern

381
def _parse(source, state):
382
    # parse a simple pattern
383

384
    subpattern = SubPattern(state)
385 386 387

    while 1:

388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405
        if source.next in ("|", ")"):
            break # end of subpattern
        this = source.get()
        if this is None:
            break # end of pattern

        if state.flags & SRE_FLAG_VERBOSE:
            # skip whitespace and comments
            if this in WHITESPACE:
                continue
            if this == "#":
                while 1:
                    this = source.get()
                    if this in (None, "\n"):
                        break
                continue

        if this and this[0] not in SPECIAL_CHARS:
406
            subpattern.append((LITERAL, ord(this)))
407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423

        elif this == "[":
            # character set
            set = []
##          if source.match(":"):
##              pass # handle character classes
            if source.match("^"):
                set.append((NEGATE, None))
            # check remaining characters
            start = set[:]
            while 1:
                this = source.get()
                if this == "]" and set != start:
                    break
                elif this and this[0] == "\\":
                    code1 = _class_escape(source, this)
                elif this:
424
                    code1 = LITERAL, ord(this)
425 426 427 428 429 430
                else:
                    raise error, "unexpected end of regular expression"
                if source.match("-"):
                    # potential range
                    this = source.get()
                    if this == "]":
431 432
                        if code1[0] is IN:
                            code1 = code1[1][0]
433
                        set.append(code1)
434
                        set.append((LITERAL, ord("-")))
435
                        break
436
                    elif this:
437 438 439
                        if this[0] == "\\":
                            code2 = _class_escape(source, this)
                        else:
440
                            code2 = LITERAL, ord(this)
441
                        if code1[0] != LITERAL or code2[0] != LITERAL:
442
                            raise error, "bad character range"
443 444 445
                        lo = code1[1]
                        hi = code2[1]
                        if hi < lo:
446
                            raise error, "bad character range"
447
                        set.append((RANGE, (lo, hi)))
448 449
                    else:
                        raise error, "unexpected end of regular expression"
450 451 452 453 454
                else:
                    if code1[0] is IN:
                        code1 = code1[1][0]
                    set.append(code1)

Fredrik Lundh's avatar
Fredrik Lundh committed
455
            # XXX: <fl> should move set optimization to compiler!
456 457 458 459 460
            if len(set)==1 and set[0][0] is LITERAL:
                subpattern.append(set[0]) # optimization
            elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
                subpattern.append((NOT_LITERAL, set[1][1])) # optimization
            else:
Fredrik Lundh's avatar
Fredrik Lundh committed
461
                # XXX: <fl> should add charmap optimization here
462 463 464 465 466 467 468 469
                subpattern.append((IN, set))

        elif this and this[0] in REPEAT_CHARS:
            # repeat previous item
            if this == "?":
                min, max = 0, 1
            elif this == "*":
                min, max = 0, MAXREPEAT
470

471 472 473
            elif this == "+":
                min, max = 1, MAXREPEAT
            elif this == "{":
474
                here = source.tell()
475 476 477 478 479 480 481 482 483 484
                min, max = 0, MAXREPEAT
                lo = hi = ""
                while source.next in DIGITS:
                    lo = lo + source.get()
                if source.match(","):
                    while source.next in DIGITS:
                        hi = hi + source.get()
                else:
                    hi = lo
                if not source.match("}"):
485 486 487
                    subpattern.append((LITERAL, ord(this)))
                    source.seek(here)
                    continue
488
                if lo:
489
                    min = atoi(lo)
490
                if hi:
491
                    max = atoi(hi)
492 493
                if max < min:
                    raise error, "bad repeat interval"
494 495 496 497 498 499
            else:
                raise error, "not supported"
            # figure out which item to repeat
            if subpattern:
                item = subpattern[-1:]
            else:
500 501
                item = None
            if not item or (len(item) == 1 and item[0][0] == AT):
502
                raise error, "nothing to repeat"
503 504
            if item[0][0] in (MIN_REPEAT, MAX_REPEAT):
                raise error, "multiple repeat"
505 506 507 508 509 510 511 512 513 514 515
            if source.match("?"):
                subpattern[-1] = (MIN_REPEAT, (min, max, item))
            else:
                subpattern[-1] = (MAX_REPEAT, (min, max, item))

        elif this == ".":
            subpattern.append((ANY, None))

        elif this == "(":
            group = 1
            name = None
516
            condgroup = None
517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533
            if source.match("?"):
                group = 0
                # options
                if source.match("P"):
                    # python extensions
                    if source.match("<"):
                        # named group: skip forward to end of name
                        name = ""
                        while 1:
                            char = source.get()
                            if char is None:
                                raise error, "unterminated name"
                            if char == ">":
                                break
                            name = name + char
                        group = 1
                        if not isname(name):
534
                            raise error, "bad character in group name"
535 536
                    elif source.match("="):
                        # named backreference
537 538 539 540 541 542 543 544 545
                        name = ""
                        while 1:
                            char = source.get()
                            if char is None:
                                raise error, "unterminated name"
                            if char == ")":
                                break
                            name = name + char
                        if not isname(name):
546
                            raise error, "bad character in group name"
547 548 549
                        gid = state.groupdict.get(name)
                        if gid is None:
                            raise error, "unknown group name"
550
                        subpattern.append((GROUPREF, gid))
551
                        continue
552 553 554 555 556 557 558 559 560 561 562 563 564 565
                    else:
                        char = source.get()
                        if char is None:
                            raise error, "unexpected end of pattern"
                        raise error, "unknown specifier: ?P%s" % char
                elif source.match(":"):
                    # non-capturing group
                    group = 2
                elif source.match("#"):
                    # comment
                    while 1:
                        if source.next is None or source.next == ")":
                            break
                        source.get()
566 567 568
                    if not source.match(")"):
                        raise error, "unbalanced parenthesis"
                    continue
569
                elif source.next in ("=", "!", "<"):
570 571
                    # lookahead assertions
                    char = source.get()
572 573 574 575 576 577
                    dir = 1
                    if char == "<":
                        if source.next not in ("=", "!"):
                            raise error, "syntax error"
                        dir = -1 # lookbehind
                        char = source.get()
578
                    p = _parse_sub(source, state)
579 580
                    if not source.match(")"):
                        raise error, "unbalanced parenthesis"
581 582 583 584 585
                    if char == "=":
                        subpattern.append((ASSERT, (dir, p)))
                    else:
                        subpattern.append((ASSERT_NOT, (dir, p)))
                    continue
586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605
                elif source.match("("):
                    # conditional backreference group
                    condname = ""
                    while 1:
                        char = source.get()
                        if char is None:
                            raise error, "unterminated name"
                        if char == ")":
                            break
                        condname = condname + char
                    group = 2
                    if isname(condname):
                        condgroup = state.groupdict.get(condname)
                        if condgroup is None:
                            raise error, "unknown group name"
                    else:
                        try:
                            condgroup = atoi(condname)
                        except ValueError:
                            raise error, "bad character in group name"
606 607
                else:
                    # flags
608
                    if not source.next in FLAGS:
609
                        raise error, "unexpected end of pattern"
610
                    while source.next in FLAGS:
611 612 613 614 615 616 617
                        state.flags = state.flags | FLAGS[source.get()]
            if group:
                # parse group contents
                if group == 2:
                    # anonymous group
                    group = None
                else:
618
                    group = state.opengroup(name)
619 620 621 622
                if condgroup:
                    p = _parse_sub_cond(source, state, condgroup)
                else:
                    p = _parse_sub(source, state)
623 624
                if not source.match(")"):
                    raise error, "unbalanced parenthesis"
625 626
                if group is not None:
                    state.closegroup(group)
627
                subpattern.append((SUBPATTERN, (group, p)))
628 629 630
            else:
                while 1:
                    char = source.get()
631 632 633
                    if char is None:
                        raise error, "unexpected end of pattern"
                    if char == ")":
634 635 636 637 638 639 640 641 642 643 644 645 646 647 648
                        break
                    raise error, "unknown extension"

        elif this == "^":
            subpattern.append((AT, AT_BEGINNING))

        elif this == "$":
            subpattern.append((AT, AT_END))

        elif this and this[0] == "\\":
            code = _escape(source, this, state)
            subpattern.append(code)

        else:
            raise error, "parser error"
649 650 651

    return subpattern

652
def parse(str, flags=0, pattern=None):
653
    # parse 're' pattern into list of (opcode, argument) tuples
654 655 656

    source = Tokenizer(str)

657 658
    if pattern is None:
        pattern = Pattern()
659
    pattern.flags = flags
660
    pattern.str = str
661 662 663 664 665 666 667 668 669

    p = _parse_sub(source, pattern, 0)

    tail = source.get()
    if tail == ")":
        raise error, "unbalanced parenthesis"
    elif tail:
        raise error, "bogus characters at end of regular expression"

Fredrik Lundh's avatar
Fredrik Lundh committed
670 671
    if flags & SRE_FLAG_DEBUG:
        p.dump()
672

673 674 675 676 677
    if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE:
        # the VERBOSE flag was switched on inside the pattern.  to be
        # on the safe side, we'll parse the whole thing again...
        return parse(str, p.pattern.flags)

678 679
    return p

Fredrik Lundh's avatar
Fredrik Lundh committed
680
def parse_template(source, pattern):
681 682 683 684 685
    # parse 're' replacement string into list of literals and
    # group references
    s = Tokenizer(source)
    p = []
    a = p.append
Fredrik Lundh's avatar
Fredrik Lundh committed
686 687 688 689 690 691 692
    def literal(literal, p=p):
        if p and p[-1][0] is LITERAL:
            p[-1] = LITERAL, p[-1][1] + literal
        else:
            p.append((LITERAL, literal))
    sep = source[:0]
    if type(sep) is type(""):
693
        makechar = chr
Fredrik Lundh's avatar
Fredrik Lundh committed
694
    else:
695
        makechar = unichr
696
    while 1:
697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714
        this = s.get()
        if this is None:
            break # end of replacement string
        if this and this[0] == "\\":
            # group
            if this == "\\g":
                name = ""
                if s.match("<"):
                    while 1:
                        char = s.get()
                        if char is None:
                            raise error, "unterminated group name"
                        if char == ">":
                            break
                        name = name + char
                if not name:
                    raise error, "bad group name"
                try:
715
                    index = atoi(name)
716 717
                except ValueError:
                    if not isname(name):
718
                        raise error, "bad character in group name"
719 720 721 722 723 724 725 726 727 728
                    try:
                        index = pattern.groupindex[name]
                    except KeyError:
                        raise IndexError, "unknown group name"
                a((MARK, index))
            elif len(this) > 1 and this[1] in DIGITS:
                code = None
                while 1:
                    group = _group(this, pattern.groups+1)
                    if group:
729
                        if (s.next not in DIGITS or
730
                            not _group(this + s.next, pattern.groups+1)):
731
                            code = MARK, group
732 733 734 735 736 737 738
                            break
                    elif s.next in OCTDIGITS:
                        this = this + s.get()
                    else:
                        break
                if not code:
                    this = this[1:]
739
                    code = LITERAL, makechar(atoi(this[-6:], 8) & 0xff)
Fredrik Lundh's avatar
Fredrik Lundh committed
740 741 742 743
                if code[0] is LITERAL:
                    literal(code[1])
                else:
                    a(code)
744 745
            else:
                try:
746
                    this = makechar(ESCAPES[this][1])
747
                except KeyError:
Fredrik Lundh's avatar
Fredrik Lundh committed
748 749
                    pass
                literal(this)
750
        else:
Fredrik Lundh's avatar
Fredrik Lundh committed
751 752 753 754 755 756 757 758 759 760 761 762 763
            literal(this)
    # convert template to groups and literals lists
    i = 0
    groups = []
    literals = []
    for c, s in p:
        if c is MARK:
            groups.append((i, s))
            literals.append(None)
        else:
            literals.append(s)
        i = i + 1
    return groups, literals
764

Fredrik Lundh's avatar
Fredrik Lundh committed
765
def expand_template(template, match):
Fredrik Lundh's avatar
Fredrik Lundh committed
766
    g = match.group
767
    sep = match.string[:0]
Fredrik Lundh's avatar
Fredrik Lundh committed
768 769 770 771 772
    groups, literals = template
    literals = literals[:]
    try:
        for index, group in groups:
            literals[index] = s = g(group)
773
            if s is None:
Fredrik Lundh's avatar
Fredrik Lundh committed
774 775 776 777
                raise IndexError
    except IndexError:
        raise error, "empty group"
    return string.join(literals, sep)