sre_compile.py 13.9 KB
Newer Older
1 2 3 4 5
#
# Secret Labs' Regular Expression Engine
#
# convert template to internal format
#
Fredrik Lundh's avatar
Fredrik Lundh committed
6
# Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
7
#
8
# See the sre.py file for information on usage and redistribution.
9 10
#

11 12
"""Internal support module for sre"""

13
import _sre, sys
14 15 16

from sre_constants import *

17 18
assert _sre.MAGIC == MAGIC, "SRE module mismatch"

19 20
MAXCODE = 65535

Fredrik Lundh's avatar
Fredrik Lundh committed
21
def _compile(code, pattern, flags):
22
    # internal: compile a (sub)pattern
Fredrik Lundh's avatar
Fredrik Lundh committed
23
    emit = code.append
24
    for op, av in pattern:
25
        if op in (LITERAL, NOT_LITERAL):
26 27
            if flags & SRE_FLAG_IGNORECASE:
                emit(OPCODES[OP_IGNORE[op]])
28
                emit(_sre.getlower(av, flags))
29 30
            else:
                emit(OPCODES[op])
31
                emit(av)
32 33 34 35
        elif op is IN:
            if flags & SRE_FLAG_IGNORECASE:
                emit(OPCODES[OP_IGNORE[op]])
                def fixup(literal, flags=flags):
36
                    return _sre.getlower(literal, flags)
37 38
            else:
                emit(OPCODES[op])
39
                fixup = lambda x: x
40
            skip = len(code); emit(0)
41
            _compile_charset(av, flags, code, fixup)
42
            code[skip] = len(code) - skip
43 44
        elif op is ANY:
            if flags & SRE_FLAG_DOTALL:
Fredrik Lundh's avatar
Fredrik Lundh committed
45
                emit(OPCODES[ANY_ALL])
46
            else:
Fredrik Lundh's avatar
Fredrik Lundh committed
47
                emit(OPCODES[ANY])
48 49
        elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
            if flags & SRE_FLAG_TEMPLATE:
50
                raise error, "internal: unsupported template operator"
51 52 53 54 55 56 57
                emit(OPCODES[REPEAT])
                skip = len(code); emit(0)
                emit(av[0])
                emit(av[1])
                _compile(code, av[2], flags)
                emit(OPCODES[SUCCESS])
                code[skip] = len(code) - skip
Fredrik Lundh's avatar
Fredrik Lundh committed
58 59 60 61 62 63 64 65
            elif _simple(av) and op == MAX_REPEAT:
                emit(OPCODES[REPEAT_ONE])
                skip = len(code); emit(0)
                emit(av[0])
                emit(av[1])
                _compile(code, av[2], flags)
                emit(OPCODES[SUCCESS])
                code[skip] = len(code) - skip
66
            else:
Fredrik Lundh's avatar
Fredrik Lundh committed
67 68 69 70 71 72 73 74
                emit(OPCODES[REPEAT])
                skip = len(code); emit(0)
                emit(av[0])
                emit(av[1])
                _compile(code, av[2], flags)
                code[skip] = len(code) - skip
                if op == MAX_REPEAT:
                    emit(OPCODES[MAX_UNTIL])
75
                else:
Fredrik Lundh's avatar
Fredrik Lundh committed
76
                    emit(OPCODES[MIN_UNTIL])
77
        elif op is SUBPATTERN:
78
            if av[0]:
79
                emit(OPCODES[MARK])
80
                emit((av[0]-1)*2)
81
            # _compile_info(code, av[1], flags)
82
            _compile(code, av[1], flags)
83
            if av[0]:
84
                emit(OPCODES[MARK])
85
                emit((av[0]-1)*2+1)
86 87
        elif op in (SUCCESS, FAILURE):
            emit(OPCODES[op])
88 89 90 91 92 93 94 95 96 97 98 99 100 101
        elif op in (ASSERT, ASSERT_NOT):
            emit(OPCODES[op])
            skip = len(code); emit(0)
            if av[0] >= 0:
                emit(0) # look ahead
            else:
                lo, hi = av[1].getwidth()
                if lo != hi:
                    raise error, "look-behind requires fixed-width pattern"
                emit(lo) # look behind
            _compile(code, av[1], flags)
            emit(OPCODES[SUCCESS])
            code[skip] = len(code) - skip
        elif op is CALL:
102 103 104 105 106 107 108 109
            emit(OPCODES[op])
            skip = len(code); emit(0)
            _compile(code, av, flags)
            emit(OPCODES[SUCCESS])
            code[skip] = len(code) - skip
        elif op is AT:
            emit(OPCODES[op])
            if flags & SRE_FLAG_MULTILINE:
Fredrik Lundh's avatar
Fredrik Lundh committed
110 111 112 113 114 115
                av = AT_MULTILINE.get(av, av)
            if flags & SRE_FLAG_LOCALE:
                av = AT_LOCALE.get(av, av)
            elif flags & SRE_FLAG_UNICODE:
                av = AT_UNICODE.get(av, av)
            emit(ATCODES[av])
116
        elif op is BRANCH:
117
            emit(OPCODES[op])
118 119 120
            tail = []
            for av in av[1]:
                skip = len(code); emit(0)
121
                # _compile_info(code, av, flags)
122 123 124 125 126 127 128 129 130 131
                _compile(code, av, flags)
                emit(OPCODES[JUMP])
                tail.append(len(code)); emit(0)
                code[skip] = len(code) - skip
            emit(0) # end of branch
            for tail in tail:
                code[tail] = len(code) - tail
        elif op is CATEGORY:
            emit(OPCODES[op])
            if flags & SRE_FLAG_LOCALE:
Fredrik Lundh's avatar
Fredrik Lundh committed
132
                av = CH_LOCALE[av]
133
            elif flags & SRE_FLAG_UNICODE:
Fredrik Lundh's avatar
Fredrik Lundh committed
134 135
                av = CH_UNICODE[av]
            emit(CHCODES[av])
136
        elif op is GROUPREF:
137 138 139 140 141
            if flags & SRE_FLAG_IGNORECASE:
                emit(OPCODES[OP_IGNORE[op]])
            else:
                emit(OPCODES[op])
            emit(av-1)
142 143
        else:
            raise ValueError, ("unsupported operand type", op)
144

145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
def _compile_charset(charset, flags, code, fixup=None):
    # compile charset subprogram
    emit = code.append
    if not fixup:
        fixup = lambda x: x
    for op, av in _optimize_charset(charset, fixup):
        emit(OPCODES[op])
        if op is NEGATE:
            pass
        elif op is LITERAL:
            emit(fixup(av))
        elif op is RANGE:
            emit(fixup(av[0]))
            emit(fixup(av[1]))
        elif op is CHARSET:
            code.extend(av)
161 162
        elif op is BIGCHARSET:
            code.extend(av)
163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
        elif op is CATEGORY:
            if flags & SRE_FLAG_LOCALE:
                emit(CHCODES[CH_LOCALE[av]])
            elif flags & SRE_FLAG_UNICODE:
                emit(CHCODES[CH_UNICODE[av]])
            else:
                emit(CHCODES[av])
        else:
            raise error, "internal: unsupported set operator"
    emit(OPCODES[FAILURE])

def _optimize_charset(charset, fixup):
    # internal: optimize character set
    out = []
    charmap = [0]*256
    try:
        for op, av in charset:
            if op is NEGATE:
                out.append((op, av))
            elif op is LITERAL:
                charmap[fixup(av)] = 1
            elif op is RANGE:
                for i in range(fixup(av[0]), fixup(av[1])+1):
                    charmap[i] = 1
            elif op is CATEGORY:
Fredrik Lundh's avatar
Fredrik Lundh committed
188
                # XXX: could append to charmap tail
189 190
                return charset # cannot compress
    except IndexError:
191 192 193
        if sys.maxunicode != 65535:
            # XXX: big charsets don't work in UCS-4 builds
            return charset
194
        # character set contains unicode characters
195
        return _optimize_unicode(charset, fixup)
196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
    # compress character map
    i = p = n = 0
    runs = []
    for c in charmap:
        if c:
            if n == 0:
                p = i
            n = n + 1
        elif n:
            runs.append((p, n))
            n = 0
        i = i + 1
    if n:
        runs.append((p, n))
    if len(runs) <= 2:
        # use literal/range
        for p, n in runs:
            if n == 1:
                out.append((LITERAL, p))
            else:
                out.append((RANGE, (p, p+n-1)))
        if len(out) < len(charset):
            return out
    else:
        # use bitmap
221
        data = _mk_bitmap(charmap)
222 223 224 225
        out.append((CHARSET, data))
        return out
    return charset

226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283
def _mk_bitmap(bits):
    data = []
    m = 1; v = 0
    for c in bits:
        if c:
            v = v + m
        m = m << 1
        if m > MAXCODE:
            data.append(v)
            m = 1; v = 0
    return data

# To represent a big charset, first a bitmap of all characters in the
# set is constructed. Then, this bitmap is sliced into chunks of 256
# characters, duplicate chunks are eliminitated, and each chunk is
# given a number. In the compiled expression, the charset is
# represented by a 16-bit word sequence, consisting of one word for
# the number of different chunks, a sequence of 256 bytes (128 words)
# of chunk numbers indexed by their original chunk position, and a
# sequence of chunks (16 words each).

# Compression is normally good: in a typical charset, large ranges of
# Unicode will be either completely excluded (e.g. if only cyrillic
# letters are to be matched), or completely included (e.g. if large
# subranges of Kanji match). These ranges will be represented by
# chunks of all one-bits or all zero-bits.

# Matching can be also done efficiently: the more significant byte of
# the Unicode character is an index into the chunk number, and the
# less significant byte is a bit index in the chunk (just like the
# CHARSET matching).

def _optimize_unicode(charset, fixup):
    charmap = [0]*65536
    negate = 0
    for op, av in charset:
        if op is NEGATE:
            negate = 1
        elif op is LITERAL:
            charmap[fixup(av)] = 1
        elif op is RANGE:
            for i in range(fixup(av[0]), fixup(av[1])+1):
                charmap[i] = 1
        elif op is CATEGORY:
            # XXX: could expand category
            return charset # cannot compress
    if negate:
        for i in range(65536):
            charmap[i] = not charmap[i]
    comps = {}
    mapping = [0]*256
    block = 0
    data = []
    for i in range(256):
        chunk = tuple(charmap[i*256:(i+1)*256])
        new = comps.setdefault(chunk, block)
        mapping[i] = new
        if new == block:
284 285
            block = block + 1
            data = data + _mk_bitmap(chunk)
286 287 288
    header = [block]
    assert MAXCODE == 65535
    for i in range(128):
289 290 291 292
        if sys.byteorder == 'big':
            header.append(256*mapping[2*i]+mapping[2*i+1])
        else:
            header.append(mapping[2*i]+256*mapping[2*i+1])
293
    data[0:0] = header
294
    return [(BIGCHARSET, data)]
295

296 297 298
def _simple(av):
    # check if av is a "simple" operator
    lo, hi = av[2].getwidth()
299
    if lo == 0 and hi == MAXREPEAT:
300 301 302
        raise error, "nothing to repeat"
    return lo == hi == 1 and av[2][0][0] != SUBPATTERN

303 304
def _compile_info(code, pattern, flags):
    # internal: compile an info block.  in the current version,
305 306
    # this contains min/max pattern width, and an optional literal
    # prefix or a character map
307 308
    lo, hi = pattern.getwidth()
    if lo == 0:
309
        return # not worth it
310 311
    # look for a literal prefix
    prefix = []
312
    prefix_skip = 0
313
    charset = [] # not used
314
    if not (flags & SRE_FLAG_IGNORECASE):
315
        # look for literal prefix
316 317
        for op, av in pattern.data:
            if op is LITERAL:
318 319
                if len(prefix) == prefix_skip:
                    prefix_skip = prefix_skip + 1
320
                prefix.append(av)
321 322 323 324 325 326
            elif op is SUBPATTERN and len(av[1]) == 1:
                op, av = av[1][0]
                if op is LITERAL:
                    prefix.append(av)
                else:
                    break
327 328
            else:
                break
329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365
        # if no prefix, look for charset prefix
        if not prefix and pattern.data:
            op, av = pattern.data[0]
            if op is SUBPATTERN and av[1]:
                op, av = av[1][0]
                if op is LITERAL:
                    charset.append((op, av))
                elif op is BRANCH:
                    c = []
                    for p in av[1]:
                        if not p:
                            break
                        op, av = p[0]
                        if op is LITERAL:
                            c.append((op, av))
                        else:
                            break
                    else:
                        charset = c
            elif op is BRANCH:
                c = []
                for p in av[1]:
                    if not p:
                        break
                    op, av = p[0]
                    if op is LITERAL:
                        c.append((op, av))
                    else:
                        break
                else:
                    charset = c
            elif op is IN:
                charset = av
##     if prefix:
##         print "*** PREFIX", prefix, prefix_skip
##     if charset:
##         print "*** CHARSET", charset
366 367 368 369 370 371
    # add an info block
    emit = code.append
    emit(OPCODES[INFO])
    skip = len(code); emit(0)
    # literal flag
    mask = 0
372 373
    if prefix:
        mask = SRE_INFO_PREFIX
374
        if len(prefix) == prefix_skip == len(pattern.data):
375 376 377
            mask = mask + SRE_INFO_LITERAL
    elif charset:
        mask = mask + SRE_INFO_CHARSET
378 379
    emit(mask)
    # pattern length
380 381 382 383 384 385
    if lo < MAXCODE:
        emit(lo)
    else:
        emit(MAXCODE)
        prefix = prefix[:MAXCODE]
    if hi < MAXCODE:
386
        emit(hi)
387
    else:
388
        emit(0)
389 390
    # add literal prefix
    if prefix:
391 392 393 394 395 396 397 398 399 400
        emit(len(prefix)) # length
        emit(prefix_skip) # skip
        code.extend(prefix)
        # generate overlap table
        table = [-1] + ([0]*len(prefix))
        for i in range(len(prefix)):
            table[i+1] = table[i]+1
            while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
                table[i+1] = table[table[i+1]-1]+1
        code.extend(table[1:]) # don't store first entry
401
    elif charset:
402
        _compile_charset(charset, 0, code)
403 404
    code[skip] = len(code) - skip

405 406 407 408 409 410 411
STRING_TYPES = [type("")]

try:
    STRING_TYPES.append(type(unicode("")))
except NameError:
    pass

412
def _code(p, flags):
413

414
    flags = p.pattern.flags | flags
Fredrik Lundh's avatar
Fredrik Lundh committed
415
    code = []
416 417 418 419 420

    # compile info block
    _compile_info(code, p, flags)

    # compile the pattern
421
    _compile(code, p.data, flags)
422

423
    code.append(OPCODES[SUCCESS])
424

425 426 427 428 429 430 431 432 433 434 435 436
    return code

def compile(p, flags=0):
    # internal: convert pattern list to internal format

    if type(p) in STRING_TYPES:
        import sre_parse
        pattern = p
        p = sre_parse.parse(p, flags)
    else:
        pattern = None

437
    code = _code(p, flags)
438

439 440
    # print code

Fredrik Lundh's avatar
Fredrik Lundh committed
441
    # XXX: <fl> get rid of this limitation!
Fredrik Lundh's avatar
Fredrik Lundh committed
442
    assert p.pattern.groups <= 100,\
443
           "sorry, but this version only supports 100 named groups"
444

445 446 447 448 449 450
    # map in either direction
    groupindex = p.pattern.groupdict
    indexgroup = [None] * p.pattern.groups
    for k, i in groupindex.items():
        indexgroup[i] = k

451
    return _sre.compile(
452 453 454
        pattern, flags, code,
        p.pattern.groups-1,
        groupindex, indexgroup
455
        )