sre_compile.py 18.9 KB
Newer Older
1 2 3 4 5
#
# Secret Labs' Regular Expression Engine
#
# convert template to internal format
#
Fredrik Lundh's avatar
Fredrik Lundh committed
6
# Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
7
#
8
# See the sre.py file for information on usage and redistribution.
9 10
#

11 12
"""Internal support module for sre"""

13
import _sre
Christian Heimes's avatar
Christian Heimes committed
14
import sre_parse
15 16
from sre_constants import *

17 18
assert _sre.MAGIC == MAGIC, "SRE module mismatch"

19 20 21 22
_LITERAL_CODES = {LITERAL, NOT_LITERAL}
_REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT}
_SUCCESS_CODES = {SUCCESS, FAILURE}
_ASSERT_CODES = {ASSERT, ASSERT_NOT}
23

24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
# Sets of lowercase characters which have the same uppercase.
_equivalences = (
    # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
    (0x69, 0x131), # iı
    # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S
    (0x73, 0x17f), # sſ
    # MICRO SIGN, GREEK SMALL LETTER MU
    (0xb5, 0x3bc), # µμ
    # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
    (0x345, 0x3b9, 0x1fbe), # \u0345ιι
    # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
    (0x390, 0x1fd3), # ΐΐ
    # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
    (0x3b0, 0x1fe3), # ΰΰ
    # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL
    (0x3b2, 0x3d0), # βϐ
    # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL
    (0x3b5, 0x3f5), # εϵ
    # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL
    (0x3b8, 0x3d1), # θϑ
    # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL
    (0x3ba, 0x3f0), # κϰ
    # GREEK SMALL LETTER PI, GREEK PI SYMBOL
    (0x3c0, 0x3d6), # πϖ
    # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL
    (0x3c1, 0x3f1), # ρϱ
    # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA
    (0x3c2, 0x3c3), # ςσ
    # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL
    (0x3c6, 0x3d5), # φϕ
    # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE
    (0x1e61, 0x1e9b), # ṡẛ
    # LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST
    (0xfb05, 0xfb06), # ſtst
)

# Maps the lowercase code to lowercase codes which have the same uppercase.
_ignorecase_fixes = {i: tuple(j for j in t if i != j)
                     for t in _equivalences for i in t}

Fredrik Lundh's avatar
Fredrik Lundh committed
64
def _compile(code, pattern, flags):
65
    # internal: compile a (sub)pattern
Fredrik Lundh's avatar
Fredrik Lundh committed
66
    emit = code.append
67
    _len = len
68 69 70 71
    LITERAL_CODES = _LITERAL_CODES
    REPEATING_CODES = _REPEATING_CODES
    SUCCESS_CODES = _SUCCESS_CODES
    ASSERT_CODES = _ASSERT_CODES
72 73
    if (flags & SRE_FLAG_IGNORECASE and
            not (flags & SRE_FLAG_LOCALE) and
74 75
            flags & SRE_FLAG_UNICODE and
            not (flags & SRE_FLAG_ASCII)):
76 77 78
        fixes = _ignorecase_fixes
    else:
        fixes = None
79
    for op, av in pattern:
80
        if op in LITERAL_CODES:
81
            if flags & SRE_FLAG_IGNORECASE:
82 83
                lo = _sre.getlower(av, flags)
                if fixes and lo in fixes:
84
                    emit(IN_IGNORE)
85 86
                    skip = _len(code); emit(0)
                    if op is NOT_LITERAL:
87
                        emit(NEGATE)
88
                    for k in (lo,) + fixes[lo]:
89
                        emit(LITERAL)
90
                        emit(k)
91
                    emit(FAILURE)
92 93
                    code[skip] = _len(code) - skip
                else:
94
                    emit(OP_IGNORE[op])
95
                    emit(lo)
96
            else:
97
                emit(op)
98
                emit(av)
99 100
        elif op is IN:
            if flags & SRE_FLAG_IGNORECASE:
101
                emit(OP_IGNORE[op])
102
                def fixup(literal, flags=flags):
103
                    return _sre.getlower(literal, flags)
104
            else:
105
                emit(op)
106
                fixup = None
107
            skip = _len(code); emit(0)
108
            _compile_charset(av, flags, code, fixup, fixes)
109
            code[skip] = _len(code) - skip
110 111
        elif op is ANY:
            if flags & SRE_FLAG_DOTALL:
112
                emit(ANY_ALL)
113
            else:
114
                emit(ANY)
115
        elif op in REPEATING_CODES:
116
            if flags & SRE_FLAG_TEMPLATE:
117
                raise error("internal: unsupported template operator %r" % (op,))
118 119
            elif _simple(av) and op is not REPEAT:
                if op is MAX_REPEAT:
120
                    emit(REPEAT_ONE)
121
                else:
122
                    emit(MIN_REPEAT_ONE)
123
                skip = _len(code); emit(0)
Fredrik Lundh's avatar
Fredrik Lundh committed
124 125 126
                emit(av[0])
                emit(av[1])
                _compile(code, av[2], flags)
127
                emit(SUCCESS)
128
                code[skip] = _len(code) - skip
129
            else:
130
                emit(REPEAT)
131
                skip = _len(code); emit(0)
Fredrik Lundh's avatar
Fredrik Lundh committed
132 133 134
                emit(av[0])
                emit(av[1])
                _compile(code, av[2], flags)
135 136
                code[skip] = _len(code) - skip
                if op is MAX_REPEAT:
137
                    emit(MAX_UNTIL)
138
                else:
139
                    emit(MIN_UNTIL)
140
        elif op is SUBPATTERN:
141 142
            group, add_flags, del_flags, p = av
            if group:
143
                emit(MARK)
144 145 146 147
                emit((group-1)*2)
            # _compile_info(code, p, (flags | add_flags) & ~del_flags)
            _compile(code, p, (flags | add_flags) & ~del_flags)
            if group:
148
                emit(MARK)
149
                emit((group-1)*2+1)
150
        elif op in SUCCESS_CODES:
151
            emit(op)
152
        elif op in ASSERT_CODES:
153
            emit(op)
154
            skip = _len(code); emit(0)
155 156 157 158 159
            if av[0] >= 0:
                emit(0) # look ahead
            else:
                lo, hi = av[1].getwidth()
                if lo != hi:
160
                    raise error("look-behind requires fixed-width pattern")
161 162
                emit(lo) # look behind
            _compile(code, av[1], flags)
163
            emit(SUCCESS)
164
            code[skip] = _len(code) - skip
165
        elif op is CALL:
166
            emit(op)
167
            skip = _len(code); emit(0)
168
            _compile(code, av, flags)
169
            emit(SUCCESS)
170
            code[skip] = _len(code) - skip
171
        elif op is AT:
172
            emit(op)
173
            if flags & SRE_FLAG_MULTILINE:
Fredrik Lundh's avatar
Fredrik Lundh committed
174 175 176
                av = AT_MULTILINE.get(av, av)
            if flags & SRE_FLAG_LOCALE:
                av = AT_LOCALE.get(av, av)
177
            elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
Fredrik Lundh's avatar
Fredrik Lundh committed
178
                av = AT_UNICODE.get(av, av)
179
            emit(av)
180
        elif op is BRANCH:
181
            emit(op)
182
            tail = []
183
            tailappend = tail.append
184
            for av in av[1]:
185
                skip = _len(code); emit(0)
186
                # _compile_info(code, av, flags)
187
                _compile(code, av, flags)
188
                emit(JUMP)
189 190
                tailappend(_len(code)); emit(0)
                code[skip] = _len(code) - skip
191
            emit(FAILURE) # end of branch
192
            for tail in tail:
193
                code[tail] = _len(code) - tail
194
        elif op is CATEGORY:
195
            emit(op)
196
            if flags & SRE_FLAG_LOCALE:
Fredrik Lundh's avatar
Fredrik Lundh committed
197
                av = CH_LOCALE[av]
198
            elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
Fredrik Lundh's avatar
Fredrik Lundh committed
199
                av = CH_UNICODE[av]
200
            emit(av)
201
        elif op is GROUPREF:
202
            if flags & SRE_FLAG_IGNORECASE:
203
                emit(OP_IGNORE[op])
204
            else:
205
                emit(op)
206
            emit(av-1)
207
        elif op is GROUPREF_EXISTS:
208
            emit(op)
209
            emit(av[0]-1)
210
            skipyes = _len(code); emit(0)
211 212
            _compile(code, av[1], flags)
            if av[2]:
213
                emit(JUMP)
214 215
                skipno = _len(code); emit(0)
                code[skipyes] = _len(code) - skipyes + 1
216
                _compile(code, av[2], flags)
217
                code[skipno] = _len(code) - skipno
218
            else:
219
                code[skipyes] = _len(code) - skipyes + 1
220
        else:
221
            raise error("internal: unsupported operand type %r" % (op,))
222

223
def _compile_charset(charset, flags, code, fixup=None, fixes=None):
224 225
    # compile charset subprogram
    emit = code.append
226
    for op, av in _optimize_charset(charset, fixup, fixes):
227
        emit(op)
228 229 230
        if op is NEGATE:
            pass
        elif op is LITERAL:
231 232 233 234
            emit(av)
        elif op is RANGE or op is RANGE_IGNORE:
            emit(av[0])
            emit(av[1])
235 236
        elif op is CHARSET:
            code.extend(av)
237 238
        elif op is BIGCHARSET:
            code.extend(av)
239 240
        elif op is CATEGORY:
            if flags & SRE_FLAG_LOCALE:
241
                emit(CH_LOCALE[av])
242
            elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
243
                emit(CH_UNICODE[av])
244
            else:
245
                emit(av)
246
        else:
247
            raise error("internal: unsupported set operator %r" % (op,))
248
    emit(FAILURE)
249

250
def _optimize_charset(charset, fixup, fixes):
251 252
    # internal: optimize character set
    out = []
253 254 255 256 257 258
    tail = []
    charmap = bytearray(256)
    for op, av in charset:
        while True:
            try:
                if op is LITERAL:
259
                    if fixup:
260 261 262 263
                        lo = fixup(av)
                        charmap[lo] = 1
                        if fixes and lo in fixes:
                            for k in fixes[lo]:
264 265 266
                                charmap[k] = 1
                    else:
                        charmap[av] = 1
267
                elif op is RANGE:
268 269 270
                    r = range(av[0], av[1]+1)
                    if fixup:
                        r = map(fixup, r)
271 272 273 274 275 276 277 278 279
                    if fixup and fixes:
                        for i in r:
                            charmap[i] = 1
                            if i in fixes:
                                for k in fixes[i]:
                                    charmap[k] = 1
                    else:
                        for i in r:
                            charmap[i] = 1
280 281 282 283 284 285 286 287 288
                elif op is NEGATE:
                    out.append((op, av))
                else:
                    tail.append((op, av))
            except IndexError:
                if len(charmap) == 256:
                    # character set contains non-UCS1 character codes
                    charmap += b'\0' * 0xff00
                    continue
289 290 291 292 293 294
                # Character set contains non-BMP character codes.
                # There are only two ranges of cased non-BMP characters:
                # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
                # and for both ranges RANGE_IGNORE works.
                if fixup and op is RANGE:
                    op = RANGE_IGNORE
295 296 297
                tail.append((op, av))
            break

298 299
    # compress character map
    runs = []
300 301 302 303 304 305 306 307 308 309 310 311 312 313
    q = 0
    while True:
        p = charmap.find(1, q)
        if p < 0:
            break
        if len(runs) >= 2:
            runs = None
            break
        q = charmap.find(0, p)
        if q < 0:
            runs.append((p, len(charmap)))
            break
        runs.append((p, q))
    if runs is not None:
314
        # use literal/range
315 316 317
        for p, q in runs:
            if q - p == 1:
                out.append((LITERAL, p))
318
            else:
319 320
                out.append((RANGE, (p, q - 1)))
        out += tail
321 322
        # if the case was changed or new representation is more compact
        if fixup or len(out) < len(charset):
323
            return out
324
        # else original character set is good enough
325 326 327 328
        return charset

    # use bitmap
    if len(charmap) == 256:
329
        data = _mk_bitmap(charmap)
330 331
        out.append((CHARSET, data))
        out += tail
332
        return out
333

334 335 336 337 338 339 340 341
    # To represent a big charset, first a bitmap of all characters in the
    # set is constructed. Then, this bitmap is sliced into chunks of 256
    # characters, duplicate chunks are eliminated, and each chunk is
    # given a number. In the compiled expression, the charset is
    # represented by a 32-bit word sequence, consisting of one word for
    # the number of different chunks, a sequence of 256 bytes (64 words)
    # of chunk numbers indexed by their original chunk position, and a
    # sequence of 256-bit chunks (8 words each).
342

343 344 345 346 347
    # Compression is normally good: in a typical charset, large ranges of
    # Unicode will be either completely excluded (e.g. if only cyrillic
    # letters are to be matched), or completely included (e.g. if large
    # subranges of Kanji match). These ranges will be represented by
    # chunks of all one-bits or all zero-bits.
348

349 350 351 352
    # Matching can be also done efficiently: the more significant byte of
    # the Unicode character is an index into the chunk number, and the
    # less significant byte is a bit index in the chunk (just like the
    # CHARSET matching).
353

354
    charmap = bytes(charmap) # should be hashable
355
    comps = {}
356
    mapping = bytearray(256)
357
    block = 0
358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373
    data = bytearray()
    for i in range(0, 65536, 256):
        chunk = charmap[i: i + 256]
        if chunk in comps:
            mapping[i // 256] = comps[chunk]
        else:
            mapping[i // 256] = comps[chunk] = block
            block += 1
            data += chunk
    data = _mk_bitmap(data)
    data[0:0] = [block] + _bytes_to_codes(mapping)
    out.append((BIGCHARSET, data))
    out += tail
    return out

_CODEBITS = _sre.CODESIZE * 8
374
MAXCODE = (1 << _CODEBITS) - 1
375 376 377 378 379 380 381 382
_BITS_TRANS = b'0' + b'1' * 255
def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
    s = bits.translate(_BITS_TRANS)[::-1]
    return [_int(s[i - _CODEBITS: i], 2)
            for i in range(len(s), 0, -_CODEBITS)]

def _bytes_to_codes(b):
    # Convert block indices to word array
383
    a = memoryview(b).cast('I')
384 385 386
    assert a.itemsize == _sre.CODESIZE
    assert len(a) * a.itemsize == len(b)
    return a.tolist()
387

388 389 390 391 392
def _simple(av):
    # check if av is a "simple" operator
    lo, hi = av[2].getwidth()
    return lo == hi == 1 and av[2][0][0] != SUBPATTERN

393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413
def _generate_overlap_table(prefix):
    """
    Generate an overlap table for the following prefix.
    An overlap table is a table of the same size as the prefix which
    informs about the potential self-overlap for each index in the prefix:
    - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...]
    - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with
      prefix[0:k]
    """
    table = [0] * len(prefix)
    for i in range(1, len(prefix)):
        idx = table[i - 1]
        while prefix[i] != prefix[idx]:
            if idx == 0:
                table[i] = 0
                break
            idx = table[idx - 1]
        else:
            table[i] = idx + 1
    return table

414 415
def _get_literal_prefix(pattern):
    # look for literal prefix
416
    prefix = []
417
    prefixappend = prefix.append
418 419 420 421 422
    prefix_skip = None
    for op, av in pattern.data:
        if op is LITERAL:
            prefixappend(av)
        elif op is SUBPATTERN:
423 424 425 426
            group, add_flags, del_flags, p = av
            if add_flags & SRE_FLAG_IGNORECASE:
                break
            prefix1, prefix_skip1, got_all = _get_literal_prefix(p)
427
            if prefix_skip is None:
428
                if group is not None:
429 430 431 432 433 434 435 436
                    prefix_skip = len(prefix)
                elif prefix_skip1 is not None:
                    prefix_skip = len(prefix) + prefix_skip1
            prefix.extend(prefix1)
            if not got_all:
                break
        else:
            break
437 438 439
    else:
        return prefix, prefix_skip, True
    return prefix, prefix_skip, False
440 441

def _get_charset_prefix(pattern):
442
    charset = [] # not used
443
    charsetappend = charset.append
444 445
    if pattern.data:
        op, av = pattern.data[0]
446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462
        if op is SUBPATTERN:
            group, add_flags, del_flags, p = av
            if p and not (add_flags & SRE_FLAG_IGNORECASE):
                op, av = p[0]
                if op is LITERAL:
                    charsetappend((op, av))
                elif op is BRANCH:
                    c = []
                    cappend = c.append
                    for p in av[1]:
                        if not p:
                            break
                        op, av = p[0]
                        if op is LITERAL:
                            cappend((op, av))
                        else:
                            break
463
                    else:
464
                        charset = c
465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501
        elif op is BRANCH:
            c = []
            cappend = c.append
            for p in av[1]:
                if not p:
                    break
                op, av = p[0]
                if op is LITERAL:
                    cappend((op, av))
                else:
                    break
            else:
                charset = c
        elif op is IN:
            charset = av
    return charset

def _compile_info(code, pattern, flags):
    # internal: compile an info block.  in the current version,
    # this contains min/max pattern width, and an optional literal
    # prefix or a character map
    lo, hi = pattern.getwidth()
    if hi > MAXCODE:
        hi = MAXCODE
    if lo == 0:
        code.extend([INFO, 4, 0, lo, hi])
        return
    # look for a literal prefix
    prefix = []
    prefix_skip = 0
    charset = [] # not used
    if not (flags & SRE_FLAG_IGNORECASE):
        # look for literal prefix
        prefix, prefix_skip, got_all = _get_literal_prefix(pattern)
        # if no prefix, look for charset prefix
        if not prefix:
            charset = _get_charset_prefix(pattern)
502
##     if prefix:
503
##         print("*** PREFIX", prefix, prefix_skip)
504
##     if charset:
505
##         print("*** CHARSET", charset)
506 507
    # add an info block
    emit = code.append
508
    emit(INFO)
509 510 511
    skip = len(code); emit(0)
    # literal flag
    mask = 0
512 513
    if prefix:
        mask = SRE_INFO_PREFIX
514
        if prefix_skip is None and got_all:
515
            mask = mask | SRE_INFO_LITERAL
516
    elif charset:
517
        mask = mask | SRE_INFO_CHARSET
518 519
    emit(mask)
    # pattern length
520 521 522 523 524
    if lo < MAXCODE:
        emit(lo)
    else:
        emit(MAXCODE)
        prefix = prefix[:MAXCODE]
525
    emit(min(hi, MAXCODE))
526 527
    # add literal prefix
    if prefix:
528
        emit(len(prefix)) # length
529 530
        if prefix_skip is None:
            prefix_skip =  len(prefix)
531 532 533
        emit(prefix_skip) # skip
        code.extend(prefix)
        # generate overlap table
534
        code.extend(_generate_overlap_table(prefix))
535
    elif charset:
536
        _compile_charset(charset, flags, code)
537 538
    code[skip] = len(code) - skip

539
def isstring(obj):
540
    return isinstance(obj, (str, bytes))
541

542
def _code(p, flags):
543

544
    flags = p.pattern.flags | flags
Fredrik Lundh's avatar
Fredrik Lundh committed
545
    code = []
546 547 548 549 550

    # compile info block
    _compile_info(code, p, flags)

    # compile the pattern
551
    _compile(code, p.data, flags)
552

553
    code.append(SUCCESS)
554

555 556 557 558 559
    return code

def compile(p, flags=0):
    # internal: convert pattern list to internal format

560
    if isstring(p):
561 562 563 564 565
        pattern = p
        p = sre_parse.parse(p, flags)
    else:
        pattern = None

566
    code = _code(p, flags)
567

568
    # print(code)
569

570 571 572 573 574 575
    # map in either direction
    groupindex = p.pattern.groupdict
    indexgroup = [None] * p.pattern.groups
    for k, i in groupindex.items():
        indexgroup[i] = k

576
    return _sre.compile(
577
        pattern, flags | p.pattern.flags, code,
578 579
        p.pattern.groups-1,
        groupindex, indexgroup
580
        )