Issue #19327: Fixed the working of regular expressions with too big charset.

be80fc9a · Serhiy Storchaka · b82a3dc2 · be80fc9a · be80fc9a · be80fc9a
Kaydet (Commit) be80fc9a authored Eki 24, 2013 tarafından Serhiy Storchaka
Hide whitespace changes
Inline Side-by-side

Showing with 8 additions and 3 deletions

sre_compile.py Lib/sre_compile.py +1 -1

test_re.py Lib/test/test_re.py +3 -0

NEWS Misc/NEWS +2 -0

_sre.c Modules/_sre.c +2 -2

No files found.
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -345,7 +345,7 @@ def _optimize_unicode(charset, fixup):
    else:
        code = 'I'
    # Convert block indices to byte array of 256 bytes
-    mapping = array.array('b', mapping).tobytes()
+    mapping = array.array('B', mapping).tobytes()
    # Convert byte array to word array
    mapping = array.array(code, mapping)
    assert mapping.itemsize == _sre.CODESIZE

--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -428,6 +428,9 @@ class ReTests(unittest.TestCase):
                                  "\u2222").group(1), "\u2222")
        self.assertEqual(re.match("([\u2222\u2223])",
                                  "\u2222", re.UNICODE).group(1), "\u2222")
+        r = '[%s]' % ''.join(map(chr, range(256, 2**16, 255)))
+        self.assertEqual(re.match(r,
+                                  "\uff01", re.UNICODE).group(), "\uff01")
    def test_big_codesize(self):
        # Issue #1160

--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -81,6 +81,8 @@ Core and Builtins
 Library
 -------
+- Issue #19327: Fixed the working of regular expressions with too big charset.
 - Issue #19350: Increasing the test coverage of macurl2path. Patch by Colin
  Williams.

--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -451,7 +451,7 @@ SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
            count = *(set++);
            if (sizeof(SRE_CODE) == 2) {
-                block = ((char*)set)[ch >> 8];
+                block = ((unsigned char*)set)[ch >> 8];
                set += 128;
                if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
                    return ok;
@@ -461,7 +461,7 @@ SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
                /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
                 * warnings when c's type supports only numbers < N+1 */
                if (!(ch & ~65535))
-                    block = ((char*)set)[ch >> 8];
+                    block = ((unsigned char*)set)[ch >> 8];
                else
                    block = -1;
                set += 64;