use full unicode mappings for upper/lower/title case (#12736)

Also broaden the category of characters that count as lowercase/uppercase.

use full unicode mappings for upper/lower/title case (#12736)
Also broaden the category of characters that count as lowercase/uppercase.
b2bf01d8 · Benjamin Peterson · 9007f72d · b2bf01d8 · b2bf01d8 · b2bf01d8
Kaydet (Commit) b2bf01d8 authored Ock 11, 2012 tarafından Benjamin Peterson
11 changed files
--- a/Doc/c-api/unicode.rst
+++ b/Doc/c-api/unicode.rst
@@ -318,16 +318,25 @@ These APIs can be used for fast direct character conversions:
   Return the character *ch* converted to lower case.
+   .. deprecated:: 3.3
+      This function uses simple case mappings.
 .. c:function:: Py_UNICODE Py_UNICODE_TOUPPER(Py_UNICODE ch)
   Return the character *ch* converted to upper case.
+   .. deprecated:: 3.3
+      This function uses simple case mappings.
 .. c:function:: Py_UNICODE Py_UNICODE_TOTITLE(Py_UNICODE ch)
   Return the character *ch* converted to title case.
+   .. deprecated:: 3.3
+      This function uses simple case mappings.
 .. c:function:: int Py_UNICODE_TODECIMAL(Py_UNICODE ch)

--- a/Doc/library/stdtypes.rst
+++ b/Doc/library/stdtypes.rst
@@ -1360,7 +1360,8 @@ functions based on regular expressions.
 .. method:: str.swapcase()
   Return a copy of the string with uppercase characters converted to lowercase and
-   vice versa.
+   vice versa. Note that it is not necessarily true that
+   ``s.swapcase().swapcase() == s``.
 .. method:: str.title()

--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -2008,6 +2008,29 @@ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
    Py_UCS4 ch       /* Unicode character */
    );
+PyAPI_FUNC(int) _PyUnicode_ToLowerFull(
+    Py_UCS4 ch,       /* Unicode character */
+    Py_UCS4 *res
+    );
+PyAPI_FUNC(int) _PyUnicode_ToTitleFull(
+    Py_UCS4 ch,       /* Unicode character */
+    Py_UCS4 *res
+    );
+PyAPI_FUNC(int) _PyUnicode_ToUpperFull(
+    Py_UCS4 ch,       /* Unicode character */
+    Py_UCS4 *res
+    );
+PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
+    const Py_UCS4 ch         /* Unicode character */
+    );
+PyAPI_FUNC(int) _PyUnicode_IsCased(
+    const Py_UCS4 ch         /* Unicode character */
+    );
 PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
    Py_UCS4 ch       /* Unicode character */
    );

--- a/Lib/test/string_tests.py
+++ b/Lib/test/string_tests.py
@@ -669,7 +669,7 @@ class CommonTest(BaseTest):
        # check that titlecased chars are lowered correctly
        # \u1ffc is the titlecased char
-        self.checkequal('\u1ffc\u1ff3\u1ff3\u1ff3',
+        self.checkequal('\u03a9\u0399\u1ff3\u1ff3\u1ff3',
                        '\u1ff3\u1ff3\u1ffc\u1ffc', 'capitalize')
        # check with cased non-letter chars
        self.checkequal('\u24c5\u24e8\u24e3\u24d7\u24de\u24dd',

--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -369,6 +369,8 @@ class UnicodeTest(string_tests.CommonTest,
    def test_islower(self):
        string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
        self.checkequalnofix(False, '\u1FFc', 'islower')
+        self.assertFalse('\u2167'.islower())
+        self.assertTrue('\u2177'.islower())
        # non-BMP, uppercase
        self.assertFalse('\U00010401'.islower())
        self.assertFalse('\U00010427'.islower())
@@ -383,6 +385,8 @@ class UnicodeTest(string_tests.CommonTest,
        string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
        if not sys.platform.startswith('java'):
            self.checkequalnofix(False, '\u1FFc', 'isupper')
+        self.assertTrue('\u2167'.isupper())
+        self.assertFalse('\u2177'.isupper())
        # non-BMP, uppercase
        self.assertTrue('\U00010401'.isupper())
        self.assertTrue('\U00010427'.isupper())
@@ -548,6 +552,18 @@ class UnicodeTest(string_tests.CommonTest,
                         '\U0001044F\U0001044F')
        self.assertEqual('X\U00010427x\U0001044F'.lower(),
                         'x\U0001044Fx\U0001044F')
+        self.assertEqual('ﬁ'.lower(), 'ﬁ')
+        self.assertEqual('\u0130'.lower(), '\u0069\u0307')
+        # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
+        self.assertEqual('\u03a3'.lower(), '\u03c3')
+        self.assertEqual('\u0345\u03a3'.lower(), '\u0345\u03c3')
+        self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
+        self.assertEqual('A\u0345\u03a3a'.lower(), 'a\u0345\u03c3a')
+        self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
+        self.assertEqual('A\u03a3\u0345'.lower(), 'a\u03c2\u0345')
+        self.assertEqual('\u03a3\u0345 '.lower(), '\u03c3\u0345 ')
+        self.assertEqual('\U0008fffe'.lower(), '\U0008fffe')
+        self.assertEqual('\u2177'.lower(), '\u2177')
    def test_upper(self):
        string_tests.CommonTest.test_upper(self)
@@ -558,6 +574,13 @@ class UnicodeTest(string_tests.CommonTest,
                         '\U00010427\U00010427')
        self.assertEqual('X\U00010427x\U0001044F'.upper(),
                         'X\U00010427X\U00010427')
+        self.assertEqual('ﬁ'.upper(), 'FI')
+        self.assertEqual('\u0130'.upper(), '\u0130')
+        self.assertEqual('\u03a3'.upper(), '\u03a3')
+        self.assertEqual('ß'.upper(), 'SS')
+        self.assertEqual('\u1fd2'.upper(), '\u0399\u0308\u0300')
+        self.assertEqual('\U0008fffe'.upper(), '\U0008fffe')
+        self.assertEqual('\u2177'.upper(), '\u2167')
    def test_capitalize(self):
        string_tests.CommonTest.test_capitalize(self)
@@ -570,6 +593,11 @@ class UnicodeTest(string_tests.CommonTest,
                         '\U00010427\U0001044F')
        self.assertEqual('X\U00010427x\U0001044F'.capitalize(),
                         'X\U0001044Fx\U0001044F')
+        self.assertEqual('h\u0130'.capitalize(), 'H\u0069\u0307')
+        exp = '\u0399\u0308\u0300\u0069\u0307'
+        self.assertEqual('\u1fd2\u0130'.capitalize(), exp)
+        self.assertEqual('ﬁnnish'.capitalize(), 'FInnish')
+        self.assertEqual('A\u0345\u03a3'.capitalize(), 'A\u0345\u03c2')
    def test_title(self):
        string_tests.MixinStrUnicodeUserStringTest.test_title(self)
@@ -584,6 +612,9 @@ class UnicodeTest(string_tests.CommonTest,
                         '\U00010427\U0001044F \U00010427\U0001044F')
        self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
                         'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
+        self.assertEqual('ﬁNNISH'.title(), 'Finnish')
+        self.assertEqual('A\u03a3 \u1fa1xy'.title(), 'A\u03c2 \u1fa9xy')
+        self.assertEqual('A\u03a3A'.title(), 'A\u03c3a')
    def test_swapcase(self):
        string_tests.CommonTest.test_swapcase(self)
@@ -597,6 +628,19 @@ class UnicodeTest(string_tests.CommonTest,
                         '\U00010427\U0001044F')
        self.assertEqual('X\U00010427x\U0001044F'.swapcase(),
                         'x\U0001044FX\U00010427')
+        self.assertEqual('ﬁ'.swapcase(), 'FI')
+        self.assertEqual('\u0130'.swapcase(), '\u0069\u0307')
+        # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
+        self.assertEqual('\u03a3'.swapcase(), '\u03c3')
+        self.assertEqual('\u0345\u03a3'.swapcase(), '\u0399\u03c3')
+        self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
+        self.assertEqual('A\u0345\u03a3a'.swapcase(), 'a\u0399\u03c3A')
+        self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
+        self.assertEqual('A\u03a3\u0345'.swapcase(), 'a\u03c2\u0399')
+        self.assertEqual('\u03a3\u0345 '.swapcase(), '\u03c3\u0399 ')
+        self.assertEqual('\u03a3'.swapcase(), '\u03c3')
+        self.assertEqual('ß'.swapcase(), 'SS')
+        self.assertEqual('\u1fd2'.swapcase(), '\u0399\u0308\u0300')
    def test_contains(self):
        # Testing Unicode contains method

--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@@ -21,7 +21,7 @@ errors = 'surrogatepass'
 class UnicodeMethodsTest(unittest.TestCase):
    # update this, if the database changes
-    expectedchecksum = '21b90f1aed00081b81ca7942b22196af090015a0'
+    expectedchecksum = 'df0b3ca6785a070b21f837b227dbdbdff3c2e921'
    def test_method_checksum(self):
        h = hashlib.sha1()

--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,8 @@ What's New in Python 3.3 Alpha 1?
 Core and Builtins
 -----------------
+- Issue #12736: Use full unicode case mappings for upper, lower, and title case.
 - Issue #12760: Add a create mode to open(). Patch by David Townshend.
 - Issue #13738: Simplify implementation of bytes.lower() and bytes.upper().

--- a/Objects/unicodectype.c
+++ b/Objects/unicodectype.c
@@ -21,8 +21,10 @@
 #define XID_START_MASK 0x100
 #define XID_CONTINUE_MASK 0x200
 #define PRINTABLE_MASK 0x400
-#define NODELTA_MASK 0x800
+#define NUMERIC_MASK 0x800
-#define NUMERIC_MASK 0x1000
+#define CASE_IGNORABLE_MASK 0x1000
+#define CASED_MASK 0x2000
+#define EXTENDED_CASE_MASK 0x4000
 typedef struct {
    const Py_UCS4 upper;
@@ -57,15 +59,8 @@ gettyperecord(Py_UCS4 code)
 Py_UCS4 _PyUnicode_ToTitlecase(register Py_UCS4 ch)
 {
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
-    int delta = ctype->title;
-    if (ctype->flags & NODELTA_MASK)
+    return ctype->title ? ctype->title : ch;
-        return delta;
-    if (delta >= 32768)
-            delta -= 65536;
-    return ch + delta;
 }
 /* Returns 1 for Unicode characters having the category 'Lt', 0
@@ -188,12 +183,10 @@ int _PyUnicode_IsUppercase(Py_UCS4 ch)
 Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
 {
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
-    int delta = ctype->upper;
-    if (ctype->flags & NODELTA_MASK)
+    if (ctype->flags & EXTENDED_CASE_MASK)
-        return delta;
+        return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFFFF];
-    if (delta >= 32768)
+    return ctype->upper ? ctype->upper : ch;
-            delta -= 65536;
-    return ch + delta;
 }
 /* Returns the lowercase Unicode characters corresponding to ch or just
@@ -202,12 +195,72 @@ Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
 Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
 {
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
-    int delta = ctype->lower;
-    if (ctype->flags & NODELTA_MASK)
+    if (ctype->flags & EXTENDED_CASE_MASK)
-        return delta;
+        return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFFFF];
-    if (delta >= 32768)
+    return ctype->lower ? ctype->lower : ch;
-            delta -= 65536;
+}
-    return ch + delta;
+int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res)
+{
+    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
+    if (ctype->flags & EXTENDED_CASE_MASK) {
+        int index = ctype->lower & 0xFFFFFF;
+        int n = ctype->lower >> 24;
+        int i;
+        for (i = 0; i < n; i++)
+            res[i] = _PyUnicode_ExtendedCase[index + i];
+        return n;
+    }
+    res[0] = ctype->lower ? ctype->lower : ch;
+    return 1;
+}
+int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res)
+{
+    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
+    if (ctype->flags & EXTENDED_CASE_MASK) {
+        int index = ctype->title & 0xFFFFFF;
+        int n = ctype->title >> 24;
+        int i;
+        for (i = 0; i < n; i++)
+            res[i] = _PyUnicode_ExtendedCase[index + i];
+        return n;
+    }
+    res[0] = ctype->title ? ctype->title : ch;
+    return 1;
+}
+int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res)
+{
+    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
+    if (ctype->flags & EXTENDED_CASE_MASK) {
+        int index = ctype->upper & 0xFFFFFF;
+        int n = ctype->upper >> 24;
+        int i;
+        for (i = 0; i < n; i++)
+            res[i] = _PyUnicode_ExtendedCase[index + i];
+        return n;
+    }
+    res[0] = ctype->upper ? ctype->upper : ch;
+    return 1;
+}
+int _PyUnicode_IsCased(Py_UCS4 ch)
+{
+    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
+    return (ctype->flags & CASED_MASK) != 0;
+}
+int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch)
+{
+    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
+    return (ctype->flags & CASE_IGNORABLE_MASK) != 0;
 }
 /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',

--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
--- a/Objects/unicodetype_db.h
+++ b/Objects/unicodetype_db.h
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -22,6 +22,7 @@
 # 2006-03-10 mvl  update to Unicode 4.1; add UCD 3.2 delta
 # 2008-06-11 gb   add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
 # 2011-10-21 ezio add support for name aliases and named sequences
+# 2012-01    benjamin add full case mappings
 #
 # written by Fredrik Lundh (fredrik@pythonware.com)
 #
@@ -47,6 +48,7 @@ DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
 LINE_BREAK = "LineBreak%s.txt"
 NAME_ALIASES = "NameAliases%s.txt"
 NAMED_SEQUENCES = "NamedSequences%s.txt"
+SPECIAL_CASING = "SpecialCasing%s.txt"
 # Private Use Areas -- in planes 1, 15, 16
 PUA_1 = range(0xE000, 0xF900)
@@ -84,8 +86,10 @@ UPPER_MASK = 0x80
 XID_START_MASK = 0x100
 XID_CONTINUE_MASK = 0x200
 PRINTABLE_MASK = 0x400
-NODELTA_MASK = 0x800
+NUMERIC_MASK = 0x800
-NUMERIC_MASK = 0x1000
+CASE_IGNORABLE_MASK = 0x1000
+CASED_MASK = 0x2000
+EXTENDED_CASE_MASK = 0x4000
 # these ranges need to match unicodedata.c:is_unified_ideograph
 cjk_ranges = [
@@ -384,6 +388,7 @@ def makeunicodetype(unicode, trace):
    numeric = {}
    spaces = []
    linebreaks = []
+    extra_casing = []
    for char in unicode.chars:
        record = unicode.table[char]
@@ -396,7 +401,7 @@ def makeunicodetype(unicode, trace):
            delta = True
            if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
                flags |= ALPHA_MASK
-            if category == "Ll":
+            if "Lowercase" in properties:
                flags |= LOWER_MASK
            if 'Line_Break' in properties or bidirectional == "B":
                flags |= LINEBREAK_MASK
@@ -406,7 +411,7 @@ def makeunicodetype(unicode, trace):
                spaces.append(char)
            if category == "Lt":
                flags |= TITLE_MASK
-            if category == "Lu":
+            if "Uppercase" in properties:
                flags |= UPPER_MASK
            if char == ord(" ") or category[0] not in ("C", "Z"):
                flags |= PRINTABLE_MASK
@@ -414,35 +419,41 @@ def makeunicodetype(unicode, trace):
                flags |= XID_START_MASK
            if "XID_Continue" in properties:
                flags |= XID_CONTINUE_MASK
-            # use delta predictor for upper/lower/title if it fits
+            if "Cased" in properties:
-            if record[12]:
+                flags |= CASED_MASK
-                upper = int(record[12], 16)
+            if "Case_Ignorable" in properties:
-            else:
+                flags |= CASE_IGNORABLE_MASK
-                upper = char
+            sc = unicode.special_casing.get(char)
-            if record[13]:
+            if sc is None:
-                lower = int(record[13], 16)
+                if record[12]:
-            else:
+                    upper = int(record[12], 16)
-                lower = char
+                else:
-            if record[14]:
+                    upper = char
-                title = int(record[14], 16)
+                if record[13]:
-            else:
+                    lower = int(record[13], 16)
-                # UCD.html says that a missing title char means that
+                else:
-                # it defaults to the uppercase character, not to the
+                    lower = char
-                # character itself. Apparently, in the current UCD (5.x)
+                if record[14]:
-                # this feature is never used
+                    title = int(record[14], 16)
-                title = upper
+                else:
-            upper_d = upper - char
+                    title = upper
-            lower_d = lower - char
+                if upper == lower == title:
-            title_d = title - char
+                    upper = lower = title = 0
-            if -32768 <= upper_d <= 32767 and \
-               -32768 <= lower_d <= 32767 and \
-               -32768 <= title_d <= 32767:
-                # use deltas
-                upper = upper_d & 0xffff
-                lower = lower_d & 0xffff
-                title = title_d & 0xffff
            else:
-                flags |= NODELTA_MASK
+                # This happens when some character maps to more than one
+                # character in uppercase, lowercase, or titlecase. The extra
+                # characters are stored in a different array.
+                flags |= EXTENDED_CASE_MASK
+                lower = len(extra_casing) | (len(sc[0]) << 24)
+                extra_casing.extend(sc[0])
+                upper = len(extra_casing) | (len(sc[2]) << 24)
+                extra_casing.extend(sc[2])
+                # Title is probably equal to upper.
+                if sc[1] == sc[2]:
+                    title = upper
+                else:
+                    title = len(extra_casing) | (len(sc[1]) << 24)
+                    extra_casing.extend(sc[1])
            # decimal digit, integer digit
            decimal = 0
            if record[6]:
@@ -469,6 +480,7 @@ def makeunicodetype(unicode, trace):
    print(sum(map(len, numeric.values())), "numeric code points")
    print(len(spaces), "whitespace code points")
    print(len(linebreaks), "linebreak code points")
+    print(len(extra_casing), "extended case array")
    print("--- Writing", FILE, "...")
@@ -482,6 +494,14 @@ def makeunicodetype(unicode, trace):
    print("};", file=fp)
    print(file=fp)
+    print("/* extended case mappings */", file=fp)
+    print(file=fp)
+    print("const Py_UCS4 _PyUnicode_ExtendedCase[] = {", file=fp)
+    for c in extra_casing:
+        print("    %d," % c, file=fp)
+    print("};", file=fp)
+    print(file=fp)
    # split decomposition index table
    index1, index2, shift = splitbins(index, trace)
@@ -1070,6 +1090,23 @@ class UnicodeData:
            # Patch the numeric field
            if table[i] is not None:
                table[i][8] = value
+        sc = self.special_casing = {}
+        with open_data(SPECIAL_CASING, version) as file:
+            for s in file:
+                s = s[:-1].split('#', 1)[0]
+                if not s:
+                    continue
+                data = s.split("; ")
+                if data[4]:
+                    # We ignore all conditionals (since they depend on
+                    # languages) except for one, which is hardcoded. See
+                    # handle_capital_sigma in unicodeobject.c.
+                    continue
+                c = int(data[0], 16)
+                lower = [int(char, 16) for char in data[1].split()]
+                title = [int(char, 16) for char in data[2].split()]
+                upper = [int(char, 16) for char in data[3].split()]
+                sc[c] = (lower, title, upper)
    def uselatin1(self):
        # restrict character range to ISO Latin 1