Kaydet (Commit) b2bf01d8 authored tarafından Benjamin Peterson's avatar Benjamin Peterson

use full unicode mappings for upper/lower/title case (#12736)

Also broaden the category of characters that count as lowercase/uppercase.
üst 9007f72d
...@@ -318,16 +318,25 @@ These APIs can be used for fast direct character conversions: ...@@ -318,16 +318,25 @@ These APIs can be used for fast direct character conversions:
Return the character *ch* converted to lower case. Return the character *ch* converted to lower case.
.. deprecated:: 3.3
This function uses simple case mappings.
.. c:function:: Py_UNICODE Py_UNICODE_TOUPPER(Py_UNICODE ch) .. c:function:: Py_UNICODE Py_UNICODE_TOUPPER(Py_UNICODE ch)
Return the character *ch* converted to upper case. Return the character *ch* converted to upper case.
.. deprecated:: 3.3
This function uses simple case mappings.
.. c:function:: Py_UNICODE Py_UNICODE_TOTITLE(Py_UNICODE ch) .. c:function:: Py_UNICODE Py_UNICODE_TOTITLE(Py_UNICODE ch)
Return the character *ch* converted to title case. Return the character *ch* converted to title case.
.. deprecated:: 3.3
This function uses simple case mappings.
.. c:function:: int Py_UNICODE_TODECIMAL(Py_UNICODE ch) .. c:function:: int Py_UNICODE_TODECIMAL(Py_UNICODE ch)
......
...@@ -1360,7 +1360,8 @@ functions based on regular expressions. ...@@ -1360,7 +1360,8 @@ functions based on regular expressions.
.. method:: str.swapcase() .. method:: str.swapcase()
Return a copy of the string with uppercase characters converted to lowercase and Return a copy of the string with uppercase characters converted to lowercase and
vice versa. vice versa. Note that it is not necessarily true that
``s.swapcase().swapcase() == s``.
.. method:: str.title() .. method:: str.title()
......
...@@ -2008,6 +2008,29 @@ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase( ...@@ -2008,6 +2008,29 @@ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
Py_UCS4 ch /* Unicode character */ Py_UCS4 ch /* Unicode character */
); );
PyAPI_FUNC(int) _PyUnicode_ToLowerFull(
Py_UCS4 ch, /* Unicode character */
Py_UCS4 *res
);
PyAPI_FUNC(int) _PyUnicode_ToTitleFull(
Py_UCS4 ch, /* Unicode character */
Py_UCS4 *res
);
PyAPI_FUNC(int) _PyUnicode_ToUpperFull(
Py_UCS4 ch, /* Unicode character */
Py_UCS4 *res
);
PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
const Py_UCS4 ch /* Unicode character */
);
PyAPI_FUNC(int) _PyUnicode_IsCased(
const Py_UCS4 ch /* Unicode character */
);
PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit( PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
Py_UCS4 ch /* Unicode character */ Py_UCS4 ch /* Unicode character */
); );
......
...@@ -669,7 +669,7 @@ class CommonTest(BaseTest): ...@@ -669,7 +669,7 @@ class CommonTest(BaseTest):
# check that titlecased chars are lowered correctly # check that titlecased chars are lowered correctly
# \u1ffc is the titlecased char # \u1ffc is the titlecased char
self.checkequal('\u1ffc\u1ff3\u1ff3\u1ff3', self.checkequal('\u03a9\u0399\u1ff3\u1ff3\u1ff3',
'\u1ff3\u1ff3\u1ffc\u1ffc', 'capitalize') '\u1ff3\u1ff3\u1ffc\u1ffc', 'capitalize')
# check with cased non-letter chars # check with cased non-letter chars
self.checkequal('\u24c5\u24e8\u24e3\u24d7\u24de\u24dd', self.checkequal('\u24c5\u24e8\u24e3\u24d7\u24de\u24dd',
......
...@@ -369,6 +369,8 @@ class UnicodeTest(string_tests.CommonTest, ...@@ -369,6 +369,8 @@ class UnicodeTest(string_tests.CommonTest,
def test_islower(self): def test_islower(self):
string_tests.MixinStrUnicodeUserStringTest.test_islower(self) string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
self.checkequalnofix(False, '\u1FFc', 'islower') self.checkequalnofix(False, '\u1FFc', 'islower')
self.assertFalse('\u2167'.islower())
self.assertTrue('\u2177'.islower())
# non-BMP, uppercase # non-BMP, uppercase
self.assertFalse('\U00010401'.islower()) self.assertFalse('\U00010401'.islower())
self.assertFalse('\U00010427'.islower()) self.assertFalse('\U00010427'.islower())
...@@ -383,6 +385,8 @@ class UnicodeTest(string_tests.CommonTest, ...@@ -383,6 +385,8 @@ class UnicodeTest(string_tests.CommonTest,
string_tests.MixinStrUnicodeUserStringTest.test_isupper(self) string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
if not sys.platform.startswith('java'): if not sys.platform.startswith('java'):
self.checkequalnofix(False, '\u1FFc', 'isupper') self.checkequalnofix(False, '\u1FFc', 'isupper')
self.assertTrue('\u2167'.isupper())
self.assertFalse('\u2177'.isupper())
# non-BMP, uppercase # non-BMP, uppercase
self.assertTrue('\U00010401'.isupper()) self.assertTrue('\U00010401'.isupper())
self.assertTrue('\U00010427'.isupper()) self.assertTrue('\U00010427'.isupper())
...@@ -548,6 +552,18 @@ class UnicodeTest(string_tests.CommonTest, ...@@ -548,6 +552,18 @@ class UnicodeTest(string_tests.CommonTest,
'\U0001044F\U0001044F') '\U0001044F\U0001044F')
self.assertEqual('X\U00010427x\U0001044F'.lower(), self.assertEqual('X\U00010427x\U0001044F'.lower(),
'x\U0001044Fx\U0001044F') 'x\U0001044Fx\U0001044F')
self.assertEqual('fi'.lower(), 'fi')
self.assertEqual('\u0130'.lower(), '\u0069\u0307')
# Special case for GREEK CAPITAL LETTER SIGMA U+03A3
self.assertEqual('\u03a3'.lower(), '\u03c3')
self.assertEqual('\u0345\u03a3'.lower(), '\u0345\u03c3')
self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
self.assertEqual('A\u0345\u03a3a'.lower(), 'a\u0345\u03c3a')
self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
self.assertEqual('A\u03a3\u0345'.lower(), 'a\u03c2\u0345')
self.assertEqual('\u03a3\u0345 '.lower(), '\u03c3\u0345 ')
self.assertEqual('\U0008fffe'.lower(), '\U0008fffe')
self.assertEqual('\u2177'.lower(), '\u2177')
def test_upper(self): def test_upper(self):
string_tests.CommonTest.test_upper(self) string_tests.CommonTest.test_upper(self)
...@@ -558,6 +574,13 @@ class UnicodeTest(string_tests.CommonTest, ...@@ -558,6 +574,13 @@ class UnicodeTest(string_tests.CommonTest,
'\U00010427\U00010427') '\U00010427\U00010427')
self.assertEqual('X\U00010427x\U0001044F'.upper(), self.assertEqual('X\U00010427x\U0001044F'.upper(),
'X\U00010427X\U00010427') 'X\U00010427X\U00010427')
self.assertEqual('fi'.upper(), 'FI')
self.assertEqual('\u0130'.upper(), '\u0130')
self.assertEqual('\u03a3'.upper(), '\u03a3')
self.assertEqual('ß'.upper(), 'SS')
self.assertEqual('\u1fd2'.upper(), '\u0399\u0308\u0300')
self.assertEqual('\U0008fffe'.upper(), '\U0008fffe')
self.assertEqual('\u2177'.upper(), '\u2167')
def test_capitalize(self): def test_capitalize(self):
string_tests.CommonTest.test_capitalize(self) string_tests.CommonTest.test_capitalize(self)
...@@ -570,6 +593,11 @@ class UnicodeTest(string_tests.CommonTest, ...@@ -570,6 +593,11 @@ class UnicodeTest(string_tests.CommonTest,
'\U00010427\U0001044F') '\U00010427\U0001044F')
self.assertEqual('X\U00010427x\U0001044F'.capitalize(), self.assertEqual('X\U00010427x\U0001044F'.capitalize(),
'X\U0001044Fx\U0001044F') 'X\U0001044Fx\U0001044F')
self.assertEqual('h\u0130'.capitalize(), 'H\u0069\u0307')
exp = '\u0399\u0308\u0300\u0069\u0307'
self.assertEqual('\u1fd2\u0130'.capitalize(), exp)
self.assertEqual('finnish'.capitalize(), 'FInnish')
self.assertEqual('A\u0345\u03a3'.capitalize(), 'A\u0345\u03c2')
def test_title(self): def test_title(self):
string_tests.MixinStrUnicodeUserStringTest.test_title(self) string_tests.MixinStrUnicodeUserStringTest.test_title(self)
...@@ -584,6 +612,9 @@ class UnicodeTest(string_tests.CommonTest, ...@@ -584,6 +612,9 @@ class UnicodeTest(string_tests.CommonTest,
'\U00010427\U0001044F \U00010427\U0001044F') '\U00010427\U0001044F \U00010427\U0001044F')
self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(), self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F') 'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
self.assertEqual('fiNNISH'.title(), 'Finnish')
self.assertEqual('A\u03a3 \u1fa1xy'.title(), 'A\u03c2 \u1fa9xy')
self.assertEqual('A\u03a3A'.title(), 'A\u03c3a')
def test_swapcase(self): def test_swapcase(self):
string_tests.CommonTest.test_swapcase(self) string_tests.CommonTest.test_swapcase(self)
...@@ -597,6 +628,19 @@ class UnicodeTest(string_tests.CommonTest, ...@@ -597,6 +628,19 @@ class UnicodeTest(string_tests.CommonTest,
'\U00010427\U0001044F') '\U00010427\U0001044F')
self.assertEqual('X\U00010427x\U0001044F'.swapcase(), self.assertEqual('X\U00010427x\U0001044F'.swapcase(),
'x\U0001044FX\U00010427') 'x\U0001044FX\U00010427')
self.assertEqual('fi'.swapcase(), 'FI')
self.assertEqual('\u0130'.swapcase(), '\u0069\u0307')
# Special case for GREEK CAPITAL LETTER SIGMA U+03A3
self.assertEqual('\u03a3'.swapcase(), '\u03c3')
self.assertEqual('\u0345\u03a3'.swapcase(), '\u0399\u03c3')
self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
self.assertEqual('A\u0345\u03a3a'.swapcase(), 'a\u0399\u03c3A')
self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
self.assertEqual('A\u03a3\u0345'.swapcase(), 'a\u03c2\u0399')
self.assertEqual('\u03a3\u0345 '.swapcase(), '\u03c3\u0399 ')
self.assertEqual('\u03a3'.swapcase(), '\u03c3')
self.assertEqual('ß'.swapcase(), 'SS')
self.assertEqual('\u1fd2'.swapcase(), '\u0399\u0308\u0300')
def test_contains(self): def test_contains(self):
# Testing Unicode contains method # Testing Unicode contains method
......
...@@ -21,7 +21,7 @@ errors = 'surrogatepass' ...@@ -21,7 +21,7 @@ errors = 'surrogatepass'
class UnicodeMethodsTest(unittest.TestCase): class UnicodeMethodsTest(unittest.TestCase):
# update this, if the database changes # update this, if the database changes
expectedchecksum = '21b90f1aed00081b81ca7942b22196af090015a0' expectedchecksum = 'df0b3ca6785a070b21f837b227dbdbdff3c2e921'
def test_method_checksum(self): def test_method_checksum(self):
h = hashlib.sha1() h = hashlib.sha1()
......
...@@ -10,6 +10,8 @@ What's New in Python 3.3 Alpha 1? ...@@ -10,6 +10,8 @@ What's New in Python 3.3 Alpha 1?
Core and Builtins Core and Builtins
----------------- -----------------
- Issue #12736: Use full unicode case mappings for upper, lower, and title case.
- Issue #12760: Add a create mode to open(). Patch by David Townshend. - Issue #12760: Add a create mode to open(). Patch by David Townshend.
- Issue #13738: Simplify implementation of bytes.lower() and bytes.upper(). - Issue #13738: Simplify implementation of bytes.lower() and bytes.upper().
......
...@@ -21,8 +21,10 @@ ...@@ -21,8 +21,10 @@
#define XID_START_MASK 0x100 #define XID_START_MASK 0x100
#define XID_CONTINUE_MASK 0x200 #define XID_CONTINUE_MASK 0x200
#define PRINTABLE_MASK 0x400 #define PRINTABLE_MASK 0x400
#define NODELTA_MASK 0x800 #define NUMERIC_MASK 0x800
#define NUMERIC_MASK 0x1000 #define CASE_IGNORABLE_MASK 0x1000
#define CASED_MASK 0x2000
#define EXTENDED_CASE_MASK 0x4000
typedef struct { typedef struct {
const Py_UCS4 upper; const Py_UCS4 upper;
...@@ -57,15 +59,8 @@ gettyperecord(Py_UCS4 code) ...@@ -57,15 +59,8 @@ gettyperecord(Py_UCS4 code)
Py_UCS4 _PyUnicode_ToTitlecase(register Py_UCS4 ch) Py_UCS4 _PyUnicode_ToTitlecase(register Py_UCS4 ch)
{ {
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
int delta = ctype->title;
if (ctype->flags & NODELTA_MASK) return ctype->title ? ctype->title : ch;
return delta;
if (delta >= 32768)
delta -= 65536;
return ch + delta;
} }
/* Returns 1 for Unicode characters having the category 'Lt', 0 /* Returns 1 for Unicode characters having the category 'Lt', 0
...@@ -188,12 +183,10 @@ int _PyUnicode_IsUppercase(Py_UCS4 ch) ...@@ -188,12 +183,10 @@ int _PyUnicode_IsUppercase(Py_UCS4 ch)
Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch) Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
{ {
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
int delta = ctype->upper;
if (ctype->flags & NODELTA_MASK) if (ctype->flags & EXTENDED_CASE_MASK)
return delta; return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFFFF];
if (delta >= 32768) return ctype->upper ? ctype->upper : ch;
delta -= 65536;
return ch + delta;
} }
/* Returns the lowercase Unicode characters corresponding to ch or just /* Returns the lowercase Unicode characters corresponding to ch or just
...@@ -202,12 +195,72 @@ Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch) ...@@ -202,12 +195,72 @@ Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch) Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
{ {
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
int delta = ctype->lower;
if (ctype->flags & NODELTA_MASK) if (ctype->flags & EXTENDED_CASE_MASK)
return delta; return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFFFF];
if (delta >= 32768) return ctype->lower ? ctype->lower : ch;
delta -= 65536; }
return ch + delta;
int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res)
{
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
if (ctype->flags & EXTENDED_CASE_MASK) {
int index = ctype->lower & 0xFFFFFF;
int n = ctype->lower >> 24;
int i;
for (i = 0; i < n; i++)
res[i] = _PyUnicode_ExtendedCase[index + i];
return n;
}
res[0] = ctype->lower ? ctype->lower : ch;
return 1;
}
int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res)
{
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
if (ctype->flags & EXTENDED_CASE_MASK) {
int index = ctype->title & 0xFFFFFF;
int n = ctype->title >> 24;
int i;
for (i = 0; i < n; i++)
res[i] = _PyUnicode_ExtendedCase[index + i];
return n;
}
res[0] = ctype->title ? ctype->title : ch;
return 1;
}
int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res)
{
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
if (ctype->flags & EXTENDED_CASE_MASK) {
int index = ctype->upper & 0xFFFFFF;
int n = ctype->upper >> 24;
int i;
for (i = 0; i < n; i++)
res[i] = _PyUnicode_ExtendedCase[index + i];
return n;
}
res[0] = ctype->upper ? ctype->upper : ch;
return 1;
}
int _PyUnicode_IsCased(Py_UCS4 ch)
{
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
return (ctype->flags & CASED_MASK) != 0;
}
int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch)
{
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
return (ctype->flags & CASE_IGNORABLE_MASK) != 0;
} }
/* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt', /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
......
This diff is collapsed.
This diff is collapsed.
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta # 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch # 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
# 2011-10-21 ezio add support for name aliases and named sequences # 2011-10-21 ezio add support for name aliases and named sequences
# 2012-01 benjamin add full case mappings
# #
# written by Fredrik Lundh (fredrik@pythonware.com) # written by Fredrik Lundh (fredrik@pythonware.com)
# #
...@@ -47,6 +48,7 @@ DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt" ...@@ -47,6 +48,7 @@ DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
LINE_BREAK = "LineBreak%s.txt" LINE_BREAK = "LineBreak%s.txt"
NAME_ALIASES = "NameAliases%s.txt" NAME_ALIASES = "NameAliases%s.txt"
NAMED_SEQUENCES = "NamedSequences%s.txt" NAMED_SEQUENCES = "NamedSequences%s.txt"
SPECIAL_CASING = "SpecialCasing%s.txt"
# Private Use Areas -- in planes 1, 15, 16 # Private Use Areas -- in planes 1, 15, 16
PUA_1 = range(0xE000, 0xF900) PUA_1 = range(0xE000, 0xF900)
...@@ -84,8 +86,10 @@ UPPER_MASK = 0x80 ...@@ -84,8 +86,10 @@ UPPER_MASK = 0x80
XID_START_MASK = 0x100 XID_START_MASK = 0x100
XID_CONTINUE_MASK = 0x200 XID_CONTINUE_MASK = 0x200
PRINTABLE_MASK = 0x400 PRINTABLE_MASK = 0x400
NODELTA_MASK = 0x800 NUMERIC_MASK = 0x800
NUMERIC_MASK = 0x1000 CASE_IGNORABLE_MASK = 0x1000
CASED_MASK = 0x2000
EXTENDED_CASE_MASK = 0x4000
# these ranges need to match unicodedata.c:is_unified_ideograph # these ranges need to match unicodedata.c:is_unified_ideograph
cjk_ranges = [ cjk_ranges = [
...@@ -384,6 +388,7 @@ def makeunicodetype(unicode, trace): ...@@ -384,6 +388,7 @@ def makeunicodetype(unicode, trace):
numeric = {} numeric = {}
spaces = [] spaces = []
linebreaks = [] linebreaks = []
extra_casing = []
for char in unicode.chars: for char in unicode.chars:
record = unicode.table[char] record = unicode.table[char]
...@@ -396,7 +401,7 @@ def makeunicodetype(unicode, trace): ...@@ -396,7 +401,7 @@ def makeunicodetype(unicode, trace):
delta = True delta = True
if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]: if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
flags |= ALPHA_MASK flags |= ALPHA_MASK
if category == "Ll": if "Lowercase" in properties:
flags |= LOWER_MASK flags |= LOWER_MASK
if 'Line_Break' in properties or bidirectional == "B": if 'Line_Break' in properties or bidirectional == "B":
flags |= LINEBREAK_MASK flags |= LINEBREAK_MASK
...@@ -406,7 +411,7 @@ def makeunicodetype(unicode, trace): ...@@ -406,7 +411,7 @@ def makeunicodetype(unicode, trace):
spaces.append(char) spaces.append(char)
if category == "Lt": if category == "Lt":
flags |= TITLE_MASK flags |= TITLE_MASK
if category == "Lu": if "Uppercase" in properties:
flags |= UPPER_MASK flags |= UPPER_MASK
if char == ord(" ") or category[0] not in ("C", "Z"): if char == ord(" ") or category[0] not in ("C", "Z"):
flags |= PRINTABLE_MASK flags |= PRINTABLE_MASK
...@@ -414,35 +419,41 @@ def makeunicodetype(unicode, trace): ...@@ -414,35 +419,41 @@ def makeunicodetype(unicode, trace):
flags |= XID_START_MASK flags |= XID_START_MASK
if "XID_Continue" in properties: if "XID_Continue" in properties:
flags |= XID_CONTINUE_MASK flags |= XID_CONTINUE_MASK
# use delta predictor for upper/lower/title if it fits if "Cased" in properties:
if record[12]: flags |= CASED_MASK
upper = int(record[12], 16) if "Case_Ignorable" in properties:
else: flags |= CASE_IGNORABLE_MASK
upper = char sc = unicode.special_casing.get(char)
if record[13]: if sc is None:
lower = int(record[13], 16) if record[12]:
else: upper = int(record[12], 16)
lower = char else:
if record[14]: upper = char
title = int(record[14], 16) if record[13]:
else: lower = int(record[13], 16)
# UCD.html says that a missing title char means that else:
# it defaults to the uppercase character, not to the lower = char
# character itself. Apparently, in the current UCD (5.x) if record[14]:
# this feature is never used title = int(record[14], 16)
title = upper else:
upper_d = upper - char title = upper
lower_d = lower - char if upper == lower == title:
title_d = title - char upper = lower = title = 0
if -32768 <= upper_d <= 32767 and \
-32768 <= lower_d <= 32767 and \
-32768 <= title_d <= 32767:
# use deltas
upper = upper_d & 0xffff
lower = lower_d & 0xffff
title = title_d & 0xffff
else: else:
flags |= NODELTA_MASK # This happens when some character maps to more than one
# character in uppercase, lowercase, or titlecase. The extra
# characters are stored in a different array.
flags |= EXTENDED_CASE_MASK
lower = len(extra_casing) | (len(sc[0]) << 24)
extra_casing.extend(sc[0])
upper = len(extra_casing) | (len(sc[2]) << 24)
extra_casing.extend(sc[2])
# Title is probably equal to upper.
if sc[1] == sc[2]:
title = upper
else:
title = len(extra_casing) | (len(sc[1]) << 24)
extra_casing.extend(sc[1])
# decimal digit, integer digit # decimal digit, integer digit
decimal = 0 decimal = 0
if record[6]: if record[6]:
...@@ -469,6 +480,7 @@ def makeunicodetype(unicode, trace): ...@@ -469,6 +480,7 @@ def makeunicodetype(unicode, trace):
print(sum(map(len, numeric.values())), "numeric code points") print(sum(map(len, numeric.values())), "numeric code points")
print(len(spaces), "whitespace code points") print(len(spaces), "whitespace code points")
print(len(linebreaks), "linebreak code points") print(len(linebreaks), "linebreak code points")
print(len(extra_casing), "extended case array")
print("--- Writing", FILE, "...") print("--- Writing", FILE, "...")
...@@ -482,6 +494,14 @@ def makeunicodetype(unicode, trace): ...@@ -482,6 +494,14 @@ def makeunicodetype(unicode, trace):
print("};", file=fp) print("};", file=fp)
print(file=fp) print(file=fp)
print("/* extended case mappings */", file=fp)
print(file=fp)
print("const Py_UCS4 _PyUnicode_ExtendedCase[] = {", file=fp)
for c in extra_casing:
print(" %d," % c, file=fp)
print("};", file=fp)
print(file=fp)
# split decomposition index table # split decomposition index table
index1, index2, shift = splitbins(index, trace) index1, index2, shift = splitbins(index, trace)
...@@ -1070,6 +1090,23 @@ class UnicodeData: ...@@ -1070,6 +1090,23 @@ class UnicodeData:
# Patch the numeric field # Patch the numeric field
if table[i] is not None: if table[i] is not None:
table[i][8] = value table[i][8] = value
sc = self.special_casing = {}
with open_data(SPECIAL_CASING, version) as file:
for s in file:
s = s[:-1].split('#', 1)[0]
if not s:
continue
data = s.split("; ")
if data[4]:
# We ignore all conditionals (since they depend on
# languages) except for one, which is hardcoded. See
# handle_capital_sigma in unicodeobject.c.
continue
c = int(data[0], 16)
lower = [int(char, 16) for char in data[1].split()]
title = [int(char, 16) for char in data[2].split()]
upper = [int(char, 16) for char in data[3].split()]
sc[c] = (lower, title, upper)
def uselatin1(self): def uselatin1(self):
# restrict character range to ISO Latin 1 # restrict character range to ISO Latin 1
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment