Kaydet (Commit) 5cbc71e5 authored tarafından Martin v. Löwis's avatar Martin v. Löwis

Issue #10459: Update CJK character names to Unicode 6.0.

üst 249d7e3c
...@@ -88,9 +88,13 @@ class UnicodeNamesTest(unittest.TestCase): ...@@ -88,9 +88,13 @@ class UnicodeNamesTest(unittest.TestCase):
self.checkletter("CJK UNIFIED IDEOGRAPH-3400", "\u3400") self.checkletter("CJK UNIFIED IDEOGRAPH-3400", "\u3400")
self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", "\u4db5") self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", "\u4db5")
self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", "\u4e00") self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", "\u4e00")
self.checkletter("CJK UNIFIED IDEOGRAPH-9FA5", "\u9fa5") self.checkletter("CJK UNIFIED IDEOGRAPH-9FCB", "\u9fCB")
self.checkletter("CJK UNIFIED IDEOGRAPH-20000", "\U00020000") self.checkletter("CJK UNIFIED IDEOGRAPH-20000", "\U00020000")
self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", "\U0002a6d6") self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", "\U0002a6d6")
self.checkletter("CJK UNIFIED IDEOGRAPH-2A700", "\U0002A700")
self.checkletter("CJK UNIFIED IDEOGRAPH-2B734", "\U0002B734")
self.checkletter("CJK UNIFIED IDEOGRAPH-2B740", "\U0002B740")
self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D")
def test_bmp_characters(self): def test_bmp_characters(self):
import unicodedata import unicodedata
......
...@@ -32,6 +32,8 @@ Core and Builtins ...@@ -32,6 +32,8 @@ Core and Builtins
Library Library
------- -------
- Issue #10459: Update CJK character names to Unicode 6.0.
- Issue #4493: urllib.request adds '/' in front of path components which does not - Issue #4493: urllib.request adds '/' in front of path components which does not
start with '/. Common behavior exhibited by browsers and other clients. start with '/. Common behavior exhibited by browsers and other clients.
......
...@@ -866,13 +866,16 @@ static char *hangul_syllables[][3] = { ...@@ -866,13 +866,16 @@ static char *hangul_syllables[][3] = {
{ 0, 0, "H" } { 0, 0, "H" }
}; };
/* These ranges need to match makeunicodedata.py:cjk_ranges. */
static int static int
is_unified_ideograph(Py_UCS4 code) is_unified_ideograph(Py_UCS4 code)
{ {
return ( return
(0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */ (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
(0x4E00 <= code && code <= 0x9FBB) || /* CJK Ideograph */ (0x4E00 <= code && code <= 0x9FCB) || /* CJK Ideograph */
(0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */ (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
(0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
(0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */
} }
static int static int
......
...@@ -70,6 +70,15 @@ PRINTABLE_MASK = 0x400 ...@@ -70,6 +70,15 @@ PRINTABLE_MASK = 0x400
NODELTA_MASK = 0x800 NODELTA_MASK = 0x800
NUMERIC_MASK = 0x1000 NUMERIC_MASK = 0x1000
# these ranges need to match unicodedata.c:is_unified_ideograph
cjk_ranges = [
('3400', '4DB5'),
('4E00', '9FCB'),
('20000', '2A6D6'),
('2A700', '2B734'),
('2B740', '2B81D')
]
def maketables(trace=0): def maketables(trace=0):
print("--- Reading", UNICODE_DATA % "", "...") print("--- Reading", UNICODE_DATA % "", "...")
...@@ -81,7 +90,7 @@ def maketables(trace=0): ...@@ -81,7 +90,7 @@ def maketables(trace=0):
for version in old_versions: for version in old_versions:
print("--- Reading", UNICODE_DATA % ("-"+version), "...") print("--- Reading", UNICODE_DATA % ("-"+version), "...")
old_unicode = UnicodeData(version) old_unicode = UnicodeData(version, cjk_check=False)
print(len(list(filter(None, old_unicode.table))), "characters") print(len(list(filter(None, old_unicode.table))), "characters")
merge_old_version(version, unicode, old_unicode) merge_old_version(version, unicode, old_unicode)
...@@ -804,7 +813,8 @@ class UnicodeData: ...@@ -804,7 +813,8 @@ class UnicodeData:
def __init__(self, version, def __init__(self, version,
linebreakprops=False, linebreakprops=False,
expand=1): expand=1,
cjk_check=True):
self.changed = [] self.changed = []
file = open_data(UNICODE_DATA, version) file = open_data(UNICODE_DATA, version)
table = [None] * 0x110000 table = [None] * 0x110000
...@@ -816,6 +826,8 @@ class UnicodeData: ...@@ -816,6 +826,8 @@ class UnicodeData:
char = int(s[0], 16) char = int(s[0], 16)
table[char] = s table[char] = s
cjk_ranges_found = []
# expand first-last ranges # expand first-last ranges
if expand: if expand:
field = None field = None
...@@ -826,12 +838,17 @@ class UnicodeData: ...@@ -826,12 +838,17 @@ class UnicodeData:
s[1] = "" s[1] = ""
field = s field = s
elif s[1][-5:] == "Last>": elif s[1][-5:] == "Last>":
if s[1].startswith("<CJK Ideograph"):
cjk_ranges_found.append((field[0],
s[0]))
s[1] = "" s[1] = ""
field = None field = None
elif field: elif field:
f2 = field[:] f2 = field[:]
f2[0] = "%X" % i f2[0] = "%X" % i
table[i] = f2 table[i] = f2
if cjk_check and cjk_ranges != cjk_ranges_found:
raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
# public attributes # public attributes
self.filename = UNICODE_DATA % '' self.filename = UNICODE_DATA % ''
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment