Kaydet (Commit) d0052d17 authored tarafından Amaury Forgeot d'Arc's avatar Amaury Forgeot d'Arc

#1571184: makeunicodedata.py now generates the functions _PyUnicode_ToNumeric,

_PyUnicode_IsLinebreak and _PyUnicode_IsWhitespace.

It now also parses the Unihan.txt for numeric values.
üst 85ea4bf7
...@@ -20,7 +20,7 @@ encoding = 'utf-8' ...@@ -20,7 +20,7 @@ encoding = 'utf-8'
class UnicodeMethodsTest(unittest.TestCase): class UnicodeMethodsTest(unittest.TestCase):
# update this, if the database changes # update this, if the database changes
expectedchecksum = '6ec65b65835614ec00634c674bba0e50cd32c189' expectedchecksum = '0b915116051f3ed029a98542c2b7df63c9646272'
def test_method_checksum(self): def test_method_checksum(self):
h = hashlib.sha1() h = hashlib.sha1()
...@@ -79,7 +79,7 @@ class UnicodeDatabaseTest(unittest.TestCase): ...@@ -79,7 +79,7 @@ class UnicodeDatabaseTest(unittest.TestCase):
class UnicodeFunctionsTest(UnicodeDatabaseTest): class UnicodeFunctionsTest(UnicodeDatabaseTest):
# update this, if the database changes # update this, if the database changes
expectedchecksum = '3136d5afd787dc2bcb1bdcac95e385349fbebbca' expectedchecksum = 'd4169ccff998ebbd1ec007a0b3fbd66e5ccf0229'
def test_function_checksum(self): def test_function_checksum(self):
data = [] data = []
...@@ -118,6 +118,7 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest): ...@@ -118,6 +118,7 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):
self.assertEqual(self.db.numeric(u'9'), 9) self.assertEqual(self.db.numeric(u'9'), 9)
self.assertEqual(self.db.numeric(u'\u215b'), 0.125) self.assertEqual(self.db.numeric(u'\u215b'), 0.125)
self.assertEqual(self.db.numeric(u'\u2468'), 9.0) self.assertEqual(self.db.numeric(u'\u2468'), 9.0)
self.assertEqual(self.db.numeric(u'\ua627'), 7.0)
self.assertEqual(self.db.numeric(u'\U00020000', None), None) self.assertEqual(self.db.numeric(u'\U00020000', None), None)
self.assertRaises(TypeError, self.db.numeric) self.assertRaises(TypeError, self.db.numeric)
......
...@@ -12,6 +12,11 @@ What's New in Python 2.7 alpha 1 ...@@ -12,6 +12,11 @@ What's New in Python 2.7 alpha 1
Core and Builtins Core and Builtins
----------------- -----------------
- Issue #1571184: The Unicode database contains properties for more characters.
The tables for code points representing numeric values, white spaces or line
breaks are now generated from the official Unicode Character Database files,
and include information from the Unihan.txt file.
- Issue #7050: Fix a SystemError when trying to use unpacking and augmented - Issue #7050: Fix a SystemError when trying to use unpacking and augmented
assignment. assignment.
......
...@@ -36,7 +36,7 @@ typedef struct change_record { ...@@ -36,7 +36,7 @@ typedef struct change_record {
const unsigned char category_changed; const unsigned char category_changed;
const unsigned char decimal_changed; const unsigned char decimal_changed;
const unsigned char mirrored_changed; const unsigned char mirrored_changed;
const int numeric_changed; const double numeric_changed;
} change_record; } change_record;
/* data file generated by Tools/unicode/makeunicodedata.py */ /* data file generated by Tools/unicode/makeunicodedata.py */
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -34,6 +34,7 @@ UNIDATA_VERSION = "5.1.0" ...@@ -34,6 +34,7 @@ UNIDATA_VERSION = "5.1.0"
UNICODE_DATA = "UnicodeData%s.txt" UNICODE_DATA = "UnicodeData%s.txt"
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt" COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
EASTASIAN_WIDTH = "EastAsianWidth%s.txt" EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
UNIHAN = "Unihan%s.txt"
DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt" DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
old_versions = ["3.2.0"] old_versions = ["3.2.0"]
...@@ -59,6 +60,7 @@ SPACE_MASK = 0x20 ...@@ -59,6 +60,7 @@ SPACE_MASK = 0x20
TITLE_MASK = 0x40 TITLE_MASK = 0x40
UPPER_MASK = 0x80 UPPER_MASK = 0x80
NODELTA_MASK = 0x100 NODELTA_MASK = 0x100
NUMERIC_MASK = 0x200
def maketables(trace=0): def maketables(trace=0):
...@@ -68,6 +70,7 @@ def maketables(trace=0): ...@@ -68,6 +70,7 @@ def maketables(trace=0):
unicode = UnicodeData(UNICODE_DATA % version, unicode = UnicodeData(UNICODE_DATA % version,
COMPOSITION_EXCLUSIONS % version, COMPOSITION_EXCLUSIONS % version,
EASTASIAN_WIDTH % version, EASTASIAN_WIDTH % version,
UNIHAN % version,
DERIVEDNORMALIZATION_PROPS % version) DERIVEDNORMALIZATION_PROPS % version)
print len(filter(None, unicode.table)), "characters" print len(filter(None, unicode.table)), "characters"
...@@ -76,7 +79,8 @@ def maketables(trace=0): ...@@ -76,7 +79,8 @@ def maketables(trace=0):
print "--- Reading", UNICODE_DATA % ("-"+version), "..." print "--- Reading", UNICODE_DATA % ("-"+version), "..."
old_unicode = UnicodeData(UNICODE_DATA % ("-"+version), old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
COMPOSITION_EXCLUSIONS % ("-"+version), COMPOSITION_EXCLUSIONS % ("-"+version),
EASTASIAN_WIDTH % ("-"+version)) EASTASIAN_WIDTH % ("-"+version),
UNIHAN % ("-"+version))
print len(filter(None, old_unicode.table)), "characters" print len(filter(None, old_unicode.table)), "characters"
merge_old_version(version, unicode, old_unicode) merge_old_version(version, unicode, old_unicode)
...@@ -352,6 +356,9 @@ def makeunicodetype(unicode, trace): ...@@ -352,6 +356,9 @@ def makeunicodetype(unicode, trace):
table = [dummy] table = [dummy]
cache = {0: dummy} cache = {0: dummy}
index = [0] * len(unicode.chars) index = [0] * len(unicode.chars)
numeric = {}
spaces = []
linebreaks = []
for char in unicode.chars: for char in unicode.chars:
record = unicode.table[char] record = unicode.table[char]
...@@ -367,8 +374,10 @@ def makeunicodetype(unicode, trace): ...@@ -367,8 +374,10 @@ def makeunicodetype(unicode, trace):
flags |= LOWER_MASK flags |= LOWER_MASK
if category == "Zl" or bidirectional == "B": if category == "Zl" or bidirectional == "B":
flags |= LINEBREAK_MASK flags |= LINEBREAK_MASK
linebreaks.append(char)
if category == "Zs" or bidirectional in ("WS", "B", "S"): if category == "Zs" or bidirectional in ("WS", "B", "S"):
flags |= SPACE_MASK flags |= SPACE_MASK
spaces.append(char)
if category == "Lt": if category == "Lt":
flags |= TITLE_MASK flags |= TITLE_MASK
if category == "Lu": if category == "Lu":
...@@ -411,6 +420,9 @@ def makeunicodetype(unicode, trace): ...@@ -411,6 +420,9 @@ def makeunicodetype(unicode, trace):
if record[7]: if record[7]:
flags |= DIGIT_MASK flags |= DIGIT_MASK
digit = int(record[7]) digit = int(record[7])
if record[8]:
flags |= NUMERIC_MASK
numeric.setdefault(record[8], []).append(char)
item = ( item = (
upper, lower, title, decimal, digit, flags upper, lower, title, decimal, digit, flags
) )
...@@ -422,6 +434,9 @@ def makeunicodetype(unicode, trace): ...@@ -422,6 +434,9 @@ def makeunicodetype(unicode, trace):
index[char] = i index[char] = i
print len(table), "unique character type entries" print len(table), "unique character type entries"
print sum(map(len, numeric.values())), "numeric code points"
print len(spaces), "whitespace code points"
print len(linebreaks), "linebreak code points"
print "--- Writing", FILE, "..." print "--- Writing", FILE, "..."
...@@ -443,6 +458,97 @@ def makeunicodetype(unicode, trace): ...@@ -443,6 +458,97 @@ def makeunicodetype(unicode, trace):
Array("index1", index1).dump(fp, trace) Array("index1", index1).dump(fp, trace)
Array("index2", index2).dump(fp, trace) Array("index2", index2).dump(fp, trace)
# Generate code for _PyUnicode_ToNumeric()
numeric_items = numeric.items()
numeric_items.sort()
print >>fp, '/* Returns the numeric value as double for Unicode characters'
print >>fp, ' * having this property, -1.0 otherwise.'
print >>fp, ' */'
print >>fp, 'double _PyUnicode_ToNumeric(Py_UNICODE ch)'
print >>fp, '{'
print >>fp, ' switch (ch) {'
for value, codepoints in numeric_items:
haswide = False
hasnonewide = False
codepoints.sort()
for codepoint in codepoints:
if codepoint < 0x10000:
hasnonewide = True
if codepoint >= 0x10000 and not haswide:
print >>fp, '#ifdef Py_UNICODE_WIDE'
haswide = True
print >>fp, ' case 0x%04X:' % (codepoint,)
if haswide and hasnonewide:
print >>fp, '#endif'
print >>fp, ' return (double) %s;' % (value,)
if haswide and not hasnonewide:
print >>fp, '#endif'
print >>fp,' }'
print >>fp,' return -1.0;'
print >>fp,'}'
print >>fp
# Generate code for _PyUnicode_IsWhitespace()
print >>fp, "/* Returns 1 for Unicode characters having the bidirectional"
print >>fp, " * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise."
print >>fp, " */"
print >>fp, 'int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)'
print >>fp, '{'
print >>fp, '#ifdef WANT_WCTYPE_FUNCTIONS'
print >>fp, ' return iswspace(ch);'
print >>fp, '#else'
print >>fp, ' switch (ch) {'
haswide = False
hasnonewide = False
spaces.sort()
for codepoint in spaces:
if codepoint < 0x10000:
hasnonewide = True
if codepoint >= 0x10000 and not haswide:
print >>fp, '#ifdef Py_UNICODE_WIDE'
haswide = True
print >>fp, ' case 0x%04X:' % (codepoint,)
if haswide and hasnonewide:
print >>fp, '#endif'
print >>fp, ' return 1;'
if haswide and not hasnonewide:
print >>fp, '#endif'
print >>fp,' }'
print >>fp,' return 0;'
print >>fp, '#endif'
print >>fp,'}'
print >>fp
# Generate code for _PyUnicode_IsLinebreak()
print >>fp, "/* Returns 1 for Unicode characters having the category 'Zl',"
print >>fp, " * 'Zp' or type 'B', 0 otherwise."
print >>fp, " */"
print >>fp, 'int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)'
print >>fp, '{'
print >>fp, ' switch (ch) {'
haswide = False
hasnonewide = False
linebreaks.sort()
for codepoint in linebreaks:
if codepoint < 0x10000:
hasnonewide = True
if codepoint >= 0x10000 and not haswide:
print >>fp, '#ifdef Py_UNICODE_WIDE'
haswide = True
print >>fp, ' case 0x%04X:' % (codepoint,)
if haswide and hasnonewide:
print >>fp, '#endif'
print >>fp, ' return 1;'
if haswide and not hasnonewide:
print >>fp, '#endif'
print >>fp,' }'
print >>fp,' return 0;'
print >>fp,'}'
print >>fp
fp.close() fp.close()
# -------------------------------------------------------------------- # --------------------------------------------------------------------
...@@ -660,12 +766,11 @@ def merge_old_version(version, new, old): ...@@ -660,12 +766,11 @@ def merge_old_version(version, new, old):
elif k == 8: elif k == 8:
# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k] # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
# Since 0 encodes "no change", the old value is better not 0 # Since 0 encodes "no change", the old value is better not 0
assert value != "0" and value != "-1"
if not value: if not value:
numeric_changes[i] = -1 numeric_changes[i] = -1
else: else:
assert re.match("^[0-9]+$", value) numeric_changes[i] = float(value)
numeric_changes[i] = int(value) assert numeric_changes[i] not in (0, -1)
elif k == 9: elif k == 9:
if value == 'Y': if value == 'Y':
mirrored_changes[i] = '1' mirrored_changes[i] = '1'
...@@ -698,11 +803,9 @@ def merge_old_version(version, new, old): ...@@ -698,11 +803,9 @@ def merge_old_version(version, new, old):
# load a unicode-data file from disk # load a unicode-data file from disk
import sys
class UnicodeData: class UnicodeData:
def __init__(self, filename, exclusions, eastasianwidth, def __init__(self, filename, exclusions, eastasianwidth, unihan,
derivednormalizationprops=None, expand=1): derivednormalizationprops=None, expand=1):
self.changed = [] self.changed = []
file = open(filename) file = open(filename)
...@@ -789,6 +892,19 @@ class UnicodeData: ...@@ -789,6 +892,19 @@ class UnicodeData:
if table[i] is not None: if table[i] is not None:
table[i].append(quickchecks[i]) table[i].append(quickchecks[i])
for line in open(unihan):
if not line.startswith('U+'):
continue
code, tag, value = line.split(None, 3)[:3]
if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
'kOtherNumeric'):
continue
value = value.strip().replace(',', '')
i = int(code[2:], 16)
# Patch the numeric field
if table[i] is not None:
table[i][8] = value
def uselatin1(self): def uselatin1(self):
# restrict character range to ISO Latin 1 # restrict character range to ISO Latin 1
self.chars = range(256) self.chars = range(256)
...@@ -938,7 +1054,6 @@ def splitbins(t, trace=0): ...@@ -938,7 +1054,6 @@ def splitbins(t, trace=0):
you'll get. you'll get.
""" """
import sys
if trace: if trace:
def dump(t1, t2, shift, bytes): def dump(t1, t2, shift, bytes):
print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % ( print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment