Kaydet (Commit) d0052d17 authored tarafından Amaury Forgeot d'Arc's avatar Amaury Forgeot d'Arc

#1571184: makeunicodedata.py now generates the functions _PyUnicode_ToNumeric,

_PyUnicode_IsLinebreak and _PyUnicode_IsWhitespace.

It now also parses the Unihan.txt for numeric values.
üst 85ea4bf7
......@@ -20,7 +20,7 @@ encoding = 'utf-8'
class UnicodeMethodsTest(unittest.TestCase):
# update this, if the database changes
expectedchecksum = '6ec65b65835614ec00634c674bba0e50cd32c189'
expectedchecksum = '0b915116051f3ed029a98542c2b7df63c9646272'
def test_method_checksum(self):
h = hashlib.sha1()
......@@ -79,7 +79,7 @@ class UnicodeDatabaseTest(unittest.TestCase):
class UnicodeFunctionsTest(UnicodeDatabaseTest):
# update this, if the database changes
expectedchecksum = '3136d5afd787dc2bcb1bdcac95e385349fbebbca'
expectedchecksum = 'd4169ccff998ebbd1ec007a0b3fbd66e5ccf0229'
def test_function_checksum(self):
data = []
......@@ -118,6 +118,7 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):
self.assertEqual(self.db.numeric(u'9'), 9)
self.assertEqual(self.db.numeric(u'\u215b'), 0.125)
self.assertEqual(self.db.numeric(u'\u2468'), 9.0)
self.assertEqual(self.db.numeric(u'\ua627'), 7.0)
self.assertEqual(self.db.numeric(u'\U00020000', None), None)
self.assertRaises(TypeError, self.db.numeric)
......
......@@ -12,6 +12,11 @@ What's New in Python 2.7 alpha 1
Core and Builtins
-----------------
- Issue #1571184: The Unicode database contains properties for more characters.
The tables for code points representing numeric values, white spaces or line
breaks are now generated from the official Unicode Character Database files,
and include information from the Unihan.txt file.
- Issue #7050: Fix a SystemError when trying to use unpacking and augmented
assignment.
......
......@@ -36,7 +36,7 @@ typedef struct change_record {
const unsigned char category_changed;
const unsigned char decimal_changed;
const unsigned char mirrored_changed;
const int numeric_changed;
const double numeric_changed;
} change_record;
/* data file generated by Tools/unicode/makeunicodedata.py */
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -34,6 +34,7 @@ UNIDATA_VERSION = "5.1.0"
UNICODE_DATA = "UnicodeData%s.txt"
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
UNIHAN = "Unihan%s.txt"
DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
old_versions = ["3.2.0"]
......@@ -59,6 +60,7 @@ SPACE_MASK = 0x20
TITLE_MASK = 0x40
UPPER_MASK = 0x80
NODELTA_MASK = 0x100
NUMERIC_MASK = 0x200
def maketables(trace=0):
......@@ -68,6 +70,7 @@ def maketables(trace=0):
unicode = UnicodeData(UNICODE_DATA % version,
COMPOSITION_EXCLUSIONS % version,
EASTASIAN_WIDTH % version,
UNIHAN % version,
DERIVEDNORMALIZATION_PROPS % version)
print len(filter(None, unicode.table)), "characters"
......@@ -76,7 +79,8 @@ def maketables(trace=0):
print "--- Reading", UNICODE_DATA % ("-"+version), "..."
old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
COMPOSITION_EXCLUSIONS % ("-"+version),
EASTASIAN_WIDTH % ("-"+version))
EASTASIAN_WIDTH % ("-"+version),
UNIHAN % ("-"+version))
print len(filter(None, old_unicode.table)), "characters"
merge_old_version(version, unicode, old_unicode)
......@@ -352,6 +356,9 @@ def makeunicodetype(unicode, trace):
table = [dummy]
cache = {0: dummy}
index = [0] * len(unicode.chars)
numeric = {}
spaces = []
linebreaks = []
for char in unicode.chars:
record = unicode.table[char]
......@@ -367,8 +374,10 @@ def makeunicodetype(unicode, trace):
flags |= LOWER_MASK
if category == "Zl" or bidirectional == "B":
flags |= LINEBREAK_MASK
linebreaks.append(char)
if category == "Zs" or bidirectional in ("WS", "B", "S"):
flags |= SPACE_MASK
spaces.append(char)
if category == "Lt":
flags |= TITLE_MASK
if category == "Lu":
......@@ -411,6 +420,9 @@ def makeunicodetype(unicode, trace):
if record[7]:
flags |= DIGIT_MASK
digit = int(record[7])
if record[8]:
flags |= NUMERIC_MASK
numeric.setdefault(record[8], []).append(char)
item = (
upper, lower, title, decimal, digit, flags
)
......@@ -422,6 +434,9 @@ def makeunicodetype(unicode, trace):
index[char] = i
print len(table), "unique character type entries"
print sum(map(len, numeric.values())), "numeric code points"
print len(spaces), "whitespace code points"
print len(linebreaks), "linebreak code points"
print "--- Writing", FILE, "..."
......@@ -443,6 +458,97 @@ def makeunicodetype(unicode, trace):
Array("index1", index1).dump(fp, trace)
Array("index2", index2).dump(fp, trace)
# Generate code for _PyUnicode_ToNumeric()
numeric_items = numeric.items()
numeric_items.sort()
print >>fp, '/* Returns the numeric value as double for Unicode characters'
print >>fp, ' * having this property, -1.0 otherwise.'
print >>fp, ' */'
print >>fp, 'double _PyUnicode_ToNumeric(Py_UNICODE ch)'
print >>fp, '{'
print >>fp, ' switch (ch) {'
for value, codepoints in numeric_items:
haswide = False
hasnonewide = False
codepoints.sort()
for codepoint in codepoints:
if codepoint < 0x10000:
hasnonewide = True
if codepoint >= 0x10000 and not haswide:
print >>fp, '#ifdef Py_UNICODE_WIDE'
haswide = True
print >>fp, ' case 0x%04X:' % (codepoint,)
if haswide and hasnonewide:
print >>fp, '#endif'
print >>fp, ' return (double) %s;' % (value,)
if haswide and not hasnonewide:
print >>fp, '#endif'
print >>fp,' }'
print >>fp,' return -1.0;'
print >>fp,'}'
print >>fp
# Generate code for _PyUnicode_IsWhitespace()
print >>fp, "/* Returns 1 for Unicode characters having the bidirectional"
print >>fp, " * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise."
print >>fp, " */"
print >>fp, 'int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)'
print >>fp, '{'
print >>fp, '#ifdef WANT_WCTYPE_FUNCTIONS'
print >>fp, ' return iswspace(ch);'
print >>fp, '#else'
print >>fp, ' switch (ch) {'
haswide = False
hasnonewide = False
spaces.sort()
for codepoint in spaces:
if codepoint < 0x10000:
hasnonewide = True
if codepoint >= 0x10000 and not haswide:
print >>fp, '#ifdef Py_UNICODE_WIDE'
haswide = True
print >>fp, ' case 0x%04X:' % (codepoint,)
if haswide and hasnonewide:
print >>fp, '#endif'
print >>fp, ' return 1;'
if haswide and not hasnonewide:
print >>fp, '#endif'
print >>fp,' }'
print >>fp,' return 0;'
print >>fp, '#endif'
print >>fp,'}'
print >>fp
# Generate code for _PyUnicode_IsLinebreak()
print >>fp, "/* Returns 1 for Unicode characters having the category 'Zl',"
print >>fp, " * 'Zp' or type 'B', 0 otherwise."
print >>fp, " */"
print >>fp, 'int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)'
print >>fp, '{'
print >>fp, ' switch (ch) {'
haswide = False
hasnonewide = False
linebreaks.sort()
for codepoint in linebreaks:
if codepoint < 0x10000:
hasnonewide = True
if codepoint >= 0x10000 and not haswide:
print >>fp, '#ifdef Py_UNICODE_WIDE'
haswide = True
print >>fp, ' case 0x%04X:' % (codepoint,)
if haswide and hasnonewide:
print >>fp, '#endif'
print >>fp, ' return 1;'
if haswide and not hasnonewide:
print >>fp, '#endif'
print >>fp,' }'
print >>fp,' return 0;'
print >>fp,'}'
print >>fp
fp.close()
# --------------------------------------------------------------------
......@@ -660,12 +766,11 @@ def merge_old_version(version, new, old):
elif k == 8:
# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
# Since 0 encodes "no change", the old value is better not 0
assert value != "0" and value != "-1"
if not value:
numeric_changes[i] = -1
else:
assert re.match("^[0-9]+$", value)
numeric_changes[i] = int(value)
numeric_changes[i] = float(value)
assert numeric_changes[i] not in (0, -1)
elif k == 9:
if value == 'Y':
mirrored_changes[i] = '1'
......@@ -698,11 +803,9 @@ def merge_old_version(version, new, old):
# load a unicode-data file from disk
import sys
class UnicodeData:
def __init__(self, filename, exclusions, eastasianwidth,
def __init__(self, filename, exclusions, eastasianwidth, unihan,
derivednormalizationprops=None, expand=1):
self.changed = []
file = open(filename)
......@@ -789,6 +892,19 @@ class UnicodeData:
if table[i] is not None:
table[i].append(quickchecks[i])
for line in open(unihan):
if not line.startswith('U+'):
continue
code, tag, value = line.split(None, 3)[:3]
if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
'kOtherNumeric'):
continue
value = value.strip().replace(',', '')
i = int(code[2:], 16)
# Patch the numeric field
if table[i] is not None:
table[i][8] = value
def uselatin1(self):
# restrict character range to ISO Latin 1
self.chars = range(256)
......@@ -938,7 +1054,6 @@ def splitbins(t, trace=0):
you'll get.
"""
import sys
if trace:
def dump(t1, t2, shift, bytes):
print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment