#1571184: makeunicodedata.py now generates the functions _PyUnicode_ToNumeric,

_PyUnicode_IsLinebreak and _PyUnicode_IsWhitespace. It now also parses the Unihan.txt for numeric values.

#1571184: makeunicodedata.py now generates the functions _PyUnicode_ToNumeric,
_PyUnicode_IsLinebreak and _PyUnicode_IsWhitespace. It now also parses the Unihan.txt for numeric values.
d0052d17 · Amaury Forgeot d'Arc · 85ea4bf7 · d0052d17 · d0052d17 · d0052d17
Kaydet (Commit) d0052d17 authored Eki 06, 2009 tarafından Amaury Forgeot d'Arc
7 changed files
--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@@ -20,7 +20,7 @@ encoding = 'utf-8'
 class UnicodeMethodsTest(unittest.TestCase):

    # update this, if the database changes
-    expectedchecksum = '6ec65b65835614ec00634c674bba0e50cd32c189'
+    expectedchecksum = '0b915116051f3ed029a98542c2b7df63c9646272'

    def test_method_checksum(self):
        h = hashlib.sha1()
@@ -79,7 +79,7 @@ class UnicodeDatabaseTest(unittest.TestCase):
 class UnicodeFunctionsTest(UnicodeDatabaseTest):

    # update this, if the database changes
-    expectedchecksum = '3136d5afd787dc2bcb1bdcac95e385349fbebbca'
+    expectedchecksum = 'd4169ccff998ebbd1ec007a0b3fbd66e5ccf0229'

    def test_function_checksum(self):
        data = []
@@ -118,6 +118,7 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):
        self.assertEqual(self.db.numeric(u'9'), 9)
        self.assertEqual(self.db.numeric(u'\u215b'), 0.125)
        self.assertEqual(self.db.numeric(u'\u2468'), 9.0)
+        self.assertEqual(self.db.numeric(u'\ua627'), 7.0)
        self.assertEqual(self.db.numeric(u'\U00020000', None), None)

        self.assertRaises(TypeError, self.db.numeric)

--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -12,6 +12,11 @@ What's New in Python 2.7 alpha 1
 Core and Builtins
 -----------------

+- Issue #1571184: The Unicode database contains properties for more characters.
+  The tables for code points representing numeric values, white spaces or line
+  breaks are now generated from the official Unicode Character Database files,
+  and include information from the Unihan.txt file.
+
 - Issue #7050: Fix a SystemError when trying to use unpacking and augmented
  assignment.


--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -36,7 +36,7 @@ typedef struct change_record {
    const unsigned char category_changed;
    const unsigned char decimal_changed;
    const unsigned char mirrored_changed;
-    const int numeric_changed;
+    const double numeric_changed;
 } change_record;

 /* data file generated by Tools/unicode/makeunicodedata.py */

--- a/Modules/unicodedata_db.h
+++ b/Modules/unicodedata_db.h
--- a/Objects/unicodectype.c
+++ b/Objects/unicodectype.c
--- a/Objects/unicodetype_db.h
+++ b/Objects/unicodetype_db.h
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -34,6 +34,7 @@ UNIDATA_VERSION = "5.1.0"
 UNICODE_DATA = "UnicodeData%s.txt"
 COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
 EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
+UNIHAN = "Unihan%s.txt"
 DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"

 old_versions = ["3.2.0"]
@@ -59,6 +60,7 @@ SPACE_MASK = 0x20
 TITLE_MASK = 0x40
 UPPER_MASK = 0x80
 NODELTA_MASK = 0x100
+NUMERIC_MASK = 0x200

 def maketables(trace=0):

@@ -68,6 +70,7 @@ def maketables(trace=0):
    unicode = UnicodeData(UNICODE_DATA % version,
                          COMPOSITION_EXCLUSIONS % version,
                          EASTASIAN_WIDTH % version,
+                          UNIHAN % version,
                          DERIVEDNORMALIZATION_PROPS % version)

    print len(filter(None, unicode.table)), "characters"
@@ -76,7 +79,8 @@ def maketables(trace=0):
        print "--- Reading", UNICODE_DATA % ("-"+version), "..."
        old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
                                  COMPOSITION_EXCLUSIONS % ("-"+version),
-                                  EASTASIAN_WIDTH % ("-"+version))
+                                  EASTASIAN_WIDTH % ("-"+version),
+                                  UNIHAN % ("-"+version))
        print len(filter(None, old_unicode.table)), "characters"
        merge_old_version(version, unicode, old_unicode)

@@ -352,6 +356,9 @@ def makeunicodetype(unicode, trace):
    table = [dummy]
    cache = {0: dummy}
    index = [0] * len(unicode.chars)
+    numeric = {}
+    spaces = []
+    linebreaks = []

    for char in unicode.chars:
        record = unicode.table[char]
@@ -367,8 +374,10 @@ def makeunicodetype(unicode, trace):
                flags |= LOWER_MASK
            if category == "Zl" or bidirectional == "B":
                flags |= LINEBREAK_MASK
+                linebreaks.append(char)
            if category == "Zs" or bidirectional in ("WS", "B", "S"):
                flags |= SPACE_MASK
+                spaces.append(char)
            if category == "Lt":
                flags |= TITLE_MASK
            if category == "Lu":
@@ -411,6 +420,9 @@ def makeunicodetype(unicode, trace):
            if record[7]:
                flags |= DIGIT_MASK
                digit = int(record[7])
+            if record[8]:
+                flags |= NUMERIC_MASK
+                numeric.setdefault(record[8], []).append(char)
            item = (
                upper, lower, title, decimal, digit, flags
                )
@@ -422,6 +434,9 @@ def makeunicodetype(unicode, trace):
            index[char] = i

    print len(table), "unique character type entries"
+    print sum(map(len, numeric.values())), "numeric code points"
+    print len(spaces), "whitespace code points"
+    print len(linebreaks), "linebreak code points"

    print "--- Writing", FILE, "..."

@@ -443,6 +458,97 @@ def makeunicodetype(unicode, trace):
    Array("index1", index1).dump(fp, trace)
    Array("index2", index2).dump(fp, trace)

+    # Generate code for _PyUnicode_ToNumeric()
+    numeric_items = numeric.items()
+    numeric_items.sort()
+    print >>fp, '/* Returns the numeric value as double for Unicode characters'
+    print >>fp, ' * having this property, -1.0 otherwise.'
+    print >>fp, ' */'
+    print >>fp, 'double _PyUnicode_ToNumeric(Py_UNICODE ch)'
+    print >>fp, '{'
+    print >>fp, '    switch (ch) {'
+    for value, codepoints in numeric_items:
+        haswide = False
+        hasnonewide = False
+        codepoints.sort()
+        for codepoint in codepoints:
+            if codepoint < 0x10000:
+                hasnonewide = True
+            if codepoint >= 0x10000 and not haswide:
+                print >>fp, '#ifdef Py_UNICODE_WIDE'
+                haswide = True
+            print >>fp, '    case 0x%04X:' % (codepoint,)
+        if haswide and hasnonewide:
+            print >>fp, '#endif'
+        print >>fp, '        return (double) %s;' % (value,)
+        if haswide and not hasnonewide:
+            print >>fp, '#endif'
+    print >>fp,'    }'
+    print >>fp,'    return -1.0;'
+    print >>fp,'}'
+    print >>fp
+
+    # Generate code for _PyUnicode_IsWhitespace()
+    print >>fp, "/* Returns 1 for Unicode characters having the bidirectional"
+    print >>fp, " * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise."
+    print >>fp, " */"
+    print >>fp, 'int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)'
+    print >>fp, '{'
+    print >>fp, '#ifdef WANT_WCTYPE_FUNCTIONS'
+    print >>fp, '    return iswspace(ch);'
+    print >>fp, '#else'
+    print >>fp, '    switch (ch) {'
+
+    haswide = False
+    hasnonewide = False
+    spaces.sort()
+    for codepoint in spaces:
+        if codepoint < 0x10000:
+            hasnonewide = True
+        if codepoint >= 0x10000 and not haswide:
+            print >>fp, '#ifdef Py_UNICODE_WIDE'
+            haswide = True
+        print >>fp, '    case 0x%04X:' % (codepoint,)
+    if haswide and hasnonewide:
+        print >>fp, '#endif'
+    print >>fp, '        return 1;'
+    if haswide and not hasnonewide:
+        print >>fp, '#endif'
+
+    print >>fp,'    }'
+    print >>fp,'    return 0;'
+    print >>fp, '#endif'
+    print >>fp,'}'
+    print >>fp
+
+    # Generate code for _PyUnicode_IsLinebreak()
+    print >>fp, "/* Returns 1 for Unicode characters having the category 'Zl',"
+    print >>fp, " * 'Zp' or type 'B', 0 otherwise."
+    print >>fp, " */"
+    print >>fp, 'int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)'
+    print >>fp, '{'
+    print >>fp, '    switch (ch) {'
+    haswide = False
+    hasnonewide = False
+    linebreaks.sort()
+    for codepoint in linebreaks:
+        if codepoint < 0x10000:
+            hasnonewide = True
+        if codepoint >= 0x10000 and not haswide:
+            print >>fp, '#ifdef Py_UNICODE_WIDE'
+            haswide = True
+        print >>fp, '    case 0x%04X:' % (codepoint,)
+    if haswide and hasnonewide:
+        print >>fp, '#endif'
+    print >>fp, '        return 1;'
+    if haswide and not hasnonewide:
+        print >>fp, '#endif'
+
+    print >>fp,'    }'
+    print >>fp,'    return 0;'
+    print >>fp,'}'
+    print >>fp
+
    fp.close()

 # --------------------------------------------------------------------
@@ -660,12 +766,11 @@ def merge_old_version(version, new, old):
                    elif k == 8:
                        # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
                        # Since 0 encodes "no change", the old value is better not 0
-                        assert value != "0" and value != "-1"
                        if not value:
                            numeric_changes[i] = -1
                        else:
-                            assert re.match("^[0-9]+$", value)
-                            numeric_changes[i] = int(value)
+                            numeric_changes[i] = float(value)
+                            assert numeric_changes[i] not in (0, -1)
                    elif k == 9:
                        if value == 'Y':
                            mirrored_changes[i] = '1'
@@ -698,11 +803,9 @@ def merge_old_version(version, new, old):

 # load a unicode-data file from disk

-import sys
-
 class UnicodeData:

-    def __init__(self, filename, exclusions, eastasianwidth,
+    def __init__(self, filename, exclusions, eastasianwidth, unihan,
                 derivednormalizationprops=None, expand=1):
        self.changed = []
        file = open(filename)
@@ -789,6 +892,19 @@ class UnicodeData:
                if table[i] is not None:
                    table[i].append(quickchecks[i])

+        for line in open(unihan):
+            if not line.startswith('U+'):
+                continue
+            code, tag, value = line.split(None, 3)[:3]
+            if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
+                           'kOtherNumeric'):
+                continue
+            value = value.strip().replace(',', '')
+            i = int(code[2:], 16)
+            # Patch the numeric field
+            if table[i] is not None:
+                table[i][8] = value
+
    def uselatin1(self):
        # restrict character range to ISO Latin 1
        self.chars = range(256)
@@ -938,7 +1054,6 @@ def splitbins(t, trace=0):
    you'll get.
    """

-    import sys
    if trace:
        def dump(t1, t2, shift, bytes):
            print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (