Implement names for CJK unified ideographs. Add name to KeyError output.

Verify that the lookup for an existing name succeeds.

Implement names for CJK unified ideographs. Add name to KeyError output.
Verify that the lookup for an existing name succeeds.
ef7fe2e8 · Martin v. Löwis · 8579efc8 · ef7fe2e8 · ef7fe2e8 · ef7fe2e8
Kaydet (Commit) ef7fe2e8 authored Kas 23, 2002 tarafından Martin v. Löwis
Hide whitespace changes
Inline Side-by-side

Showing with 59 additions and 8 deletions

test_ucn Lib/test/output/test_ucn +3 -2

test_ucn.py Lib/test/test_ucn.py +16 -4

NEWS Misc/NEWS +1 -1

unicodedata.c Modules/unicodedata.c +39 -1

No files found.
--- a/Lib/test/output/test_ucn
+++ b/Lib/test/output/test_ucn
@@ -2,7 +2,8 @@ test_ucn
 Testing General Unicode Character Name, and case insensitivity... done.
 Testing name to code mapping.... done.
 Testing hangul syllable names.... done.
-Testing code to name mapping for all characters.... done.
-Found 22728 characters in the unicode name database
+Testing names of CJK unified ideographs.... done.
+Testing code to name mapping for all BMP characters.... done.
+Found 50212 characters in the unicode name database
 Testing misc. symbols for unicode character name expansion.... done.
 Testing unicode character name expansion strict error handling.... done.
--- a/Lib/test/test_ucn.py
+++ b/Lib/test/test_ucn.py
@@ -80,16 +80,28 @@ else:
    raise AssertionError, "Found name for U+D7A4"
 print "done."

-print "Testing code to name mapping for all characters....",
+print "Testing names of CJK unified ideographs....",
+exec r"""
+verify(u"\N{CJK UNIFIED IDEOGRAPH-3400}" == u"\u3400")
+verify(u"\N{CJK UNIFIED IDEOGRAPH-4DB5}" == u"\u4db5")
+verify(u"\N{CJK UNIFIED IDEOGRAPH-4E00}" == u"\u4e00")
+verify(u"\N{CJK UNIFIED IDEOGRAPH-9FA5}" == u"\u9fa5")
+verify(u"\N{CJK UNIFIED IDEOGRAPH-20000}" == u"\U00020000")
+verify(u"\N{CJK UNIFIED IDEOGRAPH-2A6D6}" == u"\U0002a6d6")
+"""
+print "done."
+
+print "Testing code to name mapping for all BMP characters....",
 count = 0
-for code in range(65536):
+for code in range(0x10000):
    try:
        char = unichr(code)
        name = unicodedata.name(char)
-        verify(unicodedata.lookup(name) == char)
-        count += 1
    except (KeyError, ValueError):
        pass
+    else:
+        verify(unicodedata.lookup(name) == char)
+        count += 1
 print "done."

 print "Found", count, "characters in the unicode name database"

--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -318,7 +318,7 @@ Extension modules
  is now named bsddb185.

 - unicodedata was updated to Unicode 3.2. In now also supports names
-  for Hangul syllables.
+  for Hangul syllables and CJK unified ideographs.

 - resource.getrlimit() now returns longs instead of ints.


--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -348,6 +348,16 @@ _getucname(Py_UCS4 code, char* buffer, int buflen)
 	return 1;
    }

+    if ((0x3400 <= code && code <= 0x4DB5) ||  /* CJK Ideograph Extension A */
+        (0x4E00 <= code && code <= 0x9FA5) ||  /* CJK Ideograph */
+        (0x20000 <= code && code <= 0x2A6D6)) {/* CJK Ideograph Extension B */
+        if (buflen < 28)
+            /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
+            return 0;
+        sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
+        return 1;
+    }
+
    if (code >= 0x110000)
        return 0;

@@ -449,6 +459,30 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
 	    *code = SBase + (L*VCount+V)*TCount + T;
 	    return 1;
 	}
+        /* Otherwise, it's an illegal syllable name. */
+        return 0;
+    }
+
+    /* Check for unified ideographs. */
+    if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
+        /* Four or five hexdigits must follow. */
+        v = 0;
+        name += 22;
+        namelen -= 22;
+        if (namelen != 4 && namelen != 5)
+            return 0;
+        while (namelen--) {
+            v *= 16;
+            if (*name >= '0' && *name <= '9')
+                v += *name - '0';
+            else if (*name >= 'A' && *name <= 'F')
+                v += *name - 'A' + 10;
+            else
+                return 0;
+            name++;
+        }
+        *code = v;
+        return 1;
    }

    /* the following is the same as python's dictionary lookup, with
@@ -535,7 +569,11 @@ unicodedata_lookup(PyObject* self, PyObject* args)
        return NULL;

    if (!_getcode(name, namelen, &code)) {
-        PyErr_SetString(PyExc_KeyError, "undefined character name");
+        char fmt[] = "undefined character name '%s'";
+        char *buf = PyMem_MALLOC(sizeof(fmt) + namelen);
+        sprintf(buf, fmt, name);
+        PyErr_SetString(PyExc_KeyError, buf);
+        PyMem_FREE(buf);
        return NULL;
    }