test_ucn.py 4.94 KB
Newer Older
1 2 3
""" Test script for the Unicode implementation.

Written by Bill Tutt.
4
Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
5 6 7 8

(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.

"""#"
9 10 11

import unittest

12
from test import support
13 14 15 16 17

class UnicodeNamesTest(unittest.TestCase):

    def checkletter(self, name, code):
        # Helper that put all \N escapes inside eval'd raw strings,
Tim Peters's avatar
Tim Peters committed
18
        # to make sure this script runs even if the compiler
19
        # chokes on \N escapes
20
        res = eval(r'"\N{%s}"' % name)
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
        self.assertEqual(res, code)
        return res

    def test_general(self):
        # General and case insensitivity test:
        chars = [
            "LATIN CAPITAL LETTER T",
            "LATIN SMALL LETTER H",
            "LATIN SMALL LETTER E",
            "SPACE",
            "LATIN SMALL LETTER R",
            "LATIN CAPITAL LETTER E",
            "LATIN SMALL LETTER D",
            "SPACE",
            "LATIN SMALL LETTER f",
            "LATIN CAPITAL LeTtEr o",
            "LATIN SMaLl LETTER x",
            "SPACE",
            "LATIN SMALL LETTER A",
            "LATIN SMALL LETTER T",
            "LATIN SMALL LETTER E",
            "SPACE",
            "LATIN SMALL LETTER T",
            "LATIN SMALL LETTER H",
            "LATIN SMALL LETTER E",
            "SpAcE",
            "LATIN SMALL LETTER S",
            "LATIN SMALL LETTER H",
            "LATIN small LETTER e",
            "LATIN small LETTER e",
            "LATIN SMALL LETTER P",
            "FULL STOP"
        ]
54
        string = "The rEd fOx ate the sheep."
55 56

        self.assertEqual(
57
            "".join([self.checkletter(*args) for args in zip(chars, string)]),
58 59 60 61 62 63
            string
        )

    def test_ascii_letters(self):
        import unicodedata

64
        for char in "".join(map(chr, range(ord("a"), ord("z")))):
65 66 67 68 69
            name = "LATIN SMALL LETTER %s" % char.upper()
            code = unicodedata.lookup(name)
            self.assertEqual(unicodedata.name(code), name)

    def test_hangul_syllables(self):
70 71 72 73 74 75 76 77 78 79 80 81 82
        self.checkletter("HANGUL SYLLABLE GA", "\uac00")
        self.checkletter("HANGUL SYLLABLE GGWEOSS", "\uafe8")
        self.checkletter("HANGUL SYLLABLE DOLS", "\ub3d0")
        self.checkletter("HANGUL SYLLABLE RYAN", "\ub7b8")
        self.checkletter("HANGUL SYLLABLE MWIK", "\ubba0")
        self.checkletter("HANGUL SYLLABLE BBWAEM", "\ubf88")
        self.checkletter("HANGUL SYLLABLE SSEOL", "\uc370")
        self.checkletter("HANGUL SYLLABLE YI", "\uc758")
        self.checkletter("HANGUL SYLLABLE JJYOSS", "\ucb40")
        self.checkletter("HANGUL SYLLABLE KYEOLS", "\ucf28")
        self.checkletter("HANGUL SYLLABLE PAN", "\ud310")
        self.checkletter("HANGUL SYLLABLE HWEOK", "\ud6f8")
        self.checkletter("HANGUL SYLLABLE HIH", "\ud7a3")
83 84

        import unicodedata
85
        self.assertRaises(ValueError, unicodedata.name, "\ud7a4")
86 87

    def test_cjk_unified_ideographs(self):
88 89 90 91 92 93
        self.checkletter("CJK UNIFIED IDEOGRAPH-3400", "\u3400")
        self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", "\u4db5")
        self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", "\u4e00")
        self.checkletter("CJK UNIFIED IDEOGRAPH-9FA5", "\u9fa5")
        self.checkletter("CJK UNIFIED IDEOGRAPH-20000", "\U00020000")
        self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", "\U0002a6d6")
94 95 96 97

    def test_bmp_characters(self):
        import unicodedata
        count = 0
98
        for code in range(0x10000):
99
            char = chr(code)
100 101 102 103 104 105
            name = unicodedata.name(char, None)
            if name is not None:
                self.assertEqual(unicodedata.lookup(name), char)
                count += 1

    def test_misc_symbols(self):
106 107 108 109
        self.checkletter("PILCROW SIGN", "\u00b6")
        self.checkletter("REPLACEMENT CHARACTER", "\uFFFD")
        self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", "\uFF9F")
        self.checkletter("FULLWIDTH LATIN SMALL LETTER A", "\uFF41")
110 111 112 113

    def test_errors(self):
        import unicodedata
        self.assertRaises(TypeError, unicodedata.name)
114
        self.assertRaises(TypeError, unicodedata.name, 'xx')
115
        self.assertRaises(TypeError, unicodedata.lookup)
116
        self.assertRaises(KeyError, unicodedata.lookup, 'unknown')
117

118
    def test_strict_error_handling(self):
119 120 121
        # bogus character name
        self.assertRaises(
            UnicodeError,
122
            str, b"\\N{blah}", 'unicode-escape', 'strict'
123 124 125 126
        )
        # long bogus character name
        self.assertRaises(
            UnicodeError,
127
            str, bytes("\\N{%s}" % ("x" * 100000), "ascii"), 'unicode-escape', 'strict'
128 129 130 131
        )
        # missing closing brace
        self.assertRaises(
            UnicodeError,
132
            str, b"\\N{SPACE", 'unicode-escape', 'strict'
133 134 135 136
        )
        # missing opening brace
        self.assertRaises(
            UnicodeError,
137
            str, b"\\NSPACE", 'unicode-escape', 'strict'
138 139 140
        )

def test_main():
141
    support.run_unittest(UnicodeNamesTest)
142 143 144

if __name__ == "__main__":
    test_main()