test_ucn.py 9.35 KB
Newer Older
1 2 3
""" Test script for the Unicode implementation.

Written by Bill Tutt.
4
Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
5 6 7 8

(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.

"""#"
9 10

import unittest
11
import unicodedata
12

13
from test import support
14 15
from http.client import HTTPException
from test.test_normalization import check_version
16

17 18 19 20 21
try:
    from _testcapi import INT_MAX, PY_SSIZE_T_MAX, UINT_MAX
except ImportError:
    INT_MAX = PY_SSIZE_T_MAX = UINT_MAX = 2**64 - 1

22 23 24 25
class UnicodeNamesTest(unittest.TestCase):

    def checkletter(self, name, code):
        # Helper that put all \N escapes inside eval'd raw strings,
Tim Peters's avatar
Tim Peters committed
26
        # to make sure this script runs even if the compiler
27
        # chokes on \N escapes
28
        res = eval(r'"\N{%s}"' % name)
29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
        self.assertEqual(res, code)
        return res

    def test_general(self):
        # General and case insensitivity test:
        chars = [
            "LATIN CAPITAL LETTER T",
            "LATIN SMALL LETTER H",
            "LATIN SMALL LETTER E",
            "SPACE",
            "LATIN SMALL LETTER R",
            "LATIN CAPITAL LETTER E",
            "LATIN SMALL LETTER D",
            "SPACE",
            "LATIN SMALL LETTER f",
            "LATIN CAPITAL LeTtEr o",
            "LATIN SMaLl LETTER x",
            "SPACE",
            "LATIN SMALL LETTER A",
            "LATIN SMALL LETTER T",
            "LATIN SMALL LETTER E",
            "SPACE",
            "LATIN SMALL LETTER T",
            "LATIN SMALL LETTER H",
            "LATIN SMALL LETTER E",
            "SpAcE",
            "LATIN SMALL LETTER S",
            "LATIN SMALL LETTER H",
            "LATIN small LETTER e",
            "LATIN small LETTER e",
            "LATIN SMALL LETTER P",
            "FULL STOP"
        ]
62
        string = "The rEd fOx ate the sheep."
63 64

        self.assertEqual(
65
            "".join([self.checkletter(*args) for args in zip(chars, string)]),
66 67 68 69
            string
        )

    def test_ascii_letters(self):
70
        for char in "".join(map(chr, range(ord("a"), ord("z")))):
71 72 73 74 75
            name = "LATIN SMALL LETTER %s" % char.upper()
            code = unicodedata.lookup(name)
            self.assertEqual(unicodedata.name(code), name)

    def test_hangul_syllables(self):
76 77 78 79 80 81 82 83 84 85 86 87 88
        self.checkletter("HANGUL SYLLABLE GA", "\uac00")
        self.checkletter("HANGUL SYLLABLE GGWEOSS", "\uafe8")
        self.checkletter("HANGUL SYLLABLE DOLS", "\ub3d0")
        self.checkletter("HANGUL SYLLABLE RYAN", "\ub7b8")
        self.checkletter("HANGUL SYLLABLE MWIK", "\ubba0")
        self.checkletter("HANGUL SYLLABLE BBWAEM", "\ubf88")
        self.checkletter("HANGUL SYLLABLE SSEOL", "\uc370")
        self.checkletter("HANGUL SYLLABLE YI", "\uc758")
        self.checkletter("HANGUL SYLLABLE JJYOSS", "\ucb40")
        self.checkletter("HANGUL SYLLABLE KYEOLS", "\ucf28")
        self.checkletter("HANGUL SYLLABLE PAN", "\ud310")
        self.checkletter("HANGUL SYLLABLE HWEOK", "\ud6f8")
        self.checkletter("HANGUL SYLLABLE HIH", "\ud7a3")
89

90
        self.assertRaises(ValueError, unicodedata.name, "\ud7a4")
91 92

    def test_cjk_unified_ideographs(self):
93 94 95
        self.checkletter("CJK UNIFIED IDEOGRAPH-3400", "\u3400")
        self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", "\u4db5")
        self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", "\u4e00")
96
        self.checkletter("CJK UNIFIED IDEOGRAPH-9FCB", "\u9fCB")
97 98
        self.checkletter("CJK UNIFIED IDEOGRAPH-20000", "\U00020000")
        self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", "\U0002a6d6")
99 100 101 102
        self.checkletter("CJK UNIFIED IDEOGRAPH-2A700", "\U0002A700")
        self.checkletter("CJK UNIFIED IDEOGRAPH-2B734", "\U0002B734")
        self.checkletter("CJK UNIFIED IDEOGRAPH-2B740", "\U0002B740")
        self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D")
103 104

    def test_bmp_characters(self):
105
        for code in range(0x10000):
106
            char = chr(code)
107 108 109 110 111
            name = unicodedata.name(char, None)
            if name is not None:
                self.assertEqual(unicodedata.lookup(name), char)

    def test_misc_symbols(self):
112 113 114 115
        self.checkletter("PILCROW SIGN", "\u00b6")
        self.checkletter("REPLACEMENT CHARACTER", "\uFFFD")
        self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", "\uFF9F")
        self.checkletter("FULLWIDTH LATIN SMALL LETTER A", "\uFF41")
116

117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174
    def test_aliases(self):
        # Check that the aliases defined in the NameAliases.txt file work.
        # This should be updated when new aliases are added or the file
        # should be downloaded and parsed instead.  See #12753.
        aliases = [
            ('LATIN CAPITAL LETTER GHA', 0x01A2),
            ('LATIN SMALL LETTER GHA', 0x01A3),
            ('KANNADA LETTER LLLA', 0x0CDE),
            ('LAO LETTER FO FON', 0x0E9D),
            ('LAO LETTER FO FAY', 0x0E9F),
            ('LAO LETTER RO', 0x0EA3),
            ('LAO LETTER LO', 0x0EA5),
            ('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0),
            ('YI SYLLABLE ITERATION MARK', 0xA015),
            ('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18),
            ('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5)
        ]
        for alias, codepoint in aliases:
            self.checkletter(alias, chr(codepoint))
            name = unicodedata.name(chr(codepoint))
            self.assertNotEqual(name, alias)
            self.assertEqual(unicodedata.lookup(alias),
                             unicodedata.lookup(name))
            with self.assertRaises(KeyError):
                unicodedata.ucd_3_2_0.lookup(alias)

    def test_aliases_names_in_pua_range(self):
        # We are storing aliases in the PUA 15, but their names shouldn't leak
        for cp in range(0xf0000, 0xf0100):
            with self.assertRaises(ValueError) as cm:
                unicodedata.name(chr(cp))
            self.assertEqual(str(cm.exception), 'no such name')

    def test_named_sequences_names_in_pua_range(self):
        # We are storing named seq in the PUA 15, but their names shouldn't leak
        for cp in range(0xf0100, 0xf0fff):
            with self.assertRaises(ValueError) as cm:
                unicodedata.name(chr(cp))
            self.assertEqual(str(cm.exception), 'no such name')

    def test_named_sequences_sample(self):
        # Check a few named sequences.  See #12753.
        sequences = [
            ('LATIN SMALL LETTER R WITH TILDE', '\u0072\u0303'),
            ('TAMIL SYLLABLE SAI', '\u0BB8\u0BC8'),
            ('TAMIL SYLLABLE MOO', '\u0BAE\u0BCB'),
            ('TAMIL SYLLABLE NNOO', '\u0BA3\u0BCB'),
            ('TAMIL CONSONANT KSS', '\u0B95\u0BCD\u0BB7\u0BCD'),
        ]
        for seqname, codepoints in sequences:
            self.assertEqual(unicodedata.lookup(seqname), codepoints)
            with self.assertRaises(SyntaxError):
                self.checkletter(seqname, None)
            with self.assertRaises(KeyError):
                unicodedata.ucd_3_2_0.lookup(seqname)

    def test_named_sequences_full(self):
        # Check all the named sequences
175
        url = ("http://www.pythontest.net/unicode/%s/NamedSequences.txt" %
176 177 178 179
               unicodedata.unidata_version)
        try:
            testdata = support.open_urlresource(url, encoding="utf-8",
                                                check=check_version)
180
        except (OSError, HTTPException):
181 182 183 184 185 186 187 188 189 190 191 192 193 194
            self.skipTest("Could not retrieve " + url)
        self.addCleanup(testdata.close)
        for line in testdata:
            line = line.strip()
            if not line or line.startswith('#'):
                continue
            seqname, codepoints = line.split(';')
            codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split())
            self.assertEqual(unicodedata.lookup(seqname), codepoints)
            with self.assertRaises(SyntaxError):
                self.checkletter(seqname, None)
            with self.assertRaises(KeyError):
                unicodedata.ucd_3_2_0.lookup(seqname)

195 196
    def test_errors(self):
        self.assertRaises(TypeError, unicodedata.name)
197
        self.assertRaises(TypeError, unicodedata.name, 'xx')
198
        self.assertRaises(TypeError, unicodedata.lookup)
199
        self.assertRaises(KeyError, unicodedata.lookup, 'unknown')
200

201
    def test_strict_error_handling(self):
202 203 204
        # bogus character name
        self.assertRaises(
            UnicodeError,
205
            str, b"\\N{blah}", 'unicode-escape', 'strict'
206 207 208 209
        )
        # long bogus character name
        self.assertRaises(
            UnicodeError,
210
            str, bytes("\\N{%s}" % ("x" * 100000), "ascii"), 'unicode-escape', 'strict'
211 212 213 214
        )
        # missing closing brace
        self.assertRaises(
            UnicodeError,
215
            str, b"\\N{SPACE", 'unicode-escape', 'strict'
216 217 218 219
        )
        # missing opening brace
        self.assertRaises(
            UnicodeError,
220
            str, b"\\NSPACE", 'unicode-escape', 'strict'
221 222
        )

223 224 225
    @support.cpython_only
    @unittest.skipUnless(INT_MAX < PY_SSIZE_T_MAX, "needs UINT_MAX < SIZE_MAX")
    @support.bigmemtest(size=UINT_MAX + 1, memuse=2 + 1, dry_run=False)
226
    def test_issue16335(self, size):
227
        # very very long bogus character name
228 229
        x = b'\\N{SPACE' + b'x' * (UINT_MAX + 1) + b'}'
        self.assertEqual(len(x), len(b'\\N{SPACE}') + (UINT_MAX + 1))
230 231 232 233
        self.assertRaisesRegex(UnicodeError,
            'unknown Unicode character name',
            x.decode, 'unicode-escape'
        )
234 235


236
if __name__ == "__main__":
237
    unittest.main()