test_unicodedata.py 12.1 KB
Newer Older
Guido van Rossum's avatar
Guido van Rossum committed
1 2
""" Test script for the unicodedata module.

3
    Written by Marc-Andre Lemburg (mal@lemburg.com).
Guido van Rossum's avatar
Guido van Rossum committed
4

5
    (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
Guido van Rossum's avatar
Guido van Rossum committed
6

7 8 9 10
"""

import sys
import unittest
11
import hashlib
12 13
import subprocess
import test.support
Guido van Rossum's avatar
Guido van Rossum committed
14

15
encoding = 'utf-8'
16
errors = 'surrogatepass'
17

18 19 20

### Run tests

21 22 23
class UnicodeMethodsTest(unittest.TestCase):

    # update this, if the database changes
24
    expectedchecksum = 'e74e878de71b6e780ffac271785c3cb58f6251f3'
25 26

    def test_method_checksum(self):
27
        h = hashlib.sha1()
28
        for i in range(0x10000):
29
            char = chr(i)
30 31
            data = [
                # Predicates (single char)
32 33 34 35 36 37 38 39 40
                "01"[char.isalnum()],
                "01"[char.isalpha()],
                "01"[char.isdecimal()],
                "01"[char.isdigit()],
                "01"[char.islower()],
                "01"[char.isnumeric()],
                "01"[char.isspace()],
                "01"[char.istitle()],
                "01"[char.isupper()],
41 42

                # Predicates (multiple chars)
43 44 45 46 47 48 49 50 51
                "01"[(char + 'abc').isalnum()],
                "01"[(char + 'abc').isalpha()],
                "01"[(char + '123').isdecimal()],
                "01"[(char + '123').isdigit()],
                "01"[(char + 'abc').islower()],
                "01"[(char + '123').isnumeric()],
                "01"[(char + ' \t').isspace()],
                "01"[(char + 'abc').istitle()],
                "01"[(char + 'ABC').isupper()],
52 53 54 55 56 57 58

                # Mappings (single char)
                char.lower(),
                char.upper(),
                char.title(),

                # Mappings (multiple chars)
59 60 61 62
                (char + 'abc').lower(),
                (char + 'ABC').upper(),
                (char + 'abc').title(),
                (char + 'ABC').title(),
63 64

                ]
65
            h.update(''.join(data).encode(encoding, errors))
66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
        result = h.hexdigest()
        self.assertEqual(result, self.expectedchecksum)

class UnicodeDatabaseTest(unittest.TestCase):

    def setUp(self):
        # In case unicodedata is not available, this will raise an ImportError,
        # but the other test cases will still be run
        import unicodedata
        self.db = unicodedata

    def tearDown(self):
        del self.db

class UnicodeFunctionsTest(UnicodeDatabaseTest):

    # update this, if the database changes
83
    expectedchecksum = 'f0b74d26776331cc7bdc3a4698f037d73f2cee2b'
84 85
    def test_function_checksum(self):
        data = []
86
        h = hashlib.sha1()
87 88

        for i in range(0x10000):
89
            char = chr(i)
90 91
            data = [
                # Properties
92 93 94
                format(self.db.digit(char, -1), '.12g'),
                format(self.db.numeric(char, -1), '.12g'),
                format(self.db.decimal(char, -1), '.12g'),
95 96 97 98 99 100
                self.db.category(char),
                self.db.bidirectional(char),
                self.db.decomposition(char),
                str(self.db.mirrored(char)),
                str(self.db.combining(char)),
            ]
101
            h.update(''.join(data).encode("ascii"))
102 103 104 105
        result = h.hexdigest()
        self.assertEqual(result, self.expectedchecksum)

    def test_digit(self):
106 107 108 109
        self.assertEqual(self.db.digit('A', None), None)
        self.assertEqual(self.db.digit('9'), 9)
        self.assertEqual(self.db.digit('\u215b', None), None)
        self.assertEqual(self.db.digit('\u2468'), 9)
110
        self.assertEqual(self.db.digit('\U00020000', None), None)
111
        self.assertEqual(self.db.digit('\U0001D7FD'), 7)
112 113

        self.assertRaises(TypeError, self.db.digit)
114 115
        self.assertRaises(TypeError, self.db.digit, 'xx')
        self.assertRaises(ValueError, self.db.digit, 'x')
116 117

    def test_numeric(self):
118 119 120 121
        self.assertEqual(self.db.numeric('A',None), None)
        self.assertEqual(self.db.numeric('9'), 9)
        self.assertEqual(self.db.numeric('\u215b'), 0.125)
        self.assertEqual(self.db.numeric('\u2468'), 9.0)
122
        self.assertEqual(self.db.numeric('\ua627'), 7.0)
123
        self.assertEqual(self.db.numeric('\U00020000', None), None)
124
        self.assertEqual(self.db.numeric('\U0001012A'), 9000)
125 126

        self.assertRaises(TypeError, self.db.numeric)
127 128
        self.assertRaises(TypeError, self.db.numeric, 'xx')
        self.assertRaises(ValueError, self.db.numeric, 'x')
129 130

    def test_decimal(self):
131 132 133 134
        self.assertEqual(self.db.decimal('A',None), None)
        self.assertEqual(self.db.decimal('9'), 9)
        self.assertEqual(self.db.decimal('\u215b', None), None)
        self.assertEqual(self.db.decimal('\u2468', None), None)
135
        self.assertEqual(self.db.decimal('\U00020000', None), None)
136
        self.assertEqual(self.db.decimal('\U0001D7FD'), 7)
137 138

        self.assertRaises(TypeError, self.db.decimal)
139 140
        self.assertRaises(TypeError, self.db.decimal, 'xx')
        self.assertRaises(ValueError, self.db.decimal, 'x')
141 142

    def test_category(self):
143 144 145
        self.assertEqual(self.db.category('\uFFFE'), 'Cn')
        self.assertEqual(self.db.category('a'), 'Ll')
        self.assertEqual(self.db.category('A'), 'Lu')
146
        self.assertEqual(self.db.category('\U00020000'), 'Lo')
147
        self.assertEqual(self.db.category('\U0001012A'), 'No')
148 149

        self.assertRaises(TypeError, self.db.category)
150
        self.assertRaises(TypeError, self.db.category, 'xx')
151 152

    def test_bidirectional(self):
153 154 155
        self.assertEqual(self.db.bidirectional('\uFFFE'), '')
        self.assertEqual(self.db.bidirectional(' '), 'WS')
        self.assertEqual(self.db.bidirectional('A'), 'L')
156
        self.assertEqual(self.db.bidirectional('\U00020000'), 'L')
157 158

        self.assertRaises(TypeError, self.db.bidirectional)
159
        self.assertRaises(TypeError, self.db.bidirectional, 'xx')
160 161

    def test_decomposition(self):
162 163
        self.assertEqual(self.db.decomposition('\uFFFE'),'')
        self.assertEqual(self.db.decomposition('\u00bc'), '<fraction> 0031 2044 0034')
164 165

        self.assertRaises(TypeError, self.db.decomposition)
166
        self.assertRaises(TypeError, self.db.decomposition, 'xx')
167 168

    def test_mirrored(self):
169 170 171
        self.assertEqual(self.db.mirrored('\uFFFE'), 0)
        self.assertEqual(self.db.mirrored('a'), 0)
        self.assertEqual(self.db.mirrored('\u2201'), 1)
172
        self.assertEqual(self.db.mirrored('\U00020000'), 0)
173 174

        self.assertRaises(TypeError, self.db.mirrored)
175
        self.assertRaises(TypeError, self.db.mirrored, 'xx')
176 177

    def test_combining(self):
178 179 180
        self.assertEqual(self.db.combining('\uFFFE'), 0)
        self.assertEqual(self.db.combining('a'), 0)
        self.assertEqual(self.db.combining('\u20e1'), 230)
181
        self.assertEqual(self.db.combining('\U00020000'), 0)
182 183

        self.assertRaises(TypeError, self.db.combining)
184
        self.assertRaises(TypeError, self.db.combining, 'xx')
185 186 187

    def test_normalize(self):
        self.assertRaises(TypeError, self.db.normalize)
188 189
        self.assertRaises(ValueError, self.db.normalize, 'unknown', 'xx')
        self.assertEqual(self.db.normalize('NFKC', ''), '')
190 191 192
        # The rest can be found in test_normalization.py
        # which requires an external file.

193 194
    def test_pr29(self):
        # http://www.unicode.org/review/pr-29.html
195 196 197 198 199 200 201 202
        # See issues #1054943 and #10254.
        composed = ("\u0b47\u0300\u0b3e", "\u1100\u0300\u1161",
                    'Li\u030dt-s\u1e73\u0301',
                    '\u092e\u093e\u0930\u094d\u0915 \u091c\u093c'
                    + '\u0941\u0915\u0947\u0930\u092c\u0930\u094d\u0917',
                    '\u0915\u093f\u0930\u094d\u0917\u093f\u091c\u093c'
                    + '\u0938\u094d\u0924\u093e\u0928')
        for text in composed:
203 204
            self.assertEqual(self.db.normalize('NFC', text), text)

205 206 207 208 209 210
    def test_issue10254(self):
        # Crash reported in #10254
        a = 'C\u0338' * 20  + 'C\u0327'
        b = 'C\u0338' * 20  + '\xC7'
        self.assertEqual(self.db.normalize('NFC', a), b)

211 212
    def test_east_asian_width(self):
        eaw = self.db.east_asian_width
213
        self.assertRaises(TypeError, eaw, b'a')
214
        self.assertRaises(TypeError, eaw, bytearray())
215 216 217 218 219 220 221 222
        self.assertRaises(TypeError, eaw, '')
        self.assertRaises(TypeError, eaw, 'ra')
        self.assertEqual(eaw('\x1e'), 'N')
        self.assertEqual(eaw('\x20'), 'Na')
        self.assertEqual(eaw('\uC894'), 'W')
        self.assertEqual(eaw('\uFF66'), 'H')
        self.assertEqual(eaw('\uFF1F'), 'F')
        self.assertEqual(eaw('\u2010'), 'A')
223
        self.assertEqual(eaw('\U00020000'), 'W')
224 225 226

class UnicodeMiscTest(UnicodeDatabaseTest):

227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243
    def test_failed_import_during_compiling(self):
        # Issue 4367
        # Decoding \N escapes requires the unicodedata module. If it can't be
        # imported, we shouldn't segfault.

        # This program should raise a SyntaxError in the eval.
        code = "import sys;" \
            "sys.modules['unicodedata'] = None;" \
            """eval("'\\\\N{SOFT HYPHEN}'")"""
        args = [sys.executable, "-c", code]
        # We use a subprocess because the unicodedata module may already have
        # been loaded in this process.
        popen = subprocess.Popen(args, stderr=subprocess.PIPE)
        popen.wait()
        self.assertEqual(popen.returncode, 1)
        error = "SyntaxError: (unicode error) \\N escapes not supported " \
            "(can't load unicodedata module)"
244
        self.assertIn(error, popen.stderr.read().decode("ascii"))
245
        popen.stderr.close()
246

247 248 249
    def test_decimal_numeric_consistent(self):
        # Test that decimal and numeric are consistent,
        # i.e. if a character has a decimal value,
250
        # its numeric value should be the same.
251
        count = 0
252
        for i in range(0x10000):
253
            c = chr(i)
254 255 256 257
            dec = self.db.decimal(c, -1)
            if dec != -1:
                self.assertEqual(dec, self.db.numeric(c))
                count += 1
258
        self.assertTrue(count >= 10) # should have tested at least the ASCII digits
259 260 261

    def test_digit_numeric_consistent(self):
        # Test that digit and numeric are consistent,
Tim Peters's avatar
Tim Peters committed
262
        # i.e. if a character has a digit value,
263
        # its numeric value should be the same.
264
        count = 0
265
        for i in range(0x10000):
266
            c = chr(i)
267 268 269 270
            dec = self.db.digit(c, -1)
            if dec != -1:
                self.assertEqual(dec, self.db.numeric(c))
                count += 1
271
        self.assertTrue(count >= 10) # should have tested at least the ASCII digits
272

273
    def test_bug_1704793(self):
274
        self.assertEqual(self.db.lookup("GOTHIC LETTER FAIHU"), '\U00010346')
275

276 277 278
    def test_ucd_510(self):
        import unicodedata
        # In UCD 5.1.0, a mirrored property changed wrt. UCD 3.2.0
279 280
        self.assertTrue(unicodedata.mirrored("\u0f3a"))
        self.assertTrue(not unicodedata.ucd_3_2_0.mirrored("\u0f3a"))
281 282
        # Also, we now have two ways of representing
        # the upper-case mapping: as delta, or as absolute value
283 284 285
        self.assertTrue("a".upper()=='A')
        self.assertTrue("\u1d79".upper()=='\ua77d')
        self.assertTrue(".".upper()=='.')
286 287 288 289 290 291 292 293 294 295 296 297

    def test_bug_5828(self):
        self.assertEqual("\u1d79".lower(), "\u1d79")
        # Only U+0000 should have U+0000 as its upper/lower/titlecase variant
        self.assertEqual(
            [
                c for c in range(sys.maxunicode+1)
                if "\x00" in chr(c).lower()+chr(c).upper()+chr(c).title()
            ],
            [0]
        )

298
    def test_bug_4971(self):
299 300 301 302
        # LETTER DZ WITH CARON: DZ, Dz, dz
        self.assertEqual("\u01c4".title(), "\u01c5")
        self.assertEqual("\u01c5".title(), "\u01c5")
        self.assertEqual("\u01c6".title(), "\u01c5")
303

304 305 306 307 308 309 310 311 312 313 314
    def test_linebreak_7643(self):
        for i in range(0x10000):
            lines = (chr(i) + 'A').splitlines()
            if i in (0x0a, 0x0b, 0x0c, 0x0d, 0x85,
                     0x1c, 0x1d, 0x1e, 0x2028, 0x2029):
                self.assertEqual(len(lines), 2,
                                 r"\u%.4x should be a linebreak" % i)
            else:
                self.assertEqual(len(lines), 1,
                                 r"\u%.4x should not be a linebreak" % i)

315
def test_main():
316
    test.support.run_unittest(
317 318 319 320
        UnicodeMiscTest,
        UnicodeMethodsTest,
        UnicodeFunctionsTest
    )
321 322 323

if __name__ == "__main__":
    test_main()