test_unicodedata.py 11.2 KB
Newer Older
Guido van Rossum's avatar
Guido van Rossum committed
1 2
""" Test script for the unicodedata module.

3
    Written by Marc-Andre Lemburg (mal@lemburg.com).
Guido van Rossum's avatar
Guido van Rossum committed
4

5
    (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
Guido van Rossum's avatar
Guido van Rossum committed
6

7 8 9 10
"""

import sys
import unittest
11
import hashlib
12 13
import subprocess
import test.support
Guido van Rossum's avatar
Guido van Rossum committed
14

15
encoding = 'utf-8'
16
errors = 'surrogatepass'
17

18 19 20

### Run tests

21 22 23
class UnicodeMethodsTest(unittest.TestCase):

    # update this, if the database changes
24
    expectedchecksum = '6ec65b65835614ec00634c674bba0e50cd32c189'
25 26

    def test_method_checksum(self):
27
        h = hashlib.sha1()
28
        for i in range(65536):
29
            char = chr(i)
30 31
            data = [
                # Predicates (single char)
32 33 34 35 36 37 38 39 40
                "01"[char.isalnum()],
                "01"[char.isalpha()],
                "01"[char.isdecimal()],
                "01"[char.isdigit()],
                "01"[char.islower()],
                "01"[char.isnumeric()],
                "01"[char.isspace()],
                "01"[char.istitle()],
                "01"[char.isupper()],
41 42

                # Predicates (multiple chars)
43 44 45 46 47 48 49 50 51
                "01"[(char + 'abc').isalnum()],
                "01"[(char + 'abc').isalpha()],
                "01"[(char + '123').isdecimal()],
                "01"[(char + '123').isdigit()],
                "01"[(char + 'abc').islower()],
                "01"[(char + '123').isnumeric()],
                "01"[(char + ' \t').isspace()],
                "01"[(char + 'abc').istitle()],
                "01"[(char + 'ABC').isupper()],
52 53 54 55 56 57 58

                # Mappings (single char)
                char.lower(),
                char.upper(),
                char.title(),

                # Mappings (multiple chars)
59 60 61 62
                (char + 'abc').lower(),
                (char + 'ABC').upper(),
                (char + 'abc').title(),
                (char + 'ABC').title(),
63 64

                ]
65
            h.update(''.join(data).encode(encoding, errors))
66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
        result = h.hexdigest()
        self.assertEqual(result, self.expectedchecksum)

class UnicodeDatabaseTest(unittest.TestCase):

    def setUp(self):
        # In case unicodedata is not available, this will raise an ImportError,
        # but the other test cases will still be run
        import unicodedata
        self.db = unicodedata

    def tearDown(self):
        del self.db

class UnicodeFunctionsTest(UnicodeDatabaseTest):

    # update this, if the database changes
83
    expectedchecksum = '3136d5afd787dc2bcb1bdcac95e385349fbebbca'
84 85 86

    def test_function_checksum(self):
        data = []
87
        h = hashlib.sha1()
88 89

        for i in range(0x10000):
90
            char = chr(i)
91 92 93 94 95 96 97 98 99 100 101
            data = [
                # Properties
                str(self.db.digit(char, -1)),
                str(self.db.numeric(char, -1)),
                str(self.db.decimal(char, -1)),
                self.db.category(char),
                self.db.bidirectional(char),
                self.db.decomposition(char),
                str(self.db.mirrored(char)),
                str(self.db.combining(char)),
            ]
102
            h.update(''.join(data).encode("ascii"))
103 104 105 106
        result = h.hexdigest()
        self.assertEqual(result, self.expectedchecksum)

    def test_digit(self):
107 108 109 110
        self.assertEqual(self.db.digit('A', None), None)
        self.assertEqual(self.db.digit('9'), 9)
        self.assertEqual(self.db.digit('\u215b', None), None)
        self.assertEqual(self.db.digit('\u2468'), 9)
111
        self.assertEqual(self.db.digit('\U00020000', None), None)
112 113

        self.assertRaises(TypeError, self.db.digit)
114 115
        self.assertRaises(TypeError, self.db.digit, 'xx')
        self.assertRaises(ValueError, self.db.digit, 'x')
116 117

    def test_numeric(self):
118 119 120 121
        self.assertEqual(self.db.numeric('A',None), None)
        self.assertEqual(self.db.numeric('9'), 9)
        self.assertEqual(self.db.numeric('\u215b'), 0.125)
        self.assertEqual(self.db.numeric('\u2468'), 9.0)
122
        self.assertEqual(self.db.numeric('\U00020000', None), None)
123 124

        self.assertRaises(TypeError, self.db.numeric)
125 126
        self.assertRaises(TypeError, self.db.numeric, 'xx')
        self.assertRaises(ValueError, self.db.numeric, 'x')
127 128

    def test_decimal(self):
129 130 131 132
        self.assertEqual(self.db.decimal('A',None), None)
        self.assertEqual(self.db.decimal('9'), 9)
        self.assertEqual(self.db.decimal('\u215b', None), None)
        self.assertEqual(self.db.decimal('\u2468', None), None)
133
        self.assertEqual(self.db.decimal('\U00020000', None), None)
134 135

        self.assertRaises(TypeError, self.db.decimal)
136 137
        self.assertRaises(TypeError, self.db.decimal, 'xx')
        self.assertRaises(ValueError, self.db.decimal, 'x')
138 139

    def test_category(self):
140 141 142
        self.assertEqual(self.db.category('\uFFFE'), 'Cn')
        self.assertEqual(self.db.category('a'), 'Ll')
        self.assertEqual(self.db.category('A'), 'Lu')
143
        self.assertEqual(self.db.category('\U00020000'), 'Lo')
144 145

        self.assertRaises(TypeError, self.db.category)
146
        self.assertRaises(TypeError, self.db.category, 'xx')
147 148

    def test_bidirectional(self):
149 150 151
        self.assertEqual(self.db.bidirectional('\uFFFE'), '')
        self.assertEqual(self.db.bidirectional(' '), 'WS')
        self.assertEqual(self.db.bidirectional('A'), 'L')
152
        self.assertEqual(self.db.bidirectional('\U00020000'), 'L')
153 154

        self.assertRaises(TypeError, self.db.bidirectional)
155
        self.assertRaises(TypeError, self.db.bidirectional, 'xx')
156 157

    def test_decomposition(self):
158 159
        self.assertEqual(self.db.decomposition('\uFFFE'),'')
        self.assertEqual(self.db.decomposition('\u00bc'), '<fraction> 0031 2044 0034')
160 161

        self.assertRaises(TypeError, self.db.decomposition)
162
        self.assertRaises(TypeError, self.db.decomposition, 'xx')
163 164

    def test_mirrored(self):
165 166 167
        self.assertEqual(self.db.mirrored('\uFFFE'), 0)
        self.assertEqual(self.db.mirrored('a'), 0)
        self.assertEqual(self.db.mirrored('\u2201'), 1)
168
        self.assertEqual(self.db.mirrored('\U00020000'), 0)
169 170

        self.assertRaises(TypeError, self.db.mirrored)
171
        self.assertRaises(TypeError, self.db.mirrored, 'xx')
172 173

    def test_combining(self):
174 175 176
        self.assertEqual(self.db.combining('\uFFFE'), 0)
        self.assertEqual(self.db.combining('a'), 0)
        self.assertEqual(self.db.combining('\u20e1'), 230)
177
        self.assertEqual(self.db.combining('\U00020000'), 0)
178 179

        self.assertRaises(TypeError, self.db.combining)
180
        self.assertRaises(TypeError, self.db.combining, 'xx')
181 182 183

    def test_normalize(self):
        self.assertRaises(TypeError, self.db.normalize)
184 185
        self.assertRaises(ValueError, self.db.normalize, 'unknown', 'xx')
        self.assertEqual(self.db.normalize('NFKC', ''), '')
186 187 188
        # The rest can be found in test_normalization.py
        # which requires an external file.

189 190
    def test_pr29(self):
        # http://www.unicode.org/review/pr-29.html
191 192 193 194 195 196 197 198
        # See issues #1054943 and #10254.
        composed = ("\u0b47\u0300\u0b3e", "\u1100\u0300\u1161",
                    'Li\u030dt-s\u1e73\u0301',
                    '\u092e\u093e\u0930\u094d\u0915 \u091c\u093c'
                    + '\u0941\u0915\u0947\u0930\u092c\u0930\u094d\u0917',
                    '\u0915\u093f\u0930\u094d\u0917\u093f\u091c\u093c'
                    + '\u0938\u094d\u0924\u093e\u0928')
        for text in composed:
199 200
            self.assertEqual(self.db.normalize('NFC', text), text)

201 202 203 204 205 206
    def test_issue10254(self):
        # Crash reported in #10254
        a = 'C\u0338' * 20  + 'C\u0327'
        b = 'C\u0338' * 20  + '\xC7'
        self.assertEqual(self.db.normalize('NFC', a), b)

207 208
    def test_east_asian_width(self):
        eaw = self.db.east_asian_width
209
        self.assertRaises(TypeError, eaw, b'a')
210
        self.assertRaises(TypeError, eaw, bytearray())
211 212 213 214 215 216 217 218
        self.assertRaises(TypeError, eaw, '')
        self.assertRaises(TypeError, eaw, 'ra')
        self.assertEqual(eaw('\x1e'), 'N')
        self.assertEqual(eaw('\x20'), 'Na')
        self.assertEqual(eaw('\uC894'), 'W')
        self.assertEqual(eaw('\uFF66'), 'H')
        self.assertEqual(eaw('\uFF1F'), 'F')
        self.assertEqual(eaw('\u2010'), 'A')
219
        self.assertEqual(eaw('\U00020000'), 'W')
220 221 222

class UnicodeMiscTest(UnicodeDatabaseTest):

223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241
    def test_failed_import_during_compiling(self):
        # Issue 4367
        # Decoding \N escapes requires the unicodedata module. If it can't be
        # imported, we shouldn't segfault.

        # This program should raise a SyntaxError in the eval.
        code = "import sys;" \
            "sys.modules['unicodedata'] = None;" \
            """eval("'\\\\N{SOFT HYPHEN}'")"""
        args = [sys.executable, "-c", code]
        # We use a subprocess because the unicodedata module may already have
        # been loaded in this process.
        popen = subprocess.Popen(args, stderr=subprocess.PIPE)
        popen.wait()
        self.assertEqual(popen.returncode, 1)
        error = "SyntaxError: (unicode error) \\N escapes not supported " \
            "(can't load unicodedata module)"
        self.assertTrue(error in popen.stderr.read().decode("ascii"))

242 243 244
    def test_decimal_numeric_consistent(self):
        # Test that decimal and numeric are consistent,
        # i.e. if a character has a decimal value,
245
        # its numeric value should be the same.
246
        count = 0
247
        for i in range(0x10000):
248
            c = chr(i)
249 250 251 252
            dec = self.db.decimal(c, -1)
            if dec != -1:
                self.assertEqual(dec, self.db.numeric(c))
                count += 1
253
        self.assertTrue(count >= 10) # should have tested at least the ASCII digits
254 255 256

    def test_digit_numeric_consistent(self):
        # Test that digit and numeric are consistent,
Tim Peters's avatar
Tim Peters committed
257
        # i.e. if a character has a digit value,
258
        # its numeric value should be the same.
259
        count = 0
260
        for i in range(0x10000):
261
            c = chr(i)
262 263 264 265
            dec = self.db.digit(c, -1)
            if dec != -1:
                self.assertEqual(dec, self.db.numeric(c))
                count += 1
266
        self.assertTrue(count >= 10) # should have tested at least the ASCII digits
267

268
    def test_bug_1704793(self):
269
        self.assertEqual(self.db.lookup("GOTHIC LETTER FAIHU"), '\U00010346')
270

271 272 273
    def test_ucd_510(self):
        import unicodedata
        # In UCD 5.1.0, a mirrored property changed wrt. UCD 3.2.0
274 275
        self.assertTrue(unicodedata.mirrored("\u0f3a"))
        self.assertTrue(not unicodedata.ucd_3_2_0.mirrored("\u0f3a"))
276 277
        # Also, we now have two ways of representing
        # the upper-case mapping: as delta, or as absolute value
278 279 280
        self.assertTrue("a".upper()=='A')
        self.assertTrue("\u1d79".upper()=='\ua77d')
        self.assertTrue(".".upper()=='.')
281 282 283 284 285 286 287 288 289 290 291 292

    def test_bug_5828(self):
        self.assertEqual("\u1d79".lower(), "\u1d79")
        # Only U+0000 should have U+0000 as its upper/lower/titlecase variant
        self.assertEqual(
            [
                c for c in range(sys.maxunicode+1)
                if "\x00" in chr(c).lower()+chr(c).upper()+chr(c).title()
            ],
            [0]
        )

293
    def test_bug_4971(self):
294 295 296 297
        # LETTER DZ WITH CARON: DZ, Dz, dz
        self.assertEqual("\u01c4".title(), "\u01c5")
        self.assertEqual("\u01c5".title(), "\u01c5")
        self.assertEqual("\u01c6".title(), "\u01c5")
298

299
def test_main():
300
    test.support.run_unittest(
301 302 303 304
        UnicodeMiscTest,
        UnicodeMethodsTest,
        UnicodeFunctionsTest
    )
305 306 307

if __name__ == "__main__":
    test_main()