test_codecencodings_cn.py 3.86 KB
Newer Older
1 2 3 4 5
#
# test_codecencodings_cn.py
#   Codec encoding tests for PRC encodings.
#

6
from test import multibytecodec_support
7 8
import unittest

9
class Test_GB2312(multibytecodec_support.TestBase, unittest.TestCase):
10
    encoding = 'gb2312'
11
    tstring = multibytecodec_support.load_teststring('gb2312')
12 13
    codectests = (
        # invalid bytes
14 15
        (b"abc\x81\x81\xc1\xc4", "strict",  None),
        (b"abc\xc8", "strict",  None),
16 17
        (b"abc\x81\x81\xc1\xc4", "replace", "abc\ufffd\ufffd\u804a"),
        (b"abc\x81\x81\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u804a\ufffd"),
18 19
        (b"abc\x81\x81\xc1\xc4", "ignore",  "abc\u804a"),
        (b"\xc1\x64", "strict", None),
20 21
    )

22
class Test_GBK(multibytecodec_support.TestBase, unittest.TestCase):
23
    encoding = 'gbk'
24
    tstring = multibytecodec_support.load_teststring('gbk')
25 26
    codectests = (
        # invalid bytes
27 28
        (b"abc\x80\x80\xc1\xc4", "strict",  None),
        (b"abc\xc8", "strict",  None),
29 30
        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\u804a"),
        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u804a\ufffd"),
31 32
        (b"abc\x80\x80\xc1\xc4", "ignore",  "abc\u804a"),
        (b"\x83\x34\x83\x31", "strict", None),
33
        ("\u30fb", "strict", None),
34 35
    )

36
class Test_GB18030(multibytecodec_support.TestBase, unittest.TestCase):
37
    encoding = 'gb18030'
38
    tstring = multibytecodec_support.load_teststring('gb18030')
39 40
    codectests = (
        # invalid bytes
41 42
        (b"abc\x80\x80\xc1\xc4", "strict",  None),
        (b"abc\xc8", "strict",  None),
43 44
        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\u804a"),
        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u804a\ufffd"),
45
        (b"abc\x80\x80\xc1\xc4", "ignore",  "abc\u804a"),
46
        (b"abc\x84\x39\x84\x39\xc1\xc4", "replace", "abc\ufffd9\ufffd9\u804a"),
47
        ("\u30fb", "strict", b"\x819\xa79"),
48 49 50
        (b"abc\x84\x32\x80\x80def", "replace", 'abc\ufffd2\ufffd\ufffddef'),
        (b"abc\x81\x30\x81\x30def", "strict", 'abc\x80def'),
        (b"abc\x86\x30\x81\x30def", "replace", 'abc\ufffd0\ufffd0def'),
51 52 53 54 55 56
        # issue29990
        (b"\xff\x30\x81\x30", "strict", None),
        (b"\x81\x30\xff\x30", "strict", None),
        (b"abc\x81\x39\xff\x39\xc1\xc4", "replace", "abc\ufffd\x39\ufffd\x39\u804a"),
        (b"abc\xab\x36\xff\x30def", "replace", 'abc\ufffd\x36\ufffd\x30def'),
        (b"abc\xbf\x38\xff\x32\xc1\xc4", "ignore",  "abc\x38\x32\u804a"),
57 58 59
    )
    has_iso10646 = True

60
class Test_HZ(multibytecodec_support.TestBase, unittest.TestCase):
61
    encoding = 'hz'
62
    tstring = multibytecodec_support.load_teststring('hz')
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
    codectests = (
        # test '~\n' (3 lines)
        (b'This sentence is in ASCII.\n'
         b'The next sentence is in GB.~{<:Ky2;S{#,~}~\n'
         b'~{NpJ)l6HK!#~}Bye.\n',
         'strict',
         'This sentence is in ASCII.\n'
         'The next sentence is in GB.'
         '\u5df1\u6240\u4e0d\u6b32\uff0c\u52ff\u65bd\u65bc\u4eba\u3002'
         'Bye.\n'),
        # test '~\n' (4 lines)
        (b'This sentence is in ASCII.\n'
         b'The next sentence is in GB.~\n'
         b'~{<:Ky2;S{#,NpJ)l6HK!#~}~\n'
         b'Bye.\n',
         'strict',
         'This sentence is in ASCII.\n'
         'The next sentence is in GB.'
         '\u5df1\u6240\u4e0d\u6b32\uff0c\u52ff\u65bd\u65bc\u4eba\u3002'
         'Bye.\n'),
        # invalid bytes
84
        (b'ab~cd', 'replace', 'ab\uFFFDcd'),
85 86
        (b'ab\xffcd', 'replace', 'ab\uFFFDcd'),
        (b'ab~{\x81\x81\x41\x44~}cd', 'replace', 'ab\uFFFD\uFFFD\u804Acd'),
87 88
        (b'ab~{\x41\x44~}cd', 'replace', 'ab\u804Acd'),
        (b"ab~{\x79\x79\x41\x44~}cd", "replace", "ab\ufffd\ufffd\u804acd"),
89 90 91 92
        # issue 30003
        ('ab~cd', 'strict',  b'ab~~cd'),  # escape ~
        (b'~{Dc~~:C~}', 'strict', None),  # ~~ only in ASCII mode
        (b'~{Dc~\n:C~}', 'strict', None), # ~\n only in ASCII mode
93 94
    )

95
if __name__ == "__main__":
96
    unittest.main()