test_multibytecodec.py 9.74 KB
Newer Older
1 2 3 4 5
#
# test_multibytecodec.py
#   Unit test for multibytecodec itself
#

6 7
from test import support
from test.support import TESTFN
8
import unittest, io, codecs, sys, os
Georg Brandl's avatar
Georg Brandl committed
9
import _multibytecodec
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26

ALL_CJKENCODINGS = [
# _codecs_cn
    'gb2312', 'gbk', 'gb18030', 'hz',
# _codecs_hk
    'big5hkscs',
# _codecs_jp
    'cp932', 'shift_jis', 'euc_jp', 'euc_jisx0213', 'shift_jisx0213',
    'euc_jis_2004', 'shift_jis_2004',
# _codecs_kr
    'cp949', 'euc_kr', 'johab',
# _codecs_tw
    'big5', 'cp950',
# _codecs_iso2022
    'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2', 'iso2022_jp_2004',
    'iso2022_jp_3', 'iso2022_jp_ext', 'iso2022_kr',
]
27 28 29 30

class Test_MultibyteCodec(unittest.TestCase):

    def test_nullcoding(self):
31
        for enc in ALL_CJKENCODINGS:
32 33 34
            self.assertEqual(b''.decode(enc), '')
            self.assertEqual(str(b'', enc), '')
            self.assertEqual(''.encode(enc), b'')
35 36

    def test_str_decode(self):
37
        for enc in ALL_CJKENCODINGS:
38
            self.assertEqual('abcd'.encode(enc), b'abcd')
39 40 41

    def test_errorcallback_longindex(self):
        dec = codecs.getdecoder('euc-kr')
42
        myreplace  = lambda exc: ('', sys.maxsize+1)
43 44
        codecs.register_error('test.cjktest', myreplace)
        self.assertRaises(IndexError, dec,
45
                          b'apple\x92ham\x93spam', 'test.cjktest')
46

47 48 49
    def test_codingspec(self):
        try:
            for enc in ALL_CJKENCODINGS:
50 51
                code = '# coding: {}\n'.format(enc)
                exec(code)
52
        finally:
53
            support.unlink(TESTFN)
54

Georg Brandl's avatar
Georg Brandl committed
55 56 57 58 59 60 61
    def test_init_segfault(self):
        # bug #3305: this used to segfault
        self.assertRaises(AttributeError,
                          _multibytecodec.MultibyteStreamReader, None)
        self.assertRaises(AttributeError,
                          _multibytecodec.MultibyteStreamWriter, None)

62 63 64 65
    def test_decode_unicode(self):
        # Trying to decode an unicode string should raise a TypeError
        for enc in ALL_CJKENCODINGS:
            self.assertRaises(TypeError, codecs.getdecoder(enc), "")
Georg Brandl's avatar
Georg Brandl committed
66

67 68 69 70 71
class Test_IncrementalEncoder(unittest.TestCase):

    def test_stateless(self):
        # cp949 encoder isn't stateful at all.
        encoder = codecs.getincrementalencoder('cp949')()
72
        self.assertEqual(encoder.encode('\ud30c\uc774\uc36c \ub9c8\uc744'),
73
                         b'\xc6\xc4\xc0\xcc\xbd\xe3 \xb8\xb6\xc0\xbb')
74
        self.assertEqual(encoder.reset(), None)
75
        self.assertEqual(encoder.encode('\u2606\u223c\u2606', True),
76
                         b'\xa1\xd9\xa1\xad\xa1\xd9')
77
        self.assertEqual(encoder.reset(), None)
78 79
        self.assertEqual(encoder.encode('', True), b'')
        self.assertEqual(encoder.encode('', False), b'')
80 81 82 83 84 85 86 87 88
        self.assertEqual(encoder.reset(), None)

    def test_stateful(self):
        # jisx0213 encoder is stateful for a few codepoints. eg)
        #   U+00E6 => A9DC
        #   U+00E6 U+0300 => ABC4
        #   U+0300 => ABDC

        encoder = codecs.getincrementalencoder('jisx0213')()
89 90 91 92
        self.assertEqual(encoder.encode('\u00e6\u0300'), b'\xab\xc4')
        self.assertEqual(encoder.encode('\u00e6'), b'')
        self.assertEqual(encoder.encode('\u0300'), b'\xab\xc4')
        self.assertEqual(encoder.encode('\u00e6', True), b'\xa9\xdc')
93 94

        self.assertEqual(encoder.reset(), None)
95
        self.assertEqual(encoder.encode('\u0300'), b'\xab\xdc')
96

97 98 99
        self.assertEqual(encoder.encode('\u00e6'), b'')
        self.assertEqual(encoder.encode('', True), b'\xa9\xdc')
        self.assertEqual(encoder.encode('', True), b'')
100 101 102

    def test_stateful_keep_buffer(self):
        encoder = codecs.getincrementalencoder('jisx0213')()
103
        self.assertEqual(encoder.encode('\u00e6'), b'')
104
        self.assertRaises(UnicodeEncodeError, encoder.encode, '\u0123')
105
        self.assertEqual(encoder.encode('\u0300\u00e6'), b'\xab\xc4')
106
        self.assertRaises(UnicodeEncodeError, encoder.encode, '\u0123')
107
        self.assertEqual(encoder.reset(), None)
108 109
        self.assertEqual(encoder.encode('\u0300'), b'\xab\xdc')
        self.assertEqual(encoder.encode('\u00e6'), b'')
110
        self.assertRaises(UnicodeEncodeError, encoder.encode, '\u0123')
111
        self.assertEqual(encoder.encode('', True), b'\xa9\xdc')
112

113 114 115 116
    def test_issue5640(self):
        encoder = codecs.getincrementalencoder('shift-jis')('backslashreplace')
        self.assertEqual(encoder.encode('\xff'), b'\\xff')
        self.assertEqual(encoder.encode('\n'), b'\n')
117 118 119 120 121 122

class Test_IncrementalDecoder(unittest.TestCase):

    def test_dbcs(self):
        # cp949 decoder is simple with only 1 or 2 bytes sequences.
        decoder = codecs.getincrementaldecoder('cp949')()
123
        self.assertEqual(decoder.decode(b'\xc6\xc4\xc0\xcc\xbd'),
124
                         '\ud30c\uc774')
125
        self.assertEqual(decoder.decode(b'\xe3 \xb8\xb6\xc0\xbb'),
126
                         '\uc36c \ub9c8\uc744')
127
        self.assertEqual(decoder.decode(b''), '')
128 129 130

    def test_dbcs_keep_buffer(self):
        decoder = codecs.getincrementaldecoder('cp949')()
131
        self.assertEqual(decoder.decode(b'\xc6\xc4\xc0'), '\ud30c')
132
        self.assertRaises(UnicodeDecodeError, decoder.decode, b'', True)
133
        self.assertEqual(decoder.decode(b'\xcc'), '\uc774')
134

135
        self.assertEqual(decoder.decode(b'\xc6\xc4\xc0'), '\ud30c')
136 137
        self.assertRaises(UnicodeDecodeError, decoder.decode,
                          b'\xcc\xbd', True)
138
        self.assertEqual(decoder.decode(b'\xcc'), '\uc774')
139 140 141

    def test_iso2022(self):
        decoder = codecs.getincrementaldecoder('iso2022-jp')()
142 143 144 145 146 147 148
        ESC = b'\x1b'
        self.assertEqual(decoder.decode(ESC + b'('), '')
        self.assertEqual(decoder.decode(b'B', True), '')
        self.assertEqual(decoder.decode(ESC + b'$'), '')
        self.assertEqual(decoder.decode(b'B@$'), '\u4e16')
        self.assertEqual(decoder.decode(b'@$@'), '\u4e16')
        self.assertEqual(decoder.decode(b'$', True), '\u4e16')
149
        self.assertEqual(decoder.reset(), None)
150 151 152 153
        self.assertEqual(decoder.decode(b'@$'), '@$')
        self.assertEqual(decoder.decode(ESC + b'$'), '')
        self.assertRaises(UnicodeDecodeError, decoder.decode, b'', True)
        self.assertEqual(decoder.decode(b'B@$'), '\u4e16')
154

155 156 157 158 159 160
    def test_decode_unicode(self):
        # Trying to decode an unicode string should raise a TypeError
        for enc in ALL_CJKENCODINGS:
            decoder = codecs.getincrementaldecoder(enc)()
            self.assertRaises(TypeError, decoder.decode, "")

161 162 163
class Test_StreamReader(unittest.TestCase):
    def test_bug1728403(self):
        try:
164 165 166 167 168
            f = open(TESTFN, 'wb')
            try:
                f.write(b'\xa1')
            finally:
                f.close()
169
            f = codecs.open(TESTFN, encoding='cp949')
170 171 172 173
            try:
                self.assertRaises(UnicodeDecodeError, f.read, 2)
            finally:
                f.close()
174
        finally:
175
            support.unlink(TESTFN)
176 177

class Test_StreamWriter(unittest.TestCase):
178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199
    def test_gb18030(self):
        s= io.BytesIO()
        c = codecs.getwriter('gb18030')(s)
        c.write('123')
        self.assertEqual(s.getvalue(), b'123')
        c.write('\U00012345')
        self.assertEqual(s.getvalue(), b'123\x907\x959')
        c.write('\uac00\u00ac')
        self.assertEqual(s.getvalue(),
                b'123\x907\x959\x827\xcf5\x810\x851')

    def test_utf_8(self):
        s= io.BytesIO()
        c = codecs.getwriter('utf-8')(s)
        c.write('123')
        self.assertEqual(s.getvalue(), b'123')
        c.write('\U00012345')
        self.assertEqual(s.getvalue(), b'123\xf0\x92\x8d\x85')
        c.write('\uac00\u00ac')
        self.assertEqual(s.getvalue(),
            b'123\xf0\x92\x8d\x85'
            b'\xea\xb0\x80\xc2\xac')
200

201
    def test_streamwriter_strwrite(self):
202
        s = io.BytesIO()
203 204
        wr = codecs.getwriter('gb18030')(s)
        wr.write('abcd')
205
        self.assertEqual(s.getvalue(), b'abcd')
206

207 208
class Test_ISO2022(unittest.TestCase):
    def test_g2(self):
209
        iso2022jp2 = b'\x1b(B:hu4:unit\x1b.A\x1bNi de famille'
210
        uni = ':hu4:unit\xe9 de famille'
211 212
        self.assertEqual(iso2022jp2.decode('iso2022-jp-2'), uni)

213
    def test_iso2022_jp_g0(self):
214
        self.assertNotIn(b'\x0e', '\N{SOFT HYPHEN}'.encode('iso-2022-jp-2'))
215
        for encoding in ('iso-2022-jp-2004', 'iso-2022-jp-3'):
216
            e = '\u3406'.encode(encoding)
217
            self.assertFalse(any(x > 0x80 for x in e))
218 219

    def test_bug1572832(self):
220
        for x in range(0x10000, 0x110000):
221
            # Any ISO 2022 codec will cause the segfault
222
            chr(x).encode('iso_2022_jp', 'ignore')
223

224 225 226 227
class TestStateful(unittest.TestCase):
    text = '\u4E16\u4E16'
    encoding = 'iso-2022-jp'
    expected = b'\x1b$B@$@$'
228 229
    reset = b'\x1b(B'
    expected_reset = expected + reset
230 231 232 233 234 235 236 237 238 239

    def test_encode(self):
        self.assertEqual(self.text.encode(self.encoding), self.expected_reset)

    def test_incrementalencoder(self):
        encoder = codecs.getincrementalencoder(self.encoding)()
        output = b''.join(
            encoder.encode(char)
            for char in self.text)
        self.assertEqual(output, self.expected)
240 241
        self.assertEqual(encoder.encode('', final=True), self.reset)
        self.assertEqual(encoder.encode('', final=True), b'')
242 243 244 245 246 247 248 249

    def test_incrementalencoder_final(self):
        encoder = codecs.getincrementalencoder(self.encoding)()
        last_index = len(self.text) - 1
        output = b''.join(
            encoder.encode(char, index == last_index)
            for index, char in enumerate(self.text))
        self.assertEqual(output, self.expected_reset)
250
        self.assertEqual(encoder.encode('', final=True), b'')
251 252 253 254 255

class TestHZStateful(TestStateful):
    text = '\u804a\u804a'
    encoding = 'hz'
    expected = b'~{ADAD'
256 257
    reset = b'~}'
    expected_reset = expected + reset
258

259
def test_main():
260
    support.run_unittest(__name__)
261 262 263

if __name__ == "__main__":
    test_main()