test_multibytecodec.py 11.3 KB
Newer Older
1
#!/usr/bin/env python3
2 3 4 5 6
#
# test_multibytecodec.py
#   Unit test for multibytecodec itself
#

7 8
from test import support
from test.support import TESTFN
9
import unittest, io, codecs, sys, os
Georg Brandl's avatar
Georg Brandl committed
10
import _multibytecodec
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27

ALL_CJKENCODINGS = [
# _codecs_cn
    'gb2312', 'gbk', 'gb18030', 'hz',
# _codecs_hk
    'big5hkscs',
# _codecs_jp
    'cp932', 'shift_jis', 'euc_jp', 'euc_jisx0213', 'shift_jisx0213',
    'euc_jis_2004', 'shift_jis_2004',
# _codecs_kr
    'cp949', 'euc_kr', 'johab',
# _codecs_tw
    'big5', 'cp950',
# _codecs_iso2022
    'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2', 'iso2022_jp_2004',
    'iso2022_jp_3', 'iso2022_jp_ext', 'iso2022_kr',
]
28 29 30 31

class Test_MultibyteCodec(unittest.TestCase):

    def test_nullcoding(self):
32
        for enc in ALL_CJKENCODINGS:
33 34 35
            self.assertEqual(b''.decode(enc), '')
            self.assertEqual(str(b'', enc), '')
            self.assertEqual(''.encode(enc), b'')
36 37

    def test_str_decode(self):
38
        for enc in ALL_CJKENCODINGS:
39
            self.assertEqual('abcd'.encode(enc), b'abcd')
40 41 42

    def test_errorcallback_longindex(self):
        dec = codecs.getdecoder('euc-kr')
43
        myreplace  = lambda exc: ('', sys.maxsize+1)
44 45
        codecs.register_error('test.cjktest', myreplace)
        self.assertRaises(IndexError, dec,
46
                          b'apple\x92ham\x93spam', 'test.cjktest')
47

48 49 50
    def test_codingspec(self):
        try:
            for enc in ALL_CJKENCODINGS:
51 52
                code = '# coding: {}\n'.format(enc)
                exec(code)
53
        finally:
54
            support.unlink(TESTFN)
55

Georg Brandl's avatar
Georg Brandl committed
56 57 58 59 60 61 62
    def test_init_segfault(self):
        # bug #3305: this used to segfault
        self.assertRaises(AttributeError,
                          _multibytecodec.MultibyteStreamReader, None)
        self.assertRaises(AttributeError,
                          _multibytecodec.MultibyteStreamWriter, None)

63 64 65 66
    def test_decode_unicode(self):
        # Trying to decode an unicode string should raise a TypeError
        for enc in ALL_CJKENCODINGS:
            self.assertRaises(TypeError, codecs.getdecoder(enc), "")
Georg Brandl's avatar
Georg Brandl committed
67

68 69 70 71 72
class Test_IncrementalEncoder(unittest.TestCase):

    def test_stateless(self):
        # cp949 encoder isn't stateful at all.
        encoder = codecs.getincrementalencoder('cp949')()
73
        self.assertEqual(encoder.encode('\ud30c\uc774\uc36c \ub9c8\uc744'),
74
                         b'\xc6\xc4\xc0\xcc\xbd\xe3 \xb8\xb6\xc0\xbb')
75
        self.assertEqual(encoder.reset(), None)
76
        self.assertEqual(encoder.encode('\u2606\u223c\u2606', True),
77
                         b'\xa1\xd9\xa1\xad\xa1\xd9')
78
        self.assertEqual(encoder.reset(), None)
79 80
        self.assertEqual(encoder.encode('', True), b'')
        self.assertEqual(encoder.encode('', False), b'')
81 82 83 84 85 86 87 88 89
        self.assertEqual(encoder.reset(), None)

    def test_stateful(self):
        # jisx0213 encoder is stateful for a few codepoints. eg)
        #   U+00E6 => A9DC
        #   U+00E6 U+0300 => ABC4
        #   U+0300 => ABDC

        encoder = codecs.getincrementalencoder('jisx0213')()
90 91 92 93
        self.assertEqual(encoder.encode('\u00e6\u0300'), b'\xab\xc4')
        self.assertEqual(encoder.encode('\u00e6'), b'')
        self.assertEqual(encoder.encode('\u0300'), b'\xab\xc4')
        self.assertEqual(encoder.encode('\u00e6', True), b'\xa9\xdc')
94 95

        self.assertEqual(encoder.reset(), None)
96
        self.assertEqual(encoder.encode('\u0300'), b'\xab\xdc')
97

98 99 100
        self.assertEqual(encoder.encode('\u00e6'), b'')
        self.assertEqual(encoder.encode('', True), b'\xa9\xdc')
        self.assertEqual(encoder.encode('', True), b'')
101 102 103

    def test_stateful_keep_buffer(self):
        encoder = codecs.getincrementalencoder('jisx0213')()
104
        self.assertEqual(encoder.encode('\u00e6'), b'')
105
        self.assertRaises(UnicodeEncodeError, encoder.encode, '\u0123')
106
        self.assertEqual(encoder.encode('\u0300\u00e6'), b'\xab\xc4')
107
        self.assertRaises(UnicodeEncodeError, encoder.encode, '\u0123')
108
        self.assertEqual(encoder.reset(), None)
109 110
        self.assertEqual(encoder.encode('\u0300'), b'\xab\xdc')
        self.assertEqual(encoder.encode('\u00e6'), b'')
111
        self.assertRaises(UnicodeEncodeError, encoder.encode, '\u0123')
112
        self.assertEqual(encoder.encode('', True), b'\xa9\xdc')
113

114 115 116 117
    def test_issue5640(self):
        encoder = codecs.getincrementalencoder('shift-jis')('backslashreplace')
        self.assertEqual(encoder.encode('\xff'), b'\\xff')
        self.assertEqual(encoder.encode('\n'), b'\n')
118 119 120 121 122 123

class Test_IncrementalDecoder(unittest.TestCase):

    def test_dbcs(self):
        # cp949 decoder is simple with only 1 or 2 bytes sequences.
        decoder = codecs.getincrementaldecoder('cp949')()
124
        self.assertEqual(decoder.decode(b'\xc6\xc4\xc0\xcc\xbd'),
125
                         '\ud30c\uc774')
126
        self.assertEqual(decoder.decode(b'\xe3 \xb8\xb6\xc0\xbb'),
127
                         '\uc36c \ub9c8\uc744')
128
        self.assertEqual(decoder.decode(b''), '')
129 130 131

    def test_dbcs_keep_buffer(self):
        decoder = codecs.getincrementaldecoder('cp949')()
132
        self.assertEqual(decoder.decode(b'\xc6\xc4\xc0'), '\ud30c')
133
        self.assertRaises(UnicodeDecodeError, decoder.decode, b'', True)
134
        self.assertEqual(decoder.decode(b'\xcc'), '\uc774')
135

136
        self.assertEqual(decoder.decode(b'\xc6\xc4\xc0'), '\ud30c')
137 138
        self.assertRaises(UnicodeDecodeError, decoder.decode,
                          b'\xcc\xbd', True)
139
        self.assertEqual(decoder.decode(b'\xcc'), '\uc774')
140 141 142

    def test_iso2022(self):
        decoder = codecs.getincrementaldecoder('iso2022-jp')()
143 144 145 146 147 148 149
        ESC = b'\x1b'
        self.assertEqual(decoder.decode(ESC + b'('), '')
        self.assertEqual(decoder.decode(b'B', True), '')
        self.assertEqual(decoder.decode(ESC + b'$'), '')
        self.assertEqual(decoder.decode(b'B@$'), '\u4e16')
        self.assertEqual(decoder.decode(b'@$@'), '\u4e16')
        self.assertEqual(decoder.decode(b'$', True), '\u4e16')
150
        self.assertEqual(decoder.reset(), None)
151 152 153 154
        self.assertEqual(decoder.decode(b'@$'), '@$')
        self.assertEqual(decoder.decode(ESC + b'$'), '')
        self.assertRaises(UnicodeDecodeError, decoder.decode, b'', True)
        self.assertEqual(decoder.decode(b'B@$'), '\u4e16')
155

156 157 158 159 160 161
    def test_decode_unicode(self):
        # Trying to decode an unicode string should raise a TypeError
        for enc in ALL_CJKENCODINGS:
            decoder = codecs.getincrementaldecoder(enc)()
            self.assertRaises(TypeError, decoder.decode, "")

162 163 164
class Test_StreamReader(unittest.TestCase):
    def test_bug1728403(self):
        try:
165 166 167 168 169
            f = open(TESTFN, 'wb')
            try:
                f.write(b'\xa1')
            finally:
                f.close()
170
            f = codecs.open(TESTFN, encoding='cp949')
171 172 173 174
            try:
                self.assertRaises(UnicodeDecodeError, f.read, 2)
            finally:
                f.close()
175
        finally:
176
            support.unlink(TESTFN)
177 178

class Test_StreamWriter(unittest.TestCase):
179
    if len('\U00012345') == 2: # UCS2
180
        def test_gb18030(self):
181
            s= io.BytesIO()
182
            c = codecs.getwriter('gb18030')(s)
183
            c.write('123')
184
            self.assertEqual(s.getvalue(), b'123')
185
            c.write('\U00012345')
186
            self.assertEqual(s.getvalue(), b'123\x907\x959')
187
            c.write('\U00012345'[0])
188
            self.assertEqual(s.getvalue(), b'123\x907\x959')
189
            c.write('\U00012345'[1] + '\U00012345' + '\uac00\u00ac')
190
            self.assertEqual(s.getvalue(),
191
                    b'123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
192
            c.write('\U00012345'[0])
193
            self.assertEqual(s.getvalue(),
194
                    b'123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
195 196
            self.assertRaises(UnicodeError, c.reset)
            self.assertEqual(s.getvalue(),
197
                    b'123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
198

199
        def test_utf_8(self):
200
            s= io.BytesIO()
201
            c = codecs.getwriter('utf-8')(s)
202
            c.write('123')
203
            self.assertEqual(s.getvalue(), b'123')
204
            c.write('\U00012345')
205
            self.assertEqual(s.getvalue(), b'123\xf0\x92\x8d\x85')
206 207 208

            # Python utf-8 codec can't buffer surrogate pairs yet.
            if 0:
209
                c.write('\U00012345'[0])
210
                self.assertEqual(s.getvalue(), b'123\xf0\x92\x8d\x85')
211
                c.write('\U00012345'[1] + '\U00012345' + '\uac00\u00ac')
212
                self.assertEqual(s.getvalue(),
213 214
                    b'123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
                    b'\xea\xb0\x80\xc2\xac')
215
                c.write('\U00012345'[0])
216
                self.assertEqual(s.getvalue(),
217 218
                    b'123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
                    b'\xea\xb0\x80\xc2\xac')
219 220
                c.reset()
                self.assertEqual(s.getvalue(),
221 222
                    b'123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
                    b'\xea\xb0\x80\xc2\xac\xed\xa0\x88')
223
                c.write('\U00012345'[1])
224
                self.assertEqual(s.getvalue(),
225 226
                    b'123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
                    b'\xea\xb0\x80\xc2\xac\xed\xa0\x88\xed\xbd\x85')
227 228 229 230

    else: # UCS4
        pass

231
    def test_streamwriter_strwrite(self):
232
        s = io.BytesIO()
233 234
        wr = codecs.getwriter('gb18030')(s)
        wr.write('abcd')
235
        self.assertEqual(s.getvalue(), b'abcd')
236

237 238
class Test_ISO2022(unittest.TestCase):
    def test_g2(self):
239
        iso2022jp2 = b'\x1b(B:hu4:unit\x1b.A\x1bNi de famille'
240
        uni = ':hu4:unit\xe9 de famille'
241 242
        self.assertEqual(iso2022jp2.decode('iso2022-jp-2'), uni)

243
    def test_iso2022_jp_g0(self):
244
        self.assertNotIn(b'\x0e', '\N{SOFT HYPHEN}'.encode('iso-2022-jp-2'))
245
        for encoding in ('iso-2022-jp-2004', 'iso-2022-jp-3'):
246
            e = '\u3406'.encode(encoding)
247
            self.assertFalse(any(x > 0x80 for x in e))
248 249

    def test_bug1572832(self):
250
        for x in range(0x10000, 0x110000):
251
            # Any ISO 2022 codec will cause the segfault
252
            chr(x).encode('iso_2022_jp', 'ignore')
253

254 255 256 257
class TestStateful(unittest.TestCase):
    text = '\u4E16\u4E16'
    encoding = 'iso-2022-jp'
    expected = b'\x1b$B@$@$'
258 259
    reset = b'\x1b(B'
    expected_reset = expected + reset
260 261 262 263 264 265 266 267 268 269

    def test_encode(self):
        self.assertEqual(self.text.encode(self.encoding), self.expected_reset)

    def test_incrementalencoder(self):
        encoder = codecs.getincrementalencoder(self.encoding)()
        output = b''.join(
            encoder.encode(char)
            for char in self.text)
        self.assertEqual(output, self.expected)
270 271
        self.assertEqual(encoder.encode('', final=True), self.reset)
        self.assertEqual(encoder.encode('', final=True), b'')
272 273 274 275 276 277 278 279

    def test_incrementalencoder_final(self):
        encoder = codecs.getincrementalencoder(self.encoding)()
        last_index = len(self.text) - 1
        output = b''.join(
            encoder.encode(char, index == last_index)
            for index, char in enumerate(self.text))
        self.assertEqual(output, self.expected_reset)
280
        self.assertEqual(encoder.encode('', final=True), b'')
281 282 283 284 285

class TestHZStateful(TestStateful):
    text = '\u804a\u804a'
    encoding = 'hz'
    expected = b'~{ADAD'
286 287
    reset = b'~}'
    expected_reset = expected + reset
288

289
def test_main():
290
    support.run_unittest(__name__)
291 292 293

if __name__ == "__main__":
    test_main()