test_multibytecodec.py 10.1 KB
Newer Older
1 2 3 4 5
#
# test_multibytecodec.py
#   Unit test for multibytecodec itself
#

6 7
from test import support
from test.support import TESTFN
8
import unittest, io, codecs, sys, os
Georg Brandl's avatar
Georg Brandl committed
9
import _multibytecodec
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26

ALL_CJKENCODINGS = [
# _codecs_cn
    'gb2312', 'gbk', 'gb18030', 'hz',
# _codecs_hk
    'big5hkscs',
# _codecs_jp
    'cp932', 'shift_jis', 'euc_jp', 'euc_jisx0213', 'shift_jisx0213',
    'euc_jis_2004', 'shift_jis_2004',
# _codecs_kr
    'cp949', 'euc_kr', 'johab',
# _codecs_tw
    'big5', 'cp950',
# _codecs_iso2022
    'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2', 'iso2022_jp_2004',
    'iso2022_jp_3', 'iso2022_jp_ext', 'iso2022_kr',
]
27 28 29 30

class Test_MultibyteCodec(unittest.TestCase):

    def test_nullcoding(self):
31
        for enc in ALL_CJKENCODINGS:
32 33 34
            self.assertEqual(b''.decode(enc), '')
            self.assertEqual(str(b'', enc), '')
            self.assertEqual(''.encode(enc), b'')
35 36

    def test_str_decode(self):
37
        for enc in ALL_CJKENCODINGS:
38
            self.assertEqual('abcd'.encode(enc), b'abcd')
39 40 41

    def test_errorcallback_longindex(self):
        dec = codecs.getdecoder('euc-kr')
42
        myreplace  = lambda exc: ('', sys.maxsize+1)
43 44
        codecs.register_error('test.cjktest', myreplace)
        self.assertRaises(IndexError, dec,
45
                          b'apple\x92ham\x93spam', 'test.cjktest')
46

47 48 49 50 51 52 53
    def test_errorcallback_custom_ignore(self):
        # Issue #23215: MemoryError with custom error handlers and multibyte codecs
        data = 100 * "\udc00"
        codecs.register_error("test.ignore", codecs.ignore_errors)
        for enc in ALL_CJKENCODINGS:
            self.assertEqual(data.encode(enc, "test.ignore"), b'')

54 55 56
    def test_codingspec(self):
        try:
            for enc in ALL_CJKENCODINGS:
57 58
                code = '# coding: {}\n'.format(enc)
                exec(code)
59
        finally:
60
            support.unlink(TESTFN)
61

Georg Brandl's avatar
Georg Brandl committed
62 63 64 65 66 67 68
    def test_init_segfault(self):
        # bug #3305: this used to segfault
        self.assertRaises(AttributeError,
                          _multibytecodec.MultibyteStreamReader, None)
        self.assertRaises(AttributeError,
                          _multibytecodec.MultibyteStreamWriter, None)

69 70 71 72
    def test_decode_unicode(self):
        # Trying to decode an unicode string should raise a TypeError
        for enc in ALL_CJKENCODINGS:
            self.assertRaises(TypeError, codecs.getdecoder(enc), "")
Georg Brandl's avatar
Georg Brandl committed
73

74 75 76 77 78
class Test_IncrementalEncoder(unittest.TestCase):

    def test_stateless(self):
        # cp949 encoder isn't stateful at all.
        encoder = codecs.getincrementalencoder('cp949')()
79
        self.assertEqual(encoder.encode('\ud30c\uc774\uc36c \ub9c8\uc744'),
80
                         b'\xc6\xc4\xc0\xcc\xbd\xe3 \xb8\xb6\xc0\xbb')
81
        self.assertEqual(encoder.reset(), None)
82
        self.assertEqual(encoder.encode('\u2606\u223c\u2606', True),
83
                         b'\xa1\xd9\xa1\xad\xa1\xd9')
84
        self.assertEqual(encoder.reset(), None)
85 86
        self.assertEqual(encoder.encode('', True), b'')
        self.assertEqual(encoder.encode('', False), b'')
87 88 89
        self.assertEqual(encoder.reset(), None)

    def test_stateful(self):
90
        # jisx0213 encoder is stateful for a few code points. eg)
91 92 93 94 95
        #   U+00E6 => A9DC
        #   U+00E6 U+0300 => ABC4
        #   U+0300 => ABDC

        encoder = codecs.getincrementalencoder('jisx0213')()
96 97 98 99
        self.assertEqual(encoder.encode('\u00e6\u0300'), b'\xab\xc4')
        self.assertEqual(encoder.encode('\u00e6'), b'')
        self.assertEqual(encoder.encode('\u0300'), b'\xab\xc4')
        self.assertEqual(encoder.encode('\u00e6', True), b'\xa9\xdc')
100 101

        self.assertEqual(encoder.reset(), None)
102
        self.assertEqual(encoder.encode('\u0300'), b'\xab\xdc')
103

104 105 106
        self.assertEqual(encoder.encode('\u00e6'), b'')
        self.assertEqual(encoder.encode('', True), b'\xa9\xdc')
        self.assertEqual(encoder.encode('', True), b'')
107 108 109

    def test_stateful_keep_buffer(self):
        encoder = codecs.getincrementalencoder('jisx0213')()
110
        self.assertEqual(encoder.encode('\u00e6'), b'')
111
        self.assertRaises(UnicodeEncodeError, encoder.encode, '\u0123')
112
        self.assertEqual(encoder.encode('\u0300\u00e6'), b'\xab\xc4')
113
        self.assertRaises(UnicodeEncodeError, encoder.encode, '\u0123')
114
        self.assertEqual(encoder.reset(), None)
115 116
        self.assertEqual(encoder.encode('\u0300'), b'\xab\xdc')
        self.assertEqual(encoder.encode('\u00e6'), b'')
117
        self.assertRaises(UnicodeEncodeError, encoder.encode, '\u0123')
118
        self.assertEqual(encoder.encode('', True), b'\xa9\xdc')
119

120 121 122 123
    def test_issue5640(self):
        encoder = codecs.getincrementalencoder('shift-jis')('backslashreplace')
        self.assertEqual(encoder.encode('\xff'), b'\\xff')
        self.assertEqual(encoder.encode('\n'), b'\n')
124 125 126 127 128 129

class Test_IncrementalDecoder(unittest.TestCase):

    def test_dbcs(self):
        # cp949 decoder is simple with only 1 or 2 bytes sequences.
        decoder = codecs.getincrementaldecoder('cp949')()
130
        self.assertEqual(decoder.decode(b'\xc6\xc4\xc0\xcc\xbd'),
131
                         '\ud30c\uc774')
132
        self.assertEqual(decoder.decode(b'\xe3 \xb8\xb6\xc0\xbb'),
133
                         '\uc36c \ub9c8\uc744')
134
        self.assertEqual(decoder.decode(b''), '')
135 136 137

    def test_dbcs_keep_buffer(self):
        decoder = codecs.getincrementaldecoder('cp949')()
138
        self.assertEqual(decoder.decode(b'\xc6\xc4\xc0'), '\ud30c')
139
        self.assertRaises(UnicodeDecodeError, decoder.decode, b'', True)
140
        self.assertEqual(decoder.decode(b'\xcc'), '\uc774')
141

142
        self.assertEqual(decoder.decode(b'\xc6\xc4\xc0'), '\ud30c')
143 144
        self.assertRaises(UnicodeDecodeError, decoder.decode,
                          b'\xcc\xbd', True)
145
        self.assertEqual(decoder.decode(b'\xcc'), '\uc774')
146 147 148

    def test_iso2022(self):
        decoder = codecs.getincrementaldecoder('iso2022-jp')()
149 150 151 152 153 154 155
        ESC = b'\x1b'
        self.assertEqual(decoder.decode(ESC + b'('), '')
        self.assertEqual(decoder.decode(b'B', True), '')
        self.assertEqual(decoder.decode(ESC + b'$'), '')
        self.assertEqual(decoder.decode(b'B@$'), '\u4e16')
        self.assertEqual(decoder.decode(b'@$@'), '\u4e16')
        self.assertEqual(decoder.decode(b'$', True), '\u4e16')
156
        self.assertEqual(decoder.reset(), None)
157 158 159 160
        self.assertEqual(decoder.decode(b'@$'), '@$')
        self.assertEqual(decoder.decode(ESC + b'$'), '')
        self.assertRaises(UnicodeDecodeError, decoder.decode, b'', True)
        self.assertEqual(decoder.decode(b'B@$'), '\u4e16')
161

162 163 164 165 166 167
    def test_decode_unicode(self):
        # Trying to decode an unicode string should raise a TypeError
        for enc in ALL_CJKENCODINGS:
            decoder = codecs.getincrementaldecoder(enc)()
            self.assertRaises(TypeError, decoder.decode, "")

168 169 170
class Test_StreamReader(unittest.TestCase):
    def test_bug1728403(self):
        try:
171 172 173 174 175
            f = open(TESTFN, 'wb')
            try:
                f.write(b'\xa1')
            finally:
                f.close()
176
            f = codecs.open(TESTFN, encoding='cp949')
177 178 179 180
            try:
                self.assertRaises(UnicodeDecodeError, f.read, 2)
            finally:
                f.close()
181
        finally:
182
            support.unlink(TESTFN)
183 184

class Test_StreamWriter(unittest.TestCase):
185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206
    def test_gb18030(self):
        s= io.BytesIO()
        c = codecs.getwriter('gb18030')(s)
        c.write('123')
        self.assertEqual(s.getvalue(), b'123')
        c.write('\U00012345')
        self.assertEqual(s.getvalue(), b'123\x907\x959')
        c.write('\uac00\u00ac')
        self.assertEqual(s.getvalue(),
                b'123\x907\x959\x827\xcf5\x810\x851')

    def test_utf_8(self):
        s= io.BytesIO()
        c = codecs.getwriter('utf-8')(s)
        c.write('123')
        self.assertEqual(s.getvalue(), b'123')
        c.write('\U00012345')
        self.assertEqual(s.getvalue(), b'123\xf0\x92\x8d\x85')
        c.write('\uac00\u00ac')
        self.assertEqual(s.getvalue(),
            b'123\xf0\x92\x8d\x85'
            b'\xea\xb0\x80\xc2\xac')
207

208
    def test_streamwriter_strwrite(self):
209
        s = io.BytesIO()
210 211
        wr = codecs.getwriter('gb18030')(s)
        wr.write('abcd')
212
        self.assertEqual(s.getvalue(), b'abcd')
213

214 215
class Test_ISO2022(unittest.TestCase):
    def test_g2(self):
216
        iso2022jp2 = b'\x1b(B:hu4:unit\x1b.A\x1bNi de famille'
217
        uni = ':hu4:unit\xe9 de famille'
218 219
        self.assertEqual(iso2022jp2.decode('iso2022-jp-2'), uni)

220
    def test_iso2022_jp_g0(self):
221
        self.assertNotIn(b'\x0e', '\N{SOFT HYPHEN}'.encode('iso-2022-jp-2'))
222
        for encoding in ('iso-2022-jp-2004', 'iso-2022-jp-3'):
223
            e = '\u3406'.encode(encoding)
224
            self.assertFalse(any(x > 0x80 for x in e))
225 226

    def test_bug1572832(self):
227
        for x in range(0x10000, 0x110000):
228
            # Any ISO 2022 codec will cause the segfault
229
            chr(x).encode('iso_2022_jp', 'ignore')
230

231 232 233 234
class TestStateful(unittest.TestCase):
    text = '\u4E16\u4E16'
    encoding = 'iso-2022-jp'
    expected = b'\x1b$B@$@$'
235 236
    reset = b'\x1b(B'
    expected_reset = expected + reset
237 238 239 240 241 242 243 244 245 246

    def test_encode(self):
        self.assertEqual(self.text.encode(self.encoding), self.expected_reset)

    def test_incrementalencoder(self):
        encoder = codecs.getincrementalencoder(self.encoding)()
        output = b''.join(
            encoder.encode(char)
            for char in self.text)
        self.assertEqual(output, self.expected)
247 248
        self.assertEqual(encoder.encode('', final=True), self.reset)
        self.assertEqual(encoder.encode('', final=True), b'')
249 250 251 252 253 254 255 256

    def test_incrementalencoder_final(self):
        encoder = codecs.getincrementalencoder(self.encoding)()
        last_index = len(self.text) - 1
        output = b''.join(
            encoder.encode(char, index == last_index)
            for index, char in enumerate(self.text))
        self.assertEqual(output, self.expected_reset)
257
        self.assertEqual(encoder.encode('', final=True), b'')
258 259 260 261 262

class TestHZStateful(TestStateful):
    text = '\u804a\u804a'
    encoding = 'hz'
    expected = b'~{ADAD'
263 264
    reset = b'~}'
    expected_reset = expected + reset
265

266
def test_main():
267
    support.run_unittest(__name__)
268 269 270

if __name__ == "__main__":
    test_main()