test_source_encoding.py 7.88 KB
Newer Older
Benjamin Peterson's avatar
Benjamin Peterson committed
1
# -*- coding: koi8-r -*-
2 3

import unittest
4
from test.support import TESTFN, unlink, unload, rmtree, script_helper, captured_stdout
5 6 7
import importlib
import os
import sys
Martin v. Löwis's avatar
Martin v. Löwis committed
8
import subprocess
9
import tempfile
10

11
class MiscSourceEncodingTest(unittest.TestCase):
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33

    def test_pep263(self):
        self.assertEqual(
            "".encode("utf-8"),
            b'\xd0\x9f\xd0\xb8\xd1\x82\xd0\xbe\xd0\xbd'
        )
        self.assertEqual(
            "\".encode("utf-8"),
            b'\\\xd0\x9f'
        )

    def test_compilestring(self):
        # see #1882
        c = compile(b"\n# coding: utf-8\nu = '\xc3\xb3'\n", "dummy", "exec")
        d = {}
        exec(c, d)
        self.assertEqual(d['u'], '\xf3')

    def test_issue2301(self):
        try:
            compile(b"# coding: cp932\nprint '\x94\x4e'", "dummy", "exec")
        except SyntaxError as v:
34
            self.assertEqual(v.text, "print '\u5e74'\n")
35 36 37
        else:
            self.fail()

38 39 40 41
    def test_issue4626(self):
        c = compile("# coding=latin-1\n\u00c6 = '\u00c6'", "dummy", "exec")
        d = {}
        exec(c, d)
42
        self.assertEqual(d['\xc6'], '\xc6')
43

44 45 46 47 48 49 50 51
    def test_issue3297(self):
        c = compile("a, b = '\U0001010F', '\\U0001010F'", "dummy", "exec")
        d = {}
        exec(c, d)
        self.assertEqual(d['a'], d['b'])
        self.assertEqual(len(d['a']), len(d['b']))
        self.assertEqual(ascii(d['a']), ascii(d['b']))

52 53 54 55 56 57 58 59 60 61 62
    def test_issue7820(self):
        # Ensure that check_bom() restores all bytes in the right order if
        # check_bom() fails in pydebug mode: a buffer starts with the first
        # byte of a valid BOM, but next bytes are different

        # one byte in common with the UTF-16-LE BOM
        self.assertRaises(SyntaxError, eval, b'\xff\x20')

        # two bytes in common with the UTF-8 BOM
        self.assertRaises(SyntaxError, eval, b'\xef\xbb\x20')

Martin v. Löwis's avatar
Martin v. Löwis committed
63
    def test_20731(self):
Martin v. Löwis's avatar
Martin v. Löwis committed
64
        sub = subprocess.Popen([sys.executable,
Martin v. Löwis's avatar
Martin v. Löwis committed
65 66 67 68
                        os.path.join(os.path.dirname(__file__),
                                     'coding20731.py')],
                        stderr=subprocess.PIPE)
        err = sub.communicate()[1]
Benjamin Peterson's avatar
Benjamin Peterson committed
69 70
        self.assertEqual(sub.returncode, 0)
        self.assertNotIn(b'SyntaxError', err)
Martin v. Löwis's avatar
Martin v. Löwis committed
71

72 73 74 75
    def test_error_message(self):
        compile(b'# -*- coding: iso-8859-15 -*-\n', 'dummy', 'exec')
        compile(b'\xef\xbb\xbf\n', 'dummy', 'exec')
        compile(b'\xef\xbb\xbf# -*- coding: utf-8 -*-\n', 'dummy', 'exec')
76
        with self.assertRaisesRegex(SyntaxError, 'fake'):
77
            compile(b'# -*- coding: fake -*-\n', 'dummy', 'exec')
78
        with self.assertRaisesRegex(SyntaxError, 'iso-8859-15'):
79 80
            compile(b'\xef\xbb\xbf# -*- coding: iso-8859-15 -*-\n',
                    'dummy', 'exec')
81
        with self.assertRaisesRegex(SyntaxError, 'BOM'):
82 83
            compile(b'\xef\xbb\xbf# -*- coding: iso-8859-15 -*-\n',
                    'dummy', 'exec')
84
        with self.assertRaisesRegex(SyntaxError, 'fake'):
85
            compile(b'\xef\xbb\xbf# -*- coding: fake -*-\n', 'dummy', 'exec')
86
        with self.assertRaisesRegex(SyntaxError, 'BOM'):
87
            compile(b'\xef\xbb\xbf# -*- coding: fake -*-\n', 'dummy', 'exec')
88 89 90

    def test_bad_coding(self):
        module_name = 'bad_coding'
91 92 93 94 95 96 97
        self.verify_bad_module(module_name)

    def test_bad_coding2(self):
        module_name = 'bad_coding2'
        self.verify_bad_module(module_name)

    def verify_bad_module(self, module_name):
98 99 100 101
        self.assertRaises(SyntaxError, __import__, 'test.' + module_name)

        path = os.path.dirname(__file__)
        filename = os.path.join(path, module_name + '.py')
102 103
        with open(filename, "rb") as fp:
            bytes = fp.read()
104
        self.assertRaises(SyntaxError, compile, bytes, filename, 'exec')
105

106 107
    def test_exec_valid_coding(self):
        d = {}
108 109
        exec(b'# coding: cp949\na = "\xaa\xa7"\n', d)
        self.assertEqual(d['a'], '\u3047')
110

111 112 113
    def test_file_parse(self):
        # issue1134: all encodings outside latin-1 and utf-8 fail on
        # multiline strings and long lines (>512 columns)
114
        unload(TESTFN)
115
        filename = TESTFN + ".py"
116 117
        f = open(filename, "w", encoding="cp1252")
        sys.path.insert(0, os.curdir)
118
        try:
119 120 121 122 123
            with f:
                f.write("# -*- coding: cp1252 -*-\n")
                f.write("'''A short string\n")
                f.write("'''\n")
                f.write("'A very long string %s'\n" % ("X" * 1000))
124

125
            importlib.invalidate_caches()
126
            __import__(TESTFN)
127
        finally:
128
            del sys.path[0]
129 130
            unlink(filename)
            unlink(filename + "c")
131
            unlink(filename + "o")
132
            unload(TESTFN)
133
            rmtree('__pycache__')
134

Benjamin Peterson's avatar
Benjamin Peterson committed
135 136 137
    def test_error_from_string(self):
        # See http://bugs.python.org/issue6289
        input = "# coding: ascii\n\N{SNOWMAN}".encode('utf-8')
138
        with self.assertRaises(SyntaxError) as c:
Benjamin Peterson's avatar
Benjamin Peterson committed
139
            compile(input, "<string>", "exec")
140 141
        expected = "'ascii' codec can't decode byte 0xe2 in position 16: " \
                   "ordinal not in range(128)"
142 143
        self.assertTrue(c.exception.args[0].startswith(expected),
                        msg=c.exception.args[0])
144

Benjamin Peterson's avatar
Benjamin Peterson committed
145

146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
class AbstractSourceEncodingTest:

    def test_default_coding(self):
        src = (b'print(ascii("\xc3\xa4"))\n')
        self.check_script_output(src, br"'\xe4'")

    def test_first_coding_line(self):
        src = (b'#coding:iso8859-15\n'
               b'print(ascii("\xc3\xa4"))\n')
        self.check_script_output(src, br"'\xc3\u20ac'")

    def test_second_coding_line(self):
        src = (b'#\n'
               b'#coding:iso8859-15\n'
               b'print(ascii("\xc3\xa4"))\n')
        self.check_script_output(src, br"'\xc3\u20ac'")

    def test_third_coding_line(self):
        # Only first two lines are tested for a magic comment.
        src = (b'#\n'
               b'#\n'
               b'#coding:iso8859-15\n'
               b'print(ascii("\xc3\xa4"))\n')
        self.check_script_output(src, br"'\xe4'")

    def test_double_coding_line(self):
        # If the first line matches the second line is ignored.
        src = (b'#coding:iso8859-15\n'
               b'#coding:latin1\n'
               b'print(ascii("\xc3\xa4"))\n')
        self.check_script_output(src, br"'\xc3\u20ac'")

    def test_double_coding_same_line(self):
        src = (b'#coding:iso8859-15 coding:latin1\n'
               b'print(ascii("\xc3\xa4"))\n')
181
        self.check_script_output(src, br"'\xc3\u20ac'")
182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223

    def test_first_non_utf8_coding_line(self):
        src = (b'#coding:iso-8859-15 \xa4\n'
               b'print(ascii("\xc3\xa4"))\n')
        self.check_script_output(src, br"'\xc3\u20ac'")

    def test_second_non_utf8_coding_line(self):
        src = (b'\n'
               b'#coding:iso-8859-15 \xa4\n'
               b'print(ascii("\xc3\xa4"))\n')
        self.check_script_output(src, br"'\xc3\u20ac'")

    def test_utf8_bom(self):
        src = (b'\xef\xbb\xbfprint(ascii("\xc3\xa4"))\n')
        self.check_script_output(src, br"'\xe4'")

    def test_utf8_bom_and_utf8_coding_line(self):
        src = (b'\xef\xbb\xbf#coding:utf-8\n'
               b'print(ascii("\xc3\xa4"))\n')
        self.check_script_output(src, br"'\xe4'")


class BytesSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase):

    def check_script_output(self, src, expected):
        with captured_stdout() as stdout:
            exec(src)
        out = stdout.getvalue().encode('latin1')
        self.assertEqual(out.rstrip(), expected)


class FileSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase):

    def check_script_output(self, src, expected):
        with tempfile.TemporaryDirectory() as tmpd:
            fn = os.path.join(tmpd, 'test.py')
            with open(fn, 'wb') as fp:
                fp.write(src)
            res = script_helper.assert_python_ok(fn)
        self.assertEqual(res.out.rstrip(), expected)


224
if __name__ == "__main__":
225
    unittest.main()