Kaydet (Commit) da780432 authored tarafından Brett Cannon's avatar Brett Cannon

Latin-1 source code was not being properly decoded when passed through

compile(). This was due to left-over special-casing before UTF-8 became the
default source encoding.

Closes issue #3574. Thanks to Victor Stinner for help with the patch.
üst 9e9dcd6d
...@@ -23,8 +23,24 @@ class PEP3120Test(unittest.TestCase): ...@@ -23,8 +23,24 @@ class PEP3120Test(unittest.TestCase):
else: else:
self.fail("expected exception didn't occur") self.fail("expected exception didn't occur")
class BuiltinCompileTests(unittest.TestCase):
# Issue 3574.
def test_latin1(self):
# Allow compile() to read Latin-1 source.
source_code = '# coding: Latin-1\nu = "Ç"\n'.encode("Latin-1")
try:
code = compile(source_code, '<dummy>', 'exec')
except SyntaxError:
self.fail("compile() cannot handle Latin-1 source")
ns = {}
exec(code, ns)
self.assertEqual('Ç', ns['u'])
def test_main(): def test_main():
support.run_unittest(PEP3120Test) support.run_unittest(PEP3120Test, BuiltinCompileTests)
if __name__=="__main__": if __name__=="__main__":
test_main() test_main()
...@@ -15,6 +15,8 @@ What's New in Python 3.0 beta 5 ...@@ -15,6 +15,8 @@ What's New in Python 3.0 beta 5
Core and Builtins Core and Builtins
----------------- -----------------
- Issue #3574: compile() incorrectly handled source code encoded as Latin-1.
- Issues #2384 and #3975: Tracebacks were not correctly printed when the - Issues #2384 and #3975: Tracebacks were not correctly printed when the
source file contains a ``coding:`` header: the wrong line was displayed, and source file contains a ``coding:`` header: the wrong line was displayed, and
the encoding was not respected. the encoding was not respected.
......
...@@ -135,6 +135,7 @@ tok_new(void) ...@@ -135,6 +135,7 @@ tok_new(void)
tok->decoding_state = STATE_INIT; tok->decoding_state = STATE_INIT;
tok->decoding_erred = 0; tok->decoding_erred = 0;
tok->read_coding_spec = 0; tok->read_coding_spec = 0;
tok->enc = NULL;
tok->encoding = NULL; tok->encoding = NULL;
tok->cont_line = 0; tok->cont_line = 0;
#ifndef PGEN #ifndef PGEN
...@@ -274,8 +275,7 @@ check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok, ...@@ -274,8 +275,7 @@ check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
tok->read_coding_spec = 1; tok->read_coding_spec = 1;
if (tok->encoding == NULL) { if (tok->encoding == NULL) {
assert(tok->decoding_state == STATE_RAW); assert(tok->decoding_state == STATE_RAW);
if (strcmp(cs, "utf-8") == 0 || if (strcmp(cs, "utf-8") == 0) {
strcmp(cs, "iso-8859-1") == 0) {
tok->encoding = cs; tok->encoding = cs;
} else { } else {
r = set_readline(tok, cs); r = set_readline(tok, cs);
......
...@@ -49,14 +49,14 @@ struct tok_state { ...@@ -49,14 +49,14 @@ struct tok_state {
enum decoding_state decoding_state; enum decoding_state decoding_state;
int decoding_erred; /* whether erred in decoding */ int decoding_erred; /* whether erred in decoding */
int read_coding_spec; /* whether 'coding:...' has been read */ int read_coding_spec; /* whether 'coding:...' has been read */
char *encoding; char *encoding; /* Source encoding. */
int cont_line; /* whether we are in a continuation line. */ int cont_line; /* whether we are in a continuation line. */
const char* line_start; /* pointer to start of current line */ const char* line_start; /* pointer to start of current line */
#ifndef PGEN #ifndef PGEN
PyObject *decoding_readline; /* codecs.open(...).readline */ PyObject *decoding_readline; /* codecs.open(...).readline */
PyObject *decoding_buffer; PyObject *decoding_buffer;
#endif #endif
const char* enc; const char* enc; /* Encoding for the current str. */
const char* str; const char* str;
}; };
......
...@@ -3160,9 +3160,6 @@ decode_unicode(struct compiling *c, const char *s, size_t len, int rawmode, cons ...@@ -3160,9 +3160,6 @@ decode_unicode(struct compiling *c, const char *s, size_t len, int rawmode, cons
if (encoding == NULL) { if (encoding == NULL) {
buf = (char *)s; buf = (char *)s;
u = NULL; u = NULL;
} else if (strcmp(encoding, "iso-8859-1") == 0) {
buf = (char *)s;
u = NULL;
} else { } else {
/* check for integer overflow */ /* check for integer overflow */
if (len > PY_SIZE_MAX / 4) if (len > PY_SIZE_MAX / 4)
...@@ -3275,8 +3272,7 @@ parsestr(struct compiling *c, const node *n, int *bytesmode) ...@@ -3275,8 +3272,7 @@ parsestr(struct compiling *c, const node *n, int *bytesmode)
} }
} }
need_encoding = (!*bytesmode && c->c_encoding != NULL && need_encoding = (!*bytesmode && c->c_encoding != NULL &&
strcmp(c->c_encoding, "utf-8") != 0 && strcmp(c->c_encoding, "utf-8") != 0);
strcmp(c->c_encoding, "iso-8859-1") != 0);
if (rawmode || strchr(s, '\\') == NULL) { if (rawmode || strchr(s, '\\') == NULL) {
if (need_encoding) { if (need_encoding) {
PyObject *v, *u = PyUnicode_DecodeUTF8(s, len, NULL); PyObject *v, *u = PyUnicode_DecodeUTF8(s, len, NULL);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment