Skip to content
Projeler
Gruplar
Parçacıklar
Yardım
Yükleniyor...
Oturum aç / Kaydol
Gezinmeyi değiştir
C
cpython
Proje
Proje
Ayrıntılar
Etkinlik
Cycle Analytics
Depo (repository)
Depo (repository)
Dosyalar
Kayıtlar (commit)
Dallar (branch)
Etiketler
Katkıda bulunanlar
Grafik
Karşılaştır
Grafikler
Konular (issue)
0
Konular (issue)
0
Liste
Pano
Etiketler
Kilometre Taşları
Birleştirme (merge) Talepleri
0
Birleştirme (merge) Talepleri
0
CI / CD
CI / CD
İş akışları (pipeline)
İşler
Zamanlamalar
Grafikler
Paketler
Paketler
Wiki
Wiki
Parçacıklar
Parçacıklar
Üyeler
Üyeler
Collapse sidebar
Close sidebar
Etkinlik
Grafik
Grafikler
Yeni bir konu (issue) oluştur
İşler
Kayıtlar (commit)
Konu (issue) Panoları
Kenar çubuğunu aç
Batuhan Osman TASKAYA
cpython
Commits
d3afadaa
Kaydet (Commit)
d3afadaa
authored
Eki 09, 2009
tarafından
Benjamin Peterson
Dosyalara gözat
Seçenekler
Dosyalara Gözat
İndir
Eposta Yamaları
Sade Fark
normalize latin-1 and utf-8 variant encodings like the builtin tokenizer does
üst
ffc08fca
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
44 additions
and
2 deletions
+44
-2
test_tokenize.py
Lib/test/test_tokenize.py
+29
-1
tokenize.py
Lib/tokenize.py
+12
-1
NEWS
Misc/NEWS
+3
-0
No files found.
Lib/test/test_tokenize.py
Dosyayı görüntüle @
d3afadaa
...
...
@@ -719,7 +719,7 @@ class TestDetectEncoding(TestCase):
b
'do_something(else)
\n
'
)
encoding
,
consumed_lines
=
detect_encoding
(
self
.
get_readline
(
lines
))
self
.
assertEquals
(
encoding
,
'
latin
-1'
)
self
.
assertEquals
(
encoding
,
'
iso-8859
-1'
)
self
.
assertEquals
(
consumed_lines
,
[
b
'# -*- coding: latin-1 -*-
\n
'
])
def
test_matched_bom_and_cookie_first_line
(
self
):
...
...
@@ -775,6 +775,34 @@ class TestDetectEncoding(TestCase):
readline
=
self
.
get_readline
(
lines
)
self
.
assertRaises
(
SyntaxError
,
detect_encoding
,
readline
)
def
test_latin1_normalization
(
self
):
# See get_normal_name() in tokenizer.c.
encodings
=
(
"latin-1"
,
"iso-8859-1"
,
"iso-latin-1"
,
"latin-1-unix"
,
"iso-8859-1-unix"
,
"iso-latin-1-mac"
)
for
encoding
in
encodings
:
for
rep
in
(
"-"
,
"_"
):
enc
=
encoding
.
replace
(
"-"
,
rep
)
lines
=
(
b
"#!/usr/bin/python
\n
"
,
b
"# coding: "
+
enc
.
encode
(
"ascii"
)
+
b
"
\n
"
,
b
"print(things)
\n
"
,
b
"do_something += 4
\n
"
)
rl
=
self
.
get_readline
(
lines
)
found
,
consumed_lines
=
detect_encoding
(
rl
)
self
.
assertEquals
(
found
,
"iso-8859-1"
)
def
test_utf8_normalization
(
self
):
# See get_normal_name() in tokenizer.c.
encodings
=
(
"utf-8"
,
"utf-8-mac"
,
"utf-8-unix"
)
for
encoding
in
encodings
:
for
rep
in
(
"-"
,
"_"
):
enc
=
encoding
.
replace
(
"-"
,
rep
)
lines
=
(
b
"#!/usr/bin/python
\n
"
,
b
"# coding: "
+
enc
.
encode
(
"ascii"
)
+
b
"
\n
"
,
b
"1 + 3
\n
"
)
rl
=
self
.
get_readline
(
lines
)
found
,
consumed_lines
=
detect_encoding
(
rl
)
self
.
assertEquals
(
found
,
"utf-8"
)
def
test_short_files
(
self
):
readline
=
self
.
get_readline
((
b
'print(something)
\n
'
,))
encoding
,
consumed_lines
=
detect_encoding
(
readline
)
...
...
Lib/tokenize.py
Dosyayı görüntüle @
d3afadaa
...
...
@@ -279,6 +279,17 @@ def untokenize(iterable):
return
out
def
_get_normal_name
(
orig_enc
):
"""Imitates get_normal_name in tokenizer.c."""
# Only care about the first 12 characters.
enc
=
orig_enc
[:
12
]
.
lower
()
.
replace
(
"_"
,
"-"
)
if
enc
==
"utf-8"
or
enc
.
startswith
(
"utf-8-"
):
return
"utf-8"
if
enc
in
(
"latin-1"
,
"iso-8859-1"
,
"iso-latin-1"
)
or
\
enc
.
startswith
((
"latin-1-"
,
"iso-8859-1-"
,
"iso-latin-1-"
)):
return
"iso-8859-1"
return
orig_enc
def
detect_encoding
(
readline
):
"""
The detect_encoding() function is used to detect the encoding that should
...
...
@@ -313,7 +324,7 @@ def detect_encoding(readline):
matches
=
cookie_re
.
findall
(
line_string
)
if
not
matches
:
return
None
encoding
=
matches
[
0
]
encoding
=
_get_normal_name
(
matches
[
0
])
try
:
codec
=
lookup
(
encoding
)
except
LookupError
:
...
...
Misc/NEWS
Dosyayı görüntüle @
d3afadaa
...
...
@@ -87,6 +87,9 @@ C-API
Library
-------
- Make tokenize.detect_coding() normalize utf-8 and iso-8859-1 variants like the
builtin tokenizer.
- Issue #7048: Force Decimal.logb to round its result when that result
is too large to fit in the current precision.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment