Skip to content
Projeler
Gruplar
Parçacıklar
Yardım
Yükleniyor...
Oturum aç / Kaydol
Gezinmeyi değiştir
C
cpython
Proje
Proje
Ayrıntılar
Etkinlik
Cycle Analytics
Depo (repository)
Depo (repository)
Dosyalar
Kayıtlar (commit)
Dallar (branch)
Etiketler
Katkıda bulunanlar
Grafik
Karşılaştır
Grafikler
Konular (issue)
0
Konular (issue)
0
Liste
Pano
Etiketler
Kilometre Taşları
Birleştirme (merge) Talepleri
0
Birleştirme (merge) Talepleri
0
CI / CD
CI / CD
İş akışları (pipeline)
İşler
Zamanlamalar
Grafikler
Paketler
Paketler
Wiki
Wiki
Parçacıklar
Parçacıklar
Üyeler
Üyeler
Collapse sidebar
Close sidebar
Etkinlik
Grafik
Grafikler
Yeni bir konu (issue) oluştur
İşler
Kayıtlar (commit)
Konu (issue) Panoları
Kenar çubuğunu aç
Batuhan Osman TASKAYA
cpython
Commits
689a5580
Kaydet (Commit)
689a5580
authored
Mar 18, 2010
tarafından
Benjamin Peterson
Dosyalara gözat
Seçenekler
Dosyalara Gözat
İndir
Eposta Yamaları
Sade Fark
in tokenize.detect_encoding(), return utf-8-sig when a BOM is found
üst
8c804273
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
22 additions
and
12 deletions
+22
-12
tokenize.rst
Doc/library/tokenize.rst
+2
-1
test_tokenize.py
Lib/test/test_tokenize.py
+5
-5
tokenize.py
Lib/tokenize.py
+12
-6
NEWS
Misc/NEWS
+3
-0
No files found.
Doc/library/tokenize.rst
Dosyayı görüntüle @
689a5580
...
...
@@ -95,7 +95,8 @@ function it uses to do this is available:
It detects the encoding from the presence of a UTF-8 BOM or an encoding
cookie as specified in :pep:`263`. If both a BOM and a cookie are present,
but disagree, a SyntaxError will be raised.
but disagree, a SyntaxError will be raised. Note that if the BOM is found,
``'utf-8-sig'`` will be returned as an encoding.
If no encoding is specified, then the default of ``'utf-8'`` will be returned.
...
...
Lib/test/test_tokenize.py
Dosyayı görüntüle @
689a5580
...
...
@@ -726,7 +726,7 @@ class TestDetectEncoding(TestCase):
b
'do_something(else)
\n
'
)
encoding
,
consumed_lines
=
detect_encoding
(
self
.
get_readline
(
lines
))
self
.
assertEquals
(
encoding
,
'utf-8'
)
self
.
assertEquals
(
encoding
,
'utf-8
-sig
'
)
self
.
assertEquals
(
consumed_lines
,
[
b
'# something
\n
'
,
b
'print(something)
\n
'
])
...
...
@@ -747,7 +747,7 @@ class TestDetectEncoding(TestCase):
b
'do_something(else)
\n
'
)
encoding
,
consumed_lines
=
detect_encoding
(
self
.
get_readline
(
lines
))
self
.
assertEquals
(
encoding
,
'utf-8'
)
self
.
assertEquals
(
encoding
,
'utf-8
-sig
'
)
self
.
assertEquals
(
consumed_lines
,
[
b
'# coding=utf-8
\n
'
])
def
test_mismatched_bom_and_cookie_first_line_raises_syntaxerror
(
self
):
...
...
@@ -779,7 +779,7 @@ class TestDetectEncoding(TestCase):
b
'do_something(else)
\n
'
)
encoding
,
consumed_lines
=
detect_encoding
(
self
.
get_readline
(
lines
))
self
.
assertEquals
(
encoding
,
'utf-8'
)
self
.
assertEquals
(
encoding
,
'utf-8
-sig
'
)
self
.
assertEquals
(
consumed_lines
,
[
b
'#! something
\n
'
,
b
'f# coding=utf-8
\n
'
])
...
...
@@ -833,12 +833,12 @@ class TestDetectEncoding(TestCase):
readline
=
self
.
get_readline
((
b
'
\xef\xbb\xbf
print(something)
\n
'
,))
encoding
,
consumed_lines
=
detect_encoding
(
readline
)
self
.
assertEquals
(
encoding
,
'utf-8'
)
self
.
assertEquals
(
encoding
,
'utf-8
-sig
'
)
self
.
assertEquals
(
consumed_lines
,
[
b
'print(something)
\n
'
])
readline
=
self
.
get_readline
((
b
'
\xef\xbb\xbf
'
,))
encoding
,
consumed_lines
=
detect_encoding
(
readline
)
self
.
assertEquals
(
encoding
,
'utf-8'
)
self
.
assertEquals
(
encoding
,
'utf-8
-sig
'
)
self
.
assertEquals
(
consumed_lines
,
[])
readline
=
self
.
get_readline
((
b
'# coding: bad
\n
'
,))
...
...
Lib/tokenize.py
Dosyayı görüntüle @
689a5580
...
...
@@ -301,14 +301,16 @@ def detect_encoding(readline):
in.
It detects the encoding from the presence of a utf-8 bom or an encoding
cookie as specified in pep-0263. If both a bom and a cookie are present,
but disagree, a SyntaxError will be raised. If the encoding cookie is an
invalid charset, raise a SyntaxError.
cookie as specified in pep-0263. If both a bom and a cookie are present, but
disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
charset, raise a SyntaxError. Note that if a utf-8 bom is found,
'utf-8-sig' is returned.
If no encoding is specified, then the default of 'utf-8' will be returned.
"""
bom_found
=
False
encoding
=
None
default
=
'utf-8'
def
read_or_stop
():
try
:
return
readline
()
...
...
@@ -340,8 +342,9 @@ def detect_encoding(readline):
if
first
.
startswith
(
BOM_UTF8
):
bom_found
=
True
first
=
first
[
3
:]
default
=
'utf-8-sig'
if
not
first
:
return
'utf-8'
,
[]
return
default
,
[]
encoding
=
find_cookie
(
first
)
if
encoding
:
...
...
@@ -349,13 +352,13 @@ def detect_encoding(readline):
second
=
read_or_stop
()
if
not
second
:
return
'utf-8'
,
[
first
]
return
default
,
[
first
]
encoding
=
find_cookie
(
second
)
if
encoding
:
return
encoding
,
[
first
,
second
]
return
'utf-8'
,
[
first
,
second
]
return
default
,
[
first
,
second
]
def
tokenize
(
readline
):
...
...
@@ -394,6 +397,9 @@ def _tokenize(readline, encoding):
indents
=
[
0
]
if
encoding
is
not
None
:
if
encoding
==
"utf-8-sig"
:
# BOM will already have been stripped.
encoding
=
"utf-8"
yield
TokenInfo
(
ENCODING
,
encoding
,
(
0
,
0
),
(
0
,
0
),
''
)
while
True
:
# loop over lines in stream
try
:
...
...
Misc/NEWS
Dosyayı görüntüle @
689a5580
...
...
@@ -283,6 +283,9 @@ C-API
Library
-------
- ``tokenize.detect_encoding`` now returns ``'utf-8-sig'`` when a UTF-8 BOM is
detected.
- Issue #8024: Update the Unicode database to 5.2.
- Issue #6716/2: Backslash-replace error output in compilall.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment