Skip to content
Projeler
Gruplar
Parçacıklar
Yardım
Yükleniyor...
Oturum aç / Kaydol
Gezinmeyi değiştir
C
cpython
Proje
Proje
Ayrıntılar
Etkinlik
Cycle Analytics
Depo (repository)
Depo (repository)
Dosyalar
Kayıtlar (commit)
Dallar (branch)
Etiketler
Katkıda bulunanlar
Grafik
Karşılaştır
Grafikler
Konular (issue)
0
Konular (issue)
0
Liste
Pano
Etiketler
Kilometre Taşları
Birleştirme (merge) Talepleri
0
Birleştirme (merge) Talepleri
0
CI / CD
CI / CD
İş akışları (pipeline)
İşler
Zamanlamalar
Grafikler
Paketler
Paketler
Wiki
Wiki
Parçacıklar
Parçacıklar
Üyeler
Üyeler
Collapse sidebar
Close sidebar
Etkinlik
Grafik
Grafikler
Yeni bir konu (issue) oluştur
İşler
Kayıtlar (commit)
Konu (issue) Panoları
Kenar çubuğunu aç
Batuhan Osman TASKAYA
cpython
Commits
2548c730
Kaydet (Commit)
2548c730
authored
Nis 18, 2003
tarafından
Martin v. Löwis
Dosyalara gözat
Seçenekler
Dosyalara Gözat
İndir
Eposta Yamaları
Sade Fark
Implement IDNA (Internationalized Domain Names in Applications).
üst
8d17a90b
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
1671 additions
and
9 deletions
+1671
-9
lib.tex
Doc/lib/lib.tex
+1
-0
libcodecs.tex
Doc/lib/libcodecs.tex
+71
-1
libstringprep.tex
Doc/lib/libstringprep.tex
+134
-0
whatsnew23.tex
Doc/whatsnew/whatsnew23.tex
+21
-0
idna.py
Lib/encodings/idna.py
+187
-0
punycode.py
Lib/encodings/punycode.py
+222
-0
httplib.py
Lib/httplib.py
+3
-3
stringprep.py
Lib/stringprep.py
+273
-0
test_codecs.py
Lib/test/test_codecs.py
+296
-0
NEWS
Misc/NEWS
+5
-0
socketmodule.c
Modules/socketmodule.c
+25
-5
mkstringprep.py
Tools/unicode/mkstringprep.py
+433
-0
No files found.
Doc/lib/lib.tex
Dosyayı görüntüle @
2548c730
...
...
@@ -112,6 +112,7 @@ and how to embed it in other applications.
\input
{
libtextwrap
}
\input
{
libcodecs
}
\input
{
libunicodedata
}
\input
{
libstringprep
}
\input
{
libmisc
}
% Miscellaneous Services
\input
{
libpydoc
}
...
...
Doc/lib/libcodecs.tex
Dosyayı görüntüle @
2548c730
...
...
@@ -5,7 +5,7 @@
\modulesynopsis
{
Encode and decode data and streams.
}
\moduleauthor
{
Marc-Andre Lemburg
}{
mal@lemburg.com
}
\sectionauthor
{
Marc-Andre Lemburg
}{
mal@lemburg.com
}
\sectionauthor
{
Martin v. L
\"
owis
}{
martin@v.loewis.de
}
\index
{
Unicode
}
\index
{
Codecs
}
...
...
@@ -809,6 +809,11 @@ listed as operand type in the table.
{
byte string
}
{
Convert operand to hexadecimal representation, with two digits per byte
}
\lineiv
{
idna
}
{}
{
Unicode string
}
{
Implements
\rfc
{
3490
}
.
\versionadded
{
2.3
}
. See also
\module
{
encodings.idna
}}
\lineiv
{
mbcs
}
{
dbcs
}
{
Unicode string
}
...
...
@@ -819,6 +824,11 @@ listed as operand type in the table.
{
Unicode string
}
{
Encoding of PalmOS 3.5
}
\lineiv
{
punycode
}
{}
{
Unicode string
}
{
Implements
\rfc
{
3492
}
.
\versionadded
{
2.3
}}
\lineiv
{
quopri
_
codec
}
{
quopri, quoted-printable, quotedprintable
}
{
byte string
}
...
...
@@ -865,3 +875,63 @@ listed as operand type in the table.
{
Compress the operand using gzip
}
\end{tableiv}
\subsection
{
\module
{
encodings.idna
}
---
Internationalized Domain Names in Applications
}
\declaremodule
{
standard
}{
encodings.idna
}
\modulesynopsis
{
Internationalized Domain Names implementation
}
\moduleauthor
{
Martin v. L
\"
owis
}
This module implements
\rfc
{
3490
}
(Internationalized Domain Names in
Applications) and
\rfc
{
3492
}
(Nameprep: A Stringprep Profile for
Internationalized Domain Names (IDN)). It builds upon the
\code
{
punycode
}
encoding and
\module
{
stringprep
}
.
\versionadded
{
2.3
}
These RFCs together define a protocol to support non-ASCII characters
in domain names. A domain name containing non-ASCII characters (such
as ``www.Alliancefran
\,
caise.nu'') is converted into an
ASCII-compatible encoding (ACE, such as
``www.xn--alliancefranaise-npb.nu''). The ACE form of the domain name
is then used in all places where arbitrary characters are not allowed
by the protocol, such as DNS queries, HTTP
\code
{
Host:
}
fields, and so
on. This conversion is carried out in the application; if possible
invisible to the user: The application should transparently convert
Unicode domain labels to IDNA on the wire, and convert back ACE labels
to Unicode before presenting them to the user.
Python supports this conversion in several ways: The
\code
{
idna
}
codec
allows to convert between Unicode and the ACE. Furthermore, the
\module
{
socket
}
module transparently converts Unicode host names to
ACE, so that applications need not be concerned about converting host
names themselves when they pass them to the socket module. On top of
that, modules that have host names as function parameters, such as
\module
{
httplib
}
and
\module
{
ftplib
}
, accept Unicode host names
(
\module
{
httplib
}
then also transparently sends an IDNA hostname in
the
\code
{
Host:
}
field if it sends that field at all).
When receiving host names from the wire (such as in reverse name
lookup), no automatic conversion to Unicode is performed: Applications
wishing to present such host names to the user should decode them to
Unicode.
The module
\module
{
encodings.idna
}
also implements the nameprep
procedure, which performs certain normalizations on host names, to
achieve case-insensitivity of international domain names, and to unify
similar characters. The nameprep functions can be used directly if
desired.
\begin{funcdesc}
{
nameprep
}{
label
}
Return the nameprepped version of
\var
{
label
}
. The implementation
currently assumes query strings, so
\code
{
AllowUnassigned
}
is
true.
\end{funcdesc}
\begin{funcdesc}
{
ToASCCII
}{
label
}
Convert a label to ASCII, as specified in
\rfc
{
3490
}
.
\code
{
UseSTD3ASCIIRules
}
is assumed to be false.
\end{funcdesc}
\begin{funcdesc}
{
ToUnicode
}{
label
}
Convert a label to Unicode, as specified in
\rfc
{
3490
}
.
\end{funcdesc}
Doc/lib/libstringprep.tex
0 → 100644
Dosyayı görüntüle @
2548c730
\section
{
\module
{
stringprep
}
---
Internet String Preparation
}
\declaremodule
{
standard
}{
stringprep
}
\modulesynopsis
{
String preparation, as per RFC 3453
}
\moduleauthor
{
Martin v. L
\"
owis
}{
martin@v.loewis.de
}
\sectionauthor
{
Martin v. L
\"
owis
}{
martin@v.loewis.de
}
When identifying things (such as host names) in the internet, it is
often necessary to compare such identifications for
``equality''. Exactly how this comparison is executed may depend on
the application domain, e.g. whether it should be case-insensitive or
not. It may be also necessary to restrict the possible
identifications, to allow only identifications consisting of
``printable'' characters.
\rfc
{
3454
}
defines a procedure for ``preparing'' Unicode strings in
internet protocols. Before passing strings onto the wire, they are
processed with the preparation procedure, after which they have a
certain normalized form. The RFC defines a set of tables, which can be
combined into profiles. Each profile must define which tables it uses,
and what other optional parts of the
\code
{
stringprep
}
procedure are
part of the profile. One example of a
\code
{
stringprep
}
profile is
\code
{
nameprep
}
, which is used for internationalized domain names.
The module
\module
{
stringprep
}
only exposes the tables from RFC
3454. As these tables would be very large to represent them as
dictionaries or lists, the module uses the Unicode character database
internally. The module source code itself was generated using the
\code
{
mkstringprep.py
}
utility.
As a result, these tables are exposed as functions, not as data
structures. There are two kinds of tables in the RFC: sets and
mappings. For a set,
\module
{
stringprep
}
provides the ``characteristic
function'', i.e. a function that returns true if the parameter is part
of the set. For mappings, it provides the mapping function: given the
key, it returns the associated value. Below is a list of all functions
available in the module.
\begin{funcdesc}
{
in
_
table
_
a1
}{
code
}
Determine whether
\var
{
code
}
is in table
{
A.1
}
(Unassigned code points
in Unicode 3.2).
\end{funcdesc}
\begin{funcdesc}
{
in
_
table
_
b1
}{
code
}
Determine whether
\var
{
code
}
is in table
{
B.1
}
(Commonly mapped to
nothing).
\end{funcdesc}
\begin{funcdesc}
{
map
_
table
_
b2
}{
code
}
Return the mapped value for
\var
{
code
}
according to table
{
B.2
}
(Mapping for case-folding used with NFKC).
\end{funcdesc}
\begin{funcdesc}
{
map
_
table
_
b3
}{
code
}
Return the mapped value for
\var
{
code
}
according to table
{
B.3
}
(Mapping for case-folding used with no normalization).
\end{funcdesc}
\begin{funcdesc}
{
in
_
table
_
c11
}{
code
}
Determine whether
\var
{
code
}
is in table
{
C.1.1
}
(ASCII space characters).
\end{funcdesc}
\begin{funcdesc}
{
in
_
table
_
c12
}{
code
}
Determine whether
\var
{
code
}
is in table
{
C.1.2
}
(Non-ASCII space characters).
\end{funcdesc}
\begin{funcdesc}
{
in
_
table
_
c11
_
c12
}{
code
}
Determine whether
\var
{
code
}
is in table
{
C.1
}
(Space characters, union of C.1.1 and C.1.2).
\end{funcdesc}
\begin{funcdesc}
{
in
_
table
_
c21
}{
code
}
Determine whether
\var
{
code
}
is in table
{
C.2.1
}
(ASCII control characters).
\end{funcdesc}
\begin{funcdesc}
{
in
_
table
_
c22
}{
code
}
Determine whether
\var
{
code
}
is in table
{
C.2.2
}
(Non-ASCII control characters).
\end{funcdesc}
\begin{funcdesc}
{
in
_
table
_
c21
_
c22
}{
code
}
Determine whether
\var
{
code
}
is in table
{
C.2
}
(Control characters, union of C.2.1 and C.2.2).
\end{funcdesc}
\begin{funcdesc}
{
in
_
table
_
c3
}{
code
}
Determine whether
\var
{
code
}
is in table
{
C.3
}
(Private use).
\end{funcdesc}
\begin{funcdesc}
{
in
_
table
_
c4
}{
code
}
Determine whether
\var
{
code
}
is in table
{
C.4
}
(Non-character code points).
\end{funcdesc}
\begin{funcdesc}
{
in
_
table
_
c5
}{
code
}
Determine whether
\var
{
code
}
is in table
{
C.5
}
(Surrogate codes).
\end{funcdesc}
\begin{funcdesc}
{
in
_
table
_
c6
}{
code
}
Determine whether
\var
{
code
}
is in table
{
C.6
}
(Inappropriate for plain text).
\end{funcdesc}
\begin{funcdesc}
{
in
_
table
_
c7
}{
code
}
Determine whether
\var
{
code
}
is in table
{
C.7
}
(Inappropriate for canonical representation).
\end{funcdesc}
\begin{funcdesc}
{
in
_
table
_
c8
}{
code
}
Determine whether
\var
{
code
}
is in table
{
C.8
}
(Change display properties or are deprecated).
\end{funcdesc}
\begin{funcdesc}
{
in
_
table
_
c9
}{
code
}
Determine whether
\var
{
code
}
is in table
{
C.9
}
(Tagging characters).
\end{funcdesc}
\begin{funcdesc}
{
in
_
table
_
d1
}{
code
}
Determine whether
\var
{
code
}
is in table
{
D.1
}
(Characters with bidirectional property ``R'' or ``AL'').
\end{funcdesc}
\begin{funcdesc}
{
in
_
table
_
d2
}{
code
}
Determine whether
\var
{
code
}
is in table
{
D.2
}
(Characters with bidirectional property ``L'').
\end{funcdesc}
Doc/whatsnew/whatsnew23.tex
Dosyayı görüntüle @
2548c730
...
...
@@ -1791,6 +1791,27 @@ Tkinter.wantobjects = 0
Any breakage caused by this change should be reported as a bug.
\item
Support for internationalized domain names (RFCs 3454, 3490,
3491, and 3492) has been added. The ``idna'' encoding can be used
to convert between a Unicode domain name and the ASCII-compatible
encoding (ACE).
\begin{verbatim}
>>> u"www.Alliancefran
\,
caise.nu".encode("idna")
'www.xn--alliancefranaise-npb.nu'
\end{verbatim}
In addition, the
\module
{
socket
}
has been extended to transparently
convert Unicode hostnames to the ACE before passing them to the C
library. In turn, modules that pass hostnames ``through'' (such as
\module
{
httplib
}
,
\module
{
ftplib
}
) also support Unicode host names
(httplib also sends ACE Host: headers).
\module
{
urllib
}
supports
Unicode URLs with non-ASCII host names as long as the
\code
{
path
}
part
of the URL is ASCII only.
To implement this change, the module
\module
{
stringprep
}
, the tool
\code
{
mkstringprep
}
and the
\code
{
punycode
}
encoding have been added.
\end{itemize}
...
...
Lib/encodings/idna.py
0 → 100644
Dosyayı görüntüle @
2548c730
# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
import
stringprep
,
unicodedata
,
re
,
codecs
# IDNA section 3.1
dots
=
re
.
compile
(
u"[
\u002E\u3002\uFF0E\uFF61
]"
)
# IDNA section 5
ace_prefix
=
"xn--"
uace_prefix
=
unicode
(
ace_prefix
,
"ascii"
)
# This assumes query strings, so AllowUnassigned is true
def
nameprep
(
label
):
# Map
newlabel
=
[]
for
c
in
label
:
if
stringprep
.
in_table_b1
(
c
):
# Map to nothing
continue
newlabel
.
append
(
stringprep
.
map_table_b2
(
c
))
label
=
u""
.
join
(
newlabel
)
# Normalize
label
=
unicodedata
.
normalize
(
"NFKC"
,
label
)
# Prohibit
for
c
in
label
:
if
stringprep
.
in_table_c12
(
c
)
or
\
stringprep
.
in_table_c22
(
c
)
or
\
stringprep
.
in_table_c3
(
c
)
or
\
stringprep
.
in_table_c4
(
c
)
or
\
stringprep
.
in_table_c5
(
c
)
or
\
stringprep
.
in_table_c6
(
c
)
or
\
stringprep
.
in_table_c7
(
c
)
or
\
stringprep
.
in_table_c8
(
c
)
or
\
stringprep
.
in_table_c9
(
c
):
raise
UnicodeError
,
"Invalid character
%
s"
%
repr
(
c
)
# Check bidi
RandAL
=
map
(
stringprep
.
in_table_d1
,
label
)
for
c
in
RandAL
:
if
c
:
# There is a RandAL char in the string. Must perform further
# tests:
# 1) The characters in section 5.8 MUST be prohibited.
# This is table C.8, which was already checked
# 2) If a string contains any RandALCat character, the string
# MUST NOT contain any LCat character.
if
filter
(
stringprep
.
in_table_d2
,
label
):
raise
UnicodeError
,
"Violation of BIDI requirement 2"
# 3) If a string contains any RandALCat character, a
# RandALCat character MUST be the first character of the
# string, and a RandALCat character MUST be the last
# character of the string.
if
not
RandAL
[
0
]
or
not
RandAL
[
-
1
]:
raise
UnicodeError
,
"Violation of BIDI requirement 3"
return
label
def
ToASCII
(
label
):
try
:
# Step 1: try ASCII
label
=
label
.
encode
(
"ascii"
)
except
UnicodeError
:
pass
else
:
# Skip to step 3: UseSTD3ASCIIRules is false, so
# Skip to step 8.
if
0
<
len
(
label
)
<
64
:
return
label
raise
UnicodeError
,
"label too long"
# Step 2: nameprep
label
=
nameprep
(
label
)
# Step 3: UseSTD3ASCIIRules is false
# Step 4: try ASCII
try
:
label
=
label
.
encode
(
"ascii"
)
except
UnicodeError
:
pass
else
:
# Skip to step 8.
if
0
<
len
(
label
)
<
64
:
return
label
raise
UnicodeError
,
"label too long"
# Step 5: Check ACE prefix
if
label
.
startswith
(
uace_prefix
):
raise
UnicodeError
,
"Label starts with ACE prefix"
# Step 6: Encode with PUNYCODE
label
=
label
.
encode
(
"punycode"
)
# Step 7: Prepend ACE prefix
label
=
ace_prefix
+
label
# Step 8: Check size
if
0
<
len
(
label
)
<
64
:
return
label
raise
UnicodeError
,
"label too long"
def
ToUnicode
(
label
):
# Step 1: Check for ASCII
if
isinstance
(
label
,
str
):
pure_ascii
=
True
else
:
try
:
label
=
label
.
encode
(
"ascii"
)
pure_ascii
=
True
except
UnicodeError
:
pure_ascii
=
False
if
not
pure_ascii
:
# Step 2: Perform nameprep
label
=
nameprep
(
label
)
# It doesn't say this, but apparently, it should be ASCII now
try
:
label
=
label
.
encode
(
"ascii"
)
except
UnicodeError
:
raise
UnicodeError
,
"Invalid character in IDN label"
# Step 3: Check for ACE prefix
if
not
label
.
startswith
(
ace_prefix
):
return
unicode
(
label
,
"ascii"
)
# Step 4: Remove ACE prefix
label1
=
label
[
len
(
ace_prefix
):]
# Step 5: Decode using PUNYCODE
result
=
label1
.
decode
(
"punycode"
)
# Step 6: Apply ToASCII
label2
=
ToASCII
(
result
)
# Step 7: Compare the result of step 6 with the one of step 3
# label2 will already be in lower case.
if
label
.
lower
()
!=
label2
:
raise
UnicodeError
,
(
"IDNA does not round-trip"
,
label
,
label2
)
# Step 8: return the result of step 5
return
result
### Codec APIs
class
Codec
(
codecs
.
Codec
):
def
encode
(
self
,
input
,
errors
=
'strict'
):
if
errors
!=
'strict'
:
# IDNA is quite clear that implementations must be strict
raise
UnicodeError
,
"unsupported error handling "
+
errors
result
=
[]
for
label
in
dots
.
split
(
input
):
result
.
append
(
ToASCII
(
label
))
# Join with U+002E
return
"."
.
join
(
result
),
len
(
input
)
def
decode
(
self
,
input
,
errors
=
'strict'
):
if
errors
!=
'strict'
:
raise
UnicodeError
,
"Unsupported error handling "
+
errors
# IDNA allows decoding to operate on Unicode strings, too.
if
isinstance
(
input
,
unicode
):
labels
=
dots
.
split
(
input
)
else
:
# Must be ASCII string
unicode
(
input
,
"ascii"
)
labels
=
input
.
split
(
"."
)
result
=
[]
for
label
in
labels
:
result
.
append
(
ToUnicode
(
label
))
return
u"."
.
join
(
result
),
len
(
input
)
class
StreamWriter
(
Codec
,
codecs
.
StreamWriter
):
pass
class
StreamReader
(
Codec
,
codecs
.
StreamReader
):
pass
### encodings module API
def
getregentry
():
return
(
Codec
()
.
encode
,
Codec
()
.
decode
,
StreamReader
,
StreamWriter
)
Lib/encodings/punycode.py
0 → 100644
Dosyayı görüntüle @
2548c730
# -*- coding: iso-8859-1 -*-
""" Codec for the Punicode encoding, as specified in RFC 3492
Written by Martin v. Lwis.
"""
import
codecs
##################### Encoding #####################################
def
segregate
(
str
):
"""3.1 Basic code point segregation"""
base
=
[]
extended
=
{}
for
c
in
str
:
if
ord
(
c
)
<
128
:
base
.
append
(
c
)
else
:
extended
[
c
]
=
1
extended
=
extended
.
keys
()
extended
.
sort
()
return
""
.
join
(
base
)
.
encode
(
"ascii"
),
extended
def
selective_len
(
str
,
max
):
"""Return the length of str, considering only characters below max."""
res
=
0
for
c
in
str
:
if
ord
(
c
)
<
max
:
res
+=
1
return
res
def
selective_find
(
str
,
char
,
index
,
pos
):
"""Return a pair (index, pos), indicating the next occurrence of
char in str. index is the position of the character considering
only ordinals up to and including char, and pos is the position in
the full string. index/pos is the starting position in the full
string."""
l
=
len
(
str
)
while
1
:
pos
+=
1
if
pos
==
l
:
return
(
-
1
,
-
1
)
c
=
str
[
pos
]
if
c
==
char
:
return
index
+
1
,
pos
elif
c
<
char
:
index
+=
1
def
insertion_unsort
(
str
,
extended
):
"""3.2 Insertion unsort coding"""
oldchar
=
0x80
result
=
[]
oldindex
=
-
1
for
c
in
extended
:
index
=
pos
=
-
1
char
=
ord
(
c
)
curlen
=
selective_len
(
str
,
char
)
delta
=
(
curlen
+
1
)
*
(
char
-
oldchar
)
while
1
:
index
,
pos
=
selective_find
(
str
,
c
,
index
,
pos
)
if
index
==
-
1
:
break
delta
+=
index
-
oldindex
result
.
append
(
delta
-
1
)
oldindex
=
index
delta
=
0
oldchar
=
char
return
result
def
T
(
j
,
bias
):
# Punycode parameters: tmin = 1, tmax = 26, base = 36
res
=
36
*
(
j
+
1
)
-
bias
if
res
<
1
:
return
1
if
res
>
26
:
return
26
return
res
digits
=
"abcdefghijklmnopqrstuvwxyz0123456789"
def
generate_generalized_integer
(
N
,
bias
):
"""3.3 Generalized variable-length integers"""
result
=
[]
j
=
0
while
1
:
t
=
T
(
j
,
bias
)
if
N
<
t
:
result
.
append
(
digits
[
N
])
return
result
result
.
append
(
digits
[
t
+
((
N
-
t
)
%
(
36
-
t
))])
N
=
(
N
-
t
)
//
(
36
-
t
)
j
+=
1
def
adapt
(
delta
,
first
,
numchars
):
if
first
:
delta
//=
700
else
:
delta
//=
2
delta
+=
delta
//
numchars
# ((base - tmin) * tmax) // 2 == 455
divisions
=
0
while
delta
>
455
:
delta
=
delta
//
35
# base - tmin
divisions
+=
36
bias
=
divisions
+
(
36
*
delta
//
(
delta
+
38
))
return
bias
def
generate_integers
(
baselen
,
deltas
):
"""3.4 Bias adaptation"""
# Punycode parameters: initial bias = 72, damp = 700, skew = 38
result
=
[]
bias
=
72
for
points
,
delta
in
enumerate
(
deltas
):
s
=
generate_generalized_integer
(
delta
,
bias
)
result
.
extend
(
s
)
bias
=
adapt
(
delta
,
points
==
0
,
baselen
+
points
+
1
)
return
""
.
join
(
result
)
def
punycode_encode
(
text
):
base
,
extended
=
segregate
(
text
)
base
=
base
.
encode
(
"ascii"
)
deltas
=
insertion_unsort
(
text
,
extended
)
extended
=
generate_integers
(
len
(
base
),
deltas
)
if
base
:
return
base
+
"-"
+
extended
return
extended
##################### Decoding #####################################
def
decode_generalized_number
(
extended
,
extpos
,
bias
,
errors
):
"""3.3 Generalized variable-length integers"""
result
=
0
w
=
1
j
=
0
while
1
:
try
:
char
=
ord
(
extended
[
extpos
])
except
IndexError
:
if
errors
==
"strict"
:
raise
UnicodeError
,
"incomplete punicode string"
return
extpos
+
1
,
None
extpos
+=
1
if
0x41
<=
char
<=
0x5A
:
# A-Z
digit
=
char
-
0x41
elif
0x30
<=
char
<=
0x39
:
digit
=
char
-
22
# 0x30-26
elif
errors
==
"strict"
:
raise
UnicodeError
(
"Invalid extended code point '
%
s'"
%
extended
[
extpos
])
else
:
return
extpos
,
None
t
=
T
(
j
,
bias
)
result
+=
digit
*
w
if
digit
<
t
:
return
extpos
,
result
w
=
w
*
(
36
-
t
)
j
+=
1
def
insertion_sort
(
base
,
extended
,
errors
):
"""3.2 Insertion unsort coding"""
char
=
0x80
pos
=
-
1
bias
=
72
extpos
=
0
while
extpos
<
len
(
extended
):
newpos
,
delta
=
decode_generalized_number
(
extended
,
extpos
,
bias
,
errors
)
if
delta
is
None
:
# There was an error in decoding. We can't continue because
# synchronization is lost.
return
base
pos
+=
delta
+
1
char
+=
pos
//
(
len
(
base
)
+
1
)
if
char
>
0x10FFFF
:
if
errors
==
"strict"
:
raise
UnicodeError
,
(
"Invalid character U+
%
x"
%
char
)
char
=
ord
(
'?'
)
pos
=
pos
%
(
len
(
base
)
+
1
)
base
=
base
[:
pos
]
+
unichr
(
char
)
+
base
[
pos
:]
bias
=
adapt
(
delta
,
(
extpos
==
0
),
len
(
base
))
extpos
=
newpos
return
base
def
punycode_decode
(
text
,
errors
):
pos
=
text
.
rfind
(
"-"
)
if
pos
==
-
1
:
base
=
""
extended
=
text
else
:
base
=
text
[:
pos
]
extended
=
text
[
pos
+
1
:]
base
=
unicode
(
base
,
"ascii"
,
errors
)
extended
=
extended
.
upper
()
return
insertion_sort
(
base
,
extended
,
errors
)
### Codec APIs
class
Codec
(
codecs
.
Codec
):
def
encode
(
self
,
input
,
errors
=
'strict'
):
res
=
punycode_encode
(
input
)
return
res
,
len
(
input
)
def
decode
(
self
,
input
,
errors
=
'strict'
):
if
errors
not
in
(
'strict'
,
'replace'
,
'ignore'
):
raise
UnicodeError
,
"Unsupported error handling "
+
errors
res
=
punycode_decode
(
input
,
errors
)
return
res
,
len
(
input
)
class
StreamWriter
(
Codec
,
codecs
.
StreamWriter
):
pass
class
StreamReader
(
Codec
,
codecs
.
StreamReader
):
pass
### encodings module API
def
getregentry
():
return
(
Codec
()
.
encode
,
Codec
()
.
decode
,
StreamReader
,
StreamWriter
)
Lib/httplib.py
Dosyayı görüntüle @
2548c730
...
...
@@ -655,11 +655,11 @@ class HTTPConnection:
nil
,
netloc
,
nil
,
nil
,
nil
=
urlsplit
(
url
)
if
netloc
:
self
.
putheader
(
'Host'
,
netloc
)
self
.
putheader
(
'Host'
,
netloc
.
encode
(
"idna"
)
)
elif
self
.
port
==
HTTP_PORT
:
self
.
putheader
(
'Host'
,
self
.
host
)
self
.
putheader
(
'Host'
,
self
.
host
.
encode
(
"idna"
)
)
else
:
self
.
putheader
(
'Host'
,
"
%
s:
%
s"
%
(
self
.
host
,
self
.
port
))
self
.
putheader
(
'Host'
,
"
%
s:
%
s"
%
(
self
.
host
.
encode
(
"idna"
)
,
self
.
port
))
# note: we are assuming that clients will not attempt to set these
# headers since *this* library must deal with the
...
...
Lib/stringprep.py
0 → 100644
Dosyayı görüntüle @
2548c730
# This file is generated by mkstringprep.py. DO NOT EDIT.
"""Library that exposes various tables found in the StringPrep RFC 3454.
There are two kinds of tables: sets, for which a member test is provided,
and mappings, for which a mapping function is provided.
"""
import
unicodedata
,
sets
assert
unicodedata
.
unidata_version
==
'3.2.0'
def
in_table_a1
(
code
):
if
unicodedata
.
category
(
code
)
!=
'Cn'
:
return
False
c
=
ord
(
code
)
if
0xFDD0
<=
c
<
0xFDF0
:
return
False
return
(
c
&
0xFFFF
)
not
in
(
0xFFFE
,
0xFFFF
)
b1_set
=
sets
.
Set
([
173
,
847
,
6150
,
6155
,
6156
,
6157
,
8203
,
8204
,
8205
,
8288
,
65279
]
+
range
(
65024
,
65040
))
def
in_table_b1
(
code
):
return
ord
(
code
)
in
b1_set
b3_exceptions
=
{
0xb5
:
u'
\u03bc
'
,
0xdf
:
u'ss'
,
0x130
:
u'i
\u0307
'
,
0x149
:
u'
\u02bc
n'
,
0x17f
:
u's'
,
0x1f0
:
u'j
\u030c
'
,
0x345
:
u'
\u03b9
'
,
0x37a
:
u'
\u03b9
'
,
0x390
:
u'
\u03b9\u0308\u0301
'
,
0x3b0
:
u'
\u03c5\u0308\u0301
'
,
0x3c2
:
u'
\u03c3
'
,
0x3d0
:
u'
\u03b2
'
,
0x3d1
:
u'
\u03b8
'
,
0x3d2
:
u'
\u03c5
'
,
0x3d3
:
u'
\u03cd
'
,
0x3d4
:
u'
\u03cb
'
,
0x3d5
:
u'
\u03c6
'
,
0x3d6
:
u'
\u03c0
'
,
0x3f0
:
u'
\u03ba
'
,
0x3f1
:
u'
\u03c1
'
,
0x3f2
:
u'
\u03c3
'
,
0x3f5
:
u'
\u03b5
'
,
0x587
:
u'
\u0565\u0582
'
,
0x1e96
:
u'h
\u0331
'
,
0x1e97
:
u't
\u0308
'
,
0x1e98
:
u'w
\u030a
'
,
0x1e99
:
u'y
\u030a
'
,
0x1e9a
:
u'a
\u02be
'
,
0x1e9b
:
u'
\u1e61
'
,
0x1f50
:
u'
\u03c5\u0313
'
,
0x1f52
:
u'
\u03c5\u0313\u0300
'
,
0x1f54
:
u'
\u03c5\u0313\u0301
'
,
0x1f56
:
u'
\u03c5\u0313\u0342
'
,
0x1f80
:
u'
\u1f00\u03b9
'
,
0x1f81
:
u'
\u1f01\u03b9
'
,
0x1f82
:
u'
\u1f02\u03b9
'
,
0x1f83
:
u'
\u1f03\u03b9
'
,
0x1f84
:
u'
\u1f04\u03b9
'
,
0x1f85
:
u'
\u1f05\u03b9
'
,
0x1f86
:
u'
\u1f06\u03b9
'
,
0x1f87
:
u'
\u1f07\u03b9
'
,
0x1f88
:
u'
\u1f00\u03b9
'
,
0x1f89
:
u'
\u1f01\u03b9
'
,
0x1f8a
:
u'
\u1f02\u03b9
'
,
0x1f8b
:
u'
\u1f03\u03b9
'
,
0x1f8c
:
u'
\u1f04\u03b9
'
,
0x1f8d
:
u'
\u1f05\u03b9
'
,
0x1f8e
:
u'
\u1f06\u03b9
'
,
0x1f8f
:
u'
\u1f07\u03b9
'
,
0x1f90
:
u'
\u1f20\u03b9
'
,
0x1f91
:
u'
\u1f21\u03b9
'
,
0x1f92
:
u'
\u1f22\u03b9
'
,
0x1f93
:
u'
\u1f23\u03b9
'
,
0x1f94
:
u'
\u1f24\u03b9
'
,
0x1f95
:
u'
\u1f25\u03b9
'
,
0x1f96
:
u'
\u1f26\u03b9
'
,
0x1f97
:
u'
\u1f27\u03b9
'
,
0x1f98
:
u'
\u1f20\u03b9
'
,
0x1f99
:
u'
\u1f21\u03b9
'
,
0x1f9a
:
u'
\u1f22\u03b9
'
,
0x1f9b
:
u'
\u1f23\u03b9
'
,
0x1f9c
:
u'
\u1f24\u03b9
'
,
0x1f9d
:
u'
\u1f25\u03b9
'
,
0x1f9e
:
u'
\u1f26\u03b9
'
,
0x1f9f
:
u'
\u1f27\u03b9
'
,
0x1fa0
:
u'
\u1f60\u03b9
'
,
0x1fa1
:
u'
\u1f61\u03b9
'
,
0x1fa2
:
u'
\u1f62\u03b9
'
,
0x1fa3
:
u'
\u1f63\u03b9
'
,
0x1fa4
:
u'
\u1f64\u03b9
'
,
0x1fa5
:
u'
\u1f65\u03b9
'
,
0x1fa6
:
u'
\u1f66\u03b9
'
,
0x1fa7
:
u'
\u1f67\u03b9
'
,
0x1fa8
:
u'
\u1f60\u03b9
'
,
0x1fa9
:
u'
\u1f61\u03b9
'
,
0x1faa
:
u'
\u1f62\u03b9
'
,
0x1fab
:
u'
\u1f63\u03b9
'
,
0x1fac
:
u'
\u1f64\u03b9
'
,
0x1fad
:
u'
\u1f65\u03b9
'
,
0x1fae
:
u'
\u1f66\u03b9
'
,
0x1faf
:
u'
\u1f67\u03b9
'
,
0x1fb2
:
u'
\u1f70\u03b9
'
,
0x1fb3
:
u'
\u03b1\u03b9
'
,
0x1fb4
:
u'
\u03ac\u03b9
'
,
0x1fb6
:
u'
\u03b1\u0342
'
,
0x1fb7
:
u'
\u03b1\u0342\u03b9
'
,
0x1fbc
:
u'
\u03b1\u03b9
'
,
0x1fbe
:
u'
\u03b9
'
,
0x1fc2
:
u'
\u1f74\u03b9
'
,
0x1fc3
:
u'
\u03b7\u03b9
'
,
0x1fc4
:
u'
\u03ae\u03b9
'
,
0x1fc6
:
u'
\u03b7\u0342
'
,
0x1fc7
:
u'
\u03b7\u0342\u03b9
'
,
0x1fcc
:
u'
\u03b7\u03b9
'
,
0x1fd2
:
u'
\u03b9\u0308\u0300
'
,
0x1fd3
:
u'
\u03b9\u0308\u0301
'
,
0x1fd6
:
u'
\u03b9\u0342
'
,
0x1fd7
:
u'
\u03b9\u0308\u0342
'
,
0x1fe2
:
u'
\u03c5\u0308\u0300
'
,
0x1fe3
:
u'
\u03c5\u0308\u0301
'
,
0x1fe4
:
u'
\u03c1\u0313
'
,
0x1fe6
:
u'
\u03c5\u0342
'
,
0x1fe7
:
u'
\u03c5\u0308\u0342
'
,
0x1ff2
:
u'
\u1f7c\u03b9
'
,
0x1ff3
:
u'
\u03c9\u03b9
'
,
0x1ff4
:
u'
\u03ce\u03b9
'
,
0x1ff6
:
u'
\u03c9\u0342
'
,
0x1ff7
:
u'
\u03c9\u0342\u03b9
'
,
0x1ffc
:
u'
\u03c9\u03b9
'
,
0x20a8
:
u'rs'
,
0x2102
:
u'c'
,
0x2103
:
u'
\xb0
c'
,
0x2107
:
u'
\u025b
'
,
0x2109
:
u'
\xb0
f'
,
0x210b
:
u'h'
,
0x210c
:
u'h'
,
0x210d
:
u'h'
,
0x2110
:
u'i'
,
0x2111
:
u'i'
,
0x2112
:
u'l'
,
0x2115
:
u'n'
,
0x2116
:
u'no'
,
0x2119
:
u'p'
,
0x211a
:
u'q'
,
0x211b
:
u'r'
,
0x211c
:
u'r'
,
0x211d
:
u'r'
,
0x2120
:
u'sm'
,
0x2121
:
u'tel'
,
0x2122
:
u'tm'
,
0x2124
:
u'z'
,
0x2128
:
u'z'
,
0x212c
:
u'b'
,
0x212d
:
u'c'
,
0x2130
:
u'e'
,
0x2131
:
u'f'
,
0x2133
:
u'm'
,
0x213e
:
u'
\u03b3
'
,
0x213f
:
u'
\u03c0
'
,
0x2145
:
u'd'
,
0x3371
:
u'hpa'
,
0x3373
:
u'au'
,
0x3375
:
u'ov'
,
0x3380
:
u'pa'
,
0x3381
:
u'na'
,
0x3382
:
u'
\u03bc
a'
,
0x3383
:
u'ma'
,
0x3384
:
u'ka'
,
0x3385
:
u'kb'
,
0x3386
:
u'mb'
,
0x3387
:
u'gb'
,
0x338a
:
u'pf'
,
0x338b
:
u'nf'
,
0x338c
:
u'
\u03bc
f'
,
0x3390
:
u'hz'
,
0x3391
:
u'khz'
,
0x3392
:
u'mhz'
,
0x3393
:
u'ghz'
,
0x3394
:
u'thz'
,
0x33a9
:
u'pa'
,
0x33aa
:
u'kpa'
,
0x33ab
:
u'mpa'
,
0x33ac
:
u'gpa'
,
0x33b4
:
u'pv'
,
0x33b5
:
u'nv'
,
0x33b6
:
u'
\u03bc
v'
,
0x33b7
:
u'mv'
,
0x33b8
:
u'kv'
,
0x33b9
:
u'mv'
,
0x33ba
:
u'pw'
,
0x33bb
:
u'nw'
,
0x33bc
:
u'
\u03bc
w'
,
0x33bd
:
u'mw'
,
0x33be
:
u'kw'
,
0x33bf
:
u'mw'
,
0x33c0
:
u'k
\u03c9
'
,
0x33c1
:
u'm
\u03c9
'
,
0x33c3
:
u'bq'
,
0x33c6
:
u'c
\u2215
kg'
,
0x33c7
:
u'co.'
,
0x33c8
:
u'db'
,
0x33c9
:
u'gy'
,
0x33cb
:
u'hp'
,
0x33cd
:
u'kk'
,
0x33ce
:
u'km'
,
0x33d7
:
u'ph'
,
0x33d9
:
u'ppm'
,
0x33da
:
u'pr'
,
0x33dc
:
u'sv'
,
0x33dd
:
u'wb'
,
0xfb00
:
u'ff'
,
0xfb01
:
u'fi'
,
0xfb02
:
u'fl'
,
0xfb03
:
u'ffi'
,
0xfb04
:
u'ffl'
,
0xfb05
:
u'st'
,
0xfb06
:
u'st'
,
0xfb13
:
u'
\u0574\u0576
'
,
0xfb14
:
u'
\u0574\u0565
'
,
0xfb15
:
u'
\u0574\u056b
'
,
0xfb16
:
u'
\u057e\u0576
'
,
0xfb17
:
u'
\u0574\u056d
'
,
0x1d400
:
u'a'
,
0x1d401
:
u'b'
,
0x1d402
:
u'c'
,
0x1d403
:
u'd'
,
0x1d404
:
u'e'
,
0x1d405
:
u'f'
,
0x1d406
:
u'g'
,
0x1d407
:
u'h'
,
0x1d408
:
u'i'
,
0x1d409
:
u'j'
,
0x1d40a
:
u'k'
,
0x1d40b
:
u'l'
,
0x1d40c
:
u'm'
,
0x1d40d
:
u'n'
,
0x1d40e
:
u'o'
,
0x1d40f
:
u'p'
,
0x1d410
:
u'q'
,
0x1d411
:
u'r'
,
0x1d412
:
u's'
,
0x1d413
:
u't'
,
0x1d414
:
u'u'
,
0x1d415
:
u'v'
,
0x1d416
:
u'w'
,
0x1d417
:
u'x'
,
0x1d418
:
u'y'
,
0x1d419
:
u'z'
,
0x1d434
:
u'a'
,
0x1d435
:
u'b'
,
0x1d436
:
u'c'
,
0x1d437
:
u'd'
,
0x1d438
:
u'e'
,
0x1d439
:
u'f'
,
0x1d43a
:
u'g'
,
0x1d43b
:
u'h'
,
0x1d43c
:
u'i'
,
0x1d43d
:
u'j'
,
0x1d43e
:
u'k'
,
0x1d43f
:
u'l'
,
0x1d440
:
u'm'
,
0x1d441
:
u'n'
,
0x1d442
:
u'o'
,
0x1d443
:
u'p'
,
0x1d444
:
u'q'
,
0x1d445
:
u'r'
,
0x1d446
:
u's'
,
0x1d447
:
u't'
,
0x1d448
:
u'u'
,
0x1d449
:
u'v'
,
0x1d44a
:
u'w'
,
0x1d44b
:
u'x'
,
0x1d44c
:
u'y'
,
0x1d44d
:
u'z'
,
0x1d468
:
u'a'
,
0x1d469
:
u'b'
,
0x1d46a
:
u'c'
,
0x1d46b
:
u'd'
,
0x1d46c
:
u'e'
,
0x1d46d
:
u'f'
,
0x1d46e
:
u'g'
,
0x1d46f
:
u'h'
,
0x1d470
:
u'i'
,
0x1d471
:
u'j'
,
0x1d472
:
u'k'
,
0x1d473
:
u'l'
,
0x1d474
:
u'm'
,
0x1d475
:
u'n'
,
0x1d476
:
u'o'
,
0x1d477
:
u'p'
,
0x1d478
:
u'q'
,
0x1d479
:
u'r'
,
0x1d47a
:
u's'
,
0x1d47b
:
u't'
,
0x1d47c
:
u'u'
,
0x1d47d
:
u'v'
,
0x1d47e
:
u'w'
,
0x1d47f
:
u'x'
,
0x1d480
:
u'y'
,
0x1d481
:
u'z'
,
0x1d49c
:
u'a'
,
0x1d49e
:
u'c'
,
0x1d49f
:
u'd'
,
0x1d4a2
:
u'g'
,
0x1d4a5
:
u'j'
,
0x1d4a6
:
u'k'
,
0x1d4a9
:
u'n'
,
0x1d4aa
:
u'o'
,
0x1d4ab
:
u'p'
,
0x1d4ac
:
u'q'
,
0x1d4ae
:
u's'
,
0x1d4af
:
u't'
,
0x1d4b0
:
u'u'
,
0x1d4b1
:
u'v'
,
0x1d4b2
:
u'w'
,
0x1d4b3
:
u'x'
,
0x1d4b4
:
u'y'
,
0x1d4b5
:
u'z'
,
0x1d4d0
:
u'a'
,
0x1d4d1
:
u'b'
,
0x1d4d2
:
u'c'
,
0x1d4d3
:
u'd'
,
0x1d4d4
:
u'e'
,
0x1d4d5
:
u'f'
,
0x1d4d6
:
u'g'
,
0x1d4d7
:
u'h'
,
0x1d4d8
:
u'i'
,
0x1d4d9
:
u'j'
,
0x1d4da
:
u'k'
,
0x1d4db
:
u'l'
,
0x1d4dc
:
u'm'
,
0x1d4dd
:
u'n'
,
0x1d4de
:
u'o'
,
0x1d4df
:
u'p'
,
0x1d4e0
:
u'q'
,
0x1d4e1
:
u'r'
,
0x1d4e2
:
u's'
,
0x1d4e3
:
u't'
,
0x1d4e4
:
u'u'
,
0x1d4e5
:
u'v'
,
0x1d4e6
:
u'w'
,
0x1d4e7
:
u'x'
,
0x1d4e8
:
u'y'
,
0x1d4e9
:
u'z'
,
0x1d504
:
u'a'
,
0x1d505
:
u'b'
,
0x1d507
:
u'd'
,
0x1d508
:
u'e'
,
0x1d509
:
u'f'
,
0x1d50a
:
u'g'
,
0x1d50d
:
u'j'
,
0x1d50e
:
u'k'
,
0x1d50f
:
u'l'
,
0x1d510
:
u'm'
,
0x1d511
:
u'n'
,
0x1d512
:
u'o'
,
0x1d513
:
u'p'
,
0x1d514
:
u'q'
,
0x1d516
:
u's'
,
0x1d517
:
u't'
,
0x1d518
:
u'u'
,
0x1d519
:
u'v'
,
0x1d51a
:
u'w'
,
0x1d51b
:
u'x'
,
0x1d51c
:
u'y'
,
0x1d538
:
u'a'
,
0x1d539
:
u'b'
,
0x1d53b
:
u'd'
,
0x1d53c
:
u'e'
,
0x1d53d
:
u'f'
,
0x1d53e
:
u'g'
,
0x1d540
:
u'i'
,
0x1d541
:
u'j'
,
0x1d542
:
u'k'
,
0x1d543
:
u'l'
,
0x1d544
:
u'm'
,
0x1d546
:
u'o'
,
0x1d54a
:
u's'
,
0x1d54b
:
u't'
,
0x1d54c
:
u'u'
,
0x1d54d
:
u'v'
,
0x1d54e
:
u'w'
,
0x1d54f
:
u'x'
,
0x1d550
:
u'y'
,
0x1d56c
:
u'a'
,
0x1d56d
:
u'b'
,
0x1d56e
:
u'c'
,
0x1d56f
:
u'd'
,
0x1d570
:
u'e'
,
0x1d571
:
u'f'
,
0x1d572
:
u'g'
,
0x1d573
:
u'h'
,
0x1d574
:
u'i'
,
0x1d575
:
u'j'
,
0x1d576
:
u'k'
,
0x1d577
:
u'l'
,
0x1d578
:
u'm'
,
0x1d579
:
u'n'
,
0x1d57a
:
u'o'
,
0x1d57b
:
u'p'
,
0x1d57c
:
u'q'
,
0x1d57d
:
u'r'
,
0x1d57e
:
u's'
,
0x1d57f
:
u't'
,
0x1d580
:
u'u'
,
0x1d581
:
u'v'
,
0x1d582
:
u'w'
,
0x1d583
:
u'x'
,
0x1d584
:
u'y'
,
0x1d585
:
u'z'
,
0x1d5a0
:
u'a'
,
0x1d5a1
:
u'b'
,
0x1d5a2
:
u'c'
,
0x1d5a3
:
u'd'
,
0x1d5a4
:
u'e'
,
0x1d5a5
:
u'f'
,
0x1d5a6
:
u'g'
,
0x1d5a7
:
u'h'
,
0x1d5a8
:
u'i'
,
0x1d5a9
:
u'j'
,
0x1d5aa
:
u'k'
,
0x1d5ab
:
u'l'
,
0x1d5ac
:
u'm'
,
0x1d5ad
:
u'n'
,
0x1d5ae
:
u'o'
,
0x1d5af
:
u'p'
,
0x1d5b0
:
u'q'
,
0x1d5b1
:
u'r'
,
0x1d5b2
:
u's'
,
0x1d5b3
:
u't'
,
0x1d5b4
:
u'u'
,
0x1d5b5
:
u'v'
,
0x1d5b6
:
u'w'
,
0x1d5b7
:
u'x'
,
0x1d5b8
:
u'y'
,
0x1d5b9
:
u'z'
,
0x1d5d4
:
u'a'
,
0x1d5d5
:
u'b'
,
0x1d5d6
:
u'c'
,
0x1d5d7
:
u'd'
,
0x1d5d8
:
u'e'
,
0x1d5d9
:
u'f'
,
0x1d5da
:
u'g'
,
0x1d5db
:
u'h'
,
0x1d5dc
:
u'i'
,
0x1d5dd
:
u'j'
,
0x1d5de
:
u'k'
,
0x1d5df
:
u'l'
,
0x1d5e0
:
u'm'
,
0x1d5e1
:
u'n'
,
0x1d5e2
:
u'o'
,
0x1d5e3
:
u'p'
,
0x1d5e4
:
u'q'
,
0x1d5e5
:
u'r'
,
0x1d5e6
:
u's'
,
0x1d5e7
:
u't'
,
0x1d5e8
:
u'u'
,
0x1d5e9
:
u'v'
,
0x1d5ea
:
u'w'
,
0x1d5eb
:
u'x'
,
0x1d5ec
:
u'y'
,
0x1d5ed
:
u'z'
,
0x1d608
:
u'a'
,
0x1d609
:
u'b'
,
0x1d60a
:
u'c'
,
0x1d60b
:
u'd'
,
0x1d60c
:
u'e'
,
0x1d60d
:
u'f'
,
0x1d60e
:
u'g'
,
0x1d60f
:
u'h'
,
0x1d610
:
u'i'
,
0x1d611
:
u'j'
,
0x1d612
:
u'k'
,
0x1d613
:
u'l'
,
0x1d614
:
u'm'
,
0x1d615
:
u'n'
,
0x1d616
:
u'o'
,
0x1d617
:
u'p'
,
0x1d618
:
u'q'
,
0x1d619
:
u'r'
,
0x1d61a
:
u's'
,
0x1d61b
:
u't'
,
0x1d61c
:
u'u'
,
0x1d61d
:
u'v'
,
0x1d61e
:
u'w'
,
0x1d61f
:
u'x'
,
0x1d620
:
u'y'
,
0x1d621
:
u'z'
,
0x1d63c
:
u'a'
,
0x1d63d
:
u'b'
,
0x1d63e
:
u'c'
,
0x1d63f
:
u'd'
,
0x1d640
:
u'e'
,
0x1d641
:
u'f'
,
0x1d642
:
u'g'
,
0x1d643
:
u'h'
,
0x1d644
:
u'i'
,
0x1d645
:
u'j'
,
0x1d646
:
u'k'
,
0x1d647
:
u'l'
,
0x1d648
:
u'm'
,
0x1d649
:
u'n'
,
0x1d64a
:
u'o'
,
0x1d64b
:
u'p'
,
0x1d64c
:
u'q'
,
0x1d64d
:
u'r'
,
0x1d64e
:
u's'
,
0x1d64f
:
u't'
,
0x1d650
:
u'u'
,
0x1d651
:
u'v'
,
0x1d652
:
u'w'
,
0x1d653
:
u'x'
,
0x1d654
:
u'y'
,
0x1d655
:
u'z'
,
0x1d670
:
u'a'
,
0x1d671
:
u'b'
,
0x1d672
:
u'c'
,
0x1d673
:
u'd'
,
0x1d674
:
u'e'
,
0x1d675
:
u'f'
,
0x1d676
:
u'g'
,
0x1d677
:
u'h'
,
0x1d678
:
u'i'
,
0x1d679
:
u'j'
,
0x1d67a
:
u'k'
,
0x1d67b
:
u'l'
,
0x1d67c
:
u'm'
,
0x1d67d
:
u'n'
,
0x1d67e
:
u'o'
,
0x1d67f
:
u'p'
,
0x1d680
:
u'q'
,
0x1d681
:
u'r'
,
0x1d682
:
u's'
,
0x1d683
:
u't'
,
0x1d684
:
u'u'
,
0x1d685
:
u'v'
,
0x1d686
:
u'w'
,
0x1d687
:
u'x'
,
0x1d688
:
u'y'
,
0x1d689
:
u'z'
,
0x1d6a8
:
u'
\u03b1
'
,
0x1d6a9
:
u'
\u03b2
'
,
0x1d6aa
:
u'
\u03b3
'
,
0x1d6ab
:
u'
\u03b4
'
,
0x1d6ac
:
u'
\u03b5
'
,
0x1d6ad
:
u'
\u03b6
'
,
0x1d6ae
:
u'
\u03b7
'
,
0x1d6af
:
u'
\u03b8
'
,
0x1d6b0
:
u'
\u03b9
'
,
0x1d6b1
:
u'
\u03ba
'
,
0x1d6b2
:
u'
\u03bb
'
,
0x1d6b3
:
u'
\u03bc
'
,
0x1d6b4
:
u'
\u03bd
'
,
0x1d6b5
:
u'
\u03be
'
,
0x1d6b6
:
u'
\u03bf
'
,
0x1d6b7
:
u'
\u03c0
'
,
0x1d6b8
:
u'
\u03c1
'
,
0x1d6b9
:
u'
\u03b8
'
,
0x1d6ba
:
u'
\u03c3
'
,
0x1d6bb
:
u'
\u03c4
'
,
0x1d6bc
:
u'
\u03c5
'
,
0x1d6bd
:
u'
\u03c6
'
,
0x1d6be
:
u'
\u03c7
'
,
0x1d6bf
:
u'
\u03c8
'
,
0x1d6c0
:
u'
\u03c9
'
,
0x1d6d3
:
u'
\u03c3
'
,
0x1d6e2
:
u'
\u03b1
'
,
0x1d6e3
:
u'
\u03b2
'
,
0x1d6e4
:
u'
\u03b3
'
,
0x1d6e5
:
u'
\u03b4
'
,
0x1d6e6
:
u'
\u03b5
'
,
0x1d6e7
:
u'
\u03b6
'
,
0x1d6e8
:
u'
\u03b7
'
,
0x1d6e9
:
u'
\u03b8
'
,
0x1d6ea
:
u'
\u03b9
'
,
0x1d6eb
:
u'
\u03ba
'
,
0x1d6ec
:
u'
\u03bb
'
,
0x1d6ed
:
u'
\u03bc
'
,
0x1d6ee
:
u'
\u03bd
'
,
0x1d6ef
:
u'
\u03be
'
,
0x1d6f0
:
u'
\u03bf
'
,
0x1d6f1
:
u'
\u03c0
'
,
0x1d6f2
:
u'
\u03c1
'
,
0x1d6f3
:
u'
\u03b8
'
,
0x1d6f4
:
u'
\u03c3
'
,
0x1d6f5
:
u'
\u03c4
'
,
0x1d6f6
:
u'
\u03c5
'
,
0x1d6f7
:
u'
\u03c6
'
,
0x1d6f8
:
u'
\u03c7
'
,
0x1d6f9
:
u'
\u03c8
'
,
0x1d6fa
:
u'
\u03c9
'
,
0x1d70d
:
u'
\u03c3
'
,
0x1d71c
:
u'
\u03b1
'
,
0x1d71d
:
u'
\u03b2
'
,
0x1d71e
:
u'
\u03b3
'
,
0x1d71f
:
u'
\u03b4
'
,
0x1d720
:
u'
\u03b5
'
,
0x1d721
:
u'
\u03b6
'
,
0x1d722
:
u'
\u03b7
'
,
0x1d723
:
u'
\u03b8
'
,
0x1d724
:
u'
\u03b9
'
,
0x1d725
:
u'
\u03ba
'
,
0x1d726
:
u'
\u03bb
'
,
0x1d727
:
u'
\u03bc
'
,
0x1d728
:
u'
\u03bd
'
,
0x1d729
:
u'
\u03be
'
,
0x1d72a
:
u'
\u03bf
'
,
0x1d72b
:
u'
\u03c0
'
,
0x1d72c
:
u'
\u03c1
'
,
0x1d72d
:
u'
\u03b8
'
,
0x1d72e
:
u'
\u03c3
'
,
0x1d72f
:
u'
\u03c4
'
,
0x1d730
:
u'
\u03c5
'
,
0x1d731
:
u'
\u03c6
'
,
0x1d732
:
u'
\u03c7
'
,
0x1d733
:
u'
\u03c8
'
,
0x1d734
:
u'
\u03c9
'
,
0x1d747
:
u'
\u03c3
'
,
0x1d756
:
u'
\u03b1
'
,
0x1d757
:
u'
\u03b2
'
,
0x1d758
:
u'
\u03b3
'
,
0x1d759
:
u'
\u03b4
'
,
0x1d75a
:
u'
\u03b5
'
,
0x1d75b
:
u'
\u03b6
'
,
0x1d75c
:
u'
\u03b7
'
,
0x1d75d
:
u'
\u03b8
'
,
0x1d75e
:
u'
\u03b9
'
,
0x1d75f
:
u'
\u03ba
'
,
0x1d760
:
u'
\u03bb
'
,
0x1d761
:
u'
\u03bc
'
,
0x1d762
:
u'
\u03bd
'
,
0x1d763
:
u'
\u03be
'
,
0x1d764
:
u'
\u03bf
'
,
0x1d765
:
u'
\u03c0
'
,
0x1d766
:
u'
\u03c1
'
,
0x1d767
:
u'
\u03b8
'
,
0x1d768
:
u'
\u03c3
'
,
0x1d769
:
u'
\u03c4
'
,
0x1d76a
:
u'
\u03c5
'
,
0x1d76b
:
u'
\u03c6
'
,
0x1d76c
:
u'
\u03c7
'
,
0x1d76d
:
u'
\u03c8
'
,
0x1d76e
:
u'
\u03c9
'
,
0x1d781
:
u'
\u03c3
'
,
0x1d790
:
u'
\u03b1
'
,
0x1d791
:
u'
\u03b2
'
,
0x1d792
:
u'
\u03b3
'
,
0x1d793
:
u'
\u03b4
'
,
0x1d794
:
u'
\u03b5
'
,
0x1d795
:
u'
\u03b6
'
,
0x1d796
:
u'
\u03b7
'
,
0x1d797
:
u'
\u03b8
'
,
0x1d798
:
u'
\u03b9
'
,
0x1d799
:
u'
\u03ba
'
,
0x1d79a
:
u'
\u03bb
'
,
0x1d79b
:
u'
\u03bc
'
,
0x1d79c
:
u'
\u03bd
'
,
0x1d79d
:
u'
\u03be
'
,
0x1d79e
:
u'
\u03bf
'
,
0x1d79f
:
u'
\u03c0
'
,
0x1d7a0
:
u'
\u03c1
'
,
0x1d7a1
:
u'
\u03b8
'
,
0x1d7a2
:
u'
\u03c3
'
,
0x1d7a3
:
u'
\u03c4
'
,
0x1d7a4
:
u'
\u03c5
'
,
0x1d7a5
:
u'
\u03c6
'
,
0x1d7a6
:
u'
\u03c7
'
,
0x1d7a7
:
u'
\u03c8
'
,
0x1d7a8
:
u'
\u03c9
'
,
0x1d7bb
:
u'
\u03c3
'
,
}
def
map_table_b3
(
code
):
r
=
b3_exceptions
.
get
(
ord
(
code
))
if
r
is
not
None
:
return
r
return
code
.
lower
()
def
map_table_b2
(
a
):
al
=
map_table_b3
(
a
)
b
=
unicodedata
.
normalize
(
"NFKC"
,
al
)
bl
=
u""
.
join
([
map_table_b3
(
ch
)
for
ch
in
b
])
c
=
unicodedata
.
normalize
(
"NFKC"
,
bl
)
if
b
!=
c
:
return
c
else
:
return
al
def
in_table_c11
(
code
):
return
code
==
u" "
def
in_table_c12
(
code
):
return
unicodedata
.
category
(
code
)
==
"Zs"
and
code
!=
u" "
def
in_table_c11_c12
(
code
):
return
unicodedata
.
category
(
code
)
==
"Zs"
def
in_table_c21
(
code
):
return
ord
(
code
)
<
128
and
unicodedata
.
category
(
code
)
==
"Cc"
c22_specials
=
sets
.
Set
([
1757
,
1807
,
6158
,
8204
,
8205
,
8232
,
8233
,
65279
]
+
range
(
8288
,
8292
)
+
range
(
8298
,
8304
)
+
range
(
65529
,
65533
)
+
range
(
119155
,
119163
))
def
in_table_c22
(
code
):
c
=
ord
(
code
)
if
c
<
128
:
return
False
if
unicodedata
.
category
(
code
)
==
"Cc"
:
return
True
return
c
in
c22_specials
def
in_table_c21_c22
(
code
):
return
unicodedata
.
category
(
code
)
==
"Cc"
or
\
ord
(
code
)
in
c22_specials
def
in_table_c3
(
code
):
return
unicodedata
.
category
(
code
)
==
"Co"
def
in_table_c4
(
code
):
c
=
ord
(
code
)
if
c
<
0xFDD0
:
return
False
if
c
<
0xFDF0
:
return
True
return
(
ord
(
code
)
&
0xFFFF
)
in
(
0xFFFE
,
0xFFFF
)
def
in_table_c5
(
code
):
return
unicodedata
.
category
(
code
)
==
"Cs"
c6_set
=
sets
.
Set
(
range
(
65529
,
65534
))
def
in_table_c6
(
code
):
return
ord
(
code
)
in
c6_set
c7_set
=
sets
.
Set
(
range
(
12272
,
12284
))
def
in_table_c7
(
code
):
return
ord
(
code
)
in
c7_set
c8_set
=
sets
.
Set
([
832
,
833
,
8206
,
8207
]
+
range
(
8234
,
8239
)
+
range
(
8298
,
8304
))
def
in_table_c8
(
code
):
return
ord
(
code
)
in
c8_set
c9_set
=
sets
.
Set
([
917505
]
+
range
(
917536
,
917632
))
def
in_table_c9
(
code
):
return
ord
(
code
)
in
c9_set
def
in_table_d1
(
code
):
return
unicodedata
.
bidirectional
(
code
)
in
(
"R"
,
"AL"
)
def
in_table_d2
(
code
):
return
unicodedata
.
bidirectional
(
code
)
==
"L"
Lib/test/test_codecs.py
Dosyayı görüntüle @
2548c730
...
...
@@ -36,11 +36,307 @@ class RecodingTest(unittest.TestCase):
# Python used to crash on this at exit because of a refcount
# bug in _codecsmodule.c
# From RFC 3492
punycode_testcases
=
[
# A Arabic (Egyptian):
(
u"
\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644
"
u"
\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F
"
,
"egbpdaj6bu4bxfgehfvwxn"
),
# B Chinese (simplified):
(
u"
\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587
"
,
"ihqwcrb4cv8a8dqg056pqjye"
),
# C Chinese (traditional):
(
u"
\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587
"
,
"ihqwctvzc91f659drss3x8bo0yb"
),
# D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
(
u"
\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074
"
u"
\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D
"
u"
\u0065\u0073\u006B\u0079
"
,
"Proprostnemluvesky-uyb24dma41a"
),
# E Hebrew:
(
u"
\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8
"
u"
\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2
"
u"
\u05D1\u05E8\u05D9\u05EA
"
,
"4dbcagdahymbxekheh6e0a7fei0b"
),
# F Hindi (Devanagari):
(
u"
\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D
"
u"
\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939
"
u"
\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947
"
u"
\u0939\u0948\u0902
"
,
"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"
),
#(G) Japanese (kanji and hiragana):
(
u"
\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092
"
u"
\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B
"
,
"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"
),
# (H) Korean (Hangul syllables):
(
u"
\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774
"
u"
\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74
"
u"
\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C
"
,
"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
"psd879ccm6fea98c"
),
# (I) Russian (Cyrillic):
(
u"
\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E
"
u"
\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440
"
u"
\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A
"
u"
\u0438
"
,
"b1abfaaepdrnnbgefbaDotcwatmq2g4l"
),
# (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
(
u"
\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070
"
u"
\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070
"
u"
\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061
"
u"
\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070
"
u"
\u0061\u00F1\u006F\u006C
"
,
"PorqunopuedensimplementehablarenEspaol-fmd56a"
),
# (K) Vietnamese:
# T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
# <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
(
u"
\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B
"
u"
\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068
"
u"
\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067
"
u"
\u0056\u0069\u1EC7\u0074
"
,
"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"
),
#(L) 3<nen>B<gumi><kinpachi><sensei>
(
u"
\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F
"
,
"3B-ww4c5e180e575a65lsy2b"
),
# (M) <amuro><namie>-with-SUPER-MONKEYS
(
u"
\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074
"
u"
\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D
"
u"
\u004F\u004E\u004B\u0045\u0059\u0053
"
,
"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"
),
# (N) Hello-Another-Way-<sorezore><no><basho>
(
u"
\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F
"
u"
\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D
"
u"
\u305D\u308C\u305E\u308C\u306E\u5834\u6240
"
,
"Hello-Another-Way--fc4qua05auwb3674vfr0b"
),
# (O) <hitotsu><yane><no><shita>2
(
u"
\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032
"
,
"2-u9tlzr9756bt3uc0v"
),
# (P) Maji<de>Koi<suru>5<byou><mae>
(
u"
\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059
"
u"
\u308B\u0035\u79D2\u524D
"
,
"MajiKoi5-783gue6qz075azm5e"
),
# (Q) <pafii>de<runba>
(
u"
\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0
"
,
"de-jg4avhby1noc0d"
),
# (R) <sono><supiido><de>
(
u"
\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067
"
,
"d9juau41awczczp"
),
# (S) -> $1.00 <-
(
u"
\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020
"
u"
\u003C\u002D
"
,
"-> $1.00 <--"
)
]
for
i
in
punycode_testcases
:
if
len
(
i
)
!=
2
:
print
repr
(
i
)
class
PunycodeTest
(
unittest
.
TestCase
):
def
test_encode
(
self
):
for
uni
,
puny
in
punycode_testcases
:
# Need to convert both strings to lower case, since
# some of the extended encodings use upper case, but our
# code produces only lower case. Converting just puny to
# lower is also insufficient, since some of the input characters
# are upper case.
self
.
assertEquals
(
uni
.
encode
(
"punycode"
)
.
lower
(),
puny
.
lower
())
def
test_decode
(
self
):
for
uni
,
puny
in
punycode_testcases
:
self
.
assertEquals
(
uni
,
puny
.
decode
(
"punycode"
))
# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
nameprep_tests
=
[
# 3.1 Map to nothing.
(
'foo
\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8b
bar'
'
\xe2\x80\x8b\xe2\x81\xa0
baz
\xef\xb8\x80\xef\xb8\x88\xef
'
'
\xb8\x8f\xef\xbb\xbf
'
,
'foobarbaz'
),
# 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
(
'CAFE'
,
'cafe'
),
# 3.3 Case folding 8bit U+00DF (german sharp s).
# The original test case is bogus; it says \xc3\xdf
(
'
\xc3\x9f
'
,
'ss'
),
# 3.4 Case folding U+0130 (turkish capital I with dot).
(
'
\xc4\xb0
'
,
'i
\xcc\x87
'
),
# 3.5 Case folding multibyte U+0143 U+037A.
(
'
\xc5\x83\xcd\xba
'
,
'
\xc5\x84
\xce\xb9
'
),
# 3.6 Case folding U+2121 U+33C6 U+1D7BB.
# XXX: skip this as it fails in UCS-2 mode
#('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
# 'telc\xe2\x88\x95kg\xcf\x83'),
(
None
,
None
),
# 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
(
'j
\xcc\x8c\xc2\xa0\xc2\xaa
'
,
'
\xc7\xb0
a'
),
# 3.8 Case folding U+1FB7 and normalization.
(
'
\xe1\xbe\xb7
'
,
'
\xe1\xbe\xb6\xce\xb9
'
),
# 3.9 Self-reverting case folding U+01F0 and normalization.
# The original test case is bogus, it says `\xc7\xf0'
(
'
\xc7\xb0
'
,
'
\xc7\xb0
'
),
# 3.10 Self-reverting case folding U+0390 and normalization.
(
'
\xce\x90
'
,
'
\xce\x90
'
),
# 3.11 Self-reverting case folding U+03B0 and normalization.
(
'
\xce\xb0
'
,
'
\xce\xb0
'
),
# 3.12 Self-reverting case folding U+1E96 and normalization.
(
'
\xe1\xba\x96
'
,
'
\xe1\xba\x96
'
),
# 3.13 Self-reverting case folding U+1F56 and normalization.
(
'
\xe1\xbd\x96
'
,
'
\xe1\xbd\x96
'
),
# 3.14 ASCII space character U+0020.
(
' '
,
' '
),
# 3.15 Non-ASCII 8bit space character U+00A0.
(
'
\xc2\xa0
'
,
' '
),
# 3.16 Non-ASCII multibyte space character U+1680.
(
'
\xe1\x9a\x80
'
,
None
),
# 3.17 Non-ASCII multibyte space character U+2000.
(
'
\xe2\x80\x80
'
,
' '
),
# 3.18 Zero Width Space U+200b.
(
'
\xe2\x80\x8b
'
,
''
),
# 3.19 Non-ASCII multibyte space character U+3000.
(
'
\xe3\x80\x80
'
,
' '
),
# 3.20 ASCII control characters U+0010 U+007F.
(
'
\x10\x7f
'
,
'
\x10\x7f
'
),
# 3.21 Non-ASCII 8bit control character U+0085.
(
'
\xc2\x85
'
,
None
),
# 3.22 Non-ASCII multibyte control character U+180E.
(
'
\xe1\xa0\x8e
'
,
None
),
# 3.23 Zero Width No-Break Space U+FEFF.
(
'
\xef\xbb\xbf
'
,
''
),
# 3.24 Non-ASCII control character U+1D175.
(
'
\xf0\x9d\x85\xb5
'
,
None
),
# 3.25 Plane 0 private use character U+F123.
(
'
\xef\x84\xa3
'
,
None
),
# 3.26 Plane 15 private use character U+F1234.
(
'
\xf3\xb1\x88\xb4
'
,
None
),
# 3.27 Plane 16 private use character U+10F234.
(
'
\xf4\x8f\x88\xb4
'
,
None
),
# 3.28 Non-character code point U+8FFFE.
(
'
\xf2\x8f\xbf\xbe
'
,
None
),
# 3.29 Non-character code point U+10FFFF.
(
'
\xf4\x8f\xbf\xbf
'
,
None
),
# 3.30 Surrogate code U+DF42.
(
'
\xed\xbd\x82
'
,
None
),
# 3.31 Non-plain text character U+FFFD.
(
'
\xef\xbf\xbd
'
,
None
),
# 3.32 Ideographic description character U+2FF5.
(
'
\xe2\xbf\xb5
'
,
None
),
# 3.33 Display property character U+0341.
(
'
\xcd\x81
'
,
'
\xcc\x81
'
),
# 3.34 Left-to-right mark U+200E.
(
'
\xe2\x80\x8e
'
,
None
),
# 3.35 Deprecated U+202A.
(
'
\xe2\x80\xaa
'
,
None
),
# 3.36 Language tagging character U+E0001.
(
'
\xf3\xa0\x80\x81
'
,
None
),
# 3.37 Language tagging character U+E0042.
(
'
\xf3\xa0\x81\x82
'
,
None
),
# 3.38 Bidi: RandALCat character U+05BE and LCat characters.
(
'foo
\xd6\xbe
bar'
,
None
),
# 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
(
'foo
\xef\xb5\x90
bar'
,
None
),
# 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
(
'foo
\xef\xb9\xb6
bar'
,
'foo
\xd9\x8e
bar'
),
# 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
(
'
\xd8\xa7
1'
,
None
),
# 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
(
'
\xd8\xa7
1
\xd8\xa8
'
,
'
\xd8\xa7
1
\xd8\xa8
'
),
# 3.43 Unassigned code point U+E0002.
(
'
\xf3\xa0\x80\x82
'
,
None
),
# 3.44 Larger test (shrinking).
# Original test case reads \xc3\xdf
(
'X
\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1
j
\xcc\x8c\xc2\xa0\xc2
'
'
\xaa\xce\xb0\xe2\x80\x80
'
,
'xssi
\xcc\x87
tel
\xc7\xb0
a
\xce\xb0
'
),
# 3.45 Larger test (expanding).
# Original test case reads \xc3\x9f
(
'X
\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c
'
'
\x80
'
,
'xss
\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3
'
'
\x83\x88\xe3\x83\xab
i
\xcc\x87
tel
\x28
d
\x29\xe3\x82
'
'
\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88
'
)
]
class
NameprepTest
(
unittest
.
TestCase
):
def
test_nameprep
(
self
):
from
encodings.idna
import
nameprep
for
pos
,
(
orig
,
prepped
)
in
enumerate
(
nameprep_tests
):
if
orig
is
None
:
# Skipped
continue
# The Unicode strings are given in UTF-8
orig
=
unicode
(
orig
,
"utf-8"
)
if
prepped
is
None
:
# Input contains prohibited characters
self
.
assertRaises
(
UnicodeError
,
nameprep
,
orig
)
else
:
prepped
=
unicode
(
prepped
,
"utf-8"
)
try
:
self
.
assertEquals
(
nameprep
(
orig
),
prepped
)
except
Exception
,
e
:
raise
test_support
.
TestFailed
(
"Test 3.
%
d:
%
s"
%
(
pos
+
1
,
str
(
e
)))
def
test_main
():
suite
=
unittest
.
TestSuite
()
suite
.
addTest
(
unittest
.
makeSuite
(
UTF16Test
))
suite
.
addTest
(
unittest
.
makeSuite
(
EscapeDecodeTest
))
suite
.
addTest
(
unittest
.
makeSuite
(
RecodingTest
))
suite
.
addTest
(
unittest
.
makeSuite
(
PunycodeTest
))
suite
.
addTest
(
unittest
.
makeSuite
(
NameprepTest
))
test_support
.
run_suite
(
suite
)
...
...
Misc/NEWS
Dosyayı görüntüle @
2548c730
...
...
@@ -118,6 +118,11 @@ Extension modules
Library
-------
-
Support
for
internationalized
domain
names
has
been
added
through
the
'idna'
and
'punycode'
encodings
,
the
'stringprep'
module
,
the
'mkstringprep'
tool
,
and
enhancements
to
the
socket
and
httplib
modules
.
-
htmlentitydefs
has
two
new
dictionaries
:
name2codepoint
maps
HTML
entity
names
to
Unicode
codepoints
(
as
integers
).
codepoint2name
is
the
reverse
mapping
.
See
SF
patch
#
722017.
...
...
Modules/socketmodule.c
Dosyayı görüntüle @
2548c730
...
...
@@ -874,7 +874,8 @@ getsockaddrarg(PySocketSockObject *s, PyObject *args,
args
->
ob_type
->
tp_name
);
return
0
;
}
if
(
!
PyArg_ParseTuple
(
args
,
"si:getsockaddrarg"
,
&
host
,
&
port
))
if
(
!
PyArg_ParseTuple
(
args
,
"eti:getsockaddrarg"
,
"idna"
,
&
host
,
&
port
))
return
0
;
if
(
setipaddr
(
host
,
(
struct
sockaddr
*
)
addr
,
sizeof
(
*
addr
),
AF_INET
)
<
0
)
return
0
;
...
...
@@ -893,7 +894,8 @@ getsockaddrarg(PySocketSockObject *s, PyObject *args,
int
port
,
flowinfo
,
scope_id
;
addr
=
(
struct
sockaddr_in6
*
)
&
(
s
->
sock_addr
).
in6
;
flowinfo
=
scope_id
=
0
;
if
(
!
PyArg_ParseTuple
(
args
,
"si|ii"
,
&
host
,
&
port
,
&
flowinfo
,
if
(
!
PyArg_ParseTuple
(
args
,
"eti|ii"
,
"idna"
,
&
host
,
&
port
,
&
flowinfo
,
&
scope_id
))
{
return
0
;
}
...
...
@@ -2782,6 +2784,7 @@ socket_getaddrinfo(PyObject *self, PyObject *args)
{
struct
addrinfo
hints
,
*
res
;
struct
addrinfo
*
res0
=
NULL
;
PyObject
*
hobj
=
NULL
;
PyObject
*
pobj
=
(
PyObject
*
)
NULL
;
char
pbuf
[
30
];
char
*
hptr
,
*
pptr
;
...
...
@@ -2789,12 +2792,27 @@ socket_getaddrinfo(PyObject *self, PyObject *args)
int
error
;
PyObject
*
all
=
(
PyObject
*
)
NULL
;
PyObject
*
single
=
(
PyObject
*
)
NULL
;
PyObject
*
idna
=
NULL
;
family
=
socktype
=
protocol
=
flags
=
0
;
family
=
AF_UNSPEC
;
if
(
!
PyArg_ParseTuple
(
args
,
"zO|iiii:getaddrinfo"
,
&
hptr
,
&
pobj
,
&
family
,
&
socktype
,
&
protocol
,
&
flags
))
{
if
(
!
PyArg_ParseTuple
(
args
,
"OO|iiii:getaddrinfo"
,
&
hobj
,
&
pobj
,
&
family
,
&
socktype
,
&
protocol
,
&
flags
))
{
return
NULL
;
}
if
(
hobj
==
Py_None
)
{
hptr
=
NULL
;
}
else
if
(
PyUnicode_Check
(
hobj
))
{
idna
=
PyObject_CallMethod
(
hobj
,
"encode"
,
"s"
,
"idna"
);
if
(
!
idna
)
return
NULL
;
hptr
=
PyString_AsString
(
idna
);
}
else
if
(
PyString_Check
(
hobj
))
{
hptr
=
PyString_AsString
(
hobj
);
}
else
{
PyErr_SetString
(
PyExc_TypeError
,
"getaddrinfo() argument 1 must be string or None"
);
return
NULL
;
}
if
(
PyInt_Check
(
pobj
))
{
...
...
@@ -2838,12 +2856,14 @@ socket_getaddrinfo(PyObject *self, PyObject *args)
goto
err
;
Py_XDECREF
(
single
);
}
Py_XDECREF
(
idna
);
if
(
res0
)
freeaddrinfo
(
res0
);
return
all
;
err:
Py_XDECREF
(
single
);
Py_XDECREF
(
all
);
Py_XDECREF
(
idna
);
if
(
res0
)
freeaddrinfo
(
res0
);
return
(
PyObject
*
)
NULL
;
...
...
Tools/unicode/mkstringprep.py
0 → 100644
Dosyayı görüntüle @
2548c730
import
re
,
unicodedata
,
sys
,
sets
from
sets
import
Set
if
sys
.
maxunicode
==
65535
:
raise
RuntimeError
,
"need UCS-4 Python"
def
gen_category
(
cats
):
for
i
in
range
(
0
,
0x110000
):
if
unicodedata
.
category
(
unichr
(
i
))
in
cats
:
yield
(
i
)
def
gen_bidirectional
(
cats
):
for
i
in
range
(
0
,
0x110000
):
if
unicodedata
.
bidirectional
(
unichr
(
i
))
in
cats
:
yield
(
i
)
def
compact_set
(
l
):
single
=
[]
tuple
=
[]
prev
=
None
span
=
0
for
e
in
l
:
if
prev
is
None
:
prev
=
e
span
=
0
continue
if
prev
+
span
+
1
!=
e
:
if
span
>
2
:
tuple
.
append
((
prev
,
prev
+
span
+
1
))
else
:
for
i
in
range
(
prev
,
prev
+
span
+
1
):
single
.
append
(
i
)
prev
=
e
span
=
0
else
:
span
+=
1
if
span
:
tuple
.
append
((
prev
,
prev
+
span
+
1
))
else
:
single
.
append
(
prev
)
tuple
=
" + "
.
join
([
"range(
%
d,
%
d)"
%
t
for
t
in
tuple
])
if
not
single
:
return
"sets.Set(
%
s)"
%
tuple
if
not
tuple
:
return
"sets.Set(
%
s)"
%
repr
(
single
)
return
"sets.Set(
%
s +
%
s)"
%
(
repr
(
single
),
tuple
)
############## Read the tables in the RFC #######################
data
=
open
(
"rfc3454.txt"
)
.
readlines
()
tables
=
[]
curname
=
None
for
l
in
data
:
l
=
l
.
strip
()
if
not
l
:
continue
# Skip RFC page breaks
if
l
.
startswith
(
"Hoffman & Blanchet"
)
or
\
l
.
startswith
(
"RFC 3454"
):
continue
# Find start/end lines
m
=
re
.
match
(
"----- (Start|End) Table ([A-Z](.[0-9])+) -----"
,
l
)
if
m
:
if
m
.
group
(
1
)
==
"Start"
:
if
curname
:
raise
"Double Start"
,(
curname
,
l
)
curname
=
m
.
group
(
2
)
table
=
{}
tables
.
append
((
curname
,
table
))
continue
else
:
if
not
curname
:
raise
"End without start"
,
l
curname
=
None
continue
if
not
curname
:
continue
# Now we are in a table
fields
=
l
.
split
(
";"
)
if
len
(
fields
)
>
1
:
# Drop comment field
fields
=
fields
[:
-
1
]
if
len
(
fields
)
==
1
:
fields
=
fields
[
0
]
.
split
(
"-"
)
if
len
(
fields
)
>
1
:
# range
try
:
start
,
end
=
fields
except
ValueError
:
raise
"Unpacking problem"
,
l
else
:
start
=
end
=
fields
[
0
]
start
=
int
(
start
,
16
)
end
=
int
(
end
,
16
)
for
i
in
range
(
start
,
end
+
1
):
table
[
i
]
=
i
else
:
code
,
value
=
fields
value
=
value
.
strip
()
if
value
:
value
=
[
int
(
v
,
16
)
for
v
in
value
.
split
(
" "
)]
else
:
# table B.1
value
=
None
table
[
int
(
code
,
16
)]
=
value
########### Generate compact Python versions of the tables #############
print
"""# This file is generated by mkstringprep.py. DO NOT EDIT.
\"\"\"
Library that exposes various tables found in the StringPrep RFC 3454.
There are two kinds of tables: sets, for which a member test is provided,
and mappings, for which a mapping function is provided.
\"\"\"
import unicodedata, sets
"""
print
"assert unicodedata.unidata_version ==
%
s"
%
repr
(
unicodedata
.
unidata_version
)
# A.1 is the table of unassigned characters
# XXX Plane 15 PUA is listed as unassigned in Python.
name
,
table
=
tables
[
0
]
del
tables
[
0
]
assert
name
==
"A.1"
table
=
Set
(
table
.
keys
())
Cn
=
Set
(
gen_category
([
"Cn"
]))
# FDD0..FDEF are process internal codes
Cn
-=
Set
(
range
(
0xFDD0
,
0xFDF0
))
# not a character
Cn
-=
Set
(
range
(
0xFFFE
,
0x110000
,
0x10000
))
Cn
-=
Set
(
range
(
0xFFFF
,
0x110000
,
0x10000
))
# assert table == Cn
print
"""
def in_table_a1(code):
if unicodedata.category(code) != 'Cn': return False
c = ord(code)
if 0xFDD0 <= c < 0xFDF0: return False
return (c & 0xFFFF) not in (0xFFFE, 0xFFFF)
"""
# B.1 cannot easily be derived
name
,
table
=
tables
[
0
]
del
tables
[
0
]
assert
name
==
"B.1"
table
=
table
.
keys
()
table
.
sort
()
print
"""
b1_set = """
+
compact_set
(
table
)
+
"""
def in_table_b1(code):
return ord(code) in b1_set
"""
# B.2 and B.3 is case folding.
# It takes CaseFolding.txt into account, which is
# not available in the Python database. Since
# B.2 is derived from B.3, we process B.3 first.
# B.3 supposedly *is* CaseFolding-3.2.0.txt.
name
,
table_b2
=
tables
[
0
]
del
tables
[
0
]
assert
name
==
"B.2"
name
,
table_b3
=
tables
[
0
]
del
tables
[
0
]
assert
name
==
"B.3"
# B.3 is mostly Python's .lower, except for a number
# of special cases, e.g. considering canonical forms.
b3_exceptions
=
{}
for
k
,
v
in
table_b2
.
items
():
if
map
(
ord
,
unichr
(
k
)
.
lower
())
!=
v
:
b3_exceptions
[
k
]
=
u""
.
join
(
map
(
unichr
,
v
))
b3
=
b3_exceptions
.
items
()
b3
.
sort
()
print
"""
b3_exceptions = {"""
for
i
,(
k
,
v
)
in
enumerate
(
b3
):
print
"0x
%
x:
%
s,"
%
(
k
,
repr
(
v
)),
if
i
%
4
==
3
:
print
print
"}"
print
"""
def map_table_b3(code):
r = b3_exceptions.get(ord(code))
if r is not None: return r
return code.lower()
"""
def
map_table_b3
(
code
):
r
=
b3_exceptions
.
get
(
ord
(
code
))
if
r
is
not
None
:
return
r
return
code
.
lower
()
# B.2 is case folding for NFKC. This is the same as B.3,
# except where NormalizeWithKC(Fold(a)) !=
# NormalizeWithKC(Fold(NormalizeWithKC(Fold(a))))
def
map_table_b2
(
a
):
al
=
map_table_b3
(
a
)
b
=
unicodedata
.
normalize
(
"NFKC"
,
al
)
bl
=
u""
.
join
([
map_table_b3
(
ch
)
for
ch
in
b
])
c
=
unicodedata
.
normalize
(
"NFKC"
,
bl
)
if
b
!=
c
:
return
c
else
:
return
al
specials
=
{}
for
k
,
v
in
table_b2
.
items
():
if
map
(
ord
,
map_table_b2
(
unichr
(
k
)))
!=
v
:
specials
[
k
]
=
v
# B.3 should not add any additional special cases
assert
specials
==
{}
print
"""
def map_table_b2(a):
al = map_table_b3(a)
b = unicodedata.normalize("NFKC", al)
bl = u"".join([map_table_b3(ch) for ch in b])
c = unicodedata.normalize("NFKC", bl)
if b != c:
return c
else:
return al
"""
# C.1.1 is a table with a single character
name
,
table
=
tables
[
0
]
del
tables
[
0
]
assert
name
==
"C.1.1"
assert
table
==
{
0x20
:
0x20
}
print
"""
def in_table_c11(code):
return code == u" "
"""
# C.1.2 is the rest of all space characters
name
,
table
=
tables
[
0
]
del
tables
[
0
]
assert
name
==
"C.1.2"
# table = Set(table.keys())
# Zs = Set(gen_category(["Zs"])) - Set([0x20])
# assert Zs == table
print
"""
def in_table_c12(code):
return unicodedata.category(code) == "Zs" and code != u" "
def in_table_c11_c12(code):
return unicodedata.category(code) == "Zs"
"""
# C.2.1 ASCII control characters
name
,
table_c21
=
tables
[
0
]
del
tables
[
0
]
assert
name
==
"C.2.1"
Cc
=
Set
(
gen_category
([
"Cc"
]))
Cc_ascii
=
Cc
&
Set
(
range
(
128
))
table_c21
=
Set
(
table_c21
.
keys
())
assert
Cc_ascii
==
table_c21
print
"""
def in_table_c21(code):
return ord(code) < 128 and unicodedata.category(code) == "Cc"
"""
# C.2.2 Non-ASCII control characters. It also includes
# a number of characters in category Cf.
name
,
table_c22
=
tables
[
0
]
del
tables
[
0
]
assert
name
==
"C.2.2"
Cc_nonascii
=
Cc
-
Cc_ascii
table_c22
=
Set
(
table_c22
.
keys
())
assert
len
(
Cc_nonascii
-
table_c22
)
==
0
specials
=
list
(
table_c22
-
Cc_nonascii
)
specials
.
sort
()
print
"""c22_specials = """
+
compact_set
(
specials
)
+
"""
def in_table_c22(code):
c = ord(code)
if c < 128: return False
if unicodedata.category(code) == "Cc": return True
return c in c22_specials
def in_table_c21_c22(code):
return unicodedata.category(code) == "Cc" or
\\
ord(code) in c22_specials
"""
# C.3 Private use
name
,
table
=
tables
[
0
]
del
tables
[
0
]
assert
name
==
"C.3"
Co
=
Set
(
gen_category
([
"Co"
]))
assert
Set
(
table
.
keys
())
==
Co
print
"""
def in_table_c3(code):
return unicodedata.category(code) == "Co"
"""
# C.4 Non-character code points, xFFFE, xFFFF
# plus process internal codes
name
,
table
=
tables
[
0
]
del
tables
[
0
]
assert
name
==
"C.4"
nonchar
=
Set
(
range
(
0xFDD0
,
0xFDF0
)
+
range
(
0xFFFE
,
0x110000
,
0x10000
)
+
range
(
0xFFFF
,
0x110000
,
0x10000
))
table
=
Set
(
table
.
keys
())
assert
table
==
nonchar
print
"""
def in_table_c4(code):
c = ord(code)
if c < 0xFDD0: return False
if c < 0xFDF0: return True
return (ord(code) & 0xFFFF) in (0xFFFE, 0xFFFF)
"""
# C.5 Surrogate codes
name
,
table
=
tables
[
0
]
del
tables
[
0
]
assert
name
==
"C.5"
Cs
=
Set
(
gen_category
([
"Cs"
]))
assert
Set
(
table
.
keys
())
==
Cs
print
"""
def in_table_c5(code):
return unicodedata.category(code) == "Cs"
"""
# C.6 Inappropriate for plain text
name
,
table
=
tables
[
0
]
del
tables
[
0
]
assert
name
==
"C.6"
table
=
table
.
keys
()
table
.
sort
()
print
"""
c6_set = """
+
compact_set
(
table
)
+
"""
def in_table_c6(code):
return ord(code) in c6_set
"""
# C.7 Inappropriate for canonical representation
name
,
table
=
tables
[
0
]
del
tables
[
0
]
assert
name
==
"C.7"
table
=
table
.
keys
()
table
.
sort
()
print
"""
c7_set = """
+
compact_set
(
table
)
+
"""
def in_table_c7(code):
return ord(code) in c7_set
"""
# C.8 Change display properties or are deprecated
name
,
table
=
tables
[
0
]
del
tables
[
0
]
assert
name
==
"C.8"
table
=
table
.
keys
()
table
.
sort
()
print
"""
c8_set = """
+
compact_set
(
table
)
+
"""
def in_table_c8(code):
return ord(code) in c8_set
"""
# C.9 Tagging characters
name
,
table
=
tables
[
0
]
del
tables
[
0
]
assert
name
==
"C.9"
table
=
table
.
keys
()
table
.
sort
()
print
"""
c9_set = """
+
compact_set
(
table
)
+
"""
def in_table_c9(code):
return ord(code) in c9_set
"""
# D.1 Characters with bidirectional property "R" or "AL"
name
,
table
=
tables
[
0
]
del
tables
[
0
]
assert
name
==
"D.1"
RandAL
=
Set
(
gen_bidirectional
([
"R"
,
"AL"
]))
assert
Set
(
table
.
keys
())
==
RandAL
print
"""
def in_table_d1(code):
return unicodedata.bidirectional(code) in ("R","AL")
"""
# D.2 Characters with bidirectional property "L"
name
,
table
=
tables
[
0
]
del
tables
[
0
]
assert
name
==
"D.2"
L
=
Set
(
gen_bidirectional
([
"L"
]))
assert
Set
(
table
.
keys
())
==
L
print
"""
def in_table_d2(code):
return unicodedata.bidirectional(code) == "L"
"""
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment