Kaydet (Commit) 0ac30f82 authored tarafından Walter Dörwald's avatar Walter Dörwald

Enhance the punycode decoder so that it can decode

unicode objects.

Fix the idna codec and the tests.
üst 1f05a3b7
......@@ -7,7 +7,8 @@ from unicodedata import ucd_3_2_0 as unicodedata
dots = re.compile("[\u002E\u3002\uFF0E\uFF61]")
# IDNA section 5
ace_prefix = "xn--"
ace_prefix = b"xn--"
sace_prefix = "xn--"
# This assumes query strings, so AllowUnassigned is true
def nameprep(label):
......@@ -87,7 +88,7 @@ def ToASCII(label):
raise UnicodeError("label empty or too long")
# Step 5: Check ACE prefix
if label.startswith(ace_prefix):
if label.startswith(sace_prefix):
raise UnicodeError("Label starts with ACE prefix")
# Step 6: Encode with PUNYCODE
......@@ -134,7 +135,7 @@ def ToUnicode(label):
# Step 7: Compare the result of step 6 with the one of step 3
# label2 will already be in lower case.
if label.lower() != label2:
if str(label, "ascii").lower() != str(label2, "ascii"):
raise UnicodeError("IDNA does not round-trip", label, label2)
# Step 8: return the result of step 5
......@@ -143,7 +144,7 @@ def ToUnicode(label):
### Codec APIs
class Codec(codecs.Codec):
def encode(self,input,errors='strict'):
def encode(self, input, errors='strict'):
if errors != 'strict':
# IDNA is quite clear that implementations must be strict
......@@ -152,19 +153,21 @@ class Codec(codecs.Codec):
if not input:
return b"", 0
result = []
result = b""
labels = dots.split(input)
if labels and len(labels[-1])==0:
if labels and not labels[-1]:
trailing_dot = b'.'
del labels[-1]
else:
trailing_dot = b''
for label in labels:
result.append(ToASCII(label))
if result:
# Join with U+002E
return b".".join(result)+trailing_dot, len(input)
result.extend(b'.')
result.extend(ToASCII(label))
return result+trailing_dot, len(input)
def decode(self,input,errors='strict'):
def decode(self, input, errors='strict'):
if errors != 'strict':
raise UnicodeError("Unsupported error handling "+errors)
......@@ -199,30 +202,31 @@ class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
raise UnicodeError("unsupported error handling "+errors)
if not input:
return ("", 0)
return (b'', 0)
labels = dots.split(input)
trailing_dot = ''
trailing_dot = b''
if labels:
if not labels[-1]:
trailing_dot = '.'
trailing_dot = b'.'
del labels[-1]
elif not final:
# Keep potentially unfinished label until the next call
del labels[-1]
if labels:
trailing_dot = '.'
trailing_dot = b'.'
result = []
result = b""
size = 0
for label in labels:
result.append(ToASCII(label))
if size:
# Join with U+002E
result.extend(b'.')
size += 1
result.extend(ToASCII(label))
size += len(label)
# Join with U+002E
result = ".".join(result) + trailing_dot
result += trailing_dot
size += len(trailing_dot)
return (result, size)
......@@ -239,8 +243,7 @@ class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
labels = dots.split(input)
else:
# Must be ASCII string
input = str(input)
str(input, "ascii")
input = str(input, "ascii")
labels = input.split(".")
trailing_dot = ''
......
......@@ -181,6 +181,8 @@ def insertion_sort(base, extended, errors):
return base
def punycode_decode(text, errors):
if isinstance(text, str):
text = text.encode("ascii")
pos = text.rfind(b"-")
if pos == -1:
base = ""
......@@ -194,11 +196,11 @@ def punycode_decode(text, errors):
class Codec(codecs.Codec):
def encode(self,input,errors='strict'):
def encode(self, input, errors='strict'):
res = punycode_encode(input)
return res, len(input)
def decode(self,input,errors='strict'):
def decode(self, input, errors='strict'):
if errors not in ('strict', 'replace', 'ignore'):
raise UnicodeError, "Unsupported error handling "+errors
res = punycode_decode(input, errors)
......
......@@ -624,6 +624,7 @@ class PunycodeTest(unittest.TestCase):
def test_decode(self):
for uni, puny in punycode_testcases:
self.assertEquals(uni, puny.decode("punycode"))
self.assertEquals(uni, puny.decode("ascii").decode("punycode"))
class UnicodeInternalTest(unittest.TestCase):
def test_bug1251300(self):
......@@ -676,154 +677,154 @@ class UnicodeInternalTest(unittest.TestCase):
# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
nameprep_tests = [
# 3.1 Map to nothing.
('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
'\xb8\x8f\xef\xbb\xbf',
'foobarbaz'),
(b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
b'\xb8\x8f\xef\xbb\xbf',
b'foobarbaz'),
# 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
('CAFE',
'cafe'),
(b'CAFE',
b'cafe'),
# 3.3 Case folding 8bit U+00DF (german sharp s).
# The original test case is bogus; it says \xc3\xdf
('\xc3\x9f',
'ss'),
(b'\xc3\x9f',
b'ss'),
# 3.4 Case folding U+0130 (turkish capital I with dot).
('\xc4\xb0',
'i\xcc\x87'),
(b'\xc4\xb0',
b'i\xcc\x87'),
# 3.5 Case folding multibyte U+0143 U+037A.
('\xc5\x83\xcd\xba',
'\xc5\x84 \xce\xb9'),
(b'\xc5\x83\xcd\xba',
b'\xc5\x84 \xce\xb9'),
# 3.6 Case folding U+2121 U+33C6 U+1D7BB.
# XXX: skip this as it fails in UCS-2 mode
#('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
# 'telc\xe2\x88\x95kg\xcf\x83'),
(None, None),
# 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
('j\xcc\x8c\xc2\xa0\xc2\xaa',
'\xc7\xb0 a'),
(b'j\xcc\x8c\xc2\xa0\xc2\xaa',
b'\xc7\xb0 a'),
# 3.8 Case folding U+1FB7 and normalization.
('\xe1\xbe\xb7',
'\xe1\xbe\xb6\xce\xb9'),
(b'\xe1\xbe\xb7',
b'\xe1\xbe\xb6\xce\xb9'),
# 3.9 Self-reverting case folding U+01F0 and normalization.
# The original test case is bogus, it says `\xc7\xf0'
('\xc7\xb0',
'\xc7\xb0'),
(b'\xc7\xb0',
b'\xc7\xb0'),
# 3.10 Self-reverting case folding U+0390 and normalization.
('\xce\x90',
'\xce\x90'),
(b'\xce\x90',
b'\xce\x90'),
# 3.11 Self-reverting case folding U+03B0 and normalization.
('\xce\xb0',
'\xce\xb0'),
(b'\xce\xb0',
b'\xce\xb0'),
# 3.12 Self-reverting case folding U+1E96 and normalization.
('\xe1\xba\x96',
'\xe1\xba\x96'),
(b'\xe1\xba\x96',
b'\xe1\xba\x96'),
# 3.13 Self-reverting case folding U+1F56 and normalization.
('\xe1\xbd\x96',
'\xe1\xbd\x96'),
(b'\xe1\xbd\x96',
b'\xe1\xbd\x96'),
# 3.14 ASCII space character U+0020.
(' ',
' '),
(b' ',
b' '),
# 3.15 Non-ASCII 8bit space character U+00A0.
('\xc2\xa0',
' '),
(b'\xc2\xa0',
b' '),
# 3.16 Non-ASCII multibyte space character U+1680.
('\xe1\x9a\x80',
(b'\xe1\x9a\x80',
None),
# 3.17 Non-ASCII multibyte space character U+2000.
('\xe2\x80\x80',
' '),
(b'\xe2\x80\x80',
b' '),
# 3.18 Zero Width Space U+200b.
('\xe2\x80\x8b',
''),
(b'\xe2\x80\x8b',
b''),
# 3.19 Non-ASCII multibyte space character U+3000.
('\xe3\x80\x80',
' '),
(b'\xe3\x80\x80',
b' '),
# 3.20 ASCII control characters U+0010 U+007F.
('\x10\x7f',
'\x10\x7f'),
(b'\x10\x7f',
b'\x10\x7f'),
# 3.21 Non-ASCII 8bit control character U+0085.
('\xc2\x85',
(b'\xc2\x85',
None),
# 3.22 Non-ASCII multibyte control character U+180E.
('\xe1\xa0\x8e',
(b'\xe1\xa0\x8e',
None),
# 3.23 Zero Width No-Break Space U+FEFF.
('\xef\xbb\xbf',
''),
(b'\xef\xbb\xbf',
b''),
# 3.24 Non-ASCII control character U+1D175.
('\xf0\x9d\x85\xb5',
(b'\xf0\x9d\x85\xb5',
None),
# 3.25 Plane 0 private use character U+F123.
('\xef\x84\xa3',
(b'\xef\x84\xa3',
None),
# 3.26 Plane 15 private use character U+F1234.
('\xf3\xb1\x88\xb4',
(b'\xf3\xb1\x88\xb4',
None),
# 3.27 Plane 16 private use character U+10F234.
('\xf4\x8f\x88\xb4',
(b'\xf4\x8f\x88\xb4',
None),
# 3.28 Non-character code point U+8FFFE.
('\xf2\x8f\xbf\xbe',
(b'\xf2\x8f\xbf\xbe',
None),
# 3.29 Non-character code point U+10FFFF.
('\xf4\x8f\xbf\xbf',
(b'\xf4\x8f\xbf\xbf',
None),
# 3.30 Surrogate code U+DF42.
('\xed\xbd\x82',
(b'\xed\xbd\x82',
None),
# 3.31 Non-plain text character U+FFFD.
('\xef\xbf\xbd',
(b'\xef\xbf\xbd',
None),
# 3.32 Ideographic description character U+2FF5.
('\xe2\xbf\xb5',
(b'\xe2\xbf\xb5',
None),
# 3.33 Display property character U+0341.
('\xcd\x81',
'\xcc\x81'),
(b'\xcd\x81',
b'\xcc\x81'),
# 3.34 Left-to-right mark U+200E.
('\xe2\x80\x8e',
(b'\xe2\x80\x8e',
None),
# 3.35 Deprecated U+202A.
('\xe2\x80\xaa',
(b'\xe2\x80\xaa',
None),
# 3.36 Language tagging character U+E0001.
('\xf3\xa0\x80\x81',
(b'\xf3\xa0\x80\x81',
None),
# 3.37 Language tagging character U+E0042.
('\xf3\xa0\x81\x82',
(b'\xf3\xa0\x81\x82',
None),
# 3.38 Bidi: RandALCat character U+05BE and LCat characters.
('foo\xd6\xbebar',
(b'foo\xd6\xbebar',
None),
# 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
('foo\xef\xb5\x90bar',
(b'foo\xef\xb5\x90bar',
None),
# 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
('foo\xef\xb9\xb6bar',
'foo \xd9\x8ebar'),
(b'foo\xef\xb9\xb6bar',
b'foo \xd9\x8ebar'),
# 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
('\xd8\xa71',
(b'\xd8\xa71',
None),
# 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
('\xd8\xa71\xd8\xa8',
'\xd8\xa71\xd8\xa8'),
(b'\xd8\xa71\xd8\xa8',
b'\xd8\xa71\xd8\xa8'),
# 3.43 Unassigned code point U+E0002.
# Skip this test as we allow unassigned
#('\xf3\xa0\x80\x82',
#(b'\xf3\xa0\x80\x82',
# None),
(None, None),
# 3.44 Larger test (shrinking).
# Original test case reads \xc3\xdf
('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
'\xaa\xce\xb0\xe2\x80\x80',
'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
(b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
b'\xaa\xce\xb0\xe2\x80\x80',
b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
# 3.45 Larger test (expanding).
# Original test case reads \xc3\x9f
('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
'\x80',
'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
(b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
b'\x80',
b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
]
......@@ -848,16 +849,16 @@ class NameprepTest(unittest.TestCase):
class IDNACodecTest(unittest.TestCase):
def test_builtin_decode(self):
self.assertEquals(str("python.org", "idna"), "python.org")
self.assertEquals(str("python.org.", "idna"), "python.org.")
self.assertEquals(str("xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
self.assertEquals(str("xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
self.assertEquals(str(b"python.org", "idna"), "python.org")
self.assertEquals(str(b"python.org.", "idna"), "python.org.")
self.assertEquals(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
self.assertEquals(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
def test_builtin_encode(self):
self.assertEquals("python.org".encode("idna"), "python.org")
self.assertEquals("python.org.".encode("idna"), "python.org.")
self.assertEquals("pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
self.assertEquals("pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
self.assertEquals("python.org".encode("idna"), b"python.org")
self.assertEquals("python.org.".encode("idna"), b"python.org.")
self.assertEquals("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
self.assertEquals("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
def test_stream(self):
r = codecs.getreader("idna")(io.BytesIO(b"abc"))
......@@ -866,61 +867,61 @@ class IDNACodecTest(unittest.TestCase):
def test_incremental_decode(self):
self.assertEquals(
"".join(codecs.iterdecode("python.org", "idna")),
"".join(codecs.iterdecode((bytes(chr(c)) for c in b"python.org"), "idna")),
"python.org"
)
self.assertEquals(
"".join(codecs.iterdecode("python.org.", "idna")),
"".join(codecs.iterdecode((bytes(chr(c)) for c in b"python.org."), "idna")),
"python.org."
)
self.assertEquals(
"".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
"".join(codecs.iterdecode((bytes(chr(c)) for c in b"xn--pythn-mua.org."), "idna")),
"pyth\xf6n.org."
)
self.assertEquals(
"".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
"".join(codecs.iterdecode((bytes(chr(c)) for c in b"xn--pythn-mua.org."), "idna")),
"pyth\xf6n.org."
)
decoder = codecs.getincrementaldecoder("idna")()
self.assertEquals(decoder.decode("xn--xam", ), "")
self.assertEquals(decoder.decode("ple-9ta.o", ), "\xe4xample.")
self.assertEquals(decoder.decode("rg"), "")
self.assertEquals(decoder.decode("", True), "org")
self.assertEquals(decoder.decode(b"xn--xam", ), "")
self.assertEquals(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
self.assertEquals(decoder.decode(b"rg"), "")
self.assertEquals(decoder.decode(b"", True), "org")
decoder.reset()
self.assertEquals(decoder.decode("xn--xam", ), "")
self.assertEquals(decoder.decode("ple-9ta.o", ), "\xe4xample.")
self.assertEquals(decoder.decode("rg."), "org.")
self.assertEquals(decoder.decode("", True), "")
self.assertEquals(decoder.decode(b"xn--xam", ), "")
self.assertEquals(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
self.assertEquals(decoder.decode(b"rg."), "org.")
self.assertEquals(decoder.decode(b"", True), "")
def test_incremental_encode(self):
self.assertEquals(
"".join(codecs.iterencode("python.org", "idna")),
"python.org"
b"".join(codecs.iterencode("python.org", "idna")),
b"python.org"
)
self.assertEquals(
"".join(codecs.iterencode("python.org.", "idna")),
"python.org."
b"".join(codecs.iterencode("python.org.", "idna")),
b"python.org."
)
self.assertEquals(
"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
"xn--pythn-mua.org."
b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
b"xn--pythn-mua.org."
)
self.assertEquals(
"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
"xn--pythn-mua.org."
b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
b"xn--pythn-mua.org."
)
encoder = codecs.getincrementalencoder("idna")()
self.assertEquals(encoder.encode("\xe4x"), "")
self.assertEquals(encoder.encode("ample.org"), "xn--xample-9ta.")
self.assertEquals(encoder.encode("", True), "org")
self.assertEquals(encoder.encode("\xe4x"), b"")
self.assertEquals(encoder.encode("ample.org"), b"xn--xample-9ta.")
self.assertEquals(encoder.encode("", True), b"org")
encoder.reset()
self.assertEquals(encoder.encode("\xe4x"), "")
self.assertEquals(encoder.encode("ample.org."), "xn--xample-9ta.org.")
self.assertEquals(encoder.encode("", True), "")
self.assertEquals(encoder.encode("\xe4x"), b"")
self.assertEquals(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
self.assertEquals(encoder.encode("", True), b"")
class CodecsModuleTest(unittest.TestCase):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment