Kaydet (Commit) d6d06ade authored tarafından Marc-André Lemburg's avatar Marc-André Lemburg

Tests for new surrogate support in the UTF-8 codec. By Bill Tutt.

üst fa1309fd
...@@ -168,6 +168,57 @@ assert 'abc' < u'abcd' ...@@ -168,6 +168,57 @@ assert 'abc' < u'abcd'
assert u'abc' < u'abcd' assert u'abc' < u'abcd'
print 'done.' print 'done.'
print 'Testing UTF-16 code point order comparisons...',
#No surrogates, no fixup required.
assert u'\u0061' < u'\u20ac'
# Non surrogate below surrogate value, no fixup required
assert u'\u0061' < u'\ud800\udc02'
# Non surrogate above surrogate value, fixup required
def test_lecmp(s, s2):
assert s < s2 , "comparison failed on %s < %s" % (s, s2)
def test_fixup(s):
s2 = u'\ud800\udc01'
test_lecmp(s, s2)
s2 = u'\ud900\udc01'
test_lecmp(s, s2)
s2 = u'\uda00\udc01'
test_lecmp(s, s2)
s2 = u'\udb00\udc01'
test_lecmp(s, s2)
s2 = u'\ud800\udd01'
test_lecmp(s, s2)
s2 = u'\ud900\udd01'
test_lecmp(s, s2)
s2 = u'\uda00\udd01'
test_lecmp(s, s2)
s2 = u'\udb00\udd01'
test_lecmp(s, s2)
s2 = u'\ud800\ude01'
test_lecmp(s, s2)
s2 = u'\ud900\ude01'
test_lecmp(s, s2)
s2 = u'\uda00\ude01'
test_lecmp(s, s2)
s2 = u'\udb00\ude01'
test_lecmp(s, s2)
s2 = u'\ud800\udfff'
test_lecmp(s, s2)
s2 = u'\ud900\udfff'
test_lecmp(s, s2)
s2 = u'\uda00\udfff'
test_lecmp(s, s2)
s2 = u'\udb00\udfff'
test_lecmp(s, s2)
test_fixup(u'\ue000')
test_fixup(u'\uff61')
# Surrogates on both sides, no fixup required
assert u'\ud800\udc02' < u'\ud84d\udc56'
print 'done.'
test('ljust', u'abc', u'abc ', 10) test('ljust', u'abc', u'abc ', 10)
test('rjust', u'abc', u' abc', 10) test('rjust', u'abc', u' abc', 10)
test('center', u'abc', u' abc ', 10) test('center', u'abc', u' abc ', 10)
...@@ -293,6 +344,27 @@ print 'done.' ...@@ -293,6 +344,27 @@ print 'done.'
# Test builtin codecs # Test builtin codecs
print 'Testing builtin codecs...', print 'Testing builtin codecs...',
# UTF-8 specific encoding tests:
assert u'\u20ac'.encode('utf-8') == \
''.join((chr(0xe2), chr(0x82), chr(0xac)))
assert u'\ud800\udc02'.encode('utf-8') == \
''.join((chr(0xf0), chr(0x90), chr(0x80), chr(0x82)))
assert u'\ud84d\udc56'.encode('utf-8') == \
''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96)))
# UTF-8 specific decoding tests
assert unicode(''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))),
'utf-8') == u'\ud84d\udc56'
assert unicode(''.join((chr(0xf0), chr(0x90), chr(0x80), chr(0x82))),
'utf-8') == u'\ud800\udc02'
assert unicode(''.join((chr(0xe2), chr(0x82), chr(0xac))),
'utf-8') == u'\u20ac'
# Other possible utf-8 test cases:
# * strict decoding testing for all of the
# UTF8_ERROR cases in PyUnicode_DecodeUTF8
assert unicode('hello','ascii') == u'hello' assert unicode('hello','ascii') == u'hello'
assert unicode('hello','utf-8') == u'hello' assert unicode('hello','utf-8') == u'hello'
assert unicode('hello','utf8') == u'hello' assert unicode('hello','utf8') == u'hello'
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment