Skip to content
Projeler
Gruplar
Parçacıklar
Yardım
Yükleniyor...
Oturum aç / Kaydol
Gezinmeyi değiştir
C
cpython
Proje
Proje
Ayrıntılar
Etkinlik
Cycle Analytics
Depo (repository)
Depo (repository)
Dosyalar
Kayıtlar (commit)
Dallar (branch)
Etiketler
Katkıda bulunanlar
Grafik
Karşılaştır
Grafikler
Konular (issue)
0
Konular (issue)
0
Liste
Pano
Etiketler
Kilometre Taşları
Birleştirme (merge) Talepleri
0
Birleştirme (merge) Talepleri
0
CI / CD
CI / CD
İş akışları (pipeline)
İşler
Zamanlamalar
Grafikler
Paketler
Paketler
Wiki
Wiki
Parçacıklar
Parçacıklar
Üyeler
Üyeler
Collapse sidebar
Close sidebar
Etkinlik
Grafik
Grafikler
Yeni bir konu (issue) oluştur
İşler
Kayıtlar (commit)
Konu (issue) Panoları
Kenar çubuğunu aç
Batuhan Osman TASKAYA
cpython
Commits
c60e6f77
Kaydet (Commit)
c60e6f77
authored
Eyl 20, 2001
tarafından
Marc-André Lemburg
Dosyalara gözat
Seçenekler
Dosyalara Gözat
İndir
Eposta Yamaları
Sade Fark
Patch #435971: UTF-7 codec by Brian Quinlan.
üst
26e3b681
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
392 additions
and
1 deletion
+392
-1
unicodeobject.h
Include/unicodeobject.h
+18
-0
aliases.py
Lib/encodings/aliases.py
+4
-0
test_unicode.py
Lib/test/test_unicode.py
+28
-1
_codecsmodule.c
Modules/_codecsmodule.c
+42
-0
unicodeobject.c
Objects/unicodeobject.c
+300
-0
No files found.
Include/unicodeobject.h
Dosyayı görüntüle @
c60e6f77
...
@@ -607,6 +607,24 @@ extern DL_IMPORT(PyObject*) PyUnicode_AsEncodedString(
...
@@ -607,6 +607,24 @@ extern DL_IMPORT(PyObject*) PyUnicode_AsEncodedString(
const
char
*
errors
/* error handling */
const
char
*
errors
/* error handling */
);
);
/* --- UTF-7 Codecs ------------------------------------------------------- */
extern
DL_IMPORT
(
PyObject
*
)
PyUnicode_DecodeUTF7
(
const
char
*
string
,
/* UTF-7 encoded string */
int
length
,
/* size of string */
const
char
*
errors
/* error handling */
);
extern
DL_IMPORT
(
PyObject
*
)
PyUnicode_EncodeUTF7
(
const
Py_UNICODE
*
data
,
/* Unicode char buffer */
int
length
,
/* number of Py_UNICODE chars to encode */
int
encodeSetO
,
/* force the encoder to encode characters in
Set O, as described in RFC2152 */
int
encodeWhiteSpace
,
/* force the encoder to encode space, tab,
carriage return and linefeed characters */
const
char
*
errors
/* error handling */
);
/* --- UTF-8 Codecs ------------------------------------------------------- */
/* --- UTF-8 Codecs ------------------------------------------------------- */
extern
DL_IMPORT
(
PyObject
*
)
PyUnicode_DecodeUTF8
(
extern
DL_IMPORT
(
PyObject
*
)
PyUnicode_DecodeUTF8
(
...
...
Lib/encodings/aliases.py
Dosyayı görüntüle @
c60e6f77
...
@@ -14,6 +14,10 @@ aliases = {
...
@@ -14,6 +14,10 @@ aliases = {
'latin'
:
'latin_1'
,
'latin'
:
'latin_1'
,
'latin1'
:
'latin_1'
,
'latin1'
:
'latin_1'
,
# UTF-7
'utf7'
:
'utf_7'
,
'u7'
:
'utf_7'
,
# UTF-8
# UTF-8
'utf'
:
'utf_8'
,
'utf'
:
'utf_8'
,
'utf8'
:
'utf_8'
,
'utf8'
:
'utf_8'
,
...
...
Lib/test/test_unicode.py
Dosyayı görüntüle @
c60e6f77
...
@@ -377,6 +377,32 @@ print 'done.'
...
@@ -377,6 +377,32 @@ print 'done.'
# Test builtin codecs
# Test builtin codecs
print
'Testing builtin codecs...'
,
print
'Testing builtin codecs...'
,
# UTF-7 specific encoding tests:
utfTests
=
[(
u'A
\u2262\u0391
.'
,
'A+ImIDkQ.'
),
# RFC2152 example
(
u'Hi Mom -
\u263a
-!'
,
'Hi Mom -+Jjo--!'
),
# RFC2152 example
(
u'
\u65E5\u672C\u8A9E
'
,
'+ZeVnLIqe-'
),
# RFC2152 example
(
u'Item 3 is
\u00a3
1.'
,
'Item 3 is +AKM-1.'
),
# RFC2152 example
(
u'+'
,
'+-'
),
(
u'+-'
,
'+--'
),
(
u'+?'
,
'+-?'
),
(
u'
\
?'
,
'+AFw?'
),
(
u'+?'
,
'+-?'
),
(
ur'\\?'
,
'+AFwAXA?'
),
(
ur'\\\?'
,
'+AFwAXABc?'
),
(
ur'++--'
,
'+-+---'
)]
for
x
,
y
in
utfTests
:
verify
(
x
.
encode
(
'utf-7'
)
==
y
)
try
:
unicode
(
'+3ADYAA-'
,
'utf-7'
)
# surrogates not supported
except
UnicodeError
:
pass
else
:
raise
TestFailed
,
"unicode('+3ADYAA-', 'utf-7') failed to raise an exception"
verify
(
unicode
(
'+3ADYAA-'
,
'utf-7'
,
'replace'
)
==
u'
\ufffd
'
)
# UTF-8 specific encoding tests:
# UTF-8 specific encoding tests:
verify
(
u'
\u20ac
'
.
encode
(
'utf-8'
)
==
\
verify
(
u'
\u20ac
'
.
encode
(
'utf-8'
)
==
\
''
.
join
((
chr
(
0xe2
),
chr
(
0x82
),
chr
(
0xac
)))
)
''
.
join
((
chr
(
0xe2
),
chr
(
0x82
),
chr
(
0xac
)))
)
...
@@ -439,6 +465,7 @@ verify(unicode('Andr\202 x','ascii','ignore') == u"Andr x")
...
@@ -439,6 +465,7 @@ verify(unicode('Andr\202 x','ascii','ignore') == u"Andr x")
verify
(
unicode
(
'Andr
\202
x'
,
'ascii'
,
'replace'
)
==
u'Andr
\uFFFD
x'
)
verify
(
unicode
(
'Andr
\202
x'
,
'ascii'
,
'replace'
)
==
u'Andr
\uFFFD
x'
)
verify
(
u'hello'
.
encode
(
'ascii'
)
==
'hello'
)
verify
(
u'hello'
.
encode
(
'ascii'
)
==
'hello'
)
verify
(
u'hello'
.
encode
(
'utf-7'
)
==
'hello'
)
verify
(
u'hello'
.
encode
(
'utf-8'
)
==
'hello'
)
verify
(
u'hello'
.
encode
(
'utf-8'
)
==
'hello'
)
verify
(
u'hello'
.
encode
(
'utf8'
)
==
'hello'
)
verify
(
u'hello'
.
encode
(
'utf8'
)
==
'hello'
)
verify
(
u'hello'
.
encode
(
'utf-16-le'
)
==
'h
\000
e
\000
l
\000
l
\000
o
\000
'
)
verify
(
u'hello'
.
encode
(
'utf-16-le'
)
==
'h
\000
e
\000
l
\000
l
\000
o
\000
'
)
...
@@ -447,7 +474,7 @@ verify(u'hello'.encode('latin-1') == 'hello')
...
@@ -447,7 +474,7 @@ verify(u'hello'.encode('latin-1') == 'hello')
# Roundtrip safety for BMP (just the first 1024 chars)
# Roundtrip safety for BMP (just the first 1024 chars)
u
=
u''
.
join
(
map
(
unichr
,
range
(
1024
)))
u
=
u''
.
join
(
map
(
unichr
,
range
(
1024
)))
for
encoding
in
(
'utf-8'
,
'utf-16'
,
'utf-16-le'
,
'utf-16-be'
,
for
encoding
in
(
'utf-
7'
,
'utf-
8'
,
'utf-16'
,
'utf-16-le'
,
'utf-16-be'
,
'raw_unicode_escape'
,
'unicode_escape'
,
'unicode_internal'
):
'raw_unicode_escape'
,
'unicode_escape'
,
'unicode_internal'
):
verify
(
unicode
(
u
.
encode
(
encoding
),
encoding
)
==
u
)
verify
(
unicode
(
u
.
encode
(
encoding
),
encoding
)
==
u
)
...
...
Modules/_codecsmodule.c
Dosyayı görüntüle @
c60e6f77
...
@@ -123,6 +123,22 @@ unicode_internal_decode(PyObject *self,
...
@@ -123,6 +123,22 @@ unicode_internal_decode(PyObject *self,
}
}
}
}
static
PyObject
*
utf_7_decode
(
PyObject
*
self
,
PyObject
*
args
)
{
const
char
*
data
;
int
size
;
const
char
*
errors
=
NULL
;
if
(
!
PyArg_ParseTuple
(
args
,
"t#|z:utf_7_decode"
,
&
data
,
&
size
,
&
errors
))
return
NULL
;
return
codec_tuple
(
PyUnicode_DecodeUTF7
(
data
,
size
,
errors
),
size
);
}
static
PyObject
*
static
PyObject
*
utf_8_decode
(
PyObject
*
self
,
utf_8_decode
(
PyObject
*
self
,
PyObject
*
args
)
PyObject
*
args
)
...
@@ -381,6 +397,30 @@ unicode_internal_encode(PyObject *self,
...
@@ -381,6 +397,30 @@ unicode_internal_encode(PyObject *self,
}
}
}
}
static
PyObject
*
utf_7_encode
(
PyObject
*
self
,
PyObject
*
args
)
{
PyObject
*
str
,
*
v
;
const
char
*
errors
=
NULL
;
if
(
!
PyArg_ParseTuple
(
args
,
"O|z:utf_7_encode"
,
&
str
,
&
errors
))
return
NULL
;
str
=
PyUnicode_FromObject
(
str
);
if
(
str
==
NULL
)
return
NULL
;
v
=
codec_tuple
(
PyUnicode_EncodeUTF7
(
PyUnicode_AS_UNICODE
(
str
),
PyUnicode_GET_SIZE
(
str
),
0
,
0
,
errors
),
PyUnicode_GET_SIZE
(
str
));
Py_DECREF
(
str
);
return
v
;
}
static
PyObject
*
static
PyObject
*
utf_8_encode
(
PyObject
*
self
,
utf_8_encode
(
PyObject
*
self
,
PyObject
*
args
)
PyObject
*
args
)
...
@@ -632,6 +672,8 @@ static PyMethodDef _codecs_functions[] = {
...
@@ -632,6 +672,8 @@ static PyMethodDef _codecs_functions[] = {
#ifdef Py_USING_UNICODE
#ifdef Py_USING_UNICODE
{
"utf_8_encode"
,
utf_8_encode
,
1
},
{
"utf_8_encode"
,
utf_8_encode
,
1
},
{
"utf_8_decode"
,
utf_8_decode
,
1
},
{
"utf_8_decode"
,
utf_8_decode
,
1
},
{
"utf_7_encode"
,
utf_7_encode
,
1
},
{
"utf_7_decode"
,
utf_7_decode
,
1
},
{
"utf_16_encode"
,
utf_16_encode
,
1
},
{
"utf_16_encode"
,
utf_16_encode
,
1
},
{
"utf_16_le_encode"
,
utf_16_le_encode
,
1
},
{
"utf_16_le_encode"
,
utf_16_le_encode
,
1
},
{
"utf_16_be_encode"
,
utf_16_be_encode
,
1
},
{
"utf_16_be_encode"
,
utf_16_be_encode
,
1
},
...
...
Objects/unicodeobject.c
Dosyayı görüntüle @
c60e6f77
...
@@ -635,6 +635,306 @@ int PyUnicode_SetDefaultEncoding(const char *encoding)
...
@@ -635,6 +635,306 @@ int PyUnicode_SetDefaultEncoding(const char *encoding)
return
-
1
;
return
-
1
;
}
}
/* --- UTF-7 Codec -------------------------------------------------------- */
/* see RFC2152 for details */
static
char
utf7_special
[
128
]
=
{
/* indicate whether a UTF-7 character is special i.e. cannot be directly
encoded:
0 - not special
1 - special
2 - whitespace (optional)
3 - RFC2152 Set O (optional) */
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
2
,
2
,
1
,
1
,
2
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
2
,
3
,
3
,
3
,
3
,
3
,
3
,
0
,
0
,
0
,
3
,
1
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
3
,
3
,
3
,
3
,
0
,
3
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
3
,
1
,
3
,
3
,
3
,
3
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
3
,
3
,
3
,
1
,
1
,
};
#define SPECIAL(c, encodeO, encodeWS) \
(((c)>127 || utf7_special[(c)] == 1) || \
(encodeWS && (utf7_special[(c)] == 2)) || \
(encodeO && (utf7_special[(c)] == 3)))
#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
(c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
#define ENCODE(out, ch, bits) \
while (bits >= 6) { \
*out++ = B64(ch >> (bits-6)); \
bits -= 6; \
}
#define DECODE(out, ch, bits, surrogate) \
while (bits >= 16) { \
Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
bits -= 16; \
if (surrogate) { \
/* We have already generated an error for the high surrogate
so let's not bother seeing if the low surrogate is correct or not */
\
surrogate = 0; \
} else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
/* This is a surrogate pair. Unfortunately we can't represent \
it in a 16-bit character */
\
surrogate = 1; \
errmsg = "code pairs are not supported"; \
goto utf7Error; \
} else { \
*out++ = outCh; \
} \
} \
static
int
utf7_decoding_error
(
Py_UNICODE
**
dest
,
const
char
*
errors
,
const
char
*
details
)
{
if
((
errors
==
NULL
)
||
(
strcmp
(
errors
,
"strict"
)
==
0
))
{
PyErr_Format
(
PyExc_UnicodeError
,
"UTF-7 decoding error: %.400s"
,
details
);
return
-
1
;
}
else
if
(
strcmp
(
errors
,
"ignore"
)
==
0
)
{
return
0
;
}
else
if
(
strcmp
(
errors
,
"replace"
)
==
0
)
{
if
(
dest
!=
NULL
)
{
**
dest
=
Py_UNICODE_REPLACEMENT_CHARACTER
;
(
*
dest
)
++
;
}
return
0
;
}
else
{
PyErr_Format
(
PyExc_ValueError
,
"UTF-7 decoding error; unknown error handling code: %.400s"
,
errors
);
return
-
1
;
}
}
PyObject
*
PyUnicode_DecodeUTF7
(
const
char
*
s
,
int
size
,
const
char
*
errors
)
{
const
char
*
e
;
PyUnicodeObject
*
unicode
;
Py_UNICODE
*
p
;
const
char
*
errmsg
=
""
;
int
inShift
=
0
;
unsigned
int
bitsleft
=
0
;
unsigned
long
charsleft
=
0
;
int
surrogate
=
0
;
unicode
=
_PyUnicode_New
(
size
);
if
(
!
unicode
)
return
NULL
;
if
(
size
==
0
)
return
(
PyObject
*
)
unicode
;
p
=
unicode
->
str
;
e
=
s
+
size
;
while
(
s
<
e
)
{
Py_UNICODE
ch
=
*
s
;
if
(
inShift
)
{
if
((
ch
==
'-'
)
||
!
B64CHAR
(
ch
))
{
inShift
=
0
;
s
++
;
/* p, charsleft, bitsleft, surrogate = */
DECODE
(
p
,
charsleft
,
bitsleft
,
surrogate
);
if
(
bitsleft
>=
6
)
{
/* The shift sequence has a partial character in it. If
bitsleft < 6 then we could just classify it as padding
but that is not the case here */
errmsg
=
"partial character in shift sequence"
;
goto
utf7Error
;
}
/* According to RFC2152 the remaining bits should be zero. We
choose to signal an error/insert a replacement character
here so indicate the potential of a misencoded character. */
/* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
if
(
bitsleft
&&
charsleft
<<
(
sizeof
(
charsleft
)
*
8
-
bitsleft
))
{
errmsg
=
"non-zero padding bits in shift sequence"
;
goto
utf7Error
;
}
if
(
ch
==
'-'
)
{
if
((
s
<
e
)
&&
(
*
(
s
)
==
'-'
))
{
*
p
++
=
'-'
;
inShift
=
1
;
}
}
else
if
(
SPECIAL
(
ch
,
0
,
0
))
{
errmsg
=
"unexpected special character"
;
goto
utf7Error
;
}
else
{
*
p
++
=
ch
;
}
}
else
{
charsleft
=
(
charsleft
<<
6
)
|
UB64
(
ch
);
bitsleft
+=
6
;
s
++
;
/* p, charsleft, bitsleft, surrogate = */
DECODE
(
p
,
charsleft
,
bitsleft
,
surrogate
);
}
}
else
if
(
ch
==
'+'
)
{
s
++
;
if
(
s
<
e
&&
*
s
==
'-'
)
{
s
++
;
*
p
++
=
'+'
;
}
else
{
inShift
=
1
;
bitsleft
=
0
;
}
}
else
if
(
SPECIAL
(
ch
,
0
,
0
))
{
errmsg
=
"unexpected special character"
;
s
++
;
goto
utf7Error
;
}
else
{
*
p
++
=
ch
;
s
++
;
}
continue
;
utf7Error:
if
(
utf7_decoding_error
(
&
p
,
errors
,
errmsg
))
goto
onError
;
}
if
(
inShift
)
{
if
(
utf7_decoding_error
(
&
p
,
errors
,
"unterminated shift sequence"
))
goto
onError
;
}
if
(
_PyUnicode_Resize
(
&
unicode
,
p
-
unicode
->
str
))
goto
onError
;
return
(
PyObject
*
)
unicode
;
onError:
Py_DECREF
(
unicode
);
return
NULL
;
}
PyObject
*
PyUnicode_EncodeUTF7
(
const
Py_UNICODE
*
s
,
int
size
,
int
encodeSetO
,
int
encodeWhiteSpace
,
const
char
*
errors
)
{
PyObject
*
v
;
/* It might be possible to tighten this worst case */
unsigned
int
cbAllocated
=
5
*
size
;
int
inShift
=
0
;
int
i
=
0
;
unsigned
int
bitsleft
=
0
;
unsigned
long
charsleft
=
0
;
char
*
out
;
char
*
start
;
if
(
size
==
0
)
return
PyString_FromStringAndSize
(
NULL
,
0
);
v
=
PyString_FromStringAndSize
(
NULL
,
cbAllocated
);
if
(
v
==
NULL
)
return
NULL
;
start
=
out
=
PyString_AS_STRING
(
v
);
for
(;
i
<
size
;
++
i
)
{
Py_UNICODE
ch
=
s
[
i
];
if
(
!
inShift
)
{
if
(
ch
==
'+'
)
{
*
out
++
=
'+'
;
*
out
++
=
'-'
;
}
else
if
(
SPECIAL
(
ch
,
encodeSetO
,
encodeWhiteSpace
))
{
charsleft
=
ch
;
bitsleft
=
16
;
*
out
++
=
'+'
;
/* out, charsleft, bitsleft = */
ENCODE
(
out
,
charsleft
,
bitsleft
);
inShift
=
bitsleft
>
0
;
}
else
{
*
out
++
=
(
char
)
ch
;
}
}
else
{
if
(
!
SPECIAL
(
ch
,
encodeSetO
,
encodeWhiteSpace
))
{
*
out
++
=
B64
(
charsleft
<<
(
6
-
bitsleft
));
charsleft
=
0
;
bitsleft
=
0
;
/* Characters not in the BASE64 set implicitly unshift the sequence
so no '-' is required, except if the character is itself a '-' */
if
(
B64CHAR
(
ch
)
||
ch
==
'-'
)
{
*
out
++
=
'-'
;
}
inShift
=
0
;
*
out
++
=
(
char
)
ch
;
}
else
{
bitsleft
+=
16
;
charsleft
=
(
charsleft
<<
16
)
|
ch
;
/* out, charsleft, bitsleft = */
ENCODE
(
out
,
charsleft
,
bitsleft
);
/* If the next character is special then we dont' need to terminate
the shift sequence. If the next character is not a BASE64 character
or '-' then the shift sequence will be terminated implicitly and we
don't have to insert a '-'. */
if
(
bitsleft
==
0
)
{
if
(
i
+
1
<
size
)
{
Py_UNICODE
ch2
=
s
[
i
+
1
];
if
(
SPECIAL
(
ch2
,
encodeSetO
,
encodeWhiteSpace
))
{
}
else
if
(
B64CHAR
(
ch2
)
||
ch2
==
'-'
)
{
*
out
++
=
'-'
;
inShift
=
0
;
}
else
{
inShift
=
0
;
}
}
else
{
*
out
++
=
'-'
;
inShift
=
0
;
}
}
}
}
}
if
(
bitsleft
)
{
*
out
++=
B64
(
charsleft
<<
(
6
-
bitsleft
)
);
*
out
++
=
'-'
;
}
if
(
_PyString_Resize
(
&
v
,
out
-
start
))
{
Py_DECREF
(
v
);
return
NULL
;
}
return
v
;
}
#undef SPECIAL
#undef B64
#undef B64CHAR
#undef UB64
#undef ENCODE
#undef DECODE
/* --- UTF-8 Codec -------------------------------------------------------- */
/* --- UTF-8 Codec -------------------------------------------------------- */
static
static
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment