Skip to content
Projeler
Gruplar
Parçacıklar
Yardım
Yükleniyor...
Oturum aç / Kaydol
Gezinmeyi değiştir
C
cpython
Proje
Proje
Ayrıntılar
Etkinlik
Cycle Analytics
Depo (repository)
Depo (repository)
Dosyalar
Kayıtlar (commit)
Dallar (branch)
Etiketler
Katkıda bulunanlar
Grafik
Karşılaştır
Grafikler
Konular (issue)
0
Konular (issue)
0
Liste
Pano
Etiketler
Kilometre Taşları
Birleştirme (merge) Talepleri
0
Birleştirme (merge) Talepleri
0
CI / CD
CI / CD
İş akışları (pipeline)
İşler
Zamanlamalar
Grafikler
Paketler
Paketler
Wiki
Wiki
Parçacıklar
Parçacıklar
Üyeler
Üyeler
Collapse sidebar
Close sidebar
Etkinlik
Grafik
Grafikler
Yeni bir konu (issue) oluştur
İşler
Kayıtlar (commit)
Konu (issue) Panoları
Kenar çubuğunu aç
Batuhan Osman TASKAYA
cpython
Commits
22970667
Kaydet (Commit)
22970667
authored
Eyl 29, 2011
tarafından
Martin v. Löwis
Dosyalara gözat
Seçenekler
Dosyalara Gözat
İndir
Eposta Yamaları
Sade Fark
Port normalization to new API.
üst
f0ddadcf
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
119 additions
and
71 deletions
+119
-71
unicodedata.c
Modules/unicodedata.c
+119
-71
No files found.
Modules/unicodedata.c
Dosyayı görüntüle @
22970667
...
...
@@ -494,36 +494,44 @@ static PyObject*
nfd_nfkd
(
PyObject
*
self
,
PyObject
*
input
,
int
k
)
{
PyObject
*
result
;
Py_UNICODE
*
i
,
*
end
,
*
o
;
Py_UCS4
*
output
;
Py_ssize_t
i
,
o
,
osize
;
int
kind
;
void
*
data
;
/* Longest decomposition in Unicode 3.2: U+FDFA */
Py_U
NICODE
stack
[
20
];
Py_U
CS4
stack
[
20
];
Py_ssize_t
space
,
isize
;
int
index
,
prefix
,
count
,
stackptr
;
unsigned
char
prev
,
cur
;
stackptr
=
0
;
isize
=
PyUnicode_GET_
SIZE
(
input
);
isize
=
PyUnicode_GET_
LENGTH
(
input
);
/* Overallocate atmost 10 characters. */
space
=
(
isize
>
10
?
10
:
isize
)
+
isize
;
result
=
PyUnicode_FromUnicode
(
NULL
,
space
);
if
(
!
result
)
osize
=
space
;
output
=
PyMem_Malloc
(
space
*
sizeof
(
Py_UCS4
));
if
(
!
output
)
{
PyErr_NoMemory
();
return
NULL
;
i
=
PyUnicode_AS_UNICODE
(
input
);
end
=
i
+
isize
;
o
=
PyUnicode_AS_UNICODE
(
result
);
}
i
=
o
=
0
;
kind
=
PyUnicode_KIND
(
input
);
data
=
PyUnicode_DATA
(
input
);
while
(
i
<
end
)
{
stack
[
stackptr
++
]
=
*
i
++
;
while
(
i
<
isize
)
{
stack
[
stackptr
++
]
=
PyUnicode_READ
(
kind
,
data
,
i
++
)
;
while
(
stackptr
)
{
Py_U
NICODE
code
=
stack
[
--
stackptr
];
Py_U
CS4
code
=
stack
[
--
stackptr
];
/* Hangul Decomposition adds three characters in
a single step, so we need atleast that much room. */
if
(
space
<
3
)
{
Py_ssize_t
newsize
=
PyUnicode_GET_SIZE
(
result
)
+
10
;
osize
+=
10
;
space
+=
10
;
if
(
PyUnicode_Resize
(
&
result
,
newsize
)
==
-
1
)
output
=
PyMem_Realloc
(
output
,
osize
*
sizeof
(
Py_UCS4
));
if
(
output
==
NULL
)
{
PyErr_NoMemory
();
return
NULL
;
o
=
PyUnicode_AS_UNICODE
(
result
)
+
newsize
-
space
;
}
}
/* Hangul Decomposition. */
if
(
SBase
<=
code
&&
code
<
(
SBase
+
SCount
))
{
...
...
@@ -531,11 +539,11 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
int
L
=
LBase
+
SIndex
/
NCount
;
int
V
=
VBase
+
(
SIndex
%
NCount
)
/
TCount
;
int
T
=
TBase
+
SIndex
%
TCount
;
*
o
++
=
L
;
*
o
++
=
V
;
output
[
o
++
]
=
L
;
output
[
o
++
]
=
V
;
space
-=
2
;
if
(
T
!=
TBase
)
{
*
o
++
=
T
;
output
[
o
++
]
=
T
;
space
--
;
}
continue
;
...
...
@@ -555,7 +563,7 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
/* Copy character if it is not decomposable, or has a
compatibility decomposition, but we do NFD. */
if
(
!
count
||
(
prefix
&&
!
k
))
{
*
o
++
=
code
;
output
[
o
++
]
=
code
;
space
--
;
continue
;
}
...
...
@@ -568,15 +576,20 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
}
}
/* Drop overallocation. Cannot fail. */
PyUnicode_Resize
(
&
result
,
PyUnicode_GET_SIZE
(
result
)
-
space
);
result
=
PyUnicode_FromKindAndData
(
PyUnicode_4BYTE_KIND
,
output
,
o
);
PyMem_Free
(
output
);
if
(
!
result
)
return
NULL
;
/* result is guaranteed to be ready, as it is compact. */
kind
=
PyUnicode_KIND
(
result
);
data
=
PyUnicode_DATA
(
result
);
/* Sort canonically. */
i
=
PyUnicode_AS_UNICODE
(
result
);
prev
=
_getrecord_ex
(
*
i
)
->
combining
;
end
=
i
+
PyUnicode_GET_SIZE
(
result
);
for
(
i
++
;
i
<
end
;
i
++
)
{
cur
=
_getrecord_ex
(
*
i
)
->
combining
;
i
=
0
;
prev
=
_getrecord_ex
(
PyUnicode_READ
(
kind
,
data
,
i
))
->
combining
;
for
(
i
++
;
i
<
PyUnicode_GET_LENGTH
(
result
);
i
++
)
{
cur
=
_getrecord_ex
(
PyUnicode_READ
(
kind
,
data
,
i
))
->
combining
;
if
(
prev
==
0
||
cur
==
0
||
prev
<=
cur
)
{
prev
=
cur
;
continue
;
...
...
@@ -584,23 +597,24 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
/* Non-canonical order. Need to switch *i with previous. */
o
=
i
-
1
;
while
(
1
)
{
Py_UNICODE
tmp
=
o
[
1
];
o
[
1
]
=
o
[
0
];
o
[
0
]
=
tmp
;
Py_UCS4
tmp
=
PyUnicode_READ
(
kind
,
data
,
o
+
1
);
PyUnicode_WRITE
(
kind
,
data
,
o
+
1
,
PyUnicode_READ
(
kind
,
data
,
o
));
PyUnicode_WRITE
(
kind
,
data
,
o
,
tmp
);
o
--
;
if
(
o
<
PyUnicode_AS_UNICODE
(
result
)
)
if
(
o
<
0
)
break
;
prev
=
_getrecord_ex
(
*
o
)
->
combining
;
prev
=
_getrecord_ex
(
PyUnicode_READ
(
kind
,
data
,
o
)
)
->
combining
;
if
(
prev
==
0
||
prev
<=
cur
)
break
;
}
prev
=
_getrecord_ex
(
*
i
)
->
combining
;
prev
=
_getrecord_ex
(
PyUnicode_READ
(
kind
,
data
,
i
)
)
->
combining
;
}
return
result
;
}
static
int
find_nfc_index
(
PyObject
*
self
,
struct
reindex
*
nfc
,
Py_U
NICODE
code
)
find_nfc_index
(
PyObject
*
self
,
struct
reindex
*
nfc
,
Py_U
CS4
code
)
{
int
index
;
for
(
index
=
0
;
nfc
[
index
].
start
;
index
++
)
{
...
...
@@ -619,27 +633,36 @@ static PyObject*
nfc_nfkc
(
PyObject
*
self
,
PyObject
*
input
,
int
k
)
{
PyObject
*
result
;
Py_UNICODE
*
i
,
*
i1
,
*
o
,
*
end
;
int
kind
;
void
*
data
;
Py_UCS4
*
output
;
Py_ssize_t
i
,
i1
,
o
,
len
;
int
f
,
l
,
index
,
index1
,
comb
;
Py_U
NICODE
code
;
Py_
UNICODE
*
skipped
[
20
];
Py_U
CS4
code
;
Py_
ssize_t
skipped
[
20
];
int
cskipped
=
0
;
result
=
nfd_nfkd
(
self
,
input
,
k
);
if
(
!
result
)
return
NULL
;
/* We are going to modify result in-place.
If nfd_nfkd is changed to sometimes return the input,
this code needs to be reviewed. */
assert
(
result
!=
input
);
i
=
PyUnicode_AS_UNICODE
(
result
);
end
=
i
+
PyUnicode_GET_SIZE
(
result
);
o
=
PyUnicode_AS_UNICODE
(
result
);
/* result will be "ready". */
kind
=
PyUnicode_KIND
(
result
);
data
=
PyUnicode_DATA
(
result
);
len
=
PyUnicode_GET_LENGTH
(
result
);
/* We allocate a buffer for the output.
If we find that we made no changes, we still return
the NFD result. */
output
=
PyMem_Malloc
(
len
*
sizeof
(
Py_UCS4
));
if
(
!
output
)
{
PyErr_NoMemory
();
Py_DECREF
(
result
);
return
0
;
}
i
=
o
=
0
;
again:
while
(
i
<
end
)
{
while
(
i
<
len
)
{
for
(
index
=
0
;
index
<
cskipped
;
index
++
)
{
if
(
skipped
[
index
]
==
i
)
{
/* *i character is skipped.
...
...
@@ -652,33 +675,41 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
}
/* Hangul Composition. We don't need to check for <LV,T>
pairs, since we always have decomposed data. */
if
(
LBase
<=
*
i
&&
*
i
<
(
LBase
+
LCount
)
&&
i
+
1
<
end
&&
VBase
<=
i
[
1
]
&&
i
[
1
]
<=
(
VBase
+
VCount
))
{
code
=
PyUnicode_READ
(
kind
,
data
,
i
);
if
(
LBase
<=
code
&&
code
<
(
LBase
+
LCount
)
&&
i
+
1
<
len
&&
VBase
<=
PyUnicode_READ
(
kind
,
data
,
i
+
1
)
&&
PyUnicode_READ
(
kind
,
data
,
i
+
1
)
<=
(
VBase
+
VCount
))
{
int
LIndex
,
VIndex
;
LIndex
=
i
[
0
]
-
LBase
;
VIndex
=
i
[
1
]
-
VBase
;
LIndex
=
code
-
LBase
;
VIndex
=
PyUnicode_READ
(
kind
,
data
,
i
+
1
)
-
VBase
;
code
=
SBase
+
(
LIndex
*
VCount
+
VIndex
)
*
TCount
;
i
+=
2
;
if
(
i
<
end
&&
TBase
<=
*
i
&&
*
i
<=
(
TBase
+
TCount
))
{
code
+=
*
i
-
TBase
;
if
(
i
<
len
&&
TBase
<=
PyUnicode_READ
(
kind
,
data
,
i
)
&&
PyUnicode_READ
(
kind
,
data
,
i
)
<=
(
TBase
+
TCount
))
{
code
+=
PyUnicode_READ
(
kind
,
data
,
i
)
-
TBase
;
i
++
;
}
*
o
++
=
code
;
output
[
o
++
]
=
code
;
continue
;
}
f
=
find_nfc_index
(
self
,
nfc_first
,
*
i
);
/* code is still input[i] here */
f
=
find_nfc_index
(
self
,
nfc_first
,
code
);
if
(
f
==
-
1
)
{
*
o
++
=
*
i
++
;
output
[
o
++
]
=
code
;
i
++
;
continue
;
}
/* Find next unblocked character. */
i1
=
i
+
1
;
comb
=
0
;
while
(
i1
<
end
)
{
int
comb1
=
_getrecord_ex
(
*
i1
)
->
combining
;
/* output base character for now; might be updated later. */
output
[
o
]
=
PyUnicode_READ
(
kind
,
data
,
i
);
while
(
i1
<
len
)
{
Py_UCS4
code1
=
PyUnicode_READ
(
kind
,
data
,
i1
);
int
comb1
=
_getrecord_ex
(
code1
)
->
combining
;
if
(
comb
)
{
if
(
comb1
==
0
)
break
;
...
...
@@ -688,8 +719,8 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
continue
;
}
}
l
=
find_nfc_index
(
self
,
nfc_last
,
*
i
1
);
/*
*i1 cannot be combined with *i. If *
i1
l
=
find_nfc_index
(
self
,
nfc_last
,
code
1
);
/*
i1 cannot be combined with i. If
i1
is a starter, we don't need to look further.
Otherwise, record the combining class. */
if
(
l
==
-
1
)
{
...
...
@@ -708,19 +739,28 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
goto
not_combinable
;
/* Replace the original character. */
*
i
=
code
;
output
[
o
]
=
code
;
/* Mark the second character unused. */
assert
(
cskipped
<
20
);
skipped
[
cskipped
++
]
=
i1
;
i1
++
;
f
=
find_nfc_index
(
self
,
nfc_first
,
*
i
);
f
=
find_nfc_index
(
self
,
nfc_first
,
output
[
o
]
);
if
(
f
==
-
1
)
break
;
}
*
o
++
=
*
i
++
;
/* Output character was already written.
Just advance the indices. */
o
++
;
i
++
;
}
if
(
o
==
len
)
{
/* No changes. Return original string. */
PyMem_Free
(
output
);
return
result
;
}
if
(
o
!=
end
)
PyUnicode_Resize
(
&
result
,
o
-
PyUnicode_AS_UNICODE
(
result
));
Py_DECREF
(
result
);
result
=
PyUnicode_FromKindAndData
(
PyUnicode_4BYTE_KIND
,
output
,
o
);
PyMem_Free
(
output
);
return
result
;
}
...
...
@@ -728,7 +768,9 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
static
int
is_normalized
(
PyObject
*
self
,
PyObject
*
input
,
int
nfc
,
int
k
)
{
Py_UNICODE
*
i
,
*
end
;
Py_ssize_t
i
,
len
;
int
kind
;
void
*
data
;
unsigned
char
prev_combining
=
0
,
quickcheck_mask
;
/* An older version of the database is requested, quickchecks must be
...
...
@@ -740,10 +782,13 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k)
as described in http://unicode.org/reports/tr15/#Annex8. */
quickcheck_mask
=
3
<<
((
nfc
?
4
:
0
)
+
(
k
?
2
:
0
));
i
=
PyUnicode_AS_UNICODE
(
input
);
end
=
i
+
PyUnicode_GET_SIZE
(
input
);
while
(
i
<
end
)
{
const
_PyUnicode_DatabaseRecord
*
record
=
_getrecord_ex
(
*
i
++
);
i
=
0
;
kind
=
PyUnicode_KIND
(
input
);
data
=
PyUnicode_DATA
(
input
);
len
=
PyUnicode_GET_LENGTH
(
input
);
while
(
i
<
len
)
{
Py_UCS4
ch
=
PyUnicode_READ
(
kind
,
data
,
i
++
);
const
_PyUnicode_DatabaseRecord
*
record
=
_getrecord_ex
(
ch
);
unsigned
char
combining
=
record
->
combining
;
unsigned
char
quickcheck
=
record
->
normalization_quick_check
;
...
...
@@ -772,7 +817,10 @@ unicodedata_normalize(PyObject *self, PyObject *args)
&
form
,
&
PyUnicode_Type
,
&
input
))
return
NULL
;
if
(
PyUnicode_GetSize
(
input
)
==
0
)
{
if
(
PyUnicode_READY
(
input
)
==
-
1
)
return
NULL
;
if
(
PyUnicode_GET_LENGTH
(
input
)
==
0
)
{
/* Special case empty input strings, since resizing
them later would cause internal errors. */
Py_INCREF
(
input
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment