Skip to content
Projeler
Gruplar
Parçacıklar
Yardım
Yükleniyor...
Oturum aç / Kaydol
Gezinmeyi değiştir
C
cpython
Proje
Proje
Ayrıntılar
Etkinlik
Cycle Analytics
Depo (repository)
Depo (repository)
Dosyalar
Kayıtlar (commit)
Dallar (branch)
Etiketler
Katkıda bulunanlar
Grafik
Karşılaştır
Grafikler
Konular (issue)
0
Konular (issue)
0
Liste
Pano
Etiketler
Kilometre Taşları
Birleştirme (merge) Talepleri
0
Birleştirme (merge) Talepleri
0
CI / CD
CI / CD
İş akışları (pipeline)
İşler
Zamanlamalar
Grafikler
Paketler
Paketler
Wiki
Wiki
Parçacıklar
Parçacıklar
Üyeler
Üyeler
Collapse sidebar
Close sidebar
Etkinlik
Grafik
Grafikler
Yeni bir konu (issue) oluştur
İşler
Kayıtlar (commit)
Konu (issue) Panoları
Kenar çubuğunu aç
Batuhan Osman TASKAYA
cpython
Commits
22970667
Kaydet (Commit)
22970667
authored
Eyl 29, 2011
tarafından
Martin v. Löwis
Dosyalara gözat
Seçenekler
Dosyalara Gözat
İndir
Eposta Yamaları
Sade Fark
Port normalization to new API.
üst
f0ddadcf
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
119 additions
and
71 deletions
+119
-71
unicodedata.c
Modules/unicodedata.c
+119
-71
No files found.
Modules/unicodedata.c
Dosyayı görüntüle @
22970667
...
@@ -494,36 +494,44 @@ static PyObject*
...
@@ -494,36 +494,44 @@ static PyObject*
nfd_nfkd
(
PyObject
*
self
,
PyObject
*
input
,
int
k
)
nfd_nfkd
(
PyObject
*
self
,
PyObject
*
input
,
int
k
)
{
{
PyObject
*
result
;
PyObject
*
result
;
Py_UNICODE
*
i
,
*
end
,
*
o
;
Py_UCS4
*
output
;
Py_ssize_t
i
,
o
,
osize
;
int
kind
;
void
*
data
;
/* Longest decomposition in Unicode 3.2: U+FDFA */
/* Longest decomposition in Unicode 3.2: U+FDFA */
Py_U
NICODE
stack
[
20
];
Py_U
CS4
stack
[
20
];
Py_ssize_t
space
,
isize
;
Py_ssize_t
space
,
isize
;
int
index
,
prefix
,
count
,
stackptr
;
int
index
,
prefix
,
count
,
stackptr
;
unsigned
char
prev
,
cur
;
unsigned
char
prev
,
cur
;
stackptr
=
0
;
stackptr
=
0
;
isize
=
PyUnicode_GET_
SIZE
(
input
);
isize
=
PyUnicode_GET_
LENGTH
(
input
);
/* Overallocate atmost 10 characters. */
/* Overallocate atmost 10 characters. */
space
=
(
isize
>
10
?
10
:
isize
)
+
isize
;
space
=
(
isize
>
10
?
10
:
isize
)
+
isize
;
result
=
PyUnicode_FromUnicode
(
NULL
,
space
);
osize
=
space
;
if
(
!
result
)
output
=
PyMem_Malloc
(
space
*
sizeof
(
Py_UCS4
));
if
(
!
output
)
{
PyErr_NoMemory
();
return
NULL
;
return
NULL
;
i
=
PyUnicode_AS_UNICODE
(
input
);
}
end
=
i
+
isize
;
i
=
o
=
0
;
o
=
PyUnicode_AS_UNICODE
(
result
);
kind
=
PyUnicode_KIND
(
input
);
data
=
PyUnicode_DATA
(
input
);
while
(
i
<
end
)
{
while
(
i
<
isize
)
{
stack
[
stackptr
++
]
=
*
i
++
;
stack
[
stackptr
++
]
=
PyUnicode_READ
(
kind
,
data
,
i
++
)
;
while
(
stackptr
)
{
while
(
stackptr
)
{
Py_U
NICODE
code
=
stack
[
--
stackptr
];
Py_U
CS4
code
=
stack
[
--
stackptr
];
/* Hangul Decomposition adds three characters in
/* Hangul Decomposition adds three characters in
a single step, so we need atleast that much room. */
a single step, so we need atleast that much room. */
if
(
space
<
3
)
{
if
(
space
<
3
)
{
Py_ssize_t
newsize
=
PyUnicode_GET_SIZE
(
result
)
+
10
;
osize
+=
10
;
space
+=
10
;
space
+=
10
;
if
(
PyUnicode_Resize
(
&
result
,
newsize
)
==
-
1
)
output
=
PyMem_Realloc
(
output
,
osize
*
sizeof
(
Py_UCS4
));
if
(
output
==
NULL
)
{
PyErr_NoMemory
();
return
NULL
;
return
NULL
;
o
=
PyUnicode_AS_UNICODE
(
result
)
+
newsize
-
space
;
}
}
}
/* Hangul Decomposition. */
/* Hangul Decomposition. */
if
(
SBase
<=
code
&&
code
<
(
SBase
+
SCount
))
{
if
(
SBase
<=
code
&&
code
<
(
SBase
+
SCount
))
{
...
@@ -531,11 +539,11 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
...
@@ -531,11 +539,11 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
int
L
=
LBase
+
SIndex
/
NCount
;
int
L
=
LBase
+
SIndex
/
NCount
;
int
V
=
VBase
+
(
SIndex
%
NCount
)
/
TCount
;
int
V
=
VBase
+
(
SIndex
%
NCount
)
/
TCount
;
int
T
=
TBase
+
SIndex
%
TCount
;
int
T
=
TBase
+
SIndex
%
TCount
;
*
o
++
=
L
;
output
[
o
++
]
=
L
;
*
o
++
=
V
;
output
[
o
++
]
=
V
;
space
-=
2
;
space
-=
2
;
if
(
T
!=
TBase
)
{
if
(
T
!=
TBase
)
{
*
o
++
=
T
;
output
[
o
++
]
=
T
;
space
--
;
space
--
;
}
}
continue
;
continue
;
...
@@ -555,7 +563,7 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
...
@@ -555,7 +563,7 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
/* Copy character if it is not decomposable, or has a
/* Copy character if it is not decomposable, or has a
compatibility decomposition, but we do NFD. */
compatibility decomposition, but we do NFD. */
if
(
!
count
||
(
prefix
&&
!
k
))
{
if
(
!
count
||
(
prefix
&&
!
k
))
{
*
o
++
=
code
;
output
[
o
++
]
=
code
;
space
--
;
space
--
;
continue
;
continue
;
}
}
...
@@ -568,15 +576,20 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
...
@@ -568,15 +576,20 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
}
}
}
}
/* Drop overallocation. Cannot fail. */
result
=
PyUnicode_FromKindAndData
(
PyUnicode_4BYTE_KIND
,
PyUnicode_Resize
(
&
result
,
PyUnicode_GET_SIZE
(
result
)
-
space
);
output
,
o
);
PyMem_Free
(
output
);
if
(
!
result
)
return
NULL
;
/* result is guaranteed to be ready, as it is compact. */
kind
=
PyUnicode_KIND
(
result
);
data
=
PyUnicode_DATA
(
result
);
/* Sort canonically. */
/* Sort canonically. */
i
=
PyUnicode_AS_UNICODE
(
result
);
i
=
0
;
prev
=
_getrecord_ex
(
*
i
)
->
combining
;
prev
=
_getrecord_ex
(
PyUnicode_READ
(
kind
,
data
,
i
))
->
combining
;
end
=
i
+
PyUnicode_GET_SIZE
(
result
);
for
(
i
++
;
i
<
PyUnicode_GET_LENGTH
(
result
);
i
++
)
{
for
(
i
++
;
i
<
end
;
i
++
)
{
cur
=
_getrecord_ex
(
PyUnicode_READ
(
kind
,
data
,
i
))
->
combining
;
cur
=
_getrecord_ex
(
*
i
)
->
combining
;
if
(
prev
==
0
||
cur
==
0
||
prev
<=
cur
)
{
if
(
prev
==
0
||
cur
==
0
||
prev
<=
cur
)
{
prev
=
cur
;
prev
=
cur
;
continue
;
continue
;
...
@@ -584,23 +597,24 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
...
@@ -584,23 +597,24 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
/* Non-canonical order. Need to switch *i with previous. */
/* Non-canonical order. Need to switch *i with previous. */
o
=
i
-
1
;
o
=
i
-
1
;
while
(
1
)
{
while
(
1
)
{
Py_UNICODE
tmp
=
o
[
1
];
Py_UCS4
tmp
=
PyUnicode_READ
(
kind
,
data
,
o
+
1
);
o
[
1
]
=
o
[
0
];
PyUnicode_WRITE
(
kind
,
data
,
o
+
1
,
o
[
0
]
=
tmp
;
PyUnicode_READ
(
kind
,
data
,
o
));
PyUnicode_WRITE
(
kind
,
data
,
o
,
tmp
);
o
--
;
o
--
;
if
(
o
<
PyUnicode_AS_UNICODE
(
result
)
)
if
(
o
<
0
)
break
;
break
;
prev
=
_getrecord_ex
(
*
o
)
->
combining
;
prev
=
_getrecord_ex
(
PyUnicode_READ
(
kind
,
data
,
o
)
)
->
combining
;
if
(
prev
==
0
||
prev
<=
cur
)
if
(
prev
==
0
||
prev
<=
cur
)
break
;
break
;
}
}
prev
=
_getrecord_ex
(
*
i
)
->
combining
;
prev
=
_getrecord_ex
(
PyUnicode_READ
(
kind
,
data
,
i
)
)
->
combining
;
}
}
return
result
;
return
result
;
}
}
static
int
static
int
find_nfc_index
(
PyObject
*
self
,
struct
reindex
*
nfc
,
Py_U
NICODE
code
)
find_nfc_index
(
PyObject
*
self
,
struct
reindex
*
nfc
,
Py_U
CS4
code
)
{
{
int
index
;
int
index
;
for
(
index
=
0
;
nfc
[
index
].
start
;
index
++
)
{
for
(
index
=
0
;
nfc
[
index
].
start
;
index
++
)
{
...
@@ -619,27 +633,36 @@ static PyObject*
...
@@ -619,27 +633,36 @@ static PyObject*
nfc_nfkc
(
PyObject
*
self
,
PyObject
*
input
,
int
k
)
nfc_nfkc
(
PyObject
*
self
,
PyObject
*
input
,
int
k
)
{
{
PyObject
*
result
;
PyObject
*
result
;
Py_UNICODE
*
i
,
*
i1
,
*
o
,
*
end
;
int
kind
;
void
*
data
;
Py_UCS4
*
output
;
Py_ssize_t
i
,
i1
,
o
,
len
;
int
f
,
l
,
index
,
index1
,
comb
;
int
f
,
l
,
index
,
index1
,
comb
;
Py_U
NICODE
code
;
Py_U
CS4
code
;
Py_
UNICODE
*
skipped
[
20
];
Py_
ssize_t
skipped
[
20
];
int
cskipped
=
0
;
int
cskipped
=
0
;
result
=
nfd_nfkd
(
self
,
input
,
k
);
result
=
nfd_nfkd
(
self
,
input
,
k
);
if
(
!
result
)
if
(
!
result
)
return
NULL
;
return
NULL
;
/* result will be "ready". */
/* We are going to modify result in-place.
kind
=
PyUnicode_KIND
(
result
);
If nfd_nfkd is changed to sometimes return the input,
data
=
PyUnicode_DATA
(
result
);
this code needs to be reviewed. */
len
=
PyUnicode_GET_LENGTH
(
result
);
assert
(
result
!=
input
);
/* We allocate a buffer for the output.
i
=
PyUnicode_AS_UNICODE
(
result
);
If we find that we made no changes, we still return
end
=
i
+
PyUnicode_GET_SIZE
(
result
);
the NFD result. */
o
=
PyUnicode_AS_UNICODE
(
result
);
output
=
PyMem_Malloc
(
len
*
sizeof
(
Py_UCS4
));
if
(
!
output
)
{
PyErr_NoMemory
();
Py_DECREF
(
result
);
return
0
;
}
i
=
o
=
0
;
again:
again:
while
(
i
<
end
)
{
while
(
i
<
len
)
{
for
(
index
=
0
;
index
<
cskipped
;
index
++
)
{
for
(
index
=
0
;
index
<
cskipped
;
index
++
)
{
if
(
skipped
[
index
]
==
i
)
{
if
(
skipped
[
index
]
==
i
)
{
/* *i character is skipped.
/* *i character is skipped.
...
@@ -652,33 +675,41 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
...
@@ -652,33 +675,41 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
}
}
/* Hangul Composition. We don't need to check for <LV,T>
/* Hangul Composition. We don't need to check for <LV,T>
pairs, since we always have decomposed data. */
pairs, since we always have decomposed data. */
if
(
LBase
<=
*
i
&&
*
i
<
(
LBase
+
LCount
)
&&
code
=
PyUnicode_READ
(
kind
,
data
,
i
);
i
+
1
<
end
&&
if
(
LBase
<=
code
&&
code
<
(
LBase
+
LCount
)
&&
VBase
<=
i
[
1
]
&&
i
[
1
]
<=
(
VBase
+
VCount
))
{
i
+
1
<
len
&&
VBase
<=
PyUnicode_READ
(
kind
,
data
,
i
+
1
)
&&
PyUnicode_READ
(
kind
,
data
,
i
+
1
)
<=
(
VBase
+
VCount
))
{
int
LIndex
,
VIndex
;
int
LIndex
,
VIndex
;
LIndex
=
i
[
0
]
-
LBase
;
LIndex
=
code
-
LBase
;
VIndex
=
i
[
1
]
-
VBase
;
VIndex
=
PyUnicode_READ
(
kind
,
data
,
i
+
1
)
-
VBase
;
code
=
SBase
+
(
LIndex
*
VCount
+
VIndex
)
*
TCount
;
code
=
SBase
+
(
LIndex
*
VCount
+
VIndex
)
*
TCount
;
i
+=
2
;
i
+=
2
;
if
(
i
<
end
&&
if
(
i
<
len
&&
TBase
<=
*
i
&&
*
i
<=
(
TBase
+
TCount
))
{
TBase
<=
PyUnicode_READ
(
kind
,
data
,
i
)
&&
code
+=
*
i
-
TBase
;
PyUnicode_READ
(
kind
,
data
,
i
)
<=
(
TBase
+
TCount
))
{
code
+=
PyUnicode_READ
(
kind
,
data
,
i
)
-
TBase
;
i
++
;
i
++
;
}
}
*
o
++
=
code
;
output
[
o
++
]
=
code
;
continue
;
continue
;
}
}
f
=
find_nfc_index
(
self
,
nfc_first
,
*
i
);
/* code is still input[i] here */
f
=
find_nfc_index
(
self
,
nfc_first
,
code
);
if
(
f
==
-
1
)
{
if
(
f
==
-
1
)
{
*
o
++
=
*
i
++
;
output
[
o
++
]
=
code
;
i
++
;
continue
;
continue
;
}
}
/* Find next unblocked character. */
/* Find next unblocked character. */
i1
=
i
+
1
;
i1
=
i
+
1
;
comb
=
0
;
comb
=
0
;
while
(
i1
<
end
)
{
/* output base character for now; might be updated later. */
int
comb1
=
_getrecord_ex
(
*
i1
)
->
combining
;
output
[
o
]
=
PyUnicode_READ
(
kind
,
data
,
i
);
while
(
i1
<
len
)
{
Py_UCS4
code1
=
PyUnicode_READ
(
kind
,
data
,
i1
);
int
comb1
=
_getrecord_ex
(
code1
)
->
combining
;
if
(
comb
)
{
if
(
comb
)
{
if
(
comb1
==
0
)
if
(
comb1
==
0
)
break
;
break
;
...
@@ -688,8 +719,8 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
...
@@ -688,8 +719,8 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
continue
;
continue
;
}
}
}
}
l
=
find_nfc_index
(
self
,
nfc_last
,
*
i
1
);
l
=
find_nfc_index
(
self
,
nfc_last
,
code
1
);
/*
*i1 cannot be combined with *i. If *
i1
/*
i1 cannot be combined with i. If
i1
is a starter, we don't need to look further.
is a starter, we don't need to look further.
Otherwise, record the combining class. */
Otherwise, record the combining class. */
if
(
l
==
-
1
)
{
if
(
l
==
-
1
)
{
...
@@ -708,19 +739,28 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
...
@@ -708,19 +739,28 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
goto
not_combinable
;
goto
not_combinable
;
/* Replace the original character. */
/* Replace the original character. */
*
i
=
code
;
output
[
o
]
=
code
;
/* Mark the second character unused. */
/* Mark the second character unused. */
assert
(
cskipped
<
20
);
assert
(
cskipped
<
20
);
skipped
[
cskipped
++
]
=
i1
;
skipped
[
cskipped
++
]
=
i1
;
i1
++
;
i1
++
;
f
=
find_nfc_index
(
self
,
nfc_first
,
*
i
);
f
=
find_nfc_index
(
self
,
nfc_first
,
output
[
o
]
);
if
(
f
==
-
1
)
if
(
f
==
-
1
)
break
;
break
;
}
}
*
o
++
=
*
i
++
;
/* Output character was already written.
Just advance the indices. */
o
++
;
i
++
;
}
if
(
o
==
len
)
{
/* No changes. Return original string. */
PyMem_Free
(
output
);
return
result
;
}
}
if
(
o
!=
end
)
Py_DECREF
(
result
);
PyUnicode_Resize
(
&
result
,
o
-
PyUnicode_AS_UNICODE
(
result
));
result
=
PyUnicode_FromKindAndData
(
PyUnicode_4BYTE_KIND
,
output
,
o
);
PyMem_Free
(
output
);
return
result
;
return
result
;
}
}
...
@@ -728,7 +768,9 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
...
@@ -728,7 +768,9 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
static
int
static
int
is_normalized
(
PyObject
*
self
,
PyObject
*
input
,
int
nfc
,
int
k
)
is_normalized
(
PyObject
*
self
,
PyObject
*
input
,
int
nfc
,
int
k
)
{
{
Py_UNICODE
*
i
,
*
end
;
Py_ssize_t
i
,
len
;
int
kind
;
void
*
data
;
unsigned
char
prev_combining
=
0
,
quickcheck_mask
;
unsigned
char
prev_combining
=
0
,
quickcheck_mask
;
/* An older version of the database is requested, quickchecks must be
/* An older version of the database is requested, quickchecks must be
...
@@ -740,10 +782,13 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k)
...
@@ -740,10 +782,13 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k)
as described in http://unicode.org/reports/tr15/#Annex8. */
as described in http://unicode.org/reports/tr15/#Annex8. */
quickcheck_mask
=
3
<<
((
nfc
?
4
:
0
)
+
(
k
?
2
:
0
));
quickcheck_mask
=
3
<<
((
nfc
?
4
:
0
)
+
(
k
?
2
:
0
));
i
=
PyUnicode_AS_UNICODE
(
input
);
i
=
0
;
end
=
i
+
PyUnicode_GET_SIZE
(
input
);
kind
=
PyUnicode_KIND
(
input
);
while
(
i
<
end
)
{
data
=
PyUnicode_DATA
(
input
);
const
_PyUnicode_DatabaseRecord
*
record
=
_getrecord_ex
(
*
i
++
);
len
=
PyUnicode_GET_LENGTH
(
input
);
while
(
i
<
len
)
{
Py_UCS4
ch
=
PyUnicode_READ
(
kind
,
data
,
i
++
);
const
_PyUnicode_DatabaseRecord
*
record
=
_getrecord_ex
(
ch
);
unsigned
char
combining
=
record
->
combining
;
unsigned
char
combining
=
record
->
combining
;
unsigned
char
quickcheck
=
record
->
normalization_quick_check
;
unsigned
char
quickcheck
=
record
->
normalization_quick_check
;
...
@@ -772,7 +817,10 @@ unicodedata_normalize(PyObject *self, PyObject *args)
...
@@ -772,7 +817,10 @@ unicodedata_normalize(PyObject *self, PyObject *args)
&
form
,
&
PyUnicode_Type
,
&
input
))
&
form
,
&
PyUnicode_Type
,
&
input
))
return
NULL
;
return
NULL
;
if
(
PyUnicode_GetSize
(
input
)
==
0
)
{
if
(
PyUnicode_READY
(
input
)
==
-
1
)
return
NULL
;
if
(
PyUnicode_GET_LENGTH
(
input
)
==
0
)
{
/* Special case empty input strings, since resizing
/* Special case empty input strings, since resizing
them later would cause internal errors. */
them later would cause internal errors. */
Py_INCREF
(
input
);
Py_INCREF
(
input
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment