Skip to content
Projeler
Gruplar
Parçacıklar
Yardım
Yükleniyor...
Oturum aç / Kaydol
Gezinmeyi değiştir
C
cpython
Proje
Proje
Ayrıntılar
Etkinlik
Cycle Analytics
Depo (repository)
Depo (repository)
Dosyalar
Kayıtlar (commit)
Dallar (branch)
Etiketler
Katkıda bulunanlar
Grafik
Karşılaştır
Grafikler
Konular (issue)
0
Konular (issue)
0
Liste
Pano
Etiketler
Kilometre Taşları
Birleştirme (merge) Talepleri
0
Birleştirme (merge) Talepleri
0
CI / CD
CI / CD
İş akışları (pipeline)
İşler
Zamanlamalar
Grafikler
Paketler
Paketler
Wiki
Wiki
Parçacıklar
Parçacıklar
Üyeler
Üyeler
Collapse sidebar
Close sidebar
Etkinlik
Grafik
Grafikler
Yeni bir konu (issue) oluştur
İşler
Kayıtlar (commit)
Konu (issue) Panoları
Kenar çubuğunu aç
Batuhan Osman TASKAYA
cpython
Commits
2810dd7b
Kaydet (Commit)
2810dd7b
authored
Kas 04, 2018
tarafından
Max Bélanger
Kaydeden (comit)
Benjamin Peterson
Kas 04, 2018
Dosyalara gözat
Seçenekler
Dosyalara Gözat
İndir
Eposta Yamaları
Sade Fark
closes bpo-32285: Add unicodedata.is_normalized. (GH-4806)
üst
5d236caf
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
160 additions
and
22 deletions
+160
-22
unicodedata.rst
Doc/library/unicodedata.rst
+7
-0
3.8.rst
Doc/whatsnew/3.8.rst
+7
-0
test_normalization.py
Lib/test/test_normalization.py
+10
-1
2017-12-12-13-43-13.bpo-32285.LzKSwz.rst
...ore and Builtins/2017-12-12-13-43-13.bpo-32285.LzKSwz.rst
+2
-0
unicodedata.c.h
Modules/clinic/unicodedata.c.h
+36
-4
unicodedata.c
Modules/unicodedata.c
+98
-17
No files found.
Doc/library/unicodedata.rst
Dosyayı görüntüle @
2810dd7b
...
@@ -133,6 +133,13 @@ following functions:
...
@@ -133,6 +133,13 @@ following functions:
a human reader, if one has combining characters and the other
a human reader, if one has combining characters and the other
doesn't, they may not compare equal.
doesn't, they may not compare equal.
.. function:: is_normalized(form, unistr)
Return whether the Unicode string *unistr* is in the normal form *form*. Valid
values for *form* are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
.. versionadded:: 3.8
In addition, the module exposes the following constant:
In addition, the module exposes the following constant:
...
...
Doc/whatsnew/3.8.rst
Dosyayı görüntüle @
2810dd7b
...
@@ -204,6 +204,13 @@ Added method :meth:`~tkinter.Canvas.moveto`
...
@@ -204,6 +204,13 @@ Added method :meth:`~tkinter.Canvas.moveto`
in the :class:`tkinter.Canvas` class.
in the :class:`tkinter.Canvas` class.
(Contributed by Juliette Monsel in :issue:`23831`.)
(Contributed by Juliette Monsel in :issue:`23831`.)
unicodedata
-----------
* New function :func:`~unicodedata.is_normalized` can be used to verify a string
is in a specific normal form. (Contributed by Max Belanger and David Euresti in
:issue:`32285`).
venv
venv
----
----
...
...
Lib/test/test_normalization.py
Dosyayı görüntüle @
2810dd7b
...
@@ -3,7 +3,7 @@ import unittest
...
@@ -3,7 +3,7 @@ import unittest
from
http.client
import
HTTPException
from
http.client
import
HTTPException
import
sys
import
sys
from
unicodedata
import
normalize
,
unidata_version
from
unicodedata
import
normalize
,
is_normalized
,
unidata_version
TESTDATAFILE
=
"NormalizationTest.txt"
TESTDATAFILE
=
"NormalizationTest.txt"
TESTDATAURL
=
"http://www.pythontest.net/unicode/"
+
unidata_version
+
"/"
+
TESTDATAFILE
TESTDATAURL
=
"http://www.pythontest.net/unicode/"
+
unidata_version
+
"/"
+
TESTDATAFILE
...
@@ -88,6 +88,15 @@ class NormalizationTest(unittest.TestCase):
...
@@ -88,6 +88,15 @@ class NormalizationTest(unittest.TestCase):
NFKD
(
c3
)
==
NFKD
(
c4
)
==
NFKD
(
c5
),
NFKD
(
c3
)
==
NFKD
(
c4
)
==
NFKD
(
c5
),
line
)
line
)
self
.
assertTrue
(
is_normalized
(
"NFC"
,
c2
))
self
.
assertTrue
(
is_normalized
(
"NFC"
,
c4
))
self
.
assertTrue
(
is_normalized
(
"NFD"
,
c3
))
self
.
assertTrue
(
is_normalized
(
"NFD"
,
c5
))
self
.
assertTrue
(
is_normalized
(
"NFKC"
,
c4
))
self
.
assertTrue
(
is_normalized
(
"NFKD"
,
c5
))
# Record part 1 data
# Record part 1 data
if
part
==
"@Part1"
:
if
part
==
"@Part1"
:
part1_data
[
c1
]
=
1
part1_data
[
c1
]
=
1
...
...
Misc/NEWS.d/next/Core and Builtins/2017-12-12-13-43-13.bpo-32285.LzKSwz.rst
0 → 100644
Dosyayı görüntüle @
2810dd7b
New function unicodedata.is_normalized, which can check whether a string is
in a specific normal form.
Modules/clinic/unicodedata.c.h
Dosyayı görüntüle @
2810dd7b
...
@@ -284,6 +284,38 @@ exit:
...
@@ -284,6 +284,38 @@ exit:
return
return_value
;
return
return_value
;
}
}
PyDoc_STRVAR
(
unicodedata_UCD_is_normalized__doc__
,
"is_normalized($self, form, unistr, /)
\n
"
"--
\n
"
"
\n
"
"Return whether the Unicode string unistr is in the normal form
\'
form
\'
.
\n
"
"
\n
"
"Valid values for form are
\'
NFC
\'
,
\'
NFKC
\'
,
\'
NFD
\'
, and
\'
NFKD
\'
."
);
#define UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF \
{"is_normalized", (PyCFunction)unicodedata_UCD_is_normalized, METH_FASTCALL, unicodedata_UCD_is_normalized__doc__},
static
PyObject
*
unicodedata_UCD_is_normalized_impl
(
PyObject
*
self
,
PyObject
*
form
,
PyObject
*
input
);
static
PyObject
*
unicodedata_UCD_is_normalized
(
PyObject
*
self
,
PyObject
*
const
*
args
,
Py_ssize_t
nargs
)
{
PyObject
*
return_value
=
NULL
;
PyObject
*
form
;
PyObject
*
input
;
if
(
!
_PyArg_ParseStack
(
args
,
nargs
,
"UU:is_normalized"
,
&
form
,
&
input
))
{
goto
exit
;
}
return_value
=
unicodedata_UCD_is_normalized_impl
(
self
,
form
,
input
);
exit:
return
return_value
;
}
PyDoc_STRVAR
(
unicodedata_UCD_normalize__doc__
,
PyDoc_STRVAR
(
unicodedata_UCD_normalize__doc__
,
"normalize($self, form, unistr, /)
\n
"
"normalize($self, form, unistr, /)
\n
"
"--
\n
"
"--
\n
"
...
@@ -296,17 +328,17 @@ PyDoc_STRVAR(unicodedata_UCD_normalize__doc__,
...
@@ -296,17 +328,17 @@ PyDoc_STRVAR(unicodedata_UCD_normalize__doc__,
{"normalize", (PyCFunction)unicodedata_UCD_normalize, METH_FASTCALL, unicodedata_UCD_normalize__doc__},
{"normalize", (PyCFunction)unicodedata_UCD_normalize, METH_FASTCALL, unicodedata_UCD_normalize__doc__},
static
PyObject
*
static
PyObject
*
unicodedata_UCD_normalize_impl
(
PyObject
*
self
,
const
char
*
form
,
unicodedata_UCD_normalize_impl
(
PyObject
*
self
,
PyObject
*
form
,
PyObject
*
input
);
PyObject
*
input
);
static
PyObject
*
static
PyObject
*
unicodedata_UCD_normalize
(
PyObject
*
self
,
PyObject
*
const
*
args
,
Py_ssize_t
nargs
)
unicodedata_UCD_normalize
(
PyObject
*
self
,
PyObject
*
const
*
args
,
Py_ssize_t
nargs
)
{
{
PyObject
*
return_value
=
NULL
;
PyObject
*
return_value
=
NULL
;
const
char
*
form
;
PyObject
*
form
;
PyObject
*
input
;
PyObject
*
input
;
if
(
!
_PyArg_ParseStack
(
args
,
nargs
,
"
s
U:normalize"
,
if
(
!
_PyArg_ParseStack
(
args
,
nargs
,
"
U
U:normalize"
,
&
form
,
&
input
))
{
&
form
,
&
input
))
{
goto
exit
;
goto
exit
;
}
}
...
@@ -379,4 +411,4 @@ unicodedata_UCD_lookup(PyObject *self, PyObject *arg)
...
@@ -379,4 +411,4 @@ unicodedata_UCD_lookup(PyObject *self, PyObject *arg)
exit:
exit:
return
return_value
;
return
return_value
;
}
}
/*[clinic end generated code: output=
dc899bff0ecd14c1
input=a9049054013a1b77]*/
/*[clinic end generated code: output=
2c5fbf597c18f6b8
input=a9049054013a1b77]*/
Modules/unicodedata.c
Dosyayı görüntüle @
2810dd7b
...
@@ -19,6 +19,11 @@
...
@@ -19,6 +19,11 @@
#include "ucnhash.h"
#include "ucnhash.h"
#include "structmember.h"
#include "structmember.h"
_Py_IDENTIFIER
(
NFC
);
_Py_IDENTIFIER
(
NFD
);
_Py_IDENTIFIER
(
NFKC
);
_Py_IDENTIFIER
(
NFKD
);
/*[clinic input]
/*[clinic input]
module unicodedata
module unicodedata
class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
...
@@ -770,8 +775,10 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
...
@@ -770,8 +775,10 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
return
result
;
return
result
;
}
}
/* Return 1 if the input is certainly normalized, 0 if it might not be. */
typedef
enum
{
YES
,
NO
,
MAYBE
}
NormalMode
;
static
int
/* Return YES if the input is certainly normalized, NO or MAYBE if it might not be. */
static
NormalMode
is_normalized
(
PyObject
*
self
,
PyObject
*
input
,
int
nfc
,
int
k
)
is_normalized
(
PyObject
*
self
,
PyObject
*
input
,
int
nfc
,
int
k
)
{
{
Py_ssize_t
i
,
len
;
Py_ssize_t
i
,
len
;
...
@@ -782,7 +789,7 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k)
...
@@ -782,7 +789,7 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k)
/* An older version of the database is requested, quickchecks must be
/* An older version of the database is requested, quickchecks must be
disabled. */
disabled. */
if
(
self
&&
UCD_Check
(
self
))
if
(
self
&&
UCD_Check
(
self
))
return
0
;
return
NO
;
/* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
/* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
as described in http://unicode.org/reports/tr15/#Annex8. */
as described in http://unicode.org/reports/tr15/#Annex8. */
...
@@ -799,19 +806,92 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k)
...
@@ -799,19 +806,92 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k)
unsigned
char
quickcheck
=
record
->
normalization_quick_check
;
unsigned
char
quickcheck
=
record
->
normalization_quick_check
;
if
(
quickcheck
&
quickcheck_mask
)
if
(
quickcheck
&
quickcheck_mask
)
return
0
;
/* this string might need normalization */
return
MAYBE
;
/* this string might need normalization */
if
(
combining
&&
prev_combining
>
combining
)
if
(
combining
&&
prev_combining
>
combining
)
return
0
;
/* non-canonical sort order, not normalized */
return
NO
;
/* non-canonical sort order, not normalized */
prev_combining
=
combining
;
prev_combining
=
combining
;
}
}
return
1
;
/* certainly normalized */
return
YES
;
/* certainly normalized */
}
/*[clinic input]
unicodedata.UCD.is_normalized
self: self
form: unicode
unistr as input: unicode
/
Return whether the Unicode string unistr is in the normal form 'form'.
Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
[clinic start generated code]*/
static
PyObject
*
unicodedata_UCD_is_normalized_impl
(
PyObject
*
self
,
PyObject
*
form
,
PyObject
*
input
)
/*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
{
if
(
PyUnicode_READY
(
input
)
==
-
1
)
{
return
NULL
;
}
if
(
PyUnicode_GET_LENGTH
(
input
)
==
0
)
{
/* special case empty input strings. */
Py_RETURN_TRUE
;
}
PyObject
*
result
;
int
nfc
=
0
;
int
k
=
0
;
NormalMode
m
;
PyObject
*
cmp
;
int
match
=
0
;
if
(
_PyUnicode_EqualToASCIIId
(
form
,
&
PyId_NFC
))
{
nfc
=
1
;
}
else
if
(
_PyUnicode_EqualToASCIIId
(
form
,
&
PyId_NFKC
))
{
nfc
=
1
;
k
=
1
;
}
else
if
(
_PyUnicode_EqualToASCIIId
(
form
,
&
PyId_NFD
))
{
/* matches default values for `nfc` and `k` */
}
else
if
(
_PyUnicode_EqualToASCIIId
(
form
,
&
PyId_NFKD
))
{
k
=
1
;
}
else
{
PyErr_SetString
(
PyExc_ValueError
,
"invalid normalization form"
);
return
NULL
;
}
m
=
is_normalized
(
self
,
input
,
nfc
,
k
);
if
(
m
==
MAYBE
)
{
cmp
=
(
nfc
?
nfc_nfkc
:
nfd_nfkd
)(
self
,
input
,
k
);
if
(
cmp
==
NULL
)
{
return
NULL
;
}
match
=
PyUnicode_Compare
(
input
,
cmp
);
Py_DECREF
(
cmp
);
result
=
(
match
==
0
)
?
Py_True
:
Py_False
;
}
else
{
result
=
(
m
==
YES
)
?
Py_True
:
Py_False
;
}
Py_INCREF
(
result
);
return
result
;
}
}
/*[clinic input]
/*[clinic input]
unicodedata.UCD.normalize
unicodedata.UCD.normalize
self: self
self: self
form:
str
form:
unicode
unistr as input: unicode
unistr as input: unicode
/
/
...
@@ -821,9 +901,9 @@ Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
...
@@ -821,9 +901,9 @@ Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
[clinic start generated code]*/
[clinic start generated code]*/
static
PyObject
*
static
PyObject
*
unicodedata_UCD_normalize_impl
(
PyObject
*
self
,
const
char
*
form
,
unicodedata_UCD_normalize_impl
(
PyObject
*
self
,
PyObject
*
form
,
PyObject
*
input
)
PyObject
*
input
)
/*[clinic end generated code: output=
62d1f8870027efdc input=1744c55f4ab79bf0
]*/
/*[clinic end generated code: output=
05ca4385a2ad6983 input=3a5206c0ad2833fb
]*/
{
{
if
(
PyUnicode_GET_LENGTH
(
input
)
==
0
)
{
if
(
PyUnicode_GET_LENGTH
(
input
)
==
0
)
{
/* Special case empty input strings, since resizing
/* Special case empty input strings, since resizing
...
@@ -832,29 +912,29 @@ unicodedata_UCD_normalize_impl(PyObject *self, const char *form,
...
@@ -832,29 +912,29 @@ unicodedata_UCD_normalize_impl(PyObject *self, const char *form,
return
input
;
return
input
;
}
}
if
(
strcmp
(
form
,
"NFC"
)
==
0
)
{
if
(
_PyUnicode_EqualToASCIIId
(
form
,
&
PyId_NFC
)
)
{
if
(
is_normalized
(
self
,
input
,
1
,
0
))
{
if
(
is_normalized
(
self
,
input
,
1
,
0
)
==
YES
)
{
Py_INCREF
(
input
);
Py_INCREF
(
input
);
return
input
;
return
input
;
}
}
return
nfc_nfkc
(
self
,
input
,
0
);
return
nfc_nfkc
(
self
,
input
,
0
);
}
}
if
(
strcmp
(
form
,
"NFKC"
)
==
0
)
{
if
(
_PyUnicode_EqualToASCIIId
(
form
,
&
PyId_NFKC
)
)
{
if
(
is_normalized
(
self
,
input
,
1
,
1
))
{
if
(
is_normalized
(
self
,
input
,
1
,
1
)
==
YES
)
{
Py_INCREF
(
input
);
Py_INCREF
(
input
);
return
input
;
return
input
;
}
}
return
nfc_nfkc
(
self
,
input
,
1
);
return
nfc_nfkc
(
self
,
input
,
1
);
}
}
if
(
strcmp
(
form
,
"NFD"
)
==
0
)
{
if
(
_PyUnicode_EqualToASCIIId
(
form
,
&
PyId_NFD
)
)
{
if
(
is_normalized
(
self
,
input
,
0
,
0
))
{
if
(
is_normalized
(
self
,
input
,
0
,
0
)
==
YES
)
{
Py_INCREF
(
input
);
Py_INCREF
(
input
);
return
input
;
return
input
;
}
}
return
nfd_nfkd
(
self
,
input
,
0
);
return
nfd_nfkd
(
self
,
input
,
0
);
}
}
if
(
strcmp
(
form
,
"NFKD"
)
==
0
)
{
if
(
_PyUnicode_EqualToASCIIId
(
form
,
&
PyId_NFKD
)
)
{
if
(
is_normalized
(
self
,
input
,
0
,
1
))
{
if
(
is_normalized
(
self
,
input
,
0
,
1
)
==
YES
)
{
Py_INCREF
(
input
);
Py_INCREF
(
input
);
return
input
;
return
input
;
}
}
...
@@ -1271,6 +1351,7 @@ static PyMethodDef unicodedata_functions[] = {
...
@@ -1271,6 +1351,7 @@ static PyMethodDef unicodedata_functions[] = {
UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
UNICODEDATA_UCD_NAME_METHODDEF
UNICODEDATA_UCD_NAME_METHODDEF
UNICODEDATA_UCD_LOOKUP_METHODDEF
UNICODEDATA_UCD_LOOKUP_METHODDEF
UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
UNICODEDATA_UCD_NORMALIZE_METHODDEF
UNICODEDATA_UCD_NORMALIZE_METHODDEF
{
NULL
,
NULL
}
/* sentinel */
{
NULL
,
NULL
}
/* sentinel */
};
};
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment