Skip to content
Projeler
Gruplar
Parçacıklar
Yardım
Yükleniyor...
Oturum aç / Kaydol
Gezinmeyi değiştir
C
cpython
Proje
Proje
Ayrıntılar
Etkinlik
Cycle Analytics
Depo (repository)
Depo (repository)
Dosyalar
Kayıtlar (commit)
Dallar (branch)
Etiketler
Katkıda bulunanlar
Grafik
Karşılaştır
Grafikler
Konular (issue)
0
Konular (issue)
0
Liste
Pano
Etiketler
Kilometre Taşları
Birleştirme (merge) Talepleri
0
Birleştirme (merge) Talepleri
0
CI / CD
CI / CD
İş akışları (pipeline)
İşler
Zamanlamalar
Grafikler
Paketler
Paketler
Wiki
Wiki
Parçacıklar
Parçacıklar
Üyeler
Üyeler
Collapse sidebar
Close sidebar
Etkinlik
Grafik
Grafikler
Yeni bir konu (issue) oluştur
İşler
Kayıtlar (commit)
Konu (issue) Panoları
Kenar çubuğunu aç
Batuhan Osman TASKAYA
cpython
Commits
9e9bcda5
Kaydet (Commit)
9e9bcda5
authored
Ock 21, 2001
tarafından
Fredrik Lundh
Dosyalara gözat
Seçenekler
Dosyalara Gözat
İndir
Eposta Yamaları
Sade Fark
forgot to check in the new makeunicodedata.py script
üst
d38855c3
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
281 additions
and
29 deletions
+281
-29
ucnhash.c
Modules/ucnhash.c
+8
-10
unicodedata_db.h
Modules/unicodedata_db.h
+1
-1
unicodename_db.h
Modules/unicodename_db.h
+0
-0
unicodetype_db.h
Objects/unicodetype_db.h
+1
-1
makeunicodedata.py
Tools/unicode/makeunicodedata.py
+271
-17
No files found.
Modules/ucnhash.c
Dosyayı görüntüle @
9e9bcda5
...
...
@@ -11,16 +11,13 @@
/* database code (cut and pasted from the unidb package) */
static
unsigned
long
gethash
(
const
char
*
s
,
int
len
)
gethash
(
const
char
*
s
,
int
len
,
int
scale
)
{
int
i
;
unsigned
long
h
=
0
;
unsigned
long
ix
;
for
(
i
=
0
;
i
<
len
;
i
++
)
{
/* magic value 47 was chosen to minimize the number
of collisions for the uninames dataset. see the
makeunicodedata script for more background */
h
=
(
h
*
47
)
+
(
unsigned
char
)
toupper
(
s
[
i
]);
h
=
(
h
*
scale
)
+
(
unsigned
char
)
toupper
(
s
[
i
]);
ix
=
h
&
0xff000000
;
if
(
ix
)
h
=
(
h
^
((
ix
>>
24
)
&
0xff
))
&
0x00ffffff
;
...
...
@@ -40,8 +37,9 @@ getname(Py_UCS4 code, char* buffer, int buflen)
return
0
;
/* get offset into phrasebook */
offset
=
phrasebook_offset1
[(
code
>>
SHIFT
)];
offset
=
phrasebook_offset2
[(
offset
<<
SHIFT
)
+
(
code
&
((
1
<<
SHIFT
)
-
1
))];
offset
=
phrasebook_offset1
[(
code
>>
phrasebook_shift
)];
offset
=
phrasebook_offset2
[(
offset
<<
phrasebook_shift
)
+
(
code
&
((
1
<<
phrasebook_shift
)
-
1
))];
if
(
!
offset
)
return
0
;
...
...
@@ -99,14 +97,14 @@ static int
getcode
(
const
char
*
name
,
int
namelen
,
Py_UCS4
*
code
)
{
unsigned
int
h
,
v
;
unsigned
int
mask
=
CODE_SIZE
-
1
;
unsigned
int
mask
=
code_size
-
1
;
unsigned
int
i
,
incr
;
/* the following is the same as python's dictionary lookup, with
only minor changes. see the makeunicodedata script for more
details */
h
=
(
unsigned
int
)
gethash
(
name
,
namelen
);
h
=
(
unsigned
int
)
gethash
(
name
,
namelen
,
code_magic
);
i
=
(
~
h
)
&
mask
;
v
=
code_hash
[
i
];
if
(
!
v
)
...
...
@@ -129,7 +127,7 @@ getcode(const char* name, int namelen, Py_UCS4* code)
}
incr
=
incr
<<
1
;
if
(
incr
>
mask
)
incr
=
incr
^
CODE_POLY
;
incr
=
incr
^
code_poly
;
}
}
...
...
Modules/unicodedata_db.h
Dosyayı görüntüle @
9e9bcda5
/* this file was generated by tools\unicode\makeunicodedata.py
1
.1 */
/* this file was generated by tools\unicode\makeunicodedata.py
2
.1 */
/* a list of unique database records */
const
_PyUnicode_DatabaseRecord
_PyUnicode_Database_Records
[]
=
{
...
...
Modules/unicodename_db.h
Dosyayı görüntüle @
9e9bcda5
This source diff could not be displayed because it is too large. You can
view the blob
instead.
Objects/unicodetype_db.h
Dosyayı görüntüle @
9e9bcda5
/* this file was generated by tools\unicode\makeunicodedata.py
1
.1 */
/* this file was generated by tools\unicode\makeunicodedata.py
2
.1 */
/* a list of unique character type descriptors */
const
_PyUnicode_TypeRecord
_PyUnicode_TypeRecords
[]
=
{
...
...
Tools/unicode/makeunicodedata.py
Dosyayı görüntüle @
9e9bcda5
...
...
@@ -2,14 +2,16 @@
# (re)generate unicode property and type databases
#
# this script converts a unicode 3.0 database file to
# Modules/unicodedata_db.h and Objects/unicodetype_db.h
# Modules/unicodedata_db.h, Modules/unicodename_db.h,
# and Objects/unicodetype_db.h
#
# history:
# 2000-09-24 fl created (based on bits and pieces from unidb)
# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
# 2000-09-25 fl added character type table
# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields
# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields
(2.0)
# 2000-11-03 fl expand first/last ranges
# 2001-01-19 fl added character name tables (2.1)
#
# written by Fredrik Lundh (fredrik@pythonware.com), September 2000
#
...
...
@@ -17,7 +19,7 @@
import
sys
SCRIPT
=
sys
.
argv
[
0
]
VERSION
=
"
1
.1"
VERSION
=
"
2
.1"
UNICODE_DATA
=
"UnicodeData-Latest.txt"
...
...
@@ -42,18 +44,32 @@ UPPER_MASK = 0x80
def
maketables
(
trace
=
0
):
print
"--- Reading"
,
UNICODE_DATA
,
"..."
unicode
=
UnicodeData
(
UNICODE_DATA
)
print
"--- Processing"
,
UNICODE_DATA
,
"..."
print
len
(
filter
(
None
,
unicode
.
table
)),
"characters"
# extract unicode properties
makeunicodedata
(
unicode
,
trace
)
makeunicodetype
(
unicode
,
trace
)
makeunicodename
(
unicode
,
trace
)
# --------------------------------------------------------------------
# unicode character properties
def
makeunicodedata
(
unicode
,
trace
):
dummy
=
(
0
,
0
,
0
,
0
)
table
=
[
dummy
]
cache
=
{
0
:
dummy
}
index
=
[
0
]
*
len
(
unicode
.
chars
)
FILE
=
"Modules/unicodedata_db.h"
print
"--- Preparing"
,
FILE
,
"..."
# 1) database properties
for
char
in
unicode
.
chars
:
record
=
unicode
.
table
[
char
]
if
record
:
...
...
@@ -93,13 +109,11 @@ def maketables(trace=0):
i
=
0
decomp_index
[
char
]
=
i
FILE
=
"Modules/unicodedata_db.h"
print
"--- Writing"
,
FILE
,
"..."
print
len
(
table
),
"unique properties"
print
len
(
decomp_data
),
"unique decomposition entries"
print
"--- Writing"
,
FILE
,
"..."
fp
=
open
(
FILE
,
"w"
)
print
>>
fp
,
"/* this file was generated by
%
s
%
s */"
%
(
SCRIPT
,
VERSION
)
print
>>
fp
...
...
@@ -111,7 +125,7 @@ def maketables(trace=0):
print
>>
fp
,
"};"
print
>>
fp
# FIXME:
the following tables sh
ould be made static, and
# FIXME:
<fl> the following tables c
ould be made static, and
# the support code moved into unicodedatabase.c
print
>>
fp
,
"/* string literals */"
...
...
@@ -149,8 +163,16 @@ def maketables(trace=0):
Array
(
"decomp_index1"
,
index1
)
.
dump
(
fp
)
Array
(
"decomp_index2"
,
index2
)
.
dump
(
fp
)
#
# 3) unicode type data
fp
.
close
()
# --------------------------------------------------------------------
# unicode character type tables
def
makeunicodetype
(
unicode
,
trace
):
FILE
=
"Objects/unicodetype_db.h"
print
"--- Preparing"
,
FILE
,
"..."
# extract unicode types
dummy
=
(
0
,
0
,
0
,
0
,
0
,
0
)
...
...
@@ -209,14 +231,11 @@ def maketables(trace=0):
table
.
append
(
item
)
index
[
char
]
=
i
FILE
=
"Objects/unicodetype_db.h"
fp
=
open
(
FILE
,
"w"
)
print
len
(
table
),
"unique character type entries"
print
"--- Writing"
,
FILE
,
"..."
print
len
(
table
),
"unique character type entries"
fp
=
open
(
FILE
,
"w"
)
print
>>
fp
,
"/* this file was generated by
%
s
%
s */"
%
(
SCRIPT
,
VERSION
)
print
>>
fp
print
>>
fp
,
"/* a list of unique character type descriptors */"
...
...
@@ -234,6 +253,155 @@ def maketables(trace=0):
Array
(
"index1"
,
index1
)
.
dump
(
fp
)
Array
(
"index2"
,
index2
)
.
dump
(
fp
)
fp
.
close
()
# --------------------------------------------------------------------
# unicode name database
def
makeunicodename
(
unicode
,
trace
):
FILE
=
"Modules/unicodename_db.h"
print
"--- Preparing"
,
FILE
,
"..."
# collect names
names
=
[
None
]
*
len
(
unicode
.
chars
)
for
char
in
unicode
.
chars
:
record
=
unicode
.
table
[
char
]
if
record
:
name
=
record
[
1
]
.
strip
()
if
name
and
name
[
0
]
!=
"<"
:
names
[
char
]
=
name
+
chr
(
0
)
print
len
(
filter
(
lambda
n
:
n
is
not
None
,
names
)),
"distinct names"
# collect unique words from names (note that we differ between
# words inside a sentence, and words ending a sentence. the
# latter includes the trailing null byte.
words
=
{}
n
=
b
=
0
for
char
in
unicode
.
chars
:
name
=
names
[
char
]
if
name
:
w
=
name
.
split
()
b
=
b
+
len
(
name
)
n
=
n
+
len
(
w
)
for
w
in
w
:
l
=
words
.
get
(
w
)
if
l
:
l
.
append
(
None
)
else
:
words
[
w
]
=
[
len
(
words
)]
print
n
,
"words in text;"
,
b
,
"bytes"
wordlist
=
words
.
items
()
# sort on falling frequency
wordlist
.
sort
(
lambda
a
,
b
:
len
(
b
[
1
])
-
len
(
a
[
1
]))
# statistics
n
=
0
for
i
in
range
(
128
):
n
=
n
+
len
(
wordlist
[
i
][
1
])
print
n
,
"short words (7-bit indices)"
# pick the 128 most commonly used words, and sort the rest on
# falling length (to maximize overlap)
wordlist
,
wordtail
=
wordlist
[:
128
],
wordlist
[
128
:]
wordtail
.
sort
(
lambda
a
,
b
:
len
(
b
[
0
])
-
len
(
a
[
0
]))
wordlist
.
extend
(
wordtail
)
# generate lexicon from words
lexicon_offset
=
[
0
]
lexicon
=
""
words
=
{}
# build a lexicon string
offset
=
0
for
w
,
x
in
wordlist
:
# encoding: bit 7 indicates last character in word (chr(128)
# indicates the last character in an entire string)
ww
=
w
[:
-
1
]
+
chr
(
ord
(
w
[
-
1
])
+
128
)
# reuse string tails, when possible
o
=
string
.
find
(
lexicon
,
ww
)
if
o
<
0
:
o
=
offset
lexicon
=
lexicon
+
ww
offset
=
offset
+
len
(
w
)
words
[
w
]
=
len
(
lexicon_offset
)
lexicon_offset
.
append
(
offset
)
print
len
(
words
),
"words in lexicon;"
,
len
(
lexicon
),
"bytes"
assert
len
(
words
)
<
32768
# 15-bit word indices
lexicon
=
map
(
ord
,
lexicon
)
# generate phrasebook from names and lexicon
phrasebook
=
[
0
]
phrasebook_offset
=
[
0
]
*
len
(
unicode
.
chars
)
for
char
in
unicode
.
chars
:
name
=
names
[
char
]
if
name
:
w
=
name
.
split
()
phrasebook_offset
[
char
]
=
len
(
phrasebook
)
for
w
in
w
:
i
=
words
[
w
]
if
i
<
128
:
phrasebook
.
append
(
128
+
i
)
else
:
phrasebook
.
append
(
i
>>
8
)
phrasebook
.
append
(
i
&
255
)
#
# unicode name hash table
# extract names
data
=
[]
for
char
in
unicode
.
chars
:
record
=
unicode
.
table
[
char
]
if
record
:
name
=
record
[
1
]
.
strip
()
if
name
and
name
[
0
]
!=
"<"
:
data
.
append
((
name
,
char
))
# the magic number 47 was chosen to minimize the number of
# collisions on the current data set. if you like, change it
# and see what happens...
codehash
=
Hash
(
"code"
,
data
,
47
)
print
"--- Writing"
,
FILE
,
"..."
fp
=
open
(
FILE
,
"w"
)
print
>>
fp
,
"/* this file was generated by
%
s
%
s */"
%
(
SCRIPT
,
VERSION
)
print
>>
fp
print
>>
fp
,
"#define NAME_MAXLEN"
,
256
print
>>
fp
print
>>
fp
,
"/* lexicon */"
Array
(
"lexicon"
,
lexicon
)
.
dump
(
fp
)
Array
(
"lexicon_offset"
,
lexicon_offset
)
.
dump
(
fp
)
# split decomposition index table
offset1
,
offset2
,
shift
=
splitbins
(
phrasebook_offset
,
trace
)
print
>>
fp
,
"/* code->name phrasebook */"
print
>>
fp
,
"#define phrasebook_shift"
,
shift
Array
(
"phrasebook"
,
phrasebook
)
.
dump
(
fp
)
Array
(
"phrasebook_offset1"
,
offset1
)
.
dump
(
fp
)
Array
(
"phrasebook_offset2"
,
offset2
)
.
dump
(
fp
)
print
>>
fp
,
"/* name->code dictionary */"
codehash
.
dump
(
fp
)
fp
.
close
()
# --------------------------------------------------------------------
# the following support code is taken from the unidb utilities
# Copyright (c) 1999-2000 by Secret Labs AB
...
...
@@ -280,6 +448,92 @@ class UnicodeData:
# restrict character range to ISO Latin 1
self
.
chars
=
range
(
256
)
# hash table tools
# this is a straight-forward reimplementation of Python's built-in
# dictionary type, using a static data structure, and a custom string
# hash algorithm.
def
myhash
(
s
,
magic
):
h
=
0
for
c
in
map
(
ord
,
string
.
upper
(
s
)):
h
=
(
h
*
magic
)
+
c
ix
=
h
&
0xff000000
if
ix
:
h
=
(
h
^
((
ix
>>
24
)
&
0xff
))
&
0x00ffffff
return
h
SIZES
=
[
(
4
,
3
),
(
8
,
3
),
(
16
,
3
),
(
32
,
5
),
(
64
,
3
),
(
128
,
3
),
(
256
,
29
),
(
512
,
17
),
(
1024
,
9
),
(
2048
,
5
),
(
4096
,
83
),
(
8192
,
27
),
(
16384
,
43
),
(
32768
,
3
),
(
65536
,
45
),
(
131072
,
9
),
(
262144
,
39
),
(
524288
,
39
),
(
1048576
,
9
),
(
2097152
,
5
),
(
4194304
,
3
),
(
8388608
,
33
),
(
16777216
,
27
)
]
class
Hash
:
def
__init__
(
self
,
name
,
data
,
magic
):
# turn a (key, value) list into a static hash table structure
# determine table size
for
size
,
poly
in
SIZES
:
if
size
>
len
(
data
):
poly
=
size
+
poly
break
else
:
raise
AssertionError
,
"ran out of polynominals"
print
size
,
"slots in hash table"
table
=
[
None
]
*
size
mask
=
size
-
1
n
=
0
hash
=
myhash
# initialize hash table
for
key
,
value
in
data
:
h
=
hash
(
key
,
magic
)
i
=
(
~
h
)
&
mask
v
=
table
[
i
]
if
v
is
None
:
table
[
i
]
=
value
continue
incr
=
(
h
^
(
h
>>
3
))
&
mask
;
if
not
incr
:
incr
=
mask
while
1
:
n
=
n
+
1
i
=
(
i
+
incr
)
&
mask
v
=
table
[
i
]
if
v
is
None
:
table
[
i
]
=
value
break
incr
=
incr
<<
1
if
incr
>
mask
:
incr
=
incr
^
poly
print
n
,
"collisions"
self
.
collisions
=
n
for
i
in
range
(
len
(
table
)):
if
table
[
i
]
is
None
:
table
[
i
]
=
0
self
.
data
=
Array
(
name
+
"_hash"
,
table
)
self
.
magic
=
magic
self
.
name
=
name
self
.
size
=
size
self
.
poly
=
poly
def
dump
(
self
,
file
):
# write data to file, as a C array
self
.
data
.
dump
(
file
)
file
.
write
(
"#define
%
s_magic
%
d
\n
"
%
(
self
.
name
,
self
.
magic
))
file
.
write
(
"#define
%
s_size
%
d
\n
"
%
(
self
.
name
,
self
.
size
))
file
.
write
(
"#define
%
s_poly
%
d
\n
"
%
(
self
.
name
,
self
.
poly
))
# stuff to deal with arrays of unsigned integers
class
Array
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment