Skip to content
Projeler
Gruplar
Parçacıklar
Yardım
Yükleniyor...
Oturum aç / Kaydol
Gezinmeyi değiştir
C
cpython
Proje
Proje
Ayrıntılar
Etkinlik
Cycle Analytics
Depo (repository)
Depo (repository)
Dosyalar
Kayıtlar (commit)
Dallar (branch)
Etiketler
Katkıda bulunanlar
Grafik
Karşılaştır
Grafikler
Konular (issue)
0
Konular (issue)
0
Liste
Pano
Etiketler
Kilometre Taşları
Birleştirme (merge) Talepleri
0
Birleştirme (merge) Talepleri
0
CI / CD
CI / CD
İş akışları (pipeline)
İşler
Zamanlamalar
Grafikler
Paketler
Paketler
Wiki
Wiki
Parçacıklar
Parçacıklar
Üyeler
Üyeler
Collapse sidebar
Close sidebar
Etkinlik
Grafik
Grafikler
Yeni bir konu (issue) oluştur
İşler
Kayıtlar (commit)
Konu (issue) Panoları
Kenar çubuğunu aç
Batuhan Osman TASKAYA
cpython
Commits
7b7dd107
Kaydet (Commit)
7b7dd107
authored
24 years ago
tarafından
Fredrik Lundh
Dosyalara gözat
Seçenekler
Dosyalara Gözat
İndir
Eposta Yamaları
Sade Fark
compress unicode decomposition tables (this saves another 55k)
üst
f75c9d94
No related merge requests found
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
179 additions
and
208 deletions
+179
-208
ucnhash.c
Modules/ucnhash.c
+7
-8
unicodedata.c
Modules/unicodedata.c
+94
-79
unicodedata_db.h
Modules/unicodedata_db.h
+0
-0
unicodedatabase.c
Modules/unicodedatabase.c
+1
-47
unicodedatabase.h
Modules/unicodedatabase.h
+1
-33
unicodename_db.h
Modules/unicodename_db.h
+0
-0
makeunicodedata.py
Tools/unicode/makeunicodedata.py
+76
-41
No files found.
Modules/ucnhash.c
Dosyayı görüntüle @
7b7dd107
...
...
@@ -38,7 +38,7 @@ getname(Py_UCS4 code, char* buffer, int buflen)
/* get offset into phrasebook */
offset
=
phrasebook_offset1
[(
code
>>
phrasebook_shift
)];
offset
=
phrasebook_offset2
[(
offset
<<
phrasebook_shift
)
+
offset
=
phrasebook_offset2
[(
offset
<<
phrasebook_shift
)
+
(
code
&
((
1
<<
phrasebook_shift
)
-
1
))];
if
(
!
offset
)
return
0
;
...
...
@@ -47,13 +47,12 @@ getname(Py_UCS4 code, char* buffer, int buflen)
for
(;;)
{
/* get word index */
if
(
phrasebook
[
offset
]
&
128
)
{
word
=
phrasebook
[
offset
]
&
127
;
offset
++
;
}
else
{
word
=
(
phrasebook
[
offset
]
<<
8
)
+
phrasebook
[
offset
+
1
];
offset
+=
2
;
}
word
=
phrasebook
[
offset
]
-
phrasebook_short
;
if
(
word
>=
0
)
{
word
=
(
word
<<
8
)
+
phrasebook
[
offset
+
1
];
offset
+=
2
;
}
else
word
=
phrasebook
[
offset
++
];
if
(
i
)
{
if
(
i
>
buflen
)
return
0
;
/* buffer overflow */
...
...
This diff is collapsed.
Click to expand it.
Modules/unicodedata.c
Dosyayı görüntüle @
7b7dd107
...
...
@@ -14,11 +14,40 @@
#include "Python.h"
#include "unicodedatabase.h"
typedef
struct
{
const
unsigned
char
category
;
/* index into
_PyUnicode_CategoryNames */
const
unsigned
char
combining
;
/* combining class value 0 - 255 */
const
unsigned
char
bidirectional
;
/* index into
_PyUnicode_BidirectionalNames */
const
unsigned
char
mirrored
;
/* true if mirrored in bidir mode */
}
_PyUnicode_DatabaseRecord
;
/* data file generated by Tools/unicode/makeunicodedata.py */
#include "unicodedata_db.h"
static
const
_PyUnicode_DatabaseRecord
*
getrecord
(
PyUnicodeObject
*
v
)
{
int
code
;
int
index
;
code
=
(
int
)
*
PyUnicode_AS_UNICODE
(
v
);
if
(
code
<
0
||
code
>=
65536
)
index
=
0
;
else
{
index
=
index1
[(
code
>>
SHIFT
)];
index
=
index2
[(
index
<<
SHIFT
)
+
(
code
&
((
1
<<
SHIFT
)
-
1
))];
}
return
&
_PyUnicode_Database_Records
[
index
];
}
/* --- Module API --------------------------------------------------------- */
static
PyObject
*
unicodedata_decimal
(
PyObject
*
self
,
PyObject
*
args
)
unicodedata_decimal
(
PyObject
*
self
,
PyObject
*
args
)
{
PyUnicodeObject
*
v
;
PyObject
*
defobj
=
NULL
;
...
...
@@ -26,18 +55,18 @@ unicodedata_decimal(PyObject *self,
if
(
!
PyArg_ParseTuple
(
args
,
"O!|O:decimal"
,
&
PyUnicode_Type
,
&
v
,
&
defobj
))
goto
onError
;
return
NULL
;
if
(
PyUnicode_GET_SIZE
(
v
)
!=
1
)
{
PyErr_SetString
(
PyExc_TypeError
,
"need a single Unicode character as parameter"
);
goto
onError
;
return
NULL
;
}
rc
=
Py_UNICODE_TODECIMAL
(
*
PyUnicode_AS_UNICODE
(
v
));
if
(
rc
<
0
)
{
if
(
defobj
==
NULL
)
{
PyErr_SetString
(
PyExc_ValueError
,
"not a decimal"
);
goto
onError
;
return
NULL
;
}
else
{
Py_INCREF
(
defobj
);
...
...
@@ -45,14 +74,10 @@ unicodedata_decimal(PyObject *self,
}
}
return
PyInt_FromLong
(
rc
);
onError:
return
NULL
;
}
static
PyObject
*
unicodedata_digit
(
PyObject
*
self
,
PyObject
*
args
)
unicodedata_digit
(
PyObject
*
self
,
PyObject
*
args
)
{
PyUnicodeObject
*
v
;
PyObject
*
defobj
=
NULL
;
...
...
@@ -60,18 +85,18 @@ unicodedata_digit(PyObject *self,
if
(
!
PyArg_ParseTuple
(
args
,
"O!|O:digit"
,
&
PyUnicode_Type
,
&
v
,
&
defobj
))
goto
onError
;
return
NULL
;
if
(
PyUnicode_GET_SIZE
(
v
)
!=
1
)
{
PyErr_SetString
(
PyExc_TypeError
,
"need a single Unicode character as parameter"
);
goto
onError
;
return
NULL
;
}
rc
=
Py_UNICODE_TODIGIT
(
*
PyUnicode_AS_UNICODE
(
v
));
if
(
rc
<
0
)
{
if
(
defobj
==
NULL
)
{
PyErr_SetString
(
PyExc_ValueError
,
"not a digit"
);
goto
onError
;
return
NULL
;
}
else
{
Py_INCREF
(
defobj
);
...
...
@@ -79,14 +104,10 @@ unicodedata_digit(PyObject *self,
}
}
return
PyInt_FromLong
(
rc
);
onError:
return
NULL
;
}
static
PyObject
*
unicodedata_numeric
(
PyObject
*
self
,
PyObject
*
args
)
unicodedata_numeric
(
PyObject
*
self
,
PyObject
*
args
)
{
PyUnicodeObject
*
v
;
PyObject
*
defobj
=
NULL
;
...
...
@@ -94,18 +115,18 @@ unicodedata_numeric(PyObject *self,
if
(
!
PyArg_ParseTuple
(
args
,
"O!|O:numeric"
,
&
PyUnicode_Type
,
&
v
,
&
defobj
))
goto
onError
;
return
NULL
;
if
(
PyUnicode_GET_SIZE
(
v
)
!=
1
)
{
PyErr_SetString
(
PyExc_TypeError
,
"need a single Unicode character as parameter"
);
goto
onError
;
return
NULL
;
}
rc
=
Py_UNICODE_TONUMERIC
(
*
PyUnicode_AS_UNICODE
(
v
));
if
(
rc
<
0
)
{
if
(
defobj
==
NULL
)
{
PyErr_SetString
(
PyExc_ValueError
,
"not a numeric character"
);
goto
onError
;
return
NULL
;
}
else
{
Py_INCREF
(
defobj
);
...
...
@@ -113,129 +134,123 @@ unicodedata_numeric(PyObject *self,
}
}
return
PyFloat_FromDouble
(
rc
);
onError:
return
NULL
;
}
static
PyObject
*
unicodedata_category
(
PyObject
*
self
,
PyObject
*
args
)
unicodedata_category
(
PyObject
*
self
,
PyObject
*
args
)
{
PyUnicodeObject
*
v
;
int
index
;
if
(
!
PyArg_ParseTuple
(
args
,
"O!:category"
,
&
PyUnicode_Type
,
&
v
))
goto
onError
;
return
NULL
;
if
(
PyUnicode_GET_SIZE
(
v
)
!=
1
)
{
PyErr_SetString
(
PyExc_TypeError
,
"need a single Unicode character as parameter"
);
goto
onError
;
return
NULL
;
}
index
=
(
int
)
_PyUnicode_Database_GetRecord
(
(
int
)
*
PyUnicode_AS_UNICODE
(
v
)
)
->
category
;
index
=
(
int
)
getrecord
(
v
)
->
category
;
return
PyString_FromString
(
_PyUnicode_CategoryNames
[
index
]);
onError:
return
NULL
;
}
static
PyObject
*
unicodedata_bidirectional
(
PyObject
*
self
,
PyObject
*
args
)
unicodedata_bidirectional
(
PyObject
*
self
,
PyObject
*
args
)
{
PyUnicodeObject
*
v
;
int
index
;
if
(
!
PyArg_ParseTuple
(
args
,
"O!:bidirectional"
,
&
PyUnicode_Type
,
&
v
))
goto
onError
;
return
NULL
;
if
(
PyUnicode_GET_SIZE
(
v
)
!=
1
)
{
PyErr_SetString
(
PyExc_TypeError
,
"need a single Unicode character as parameter"
);
goto
onError
;
return
NULL
;
}
index
=
(
int
)
_PyUnicode_Database_GetRecord
(
(
int
)
*
PyUnicode_AS_UNICODE
(
v
)
)
->
bidirectional
;
index
=
(
int
)
getrecord
(
v
)
->
bidirectional
;
return
PyString_FromString
(
_PyUnicode_BidirectionalNames
[
index
]);
onError:
return
NULL
;
}
static
PyObject
*
unicodedata_combining
(
PyObject
*
self
,
PyObject
*
args
)
unicodedata_combining
(
PyObject
*
self
,
PyObject
*
args
)
{
PyUnicodeObject
*
v
;
int
value
;
if
(
!
PyArg_ParseTuple
(
args
,
"O!:combining"
,
&
PyUnicode_Type
,
&
v
))
goto
onError
;
return
NULL
;
if
(
PyUnicode_GET_SIZE
(
v
)
!=
1
)
{
PyErr_SetString
(
PyExc_TypeError
,
"need a single Unicode character as parameter"
);
goto
onError
;
return
NULL
;
}
value
=
(
int
)
_PyUnicode_Database_GetRecord
(
(
int
)
*
PyUnicode_AS_UNICODE
(
v
)
)
->
combining
;
return
PyInt_FromLong
(
value
);
onError:
return
NULL
;
return
PyInt_FromLong
((
int
)
getrecord
(
v
)
->
combining
);
}
static
PyObject
*
unicodedata_mirrored
(
PyObject
*
self
,
PyObject
*
args
)
unicodedata_mirrored
(
PyObject
*
self
,
PyObject
*
args
)
{
PyUnicodeObject
*
v
;
int
value
;
if
(
!
PyArg_ParseTuple
(
args
,
"O!:mirrored"
,
&
PyUnicode_Type
,
&
v
))
goto
onError
;
return
NULL
;
if
(
PyUnicode_GET_SIZE
(
v
)
!=
1
)
{
PyErr_SetString
(
PyExc_TypeError
,
"need a single Unicode character as parameter"
);
goto
onError
;
return
NULL
;
}
value
=
(
int
)
_PyUnicode_Database_GetRecord
(
(
int
)
*
PyUnicode_AS_UNICODE
(
v
)
)
->
mirrored
;
return
PyInt_FromLong
(
value
);
onError:
return
NULL
;
return
PyInt_FromLong
((
int
)
getrecord
(
v
)
->
mirrored
);
}
static
PyObject
*
unicodedata_decomposition
(
PyObject
*
self
,
PyObject
*
args
)
unicodedata_decomposition
(
PyObject
*
self
,
PyObject
*
args
)
{
PyUnicodeObject
*
v
;
const
char
*
value
;
char
decomp
[
256
];
int
code
,
index
,
count
,
i
;
if
(
!
PyArg_ParseTuple
(
args
,
"O!:decomposition"
,
&
PyUnicode_Type
,
&
v
))
goto
onError
;
return
NULL
;
if
(
PyUnicode_GET_SIZE
(
v
)
!=
1
)
{
PyErr_SetString
(
PyExc_TypeError
,
"need a single Unicode character as parameter"
);
goto
onError
;
return
NULL
;
}
code
=
(
int
)
*
PyUnicode_AS_UNICODE
(
v
);
if
(
code
<
0
||
code
>=
65536
)
index
=
0
;
else
{
index
=
decomp_index1
[(
code
>>
DECOMP_SHIFT
)];
index
=
decomp_index2
[(
index
<<
DECOMP_SHIFT
)
+
(
code
&
((
1
<<
DECOMP_SHIFT
)
-
1
))];
}
/* high byte is of hex bytes (usually one or two), low byte
is prefix code (from*/
count
=
decomp_data
[
index
]
>>
8
;
/* XXX: could allocate the PyString up front instead
(strlen(prefix) + 5 * count + 1 bytes) */
/* copy prefix */
i
=
strlen
(
decomp_prefix
[
decomp_data
[
index
]
&
255
]);
memcpy
(
decomp
,
decomp_prefix
[
decomp_data
[
index
]
&
255
],
i
);
while
(
count
--
>
0
)
{
if
(
i
)
decomp
[
i
++
]
=
' '
;
sprintf
(
decomp
+
i
,
"%04X"
,
decomp_data
[
++
index
]);
i
+=
strlen
(
decomp
+
i
);
}
value
=
_PyUnicode_Database_GetDecomposition
(
(
int
)
*
PyUnicode_AS_UNICODE
(
v
)
);
return
PyString_FromString
(
value
);
onError:
return
NULL
;
decomp
[
i
]
=
'\0'
;
return
PyString_FromString
(
decomp
);
}
/* XXX Add doc strings. */
...
...
This diff is collapsed.
Click to expand it.
Modules/unicodedata_db.h
Dosyayı görüntüle @
7b7dd107
This diff is collapsed.
Click to expand it.
Modules/unicodedatabase.c
Dosyayı görüntüle @
7b7dd107
/* ------------------------------------------------------------------------
unicodedatabase -- The Unicode 3.0 data base.
Data was extracted from the Unicode 3.0 UnicodeData.txt file.
Written by Marc-Andre Lemburg (mal@lemburg.com).
Rewritten for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Copyright (c) Corporation for National Research Initiatives.
------------------------------------------------------------------------ */
#include "Python.h"
#include "unicodedatabase.h"
/* read the actual data from a separate file! */
#include "unicodedata_db.h"
const
_PyUnicode_DatabaseRecord
*
_PyUnicode_Database_GetRecord
(
int
code
)
{
int
index
;
if
(
code
<
0
||
code
>=
65536
)
index
=
0
;
else
{
index
=
index1
[(
code
>>
SHIFT
)];
index
=
index2
[(
index
<<
SHIFT
)
+
(
code
&
((
1
<<
SHIFT
)
-
1
))];
}
return
&
_PyUnicode_Database_Records
[
index
];
}
const
char
*
_PyUnicode_Database_GetDecomposition
(
int
code
)
{
int
index
;
if
(
code
<
0
||
code
>=
65536
)
index
=
0
;
else
{
index
=
decomp_index1
[(
code
>>
DECOMP_SHIFT
)];
index
=
decomp_index2
[(
index
<<
DECOMP_SHIFT
)
+
(
code
&
((
1
<<
DECOMP_SHIFT
)
-
1
))];
}
return
decomp_data
[
index
];
}
/* remove this file! */
This diff is collapsed.
Click to expand it.
Modules/unicodedatabase.h
Dosyayı görüntüle @
7b7dd107
/* ------------------------------------------------------------------------
unicodedatabase -- The Unicode 3.0 data base.
Data was extracted from the Unicode 3.0 UnicodeData.txt file.
Written by Marc-Andre Lemburg (mal@lemburg.com).
Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Copyright (c) Corporation for National Research Initiatives.
------------------------------------------------------------------------ */
/* --- Unicode database entry --------------------------------------------- */
typedef
struct
{
const
unsigned
char
category
;
/* index into
_PyUnicode_CategoryNames */
const
unsigned
char
combining
;
/* combining class value 0 - 255 */
const
unsigned
char
bidirectional
;
/* index into
_PyUnicode_BidirectionalNames */
const
unsigned
char
mirrored
;
/* true if mirrored in bidir mode */
}
_PyUnicode_DatabaseRecord
;
/* --- Unicode category names --------------------------------------------- */
extern
const
char
*
_PyUnicode_CategoryNames
[];
extern
const
char
*
_PyUnicode_BidirectionalNames
[];
/* --- Unicode Database --------------------------------------------------- */
extern
const
_PyUnicode_DatabaseRecord
*
_PyUnicode_Database_GetRecord
(
int
ch
);
extern
const
char
*
_PyUnicode_Database_GetDecomposition
(
int
ch
);
/* remove this file! */
This diff is collapsed.
Click to expand it.
Modules/unicodename_db.h
Dosyayı görüntüle @
7b7dd107
This diff is collapsed.
Click to expand it.
Tools/unicode/makeunicodedata.py
Dosyayı görüntüle @
7b7dd107
...
...
@@ -12,8 +12,9 @@
# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
# 2000-11-03 fl expand first/last ranges
# 2001-01-19 fl added character name tables (2.1)
# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
#
# written by Fredrik Lundh (fredrik@pythonware.com)
, September 2000
# written by Fredrik Lundh (fredrik@pythonware.com)
#
import
sys
...
...
@@ -50,9 +51,9 @@ def maketables(trace=0):
print
len
(
filter
(
None
,
unicode
.
table
)),
"characters"
# makeunicodename(unicode, trace)
makeunicodedata
(
unicode
,
trace
)
makeunicodetype
(
unicode
,
trace
)
makeunicodename
(
unicode
,
trace
)
# makeunicodetype(unicode, trace)
# --------------------------------------------------------------------
# unicode character properties
...
...
@@ -90,27 +91,45 @@ def makeunicodedata(unicode, trace):
# 2) decomposition data
# FIXME: <fl> using the encoding stuff from unidb would save
# another 50k or so, but I'll leave that for 2.1...
decomp_data
=
[
""
]
decomp_data
=
[
0
]
decomp_prefix
=
[
""
]
decomp_index
=
[
0
]
*
len
(
unicode
.
chars
)
decomp_size
=
0
for
char
in
unicode
.
chars
:
record
=
unicode
.
table
[
char
]
if
record
:
if
record
[
5
]:
decomp
=
string
.
split
(
record
[
5
])
# prefix
if
decomp
[
0
][
0
]
==
"<"
:
prefix
=
decomp
.
pop
(
0
)
else
:
prefix
=
""
try
:
i
=
decomp_prefix
.
index
(
prefix
)
except
ValueError
:
i
=
len
(
decomp_prefix
)
decomp_prefix
.
append
(
prefix
)
prefix
=
i
assert
prefix
<
256
# content
decomp
=
[
prefix
+
(
len
(
decomp
)
<<
8
)]
+
\
map
(
lambda
s
:
int
(
s
,
16
),
decomp
)
try
:
i
=
decomp_data
.
index
(
record
[
5
]
)
i
=
decomp_data
.
index
(
decomp
)
except
ValueError
:
i
=
len
(
decomp_data
)
decomp_data
.
append
(
record
[
5
])
decomp_data
.
extend
(
decomp
)
decomp_size
=
decomp_size
+
len
(
decomp
)
*
2
else
:
i
=
0
decomp_index
[
char
]
=
i
print
len
(
table
),
"unique properties"
print
len
(
decomp_data
),
"unique decomposition entries"
print
len
(
decomp_prefix
),
"unique decomposition prefixes"
print
len
(
decomp_data
),
"unique decomposition entries:"
,
print
decomp_size
,
"bytes"
print
"--- Writing"
,
FILE
,
"..."
...
...
@@ -141,8 +160,8 @@ def makeunicodedata(unicode, trace):
print
>>
fp
,
" NULL"
print
>>
fp
,
"};"
print
>>
fp
,
"static const char *decomp_
data
[] = {"
for
name
in
decomp_
data
:
print
>>
fp
,
"static const char *decomp_
prefix
[] = {"
for
name
in
decomp_
prefix
:
print
>>
fp
,
"
\"
%
s
\"
,"
%
name
print
>>
fp
,
" NULL"
print
>>
fp
,
"};"
...
...
@@ -152,16 +171,19 @@ def makeunicodedata(unicode, trace):
print
>>
fp
,
"/* index tables for the database records */"
print
>>
fp
,
"#define SHIFT"
,
shift
Array
(
"index1"
,
index1
)
.
dump
(
fp
)
Array
(
"index2"
,
index2
)
.
dump
(
fp
)
Array
(
"index1"
,
index1
)
.
dump
(
fp
,
trace
)
Array
(
"index2"
,
index2
)
.
dump
(
fp
,
trace
)
# split decomposition index table
index1
,
index2
,
shift
=
splitbins
(
decomp_index
,
trace
)
print
>>
fp
,
"/* decomposition data */"
Array
(
"decomp_data"
,
decomp_data
)
.
dump
(
fp
,
trace
)
print
>>
fp
,
"/* index tables for the decomposition data */"
print
>>
fp
,
"#define DECOMP_SHIFT"
,
shift
Array
(
"decomp_index1"
,
index1
)
.
dump
(
fp
)
Array
(
"decomp_index2"
,
index2
)
.
dump
(
fp
)
Array
(
"decomp_index1"
,
index1
)
.
dump
(
fp
,
trace
)
Array
(
"decomp_index2"
,
index2
)
.
dump
(
fp
,
trace
)
fp
.
close
()
...
...
@@ -250,8 +272,8 @@ def makeunicodetype(unicode, trace):
print
>>
fp
,
"/* type indexes */"
print
>>
fp
,
"#define SHIFT"
,
shift
Array
(
"index1"
,
index1
)
.
dump
(
fp
)
Array
(
"index2"
,
index2
)
.
dump
(
fp
)
Array
(
"index1"
,
index1
)
.
dump
(
fp
,
trace
)
Array
(
"index2"
,
index2
)
.
dump
(
fp
,
trace
)
fp
.
close
()
...
...
@@ -302,16 +324,28 @@ def makeunicodename(unicode, trace):
# sort on falling frequency
wordlist
.
sort
(
lambda
a
,
b
:
len
(
b
[
1
])
-
len
(
a
[
1
]))
# figure out how many phrasebook escapes we need
escapes
=
0
while
escapes
*
256
<
len
(
wordlist
):
escapes
=
escapes
+
1
print
escapes
,
"escapes"
short
=
256
-
escapes
assert
short
>
0
print
short
,
"short indexes in lexicon"
# statistics
n
=
0
for
i
in
range
(
128
):
for
i
in
range
(
short
):
n
=
n
+
len
(
wordlist
[
i
][
1
])
print
n
,
"short
words (7-bit indices)
"
print
n
,
"short
indexes in phrasebook
"
# pick the
128 most commonly used words, and sort the rest on
#
falling
length (to maximize overlap)
# pick the
most commonly used words, and sort the rest on falling
# length (to maximize overlap)
wordlist
,
wordtail
=
wordlist
[:
128
],
wordlist
[
128
:]
wordlist
,
wordtail
=
wordlist
[:
short
],
wordlist
[
short
:]
wordtail
.
sort
(
lambda
a
,
b
:
len
(
b
[
0
])
-
len
(
a
[
0
]))
wordlist
.
extend
(
wordtail
)
...
...
@@ -334,11 +368,7 @@ def makeunicodename(unicode, trace):
lexicon
=
lexicon
+
ww
offset
=
offset
+
len
(
w
)
words
[
w
]
=
len
(
lexicon_offset
)
lexicon_offset
.
append
(
offset
)
print
len
(
words
),
"words in lexicon;"
,
len
(
lexicon
),
"bytes"
assert
len
(
words
)
<
32768
# 15-bit word indices
lexicon_offset
.
append
(
o
)
lexicon
=
map
(
ord
,
lexicon
)
...
...
@@ -352,12 +382,15 @@ def makeunicodename(unicode, trace):
phrasebook_offset
[
char
]
=
len
(
phrasebook
)
for
w
in
w
:
i
=
words
[
w
]
if
i
<
128
:
phrasebook
.
append
(
128
+
i
)
if
i
<
short
:
phrasebook
.
append
(
i
)
else
:
phrasebook
.
append
(
i
>>
8
)
# store as two bytes
phrasebook
.
append
((
i
>>
8
)
+
short
)
phrasebook
.
append
(
i
&
255
)
assert
getsize
(
phrasebook
)
==
1
#
# unicode name hash table
...
...
@@ -384,21 +417,22 @@ def makeunicodename(unicode, trace):
print
>>
fp
,
"#define NAME_MAXLEN"
,
256
print
>>
fp
print
>>
fp
,
"/* lexicon */"
Array
(
"lexicon"
,
lexicon
)
.
dump
(
fp
)
Array
(
"lexicon_offset"
,
lexicon_offset
)
.
dump
(
fp
)
Array
(
"lexicon"
,
lexicon
)
.
dump
(
fp
,
trace
)
Array
(
"lexicon_offset"
,
lexicon_offset
)
.
dump
(
fp
,
trace
)
# split decomposition index table
offset1
,
offset2
,
shift
=
splitbins
(
phrasebook_offset
,
trace
)
print
>>
fp
,
"/* code->name phrasebook */"
print
>>
fp
,
"#define phrasebook_shift"
,
shift
print
>>
fp
,
"#define phrasebook_short"
,
short
Array
(
"phrasebook"
,
phrasebook
)
.
dump
(
fp
)
Array
(
"phrasebook_offset1"
,
offset1
)
.
dump
(
fp
)
Array
(
"phrasebook_offset2"
,
offset2
)
.
dump
(
fp
)
Array
(
"phrasebook"
,
phrasebook
)
.
dump
(
fp
,
trace
)
Array
(
"phrasebook_offset1"
,
offset1
)
.
dump
(
fp
,
trace
)
Array
(
"phrasebook_offset2"
,
offset2
)
.
dump
(
fp
,
trace
)
print
>>
fp
,
"/* name->code dictionary */"
codehash
.
dump
(
fp
)
codehash
.
dump
(
fp
,
trace
)
fp
.
close
()
...
...
@@ -527,9 +561,9 @@ class Hash:
self
.
size
=
size
self
.
poly
=
poly
def
dump
(
self
,
file
):
def
dump
(
self
,
file
,
trace
):
# write data to file, as a C array
self
.
data
.
dump
(
file
)
self
.
data
.
dump
(
file
,
trace
)
file
.
write
(
"#define
%
s_magic
%
d
\n
"
%
(
self
.
name
,
self
.
magic
))
file
.
write
(
"#define
%
s_size
%
d
\n
"
%
(
self
.
name
,
self
.
size
))
file
.
write
(
"#define
%
s_poly
%
d
\n
"
%
(
self
.
name
,
self
.
poly
))
...
...
@@ -542,10 +576,11 @@ class Array:
self
.
name
=
name
self
.
data
=
data
def
dump
(
self
,
file
):
def
dump
(
self
,
file
,
trace
=
0
):
# write data to file, as a C array
size
=
getsize
(
self
.
data
)
# print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
if
trace
:
print
>>
sys
.
stderr
,
self
.
name
+
":"
,
size
*
len
(
self
.
data
),
"bytes"
file
.
write
(
"static "
)
if
size
==
1
:
file
.
write
(
"unsigned char"
)
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment