Skip to content
Projeler
Gruplar
Parçacıklar
Yardım
Yükleniyor...
Oturum aç / Kaydol
Gezinmeyi değiştir
C
cpython
Proje
Proje
Ayrıntılar
Etkinlik
Cycle Analytics
Depo (repository)
Depo (repository)
Dosyalar
Kayıtlar (commit)
Dallar (branch)
Etiketler
Katkıda bulunanlar
Grafik
Karşılaştır
Grafikler
Konular (issue)
0
Konular (issue)
0
Liste
Pano
Etiketler
Kilometre Taşları
Birleştirme (merge) Talepleri
0
Birleştirme (merge) Talepleri
0
CI / CD
CI / CD
İş akışları (pipeline)
İşler
Zamanlamalar
Grafikler
Paketler
Paketler
Wiki
Wiki
Parçacıklar
Parçacıklar
Üyeler
Üyeler
Collapse sidebar
Close sidebar
Etkinlik
Grafik
Grafikler
Yeni bir konu (issue) oluştur
İşler
Kayıtlar (commit)
Konu (issue) Panoları
Kenar çubuğunu aç
Batuhan Osman TASKAYA
cpython
Commits
7b7dd107
Kaydet (Commit)
7b7dd107
authored
Ock 21, 2001
tarafından
Fredrik Lundh
Dosyalara gözat
Seçenekler
Dosyalara Gözat
İndir
Eposta Yamaları
Sade Fark
compress unicode decomposition tables (this saves another 55k)
üst
f75c9d94
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
179 additions
and
208 deletions
+179
-208
ucnhash.c
Modules/ucnhash.c
+7
-8
unicodedata.c
Modules/unicodedata.c
+94
-79
unicodedata_db.h
Modules/unicodedata_db.h
+0
-0
unicodedatabase.c
Modules/unicodedatabase.c
+1
-47
unicodedatabase.h
Modules/unicodedatabase.h
+1
-33
unicodename_db.h
Modules/unicodename_db.h
+0
-0
makeunicodedata.py
Tools/unicode/makeunicodedata.py
+76
-41
No files found.
Modules/ucnhash.c
Dosyayı görüntüle @
7b7dd107
...
@@ -38,7 +38,7 @@ getname(Py_UCS4 code, char* buffer, int buflen)
...
@@ -38,7 +38,7 @@ getname(Py_UCS4 code, char* buffer, int buflen)
/* get offset into phrasebook */
/* get offset into phrasebook */
offset
=
phrasebook_offset1
[(
code
>>
phrasebook_shift
)];
offset
=
phrasebook_offset1
[(
code
>>
phrasebook_shift
)];
offset
=
phrasebook_offset2
[(
offset
<<
phrasebook_shift
)
+
offset
=
phrasebook_offset2
[(
offset
<<
phrasebook_shift
)
+
(
code
&
((
1
<<
phrasebook_shift
)
-
1
))];
(
code
&
((
1
<<
phrasebook_shift
)
-
1
))];
if
(
!
offset
)
if
(
!
offset
)
return
0
;
return
0
;
...
@@ -47,13 +47,12 @@ getname(Py_UCS4 code, char* buffer, int buflen)
...
@@ -47,13 +47,12 @@ getname(Py_UCS4 code, char* buffer, int buflen)
for
(;;)
{
for
(;;)
{
/* get word index */
/* get word index */
if
(
phrasebook
[
offset
]
&
128
)
{
word
=
phrasebook
[
offset
]
-
phrasebook_short
;
word
=
phrasebook
[
offset
]
&
127
;
if
(
word
>=
0
)
{
offset
++
;
word
=
(
word
<<
8
)
+
phrasebook
[
offset
+
1
];
}
else
{
offset
+=
2
;
word
=
(
phrasebook
[
offset
]
<<
8
)
+
phrasebook
[
offset
+
1
];
}
else
offset
+=
2
;
word
=
phrasebook
[
offset
++
];
}
if
(
i
)
{
if
(
i
)
{
if
(
i
>
buflen
)
if
(
i
>
buflen
)
return
0
;
/* buffer overflow */
return
0
;
/* buffer overflow */
...
...
Modules/unicodedata.c
Dosyayı görüntüle @
7b7dd107
...
@@ -14,11 +14,40 @@
...
@@ -14,11 +14,40 @@
#include "Python.h"
#include "Python.h"
#include "unicodedatabase.h"
#include "unicodedatabase.h"
typedef
struct
{
const
unsigned
char
category
;
/* index into
_PyUnicode_CategoryNames */
const
unsigned
char
combining
;
/* combining class value 0 - 255 */
const
unsigned
char
bidirectional
;
/* index into
_PyUnicode_BidirectionalNames */
const
unsigned
char
mirrored
;
/* true if mirrored in bidir mode */
}
_PyUnicode_DatabaseRecord
;
/* data file generated by Tools/unicode/makeunicodedata.py */
#include "unicodedata_db.h"
static
const
_PyUnicode_DatabaseRecord
*
getrecord
(
PyUnicodeObject
*
v
)
{
int
code
;
int
index
;
code
=
(
int
)
*
PyUnicode_AS_UNICODE
(
v
);
if
(
code
<
0
||
code
>=
65536
)
index
=
0
;
else
{
index
=
index1
[(
code
>>
SHIFT
)];
index
=
index2
[(
index
<<
SHIFT
)
+
(
code
&
((
1
<<
SHIFT
)
-
1
))];
}
return
&
_PyUnicode_Database_Records
[
index
];
}
/* --- Module API --------------------------------------------------------- */
/* --- Module API --------------------------------------------------------- */
static
PyObject
*
static
PyObject
*
unicodedata_decimal
(
PyObject
*
self
,
unicodedata_decimal
(
PyObject
*
self
,
PyObject
*
args
)
PyObject
*
args
)
{
{
PyUnicodeObject
*
v
;
PyUnicodeObject
*
v
;
PyObject
*
defobj
=
NULL
;
PyObject
*
defobj
=
NULL
;
...
@@ -26,18 +55,18 @@ unicodedata_decimal(PyObject *self,
...
@@ -26,18 +55,18 @@ unicodedata_decimal(PyObject *self,
if
(
!
PyArg_ParseTuple
(
args
,
"O!|O:decimal"
,
if
(
!
PyArg_ParseTuple
(
args
,
"O!|O:decimal"
,
&
PyUnicode_Type
,
&
v
,
&
defobj
))
&
PyUnicode_Type
,
&
v
,
&
defobj
))
goto
onError
;
return
NULL
;
if
(
PyUnicode_GET_SIZE
(
v
)
!=
1
)
{
if
(
PyUnicode_GET_SIZE
(
v
)
!=
1
)
{
PyErr_SetString
(
PyExc_TypeError
,
PyErr_SetString
(
PyExc_TypeError
,
"need a single Unicode character as parameter"
);
"need a single Unicode character as parameter"
);
goto
onError
;
return
NULL
;
}
}
rc
=
Py_UNICODE_TODECIMAL
(
*
PyUnicode_AS_UNICODE
(
v
));
rc
=
Py_UNICODE_TODECIMAL
(
*
PyUnicode_AS_UNICODE
(
v
));
if
(
rc
<
0
)
{
if
(
rc
<
0
)
{
if
(
defobj
==
NULL
)
{
if
(
defobj
==
NULL
)
{
PyErr_SetString
(
PyExc_ValueError
,
PyErr_SetString
(
PyExc_ValueError
,
"not a decimal"
);
"not a decimal"
);
goto
onError
;
return
NULL
;
}
}
else
{
else
{
Py_INCREF
(
defobj
);
Py_INCREF
(
defobj
);
...
@@ -45,14 +74,10 @@ unicodedata_decimal(PyObject *self,
...
@@ -45,14 +74,10 @@ unicodedata_decimal(PyObject *self,
}
}
}
}
return
PyInt_FromLong
(
rc
);
return
PyInt_FromLong
(
rc
);
onError:
return
NULL
;
}
}
static
PyObject
*
static
PyObject
*
unicodedata_digit
(
PyObject
*
self
,
unicodedata_digit
(
PyObject
*
self
,
PyObject
*
args
)
PyObject
*
args
)
{
{
PyUnicodeObject
*
v
;
PyUnicodeObject
*
v
;
PyObject
*
defobj
=
NULL
;
PyObject
*
defobj
=
NULL
;
...
@@ -60,18 +85,18 @@ unicodedata_digit(PyObject *self,
...
@@ -60,18 +85,18 @@ unicodedata_digit(PyObject *self,
if
(
!
PyArg_ParseTuple
(
args
,
"O!|O:digit"
,
if
(
!
PyArg_ParseTuple
(
args
,
"O!|O:digit"
,
&
PyUnicode_Type
,
&
v
,
&
defobj
))
&
PyUnicode_Type
,
&
v
,
&
defobj
))
goto
onError
;
return
NULL
;
if
(
PyUnicode_GET_SIZE
(
v
)
!=
1
)
{
if
(
PyUnicode_GET_SIZE
(
v
)
!=
1
)
{
PyErr_SetString
(
PyExc_TypeError
,
PyErr_SetString
(
PyExc_TypeError
,
"need a single Unicode character as parameter"
);
"need a single Unicode character as parameter"
);
goto
onError
;
return
NULL
;
}
}
rc
=
Py_UNICODE_TODIGIT
(
*
PyUnicode_AS_UNICODE
(
v
));
rc
=
Py_UNICODE_TODIGIT
(
*
PyUnicode_AS_UNICODE
(
v
));
if
(
rc
<
0
)
{
if
(
rc
<
0
)
{
if
(
defobj
==
NULL
)
{
if
(
defobj
==
NULL
)
{
PyErr_SetString
(
PyExc_ValueError
,
PyErr_SetString
(
PyExc_ValueError
,
"not a digit"
);
"not a digit"
);
goto
onError
;
return
NULL
;
}
}
else
{
else
{
Py_INCREF
(
defobj
);
Py_INCREF
(
defobj
);
...
@@ -79,14 +104,10 @@ unicodedata_digit(PyObject *self,
...
@@ -79,14 +104,10 @@ unicodedata_digit(PyObject *self,
}
}
}
}
return
PyInt_FromLong
(
rc
);
return
PyInt_FromLong
(
rc
);
onError:
return
NULL
;
}
}
static
PyObject
*
static
PyObject
*
unicodedata_numeric
(
PyObject
*
self
,
unicodedata_numeric
(
PyObject
*
self
,
PyObject
*
args
)
PyObject
*
args
)
{
{
PyUnicodeObject
*
v
;
PyUnicodeObject
*
v
;
PyObject
*
defobj
=
NULL
;
PyObject
*
defobj
=
NULL
;
...
@@ -94,18 +115,18 @@ unicodedata_numeric(PyObject *self,
...
@@ -94,18 +115,18 @@ unicodedata_numeric(PyObject *self,
if
(
!
PyArg_ParseTuple
(
args
,
"O!|O:numeric"
,
if
(
!
PyArg_ParseTuple
(
args
,
"O!|O:numeric"
,
&
PyUnicode_Type
,
&
v
,
&
defobj
))
&
PyUnicode_Type
,
&
v
,
&
defobj
))
goto
onError
;
return
NULL
;
if
(
PyUnicode_GET_SIZE
(
v
)
!=
1
)
{
if
(
PyUnicode_GET_SIZE
(
v
)
!=
1
)
{
PyErr_SetString
(
PyExc_TypeError
,
PyErr_SetString
(
PyExc_TypeError
,
"need a single Unicode character as parameter"
);
"need a single Unicode character as parameter"
);
goto
onError
;
return
NULL
;
}
}
rc
=
Py_UNICODE_TONUMERIC
(
*
PyUnicode_AS_UNICODE
(
v
));
rc
=
Py_UNICODE_TONUMERIC
(
*
PyUnicode_AS_UNICODE
(
v
));
if
(
rc
<
0
)
{
if
(
rc
<
0
)
{
if
(
defobj
==
NULL
)
{
if
(
defobj
==
NULL
)
{
PyErr_SetString
(
PyExc_ValueError
,
PyErr_SetString
(
PyExc_ValueError
,
"not a numeric character"
);
"not a numeric character"
);
goto
onError
;
return
NULL
;
}
}
else
{
else
{
Py_INCREF
(
defobj
);
Py_INCREF
(
defobj
);
...
@@ -113,129 +134,123 @@ unicodedata_numeric(PyObject *self,
...
@@ -113,129 +134,123 @@ unicodedata_numeric(PyObject *self,
}
}
}
}
return
PyFloat_FromDouble
(
rc
);
return
PyFloat_FromDouble
(
rc
);
onError:
return
NULL
;
}
}
static
PyObject
*
static
PyObject
*
unicodedata_category
(
PyObject
*
self
,
unicodedata_category
(
PyObject
*
self
,
PyObject
*
args
)
PyObject
*
args
)
{
{
PyUnicodeObject
*
v
;
PyUnicodeObject
*
v
;
int
index
;
int
index
;
if
(
!
PyArg_ParseTuple
(
args
,
"O!:category"
,
if
(
!
PyArg_ParseTuple
(
args
,
"O!:category"
,
&
PyUnicode_Type
,
&
v
))
&
PyUnicode_Type
,
&
v
))
goto
onError
;
return
NULL
;
if
(
PyUnicode_GET_SIZE
(
v
)
!=
1
)
{
if
(
PyUnicode_GET_SIZE
(
v
)
!=
1
)
{
PyErr_SetString
(
PyExc_TypeError
,
PyErr_SetString
(
PyExc_TypeError
,
"need a single Unicode character as parameter"
);
"need a single Unicode character as parameter"
);
goto
onError
;
return
NULL
;
}
}
index
=
(
int
)
_PyUnicode_Database_GetRecord
(
index
=
(
int
)
getrecord
(
v
)
->
category
;
(
int
)
*
PyUnicode_AS_UNICODE
(
v
)
)
->
category
;
return
PyString_FromString
(
_PyUnicode_CategoryNames
[
index
]);
return
PyString_FromString
(
_PyUnicode_CategoryNames
[
index
]);
onError:
return
NULL
;
}
}
static
PyObject
*
static
PyObject
*
unicodedata_bidirectional
(
PyObject
*
self
,
unicodedata_bidirectional
(
PyObject
*
self
,
PyObject
*
args
)
PyObject
*
args
)
{
{
PyUnicodeObject
*
v
;
PyUnicodeObject
*
v
;
int
index
;
int
index
;
if
(
!
PyArg_ParseTuple
(
args
,
"O!:bidirectional"
,
if
(
!
PyArg_ParseTuple
(
args
,
"O!:bidirectional"
,
&
PyUnicode_Type
,
&
v
))
&
PyUnicode_Type
,
&
v
))
goto
onError
;
return
NULL
;
if
(
PyUnicode_GET_SIZE
(
v
)
!=
1
)
{
if
(
PyUnicode_GET_SIZE
(
v
)
!=
1
)
{
PyErr_SetString
(
PyExc_TypeError
,
PyErr_SetString
(
PyExc_TypeError
,
"need a single Unicode character as parameter"
);
"need a single Unicode character as parameter"
);
goto
onError
;
return
NULL
;
}
}
index
=
(
int
)
_PyUnicode_Database_GetRecord
(
index
=
(
int
)
getrecord
(
v
)
->
bidirectional
;
(
int
)
*
PyUnicode_AS_UNICODE
(
v
)
)
->
bidirectional
;
return
PyString_FromString
(
_PyUnicode_BidirectionalNames
[
index
]);
return
PyString_FromString
(
_PyUnicode_BidirectionalNames
[
index
]);
onError:
return
NULL
;
}
}
static
PyObject
*
static
PyObject
*
unicodedata_combining
(
PyObject
*
self
,
unicodedata_combining
(
PyObject
*
self
,
PyObject
*
args
)
PyObject
*
args
)
{
{
PyUnicodeObject
*
v
;
PyUnicodeObject
*
v
;
int
value
;
if
(
!
PyArg_ParseTuple
(
args
,
"O!:combining"
,
if
(
!
PyArg_ParseTuple
(
args
,
"O!:combining"
,
&
PyUnicode_Type
,
&
v
))
&
PyUnicode_Type
,
&
v
))
goto
onError
;
return
NULL
;
if
(
PyUnicode_GET_SIZE
(
v
)
!=
1
)
{
if
(
PyUnicode_GET_SIZE
(
v
)
!=
1
)
{
PyErr_SetString
(
PyExc_TypeError
,
PyErr_SetString
(
PyExc_TypeError
,
"need a single Unicode character as parameter"
);
"need a single Unicode character as parameter"
);
goto
onError
;
return
NULL
;
}
}
value
=
(
int
)
_PyUnicode_Database_GetRecord
(
return
PyInt_FromLong
((
int
)
getrecord
(
v
)
->
combining
);
(
int
)
*
PyUnicode_AS_UNICODE
(
v
)
)
->
combining
;
return
PyInt_FromLong
(
value
);
onError:
return
NULL
;
}
}
static
PyObject
*
static
PyObject
*
unicodedata_mirrored
(
PyObject
*
self
,
unicodedata_mirrored
(
PyObject
*
self
,
PyObject
*
args
)
PyObject
*
args
)
{
{
PyUnicodeObject
*
v
;
PyUnicodeObject
*
v
;
int
value
;
if
(
!
PyArg_ParseTuple
(
args
,
"O!:mirrored"
,
if
(
!
PyArg_ParseTuple
(
args
,
"O!:mirrored"
,
&
PyUnicode_Type
,
&
v
))
&
PyUnicode_Type
,
&
v
))
goto
onError
;
return
NULL
;
if
(
PyUnicode_GET_SIZE
(
v
)
!=
1
)
{
if
(
PyUnicode_GET_SIZE
(
v
)
!=
1
)
{
PyErr_SetString
(
PyExc_TypeError
,
PyErr_SetString
(
PyExc_TypeError
,
"need a single Unicode character as parameter"
);
"need a single Unicode character as parameter"
);
goto
onError
;
return
NULL
;
}
}
value
=
(
int
)
_PyUnicode_Database_GetRecord
(
return
PyInt_FromLong
((
int
)
getrecord
(
v
)
->
mirrored
);
(
int
)
*
PyUnicode_AS_UNICODE
(
v
)
)
->
mirrored
;
return
PyInt_FromLong
(
value
);
onError:
return
NULL
;
}
}
static
PyObject
*
static
PyObject
*
unicodedata_decomposition
(
PyObject
*
self
,
unicodedata_decomposition
(
PyObject
*
self
,
PyObject
*
args
)
PyObject
*
args
)
{
{
PyUnicodeObject
*
v
;
PyUnicodeObject
*
v
;
const
char
*
value
;
char
decomp
[
256
];
int
code
,
index
,
count
,
i
;
if
(
!
PyArg_ParseTuple
(
args
,
"O!:decomposition"
,
if
(
!
PyArg_ParseTuple
(
args
,
"O!:decomposition"
,
&
PyUnicode_Type
,
&
v
))
&
PyUnicode_Type
,
&
v
))
goto
onError
;
return
NULL
;
if
(
PyUnicode_GET_SIZE
(
v
)
!=
1
)
{
if
(
PyUnicode_GET_SIZE
(
v
)
!=
1
)
{
PyErr_SetString
(
PyExc_TypeError
,
PyErr_SetString
(
PyExc_TypeError
,
"need a single Unicode character as parameter"
);
"need a single Unicode character as parameter"
);
goto
onError
;
return
NULL
;
}
code
=
(
int
)
*
PyUnicode_AS_UNICODE
(
v
);
if
(
code
<
0
||
code
>=
65536
)
index
=
0
;
else
{
index
=
decomp_index1
[(
code
>>
DECOMP_SHIFT
)];
index
=
decomp_index2
[(
index
<<
DECOMP_SHIFT
)
+
(
code
&
((
1
<<
DECOMP_SHIFT
)
-
1
))];
}
/* high byte is of hex bytes (usually one or two), low byte
is prefix code (from*/
count
=
decomp_data
[
index
]
>>
8
;
/* XXX: could allocate the PyString up front instead
(strlen(prefix) + 5 * count + 1 bytes) */
/* copy prefix */
i
=
strlen
(
decomp_prefix
[
decomp_data
[
index
]
&
255
]);
memcpy
(
decomp
,
decomp_prefix
[
decomp_data
[
index
]
&
255
],
i
);
while
(
count
--
>
0
)
{
if
(
i
)
decomp
[
i
++
]
=
' '
;
sprintf
(
decomp
+
i
,
"%04X"
,
decomp_data
[
++
index
]);
i
+=
strlen
(
decomp
+
i
);
}
}
value
=
_PyUnicode_Database_GetDecomposition
(
(
int
)
*
PyUnicode_AS_UNICODE
(
v
)
);
return
PyString_FromString
(
value
);
onError:
decomp
[
i
]
=
'\0'
;
return
NULL
;
return
PyString_FromString
(
decomp
);
}
}
/* XXX Add doc strings. */
/* XXX Add doc strings. */
...
...
Modules/unicodedata_db.h
Dosyayı görüntüle @
7b7dd107
This diff is collapsed.
Click to expand it.
Modules/unicodedatabase.c
Dosyayı görüntüle @
7b7dd107
/* ------------------------------------------------------------------------
/* remove this file! */
unicodedatabase -- The Unicode 3.0 data base.
Data was extracted from the Unicode 3.0 UnicodeData.txt file.
Written by Marc-Andre Lemburg (mal@lemburg.com).
Rewritten for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Copyright (c) Corporation for National Research Initiatives.
------------------------------------------------------------------------ */
#include "Python.h"
#include "unicodedatabase.h"
/* read the actual data from a separate file! */
#include "unicodedata_db.h"
const
_PyUnicode_DatabaseRecord
*
_PyUnicode_Database_GetRecord
(
int
code
)
{
int
index
;
if
(
code
<
0
||
code
>=
65536
)
index
=
0
;
else
{
index
=
index1
[(
code
>>
SHIFT
)];
index
=
index2
[(
index
<<
SHIFT
)
+
(
code
&
((
1
<<
SHIFT
)
-
1
))];
}
return
&
_PyUnicode_Database_Records
[
index
];
}
const
char
*
_PyUnicode_Database_GetDecomposition
(
int
code
)
{
int
index
;
if
(
code
<
0
||
code
>=
65536
)
index
=
0
;
else
{
index
=
decomp_index1
[(
code
>>
DECOMP_SHIFT
)];
index
=
decomp_index2
[(
index
<<
DECOMP_SHIFT
)
+
(
code
&
((
1
<<
DECOMP_SHIFT
)
-
1
))];
}
return
decomp_data
[
index
];
}
Modules/unicodedatabase.h
Dosyayı görüntüle @
7b7dd107
/* ------------------------------------------------------------------------
/* remove this file! */
unicodedatabase -- The Unicode 3.0 data base.
Data was extracted from the Unicode 3.0 UnicodeData.txt file.
Written by Marc-Andre Lemburg (mal@lemburg.com).
Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Copyright (c) Corporation for National Research Initiatives.
------------------------------------------------------------------------ */
/* --- Unicode database entry --------------------------------------------- */
typedef
struct
{
const
unsigned
char
category
;
/* index into
_PyUnicode_CategoryNames */
const
unsigned
char
combining
;
/* combining class value 0 - 255 */
const
unsigned
char
bidirectional
;
/* index into
_PyUnicode_BidirectionalNames */
const
unsigned
char
mirrored
;
/* true if mirrored in bidir mode */
}
_PyUnicode_DatabaseRecord
;
/* --- Unicode category names --------------------------------------------- */
extern
const
char
*
_PyUnicode_CategoryNames
[];
extern
const
char
*
_PyUnicode_BidirectionalNames
[];
/* --- Unicode Database --------------------------------------------------- */
extern
const
_PyUnicode_DatabaseRecord
*
_PyUnicode_Database_GetRecord
(
int
ch
);
extern
const
char
*
_PyUnicode_Database_GetDecomposition
(
int
ch
);
Modules/unicodename_db.h
Dosyayı görüntüle @
7b7dd107
This diff is collapsed.
Click to expand it.
Tools/unicode/makeunicodedata.py
Dosyayı görüntüle @
7b7dd107
...
@@ -12,8 +12,9 @@
...
@@ -12,8 +12,9 @@
# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
# 2000-11-03 fl expand first/last ranges
# 2000-11-03 fl expand first/last ranges
# 2001-01-19 fl added character name tables (2.1)
# 2001-01-19 fl added character name tables (2.1)
# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
#
#
# written by Fredrik Lundh (fredrik@pythonware.com)
, September 2000
# written by Fredrik Lundh (fredrik@pythonware.com)
#
#
import
sys
import
sys
...
@@ -50,9 +51,9 @@ def maketables(trace=0):
...
@@ -50,9 +51,9 @@ def maketables(trace=0):
print
len
(
filter
(
None
,
unicode
.
table
)),
"characters"
print
len
(
filter
(
None
,
unicode
.
table
)),
"characters"
# makeunicodename(unicode, trace)
makeunicodedata
(
unicode
,
trace
)
makeunicodedata
(
unicode
,
trace
)
makeunicodetype
(
unicode
,
trace
)
# makeunicodetype(unicode, trace)
makeunicodename
(
unicode
,
trace
)
# --------------------------------------------------------------------
# --------------------------------------------------------------------
# unicode character properties
# unicode character properties
...
@@ -90,27 +91,45 @@ def makeunicodedata(unicode, trace):
...
@@ -90,27 +91,45 @@ def makeunicodedata(unicode, trace):
# 2) decomposition data
# 2) decomposition data
# FIXME: <fl> using the encoding stuff from unidb would save
decomp_data
=
[
0
]
# another 50k or so, but I'll leave that for 2.1...
decomp_prefix
=
[
""
]
decomp_data
=
[
""
]
decomp_index
=
[
0
]
*
len
(
unicode
.
chars
)
decomp_index
=
[
0
]
*
len
(
unicode
.
chars
)
decomp_size
=
0
for
char
in
unicode
.
chars
:
for
char
in
unicode
.
chars
:
record
=
unicode
.
table
[
char
]
record
=
unicode
.
table
[
char
]
if
record
:
if
record
:
if
record
[
5
]:
if
record
[
5
]:
decomp
=
string
.
split
(
record
[
5
])
# prefix
if
decomp
[
0
][
0
]
==
"<"
:
prefix
=
decomp
.
pop
(
0
)
else
:
prefix
=
""
try
:
i
=
decomp_prefix
.
index
(
prefix
)
except
ValueError
:
i
=
len
(
decomp_prefix
)
decomp_prefix
.
append
(
prefix
)
prefix
=
i
assert
prefix
<
256
# content
decomp
=
[
prefix
+
(
len
(
decomp
)
<<
8
)]
+
\
map
(
lambda
s
:
int
(
s
,
16
),
decomp
)
try
:
try
:
i
=
decomp_data
.
index
(
record
[
5
]
)
i
=
decomp_data
.
index
(
decomp
)
except
ValueError
:
except
ValueError
:
i
=
len
(
decomp_data
)
i
=
len
(
decomp_data
)
decomp_data
.
append
(
record
[
5
])
decomp_data
.
extend
(
decomp
)
decomp_size
=
decomp_size
+
len
(
decomp
)
*
2
else
:
else
:
i
=
0
i
=
0
decomp_index
[
char
]
=
i
decomp_index
[
char
]
=
i
print
len
(
table
),
"unique properties"
print
len
(
table
),
"unique properties"
print
len
(
decomp_data
),
"unique decomposition entries"
print
len
(
decomp_prefix
),
"unique decomposition prefixes"
print
len
(
decomp_data
),
"unique decomposition entries:"
,
print
decomp_size
,
"bytes"
print
"--- Writing"
,
FILE
,
"..."
print
"--- Writing"
,
FILE
,
"..."
...
@@ -141,8 +160,8 @@ def makeunicodedata(unicode, trace):
...
@@ -141,8 +160,8 @@ def makeunicodedata(unicode, trace):
print
>>
fp
,
" NULL"
print
>>
fp
,
" NULL"
print
>>
fp
,
"};"
print
>>
fp
,
"};"
print
>>
fp
,
"static const char *decomp_
data
[] = {"
print
>>
fp
,
"static const char *decomp_
prefix
[] = {"
for
name
in
decomp_
data
:
for
name
in
decomp_
prefix
:
print
>>
fp
,
"
\"
%
s
\"
,"
%
name
print
>>
fp
,
"
\"
%
s
\"
,"
%
name
print
>>
fp
,
" NULL"
print
>>
fp
,
" NULL"
print
>>
fp
,
"};"
print
>>
fp
,
"};"
...
@@ -152,16 +171,19 @@ def makeunicodedata(unicode, trace):
...
@@ -152,16 +171,19 @@ def makeunicodedata(unicode, trace):
print
>>
fp
,
"/* index tables for the database records */"
print
>>
fp
,
"/* index tables for the database records */"
print
>>
fp
,
"#define SHIFT"
,
shift
print
>>
fp
,
"#define SHIFT"
,
shift
Array
(
"index1"
,
index1
)
.
dump
(
fp
)
Array
(
"index1"
,
index1
)
.
dump
(
fp
,
trace
)
Array
(
"index2"
,
index2
)
.
dump
(
fp
)
Array
(
"index2"
,
index2
)
.
dump
(
fp
,
trace
)
# split decomposition index table
# split decomposition index table
index1
,
index2
,
shift
=
splitbins
(
decomp_index
,
trace
)
index1
,
index2
,
shift
=
splitbins
(
decomp_index
,
trace
)
print
>>
fp
,
"/* decomposition data */"
Array
(
"decomp_data"
,
decomp_data
)
.
dump
(
fp
,
trace
)
print
>>
fp
,
"/* index tables for the decomposition data */"
print
>>
fp
,
"/* index tables for the decomposition data */"
print
>>
fp
,
"#define DECOMP_SHIFT"
,
shift
print
>>
fp
,
"#define DECOMP_SHIFT"
,
shift
Array
(
"decomp_index1"
,
index1
)
.
dump
(
fp
)
Array
(
"decomp_index1"
,
index1
)
.
dump
(
fp
,
trace
)
Array
(
"decomp_index2"
,
index2
)
.
dump
(
fp
)
Array
(
"decomp_index2"
,
index2
)
.
dump
(
fp
,
trace
)
fp
.
close
()
fp
.
close
()
...
@@ -250,8 +272,8 @@ def makeunicodetype(unicode, trace):
...
@@ -250,8 +272,8 @@ def makeunicodetype(unicode, trace):
print
>>
fp
,
"/* type indexes */"
print
>>
fp
,
"/* type indexes */"
print
>>
fp
,
"#define SHIFT"
,
shift
print
>>
fp
,
"#define SHIFT"
,
shift
Array
(
"index1"
,
index1
)
.
dump
(
fp
)
Array
(
"index1"
,
index1
)
.
dump
(
fp
,
trace
)
Array
(
"index2"
,
index2
)
.
dump
(
fp
)
Array
(
"index2"
,
index2
)
.
dump
(
fp
,
trace
)
fp
.
close
()
fp
.
close
()
...
@@ -302,16 +324,28 @@ def makeunicodename(unicode, trace):
...
@@ -302,16 +324,28 @@ def makeunicodename(unicode, trace):
# sort on falling frequency
# sort on falling frequency
wordlist
.
sort
(
lambda
a
,
b
:
len
(
b
[
1
])
-
len
(
a
[
1
]))
wordlist
.
sort
(
lambda
a
,
b
:
len
(
b
[
1
])
-
len
(
a
[
1
]))
# figure out how many phrasebook escapes we need
escapes
=
0
while
escapes
*
256
<
len
(
wordlist
):
escapes
=
escapes
+
1
print
escapes
,
"escapes"
short
=
256
-
escapes
assert
short
>
0
print
short
,
"short indexes in lexicon"
# statistics
# statistics
n
=
0
n
=
0
for
i
in
range
(
128
):
for
i
in
range
(
short
):
n
=
n
+
len
(
wordlist
[
i
][
1
])
n
=
n
+
len
(
wordlist
[
i
][
1
])
print
n
,
"short
words (7-bit indices)
"
print
n
,
"short
indexes in phrasebook
"
# pick the
128 most commonly used words, and sort the rest on
# pick the
most commonly used words, and sort the rest on falling
#
falling
length (to maximize overlap)
# length (to maximize overlap)
wordlist
,
wordtail
=
wordlist
[:
128
],
wordlist
[
128
:]
wordlist
,
wordtail
=
wordlist
[:
short
],
wordlist
[
short
:]
wordtail
.
sort
(
lambda
a
,
b
:
len
(
b
[
0
])
-
len
(
a
[
0
]))
wordtail
.
sort
(
lambda
a
,
b
:
len
(
b
[
0
])
-
len
(
a
[
0
]))
wordlist
.
extend
(
wordtail
)
wordlist
.
extend
(
wordtail
)
...
@@ -334,11 +368,7 @@ def makeunicodename(unicode, trace):
...
@@ -334,11 +368,7 @@ def makeunicodename(unicode, trace):
lexicon
=
lexicon
+
ww
lexicon
=
lexicon
+
ww
offset
=
offset
+
len
(
w
)
offset
=
offset
+
len
(
w
)
words
[
w
]
=
len
(
lexicon_offset
)
words
[
w
]
=
len
(
lexicon_offset
)
lexicon_offset
.
append
(
offset
)
lexicon_offset
.
append
(
o
)
print
len
(
words
),
"words in lexicon;"
,
len
(
lexicon
),
"bytes"
assert
len
(
words
)
<
32768
# 15-bit word indices
lexicon
=
map
(
ord
,
lexicon
)
lexicon
=
map
(
ord
,
lexicon
)
...
@@ -352,12 +382,15 @@ def makeunicodename(unicode, trace):
...
@@ -352,12 +382,15 @@ def makeunicodename(unicode, trace):
phrasebook_offset
[
char
]
=
len
(
phrasebook
)
phrasebook_offset
[
char
]
=
len
(
phrasebook
)
for
w
in
w
:
for
w
in
w
:
i
=
words
[
w
]
i
=
words
[
w
]
if
i
<
128
:
if
i
<
short
:
phrasebook
.
append
(
128
+
i
)
phrasebook
.
append
(
i
)
else
:
else
:
phrasebook
.
append
(
i
>>
8
)
# store as two bytes
phrasebook
.
append
((
i
>>
8
)
+
short
)
phrasebook
.
append
(
i
&
255
)
phrasebook
.
append
(
i
&
255
)
assert
getsize
(
phrasebook
)
==
1
#
#
# unicode name hash table
# unicode name hash table
...
@@ -384,21 +417,22 @@ def makeunicodename(unicode, trace):
...
@@ -384,21 +417,22 @@ def makeunicodename(unicode, trace):
print
>>
fp
,
"#define NAME_MAXLEN"
,
256
print
>>
fp
,
"#define NAME_MAXLEN"
,
256
print
>>
fp
print
>>
fp
print
>>
fp
,
"/* lexicon */"
print
>>
fp
,
"/* lexicon */"
Array
(
"lexicon"
,
lexicon
)
.
dump
(
fp
)
Array
(
"lexicon"
,
lexicon
)
.
dump
(
fp
,
trace
)
Array
(
"lexicon_offset"
,
lexicon_offset
)
.
dump
(
fp
)
Array
(
"lexicon_offset"
,
lexicon_offset
)
.
dump
(
fp
,
trace
)
# split decomposition index table
# split decomposition index table
offset1
,
offset2
,
shift
=
splitbins
(
phrasebook_offset
,
trace
)
offset1
,
offset2
,
shift
=
splitbins
(
phrasebook_offset
,
trace
)
print
>>
fp
,
"/* code->name phrasebook */"
print
>>
fp
,
"/* code->name phrasebook */"
print
>>
fp
,
"#define phrasebook_shift"
,
shift
print
>>
fp
,
"#define phrasebook_shift"
,
shift
print
>>
fp
,
"#define phrasebook_short"
,
short
Array
(
"phrasebook"
,
phrasebook
)
.
dump
(
fp
)
Array
(
"phrasebook"
,
phrasebook
)
.
dump
(
fp
,
trace
)
Array
(
"phrasebook_offset1"
,
offset1
)
.
dump
(
fp
)
Array
(
"phrasebook_offset1"
,
offset1
)
.
dump
(
fp
,
trace
)
Array
(
"phrasebook_offset2"
,
offset2
)
.
dump
(
fp
)
Array
(
"phrasebook_offset2"
,
offset2
)
.
dump
(
fp
,
trace
)
print
>>
fp
,
"/* name->code dictionary */"
print
>>
fp
,
"/* name->code dictionary */"
codehash
.
dump
(
fp
)
codehash
.
dump
(
fp
,
trace
)
fp
.
close
()
fp
.
close
()
...
@@ -527,9 +561,9 @@ class Hash:
...
@@ -527,9 +561,9 @@ class Hash:
self
.
size
=
size
self
.
size
=
size
self
.
poly
=
poly
self
.
poly
=
poly
def
dump
(
self
,
file
):
def
dump
(
self
,
file
,
trace
):
# write data to file, as a C array
# write data to file, as a C array
self
.
data
.
dump
(
file
)
self
.
data
.
dump
(
file
,
trace
)
file
.
write
(
"#define
%
s_magic
%
d
\n
"
%
(
self
.
name
,
self
.
magic
))
file
.
write
(
"#define
%
s_magic
%
d
\n
"
%
(
self
.
name
,
self
.
magic
))
file
.
write
(
"#define
%
s_size
%
d
\n
"
%
(
self
.
name
,
self
.
size
))
file
.
write
(
"#define
%
s_size
%
d
\n
"
%
(
self
.
name
,
self
.
size
))
file
.
write
(
"#define
%
s_poly
%
d
\n
"
%
(
self
.
name
,
self
.
poly
))
file
.
write
(
"#define
%
s_poly
%
d
\n
"
%
(
self
.
name
,
self
.
poly
))
...
@@ -542,10 +576,11 @@ class Array:
...
@@ -542,10 +576,11 @@ class Array:
self
.
name
=
name
self
.
name
=
name
self
.
data
=
data
self
.
data
=
data
def
dump
(
self
,
file
):
def
dump
(
self
,
file
,
trace
=
0
):
# write data to file, as a C array
# write data to file, as a C array
size
=
getsize
(
self
.
data
)
size
=
getsize
(
self
.
data
)
# print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
if
trace
:
print
>>
sys
.
stderr
,
self
.
name
+
":"
,
size
*
len
(
self
.
data
),
"bytes"
file
.
write
(
"static "
)
file
.
write
(
"static "
)
if
size
==
1
:
if
size
==
1
:
file
.
write
(
"unsigned char"
)
file
.
write
(
"unsigned char"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment