unicodedata.c 12 KB
Newer Older
1 2 3 4 5 6
/* ------------------------------------------------------------------------

   unicodedata -- Provides access to the Unicode 3.0 data base.

   Data was extracted from the Unicode 3.0 UnicodeData.txt file.

7 8
   Written by Marc-Andre Lemburg (mal@lemburg.com).
   Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
9

10
   Copyright (c) Corporation for National Research Initiatives.
11 12 13 14

   ------------------------------------------------------------------------ */

#include "Python.h"
15 16 17
#include "ucnhash.h"

/* character properties */
18

19 20 21 22 23 24 25 26 27 28 29 30 31
typedef struct {
    const unsigned char category;	/* index into
					   _PyUnicode_CategoryNames */
    const unsigned char	combining; 	/* combining class value 0 - 255 */
    const unsigned char	bidirectional; 	/* index into
					   _PyUnicode_BidirectionalNames */
    const unsigned char mirrored;	/* true if mirrored in bidir mode */
} _PyUnicode_DatabaseRecord;

/* data file generated by Tools/unicode/makeunicodedata.py */
#include "unicodedata_db.h"

static const _PyUnicode_DatabaseRecord*
32
_getrecord(PyUnicodeObject* v)
33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
{
    int code;
    int index;

    code = (int) *PyUnicode_AS_UNICODE(v);

    if (code < 0 || code >= 65536)
        index = 0;
    else {
        index = index1[(code>>SHIFT)];
        index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
    }

    return &_PyUnicode_Database_Records[index];
}

49 50 51
/* --- Module API --------------------------------------------------------- */

static PyObject *
52
unicodedata_decimal(PyObject *self, PyObject *args)
53 54 55 56 57
{
    PyUnicodeObject *v;
    PyObject *defobj = NULL;
    long rc;

58
    if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
59
        return NULL;
60 61 62
    if (PyUnicode_GET_SIZE(v) != 1) {
	PyErr_SetString(PyExc_TypeError,
			"need a single Unicode character as parameter");
63
        return NULL;
64 65 66 67 68 69
    }
    rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
    if (rc < 0) {
	if (defobj == NULL) {
	    PyErr_SetString(PyExc_ValueError,
			    "not a decimal");
70
            return NULL;
71 72 73 74 75 76 77 78 79 80
	}
	else {
	    Py_INCREF(defobj);
	    return defobj;
	}
    }
    return PyInt_FromLong(rc);
}

static PyObject *
81
unicodedata_digit(PyObject *self, PyObject *args)
82 83 84 85 86
{
    PyUnicodeObject *v;
    PyObject *defobj = NULL;
    long rc;

87
    if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
88
        return NULL;
89 90 91
    if (PyUnicode_GET_SIZE(v) != 1) {
	PyErr_SetString(PyExc_TypeError,
			"need a single Unicode character as parameter");
92
        return NULL;
93 94 95 96
    }
    rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
    if (rc < 0) {
	if (defobj == NULL) {
97
	    PyErr_SetString(PyExc_ValueError, "not a digit");
98
            return NULL;
99 100 101 102 103 104 105 106 107 108
	}
	else {
	    Py_INCREF(defobj);
	    return defobj;
	}
    }
    return PyInt_FromLong(rc);
}

static PyObject *
109
unicodedata_numeric(PyObject *self, PyObject *args)
110 111 112 113 114
{
    PyUnicodeObject *v;
    PyObject *defobj = NULL;
    double rc;

115
    if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
116
        return NULL;
117 118 119
    if (PyUnicode_GET_SIZE(v) != 1) {
	PyErr_SetString(PyExc_TypeError,
			"need a single Unicode character as parameter");
120
	return NULL;
121 122 123 124
    }
    rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
    if (rc < 0) {
	if (defobj == NULL) {
125
	    PyErr_SetString(PyExc_ValueError, "not a numeric character");
126
	    return NULL;
127 128 129 130 131 132 133 134 135 136
	}
	else {
	    Py_INCREF(defobj);
	    return defobj;
	}
    }
    return PyFloat_FromDouble(rc);
}

static PyObject *
137
unicodedata_category(PyObject *self, PyObject *args)
138 139 140 141 142 143
{
    PyUnicodeObject *v;
    int index;

    if (!PyArg_ParseTuple(args, "O!:category",
			  &PyUnicode_Type, &v))
144
	return NULL;
145 146 147
    if (PyUnicode_GET_SIZE(v) != 1) {
	PyErr_SetString(PyExc_TypeError,
			"need a single Unicode character as parameter");
148
	return NULL;
149
    }
150
    index = (int) _getrecord(v)->category;
151 152 153 154
    return PyString_FromString(_PyUnicode_CategoryNames[index]);
}

static PyObject *
155
unicodedata_bidirectional(PyObject *self, PyObject *args)
156 157 158 159 160 161
{
    PyUnicodeObject *v;
    int index;

    if (!PyArg_ParseTuple(args, "O!:bidirectional",
			  &PyUnicode_Type, &v))
162
	return NULL;
163 164 165
    if (PyUnicode_GET_SIZE(v) != 1) {
	PyErr_SetString(PyExc_TypeError,
			"need a single Unicode character as parameter");
166
	return NULL;
167
    }
168
    index = (int) _getrecord(v)->bidirectional;
169 170 171 172
    return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
}

static PyObject *
173
unicodedata_combining(PyObject *self, PyObject *args)
174 175 176 177 178
{
    PyUnicodeObject *v;

    if (!PyArg_ParseTuple(args, "O!:combining",
			  &PyUnicode_Type, &v))
179
	return NULL;
180 181 182
    if (PyUnicode_GET_SIZE(v) != 1) {
	PyErr_SetString(PyExc_TypeError,
			"need a single Unicode character as parameter");
183
	return NULL;
184
    }
185
    return PyInt_FromLong((int) _getrecord(v)->combining);
186 187 188
}

static PyObject *
189
unicodedata_mirrored(PyObject *self, PyObject *args)
190 191 192 193 194
{
    PyUnicodeObject *v;

    if (!PyArg_ParseTuple(args, "O!:mirrored",
			  &PyUnicode_Type, &v))
195
	return NULL;
196 197 198
    if (PyUnicode_GET_SIZE(v) != 1) {
	PyErr_SetString(PyExc_TypeError,
			"need a single Unicode character as parameter");
199
	return NULL;
200
    }
201
    return PyInt_FromLong((int) _getrecord(v)->mirrored);
202 203 204
}

static PyObject *
205
unicodedata_decomposition(PyObject *self, PyObject *args)
206 207
{
    PyUnicodeObject *v;
208 209
    char decomp[256];
    int code, index, count, i;
210 211 212

    if (!PyArg_ParseTuple(args, "O!:decomposition",
			  &PyUnicode_Type, &v))
213
	return NULL;
214 215 216
    if (PyUnicode_GET_SIZE(v) != 1) {
	PyErr_SetString(PyExc_TypeError,
			"need a single Unicode character as parameter");
217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245
	return NULL;
    }

    code = (int) *PyUnicode_AS_UNICODE(v);

    if (code < 0 || code >= 65536)
        index = 0;
    else {
        index = decomp_index1[(code>>DECOMP_SHIFT)];
        index = decomp_index2[(index<<DECOMP_SHIFT)+
                             (code&((1<<DECOMP_SHIFT)-1))];
    }

    /* high byte is of hex bytes (usually one or two), low byte
       is prefix code (from*/
    count = decomp_data[index] >> 8;

    /* XXX: could allocate the PyString up front instead
       (strlen(prefix) + 5 * count + 1 bytes) */

    /* copy prefix */
    i = strlen(decomp_prefix[decomp_data[index] & 255]);
    memcpy(decomp, decomp_prefix[decomp_data[index] & 255], i);

    while (count-- > 0) {
        if (i)
            decomp[i++] = ' ';
        sprintf(decomp + i, "%04X", decomp_data[++index]);
        i += strlen(decomp + i);
246 247
    }
    
248 249 250
    decomp[i] = '\0';

    return PyString_FromString(decomp);
251 252
}

253 254 255 256 257 258 259 260 261 262
/* -------------------------------------------------------------------- */
/* unicode character name tables */

/* data file generated by Tools/unicode/makeunicodedata.py */
#include "unicodename_db.h"

/* -------------------------------------------------------------------- */
/* database code (cut and pasted from the unidb package) */

static unsigned long
263
_gethash(const char *s, int len, int scale)
264 265 266 267 268 269 270 271 272 273 274 275 276 277
{
    int i;
    unsigned long h = 0;
    unsigned long ix;
    for (i = 0; i < len; i++) {
        h = (h * scale) + (unsigned char) toupper(s[i]);
        ix = h & 0xff000000;
        if (ix)
            h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
    }
    return h;
}

static int
278
_getname(Py_UCS4 code, char* buffer, int buflen)
279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329
{
    int offset;
    int i;
    int word;
    unsigned char* w;

    if (code < 0 || code >= 65536)
        return 0;

    /* get offset into phrasebook */
    offset = phrasebook_offset1[(code>>phrasebook_shift)];
    offset = phrasebook_offset2[(offset<<phrasebook_shift) +
                               (code&((1<<phrasebook_shift)-1))];
    if (!offset)
        return 0;

    i = 0;

    for (;;) {
        /* get word index */
        word = phrasebook[offset] - phrasebook_short;
        if (word >= 0) {
            word = (word << 8) + phrasebook[offset+1];
            offset += 2;
        } else
            word = phrasebook[offset++];
        if (i) {
            if (i > buflen)
                return 0; /* buffer overflow */
            buffer[i++] = ' ';
        }
        /* copy word string from lexicon.  the last character in the
           word has bit 7 set.  the last word in a string ends with
           0x80 */
        w = lexicon + lexicon_offset[word];
        while (*w < 128) {
            if (i >= buflen)
                return 0; /* buffer overflow */
            buffer[i++] = *w++;
        }
        if (i >= buflen)
            return 0; /* buffer overflow */
        buffer[i++] = *w & 127;
        if (*w == 128)
            break; /* end of word */
    }

    return 1;
}

static int
330
_cmpname(int code, const char* name, int namelen)
331 332 333 334
{
    /* check if code corresponds to the given name */
    int i;
    char buffer[NAME_MAXLEN];
335
    if (!_getname(code, buffer, sizeof(buffer)))
336 337 338 339 340 341 342 343 344
        return 0;
    for (i = 0; i < namelen; i++) {
        if (toupper(name[i]) != buffer[i])
            return 0;
    }
    return buffer[namelen] == '\0';
}

static int
345
_getcode(const char* name, int namelen, Py_UCS4* code)
346 347 348 349 350 351 352 353 354
{
    unsigned int h, v;
    unsigned int mask = code_size-1;
    unsigned int i, incr;

    /* the following is the same as python's dictionary lookup, with
       only minor changes.  see the makeunicodedata script for more
       details */

355
    h = (unsigned int) _gethash(name, namelen, code_magic);
356 357 358 359
    i = (~h) & mask;
    v = code_hash[i];
    if (!v)
        return 0;
360
    if (_cmpname(v, name, namelen)) {
361 362 363 364 365 366 367 368 369 370
        *code = v;
        return 1;
    }
    incr = (h ^ (h >> 3)) & mask;
    if (!incr)
        incr = mask;
    for (;;) {
        i = (i + incr) & mask;
        v = code_hash[i];
        if (!v)
371
            return 0;
372
        if (_cmpname(v, name, namelen)) {
373 374 375 376 377 378 379 380 381 382 383 384
            *code = v;
            return 1;
        }
        incr = incr << 1;
        if (incr > mask)
            incr = incr ^ code_poly;
    }
}

static const _PyUnicode_Name_CAPI hashAPI = 
{
    sizeof(_PyUnicode_Name_CAPI),
385 386
    _getname,
    _getcode
387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407
};

/* -------------------------------------------------------------------- */
/* Python bindings */

static PyObject *
unicodedata_name(PyObject* self, PyObject* args)
{
    char name[NAME_MAXLEN];

    PyUnicodeObject* v;
    PyObject* defobj = NULL;
    if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
        return NULL;

    if (PyUnicode_GET_SIZE(v) != 1) {
	PyErr_SetString(PyExc_TypeError,
			"need a single Unicode character as parameter");
	return NULL;
    }

408 409
    if (!_getname((Py_UCS4) *PyUnicode_AS_UNICODE(v),
                             name, sizeof(name))) {
410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433
	if (defobj == NULL) {
	    PyErr_SetString(PyExc_ValueError, "no such name");
            return NULL;
	}
	else {
	    Py_INCREF(defobj);
	    return defobj;
	}
    }

    return Py_BuildValue("s", name);
}

static PyObject *
unicodedata_lookup(PyObject* self, PyObject* args)
{
    Py_UCS4 code;
    Py_UNICODE str[1];

    char* name;
    int namelen;
    if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
        return NULL;

434
    if (!_getcode(name, namelen, &code)) {
435 436 437 438 439 440 441 442
        PyErr_SetString(PyExc_KeyError, "undefined character name");
        return NULL;
    }

    str[0] = (Py_UNICODE) code;
    return PyUnicode_FromUnicode(str, 1);
}

443 444 445
/* XXX Add doc strings. */

static PyMethodDef unicodedata_functions[] = {
446 447 448 449 450 451 452 453 454 455
    {"decimal", unicodedata_decimal, METH_VARARGS},
    {"digit", unicodedata_digit, METH_VARARGS},
    {"numeric", unicodedata_numeric, METH_VARARGS},
    {"category", unicodedata_category, METH_VARARGS},
    {"bidirectional", unicodedata_bidirectional, METH_VARARGS},
    {"combining", unicodedata_combining, METH_VARARGS},
    {"mirrored", unicodedata_mirrored, METH_VARARGS},
    {"decomposition",unicodedata_decomposition, METH_VARARGS},
    {"name", unicodedata_name, METH_VARARGS},
    {"lookup", unicodedata_lookup, METH_VARARGS},
456 457 458
    {NULL, NULL}		/* sentinel */
};

459 460
static char *unicodedata_docstring = "unicode character database";

461
DL_EXPORT(void)
462
initunicodedata(void)
463
{
464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480
    PyObject *m, *d, *v;

    m = Py_InitModule4(
        "unicodedata", unicodedata_functions,
        unicodedata_docstring, NULL, PYTHON_API_VERSION);
    if (!m)
        return;

    d = PyModule_GetDict(m);
    if (!d)
        return;

    /* Export C API */
    v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
    PyDict_SetItemString(d, "ucnhash_CAPI", v);
    Py_XDECREF(v);

481
}