_codecsmodule.c 33.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12
/* ------------------------------------------------------------------------

   _codecs -- Provides access to the codec registry and the builtin
              codecs.

   This module should never be imported directly. The standard library
   module "codecs" wraps this builtin module for use within Python.

   The codec registry is accessible via:

     register(search_function) -> None

13
     lookup(encoding) -> CodecInfo object
14 15 16

   The builtin Unicode codecs use the following interface:

Walter Dörwald's avatar
Walter Dörwald committed
17
     <encoding>_encode(Unicode_object[,errors='strict']) ->
18
        (string object, bytes consumed)
19

Walter Dörwald's avatar
Walter Dörwald committed
20
     <encoding>_decode(char_buffer_obj[,errors='strict']) ->
21 22
        (Unicode object, bytes consumed)

23 24 25 26
   <encoding>_encode() interfaces also accept non-Unicode object as
   input. The objects are then converted to Unicode using
   PyUnicode_FromObject() prior to applying the conversion.

27
   These <encoding>s are available: utf_8, unicode_escape,
28 29 30
   raw_unicode_escape, unicode_internal, latin_1, ascii (7-bit),
   mbcs (on win32).

31 32 33

Written by Marc-Andre Lemburg (mal@lemburg.com).

34
Copyright (c) Corporation for National Research Initiatives.
35 36 37

   ------------------------------------------------------------------------ */

Martin v. Löwis's avatar
Martin v. Löwis committed
38
#define PY_SSIZE_T_CLEAN
39 40 41 42
#include "Python.h"

/* --- Registry ----------------------------------------------------------- */

43 44 45 46 47
PyDoc_STRVAR(register__doc__,
"register(search_function)\n\
\n\
Register a codec search function. Search functions are expected to take\n\
one argument, the encoding name in all lower case letters, and return\n\
48 49
a tuple of functions (encoder, decoder, stream_reader, stream_writer)\n\
(or a CodecInfo object).");
50

51
static
52
PyObject *codec_register(PyObject *self, PyObject *search_function)
53 54
{
    if (PyCodec_Register(search_function))
55
        return NULL;
56

57
    Py_RETURN_NONE;
58 59
}

60
PyDoc_STRVAR(lookup__doc__,
61
"lookup(encoding) -> CodecInfo\n\
62 63
\n\
Looks up a codec tuple in the Python codec registry and returns\n\
Benjamin Peterson's avatar
Benjamin Peterson committed
64
a CodecInfo object.");
65

66
static
67
PyObject *codec_lookup(PyObject *self, PyObject *args)
68 69 70 71
{
    char *encoding;

    if (!PyArg_ParseTuple(args, "s:lookup", &encoding))
72
        return NULL;
73 74 75 76

    return _PyCodec_Lookup(encoding);
}

77 78 79 80 81 82 83 84 85 86 87 88 89
PyDoc_STRVAR(encode__doc__,
"encode(obj, [encoding[,errors]]) -> object\n\
\n\
Encodes obj using the codec registered for encoding. encoding defaults\n\
to the default encoding. errors may be given to set a different error\n\
handling scheme. Default is 'strict' meaning that encoding errors raise\n\
a ValueError. Other possible values are 'ignore', 'replace' and\n\
'xmlcharrefreplace' as well as any other name registered with\n\
codecs.register_error that can handle ValueErrors.");

static PyObject *
codec_encode(PyObject *self, PyObject *args)
{
90 91
    const char *encoding = NULL;
    const char *errors = NULL;
92
    PyObject *v;
Walter Dörwald's avatar
Walter Dörwald committed
93

94 95 96 97
    if (!PyArg_ParseTuple(args, "O|ss:encode", &v, &encoding, &errors))
        return NULL;

    if (encoding == NULL)
98
        encoding = PyUnicode_GetDefaultEncoding();
99 100

    /* Encode via the codec registry */
101
    return PyCodec_Encode(v, encoding, errors);
102 103 104 105 106 107 108 109 110
}

PyDoc_STRVAR(decode__doc__,
"decode(obj, [encoding[,errors]]) -> object\n\
\n\
Decodes obj using the codec registered for encoding. encoding defaults\n\
to the default encoding. errors may be given to set a different error\n\
handling scheme. Default is 'strict' meaning that encoding errors raise\n\
a ValueError. Other possible values are 'ignore' and 'replace'\n\
Benjamin Peterson's avatar
Benjamin Peterson committed
111
as well as any other name registered with codecs.register_error that is\n\
112 113 114 115 116
able to handle ValueErrors.");

static PyObject *
codec_decode(PyObject *self, PyObject *args)
{
117 118
    const char *encoding = NULL;
    const char *errors = NULL;
119
    PyObject *v;
Walter Dörwald's avatar
Walter Dörwald committed
120

121 122 123 124
    if (!PyArg_ParseTuple(args, "O|ss:decode", &v, &encoding, &errors))
        return NULL;

    if (encoding == NULL)
125
        encoding = PyUnicode_GetDefaultEncoding();
126 127

    /* Decode via the codec registry */
128
    return PyCodec_Decode(v, encoding, errors);
129 130
}

131 132 133 134
/* --- Helpers ------------------------------------------------------------ */

static
PyObject *codec_tuple(PyObject *unicode,
135
                      Py_ssize_t len)
136
{
137
    PyObject *v;
138
    if (unicode == NULL)
139 140 141
        return NULL;
    v = Py_BuildValue("On", unicode, len);
    Py_DECREF(unicode);
142 143 144
    return v;
}

145 146 147
/* --- String codecs ------------------------------------------------------ */
static PyObject *
escape_decode(PyObject *self,
148
              PyObject *args)
149 150 151
{
    const char *errors = NULL;
    const char *data;
Martin v. Löwis's avatar
Martin v. Löwis committed
152
    Py_ssize_t size;
Walter Dörwald's avatar
Walter Dörwald committed
153

154
    if (!PyArg_ParseTuple(args, "s#|z:escape_decode",
155 156
                          &data, &size, &errors))
        return NULL;
157
    return codec_tuple(PyBytes_DecodeEscape(data, size, errors, 0, NULL),
158
                       size);
159 160 161 162
}

static PyObject *
escape_encode(PyObject *self,
163
              PyObject *args)
164
{
165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183
    static const char *hexdigits = "0123456789abcdef";
    PyObject *str;
    Py_ssize_t size;
    Py_ssize_t newsize;
    const char *errors = NULL;
    PyObject *v;

    if (!PyArg_ParseTuple(args, "O!|z:escape_encode",
                          &PyBytes_Type, &str, &errors))
        return NULL;

    size = PyBytes_GET_SIZE(str);
    newsize = 4*size;
    if (newsize > PY_SSIZE_T_MAX || newsize / 4 != size) {
        PyErr_SetString(PyExc_OverflowError,
            "string is too large to encode");
            return NULL;
    }
    v = PyBytes_FromStringAndSize(NULL, newsize);
184

185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212
    if (v == NULL) {
        return NULL;
    }
    else {
        register Py_ssize_t i;
        register char c;
        register char *p = PyBytes_AS_STRING(v);

        for (i = 0; i < size; i++) {
            /* There's at least enough room for a hex escape */
            assert(newsize - (p - PyBytes_AS_STRING(v)) >= 4);
            c = PyBytes_AS_STRING(str)[i];
            if (c == '\'' || c == '\\')
                *p++ = '\\', *p++ = c;
            else if (c == '\t')
                *p++ = '\\', *p++ = 't';
            else if (c == '\n')
                *p++ = '\\', *p++ = 'n';
            else if (c == '\r')
                *p++ = '\\', *p++ = 'r';
            else if (c < ' ' || c >= 0x7f) {
                *p++ = '\\';
                *p++ = 'x';
                *p++ = hexdigits[(c & 0xf0) >> 4];
                *p++ = hexdigits[c & 0xf];
            }
            else
                *p++ = c;
213
        }
214 215 216
        *p = '\0';
        if (_PyBytes_Resize(&v, (p - PyBytes_AS_STRING(v)))) {
            return NULL;
217
        }
218
    }
219

220
    return codec_tuple(v, size);
221 222
}

223 224 225 226
/* --- Decoder ------------------------------------------------------------ */

static PyObject *
unicode_internal_decode(PyObject *self,
227
                        PyObject *args)
228
{
229 230
    PyObject *obj;
    const char *errors = NULL;
231
    const char *data;
Martin v. Löwis's avatar
Martin v. Löwis committed
232
    Py_ssize_t size;
Walter Dörwald's avatar
Walter Dörwald committed
233

234
    if (!PyArg_ParseTuple(args, "O|z:unicode_internal_decode",
235 236
                          &obj, &errors))
        return NULL;
237

238
    if (PyUnicode_Check(obj)) {
239 240
        Py_INCREF(obj);
        return codec_tuple(obj, PyUnicode_GET_SIZE(obj));
241
    }
242
    else {
243 244
        if (PyObject_AsReadBuffer(obj, (const void **)&data, &size))
            return NULL;
245

246 247
        return codec_tuple(_PyUnicode_DecodeUnicodeInternal(data, size, errors),
                           size);
248
    }
249 250
}

251 252
static PyObject *
utf_7_decode(PyObject *self,
253
             PyObject *args)
254
{
255
    Py_buffer pbuf;
256
    const char *errors = NULL;
257 258 259
    int final = 0;
    Py_ssize_t consumed;
    PyObject *decoded = NULL;
Walter Dörwald's avatar
Walter Dörwald committed
260

261
    if (!PyArg_ParseTuple(args, "y*|zi:utf_7_decode",
262 263
                          &pbuf, &errors, &final))
        return NULL;
264
    consumed = pbuf.len;
265

266
    decoded = PyUnicode_DecodeUTF7Stateful(pbuf.buf, pbuf.len, errors,
267
                                           final ? NULL : &consumed);
268
    PyBuffer_Release(&pbuf);
269 270 271
    if (decoded == NULL)
        return NULL;
    return codec_tuple(decoded, consumed);
272 273
}

274 275
static PyObject *
utf_8_decode(PyObject *self,
276
            PyObject *args)
277
{
278
    Py_buffer pbuf;
279
    const char *errors = NULL;
280
    int final = 0;
Martin v. Löwis's avatar
Martin v. Löwis committed
281
    Py_ssize_t consumed;
282
    PyObject *decoded = NULL;
283

284
    if (!PyArg_ParseTuple(args, "y*|zi:utf_8_decode",
285 286
                          &pbuf, &errors, &final))
        return NULL;
287 288 289
    consumed = pbuf.len;

    decoded = PyUnicode_DecodeUTF8Stateful(pbuf.buf, pbuf.len, errors,
290
                                           final ? NULL : &consumed);
291
    PyBuffer_Release(&pbuf);
292
    if (decoded == NULL)
293
        return NULL;
294
    return codec_tuple(decoded, consumed);
295 296 297 298
}

static PyObject *
utf_16_decode(PyObject *self,
299
            PyObject *args)
300
{
301
    Py_buffer pbuf;
302 303
    const char *errors = NULL;
    int byteorder = 0;
304
    int final = 0;
Martin v. Löwis's avatar
Martin v. Löwis committed
305
    Py_ssize_t consumed;
306 307
    PyObject *decoded;

308
    if (!PyArg_ParseTuple(args, "y*|zi:utf_16_decode",
309 310
                          &pbuf, &errors, &final))
        return NULL;
311 312
    consumed = pbuf.len; /* This is overwritten unless final is true. */
    decoded = PyUnicode_DecodeUTF16Stateful(pbuf.buf, pbuf.len, errors,
313
                                        &byteorder, final ? NULL : &consumed);
314
    PyBuffer_Release(&pbuf);
315
    if (decoded == NULL)
316
        return NULL;
317
    return codec_tuple(decoded, consumed);
318 319 320 321
}

static PyObject *
utf_16_le_decode(PyObject *self,
322
                 PyObject *args)
323
{
324
    Py_buffer pbuf;
325 326
    const char *errors = NULL;
    int byteorder = -1;
327
    int final = 0;
Martin v. Löwis's avatar
Martin v. Löwis committed
328
    Py_ssize_t consumed;
329
    PyObject *decoded = NULL;
Walter Dörwald's avatar
Walter Dörwald committed
330

331
    if (!PyArg_ParseTuple(args, "y*|zi:utf_16_le_decode",
332 333
                          &pbuf, &errors, &final))
        return NULL;
Martin v. Löwis's avatar
Martin v. Löwis committed
334

335 336
    consumed = pbuf.len; /* This is overwritten unless final is true. */
    decoded = PyUnicode_DecodeUTF16Stateful(pbuf.buf, pbuf.len, errors,
337
        &byteorder, final ? NULL : &consumed);
338
    PyBuffer_Release(&pbuf);
339
    if (decoded == NULL)
340
        return NULL;
341
    return codec_tuple(decoded, consumed);
342 343 344 345
}

static PyObject *
utf_16_be_decode(PyObject *self,
346
                 PyObject *args)
347
{
348
    Py_buffer pbuf;
349 350
    const char *errors = NULL;
    int byteorder = 1;
351
    int final = 0;
Martin v. Löwis's avatar
Martin v. Löwis committed
352
    Py_ssize_t consumed;
353
    PyObject *decoded = NULL;
Walter Dörwald's avatar
Walter Dörwald committed
354

355
    if (!PyArg_ParseTuple(args, "y*|zi:utf_16_be_decode",
356 357
                          &pbuf, &errors, &final))
        return NULL;
358 359 360

    consumed = pbuf.len; /* This is overwritten unless final is true. */
    decoded = PyUnicode_DecodeUTF16Stateful(pbuf.buf, pbuf.len, errors,
361
        &byteorder, final ? NULL : &consumed);
362
    PyBuffer_Release(&pbuf);
363
    if (decoded == NULL)
364
        return NULL;
365
    return codec_tuple(decoded, consumed);
366 367 368 369 370 371 372 373 374 375 376 377
}

/* This non-standard version also provides access to the byteorder
   parameter of the builtin UTF-16 codec.

   It returns a tuple (unicode, bytesread, byteorder) with byteorder
   being the value in effect at the end of data.

*/

static PyObject *
utf_16_ex_decode(PyObject *self,
378
                 PyObject *args)
379
{
380
    Py_buffer pbuf;
381 382 383
    const char *errors = NULL;
    int byteorder = 0;
    PyObject *unicode, *tuple;
384
    int final = 0;
Martin v. Löwis's avatar
Martin v. Löwis committed
385
    Py_ssize_t consumed;
386

387
    if (!PyArg_ParseTuple(args, "y*|zii:utf_16_ex_decode",
388 389
                          &pbuf, &errors, &byteorder, &final))
        return NULL;
390 391
    consumed = pbuf.len; /* This is overwritten unless final is true. */
    unicode = PyUnicode_DecodeUTF16Stateful(pbuf.buf, pbuf.len, errors,
392
                                        &byteorder, final ? NULL : &consumed);
393
    PyBuffer_Release(&pbuf);
394
    if (unicode == NULL)
395
        return NULL;
396
    tuple = Py_BuildValue("Oni", unicode, consumed, byteorder);
397 398 399 400
    Py_DECREF(unicode);
    return tuple;
}

401 402
static PyObject *
utf_32_decode(PyObject *self,
403
            PyObject *args)
404
{
405
    Py_buffer pbuf;
406 407 408 409 410 411
    const char *errors = NULL;
    int byteorder = 0;
    int final = 0;
    Py_ssize_t consumed;
    PyObject *decoded;

412
    if (!PyArg_ParseTuple(args, "y*|zi:utf_32_decode",
413 414
                          &pbuf, &errors, &final))
        return NULL;
415 416
    consumed = pbuf.len; /* This is overwritten unless final is true. */
    decoded = PyUnicode_DecodeUTF32Stateful(pbuf.buf, pbuf.len, errors,
417
                                        &byteorder, final ? NULL : &consumed);
418
    PyBuffer_Release(&pbuf);
419
    if (decoded == NULL)
420
        return NULL;
421 422 423 424 425
    return codec_tuple(decoded, consumed);
}

static PyObject *
utf_32_le_decode(PyObject *self,
426
                 PyObject *args)
427
{
428
    Py_buffer pbuf;
429 430 431 432
    const char *errors = NULL;
    int byteorder = -1;
    int final = 0;
    Py_ssize_t consumed;
433
    PyObject *decoded;
434

435
    if (!PyArg_ParseTuple(args, "y*|zi:utf_32_le_decode",
436 437
                          &pbuf, &errors, &final))
        return NULL;
438 439
    consumed = pbuf.len; /* This is overwritten unless final is true. */
    decoded = PyUnicode_DecodeUTF32Stateful(pbuf.buf, pbuf.len, errors,
440
                                        &byteorder, final ? NULL : &consumed);
441
    PyBuffer_Release(&pbuf);
442
    if (decoded == NULL)
443
        return NULL;
444 445 446 447 448
    return codec_tuple(decoded, consumed);
}

static PyObject *
utf_32_be_decode(PyObject *self,
449
                 PyObject *args)
450
{
451
    Py_buffer pbuf;
452 453 454 455
    const char *errors = NULL;
    int byteorder = 1;
    int final = 0;
    Py_ssize_t consumed;
456
    PyObject *decoded;
457

458
    if (!PyArg_ParseTuple(args, "y*|zi:utf_32_be_decode",
459 460
                          &pbuf, &errors, &final))
        return NULL;
461 462
    consumed = pbuf.len; /* This is overwritten unless final is true. */
    decoded = PyUnicode_DecodeUTF32Stateful(pbuf.buf, pbuf.len, errors,
463
                                        &byteorder, final ? NULL : &consumed);
464
    PyBuffer_Release(&pbuf);
465
    if (decoded == NULL)
466
        return NULL;
467 468 469 470 471 472 473 474 475 476 477 478 479
    return codec_tuple(decoded, consumed);
}

/* This non-standard version also provides access to the byteorder
   parameter of the builtin UTF-32 codec.

   It returns a tuple (unicode, bytesread, byteorder) with byteorder
   being the value in effect at the end of data.

*/

static PyObject *
utf_32_ex_decode(PyObject *self,
480
                 PyObject *args)
481
{
482
    Py_buffer pbuf;
483 484 485 486 487 488
    const char *errors = NULL;
    int byteorder = 0;
    PyObject *unicode, *tuple;
    int final = 0;
    Py_ssize_t consumed;

489
    if (!PyArg_ParseTuple(args, "y*|zii:utf_32_ex_decode",
490 491
                          &pbuf, &errors, &byteorder, &final))
        return NULL;
492 493
    consumed = pbuf.len; /* This is overwritten unless final is true. */
    unicode = PyUnicode_DecodeUTF32Stateful(pbuf.buf, pbuf.len, errors,
494
                                        &byteorder, final ? NULL : &consumed);
495
    PyBuffer_Release(&pbuf);
496
    if (unicode == NULL)
497
        return NULL;
498 499 500 501 502
    tuple = Py_BuildValue("Oni", unicode, consumed, byteorder);
    Py_DECREF(unicode);
    return tuple;
}

503 504
static PyObject *
unicode_escape_decode(PyObject *self,
505
                     PyObject *args)
506
{
507
    Py_buffer pbuf;
508
    const char *errors = NULL;
509
        PyObject *unicode;
Walter Dörwald's avatar
Walter Dörwald committed
510

511
    if (!PyArg_ParseTuple(args, "s*|z:unicode_escape_decode",
512 513
                          &pbuf, &errors))
        return NULL;
514

515 516 517
    unicode = PyUnicode_DecodeUnicodeEscape(pbuf.buf, pbuf.len, errors);
    PyBuffer_Release(&pbuf);
    return codec_tuple(unicode, pbuf.len);
518 519 520 521
}

static PyObject *
raw_unicode_escape_decode(PyObject *self,
522
                        PyObject *args)
523
{
524
    Py_buffer pbuf;
525
    const char *errors = NULL;
526
    PyObject *unicode;
Walter Dörwald's avatar
Walter Dörwald committed
527

528
    if (!PyArg_ParseTuple(args, "s*|z:raw_unicode_escape_decode",
529 530
                          &pbuf, &errors))
        return NULL;
531

532 533 534
    unicode = PyUnicode_DecodeRawUnicodeEscape(pbuf.buf, pbuf.len, errors);
    PyBuffer_Release(&pbuf);
    return codec_tuple(unicode, pbuf.len);
535 536 537 538
}

static PyObject *
latin_1_decode(PyObject *self,
539
               PyObject *args)
540
{
541 542
    Py_buffer pbuf;
    PyObject *unicode;
543
    const char *errors = NULL;
Walter Dörwald's avatar
Walter Dörwald committed
544

545
    if (!PyArg_ParseTuple(args, "y*|z:latin_1_decode",
546 547
                          &pbuf, &errors))
        return NULL;
548

549 550 551
    unicode = PyUnicode_DecodeLatin1(pbuf.buf, pbuf.len, errors);
    PyBuffer_Release(&pbuf);
    return codec_tuple(unicode, pbuf.len);
552 553 554 555
}

static PyObject *
ascii_decode(PyObject *self,
556
             PyObject *args)
557
{
558 559
    Py_buffer pbuf;
    PyObject *unicode;
560
    const char *errors = NULL;
Walter Dörwald's avatar
Walter Dörwald committed
561

562
    if (!PyArg_ParseTuple(args, "y*|z:ascii_decode",
563 564
                          &pbuf, &errors))
        return NULL;
565

566 567 568
    unicode = PyUnicode_DecodeASCII(pbuf.buf, pbuf.len, errors);
    PyBuffer_Release(&pbuf);
    return codec_tuple(unicode, pbuf.len);
569 570 571 572
}

static PyObject *
charmap_decode(PyObject *self,
573
               PyObject *args)
574
{
575 576
    Py_buffer pbuf;
    PyObject *unicode;
577 578
    const char *errors = NULL;
    PyObject *mapping = NULL;
Walter Dörwald's avatar
Walter Dörwald committed
579

580
    if (!PyArg_ParseTuple(args, "y*|zO:charmap_decode",
581 582
                          &pbuf, &errors, &mapping))
        return NULL;
583
    if (mapping == Py_None)
584
        mapping = NULL;
585

586 587 588
    unicode = PyUnicode_DecodeCharmap(pbuf.buf, pbuf.len, mapping, errors);
    PyBuffer_Release(&pbuf);
    return codec_tuple(unicode, pbuf.len);
589 590
}

591
#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum's avatar
Guido van Rossum committed
592 593 594

static PyObject *
mbcs_decode(PyObject *self,
595
            PyObject *args)
Guido van Rossum's avatar
Guido van Rossum committed
596
{
597
    Py_buffer pbuf;
Guido van Rossum's avatar
Guido van Rossum committed
598
    const char *errors = NULL;
599
    int final = 0;
600 601
    Py_ssize_t consumed;
    PyObject *decoded = NULL;
Walter Dörwald's avatar
Walter Dörwald committed
602

603
    if (!PyArg_ParseTuple(args, "y*|zi:mbcs_decode",
604 605
                          &pbuf, &errors, &final))
        return NULL;
606
    consumed = pbuf.len;
Guido van Rossum's avatar
Guido van Rossum committed
607

608
    decoded = PyUnicode_DecodeMBCSStateful(pbuf.buf, pbuf.len, errors,
609
                                           final ? NULL : &consumed);
610
    PyBuffer_Release(&pbuf);
611
    if (decoded == NULL)
612
        return NULL;
613
    return codec_tuple(decoded, consumed);
Guido van Rossum's avatar
Guido van Rossum committed
614 615
}

616
#endif /* MS_WINDOWS */
Guido van Rossum's avatar
Guido van Rossum committed
617

618 619 620 621
/* --- Encoder ------------------------------------------------------------ */

static PyObject *
readbuffer_encode(PyObject *self,
622
                  PyObject *args)
623
{
624
    Py_buffer pdata;
625
    const char *data;
Martin v. Löwis's avatar
Martin v. Löwis committed
626
    Py_ssize_t size;
627
    const char *errors = NULL;
628
    PyObject *result;
629

630
    if (!PyArg_ParseTuple(args, "s*|z:readbuffer_encode",
631 632
                          &pdata, &errors))
        return NULL;
633 634
    data = pdata.buf;
    size = pdata.len;
635

636 637 638
    result = PyBytes_FromStringAndSize(data, size);
    PyBuffer_Release(&pdata);
    return codec_tuple(result, size);
639 640
}

641 642
static PyObject *
unicode_internal_encode(PyObject *self,
643
                        PyObject *args)
644 645 646 647
{
    PyObject *obj;
    const char *errors = NULL;
    const char *data;
Martin v. Löwis's avatar
Martin v. Löwis committed
648
    Py_ssize_t size;
Walter Dörwald's avatar
Walter Dörwald committed
649

650
    if (!PyArg_ParseTuple(args, "O|z:unicode_internal_encode",
651 652
                          &obj, &errors))
        return NULL;
653 654

    if (PyUnicode_Check(obj)) {
655 656 657 658
        data = PyUnicode_AS_DATA(obj);
        size = PyUnicode_GET_DATA_SIZE(obj);
        return codec_tuple(PyBytes_FromStringAndSize(data, size),
                           PyUnicode_GET_SIZE(obj));
659 660
    }
    else {
661 662 663
        if (PyObject_AsReadBuffer(obj, (const void **)&data, &size))
            return NULL;
        return codec_tuple(PyBytes_FromStringAndSize(data, size), size);
664 665 666
    }
}

667 668
static PyObject *
utf_7_encode(PyObject *self,
669
            PyObject *args)
670 671 672 673 674
{
    PyObject *str, *v;
    const char *errors = NULL;

    if (!PyArg_ParseTuple(args, "O|z:utf_7_encode",
675 676
                          &str, &errors))
        return NULL;
677 678 679

    str = PyUnicode_FromObject(str);
    if (str == NULL)
680
        return NULL;
681
    v = codec_tuple(PyUnicode_EncodeUTF7(PyUnicode_AS_UNICODE(str),
682 683 684 685 686
                                         PyUnicode_GET_SIZE(str),
                                         0,
                                         0,
                                         errors),
                    PyUnicode_GET_SIZE(str));
687 688 689 690
    Py_DECREF(str);
    return v;
}

691 692
static PyObject *
utf_8_encode(PyObject *self,
693
            PyObject *args)
694
{
695
    PyObject *str, *v;
696 697
    const char *errors = NULL;

698
    if (!PyArg_ParseTuple(args, "O|z:utf_8_encode",
699 700
                          &str, &errors))
        return NULL;
701

702 703
    str = PyUnicode_FromObject(str);
    if (str == NULL)
704
        return NULL;
705
    v = codec_tuple(PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(str),
706 707 708
                                         PyUnicode_GET_SIZE(str),
                                         errors),
                    PyUnicode_GET_SIZE(str));
709 710
    Py_DECREF(str);
    return v;
711 712 713 714 715
}

/* This version provides access to the byteorder parameter of the
   builtin UTF-16 codecs as optional third argument. It defaults to 0
   which means: use the native byte order and prepend the data with a
Walter Dörwald's avatar
Walter Dörwald committed
716
   BOM mark.
717 718 719 720 721

*/

static PyObject *
utf_16_encode(PyObject *self,
722
            PyObject *args)
723
{
724
    PyObject *str, *v;
725 726 727
    const char *errors = NULL;
    int byteorder = 0;

728
    if (!PyArg_ParseTuple(args, "O|zi:utf_16_encode",
729 730
                          &str, &errors, &byteorder))
        return NULL;
731

732 733
    str = PyUnicode_FromObject(str);
    if (str == NULL)
734
        return NULL;
735
    v = codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str),
736 737 738 739
                                          PyUnicode_GET_SIZE(str),
                                          errors,
                                          byteorder),
                    PyUnicode_GET_SIZE(str));
740 741
    Py_DECREF(str);
    return v;
742 743 744 745
}

static PyObject *
utf_16_le_encode(PyObject *self,
746
                 PyObject *args)
747
{
748
    PyObject *str, *v;
749 750
    const char *errors = NULL;

751
    if (!PyArg_ParseTuple(args, "O|z:utf_16_le_encode",
752 753
                          &str, &errors))
        return NULL;
754

755 756
    str = PyUnicode_FromObject(str);
    if (str == NULL)
757
        return NULL;
758
    v = codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str),
759 760 761 762
                                             PyUnicode_GET_SIZE(str),
                                             errors,
                                             -1),
                       PyUnicode_GET_SIZE(str));
763 764
    Py_DECREF(str);
    return v;
765 766 767 768
}

static PyObject *
utf_16_be_encode(PyObject *self,
769
                 PyObject *args)
770
{
771
    PyObject *str, *v;
772 773
    const char *errors = NULL;

774
    if (!PyArg_ParseTuple(args, "O|z:utf_16_be_encode",
775 776
                          &str, &errors))
        return NULL;
777

778 779
    str = PyUnicode_FromObject(str);
    if (str == NULL)
780
        return NULL;
781
    v = codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str),
782 783 784 785
                                          PyUnicode_GET_SIZE(str),
                                          errors,
                                          +1),
                    PyUnicode_GET_SIZE(str));
786 787
    Py_DECREF(str);
    return v;
788 789
}

790 791 792 793 794 795 796 797 798
/* This version provides access to the byteorder parameter of the
   builtin UTF-32 codecs as optional third argument. It defaults to 0
   which means: use the native byte order and prepend the data with a
   BOM mark.

*/

static PyObject *
utf_32_encode(PyObject *self,
799
            PyObject *args)
800 801 802 803 804 805
{
    PyObject *str, *v;
    const char *errors = NULL;
    int byteorder = 0;

    if (!PyArg_ParseTuple(args, "O|zi:utf_32_encode",
806 807
                          &str, &errors, &byteorder))
        return NULL;
808 809 810

    str = PyUnicode_FromObject(str);
    if (str == NULL)
811
        return NULL;
812
    v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
813 814 815 816
                                          PyUnicode_GET_SIZE(str),
                                          errors,
                                          byteorder),
                    PyUnicode_GET_SIZE(str));
817 818 819 820 821 822
    Py_DECREF(str);
    return v;
}

static PyObject *
utf_32_le_encode(PyObject *self,
823
                 PyObject *args)
824 825 826 827 828
{
    PyObject *str, *v;
    const char *errors = NULL;

    if (!PyArg_ParseTuple(args, "O|z:utf_32_le_encode",
829 830
                          &str, &errors))
        return NULL;
831 832 833

    str = PyUnicode_FromObject(str);
    if (str == NULL)
834
        return NULL;
835
    v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
836 837 838 839
                                             PyUnicode_GET_SIZE(str),
                                             errors,
                                             -1),
                       PyUnicode_GET_SIZE(str));
840 841 842 843 844 845
    Py_DECREF(str);
    return v;
}

static PyObject *
utf_32_be_encode(PyObject *self,
846
                 PyObject *args)
847 848 849 850 851
{
    PyObject *str, *v;
    const char *errors = NULL;

    if (!PyArg_ParseTuple(args, "O|z:utf_32_be_encode",
852 853
                          &str, &errors))
        return NULL;
854 855 856

    str = PyUnicode_FromObject(str);
    if (str == NULL)
857
        return NULL;
858
    v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
859 860 861 862
                                          PyUnicode_GET_SIZE(str),
                                          errors,
                                          +1),
                    PyUnicode_GET_SIZE(str));
863 864 865 866
    Py_DECREF(str);
    return v;
}

867 868
static PyObject *
unicode_escape_encode(PyObject *self,
869
                     PyObject *args)
870
{
871
    PyObject *str, *v;
872 873
    const char *errors = NULL;

874
    if (!PyArg_ParseTuple(args, "O|z:unicode_escape_encode",
875 876
                          &str, &errors))
        return NULL;
877

878 879
    str = PyUnicode_FromObject(str);
    if (str == NULL)
880
        return NULL;
Walter Dörwald's avatar
Walter Dörwald committed
881
    v = codec_tuple(PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(str),
882 883
                                                  PyUnicode_GET_SIZE(str)),
                    PyUnicode_GET_SIZE(str));
884 885
    Py_DECREF(str);
    return v;
886 887 888 889
}

static PyObject *
raw_unicode_escape_encode(PyObject *self,
890
                        PyObject *args)
891
{
892
    PyObject *str, *v;
893 894
    const char *errors = NULL;

895
    if (!PyArg_ParseTuple(args, "O|z:raw_unicode_escape_encode",
896 897
                          &str, &errors))
        return NULL;
898

899 900
    str = PyUnicode_FromObject(str);
    if (str == NULL)
901
        return NULL;
902
    v = codec_tuple(PyUnicode_EncodeRawUnicodeEscape(
903 904 905
                               PyUnicode_AS_UNICODE(str),
                               PyUnicode_GET_SIZE(str)),
                    PyUnicode_GET_SIZE(str));
906 907
    Py_DECREF(str);
    return v;
908 909 910 911
}

static PyObject *
latin_1_encode(PyObject *self,
912
               PyObject *args)
913
{
914
    PyObject *str, *v;
915 916
    const char *errors = NULL;

917
    if (!PyArg_ParseTuple(args, "O|z:latin_1_encode",
918 919
                          &str, &errors))
        return NULL;
920

921 922
    str = PyUnicode_FromObject(str);
    if (str == NULL)
923
        return NULL;
924
    v = codec_tuple(PyUnicode_EncodeLatin1(
925 926 927 928
                               PyUnicode_AS_UNICODE(str),
                               PyUnicode_GET_SIZE(str),
                               errors),
                    PyUnicode_GET_SIZE(str));
929 930
    Py_DECREF(str);
    return v;
931 932 933 934
}

static PyObject *
ascii_encode(PyObject *self,
935
             PyObject *args)
936
{
937
    PyObject *str, *v;
938 939
    const char *errors = NULL;

940
    if (!PyArg_ParseTuple(args, "O|z:ascii_encode",
941 942
                          &str, &errors))
        return NULL;
943

944 945
    str = PyUnicode_FromObject(str);
    if (str == NULL)
946
        return NULL;
947
    v = codec_tuple(PyUnicode_EncodeASCII(
948 949 950 951
                               PyUnicode_AS_UNICODE(str),
                               PyUnicode_GET_SIZE(str),
                               errors),
                    PyUnicode_GET_SIZE(str));
952 953
    Py_DECREF(str);
    return v;
954 955 956 957
}

static PyObject *
charmap_encode(PyObject *self,
958
             PyObject *args)
959
{
960
    PyObject *str, *v;
961 962 963
    const char *errors = NULL;
    PyObject *mapping = NULL;

964
    if (!PyArg_ParseTuple(args, "O|zO:charmap_encode",
965 966
                          &str, &errors, &mapping))
        return NULL;
967
    if (mapping == Py_None)
968
        mapping = NULL;
969

970 971
    str = PyUnicode_FromObject(str);
    if (str == NULL)
972
        return NULL;
973
    v = codec_tuple(PyUnicode_EncodeCharmap(
974 975 976 977 978
                               PyUnicode_AS_UNICODE(str),
                               PyUnicode_GET_SIZE(str),
                               mapping,
                               errors),
                    PyUnicode_GET_SIZE(str));
979 980
    Py_DECREF(str);
    return v;
981 982
}

983 984 985 986 987 988 989 990 991
static PyObject*
charmap_build(PyObject *self, PyObject *args)
{
    PyObject *map;
    if (!PyArg_ParseTuple(args, "U:charmap_build", &map))
        return NULL;
    return PyUnicode_BuildEncodingMap(map);
}

992
#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum's avatar
Guido van Rossum committed
993 994 995

static PyObject *
mbcs_encode(PyObject *self,
996
            PyObject *args)
Guido van Rossum's avatar
Guido van Rossum committed
997
{
998
    PyObject *str, *v;
Guido van Rossum's avatar
Guido van Rossum committed
999 1000
    const char *errors = NULL;

1001
    if (!PyArg_ParseTuple(args, "O|z:mbcs_encode",
1002 1003
                          &str, &errors))
        return NULL;
Guido van Rossum's avatar
Guido van Rossum committed
1004

1005 1006
    str = PyUnicode_FromObject(str);
    if (str == NULL)
1007
        return NULL;
1008
    v = codec_tuple(PyUnicode_EncodeMBCS(
1009 1010 1011 1012
                               PyUnicode_AS_UNICODE(str),
                               PyUnicode_GET_SIZE(str),
                               errors),
                    PyUnicode_GET_SIZE(str));
1013 1014
    Py_DECREF(str);
    return v;
Guido van Rossum's avatar
Guido van Rossum committed
1015 1016
}

1017
#endif /* MS_WINDOWS */
Guido van Rossum's avatar
Guido van Rossum committed
1018

1019 1020
/* --- Error handler registry --------------------------------------------- */

1021 1022 1023 1024 1025 1026 1027 1028 1029
PyDoc_STRVAR(register_error__doc__,
"register_error(errors, handler)\n\
\n\
Register the specified error handler under the name\n\
errors. handler must be a callable object, that\n\
will be called with an exception instance containing\n\
information about the location of the encoding/decoding\n\
error and must return a (replacement, new position) tuple.");

1030 1031 1032 1033 1034 1035
static PyObject *register_error(PyObject *self, PyObject *args)
{
    const char *name;
    PyObject *handler;

    if (!PyArg_ParseTuple(args, "sO:register_error",
1036 1037
                          &name, &handler))
        return NULL;
1038 1039
    if (PyCodec_RegisterError(name, handler))
        return NULL;
1040
    Py_RETURN_NONE;
1041 1042
}

1043 1044 1045 1046 1047 1048
PyDoc_STRVAR(lookup_error__doc__,
"lookup_error(errors) -> handler\n\
\n\
Return the error handler for the specified error handling name\n\
or raise a LookupError, if no handler exists under this name.");

1049 1050 1051 1052 1053
static PyObject *lookup_error(PyObject *self, PyObject *args)
{
    const char *name;

    if (!PyArg_ParseTuple(args, "s:lookup_error",
1054 1055
                          &name))
        return NULL;
1056 1057 1058
    return PyCodec_LookupError(name);
}

1059 1060 1061
/* --- Module API --------------------------------------------------------- */

static PyMethodDef _codecs_functions[] = {
1062
    {"register",                codec_register,                 METH_O,
1063
        register__doc__},
1064
    {"lookup",                  codec_lookup,                   METH_VARARGS,
1065
        lookup__doc__},
1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103
    {"encode",                  codec_encode,                   METH_VARARGS,
        encode__doc__},
    {"decode",                  codec_decode,                   METH_VARARGS,
        decode__doc__},
    {"escape_encode",           escape_encode,                  METH_VARARGS},
    {"escape_decode",           escape_decode,                  METH_VARARGS},
    {"utf_8_encode",            utf_8_encode,                   METH_VARARGS},
    {"utf_8_decode",            utf_8_decode,                   METH_VARARGS},
    {"utf_7_encode",            utf_7_encode,                   METH_VARARGS},
    {"utf_7_decode",            utf_7_decode,                   METH_VARARGS},
    {"utf_16_encode",           utf_16_encode,                  METH_VARARGS},
    {"utf_16_le_encode",        utf_16_le_encode,               METH_VARARGS},
    {"utf_16_be_encode",        utf_16_be_encode,               METH_VARARGS},
    {"utf_16_decode",           utf_16_decode,                  METH_VARARGS},
    {"utf_16_le_decode",        utf_16_le_decode,               METH_VARARGS},
    {"utf_16_be_decode",        utf_16_be_decode,               METH_VARARGS},
    {"utf_16_ex_decode",        utf_16_ex_decode,               METH_VARARGS},
    {"utf_32_encode",           utf_32_encode,                  METH_VARARGS},
    {"utf_32_le_encode",        utf_32_le_encode,               METH_VARARGS},
    {"utf_32_be_encode",        utf_32_be_encode,               METH_VARARGS},
    {"utf_32_decode",           utf_32_decode,                  METH_VARARGS},
    {"utf_32_le_decode",        utf_32_le_decode,               METH_VARARGS},
    {"utf_32_be_decode",        utf_32_be_decode,               METH_VARARGS},
    {"utf_32_ex_decode",        utf_32_ex_decode,               METH_VARARGS},
    {"unicode_escape_encode",   unicode_escape_encode,          METH_VARARGS},
    {"unicode_escape_decode",   unicode_escape_decode,          METH_VARARGS},
    {"unicode_internal_encode", unicode_internal_encode,        METH_VARARGS},
    {"unicode_internal_decode", unicode_internal_decode,        METH_VARARGS},
    {"raw_unicode_escape_encode", raw_unicode_escape_encode,    METH_VARARGS},
    {"raw_unicode_escape_decode", raw_unicode_escape_decode,    METH_VARARGS},
    {"latin_1_encode",          latin_1_encode,                 METH_VARARGS},
    {"latin_1_decode",          latin_1_decode,                 METH_VARARGS},
    {"ascii_encode",            ascii_encode,                   METH_VARARGS},
    {"ascii_decode",            ascii_decode,                   METH_VARARGS},
    {"charmap_encode",          charmap_encode,                 METH_VARARGS},
    {"charmap_decode",          charmap_decode,                 METH_VARARGS},
    {"charmap_build",           charmap_build,                  METH_VARARGS},
    {"readbuffer_encode",       readbuffer_encode,              METH_VARARGS},
1104
#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1105 1106
    {"mbcs_encode",             mbcs_encode,                    METH_VARARGS},
    {"mbcs_decode",             mbcs_decode,                    METH_VARARGS},
Guido van Rossum's avatar
Guido van Rossum committed
1107
#endif
1108
    {"register_error",          register_error,                 METH_VARARGS,
1109
        register_error__doc__},
1110
    {"lookup_error",            lookup_error,                   METH_VARARGS,
1111
        lookup_error__doc__},
1112
    {NULL, NULL}                /* sentinel */
1113 1114
};

1115
static struct PyModuleDef codecsmodule = {
1116 1117 1118 1119 1120 1121 1122 1123 1124
        PyModuleDef_HEAD_INIT,
        "_codecs",
        NULL,
        -1,
        _codecs_functions,
        NULL,
        NULL,
        NULL,
        NULL
1125 1126
};

1127
PyMODINIT_FUNC
1128
PyInit__codecs(void)
1129
{
1130
        return PyModule_Create(&codecsmodule);
1131
}