codecs.c 29.8 KB
Newer Older
1 2 3 4 5 6
/* ------------------------------------------------------------------------

   Python Codec Registry and support functions

Written by Marc-Andre Lemburg (mal@lemburg.com).

7
Copyright (c) Corporation for National Research Initiatives.
8 9 10 11 12 13

   ------------------------------------------------------------------------ */

#include "Python.h"
#include <ctype.h>

14 15
const char *Py_hexdigits = "0123456789abcdef";

16 17 18
/* --- Codec Registry ----------------------------------------------------- */

/* Import the standard encodings package which will register the first
19
   codec search function.
20 21 22 23

   This is done in a lazy way so that the Unicode implementation does
   not downgrade startup time of scripts not needing it.

24 25
   ImportErrors are silently ignored by this function. Only one try is
   made.
26 27 28

*/

29
static int _PyCodecRegistry_Init(void); /* Forward */
30 31 32

int PyCodec_Register(PyObject *search_function)
{
33
    PyInterpreterState *interp = PyThreadState_GET()->interp;
34
    if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
35
        goto onError;
36
    if (search_function == NULL) {
37 38
        PyErr_BadArgument();
        goto onError;
39 40
    }
    if (!PyCallable_Check(search_function)) {
41 42
        PyErr_SetString(PyExc_TypeError, "argument must be callable");
        goto onError;
43
    }
44
    return PyList_Append(interp->codec_search_path, search_function);
45 46 47

 onError:
    return -1;
48 49
}

50 51 52
/* Convert a string to a normalized Python string: all characters are
   converted to lower case, spaces are replaced with underscores. */

53
static
54
PyObject *normalizestring(const char *string)
55
{
56
    register size_t i;
57
    size_t len = strlen(string);
58 59
    char *p;
    PyObject *v;
60

61
    if (len > PY_SSIZE_T_MAX) {
62 63
        PyErr_SetString(PyExc_OverflowError, "string is too large");
        return NULL;
64
    }
65 66 67 68

    p = PyMem_Malloc(len + 1);
    if (p == NULL)
        return NULL;
69 70 71 72 73
    for (i = 0; i < len; i++) {
        register char ch = string[i];
        if (ch == ' ')
            ch = '-';
        else
74
            ch = Py_TOLOWER(Py_CHARMASK(ch));
75
        p[i] = ch;
76
    }
77 78 79 80 81
    p[i] = '\0';
    v = PyUnicode_FromString(p);
    if (v == NULL)
        return NULL;
    PyMem_Free(p);
82 83 84 85 86 87 88 89 90 91
    return v;
}

/* Lookup the given encoding and return a tuple providing the codec
   facilities.

   The encoding string is looked up converted to all lower-case
   characters. This makes encodings looked up through this mechanism
   effectively case-insensitive.

92
   If no codec is found, a LookupError is set and NULL returned.
93 94 95 96 97 98

   As side effect, this tries to load the encodings package, if not
   yet done. This is part of the lazy load strategy for the encodings
   package.

*/
99 100 101

PyObject *_PyCodec_Lookup(const char *encoding)
{
102
    PyInterpreterState *interp;
Guido van Rossum's avatar
Guido van Rossum committed
103
    PyObject *result, *args = NULL, *v;
104
    Py_ssize_t i, len;
105

106
    if (encoding == NULL) {
107 108
        PyErr_BadArgument();
        goto onError;
109
    }
110

111
    interp = PyThreadState_GET()->interp;
112
    if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
113
        goto onError;
114

115
    /* Convert the encoding to a normalized Python string: all
116
       characters are converted to lower case, spaces and hyphens are
117 118
       replaced with underscores. */
    v = normalizestring(encoding);
119
    if (v == NULL)
120
        goto onError;
121
    PyUnicode_InternInPlace(&v);
122 123

    /* First, try to lookup the name in the registry dictionary */
124
    result = PyDict_GetItem(interp->codec_search_cache, v);
125
    if (result != NULL) {
126 127 128
        Py_INCREF(result);
        Py_DECREF(v);
        return result;
129
    }
130

131 132 133
    /* Next, scan the search functions in order of registration */
    args = PyTuple_New(1);
    if (args == NULL)
134
        goto onError;
135
    PyTuple_SET_ITEM(args,0,v);
Guido van Rossum's avatar
Guido van Rossum committed
136

137
    len = PyList_Size(interp->codec_search_path);
Guido van Rossum's avatar
Guido van Rossum committed
138
    if (len < 0)
139
        goto onError;
140
    if (len == 0) {
141 142 143 144
        PyErr_SetString(PyExc_LookupError,
                        "no codec search functions registered: "
                        "can't find encoding");
        goto onError;
145
    }
146 147

    for (i = 0; i < len; i++) {
148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166
        PyObject *func;

        func = PyList_GetItem(interp->codec_search_path, i);
        if (func == NULL)
            goto onError;
        result = PyEval_CallObject(func, args);
        if (result == NULL)
            goto onError;
        if (result == Py_None) {
            Py_DECREF(result);
            continue;
        }
        if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
            PyErr_SetString(PyExc_TypeError,
                            "codec search functions must return 4-tuples");
            Py_DECREF(result);
            goto onError;
        }
        break;
167 168
    }
    if (i == len) {
169 170
        /* XXX Perhaps we should cache misses too ? */
        PyErr_Format(PyExc_LookupError,
171
                     "unknown encoding: %s", encoding);
172
        goto onError;
173 174 175
    }

    /* Cache and return the result */
Neal Norwitz's avatar
Neal Norwitz committed
176
    if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
177 178
        Py_DECREF(result);
        goto onError;
Neal Norwitz's avatar
Neal Norwitz committed
179
    }
180 181 182 183 184 185 186 187
    Py_DECREF(args);
    return result;

 onError:
    Py_XDECREF(args);
    return NULL;
}

188 189 190 191 192
/* Codec registry encoding check API. */

int PyCodec_KnownEncoding(const char *encoding)
{
    PyObject *codecs;
193

194 195
    codecs = _PyCodec_Lookup(encoding);
    if (!codecs) {
196 197
        PyErr_Clear();
        return 0;
198 199
    }
    else {
200 201
        Py_DECREF(codecs);
        return 1;
202 203 204
    }
}

205 206
static
PyObject *args_tuple(PyObject *object,
207
                     const char *errors)
208 209
{
    PyObject *args;
210

211 212
    args = PyTuple_New(1 + (errors != NULL));
    if (args == NULL)
213
        return NULL;
214 215 216
    Py_INCREF(object);
    PyTuple_SET_ITEM(args,0,object);
    if (errors) {
217 218 219 220 221 222 223 224
        PyObject *v;

        v = PyUnicode_FromString(errors);
        if (v == NULL) {
            Py_DECREF(args);
            return NULL;
        }
        PyTuple_SET_ITEM(args, 1, v);
225 226 227 228
    }
    return args;
}

229
/* Helper function to get a codec item */
230 231

static
232
PyObject *codec_getitem(const char *encoding, int index)
233 234 235 236 237 238
{
    PyObject *codecs;
    PyObject *v;

    codecs = _PyCodec_Lookup(encoding);
    if (codecs == NULL)
239
        return NULL;
240
    v = PyTuple_GET_ITEM(codecs, index);
241
    Py_DECREF(codecs);
242 243 244 245
    Py_INCREF(v);
    return v;
}

246 247 248 249
/* Helper function to create an incremental codec. */

static
PyObject *codec_getincrementalcodec(const char *encoding,
250 251
                                    const char *errors,
                                    const char *attrname)
252
{
253
    PyObject *codecs, *ret, *inccodec;
254 255 256

    codecs = _PyCodec_Lookup(encoding);
    if (codecs == NULL)
257
        return NULL;
258
    inccodec = PyObject_GetAttrString(codecs, attrname);
259
    Py_DECREF(codecs);
260
    if (inccodec == NULL)
261
        return NULL;
262
    if (errors)
263
        ret = PyObject_CallFunction(inccodec, "s", errors);
264
    else
265
        ret = PyObject_CallFunction(inccodec, NULL);
266 267
    Py_DECREF(inccodec);
    return ret;
268 269
}

270 271 272 273
/* Helper function to create a stream codec. */

static
PyObject *codec_getstreamcodec(const char *encoding,
274 275 276
                               PyObject *stream,
                               const char *errors,
                               const int index)
277
{
278
    PyObject *codecs, *streamcodec, *codeccls;
279 280 281

    codecs = _PyCodec_Lookup(encoding);
    if (codecs == NULL)
282
        return NULL;
283

284 285
    codeccls = PyTuple_GET_ITEM(codecs, index);
    if (errors != NULL)
286
        streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
287
    else
288
        streamcodec = PyObject_CallFunction(codeccls, "O", stream);
289
    Py_DECREF(codecs);
290 291
    return streamcodec;
}
292

293 294
/* Convenience APIs to query the Codec registry.

295
   All APIs return a codec object with incremented refcount.
296

297 298 299 300 301
 */

PyObject *PyCodec_Encoder(const char *encoding)
{
    return codec_getitem(encoding, 0);
302 303
}

304
PyObject *PyCodec_Decoder(const char *encoding)
305
{
306 307
    return codec_getitem(encoding, 1);
}
308

309
PyObject *PyCodec_IncrementalEncoder(const char *encoding,
310
                                     const char *errors)
311 312 313
{
    return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
}
314

315
PyObject *PyCodec_IncrementalDecoder(const char *encoding,
316
                                     const char *errors)
317 318
{
    return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
319 320
}

321
PyObject *PyCodec_StreamReader(const char *encoding,
322 323
                               PyObject *stream,
                               const char *errors)
324
{
325
    return codec_getstreamcodec(encoding, stream, errors, 2);
326 327 328
}

PyObject *PyCodec_StreamWriter(const char *encoding,
329 330
                               PyObject *stream,
                               const char *errors)
331
{
332
    return codec_getstreamcodec(encoding, stream, errors, 3);
333 334 335 336 337 338 339 340
}

/* Encode an object (e.g. an Unicode object) using the given encoding
   and return the resulting encoded object (usually a Python string).

   errors is passed to the encoder factory as argument if non-NULL. */

PyObject *PyCodec_Encode(PyObject *object,
341 342
                         const char *encoding,
                         const char *errors)
343 344
{
    PyObject *encoder = NULL;
345
    PyObject *args = NULL, *result = NULL;
346
    PyObject *v = NULL;
347 348 349

    encoder = PyCodec_Encoder(encoding);
    if (encoder == NULL)
350
        goto onError;
351 352 353

    args = args_tuple(object, errors);
    if (args == NULL)
354
        goto onError;
355 356

    result = PyEval_CallObject(encoder, args);
357
    if (result == NULL)
358
        goto onError;
359

360
    if (!PyTuple_Check(result) ||
361 362 363 364
        PyTuple_GET_SIZE(result) != 2) {
        PyErr_SetString(PyExc_TypeError,
                        "encoder must return a tuple (object, integer)");
        goto onError;
365
    }
366 367
    v = PyTuple_GET_ITEM(result,0);
    Py_INCREF(v);
368 369
    /* We don't check or use the second (integer) entry. */

370 371 372 373
    Py_DECREF(args);
    Py_DECREF(encoder);
    Py_DECREF(result);
    return v;
374

375
 onError:
376
    Py_XDECREF(result);
377 378
    Py_XDECREF(args);
    Py_XDECREF(encoder);
379
    return NULL;
380 381 382 383 384 385 386 387
}

/* Decode an object (usually a Python string) using the given encoding
   and return an equivalent object (e.g. an Unicode object).

   errors is passed to the decoder factory as argument if non-NULL. */

PyObject *PyCodec_Decode(PyObject *object,
388 389
                         const char *encoding,
                         const char *errors)
390 391 392 393 394 395 396
{
    PyObject *decoder = NULL;
    PyObject *args = NULL, *result = NULL;
    PyObject *v;

    decoder = PyCodec_Decoder(encoding);
    if (decoder == NULL)
397
        goto onError;
398 399 400

    args = args_tuple(object, errors);
    if (args == NULL)
401
        goto onError;
402

403 404
    result = PyEval_CallObject(decoder,args);
    if (result == NULL)
405
        goto onError;
406
    if (!PyTuple_Check(result) ||
407 408 409 410
        PyTuple_GET_SIZE(result) != 2) {
        PyErr_SetString(PyExc_TypeError,
                        "decoder must return a tuple (object,integer)");
        goto onError;
411 412 413 414 415 416 417 418 419
    }
    v = PyTuple_GET_ITEM(result,0);
    Py_INCREF(v);
    /* We don't check or use the second (integer) entry. */

    Py_DECREF(args);
    Py_DECREF(decoder);
    Py_DECREF(result);
    return v;
420

421 422 423 424 425 426 427
 onError:
    Py_XDECREF(args);
    Py_XDECREF(decoder);
    Py_XDECREF(result);
    return NULL;
}

428 429 430 431 432 433 434 435
/* Register the error handling callback function error under the name
   name. This function will be called by the codec when it encounters
   an unencodable characters/undecodable bytes and doesn't know the
   callback name, when name is specified as the error parameter
   in the call to the encode/decode function.
   Return 0 on success, -1 on error */
int PyCodec_RegisterError(const char *name, PyObject *error)
{
436
    PyInterpreterState *interp = PyThreadState_GET()->interp;
437
    if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
438
        return -1;
439
    if (!PyCallable_Check(error)) {
440 441
        PyErr_SetString(PyExc_TypeError, "handler must be callable");
        return -1;
442
    }
443
    return PyDict_SetItemString(interp->codec_error_registry,
444
                                (char *)name, error);
445 446 447 448 449 450 451 452 453
}

/* Lookup the error handling callback function registered under the
   name error. As a special case NULL can be passed, in which case
   the error handling callback for strict encoding will be returned. */
PyObject *PyCodec_LookupError(const char *name)
{
    PyObject *handler = NULL;

454
    PyInterpreterState *interp = PyThreadState_GET()->interp;
455
    if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
456
        return NULL;
457

458
    if (name==NULL)
459
        name = "strict";
460
    handler = PyDict_GetItemString(interp->codec_error_registry, (char *)name);
461
    if (!handler)
462
        PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
463
    else
464
        Py_INCREF(handler);
465 466 467 468 469
    return handler;
}

static void wrong_exception_type(PyObject *exc)
{
470 471
    _Py_IDENTIFIER(__class__);
    _Py_IDENTIFIER(__name__);
472
    PyObject *type = _PyObject_GetAttrId(exc, &PyId___class__);
473
    if (type != NULL) {
474
        PyObject *name = _PyObject_GetAttrId(type, &PyId___name__);
475 476 477 478 479 480
        Py_DECREF(type);
        if (name != NULL) {
            PyErr_Format(PyExc_TypeError,
                         "don't know how to handle %S in error callback", name);
            Py_DECREF(name);
        }
481 482 483 484 485
    }
}

PyObject *PyCodec_StrictErrors(PyObject *exc)
{
486 487
    if (PyExceptionInstance_Check(exc))
        PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
488
    else
489
        PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
490 491 492 493 494 495
    return NULL;
}


PyObject *PyCodec_IgnoreErrors(PyObject *exc)
{
Martin v. Löwis's avatar
Martin v. Löwis committed
496
    Py_ssize_t end;
497
    if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
498 499
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
            return NULL;
500 501
    }
    else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
502 503
        if (PyUnicodeDecodeError_GetEnd(exc, &end))
            return NULL;
504 505
    }
    else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
506 507
        if (PyUnicodeTranslateError_GetEnd(exc, &end))
            return NULL;
508 509
    }
    else {
510 511
        wrong_exception_type(exc);
        return NULL;
512
    }
513
    return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
514 515 516 517 518
}


PyObject *PyCodec_ReplaceErrors(PyObject *exc)
{
Martin v. Löwis's avatar
Martin v. Löwis committed
519
    Py_ssize_t start, end, i, len;
520 521

    if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
522
        PyObject *res;
Martin v. Löwis's avatar
Martin v. Löwis committed
523 524
        int kind;
        void *data;
525 526 527 528
        if (PyUnicodeEncodeError_GetStart(exc, &start))
            return NULL;
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
            return NULL;
Martin v. Löwis's avatar
Martin v. Löwis committed
529 530
        len = end - start;
        res = PyUnicode_New(len, '?');
531 532
        if (res == NULL)
            return NULL;
Martin v. Löwis's avatar
Martin v. Löwis committed
533 534 535 536
        kind = PyUnicode_KIND(res);
        data = PyUnicode_DATA(res);
        for (i = 0; i < len; ++i)
            PyUnicode_WRITE(kind, data, i, '?');
537
        assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwis's avatar
Martin v. Löwis committed
538
        return Py_BuildValue("(Nn)", res, end);
539 540
    }
    else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
541 542
        if (PyUnicodeDecodeError_GetEnd(exc, &end))
            return NULL;
543 544 545
        return Py_BuildValue("(Cn)",
                             (int)Py_UNICODE_REPLACEMENT_CHARACTER,
                             end);
546 547
    }
    else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
548
        PyObject *res;
Martin v. Löwis's avatar
Martin v. Löwis committed
549 550
        int kind;
        void *data;
551 552 553 554
        if (PyUnicodeTranslateError_GetStart(exc, &start))
            return NULL;
        if (PyUnicodeTranslateError_GetEnd(exc, &end))
            return NULL;
Martin v. Löwis's avatar
Martin v. Löwis committed
555 556
        len = end - start;
        res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
557 558
        if (res == NULL)
            return NULL;
Martin v. Löwis's avatar
Martin v. Löwis committed
559 560 561 562
        kind = PyUnicode_KIND(res);
        data = PyUnicode_DATA(res);
        for (i=0; i < len; i++)
            PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);
563
        assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwis's avatar
Martin v. Löwis committed
564
        return Py_BuildValue("(Nn)", res, end);
565 566
    }
    else {
567 568
        wrong_exception_type(exc);
        return NULL;
569 570 571 572 573 574
    }
}

PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
{
    if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
575 576
        PyObject *restuple;
        PyObject *object;
577
        Py_ssize_t i;
578 579 580
        Py_ssize_t start;
        Py_ssize_t end;
        PyObject *res;
581
        unsigned char *outp;
582
        int ressize;
583
        Py_UCS4 ch;
584 585 586 587 588 589
        if (PyUnicodeEncodeError_GetStart(exc, &start))
            return NULL;
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
            return NULL;
        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
            return NULL;
590 591 592 593
        for (i = start, ressize = 0; i < end; ++i) {
            /* object is guaranteed to be "ready" */
            ch = PyUnicode_READ_CHAR(object, i);
            if (ch<10)
594
                ressize += 2+1+1;
595
            else if (ch<100)
596
                ressize += 2+2+1;
597
            else if (ch<1000)
598
                ressize += 2+3+1;
599
            else if (ch<10000)
600
                ressize += 2+4+1;
601
            else if (ch<100000)
602
                ressize += 2+5+1;
603
            else if (ch<1000000)
604 605 606 607 608
                ressize += 2+6+1;
            else
                ressize += 2+7+1;
        }
        /* allocate replacement */
609
        res = PyUnicode_New(ressize, 127);
610 611 612 613
        if (res == NULL) {
            Py_DECREF(object);
            return NULL;
        }
614
        outp = PyUnicode_1BYTE_DATA(res);
615
        /* generate replacement */
616
        for (i = start; i < end; ++i) {
617 618
            int digits;
            int base;
619
            ch = PyUnicode_READ_CHAR(object, i);
620 621
            *outp++ = '&';
            *outp++ = '#';
622
            if (ch<10) {
623 624 625
                digits = 1;
                base = 1;
            }
626
            else if (ch<100) {
627 628 629
                digits = 2;
                base = 10;
            }
630
            else if (ch<1000) {
631 632 633
                digits = 3;
                base = 100;
            }
634
            else if (ch<10000) {
635 636 637
                digits = 4;
                base = 1000;
            }
638
            else if (ch<100000) {
639 640 641
                digits = 5;
                base = 10000;
            }
642
            else if (ch<1000000) {
643 644 645 646 647 648 649 650
                digits = 6;
                base = 100000;
            }
            else {
                digits = 7;
                base = 1000000;
            }
            while (digits-->0) {
651 652
                *outp++ = '0' + ch/base;
                ch %= base;
653 654 655 656
                base /= 10;
            }
            *outp++ = ';';
        }
657 658
        assert(_PyUnicode_CheckConsistency(res, 1));
        restuple = Py_BuildValue("(Nn)", res, end);
659 660
        Py_DECREF(object);
        return restuple;
661 662
    }
    else {
663 664
        wrong_exception_type(exc);
        return NULL;
665 666 667 668 669 670
    }
}

PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
{
    if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
671 672
        PyObject *restuple;
        PyObject *object;
673
        Py_ssize_t i;
674 675 676
        Py_ssize_t start;
        Py_ssize_t end;
        PyObject *res;
677
        unsigned char *outp;
678
        int ressize;
679
        Py_UCS4 c;
680 681 682 683 684 685
        if (PyUnicodeEncodeError_GetStart(exc, &start))
            return NULL;
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
            return NULL;
        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
            return NULL;
686 687 688 689
        for (i = start, ressize = 0; i < end; ++i) {
            /* object is guaranteed to be "ready" */
            c = PyUnicode_READ_CHAR(object, i);
            if (c >= 0x10000) {
690
                ressize += 1+1+8;
691 692 693
            }
            else if (c >= 0x100) {
                ressize += 1+1+4;
694 695 696 697
            }
            else
                ressize += 1+1+2;
        }
698
        res = PyUnicode_New(ressize, 127);
699 700
        if (res==NULL)
            return NULL;
701 702 703
        for (i = start, outp = PyUnicode_1BYTE_DATA(res);
            i < end; ++i) {
            c = PyUnicode_READ_CHAR(object, i);
704 705 706
            *outp++ = '\\';
            if (c >= 0x00010000) {
                *outp++ = 'U';
707 708 709 710 711 712
                *outp++ = Py_hexdigits[(c>>28)&0xf];
                *outp++ = Py_hexdigits[(c>>24)&0xf];
                *outp++ = Py_hexdigits[(c>>20)&0xf];
                *outp++ = Py_hexdigits[(c>>16)&0xf];
                *outp++ = Py_hexdigits[(c>>12)&0xf];
                *outp++ = Py_hexdigits[(c>>8)&0xf];
713
            }
714
            else if (c >= 0x100) {
715
                *outp++ = 'u';
716 717
                *outp++ = Py_hexdigits[(c>>12)&0xf];
                *outp++ = Py_hexdigits[(c>>8)&0xf];
718 719 720
            }
            else
                *outp++ = 'x';
721 722
            *outp++ = Py_hexdigits[(c>>4)&0xf];
            *outp++ = Py_hexdigits[c&0xf];
723 724
        }

725 726
        assert(_PyUnicode_CheckConsistency(res, 1));
        restuple = Py_BuildValue("(Nn)", res, end);
727 728
        Py_DECREF(object);
        return restuple;
729 730
    }
    else {
731 732
        wrong_exception_type(exc);
        return NULL;
733 734 735
    }
}

736 737 738
/* This handler is declared static until someone demonstrates
   a need to call it directly. */
static PyObject *
739
PyCodec_SurrogatePassErrors(PyObject *exc)
740 741 742
{
    PyObject *restuple;
    PyObject *object;
743
    Py_ssize_t i;
744 745 746 747
    Py_ssize_t start;
    Py_ssize_t end;
    PyObject *res;
    if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
748 749 750 751 752 753 754 755 756 757 758 759 760
        char *outp;
        if (PyUnicodeEncodeError_GetStart(exc, &start))
            return NULL;
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
            return NULL;
        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
            return NULL;
        res = PyBytes_FromStringAndSize(NULL, 3*(end-start));
        if (!res) {
            Py_DECREF(object);
            return NULL;
        }
        outp = PyBytes_AsString(res);
761 762 763
        for (i = start; i < end; i++) {
            /* object is guaranteed to be "ready" */
            Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
764 765 766 767 768 769 770 771 772 773 774 775 776 777 778
            if (ch < 0xd800 || ch > 0xdfff) {
                /* Not a surrogate, fail with original exception */
                PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
                Py_DECREF(res);
                Py_DECREF(object);
                return NULL;
            }
            *outp++ = (char)(0xe0 | (ch >> 12));
            *outp++ = (char)(0x80 | ((ch >> 6) & 0x3f));
            *outp++ = (char)(0x80 | (ch & 0x3f));
        }
        restuple = Py_BuildValue("(On)", res, end);
        Py_DECREF(res);
        Py_DECREF(object);
        return restuple;
779 780
    }
    else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
781
        unsigned char *p;
782
        Py_UCS4 ch = 0;
783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807
        if (PyUnicodeDecodeError_GetStart(exc, &start))
            return NULL;
        if (!(object = PyUnicodeDecodeError_GetObject(exc)))
            return NULL;
        if (!(p = (unsigned char*)PyBytes_AsString(object))) {
            Py_DECREF(object);
            return NULL;
        }
        /* Try decoding a single surrogate character. If
           there are more, let the codec call us again. */
        p += start;
        if ((p[0] & 0xf0) == 0xe0 ||
            (p[1] & 0xc0) == 0x80 ||
            (p[2] & 0xc0) == 0x80) {
            /* it's a three-byte code */
            ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
            if (ch < 0xd800 || ch > 0xdfff)
                /* it's not a surrogate - fail */
                ch = 0;
        }
        Py_DECREF(object);
        if (ch == 0) {
            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
            return NULL;
        }
808 809 810 811
        res = PyUnicode_FromOrdinal(ch);
        if (res == NULL)
            return NULL;
        return Py_BuildValue("(Nn)", res, start+3);
812 813
    }
    else {
814 815
        wrong_exception_type(exc);
        return NULL;
816 817 818
    }
}

819
static PyObject *
820
PyCodec_SurrogateEscapeErrors(PyObject *exc)
821 822 823
{
    PyObject *restuple;
    PyObject *object;
824
    Py_ssize_t i;
825 826 827 828
    Py_ssize_t start;
    Py_ssize_t end;
    PyObject *res;
    if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
829 830 831 832 833 834 835 836 837 838 839 840 841
        char *outp;
        if (PyUnicodeEncodeError_GetStart(exc, &start))
            return NULL;
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
            return NULL;
        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
            return NULL;
        res = PyBytes_FromStringAndSize(NULL, end-start);
        if (!res) {
            Py_DECREF(object);
            return NULL;
        }
        outp = PyBytes_AsString(res);
842 843 844
        for (i = start; i < end; i++) {
            /* object is guaranteed to be "ready" */
            Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
845 846 847 848 849 850 851 852 853 854 855 856 857
            if (ch < 0xdc80 || ch > 0xdcff) {
                /* Not a UTF-8b surrogate, fail with original exception */
                PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
                Py_DECREF(res);
                Py_DECREF(object);
                return NULL;
            }
            *outp++ = ch - 0xdc00;
        }
        restuple = Py_BuildValue("(On)", res, end);
        Py_DECREF(res);
        Py_DECREF(object);
        return restuple;
858 859
    }
    else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
860
        PyObject *str;
861
        unsigned char *p;
862
        Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886
        int consumed = 0;
        if (PyUnicodeDecodeError_GetStart(exc, &start))
            return NULL;
        if (PyUnicodeDecodeError_GetEnd(exc, &end))
            return NULL;
        if (!(object = PyUnicodeDecodeError_GetObject(exc)))
            return NULL;
        if (!(p = (unsigned char*)PyBytes_AsString(object))) {
            Py_DECREF(object);
            return NULL;
        }
        while (consumed < 4 && consumed < end-start) {
            /* Refuse to escape ASCII bytes. */
            if (p[start+consumed] < 128)
                break;
            ch[consumed] = 0xdc00 + p[start+consumed];
            consumed++;
        }
        Py_DECREF(object);
        if (!consumed) {
            /* codec complained about ASCII byte. */
            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
            return NULL;
        }
887 888 889 890
        str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
        if (str == NULL)
            return NULL;
        return Py_BuildValue("(Nn)", str, start+consumed);
891 892
    }
    else {
893 894
        wrong_exception_type(exc);
        return NULL;
895 896 897
    }
}

898

899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927
static PyObject *strict_errors(PyObject *self, PyObject *exc)
{
    return PyCodec_StrictErrors(exc);
}


static PyObject *ignore_errors(PyObject *self, PyObject *exc)
{
    return PyCodec_IgnoreErrors(exc);
}


static PyObject *replace_errors(PyObject *self, PyObject *exc)
{
    return PyCodec_ReplaceErrors(exc);
}


static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
{
    return PyCodec_XMLCharRefReplaceErrors(exc);
}


static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
{
    return PyCodec_BackslashReplaceErrors(exc);
}

928
static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
929
{
930
    return PyCodec_SurrogatePassErrors(exc);
931 932
}

933
static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
934
{
935
    return PyCodec_SurrogateEscapeErrors(exc);
936 937
}

938
static int _PyCodecRegistry_Init(void)
939
{
940
    static struct {
941 942
        char *name;
        PyMethodDef def;
943 944
    } methods[] =
    {
945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012
        {
            "strict",
            {
                "strict_errors",
                strict_errors,
                METH_O,
                PyDoc_STR("Implements the 'strict' error handling, which "
                          "raises a UnicodeError on coding errors.")
            }
        },
        {
            "ignore",
            {
                "ignore_errors",
                ignore_errors,
                METH_O,
                PyDoc_STR("Implements the 'ignore' error handling, which "
                          "ignores malformed data and continues.")
            }
        },
        {
            "replace",
            {
                "replace_errors",
                replace_errors,
                METH_O,
                PyDoc_STR("Implements the 'replace' error handling, which "
                          "replaces malformed data with a replacement marker.")
            }
        },
        {
            "xmlcharrefreplace",
            {
                "xmlcharrefreplace_errors",
                xmlcharrefreplace_errors,
                METH_O,
                PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
                          "which replaces an unencodable character with the "
                          "appropriate XML character reference.")
            }
        },
        {
            "backslashreplace",
            {
                "backslashreplace_errors",
                backslashreplace_errors,
                METH_O,
                PyDoc_STR("Implements the 'backslashreplace' error handling, "
                          "which replaces an unencodable character with a "
                          "backslashed escape sequence.")
            }
        },
        {
            "surrogatepass",
            {
                "surrogatepass",
                surrogatepass_errors,
                METH_O
            }
        },
        {
            "surrogateescape",
            {
                "surrogateescape",
                surrogateescape_errors,
                METH_O
            }
        }
1013
    };
1014

1015
    PyInterpreterState *interp = PyThreadState_GET()->interp;
1016
    PyObject *mod;
1017
    unsigned i;
1018 1019

    if (interp->codec_search_path != NULL)
1020
        return 0;
1021 1022 1023 1024 1025 1026

    interp->codec_search_path = PyList_New(0);
    interp->codec_search_cache = PyDict_New();
    interp->codec_error_registry = PyDict_New();

    if (interp->codec_error_registry) {
1027
        for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
1028 1029 1030 1031 1032 1033 1034 1035 1036
            PyObject *func = PyCFunction_New(&methods[i].def, NULL);
            int res;
            if (!func)
                Py_FatalError("can't initialize codec error registry");
            res = PyCodec_RegisterError(methods[i].name, func);
            Py_DECREF(func);
            if (res)
                Py_FatalError("can't initialize codec error registry");
        }
1037
    }
1038 1039

    if (interp->codec_search_path == NULL ||
1040 1041 1042
        interp->codec_search_cache == NULL ||
        interp->codec_error_registry == NULL)
        Py_FatalError("can't initialize codec registry");
1043

1044
    mod = PyImport_ImportModuleNoBlock("encodings");
1045
    if (mod == NULL) {
1046
        return -1;
1047 1048
    }
    Py_DECREF(mod);
1049
    interp->codecs_initialized = 1;
1050
    return 0;
1051
}