codecs.c 30.6 KB
Newer Older
1 2 3 4 5 6
/* ------------------------------------------------------------------------

   Python Codec Registry and support functions

Written by Marc-Andre Lemburg (mal@lemburg.com).

7
Copyright (c) Corporation for National Research Initiatives.
8 9 10 11 12 13 14 15 16

   ------------------------------------------------------------------------ */

#include "Python.h"
#include <ctype.h>

/* --- Codec Registry ----------------------------------------------------- */

/* Import the standard encodings package which will register the first
17
   codec search function.
18 19 20 21

   This is done in a lazy way so that the Unicode implementation does
   not downgrade startup time of scripts not needing it.

22 23
   ImportErrors are silently ignored by this function. Only one try is
   made.
24 25 26

*/

27
static int _PyCodecRegistry_Init(void); /* Forward */
28 29 30

int PyCodec_Register(PyObject *search_function)
{
31
    PyInterpreterState *interp = PyThreadState_GET()->interp;
32
    if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
33
        goto onError;
34
    if (search_function == NULL) {
35 36
        PyErr_BadArgument();
        goto onError;
37 38
    }
    if (!PyCallable_Check(search_function)) {
39 40
        PyErr_SetString(PyExc_TypeError, "argument must be callable");
        goto onError;
41
    }
42
    return PyList_Append(interp->codec_search_path, search_function);
43 44 45

 onError:
    return -1;
46 47
}

48 49 50
/* Convert a string to a normalized Python string: all characters are
   converted to lower case, spaces are replaced with underscores. */

51
static
52
PyObject *normalizestring(const char *string)
53
{
54
    register size_t i;
55
    size_t len = strlen(string);
56 57
    char *p;
    PyObject *v;
58

59
    if (len > PY_SSIZE_T_MAX) {
60 61
        PyErr_SetString(PyExc_OverflowError, "string is too large");
        return NULL;
62
    }
63 64 65 66

    p = PyMem_Malloc(len + 1);
    if (p == NULL)
        return NULL;
67 68 69 70 71
    for (i = 0; i < len; i++) {
        register char ch = string[i];
        if (ch == ' ')
            ch = '-';
        else
72
            ch = tolower(Py_CHARMASK(ch));
73
        p[i] = ch;
74
    }
75 76 77 78 79
    p[i] = '\0';
    v = PyUnicode_FromString(p);
    if (v == NULL)
        return NULL;
    PyMem_Free(p);
80 81 82 83 84 85 86 87 88 89
    return v;
}

/* Lookup the given encoding and return a tuple providing the codec
   facilities.

   The encoding string is looked up converted to all lower-case
   characters. This makes encodings looked up through this mechanism
   effectively case-insensitive.

90
   If no codec is found, a LookupError is set and NULL returned.
91 92 93 94 95 96

   As side effect, this tries to load the encodings package, if not
   yet done. This is part of the lazy load strategy for the encodings
   package.

*/
97 98 99

PyObject *_PyCodec_Lookup(const char *encoding)
{
100
    PyInterpreterState *interp;
Guido van Rossum's avatar
Guido van Rossum committed
101
    PyObject *result, *args = NULL, *v;
102
    Py_ssize_t i, len;
103

104
    if (encoding == NULL) {
105 106
        PyErr_BadArgument();
        goto onError;
107
    }
108

109
    interp = PyThreadState_GET()->interp;
110
    if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
111
        goto onError;
112

113
    /* Convert the encoding to a normalized Python string: all
114
       characters are converted to lower case, spaces and hyphens are
115 116
       replaced with underscores. */
    v = normalizestring(encoding);
117
    if (v == NULL)
118
        goto onError;
119
    PyUnicode_InternInPlace(&v);
120 121

    /* First, try to lookup the name in the registry dictionary */
122
    result = PyDict_GetItem(interp->codec_search_cache, v);
123
    if (result != NULL) {
124 125 126
        Py_INCREF(result);
        Py_DECREF(v);
        return result;
127
    }
128

129 130 131
    /* Next, scan the search functions in order of registration */
    args = PyTuple_New(1);
    if (args == NULL)
132
        goto onError;
133
    PyTuple_SET_ITEM(args,0,v);
Guido van Rossum's avatar
Guido van Rossum committed
134

135
    len = PyList_Size(interp->codec_search_path);
Guido van Rossum's avatar
Guido van Rossum committed
136
    if (len < 0)
137
        goto onError;
138
    if (len == 0) {
139 140 141 142
        PyErr_SetString(PyExc_LookupError,
                        "no codec search functions registered: "
                        "can't find encoding");
        goto onError;
143
    }
144 145

    for (i = 0; i < len; i++) {
146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
        PyObject *func;

        func = PyList_GetItem(interp->codec_search_path, i);
        if (func == NULL)
            goto onError;
        result = PyEval_CallObject(func, args);
        if (result == NULL)
            goto onError;
        if (result == Py_None) {
            Py_DECREF(result);
            continue;
        }
        if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
            PyErr_SetString(PyExc_TypeError,
                            "codec search functions must return 4-tuples");
            Py_DECREF(result);
            goto onError;
        }
        break;
165 166
    }
    if (i == len) {
167 168
        /* XXX Perhaps we should cache misses too ? */
        PyErr_Format(PyExc_LookupError,
169
                     "unknown encoding: %s", encoding);
170
        goto onError;
171 172 173
    }

    /* Cache and return the result */
Neal Norwitz's avatar
Neal Norwitz committed
174
    if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
175 176
        Py_DECREF(result);
        goto onError;
Neal Norwitz's avatar
Neal Norwitz committed
177
    }
178 179 180 181 182 183 184 185
    Py_DECREF(args);
    return result;

 onError:
    Py_XDECREF(args);
    return NULL;
}

186 187 188 189 190
/* Codec registry encoding check API. */

int PyCodec_KnownEncoding(const char *encoding)
{
    PyObject *codecs;
191

192 193
    codecs = _PyCodec_Lookup(encoding);
    if (!codecs) {
194 195
        PyErr_Clear();
        return 0;
196 197
    }
    else {
198 199
        Py_DECREF(codecs);
        return 1;
200 201 202
    }
}

203 204
static
PyObject *args_tuple(PyObject *object,
205
                     const char *errors)
206 207
{
    PyObject *args;
208

209 210
    args = PyTuple_New(1 + (errors != NULL));
    if (args == NULL)
211
        return NULL;
212 213 214
    Py_INCREF(object);
    PyTuple_SET_ITEM(args,0,object);
    if (errors) {
215 216 217 218 219 220 221 222
        PyObject *v;

        v = PyUnicode_FromString(errors);
        if (v == NULL) {
            Py_DECREF(args);
            return NULL;
        }
        PyTuple_SET_ITEM(args, 1, v);
223 224 225 226
    }
    return args;
}

227
/* Helper function to get a codec item */
228 229

static
230
PyObject *codec_getitem(const char *encoding, int index)
231 232 233 234 235 236
{
    PyObject *codecs;
    PyObject *v;

    codecs = _PyCodec_Lookup(encoding);
    if (codecs == NULL)
237
        return NULL;
238
    v = PyTuple_GET_ITEM(codecs, index);
239
    Py_DECREF(codecs);
240 241 242 243
    Py_INCREF(v);
    return v;
}

244 245 246 247
/* Helper function to create an incremental codec. */

static
PyObject *codec_getincrementalcodec(const char *encoding,
248 249
                                    const char *errors,
                                    const char *attrname)
250
{
251
    PyObject *codecs, *ret, *inccodec;
252 253 254

    codecs = _PyCodec_Lookup(encoding);
    if (codecs == NULL)
255
        return NULL;
256
    inccodec = PyObject_GetAttrString(codecs, attrname);
257
    Py_DECREF(codecs);
258
    if (inccodec == NULL)
259
        return NULL;
260
    if (errors)
261
        ret = PyObject_CallFunction(inccodec, "s", errors);
262
    else
263
        ret = PyObject_CallFunction(inccodec, NULL);
264 265
    Py_DECREF(inccodec);
    return ret;
266 267
}

268 269 270 271
/* Helper function to create a stream codec. */

static
PyObject *codec_getstreamcodec(const char *encoding,
272 273 274
                               PyObject *stream,
                               const char *errors,
                               const int index)
275
{
276
    PyObject *codecs, *streamcodec, *codeccls;
277 278 279

    codecs = _PyCodec_Lookup(encoding);
    if (codecs == NULL)
280
        return NULL;
281

282 283
    codeccls = PyTuple_GET_ITEM(codecs, index);
    if (errors != NULL)
284
        streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
285
    else
286
        streamcodec = PyObject_CallFunction(codeccls, "O", stream);
287
    Py_DECREF(codecs);
288 289
    return streamcodec;
}
290

291 292
/* Convenience APIs to query the Codec registry.

293
   All APIs return a codec object with incremented refcount.
294

295 296 297 298 299
 */

PyObject *PyCodec_Encoder(const char *encoding)
{
    return codec_getitem(encoding, 0);
300 301
}

302
PyObject *PyCodec_Decoder(const char *encoding)
303
{
304 305
    return codec_getitem(encoding, 1);
}
306

307
PyObject *PyCodec_IncrementalEncoder(const char *encoding,
308
                                     const char *errors)
309 310 311
{
    return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
}
312

313
PyObject *PyCodec_IncrementalDecoder(const char *encoding,
314
                                     const char *errors)
315 316
{
    return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
317 318
}

319
PyObject *PyCodec_StreamReader(const char *encoding,
320 321
                               PyObject *stream,
                               const char *errors)
322
{
323
    return codec_getstreamcodec(encoding, stream, errors, 2);
324 325 326
}

PyObject *PyCodec_StreamWriter(const char *encoding,
327 328
                               PyObject *stream,
                               const char *errors)
329
{
330
    return codec_getstreamcodec(encoding, stream, errors, 3);
331 332 333 334 335 336 337 338
}

/* Encode an object (e.g. an Unicode object) using the given encoding
   and return the resulting encoded object (usually a Python string).

   errors is passed to the encoder factory as argument if non-NULL. */

PyObject *PyCodec_Encode(PyObject *object,
339 340
                         const char *encoding,
                         const char *errors)
341 342
{
    PyObject *encoder = NULL;
343
    PyObject *args = NULL, *result = NULL;
344
    PyObject *v = NULL;
345 346 347

    encoder = PyCodec_Encoder(encoding);
    if (encoder == NULL)
348
        goto onError;
349 350 351

    args = args_tuple(object, errors);
    if (args == NULL)
352
        goto onError;
353 354

    result = PyEval_CallObject(encoder, args);
355
    if (result == NULL)
356
        goto onError;
357

358
    if (!PyTuple_Check(result) ||
359 360 361 362
        PyTuple_GET_SIZE(result) != 2) {
        PyErr_SetString(PyExc_TypeError,
                        "encoder must return a tuple (object, integer)");
        goto onError;
363
    }
364 365
    v = PyTuple_GET_ITEM(result,0);
    Py_INCREF(v);
366 367
    /* We don't check or use the second (integer) entry. */

368 369 370 371
    Py_DECREF(args);
    Py_DECREF(encoder);
    Py_DECREF(result);
    return v;
372

373
 onError:
374
    Py_XDECREF(result);
375 376
    Py_XDECREF(args);
    Py_XDECREF(encoder);
377
    return NULL;
378 379 380 381 382 383 384 385
}

/* Decode an object (usually a Python string) using the given encoding
   and return an equivalent object (e.g. an Unicode object).

   errors is passed to the decoder factory as argument if non-NULL. */

PyObject *PyCodec_Decode(PyObject *object,
386 387
                         const char *encoding,
                         const char *errors)
388 389 390 391 392 393 394
{
    PyObject *decoder = NULL;
    PyObject *args = NULL, *result = NULL;
    PyObject *v;

    decoder = PyCodec_Decoder(encoding);
    if (decoder == NULL)
395
        goto onError;
396 397 398

    args = args_tuple(object, errors);
    if (args == NULL)
399
        goto onError;
400

401 402
    result = PyEval_CallObject(decoder,args);
    if (result == NULL)
403
        goto onError;
404
    if (!PyTuple_Check(result) ||
405 406 407 408
        PyTuple_GET_SIZE(result) != 2) {
        PyErr_SetString(PyExc_TypeError,
                        "decoder must return a tuple (object,integer)");
        goto onError;
409 410 411 412 413 414 415 416 417
    }
    v = PyTuple_GET_ITEM(result,0);
    Py_INCREF(v);
    /* We don't check or use the second (integer) entry. */

    Py_DECREF(args);
    Py_DECREF(decoder);
    Py_DECREF(result);
    return v;
418

419 420 421 422 423 424 425
 onError:
    Py_XDECREF(args);
    Py_XDECREF(decoder);
    Py_XDECREF(result);
    return NULL;
}

426 427 428 429 430 431 432 433
/* Register the error handling callback function error under the name
   name. This function will be called by the codec when it encounters
   an unencodable characters/undecodable bytes and doesn't know the
   callback name, when name is specified as the error parameter
   in the call to the encode/decode function.
   Return 0 on success, -1 on error */
int PyCodec_RegisterError(const char *name, PyObject *error)
{
434
    PyInterpreterState *interp = PyThreadState_GET()->interp;
435
    if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
436
        return -1;
437
    if (!PyCallable_Check(error)) {
438 439
        PyErr_SetString(PyExc_TypeError, "handler must be callable");
        return -1;
440
    }
441
    return PyDict_SetItemString(interp->codec_error_registry,
442
                                (char *)name, error);
443 444 445 446 447 448 449 450 451
}

/* Lookup the error handling callback function registered under the
   name error. As a special case NULL can be passed, in which case
   the error handling callback for strict encoding will be returned. */
PyObject *PyCodec_LookupError(const char *name)
{
    PyObject *handler = NULL;

452
    PyInterpreterState *interp = PyThreadState_GET()->interp;
453
    if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
454
        return NULL;
455

456
    if (name==NULL)
457
        name = "strict";
458
    handler = PyDict_GetItemString(interp->codec_error_registry, (char *)name);
459
    if (!handler)
460
        PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
461
    else
462
        Py_INCREF(handler);
463 464 465 466 467 468 469
    return handler;
}

static void wrong_exception_type(PyObject *exc)
{
    PyObject *type = PyObject_GetAttrString(exc, "__class__");
    if (type != NULL) {
470 471 472 473 474 475 476
        PyObject *name = PyObject_GetAttrString(type, "__name__");
        Py_DECREF(type);
        if (name != NULL) {
            PyErr_Format(PyExc_TypeError,
                         "don't know how to handle %S in error callback", name);
            Py_DECREF(name);
        }
477 478 479 480 481
    }
}

PyObject *PyCodec_StrictErrors(PyObject *exc)
{
482 483
    if (PyExceptionInstance_Check(exc))
        PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
484
    else
485
        PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
486 487 488 489 490 491
    return NULL;
}


PyObject *PyCodec_IgnoreErrors(PyObject *exc)
{
Martin v. Löwis's avatar
Martin v. Löwis committed
492
    Py_ssize_t end;
493
    if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
494 495
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
            return NULL;
496 497
    }
    else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
498 499
        if (PyUnicodeDecodeError_GetEnd(exc, &end))
            return NULL;
500 501
    }
    else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
502 503
        if (PyUnicodeTranslateError_GetEnd(exc, &end))
            return NULL;
504 505
    }
    else {
506 507
        wrong_exception_type(exc);
        return NULL;
508 509
    }
    /* ouch: passing NULL, 0, pos gives None instead of u'' */
Martin v. Löwis's avatar
Martin v. Löwis committed
510
    return Py_BuildValue("(u#n)", &end, 0, end);
511 512 513 514 515 516
}


PyObject *PyCodec_ReplaceErrors(PyObject *exc)
{
    PyObject *restuple;
Martin v. Löwis's avatar
Martin v. Löwis committed
517 518 519
    Py_ssize_t start;
    Py_ssize_t end;
    Py_ssize_t i;
520 521

    if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
522 523 524 525 526 527 528 529 530 531 532 533 534 535 536
        PyObject *res;
        Py_UNICODE *p;
        if (PyUnicodeEncodeError_GetStart(exc, &start))
            return NULL;
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
            return NULL;
        res = PyUnicode_FromUnicode(NULL, end-start);
        if (res == NULL)
            return NULL;
        for (p = PyUnicode_AS_UNICODE(res), i = start;
            i<end; ++p, ++i)
            *p = '?';
        restuple = Py_BuildValue("(On)", res, end);
        Py_DECREF(res);
        return restuple;
537 538
    }
    else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
539 540 541 542
        Py_UNICODE res = Py_UNICODE_REPLACEMENT_CHARACTER;
        if (PyUnicodeDecodeError_GetEnd(exc, &end))
            return NULL;
        return Py_BuildValue("(u#n)", &res, 1, end);
543 544
    }
    else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
545 546 547 548 549 550 551 552 553 554 555 556 557 558 559
        PyObject *res;
        Py_UNICODE *p;
        if (PyUnicodeTranslateError_GetStart(exc, &start))
            return NULL;
        if (PyUnicodeTranslateError_GetEnd(exc, &end))
            return NULL;
        res = PyUnicode_FromUnicode(NULL, end-start);
        if (res == NULL)
            return NULL;
        for (p = PyUnicode_AS_UNICODE(res), i = start;
            i<end; ++p, ++i)
            *p = Py_UNICODE_REPLACEMENT_CHARACTER;
        restuple = Py_BuildValue("(On)", res, end);
        Py_DECREF(res);
        return restuple;
560 561
    }
    else {
562 563
        wrong_exception_type(exc);
        return NULL;
564 565 566 567 568 569
    }
}

PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
{
    if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594
        PyObject *restuple;
        PyObject *object;
        Py_ssize_t start;
        Py_ssize_t end;
        PyObject *res;
        Py_UNICODE *p;
        Py_UNICODE *startp;
        Py_UNICODE *outp;
        int ressize;
        if (PyUnicodeEncodeError_GetStart(exc, &start))
            return NULL;
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
            return NULL;
        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
            return NULL;
        startp = PyUnicode_AS_UNICODE(object);
        for (p = startp+start, ressize = 0; p < startp+end; ++p) {
            if (*p<10)
                ressize += 2+1+1;
            else if (*p<100)
                ressize += 2+2+1;
            else if (*p<1000)
                ressize += 2+3+1;
            else if (*p<10000)
                ressize += 2+4+1;
595
#ifndef Py_UNICODE_WIDE
596 597
            else
                ressize += 2+5+1;
598
#else
599 600 601 602 603 604
            else if (*p<100000)
                ressize += 2+5+1;
            else if (*p<1000000)
                ressize += 2+6+1;
            else
                ressize += 2+7+1;
605
#endif
606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636
        }
        /* allocate replacement */
        res = PyUnicode_FromUnicode(NULL, ressize);
        if (res == NULL) {
            Py_DECREF(object);
            return NULL;
        }
        /* generate replacement */
        for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
            p < startp+end; ++p) {
            Py_UNICODE c = *p;
            int digits;
            int base;
            *outp++ = '&';
            *outp++ = '#';
            if (*p<10) {
                digits = 1;
                base = 1;
            }
            else if (*p<100) {
                digits = 2;
                base = 10;
            }
            else if (*p<1000) {
                digits = 3;
                base = 100;
            }
            else if (*p<10000) {
                digits = 4;
                base = 1000;
            }
637
#ifndef Py_UNICODE_WIDE
638 639 640 641
            else {
                digits = 5;
                base = 10000;
            }
642
#else
643 644 645 646 647 648 649 650 651 652 653 654
            else if (*p<100000) {
                digits = 5;
                base = 10000;
            }
            else if (*p<1000000) {
                digits = 6;
                base = 100000;
            }
            else {
                digits = 7;
                base = 1000000;
            }
655
#endif
656 657 658 659 660 661 662 663 664 665 666
            while (digits-->0) {
                *outp++ = '0' + c/base;
                c %= base;
                base /= 10;
            }
            *outp++ = ';';
        }
        restuple = Py_BuildValue("(On)", res, end);
        Py_DECREF(res);
        Py_DECREF(object);
        return restuple;
667 668
    }
    else {
669 670
        wrong_exception_type(exc);
        return NULL;
671 672 673 674 675 676 677 678 679 680
    }
}

static Py_UNICODE hexdigits[] = {
    '0', '1', '2', '3', '4', '5', '6', '7',
    '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
};

PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
{
681 682 683 684 685 686 687
#ifndef Py_UNICODE_WIDE
#define IS_SURROGATE_PAIR(p, end) \
    (*p >= 0xD800 && *p <= 0xDBFF && (p + 1) < end && \
     *(p + 1) >= 0xDC00 && *(p + 1) <= 0xDFFF)
#else
#define IS_SURROGATE_PAIR(p, end) 0
#endif
688
    if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705
        PyObject *restuple;
        PyObject *object;
        Py_ssize_t start;
        Py_ssize_t end;
        PyObject *res;
        Py_UNICODE *p;
        Py_UNICODE *startp;
        Py_UNICODE *outp;
        int ressize;
        if (PyUnicodeEncodeError_GetStart(exc, &start))
            return NULL;
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
            return NULL;
        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
            return NULL;
        startp = PyUnicode_AS_UNICODE(object);
        for (p = startp+start, ressize = 0; p < startp+end; ++p) {
706
#ifdef Py_UNICODE_WIDE
707 708 709
            if (*p >= 0x00010000)
                ressize += 1+1+8;
            else
710
#endif
711
            if (*p >= 0x100) {
712 713 714 715 716 717
                if (IS_SURROGATE_PAIR(p, startp+end)) {
                    ressize += 1+1+8;
                    ++p;
                }
                else
                    ressize += 1+1+4;
718 719 720 721 722 723 724 725 726
            }
            else
                ressize += 1+1+2;
        }
        res = PyUnicode_FromUnicode(NULL, ressize);
        if (res==NULL)
            return NULL;
        for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
            p < startp+end; ++p) {
727
            Py_UCS4 c = (Py_UCS4) *p;
728
            *outp++ = '\\';
729 730 731 732
            if (IS_SURROGATE_PAIR(p, startp+end)) {
                c = ((*p & 0x3FF) << 10) + (*(p + 1) & 0x3FF) + 0x10000;
                ++p;
            }
733 734 735 736 737 738 739 740 741
            if (c >= 0x00010000) {
                *outp++ = 'U';
                *outp++ = hexdigits[(c>>28)&0xf];
                *outp++ = hexdigits[(c>>24)&0xf];
                *outp++ = hexdigits[(c>>20)&0xf];
                *outp++ = hexdigits[(c>>16)&0xf];
                *outp++ = hexdigits[(c>>12)&0xf];
                *outp++ = hexdigits[(c>>8)&0xf];
            }
742
            else if (c >= 0x100) {
743 744 745 746 747 748 749 750 751 752 753 754 755 756
                *outp++ = 'u';
                *outp++ = hexdigits[(c>>12)&0xf];
                *outp++ = hexdigits[(c>>8)&0xf];
            }
            else
                *outp++ = 'x';
            *outp++ = hexdigits[(c>>4)&0xf];
            *outp++ = hexdigits[c&0xf];
        }

        restuple = Py_BuildValue("(On)", res, end);
        Py_DECREF(res);
        Py_DECREF(object);
        return restuple;
757 758
    }
    else {
759 760
        wrong_exception_type(exc);
        return NULL;
761
    }
762
#undef IS_SURROGATE_PAIR
763 764
}

765 766 767
/* This handler is declared static until someone demonstrates
   a need to call it directly. */
static PyObject *
768
PyCodec_SurrogatePassErrors(PyObject *exc)
769 770 771 772 773 774 775
{
    PyObject *restuple;
    PyObject *object;
    Py_ssize_t start;
    Py_ssize_t end;
    PyObject *res;
    if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808
        Py_UNICODE *p;
        Py_UNICODE *startp;
        char *outp;
        if (PyUnicodeEncodeError_GetStart(exc, &start))
            return NULL;
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
            return NULL;
        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
            return NULL;
        startp = PyUnicode_AS_UNICODE(object);
        res = PyBytes_FromStringAndSize(NULL, 3*(end-start));
        if (!res) {
            Py_DECREF(object);
            return NULL;
        }
        outp = PyBytes_AsString(res);
        for (p = startp+start; p < startp+end; p++) {
            Py_UNICODE ch = *p;
            if (ch < 0xd800 || ch > 0xdfff) {
                /* Not a surrogate, fail with original exception */
                PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
                Py_DECREF(res);
                Py_DECREF(object);
                return NULL;
            }
            *outp++ = (char)(0xe0 | (ch >> 12));
            *outp++ = (char)(0x80 | ((ch >> 6) & 0x3f));
            *outp++ = (char)(0x80 | (ch & 0x3f));
        }
        restuple = Py_BuildValue("(On)", res, end);
        Py_DECREF(res);
        Py_DECREF(object);
        return restuple;
809 810
    }
    else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838
        unsigned char *p;
        Py_UNICODE ch = 0;
        if (PyUnicodeDecodeError_GetStart(exc, &start))
            return NULL;
        if (!(object = PyUnicodeDecodeError_GetObject(exc)))
            return NULL;
        if (!(p = (unsigned char*)PyBytes_AsString(object))) {
            Py_DECREF(object);
            return NULL;
        }
        /* Try decoding a single surrogate character. If
           there are more, let the codec call us again. */
        p += start;
        if ((p[0] & 0xf0) == 0xe0 ||
            (p[1] & 0xc0) == 0x80 ||
            (p[2] & 0xc0) == 0x80) {
            /* it's a three-byte code */
            ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
            if (ch < 0xd800 || ch > 0xdfff)
                /* it's not a surrogate - fail */
                ch = 0;
        }
        Py_DECREF(object);
        if (ch == 0) {
            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
            return NULL;
        }
        return Py_BuildValue("(u#n)", &ch, 1, start+3);
839 840
    }
    else {
841 842
        wrong_exception_type(exc);
        return NULL;
843 844 845
    }
}

846
static PyObject *
847
PyCodec_SurrogateEscapeErrors(PyObject *exc)
848 849 850 851 852 853 854
{
    PyObject *restuple;
    PyObject *object;
    Py_ssize_t start;
    Py_ssize_t end;
    PyObject *res;
    if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885
        Py_UNICODE *p;
        Py_UNICODE *startp;
        char *outp;
        if (PyUnicodeEncodeError_GetStart(exc, &start))
            return NULL;
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
            return NULL;
        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
            return NULL;
        startp = PyUnicode_AS_UNICODE(object);
        res = PyBytes_FromStringAndSize(NULL, end-start);
        if (!res) {
            Py_DECREF(object);
            return NULL;
        }
        outp = PyBytes_AsString(res);
        for (p = startp+start; p < startp+end; p++) {
            Py_UNICODE ch = *p;
            if (ch < 0xdc80 || ch > 0xdcff) {
                /* Not a UTF-8b surrogate, fail with original exception */
                PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
                Py_DECREF(res);
                Py_DECREF(object);
                return NULL;
            }
            *outp++ = ch - 0xdc00;
        }
        restuple = Py_BuildValue("(On)", res, end);
        Py_DECREF(res);
        Py_DECREF(object);
        return restuple;
886 887
    }
    else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914
        unsigned char *p;
        Py_UNICODE ch[4]; /* decode up to 4 bad bytes. */
        int consumed = 0;
        if (PyUnicodeDecodeError_GetStart(exc, &start))
            return NULL;
        if (PyUnicodeDecodeError_GetEnd(exc, &end))
            return NULL;
        if (!(object = PyUnicodeDecodeError_GetObject(exc)))
            return NULL;
        if (!(p = (unsigned char*)PyBytes_AsString(object))) {
            Py_DECREF(object);
            return NULL;
        }
        while (consumed < 4 && consumed < end-start) {
            /* Refuse to escape ASCII bytes. */
            if (p[start+consumed] < 128)
                break;
            ch[consumed] = 0xdc00 + p[start+consumed];
            consumed++;
        }
        Py_DECREF(object);
        if (!consumed) {
            /* codec complained about ASCII byte. */
            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
            return NULL;
        }
        return Py_BuildValue("(u#n)", ch, consumed, start+consumed);
915 916
    }
    else {
917 918
        wrong_exception_type(exc);
        return NULL;
919 920 921
    }
}

922

923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951
static PyObject *strict_errors(PyObject *self, PyObject *exc)
{
    return PyCodec_StrictErrors(exc);
}


static PyObject *ignore_errors(PyObject *self, PyObject *exc)
{
    return PyCodec_IgnoreErrors(exc);
}


static PyObject *replace_errors(PyObject *self, PyObject *exc)
{
    return PyCodec_ReplaceErrors(exc);
}


static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
{
    return PyCodec_XMLCharRefReplaceErrors(exc);
}


static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
{
    return PyCodec_BackslashReplaceErrors(exc);
}

952
static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
953
{
954
    return PyCodec_SurrogatePassErrors(exc);
955 956
}

957
static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
958
{
959
    return PyCodec_SurrogateEscapeErrors(exc);
960 961
}

962
static int _PyCodecRegistry_Init(void)
963
{
964
    static struct {
965 966
        char *name;
        PyMethodDef def;
967 968
    } methods[] =
    {
969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036
        {
            "strict",
            {
                "strict_errors",
                strict_errors,
                METH_O,
                PyDoc_STR("Implements the 'strict' error handling, which "
                          "raises a UnicodeError on coding errors.")
            }
        },
        {
            "ignore",
            {
                "ignore_errors",
                ignore_errors,
                METH_O,
                PyDoc_STR("Implements the 'ignore' error handling, which "
                          "ignores malformed data and continues.")
            }
        },
        {
            "replace",
            {
                "replace_errors",
                replace_errors,
                METH_O,
                PyDoc_STR("Implements the 'replace' error handling, which "
                          "replaces malformed data with a replacement marker.")
            }
        },
        {
            "xmlcharrefreplace",
            {
                "xmlcharrefreplace_errors",
                xmlcharrefreplace_errors,
                METH_O,
                PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
                          "which replaces an unencodable character with the "
                          "appropriate XML character reference.")
            }
        },
        {
            "backslashreplace",
            {
                "backslashreplace_errors",
                backslashreplace_errors,
                METH_O,
                PyDoc_STR("Implements the 'backslashreplace' error handling, "
                          "which replaces an unencodable character with a "
                          "backslashed escape sequence.")
            }
        },
        {
            "surrogatepass",
            {
                "surrogatepass",
                surrogatepass_errors,
                METH_O
            }
        },
        {
            "surrogateescape",
            {
                "surrogateescape",
                surrogateescape_errors,
                METH_O
            }
        }
1037
    };
1038

1039
    PyInterpreterState *interp = PyThreadState_GET()->interp;
1040
    PyObject *mod;
1041
    unsigned i;
1042 1043

    if (interp->codec_search_path != NULL)
1044
        return 0;
1045 1046 1047 1048 1049 1050

    interp->codec_search_path = PyList_New(0);
    interp->codec_search_cache = PyDict_New();
    interp->codec_error_registry = PyDict_New();

    if (interp->codec_error_registry) {
1051 1052 1053 1054 1055 1056 1057 1058 1059 1060
        for (i = 0; i < sizeof(methods)/sizeof(methods[0]); ++i) {
            PyObject *func = PyCFunction_New(&methods[i].def, NULL);
            int res;
            if (!func)
                Py_FatalError("can't initialize codec error registry");
            res = PyCodec_RegisterError(methods[i].name, func);
            Py_DECREF(func);
            if (res)
                Py_FatalError("can't initialize codec error registry");
        }
1061
    }
1062 1063

    if (interp->codec_search_path == NULL ||
1064 1065 1066
        interp->codec_search_cache == NULL ||
        interp->codec_error_registry == NULL)
        Py_FatalError("can't initialize codec registry");
1067

1068
    mod = PyImport_ImportModuleNoBlock("encodings");
1069
    if (mod == NULL) {
1070 1071 1072 1073 1074 1075 1076 1077 1078 1079
        if (PyErr_ExceptionMatches(PyExc_ImportError)) {
            /* Ignore ImportErrors... this is done so that
               distributions can disable the encodings package. Note
               that other errors are not masked, e.g. SystemErrors
               raised to inform the user of an error in the Python
               configuration are still reported back to the user. */
            PyErr_Clear();
            return 0;
        }
        return -1;
1080 1081
    }
    Py_DECREF(mod);
1082
    interp->codecs_initialized = 1;
1083
    return 0;
1084
}