unicodeobject.h 35.3 KB
Newer Older
1 2 3
#ifndef Py_UNICODEOBJECT_H
#define Py_UNICODEOBJECT_H

4 5
#include <stdarg.h>

6 7 8 9
/*

Unicode implementation based on original code by Fredrik Lundh,
modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
10 11
Unicode Integration Proposal. (See
http://www.egenix.com/files/python/unicode-proposal.txt).
12

13
Copyright (c) Corporation for National Research Initiatives.
14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31


 Original header:
 --------------------------------------------------------------------

 * Yet another Unicode string type for Python.  This type supports the
 * 16-bit Basic Multilingual Plane (BMP) only.
 *
 * Written by Fredrik Lundh, January 1999.
 *
 * Copyright (c) 1999 by Secret Labs AB.
 * Copyright (c) 1999 by Fredrik Lundh.
 *
 * fredrik@pythonware.com
 * http://www.pythonware.com
 *
 * --------------------------------------------------------------------
 * This Unicode String Type is
32
 *
33 34
 * Copyright (c) 1999 by Secret Labs AB
 * Copyright (c) 1999 by Fredrik Lundh
35
 *
36 37 38
 * By obtaining, using, and/or copying this software and/or its
 * associated documentation, you agree that you have read, understood,
 * and will comply with the following terms and conditions:
39
 *
40 41 42 43 44 45 46 47
 * Permission to use, copy, modify, and distribute this software and its
 * associated documentation for any purpose and without fee is hereby
 * granted, provided that the above copyright notice appears in all
 * copies, and that both that copyright notice and this permission notice
 * appear in supporting documentation, and that the name of Secret Labs
 * AB or the author not be used in advertising or publicity pertaining to
 * distribution of the software without specific, written prior
 * permission.
48
 *
49 50 51 52 53 54 55 56 57
 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 * -------------------------------------------------------------------- */

58
#include <ctype.h>
59 60 61 62 63

/* === Internal API ======================================================= */

/* --- Internal Unicode Format -------------------------------------------- */

64
/* Python 3.x requires unicode */
65
#define Py_USING_UNICODE
66

Martin v. Löwis's avatar
Martin v. Löwis committed
67 68
#ifndef SIZEOF_WCHAR_T
#error Must define SIZEOF_WCHAR_T
69 70
#endif

Martin v. Löwis's avatar
Martin v. Löwis committed
71 72 73 74 75
#define Py_UNICODE_SIZE SIZEOF_WCHAR_T

/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
   Otherwise, Unicode strings are stored as UCS-2 (with limited support
   for UTF-16) */
76 77 78

#if Py_UNICODE_SIZE >= 4
#define Py_UNICODE_WIDE
79
#endif
80

81
/* Set these flags if the platform has "wchar.h" and the
82 83 84 85 86
   wchar_t type is a 16-bit unsigned type */
/* #define HAVE_WCHAR_H */
/* #define HAVE_USABLE_WCHAR_T */

/* If the compiler provides a wchar_t type we try to support it
87 88
   through the interface functions PyUnicode_FromWideChar(),
   PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
89 90

#ifdef HAVE_USABLE_WCHAR_T
91 92 93
# ifndef HAVE_WCHAR_H
#  define HAVE_WCHAR_H
# endif
94 95 96
#endif

#ifdef HAVE_WCHAR_H
97
#  include <wchar.h>
98 99
#endif

100
/* Py_UCS4 and Py_UCS2 are typedefs for the respective
Martin v. Löwis's avatar
Martin v. Löwis committed
101
   unicode representations. */
102 103 104
typedef uint32_t Py_UCS4;
typedef uint16_t Py_UCS2;
typedef uint8_t Py_UCS1;
Martin v. Löwis's avatar
Martin v. Löwis committed
105

106 107 108 109
#ifdef __cplusplus
extern "C" {
#endif

110

111
PyAPI_DATA(PyTypeObject) PyUnicode_Type;
112
PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
113

114
#define PyUnicode_Check(op) \
115 116
                 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
117 118 119 120 121 122 123 124

/* --- Constants ---------------------------------------------------------- */

/* This Unicode character will be used as replacement character during
   decoding if the errors argument is set to "replace". Note: the
   Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
   Unicode 3.0. */

125
#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
126 127 128

/* === Public API ========================================================= */

129
/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
130
PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
131
    const char *u,             /* UTF-8 encoded string */
132
    Py_ssize_t size            /* size of buffer */
133 134
    );

135
/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Martin v. Löwis's avatar
Martin v. Löwis committed
136
   UTF-8 encoded bytes.  The size is determined with strlen(). */
137
PyAPI_FUNC(PyObject*) PyUnicode_FromString(
138
    const char *u              /* UTF-8 encoded string */
139 140
    );

141
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Martin v. Löwis's avatar
Martin v. Löwis committed
142 143 144 145
PyAPI_FUNC(PyObject*) PyUnicode_Substring(
    PyObject *str,
    Py_ssize_t start,
    Py_ssize_t end);
146
#endif
Martin v. Löwis's avatar
Martin v. Löwis committed
147

148
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
149
/* Copy the string into a UCS4 buffer including the null character if copy_null
150
   is set. Return NULL and raise an exception on error. Raise a SystemError if
Martin v. Löwis's avatar
Martin v. Löwis committed
151 152 153 154 155 156 157 158 159 160 161 162 163
   the buffer is smaller than the string. Return buffer on success.

   buflen is the length of the buffer in (Py_UCS4) characters. */
PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
    PyObject *unicode,
    Py_UCS4* buffer,
    Py_ssize_t buflen,
    int copy_null);

/* Copy the string into a UCS4 buffer. A new buffer is allocated using
 * PyMem_Malloc; if this fails, NULL is returned with a memory error
   exception set. */
PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
164
#endif
Martin v. Löwis's avatar
Martin v. Löwis committed
165

166
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
167 168
/* Get the length of the Unicode object. */

Martin v. Löwis's avatar
Martin v. Löwis committed
169 170 171
PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
    PyObject *unicode
);
172
#endif
Martin v. Löwis's avatar
Martin v. Löwis committed
173

174
/* Get the number of Py_UNICODE units in the
Martin v. Löwis's avatar
Martin v. Löwis committed
175 176
   string representation. */

Martin v. Löwis's avatar
Martin v. Löwis committed
177
PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
178
    PyObject *unicode           /* Unicode object */
179
    ) Py_DEPRECATED(3.3);
180

181
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Martin v. Löwis's avatar
Martin v. Löwis committed
182 183 184 185 186 187 188 189
/* Read a character from the string. */

PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
    PyObject *unicode,
    Py_ssize_t index
    );

/* Write a character to the string. The string must have been created through
190 191 192
   PyUnicode_New, must not be shared, and must not have been hashed yet.

   Return 0 on success, -1 on error. */
Martin v. Löwis's avatar
Martin v. Löwis committed
193 194 195 196 197 198

PyAPI_FUNC(int) PyUnicode_WriteChar(
    PyObject *unicode,
    Py_ssize_t index,
    Py_UCS4 character
    );
199
#endif
Martin v. Löwis's avatar
Martin v. Löwis committed
200

201
/* Resize a Unicode object. The length is the number of characters, except
202 203
   if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length
   is the number of Py_UNICODE characters.
204 205 206 207

   *unicode is modified to point to the new (resized) object and 0
   returned on success.

208 209
   Try to resize the string in place (which is usually faster than allocating
   a new string and copy characters), or create a new string.
210 211

   Error handling is implemented as follows: an exception is set, -1
212 213 214 215
   is returned and *unicode left untouched.

   WARNING: The function doesn't check string content, the result may not be a
            string in canonical representation. */
216

217
PyAPI_FUNC(int) PyUnicode_Resize(
218 219
    PyObject **unicode,         /* Pointer to the Unicode object */
    Py_ssize_t length           /* New length */
220 221
    );

222
/* Decode obj to a Unicode object.
223

224 225 226
   bytes, bytearray and other bytes-like objects are decoded according to the
   given encoding and error handler. The encoding and error handler can be
   NULL to have the interface use UTF-8 and "strict".
227

228
   All other objects (including Unicode objects) raise an exception.
229 230 231 232 233 234

   The API returns NULL in case of an error. The caller is responsible
   for decref'ing the returned objects.

*/

235
PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
236
    PyObject *obj,              /* Object */
237 238 239 240
    const char *encoding,       /* encoding */
    const char *errors          /* error handling */
    );

241 242 243
/* Copy an instance of a Unicode subtype to a new true Unicode object if
   necessary. If obj is already a true Unicode object (not a subtype), return
   the reference with *incremented* refcount.
244 245 246 247 248 249

   The API returns NULL in case of an error. The caller is responsible
   for decref'ing the returned objects.

*/

250
PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
251
    PyObject *obj      /* Object */
252 253
    );

254 255 256 257 258 259 260 261
PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
    const char *format,   /* ASCII-encoded string  */
    va_list vargs
    );
PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
    const char *format,   /* ASCII-encoded string  */
    ...
    );
262

263 264
PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
265 266 267
PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
    const char *u              /* UTF-8 encoded string */
    );
268 269

/* Use only if you know it's a string */
Martin v. Löwis's avatar
Martin v. Löwis committed
270 271
#define PyUnicode_CHECK_INTERNED(op) \
    (((PyASCIIObject *)(op))->state.interned)
272

273 274 275 276
/* --- wchar_t support for platforms which support it --------------------- */

#ifdef HAVE_WCHAR_H

277
/* Create a Unicode Object from the wchar_t buffer w of the given
278 279 280 281
   size.

   The buffer is copied into the new object. */

282
PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
283
    const wchar_t *w,           /* wchar_t buffer */
Martin v. Löwis's avatar
Martin v. Löwis committed
284
    Py_ssize_t size             /* size of buffer */
285 286
    );

287
/* Copies the Unicode Object contents into the wchar_t buffer w.  At
288 289
   most size wchar_t characters are copied.

290 291 292 293 294 295 296
   Note that the resulting wchar_t string may or may not be
   0-terminated.  It is the responsibility of the caller to make sure
   that the wchar_t string is 0-terminated in case this is required by
   the application.

   Returns the number of wchar_t characters copied (excluding a
   possibly trailing 0-termination character) or -1 in case of an
297 298
   error. */

Martin v. Löwis's avatar
Martin v. Löwis committed
299
PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
300
    PyObject *unicode,          /* Unicode object */
301
    wchar_t *w,                 /* wchar_t buffer */
Martin v. Löwis's avatar
Martin v. Löwis committed
302
    Py_ssize_t size             /* size of buffer */
303 304
    );

305 306
/* Convert the Unicode object to a wide character string. The output string
   always ends with a nul character. If size is not NULL, write the number of
307
   wide characters (excluding the null character) into *size.
308

309
   Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it)
310 311 312 313
   on success. On error, returns NULL, *size is undefined and raises a
   MemoryError. */

PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
314
    PyObject *unicode,          /* Unicode object */
315 316 317
    Py_ssize_t *size            /* number of characters of the result */
    );

318 319
#endif

320 321
/* --- Unicode ordinals --------------------------------------------------- */

322 323
/* Create a Unicode Object from the given Unicode code point ordinal.

324
   The ordinal must be in range(0x110000). A ValueError is
325 326 327 328
   raised in case it is not.

*/

329
PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
330

331 332 333 334 335 336 337 338 339 340 341
/* --- Free-list management ----------------------------------------------- */

/* Clear the free list used by the Unicode implementation.

   This can be used to release memory used for objects on the free
   list back to the Python memory allocator.

*/

PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);

342
/* === Builtin Codecs =====================================================
343 344 345

   Many of these APIs take two arguments encoding and errors. These
   parameters encoding and errors have the same semantics as the ones
346
   of the builtin str() API.
347

348
   Setting encoding to NULL causes the default encoding (UTF-8) to be used.
349 350 351 352 353 354 355 356 357 358 359

   Error handling is set by errors which may also be set to NULL
   meaning to use the default handling defined for the codec. Default
   error handling for all builtin codecs is "strict" (ValueErrors are
   raised).

   The codecs all use a similar interface. Only deviation from the
   generic ones are documented.

*/

360 361
/* --- Manage the default encoding ---------------------------------------- */

362
/* Returns "utf-8".  */
363
PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
364

365 366 367 368 369
/* --- Generic Codecs ----------------------------------------------------- */

/* Create a Unicode object by decoding the encoded string s of the
   given size. */

370
PyAPI_FUNC(PyObject*) PyUnicode_Decode(
371
    const char *s,              /* encoded string */
Martin v. Löwis's avatar
Martin v. Löwis committed
372
    Py_ssize_t size,            /* size of buffer */
373 374 375 376
    const char *encoding,       /* encoding */
    const char *errors          /* error handling */
    );

377
/* Decode a Unicode object unicode and return the result as Python
378 379 380 381 382
   object.

   This API is DEPRECATED. The only supported standard encoding is rot13.
   Use PyCodec_Decode() to decode with rot13 and non-standard codecs
   that decode from str. */
383 384

PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
385 386 387
    PyObject *unicode,          /* Unicode object */
    const char *encoding,       /* encoding */
    const char *errors          /* error handling */
388
    ) Py_DEPRECATED(3.6);
389 390

/* Decode a Unicode object unicode and return the result as Unicode
391 392 393 394 395
   object.

   This API is DEPRECATED. The only supported standard encoding is rot13.
   Use PyCodec_Decode() to decode with rot13 and non-standard codecs
   that decode from str to str. */
396 397

PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
398 399 400
    PyObject *unicode,          /* Unicode object */
    const char *encoding,       /* encoding */
    const char *errors          /* error handling */
401
    ) Py_DEPRECATED(3.6);
402

403
/* Encodes a Unicode object and returns the result as Python
404 405
   object.

Ville Skyttä's avatar
Ville Skyttä committed
406
   This API is DEPRECATED.  It is superseded by PyUnicode_AsEncodedString()
407 408 409
   since all standard encodings (except rot13) encode str to bytes.
   Use PyCodec_Encode() for encoding with rot13 and non-standard codecs
   that encode form str to non-bytes. */
410 411

PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
412 413 414
    PyObject *unicode,          /* Unicode object */
    const char *encoding,       /* encoding */
    const char *errors          /* error handling */
415
    ) Py_DEPRECATED(3.6);
416

417 418 419
/* Encodes a Unicode object and returns the result as Python string
   object. */

420
PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
421 422 423
    PyObject *unicode,          /* Unicode object */
    const char *encoding,       /* encoding */
    const char *errors          /* error handling */
424 425
    );

426
/* Encodes a Unicode object and returns the result as Unicode
427 428 429 430 431
   object.

   This API is DEPRECATED.  The only supported standard encodings is rot13.
   Use PyCodec_Encode() to encode with rot13 and non-standard codecs
   that encode from str to str. */
432 433

PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
434 435 436
    PyObject *unicode,          /* Unicode object */
    const char *encoding,       /* encoding */
    const char *errors          /* error handling */
437
    ) Py_DEPRECATED(3.6);
438 439 440

/* Build an encoding map. */

441 442 443 444
PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
    PyObject* string            /* 256 character map */
   );

445 446
/* --- UTF-7 Codecs ------------------------------------------------------- */

447
PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
448 449 450
    const char *string,         /* UTF-7 encoded string */
    Py_ssize_t length,          /* size of string */
    const char *errors          /* error handling */
451 452
    );

453
PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
454 455 456 457
    const char *string,         /* UTF-7 encoded string */
    Py_ssize_t length,          /* size of string */
    const char *errors,         /* error handling */
    Py_ssize_t *consumed        /* bytes consumed */
458 459
    );

460 461
/* --- UTF-8 Codecs ------------------------------------------------------- */

462
PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
463 464 465
    const char *string,         /* UTF-8 encoded string */
    Py_ssize_t length,          /* size of string */
    const char *errors          /* error handling */
466 467
    );

468
PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
469 470 471 472
    const char *string,         /* UTF-8 encoded string */
    Py_ssize_t length,          /* size of string */
    const char *errors,         /* error handling */
    Py_ssize_t *consumed        /* bytes consumed */
473 474
    );

475
PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
476
    PyObject *unicode           /* Unicode object */
477 478
    );

479 480 481 482 483 484
/* --- UTF-32 Codecs ------------------------------------------------------ */

/* Decodes length bytes from a UTF-32 encoded buffer string and returns
   the corresponding Unicode object.

   errors (if non-NULL) defines the error handling. It defaults
485
   to "strict".
486 487 488 489

   If byteorder is non-NULL, the decoder starts decoding using the
   given byte order:

490 491 492
    *byteorder == -1: little endian
    *byteorder == 0:  native order
    *byteorder == 1:  big endian
493 494 495 496 497 498 499 500 501 502 503 504

   In native mode, the first four bytes of the stream are checked for a
   BOM mark. If found, the BOM mark is analysed, the byte order
   adjusted and the BOM skipped.  In the other modes, no BOM mark
   interpretation is done. After completion, *byteorder is set to the
   current byte order at the end of input data.

   If byteorder is NULL, the codec starts in native order mode.

*/

PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
505 506 507 508 509 510
    const char *string,         /* UTF-32 encoded string */
    Py_ssize_t length,          /* size of string */
    const char *errors,         /* error handling */
    int *byteorder              /* pointer to byteorder to use
                                   0=native;-1=LE,1=BE; updated on
                                   exit */
511 512 513
    );

PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
514 515 516 517 518 519 520
    const char *string,         /* UTF-32 encoded string */
    Py_ssize_t length,          /* size of string */
    const char *errors,         /* error handling */
    int *byteorder,             /* pointer to byteorder to use
                                   0=native;-1=LE,1=BE; updated on
                                   exit */
    Py_ssize_t *consumed        /* bytes consumed */
521 522 523 524 525 526
    );

/* Returns a Python string using the UTF-32 encoding in native byte
   order. The string always starts with a BOM mark.  */

PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
527
    PyObject *unicode           /* Unicode object */
528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545
    );

/* Returns a Python string object holding the UTF-32 encoded value of
   the Unicode data.

   If byteorder is not 0, output is written according to the following
   byte order:

   byteorder == -1: little endian
   byteorder == 0:  native byte order (writes a BOM mark)
   byteorder == 1:  big endian

   If byteorder is 0, the output string will always start with the
   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
   prepended.

*/

546 547
/* --- UTF-16 Codecs ------------------------------------------------------ */

548
/* Decodes length bytes from a UTF-16 encoded buffer string and returns
549 550 551
   the corresponding Unicode object.

   errors (if non-NULL) defines the error handling. It defaults
552
   to "strict".
553 554 555 556

   If byteorder is non-NULL, the decoder starts decoding using the
   given byte order:

557 558 559
    *byteorder == -1: little endian
    *byteorder == 0:  native order
    *byteorder == 1:  big endian
560

561 562 563 564 565
   In native mode, the first two bytes of the stream are checked for a
   BOM mark. If found, the BOM mark is analysed, the byte order
   adjusted and the BOM skipped.  In the other modes, no BOM mark
   interpretation is done. After completion, *byteorder is set to the
   current byte order at the end of input data.
566 567 568 569 570

   If byteorder is NULL, the codec starts in native order mode.

*/

571
PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
572 573 574 575 576 577
    const char *string,         /* UTF-16 encoded string */
    Py_ssize_t length,          /* size of string */
    const char *errors,         /* error handling */
    int *byteorder              /* pointer to byteorder to use
                                   0=native;-1=LE,1=BE; updated on
                                   exit */
578 579
    );

580
PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
581 582 583 584 585 586 587
    const char *string,         /* UTF-16 encoded string */
    Py_ssize_t length,          /* size of string */
    const char *errors,         /* error handling */
    int *byteorder,             /* pointer to byteorder to use
                                   0=native;-1=LE,1=BE; updated on
                                   exit */
    Py_ssize_t *consumed        /* bytes consumed */
588 589
    );

590 591 592
/* Returns a Python string using the UTF-16 encoding in native byte
   order. The string always starts with a BOM mark.  */

593
PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
594
    PyObject *unicode           /* Unicode object */
595 596 597 598
    );

/* --- Unicode-Escape Codecs ---------------------------------------------- */

599
PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
600 601 602
    const char *string,         /* Unicode-Escape encoded string */
    Py_ssize_t length,          /* size of string */
    const char *errors          /* error handling */
603 604
    );

605
PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
606
    PyObject *unicode           /* Unicode object */
607 608 609 610
    );

/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */

611
PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
612 613 614
    const char *string,         /* Raw-Unicode-Escape encoded string */
    Py_ssize_t length,          /* size of string */
    const char *errors          /* error handling */
615 616
    );

617
PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
618
    PyObject *unicode           /* Unicode object */
619 620
    );

621
/* --- Latin-1 Codecs -----------------------------------------------------
622

623
   Note: Latin-1 corresponds to the first 256 Unicode ordinals. */
624

625
PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
626 627 628
    const char *string,         /* Latin-1 encoded string */
    Py_ssize_t length,          /* size of string */
    const char *errors          /* error handling */
629 630
    );

631
PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
632
    PyObject *unicode           /* Unicode object */
633 634
    );

635
/* --- ASCII Codecs -------------------------------------------------------
636 637 638 639 640

   Only 7-bit ASCII data is excepted. All other codes generate errors.

*/

641
PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
642 643 644
    const char *string,         /* ASCII encoded string */
    Py_ssize_t length,          /* size of string */
    const char *errors          /* error handling */
645 646
    );

647
PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
648
    PyObject *unicode           /* Unicode object */
649 650
    );

651
/* --- Character Map Codecs -----------------------------------------------
652

653
   This codec uses mappings to encode and decode characters.
654

655 656 657 658 659 660 661 662 663 664
   Decoding mappings must map byte ordinals (integers in the range from 0 to
   255) to Unicode strings, integers (which are then interpreted as Unicode
   ordinals) or None.  Unmapped data bytes (ones which cause a LookupError)
   as well as mapped to None, 0xFFFE or '\ufffe' are treated as "undefined
   mapping" and cause an error.

   Encoding mappings must map Unicode ordinal integers to bytes objects,
   integers in the range from 0 to 255 or None.  Unmapped character
   ordinals (ones which cause a LookupError) as well as mapped to
   None are treated as "undefined mapping" and cause an error.
665 666 667

*/

668
PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
669 670
    const char *string,         /* Encoded string */
    Py_ssize_t length,          /* size of string */
671
    PyObject *mapping,          /* decoding mapping */
672
    const char *errors          /* error handling */
673 674
    );

675
PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
676
    PyObject *unicode,          /* Unicode object */
677
    PyObject *mapping           /* encoding mapping */
678 679
    );

680
/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum's avatar
Guido van Rossum committed
681

682
#ifdef MS_WINDOWS
683
PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
684
    const char *string,         /* MBCS encoded string */
685
    Py_ssize_t length,          /* size of string */
686 687 688
    const char *errors          /* error handling */
    );

689 690 691 692 693 694 695
PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
    const char *string,         /* MBCS encoded string */
    Py_ssize_t length,          /* size of string */
    const char *errors,         /* error handling */
    Py_ssize_t *consumed        /* bytes consumed */
    );

696
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
697 698 699 700 701 702 703
PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
    int code_page,              /* code page number */
    const char *string,         /* encoded string */
    Py_ssize_t length,          /* size of string */
    const char *errors,         /* error handling */
    Py_ssize_t *consumed        /* bytes consumed */
    );
704
#endif
705

706
PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
707 708 709
    PyObject *unicode           /* Unicode object */
    );

710
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
711 712 713 714 715
PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
    int code_page,              /* code page number */
    PyObject *unicode,          /* Unicode object */
    const char *errors          /* error handling */
    );
716
#endif
717

718
#endif /* MS_WINDOWS */
Guido van Rossum's avatar
Guido van Rossum committed
719

720 721
/* --- Locale encoding --------------------------------------------------- */

722
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
723 724 725 726 727 728
/* Decode a string from the current locale encoding. The decoder is strict if
   *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
   error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
   be decoded as a surrogate character and *surrogateescape* is not equal to
   zero, the byte sequence is escaped using the 'surrogateescape' error handler
   instead of being decoded. *str* must end with a null character but cannot
729
   contain embedded null characters. */
730 731 732 733

PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
    const char *str,
    Py_ssize_t len,
734
    const char *errors);
735 736 737 738 739 740

/* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
   length using strlen(). */

PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
    const char *str,
741
    const char *errors);
742

743 744 745
/* Encode a Unicode object to the current locale encoding. The encoder is
   strict is *surrogateescape* is equal to zero, otherwise the
   "surrogateescape" error handler is used. Return a bytes object. The string
746
   cannot contain embedded null characters. */
747 748 749

PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
    PyObject *unicode,
750
    const char *errors
751
    );
752
#endif
753

754 755
/* --- File system encoding ---------------------------------------------- */

756 757
/* ParseTuple converter: encode str objects to bytes using
   PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
758 759

PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
760 761 762 763 764

/* ParseTuple converter: decode bytes objects to unicode using
   PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */

PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
765

766 767
/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
   and the "surrogateescape" error handler.
768

769 770
   If Py_FileSystemDefaultEncoding is not set, fall back to the locale
   encoding.
771

Benjamin Peterson's avatar
Benjamin Peterson committed
772
   Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
773 774 775 776 777 778
*/

PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
    const char *s               /* encoded string */
    );

779 780 781
/* Decode a string using Py_FileSystemDefaultEncoding
   and the "surrogateescape" error handler.

782 783
   If Py_FileSystemDefaultEncoding is not set, fall back to the locale
   encoding.
784 785
*/

786 787 788 789 790
PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
    const char *s,               /* encoded string */
    Py_ssize_t size              /* size */
    );

791
/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
Benjamin Peterson's avatar
Benjamin Peterson committed
792
   "surrogateescape" error handler, and return bytes.
793

794 795
   If Py_FileSystemDefaultEncoding is not set, fall back to the locale
   encoding.
796 797 798 799 800 801
*/

PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
    PyObject *unicode
    );

802 803 804 805
/* --- Methods & Slots ----------------------------------------------------

   These are capable of handling Unicode objects and strings on input
   (we refer to them as strings in the descriptions) and return
806
   Unicode objects or integers as appropriate. */
807 808 809

/* Concat two strings giving a new Unicode string. */

810
PyAPI_FUNC(PyObject*) PyUnicode_Concat(
811 812
    PyObject *left,             /* Left string */
    PyObject *right             /* Right string */
813 814
    );

815 816 817 818
/* Concat two strings and put the result in *pleft
   (sets *pleft to NULL on error) */

PyAPI_FUNC(void) PyUnicode_Append(
819 820
    PyObject **pleft,           /* Pointer to left string */
    PyObject *right             /* Right string */
821 822 823 824 825 826
    );

/* Concat two strings, put the result in *pleft and drop the right object
   (sets *pleft to NULL on error) */

PyAPI_FUNC(void) PyUnicode_AppendAndDel(
827 828
    PyObject **pleft,           /* Pointer to left string */
    PyObject *right             /* Right string */
829 830
    );

831 832 833 834 835 836 837 838 839 840 841
/* Split a string giving a list of Unicode strings.

   If sep is NULL, splitting will be done at all whitespace
   substrings. Otherwise, splits occur at the given separator.

   At most maxsplit splits will be done. If negative, no limit is set.

   Separators are not included in the resulting list.

*/

842
PyAPI_FUNC(PyObject*) PyUnicode_Split(
843 844 845 846
    PyObject *s,                /* String to split */
    PyObject *sep,              /* String separator */
    Py_ssize_t maxsplit         /* Maxsplit count */
    );
847 848 849 850 851

/* Dito, but split at line breaks.

   CRLF is considered to be one line break. Line breaks are not
   included in the resulting list. */
852

853
PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
854 855 856
    PyObject *s,                /* String to split */
    int keepends                /* If true, line end markers are included */
    );
857

858 859 860
/* Partition a string using a given separator. */

PyAPI_FUNC(PyObject*) PyUnicode_Partition(
861 862 863
    PyObject *s,                /* String to partition */
    PyObject *sep               /* String separator */
    );
864 865 866 867 868

/* Partition a string using a given separator, searching from the end of the
   string. */

PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
869 870 871
    PyObject *s,                /* String to partition */
    PyObject *sep               /* String separator */
    );
872

873 874 875 876 877 878 879 880 881 882 883 884 885 886
/* Split a string giving a list of Unicode strings.

   If sep is NULL, splitting will be done at all whitespace
   substrings. Otherwise, splits occur at the given separator.

   At most maxsplit splits will be done. But unlike PyUnicode_Split
   PyUnicode_RSplit splits from the end of the string. If negative,
   no limit is set.

   Separators are not included in the resulting list.

*/

PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
887 888 889 890
    PyObject *s,                /* String to split */
    PyObject *sep,              /* String separator */
    Py_ssize_t maxsplit         /* Maxsplit count */
    );
891

892 893 894
/* Translate a string by applying a character mapping table to it and
   return the resulting Unicode object.

895 896
   The mapping table must map Unicode ordinal integers to Unicode strings,
   Unicode ordinal integers or None (causing deletion of the character).
897 898 899 900 901 902 903

   Mapping tables may be dictionaries or sequences. Unmapped character
   ordinals (ones which cause a LookupError) are left untouched and
   are copied as-is.

*/

904
PyAPI_FUNC(PyObject *) PyUnicode_Translate(
905 906 907
    PyObject *str,              /* String */
    PyObject *table,            /* Translate table */
    const char *errors          /* error handling */
908 909 910 911
    );

/* Join a sequence of strings using the given separator and return
   the resulting Unicode string. */
912

913
PyAPI_FUNC(PyObject*) PyUnicode_Join(
914 915
    PyObject *separator,        /* Separator string */
    PyObject *seq               /* Sequence object */
916 917 918 919 920
    );

/* Return 1 if substr matches str[start:end] at the given tail end, 0
   otherwise. */

Martin v. Löwis's avatar
Martin v. Löwis committed
921
PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
922 923 924 925 926
    PyObject *str,              /* String */
    PyObject *substr,           /* Prefix or Suffix string */
    Py_ssize_t start,           /* Start index */
    Py_ssize_t end,             /* Stop index */
    int direction               /* Tail end: -1 prefix, +1 suffix */
927 928 929
    );

/* Return the first position of substr in str[start:end] using the
930 931
   given search direction or -1 if not found. -2 is returned in case
   an error occurred and an exception is set. */
932

Martin v. Löwis's avatar
Martin v. Löwis committed
933
PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
934 935 936 937 938
    PyObject *str,              /* String */
    PyObject *substr,           /* Substring to find */
    Py_ssize_t start,           /* Start index */
    Py_ssize_t end,             /* Stop index */
    int direction               /* Find direction: +1 forward, -1 backward */
939 940
    );

941
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
Martin v. Löwis's avatar
Martin v. Löwis committed
942 943 944 945 946 947 948 949
/* Like PyUnicode_Find, but search for single character only. */
PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
    PyObject *str,
    Py_UCS4 ch,
    Py_ssize_t start,
    Py_ssize_t end,
    int direction
    );
950
#endif
Martin v. Löwis's avatar
Martin v. Löwis committed
951

952
/* Count the number of occurrences of substr in str[start:end]. */
953

Martin v. Löwis's avatar
Martin v. Löwis committed
954
PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
955 956 957 958
    PyObject *str,              /* String */
    PyObject *substr,           /* Substring to count */
    Py_ssize_t start,           /* Start index */
    Py_ssize_t end              /* Stop index */
959 960
    );

961
/* Replace at most maxcount occurrences of substr in str with replstr
962 963
   and return the resulting Unicode object. */

964
PyAPI_FUNC(PyObject *) PyUnicode_Replace(
965 966 967 968 969
    PyObject *str,              /* String */
    PyObject *substr,           /* Substring to find */
    PyObject *replstr,          /* Substring to replace */
    Py_ssize_t maxcount         /* Max. number of replacements to apply;
                                   -1 = all */
970 971 972
    );

/* Compare two strings and return -1, 0, 1 for less than, equal,
973 974
   greater than resp.
   Raise an exception and return -1 on error. */
975

976
PyAPI_FUNC(int) PyUnicode_Compare(
977 978
    PyObject *left,             /* Left string */
    PyObject *right             /* Right string */
979 980
    );

981 982 983 984
/* Compare a Unicode object with C string and return -1, 0, 1 for less than,
   equal, and greater than, respectively.  It is best to pass only
   ASCII-encoded strings, but the function interprets the input string as
   ISO-8859-1 if it contains non-ASCII characters.
985
   This function does not raise exceptions. */
986

987 988
PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
    PyObject *left,
989
    const char *right           /* ASCII-encoded string */
990 991
    );

992 993 994
/* Rich compare two strings and return one of the following:

   - NULL in case an exception was raised
995
   - Py_True or Py_False for successful comparisons
996 997 998 999 1000 1001 1002 1003 1004
   - Py_NotImplemented in case the type combination is unknown

   Possible values for op:

     Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE

*/

PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
1005 1006 1007
    PyObject *left,             /* Left string */
    PyObject *right,            /* Right string */
    int op                      /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
1008 1009
    );

1010
/* Apply an argument tuple or dictionary to a format string and return
1011 1012
   the resulting Unicode string. */

1013
PyAPI_FUNC(PyObject *) PyUnicode_Format(
1014 1015
    PyObject *format,           /* Format string */
    PyObject *args              /* Argument tuple or dictionary */
1016 1017
    );

1018 1019 1020
/* Checks whether element is contained in container and return 1/0
   accordingly.

1021
   element has to coerce to a one element Unicode string. -1 is
1022 1023
   returned in case of an error. */

1024
PyAPI_FUNC(int) PyUnicode_Contains(
1025 1026
    PyObject *container,        /* Container string */
    PyObject *element           /* Element string */
1027 1028
    );

1029 1030 1031 1032
/* Checks whether argument is a valid identifier. */

PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);

1033 1034
/* === Characters Type APIs =============================================== */

1035 1036
#if defined(Py_DEBUG) && !defined(Py_LIMITED_API)
PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
1037
    PyObject *op,
1038
    int check_content);
1039 1040 1041 1042
#elif !defined(NDEBUG)
/* For asserts that call _PyUnicode_CheckConsistency(), which would
 * otherwise be a problem when building with asserts but without Py_DEBUG. */
#define _PyUnicode_CheckConsistency(op, check_content) PyUnicode_Check(op)
1043 1044
#endif

1045
#ifndef Py_LIMITED_API
1046 1047 1048 1049
#  define Py_CPYTHON_UNICODEOBJECT_H
#  include  "cpython/unicodeobject.h"
#  undef Py_CPYTHON_UNICODEOBJECT_H
#endif
1050

1051 1052 1053 1054
#ifdef __cplusplus
}
#endif
#endif /* !Py_UNICODEOBJECT_H */