unicode_format.h 40.1 KB
Newer Older
1
/*
Martin v. Löwis's avatar
Martin v. Löwis committed
2
    unicode_format.h -- implementation of str.format().
3 4 5 6 7 8 9 10 11 12 13
*/

/************************************************************************/
/***********   Global data structures and forward declarations  *********/
/************************************************************************/

/*
   A SubString consists of the characters between two string or
   unicode pointers.
*/
typedef struct {
Martin v. Löwis's avatar
Martin v. Löwis committed
14 15
    PyObject *str; /* borrowed reference */
    Py_ssize_t start, end;
16 17 18
} SubString;


19 20 21
typedef enum {
    ANS_INIT,
    ANS_AUTO,
22
    ANS_MANUAL
23 24 25 26 27 28 29 30 31
} AutoNumberState;   /* Keep track if we're auto-numbering fields */

/* Keeps track of our auto-numbering state, and which number field we're on */
typedef struct {
    AutoNumberState an_state;
    int an_field_number;
} AutoNumber;


32 33 34
/* forward declaration for recursion */
static PyObject *
build_string(SubString *input, PyObject *args, PyObject *kwargs,
35
             int recursion_depth, AutoNumber *auto_number);
36 37 38 39 40 41 42



/************************************************************************/
/**************************  Utility  functions  ************************/
/************************************************************************/

43 44 45 46 47 48 49
static void
AutoNumber_Init(AutoNumber *auto_number)
{
    auto_number->an_state = ANS_INIT;
    auto_number->an_field_number = 0;
}

50 51
/* fill in a SubString from a pointer and length */
Py_LOCAL_INLINE(void)
52
SubString_init(SubString *str, PyObject *s, Py_ssize_t start, Py_ssize_t end)
53
{
Martin v. Löwis's avatar
Martin v. Löwis committed
54 55 56
    str->str = s;
    str->start = start;
    str->end = end;
57 58
}

Martin v. Löwis's avatar
Martin v. Löwis committed
59
/* return a new string.  if str->str is NULL, return None */
60 61 62
Py_LOCAL_INLINE(PyObject *)
SubString_new_object(SubString *str)
{
63 64
    if (str->str == NULL)
        Py_RETURN_NONE;
Martin v. Löwis's avatar
Martin v. Löwis committed
65
    return PyUnicode_Substring(str->str, str->start, str->end);
66 67
}

68
/* return a new string.  if str->str is NULL, return a new empty string */
69 70 71
Py_LOCAL_INLINE(PyObject *)
SubString_new_object_or_empty(SubString *str)
{
Martin v. Löwis's avatar
Martin v. Löwis committed
72
    if (str->str == NULL) {
73
        return PyUnicode_New(0, 0);
74
    }
Martin v. Löwis's avatar
Martin v. Löwis committed
75
    return SubString_new_object(str);
76 77
}

78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
/* Return 1 if an error has been detected switching between automatic
   field numbering and manual field specification, else return 0. Set
   ValueError on error. */
static int
autonumber_state_error(AutoNumberState state, int field_name_is_empty)
{
    if (state == ANS_MANUAL) {
        if (field_name_is_empty) {
            PyErr_SetString(PyExc_ValueError, "cannot switch from "
                            "manual field specification to "
                            "automatic field numbering");
            return 1;
        }
    }
    else {
        if (!field_name_is_empty) {
            PyErr_SetString(PyExc_ValueError, "cannot switch from "
                            "automatic field numbering to "
                            "manual field specification");
            return 1;
        }
    }
    return 0;
}


104 105 106 107
/************************************************************************/
/***********  Format string parsing -- integers and identifiers *********/
/************************************************************************/

108 109
static Py_ssize_t
get_integer(const SubString *str)
110
{
111 112
    Py_ssize_t accumulator = 0;
    Py_ssize_t digitval;
Martin v. Löwis's avatar
Martin v. Löwis committed
113
    Py_ssize_t i;
114

115
    /* empty string is an error */
Martin v. Löwis's avatar
Martin v. Löwis committed
116
    if (str->start >= str->end)
117
        return -1;
118

Martin v. Löwis's avatar
Martin v. Löwis committed
119 120
    for (i = str->start; i < str->end; i++) {
        digitval = Py_UNICODE_TODECIMAL(PyUnicode_READ_CHAR(str->str, i));
121
        if (digitval < 0)
122
            return -1;
123
        /*
124 125 126 127
           Detect possible overflow before it happens:

              accumulator * 10 + digitval > PY_SSIZE_T_MAX if and only if
              accumulator > (PY_SSIZE_T_MAX - digitval) / 10.
128
        */
129
        if (accumulator > (PY_SSIZE_T_MAX - digitval) / 10) {
130 131 132 133
            PyErr_Format(PyExc_ValueError,
                         "Too many decimal digits in format string");
            return -1;
        }
134
        accumulator = accumulator * 10 + digitval;
135
    }
136
    return accumulator;
137 138
}

139 140 141 142 143 144 145 146 147
/************************************************************************/
/******** Functions to get field objects and specification strings ******/
/************************************************************************/

/* do the equivalent of obj.name */
static PyObject *
getattr(PyObject *obj, SubString *name)
{
    PyObject *newobj;
148
    PyObject *str = SubString_new_object(name);
149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
    if (str == NULL)
        return NULL;
    newobj = PyObject_GetAttr(obj, str);
    Py_DECREF(str);
    return newobj;
}

/* do the equivalent of obj[idx], where obj is a sequence */
static PyObject *
getitem_sequence(PyObject *obj, Py_ssize_t idx)
{
    return PySequence_GetItem(obj, idx);
}

/* do the equivalent of obj[idx], where obj is not a sequence */
static PyObject *
getitem_idx(PyObject *obj, Py_ssize_t idx)
{
    PyObject *newobj;
168
    PyObject *idx_obj = PyLong_FromSsize_t(idx);
169 170 171 172 173 174 175 176
    if (idx_obj == NULL)
        return NULL;
    newobj = PyObject_GetItem(obj, idx_obj);
    Py_DECREF(idx_obj);
    return newobj;
}

/* do the equivalent of obj[name] */
177
static PyObject *
178
getitem_str(PyObject *obj, SubString *name)
179
{
180
    PyObject *newobj;
181
    PyObject *str = SubString_new_object(name);
182
    if (str == NULL)
183
        return NULL;
184 185 186 187 188 189 190 191 192 193 194
    newobj = PyObject_GetItem(obj, str);
    Py_DECREF(str);
    return newobj;
}

typedef struct {
    /* the entire string we're parsing.  we assume that someone else
       is managing its lifetime, and that it will exist for the
       lifetime of the iterator.  can be empty */
    SubString str;

Martin v. Löwis's avatar
Martin v. Löwis committed
195 196
    /* index to where we are inside field_name */
    Py_ssize_t index;
197 198 199 200
} FieldNameIterator;


static int
Martin v. Löwis's avatar
Martin v. Löwis committed
201 202
FieldNameIterator_init(FieldNameIterator *self, PyObject *s,
                       Py_ssize_t start, Py_ssize_t end)
203
{
Martin v. Löwis's avatar
Martin v. Löwis committed
204 205
    SubString_init(&self->str, s, start, end);
    self->index = start;
206 207 208 209 210 211
    return 1;
}

static int
_FieldNameIterator_attr(FieldNameIterator *self, SubString *name)
{
Martin v. Löwis's avatar
Martin v. Löwis committed
212
    Py_UCS4 c;
213

Martin v. Löwis's avatar
Martin v. Löwis committed
214 215
    name->str = self->str.str;
    name->start = self->index;
216 217

    /* return everything until '.' or '[' */
Martin v. Löwis's avatar
Martin v. Löwis committed
218 219 220
    while (self->index < self->str.end) {
        c = PyUnicode_READ_CHAR(self->str.str, self->index++);
        switch (c) {
221 222 223
        case '[':
        case '.':
            /* backup so that we this character will be seen next time */
Martin v. Löwis's avatar
Martin v. Löwis committed
224
            self->index--;
225 226 227 228 229
            break;
        default:
            continue;
        }
        break;
230
    }
231
    /* end of string is okay */
Martin v. Löwis's avatar
Martin v. Löwis committed
232
    name->end = self->index;
233
    return 1;
234 235
}

236 237 238
static int
_FieldNameIterator_item(FieldNameIterator *self, SubString *name)
{
239
    int bracket_seen = 0;
Martin v. Löwis's avatar
Martin v. Löwis committed
240
    Py_UCS4 c;
241

Martin v. Löwis's avatar
Martin v. Löwis committed
242 243
    name->str = self->str.str;
    name->start = self->index;
244

245
    /* return everything until ']' */
Martin v. Löwis's avatar
Martin v. Löwis committed
246 247 248
    while (self->index < self->str.end) {
        c = PyUnicode_READ_CHAR(self->str.str, self->index++);
        switch (c) {
249
        case ']':
250
            bracket_seen = 1;
251 252 253 254 255 256
            break;
        default:
            continue;
        }
        break;
    }
257 258 259 260 261 262
    /* make sure we ended with a ']' */
    if (!bracket_seen) {
        PyErr_SetString(PyExc_ValueError, "Missing ']' in format string");
        return 0;
    }

263 264
    /* end of string is okay */
    /* don't include the ']' */
Martin v. Löwis's avatar
Martin v. Löwis committed
265
    name->end = self->index-1;
266 267 268 269 270 271 272 273 274
    return 1;
}

/* returns 0 on error, 1 on non-error termination, and 2 if it returns a value */
static int
FieldNameIterator_next(FieldNameIterator *self, int *is_attribute,
                       Py_ssize_t *name_idx, SubString *name)
{
    /* check at end of input */
Martin v. Löwis's avatar
Martin v. Löwis committed
275
    if (self->index >= self->str.end)
276 277
        return 1;

Martin v. Löwis's avatar
Martin v. Löwis committed
278
    switch (PyUnicode_READ_CHAR(self->str.str, self->index++)) {
279 280
    case '.':
        *is_attribute = 1;
281
        if (_FieldNameIterator_attr(self, name) == 0)
282 283 284 285 286
            return 0;
        *name_idx = -1;
        break;
    case '[':
        *is_attribute = 0;
287
        if (_FieldNameIterator_item(self, name) == 0)
288 289
            return 0;
        *name_idx = get_integer(name);
290 291
        if (*name_idx == -1 && PyErr_Occurred())
            return 0;
292 293
        break;
    default:
294 295 296
        /* Invalid character follows ']' */
        PyErr_SetString(PyExc_ValueError, "Only '.' or '[' may "
                        "follow ']' in format field specifier");
297 298 299 300
        return 0;
    }

    /* empty string is an error */
Martin v. Löwis's avatar
Martin v. Löwis committed
301
    if (name->start == name->end) {
302 303 304 305 306 307 308 309 310 311 312 313 314
        PyErr_SetString(PyExc_ValueError, "Empty attribute in format string");
        return 0;
    }

    return 2;
}


/* input: field_name
   output: 'first' points to the part before the first '[' or '.'
           'first_idx' is -1 if 'first' is not an integer, otherwise
                       it's the value of first converted to an integer
           'rest' is an iterator to return the rest
315
*/
316
static int
Martin v. Löwis's avatar
Martin v. Löwis committed
317
field_name_split(PyObject *str, Py_ssize_t start, Py_ssize_t end, SubString *first,
318 319
                 Py_ssize_t *first_idx, FieldNameIterator *rest,
                 AutoNumber *auto_number)
320
{
Martin v. Löwis's avatar
Martin v. Löwis committed
321 322
    Py_UCS4 c;
    Py_ssize_t i = start;
323 324
    int field_name_is_empty;
    int using_numeric_index;
325 326

    /* find the part up until the first '.' or '[' */
Martin v. Löwis's avatar
Martin v. Löwis committed
327 328
    while (i < end) {
        switch (c = PyUnicode_READ_CHAR(str, i++)) {
329 330 331 332
        case '[':
        case '.':
            /* backup so that we this character is available to the
               "rest" iterator */
Martin v. Löwis's avatar
Martin v. Löwis committed
333
            i--;
334 335 336 337 338 339 340 341
            break;
        default:
            continue;
        }
        break;
    }

    /* set up the return values */
Martin v. Löwis's avatar
Martin v. Löwis committed
342 343
    SubString_init(first, str, start, i);
    FieldNameIterator_init(rest, str, i, end);
344

345 346
    /* see if "first" is an integer, in which case it's used as an index */
    *first_idx = get_integer(first);
347 348
    if (*first_idx == -1 && PyErr_Occurred())
        return 0;
349

Martin v. Löwis's avatar
Martin v. Löwis committed
350
    field_name_is_empty = first->start >= first->end;
351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381

    /* If the field name is omitted or if we have a numeric index
       specified, then we're doing numeric indexing into args. */
    using_numeric_index = field_name_is_empty || *first_idx != -1;

    /* We always get here exactly one time for each field we're
       processing. And we get here in field order (counting by left
       braces). So this is the perfect place to handle automatic field
       numbering if the field name is omitted. */

    /* Check if we need to do the auto-numbering. It's not needed if
       we're called from string.Format routines, because it's handled
       in that class by itself. */
    if (auto_number) {
        /* Initialize our auto numbering state if this is the first
           time we're either auto-numbering or manually numbering. */
        if (auto_number->an_state == ANS_INIT && using_numeric_index)
            auto_number->an_state = field_name_is_empty ?
                ANS_AUTO : ANS_MANUAL;

        /* Make sure our state is consistent with what we're doing
           this time through. Only check if we're using a numeric
           index. */
        if (using_numeric_index)
            if (autonumber_state_error(auto_number->an_state,
                                       field_name_is_empty))
                return 0;
        /* Zero length field means we want to do auto-numbering of the
           fields. */
        if (field_name_is_empty)
            *first_idx = (auto_number->an_field_number)++;
382
    }
383 384

    return 1;
385 386
}

387

388 389 390 391 392 393
/*
    get_field_object returns the object inside {}, before the
    format_spec.  It handles getindex and getattr lookups and consumes
    the entire input string.
*/
static PyObject *
394 395
get_field_object(SubString *input, PyObject *args, PyObject *kwargs,
                 AutoNumber *auto_number)
396
{
397 398 399 400 401
    PyObject *obj = NULL;
    int ok;
    int is_attribute;
    SubString name;
    SubString first;
402
    Py_ssize_t index;
403
    FieldNameIterator rest;
404

Martin v. Löwis's avatar
Martin v. Löwis committed
405
    if (!field_name_split(input->str, input->start, input->end, &first,
406
                          &index, &rest, auto_number)) {
407 408
        goto error;
    }
409

410 411
    if (index == -1) {
        /* look up in kwargs */
412
        PyObject *key = SubString_new_object(&first);
413
        if (key == NULL) {
414
            goto error;
415 416
        }
        if (kwargs == NULL) {
417
            PyErr_SetObject(PyExc_KeyError, key);
418 419
            Py_DECREF(key);
            goto error;
420
        }
421 422 423 424
        /* Use PyObject_GetItem instead of PyDict_GetItem because this
           code is no longer just used with kwargs. It might be passed
           a non-dict when called through format_map. */
        obj = PyObject_GetItem(kwargs, key);
425
        Py_DECREF(key);
426 427 428
        if (obj == NULL) {
            goto error;
        }
429 430
    }
    else {
431 432 433 434 435 436 437 438 439 440
        /* If args is NULL, we have a format string with a positional field
           with only kwargs to retrieve it from. This can only happen when
           used with format_map(), where positional arguments are not
           allowed. */
        if (args == NULL) {
            PyErr_SetString(PyExc_ValueError, "Format string contains "
                            "positional fields");
            goto error;
        }

441 442
        /* look up in args */
        obj = PySequence_GetItem(args, index);
443
        if (obj == NULL)
444
            goto error;
445
    }
446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470

    /* iterate over the rest of the field_name */
    while ((ok = FieldNameIterator_next(&rest, &is_attribute, &index,
                                        &name)) == 2) {
        PyObject *tmp;

        if (is_attribute)
            /* getattr lookup "." */
            tmp = getattr(obj, &name);
        else
            /* getitem lookup "[]" */
            if (index == -1)
                tmp = getitem_str(obj, &name);
            else
                if (PySequence_Check(obj))
                    tmp = getitem_sequence(obj, index);
                else
                    /* not a sequence */
                    tmp = getitem_idx(obj, index);
        if (tmp == NULL)
            goto error;

        /* assign to obj */
        Py_DECREF(obj);
        obj = tmp;
471
    }
472 473 474 475 476
    /* end of iterator, this is the non-error case */
    if (ok == 1)
        return obj;
error:
    Py_XDECREF(obj);
477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492
    return NULL;
}

/************************************************************************/
/*****************  Field rendering functions  **************************/
/************************************************************************/

/*
    render_field() is the main function in this section.  It takes the
    field object and field specification string generated by
    get_field_and_spec, and renders the field into the output string.

    render_field calls fieldobj.__format__(format_spec) method, and
    appends to the output.
*/
static int
493
render_field(PyObject *fieldobj, SubString *format_spec, _PyUnicodeWriter *writer)
494 495
{
    int ok = 0;
496
    PyObject *result = NULL;
497
    PyObject *format_spec_object = NULL;
498 499
    int (*formatter) (_PyUnicodeWriter*, PyObject *, PyObject *, Py_ssize_t, Py_ssize_t) = NULL;
    int err;
500

501 502 503
    /* If we know the type exactly, skip the lookup of __format__ and just
       call the formatter directly. */
    if (PyUnicode_CheckExact(fieldobj))
504
        formatter = _PyUnicode_FormatAdvancedWriter;
505
    else if (PyLong_CheckExact(fieldobj))
506
        formatter = _PyLong_FormatAdvancedWriter;
507
    else if (PyFloat_CheckExact(fieldobj))
508 509 510
        formatter = _PyFloat_FormatAdvancedWriter;
    else if (PyComplex_CheckExact(fieldobj))
        formatter = _PyComplex_FormatAdvancedWriter;
511 512

    if (formatter) {
513 514
        /* we know exactly which formatter will be called when __format__ is
           looked up, so call it directly, instead. */
515 516 517
        err = formatter(writer, fieldobj, format_spec->str,
                        format_spec->start, format_spec->end);
        return (err == 0);
518
    }
519
    else {
520 521
        /* We need to create an object out of the pointers we have, because
           __format__ takes a string/unicode object for format_spec. */
Martin v. Löwis's avatar
Martin v. Löwis committed
522 523 524 525 526 527
        if (format_spec->str)
            format_spec_object = PyUnicode_Substring(format_spec->str,
                                                     format_spec->start,
                                                     format_spec->end);
        else
            format_spec_object = PyUnicode_New(0, 0);
528 529 530 531
        if (format_spec_object == NULL)
            goto done;

        result = PyObject_Format(fieldobj, format_spec_object);
532
    }
533 534
    if (result == NULL)
        goto done;
535

536
    if (_PyUnicodeWriter_WriteStr(writer, result) == -1)
537 538
        goto done;
    ok = 1;
539

540
done:
541
    Py_XDECREF(format_spec_object);
542 543 544 545 546 547
    Py_XDECREF(result);
    return ok;
}

static int
parse_field(SubString *str, SubString *field_name, SubString *format_spec,
548
            int *format_spec_needs_expanding, Py_UCS4 *conversion)
549
{
550 551 552 553
    /* Note this function works if the field name is zero length,
       which is good.  Zero length field names are handled later, in
       field_name_split. */

Martin v. Löwis's avatar
Martin v. Löwis committed
554
    Py_UCS4 c = 0;
555 556 557

    /* initialize these, as they may be empty */
    *conversion = '\0';
Martin v. Löwis's avatar
Martin v. Löwis committed
558
    SubString_init(format_spec, NULL, 0, 0);
559

560 561
    /* Search for the field name.  it's terminated by the end of
       the string, or a ':' or '!' */
Martin v. Löwis's avatar
Martin v. Löwis committed
562 563 564 565
    field_name->str = str->str;
    field_name->start = str->start;
    while (str->start < str->end) {
        switch ((c = PyUnicode_READ_CHAR(str->str, str->start++))) {
566 567 568 569 570 571 572 573 574
        case '{':
            PyErr_SetString(PyExc_ValueError, "unexpected '{' in field name");
            return 0;
        case '[':
            for (; str->start < str->end; str->start++)
                if (PyUnicode_READ_CHAR(str->str, str->start) == ']')
                    break;
            continue;
        case '}':
575 576 577 578 579 580 581 582 583
        case ':':
        case '!':
            break;
        default:
            continue;
        }
        break;
    }

584
    field_name->end = str->start - 1;
585
    if (c == '!' || c == ':') {
586
        Py_ssize_t count;
587 588 589 590 591 592
        /* we have a format specifier and/or a conversion */
        /* don't include the last character */

        /* see if there's a conversion specifier */
        if (c == '!') {
            /* there must be another character present */
593
            if (str->start >= str->end) {
594
                PyErr_SetString(PyExc_ValueError,
595
                                "end of string while looking for conversion "
596 597 598
                                "specifier");
                return 0;
            }
599
            *conversion = PyUnicode_READ_CHAR(str->str, str->start++);
600

601 602 603 604
            if (str->start < str->end) {
                c = PyUnicode_READ_CHAR(str->str, str->start++);
                if (c == '}')
                    return 1;
605 606
                if (c != ':') {
                    PyErr_SetString(PyExc_ValueError,
607
                                    "expected ':' after conversion specifier");
608 609 610 611
                    return 0;
                }
            }
        }
612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638
        format_spec->str = str->str;
        format_spec->start = str->start;
        count = 1;
        while (str->start < str->end) {
            switch ((c = PyUnicode_READ_CHAR(str->str, str->start++))) {
            case '{':
                *format_spec_needs_expanding = 1;
                count++;
                break;
            case '}':
                count--;
                if (count == 0) {
                    format_spec->end = str->start - 1;
                    return 1;
                }
                break;
            default:
                break;
            }
        }

        PyErr_SetString(PyExc_ValueError, "unmatched '{' in format spec");
        return 0;
    }
    else if (c != '}') {
        PyErr_SetString(PyExc_ValueError, "expected '}' before end of string");
        return 0;
639
    }
640 641

    return 1;
642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657
}

/************************************************************************/
/******* Output string allocation and escape-to-markup processing  ******/
/************************************************************************/

/* MarkupIterator breaks the string into pieces of either literal
   text, or things inside {} that need to be marked up.  it is
   designed to make it easy to wrap a Python iterator around it, for
   use with the Formatter class */

typedef struct {
    SubString str;
} MarkupIterator;

static int
658
MarkupIterator_init(MarkupIterator *self, PyObject *str,
Martin v. Löwis's avatar
Martin v. Löwis committed
659
                    Py_ssize_t start, Py_ssize_t end)
660
{
Martin v. Löwis's avatar
Martin v. Löwis committed
661
    SubString_init(&self->str, str, start, end);
662 663 664 665 666 667
    return 1;
}

/* returns 0 on error, 1 on non-error termination, and 2 if it got a
   string (or something to be expanded) */
static int
668
MarkupIterator_next(MarkupIterator *self, SubString *literal,
669
                    int *field_present, SubString *field_name,
Martin v. Löwis's avatar
Martin v. Löwis committed
670
                    SubString *format_spec, Py_UCS4 *conversion,
671 672
                    int *format_spec_needs_expanding)
{
673
    int at_end;
Martin v. Löwis's avatar
Martin v. Löwis committed
674 675
    Py_UCS4 c = 0;
    Py_ssize_t start;
676
    Py_ssize_t len;
677
    int markup_follows = 0;
678

679
    /* initialize all of the output variables */
Martin v. Löwis's avatar
Martin v. Löwis committed
680 681 682
    SubString_init(literal, NULL, 0, 0);
    SubString_init(field_name, NULL, 0, 0);
    SubString_init(format_spec, NULL, 0, 0);
683
    *conversion = '\0';
684
    *format_spec_needs_expanding = 0;
685
    *field_present = 0;
686

687 688
    /* No more input, end of iterator.  This is the normal exit
       path. */
Martin v. Löwis's avatar
Martin v. Löwis committed
689
    if (self->str.start >= self->str.end)
690 691
        return 1;

Martin v. Löwis's avatar
Martin v. Löwis committed
692
    start = self->str.start;
693

694 695 696 697 698 699 700
    /* First read any literal text. Read until the end of string, an
       escaped '{' or '}', or an unescaped '{'.  In order to never
       allocate memory and so I can just pass pointers around, if
       there's an escaped '{' or '}' then we'll return the literal
       including the brace, but no format object.  The next time
       through, we'll return the rest of the literal, skipping past
       the second consecutive brace. */
Martin v. Löwis's avatar
Martin v. Löwis committed
701 702
    while (self->str.start < self->str.end) {
        switch (c = PyUnicode_READ_CHAR(self->str.str, self->str.start++)) {
703 704 705 706 707 708
        case '{':
        case '}':
            markup_follows = 1;
            break;
        default:
            continue;
709
        }
710 711
        break;
    }
712

Martin v. Löwis's avatar
Martin v. Löwis committed
713 714
    at_end = self->str.start >= self->str.end;
    len = self->str.start - start;
715

716 717
    if ((c == '}') && (at_end ||
                       (c != PyUnicode_READ_CHAR(self->str.str,
Martin v. Löwis's avatar
Martin v. Löwis committed
718
                                                 self->str.start)))) {
719 720 721
        PyErr_SetString(PyExc_ValueError, "Single '}' encountered "
                        "in format string");
        return 0;
722
    }
723 724 725 726 727 728
    if (at_end && c == '{') {
        PyErr_SetString(PyExc_ValueError, "Single '{' encountered "
                        "in format string");
        return 0;
    }
    if (!at_end) {
Martin v. Löwis's avatar
Martin v. Löwis committed
729
        if (c == PyUnicode_READ_CHAR(self->str.str, self->str.start)) {
730 731
            /* escaped } or {, skip it in the input.  there is no
               markup object following us, just this literal text */
Martin v. Löwis's avatar
Martin v. Löwis committed
732
            self->str.start++;
733
            markup_follows = 0;
734
        }
735 736 737
        else
            len--;
    }
738

739
    /* record the literal text */
Martin v. Löwis's avatar
Martin v. Löwis committed
740 741
    literal->str = self->str.str;
    literal->start = start;
742
    literal->end = start + len;
743

744 745 746
    if (!markup_follows)
        return 2;

747
    /* this is markup; parse the field */
748
    *field_present = 1;
749 750 751 752
    if (!parse_field(&self->str, field_name, format_spec,
                     format_spec_needs_expanding, conversion))
        return 0;
    return 2;
753 754 755 756 757
}


/* do the !r or !s conversion on obj */
static PyObject *
Martin v. Löwis's avatar
Martin v. Löwis committed
758
do_conversion(PyObject *obj, Py_UCS4 conversion)
759 760 761 762 763 764 765
{
    /* XXX in pre-3.0, do we need to convert this to unicode, since it
       might have returned a string? */
    switch (conversion) {
    case 'r':
        return PyObject_Repr(obj);
    case 's':
Martin v. Löwis's avatar
Martin v. Löwis committed
766
        return PyObject_Str(obj);
767
    case 'a':
Martin v. Löwis's avatar
Martin v. Löwis committed
768
        return PyObject_ASCII(obj);
769
    default:
770 771 772 773 774
        if (conversion > 32 && conversion < 127) {
                /* It's the ASCII subrange; casting to char is safe
                   (assuming the execution character set is an ASCII
                   superset). */
                PyErr_Format(PyExc_ValueError,
775 776
                     "Unknown conversion specifier %c",
                     (char)conversion);
777 778 779 780
        } else
                PyErr_Format(PyExc_ValueError,
                     "Unknown conversion specifier \\x%x",
                     (unsigned int)conversion);
781 782 783 784 785 786 787 788 789 790 791
        return NULL;
    }
}

/* given:

   {field_name!conversion:format_spec}

   compute the result and write it to output.
   format_spec_needs_expanding is an optimization.  if it's false,
   just output the string directly, otherwise recursively expand the
792 793 794 795 796
   format_spec string.

   field_name is allowed to be zero length, in which case we
   are doing auto field numbering.
*/
797 798 799

static int
output_markup(SubString *field_name, SubString *format_spec,
Martin v. Löwis's avatar
Martin v. Löwis committed
800
              int format_spec_needs_expanding, Py_UCS4 conversion,
801
              _PyUnicodeWriter *writer, PyObject *args, PyObject *kwargs,
802
              int recursion_depth, AutoNumber *auto_number)
803 804 805 806 807 808 809 810
{
    PyObject *tmp = NULL;
    PyObject *fieldobj = NULL;
    SubString expanded_format_spec;
    SubString *actual_format_spec;
    int result = 0;

    /* convert field_name to an object */
811
    fieldobj = get_field_object(field_name, args, kwargs, auto_number);
812 813 814 815 816
    if (fieldobj == NULL)
        goto done;

    if (conversion != '\0') {
        tmp = do_conversion(fieldobj, conversion);
Martin v. Löwis's avatar
Martin v. Löwis committed
817
        if (tmp == NULL || PyUnicode_READY(tmp) == -1)
818 819 820 821 822 823 824 825 826 827
            goto done;

        /* do the assignment, transferring ownership: fieldobj = tmp */
        Py_DECREF(fieldobj);
        fieldobj = tmp;
        tmp = NULL;
    }

    /* if needed, recurively compute the format_spec */
    if (format_spec_needs_expanding) {
828 829
        tmp = build_string(format_spec, args, kwargs, recursion_depth-1,
                           auto_number);
Martin v. Löwis's avatar
Martin v. Löwis committed
830
        if (tmp == NULL || PyUnicode_READY(tmp) == -1)
831 832 833 834 835
            goto done;

        /* note that in the case we're expanding the format string,
           tmp must be kept around until after the call to
           render_field. */
Martin v. Löwis's avatar
Martin v. Löwis committed
836
        SubString_init(&expanded_format_spec, tmp, 0, PyUnicode_GET_LENGTH(tmp));
837
        actual_format_spec = &expanded_format_spec;
838 839
    }
    else
840 841
        actual_format_spec = format_spec;

842
    if (render_field(fieldobj, actual_format_spec, writer) == 0)
843 844 845 846 847 848 849 850 851 852 853 854
        goto done;

    result = 1;

done:
    Py_XDECREF(fieldobj);
    Py_XDECREF(tmp);

    return result;
}

/*
855
    do_markup is the top-level loop for the format() method.  It
856 857 858 859 860 861
    searches through the format string for escapes to markup codes, and
    calls other functions to move non-markup text to the output,
    and to perform the markup to the output.
*/
static int
do_markup(SubString *input, PyObject *args, PyObject *kwargs,
862
          _PyUnicodeWriter *writer, int recursion_depth, AutoNumber *auto_number)
863 864 865 866
{
    MarkupIterator iter;
    int format_spec_needs_expanding;
    int result;
867
    int field_present;
868
    SubString literal;
869 870
    SubString field_name;
    SubString format_spec;
871
    Py_UCS4 conversion;
872

Martin v. Löwis's avatar
Martin v. Löwis committed
873
    MarkupIterator_init(&iter, input->str, input->start, input->end);
874 875 876
    while ((result = MarkupIterator_next(&iter, &literal, &field_present,
                                         &field_name, &format_spec,
                                         &conversion,
877
                                         &format_spec_needs_expanding)) == 2) {
878 879 880 881 882
        if (literal.end != literal.start) {
            if (!field_present && iter.str.start == iter.str.end)
                writer->overallocate = 0;
            if (_PyUnicodeWriter_WriteSubstring(writer, literal.str,
                                                literal.start, literal.end) < 0)
883 884 885
                return 0;
        }

886 887
        if (field_present) {
            if (iter.str.start == iter.str.end)
888
                writer->overallocate = 0;
889
            if (!output_markup(&field_name, &format_spec,
890
                               format_spec_needs_expanding, conversion, writer,
891
                               args, kwargs, recursion_depth, auto_number))
892
                return 0;
893
        }
894 895 896 897 898 899 900 901 902 903 904
    }
    return result;
}


/*
    build_string allocates the output string and then
    calls do_markup to do the heavy lifting.
*/
static PyObject *
build_string(SubString *input, PyObject *args, PyObject *kwargs,
905
             int recursion_depth, AutoNumber *auto_number)
906
{
907
    _PyUnicodeWriter writer;
908 909

    /* check the recursion level */
910
    if (recursion_depth <= 0) {
911 912
        PyErr_SetString(PyExc_ValueError,
                        "Max string recursion exceeded");
913
        return NULL;
914 915
    }

916 917 918
    _PyUnicodeWriter_Init(&writer);
    writer.overallocate = 1;
    writer.min_length = PyUnicode_GET_LENGTH(input->str) + 100;
919

920
    if (!do_markup(input, args, kwargs, &writer, recursion_depth,
921
                   auto_number)) {
922
        _PyUnicodeWriter_Dealloc(&writer);
923
        return NULL;
924 925
    }

926
    return _PyUnicodeWriter_Finish(&writer);
927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942
}

/************************************************************************/
/*********** main routine ***********************************************/
/************************************************************************/

/* this is the main entry point */
static PyObject *
do_string_format(PyObject *self, PyObject *args, PyObject *kwargs)
{
    SubString input;

    /* PEP 3101 says only 2 levels, so that
       "{0:{1}}".format('abc', 's')            # works
       "{0:{1:{2}}}".format('abc', 's', '')    # fails
    */
943
    int recursion_depth = 2;
944

945 946
    AutoNumber auto_number;

Martin v. Löwis's avatar
Martin v. Löwis committed
947 948 949
    if (PyUnicode_READY(self) == -1)
        return NULL;

950
    AutoNumber_Init(&auto_number);
Martin v. Löwis's avatar
Martin v. Löwis committed
951
    SubString_init(&input, self, 0, PyUnicode_GET_LENGTH(self));
952
    return build_string(&input, args, kwargs, recursion_depth, &auto_number);
953
}
954

955 956 957 958 959
static PyObject *
do_string_format_map(PyObject *self, PyObject *obj)
{
    return do_string_format(self, NULL, obj);
}
960 961 962 963 964 965 966 967 968 969 970 971 972


/************************************************************************/
/*********** formatteriterator ******************************************/
/************************************************************************/

/* This is used to implement string.Formatter.vparse().  It exists so
   Formatter can share code with the built in unicode.format() method.
   It's really just a wrapper around MarkupIterator that is callable
   from Python. */

typedef struct {
    PyObject_HEAD
973
    PyObject *str;
974 975 976 977 978 979 980 981 982 983 984
    MarkupIterator it_markup;
} formatteriterobject;

static void
formatteriter_dealloc(formatteriterobject *it)
{
    Py_XDECREF(it->str);
    PyObject_FREE(it);
}

/* returns a tuple:
985 986 987 988 989 990
   (literal, field_name, format_spec, conversion)

   literal is any literal text to output.  might be zero length
   field_name is the string before the ':'.  might be None
   format_spec is the string after the ':'.  mibht be None
   conversion is either None, or the string after the '!'
991 992 993 994 995 996 997
*/
static PyObject *
formatteriter_next(formatteriterobject *it)
{
    SubString literal;
    SubString field_name;
    SubString format_spec;
Martin v. Löwis's avatar
Martin v. Löwis committed
998
    Py_UCS4 conversion;
999
    int format_spec_needs_expanding;
1000 1001 1002
    int field_present;
    int result = MarkupIterator_next(&it->it_markup, &literal, &field_present,
                                     &field_name, &format_spec, &conversion,
1003 1004 1005 1006 1007
                                     &format_spec_needs_expanding);

    /* all of the SubString objects point into it->str, so no
       memory management needs to be done on them */
    assert(0 <= result && result <= 2);
1008
    if (result == 0 || result == 1)
1009 1010
        /* if 0, error has already been set, if 1, iterator is empty */
        return NULL;
1011
    else {
1012 1013 1014 1015 1016 1017
        PyObject *literal_str = NULL;
        PyObject *field_name_str = NULL;
        PyObject *format_spec_str = NULL;
        PyObject *conversion_str = NULL;
        PyObject *tuple = NULL;

1018 1019 1020 1021 1022 1023 1024 1025 1026 1027
        literal_str = SubString_new_object(&literal);
        if (literal_str == NULL)
            goto done;

        field_name_str = SubString_new_object(&field_name);
        if (field_name_str == NULL)
            goto done;

        /* if field_name is non-zero length, return a string for
           format_spec (even if zero length), else return None */
1028
        format_spec_str = (field_present ?
1029 1030 1031 1032
                           SubString_new_object_or_empty :
                           SubString_new_object)(&format_spec);
        if (format_spec_str == NULL)
            goto done;
1033

1034 1035 1036 1037 1038
        /* if the conversion is not specified, return a None,
           otherwise create a one length string with the conversion
           character */
        if (conversion == '\0') {
            conversion_str = Py_None;
1039 1040
            Py_INCREF(conversion_str);
        }
1041
        else
Martin v. Löwis's avatar
Martin v. Löwis committed
1042 1043
            conversion_str = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
                                                       &conversion, 1);
1044 1045 1046
        if (conversion_str == NULL)
            goto done;

1047
        tuple = PyTuple_Pack(4, literal_str, field_name_str, format_spec_str,
1048
                             conversion_str);
1049
    done:
1050 1051 1052 1053 1054 1055 1056 1057 1058
        Py_XDECREF(literal_str);
        Py_XDECREF(field_name_str);
        Py_XDECREF(format_spec_str);
        Py_XDECREF(conversion_str);
        return tuple;
    }
}

static PyMethodDef formatteriter_methods[] = {
1059
    {NULL,              NULL}           /* sentinel */
1060 1061
};

1062
static PyTypeObject PyFormatterIter_Type = {
1063
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
1064 1065 1066
    "formatteriterator",                /* tp_name */
    sizeof(formatteriterobject),        /* tp_basicsize */
    0,                                  /* tp_itemsize */
1067
    /* methods */
1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091
    (destructor)formatteriter_dealloc,  /* tp_dealloc */
    0,                                  /* tp_print */
    0,                                  /* tp_getattr */
    0,                                  /* tp_setattr */
    0,                                  /* tp_reserved */
    0,                                  /* tp_repr */
    0,                                  /* tp_as_number */
    0,                                  /* tp_as_sequence */
    0,                                  /* tp_as_mapping */
    0,                                  /* tp_hash */
    0,                                  /* tp_call */
    0,                                  /* tp_str */
    PyObject_GenericGetAttr,            /* tp_getattro */
    0,                                  /* tp_setattro */
    0,                                  /* tp_as_buffer */
    Py_TPFLAGS_DEFAULT,                 /* tp_flags */
    0,                                  /* tp_doc */
    0,                                  /* tp_traverse */
    0,                                  /* tp_clear */
    0,                                  /* tp_richcompare */
    0,                                  /* tp_weaklistoffset */
    PyObject_SelfIter,                  /* tp_iter */
    (iternextfunc)formatteriter_next,   /* tp_iternext */
    formatteriter_methods,              /* tp_methods */
1092 1093 1094 1095 1096 1097 1098 1099
    0,
};

/* unicode_formatter_parser is used to implement
   string.Formatter.vformat.  it parses a string and returns tuples
   describing the parsed elements.  It's a wrapper around
   stringlib/string_format.h's MarkupIterator */
static PyObject *
1100
formatter_parser(PyObject *ignored, PyObject *self)
1101 1102 1103
{
    formatteriterobject *it;

1104 1105 1106 1107 1108
    if (!PyUnicode_Check(self)) {
        PyErr_Format(PyExc_TypeError, "expected str, got %s", Py_TYPE(self)->tp_name);
        return NULL;
    }

Martin v. Löwis's avatar
Martin v. Löwis committed
1109 1110 1111
    if (PyUnicode_READY(self) == -1)
        return NULL;

1112 1113 1114 1115 1116 1117 1118 1119 1120
    it = PyObject_New(formatteriterobject, &PyFormatterIter_Type);
    if (it == NULL)
        return NULL;

    /* take ownership, give the object to the iterator */
    Py_INCREF(self);
    it->str = self;

    /* initialize the contained MarkupIterator */
Martin v. Löwis's avatar
Martin v. Löwis committed
1121
    MarkupIterator_init(&it->it_markup, (PyObject*)self, 0, PyUnicode_GET_LENGTH(self));
1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136
    return (PyObject *)it;
}


/************************************************************************/
/*********** fieldnameiterator ******************************************/
/************************************************************************/


/* This is used to implement string.Formatter.vparse().  It parses the
   field name into attribute and item values.  It's a Python-callable
   wrapper around FieldNameIterator */

typedef struct {
    PyObject_HEAD
1137
    PyObject *str;
1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163
    FieldNameIterator it_field;
} fieldnameiterobject;

static void
fieldnameiter_dealloc(fieldnameiterobject *it)
{
    Py_XDECREF(it->str);
    PyObject_FREE(it);
}

/* returns a tuple:
   (is_attr, value)
   is_attr is true if we used attribute syntax (e.g., '.foo')
              false if we used index syntax (e.g., '[foo]')
   value is an integer or string
*/
static PyObject *
fieldnameiter_next(fieldnameiterobject *it)
{
    int result;
    int is_attr;
    Py_ssize_t idx;
    SubString name;

    result = FieldNameIterator_next(&it->it_field, &is_attr,
                                    &idx, &name);
1164
    if (result == 0 || result == 1)
1165 1166
        /* if 0, error has already been set, if 1, iterator is empty */
        return NULL;
1167
    else {
1168 1169 1170 1171 1172 1173
        PyObject* result = NULL;
        PyObject* is_attr_obj = NULL;
        PyObject* obj = NULL;

        is_attr_obj = PyBool_FromLong(is_attr);
        if (is_attr_obj == NULL)
1174
            goto done;
1175 1176 1177

        /* either an integer or a string */
        if (idx != -1)
1178
            obj = PyLong_FromSsize_t(idx);
1179 1180 1181
        else
            obj = SubString_new_object(&name);
        if (obj == NULL)
1182
            goto done;
1183 1184 1185 1186

        /* return a tuple of values */
        result = PyTuple_Pack(2, is_attr_obj, obj);

1187
    done:
1188 1189
        Py_XDECREF(is_attr_obj);
        Py_XDECREF(obj);
1190
        return result;
1191 1192 1193 1194
    }
}

static PyMethodDef fieldnameiter_methods[] = {
1195
    {NULL,              NULL}           /* sentinel */
1196 1197 1198 1199
};

static PyTypeObject PyFieldNameIter_Type = {
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
1200 1201 1202
    "fieldnameiterator",                /* tp_name */
    sizeof(fieldnameiterobject),        /* tp_basicsize */
    0,                                  /* tp_itemsize */
1203
    /* methods */
1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227
    (destructor)fieldnameiter_dealloc,  /* tp_dealloc */
    0,                                  /* tp_print */
    0,                                  /* tp_getattr */
    0,                                  /* tp_setattr */
    0,                                  /* tp_reserved */
    0,                                  /* tp_repr */
    0,                                  /* tp_as_number */
    0,                                  /* tp_as_sequence */
    0,                                  /* tp_as_mapping */
    0,                                  /* tp_hash */
    0,                                  /* tp_call */
    0,                                  /* tp_str */
    PyObject_GenericGetAttr,            /* tp_getattro */
    0,                                  /* tp_setattro */
    0,                                  /* tp_as_buffer */
    Py_TPFLAGS_DEFAULT,                 /* tp_flags */
    0,                                  /* tp_doc */
    0,                                  /* tp_traverse */
    0,                                  /* tp_clear */
    0,                                  /* tp_richcompare */
    0,                                  /* tp_weaklistoffset */
    PyObject_SelfIter,                  /* tp_iter */
    (iternextfunc)fieldnameiter_next,   /* tp_iternext */
    fieldnameiter_methods,              /* tp_methods */
1228 1229 1230
    0};

/* unicode_formatter_field_name_split is used to implement
1231
   string.Formatter.vformat.  it takes a PEP 3101 "field name", and
1232 1233 1234 1235 1236 1237
   returns a tuple of (first, rest): "first", the part before the
   first '.' or '['; and "rest", an iterator for the rest of the field
   name.  it's a wrapper around stringlib/string_format.h's
   field_name_split.  The iterator it returns is a
   FieldNameIterator */
static PyObject *
1238
formatter_field_name_split(PyObject *ignored, PyObject *self)
1239 1240 1241 1242 1243 1244 1245 1246
{
    SubString first;
    Py_ssize_t first_idx;
    fieldnameiterobject *it;

    PyObject *first_obj = NULL;
    PyObject *result = NULL;

1247 1248 1249 1250 1251
    if (!PyUnicode_Check(self)) {
        PyErr_Format(PyExc_TypeError, "expected str, got %s", Py_TYPE(self)->tp_name);
        return NULL;
    }

Martin v. Löwis's avatar
Martin v. Löwis committed
1252 1253 1254
    if (PyUnicode_READY(self) == -1)
        return NULL;

1255 1256 1257 1258 1259 1260 1261 1262 1263
    it = PyObject_New(fieldnameiterobject, &PyFieldNameIter_Type);
    if (it == NULL)
        return NULL;

    /* take ownership, give the object to the iterator.  this is
       just to keep the field_name alive */
    Py_INCREF(self);
    it->str = self;

1264 1265
    /* Pass in auto_number = NULL. We'll return an empty string for
       first_obj in that case. */
Martin v. Löwis's avatar
Martin v. Löwis committed
1266
    if (!field_name_split((PyObject*)self, 0, PyUnicode_GET_LENGTH(self),
1267
                          &first, &first_idx, &it->it_field, NULL))
1268
        goto done;
1269

1270
    /* first becomes an integer, if possible; else a string */
1271
    if (first_idx != -1)
1272
        first_obj = PyLong_FromSsize_t(first_idx);
1273 1274 1275 1276
    else
        /* convert "first" into a string object */
        first_obj = SubString_new_object(&first);
    if (first_obj == NULL)
1277
        goto done;
1278 1279 1280 1281

    /* return a tuple of values */
    result = PyTuple_Pack(2, first_obj, it);

1282
done:
1283 1284 1285 1286
    Py_XDECREF(it);
    Py_XDECREF(first_obj);
    return result;
}