Issue #16147: Rewrite PyUnicode_FromFormatV() to use _PyUnicodeWriter API

* Simplify the code: replace 4 steps with one unique step using the _PyUnicodeWriter API. PyUnicode_Format() has the same design. It avoids to store intermediate results which require to allocate an array of pointers on the heap. * Use the _PyUnicodeWriter API for speed (and its convinient API): overallocate the buffer to reduce the number of "realloc()" * Implement "width" and "precision" in Python, don't rely on sprintf(). It avoids to need of a temporary buffer allocated on the heap: only use a small buffer allocated in the stack. * Add _PyUnicodeWriter_WriteCstr() function * Split PyUnicode_FromFormatV() into two functions: add unicode_fromformat_arg(). * Inline parse_format_flags(): the format of an argument is now only parsed once, it's no more needed to have a subfunction. * Optimize PyUnicode_FromFormatV() for characters between two "%" arguments: search the next "%" and copy the substring in one chunk, instead of copying character per character.

Issue #16147: Rewrite PyUnicode_FromFormatV() to use _PyUnicodeWriter API
* Simplify the code: replace 4 steps with one unique step using the _PyUnicodeWriter API. PyUnicode_Format() has the same design. It avoids to store intermediate results which require to allocate an array of pointers on the heap. * Use the _PyUnicodeWriter API for speed (and its convinient API): overallocate the buffer to reduce the number of "realloc()" * Implement "width" and "precision" in Python, don't rely on sprintf(). It avoids to need of a temporary buffer allocated on the heap: only use a small buffer allocated in the stack. * Add _PyUnicodeWriter_WriteCstr() function * Split PyUnicode_FromFormatV() into two functions: add unicode_fromformat_arg(). * Inline parse_format_flags(): the format of an argument is now only parsed once, it's no more needed to have a subfunction. * Optimize PyUnicode_FromFormatV() for characters between two "%" arguments: search the next "%" and copy the substring in one chunk, instead of copying character per character.
e215d960 · Victor Stinner · 2a09b6e8 · e215d960 · e215d960 · e215d960
Kaydet (Commit) e215d960 authored Eki 06, 2012 tarafından Victor Stinner
Expand all Hide whitespace changes
Inline Side-by-side

Showing with 33 additions and 1 deletion

unicodeobject.h Include/unicodeobject.h +17 -1

test_unicode.py Lib/test/test_unicode.py +16 -0

unicodeobject.c Objects/unicodeobject.c +0 -0

No files found.
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -933,12 +933,28 @@ PyAPI_FUNC(int)
 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
                                 Py_ssize_t length, Py_UCS4 maxchar);

+/* Append a Unicode string.
+   Return 0 on success, raise an exception and return -1 on error. */
 PyAPI_FUNC(int)
-_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str);
+_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer,
+    PyObject *str               /* Unicode string */
+    );

+/* Append a latin1-encoded byte string.
+   Return 0 on success, raise an exception and return -1 on error. */
+PyAPI_FUNC(int)
+_PyUnicodeWriter_WriteCstr(_PyUnicodeWriter *writer,
+    const char *str,            /* latin1-encoded byte string */
+    Py_ssize_t len              /* length in bytes */
+    );
+
+/* Get the value of the write as an Unicode string. Clear the
+   buffer of the writer. Raise an exception and return NULL
+   on error. */
 PyAPI_FUNC(PyObject *)
 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer);

+/* Deallocate memory of a writer (clear its internal buffer). */
 PyAPI_FUNC(void)
 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer);
 #endif

--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -1769,6 +1769,22 @@ class UnicodeTest(string_tests.CommonTest,
        self.assertEqual(PyUnicode_FromFormat(b'%llu', c_ulonglong(123)), '123')
        self.assertEqual(PyUnicode_FromFormat(b'%zu', c_size_t(123)), '123')

+        # test padding (width and/or precision)
+        self.assertEqual(PyUnicode_FromFormat(b'%010i', c_int(123)), '123'.rjust(10, '0'))
+        self.assertEqual(PyUnicode_FromFormat(b'%100i', c_int(123)), '123'.rjust(100))
+        self.assertEqual(PyUnicode_FromFormat(b'%.100i', c_int(123)), '123'.rjust(100, '0'))
+        self.assertEqual(PyUnicode_FromFormat(b'%100.80i', c_int(123)), '123'.rjust(80, '0').rjust(100))
+
+        self.assertEqual(PyUnicode_FromFormat(b'%010u', c_uint(123)), '123'.rjust(10, '0'))
+        self.assertEqual(PyUnicode_FromFormat(b'%100u', c_uint(123)), '123'.rjust(100))
+        self.assertEqual(PyUnicode_FromFormat(b'%.100u', c_uint(123)), '123'.rjust(100, '0'))
+        self.assertEqual(PyUnicode_FromFormat(b'%100.80u', c_uint(123)), '123'.rjust(80, '0').rjust(100))
+
+        self.assertEqual(PyUnicode_FromFormat(b'%010x', c_int(0x123)), '123'.rjust(10, '0'))
+        self.assertEqual(PyUnicode_FromFormat(b'%100x', c_int(0x123)), '123'.rjust(100))
+        self.assertEqual(PyUnicode_FromFormat(b'%.100x', c_int(0x123)), '123'.rjust(100, '0'))
+        self.assertEqual(PyUnicode_FromFormat(b'%100.80x', c_int(0x123)), '123'.rjust(80, '0').rjust(100))
+
        # test %A
        text = PyUnicode_FromFormat(b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
        self.assertEqual(text, r"%A:'abc\xe9\uabcd\U0010ffff'")

--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c