Patch #1359618: Speed-up charmap encoder.

3f767795 · Martin v. Löwis · 67966bed · 3f767795 · 3f767795 · 3f767795
Kaydet (Commit) 3f767795 authored Haz 04, 2006 tarafından Martin v. Löwis
51 changed files
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -650,6 +650,11 @@ PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
    const char *errors		/* error handling */
    );
+PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
+    PyObject* string            /* 256 character map */
+   );
 /* --- UTF-7 Codecs ------------------------------------------------------- */
 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(

--- a/Lib/encodings/cp037.py
+++ b/Lib/encodings/cp037.py
--- a/Lib/encodings/cp1006.py
+++ b/Lib/encodings/cp1006.py
--- a/Lib/encodings/cp1026.py
+++ b/Lib/encodings/cp1026.py
--- a/Lib/encodings/cp1140.py
+++ b/Lib/encodings/cp1140.py
--- a/Lib/encodings/cp1250.py
+++ b/Lib/encodings/cp1250.py
--- a/Lib/encodings/cp1251.py
+++ b/Lib/encodings/cp1251.py
--- a/Lib/encodings/cp1252.py
+++ b/Lib/encodings/cp1252.py
--- a/Lib/encodings/cp1253.py
+++ b/Lib/encodings/cp1253.py
--- a/Lib/encodings/cp1254.py
+++ b/Lib/encodings/cp1254.py
--- a/Lib/encodings/cp1255.py
+++ b/Lib/encodings/cp1255.py
--- a/Lib/encodings/cp1256.py
+++ b/Lib/encodings/cp1256.py
--- a/Lib/encodings/cp1257.py
+++ b/Lib/encodings/cp1257.py
--- a/Lib/encodings/cp1258.py
+++ b/Lib/encodings/cp1258.py
--- a/Lib/encodings/cp424.py
+++ b/Lib/encodings/cp424.py
--- a/Lib/encodings/cp500.py
+++ b/Lib/encodings/cp500.py
--- a/Lib/encodings/cp856.py
+++ b/Lib/encodings/cp856.py
--- a/Lib/encodings/cp874.py
+++ b/Lib/encodings/cp874.py
--- a/Lib/encodings/cp875.py
+++ b/Lib/encodings/cp875.py
--- a/Lib/encodings/iso8859_1.py
+++ b/Lib/encodings/iso8859_1.py
--- a/Lib/encodings/iso8859_10.py
+++ b/Lib/encodings/iso8859_10.py
--- a/Lib/encodings/iso8859_11.py
+++ b/Lib/encodings/iso8859_11.py
--- a/Lib/encodings/iso8859_13.py
+++ b/Lib/encodings/iso8859_13.py
--- a/Lib/encodings/iso8859_14.py
+++ b/Lib/encodings/iso8859_14.py
--- a/Lib/encodings/iso8859_15.py
+++ b/Lib/encodings/iso8859_15.py
--- a/Lib/encodings/iso8859_16.py
+++ b/Lib/encodings/iso8859_16.py
--- a/Lib/encodings/iso8859_2.py
+++ b/Lib/encodings/iso8859_2.py
--- a/Lib/encodings/iso8859_3.py
+++ b/Lib/encodings/iso8859_3.py
--- a/Lib/encodings/iso8859_4.py
+++ b/Lib/encodings/iso8859_4.py
--- a/Lib/encodings/iso8859_5.py
+++ b/Lib/encodings/iso8859_5.py
--- a/Lib/encodings/iso8859_6.py
+++ b/Lib/encodings/iso8859_6.py
--- a/Lib/encodings/iso8859_7.py
+++ b/Lib/encodings/iso8859_7.py
--- a/Lib/encodings/iso8859_8.py
+++ b/Lib/encodings/iso8859_8.py
--- a/Lib/encodings/iso8859_9.py
+++ b/Lib/encodings/iso8859_9.py
--- a/Lib/encodings/koi8_r.py
+++ b/Lib/encodings/koi8_r.py
--- a/Lib/encodings/koi8_u.py
+++ b/Lib/encodings/koi8_u.py
--- a/Lib/encodings/mac_centeuro.py
+++ b/Lib/encodings/mac_centeuro.py
--- a/Lib/encodings/mac_croatian.py
+++ b/Lib/encodings/mac_croatian.py
--- a/Lib/encodings/mac_cyrillic.py
+++ b/Lib/encodings/mac_cyrillic.py
--- a/Lib/encodings/mac_farsi.py
+++ b/Lib/encodings/mac_farsi.py
--- a/Lib/encodings/mac_greek.py
+++ b/Lib/encodings/mac_greek.py
--- a/Lib/encodings/mac_iceland.py
+++ b/Lib/encodings/mac_iceland.py
--- a/Lib/encodings/mac_roman.py
+++ b/Lib/encodings/mac_roman.py
--- a/Lib/encodings/mac_romanian.py
+++ b/Lib/encodings/mac_romanian.py
--- a/Lib/encodings/mac_turkish.py
+++ b/Lib/encodings/mac_turkish.py
--- a/Lib/encodings/tis_620.py
+++ b/Lib/encodings/tis_620.py
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -104,6 +104,9 @@ Extension Modules
 Library
 -------
+- Patch #1359618: Speed up charmap encoder by using a trie structure
+  for lookup.
 - The functions in the ``pprint`` module now sort dictionaries by key
  before computing the display.  Before 2.5, ``pprint`` sorted a dictionary
  if and only if its display required more than one line, although that

--- a/Modules/_codecsmodule.c
+++ b/Modules/_codecsmodule.c
@@ -792,6 +792,15 @@ charmap_encode(PyObject *self,
    return v;
 }
+static PyObject*
+charmap_build(PyObject *self, PyObject *args)
+{
+    PyObject *map;
+    if (!PyArg_ParseTuple(args, "U:charmap_build", &map))
+        return NULL;
+    return PyUnicode_BuildEncodingMap(map);
+}
 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
 static PyObject *
@@ -897,6 +906,7 @@ static PyMethodDef _codecs_functions[] = {
    {"ascii_decode", 		ascii_decode,			METH_VARARGS},
    {"charmap_encode", 		charmap_encode,			METH_VARARGS},
    {"charmap_decode", 		charmap_decode,			METH_VARARGS},
+    {"charmap_build", 		charmap_build,			METH_VARARGS},
    {"readbuffer_encode",	readbuffer_encode,		METH_VARARGS},
    {"charbuffer_encode",	charbuffer_encode,		METH_VARARGS},
 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)

--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
--- a/Tools/unicode/Makefile
+++ b/Tools/unicode/Makefile
@@ -78,7 +78,7 @@ cjk:	build/
 ### Cleanup
 clean:
-	$(RM) build/*
+	$(RM) -f build/*
 distclean:	clean
 	$(RM) -rf MAPPINGS/
--- a/Tools/unicode/gencodec.py
+++ b/Tools/unicode/gencodec.py
@@ -270,6 +270,11 @@ def codegen(name, map, encodingname, comments=1):
        comments=comments,
        precisions=(4, 2))
+    if decoding_table_code:
+        suffix = 'table'
+    else:
+        suffix = 'map'
    l = [
        '''\
 """ Python Character Mapping Codec %s generated from '%s' with gencodec.py.
@@ -283,30 +288,20 @@ import codecs
 class Codec(codecs.Codec):
    def encode(self,input,errors='strict'):
-        return codecs.charmap_encode(input,errors,encoding_map)
+        return codecs.charmap_encode(input,errors,encoding_%s)
-    def decode(self,input,errors='strict'):''' % (encodingname, name)
-        ]
-    if decoding_table_code:
-        l.append('''\
-        return codecs.charmap_decode(input,errors,decoding_table)''')
-    else:
-        l.append('''\
-        return codecs.charmap_decode(input,errors,decoding_map)''')
-    l.append('''
+    def decode(self,input,errors='strict'):
+        return codecs.charmap_decode(input,errors,decoding_%s)
+''' % (encodingname, name, suffix, suffix)]
+    l.append('''\
 class IncrementalEncoder(codecs.IncrementalEncoder):
    def encode(self, input, final=False):
-        return codecs.charmap_encode(input,self.errors,encoding_map)[0]
+        return codecs.charmap_encode(input,self.errors,encoding_%s)[0]
 class IncrementalDecoder(codecs.IncrementalDecoder):
-    def decode(self, input, final=False):''')
+    def decode(self, input, final=False):
-    if decoding_table_code:
+        return codecs.charmap_decode(input,self.errors,decoding_%s)[0]''' %
-        l.append('''\
+        (suffix, suffix))
-        return codecs.charmap_decode(input,self.errors,decoding_table)[0]''')
-    else:
-        l.append('''\
-        return codecs.charmap_decode(input,self.errors,decoding_map)[0]''')
    l.append('''
 class StreamWriter(Codec,codecs.StreamWriter):
@@ -319,13 +314,13 @@ class StreamReader(Codec,codecs.StreamReader):
 def getregentry():
    return codecs.CodecInfo(
-        Codec().encode,
-        Codec().decode,
        name=%r,
-        streamwriter=StreamWriter,
+        encode=Codec().encode,
-        streamreader=StreamReader,
+        decode=Codec().decode,
        incrementalencoder=IncrementalEncoder,
        incrementaldecoder=IncrementalDecoder,
+        streamreader=StreamReader,
+        streamwriter=StreamWriter,
    )
 ''' % encodingname.replace('_', '-'))
@@ -342,10 +337,16 @@ def getregentry():
        l.extend(decoding_table_code)
    # Add encoding map
-    l.append('''
+    if decoding_table_code:
+        l.append('''
+### Encoding table
+encoding_table=codecs.charmap_build(decoding_table)
+''')
+    else:
+        l.append('''
 ### Encoding Map
 ''')
-    l.extend(encoding_map_code)
+        l.extend(encoding_map_code)
    # Final new-line
    l.append('')