__init__.py 5.03 KB
Newer Older
1 2 3 4 5
""" Standard "encodings" Package

    Standard Python encoding modules are stored in this package
    directory.

6 7 8
    Codec modules must have names corresponding to normalized encoding
    names as defined in the normalize_encoding() function below, e.g.
    'utf-8' must be implemented by the module 'utf_8.py'.
9 10 11

    Each codec module must export the following interface:

12
    * getregentry() -> codecs.CodecInfo object
13
    The getregentry() API must return a CodecInfo object with encoder, decoder,
14 15
    incrementalencoder, incrementaldecoder, streamwriter and streamreader
    atttributes which adhere to the Python Codec Interface Standard.
16 17 18 19 20 21

    In addition, a module may optionally also define the following
    APIs which are then used by the package's codec search function:

    * getaliases() -> sequence of encoding name strings to use as aliases

22 23
    Alias names returned by getaliases() must be normalized encoding
    names as defined by normalize_encoding().
24 25 26 27 28 29 30

Written by Marc-Andre Lemburg (mal@lemburg.com).

(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.

"""#"

31
import codecs
32
from . import aliases
33 34

_cache = {}
35
_unknown = '--unknown--'
36
_import_tail = ['*']
37
_aliases = aliases.aliases
38

39
class CodecRegistryError(LookupError, SystemError):
40 41
    pass

42 43 44 45 46 47 48
def normalize_encoding(encoding):

    """ Normalize an encoding name.

        Normalization works as follows: all non-alphanumeric
        characters except the dot used for Python package names are
        collapsed and replaced with a single underscore, e.g. '  -;#'
49 50 51 52
        becomes '_'. Leading and trailing underscores are removed.

        Note that encoding names should be ASCII only; if they do use
        non-ASCII characters, these must be Latin-1 compatible.
Tim Peters's avatar
Tim Peters committed
53

54
    """
55
    if isinstance(encoding, bytes):
56
        encoding = str(encoding, "ascii")
57 58 59 60 61 62 63 64 65 66 67
    chars = []
    punct = False
    for c in encoding:
        if c.isalnum() or c == '.':
            if punct and chars:
                chars.append('_')
            chars.append(c)
            punct = False
        else:
            punct = True
    return ''.join(chars)
68

69
def search_function(encoding):
Tim Peters's avatar
Tim Peters committed
70

71
    # Cache lookup
72
    entry = _cache.get(encoding, _unknown)
73
    if entry is not _unknown:
74 75
        return entry

76 77
    # Import the module:
    #
78 79 80 81
    # First try to find an alias for the normalized encoding
    # name and lookup the module using the aliased name, then try to
    # lookup the module using the standard import scheme, i.e. first
    # try in the encodings package, then at top-level.
82
    #
83 84 85 86 87 88 89 90 91
    norm_encoding = normalize_encoding(encoding)
    aliased_encoding = _aliases.get(norm_encoding) or \
                       _aliases.get(norm_encoding.replace('.', '_'))
    if aliased_encoding is not None:
        modnames = [aliased_encoding,
                    norm_encoding]
    else:
        modnames = [norm_encoding]
    for modname in modnames:
92
        if not modname or '.' in modname:
93
            continue
94
        try:
95 96 97 98
            # Import is absolute to prevent the possibly malicious import of a
            # module with side-effects that is not in the 'encodings' package.
            mod = __import__('encodings.' + modname, fromlist=_import_tail,
                             level=0)
99
        except ImportError:
100 101 102 103 104
            pass
        else:
            break
    else:
        mod = None
105 106 107 108 109 110 111

    try:
        getregentry = mod.getregentry
    except AttributeError:
        # Not a codec module
        mod = None

112
    if mod is None:
113
        # Cache misses
114
        _cache[encoding] = None
Tim Peters's avatar
Tim Peters committed
115 116
        return None

117
    # Now ask the module for the registry entry
118 119 120
    entry = getregentry()
    if not isinstance(entry, codecs.CodecInfo):
        if not 4 <= len(entry) <= 7:
121 122
            raise CodecRegistryError('module "%s" (%s) failed to register'
                                     % (mod.__name__, mod.__file__))
123 124 125 126 127 128
        if not hasattr(entry[0], '__call__') or \
           not hasattr(entry[1], '__call__') or \
           (entry[2] is not None and not hasattr(entry[2], '__call__')) or \
           (entry[3] is not None and not hasattr(entry[3], '__call__')) or \
           (len(entry) > 4 and entry[4] is not None and not hasattr(entry[4], '__call__')) or \
           (len(entry) > 5 and entry[5] is not None and not hasattr(entry[5], '__call__')):
129 130
            raise CodecRegistryError('incompatible codecs in module "%s" (%s)'
                                     % (mod.__name__, mod.__file__))
131 132 133
        if len(entry)<7 or entry[6] is None:
            entry += (None,)*(6-len(entry)) + (mod.__name__.split(".", 1)[1],)
        entry = codecs.CodecInfo(*entry)
134

135
    # Cache the codec registry entry
136
    _cache[encoding] = entry
137 138 139

    # Register its aliases (without overwriting previously registered
    # aliases)
140 141 142 143 144 145
    try:
        codecaliases = mod.getaliases()
    except AttributeError:
        pass
    else:
        for alias in codecaliases:
146
            if alias not in _aliases:
147
                _aliases[alias] = modname
148 149

    # Return the registry entry
150 151 152 153
    return entry

# Register the search_function in the Python codec registry
codecs.register(search_function)