punycode.py 6.72 KB
Newer Older
1 2
""" Codec for the Punicode encoding, as specified in RFC 3492

3
Written by Martin v. Löwis.
4 5 6 7 8 9 10
"""

import codecs

##################### Encoding #####################################

def segregate(str):
Tim Peters's avatar
Tim Peters committed
11
    """3.1 Basic code point segregation"""
12
    base = bytearray()
13
    extended = set()
14 15
    for c in str:
        if ord(c) < 128:
16
            base.append(ord(c))
17
        else:
18 19
            extended.add(c)
    extended = sorted(extended)
20
    return bytes(base), extended
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66

def selective_len(str, max):
    """Return the length of str, considering only characters below max."""
    res = 0
    for c in str:
        if ord(c) < max:
            res += 1
    return res

def selective_find(str, char, index, pos):
    """Return a pair (index, pos), indicating the next occurrence of
    char in str. index is the position of the character considering
    only ordinals up to and including char, and pos is the position in
    the full string. index/pos is the starting position in the full
    string."""

    l = len(str)
    while 1:
        pos += 1
        if pos == l:
            return (-1, -1)
        c = str[pos]
        if c == char:
            return index+1, pos
        elif c < char:
            index += 1

def insertion_unsort(str, extended):
    """3.2 Insertion unsort coding"""
    oldchar = 0x80
    result = []
    oldindex = -1
    for c in extended:
        index = pos = -1
        char = ord(c)
        curlen = selective_len(str, char)
        delta = (curlen+1) * (char - oldchar)
        while 1:
            index,pos = selective_find(str,c,index,pos)
            if index == -1:
                break
            delta += index - oldindex
            result.append(delta-1)
            oldindex = index
            delta = 0
        oldchar = char
Tim Peters's avatar
Tim Peters committed
67

68 69 70 71 72 73 74 75 76
    return result

def T(j, bias):
    # Punycode parameters: tmin = 1, tmax = 26, base = 36
    res = 36 * (j + 1) - bias
    if res < 1: return 1
    if res > 26: return 26
    return res

77
digits = b"abcdefghijklmnopqrstuvwxyz0123456789"
78 79
def generate_generalized_integer(N, bias):
    """3.3 Generalized variable-length integers"""
80
    result = bytearray()
81 82 83 84 85
    j = 0
    while 1:
        t = T(j, bias)
        if N < t:
            result.append(digits[N])
86
            return bytes(result)
87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
        result.append(digits[t + ((N - t) % (36 - t))])
        N = (N - t) // (36 - t)
        j += 1

def adapt(delta, first, numchars):
    if first:
        delta //= 700
    else:
        delta //= 2
    delta += delta // numchars
    # ((base - tmin) * tmax) // 2 == 455
    divisions = 0
    while delta > 455:
        delta = delta // 35 # base - tmin
        divisions += 36
    bias = divisions + (36 * delta // (delta + 38))
    return bias
Tim Peters's avatar
Tim Peters committed
104

105 106 107 108

def generate_integers(baselen, deltas):
    """3.4 Bias adaptation"""
    # Punycode parameters: initial bias = 72, damp = 700, skew = 38
109
    result = bytearray()
110 111 112 113 114
    bias = 72
    for points, delta in enumerate(deltas):
        s = generate_generalized_integer(delta, bias)
        result.extend(s)
        bias = adapt(delta, points==0, baselen+points+1)
115
    return bytes(result)
116 117 118 119 120 121

def punycode_encode(text):
    base, extended = segregate(text)
    deltas = insertion_unsort(text, extended)
    extended = generate_integers(len(base), deltas)
    if base:
122
        return base + b"-" + extended
123 124 125 126 127 128 129 130 131 132 133 134 135 136
    return extended

##################### Decoding #####################################

def decode_generalized_number(extended, extpos, bias, errors):
    """3.3 Generalized variable-length integers"""
    result = 0
    w = 1
    j = 0
    while 1:
        try:
            char = ord(extended[extpos])
        except IndexError:
            if errors == "strict":
137
                raise UnicodeError("incomplete punicode string")
138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
            return extpos + 1, None
        extpos += 1
        if 0x41 <= char <= 0x5A: # A-Z
            digit = char - 0x41
        elif 0x30 <= char <= 0x39:
            digit = char - 22 # 0x30-26
        elif errors == "strict":
            raise UnicodeError("Invalid extended code point '%s'"
                               % extended[extpos])
        else:
            return extpos, None
        t = T(j, bias)
        result += digit * w
        if digit < t:
            return extpos, result
        w = w * (36 - t)
        j += 1
Tim Peters's avatar
Tim Peters committed
155

156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173

def insertion_sort(base, extended, errors):
    """3.2 Insertion unsort coding"""
    char = 0x80
    pos = -1
    bias = 72
    extpos = 0
    while extpos < len(extended):
        newpos, delta = decode_generalized_number(extended, extpos,
                                                  bias, errors)
        if delta is None:
            # There was an error in decoding. We can't continue because
            # synchronization is lost.
            return base
        pos += delta+1
        char += pos // (len(base) + 1)
        if char > 0x10FFFF:
            if errors == "strict":
174
                raise UnicodeError("Invalid character U+%x" % char)
175 176
            char = ord('?')
        pos = pos % (len(base) + 1)
177
        base = base[:pos] + chr(char) + base[pos:]
178 179 180 181 182
        bias = adapt(delta, (extpos == 0), len(base))
        extpos = newpos
    return base

def punycode_decode(text, errors):
183 184
    if isinstance(text, str):
        text = text.encode("ascii")
185 186
    if isinstance(text, memoryview):
        text = bytes(text)
187
    pos = text.rfind(b"-")
188 189
    if pos == -1:
        base = ""
190
        extended = str(text, "ascii").upper()
191
    else:
192 193
        base = str(text[:pos], "ascii", errors)
        extended = str(text[pos+1:], "ascii").upper()
194
    return insertion_sort(base, extended, errors)
Tim Peters's avatar
Tim Peters committed
195

196 197 198 199
### Codec APIs

class Codec(codecs.Codec):

200
    def encode(self, input, errors='strict'):
201 202 203
        res = punycode_encode(input)
        return res, len(input)

204
    def decode(self, input, errors='strict'):
205
        if errors not in ('strict', 'replace', 'ignore'):
206
            raise UnicodeError("Unsupported error handling "+errors)
207 208 209
        res = punycode_decode(input, errors)
        return res, len(input)

210 211 212 213 214 215
class IncrementalEncoder(codecs.IncrementalEncoder):
    def encode(self, input, final=False):
        return punycode_encode(input)

class IncrementalDecoder(codecs.IncrementalDecoder):
    def decode(self, input, final=False):
216
        if self.errors not in ('strict', 'replace', 'ignore'):
217
            raise UnicodeError("Unsupported error handling "+self.errors)
218
        return punycode_decode(input, self.errors)
219

220 221 222 223 224 225 226 227 228
class StreamWriter(Codec,codecs.StreamWriter):
    pass

class StreamReader(Codec,codecs.StreamReader):
    pass

### encodings module API

def getregentry():
229 230 231 232 233 234 235 236 237
    return codecs.CodecInfo(
        name='punycode',
        encode=Codec().encode,
        decode=Codec().decode,
        incrementalencoder=IncrementalEncoder,
        incrementaldecoder=IncrementalDecoder,
        streamwriter=StreamWriter,
        streamreader=StreamReader,
    )