punycode.py 6.68 KB
Newer Older
1
# -*- coding: utf-8 -*-
2 3
""" Codec for the Punicode encoding, as specified in RFC 3492

4
Written by Martin v. Löwis.
5 6 7 8 9 10 11
"""

import codecs

##################### Encoding #####################################

def segregate(str):
Tim Peters's avatar
Tim Peters committed
12
    """3.1 Basic code point segregation"""
13
    base = bytearray()
14
    extended = set()
15 16
    for c in str:
        if ord(c) < 128:
17
            base.append(ord(c))
18
        else:
19 20
            extended.add(c)
    extended = sorted(extended)
21
    return bytes(base), extended
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67

def selective_len(str, max):
    """Return the length of str, considering only characters below max."""
    res = 0
    for c in str:
        if ord(c) < max:
            res += 1
    return res

def selective_find(str, char, index, pos):
    """Return a pair (index, pos), indicating the next occurrence of
    char in str. index is the position of the character considering
    only ordinals up to and including char, and pos is the position in
    the full string. index/pos is the starting position in the full
    string."""

    l = len(str)
    while 1:
        pos += 1
        if pos == l:
            return (-1, -1)
        c = str[pos]
        if c == char:
            return index+1, pos
        elif c < char:
            index += 1

def insertion_unsort(str, extended):
    """3.2 Insertion unsort coding"""
    oldchar = 0x80
    result = []
    oldindex = -1
    for c in extended:
        index = pos = -1
        char = ord(c)
        curlen = selective_len(str, char)
        delta = (curlen+1) * (char - oldchar)
        while 1:
            index,pos = selective_find(str,c,index,pos)
            if index == -1:
                break
            delta += index - oldindex
            result.append(delta-1)
            oldindex = index
            delta = 0
        oldchar = char
Tim Peters's avatar
Tim Peters committed
68

69 70 71 72 73 74 75 76 77
    return result

def T(j, bias):
    # Punycode parameters: tmin = 1, tmax = 26, base = 36
    res = 36 * (j + 1) - bias
    if res < 1: return 1
    if res > 26: return 26
    return res

78
digits = b"abcdefghijklmnopqrstuvwxyz0123456789"
79 80
def generate_generalized_integer(N, bias):
    """3.3 Generalized variable-length integers"""
81
    result = bytearray()
82 83 84 85 86
    j = 0
    while 1:
        t = T(j, bias)
        if N < t:
            result.append(digits[N])
87
            return bytes(result)
88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
        result.append(digits[t + ((N - t) % (36 - t))])
        N = (N - t) // (36 - t)
        j += 1

def adapt(delta, first, numchars):
    if first:
        delta //= 700
    else:
        delta //= 2
    delta += delta // numchars
    # ((base - tmin) * tmax) // 2 == 455
    divisions = 0
    while delta > 455:
        delta = delta // 35 # base - tmin
        divisions += 36
    bias = divisions + (36 * delta // (delta + 38))
    return bias
Tim Peters's avatar
Tim Peters committed
105

106 107 108 109

def generate_integers(baselen, deltas):
    """3.4 Bias adaptation"""
    # Punycode parameters: initial bias = 72, damp = 700, skew = 38
110
    result = bytearray()
111 112 113 114 115
    bias = 72
    for points, delta in enumerate(deltas):
        s = generate_generalized_integer(delta, bias)
        result.extend(s)
        bias = adapt(delta, points==0, baselen+points+1)
116
    return bytes(result)
117 118 119 120 121 122

def punycode_encode(text):
    base, extended = segregate(text)
    deltas = insertion_unsort(text, extended)
    extended = generate_integers(len(base), deltas)
    if base:
123
        return base + b"-" + extended
124 125 126 127 128 129 130 131 132 133 134 135 136 137
    return extended

##################### Decoding #####################################

def decode_generalized_number(extended, extpos, bias, errors):
    """3.3 Generalized variable-length integers"""
    result = 0
    w = 1
    j = 0
    while 1:
        try:
            char = ord(extended[extpos])
        except IndexError:
            if errors == "strict":
138
                raise UnicodeError("incomplete punicode string")
139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
            return extpos + 1, None
        extpos += 1
        if 0x41 <= char <= 0x5A: # A-Z
            digit = char - 0x41
        elif 0x30 <= char <= 0x39:
            digit = char - 22 # 0x30-26
        elif errors == "strict":
            raise UnicodeError("Invalid extended code point '%s'"
                               % extended[extpos])
        else:
            return extpos, None
        t = T(j, bias)
        result += digit * w
        if digit < t:
            return extpos, result
        w = w * (36 - t)
        j += 1
Tim Peters's avatar
Tim Peters committed
156

157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174

def insertion_sort(base, extended, errors):
    """3.2 Insertion unsort coding"""
    char = 0x80
    pos = -1
    bias = 72
    extpos = 0
    while extpos < len(extended):
        newpos, delta = decode_generalized_number(extended, extpos,
                                                  bias, errors)
        if delta is None:
            # There was an error in decoding. We can't continue because
            # synchronization is lost.
            return base
        pos += delta+1
        char += pos // (len(base) + 1)
        if char > 0x10FFFF:
            if errors == "strict":
175
                raise UnicodeError("Invalid character U+%x" % char)
176 177
            char = ord('?')
        pos = pos % (len(base) + 1)
178
        base = base[:pos] + chr(char) + base[pos:]
179 180 181 182 183
        bias = adapt(delta, (extpos == 0), len(base))
        extpos = newpos
    return base

def punycode_decode(text, errors):
184 185
    if isinstance(text, str):
        text = text.encode("ascii")
186
    pos = text.rfind(b"-")
187 188
    if pos == -1:
        base = ""
189
        extended = str(text, "ascii").upper()
190
    else:
191 192
        base = str(text[:pos], "ascii", errors)
        extended = str(text[pos+1:], "ascii").upper()
193
    return insertion_sort(base, extended, errors)
Tim Peters's avatar
Tim Peters committed
194

195 196 197 198
### Codec APIs

class Codec(codecs.Codec):

199
    def encode(self, input, errors='strict'):
200 201 202
        res = punycode_encode(input)
        return res, len(input)

203
    def decode(self, input, errors='strict'):
204
        if errors not in ('strict', 'replace', 'ignore'):
205
            raise UnicodeError("Unsupported error handling "+errors)
206 207 208
        res = punycode_decode(input, errors)
        return res, len(input)

209 210 211 212 213 214
class IncrementalEncoder(codecs.IncrementalEncoder):
    def encode(self, input, final=False):
        return punycode_encode(input)

class IncrementalDecoder(codecs.IncrementalDecoder):
    def decode(self, input, final=False):
215
        if self.errors not in ('strict', 'replace', 'ignore'):
216
            raise UnicodeError("Unsupported error handling "+self.errors)
217
        return punycode_decode(input, self.errors)
218

219 220 221 222 223 224 225 226 227
class StreamWriter(Codec,codecs.StreamWriter):
    pass

class StreamReader(Codec,codecs.StreamReader):
    pass

### encodings module API

def getregentry():
228 229 230 231 232 233 234 235 236
    return codecs.CodecInfo(
        name='punycode',
        encode=Codec().encode,
        decode=Codec().decode,
        incrementalencoder=IncrementalEncoder,
        incrementaldecoder=IncrementalDecoder,
        streamwriter=StreamWriter,
        streamreader=StreamReader,
    )