base64.py 11 KB
Newer Older
Guido van Rossum's avatar
Guido van Rossum committed
1 2
#! /usr/bin/env python

3
"""RFC 3548: Base16, Base32, Base64 Data Encodings"""
4

5 6
# Modified 04-Oct-1995 by Jack Jansen to use binascii module
# Modified 30-Dec-2003 by Barry Warsaw to add full RFC 3548 support
7

8 9
import re
import struct
10
import binascii
11

12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223

__all__ = [
    # Legacy interface exports traditional RFC 1521 Base64 encodings
    'encode', 'decode', 'encodestring', 'decodestring',
    # Generalized interface for other encodings
    'b64encode', 'b64decode', 'b32encode', 'b32decode',
    'b16encode', 'b16decode',
    # Standard Base64 encoding
    'standard_b64encode', 'standard_b64decode',
    # Some common Base64 alternatives.  As referenced by RFC 3458, see thread
    # starting at:
    #
    # http://zgp.org/pipermail/p2p-hackers/2001-September/000316.html
    'urlsafe_b64encode', 'urlsafe_b64decode',
    ]

_translation = [chr(_x) for _x in range(256)]
EMPTYSTRING = ''


def _translate(s, altchars):
    translation = _translation[:]
    for k, v in altchars.items():
        translation[ord(k)] = v
    return s.translate(''.join(translation))



# Base64 encoding/decoding uses binascii

def b64encode(s, altchars=None):
    """Encode a string using Base64.

    s is the string to encode.  Optional altchars must be a string of at least
    length 2 (additional characters are ignored) which specifies an
    alternative alphabet for the '+' and '/' characters.  This allows an
    application to e.g. generate url or filesystem safe Base64 strings.

    The encoded string is returned.
    """
    # Strip off the trailing newline
    encoded = binascii.b2a_base64(s)[:-1]
    if altchars is not None:
        return _translate(encoded, {'+': altchars[0], '/': altchars[1]})
    return encoded


def b64decode(s, altchars=None):
    """Decode a Base64 encoded string.

    s is the string to decode.  Optional altchars must be a string of at least
    length 2 (additional characters are ignored) which specifies the
    alternative alphabet used instead of the '+' and '/' characters.

    The decoded string is returned.  A TypeError is raised if s were
    incorrectly padded or if there are non-alphabet characters present in the
    string.
    """
    if altchars is not None:
        s = _translate(s, {altchars[0]: '+', altchars[1]: '/'})
    try:
        return binascii.a2b_base64(s)
    except binascii.Error, msg:
        # Transform this exception for consistency
        raise TypeError(msg)


def standard_b64encode(s):
    """Encode a string using the standard Base64 alphabet.

    s is the string to encode.  The encoded string is returned.
    """
    return b64encode(s)

def standard_b64decode(s):
    """Decode a string encoded with the standard Base64 alphabet.

    s is the string to decode.  The decoded string is returned.  A TypeError
    is raised if the string is incorrectly padded or if there are non-alphabet
    characters present in the string.
    """
    return b64decode(s)

def urlsafe_b64encode(s):
    """Encode a string using a url-safe Base64 alphabet.

    s is the string to encode.  The encoded string is returned.  The alphabet
    uses '-' instead of '+' and '_' instead of '/'.
    """
    return b64encode(s, '-_')

def urlsafe_b64decode(s):
    """Decode a string encoded with the standard Base64 alphabet.

    s is the string to decode.  The decoded string is returned.  A TypeError
    is raised if the string is incorrectly padded or if there are non-alphabet
    characters present in the string.

    The alphabet uses '-' instead of '+' and '_' instead of '/'.
    """
    return b64decode(s, '-_')



# Base32 encoding/decoding must be done in Python
_b32alphabet = {
    0: 'A',  9: 'J', 18: 'S', 27: '3',
    1: 'B', 10: 'K', 19: 'T', 28: '4',
    2: 'C', 11: 'L', 20: 'U', 29: '5',
    3: 'D', 12: 'M', 21: 'V', 30: '6',
    4: 'E', 13: 'N', 22: 'W', 31: '7',
    5: 'F', 14: 'O', 23: 'X',
    6: 'G', 15: 'P', 24: 'Y',
    7: 'H', 16: 'Q', 25: 'Z',
    8: 'I', 17: 'R', 26: '2',
    }

_b32tab = [v for v in _b32alphabet.values()]
_b32rev = dict([(v, long(k)) for k, v in _b32alphabet.items()])


def b32encode(s):
    """Encode a string using Base32.

    s is the string to encode.  The encoded string is returned.
    """
    parts = []
    quanta, leftover = divmod(len(s), 5)
    # Pad the last quantum with zero bits if necessary
    if leftover:
        s += ('\0' * (5 - leftover))
        quanta += 1
    for i in range(quanta):
        # c1 and c2 are 16 bits wide, c3 is 8 bits wide.  The intent of this
        # code is to process the 40 bits in units of 5 bits.  So we take the 1
        # leftover bit of c1 and tack it onto c2.  Then we take the 2 leftover
        # bits of c2 and tack them onto c3.  The shifts and masks are intended
        # to give us values of exactly 5 bits in width.
        c1, c2, c3 = struct.unpack('!HHB', s[i*5:(i+1)*5])
        c2 += (c1 & 1) << 16 # 17 bits wide
        c3 += (c2 & 3) << 8  # 10 bits wide
        parts.extend([_b32tab[c1 >> 11],         # bits 1 - 5
                      _b32tab[(c1 >> 6) & 0x1f], # bits 6 - 10
                      _b32tab[(c1 >> 1) & 0x1f], # bits 11 - 15
                      _b32tab[c2 >> 12],         # bits 16 - 20 (1 - 5)
                      _b32tab[(c2 >> 7) & 0x1f], # bits 21 - 25 (6 - 10)
                      _b32tab[(c2 >> 2) & 0x1f], # bits 26 - 30 (11 - 15)
                      _b32tab[c3 >> 5],          # bits 31 - 35 (1 - 5)
                      _b32tab[c3 & 0x1f],        # bits 36 - 40 (1 - 5)
                      ])
    encoded = EMPTYSTRING.join(parts)
    # Adjust for any leftover partial quanta
    if leftover == 1:
        return encoded[:-6] + '======'
    elif leftover == 2:
        return encoded[:-4] + '===='
    elif leftover == 3:
        return encoded[:-3] + '==='
    elif leftover == 4:
        return encoded[:-1] + '='
    return encoded


def b32decode(s, casefold=False, map01=None):
    """Decode a Base32 encoded string.

    s is the string to decode.  Optional casefold is a flag specifying whether
    a lowercase alphabet is acceptable as input.  For security purposes, the
    default is False.

    RFC 3548 allows for optional mapping of the digit 0 (zero) to the letter O
    (oh), and for optional mapping of the digit 1 (one) to either the letter I
    (eye) or letter L (el).  The optional argument map01 when not None,
    specifies which letter the digit 1 should be mapped to (when map01 is not
    None, the digit 0 is always mapped to the letter O).  For security
    purposes the default is None, so that 0 and 1 are not allowed in the
    input.

    The decoded string is returned.  A TypeError is raised if s were
    incorrectly padded or if there are non-alphabet characters present in the
    string.
    """
    quanta, leftover = divmod(len(s), 8)
    if leftover:
        raise TypeError('Incorrect padding')
    # Handle section 2.4 zero and one mapping.  The flag map01 will be either
    # False, or the character to map the digit 1 (one) to.  It should be
    # either L (el) or I (eye).
    if map01:
        s = _translate(s, {'0': 'O', '1': map01})
    if casefold:
        s = s.upper()
    # Strip off pad characters from the right.  We need to count the pad
    # characters because this will tell us how many null bytes to remove from
    # the end of the decoded string.
    padchars = 0
    mo = re.search('(?P<pad>[=]*)$', s)
    if mo:
        padchars = len(mo.group('pad'))
        if padchars > 0:
            s = s[:-padchars]
    # Now decode the full quanta
    parts = []
    acc = 0
    shift = 35
    for c in s:
        val = _b32rev.get(c)
        if val is None:
            raise TypeError('Non-base32 digit found')
        acc += _b32rev[c] << shift
        shift -= 5
        if shift < 0:
224
            parts.append(binascii.unhexlify('%010x' % acc))
225 226 227
            acc = 0
            shift = 35
    # Process the last, partial quanta
228 229 230 231
    last = binascii.unhexlify('%010x' % acc)
    if padchars == 0:
        last = ''                       # No characters
    elif padchars == 1:
232 233 234 235 236 237 238
        last = last[:-1]
    elif padchars == 3:
        last = last[:-2]
    elif padchars == 4:
        last = last[:-3]
    elif padchars == 6:
        last = last[:-4]
239
    else:
240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278
        raise TypeError('Incorrect padding')
    parts.append(last)
    return EMPTYSTRING.join(parts)



# RFC 3548, Base 16 Alphabet specifies uppercase, but hexlify() returns
# lowercase.  The RFC also recommends against accepting input case
# insensitively.
def b16encode(s):
    """Encode a string using Base16.

    s is the string to encode.  The encoded string is returned.
    """
    return binascii.hexlify(s).upper()


def b16decode(s, casefold=False):
    """Decode a Base16 encoded string.

    s is the string to decode.  Optional casefold is a flag specifying whether
    a lowercase alphabet is acceptable as input.  For security purposes, the
    default is False.

    The decoded string is returned.  A TypeError is raised if s were
    incorrectly padded or if there are non-alphabet characters present in the
    string.
    """
    if casefold:
        s = s.upper()
    if re.search('[^0-9A-F]', s):
        raise TypeError('Non-base16 digit found')
    return binascii.unhexlify(s)



# Legacy interface.  This code could be cleaned up since I don't believe
# binascii has any line length limitations.  It just doesn't seem worth it
# though.
279

280
MAXLINESIZE = 76 # Excluding the CRLF
281
MAXBINSIZE = (MAXLINESIZE//4)*3
282 283

def encode(input, output):
284
    """Encode a file."""
285
    while True:
286
        s = input.read(MAXBINSIZE)
287 288
        if not s:
            break
289 290
        while len(s) < MAXBINSIZE:
            ns = input.read(MAXBINSIZE-len(s))
291 292 293
            if not ns:
                break
            s += ns
294 295
        line = binascii.b2a_base64(s)
        output.write(line)
296

297

298
def decode(input, output):
299
    """Decode a file."""
300
    while True:
301
        line = input.readline()
302 303
        if not line:
            break
304 305
        s = binascii.a2b_base64(line)
        output.write(s)
306

307

308
def encodestring(s):
309
    """Encode a string."""
310 311 312 313 314
    pieces = []
    for i in range(0, len(s), MAXBINSIZE):
        chunk = s[i : i + MAXBINSIZE]
        pieces.append(binascii.b2a_base64(chunk))
    return "".join(pieces)
315

316

317
def decodestring(s):
318
    """Decode a string."""
319
    return binascii.a2b_base64(s)
320

321 322 323


# Useable as a script...
324
def test():
325 326 327 328 329 330 331
    """Small test program"""
    import sys, getopt
    try:
        opts, args = getopt.getopt(sys.argv[1:], 'deut')
    except getopt.error, msg:
        sys.stdout = sys.stderr
        print msg
332
        print """usage: %s [-d|-e|-u|-t] [file|-]
333 334
        -d, -u: decode
        -e: encode (default)
335
        -t: encode and decode string 'Aladdin:open sesame'"""%sys.argv[0]
336 337 338 339 340 341 342 343 344 345 346
        sys.exit(2)
    func = encode
    for o, a in opts:
        if o == '-e': func = encode
        if o == '-d': func = decode
        if o == '-u': func = decode
        if o == '-t': test1(); return
    if args and args[0] != '-':
        func(open(args[0], 'rb'), sys.stdout)
    else:
        func(sys.stdin, sys.stdout)
347

348

349
def test1():
350 351 352
    s0 = "Aladdin:open sesame"
    s1 = encodestring(s0)
    s2 = decodestring(s1)
353
    print s0, repr(s1), s2
354

355

356
if __name__ == '__main__':
357
    test()