gettext.py 17.2 KB
Newer Older
1 2 3 4 5 6 7 8
"""Internationalization and localization support.

This module provides internationalization (I18N) and localization (L10N)
support for your Python programs by providing an interface to the GNU gettext
message catalog library.

I18N refers to the operation by which a program is made aware of multiple
languages.  L10N refers to the adaptation of your program, once
9
internationalized, to the local language and cultural habits.
10 11 12

"""

13 14
# This module represents the integration of work, contributions, feedback, and
# suggestions from the following people:
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
#
# Martin von Loewis, who wrote the initial implementation of the underlying
# C-based libintlmodule (later renamed _gettext), along with a skeletal
# gettext.py implementation.
#
# Peter Funk, who wrote fintl.py, a fairly complete wrapper around intlmodule,
# which also included a pure-Python implementation to read .mo files if
# intlmodule wasn't available.
#
# James Henstridge, who also wrote a gettext.py module, which has some
# interesting, but currently unsupported experimental features: the notion of
# a Catalog class and instances, and the ability to add to a catalog file via
# a Python API.
#
# Barry Warsaw integrated these modules, wrote the .install() API and code,
# and conformed all C and Python code to Python's coding standards.
31 32 33 34
#
# Francois Pinard and Marc-Andre Lemburg also contributed valuably to this
# module.
#
35
# J. David Ibanez implemented plural forms. Bruno Haible fixed some bugs.
36
#
37 38 39 40 41 42 43 44 45 46
# TODO:
# - Lazy loading of .mo files.  Currently the entire catalog is loaded into
#   memory, but that's probably bad for large translated programs.  Instead,
#   the lexical sort of original strings in GNU .mo files should be exploited
#   to do binary searches and lazy initializations.  Or you might want to use
#   the undocumented double-hash algorithm for .mo files with hash tables, but
#   you'll need to study the GNU gettext code to do this.
#
# - Support Solaris .mo file formats.  Unfortunately, we've been unable to
#   find this format documented anywhere.
47

48

49
import locale, copy, io, os, re, struct, sys
50
from errno import ENOENT
51

52

53 54 55 56
__all__ = ['NullTranslations', 'GNUTranslations', 'Catalog',
           'find', 'translation', 'install', 'textdomain', 'bindtextdomain',
           'dgettext', 'dngettext', 'gettext', 'ngettext',
           ]
57

58
_default_localedir = os.path.join(sys.base_prefix, 'share', 'locale')
59 60


61
def c2py(plural):
Barry Warsaw's avatar
Barry Warsaw committed
62 63
    """Gets a C expression as used in PO files for plural forms and returns a
    Python lambda function that implements an equivalent expression.
64 65 66
    """
    # Security check, allow only the "n" identifier
    import token, tokenize
67
    tokens = tokenize.generate_tokens(io.StringIO(plural).readline)
68
    try:
Barry Warsaw's avatar
Barry Warsaw committed
69
        danger = [x for x in tokens if x[0] == token.NAME and x[1] != 'n']
70
    except tokenize.TokenError:
71
        raise ValueError('plural forms expression error, maybe unbalanced parenthesis')
72 73
    else:
        if danger:
74
            raise ValueError('plural forms expression could be dangerous')
75 76 77 78 79

    # Replace some C operators by their Python equivalents
    plural = plural.replace('&&', ' and ')
    plural = plural.replace('||', ' or ')

80 81
    expr = re.compile(r'\!([^=])')
    plural = expr.sub(' not \\1', plural)
82 83

    # Regular expression and replacement function used to transform
Benjamin Peterson's avatar
Benjamin Peterson committed
84
    # "a?b:c" to "b if a else c".
85 86
    expr = re.compile(r'(.*?)\?(.*?):(.*)')
    def repl(x):
87 88
        return "(%s if %s else %s)" % (x.group(2), x.group(1),
                                       expr.sub(repl, x.group(3)))
89 90 91 92 93 94 95

    # Code to transform the plural expression, taking care of parentheses
    stack = ['']
    for c in plural:
        if c == '(':
            stack.append('')
        elif c == ')':
96 97 98 99
            if len(stack) == 1:
                # Actually, we never reach this code, because unbalanced
                # parentheses get caught in the security check at the
                # beginning.
100
                raise ValueError('unbalanced parenthesis in plural form')
101 102 103 104 105 106 107 108 109
            s = expr.sub(repl, stack.pop())
            stack[-1] += '(%s)' % s
        else:
            stack[-1] += c
    plural = expr.sub(repl, stack.pop())

    return eval('lambda n: int(%s)' % plural)


Tim Peters's avatar
Tim Peters committed
110

111 112
def _expand_lang(loc):
    loc = locale.normalize(loc)
113 114 115 116 117
    COMPONENT_CODESET   = 1 << 0
    COMPONENT_TERRITORY = 1 << 1
    COMPONENT_MODIFIER  = 1 << 2
    # split up the locale into its base components
    mask = 0
118
    pos = loc.find('@')
119
    if pos >= 0:
120 121
        modifier = loc[pos:]
        loc = loc[:pos]
122 123 124
        mask |= COMPONENT_MODIFIER
    else:
        modifier = ''
125
    pos = loc.find('.')
126
    if pos >= 0:
127 128
        codeset = loc[pos:]
        loc = loc[:pos]
129 130 131
        mask |= COMPONENT_CODESET
    else:
        codeset = ''
132
    pos = loc.find('_')
133
    if pos >= 0:
134 135
        territory = loc[pos:]
        loc = loc[:pos]
136 137 138
        mask |= COMPONENT_TERRITORY
    else:
        territory = ''
139
    language = loc
140 141 142 143 144 145 146 147 148 149 150 151
    ret = []
    for i in range(mask+1):
        if not (i & ~mask):  # if all components for this combo exist ...
            val = language
            if i & COMPONENT_TERRITORY: val += territory
            if i & COMPONENT_CODESET:   val += codeset
            if i & COMPONENT_MODIFIER:  val += modifier
            ret.append(val)
    ret.reverse()
    return ret


Tim Peters's avatar
Tim Peters committed
152

153 154 155
class NullTranslations:
    def __init__(self, fp=None):
        self._info = {}
156
        self._charset = None
157
        self._output_charset = None
158
        self._fallback = None
159
        if fp is not None:
160
            self._parse(fp)
161

162 163 164
    def _parse(self, fp):
        pass

165 166 167 168 169 170
    def add_fallback(self, fallback):
        if self._fallback:
            self._fallback.add_fallback(fallback)
        else:
            self._fallback = fallback

171
    def gettext(self, message):
172 173
        if self._fallback:
            return self._fallback.gettext(message)
174 175
        return message

176 177 178 179 180
    def lgettext(self, message):
        if self._fallback:
            return self._fallback.lgettext(message)
        return message

181 182 183 184 185 186 187 188
    def ngettext(self, msgid1, msgid2, n):
        if self._fallback:
            return self._fallback.ngettext(msgid1, msgid2, n)
        if n == 1:
            return msgid1
        else:
            return msgid2

189 190 191 192 193 194 195 196
    def lngettext(self, msgid1, msgid2, n):
        if self._fallback:
            return self._fallback.lngettext(msgid1, msgid2, n)
        if n == 1:
            return msgid1
        else:
            return msgid2

197 198 199 200 201 202
    def info(self):
        return self._info

    def charset(self):
        return self._charset

203 204 205 206 207 208
    def output_charset(self):
        return self._output_charset

    def set_output_charset(self, charset):
        self._output_charset = charset

209
    def install(self, names=None):
210
        import builtins
211
        builtins.__dict__['_'] = self.gettext
212 213
        if hasattr(names, "__contains__"):
            if "gettext" in names:
214
                builtins.__dict__['gettext'] = builtins.__dict__['_']
215
            if "ngettext" in names:
216
                builtins.__dict__['ngettext'] = self.ngettext
217
            if "lgettext" in names:
218
                builtins.__dict__['lgettext'] = self.lgettext
219
            if "lngettext" in names:
220
                builtins.__dict__['lngettext'] = self.lngettext
221 222 223 224


class GNUTranslations(NullTranslations):
    # Magic number of .mo files
225 226
    LE_MAGIC = 0x950412de
    BE_MAGIC = 0xde120495
227 228 229 230 231 232 233

    def _parse(self, fp):
        """Override this method to support alternative .mo formats."""
        unpack = struct.unpack
        filename = getattr(fp, 'name', '')
        # Parse the .mo file header, which consists of 5 little endian 32
        # bit words.
234
        self._catalog = catalog = {}
235
        self.plural = lambda n: int(n != 1) # germanic plural by default
236
        buf = fp.read()
237
        buflen = len(buf)
238
        # Are we big endian or little endian?
239
        magic = unpack('<I', buf[:4])[0]
240
        if magic == self.LE_MAGIC:
241 242
            version, msgcount, masteridx, transidx = unpack('<4I', buf[4:20])
            ii = '<II'
243
        elif magic == self.BE_MAGIC:
244 245
            version, msgcount, masteridx, transidx = unpack('>4I', buf[4:20])
            ii = '>II'
246
        else:
247 248 249
            raise IOError(0, 'Bad magic number', filename)
        # Now put all messages from the .mo file buffer into the catalog
        # dictionary.
250
        for i in range(0, msgcount):
251
            mlen, moff = unpack(ii, buf[masteridx:masteridx+8])
252
            mend = moff + mlen
253
            tlen, toff = unpack(ii, buf[transidx:transidx+8])
254
            tend = toff + tlen
255
            if mend < buflen and tend < buflen:
256
                msg = buf[moff:mend]
257
                tmsg = buf[toff:tend]
258 259
            else:
                raise IOError(0, 'File is corrupt', filename)
260
            # See if we're looking at GNU .mo conventions for metadata
261
            if mlen == 0:
262
                # Catalog description
263
                lastk = k = None
264
                for b_item in tmsg.split('\n'.encode("ascii")):
265
                    item = b_item.decode().strip()
266 267
                    if not item:
                        continue
268 269 270 271 272 273 274 275
                    if ':' in item:
                        k, v = item.split(':', 1)
                        k = k.strip().lower()
                        v = v.strip()
                        self._info[k] = v
                        lastk = k
                    elif lastk:
                        self._info[lastk] += '\n' + item
276 277
                    if k == 'content-type':
                        self._charset = v.split('charset=')[1]
278 279 280 281
                    elif k == 'plural-forms':
                        v = v.split(';')
                        plural = v[1].split('plural=')[1]
                        self.plural = c2py(plural)
Barry Warsaw's avatar
Barry Warsaw committed
282 283 284
            # Note: we unconditionally convert both msgids and msgstrs to
            # Unicode using the character encoding specified in the charset
            # parameter of the Content-Type header.  The gettext documentation
285
            # strongly encourages msgids to be us-ascii, but some applications
Barry Warsaw's avatar
Barry Warsaw committed
286 287 288 289 290
            # require alternative encodings (e.g. Zope's ZCML and ZPT).  For
            # traditional gettext applications, the msgid conversion will
            # cause no problems since us-ascii should always be a subset of
            # the charset encoding.  We may want to fall back to 8-bit msgids
            # if the Unicode conversion fails.
291
            charset = self._charset or 'ascii'
292
            if b'\x00' in msg:
293
                # Plural forms
294 295
                msgid1, msgid2 = msg.split(b'\x00')
                tmsg = tmsg.split(b'\x00')
296 297 298
                msgid1 = str(msgid1, charset)
                for i, x in enumerate(tmsg):
                    catalog[(msgid1, i)] = str(x, charset)
299
            else:
300
                catalog[str(msg, charset)] = str(tmsg, charset)
301
            # advance to next entry in the seek tables
302 303
            masteridx += 8
            transidx += 8
304

305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329
    def lgettext(self, message):
        missing = object()
        tmsg = self._catalog.get(message, missing)
        if tmsg is missing:
            if self._fallback:
                return self._fallback.lgettext(message)
            return message
        if self._output_charset:
            return tmsg.encode(self._output_charset)
        return tmsg.encode(locale.getpreferredencoding())

    def lngettext(self, msgid1, msgid2, n):
        try:
            tmsg = self._catalog[(msgid1, self.plural(n))]
            if self._output_charset:
                return tmsg.encode(self._output_charset)
            return tmsg.encode(locale.getpreferredencoding())
        except KeyError:
            if self._fallback:
                return self._fallback.lngettext(msgid1, msgid2, n)
            if n == 1:
                return msgid1
            else:
                return msgid2

330
    def gettext(self, message):
331 332 333
        missing = object()
        tmsg = self._catalog.get(message, missing)
        if tmsg is missing:
334
            if self._fallback:
335
                return self._fallback.gettext(message)
336
            return message
337
        return tmsg
338

339
    def ngettext(self, msgid1, msgid2, n):
340 341 342 343
        try:
            tmsg = self._catalog[(msgid1, self.plural(n))]
        except KeyError:
            if self._fallback:
344
                return self._fallback.ngettext(msgid1, msgid2, n)
345
            if n == 1:
346
                tmsg = msgid1
347
            else:
348
                tmsg = msgid2
349
        return tmsg
350

Tim Peters's avatar
Tim Peters committed
351

352
# Locate a .mo file using the gettext strategy
Georg Brandl's avatar
Georg Brandl committed
353
def find(domain, localedir=None, languages=None, all=False):
354 355
    # Get some reasonable defaults for arguments that were not supplied
    if localedir is None:
356
        localedir = _default_localedir
357 358 359 360 361 362 363 364 365
    if languages is None:
        languages = []
        for envar in ('LANGUAGE', 'LC_ALL', 'LC_MESSAGES', 'LANG'):
            val = os.environ.get(envar)
            if val:
                languages = val.split(':')
                break
        if 'C' not in languages:
            languages.append('C')
366
    # now normalize and expand the languages
367
    nelangs = []
368 369
    for lang in languages:
        for nelang in _expand_lang(lang):
370 371
            if nelang not in nelangs:
                nelangs.append(nelang)
372
    # select a language
373 374 375 376
    if all:
        result = []
    else:
        result = None
377
    for lang in nelangs:
378 379
        if lang == 'C':
            break
380
        mofile = os.path.join(localedir, lang, 'LC_MESSAGES', '%s.mo' % domain)
381
        if os.path.exists(mofile):
382 383 384 385 386
            if all:
                result.append(mofile)
            else:
                return mofile
    return result
387 388


Tim Peters's avatar
Tim Peters committed
389

390 391 392
# a mapping between absolute .mo file path and Translation object
_translations = {}

393
def translation(domain, localedir=None, languages=None,
394
                class_=None, fallback=False, codeset=None):
395 396
    if class_ is None:
        class_ = GNUTranslations
Georg Brandl's avatar
Georg Brandl committed
397
    mofiles = find(domain, localedir, languages, all=True)
Barry Warsaw's avatar
Barry Warsaw committed
398
    if not mofiles:
399 400
        if fallback:
            return NullTranslations()
401
        raise IOError(ENOENT, 'No translation file found for domain', domain)
402 403
    # Avoid opening, reading, and parsing the .mo file after it's been done
    # once.
404 405
    result = None
    for mofile in mofiles:
406
        key = (class_, os.path.abspath(mofile))
407 408
        t = _translations.get(key)
        if t is None:
Benjamin Peterson's avatar
Benjamin Peterson committed
409 410
            with open(mofile, 'rb') as fp:
                t = _translations.setdefault(key, class_(fp))
411 412 413
        # Copy the translation object to allow setting fallbacks and
        # output charset. All other instance data is shared with the
        # cached object.
414
        t = copy.copy(t)
415 416
        if codeset:
            t.set_output_charset(codeset)
417 418 419 420 421
        if result is None:
            result = t
        else:
            result.add_fallback(t)
    return result
422

Tim Peters's avatar
Tim Peters committed
423

424
def install(domain, localedir=None, codeset=None, names=None):
425
    t = translation(domain, localedir, fallback=True, codeset=codeset)
426
    t.install(names)
427 428


Tim Peters's avatar
Tim Peters committed
429

430 431
# a mapping b/w domains and locale directories
_localedirs = {}
432 433
# a mapping b/w domains and codesets
_localecodesets = {}
434 435
# current global domain, `messages' used for compatibility w/ GNU gettext
_current_domain = 'messages'
436 437 438 439


def textdomain(domain=None):
    global _current_domain
440
    if domain is not None:
441
        _current_domain = domain
442
    return _current_domain
443 444


445 446 447 448 449
def bindtextdomain(domain, localedir=None):
    global _localedirs
    if localedir is not None:
        _localedirs[domain] = localedir
    return _localedirs.get(domain, _default_localedir)
450 451


452 453 454 455 456 457 458
def bind_textdomain_codeset(domain, codeset=None):
    global _localecodesets
    if codeset is not None:
        _localecodesets[domain] = codeset
    return _localecodesets.get(domain)


459
def dgettext(domain, message):
460
    try:
461 462
        t = translation(domain, _localedirs.get(domain, None),
                        codeset=_localecodesets.get(domain))
463 464 465
    except IOError:
        return message
    return t.gettext(message)
Tim Peters's avatar
Tim Peters committed
466

467 468 469 470 471 472 473
def ldgettext(domain, message):
    try:
        t = translation(domain, _localedirs.get(domain, None),
                        codeset=_localecodesets.get(domain))
    except IOError:
        return message
    return t.lgettext(message)
474

475 476
def dngettext(domain, msgid1, msgid2, n):
    try:
477 478
        t = translation(domain, _localedirs.get(domain, None),
                        codeset=_localecodesets.get(domain))
479 480 481 482 483 484 485
    except IOError:
        if n == 1:
            return msgid1
        else:
            return msgid2
    return t.ngettext(msgid1, msgid2, n)

486 487 488 489 490 491 492 493 494 495
def ldngettext(domain, msgid1, msgid2, n):
    try:
        t = translation(domain, _localedirs.get(domain, None),
                        codeset=_localecodesets.get(domain))
    except IOError:
        if n == 1:
            return msgid1
        else:
            return msgid2
    return t.lngettext(msgid1, msgid2, n)
496

497 498
def gettext(message):
    return dgettext(_current_domain, message)
499

500 501
def lgettext(message):
    return ldgettext(_current_domain, message)
502

503 504 505
def ngettext(msgid1, msgid2, n):
    return dngettext(_current_domain, msgid1, msgid2, n)

506 507
def lngettext(msgid1, msgid2, n):
    return ldngettext(_current_domain, msgid1, msgid2, n)
508

509
# dcgettext() has been deemed unnecessary and is not implemented.
510

511 512 513 514 515 516 517
# James Henstridge's Catalog constructor from GNOME gettext.  Documented usage
# was:
#
#    import gettext
#    cat = gettext.Catalog(PACKAGE, localedir=LOCALEDIR)
#    _ = cat.gettext
#    print _('Hello World')
518

519 520 521
# The resulting catalog object currently don't support access through a
# dictionary API, which was supported (but apparently unused) in GNOME
# gettext.
522

523
Catalog = translation