gettext.py 17.8 KB
Newer Older
1 2 3 4 5 6 7 8
"""Internationalization and localization support.

This module provides internationalization (I18N) and localization (L10N)
support for your Python programs by providing an interface to the GNU gettext
message catalog library.

I18N refers to the operation by which a program is made aware of multiple
languages.  L10N refers to the adaptation of your program, once
9
internationalized, to the local language and cultural habits.
10 11 12

"""

13 14
# This module represents the integration of work, contributions, feedback, and
# suggestions from the following people:
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
#
# Martin von Loewis, who wrote the initial implementation of the underlying
# C-based libintlmodule (later renamed _gettext), along with a skeletal
# gettext.py implementation.
#
# Peter Funk, who wrote fintl.py, a fairly complete wrapper around intlmodule,
# which also included a pure-Python implementation to read .mo files if
# intlmodule wasn't available.
#
# James Henstridge, who also wrote a gettext.py module, which has some
# interesting, but currently unsupported experimental features: the notion of
# a Catalog class and instances, and the ability to add to a catalog file via
# a Python API.
#
# Barry Warsaw integrated these modules, wrote the .install() API and code,
# and conformed all C and Python code to Python's coding standards.
31 32 33 34
#
# Francois Pinard and Marc-Andre Lemburg also contributed valuably to this
# module.
#
35
# J. David Ibanez implemented plural forms. Bruno Haible fixed some bugs.
36
#
37 38 39 40 41 42 43 44 45 46
# TODO:
# - Lazy loading of .mo files.  Currently the entire catalog is loaded into
#   memory, but that's probably bad for large translated programs.  Instead,
#   the lexical sort of original strings in GNU .mo files should be exploited
#   to do binary searches and lazy initializations.  Or you might want to use
#   the undocumented double-hash algorithm for .mo files with hash tables, but
#   you'll need to study the GNU gettext code to do this.
#
# - Support Solaris .mo file formats.  Unfortunately, we've been unable to
#   find this format documented anywhere.
47

48

49
import locale, copy, io, os, re, struct, sys
50
from errno import ENOENT
51

52

53 54
__all__ = ['NullTranslations', 'GNUTranslations', 'Catalog',
           'find', 'translation', 'install', 'textdomain', 'bindtextdomain',
55 56 57
           'bind_textdomain_codeset',
           'dgettext', 'dngettext', 'gettext', 'lgettext', 'ldgettext',
           'ldngettext', 'lngettext', 'ngettext',
58
           ]
59

60
_default_localedir = os.path.join(sys.base_prefix, 'share', 'locale')
61 62


63
def c2py(plural):
Barry Warsaw's avatar
Barry Warsaw committed
64 65
    """Gets a C expression as used in PO files for plural forms and returns a
    Python lambda function that implements an equivalent expression.
66 67 68
    """
    # Security check, allow only the "n" identifier
    import token, tokenize
69
    tokens = tokenize.generate_tokens(io.StringIO(plural).readline)
70
    try:
Barry Warsaw's avatar
Barry Warsaw committed
71
        danger = [x for x in tokens if x[0] == token.NAME and x[1] != 'n']
72
    except tokenize.TokenError:
73
        raise ValueError('plural forms expression error, maybe unbalanced parenthesis')
74 75
    else:
        if danger:
76
            raise ValueError('plural forms expression could be dangerous')
77 78 79 80 81

    # Replace some C operators by their Python equivalents
    plural = plural.replace('&&', ' and ')
    plural = plural.replace('||', ' or ')

82 83
    expr = re.compile(r'\!([^=])')
    plural = expr.sub(' not \\1', plural)
84 85

    # Regular expression and replacement function used to transform
Benjamin Peterson's avatar
Benjamin Peterson committed
86
    # "a?b:c" to "b if a else c".
87 88
    expr = re.compile(r'(.*?)\?(.*?):(.*)')
    def repl(x):
89 90
        return "(%s if %s else %s)" % (x.group(2), x.group(1),
                                       expr.sub(repl, x.group(3)))
91 92 93 94 95 96 97

    # Code to transform the plural expression, taking care of parentheses
    stack = ['']
    for c in plural:
        if c == '(':
            stack.append('')
        elif c == ')':
98 99 100 101
            if len(stack) == 1:
                # Actually, we never reach this code, because unbalanced
                # parentheses get caught in the security check at the
                # beginning.
102
                raise ValueError('unbalanced parenthesis in plural form')
103 104 105 106 107 108 109 110 111
            s = expr.sub(repl, stack.pop())
            stack[-1] += '(%s)' % s
        else:
            stack[-1] += c
    plural = expr.sub(repl, stack.pop())

    return eval('lambda n: int(%s)' % plural)


Tim Peters's avatar
Tim Peters committed
112

113 114
def _expand_lang(loc):
    loc = locale.normalize(loc)
115 116 117 118 119
    COMPONENT_CODESET   = 1 << 0
    COMPONENT_TERRITORY = 1 << 1
    COMPONENT_MODIFIER  = 1 << 2
    # split up the locale into its base components
    mask = 0
120
    pos = loc.find('@')
121
    if pos >= 0:
122 123
        modifier = loc[pos:]
        loc = loc[:pos]
124 125 126
        mask |= COMPONENT_MODIFIER
    else:
        modifier = ''
127
    pos = loc.find('.')
128
    if pos >= 0:
129 130
        codeset = loc[pos:]
        loc = loc[:pos]
131 132 133
        mask |= COMPONENT_CODESET
    else:
        codeset = ''
134
    pos = loc.find('_')
135
    if pos >= 0:
136 137
        territory = loc[pos:]
        loc = loc[:pos]
138 139 140
        mask |= COMPONENT_TERRITORY
    else:
        territory = ''
141
    language = loc
142 143 144 145 146 147 148 149 150 151 152 153
    ret = []
    for i in range(mask+1):
        if not (i & ~mask):  # if all components for this combo exist ...
            val = language
            if i & COMPONENT_TERRITORY: val += territory
            if i & COMPONENT_CODESET:   val += codeset
            if i & COMPONENT_MODIFIER:  val += modifier
            ret.append(val)
    ret.reverse()
    return ret


Tim Peters's avatar
Tim Peters committed
154

155 156 157
class NullTranslations:
    def __init__(self, fp=None):
        self._info = {}
158
        self._charset = None
159
        self._output_charset = None
160
        self._fallback = None
161
        if fp is not None:
162
            self._parse(fp)
163

164 165 166
    def _parse(self, fp):
        pass

167 168 169 170 171 172
    def add_fallback(self, fallback):
        if self._fallback:
            self._fallback.add_fallback(fallback)
        else:
            self._fallback = fallback

173
    def gettext(self, message):
174 175
        if self._fallback:
            return self._fallback.gettext(message)
176 177
        return message

178 179 180 181 182
    def lgettext(self, message):
        if self._fallback:
            return self._fallback.lgettext(message)
        return message

183 184 185 186 187 188 189 190
    def ngettext(self, msgid1, msgid2, n):
        if self._fallback:
            return self._fallback.ngettext(msgid1, msgid2, n)
        if n == 1:
            return msgid1
        else:
            return msgid2

191 192 193 194 195 196 197 198
    def lngettext(self, msgid1, msgid2, n):
        if self._fallback:
            return self._fallback.lngettext(msgid1, msgid2, n)
        if n == 1:
            return msgid1
        else:
            return msgid2

199 200 201 202 203 204
    def info(self):
        return self._info

    def charset(self):
        return self._charset

205 206 207 208 209 210
    def output_charset(self):
        return self._output_charset

    def set_output_charset(self, charset):
        self._output_charset = charset

211
    def install(self, names=None):
212
        import builtins
213
        builtins.__dict__['_'] = self.gettext
214 215
        if hasattr(names, "__contains__"):
            if "gettext" in names:
216
                builtins.__dict__['gettext'] = builtins.__dict__['_']
217
            if "ngettext" in names:
218
                builtins.__dict__['ngettext'] = self.ngettext
219
            if "lgettext" in names:
220
                builtins.__dict__['lgettext'] = self.lgettext
221
            if "lngettext" in names:
222
                builtins.__dict__['lngettext'] = self.lngettext
223 224 225 226


class GNUTranslations(NullTranslations):
    # Magic number of .mo files
227 228
    LE_MAGIC = 0x950412de
    BE_MAGIC = 0xde120495
229

230 231 232 233 234 235 236
    # Acceptable .mo versions
    VERSIONS = (0, 1)

    def _get_versions(self, version):
        """Returns a tuple of major version, minor version"""
        return (version >> 16, version & 0xffff)

237 238 239 240 241 242
    def _parse(self, fp):
        """Override this method to support alternative .mo formats."""
        unpack = struct.unpack
        filename = getattr(fp, 'name', '')
        # Parse the .mo file header, which consists of 5 little endian 32
        # bit words.
243
        self._catalog = catalog = {}
244
        self.plural = lambda n: int(n != 1) # germanic plural by default
245
        buf = fp.read()
246
        buflen = len(buf)
247
        # Are we big endian or little endian?
248
        magic = unpack('<I', buf[:4])[0]
249
        if magic == self.LE_MAGIC:
250 251
            version, msgcount, masteridx, transidx = unpack('<4I', buf[4:20])
            ii = '<II'
252
        elif magic == self.BE_MAGIC:
253 254
            version, msgcount, masteridx, transidx = unpack('>4I', buf[4:20])
            ii = '>II'
255
        else:
256
            raise OSError(0, 'Bad magic number', filename)
257 258 259 260 261 262

        major_version, minor_version = self._get_versions(version)

        if major_version not in self.VERSIONS:
            raise OSError(0, 'Bad version number ' + str(major_version), filename)

263 264
        # Now put all messages from the .mo file buffer into the catalog
        # dictionary.
265
        for i in range(0, msgcount):
266
            mlen, moff = unpack(ii, buf[masteridx:masteridx+8])
267
            mend = moff + mlen
268
            tlen, toff = unpack(ii, buf[transidx:transidx+8])
269
            tend = toff + tlen
270
            if mend < buflen and tend < buflen:
271
                msg = buf[moff:mend]
272
                tmsg = buf[toff:tend]
273
            else:
274
                raise OSError(0, 'File is corrupt', filename)
275
            # See if we're looking at GNU .mo conventions for metadata
276
            if mlen == 0:
277
                # Catalog description
278
                lastk = None
279
                for b_item in tmsg.split('\n'.encode("ascii")):
280
                    item = b_item.decode().strip()
281 282
                    if not item:
                        continue
283
                    k = v = None
284 285 286 287 288 289 290 291
                    if ':' in item:
                        k, v = item.split(':', 1)
                        k = k.strip().lower()
                        v = v.strip()
                        self._info[k] = v
                        lastk = k
                    elif lastk:
                        self._info[lastk] += '\n' + item
292 293
                    if k == 'content-type':
                        self._charset = v.split('charset=')[1]
294 295 296 297
                    elif k == 'plural-forms':
                        v = v.split(';')
                        plural = v[1].split('plural=')[1]
                        self.plural = c2py(plural)
Barry Warsaw's avatar
Barry Warsaw committed
298 299 300
            # Note: we unconditionally convert both msgids and msgstrs to
            # Unicode using the character encoding specified in the charset
            # parameter of the Content-Type header.  The gettext documentation
301
            # strongly encourages msgids to be us-ascii, but some applications
Barry Warsaw's avatar
Barry Warsaw committed
302 303 304 305 306
            # require alternative encodings (e.g. Zope's ZCML and ZPT).  For
            # traditional gettext applications, the msgid conversion will
            # cause no problems since us-ascii should always be a subset of
            # the charset encoding.  We may want to fall back to 8-bit msgids
            # if the Unicode conversion fails.
307
            charset = self._charset or 'ascii'
308
            if b'\x00' in msg:
309
                # Plural forms
310 311
                msgid1, msgid2 = msg.split(b'\x00')
                tmsg = tmsg.split(b'\x00')
312 313 314
                msgid1 = str(msgid1, charset)
                for i, x in enumerate(tmsg):
                    catalog[(msgid1, i)] = str(x, charset)
315
            else:
316
                catalog[str(msg, charset)] = str(tmsg, charset)
317
            # advance to next entry in the seek tables
318 319
            masteridx += 8
            transidx += 8
320

321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345
    def lgettext(self, message):
        missing = object()
        tmsg = self._catalog.get(message, missing)
        if tmsg is missing:
            if self._fallback:
                return self._fallback.lgettext(message)
            return message
        if self._output_charset:
            return tmsg.encode(self._output_charset)
        return tmsg.encode(locale.getpreferredencoding())

    def lngettext(self, msgid1, msgid2, n):
        try:
            tmsg = self._catalog[(msgid1, self.plural(n))]
            if self._output_charset:
                return tmsg.encode(self._output_charset)
            return tmsg.encode(locale.getpreferredencoding())
        except KeyError:
            if self._fallback:
                return self._fallback.lngettext(msgid1, msgid2, n)
            if n == 1:
                return msgid1
            else:
                return msgid2

346
    def gettext(self, message):
347 348 349
        missing = object()
        tmsg = self._catalog.get(message, missing)
        if tmsg is missing:
350
            if self._fallback:
351
                return self._fallback.gettext(message)
352
            return message
353
        return tmsg
354

355
    def ngettext(self, msgid1, msgid2, n):
356 357 358 359
        try:
            tmsg = self._catalog[(msgid1, self.plural(n))]
        except KeyError:
            if self._fallback:
360
                return self._fallback.ngettext(msgid1, msgid2, n)
361
            if n == 1:
362
                tmsg = msgid1
363
            else:
364
                tmsg = msgid2
365
        return tmsg
366

Tim Peters's avatar
Tim Peters committed
367

368
# Locate a .mo file using the gettext strategy
Georg Brandl's avatar
Georg Brandl committed
369
def find(domain, localedir=None, languages=None, all=False):
370 371
    # Get some reasonable defaults for arguments that were not supplied
    if localedir is None:
372
        localedir = _default_localedir
373 374 375 376 377 378 379 380 381
    if languages is None:
        languages = []
        for envar in ('LANGUAGE', 'LC_ALL', 'LC_MESSAGES', 'LANG'):
            val = os.environ.get(envar)
            if val:
                languages = val.split(':')
                break
        if 'C' not in languages:
            languages.append('C')
382
    # now normalize and expand the languages
383
    nelangs = []
384 385
    for lang in languages:
        for nelang in _expand_lang(lang):
386 387
            if nelang not in nelangs:
                nelangs.append(nelang)
388
    # select a language
389 390 391 392
    if all:
        result = []
    else:
        result = None
393
    for lang in nelangs:
394 395
        if lang == 'C':
            break
396
        mofile = os.path.join(localedir, lang, 'LC_MESSAGES', '%s.mo' % domain)
397
        if os.path.exists(mofile):
398 399 400 401 402
            if all:
                result.append(mofile)
            else:
                return mofile
    return result
403 404


Tim Peters's avatar
Tim Peters committed
405

406 407 408
# a mapping between absolute .mo file path and Translation object
_translations = {}

409
def translation(domain, localedir=None, languages=None,
410
                class_=None, fallback=False, codeset=None):
411 412
    if class_ is None:
        class_ = GNUTranslations
Georg Brandl's avatar
Georg Brandl committed
413
    mofiles = find(domain, localedir, languages, all=True)
Barry Warsaw's avatar
Barry Warsaw committed
414
    if not mofiles:
415 416
        if fallback:
            return NullTranslations()
417
        raise OSError(ENOENT, 'No translation file found for domain', domain)
418 419
    # Avoid opening, reading, and parsing the .mo file after it's been done
    # once.
420 421
    result = None
    for mofile in mofiles:
422
        key = (class_, os.path.abspath(mofile))
423 424
        t = _translations.get(key)
        if t is None:
Benjamin Peterson's avatar
Benjamin Peterson committed
425 426
            with open(mofile, 'rb') as fp:
                t = _translations.setdefault(key, class_(fp))
427 428 429
        # Copy the translation object to allow setting fallbacks and
        # output charset. All other instance data is shared with the
        # cached object.
430
        t = copy.copy(t)
431 432
        if codeset:
            t.set_output_charset(codeset)
433 434 435 436 437
        if result is None:
            result = t
        else:
            result.add_fallback(t)
    return result
438

Tim Peters's avatar
Tim Peters committed
439

440
def install(domain, localedir=None, codeset=None, names=None):
441
    t = translation(domain, localedir, fallback=True, codeset=codeset)
442
    t.install(names)
443 444


Tim Peters's avatar
Tim Peters committed
445

446 447
# a mapping b/w domains and locale directories
_localedirs = {}
448 449
# a mapping b/w domains and codesets
_localecodesets = {}
450 451
# current global domain, `messages' used for compatibility w/ GNU gettext
_current_domain = 'messages'
452 453 454 455


def textdomain(domain=None):
    global _current_domain
456
    if domain is not None:
457
        _current_domain = domain
458
    return _current_domain
459 460


461 462 463 464 465
def bindtextdomain(domain, localedir=None):
    global _localedirs
    if localedir is not None:
        _localedirs[domain] = localedir
    return _localedirs.get(domain, _default_localedir)
466 467


468 469 470 471 472 473 474
def bind_textdomain_codeset(domain, codeset=None):
    global _localecodesets
    if codeset is not None:
        _localecodesets[domain] = codeset
    return _localecodesets.get(domain)


475
def dgettext(domain, message):
476
    try:
477 478
        t = translation(domain, _localedirs.get(domain, None),
                        codeset=_localecodesets.get(domain))
479
    except OSError:
480 481
        return message
    return t.gettext(message)
Tim Peters's avatar
Tim Peters committed
482

483 484 485 486
def ldgettext(domain, message):
    try:
        t = translation(domain, _localedirs.get(domain, None),
                        codeset=_localecodesets.get(domain))
487
    except OSError:
488 489
        return message
    return t.lgettext(message)
490

491 492
def dngettext(domain, msgid1, msgid2, n):
    try:
493 494
        t = translation(domain, _localedirs.get(domain, None),
                        codeset=_localecodesets.get(domain))
495
    except OSError:
496 497 498 499 500 501
        if n == 1:
            return msgid1
        else:
            return msgid2
    return t.ngettext(msgid1, msgid2, n)

502 503 504 505
def ldngettext(domain, msgid1, msgid2, n):
    try:
        t = translation(domain, _localedirs.get(domain, None),
                        codeset=_localecodesets.get(domain))
506
    except OSError:
507 508 509 510 511
        if n == 1:
            return msgid1
        else:
            return msgid2
    return t.lngettext(msgid1, msgid2, n)
512

513 514
def gettext(message):
    return dgettext(_current_domain, message)
515

516 517
def lgettext(message):
    return ldgettext(_current_domain, message)
518

519 520 521
def ngettext(msgid1, msgid2, n):
    return dngettext(_current_domain, msgid1, msgid2, n)

522 523
def lngettext(msgid1, msgid2, n):
    return ldngettext(_current_domain, msgid1, msgid2, n)
524

525
# dcgettext() has been deemed unnecessary and is not implemented.
526

527 528 529 530 531 532 533
# James Henstridge's Catalog constructor from GNOME gettext.  Documented usage
# was:
#
#    import gettext
#    cat = gettext.Catalog(PACKAGE, localedir=LOCALEDIR)
#    _ = cat.gettext
#    print _('Hello World')
534

535 536 537
# The resulting catalog object currently don't support access through a
# dictionary API, which was supported (but apparently unused) in GNOME
# gettext.
538

539
Catalog = translation