test_c_locale_coercion.py 17.8 KB
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415
# Tests the attempted automatic coercion of the C locale to a UTF-8 locale

import unittest
import locale
import os
import sys
import sysconfig
import shutil
from collections import namedtuple

import test.support
from test.support.script_helper import (
    run_python_until_end,
    interpreter_requires_environment,
)

# Set the list of ways we expect to be able to ask for the "C" locale
EXPECTED_C_LOCALE_EQUIVALENTS = ["C", "invalid.ascii"]

# Set our expectation for the default encoding used in the C locale
# for the filesystem encoding and the standard streams
EXPECTED_C_LOCALE_STREAM_ENCODING = "ascii"
EXPECTED_C_LOCALE_FS_ENCODING = "ascii"

# Set our expectation for the default locale used when none is specified
EXPECT_COERCION_IN_DEFAULT_LOCALE = True

# Apply some platform dependent overrides
if sys.platform.startswith("linux"):
    if test.support.is_android:
        # Android defaults to using UTF-8 for all system interfaces
        EXPECTED_C_LOCALE_STREAM_ENCODING = "utf-8"
        EXPECTED_C_LOCALE_FS_ENCODING = "utf-8"
    else:
        # Linux distros typically alias the POSIX locale directly to the C
        # locale.
        # TODO: Once https://bugs.python.org/issue30672 is addressed, we'll be
        #       able to check this case unconditionally
        EXPECTED_C_LOCALE_EQUIVALENTS.append("POSIX")
elif sys.platform.startswith("aix"):
    # AIX uses iso8859-1 in the C locale, other *nix platforms use ASCII
    EXPECTED_C_LOCALE_STREAM_ENCODING = "iso8859-1"
    EXPECTED_C_LOCALE_FS_ENCODING = "iso8859-1"
elif sys.platform == "darwin":
    # FS encoding is UTF-8 on macOS
    EXPECTED_C_LOCALE_FS_ENCODING = "utf-8"
elif sys.platform == "cygwin":
    # Cygwin defaults to using C.UTF-8
    # TODO: Work out a robust dynamic test for this that doesn't rely on
    #       CPython's own locale handling machinery
    EXPECT_COERCION_IN_DEFAULT_LOCALE = False

# Note that the above expectations are still wrong in some cases, such as:
# * Windows when PYTHONLEGACYWINDOWSFSENCODING is set
# * Any platform other than AIX that uses latin-1 in the C locale
# * Any Linux distro where POSIX isn't a simple alias for the C locale
# * Any Linux distro where the default locale is something other than "C"
#
# Options for dealing with this:
# * Don't set the PY_COERCE_C_LOCALE preprocessor definition on
#   such platforms (e.g. it isn't set on Windows)
# * Fix the test expectations to match the actual platform behaviour

# In order to get the warning messages to match up as expected, the candidate
# order here must much the target locale order in Python/pylifecycle.c
_C_UTF8_LOCALES = ("C.UTF-8", "C.utf8", "UTF-8")

# There's no reliable cross-platform way of checking locale alias
# lists, so the only way of knowing which of these locales will work
# is to try them with locale.setlocale(). We do that in a subprocess
# in setUpModule() below to avoid altering the locale of the test runner.
#
# If the relevant locale module attributes exist, and we're not on a platform
# where we expect it to always succeed, we also check that
# `locale.nl_langinfo(locale.CODESET)` works, as if it fails, the interpreter
# will skip locale coercion for that particular target locale
_check_nl_langinfo_CODESET = bool(
    sys.platform not in ("darwin", "linux") and
    hasattr(locale, "nl_langinfo") and
    hasattr(locale, "CODESET")
)

def _set_locale_in_subprocess(locale_name):
    cmd_fmt = "import locale; print(locale.setlocale(locale.LC_CTYPE, '{}'))"
    if _check_nl_langinfo_CODESET:
        # If there's no valid CODESET, we expect coercion to be skipped
        cmd_fmt += "; import sys; sys.exit(not locale.nl_langinfo(locale.CODESET))"
    cmd = cmd_fmt.format(locale_name)
    result, py_cmd = run_python_until_end("-c", cmd, PYTHONCOERCECLOCALE='')
    return result.rc == 0



_fields = "fsencoding stdin_info stdout_info stderr_info lang lc_ctype lc_all"
_EncodingDetails = namedtuple("EncodingDetails", _fields)

class EncodingDetails(_EncodingDetails):
    # XXX (ncoghlan): Using JSON for child state reporting may be less fragile
    CHILD_PROCESS_SCRIPT = ";".join([
        "import sys, os",
        "print(sys.getfilesystemencoding())",
        "print(sys.stdin.encoding + ':' + sys.stdin.errors)",
        "print(sys.stdout.encoding + ':' + sys.stdout.errors)",
        "print(sys.stderr.encoding + ':' + sys.stderr.errors)",
        "print(os.environ.get('LANG', 'not set'))",
        "print(os.environ.get('LC_CTYPE', 'not set'))",
        "print(os.environ.get('LC_ALL', 'not set'))",
    ])

    @classmethod
    def get_expected_details(cls, coercion_expected, fs_encoding, stream_encoding, env_vars):
        """Returns expected child process details for a given encoding"""
        _stream = stream_encoding + ":{}"
        # stdin and stdout should use surrogateescape either because the
        # coercion triggered, or because the C locale was detected
        stream_info = 2*[_stream.format("surrogateescape")]
        # stderr should always use backslashreplace
        stream_info.append(_stream.format("backslashreplace"))
        expected_lang = env_vars.get("LANG", "not set").lower()
        if coercion_expected:
            expected_lc_ctype = CLI_COERCION_TARGET.lower()
        else:
            expected_lc_ctype = env_vars.get("LC_CTYPE", "not set").lower()
        expected_lc_all = env_vars.get("LC_ALL", "not set").lower()
        env_info = expected_lang, expected_lc_ctype, expected_lc_all
        return dict(cls(fs_encoding, *stream_info, *env_info)._asdict())

    @staticmethod
    def _handle_output_variations(data):
        """Adjust the output to handle platform specific idiosyncrasies

        * Some platforms report ASCII as ANSI_X3.4-1968
        * Some platforms report ASCII as US-ASCII
        * Some platforms report UTF-8 instead of utf-8
        """
        data = data.replace(b"ANSI_X3.4-1968", b"ascii")
        data = data.replace(b"US-ASCII", b"ascii")
        data = data.lower()
        return data

    @classmethod
    def get_child_details(cls, env_vars):
        """Retrieves fsencoding and standard stream details from a child process

        Returns (encoding_details, stderr_lines):

        - encoding_details: EncodingDetails for eager decoding
        - stderr_lines: result of calling splitlines() on the stderr output

        The child is run in isolated mode if the current interpreter supports
        that.
        """
        result, py_cmd = run_python_until_end(
            "-X", "utf8=0", "-c", cls.CHILD_PROCESS_SCRIPT,
            **env_vars
        )
        if not result.rc == 0:
            result.fail(py_cmd)
        # All subprocess outputs in this test case should be pure ASCII
        adjusted_output = cls._handle_output_variations(result.out)
        stdout_lines = adjusted_output.decode("ascii").splitlines()
        child_encoding_details = dict(cls(*stdout_lines)._asdict())
        stderr_lines = result.err.decode("ascii").rstrip().splitlines()
        return child_encoding_details, stderr_lines


# Details of the shared library warning emitted at runtime
LEGACY_LOCALE_WARNING = (
    "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII "
    "encoding), which may cause Unicode compatibility problems. Using C.UTF-8, "
    "C.utf8, or UTF-8 (if available) as alternative Unicode-compatible "
    "locales is recommended."
)

# Details of the CLI locale coercion warning emitted at runtime
CLI_COERCION_WARNING_FMT = (
    "Python detected LC_CTYPE=C: LC_CTYPE coerced to {} (set another locale "
    "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behavior)."
)


AVAILABLE_TARGETS = None
CLI_COERCION_TARGET = None
CLI_COERCION_WARNING = None

def setUpModule():
    global AVAILABLE_TARGETS
    global CLI_COERCION_TARGET
    global CLI_COERCION_WARNING

    if AVAILABLE_TARGETS is not None:
        # initialization already done
        return
    AVAILABLE_TARGETS = []

    # Find the target locales available in the current system
    for target_locale in _C_UTF8_LOCALES:
        if _set_locale_in_subprocess(target_locale):
            AVAILABLE_TARGETS.append(target_locale)

    if AVAILABLE_TARGETS:
        # Coercion is expected to use the first available target locale
        CLI_COERCION_TARGET = AVAILABLE_TARGETS[0]
        CLI_COERCION_WARNING = CLI_COERCION_WARNING_FMT.format(CLI_COERCION_TARGET)


class _LocaleHandlingTestCase(unittest.TestCase):
    # Base class to check expected locale handling behaviour

    def _check_child_encoding_details(self,
                                      env_vars,
                                      expected_fs_encoding,
                                      expected_stream_encoding,
                                      expected_warnings,
                                      coercion_expected):
        """Check the C locale handling for the given process environment

        Parameters:
            expected_fs_encoding: expected sys.getfilesystemencoding() result
            expected_stream_encoding: expected encoding for standard streams
            expected_warning: stderr output to expect (if any)
        """
        result = EncodingDetails.get_child_details(env_vars)
        encoding_details, stderr_lines = result
        expected_details = EncodingDetails.get_expected_details(
            coercion_expected,
            expected_fs_encoding,
            expected_stream_encoding,
            env_vars
        )
        self.assertEqual(encoding_details, expected_details)
        if expected_warnings is None:
            expected_warnings = []
        self.assertEqual(stderr_lines, expected_warnings)


class LocaleConfigurationTests(_LocaleHandlingTestCase):
    # Test explicit external configuration via the process environment

    @classmethod
    def setUpClass(cls):
        # This relies on setUpModule() having been run, so it can't be
        # handled via the @unittest.skipUnless decorator
        if not AVAILABLE_TARGETS:
            raise unittest.SkipTest("No C-with-UTF-8 locale available")

    def test_external_target_locale_configuration(self):

        # Explicitly setting a target locale should give the same behaviour as
        # is seen when implicitly coercing to that target locale
        self.maxDiff = None

        expected_fs_encoding = "utf-8"
        expected_stream_encoding = "utf-8"

        base_var_dict = {
            "LANG": "",
            "LC_CTYPE": "",
            "LC_ALL": "",
            "PYTHONCOERCECLOCALE": "",
        }
        for env_var in ("LANG", "LC_CTYPE"):
            for locale_to_set in AVAILABLE_TARGETS:
                # XXX (ncoghlan): LANG=UTF-8 doesn't appear to work as
                #                 expected, so skip that combination for now
                # See https://bugs.python.org/issue30672 for discussion
                if env_var == "LANG" and locale_to_set == "UTF-8":
                    continue

                with self.subTest(env_var=env_var,
                                  configured_locale=locale_to_set):
                    var_dict = base_var_dict.copy()
                    var_dict[env_var] = locale_to_set
                    self._check_child_encoding_details(var_dict,
                                                       expected_fs_encoding,
                                                       expected_stream_encoding,
                                                       expected_warnings=None,
                                                       coercion_expected=False)



@test.support.cpython_only
@unittest.skipUnless(sysconfig.get_config_var("PY_COERCE_C_LOCALE"),
                     "C locale coercion disabled at build time")
class LocaleCoercionTests(_LocaleHandlingTestCase):
    # Test implicit reconfiguration of the environment during CLI startup

    def _check_c_locale_coercion(self,
                                 fs_encoding, stream_encoding,
                                 coerce_c_locale,
                                 expected_warnings=None,
                                 coercion_expected=True,
                                 **extra_vars):
        """Check the C locale handling for various configurations

        Parameters:
            fs_encoding: expected sys.getfilesystemencoding() result
            stream_encoding: expected encoding for standard streams
            coerce_c_locale: setting to use for PYTHONCOERCECLOCALE
              None: don't set the variable at all
              str: the value set in the child's environment
            expected_warnings: expected warning lines on stderr
            extra_vars: additional environment variables to set in subprocess
        """
        self.maxDiff = None

        if not AVAILABLE_TARGETS:
            # Locale coercion is disabled when there aren't any target locales
            fs_encoding = EXPECTED_C_LOCALE_FS_ENCODING
            stream_encoding = EXPECTED_C_LOCALE_STREAM_ENCODING
            coercion_expected = False
            if expected_warnings:
                expected_warnings = [LEGACY_LOCALE_WARNING]

        base_var_dict = {
            "LANG": "",
            "LC_CTYPE": "",
            "LC_ALL": "",
            "PYTHONCOERCECLOCALE": "",
        }
        base_var_dict.update(extra_vars)
        if coerce_c_locale is not None:
            base_var_dict["PYTHONCOERCECLOCALE"] = coerce_c_locale

        # Check behaviour for the default locale
        with self.subTest(default_locale=True,
                          PYTHONCOERCECLOCALE=coerce_c_locale):
            if EXPECT_COERCION_IN_DEFAULT_LOCALE:
                _expected_warnings = expected_warnings
                _coercion_expected = coercion_expected
            else:
                _expected_warnings = None
                _coercion_expected = False
            # On Android CLI_COERCION_WARNING is not printed when all the
            # locale environment variables are undefined or empty. When
            # this code path is run with environ['LC_ALL'] == 'C', then
            # LEGACY_LOCALE_WARNING is printed.
            if (test.support.is_android and
                    _expected_warnings == [CLI_COERCION_WARNING]):
                _expected_warnings = None
            self._check_child_encoding_details(base_var_dict,
                                               fs_encoding,
                                               stream_encoding,
                                               _expected_warnings,
                                               _coercion_expected)

        # Check behaviour for explicitly configured locales
        for locale_to_set in EXPECTED_C_LOCALE_EQUIVALENTS:
            for env_var in ("LANG", "LC_CTYPE"):
                with self.subTest(env_var=env_var,
                                  nominal_locale=locale_to_set,
                                  PYTHONCOERCECLOCALE=coerce_c_locale):
                    var_dict = base_var_dict.copy()
                    var_dict[env_var] = locale_to_set
                    # Check behaviour on successful coercion
                    self._check_child_encoding_details(var_dict,
                                                       fs_encoding,
                                                       stream_encoding,
                                                       expected_warnings,
                                                       coercion_expected)

    def test_PYTHONCOERCECLOCALE_not_set(self):
        # This should coerce to the first available target locale by default
        self._check_c_locale_coercion("utf-8", "utf-8", coerce_c_locale=None)

    def test_PYTHONCOERCECLOCALE_not_zero(self):
        # *Any* string other than "0" is considered "set" for our purposes
        # and hence should result in the locale coercion being enabled
        for setting in ("", "1", "true", "false"):
            self._check_c_locale_coercion("utf-8", "utf-8", coerce_c_locale=setting)

    def test_PYTHONCOERCECLOCALE_set_to_warn(self):
        # PYTHONCOERCECLOCALE=warn enables runtime warnings for legacy locales
        self._check_c_locale_coercion("utf-8", "utf-8",
                                      coerce_c_locale="warn",
                                      expected_warnings=[CLI_COERCION_WARNING])


    def test_PYTHONCOERCECLOCALE_set_to_zero(self):
        # The setting "0" should result in the locale coercion being disabled
        self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING,
                                      EXPECTED_C_LOCALE_STREAM_ENCODING,
                                      coerce_c_locale="0",
                                      coercion_expected=False)
        # Setting LC_ALL=C shouldn't make any difference to the behaviour
        self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING,
                                      EXPECTED_C_LOCALE_STREAM_ENCODING,
                                      coerce_c_locale="0",
                                      LC_ALL="C",
                                      coercion_expected=False)

    def test_LC_ALL_set_to_C(self):
        # Setting LC_ALL should render the locale coercion ineffective
        self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING,
                                      EXPECTED_C_LOCALE_STREAM_ENCODING,
                                      coerce_c_locale=None,
                                      LC_ALL="C",
                                      coercion_expected=False)
        # And result in a warning about a lack of locale compatibility
        self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING,
                                      EXPECTED_C_LOCALE_STREAM_ENCODING,
                                      coerce_c_locale="warn",
                                      LC_ALL="C",
                                      expected_warnings=[LEGACY_LOCALE_WARNING],
                                      coercion_expected=False)

def test_main():
    test.support.run_unittest(
        LocaleConfigurationTests,
        LocaleCoercionTests
    )
    test.support.reap_children()

if __name__ == "__main__":
    test_main()