test_c_locale_coercion.py 19.3 KB
Newer Older
1 2
# Tests the attempted automatic coercion of the C locale to a UTF-8 locale

3
import locale
4
import os
5 6
import shutil
import subprocess
7 8
import sys
import sysconfig
9
import unittest
10 11
from collections import namedtuple

12
from test import support
13 14 15 16 17
from test.support.script_helper import (
    run_python_until_end,
    interpreter_requires_environment,
)

18 19 20
# Set the list of ways we expect to be able to ask for the "C" locale
EXPECTED_C_LOCALE_EQUIVALENTS = ["C", "invalid.ascii"]

21 22
# Set our expectation for the default encoding used in the C locale
# for the filesystem encoding and the standard streams
23 24 25 26 27 28
EXPECTED_C_LOCALE_STREAM_ENCODING = "ascii"
EXPECTED_C_LOCALE_FS_ENCODING = "ascii"

# Set our expectation for the default locale used when none is specified
EXPECT_COERCION_IN_DEFAULT_LOCALE = True

29 30
TARGET_LOCALES = ["C.UTF-8", "C.utf8", "UTF-8"]

31 32
# Apply some platform dependent overrides
if sys.platform.startswith("linux"):
33
    if support.is_android:
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
        # Android defaults to using UTF-8 for all system interfaces
        EXPECTED_C_LOCALE_STREAM_ENCODING = "utf-8"
        EXPECTED_C_LOCALE_FS_ENCODING = "utf-8"
    else:
        # Linux distros typically alias the POSIX locale directly to the C
        # locale.
        # TODO: Once https://bugs.python.org/issue30672 is addressed, we'll be
        #       able to check this case unconditionally
        EXPECTED_C_LOCALE_EQUIVALENTS.append("POSIX")
elif sys.platform.startswith("aix"):
    # AIX uses iso8859-1 in the C locale, other *nix platforms use ASCII
    EXPECTED_C_LOCALE_STREAM_ENCODING = "iso8859-1"
    EXPECTED_C_LOCALE_FS_ENCODING = "iso8859-1"
elif sys.platform == "darwin":
    # FS encoding is UTF-8 on macOS
    EXPECTED_C_LOCALE_FS_ENCODING = "utf-8"
elif sys.platform == "cygwin":
    # Cygwin defaults to using C.UTF-8
    # TODO: Work out a robust dynamic test for this that doesn't rely on
    #       CPython's own locale handling machinery
    EXPECT_COERCION_IN_DEFAULT_LOCALE = False

# Note that the above expectations are still wrong in some cases, such as:
57
# * Windows when PYTHONLEGACYWINDOWSFSENCODING is set
58 59 60
# * Any platform other than AIX that uses latin-1 in the C locale
# * Any Linux distro where POSIX isn't a simple alias for the C locale
# * Any Linux distro where the default locale is something other than "C"
61 62
#
# Options for dealing with this:
63 64
# * Don't set the PY_COERCE_C_LOCALE preprocessor definition on
#   such platforms (e.g. it isn't set on Windows)
65
# * Fix the test expectations to match the actual platform behaviour
66

67 68
# In order to get the warning messages to match up as expected, the candidate
# order here must much the target locale order in Python/pylifecycle.c
69
_C_UTF8_LOCALES = ("C.UTF-8", "C.utf8", "UTF-8")
70 71 72 73

# There's no reliable cross-platform way of checking locale alias
# lists, so the only way of knowing which of these locales will work
# is to try them with locale.setlocale(). We do that in a subprocess
74
# in setUpModule() below to avoid altering the locale of the test runner.
75 76 77 78 79 80 81 82 83 84 85
#
# If the relevant locale module attributes exist, and we're not on a platform
# where we expect it to always succeed, we also check that
# `locale.nl_langinfo(locale.CODESET)` works, as if it fails, the interpreter
# will skip locale coercion for that particular target locale
_check_nl_langinfo_CODESET = bool(
    sys.platform not in ("darwin", "linux") and
    hasattr(locale, "nl_langinfo") and
    hasattr(locale, "CODESET")
)

86 87
def _set_locale_in_subprocess(locale_name):
    cmd_fmt = "import locale; print(locale.setlocale(locale.LC_CTYPE, '{}'))"
88 89 90
    if _check_nl_langinfo_CODESET:
        # If there's no valid CODESET, we expect coercion to be skipped
        cmd_fmt += "; import sys; sys.exit(not locale.nl_langinfo(locale.CODESET))"
91
    cmd = cmd_fmt.format(locale_name)
92
    result, py_cmd = run_python_until_end("-c", cmd, PYTHONCOERCECLOCALE='')
93 94
    return result.rc == 0

95 96


97 98
_fields = "fsencoding stdin_info stdout_info stderr_info lang lc_ctype lc_all"
_EncodingDetails = namedtuple("EncodingDetails", _fields)
99 100

class EncodingDetails(_EncodingDetails):
101
    # XXX (ncoghlan): Using JSON for child state reporting may be less fragile
102
    CHILD_PROCESS_SCRIPT = ";".join([
103
        "import sys, os",
104 105 106 107
        "print(sys.getfilesystemencoding())",
        "print(sys.stdin.encoding + ':' + sys.stdin.errors)",
        "print(sys.stdout.encoding + ':' + sys.stdout.errors)",
        "print(sys.stderr.encoding + ':' + sys.stderr.errors)",
108 109 110
        "print(os.environ.get('LANG', 'not set'))",
        "print(os.environ.get('LC_CTYPE', 'not set'))",
        "print(os.environ.get('LC_ALL', 'not set'))",
111 112 113
    ])

    @classmethod
114
    def get_expected_details(cls, coercion_expected, fs_encoding, stream_encoding, env_vars):
115
        """Returns expected child process details for a given encoding"""
116
        _stream = stream_encoding + ":{}"
117 118 119 120 121
        # stdin and stdout should use surrogateescape either because the
        # coercion triggered, or because the C locale was detected
        stream_info = 2*[_stream.format("surrogateescape")]
        # stderr should always use backslashreplace
        stream_info.append(_stream.format("backslashreplace"))
122 123 124 125 126 127 128 129
        expected_lang = env_vars.get("LANG", "not set").lower()
        if coercion_expected:
            expected_lc_ctype = CLI_COERCION_TARGET.lower()
        else:
            expected_lc_ctype = env_vars.get("LC_CTYPE", "not set").lower()
        expected_lc_all = env_vars.get("LC_ALL", "not set").lower()
        env_info = expected_lang, expected_lc_ctype, expected_lc_all
        return dict(cls(fs_encoding, *stream_info, *env_info)._asdict())
130 131 132 133 134 135 136 137 138 139 140 141 142 143 144

    @staticmethod
    def _handle_output_variations(data):
        """Adjust the output to handle platform specific idiosyncrasies

        * Some platforms report ASCII as ANSI_X3.4-1968
        * Some platforms report ASCII as US-ASCII
        * Some platforms report UTF-8 instead of utf-8
        """
        data = data.replace(b"ANSI_X3.4-1968", b"ascii")
        data = data.replace(b"US-ASCII", b"ascii")
        data = data.lower()
        return data

    @classmethod
145
    def get_child_details(cls, env_vars):
146 147 148 149 150 151 152 153 154 155
        """Retrieves fsencoding and standard stream details from a child process

        Returns (encoding_details, stderr_lines):

        - encoding_details: EncodingDetails for eager decoding
        - stderr_lines: result of calling splitlines() on the stderr output

        The child is run in isolated mode if the current interpreter supports
        that.
        """
156 157 158 159
        result, py_cmd = run_python_until_end(
            "-X", "utf8=0", "-c", cls.CHILD_PROCESS_SCRIPT,
            **env_vars
        )
160 161 162 163
        if not result.rc == 0:
            result.fail(py_cmd)
        # All subprocess outputs in this test case should be pure ASCII
        adjusted_output = cls._handle_output_variations(result.out)
164
        stdout_lines = adjusted_output.decode("ascii").splitlines()
165 166 167 168 169 170
        child_encoding_details = dict(cls(*stdout_lines)._asdict())
        stderr_lines = result.err.decode("ascii").rstrip().splitlines()
        return child_encoding_details, stderr_lines


# Details of the shared library warning emitted at runtime
171
LEGACY_LOCALE_WARNING = (
172 173 174 175 176 177 178 179 180 181 182 183 184
    "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII "
    "encoding), which may cause Unicode compatibility problems. Using C.UTF-8, "
    "C.utf8, or UTF-8 (if available) as alternative Unicode-compatible "
    "locales is recommended."
)

# Details of the CLI locale coercion warning emitted at runtime
CLI_COERCION_WARNING_FMT = (
    "Python detected LC_CTYPE=C: LC_CTYPE coerced to {} (set another locale "
    "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behavior)."
)


185
AVAILABLE_TARGETS = None
186 187
CLI_COERCION_TARGET = None
CLI_COERCION_WARNING = None
188

189 190
def setUpModule():
    global AVAILABLE_TARGETS
191 192
    global CLI_COERCION_TARGET
    global CLI_COERCION_WARNING
193 194 195 196 197 198 199 200 201 202 203

    if AVAILABLE_TARGETS is not None:
        # initialization already done
        return
    AVAILABLE_TARGETS = []

    # Find the target locales available in the current system
    for target_locale in _C_UTF8_LOCALES:
        if _set_locale_in_subprocess(target_locale):
            AVAILABLE_TARGETS.append(target_locale)

204 205 206 207
    if AVAILABLE_TARGETS:
        # Coercion is expected to use the first available target locale
        CLI_COERCION_TARGET = AVAILABLE_TARGETS[0]
        CLI_COERCION_WARNING = CLI_COERCION_WARNING_FMT.format(CLI_COERCION_TARGET)
208

209 210 211 212 213 214 215 216 217
    if support.verbose:
        print(f"AVAILABLE_TARGETS = {AVAILABLE_TARGETS!r}")
        print(f"EXPECTED_C_LOCALE_EQUIVALENTS = {EXPECTED_C_LOCALE_EQUIVALENTS!r}")
        print(f"EXPECTED_C_LOCALE_STREAM_ENCODING = {EXPECTED_C_LOCALE_STREAM_ENCODING!r}")
        print(f"EXPECTED_C_LOCALE_FS_ENCODING = {EXPECTED_C_LOCALE_FS_ENCODING!r}")
        print(f"EXPECT_COERCION_IN_DEFAULT_LOCALE = {EXPECT_COERCION_IN_DEFAULT_LOCALE!r}")
        print(f"_C_UTF8_LOCALES = {_C_UTF8_LOCALES!r}")
        print(f"_check_nl_langinfo_CODESET = {_check_nl_langinfo_CODESET!r}")

218

219 220
class _LocaleHandlingTestCase(unittest.TestCase):
    # Base class to check expected locale handling behaviour
221

222 223 224 225 226
    def _check_child_encoding_details(self,
                                      env_vars,
                                      expected_fs_encoding,
                                      expected_stream_encoding,
                                      expected_warnings,
227
                                      coercion_expected):
228
        """Check the C locale handling for the given process environment
229

230 231 232 233 234
        Parameters:
            expected_fs_encoding: expected sys.getfilesystemencoding() result
            expected_stream_encoding: expected encoding for standard streams
            expected_warning: stderr output to expect (if any)
        """
235
        result = EncodingDetails.get_child_details(env_vars)
236 237 238 239 240 241 242 243 244 245 246
        encoding_details, stderr_lines = result
        expected_details = EncodingDetails.get_expected_details(
            coercion_expected,
            expected_fs_encoding,
            expected_stream_encoding,
            env_vars
        )
        self.assertEqual(encoding_details, expected_details)
        if expected_warnings is None:
            expected_warnings = []
        self.assertEqual(stderr_lines, expected_warnings)
247

248 249

class LocaleConfigurationTests(_LocaleHandlingTestCase):
250 251
    # Test explicit external configuration via the process environment

252 253 254
    @classmethod
    def setUpClass(cls):
        # This relies on setUpModule() having been run, so it can't be
255 256 257 258
        # handled via the @unittest.skipUnless decorator
        if not AVAILABLE_TARGETS:
            raise unittest.SkipTest("No C-with-UTF-8 locale available")

259
    def test_external_target_locale_configuration(self):
260

261 262 263 264
        # Explicitly setting a target locale should give the same behaviour as
        # is seen when implicitly coercing to that target locale
        self.maxDiff = None

265 266
        expected_fs_encoding = "utf-8"
        expected_stream_encoding = "utf-8"
267 268 269 270 271

        base_var_dict = {
            "LANG": "",
            "LC_CTYPE": "",
            "LC_ALL": "",
272
            "PYTHONCOERCECLOCALE": "",
273 274
        }
        for env_var in ("LANG", "LC_CTYPE"):
275
            for locale_to_set in AVAILABLE_TARGETS:
276 277
                # XXX (ncoghlan): LANG=UTF-8 doesn't appear to work as
                #                 expected, so skip that combination for now
278
                # See https://bugs.python.org/issue30672 for discussion
279 280 281
                if env_var == "LANG" and locale_to_set == "UTF-8":
                    continue

282 283 284 285 286
                with self.subTest(env_var=env_var,
                                  configured_locale=locale_to_set):
                    var_dict = base_var_dict.copy()
                    var_dict[env_var] = locale_to_set
                    self._check_child_encoding_details(var_dict,
287 288
                                                       expected_fs_encoding,
                                                       expected_stream_encoding,
289 290
                                                       expected_warnings=None,
                                                       coercion_expected=False)
291 292 293



294
@support.cpython_only
295 296
@unittest.skipUnless(sysconfig.get_config_var("PY_COERCE_C_LOCALE"),
                     "C locale coercion disabled at build time")
297
class LocaleCoercionTests(_LocaleHandlingTestCase):
298 299
    # Test implicit reconfiguration of the environment during CLI startup

300 301 302 303 304 305
    def _check_c_locale_coercion(self,
                                 fs_encoding, stream_encoding,
                                 coerce_c_locale,
                                 expected_warnings=None,
                                 coercion_expected=True,
                                 **extra_vars):
306 307 308
        """Check the C locale handling for various configurations

        Parameters:
309 310 311
            fs_encoding: expected sys.getfilesystemencoding() result
            stream_encoding: expected encoding for standard streams
            coerce_c_locale: setting to use for PYTHONCOERCECLOCALE
312 313
              None: don't set the variable at all
              str: the value set in the child's environment
314 315
            expected_warnings: expected warning lines on stderr
            extra_vars: additional environment variables to set in subprocess
316 317 318
        """
        self.maxDiff = None

319 320
        if not AVAILABLE_TARGETS:
            # Locale coercion is disabled when there aren't any target locales
321 322
            fs_encoding = EXPECTED_C_LOCALE_FS_ENCODING
            stream_encoding = EXPECTED_C_LOCALE_STREAM_ENCODING
323 324 325
            coercion_expected = False
            if expected_warnings:
                expected_warnings = [LEGACY_LOCALE_WARNING]
326 327 328 329 330

        base_var_dict = {
            "LANG": "",
            "LC_CTYPE": "",
            "LC_ALL": "",
331
            "PYTHONCOERCECLOCALE": "",
332
        }
333
        base_var_dict.update(extra_vars)
334
        if coerce_c_locale is not None:
335
            base_var_dict["PYTHONCOERCECLOCALE"] = coerce_c_locale
336

337 338 339 340
        # Check behaviour for the default locale
        with self.subTest(default_locale=True,
                          PYTHONCOERCECLOCALE=coerce_c_locale):
            if EXPECT_COERCION_IN_DEFAULT_LOCALE:
341
                _expected_warnings = expected_warnings
342 343 344 345 346 347 348 349
                _coercion_expected = coercion_expected
            else:
                _expected_warnings = None
                _coercion_expected = False
            # On Android CLI_COERCION_WARNING is not printed when all the
            # locale environment variables are undefined or empty. When
            # this code path is run with environ['LC_ALL'] == 'C', then
            # LEGACY_LOCALE_WARNING is printed.
350
            if (support.is_android and
351 352 353 354 355 356
                    _expected_warnings == [CLI_COERCION_WARNING]):
                _expected_warnings = None
            self._check_child_encoding_details(base_var_dict,
                                               fs_encoding,
                                               stream_encoding,
                                               _expected_warnings,
357
                                               _coercion_expected)
358 359 360 361

        # Check behaviour for explicitly configured locales
        for locale_to_set in EXPECTED_C_LOCALE_EQUIVALENTS:
            for env_var in ("LANG", "LC_CTYPE"):
362 363 364 365 366
                with self.subTest(env_var=env_var,
                                  nominal_locale=locale_to_set,
                                  PYTHONCOERCECLOCALE=coerce_c_locale):
                    var_dict = base_var_dict.copy()
                    var_dict[env_var] = locale_to_set
367
                    # Check behaviour on successful coercion
368
                    self._check_child_encoding_details(var_dict,
369 370
                                                       fs_encoding,
                                                       stream_encoding,
371
                                                       expected_warnings,
372
                                                       coercion_expected)
373

374
    def test_PYTHONCOERCECLOCALE_not_set(self):
375
        # This should coerce to the first available target locale by default
376
        self._check_c_locale_coercion("utf-8", "utf-8", coerce_c_locale=None)
377 378

    def test_PYTHONCOERCECLOCALE_not_zero(self):
379
        # *Any* string other than "0" is considered "set" for our purposes
380 381
        # and hence should result in the locale coercion being enabled
        for setting in ("", "1", "true", "false"):
382
            self._check_c_locale_coercion("utf-8", "utf-8", coerce_c_locale=setting)
383

384 385 386 387 388 389 390
    def test_PYTHONCOERCECLOCALE_set_to_warn(self):
        # PYTHONCOERCECLOCALE=warn enables runtime warnings for legacy locales
        self._check_c_locale_coercion("utf-8", "utf-8",
                                      coerce_c_locale="warn",
                                      expected_warnings=[CLI_COERCION_WARNING])


391 392
    def test_PYTHONCOERCECLOCALE_set_to_zero(self):
        # The setting "0" should result in the locale coercion being disabled
393 394
        self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING,
                                      EXPECTED_C_LOCALE_STREAM_ENCODING,
395 396 397
                                      coerce_c_locale="0",
                                      coercion_expected=False)
        # Setting LC_ALL=C shouldn't make any difference to the behaviour
398 399
        self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING,
                                      EXPECTED_C_LOCALE_STREAM_ENCODING,
400 401 402
                                      coerce_c_locale="0",
                                      LC_ALL="C",
                                      coercion_expected=False)
403

404 405
    def test_LC_ALL_set_to_C(self):
        # Setting LC_ALL should render the locale coercion ineffective
406 407
        self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING,
                                      EXPECTED_C_LOCALE_STREAM_ENCODING,
408 409 410 411
                                      coerce_c_locale=None,
                                      LC_ALL="C",
                                      coercion_expected=False)
        # And result in a warning about a lack of locale compatibility
412 413
        self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING,
                                      EXPECTED_C_LOCALE_STREAM_ENCODING,
414 415 416 417
                                      coerce_c_locale="warn",
                                      LC_ALL="C",
                                      expected_warnings=[LEGACY_LOCALE_WARNING],
                                      coercion_expected=False)
418

419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439
    def test_PYTHONCOERCECLOCALE_set_to_one(self):
        # skip the test if the LC_CTYPE locale is C or coerced
        old_loc = locale.setlocale(locale.LC_CTYPE, None)
        self.addCleanup(locale.setlocale, locale.LC_CTYPE, old_loc)
        loc = locale.setlocale(locale.LC_CTYPE, "")
        if loc == "C":
            self.skipTest("test requires LC_CTYPE locale different than C")
        if loc in TARGET_LOCALES :
            self.skipTest("coerced LC_CTYPE locale: %s" % loc)

        # bpo-35336: PYTHONCOERCECLOCALE=1 must not coerce the LC_CTYPE locale
        # if it's not equal to "C"
        code = 'import locale; print(locale.setlocale(locale.LC_CTYPE, None))'
        env = dict(os.environ, PYTHONCOERCECLOCALE='1')
        cmd = subprocess.run([sys.executable, '-c', code],
                             stdout=subprocess.PIPE,
                             env=env,
                             text=True)
        self.assertEqual(cmd.stdout.rstrip(), loc)


440
def test_main():
441
    support.run_unittest(
442
        LocaleConfigurationTests,
443
        LocaleCoercionTests
444
    )
445
    support.reap_children()
446 447 448

if __name__ == "__main__":
    test_main()