fileutils.c 11.6 KB
Newer Older
Victor Stinner's avatar
Victor Stinner committed
1
#include "Python.h"
2 3 4
#ifdef MS_WINDOWS
#  include <windows.h>
#endif
Victor Stinner's avatar
Victor Stinner committed
5 6 7 8 9 10 11 12 13 14 15

#ifdef HAVE_STAT

/* Decode a byte string from the locale encoding with the
   surrogateescape error handler (undecodable bytes are decoded as characters
   in range U+DC80..U+DCFF). If a byte sequence can be decoded as a surrogate
   character, escape the bytes using the surrogateescape error handler instead
   of decoding them.

   Use _Py_wchar2char() to encode the character string back to a byte string.

16 17 18
   Return a pointer to a newly allocated wide character string (use
   PyMem_Free() to free the memory) and write the number of written wide
   characters excluding the null character into *size if size is not NULL, or
19 20 21 22
   NULL on error (conversion or memory allocation error).

   Conversion errors should never happen, unless there is a bug in the C
   library. */
Victor Stinner's avatar
Victor Stinner committed
23
wchar_t*
24
_Py_char2wchar(const char* arg, size_t *size)
Victor Stinner's avatar
Victor Stinner committed
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
{
    wchar_t *res;
#ifdef HAVE_BROKEN_MBSTOWCS
    /* Some platforms have a broken implementation of
     * mbstowcs which does not count the characters that
     * would result from conversion.  Use an upper bound.
     */
    size_t argsize = strlen(arg);
#else
    size_t argsize = mbstowcs(NULL, arg, 0);
#endif
    size_t count;
    unsigned char *in;
    wchar_t *out;
#ifdef HAVE_MBRTOWC
    mbstate_t mbs;
#endif
    if (argsize != (size_t)-1) {
        res = (wchar_t *)PyMem_Malloc((argsize+1)*sizeof(wchar_t));
        if (!res)
            goto oom;
        count = mbstowcs(res, arg, argsize+1);
        if (count != (size_t)-1) {
            wchar_t *tmp;
            /* Only use the result if it contains no
               surrogate characters. */
            for (tmp = res; *tmp != 0 &&
                         (*tmp < 0xd800 || *tmp > 0xdfff); tmp++)
                ;
54 55 56
            if (*tmp == 0) {
                if (size != NULL)
                    *size = count;
Victor Stinner's avatar
Victor Stinner committed
57
                return res;
58
            }
Victor Stinner's avatar
Victor Stinner committed
59 60 61 62 63 64 65 66 67 68 69
        }
        PyMem_Free(res);
    }
    /* Conversion failed. Fall back to escaping with surrogateescape. */
#ifdef HAVE_MBRTOWC
    /* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */

    /* Overallocate; as multi-byte characters are in the argument, the
       actual output could use less memory. */
    argsize = strlen(arg) + 1;
    res = (wchar_t*)PyMem_Malloc(argsize*sizeof(wchar_t));
70 71
    if (!res)
        goto oom;
Victor Stinner's avatar
Victor Stinner committed
72 73 74 75 76 77 78 79 80 81 82 83 84 85
    in = (unsigned char*)arg;
    out = res;
    memset(&mbs, 0, sizeof mbs);
    while (argsize) {
        size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
        if (converted == 0)
            /* Reached end of string; null char stored. */
            break;
        if (converted == (size_t)-2) {
            /* Incomplete character. This should never happen,
               since we provide everything that we have -
               unless there is a bug in the C library, or I
               misunderstood how mbrtowc works. */
            fprintf(stderr, "unexpected mbrtowc result -2\n");
86
            PyMem_Free(res);
Victor Stinner's avatar
Victor Stinner committed
87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
            return NULL;
        }
        if (converted == (size_t)-1) {
            /* Conversion error. Escape as UTF-8b, and start over
               in the initial shift state. */
            *out++ = 0xdc00 + *in++;
            argsize--;
            memset(&mbs, 0, sizeof mbs);
            continue;
        }
        if (*out >= 0xd800 && *out <= 0xdfff) {
            /* Surrogate character.  Escape the original
               byte sequence with surrogateescape. */
            argsize -= converted;
            while (converted--)
                *out++ = 0xdc00 + *in++;
            continue;
        }
        /* successfully converted some bytes */
        in += converted;
        argsize -= converted;
        out++;
    }
#else
    /* Cannot use C locale for escaping; manually escape as if charset
       is ASCII (i.e. escape all bytes > 128. This will still roundtrip
       correctly in the locale's charset, which must be an ASCII superset. */
    res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t));
    if (!res) goto oom;
    in = (unsigned char*)arg;
    out = res;
    while(*in)
        if(*in < 128)
            *out++ = *in++;
        else
            *out++ = 0xdc00 + *in++;
    *out = 0;
#endif
125 126
    if (size != NULL)
        *size = out - res;
Victor Stinner's avatar
Victor Stinner committed
127 128 129 130 131 132 133 134 135 136 137 138 139
    return res;
oom:
    fprintf(stderr, "out of memory\n");
    return NULL;
}

/* Encode a (wide) character string to the locale encoding with the
   surrogateescape error handler (characters in range U+DC80..U+DCFF are
   converted to bytes 0x80..0xFF).

   This function is the reverse of _Py_char2wchar().

   Return a pointer to a newly allocated byte string (use PyMem_Free() to free
140 141 142 143
   the memory), or NULL on conversion or memory allocation error.

   If error_pos is not NULL: *error_pos is the index of the invalid character
   on conversion error, or (size_t)-1 otherwise. */
Victor Stinner's avatar
Victor Stinner committed
144
char*
145
_Py_wchar2char(const wchar_t *text, size_t *error_pos)
Victor Stinner's avatar
Victor Stinner committed
146 147 148 149 150 151
{
    const size_t len = wcslen(text);
    char *result = NULL, *bytes = NULL;
    size_t i, size, converted;
    wchar_t c, buf[2];

152 153 154
    if (error_pos != NULL)
        *error_pos = (size_t)-1;

Victor Stinner's avatar
Victor Stinner committed
155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181
    /* The function works in two steps:
       1. compute the length of the output buffer in bytes (size)
       2. outputs the bytes */
    size = 0;
    buf[1] = 0;
    while (1) {
        for (i=0; i < len; i++) {
            c = text[i];
            if (c >= 0xdc80 && c <= 0xdcff) {
                /* UTF-8b surrogate */
                if (bytes != NULL) {
                    *bytes++ = c - 0xdc00;
                    size--;
                }
                else
                    size++;
                continue;
            }
            else {
                buf[0] = c;
                if (bytes != NULL)
                    converted = wcstombs(bytes, buf, size);
                else
                    converted = wcstombs(NULL, buf, 0);
                if (converted == (size_t)-1) {
                    if (result != NULL)
                        PyMem_Free(result);
182 183
                    if (error_pos != NULL)
                        *error_pos = i;
Victor Stinner's avatar
Victor Stinner committed
184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214
                    return NULL;
                }
                if (bytes != NULL) {
                    bytes += converted;
                    size -= converted;
                }
                else
                    size += converted;
            }
        }
        if (result != NULL) {
            *bytes = 0;
            break;
        }

        size += 1; /* nul byte at the end */
        result = PyMem_Malloc(size);
        if (result == NULL)
            return NULL;
        bytes = result;
    }
    return result;
}

/* In principle, this should use HAVE__WSTAT, and _wstat
   should be detected by autoconf. However, no current
   POSIX system provides that function, so testing for
   it is pointless.
   Not sure whether the MS_WINDOWS guards are necessary:
   perhaps for cygwin/mingw builds?
*/
215
#if defined(HAVE_STAT) && !defined(MS_WINDOWS)
216 217 218

/* Get file status. Encode the path to the locale encoding. */

219 220 221
int
_Py_wstat(const wchar_t* path, struct stat *buf)
{
Victor Stinner's avatar
Victor Stinner committed
222 223
    int err;
    char *fname;
224
    fname = _Py_wchar2char(path, NULL);
Victor Stinner's avatar
Victor Stinner committed
225 226 227 228 229 230 231 232 233 234
    if (fname == NULL) {
        errno = EINVAL;
        return -1;
    }
    err = stat(fname, buf);
    PyMem_Free(fname);
    return err;
}
#endif

235 236 237 238 239
/* Call _wstat() on Windows, or encode the path to the filesystem encoding and
   call stat() otherwise. Only fill st_mode attribute on Windows.

   Return 0 on success, -1 on _wstat() / stat() error or (if PyErr_Occurred())
   unicode error. */
Victor Stinner's avatar
Victor Stinner committed
240 241

int
242
_Py_stat(PyObject *path, struct stat *statbuf)
Victor Stinner's avatar
Victor Stinner committed
243 244 245 246 247
{
#ifdef MS_WINDOWS
    int err;
    struct _stat wstatbuf;

248
    err = _wstat(PyUnicode_AS_UNICODE(path), &wstatbuf);
Victor Stinner's avatar
Victor Stinner committed
249 250 251 252 253
    if (!err)
        statbuf->st_mode = wstatbuf.st_mode;
    return err;
#else
    int ret;
254
    PyObject *bytes = PyUnicode_EncodeFSDefault(path);
Victor Stinner's avatar
Victor Stinner committed
255 256 257 258 259 260 261 262
    if (bytes == NULL)
        return -1;
    ret = stat(PyBytes_AS_STRING(bytes), statbuf);
    Py_DECREF(bytes);
    return ret;
#endif
}

263 264 265
/* Open a file. Use _wfopen() on Windows, encode the path to the locale
   encoding and use fopen() otherwise. */

Victor Stinner's avatar
Victor Stinner committed
266 267 268 269 270 271 272 273 274 275 276 277 278
FILE *
_Py_wfopen(const wchar_t *path, const wchar_t *mode)
{
#ifndef MS_WINDOWS
    FILE *f;
    char *cpath;
    char cmode[10];
    size_t r;
    r = wcstombs(cmode, mode, 10);
    if (r == (size_t)-1 || r >= 10) {
        errno = EINVAL;
        return NULL;
    }
279
    cpath = _Py_wchar2char(path, NULL);
Victor Stinner's avatar
Victor Stinner committed
280 281 282 283 284 285 286 287 288 289
    if (cpath == NULL)
        return NULL;
    f = fopen(cpath, cmode);
    PyMem_Free(cpath);
    return f;
#else
    return _wfopen(path, mode);
#endif
}

290 291 292 293 294
/* Call _wfopen() on Windows, or encode the path to the filesystem encoding and
   call fopen() otherwise.

   Return the new file object on success, or NULL if the file cannot be open or
   (if PyErr_Occurred()) on unicode error */
Victor Stinner's avatar
Victor Stinner committed
295 296

FILE*
297
_Py_fopen(PyObject *path, const char *mode)
Victor Stinner's avatar
Victor Stinner committed
298 299 300 301 302 303 304 305 306
{
#ifdef MS_WINDOWS
    wchar_t wmode[10];
    int usize;

    usize = MultiByteToWideChar(CP_ACP, 0, mode, -1, wmode, sizeof(wmode));
    if (usize == 0)
        return NULL;

307
    return _wfopen(PyUnicode_AS_UNICODE(path), wmode);
Victor Stinner's avatar
Victor Stinner committed
308 309
#else
    FILE *f;
310
    PyObject *bytes = PyUnicode_EncodeFSDefault(path);
Victor Stinner's avatar
Victor Stinner committed
311 312 313 314 315 316 317 318 319
    if (bytes == NULL)
        return NULL;
    f = fopen(PyBytes_AS_STRING(bytes), mode);
    Py_DECREF(bytes);
    return f;
#endif
}

#ifdef HAVE_READLINK
320 321 322 323

/* Read value of symbolic link. Encode the path to the locale encoding, decode
   the result from the locale encoding. */

Victor Stinner's avatar
Victor Stinner committed
324 325 326 327 328
int
_Py_wreadlink(const wchar_t *path, wchar_t *buf, size_t bufsiz)
{
    char *cpath;
    char cbuf[PATH_MAX];
329
    wchar_t *wbuf;
Victor Stinner's avatar
Victor Stinner committed
330 331 332
    int res;
    size_t r1;

333
    cpath = _Py_wchar2char(path, NULL);
Victor Stinner's avatar
Victor Stinner committed
334 335 336 337 338 339 340 341 342 343 344 345 346
    if (cpath == NULL) {
        errno = EINVAL;
        return -1;
    }
    res = (int)readlink(cpath, cbuf, PATH_MAX);
    PyMem_Free(cpath);
    if (res == -1)
        return -1;
    if (res == PATH_MAX) {
        errno = EINVAL;
        return -1;
    }
    cbuf[res] = '\0'; /* buf will be null terminated */
347
    wbuf = _Py_char2wchar(cbuf, &r1);
348 349 350 351
    if (wbuf == NULL) {
        errno = EINVAL;
        return -1;
    }
352 353
    if (bufsiz <= r1) {
        PyMem_Free(wbuf);
Victor Stinner's avatar
Victor Stinner committed
354 355 356
        errno = EINVAL;
        return -1;
    }
357 358
    wcsncpy(buf, wbuf, bufsiz);
    PyMem_Free(wbuf);
Victor Stinner's avatar
Victor Stinner committed
359 360 361 362 363
    return (int)r1;
}
#endif

#ifdef HAVE_REALPATH
364 365 366 367

/* Return the canonicalized absolute pathname. Encode path to the locale
   encoding, decode the result from the locale encoding. */

Victor Stinner's avatar
Victor Stinner committed
368
wchar_t*
369 370
_Py_wrealpath(const wchar_t *path,
              wchar_t *resolved_path, size_t resolved_path_size)
Victor Stinner's avatar
Victor Stinner committed
371 372 373
{
    char *cpath;
    char cresolved_path[PATH_MAX];
374
    wchar_t *wresolved_path;
Victor Stinner's avatar
Victor Stinner committed
375 376
    char *res;
    size_t r;
377
    cpath = _Py_wchar2char(path, NULL);
Victor Stinner's avatar
Victor Stinner committed
378 379 380 381 382 383 384 385
    if (cpath == NULL) {
        errno = EINVAL;
        return NULL;
    }
    res = realpath(cpath, cresolved_path);
    PyMem_Free(cpath);
    if (res == NULL)
        return NULL;
386

387
    wresolved_path = _Py_char2wchar(cresolved_path, &r);
388 389 390 391 392 393
    if (wresolved_path == NULL) {
        errno = EINVAL;
        return NULL;
    }
    if (resolved_path_size <= r) {
        PyMem_Free(wresolved_path);
Victor Stinner's avatar
Victor Stinner committed
394 395 396
        errno = EINVAL;
        return NULL;
    }
397 398
    wcsncpy(resolved_path, wresolved_path, resolved_path_size);
    PyMem_Free(wresolved_path);
Victor Stinner's avatar
Victor Stinner committed
399 400 401 402
    return resolved_path;
}
#endif

403 404
/* Get the current directory. size is the buffer size in wide characters
   including the null character. Decode the path from the locale encoding. */
405

Victor Stinner's avatar
Victor Stinner committed
406 407 408 409 410 411 412
wchar_t*
_Py_wgetcwd(wchar_t *buf, size_t size)
{
#ifdef MS_WINDOWS
    return _wgetcwd(buf, size);
#else
    char fname[PATH_MAX];
413
    wchar_t *wname;
414
    size_t len;
415

Victor Stinner's avatar
Victor Stinner committed
416 417
    if (getcwd(fname, PATH_MAX) == NULL)
        return NULL;
418
    wname = _Py_char2wchar(fname, &len);
419 420
    if (wname == NULL)
        return NULL;
421
    if (size <= len) {
422
        PyMem_Free(wname);
Victor Stinner's avatar
Victor Stinner committed
423 424
        return NULL;
    }
425 426
    wcsncpy(buf, wname, size);
    PyMem_Free(wname);
Victor Stinner's avatar
Victor Stinner committed
427 428 429 430 431
    return buf;
#endif
}

#endif