unicodectype.c 7.49 KB
Newer Older
1 2 3
/*
   Unicode character type helpers.

4 5
   Written by Marc-Andre Lemburg (mal@lemburg.com).
   Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
6

7
   Copyright (c) Corporation for National Research Initiatives.
8 9 10 11 12

*/

#include "Python.h"

13 14 15 16 17 18 19 20
#define ALPHA_MASK 0x01
#define DECIMAL_MASK 0x02
#define DIGIT_MASK 0x04
#define LOWER_MASK 0x08
#define LINEBREAK_MASK 0x10
#define SPACE_MASK 0x20
#define TITLE_MASK 0x40
#define UPPER_MASK 0x80
21 22
#define XID_START_MASK 0x100
#define XID_CONTINUE_MASK 0x200
23
#define PRINTABLE_MASK 0x400
24 25 26 27
#define NUMERIC_MASK 0x800
#define CASE_IGNORABLE_MASK 0x1000
#define CASED_MASK 0x2000
#define EXTENDED_CASE_MASK 0x4000
28 29

typedef struct {
30 31 32 33 34 35 36
    /* 
       These are either deltas to the character or offsets in
       _PyUnicode_ExtendedCase.
    */
    const int upper;
    const int lower;
    const int title;
37
    /* Note if more flag space is needed, decimal and digit could be unified. */
38 39
    const unsigned char decimal;
    const unsigned char digit;
40
    const unsigned short flags;
41 42 43 44 45
} _PyUnicode_TypeRecord;

#include "unicodetype_db.h"

static const _PyUnicode_TypeRecord *
46
gettyperecord(Py_UCS4 code)
47 48
{
    int index;
49

50
    if (code >= 0x110000)
51
        index = 0;
52 53
    else
    {
54 55 56
        index = index1[(code>>SHIFT)];
        index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
    }
57

58 59
    return &_PyUnicode_TypeRecords[index];
}
60

61 62 63
/* Returns the titlecase Unicode characters corresponding to ch or just
   ch if no titlecase mapping is known. */

64
Py_UCS4 _PyUnicode_ToTitlecase(Py_UCS4 ch)
65
{
66
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
67

68 69
    if (ctype->flags & EXTENDED_CASE_MASK)
        return _PyUnicode_ExtendedCase[ctype->title & 0xFFFF];
70
    return ch + ctype->title;
71 72 73 74 75
}

/* Returns 1 for Unicode characters having the category 'Lt', 0
   otherwise. */

76
int _PyUnicode_IsTitlecase(Py_UCS4 ch)
77
{
78 79 80
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    return (ctype->flags & TITLE_MASK) != 0;
81 82
}

83 84 85
/* Returns 1 for Unicode characters having the XID_Start property, 0
   otherwise. */

86
int _PyUnicode_IsXidStart(Py_UCS4 ch)
87 88 89 90 91 92 93 94 95
{
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    return (ctype->flags & XID_START_MASK) != 0;
}

/* Returns 1 for Unicode characters having the XID_Continue property,
   0 otherwise. */

96
int _PyUnicode_IsXidContinue(Py_UCS4 ch)
97 98 99 100 101 102
{
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    return (ctype->flags & XID_CONTINUE_MASK) != 0;
}

103 104 105
/* Returns the integer decimal (0-9) for Unicode characters having
   this property, -1 otherwise. */

106
int _PyUnicode_ToDecimalDigit(Py_UCS4 ch)
107
{
108 109 110
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -1;
111 112
}

113
int _PyUnicode_IsDecimalDigit(Py_UCS4 ch)
114 115
{
    if (_PyUnicode_ToDecimalDigit(ch) < 0)
116
        return 0;
117 118 119 120 121 122
    return 1;
}

/* Returns the integer digit (0-9) for Unicode characters having
   this property, -1 otherwise. */

123
int _PyUnicode_ToDigit(Py_UCS4 ch)
124
{
125 126 127
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    return (ctype->flags & DIGIT_MASK) ? ctype->digit : -1;
128 129
}

130
int _PyUnicode_IsDigit(Py_UCS4 ch)
131 132
{
    if (_PyUnicode_ToDigit(ch) < 0)
133
        return 0;
134 135 136 137 138 139
    return 1;
}

/* Returns the numeric value as double for Unicode characters having
   this property, -1.0 otherwise. */

140
int _PyUnicode_IsNumeric(Py_UCS4 ch)
141
{
142 143 144
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    return (ctype->flags & NUMERIC_MASK) != 0;
145 146
}

147 148 149 150 151 152 153 154 155 156 157 158 159
/* Returns 1 for Unicode characters to be hex-escaped when repr()ed,
   0 otherwise.
   All characters except those characters defined in the Unicode character
   database as following categories are considered printable.
      * Cc (Other, Control)
      * Cf (Other, Format)
      * Cs (Other, Surrogate)
      * Co (Other, Private Use)
      * Cn (Other, Not Assigned)
      * Zl Separator, Line ('\u2028', LINE SEPARATOR)
      * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
      * Zs (Separator, Space) other than ASCII space('\x20').
*/
160
int _PyUnicode_IsPrintable(Py_UCS4 ch)
161 162 163
{
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

164
    return (ctype->flags & PRINTABLE_MASK) != 0;
165 166
}

167 168 169
/* Returns 1 for Unicode characters having the category 'Ll', 0
   otherwise. */

170
int _PyUnicode_IsLowercase(Py_UCS4 ch)
171
{
172 173 174
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    return (ctype->flags & LOWER_MASK) != 0;
175 176 177 178 179
}

/* Returns 1 for Unicode characters having the category 'Lu', 0
   otherwise. */

180
int _PyUnicode_IsUppercase(Py_UCS4 ch)
181
{
182 183 184
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    return (ctype->flags & UPPER_MASK) != 0;
185 186 187 188 189
}

/* Returns the uppercase Unicode characters corresponding to ch or just
   ch if no uppercase mapping is known. */

190
Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
191
{
192
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
193 194

    if (ctype->flags & EXTENDED_CASE_MASK)
195
        return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFF];
196
    return ch + ctype->upper;
197 198 199 200 201
}

/* Returns the lowercase Unicode characters corresponding to ch or just
   ch if no lowercase mapping is known. */

202
Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
203
{
204
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
205 206

    if (ctype->flags & EXTENDED_CASE_MASK)
207
        return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFF];
208
    return ch + ctype->lower;
209 210 211 212 213 214 215
}

int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res)
{
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    if (ctype->flags & EXTENDED_CASE_MASK) {
216
        int index = ctype->lower & 0xFFFF;
217 218 219 220 221 222
        int n = ctype->lower >> 24;
        int i;
        for (i = 0; i < n; i++)
            res[i] = _PyUnicode_ExtendedCase[index + i];
        return n;
    }
223
    res[0] = ch + ctype->lower;
224 225 226 227 228 229 230 231
    return 1;
}

int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res)
{
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    if (ctype->flags & EXTENDED_CASE_MASK) {
232
        int index = ctype->title & 0xFFFF;
233 234 235 236 237 238
        int n = ctype->title >> 24;
        int i;
        for (i = 0; i < n; i++)
            res[i] = _PyUnicode_ExtendedCase[index + i];
        return n;
    }
239
    res[0] = ch + ctype->title;
240 241 242 243 244 245 246 247
    return 1;
}

int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res)
{
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    if (ctype->flags & EXTENDED_CASE_MASK) {
248
        int index = ctype->upper & 0xFFFF;
249 250 251 252 253 254
        int n = ctype->upper >> 24;
        int i;
        for (i = 0; i < n; i++)
            res[i] = _PyUnicode_ExtendedCase[index + i];
        return n;
    }
255
    res[0] = ch + ctype->upper;
256 257 258
    return 1;
}

259 260 261 262 263 264 265 266 267 268 269 270 271 272 273
int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res)
{
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) {
        int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24);
        int n = (ctype->lower >> 20) & 7;
        int i;
        for (i = 0; i < n; i++)
            res[i] = _PyUnicode_ExtendedCase[index + i];
        return n;
    }
    return _PyUnicode_ToLowerFull(ch, res);
}

274 275 276 277 278 279 280 281 282 283 284 285
int _PyUnicode_IsCased(Py_UCS4 ch)
{
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    return (ctype->flags & CASED_MASK) != 0;
}

int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch)
{
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    return (ctype->flags & CASE_IGNORABLE_MASK) != 0;
286 287
}

288 289 290
/* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
   'Lo' or 'Lm',  0 otherwise. */

291
int _PyUnicode_IsAlpha(Py_UCS4 ch)
292
{
293
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
294

295
    return (ctype->flags & ALPHA_MASK) != 0;
296 297
}