unicodectype.c 4.64 KB
Newer Older
1 2 3
/*
   Unicode character type helpers.

4 5
   Written by Marc-Andre Lemburg (mal@lemburg.com).
   Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
6

7
   Copyright (c) Corporation for National Research Initiatives.
8 9 10 11 12 13

*/

#include "Python.h"
#include "unicodeobject.h"

14 15 16 17 18 19 20 21
#define ALPHA_MASK 0x01
#define DECIMAL_MASK 0x02
#define DIGIT_MASK 0x04
#define LOWER_MASK 0x08
#define LINEBREAK_MASK 0x10
#define SPACE_MASK 0x20
#define TITLE_MASK 0x40
#define UPPER_MASK 0x80
22
#define NODELTA_MASK 0x100
23
#define NUMERIC_MASK 0x200
24 25 26 27 28 29 30

typedef struct {
    const Py_UNICODE upper;
    const Py_UNICODE lower;
    const Py_UNICODE title;
    const unsigned char decimal;
    const unsigned char digit;
31
    const unsigned short flags;
32 33 34 35 36
} _PyUnicode_TypeRecord;

#include "unicodetype_db.h"

static const _PyUnicode_TypeRecord *
37
gettyperecord(Py_UNICODE code)
38 39
{
    int index;
40

41
#ifdef Py_UNICODE_WIDE
42
    if (code >= 0x110000)
43
        index = 0;
44 45 46
    else
#endif
    {
47 48 49
        index = index1[(code>>SHIFT)];
        index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
    }
50

51 52
    return &_PyUnicode_TypeRecords[index];
}
53

54 55 56
/* Returns the titlecase Unicode characters corresponding to ch or just
   ch if no titlecase mapping is known. */

57
Py_UNICODE _PyUnicode_ToTitlecase(register Py_UNICODE ch)
58
{
59
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
60
    int delta = ctype->title;
61

62 63 64
    if (ctype->flags & NODELTA_MASK)
	return delta;

65 66 67 68
    if (delta >= 32768)
	    delta -= 65536;

    return ch + delta;
69 70 71 72 73
}

/* Returns 1 for Unicode characters having the category 'Lt', 0
   otherwise. */

74
int _PyUnicode_IsTitlecase(Py_UNICODE ch)
75
{
76 77 78
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    return (ctype->flags & TITLE_MASK) != 0;
79 80 81 82 83
}

/* Returns the integer decimal (0-9) for Unicode characters having
   this property, -1 otherwise. */

84
int _PyUnicode_ToDecimalDigit(Py_UNICODE ch)
85
{
86 87 88
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -1;
89 90
}

91
int _PyUnicode_IsDecimalDigit(Py_UNICODE ch)
92 93 94 95 96 97 98 99 100
{
    if (_PyUnicode_ToDecimalDigit(ch) < 0)
	return 0;
    return 1;
}

/* Returns the integer digit (0-9) for Unicode characters having
   this property, -1 otherwise. */

101
int _PyUnicode_ToDigit(Py_UNICODE ch)
102
{
103 104 105
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    return (ctype->flags & DIGIT_MASK) ? ctype->digit : -1;
106 107
}

108
int _PyUnicode_IsDigit(Py_UNICODE ch)
109 110 111 112 113 114 115 116 117
{
    if (_PyUnicode_ToDigit(ch) < 0)
	return 0;
    return 1;
}

/* Returns the numeric value as double for Unicode characters having
   this property, -1.0 otherwise. */

118
int _PyUnicode_IsNumeric(Py_UNICODE ch)
119
{
120 121 122
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    return (ctype->flags & NUMERIC_MASK) != 0;
123 124 125 126 127 128 129
}

#ifndef WANT_WCTYPE_FUNCTIONS

/* Returns 1 for Unicode characters having the category 'Ll', 0
   otherwise. */

130
int _PyUnicode_IsLowercase(Py_UNICODE ch)
131
{
132 133 134
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    return (ctype->flags & LOWER_MASK) != 0;
135 136 137 138 139
}

/* Returns 1 for Unicode characters having the category 'Lu', 0
   otherwise. */

140
int _PyUnicode_IsUppercase(Py_UNICODE ch)
141
{
142 143 144
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    return (ctype->flags & UPPER_MASK) != 0;
145 146 147 148 149
}

/* Returns the uppercase Unicode characters corresponding to ch or just
   ch if no uppercase mapping is known. */

150
Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
151
{
152
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
153
    int delta = ctype->upper;
154 155
    if (ctype->flags & NODELTA_MASK)
	return delta;
156 157 158
    if (delta >= 32768)
	    delta -= 65536;
    return ch + delta;
159 160 161 162 163
}

/* Returns the lowercase Unicode characters corresponding to ch or just
   ch if no lowercase mapping is known. */

164
Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
165
{
166
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
167
    int delta = ctype->lower;
168 169
    if (ctype->flags & NODELTA_MASK)
	return delta;
170 171 172
    if (delta >= 32768)
	    delta -= 65536;
    return ch + delta;
173 174
}

175 176 177
/* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
   'Lo' or 'Lm',  0 otherwise. */

178
int _PyUnicode_IsAlpha(Py_UNICODE ch)
179
{
180
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
181

182
    return (ctype->flags & ALPHA_MASK) != 0;
183 184
}

185 186 187 188 189
#else

/* Export the interfaces using the wchar_t type for portability
   reasons:  */

190
int _PyUnicode_IsLowercase(Py_UNICODE ch)
191 192 193 194
{
    return iswlower(ch);
}

195
int _PyUnicode_IsUppercase(Py_UNICODE ch)
196 197 198 199
{
    return iswupper(ch);
}

200
Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
201 202 203 204
{
    return towlower(ch);
}

205
Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
206 207 208 209
{
    return towupper(ch);
}

210
int _PyUnicode_IsAlpha(Py_UNICODE ch)
211 212 213 214
{
    return iswalpha(ch);
}

215
#endif