mkstringprep.py 10 KB
Newer Older
1 2
import re, sys
from unicodedata import ucd_3_2_0 as unicodedata
3 4

if sys.maxunicode == 65535:
5
    raise RuntimeError("need UCS-4 Python")
6 7 8

def gen_category(cats):
    for i in range(0, 0x110000):
9
        if unicodedata.category(chr(i)) in cats:
10 11 12 13
            yield(i)

def gen_bidirectional(cats):
    for i in range(0, 0x110000):
14
        if unicodedata.bidirectional(chr(i)) in cats:
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
            yield(i)

def compact_set(l):
    single = []
    tuple = []
    prev = None
    span = 0
    for e in l:
        if prev is None:
            prev = e
            span = 0
            continue
        if prev+span+1 != e:
            if span > 2:
                tuple.append((prev,prev+span+1))
            else:
                for i in range(prev, prev+span+1):
                    single.append(i)
            prev = e
            span = 0
        else:
            span += 1
    if span:
        tuple.append((prev,prev+span+1))
    else:
        single.append(prev)
41 42 43 44
    if not single and len(tuple) == 1:
        tuple = "range(%d,%d)" % tuple[0]
    else:
        tuple = " + ".join("list(range(%d,%d))" % t for t in tuple)
45
    if not single:
46
        return "set(%s)" % tuple
47
    if not tuple:
48 49
        return "set(%r)" % (single,)
    return "set(%r + %s)" % (single, tuple)
50 51 52

############## Read the tables in the RFC #######################

53 54
with open("rfc3454.txt") as f:
    data = f.readlines()
55 56 57 58 59 60 61 62

tables = []
curname = None
for l in data:
    l = l.strip()
    if not l:
        continue
    # Skip RFC page breaks
63
    if l.startswith(("Hoffman & Blanchet", "RFC 3454")):
64 65 66 67 68 69
        continue
    # Find start/end lines
    m = re.match("----- (Start|End) Table ([A-Z](.[0-9])+) -----", l)
    if m:
        if m.group(1) == "Start":
            if curname:
70
                raise RuntimeError("Double Start", (curname, l))
71 72 73 74 75 76
            curname = m.group(2)
            table = {}
            tables.append((curname, table))
            continue
        else:
            if not curname:
77
                raise RuntimeError("End without start", l)
78 79
            if curname != m.group(2):
                raise RuntimeError("Unexpected end", l)
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
            curname = None
            continue
    if not curname:
        continue
    # Now we are in a table
    fields = l.split(";")
    if len(fields) > 1:
        # Drop comment field
        fields = fields[:-1]
    if len(fields) == 1:
        fields = fields[0].split("-")
        if len(fields) > 1:
            # range
            try:
                start, end = fields
            except ValueError:
96
                raise RuntimeError("Unpacking problem", l)
97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
        else:
            start = end = fields[0]
        start = int(start, 16)
        end = int(end, 16)
        for i in range(start, end+1):
            table[i] = i
    else:
        code, value = fields
        value = value.strip()
        if value:
            value = [int(v, 16) for v in value.split(" ")]
        else:
            # table B.1
            value = None
        table[int(code, 16)] = value

########### Generate compact Python versions of the tables #############

115
print("""# This file is generated by mkstringprep.py. DO NOT EDIT.
116 117 118 119 120 121
\"\"\"Library that exposes various tables found in the StringPrep RFC 3454.

There are two kinds of tables: sets, for which a member test is provided,
and mappings, for which a mapping function is provided.
\"\"\"

122
from unicodedata import ucd_3_2_0 as unicodedata
123
""")
124

125
print("assert unicodedata.unidata_version == %r" % (unicodedata.unidata_version,))
126 127 128 129 130 131

# A.1 is the table of unassigned characters
# XXX Plane 15 PUA is listed as unassigned in Python.
name, table = tables[0]
del tables[0]
assert name == "A.1"
132 133
table = set(table.keys())
Cn = set(gen_category(["Cn"]))
134 135

# FDD0..FDEF are process internal codes
136
Cn -= set(range(0xFDD0, 0xFDF0))
137
# not a character
138 139
Cn -= set(range(0xFFFE, 0x110000, 0x10000))
Cn -= set(range(0xFFFF, 0x110000, 0x10000))
140 141 142

# assert table == Cn

143
print("""
144 145 146 147 148
def in_table_a1(code):
    if unicodedata.category(code) != 'Cn': return False
    c = ord(code)
    if 0xFDD0 <= c < 0xFDF0: return False
    return (c & 0xFFFF) not in (0xFFFE, 0xFFFF)
149
""")
150 151 152 153 154

# B.1 cannot easily be derived
name, table = tables[0]
del tables[0]
assert name == "B.1"
155
table = sorted(table.keys())
156
print("""
157 158 159
b1_set = """ + compact_set(table) + """
def in_table_b1(code):
    return ord(code) in b1_set
160
""")
161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181

# B.2 and B.3 is case folding.
# It takes CaseFolding.txt into account, which is
# not available in the Python database. Since
# B.2 is derived from B.3, we process B.3 first.
# B.3 supposedly *is* CaseFolding-3.2.0.txt.

name, table_b2 = tables[0]
del tables[0]
assert name == "B.2"

name, table_b3 = tables[0]
del tables[0]
assert name == "B.3"

# B.3 is mostly Python's .lower, except for a number
# of special cases, e.g. considering canonical forms.

b3_exceptions = {}

for k,v in table_b2.items():
182 183
    if list(map(ord, chr(k).lower())) != v:
        b3_exceptions[k] = "".join(map(chr,v))
184

185
b3 = sorted(b3_exceptions.items())
186

187 188
print("""
b3_exceptions = {""")
189 190
for i, kv in enumerate(b3):
    print("0x%x:%a," % kv, end=' ')
191
    if i % 4 == 3:
192 193
        print()
print("}")
194

195
print("""
196 197 198 199
def map_table_b3(code):
    r = b3_exceptions.get(ord(code))
    if r is not None: return r
    return code.lower()
200
""")
201 202 203 204 205 206 207 208 209 210 211 212 213

def map_table_b3(code):
    r = b3_exceptions.get(ord(code))
    if r is not None: return r
    return code.lower()

# B.2 is case folding for NFKC. This is the same as B.3,
# except where NormalizeWithKC(Fold(a)) !=
# NormalizeWithKC(Fold(NormalizeWithKC(Fold(a))))

def map_table_b2(a):
    al = map_table_b3(a)
    b = unicodedata.normalize("NFKC", al)
214
    bl = "".join([map_table_b3(ch) for ch in b])
215 216 217 218 219 220 221 222
    c = unicodedata.normalize("NFKC", bl)
    if b != c:
        return c
    else:
        return al

specials = {}
for k,v in table_b2.items():
223
    if list(map(ord, map_table_b2(chr(k)))) != v:
224 225 226 227 228
        specials[k] = v

# B.3 should not add any additional special cases
assert specials == {}

229
print("""
230 231 232
def map_table_b2(a):
    al = map_table_b3(a)
    b = unicodedata.normalize("NFKC", al)
233
    bl = "".join([map_table_b3(ch) for ch in b])
234 235 236 237 238
    c = unicodedata.normalize("NFKC", bl)
    if b != c:
        return c
    else:
        return al
239
""")
240 241 242 243 244 245 246

# C.1.1 is a table with a single character
name, table = tables[0]
del tables[0]
assert name == "C.1.1"
assert table == {0x20:0x20}

247
print("""
248
def in_table_c11(code):
249
    return code == " "
250
""")
251 252 253 254 255 256

# C.1.2 is the rest of all space characters
name, table = tables[0]
del tables[0]
assert name == "C.1.2"

257
# table = set(table.keys())
258
# Zs = set(gen_category(["Zs"])) - {0x20}
259 260
# assert Zs == table

261
print("""
262
def in_table_c12(code):
263
    return unicodedata.category(code) == "Zs" and code != " "
264 265 266

def in_table_c11_c12(code):
    return unicodedata.category(code) == "Zs"
267
""")
268 269 270 271 272 273

# C.2.1 ASCII control characters
name, table_c21 = tables[0]
del tables[0]
assert name == "C.2.1"

274 275 276
Cc = set(gen_category(["Cc"]))
Cc_ascii = Cc & set(range(128))
table_c21 = set(table_c21.keys())
277 278
assert Cc_ascii == table_c21

279
print("""
280 281
def in_table_c21(code):
    return ord(code) < 128 and unicodedata.category(code) == "Cc"
282
""")
283 284 285 286 287 288 289 290

# C.2.2 Non-ASCII control characters. It also includes
# a number of characters in category Cf.
name, table_c22 = tables[0]
del tables[0]
assert name == "C.2.2"

Cc_nonascii = Cc - Cc_ascii
291
table_c22 = set(table_c22.keys())
292 293 294 295 296
assert len(Cc_nonascii - table_c22) == 0

specials = list(table_c22 - Cc_nonascii)
specials.sort()

297
print("""c22_specials = """ + compact_set(specials) + """
298 299 300 301 302 303 304 305 306
def in_table_c22(code):
    c = ord(code)
    if c < 128: return False
    if unicodedata.category(code) == "Cc": return True
    return c in c22_specials

def in_table_c21_c22(code):
    return unicodedata.category(code) == "Cc" or \\
           ord(code) in c22_specials
307
""")
308 309 310 311 312 313

# C.3 Private use
name, table = tables[0]
del tables[0]
assert name == "C.3"

314 315
Co = set(gen_category(["Co"]))
assert set(table.keys()) == Co
316

317
print("""
318 319
def in_table_c3(code):
    return unicodedata.category(code) == "Co"
320
""")
321 322 323 324 325 326 327

# C.4 Non-character code points, xFFFE, xFFFF
# plus process internal codes
name, table = tables[0]
del tables[0]
assert name == "C.4"

328 329 330
nonchar = set(range(0xFDD0,0xFDF0))
nonchar.update(range(0xFFFE,0x110000,0x10000))
nonchar.update(range(0xFFFF,0x110000,0x10000))
331
table = set(table.keys())
332 333
assert table == nonchar

334
print("""
335 336 337 338 339
def in_table_c4(code):
    c = ord(code)
    if c < 0xFDD0: return False
    if c < 0xFDF0: return True
    return (ord(code) & 0xFFFF) in (0xFFFE, 0xFFFF)
340
""")
341 342 343 344 345 346

# C.5 Surrogate codes
name, table = tables[0]
del tables[0]
assert name == "C.5"

347 348
Cs = set(gen_category(["Cs"]))
assert set(table.keys()) == Cs
349

350
print("""
351 352
def in_table_c5(code):
    return unicodedata.category(code) == "Cs"
353
""")
354 355 356 357 358 359

# C.6 Inappropriate for plain text
name, table = tables[0]
del tables[0]
assert name == "C.6"

360
table = sorted(table.keys())
361

362
print("""
363 364 365
c6_set = """ + compact_set(table) + """
def in_table_c6(code):
    return ord(code) in c6_set
366
""")
367 368 369 370 371 372

# C.7 Inappropriate for canonical representation
name, table = tables[0]
del tables[0]
assert name == "C.7"

373
table = sorted(table.keys())
374

375
print("""
376 377 378
c7_set = """ + compact_set(table) + """
def in_table_c7(code):
    return ord(code) in c7_set
379
""")
380 381 382 383 384 385

# C.8 Change display properties or are deprecated
name, table = tables[0]
del tables[0]
assert name == "C.8"

386
table = sorted(table.keys())
387

388
print("""
389 390 391
c8_set = """ + compact_set(table) + """
def in_table_c8(code):
    return ord(code) in c8_set
392
""")
393 394 395 396 397 398

# C.9 Tagging characters
name, table = tables[0]
del tables[0]
assert name == "C.9"

399
table = sorted(table.keys())
400

401
print("""
402 403 404
c9_set = """ + compact_set(table) + """
def in_table_c9(code):
    return ord(code) in c9_set
405
""")
406 407 408 409 410 411

# D.1 Characters with bidirectional property "R" or "AL"
name, table = tables[0]
del tables[0]
assert name == "D.1"

412 413
RandAL = set(gen_bidirectional(["R","AL"]))
assert set(table.keys()) == RandAL
414

415
print("""
416 417
def in_table_d1(code):
    return unicodedata.bidirectional(code) in ("R","AL")
418
""")
419 420 421 422 423 424

# D.2 Characters with bidirectional property "L"
name, table = tables[0]
del tables[0]
assert name == "D.2"

425 426
L = set(gen_bidirectional(["L"]))
assert set(table.keys()) == L
427

428
print("""
429 430
def in_table_d2(code):
    return unicodedata.bidirectional(code) == "L"
431
""")