Kaydet (Commit) b83942c7 authored tarafından Raymond Hettinger's avatar Raymond Hettinger Kaydeden (comit) Miss Islington (bot)

Cleanup and improve the regex tokenizer example. (GH-10426)



1) Convert weird field name "typ" to the more standard "type".
2) For the NUMBER type, convert the value to an int() or float().
3) Simplify ``group(kind)`` to the shorter and faster ``group()`` call.
4) Simplify logic go a single if-elif chain to make this easier to extend.
5) Reorder the tests to match the order the tokens are specified.
   This isn't necessary for correctness but does make the example
   easier to follow.
6) Move the "column" calculation before the if-elif chain so that
   users have the option of using this value in error messages.
üst 216aaaa0
...@@ -1609,7 +1609,7 @@ successive matches:: ...@@ -1609,7 +1609,7 @@ successive matches::
import collections import collections
import re import re
Token = collections.namedtuple('Token', ['typ', 'value', 'line', 'column']) Token = collections.namedtuple('Token', ['type', 'value', 'line', 'column'])
def tokenize(code): def tokenize(code):
keywords = {'IF', 'THEN', 'ENDIF', 'FOR', 'NEXT', 'GOSUB', 'RETURN'} keywords = {'IF', 'THEN', 'ENDIF', 'FOR', 'NEXT', 'GOSUB', 'RETURN'}
...@@ -1621,25 +1621,27 @@ successive matches:: ...@@ -1621,25 +1621,27 @@ successive matches::
('OP', r'[+\-*/]'), # Arithmetic operators ('OP', r'[+\-*/]'), # Arithmetic operators
('NEWLINE', r'\n'), # Line endings ('NEWLINE', r'\n'), # Line endings
('SKIP', r'[ \t]+'), # Skip over spaces and tabs ('SKIP', r'[ \t]+'), # Skip over spaces and tabs
('MISMATCH',r'.'), # Any other character ('MISMATCH', r'.'), # Any other character
] ]
tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification) tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification)
line_num = 1 line_num = 1
line_start = 0 line_start = 0
for mo in re.finditer(tok_regex, code): for mo in re.finditer(tok_regex, code):
kind = mo.lastgroup kind = mo.lastgroup
value = mo.group(kind) value = mo.group()
if kind == 'NEWLINE': column = mo.start() - line_start
if kind == 'NUMBER':
value = float(value) if '.' in value else int(value)
elif kind == 'ID' and value in keywords:
kind = value
elif kind == 'NEWLINE':
line_start = mo.end() line_start = mo.end()
line_num += 1 line_num += 1
continue
elif kind == 'SKIP': elif kind == 'SKIP':
pass continue
elif kind == 'MISMATCH': elif kind == 'MISMATCH':
raise RuntimeError(f'{value!r} unexpected on line {line_num}') raise RuntimeError(f'{value!r} unexpected on line {line_num}')
else:
if kind == 'ID' and value in keywords:
kind = value
column = mo.start() - line_start
yield Token(kind, value, line_num, column) yield Token(kind, value, line_num, column)
statements = ''' statements = '''
...@@ -1654,25 +1656,25 @@ successive matches:: ...@@ -1654,25 +1656,25 @@ successive matches::
The tokenizer produces the following output:: The tokenizer produces the following output::
Token(typ='IF', value='IF', line=2, column=4) Token(type='IF', value='IF', line=2, column=4)
Token(typ='ID', value='quantity', line=2, column=7) Token(type='ID', value='quantity', line=2, column=7)
Token(typ='THEN', value='THEN', line=2, column=16) Token(type='THEN', value='THEN', line=2, column=16)
Token(typ='ID', value='total', line=3, column=8) Token(type='ID', value='total', line=3, column=8)
Token(typ='ASSIGN', value=':=', line=3, column=14) Token(type='ASSIGN', value=':=', line=3, column=14)
Token(typ='ID', value='total', line=3, column=17) Token(type='ID', value='total', line=3, column=17)
Token(typ='OP', value='+', line=3, column=23) Token(type='OP', value='+', line=3, column=23)
Token(typ='ID', value='price', line=3, column=25) Token(type='ID', value='price', line=3, column=25)
Token(typ='OP', value='*', line=3, column=31) Token(type='OP', value='*', line=3, column=31)
Token(typ='ID', value='quantity', line=3, column=33) Token(type='ID', value='quantity', line=3, column=33)
Token(typ='END', value=';', line=3, column=41) Token(type='END', value=';', line=3, column=41)
Token(typ='ID', value='tax', line=4, column=8) Token(type='ID', value='tax', line=4, column=8)
Token(typ='ASSIGN', value=':=', line=4, column=12) Token(type='ASSIGN', value=':=', line=4, column=12)
Token(typ='ID', value='price', line=4, column=15) Token(type='ID', value='price', line=4, column=15)
Token(typ='OP', value='*', line=4, column=21) Token(type='OP', value='*', line=4, column=21)
Token(typ='NUMBER', value='0.05', line=4, column=23) Token(type='NUMBER', value=0.05, line=4, column=23)
Token(typ='END', value=';', line=4, column=27) Token(type='END', value=';', line=4, column=27)
Token(typ='ENDIF', value='ENDIF', line=5, column=4) Token(type='ENDIF', value='ENDIF', line=5, column=4)
Token(typ='END', value=';', line=5, column=9) Token(type='END', value=';', line=5, column=9)
.. [Frie09] Friedl, Jeffrey. Mastering Regular Expressions. 3rd ed., O'Reilly .. [Frie09] Friedl, Jeffrey. Mastering Regular Expressions. 3rd ed., O'Reilly
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment