Tokenizer is a important step for any parser/compiler/interpreter.
just_len = 60
text = 'foo = 23 + 42 * 10'
import re
?P
NAME = r'(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)'
NUM = r'(?P<NUM>\d+)'
PLUS = r'(?P<PLUS>\+)'
TIMES = r'(?P<TIMES>\*)'
EQ = r'(?P<EQ>=)'
WS = r'(?P<WS>\s+)'
master_pattern = re.compile('|'.join([NAME, NUM, PLUS, TIMES, EQ, WS]))
from collections import namedtuple
Token = namedtuple('Token', ['type', 'value'])
generate all tokens match the pattern
def generate_tokens(pattern, text):
scanner = pattern.scanner(text)
for m in iter(scanner.match, None):
yield Token(m.lastgroup, m.group())
for token in generate_tokens(master_pattern, text):
print(token)
filter some tokens we does not want it
excluded_ws = (token for token in generate_tokens(master_pattern, text) if token.type != 'WS')
for token in excluded_ws:
print(token)
the pattern's order matters. Longer match should be put first like <= should be earlier than = or <
NUM = r'(?P<NUM>\d+)'
LT = r'(?P<LT><)'
LE = r'(?P<LE><=)'
EQ = r'(?P<EQ>=)'
print('\npattern in right order')
master_pattern_in_right_order = re.compile('|'.join([NUM, LE, LT, EQ]))
token_correct = list(generate_tokens(master_pattern_in_right_order, '3<=4'))
for token in token_correct:
print(token)
if put < earlier than <= then <= will be two token < and = which is incorrect
print('\npattern in wrong order')
master_pattern_in_right_order = re.compile('|'.join([NUM, LT, EQ, LE]))
token_incorrect = list(generate_tokens(master_pattern_in_right_order, '3<=4'))
for token in token_incorrect:
print(token)