8. Examples

8.1. National Identification Numbers (Worldwide)

8.2. Finding all Adverbs

Listing 335. Finding all Adverbs
import re

TEXT = 'He was carefully disguised but captured quickly by police.'
ADVERBS = r'\w+ly'

re.findall(ADVERBS, TEXT)
# ['carefully', 'quickly']

8.3. Making a Phonebook

Listing 336. Practical example of Regex usage
import re

TEXT = """Jan Twardowski: 834.345.1254 Polish Space Agency

Mark Watney: 892.345.3428 Johnson Space Center
Matt Kowalski: 925.541.7625 Kennedy Space Center


Melissa Lewis: 548.326.4584 Bajkonur, Kazakhstan"""

entries = re.split('\n+', TEXT)
# [
#   'Jan Twardowski: 834.345.1254 Polish Space Agency',
#   'Mark Watney: 892.345.3428 Johnson Space Center',
#   'Matt Kowalski: 925.541.7625 Kennedy Space Center',
#   'Melissa Lewis: 548.326.4584 Bajkonur, Kazakhstan'
# ]

out = [re.split(':?\s', entry, maxsplit=3) for entry in entries]
# [
#   ['Jan', 'Twardowski', '834.345.1254', 'Polish Space Agency'],
#   ['Mark', 'Watney', '892.345.3428', 'Johnson Space Center'],
#   ['Matt', 'Kowalski', '925.541.7625', 'Kennedy Space Center'],
#   ['Melissa', 'Lewis', '548.326.4584', 'Bajkonur, Kazakhstan']
# ]

8.4. Writing a Tokenizer

Listing 337. Writing a Tokenizer.
import collections
import re

"""
A tokenizer or scanner analyzes a string to categorize groups of characters.
This is a useful first step in writing a compiler or interpreter.

The text categories are specified with regular expressions.
The technique is to combine those into a single master regular
expression and to loop over successive matches
"""

Token = collections.namedtuple('Token', ['typ', 'value', 'line', 'column'])


def tokenize(code):
    keywords = {'IF', 'THEN', 'ENDIF', 'FOR', 'NEXT', 'GOSUB', 'RETURN'}
    token_specification = [
        ('NUMBER',  r'\d+(\.\d*)?'),  # Integer or decimal number
        ('ASSIGN',  r':='),           # Assignment operator
        ('END',     r';'),            # Statement terminator
        ('ID',      r'[A-Za-z]+'),    # Identifiers
        ('OP',      r'[+\-*/]'),      # Arithmetic operators
        ('NEWLINE', r'\n'),           # Line endings
        ('SKIP',    r'[ \t]+'),       # Skip over spaces and tabs
        ('MISMATCH',r'.'),            # Any other character
    ]
    tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification)
    line_num = 1
    line_start = 0

    for mo in re.finditer(tok_regex, code):
        kind = mo.lastgroup
        value = mo.group(kind)

        if kind == 'NEWLINE':
            line_start = mo.end()
            line_num += 1
        elif kind == 'SKIP':
            pass
        elif kind == 'MISMATCH':
            raise RuntimeError(f'{value!r} unexpected on line {line_num}')
        else:
            if kind == 'ID' and value in keywords:
                kind = value
            column = mo.start() - line_start
            yield Token(kind, value, line_num, column)

statements = '''
    IF quantity THEN
        total := total + price * quantity;
        tax := price * 0.05;
    ENDIF;
'''

for token in tokenize(statements):
    print(token)

# Token(typ='IF', value='IF', line=2, column=4)
# Token(typ='ID', value='quantity', line=2, column=7)
# Token(typ='THEN', value='THEN', line=2, column=16)
# Token(typ='ID', value='total', line=3, column=8)
# Token(typ='ASSIGN', value=':=', line=3, column=14)
# Token(typ='ID', value='total', line=3, column=17)
# Token(typ='OP', value='+', line=3, column=23)
# Token(typ='ID', value='price', line=3, column=25)
# Token(typ='OP', value='*', line=3, column=31)
# Token(typ='ID', value='quantity', line=3, column=33)
# Token(typ='END', value=';', line=3, column=41)
# Token(typ='ID', value='tax', line=4, column=8)
# Token(typ='ASSIGN', value=':=', line=4, column=12)
# Token(typ='ID', value='price', line=4, column=15)
# Token(typ='OP', value='*', line=4, column=21)
# Token(typ='NUMBER', value='0.05', line=4, column=23)
# Token(typ='END', value=';', line=4, column=27)
# Token(typ='ENDIF', value='ENDIF', line=5, column=4)
# Token(typ='END', value=';', line=5, column=9)