GraphCompiler/tokenizer.py at master · mgraczyk/GraphCompiler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/env python3

import re
from collections import namedtuple

from functools import partial

Position = namedtuple('Position', ['row', 'col'])
Token = namedtuple('Token', ['type', 'position', 'length', 'value'])

class TokenizationError(Exception):
    def __init__(self, errors):
        """ Creates a TokenizationError from the specified list of errors.

            The errors should each be a tuple (P, value), where position
            is the Position where the unmatched characters begin,
            and value is the value of the unmatched characters.
        """
        self._errors = tuple(errors)

    def __str__(self):
        return "\n".join(
                'Invalid token "{}" found at row {}, col {}.'.format(
                    e[1], e[0].row, e[0].col) for e in self._errors)

    def get_errors(self):
        return self._errors

class Tokenizer:
    def __init__(self, tokens):
        """ Creates a tokenizer that uses the specified list of tokens.
            tokens should be an iterable of tuples t where
            t[0] = The token type (can be anything to identify the token)
            t[1] = The regex that matches the token

            Where there are tokenization conflicts, tokens are matched
            in the same order as they appear in the tokens list.
        """
        self._tokens = tokens
        allPattern = '|'.join("(" + t[1] + ")" for t in tokens)
        self._re = re.compile(allPattern)


    def tokenize(self, value):
        """ Tokenizes the value.
            Returns a list of Tokens (type, position, length, value) where

            type is the token type specified in tokens[i][0]
            position is a Position with the row and column of the token
            length is the length of the token
            value is the literal value of the token

            If non-tokens are discovered, a TokenizationError is raised.
        """

        tokens = []
        errors = []
        for lineNum, line in enumerate(value.splitlines()):
            if line:
                matches = tuple(self._re.finditer(line))

                errors.extend(self._find_unmatched(line, lineNum, matches))
                tokens.extend(map(partial(self._ret_from_match, lineNum), matches))

        if errors:
            raise TokenizationError(errors)

        return tokens

    def _ret_from_match(self, line, match):
        return Token(
                self._tokens[match.lastindex-1][0],
                Position(line, match.start()),
                match.end() - match.start(),
                match.group(match.lastindex)
                )

    def _find_unmatched(self, line, lineNum, matches):
        """ For every substring of line that is not matched by one of matches,
            a tuple (pos, substring) is yielded.
        """
        start = 0
        for match in matches:
            if start < match.start():
                yield (Position(lineNum, start), line[start:match.start()])
            start = match.end()

        if start < len(line):
            yield (Position(lineNum, start), line[start:])

if __name__ == "__main__":
    import test
    exit(test.tokenizer_test())