summaryrefslogtreecommitdiff
path: root/projects/hackc/tokens.py
blob: 1ed94ae80f7c11ff98f3159ae33e3f35f27f6569 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import re

KEYWORDS = [
    "class",
    "constructor",
    "function",
    "method",
    "field",
    "static",
    "var",
    "int",
    "char",
    "boolean",
    "void",
    "true",
    "false",
    "null",
    "this",
    "let",
    "do",
    "if",
    "else",
    "while",
    "return",
]
SYMBOLS = "{}()[].,;+-*/&|<>=~"
TOKEN_TYPES = ["keyword", "symbol", "integer", "string", "identifier"]


class Token:
    def __init__(self, type: str, token: str, line_no: int, column: int):
        """A token in JACK."""
        self.type = type
        self.token = token
        self.line_no = line_no
        self.column = column

    def __len__(self) -> int:
        return self.length()

    def __eq__(self, other) -> bool:
        if type(other) == str:
            return self.token == other
        if type(other) == Token:
            return self.token == other.token

    def __str__(self) -> str:
        return self.token

    @classmethod
    def from_line(cls, line: str, line_no: int, column: int, extensions=[]):
        """Extract first token from line and return it as an instance of Token."""
        if not line:
            return None

        if line[0] in SYMBOLS:
            return Token("symbol", line[0], line_no, column)

        int_match = re.match("([0-9]+)", line)
        if int_match is not None:
            return Token("integer", int_match.group(1), line_no, column)

        if "escape" in extensions:
            str_match = re.match(r'("(.|\\")+?[^\\]")', line)
        else:
            str_match = re.match('(".*?")', line)

        if str_match is not None:
            return Token("string", str_match.group(1), line_no, column)

        # keyword or identifier
        kwid_match = re.match("([_A-Za-z][_A-Za-z0-9]*)", line)
        if kwid_match is not None:
            kwid = kwid_match.group(1)
            type = "identifier"
            if kwid in KEYWORDS:
                type = "keyword"
            return Token(type, kwid, line_no, column)

        return None

    def length(self) -> int:
        return len(self.token)