projects/hackc/parser.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85

from .tokens import Token

KEYWORDS = [
    "class",
    "constructor",
    "function",
    "method",
    "field",
    "static",
    "var",
    "int",
    "char",
    "boolean",
    "void",
    "true",
    "false",
    "null",
    "this",
    "let",
    "do",
    "if",
    "else",
    "while",
    "return",
]

SYMBOLS = "{}()[].,;+-*/&|<>=~"


class Parser:
    def __init__(self, fp):
        self._fp = fp
        self.tokens = []

    def print_tokens(self):
        print("LINE\tCOL\tTYPE\tTOKEN")
        for token in self.tokens:
            print(f"{token.line_no + 1}\t{token.column + 1}\t{token.type[:3]}\t{token.token}")
        print(f"===== {len(self.tokens)} tokens =====")

    def tokenize(self):
        # read file
        input_file = open(self._fp)
        source_code = input_file.read()
        source_lines = source_code.splitlines()
        input_file.close()

        # tokenize code
        self.tokens = []
        in_multicomment = False  # True when inside /* */
        for line_no, line in enumerate(source_lines):
            pos = 0  # current position in line
            line_width = len(line)
            if in_multicomment:
                multicomment_close_idx = line.find("*/")
                if multicomment_close_idx == -1:
                    # this line is entirely comment
                    continue
                # skip until comment ends
                pos = multicomment_close_idx + 2
                in_multicomment = False

            # advance in line until exhausted
            while pos < line_width:
                rem = line[pos:]  # remainder of line
                ws_count = len(rem) - len(rem.lstrip())
                if ws_count > 0:
                    # line begins with whitespace
                    pos += ws_count
                    continue
                elif rem.startswith("/*"):
                    multicomment_close_idx = rem.find("*/")
                    if multicomment_close_idx == -1:
                        in_multicomment = True
                        break  # this line is all comment beyond this point
                    # skip until comment ends on the same line
                    pos += multicomment_close_idx + 2
                elif rem.startswith("//"):
                    break

                rem = line[pos:]  # remainder of line
                token = Token.from_line(rem, line_no, pos)
                if token is not None:
                    self.tokens.append(token)
                    pos += token.length()