projects/hackc/parser.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103

from .tokens import Token
from .syntax import Class
from .utils import *

KEYWORDS = [
    "class",
    "constructor",
    "function",
    "method",
    "field",
    "static",
    "var",
    "int",
    "char",
    "boolean",
    "void",
    "true",
    "false",
    "null",
    "this",
    "let",
    "do",
    "if",
    "else",
    "while",
    "return",
]

SYMBOLS = "{}()[].,;+-*/&|<>=~"


class Parser:
    def __init__(self, fp, extensions=[]):
        self._fp = fp
        self._extensions = extensions
        self.tokens = []

        # load source code
        input_file = open(fp)
        self.source = input_file.read()
        self.lines = self.source.splitlines()
        input_file.close()

    def print_tokens(self):
        print("LINE\tCOL\tTYPE\tTOKEN")
        for token in self.tokens:
            print(f"{token.line_no + 1}\t{token.column + 1}\t{token.type[:3]}\t{token}")
        print(f"===== {len(self.tokens)} tokens =====")

    def tokenize(self):
        # tokenize code
        self.tokens = []
        in_multicomment = False  # True when inside /* */
        for line_no, line in enumerate(self.lines):
            pos = 0  # current position in line
            line_width = len(line)
            if in_multicomment:
                multicomment_close_idx = line.find("*/")
                if multicomment_close_idx == -1:
                    # this line is entirely comment
                    continue
                # skip until comment ends
                pos = multicomment_close_idx + 2
                in_multicomment = False

            # advance in line until exhausted
            while pos < line_width:
                rem = line[pos:]  # remainder of line
                ws_count = len(rem) - len(rem.lstrip())
                if ws_count > 0:
                    # line begins with whitespace
                    pos += ws_count
                    continue
                elif rem.startswith("/*"):
                    multicomment_close_idx = rem.find("*/")
                    if multicomment_close_idx == -1:
                        in_multicomment = True
                        break  # this line is all comment beyond this point
                    # skip until comment ends on the same line
                    pos += multicomment_close_idx + 2
                elif rem.startswith("//"):
                    break

                rem = line[pos:]  # remainder of line
                token = Token.from_line(rem, line_no, pos, extensions=self._extensions)
                if token is not None:
                    self.tokens.append(token)
                    pos += token.length()
                else:
                    # invalid token
                    print_err(f"{self._fp}:{line_no + 1}")
                    print_err(line)
                    print_err(" " * pos + f"^ Invalid token")
                    exit(EXIT_CODE_INVALID_TOKEN)

    def parse(self):
        try:
            syntax_tree = Class.from_tokens(self.tokens)
        except JackSyntaxError as err:
            print_err(f"{self._fp}:{err.token.line_no + 1}")
            print_err(self.lines[err.token.line_no])
            print_err(" " * err.token.column + "^ " + err.message)
            exit(EXIT_CODE_SYNTAX_ERROR)