hackc: tokenizer

author: Frederick Yin <fkfd@fkfd.me> 2022-08-29 20:20:08 +0800
committer: Frederick Yin <fkfd@fkfd.me> 2022-08-29 20:20:08 +0800
commit: 51e1667e716ea8c6b20f37cdec1f99eef55eccd6 (patch)
tree: 3b023734a7337de535923bd0c08cf86cc4a4a647 /projects/hackc/parser.py
parent: ca3e66d0cb0825285af7ea34a73355cf34e00a62 (diff)
1 files changed, 85 insertions, 0 deletions
diff --git a/projects/hackc/parser.py b/projects/hackc/parser.py
new file mode 100644
index 0000000..c3056d7
--- /dev/null
+++ b/projects/hackc/parser.py
@@ -0,0 +1,85 @@
+from .tokens import Token
+
+KEYWORDS = [
+    "class",
+    "constructor",
+    "function",
+    "method",
+    "field",
+    "static",
+    "var",
+    "int",
+    "char",
+    "boolean",
+    "void",
+    "true",
+    "false",
+    "null",
+    "this",
+    "let",
+    "do",
+    "if",
+    "else",
+    "while",
+    "return",
+]
+
+SYMBOLS = "{}()[].,;+-*/&|<>=~"
+
+
+class Parser:
+    def __init__(self, fp):
+        self._fp = fp
+        self.tokens = []
+
+    def print_tokens(self):
+        print("LINE\tCOL\tTYPE\tTOKEN")
+        for token in self.tokens:
+            print(f"{token.line_no + 1}\t{token.column + 1}\t{token.type[:3]}\t{token.token}")
+        print(f"===== {len(self.tokens)} tokens =====")
+
+    def tokenize(self):
+        # read file
+        input_file = open(self._fp)
+        source_code = input_file.read()
+        source_lines = source_code.splitlines()
+        input_file.close()
+
+        # tokenize code
+        self.tokens = []
+        in_multicomment = False  # True when inside /* */
+        for line_no, line in enumerate(source_lines):
+            pos = 0  # current position in line
+            line_width = len(line)
+            if in_multicomment:
+                multicomment_close_idx = line.find("*/")
+                if multicomment_close_idx == -1:
+                    # this line is entirely comment
+                    continue
+                # skip until comment ends
+                pos = multicomment_close_idx + 2
+                in_multicomment = False
+
+            # advance in line until exhausted
+            while pos < line_width:
+                rem = line[pos:]  # remainder of line
+                ws_count = len(rem) - len(rem.lstrip())
+                if ws_count > 0:
+                    # line begins with whitespace
+                    pos += ws_count
+                    continue
+                elif rem.startswith("/*"):
+                    multicomment_close_idx = rem.find("*/")
+                    if multicomment_close_idx == -1:
+                        in_multicomment = True
+                        break  # this line is all comment beyond this point
+                    # skip until comment ends on the same line
+                    pos += multicomment_close_idx + 2
+                elif rem.startswith("//"):
+                    break
+
+                rem = line[pos:]  # remainder of line
+                token = Token.from_line(rem, line_no, pos)
+                if token is not None:
+                    self.tokens.append(token)
+                    pos += token.length()
author	Frederick Yin <fkfd@fkfd.me>	2022-08-29 20:20:08 +0800
committer	Frederick Yin <fkfd@fkfd.me>	2022-08-29 20:20:08 +0800
commit	51e1667e716ea8c6b20f37cdec1f99eef55eccd6 (patch)
tree	3b023734a7337de535923bd0c08cf86cc4a4a647 /projects/hackc/parser.py
parent	ca3e66d0cb0825285af7ea34a73355cf34e00a62 (diff)