summaryrefslogtreecommitdiff
path: root/projects/hackc/parser.py
diff options
context:
space:
mode:
authorFrederick Yin <fkfd@fkfd.me>2022-08-29 20:20:08 +0800
committerFrederick Yin <fkfd@fkfd.me>2022-08-29 20:20:08 +0800
commit51e1667e716ea8c6b20f37cdec1f99eef55eccd6 (patch)
tree3b023734a7337de535923bd0c08cf86cc4a4a647 /projects/hackc/parser.py
parentca3e66d0cb0825285af7ea34a73355cf34e00a62 (diff)
hackc: tokenizer
Diffstat (limited to 'projects/hackc/parser.py')
-rw-r--r--projects/hackc/parser.py85
1 files changed, 85 insertions, 0 deletions
diff --git a/projects/hackc/parser.py b/projects/hackc/parser.py
new file mode 100644
index 0000000..c3056d7
--- /dev/null
+++ b/projects/hackc/parser.py
@@ -0,0 +1,85 @@
+from .tokens import Token
+
+KEYWORDS = [
+ "class",
+ "constructor",
+ "function",
+ "method",
+ "field",
+ "static",
+ "var",
+ "int",
+ "char",
+ "boolean",
+ "void",
+ "true",
+ "false",
+ "null",
+ "this",
+ "let",
+ "do",
+ "if",
+ "else",
+ "while",
+ "return",
+]
+
+SYMBOLS = "{}()[].,;+-*/&|<>=~"
+
+
+class Parser:
+ def __init__(self, fp):
+ self._fp = fp
+ self.tokens = []
+
+ def print_tokens(self):
+ print("LINE\tCOL\tTYPE\tTOKEN")
+ for token in self.tokens:
+ print(f"{token.line_no + 1}\t{token.column + 1}\t{token.type[:3]}\t{token.token}")
+ print(f"===== {len(self.tokens)} tokens =====")
+
+ def tokenize(self):
+ # read file
+ input_file = open(self._fp)
+ source_code = input_file.read()
+ source_lines = source_code.splitlines()
+ input_file.close()
+
+ # tokenize code
+ self.tokens = []
+ in_multicomment = False # True when inside /* */
+ for line_no, line in enumerate(source_lines):
+ pos = 0 # current position in line
+ line_width = len(line)
+ if in_multicomment:
+ multicomment_close_idx = line.find("*/")
+ if multicomment_close_idx == -1:
+ # this line is entirely comment
+ continue
+ # skip until comment ends
+ pos = multicomment_close_idx + 2
+ in_multicomment = False
+
+ # advance in line until exhausted
+ while pos < line_width:
+ rem = line[pos:] # remainder of line
+ ws_count = len(rem) - len(rem.lstrip())
+ if ws_count > 0:
+ # line begins with whitespace
+ pos += ws_count
+ continue
+ elif rem.startswith("/*"):
+ multicomment_close_idx = rem.find("*/")
+ if multicomment_close_idx == -1:
+ in_multicomment = True
+ break # this line is all comment beyond this point
+ # skip until comment ends on the same line
+ pos += multicomment_close_idx + 2
+ elif rem.startswith("//"):
+ break
+
+ rem = line[pos:] # remainder of line
+ token = Token.from_line(rem, line_no, pos)
+ if token is not None:
+ self.tokens.append(token)
+ pos += token.length()