summaryrefslogtreecommitdiff
path: root/projects/hackc
diff options
context:
space:
mode:
authorFrederick Yin <fkfd@fkfd.me>2022-08-29 20:20:08 +0800
committerFrederick Yin <fkfd@fkfd.me>2022-08-29 20:20:08 +0800
commit51e1667e716ea8c6b20f37cdec1f99eef55eccd6 (patch)
tree3b023734a7337de535923bd0c08cf86cc4a4a647 /projects/hackc
parentca3e66d0cb0825285af7ea34a73355cf34e00a62 (diff)
hackc: tokenizer
Diffstat (limited to 'projects/hackc')
-rw-r--r--projects/hackc/__main__.py29
-rw-r--r--projects/hackc/parser.py85
-rw-r--r--projects/hackc/tokens.py66
3 files changed, 180 insertions, 0 deletions
diff --git a/projects/hackc/__main__.py b/projects/hackc/__main__.py
new file mode 100644
index 0000000..a7bc06f
--- /dev/null
+++ b/projects/hackc/__main__.py
@@ -0,0 +1,29 @@
+from pathlib import Path
+from argparse import ArgumentParser
+import os
+from .parser import Parser
+
+
+def compile_jack(input_path: Path, verbose: bool):
+ try:
+ filenames = os.listdir(input_path)
+ files = [Path(input_path / f) for f in filenames]
+ jack_files = filter(lambda f: f.suffix == ".jack", files)
+ except NotADirectoryError:
+ jack_files = [Path(input_path)]
+ except:
+ # TODO: error
+ return
+
+ for input_fn in jack_files:
+ parser = Parser(input_fn)
+ parser.tokenize()
+ parser.print_tokens()
+
+
+if __name__ == "__main__":
+ parser = ArgumentParser("hackc")
+ parser.add_argument("-v", "--verbose", action="store_true", help="verbose mode")
+ parser.add_argument("input_path", help="Jack file or directory")
+ args = parser.parse_args()
+ compile_jack(Path(args.input_path), args.verbose)
diff --git a/projects/hackc/parser.py b/projects/hackc/parser.py
new file mode 100644
index 0000000..c3056d7
--- /dev/null
+++ b/projects/hackc/parser.py
@@ -0,0 +1,85 @@
+from .tokens import Token
+
+KEYWORDS = [
+ "class",
+ "constructor",
+ "function",
+ "method",
+ "field",
+ "static",
+ "var",
+ "int",
+ "char",
+ "boolean",
+ "void",
+ "true",
+ "false",
+ "null",
+ "this",
+ "let",
+ "do",
+ "if",
+ "else",
+ "while",
+ "return",
+]
+
+SYMBOLS = "{}()[].,;+-*/&|<>=~"
+
+
+class Parser:
+ def __init__(self, fp):
+ self._fp = fp
+ self.tokens = []
+
+ def print_tokens(self):
+ print("LINE\tCOL\tTYPE\tTOKEN")
+ for token in self.tokens:
+ print(f"{token.line_no + 1}\t{token.column + 1}\t{token.type[:3]}\t{token.token}")
+ print(f"===== {len(self.tokens)} tokens =====")
+
+ def tokenize(self):
+ # read file
+ input_file = open(self._fp)
+ source_code = input_file.read()
+ source_lines = source_code.splitlines()
+ input_file.close()
+
+ # tokenize code
+ self.tokens = []
+ in_multicomment = False # True when inside /* */
+ for line_no, line in enumerate(source_lines):
+ pos = 0 # current position in line
+ line_width = len(line)
+ if in_multicomment:
+ multicomment_close_idx = line.find("*/")
+ if multicomment_close_idx == -1:
+ # this line is entirely comment
+ continue
+ # skip until comment ends
+ pos = multicomment_close_idx + 2
+ in_multicomment = False
+
+ # advance in line until exhausted
+ while pos < line_width:
+ rem = line[pos:] # remainder of line
+ ws_count = len(rem) - len(rem.lstrip())
+ if ws_count > 0:
+ # line begins with whitespace
+ pos += ws_count
+ continue
+ elif rem.startswith("/*"):
+ multicomment_close_idx = rem.find("*/")
+ if multicomment_close_idx == -1:
+ in_multicomment = True
+ break # this line is all comment beyond this point
+ # skip until comment ends on the same line
+ pos += multicomment_close_idx + 2
+ elif rem.startswith("//"):
+ break
+
+ rem = line[pos:] # remainder of line
+ token = Token.from_line(rem, line_no, pos)
+ if token is not None:
+ self.tokens.append(token)
+ pos += token.length()
diff --git a/projects/hackc/tokens.py b/projects/hackc/tokens.py
new file mode 100644
index 0000000..7ae37ce
--- /dev/null
+++ b/projects/hackc/tokens.py
@@ -0,0 +1,66 @@
+import re
+
+KEYWORDS = [
+ "class",
+ "constructor",
+ "function",
+ "method",
+ "field",
+ "static",
+ "var",
+ "int",
+ "char",
+ "boolean",
+ "void",
+ "true",
+ "false",
+ "null",
+ "this",
+ "let",
+ "do",
+ "if",
+ "else",
+ "while",
+ "return",
+]
+SYMBOLS = "{}()[].,;+-*/&|<>=~"
+TOKEN_TYPES = ["keyword", "symbol", "integer", "string", "identifier"]
+
+class Token:
+ def __init__(self, type: str, token: str, line_no: int, column: int):
+ """A token in JACK."""
+ self.type = type
+ self.token = token
+ self.line_no = line_no
+ self.column = column
+
+ @classmethod
+ def from_line(cls, line: str, line_no: int, column: int):
+ """Extract first token from line and return it as an instance of Token."""
+ if not line:
+ return None
+
+ if line[0] in SYMBOLS:
+ return Token("symbol", line[0], line_no, column)
+
+ int_match = re.match("([0-9]+)", line)
+ if int_match is not None:
+ return Token("integer", int_match.group(1), line_no, column)
+
+ str_match = re.match('(".*")', line)
+ if str_match is not None:
+ return Token("string", str_match.group(1), line_no, column)
+
+ # keyword or identifier
+ kwid_match = re.match("([_A-Za-z][_A-Za-z0-9]*)", line)
+ if kwid_match is not None:
+ kwid = kwid_match.group(1)
+ type = "identifier"
+ if kwid in KEYWORDS:
+ type = "keyword"
+ return Token(type, kwid, line_no, column)
+
+ return None
+
+ def length(self) -> int:
+ return len(self.token)