From 51e1667e716ea8c6b20f37cdec1f99eef55eccd6 Mon Sep 17 00:00:00 2001
From: Frederick Yin <fkfd@fkfd.me>
Date: Mon, 29 Aug 2022 20:20:08 +0800
Subject: hackc: tokenizer

---
 projects/hackc/__main__.py | 29 ++++++++++++++++
 projects/hackc/parser.py   | 85 ++++++++++++++++++++++++++++++++++++++++++++++
 projects/hackc/tokens.py   | 66 +++++++++++++++++++++++++++++++++++
 3 files changed, 180 insertions(+)
 create mode 100644 projects/hackc/__main__.py
 create mode 100644 projects/hackc/parser.py
 create mode 100644 projects/hackc/tokens.py

(limited to 'projects/hackc')

diff --git a/projects/hackc/__main__.py b/projects/hackc/__main__.py
new file mode 100644
index 0000000..a7bc06f
--- /dev/null
+++ b/projects/hackc/__main__.py
@@ -0,0 +1,29 @@
+from pathlib import Path
+from argparse import ArgumentParser
+import os
+from .parser import Parser
+
+
+def compile_jack(input_path: Path, verbose: bool):
+    try:
+        filenames = os.listdir(input_path)
+        files = [Path(input_path / f) for f in filenames]
+        jack_files = filter(lambda f: f.suffix == ".jack", files)
+    except NotADirectoryError:
+        jack_files = [Path(input_path)]
+    except:
+        # TODO: error
+        return
+
+    for input_fn in jack_files:
+        parser = Parser(input_fn)
+        parser.tokenize()
+        parser.print_tokens()
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser("hackc")
+    parser.add_argument("-v", "--verbose", action="store_true", help="verbose mode")
+    parser.add_argument("input_path", help="Jack file or directory")
+    args = parser.parse_args()
+    compile_jack(Path(args.input_path), args.verbose)
diff --git a/projects/hackc/parser.py b/projects/hackc/parser.py
new file mode 100644
index 0000000..c3056d7
--- /dev/null
+++ b/projects/hackc/parser.py
@@ -0,0 +1,85 @@
+from .tokens import Token
+
+KEYWORDS = [
+    "class",
+    "constructor",
+    "function",
+    "method",
+    "field",
+    "static",
+    "var",
+    "int",
+    "char",
+    "boolean",
+    "void",
+    "true",
+    "false",
+    "null",
+    "this",
+    "let",
+    "do",
+    "if",
+    "else",
+    "while",
+    "return",
+]
+
+SYMBOLS = "{}()[].,;+-*/&|<>=~"
+
+
+class Parser:
+    def __init__(self, fp):
+        self._fp = fp
+        self.tokens = []
+
+    def print_tokens(self):
+        print("LINE\tCOL\tTYPE\tTOKEN")
+        for token in self.tokens:
+            print(f"{token.line_no + 1}\t{token.column + 1}\t{token.type[:3]}\t{token.token}")
+        print(f"===== {len(self.tokens)} tokens =====")
+
+    def tokenize(self):
+        # read file
+        input_file = open(self._fp)
+        source_code = input_file.read()
+        source_lines = source_code.splitlines()
+        input_file.close()
+
+        # tokenize code
+        self.tokens = []
+        in_multicomment = False  # True when inside /* */
+        for line_no, line in enumerate(source_lines):
+            pos = 0  # current position in line
+            line_width = len(line)
+            if in_multicomment:
+                multicomment_close_idx = line.find("*/")
+                if multicomment_close_idx == -1:
+                    # this line is entirely comment
+                    continue
+                # skip until comment ends
+                pos = multicomment_close_idx + 2
+                in_multicomment = False
+
+            # advance in line until exhausted
+            while pos < line_width:
+                rem = line[pos:]  # remainder of line
+                ws_count = len(rem) - len(rem.lstrip())
+                if ws_count > 0:
+                    # line begins with whitespace
+                    pos += ws_count
+                    continue
+                elif rem.startswith("/*"):
+                    multicomment_close_idx = rem.find("*/")
+                    if multicomment_close_idx == -1:
+                        in_multicomment = True
+                        break  # this line is all comment beyond this point
+                    # skip until comment ends on the same line
+                    pos += multicomment_close_idx + 2
+                elif rem.startswith("//"):
+                    break
+
+                rem = line[pos:]  # remainder of line
+                token = Token.from_line(rem, line_no, pos)
+                if token is not None:
+                    self.tokens.append(token)
+                    pos += token.length()
diff --git a/projects/hackc/tokens.py b/projects/hackc/tokens.py
new file mode 100644
index 0000000..7ae37ce
--- /dev/null
+++ b/projects/hackc/tokens.py
@@ -0,0 +1,66 @@
+import re
+
+KEYWORDS = [
+    "class",
+    "constructor",
+    "function",
+    "method",
+    "field",
+    "static",
+    "var",
+    "int",
+    "char",
+    "boolean",
+    "void",
+    "true",
+    "false",
+    "null",
+    "this",
+    "let",
+    "do",
+    "if",
+    "else",
+    "while",
+    "return",
+]
+SYMBOLS = "{}()[].,;+-*/&|<>=~"
+TOKEN_TYPES = ["keyword", "symbol", "integer", "string", "identifier"]
+
+class Token:
+    def __init__(self, type: str, token: str, line_no: int, column: int):
+        """A token in JACK."""
+        self.type = type
+        self.token = token
+        self.line_no = line_no
+        self.column = column
+
+    @classmethod
+    def from_line(cls, line: str, line_no: int, column: int):
+        """Extract first token from line and return it as an instance of Token."""
+        if not line:
+            return None
+
+        if line[0] in SYMBOLS:
+            return Token("symbol", line[0], line_no, column)
+
+        int_match = re.match("([0-9]+)", line)
+        if int_match is not None:
+            return Token("integer", int_match.group(1), line_no, column)
+
+        str_match = re.match('(".*")', line)
+        if str_match is not None:
+            return Token("string", str_match.group(1), line_no, column)
+
+        # keyword or identifier
+        kwid_match = re.match("([_A-Za-z][_A-Za-z0-9]*)", line)
+        if kwid_match is not None:
+            kwid = kwid_match.group(1)
+            type = "identifier"
+            if kwid in KEYWORDS:
+                type = "keyword"
+            return Token(type, kwid, line_no, column)
+
+        return None
+
+    def length(self) -> int:
+        return len(self.token)
-- 
cgit v1.2.3