From 51e1667e716ea8c6b20f37cdec1f99eef55eccd6 Mon Sep 17 00:00:00 2001 From: Frederick Yin Date: Mon, 29 Aug 2022 20:20:08 +0800 Subject: hackc: tokenizer --- projects/hackc/__main__.py | 29 ++++++++++++++++ projects/hackc/parser.py | 85 ++++++++++++++++++++++++++++++++++++++++++++++ projects/hackc/tokens.py | 66 +++++++++++++++++++++++++++++++++++ 3 files changed, 180 insertions(+) create mode 100644 projects/hackc/__main__.py create mode 100644 projects/hackc/parser.py create mode 100644 projects/hackc/tokens.py (limited to 'projects') diff --git a/projects/hackc/__main__.py b/projects/hackc/__main__.py new file mode 100644 index 0000000..a7bc06f --- /dev/null +++ b/projects/hackc/__main__.py @@ -0,0 +1,29 @@ +from pathlib import Path +from argparse import ArgumentParser +import os +from .parser import Parser + + +def compile_jack(input_path: Path, verbose: bool): + try: + filenames = os.listdir(input_path) + files = [Path(input_path / f) for f in filenames] + jack_files = filter(lambda f: f.suffix == ".jack", files) + except NotADirectoryError: + jack_files = [Path(input_path)] + except: + # TODO: error + return + + for input_fn in jack_files: + parser = Parser(input_fn) + parser.tokenize() + parser.print_tokens() + + +if __name__ == "__main__": + parser = ArgumentParser("hackc") + parser.add_argument("-v", "--verbose", action="store_true", help="verbose mode") + parser.add_argument("input_path", help="Jack file or directory") + args = parser.parse_args() + compile_jack(Path(args.input_path), args.verbose) diff --git a/projects/hackc/parser.py b/projects/hackc/parser.py new file mode 100644 index 0000000..c3056d7 --- /dev/null +++ b/projects/hackc/parser.py @@ -0,0 +1,85 @@ +from .tokens import Token + +KEYWORDS = [ + "class", + "constructor", + "function", + "method", + "field", + "static", + "var", + "int", + "char", + "boolean", + "void", + "true", + "false", + "null", + "this", + "let", + "do", + "if", + "else", + "while", + "return", +] + +SYMBOLS = "{}()[].,;+-*/&|<>=~" + + +class Parser: + def __init__(self, fp): + self._fp = fp + self.tokens = [] + + def print_tokens(self): + print("LINE\tCOL\tTYPE\tTOKEN") + for token in self.tokens: + print(f"{token.line_no + 1}\t{token.column + 1}\t{token.type[:3]}\t{token.token}") + print(f"===== {len(self.tokens)} tokens =====") + + def tokenize(self): + # read file + input_file = open(self._fp) + source_code = input_file.read() + source_lines = source_code.splitlines() + input_file.close() + + # tokenize code + self.tokens = [] + in_multicomment = False # True when inside /* */ + for line_no, line in enumerate(source_lines): + pos = 0 # current position in line + line_width = len(line) + if in_multicomment: + multicomment_close_idx = line.find("*/") + if multicomment_close_idx == -1: + # this line is entirely comment + continue + # skip until comment ends + pos = multicomment_close_idx + 2 + in_multicomment = False + + # advance in line until exhausted + while pos < line_width: + rem = line[pos:] # remainder of line + ws_count = len(rem) - len(rem.lstrip()) + if ws_count > 0: + # line begins with whitespace + pos += ws_count + continue + elif rem.startswith("/*"): + multicomment_close_idx = rem.find("*/") + if multicomment_close_idx == -1: + in_multicomment = True + break # this line is all comment beyond this point + # skip until comment ends on the same line + pos += multicomment_close_idx + 2 + elif rem.startswith("//"): + break + + rem = line[pos:] # remainder of line + token = Token.from_line(rem, line_no, pos) + if token is not None: + self.tokens.append(token) + pos += token.length() diff --git a/projects/hackc/tokens.py b/projects/hackc/tokens.py new file mode 100644 index 0000000..7ae37ce --- /dev/null +++ b/projects/hackc/tokens.py @@ -0,0 +1,66 @@ +import re + +KEYWORDS = [ + "class", + "constructor", + "function", + "method", + "field", + "static", + "var", + "int", + "char", + "boolean", + "void", + "true", + "false", + "null", + "this", + "let", + "do", + "if", + "else", + "while", + "return", +] +SYMBOLS = "{}()[].,;+-*/&|<>=~" +TOKEN_TYPES = ["keyword", "symbol", "integer", "string", "identifier"] + +class Token: + def __init__(self, type: str, token: str, line_no: int, column: int): + """A token in JACK.""" + self.type = type + self.token = token + self.line_no = line_no + self.column = column + + @classmethod + def from_line(cls, line: str, line_no: int, column: int): + """Extract first token from line and return it as an instance of Token.""" + if not line: + return None + + if line[0] in SYMBOLS: + return Token("symbol", line[0], line_no, column) + + int_match = re.match("([0-9]+)", line) + if int_match is not None: + return Token("integer", int_match.group(1), line_no, column) + + str_match = re.match('(".*")', line) + if str_match is not None: + return Token("string", str_match.group(1), line_no, column) + + # keyword or identifier + kwid_match = re.match("([_A-Za-z][_A-Za-z0-9]*)", line) + if kwid_match is not None: + kwid = kwid_match.group(1) + type = "identifier" + if kwid in KEYWORDS: + type = "keyword" + return Token(type, kwid, line_no, column) + + return None + + def length(self) -> int: + return len(self.token) -- cgit v1.2.3