1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
|
from .tokens import Token
KEYWORDS = [
"class",
"constructor",
"function",
"method",
"field",
"static",
"var",
"int",
"char",
"boolean",
"void",
"true",
"false",
"null",
"this",
"let",
"do",
"if",
"else",
"while",
"return",
]
SYMBOLS = "{}()[].,;+-*/&|<>=~"
class Parser:
def __init__(self, fp):
self._fp = fp
self.tokens = []
def print_tokens(self):
print("LINE\tCOL\tTYPE\tTOKEN")
for token in self.tokens:
print(f"{token.line_no + 1}\t{token.column + 1}\t{token.type[:3]}\t{token.token}")
print(f"===== {len(self.tokens)} tokens =====")
def tokenize(self):
# read file
input_file = open(self._fp)
source_code = input_file.read()
source_lines = source_code.splitlines()
input_file.close()
# tokenize code
self.tokens = []
in_multicomment = False # True when inside /* */
for line_no, line in enumerate(source_lines):
pos = 0 # current position in line
line_width = len(line)
if in_multicomment:
multicomment_close_idx = line.find("*/")
if multicomment_close_idx == -1:
# this line is entirely comment
continue
# skip until comment ends
pos = multicomment_close_idx + 2
in_multicomment = False
# advance in line until exhausted
while pos < line_width:
rem = line[pos:] # remainder of line
ws_count = len(rem) - len(rem.lstrip())
if ws_count > 0:
# line begins with whitespace
pos += ws_count
continue
elif rem.startswith("/*"):
multicomment_close_idx = rem.find("*/")
if multicomment_close_idx == -1:
in_multicomment = True
break # this line is all comment beyond this point
# skip until comment ends on the same line
pos += multicomment_close_idx + 2
elif rem.startswith("//"):
break
rem = line[pos:] # remainder of line
token = Token.from_line(rem, line_no, pos)
if token is not None:
self.tokens.append(token)
pos += token.length()
|