From 9542deeb483a00b6fabed7574720926ce97d7511 Mon Sep 17 00:00:00 2001 From: Frederick Yin Date: Tue, 16 Aug 2022 11:54:23 +0800 Subject: Projects, 01-06 completed --- projects/06/hack-as/hack-as.c | 375 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 375 insertions(+) create mode 100644 projects/06/hack-as/hack-as.c (limited to 'projects/06/hack-as/hack-as.c') diff --git a/projects/06/hack-as/hack-as.c b/projects/06/hack-as/hack-as.c new file mode 100644 index 0000000..6dde75d --- /dev/null +++ b/projects/06/hack-as/hack-as.c @@ -0,0 +1,375 @@ +#include +#include +#include +#include +#include + +#define MAX_ASM_LINE_LEN 64 +#define MAX_INST_LEN 32768 +#define MAX_ADDR 32767 + +#define EXIT_CODE_FILE_ERROR 1 +#define EXIT_CODE_ILLEGAL_CHAR 2 +#define EXIT_CODE_SIZE_EXCEEDED 3 +#define EXIT_CODE_SYNTAX_ERROR 4 +#define EXIT_CODE_ADDR_ERROR 5 + +struct symbol { + char *label; + int addr; +}; + +char find_illegal_symbol_char(char *symbol) { + // symbol should not begin with number + // nand2tetris implementation allows it, but the standard says otherwise + if (*symbol >= '0' && *symbol <= '9') return *symbol; + for (char *c = symbol; *c != '\0'; c++) { + if (!((*c >= 'A' && *c <= 'Z') || (*c >= 'a' && *c <= 'z') + || (*c >= '0' && *c <= '9') + || *c == '_' || *c == '.' || *c == '$' || *c == ':') + ) { + return *c; + } + } + return -1; +} + +void print_symbols(struct symbol *symbols, int cnt) { + printf("====== SYMBOLS =====\nlabel\taddr\n"); + for (int i = 0; i < cnt; i++) { + printf("%s\t%d\n", symbols[i].label, symbols[i].addr); + } +} + +void print_binary_and_asm(uint16_t *binary, char **asm_lines, int cnt) { + printf("\n====== RESULTS =====\naddr\tbinary \tinst\n"); + for (int i = 0; i < cnt; i++) { + char binary_str[17]; + binary_str[16] = '\0'; + for (int b = 0; b < 16; b++) { + binary_str[b] = '0' + (*(binary + i) >> (15 - b)) % 2; + } + printf("%d\t%s\t%s\n", i, binary_str, asm_lines[i]); + } +} + +void write_binary(FILE *file, uint16_t *binary, int cnt) { + for (int i = 0; i < cnt; i++) { + char binary_str[17]; + binary_str[16] = '\0'; + for (int b = 0; b < 16; b++) { + binary_str[b] = '0' + (*(binary + i) >> (15 - b)) % 2; + } + fprintf(file, "%s\n", binary_str); + } +} + +uint16_t assemble_inst(char *asm_line) { + // assemble one line of assembly, terminated with \0 + // labels and variables must be replaced with corresponding addresses beforehand + uint16_t inst = 0; + if (*asm_line == '@') { + // A instruction + char *addr_str = asm_line + 1; + int addr = atoi(addr_str); + if (addr < 0 || addr > MAX_ADDR) { + fprintf(stderr, "Address out of range: %d\n", addr); + exit(EXIT_CODE_ADDR_ERROR); + } + inst = (uint16_t) addr; + } else { + // C instruction + inst = 0xe000; // set 3 MSBs to 1 + char *eq = asm_line; + for (char *c = asm_line; *c != '\0'; c++) { + // find first equal sign (eq == asm_line if not found) + if (*c == '=') { + eq = c; + break; + } + } + + // slice out destination and copy to dest + int dest_len = eq - asm_line; + char *dest = malloc(dest_len + 1); + strncpy(dest, asm_line, dest_len); + dest[dest_len] = '\0'; + if (dest_len == 0) {} // ignore + else if (strcmp(dest, "M") == 0) inst |= 0b001 << 3; + else if (strcmp(dest, "D") == 0) inst |= 0b010 << 3; + else if (strcmp(dest, "MD") == 0) inst |= 0b011 << 3; + else if (strcmp(dest, "A") == 0) inst |= 0b100 << 3; + else if (strcmp(dest, "AM") == 0) inst |= 0b101 << 3; + else if (strcmp(dest, "AD") == 0) inst |= 0b110 << 3; + else if (strcmp(dest, "AMD") == 0) inst |= 0b111 << 3; + else { + fprintf(stderr, "Invalid destination: %s\n", dest); + exit(EXIT_CODE_SYNTAX_ERROR); + } + free(dest); + + char *semi = eq; + for (; *semi != '\0'; semi++) { + // find jump instruction after semicolon (;) (*semi == '\0' if not found) + if (*semi == ';') break; + } + // ignore if there's no semicolon, or there's nothing after it + if (*semi == '\0' || *(semi + 1) == '\0') {} + else if (strcmp(semi + 1, "JGT") == 0) inst |= 0b001; + else if (strcmp(semi + 1, "JEQ") == 0) inst |= 0b010; + else if (strcmp(semi + 1, "JGE") == 0) inst |= 0b011; + else if (strcmp(semi + 1, "JLT") == 0) inst |= 0b100; + else if (strcmp(semi + 1, "JNE") == 0) inst |= 0b101; + else if (strcmp(semi + 1, "JLE") == 0) inst |= 0b110; + else if (strcmp(semi + 1, "JMP") == 0) inst |= 0b111; + else { + fprintf(stderr, "Invalid jump instruction: %s\n", semi + 1); + exit(EXIT_CODE_SYNTAX_ERROR); + } + + // slice out computation and copy to comp + int comp_len = (*eq == '=') ? (semi - eq - 1) : (semi - eq); + char *comp = malloc(comp_len + 1); + strncpy(comp, (*eq == '=') ? (eq + 1) : eq, comp_len); + comp[comp_len] = '\0'; + if (strcmp(comp, "0") == 0) inst |= 0b0101010 << 6; + else if (strcmp(comp, "1") == 0) inst |= 0b0111111 << 6; + else if (strcmp(comp, "-1") == 0) inst |= 0b0111010 << 6; + else if (strcmp(comp, "D") == 0) inst |= 0b0001100 << 6; + else if (strcmp(comp, "A") == 0) inst |= 0b0110000 << 6; + else if (strcmp(comp, "M") == 0) inst |= 0b1110000 << 6; + else if (strcmp(comp, "!D") == 0) inst |= 0b0001101 << 6; + else if (strcmp(comp, "!A") == 0) inst |= 0b0110001 << 6; + else if (strcmp(comp, "!M") == 0) inst |= 0b1110001 << 6; + else if (strcmp(comp, "-D") == 0) inst |= 0b0001111 << 6; + else if (strcmp(comp, "-A") == 0) inst |= 0b0110011 << 6; + else if (strcmp(comp, "-M") == 0) inst |= 0b1110011 << 6; + else if (strcmp(comp, "D+1") == 0) inst |= 0b0011111 << 6; + else if (strcmp(comp, "A+1") == 0) inst |= 0b0110111 << 6; + else if (strcmp(comp, "M+1") == 0) inst |= 0b1110111 << 6; + else if (strcmp(comp, "D-1") == 0) inst |= 0b0001110 << 6; + else if (strcmp(comp, "A-1") == 0) inst |= 0b0110010 << 6; + else if (strcmp(comp, "M-1") == 0) inst |= 0b1110010 << 6; + else if (strcmp(comp, "D+A") == 0 + || strcmp(comp, "A+D") == 0) inst |= 0b0000010 << 6; + else if (strcmp(comp, "D+M") == 0 + || strcmp(comp, "M+D") == 0) inst |= 0b1000010 << 6; + else if (strcmp(comp, "D-A") == 0) inst |= 0b0010011 << 6; + else if (strcmp(comp, "D-M") == 0) inst |= 0b1010011 << 6; + else if (strcmp(comp, "A-D") == 0) inst |= 0b0000111 << 6; + else if (strcmp(comp, "M-D") == 0) inst |= 0b1000111 << 6; + else if (strcmp(comp, "D&A") == 0 + || strcmp(comp, "A&D") == 0) inst |= 0b0000000 << 6; + else if (strcmp(comp, "D&M") == 0 + || strcmp(comp, "M&D") == 0) inst |= 0b1000000 << 6; + else if (strcmp(comp, "D|A") == 0 + || strcmp(comp, "A|D") == 0) inst |= 0b0010101 << 6; + else if (strcmp(comp, "D|M") == 0 + || strcmp(comp, "M|D") == 0) inst |= 0b1010101 << 6; + else { + fprintf(stderr, "Invalid computation: %s\n", comp); + exit(EXIT_CODE_SYNTAX_ERROR); + } + free(comp); + } + return inst; +} + +size_t assembler(char *input_fn, bool verbose) { + // open input file + FILE *input_file = fopen(input_fn, "r"); + if (input_file == NULL) { + fprintf(stderr, "Cannot open input file: %s\n", input_fn); + exit(EXIT_CODE_FILE_ERROR); + } + // find size of input file + fseek(input_file, 0, SEEK_END); + size_t file_size = ftell(input_file); + fseek(input_file, 0, SEEK_SET); + // read input file + char *file_content = malloc(file_size); + fread(file_content, file_size, 1, input_file); + fclose(input_file); + + // strip away comments, labels, blank lines and whitespace from file_content + // resulting in lines of what looks like instructions in assembly but is not necessarily correct + // labels in parentheses are assigned corresponding addresses in ROM, then collected in `symbols` + // the strings are scattered in the heap but asm_lines collects pointers to them + char *asm_lines[MAX_INST_LEN] = {NULL}; + int asm_line_cnt = 0; // no. of lines (metaphorically) written into asm_lines + char *asm_line = malloc(MAX_ASM_LINE_LEN + 1); // one line of (probably) assembly + int asm_char_cnt = 0; // no. of chars written into asm_line + struct symbol symbols[MAX_INST_LEN] = { + {"SP", 0}, {"LCL", 1}, {"ARG", 2}, {"THIS", 3}, {"THAT", 4}, + {"R0", 0}, {"R1", 1}, {"R2", 2}, {"R3", 3}, + {"R4", 4}, {"R5", 5}, {"R6", 6}, {"R7", 7}, + {"R8", 8}, {"R9", 9}, {"R10", 10}, {"R11", 11}, + {"R12", 12}, {"R13", 13}, {"R14", 14}, {"R15", 15}, + {"SCREEN", 16384}, {"KBD", 24576}, + }; + const int predef_symbol_cnt = 23; // no. of predefined symbols + int user_symbol_cnt = 0; + for (size_t i = 0; i < file_size; i++) { + switch (file_content[i]) { + case '\n': + // end of line; try to figure out what's in asm_line + if (asm_char_cnt == 0) continue; // skip blank line or comment line + *(asm_line + asm_char_cnt) = '\0'; + if (*asm_line == '(' && *(asm_line + asm_char_cnt - 1) == ')') { + // this line may be a label; extract label from between the parentheses + char *label = malloc(asm_char_cnt - 1); + strncpy(label, asm_line + 1, asm_char_cnt - 2); + free(asm_line); + *(label + asm_char_cnt - 2) = '\0'; + char illegal_char = find_illegal_symbol_char(label); + if (illegal_char != -1) { + fprintf(stderr, "Illegal character: %c\n", illegal_char); + exit(EXIT_CODE_ILLEGAL_CHAR); + } + // TODO: error on repeated label + symbols[predef_symbol_cnt + user_symbol_cnt] = (struct symbol) {label, asm_line_cnt}; + user_symbol_cnt++; + } else { + // this line may be an instruction; assemble it + asm_lines[asm_line_cnt] = asm_line; + asm_line_cnt++; + } + // allocate memory for next line + asm_char_cnt = 0; + asm_line = malloc(MAX_ASM_LINE_LEN); + break; + case '/': + if (i + 1 < file_size && file_content[i + 1] == '/') { + // we encountered a comment + // skip to last char of line + while (i + 1 < file_size && file_content[i + 1] != '\n') i++; + } else { + fprintf(stderr, "Illegal character: /\n"); + exit(EXIT_CODE_ILLEGAL_CHAR); + } + break; + case ' ': + case '\t': + case '\r': + break; // ignore whitespace and CR + default: + *(asm_line + asm_char_cnt) = file_content[i]; + asm_char_cnt++; + if (asm_char_cnt > MAX_ASM_LINE_LEN) { + fprintf(stderr, "Max assembly line length (%d) exceeded\n", MAX_ASM_LINE_LEN); + exit(EXIT_CODE_SIZE_EXCEEDED); + } + } + } + free(asm_line); + free(file_content); + + // find and assign address to variables on the fly + int addr = 16; // variable addresses start at 16 + for (int i = 0; i < asm_line_cnt; i++) { + if (asm_lines[i] == NULL) break; // no more instructions + if (*(asm_lines[i]) != '@') continue; // not an A-instruction + char *addr_str = malloc(strlen(asm_lines[i])); + strcpy(addr_str, asm_lines[i] + 1); // whatever comes after the @ + if (strlen(addr_str) == 0) { + fprintf(stderr, "Address cannot be empty\n"); + exit(EXIT_CODE_SYNTAX_ERROR); + } + bool is_symbol = false; + for (char *c = addr_str; *c != '\0'; c++) { + // search for non-numeric chars in addr_str + if (*c < '0' || *c > '9') { + is_symbol = true; + } + } + if (!is_symbol) continue; // address is decimal constant + char illegal_char = find_illegal_symbol_char(addr_str); + if (illegal_char != -1) { + fprintf(stderr, "Illegal character: %c\n", illegal_char); + exit(EXIT_CODE_ILLEGAL_CHAR); + } + // search for symbol in list + bool found = false; + for (int s = 0; s < predef_symbol_cnt + user_symbol_cnt; s++) { + if (strcmp(addr_str, symbols[s].label) == 0) { + // overwrite asm line with decimal constant + sprintf(asm_lines[i], "@%d", symbols[s].addr); + found = true; + free(addr_str); + break; + } + } + if (!found) { + // add symbol to list + symbols[predef_symbol_cnt + user_symbol_cnt] = (struct symbol) {addr_str, addr}; + sprintf(asm_lines[i], "@%d", addr); + user_symbol_cnt++; + addr++; + } + } + + // start assembling + uint16_t *binary = calloc(32768, 2); + size_t inst_cnt = 0; // current no. of instructions in binary + for (char **line = asm_lines; *line != NULL; line++) { + *(binary + inst_cnt) = assemble_inst(*line); + inst_cnt++; + if (inst_cnt > MAX_INST_LEN) { + fprintf(stderr, "Max number of instruction (%d) exceeded\n", MAX_INST_LEN); + exit(EXIT_CODE_SIZE_EXCEEDED); + } + } + + if (verbose) { + print_symbols(symbols + predef_symbol_cnt, user_symbol_cnt); + print_binary_and_asm(binary, asm_lines, inst_cnt); + printf("\n"); + } + + for (char **line = asm_lines; *line != NULL; line++) free(*line); + for (int s = predef_symbol_cnt; s < predef_symbol_cnt + user_symbol_cnt; s++) { + free(symbols[s].label); + } + + // write binary + // output_fn = input_fn[:-4] + ".hack" if input_fn.endswith(".asm") else input_fn + ".hack" + int input_fn_len = strlen(input_fn); + char *output_fn = malloc(input_fn_len + 6); + strcpy(output_fn, input_fn); + if (input_fn_len >= 4 && strcmp(input_fn + input_fn_len - 4, ".asm") == 0) { + sprintf(output_fn + input_fn_len - 4, ".hack"); + } else { + sprintf(output_fn + input_fn_len, ".hack"); + } + + FILE *output_file = fopen(output_fn, "w"); + if (output_file == NULL) { + fprintf(stderr, "Cannot open output file: %s\n", output_fn); + exit(EXIT_CODE_FILE_ERROR); + } + write_binary(output_file, binary, inst_cnt); + fclose(output_file); + free(binary); + printf("Binary written to %s\n", output_fn); + free(output_fn); + + return inst_cnt; +} + +int main(int argc, char *argv[]) { + char *input_fn = NULL; + bool verbose = false; + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "-h") == 0) { + printf("Usage: %s [-v]\n-v -- verbose mode\n", argv[0]); + exit(0); + } else if (strcmp(argv[i], "-v") == 0) { + verbose = true; + } else { + input_fn = argv[i]; + } + } + + assembler(input_fn, verbose); + return 0; +} -- cgit v1.2.3