Implement parser, push/pop VM instructions
x1phosura x1phosura@x1phosura.zone
Sat, 04 Nov 2023 04:12:21 -0700
5 files changed,
422 insertions(+),
29 deletions(-)
M
projects/07/src/codewriter.h
→
projects/07/src/codewriter.h
@@ -15,11 +15,214 @@ #include "util.h"
#define _DEBUG +// memory mapping: +// 0-15 virtual registers R0-R15 +// 16-255 static variables +// 256-2047 stack +#define SP (0) // points to stack +#define LCL (1) // points to local segment +#define ARG (2) // points to argument segment +#define POINTER (3) +#define THIS (3) // +#define THAT (4) // +#define TEMP (5) // +#define R13 (13) // R13-R15 are scratch space that +#define R14 (14) // VM-generated assembly can use +#define R15 (15) // for whatever. +#define STATIC (16) // start of static variables segment (240 words long) + +char vm_init[] = "@256\n" // starting address of stack + "D=A\n" // D = 256 + "@SP\n" // A = <constant representing address of SP> + "M=D\n" // <memory pointed to by SP> = 256 + "\n"; +// TODO: add initializers for argument, local, static, constant, this, that + +// static segment index indexes into map, retrieves hack assembly symbol offset +#define MAX_STATIC_SYMBOLS (240) // for whatever. +uint16_t static_symbol_map[MAX_STATIC_SYMBOLS]; +uint8_t g_symbol_offset_bump = 0; // holds current largest symbol offset, bumps + + +void write_vm_init(FILE *fp) +{ + //fprintf(fp, vm_init); + fprintf(fp, "// TODO eventually output VM initialization assembly\n"); +} + +static bool write_arithmetic(struct vm_instruction_t *vm_instr, FILE *fp) +{ + // TODO: write assembly code for 'add' and 'neg' instructions, turn into + // codegen + fprintf(fp, "ARITHMETIC INSTRUCTION: "); + print_vm_instruction(vm_instr); + return true; +} + +static bool resolve_static_address(struct vm_instruction_t *vm_instr, + uint16_t *addr) +{ + uint16_t symbol_offset; + if (vm_instr->arg2 >= MAX_STATIC_SYMBOLS) { + err("error: arg2 too large, >= %u\n", MAX_STATIC_SYMBOLS); + return false; + } + + symbol_offset = static_symbol_map[vm_instr->arg2]; + if (symbol_offset == 0xffff) { // new offset not in map (-1 special) + if (g_symbol_offset_bump >= MAX_STATIC_SYMBOLS) { + err("error: symbol offset grew too large (>= %u), " + "too many static variables\n", MAX_STATIC_SYMBOLS); + return false; + } + static_symbol_map[vm_instr->arg2] = g_symbol_offset_bump; + *addr = STATIC + g_symbol_offset_bump; + ++g_symbol_offset_bump; // bump global symbol offset + } else { + // offset was found in map, return symbol value/index + *addr = STATIC + symbol_offset; + } + + return true; +} + +// returns addr +static bool resolve_segment_offset(struct vm_instruction_t *vm_instr, + uint16_t *addr) +{ + // TODO implement + // resolve base (address) of segment + // check index is valid for that segment + // add index vm_instr->arg2 to address + // set to *addr + return true; +} + +// push 16-bit value from segment offset onto top of stack +static bool write_push(struct vm_instruction_t *vm_instr, FILE *fp) +{ + uint16_t addr, arg2 = vm_instr->arg2; + + // TODO: maybe add SP counter/check to catch overflows + // TODO: if vm_instr->arg1 == "constant", push_constant, else push_segment + char push_const_boilerplate[] = "@%hu\n" // A = constant + "D=A\n" // D = constant + "@SP\n" + "M=M+1\n" // RAM[SP]++ // inc SP + "A=M\n" // A = RAM[SP] + "M=D\n"; // RAM[SP] = constant + char push_addr_boilerplate[] = "@%hu\n" // A = segment + index + "D=M\n" // D = RAM[segment + index] + "@SP\n" + "M=M+1\n" // RAM[SP]++ // inc SP + "A=M\n" // A = RAM[SP] + "M=D\n"; // RAM[SP] = constant + char push_indirect_boilerplate[] = "@%hu\n" // A = segment + "D=A\n" // D = segment + "@%hu\n" // A = index + "A=A+D\n" // A = segment + index + "D=M\n" // D = RAM[segment + index] + "@SP\n" + "M=M+1\n" // RAM[SP]++ + "A=M\n" // A = RAM[SP] + "M=D\n"; + + if (!strcmp(vm_instr->arg1, "constant")) { + // TODO: check size of constant (allowed to be > 32,767?) + // TODO: look up in nand2tetris forums in case issue already noted + fprintf(fp, push_const_boilerplate, arg2); + } else { + // TODO: move segment resolution to separate function + if (!strcmp(vm_instr->arg1, "argument")) { + fprintf(fp, push_indirect_boilerplate, ARG, arg2); + } else if (!strcmp(vm_instr->arg1, "local")) { + fprintf(fp, push_indirect_boilerplate, LCL, arg2); + } else if (!strcmp(vm_instr->arg1, "static")) { + if (!resolve_static_address(vm_instr, &addr)) { + return false; + } + fprintf(fp, push_addr_boilerplate, addr); + } else if (!strcmp(vm_instr->arg1, "this")) { + fprintf(fp, push_indirect_boilerplate, THIS, arg2); + } else if (!strcmp(vm_instr->arg1, "that")) { + fprintf(fp, push_indirect_boilerplate, THAT, arg2); + } else if (!strcmp(vm_instr->arg1, "pointer")) { + addr = POINTER + vm_instr->arg2; + fprintf(fp, push_addr_boilerplate, addr); + } else if (!strcmp(vm_instr->arg1, "temp")) { + addr = TEMP + vm_instr->arg2; + fprintf(fp, push_addr_boilerplate, addr); + } else { + err("error: invalid segment name \"%s\"\n", + vm_instr->arg1); + return false; + } + } + return true; +} + +// pop 16-bit value from top of stack into segment offset +static bool write_pop(struct vm_instruction_t *vm_instr, FILE *fp) +{ + // TODO: maybe add SP counter/check to catch overflows + // can use R13-R15 for scratch space + uint16_t addr, arg2 = vm_instr->arg2; + char pop_addr_boilerplate[] = "@%hu\n" // A = segment + index + "D=M\n" // D = RAM[segment + index] + "@SP\n" + "AM=M-1\n" // RAM[SP]--, A = RAM[SP] + "M=D\n"; // D = RAM[SP] + char pop_indirect_boilerplate[] = "@%hu\n" // @segment + "D=A\n" // D = segment + "@%hu\n" // @index + "A=A+D\n" // A = segment + index + "D=M\n" // D = RAM[segment + index] + "@SP\n" + "AM=M-1\n" // RAM[SP]--; A = RAM[SP] + "M=D\n"; + + // TODO: move segment resolution to separate function + if (!strcmp(vm_instr->arg1, "argument")) { + fprintf(fp, pop_indirect_boilerplate, ARG, arg2); + } else if (!strcmp(vm_instr->arg1, "local")) { + fprintf(fp, pop_indirect_boilerplate, LCL, arg2); + } else if (!strcmp(vm_instr->arg1, "static")) { + if (!resolve_static_address(vm_instr, &addr)) { + return false; + } + fprintf(fp, pop_addr_boilerplate, addr); + } else if (!strcmp(vm_instr->arg1, "this")) { + fprintf(fp, pop_indirect_boilerplate, THIS, arg2); + } else if (!strcmp(vm_instr->arg1, "that")) { + fprintf(fp, pop_indirect_boilerplate, THAT, arg2); + } else if (!strcmp(vm_instr->arg1, "pointer")) { + addr = POINTER + vm_instr->arg2; + fprintf(fp, pop_addr_boilerplate, addr); + } else if (!strcmp(vm_instr->arg1, "temp")) { + addr = TEMP + vm_instr->arg2; + fprintf(fp, pop_addr_boilerplate, addr); + } else { + err("error: invalid segment name \"%s\"\n", + vm_instr->arg1); + return false; + } + + return true; +} bool write_instruction(struct vm_instruction_t *vm_instr, FILE *fp) { - // TODO implement - // STUB! + fprintf(fp, "\n// %lu: %s\n", file_line_no, vm_instr->line); + if (vm_instr->cmd == C_ARITHMETIC) { + write_arithmetic(vm_instr, fp); + } else if (vm_instr->cmd == C_PUSH) { + write_push(vm_instr, fp); + } else if (vm_instr->cmd == C_POP) { + write_pop(vm_instr, fp); + } else { + // TODO: eventually error if unrecognized instruction + print_vm_instruction(vm_instr); + } return true; }
M
projects/07/src/parser.h
→
projects/07/src/parser.h
@@ -14,6 +14,10 @@ #include "util.h"
#define _DEBUG +#define CMD_STR_MAX_LEN (8) +#define ARG2_MAX_LEN (5) // arg2 can only be a number between 0 and 65535 +#define ARG1_MAX_LEN (MAX_LINE_LEN - CMD_STR_MAX_LEN - ARG2_MAX_LEN - 3) +//#define ARG1_MAX_LEN 128 // temp value for debugging enum vm_command_t { C_ARITHMETIC,@@ -24,39 +28,201 @@ C_GOTO,
C_IF, C_FUNCTION, C_RETURN, - C_CALL + C_CALL, + C_UNUSED }; +char *arith_cmd_lut[] = {"add", "sub", "neg", "eq", "gt", "lt", "and", + "or", "not"}; +char *cmd_lut[] = {"push", "pop", "label", "goto", "if-goto", "function", + "return", "call"}; +enum vm_command_t cmd_lut_vals[] = {C_PUSH, C_POP, C_LABEL, C_GOTO, C_IF, + C_FUNCTION, C_RETURN, C_CALL}; + struct vm_instruction_t { enum vm_command_t cmd; - char *arg1; - uint16_t arg2; - const char *original_vm_line; + char *arg1; // if C_ARITHMETIC, cmd_str, else first arg + uint16_t arg2; // 2nd arg (for push/pop/call/function) + const char *line; + char line_len; }; -bool parse_command_type(struct vm_instruction_t *vm_instr) { - return true; // STUB TODO implement +bool str_to_u16(uint16_t *res, char *s) +{ + size_t i; + *res = 0; + for (i = 0; i < 5 && s[i] != '\0'; ++i) { + if (!is_number(s[i])) { + err("error: invalid char '%c' in \"%s\"\n", s[i], s); + return false; + } + *res *= 10; + *res += s[i] - '0'; // add number to result + } + return true; } -bool parse_arg1(struct vm_instruction_t *vm_instr) { - return true; // STUB TODO implement +bool cleanup_vm_instr(struct vm_instruction_t *vm_instr) +{ + if (vm_instr->arg1 == NULL) + return true; + + free(vm_instr->arg1); + vm_instr->arg1 = NULL; + return true; } -bool parse_arg2(struct vm_instruction_t *vm_instr) { - return true; // STUB TODO implement +void print_vm_instruction(struct vm_instruction_t *vm_instr) +{ + printf("{\n\tcmd: %d,\n\targ1: \"%s\",\n\targ2: %hu,\n}\n", + vm_instr->cmd, vm_instr->arg1, vm_instr->arg2); } -bool parse_line(const char *line, struct vm_instruction_t *vm_instr) +// TODO: May not need parse_arg1(), parse_arg2(), could just have +// 'parse_command()' and manually read vm_instr attributes when needed. This +// makes sense because the entire command will need to be parsed anyway in +// order to set all the vm_instr attributes; no sense in having redundant +// functions. +// Or not. Separate functions could be useful, as each could have its own +// lookup table rather than 3 separate LUTs cluttering and bloating a single +// function. It doesn't matter _really_, but I do want clean code if possible. +// Will need to think on this... +// +// Expects the vm instruction line to _not_ start with whitespace (previously +// trimmed) +static bool parse_command_type(struct vm_instruction_t *vm_instr) { - vm_instr->original_vm_line = line; - DBGLOG("// %s\n", vm_instr->original_vm_line); + size_t i; + char cmd_str[CMD_STR_MAX_LEN + 1]; + + // TODO check line_len against CMD_STR_MAX_LEN + for (i = 0; i < CMD_STR_MAX_LEN && vm_instr->line[i] != '\0' + && (!is_whitespace(vm_instr->line[i])); ++i) { + cmd_str[i] = vm_instr->line[i]; + // TODO: check if too large, change loop + } + cmd_str[i] = '\0'; + + for (i = 0; i < (sizeof(arith_cmd_lut) / sizeof(arith_cmd_lut[0])); ++i) { + if (!strncmp(cmd_str, arith_cmd_lut[i], CMD_STR_MAX_LEN)) { + vm_instr->cmd = C_ARITHMETIC; + vm_instr->arg1 = malloc(CMD_STR_MAX_LEN + 1); + strncpy(vm_instr->arg1, arith_cmd_lut[i], CMD_STR_MAX_LEN); + return true; + } + } + for (i = 0; i < (sizeof(cmd_lut) / sizeof(cmd_lut[0])); ++i) { + if (!strncmp(cmd_str, cmd_lut[i], CMD_STR_MAX_LEN)) { + vm_instr->cmd = cmd_lut_vals[i]; + return true; + } + } + + err("error: illegal instruction in line \"%s\"\n", vm_instr->line); + return false; +} + +static bool parse_arg1(struct vm_instruction_t *vm_instr) +{ + size_t i, k; + + if (vm_instr->cmd == C_ARITHMETIC || vm_instr->cmd == C_RETURN) { + return true; // 'return' or arg1 is already parsed and correct + } else if (C_ARITHMETIC < vm_instr->cmd && vm_instr->cmd < C_UNUSED) { + vm_instr->arg1 = malloc(ARG1_MAX_LEN + 1); + // skip first token + for (i = 0; i < CMD_STR_MAX_LEN && vm_instr->line[i] != '\0' + && (!is_whitespace(vm_instr->line[i])); ++i) + ; + if (vm_instr->line[i] == '\0' || vm_instr->line[i] == '\n' + || vm_instr->line[i] == '\r') { + err("error: end of line encountered, no first arg\n"); + free(vm_instr->arg1); + return false; + } + // now at second token + // WARNING possibly an overflow here! TODO fix when I'm not lazy + for (++i, k = 0; is_symbol_char(vm_instr->line[i]) + && i < MAX_LINE_LEN; ++i, ++k) { + vm_instr->arg1[k] = vm_instr->line[i]; + } + vm_instr->arg1[k] = '\0'; + return true; + } + + err("error: illegal first argument in line \"%s\"\n", vm_instr->line); + return false; +} + +static bool parse_arg2(struct vm_instruction_t *vm_instr) +{ + bool in_whitespace = false; + uint8_t ws_count = 0; + uint16_t arg2_u16; + size_t i, k; + char c, arg2_str[ARG2_MAX_LEN + 1]; + + if (vm_instr->cmd == C_PUSH || vm_instr->cmd == C_POP + || vm_instr->cmd == C_CALL || vm_instr->cmd == C_FUNCTION) { + + + // skip first two tokens + for (i = 0; i < MAX_LINE_LEN && vm_instr->line[i] != '\0'; ++i) { + c = vm_instr->line[i]; + if (c == ' ' || c == '\t') { + if (!in_whitespace) { + ++ws_count; + in_whitespace = true; + } + } else { + in_whitespace = false; + } + + if (ws_count >= 2) + break; // break after two spaces/tabs found + } + if (vm_instr->line[i] == '\0' || vm_instr->line[i] == '\n' + || i >= MAX_LINE_LEN) { + err("error: end of line encountered, no first arg\n"); + free(vm_instr->arg1); + return false; + } + + // now at second token + for (++i, k = 0; k < ARG2_MAX_LEN + && is_number(vm_instr->line[i]); ++i, ++k) { + arg2_str[k] = vm_instr->line[i]; + } + arg2_str[k] = '\0'; + + if (!str_to_u16(&arg2_u16, arg2_str)) { + return false; + } + // TODO check if > 65535 (maybe?) + vm_instr->arg2 = arg2_u16; + return true; + } + + err("error: can't parse 2nd arg from instruction type\n"); + return false; +} + +bool parse_line(struct vm_instruction_t *vm_instr) +{ + vm_instr->cmd = C_UNUSED; + vm_instr->arg1 = NULL; + vm_instr->arg2 = 0; // 0 is still valid value + if (!parse_command_type(vm_instr)) return false; if (!parse_arg1(vm_instr)) return false; - if (!parse_arg2(vm_instr)) - return false; + if (vm_instr->cmd == C_PUSH || vm_instr->cmd == C_POP + || vm_instr->cmd == C_FUNCTION || vm_instr->cmd == C_CALL) + if (!parse_arg2(vm_instr)) + return false; + return true; }
M
projects/07/src/util.h
→
projects/07/src/util.h
@@ -18,8 +18,11 @@ #else
#define DBGLOG(...) #endif -#define err(...) (fprintf(stderr, __VA_ARGS__), \ - fprintf(stderr, "%lu | %s\n", file_line_no, file_line), false) +extern char *file_line; // reference to currently-read line (for convenience) +extern size_t file_line_no; // line number, regardless of line content + +#define err(...) (fprintf(stdout, __VA_ARGS__), \ + fprintf(stdout, "%lu | %s\n", file_line_no, file_line), false) #define die(...) do { fprintf(stderr, __VA_ARGS__); exit(-1); } while (0) #define is_symbol_char(c) (('0' <= c && c <= '9') || ('A' <= c && c <= 'Z') \ || ('a' <= c && c <= 'z') || c == '_' || c == '.' \@@ -27,7 +30,7 @@ || c == '$' || c == ':')
#define is_whitespace(c) ((c == ' ' || c == '\t' || c == '\n' || c == '\r')) #define is_number(c) (('0' <= c && c <= '9')) -#define MAX_LINE_LEN 256 +#define MAX_LINE_LEN (256) size_t skip_whitespace(const char *line, size_t n)
M
projects/07/src/vmtranslator.c
→
projects/07/src/vmtranslator.c
@@ -10,16 +10,21 @@ #include "util.h"
#define _DEBUG -char *file_line; // reference to currently-read line (for convenience) -size_t file_line_no; // line number, regardless of line content +char *file_line; +size_t file_line_no; - -bool translate(FILE *in_file) +// translate: iterate over lines in in_file, translate VM instructions to +// assembly, write to out_file +bool translate(FILE *in_file, FILE *out_file) { - char *line, in_line[MAX_LINE_LEN], *translated; + char *line, in_line[MAX_LINE_LEN]; struct vm_instruction_t vm_instr; size_t i, line_len; + write_vm_init(out_file); + for (i = 0; i < MAX_STATIC_SYMBOLS; ++i) + static_symbol_map[i] = -1; + file_line_no = 0; //instruction_offset = 0; // TODO: unnecessary? while (fgets(in_line, MAX_LINE_LEN, in_file) != NULL) { // parse loop@@ -42,10 +47,13 @@ if (line_len > 1) {
if (line[0] == '/' && line[1] == '/') { // if comment continue; } else { - if (!parse_line(line, &vm_instr)) + vm_instr.line = line; + vm_instr.line_len = line_len; + if (!parse_line(&vm_instr)) return false; - if (!write_instruction(&vm_instr, in_file)) + if (!write_instruction(&vm_instr, out_file)) return false; + cleanup_vm_instr(&vm_instr); } } }@@ -74,12 +82,12 @@ in_file = fopen(argv[1], "r"); // read input file
if (in_file == NULL) die("failed to open %s for reading\n", argv[1]); - if(!translate(in_file)) // first pass + if(!translate(in_file, out_file)) // first pass die("failed to translate VM code in file\n"); if (fclose(in_file)) die("failed to close VM file\n"); - if (fclose(out_file)) + if (fclose(out_file)) // TODO check if stdout die("failed to close assembly output file\n"); return 0;
A
projects/07/tmp_test_run.sh
@@ -0,0 +1,13 @@
+#!/bin/sh + +#vm_file="test/mine/BasicTest.vm" +#vm_file="test/mine/PointerTest.vm" +#vm_file="test/mine/SimpleAdd.vm" +#vm_file="test/mine/StackTest.vm" +vm_file="test/mine/StaticTest.vm" +#vm_file="test/StackArithmetic/SimpleAdd/SimpleAdd.vm" + +cat "$vm_file" +printf "\n====================================================\n" +./bin/vmtranslator "$vm_file" +