From 3eb89830285e242597d22a8403dbd6c4426eae40 Mon Sep 17 00:00:00 2001 From: caandt Date: Wed, 4 Jun 2025 21:35:30 -0500 Subject: [PATCH] check for valid op in tokenizer --- src/frontend/parse.zig | 50 ++++++++++++++++++-------------- src/frontend/token.zig | 66 +++++++++++++++++++++++++++--------------- src/main.zig | 3 +- 3 files changed, 73 insertions(+), 46 deletions(-) diff --git a/src/frontend/parse.zig b/src/frontend/parse.zig index 9e2df45..dd64075 100644 --- a/src/frontend/parse.zig +++ b/src/frontend/parse.zig @@ -37,9 +37,18 @@ pub const Expr = union(enum) { } }; -const Error = error{ InvalidOp, SyntaxError, OutOfMemory }; -const PrefixHandler = struct { precedence: u32, parse: *const fn (*Parser, u32, Token) Error!*const Expr }; -const InfixHandler = struct { precedence: u32, parse: *const fn (*Parser, u32, *const Expr, Token) Error!*const Expr }; +pub const Error = error{ InvalidOp, SyntaxError, OutOfMemory }; +fn no_prefix(_: *Parser, _: u32, _: Token) Error!*const Expr { + return Error.InvalidOp; +} +fn no_infix(_: *Parser, _: u32, _: *const Expr, _: Token) Error!*const Expr { + return Error.InvalidOp; +} +pub const Operator = struct { + precedence: u32 = 0, + parse_prefix: *const fn(*Parser, u32, Token) Error!*const Expr = no_prefix, + parse_infix: *const fn(*Parser, u32, *const Expr, Token) Error!*const Expr = no_infix, +}; fn prefix_atom(parser: *Parser, _: u32, tok: Token) Error!*const Expr { return try parser.make_expr(.{ .atom = tok }); @@ -63,18 +72,17 @@ fn infix_binop(parser: *Parser, precedence: u32, lhs: *const Expr, tok: Token) E pub const Parser = struct { tokenizer: token.Tokenizer, allocator: std.mem.Allocator, - prefixes: std.AutoHashMap(TokenKind, PrefixHandler), - infixes: std.AutoHashMap(TokenKind, InfixHandler), + ops: std.AutoHashMap(TokenKind, Operator), pub fn parse(self: *Parser, precedence: u32) Error!*const Expr { const tok = self.tokenizer.next(); - const prefix = self.prefixes.get(tok.kind) orelse return Error.InvalidOp; - var left = try prefix.parse(self, prefix.precedence, tok); - var infix: InfixHandler = undefined; + const prefix = self.ops.get(tok.kind) orelse return Error.InvalidOp; + var left = try prefix.parse_prefix(self, prefix.precedence, tok); + var infix: Operator = undefined; while (w: { - infix = self.infixes.get(self.tokenizer.peek().kind) orelse break :w false; + infix = self.ops.get(self.tokenizer.peek().kind) orelse break :w false; break :w infix.precedence > precedence; }) { - left = try infix.parse(self, infix.precedence, left, self.tokenizer.next()); + left = try infix.parse_infix(self, infix.precedence, left, self.tokenizer.next()); } return left; } @@ -84,19 +92,20 @@ pub const Parser = struct { return ptr; } fn register_unop(self: *Parser, op: []const u8, precedence: u32) !void { - try self.prefixes.put(op_kind(op), .{ .precedence = precedence, .parse = prefix_unop }); + const op_p = try self.ops.getOrPutValue(op_kind(op), .{ .precedence = precedence }); + op_p.value_ptr.parse_prefix = prefix_unop; } fn register_binop(self: *Parser, op: []const u8, precedence: u32) !void { - try self.infixes.put(op_kind(op), .{ .precedence = precedence, .parse = infix_binop }); + const op_p = try self.ops.getOrPutValue(op_kind(op), .{ .precedence = precedence }); + op_p.value_ptr.parse_infix = infix_binop; } - pub fn init(tokenizer: token.Tokenizer, allocator: std.mem.Allocator) !Parser { - const prefixes = std.AutoHashMap(TokenKind, PrefixHandler).init(allocator); - const infixes = std.AutoHashMap(TokenKind, InfixHandler).init(allocator); - var p = Parser{ .tokenizer = tokenizer, .allocator = allocator, .prefixes = prefixes, .infixes = infixes }; + pub fn init(src: []const u8, allocator: std.mem.Allocator) !Parser { + const ops = std.AutoHashMap(TokenKind, Operator).init(allocator); + var p = Parser{ .tokenizer = token.Tokenizer.init(src, ops), .allocator = allocator, .ops = ops }; - try p.prefixes.put(TokenKind.name, .{ .precedence = 0, .parse = prefix_atom }); - try p.prefixes.put(TokenKind.number, .{ .precedence = 0, .parse = prefix_atom }); - try p.prefixes.put(op_kind("("), .{ .precedence = 0, .parse = prefix_paren }); + try p.ops.put(TokenKind.name, .{ .parse_prefix = prefix_atom }); + try p.ops.put(TokenKind.number, .{ .parse_prefix = prefix_atom }); + try p.ops.put(op_kind("("), .{ .parse_prefix = prefix_paren }); try p.register_unop("+", 4); try p.register_unop("-", 4); @@ -107,7 +116,6 @@ pub const Parser = struct { return p; } pub fn free(self: *Parser) void { - self.prefixes.clearAndFree(); - self.infixes.clearAndFree(); + self.ops.clearAndFree(); } }; diff --git a/src/frontend/token.zig b/src/frontend/token.zig index d239479..13048f9 100644 --- a/src/frontend/token.zig +++ b/src/frontend/token.zig @@ -61,12 +61,14 @@ pub fn op_kind(op: []const u8) TokenKind { return @enumFromInt(v); } +const Operator = @import("parse.zig").Operator; pub const Tokenizer = struct { src: []const u8, - i: usize, - cache: ?Token, - pub fn init(src: []const u8) Tokenizer { - return Tokenizer{ .src = src, .i = 0, .cache = null }; + i: usize = 0, + cache: ?Token = null, + ops: std.AutoHashMap(TokenKind, Operator), + pub fn init(src: []const u8, ops: std.AutoHashMap(TokenKind, Operator)) Tokenizer { + return Tokenizer{ .src = src, .ops = ops }; } fn get_char(self: *Tokenizer) ?u8 { if (self.i < self.src.len) { @@ -122,10 +124,25 @@ pub const Tokenizer = struct { break :st .invalid; }, .op => { - while (is_op(self.peek_char() orelse 0)) { - self.i += 1; + const first: u16 = op_map[self.src[start]]; + if (is_op(self.peek_char() orelse 0)) { + const second = @as(u16, op_map[self.src[self.i]]) << 5; + if (is_op(self.peek_char() orelse 0)) { + const third = @as(u16, op_map[self.src[self.i]]) << 10; + if (self.ops.contains(@enumFromInt(first + second + third))) { + self.i += 2; + break :st @enumFromInt(first + second + third); + } + } + if (self.ops.contains(@enumFromInt(first + second))) { + self.i += 1; + break :st @enumFromInt(first + second); + } } - break :st op_kind(self.src[start..self.i]); + if (self.ops.contains(@enumFromInt(first))) { + break :st @enumFromInt(first); + } + break :st .invalid; }, .name => { while (is_alphanumeric(self.peek_char() orelse 0)) self.i += 1; @@ -146,23 +163,26 @@ pub const Tokenizer = struct { }; fn check_tokenizer(comptime input: []const u8, comptime expected: []const u8) !void { - comptime { - var tokenizer = Tokenizer.init(input); - var i = std.mem.indexOfNone(u8, expected, " \\") orelse unreachable; - while (i < expected.len) { - const j = std.mem.indexOfNonePos(u8, expected, i, &[_]u8{expected[i]}) orelse expected.len; - const k = switch (expected[i]) { - 'a' => .name, - '0' => .number, - 'o' => op_kind(input[i..j]), - ';' => .semicolon, - else => unreachable, - }; - try std.testing.expectEqual(Token{ .kind = k, .start = i, .end = j }, tokenizer.next()); - i = std.mem.indexOfNonePos(u8, expected, j, " \\") orelse expected.len; - } - try std.testing.expectEqual(Token{ .kind = .eof, .start = expected.len, .end = expected.len }, tokenizer.next()); + var ops = std.AutoHashMap(TokenKind, Operator).init(std.testing.allocator); + defer ops.clearAndFree(); + for ([_][]const u8{ "(", "+", "-", "*", "/", "+++", "+=", "-=", "=" }) |x| { + try ops.put(op_kind(x), .{}); } + var tokenizer = Tokenizer.init(input, ops); + var i = std.mem.indexOfNone(u8, expected, " \\") orelse unreachable; + while (i < expected.len) { + const j = std.mem.indexOfNonePos(u8, expected, i, &[_]u8{expected[i]}) orelse expected.len; + const k = switch (expected[i]) { + 'a' => .name, + '0' => .number, + 'o' => op_kind(input[i..j]), + ';' => .semicolon, + else => unreachable, + }; + try std.testing.expectEqual(Token{ .kind = k, .start = i, .end = j }, tokenizer.next()); + i = std.mem.indexOfNonePos(u8, expected, j, " \\") orelse expected.len; + } + try std.testing.expectEqual(Token{ .kind = .eof, .start = expected.len, .end = expected.len }, tokenizer.next()); } test "basic" { diff --git a/src/main.zig b/src/main.zig index d71c24d..03d8979 100644 --- a/src/main.zig +++ b/src/main.zig @@ -18,8 +18,7 @@ pub fn main() !void { const content = try file.readToEndAlloc(allocator, std.math.maxInt(usize)); defer allocator.free(content); - const t = lib.token.Tokenizer.init(content); - var p = try lib.parse.Parser.init(t, allocator); + var p = try lib.parse.Parser.init(content, allocator); defer p.free(); std.debug.print("{}\n", .{(try p.parse(0)).eval(content)}); }