const std = @import("std"); pub const TokenKind = enum(u24) { name, number, eof, semicolon, invalid, _, }; pub const Token = struct { kind: TokenKind, start: usize, end: usize, }; fn is_alpha(c: u8) bool { return switch (c) { 'a'...'z', 'A'...'Z', '_' => true, else => false, }; } fn is_numeric(c: u8) bool { return switch (c) { '0'...'9' => true, else => false, }; } fn is_whitespace(c: u8) bool { return switch (c) { ' ', '\t', '\n' => true, else => false, }; } fn is_alphanumeric(c: u8) bool { return is_alpha(c) or is_numeric(c); } fn is_op(c: u8) bool { return switch (c) { '!', '@', '$', '%', '^', '&', '*', '(', ')', '-', '+', '=', '~', '[', ']', '{', '}', '|', ':', '<', '>', ',', '.', '?', '/' => true, else => false, }; } pub fn op_kind(op: []const u8) TokenKind { const v: u24 = switch (op.len) { 1 => @intCast(op[0]), 2 => @intCast(@as(u16, @bitCast(op[0..2].*))), 3 => @bitCast(op[0..3].*), else => @intFromEnum(TokenKind.invalid), }; return @enumFromInt(v); } pub const Tokenizer = struct { src: []const u8, i: usize, cache: ?Token, pub fn init(src: []const u8) Tokenizer { return Tokenizer{ .src = src, .i = 0, .cache = null }; } fn get_char(self: *Tokenizer) ?u8 { if (self.i < self.src.len) { defer self.i += 1; return self.src[self.i]; } else { return null; } } fn peek_char(self: *Tokenizer) ?u8 { return if (self.i < self.src.len) self.src[self.i] else null; } pub fn at_end(self: *Tokenizer) bool { return self.i >= self.src.len; } pub fn peek(self: *Tokenizer) Token { if (self.cache) |c| { return c; } self.cache = self.get_next(); return self.cache.?; } pub fn next(self: *Tokenizer) Token { if (self.cache) |c| { self.cache = null; return c; } return self.get_next(); } fn get_next(self: *Tokenizer) Token { const State = enum { start, op, name, whitespace, number, }; var start = self.i; const kind: TokenKind = st: switch (State.start) { .start => { const c = self.get_char() orelse break :st .eof; if (is_alpha(c)) continue :st .name else if (is_numeric(c)) continue :st .number else if (is_whitespace(c)) continue :st .whitespace else if (is_op(c)) continue :st .op else if (c == ';') break :st .semicolon else break :st .invalid; }, .op => { while (is_op(self.peek_char() orelse 0)) { self.i += 1; } break :st op_kind(self.src[start..self.i]); }, .name => { while (is_alphanumeric(self.peek_char() orelse 0)) self.i += 1; break :st .name; }, .whitespace => { while (is_whitespace(self.peek_char() orelse 0)) self.i += 1; start = self.i; continue :st .start; }, .number => { while (is_numeric(self.peek_char() orelse 0)) self.i += 1; break :st .number; }, }; return .{ .kind = kind, .start = start, .end = self.i }; } }; fn check_tokenizer(comptime input: []const u8, comptime expected: []const u8) !void { comptime { var tokenizer = Tokenizer.init(input); var i = std.mem.indexOfNone(u8, expected, " \\") orelse unreachable; while (i < expected.len) { const j = std.mem.indexOfNonePos(u8, expected, i, &[_]u8{expected[i]}) orelse expected.len; const k = switch (expected[i]) { 'a' => .name, '0' => .number, 'o' => op_kind(input[i..j]), ';' => .semicolon, else => unreachable, }; try std.testing.expectEqual(Token{ .kind = k, .start = i, .end = j }, tokenizer.next()); i = std.mem.indexOfNonePos(u8, expected, j, " \\") orelse expected.len; } try std.testing.expectEqual(Token{ .kind = .eof, .start = expected.len, .end = expected.len }, tokenizer.next()); } } test "basic" { const i = "aaa-bb+c-"; const e = "aaaoaaoao"; try check_tokenizer(i, e); } test "number" { const i = "1 + 2 + 3"; const e = "0 o 0 o 0"; try check_tokenizer(i, e); } test "spacing" { const i = "\n 1 \n\t\t \t +++ 2292929 + 3\t"; const e = "\\ 0 \\\\\\ \\ ooo 0000000 o 0\\"; try check_tokenizer(i, e); } test "equals" { const i = "a += 2; b -= 3; c = 5;"; const e = "a oo 0; a oo 0; a o 0;"; try check_tokenizer(i, e); }