suzu/src/frontend/token.zig
2025-05-29 19:20:42 -05:00

179 lines
5 KiB
Zig

const std = @import("std");
pub const TokenKind = enum(u24) {
name,
number,
eof,
semicolon,
invalid,
_,
};
pub const Token = struct {
kind: TokenKind,
start: usize,
end: usize,
};
fn is_alpha(c: u8) bool {
return switch (c) {
'a'...'z', 'A'...'Z', '_' => true,
else => false,
};
}
fn is_numeric(c: u8) bool {
return switch (c) {
'0'...'9' => true,
else => false,
};
}
fn is_whitespace(c: u8) bool {
return switch (c) {
' ', '\t', '\n' => true,
else => false,
};
}
fn is_alphanumeric(c: u8) bool {
return is_alpha(c) or is_numeric(c);
}
fn is_op(c: u8) bool {
return switch (c) {
'!', '@', '$', '%', '^', '&', '*', '(', ')', '-', '+', '=', '~', '[', ']', '{', '}', '|', ':', '<', '>', ',', '.', '?', '/' => true,
else => false,
};
}
pub fn op_kind(op: []const u8) TokenKind {
const v: u24 = switch (op.len) {
1 => @intCast(op[0]),
2 => @intCast(@as(u16, @bitCast(op[0..2].*))),
3 => @bitCast(op[0..3].*),
else => @intFromEnum(TokenKind.invalid),
};
return @enumFromInt(v);
}
pub const Tokenizer = struct {
src: []const u8,
i: usize,
cache: ?Token,
pub fn init(src: []const u8) Tokenizer {
return Tokenizer{ .src = src, .i = 0, .cache = null };
}
fn get_char(self: *Tokenizer) ?u8 {
if (self.i < self.src.len) {
defer self.i += 1;
return self.src[self.i];
} else {
return null;
}
}
fn peek_char(self: *Tokenizer) ?u8 {
return if (self.i < self.src.len) self.src[self.i] else null;
}
pub fn at_end(self: *Tokenizer) bool {
return self.i >= self.src.len;
}
pub fn peek(self: *Tokenizer) Token {
if (self.cache) |c| {
return c;
}
self.cache = self.get_next();
return self.cache.?;
}
pub fn next(self: *Tokenizer) Token {
if (self.cache) |c| {
self.cache = null;
return c;
}
return self.get_next();
}
fn get_next(self: *Tokenizer) Token {
const State = enum {
start,
op,
name,
whitespace,
number,
};
var start = self.i;
const kind: TokenKind = st: switch (State.start) {
.start => {
const c = self.get_char() orelse break :st .eof;
if (is_alpha(c))
continue :st .name
else if (is_numeric(c))
continue :st .number
else if (is_whitespace(c))
continue :st .whitespace
else if (is_op(c))
continue :st .op
else if (c == ';')
break :st .semicolon
else
break :st .invalid;
},
.op => {
while (is_op(self.peek_char() orelse 0)) {
self.i += 1;
}
break :st op_kind(self.src[start..self.i]);
},
.name => {
while (is_alphanumeric(self.peek_char() orelse 0)) self.i += 1;
break :st .name;
},
.whitespace => {
while (is_whitespace(self.peek_char() orelse 0)) self.i += 1;
start = self.i;
continue :st .start;
},
.number => {
while (is_numeric(self.peek_char() orelse 0)) self.i += 1;
break :st .number;
},
};
return .{ .kind = kind, .start = start, .end = self.i };
}
};
fn check_tokenizer(comptime input: []const u8, comptime expected: []const u8) !void {
comptime {
var tokenizer = Tokenizer.init(input);
var i = std.mem.indexOfNone(u8, expected, " \\") orelse unreachable;
while (i < expected.len) {
const j = std.mem.indexOfNonePos(u8, expected, i, &[_]u8{expected[i]}) orelse expected.len;
const k = switch (expected[i]) {
'a' => .name,
'0' => .number,
'o' => op_kind(input[i..j]),
';' => .semicolon,
else => unreachable,
};
try std.testing.expectEqual(Token{ .kind = k, .start = i, .end = j }, tokenizer.next());
i = std.mem.indexOfNonePos(u8, expected, j, " \\") orelse expected.len;
}
try std.testing.expectEqual(Token{ .kind = .eof, .start = expected.len, .end = expected.len }, tokenizer.next());
}
}
test "basic" {
const i = "aaa-bb+c-";
const e = "aaaoaaoao";
try check_tokenizer(i, e);
}
test "number" {
const i = "1 + 2 + 3";
const e = "0 o 0 o 0";
try check_tokenizer(i, e);
}
test "spacing" {
const i = "\n 1 \n\t\t \t +++ 2292929 + 3\t";
const e = "\\ 0 \\\\\\ \\ ooo 0000000 o 0\\";
try check_tokenizer(i, e);
}
test "equals" {
const i = "a += 2; b -= 3; c = 5;";
const e = "a oo 0; a oo 0; a o 0;";
try check_tokenizer(i, e);
}