179 lines
5 KiB
Zig
179 lines
5 KiB
Zig
const std = @import("std");
|
|
|
|
pub const TokenKind = enum(u24) {
|
|
name,
|
|
number,
|
|
eof,
|
|
semicolon,
|
|
invalid,
|
|
_,
|
|
};
|
|
pub const Token = struct {
|
|
kind: TokenKind,
|
|
start: usize,
|
|
end: usize,
|
|
};
|
|
|
|
fn is_alpha(c: u8) bool {
|
|
return switch (c) {
|
|
'a'...'z', 'A'...'Z', '_' => true,
|
|
else => false,
|
|
};
|
|
}
|
|
fn is_numeric(c: u8) bool {
|
|
return switch (c) {
|
|
'0'...'9' => true,
|
|
else => false,
|
|
};
|
|
}
|
|
fn is_whitespace(c: u8) bool {
|
|
return switch (c) {
|
|
' ', '\t', '\n' => true,
|
|
else => false,
|
|
};
|
|
}
|
|
fn is_alphanumeric(c: u8) bool {
|
|
return is_alpha(c) or is_numeric(c);
|
|
}
|
|
fn is_op(c: u8) bool {
|
|
return switch (c) {
|
|
'!', '@', '$', '%', '^', '&', '*', '(', ')', '-', '+', '=', '~', '[', ']', '{', '}', '|', ':', '<', '>', ',', '.', '?', '/' => true,
|
|
else => false,
|
|
};
|
|
}
|
|
|
|
pub fn op_kind(op: []const u8) TokenKind {
|
|
const v: u24 = switch (op.len) {
|
|
1 => @intCast(op[0]),
|
|
2 => @intCast(@as(u16, @bitCast(op[0..2].*))),
|
|
3 => @bitCast(op[0..3].*),
|
|
else => @intFromEnum(TokenKind.invalid),
|
|
};
|
|
return @enumFromInt(v);
|
|
}
|
|
|
|
pub const Tokenizer = struct {
|
|
src: []const u8,
|
|
i: usize,
|
|
cache: ?Token,
|
|
pub fn init(src: []const u8) Tokenizer {
|
|
return Tokenizer{ .src = src, .i = 0, .cache = null };
|
|
}
|
|
fn get_char(self: *Tokenizer) ?u8 {
|
|
if (self.i < self.src.len) {
|
|
defer self.i += 1;
|
|
return self.src[self.i];
|
|
} else {
|
|
return null;
|
|
}
|
|
}
|
|
fn peek_char(self: *Tokenizer) ?u8 {
|
|
return if (self.i < self.src.len) self.src[self.i] else null;
|
|
}
|
|
pub fn at_end(self: *Tokenizer) bool {
|
|
return self.i >= self.src.len;
|
|
}
|
|
pub fn peek(self: *Tokenizer) Token {
|
|
if (self.cache) |c| {
|
|
return c;
|
|
}
|
|
self.cache = self.get_next();
|
|
return self.cache.?;
|
|
}
|
|
pub fn next(self: *Tokenizer) Token {
|
|
if (self.cache) |c| {
|
|
self.cache = null;
|
|
return c;
|
|
}
|
|
return self.get_next();
|
|
}
|
|
fn get_next(self: *Tokenizer) Token {
|
|
const State = enum {
|
|
start,
|
|
op,
|
|
name,
|
|
whitespace,
|
|
number,
|
|
};
|
|
var start = self.i;
|
|
const kind: TokenKind = st: switch (State.start) {
|
|
.start => {
|
|
const c = self.get_char() orelse break :st .eof;
|
|
if (is_alpha(c))
|
|
continue :st .name
|
|
else if (is_numeric(c))
|
|
continue :st .number
|
|
else if (is_whitespace(c))
|
|
continue :st .whitespace
|
|
else if (is_op(c))
|
|
continue :st .op
|
|
else if (c == ';')
|
|
break :st .semicolon
|
|
else
|
|
break :st .invalid;
|
|
},
|
|
.op => {
|
|
while (is_op(self.peek_char() orelse 0)) {
|
|
self.i += 1;
|
|
}
|
|
break :st op_kind(self.src[start..self.i]);
|
|
},
|
|
.name => {
|
|
while (is_alphanumeric(self.peek_char() orelse 0)) self.i += 1;
|
|
break :st .name;
|
|
},
|
|
.whitespace => {
|
|
while (is_whitespace(self.peek_char() orelse 0)) self.i += 1;
|
|
start = self.i;
|
|
continue :st .start;
|
|
},
|
|
.number => {
|
|
while (is_numeric(self.peek_char() orelse 0)) self.i += 1;
|
|
break :st .number;
|
|
},
|
|
};
|
|
return .{ .kind = kind, .start = start, .end = self.i };
|
|
}
|
|
};
|
|
|
|
fn check_tokenizer(comptime input: []const u8, comptime expected: []const u8) !void {
|
|
comptime {
|
|
var tokenizer = Tokenizer.init(input);
|
|
var i = std.mem.indexOfNone(u8, expected, " \\") orelse unreachable;
|
|
while (i < expected.len) {
|
|
const j = std.mem.indexOfNonePos(u8, expected, i, &[_]u8{expected[i]}) orelse expected.len;
|
|
const k = switch (expected[i]) {
|
|
'a' => .name,
|
|
'0' => .number,
|
|
'o' => op_kind(input[i..j]),
|
|
';' => .semicolon,
|
|
else => unreachable,
|
|
};
|
|
try std.testing.expectEqual(Token{ .kind = k, .start = i, .end = j }, tokenizer.next());
|
|
i = std.mem.indexOfNonePos(u8, expected, j, " \\") orelse expected.len;
|
|
}
|
|
try std.testing.expectEqual(Token{ .kind = .eof, .start = expected.len, .end = expected.len }, tokenizer.next());
|
|
}
|
|
}
|
|
|
|
test "basic" {
|
|
const i = "aaa-bb+c-";
|
|
const e = "aaaoaaoao";
|
|
try check_tokenizer(i, e);
|
|
}
|
|
test "number" {
|
|
const i = "1 + 2 + 3";
|
|
const e = "0 o 0 o 0";
|
|
try check_tokenizer(i, e);
|
|
}
|
|
test "spacing" {
|
|
const i = "\n 1 \n\t\t \t +++ 2292929 + 3\t";
|
|
const e = "\\ 0 \\\\\\ \\ ooo 0000000 o 0\\";
|
|
try check_tokenizer(i, e);
|
|
}
|
|
test "equals" {
|
|
const i = "a += 2; b -= 3; c = 5;";
|
|
const e = "a oo 0; a oo 0; a o 0;";
|
|
try check_tokenizer(i, e);
|
|
}
|