Skip to content

Commit 377e857

Browse files
committed
std.zig.tokenizer: simplify
I pointed a fuzzer at the tokenizer and it crashed immediately. Upon inspection, I was dissatisfied with the implementation. This commit removes several mechanisms: * Removes the "invalid byte" compile error note. * Dramatically simplifies tokenizer recovery by making recovery always occur at newlines, and never otherwise. * Removes UTF-8 validation. * Moves some character validation logic to `std.zig.parseCharLiteral`. Removing UTF-8 validation is a regression of #663, however, the existing implementation was already buggy. When adding this functionality back, it must be fuzz-tested while checking the property that it matches an independent Unicode validation implementation on the same file. While we're at it, fuzzing should check the other properties of that proposal, such as no ASCII control characters existing inside the source code. Other changes included in this commit: * Deprecate `std.unicode.utf8Decode` and its WTF-8 counterpart. This function has an awkward API that is too easy to misuse. * Make `utf8Decode2` and friends use arrays as parameters, eliminating a runtime assertion in favor of using the type system. After this commit, the crash found by fuzzing, which was "\x07\xd5\x80\xc3=o\xda|a\xfc{\x9a\xec\x91\xdf\x0f\\\x1a^\xbe;\x8c\xbf\xee\xea" no longer causes a crash. However, I did not feel the need to add this test case because the simplified logic eradicates most crashes of this nature.
1 parent c08effc commit 377e857

12 files changed

+234
-392
lines changed

lib/std/unicode.zig

+14-19
Original file line numberDiff line numberDiff line change
@@ -95,16 +95,13 @@ pub inline fn utf8EncodeComptime(comptime c: u21) [
9595

9696
const Utf8DecodeError = Utf8Decode2Error || Utf8Decode3Error || Utf8Decode4Error;
9797

98-
/// Decodes the UTF-8 codepoint encoded in the given slice of bytes.
99-
/// bytes.len must be equal to utf8ByteSequenceLength(bytes[0]) catch unreachable.
100-
/// If you already know the length at comptime, you can call one of
101-
/// utf8Decode2,utf8Decode3,utf8Decode4 directly instead of this function.
98+
/// Deprecated. This function has an awkward API that is too easy to use incorrectly.
10299
pub fn utf8Decode(bytes: []const u8) Utf8DecodeError!u21 {
103100
return switch (bytes.len) {
104-
1 => @as(u21, bytes[0]),
105-
2 => utf8Decode2(bytes),
106-
3 => utf8Decode3(bytes),
107-
4 => utf8Decode4(bytes),
101+
1 => bytes[0],
102+
2 => utf8Decode2(bytes[0..2].*),
103+
3 => utf8Decode3(bytes[0..3].*),
104+
4 => utf8Decode4(bytes[0..4].*),
108105
else => unreachable,
109106
};
110107
}
@@ -113,8 +110,7 @@ const Utf8Decode2Error = error{
113110
Utf8ExpectedContinuation,
114111
Utf8OverlongEncoding,
115112
};
116-
pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u21 {
117-
assert(bytes.len == 2);
113+
pub fn utf8Decode2(bytes: [2]u8) Utf8Decode2Error!u21 {
118114
assert(bytes[0] & 0b11100000 == 0b11000000);
119115
var value: u21 = bytes[0] & 0b00011111;
120116

@@ -130,7 +126,7 @@ pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u21 {
130126
const Utf8Decode3Error = Utf8Decode3AllowSurrogateHalfError || error{
131127
Utf8EncodesSurrogateHalf,
132128
};
133-
pub fn utf8Decode3(bytes: []const u8) Utf8Decode3Error!u21 {
129+
pub fn utf8Decode3(bytes: [3]u8) Utf8Decode3Error!u21 {
134130
const value = try utf8Decode3AllowSurrogateHalf(bytes);
135131

136132
if (0xd800 <= value and value <= 0xdfff) return error.Utf8EncodesSurrogateHalf;
@@ -142,8 +138,7 @@ const Utf8Decode3AllowSurrogateHalfError = error{
142138
Utf8ExpectedContinuation,
143139
Utf8OverlongEncoding,
144140
};
145-
pub fn utf8Decode3AllowSurrogateHalf(bytes: []const u8) Utf8Decode3AllowSurrogateHalfError!u21 {
146-
assert(bytes.len == 3);
141+
pub fn utf8Decode3AllowSurrogateHalf(bytes: [3]u8) Utf8Decode3AllowSurrogateHalfError!u21 {
147142
assert(bytes[0] & 0b11110000 == 0b11100000);
148143
var value: u21 = bytes[0] & 0b00001111;
149144

@@ -165,8 +160,7 @@ const Utf8Decode4Error = error{
165160
Utf8OverlongEncoding,
166161
Utf8CodepointTooLarge,
167162
};
168-
pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u21 {
169-
assert(bytes.len == 4);
163+
pub fn utf8Decode4(bytes: [4]u8) Utf8Decode4Error!u21 {
170164
assert(bytes[0] & 0b11111000 == 0b11110000);
171165
var value: u21 = bytes[0] & 0b00000111;
172166

@@ -1637,12 +1631,13 @@ pub fn wtf8Encode(c: u21, out: []u8) error{CodepointTooLarge}!u3 {
16371631

16381632
const Wtf8DecodeError = Utf8Decode2Error || Utf8Decode3AllowSurrogateHalfError || Utf8Decode4Error;
16391633

1634+
/// Deprecated. This function has an awkward API that is too easy to use incorrectly.
16401635
pub fn wtf8Decode(bytes: []const u8) Wtf8DecodeError!u21 {
16411636
return switch (bytes.len) {
1642-
1 => @as(u21, bytes[0]),
1643-
2 => utf8Decode2(bytes),
1644-
3 => utf8Decode3AllowSurrogateHalf(bytes),
1645-
4 => utf8Decode4(bytes),
1637+
1 => bytes[0],
1638+
2 => utf8Decode2(bytes[0..2].*),
1639+
3 => utf8Decode3AllowSurrogateHalf(bytes[0..3].*),
1640+
4 => utf8Decode4(bytes[0..4].*),
16461641
else => unreachable,
16471642
};
16481643
}

lib/std/zig/Ast.zig

+1-1
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ pub fn parse(gpa: Allocator, source: [:0]const u8, mode: Mode) Allocator.Error!A
6969
const token = tokenizer.next();
7070
try tokens.append(gpa, .{
7171
.tag = token.tag,
72-
.start = @as(u32, @intCast(token.loc.start)),
72+
.start = @intCast(token.loc.start),
7373
});
7474
if (token.tag == .eof) break;
7575
}

lib/std/zig/AstGen.zig

+3-12
Original file line numberDiff line numberDiff line change
@@ -11351,6 +11351,9 @@ fn failWithStrLitError(astgen: *AstGen, err: std.zig.string_literal.Error, token
1135111351
.{raw_string[bad_index]},
1135211352
);
1135311353
},
11354+
.empty_char_literal => {
11355+
return astgen.failOff(token, offset, "empty character literal", .{});
11356+
},
1135411357
}
1135511358
}
1135611359

@@ -13820,21 +13823,9 @@ fn lowerAstErrors(astgen: *AstGen) !void {
1382013823
var msg: std.ArrayListUnmanaged(u8) = .{};
1382113824
defer msg.deinit(gpa);
1382213825

13823-
const token_starts = tree.tokens.items(.start);
13824-
const token_tags = tree.tokens.items(.tag);
13825-
1382613826
var notes: std.ArrayListUnmanaged(u32) = .{};
1382713827
defer notes.deinit(gpa);
1382813828

13829-
const tok = parse_err.token + @intFromBool(parse_err.token_is_prev);
13830-
if (token_tags[tok] == .invalid) {
13831-
const bad_off: u32 = @intCast(tree.tokenSlice(tok).len);
13832-
const byte_abs = token_starts[tok] + bad_off;
13833-
try notes.append(gpa, try astgen.errNoteTokOff(tok, bad_off, "invalid byte: '{'}'", .{
13834-
std.zig.fmtEscapes(tree.source[byte_abs..][0..1]),
13835-
}));
13836-
}
13837-
1383813829
for (tree.errors[1..]) |note| {
1383913830
if (!note.is_note) break;
1384013831

lib/std/zig/parser_test.zig

-1
Original file line numberDiff line numberDiff line change
@@ -6061,7 +6061,6 @@ test "recovery: invalid container members" {
60616061
, &[_]Error{
60626062
.expected_expr,
60636063
.expected_comma_after_field,
6064-
.expected_type_expr,
60656064
.expected_semi_after_stmt,
60666065
});
60676066
}

lib/std/zig/string_literal.zig

+19-5
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
const std = @import("../std.zig");
22
const assert = std.debug.assert;
3-
const utf8Decode = std.unicode.utf8Decode;
43
const utf8Encode = std.unicode.utf8Encode;
54

65
pub const ParseError = error{
@@ -37,12 +36,16 @@ pub const Error = union(enum) {
3736
expected_single_quote: usize,
3837
/// The character at this index cannot be represented without an escape sequence.
3938
invalid_character: usize,
39+
/// `''`. Not returned for string literals.
40+
empty_char_literal,
4041
};
4142

42-
/// Only validates escape sequence characters.
43-
/// Slice must be valid utf8 starting and ending with "'" and exactly one codepoint in between.
43+
/// Asserts the slice starts and ends with single-quotes.
44+
/// Returns an error if there is not exactly one UTF-8 codepoint in between.
4445
pub fn parseCharLiteral(slice: []const u8) ParsedCharLiteral {
45-
assert(slice.len >= 3 and slice[0] == '\'' and slice[slice.len - 1] == '\'');
46+
if (slice.len < 3) return .{ .failure = .empty_char_literal };
47+
assert(slice[0] == '\'');
48+
assert(slice[slice.len - 1] == '\'');
4649

4750
switch (slice[1]) {
4851
'\\' => {
@@ -55,7 +58,18 @@ pub fn parseCharLiteral(slice: []const u8) ParsedCharLiteral {
5558
},
5659
0 => return .{ .failure = .{ .invalid_character = 1 } },
5760
else => {
58-
const codepoint = utf8Decode(slice[1 .. slice.len - 1]) catch unreachable;
61+
const inner = slice[1 .. slice.len - 1];
62+
const n = std.unicode.utf8ByteSequenceLength(inner[0]) catch return .{
63+
.failure = .{ .invalid_unicode_codepoint = 1 },
64+
};
65+
if (inner.len > n) return .{ .failure = .{ .expected_single_quote = 1 + n } };
66+
const codepoint = switch (n) {
67+
1 => inner[0],
68+
2 => std.unicode.utf8Decode2(inner[0..2].*),
69+
3 => std.unicode.utf8Decode3(inner[0..3].*),
70+
4 => std.unicode.utf8Decode4(inner[0..4].*),
71+
else => unreachable,
72+
} catch return .{ .failure = .{ .invalid_unicode_codepoint = 1 } };
5973
return .{ .success = codepoint };
6074
},
6175
}

0 commit comments

Comments
 (0)