From 377e8579f9539c56fc5988c9a452a01438af89f3 Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Wed, 31 Jul 2024 11:51:19 -0700 Subject: [PATCH 1/3] std.zig.tokenizer: simplify I pointed a fuzzer at the tokenizer and it crashed immediately. Upon inspection, I was dissatisfied with the implementation. This commit removes several mechanisms: * Removes the "invalid byte" compile error note. * Dramatically simplifies tokenizer recovery by making recovery always occur at newlines, and never otherwise. * Removes UTF-8 validation. * Moves some character validation logic to `std.zig.parseCharLiteral`. Removing UTF-8 validation is a regression of #663, however, the existing implementation was already buggy. When adding this functionality back, it must be fuzz-tested while checking the property that it matches an independent Unicode validation implementation on the same file. While we're at it, fuzzing should check the other properties of that proposal, such as no ASCII control characters existing inside the source code. Other changes included in this commit: * Deprecate `std.unicode.utf8Decode` and its WTF-8 counterpart. This function has an awkward API that is too easy to misuse. * Make `utf8Decode2` and friends use arrays as parameters, eliminating a runtime assertion in favor of using the type system. After this commit, the crash found by fuzzing, which was "\x07\xd5\x80\xc3=o\xda|a\xfc{\x9a\xec\x91\xdf\x0f\\\x1a^\xbe;\x8c\xbf\xee\xea" no longer causes a crash. However, I did not feel the need to add this test case because the simplified logic eradicates most crashes of this nature. --- lib/std/unicode.zig | 33 +- lib/std/zig/Ast.zig | 2 +- lib/std/zig/AstGen.zig | 15 +- lib/std/zig/parser_test.zig | 1 - lib/std/zig/string_literal.zig | 24 +- lib/std/zig/tokenizer.zig | 512 ++++++------------ src/Package/Manifest.zig | 3 + test/cases/compile_errors/empty_char_lit.zig | 9 + .../invalid_legacy_unicode_escape.zig | 3 +- .../compile_errors/invalid_unicode_escape.zig | 3 +- .../normal_string_with_newline.zig | 3 +- test/compile_errors.zig | 18 +- 12 files changed, 234 insertions(+), 392 deletions(-) create mode 100644 test/cases/compile_errors/empty_char_lit.zig diff --git a/lib/std/unicode.zig b/lib/std/unicode.zig index bab075114d50..a8fa1454a531 100644 --- a/lib/std/unicode.zig +++ b/lib/std/unicode.zig @@ -95,16 +95,13 @@ pub inline fn utf8EncodeComptime(comptime c: u21) [ const Utf8DecodeError = Utf8Decode2Error || Utf8Decode3Error || Utf8Decode4Error; -/// Decodes the UTF-8 codepoint encoded in the given slice of bytes. -/// bytes.len must be equal to utf8ByteSequenceLength(bytes[0]) catch unreachable. -/// If you already know the length at comptime, you can call one of -/// utf8Decode2,utf8Decode3,utf8Decode4 directly instead of this function. +/// Deprecated. This function has an awkward API that is too easy to use incorrectly. pub fn utf8Decode(bytes: []const u8) Utf8DecodeError!u21 { return switch (bytes.len) { - 1 => @as(u21, bytes[0]), - 2 => utf8Decode2(bytes), - 3 => utf8Decode3(bytes), - 4 => utf8Decode4(bytes), + 1 => bytes[0], + 2 => utf8Decode2(bytes[0..2].*), + 3 => utf8Decode3(bytes[0..3].*), + 4 => utf8Decode4(bytes[0..4].*), else => unreachable, }; } @@ -113,8 +110,7 @@ const Utf8Decode2Error = error{ Utf8ExpectedContinuation, Utf8OverlongEncoding, }; -pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u21 { - assert(bytes.len == 2); +pub fn utf8Decode2(bytes: [2]u8) Utf8Decode2Error!u21 { assert(bytes[0] & 0b11100000 == 0b11000000); var value: u21 = bytes[0] & 0b00011111; @@ -130,7 +126,7 @@ pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u21 { const Utf8Decode3Error = Utf8Decode3AllowSurrogateHalfError || error{ Utf8EncodesSurrogateHalf, }; -pub fn utf8Decode3(bytes: []const u8) Utf8Decode3Error!u21 { +pub fn utf8Decode3(bytes: [3]u8) Utf8Decode3Error!u21 { const value = try utf8Decode3AllowSurrogateHalf(bytes); if (0xd800 <= value and value <= 0xdfff) return error.Utf8EncodesSurrogateHalf; @@ -142,8 +138,7 @@ const Utf8Decode3AllowSurrogateHalfError = error{ Utf8ExpectedContinuation, Utf8OverlongEncoding, }; -pub fn utf8Decode3AllowSurrogateHalf(bytes: []const u8) Utf8Decode3AllowSurrogateHalfError!u21 { - assert(bytes.len == 3); +pub fn utf8Decode3AllowSurrogateHalf(bytes: [3]u8) Utf8Decode3AllowSurrogateHalfError!u21 { assert(bytes[0] & 0b11110000 == 0b11100000); var value: u21 = bytes[0] & 0b00001111; @@ -165,8 +160,7 @@ const Utf8Decode4Error = error{ Utf8OverlongEncoding, Utf8CodepointTooLarge, }; -pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u21 { - assert(bytes.len == 4); +pub fn utf8Decode4(bytes: [4]u8) Utf8Decode4Error!u21 { assert(bytes[0] & 0b11111000 == 0b11110000); var value: u21 = bytes[0] & 0b00000111; @@ -1637,12 +1631,13 @@ pub fn wtf8Encode(c: u21, out: []u8) error{CodepointTooLarge}!u3 { const Wtf8DecodeError = Utf8Decode2Error || Utf8Decode3AllowSurrogateHalfError || Utf8Decode4Error; +/// Deprecated. This function has an awkward API that is too easy to use incorrectly. pub fn wtf8Decode(bytes: []const u8) Wtf8DecodeError!u21 { return switch (bytes.len) { - 1 => @as(u21, bytes[0]), - 2 => utf8Decode2(bytes), - 3 => utf8Decode3AllowSurrogateHalf(bytes), - 4 => utf8Decode4(bytes), + 1 => bytes[0], + 2 => utf8Decode2(bytes[0..2].*), + 3 => utf8Decode3AllowSurrogateHalf(bytes[0..3].*), + 4 => utf8Decode4(bytes[0..4].*), else => unreachable, }; } diff --git a/lib/std/zig/Ast.zig b/lib/std/zig/Ast.zig index f55d78b6cedd..1f734cef63f0 100644 --- a/lib/std/zig/Ast.zig +++ b/lib/std/zig/Ast.zig @@ -69,7 +69,7 @@ pub fn parse(gpa: Allocator, source: [:0]const u8, mode: Mode) Allocator.Error!A const token = tokenizer.next(); try tokens.append(gpa, .{ .tag = token.tag, - .start = @as(u32, @intCast(token.loc.start)), + .start = @intCast(token.loc.start), }); if (token.tag == .eof) break; } diff --git a/lib/std/zig/AstGen.zig b/lib/std/zig/AstGen.zig index a6be743c2bc0..c24aa6d06325 100644 --- a/lib/std/zig/AstGen.zig +++ b/lib/std/zig/AstGen.zig @@ -11351,6 +11351,9 @@ fn failWithStrLitError(astgen: *AstGen, err: std.zig.string_literal.Error, token .{raw_string[bad_index]}, ); }, + .empty_char_literal => { + return astgen.failOff(token, offset, "empty character literal", .{}); + }, } } @@ -13820,21 +13823,9 @@ fn lowerAstErrors(astgen: *AstGen) !void { var msg: std.ArrayListUnmanaged(u8) = .{}; defer msg.deinit(gpa); - const token_starts = tree.tokens.items(.start); - const token_tags = tree.tokens.items(.tag); - var notes: std.ArrayListUnmanaged(u32) = .{}; defer notes.deinit(gpa); - const tok = parse_err.token + @intFromBool(parse_err.token_is_prev); - if (token_tags[tok] == .invalid) { - const bad_off: u32 = @intCast(tree.tokenSlice(tok).len); - const byte_abs = token_starts[tok] + bad_off; - try notes.append(gpa, try astgen.errNoteTokOff(tok, bad_off, "invalid byte: '{'}'", .{ - std.zig.fmtEscapes(tree.source[byte_abs..][0..1]), - })); - } - for (tree.errors[1..]) |note| { if (!note.is_note) break; diff --git a/lib/std/zig/parser_test.zig b/lib/std/zig/parser_test.zig index 5130ce403757..530aa924d08c 100644 --- a/lib/std/zig/parser_test.zig +++ b/lib/std/zig/parser_test.zig @@ -6061,7 +6061,6 @@ test "recovery: invalid container members" { , &[_]Error{ .expected_expr, .expected_comma_after_field, - .expected_type_expr, .expected_semi_after_stmt, }); } diff --git a/lib/std/zig/string_literal.zig b/lib/std/zig/string_literal.zig index c160d16b07bd..69178098379b 100644 --- a/lib/std/zig/string_literal.zig +++ b/lib/std/zig/string_literal.zig @@ -1,6 +1,5 @@ const std = @import("../std.zig"); const assert = std.debug.assert; -const utf8Decode = std.unicode.utf8Decode; const utf8Encode = std.unicode.utf8Encode; pub const ParseError = error{ @@ -37,12 +36,16 @@ pub const Error = union(enum) { expected_single_quote: usize, /// The character at this index cannot be represented without an escape sequence. invalid_character: usize, + /// `''`. Not returned for string literals. + empty_char_literal, }; -/// Only validates escape sequence characters. -/// Slice must be valid utf8 starting and ending with "'" and exactly one codepoint in between. +/// Asserts the slice starts and ends with single-quotes. +/// Returns an error if there is not exactly one UTF-8 codepoint in between. pub fn parseCharLiteral(slice: []const u8) ParsedCharLiteral { - assert(slice.len >= 3 and slice[0] == '\'' and slice[slice.len - 1] == '\''); + if (slice.len < 3) return .{ .failure = .empty_char_literal }; + assert(slice[0] == '\''); + assert(slice[slice.len - 1] == '\''); switch (slice[1]) { '\\' => { @@ -55,7 +58,18 @@ pub fn parseCharLiteral(slice: []const u8) ParsedCharLiteral { }, 0 => return .{ .failure = .{ .invalid_character = 1 } }, else => { - const codepoint = utf8Decode(slice[1 .. slice.len - 1]) catch unreachable; + const inner = slice[1 .. slice.len - 1]; + const n = std.unicode.utf8ByteSequenceLength(inner[0]) catch return .{ + .failure = .{ .invalid_unicode_codepoint = 1 }, + }; + if (inner.len > n) return .{ .failure = .{ .expected_single_quote = 1 + n } }; + const codepoint = switch (n) { + 1 => inner[0], + 2 => std.unicode.utf8Decode2(inner[0..2].*), + 3 => std.unicode.utf8Decode3(inner[0..3].*), + 4 => std.unicode.utf8Decode4(inner[0..4].*), + else => unreachable, + } catch return .{ .failure = .{ .invalid_unicode_codepoint = 1 } }; return .{ .success = codepoint }; }, } diff --git a/lib/std/zig/tokenizer.zig b/lib/std/zig/tokenizer.zig index 36cbf9a856fb..32e11b1b9a6d 100644 --- a/lib/std/zig/tokenizer.zig +++ b/lib/std/zig/tokenizer.zig @@ -320,7 +320,7 @@ pub const Token = struct { pub fn symbol(tag: Tag) []const u8 { return tag.lexeme() orelse switch (tag) { - .invalid => "invalid bytes", + .invalid => "invalid token", .identifier => "an identifier", .string_literal, .multiline_string_literal_line => "a string literal", .char_literal => "a character literal", @@ -338,22 +338,22 @@ pub const Tokenizer = struct { buffer: [:0]const u8, index: usize, - /// For debugging purposes + /// For debugging purposes. pub fn dump(self: *Tokenizer, token: *const Token) void { std.debug.print("{s} \"{s}\"\n", .{ @tagName(token.tag), self.buffer[token.loc.start..token.loc.end] }); } pub fn init(buffer: [:0]const u8) Tokenizer { - // Skip the UTF-8 BOM if present - const src_start: usize = if (std.mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else 0; - return Tokenizer{ + // Skip the UTF-8 BOM if present. + return .{ .buffer = buffer, - .index = src_start, + .index = if (std.mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else 0, }; } const State = enum { start, + expect_newline, identifier, builtin, string_literal, @@ -361,10 +361,6 @@ pub const Tokenizer = struct { multiline_string_literal_line, char_literal, char_literal_backslash, - char_literal_hex_escape, - char_literal_unicode_escape_saw_u, - char_literal_unicode_escape, - char_literal_end, backslash, equal, bang, @@ -400,32 +396,38 @@ pub const Tokenizer = struct { period_2, period_asterisk, saw_at_sign, + invalid, }; + /// After this returns invalid, it will reset on the next newline, returning tokens starting from there. + /// An eof token will always be returned at the end. pub fn next(self: *Tokenizer) Token { var state: State = .start; - var result = Token{ - .tag = .eof, + var result: Token = .{ + .tag = undefined, .loc = .{ .start = self.index, .end = undefined, }, }; - var seen_escape_digits: usize = undefined; while (true) : (self.index += 1) { const c = self.buffer[self.index]; switch (state) { .start => switch (c) { 0 => { - if (self.index != self.buffer.len) { - result.tag = .invalid; - result.loc.end = self.index; - self.index += 1; - return result; - } - break; - }, - ' ', '\n', '\t', '\r' => { + if (self.index == self.buffer.len) return .{ + .tag = .eof, + .loc = .{ + .start = self.index, + .end = self.index, + }, + }; + state = .invalid; + }, + '\r' => { + state = .expect_newline; + }, + ' ', '\n', '\t' => { result.loc.start = self.index + 1; }, '"' => { @@ -434,6 +436,7 @@ pub const Tokenizer = struct { }, '\'' => { state = .char_literal; + result.tag = .char_literal; }, 'a'...'z', 'A'...'Z', '_' => { state = .identifier; @@ -545,14 +548,37 @@ pub const Tokenizer = struct { result.tag = .number_literal; }, else => { + state = .invalid; + }, + }, + + .expect_newline => switch (c) { + '\n' => { + result.loc.start = self.index + 1; + state = .start; + }, + else => { + state = .invalid; + }, + }, + + .invalid => switch (c) { + 0 => if (self.index == self.buffer.len) { + result.tag = .invalid; + break; + }, + '\n' => { result.tag = .invalid; - result.loc.end = self.index; - self.index += std.unicode.utf8ByteSequenceLength(c) catch 1; - return result; + break; }, + else => continue, }, .saw_at_sign => switch (c) { + 0, '\n' => { + result.tag = .invalid; + break; + }, '"' => { result.tag = .identifier; state = .string_literal; @@ -562,8 +588,7 @@ pub const Tokenizer = struct { result.tag = .builtin; }, else => { - result.tag = .invalid; - break; + state = .invalid; }, }, @@ -698,7 +723,7 @@ pub const Tokenizer = struct { }, .identifier => switch (c) { - 'a'...'z', 'A'...'Z', '_', '0'...'9' => {}, + 'a'...'z', 'A'...'Z', '_', '0'...'9' => continue, else => { if (Token.getKeyword(self.buffer[result.loc.start..self.index])) |tag| { result.tag = tag; @@ -707,26 +732,37 @@ pub const Tokenizer = struct { }, }, .builtin => switch (c) { - 'a'...'z', 'A'...'Z', '_', '0'...'9' => {}, + 'a'...'z', 'A'...'Z', '_', '0'...'9' => continue, else => break, }, .backslash => switch (c) { + 0 => { + result.tag = .invalid; + break; + }, '\\' => { state = .multiline_string_literal_line; }, - else => { + '\n' => { result.tag = .invalid; break; }, + else => { + state = .invalid; + }, }, .string_literal => switch (c) { - 0, '\n' => { - result.tag = .invalid; - result.loc.end = self.index; + 0 => { if (self.index != self.buffer.len) { - self.index += 1; + state = .invalid; + continue; } - return result; + result.tag = .invalid; + break; + }, + '\n' => { + result.tag = .invalid; + break; }, '\\' => { state = .string_literal_backslash; @@ -735,150 +771,74 @@ pub const Tokenizer = struct { self.index += 1; break; }, - else => { - if (self.invalidCharacterLength()) |len| { - result.tag = .invalid; - result.loc.end = self.index; - self.index += len; - return result; - } - - self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1; + 0x01...0x09, 0x0b...0x1f, 0x7f => { + state = .invalid; }, + else => continue, }, .string_literal_backslash => switch (c) { 0, '\n' => { result.tag = .invalid; - result.loc.end = self.index; - if (self.index != self.buffer.len) { - self.index += 1; - } - return result; + break; }, else => { state = .string_literal; - - if (self.invalidCharacterLength()) |len| { - result.tag = .invalid; - result.loc.end = self.index; - self.index += len; - return result; - } - - self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1; }, }, .char_literal => switch (c) { - 0, '\n', '\'' => { - result.tag = .invalid; - result.loc.end = self.index; + 0 => { if (self.index != self.buffer.len) { - self.index += 1; + state = .invalid; + continue; } - return result; + result.tag = .invalid; + break; + }, + '\n' => { + result.tag = .invalid; + break; }, '\\' => { state = .char_literal_backslash; }, - else => { - state = .char_literal_end; - - if (self.invalidCharacterLength()) |len| { - result.tag = .invalid; - result.loc.end = self.index; - self.index += len; - return result; - } - - self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1; + '\'' => { + self.index += 1; + break; }, + 0x01...0x09, 0x0b...0x1f, 0x7f => { + state = .invalid; + }, + else => continue, }, .char_literal_backslash => switch (c) { - 0, '\n' => { - result.tag = .invalid; - result.loc.end = self.index; + 0 => { if (self.index != self.buffer.len) { - self.index += 1; - } - return result; - }, - 'x' => { - state = .char_literal_hex_escape; - seen_escape_digits = 0; - }, - 'u' => { - state = .char_literal_unicode_escape_saw_u; - }, - else => { - state = .char_literal_end; - - if (self.invalidCharacterLength()) |len| { - result.tag = .invalid; - result.loc.end = self.index; - self.index += len; - return result; - } - - self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1; - }, - }, - - .char_literal_hex_escape => switch (c) { - '0'...'9', 'a'...'f', 'A'...'F' => { - seen_escape_digits += 1; - if (seen_escape_digits == 2) { - state = .char_literal_end; + state = .invalid; + continue; } - }, - else => { result.tag = .invalid; break; }, - }, - - .char_literal_unicode_escape_saw_u => switch (c) { - '{' => { - state = .char_literal_unicode_escape; - }, - else => { - result.tag = .invalid; - break; - }, - }, - - .char_literal_unicode_escape => switch (c) { - '0'...'9', 'a'...'f', 'A'...'F' => {}, - '}' => { - state = .char_literal_end; // too many/few digits handled later - }, - else => { + '\n' => { result.tag = .invalid; break; }, - }, - - .char_literal_end => switch (c) { - '\'' => { - result.tag = .char_literal; - self.index += 1; - break; + 0x01...0x09, 0x0b...0x1f, 0x7f => { + state = .invalid; }, else => { - result.tag = .invalid; - break; + state = .char_literal; }, }, .multiline_string_literal_line => switch (c) { 0 => { if (self.index != self.buffer.len) { - result.tag = .invalid; - result.loc.end = self.index; - self.index += 1; - return result; + state = .invalid; + continue; } break; }, @@ -886,17 +846,10 @@ pub const Tokenizer = struct { self.index += 1; break; }, - '\t' => {}, - else => { - if (self.invalidCharacterLength()) |len| { - result.tag = .invalid; - result.loc.end = self.index; - self.index += len; - return result; - } - - self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1; + 0x01...0x08, 0x0b...0x1f, 0x7f => { + state = .invalid; }, + else => continue, }, .bang => switch (c) { @@ -1113,12 +1066,16 @@ pub const Tokenizer = struct { .line_comment_start => switch (c) { 0 => { if (self.index != self.buffer.len) { - result.tag = .invalid; - result.loc.end = self.index; - self.index += 1; - return result; + state = .invalid; + continue; } - break; + return .{ + .tag = .eof, + .loc = .{ + .start = self.index, + .end = self.index, + }, + }; }, '/' => { state = .doc_comment_start; @@ -1127,105 +1084,74 @@ pub const Tokenizer = struct { result.tag = .container_doc_comment; state = .doc_comment; }, + '\r' => { + state = .expect_newline; + }, '\n' => { state = .start; result.loc.start = self.index + 1; }, - '\t' => { - state = .line_comment; + 0x01...0x08, 0x0b...0x0c, 0x0e...0x1f, 0x7f => { + state = .invalid; }, else => { state = .line_comment; - - if (self.invalidCharacterLength()) |len| { - result.tag = .invalid; - result.loc.end = self.index; - self.index += len; - return result; - } - - self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1; }, }, .doc_comment_start => switch (c) { - '/' => { - state = .line_comment; - }, - 0 => { - if (self.index != self.buffer.len) { - result.tag = .invalid; - result.loc.end = self.index; - self.index += 1; - return result; - } + 0, '\n', '\r' => { result.tag = .doc_comment; break; }, - '\n' => { - result.tag = .doc_comment; - break; + '/' => { + state = .line_comment; }, - '\t' => { - state = .doc_comment; - result.tag = .doc_comment; + 0x01...0x08, 0x0b...0x0c, 0x0e...0x1f, 0x7f => { + state = .invalid; }, else => { state = .doc_comment; result.tag = .doc_comment; - - if (self.invalidCharacterLength()) |len| { - result.tag = .invalid; - result.loc.end = self.index; - self.index += len; - return result; - } - - self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1; }, }, .line_comment => switch (c) { 0 => { if (self.index != self.buffer.len) { - result.tag = .invalid; - result.loc.end = self.index; - self.index += 1; - return result; + state = .invalid; + continue; } - break; + return .{ + .tag = .eof, + .loc = .{ + .start = self.index, + .end = self.index, + }, + }; + }, + '\r' => { + state = .expect_newline; }, '\n' => { state = .start; result.loc.start = self.index + 1; }, - '\t' => {}, - else => { - if (self.invalidCharacterLength()) |len| { - result.tag = .invalid; - result.loc.end = self.index; - self.index += len; - return result; - } - - self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1; + 0x01...0x08, 0x0b...0x0c, 0x0e...0x1f, 0x7f => { + state = .invalid; }, + else => continue, }, .doc_comment => switch (c) { - 0, '\n' => break, - '\t' => {}, - else => { - if (self.invalidCharacterLength()) |len| { - result.tag = .invalid; - result.loc.end = self.index; - self.index += len; - return result; - } - - self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1; + 0, '\n', '\r' => { + break; }, + 0x01...0x08, 0x0b...0x0c, 0x0e...0x1f, 0x7f => { + state = .invalid; + }, + else => continue, }, .int => switch (c) { '.' => state = .int_period, - '_', 'a'...'d', 'f'...'o', 'q'...'z', 'A'...'D', 'F'...'O', 'Q'...'Z', '0'...'9' => {}, + '_', 'a'...'d', 'f'...'o', 'q'...'z', 'A'...'D', 'F'...'O', 'Q'...'Z', '0'...'9' => continue, 'e', 'E', 'p', 'P' => state = .int_exponent, else => break, }, @@ -1249,7 +1175,7 @@ pub const Tokenizer = struct { }, }, .float => switch (c) { - '_', 'a'...'d', 'f'...'o', 'q'...'z', 'A'...'D', 'F'...'O', 'Q'...'Z', '0'...'9' => {}, + '_', 'a'...'d', 'f'...'o', 'q'...'z', 'A'...'D', 'F'...'O', 'Q'...'Z', '0'...'9' => continue, 'e', 'E', 'p', 'P' => state = .float_exponent, else => break, }, @@ -1263,57 +1189,9 @@ pub const Tokenizer = struct { } } - if (result.tag == .eof) { - result.loc.start = self.index; - } - result.loc.end = self.index; return result; } - - fn invalidCharacterLength(self: *Tokenizer) ?u3 { - const c0 = self.buffer[self.index]; - if (std.ascii.isAscii(c0)) { - if (c0 == '\r') { - if (self.index + 1 < self.buffer.len and self.buffer[self.index + 1] == '\n') { - // Carriage returns are *only* allowed just before a linefeed as part of a CRLF pair, otherwise - // they constitute an illegal byte! - return null; - } else { - return 1; - } - } else if (std.ascii.isControl(c0)) { - // ascii control codes are never allowed - // (note that \n was checked before we got here) - return 1; - } - // looks fine to me. - return null; - } else { - // check utf8-encoded character. - const length = std.unicode.utf8ByteSequenceLength(c0) catch return 1; - if (self.index + length > self.buffer.len) { - return @as(u3, @intCast(self.buffer.len - self.index)); - } - const bytes = self.buffer[self.index .. self.index + length]; - switch (length) { - 2 => { - const value = std.unicode.utf8Decode2(bytes) catch return length; - if (value == 0x85) return length; // U+0085 (NEL) - }, - 3 => { - const value = std.unicode.utf8Decode3(bytes) catch return length; - if (value == 0x2028) return length; // U+2028 (LS) - if (value == 0x2029) return length; // U+2029 (PS) - }, - 4 => { - _ = std.unicode.utf8Decode4(bytes) catch return length; - }, - else => unreachable, - } - return null; - } - } }; test "keywords" { @@ -1355,7 +1233,7 @@ test "code point literal with hex escape" { , &.{.char_literal}); try testTokenize( \\'\x1' - , &.{ .invalid, .invalid }); + , &.{.char_literal}); } test "newline in char literal" { @@ -1396,40 +1274,30 @@ test "code point literal with unicode escapes" { // Invalid unicode escapes try testTokenize( \\'\u' - , &.{ .invalid, .invalid }); + , &.{.char_literal}); try testTokenize( \\'\u{{' - , &.{ .invalid, .l_brace, .invalid }); + , &.{.char_literal}); try testTokenize( \\'\u{}' , &.{.char_literal}); try testTokenize( \\'\u{s}' - , &.{ - .invalid, - .identifier, - .r_brace, - .invalid, - }); + , &.{.char_literal}); try testTokenize( \\'\u{2z}' - , &.{ - .invalid, - .identifier, - .r_brace, - .invalid, - }); + , &.{.char_literal}); try testTokenize( \\'\u{4a' - , &.{ .invalid, .invalid }); // 4a is valid + , &.{.char_literal}); // Test old-style unicode literals try testTokenize( \\'\u0333' - , &.{ .invalid, .number_literal, .invalid }); + , &.{.char_literal}); try testTokenize( \\'\U0333' - , &.{ .invalid, .number_literal, .invalid }); + , &.{.char_literal}); } test "code point literal with unicode code point" { @@ -1465,24 +1333,15 @@ test "invalid token characters" { try testTokenize("`", &.{.invalid}); try testTokenize("'c", &.{.invalid}); try testTokenize("'", &.{.invalid}); - try testTokenize("''", &.{.invalid}); + try testTokenize("''", &.{.char_literal}); try testTokenize("'\n'", &.{ .invalid, .invalid }); } test "invalid literal/comment characters" { - try testTokenize("\"\x00\"", &.{ - .invalid, - .invalid, // Incomplete string literal starting after invalid - }); - try testTokenize("//\x00", &.{ - .invalid, - }); - try testTokenize("//\x1f", &.{ - .invalid, - }); - try testTokenize("//\x7f", &.{ - .invalid, - }); + try testTokenize("\"\x00\"", &.{.invalid}); + try testTokenize("//\x00", &.{.invalid}); + try testTokenize("//\x1f", &.{.invalid}); + try testTokenize("//\x7f", &.{.invalid}); } test "utf8" { @@ -1491,46 +1350,24 @@ test "utf8" { } test "invalid utf8" { - try testTokenize("//\x80", &.{ - .invalid, - }); - try testTokenize("//\xbf", &.{ - .invalid, - }); - try testTokenize("//\xf8", &.{ - .invalid, - }); - try testTokenize("//\xff", &.{ - .invalid, - }); - try testTokenize("//\xc2\xc0", &.{ - .invalid, - }); - try testTokenize("//\xe0", &.{ - .invalid, - }); - try testTokenize("//\xf0", &.{ - .invalid, - }); - try testTokenize("//\xf0\x90\x80\xc0", &.{ - .invalid, - }); + try testTokenize("//\x80", &.{}); + try testTokenize("//\xbf", &.{}); + try testTokenize("//\xf8", &.{}); + try testTokenize("//\xff", &.{}); + try testTokenize("//\xc2\xc0", &.{}); + try testTokenize("//\xe0", &.{}); + try testTokenize("//\xf0", &.{}); + try testTokenize("//\xf0\x90\x80\xc0", &.{}); } test "illegal unicode codepoints" { // unicode newline characters.U+0085, U+2028, U+2029 try testTokenize("//\xc2\x84", &.{}); - try testTokenize("//\xc2\x85", &.{ - .invalid, - }); + try testTokenize("//\xc2\x85", &.{}); try testTokenize("//\xc2\x86", &.{}); try testTokenize("//\xe2\x80\xa7", &.{}); - try testTokenize("//\xe2\x80\xa8", &.{ - .invalid, - }); - try testTokenize("//\xe2\x80\xa9", &.{ - .invalid, - }); + try testTokenize("//\xe2\x80\xa8", &.{}); + try testTokenize("//\xe2\x80\xa9", &.{}); try testTokenize("//\xe2\x80\xaa", &.{}); } @@ -1892,8 +1729,8 @@ test "multi line string literal with only 1 backslash" { } test "invalid builtin identifiers" { - try testTokenize("@()", &.{ .invalid, .l_paren, .r_paren }); - try testTokenize("@0()", &.{ .invalid, .number_literal, .l_paren, .r_paren }); + try testTokenize("@()", &.{.invalid}); + try testTokenize("@0()", &.{.invalid}); } test "invalid token with unfinished escape right before eof" { @@ -1921,12 +1758,12 @@ test "saturating operators" { } test "null byte before eof" { - try testTokenize("123 \x00 456", &.{ .number_literal, .invalid, .number_literal }); + try testTokenize("123 \x00 456", &.{ .number_literal, .invalid }); try testTokenize("//\x00", &.{.invalid}); try testTokenize("\\\\\x00", &.{.invalid}); try testTokenize("\x00", &.{.invalid}); try testTokenize("// NUL\x00\n", &.{.invalid}); - try testTokenize("///\x00\n", &.{.invalid}); + try testTokenize("///\x00\n", &.{ .doc_comment, .invalid }); try testTokenize("/// NUL\x00\n", &.{ .doc_comment, .invalid }); } @@ -1936,6 +1773,9 @@ fn testTokenize(source: [:0]const u8, expected_token_tags: []const Token.Tag) !v const token = tokenizer.next(); try std.testing.expectEqual(expected_token_tag, token.tag); } + // Last token should always be eof, even when the last token was invalid, + // in which case the tokenizer is in an invalid state, which can only be + // recovered by opinionated means outside the scope of this implementation. const last_token = tokenizer.next(); try std.testing.expectEqual(Token.Tag.eof, last_token.tag); try std.testing.expectEqual(source.len, last_token.loc.start); diff --git a/src/Package/Manifest.zig b/src/Package/Manifest.zig index d9c39f5ab3fe..eb82ef039da2 100644 --- a/src/Package/Manifest.zig +++ b/src/Package/Manifest.zig @@ -549,6 +549,9 @@ const Parse = struct { .{raw_string[bad_index]}, ); }, + .empty_char_literal => { + try p.appendErrorOff(token, offset, "empty character literal", .{}); + }, } } diff --git a/test/cases/compile_errors/empty_char_lit.zig b/test/cases/compile_errors/empty_char_lit.zig new file mode 100644 index 000000000000..99d80778b19b --- /dev/null +++ b/test/cases/compile_errors/empty_char_lit.zig @@ -0,0 +1,9 @@ +export fn entry() u8 { + return ''; +} + +// error +// backend=stage2 +// target=native +// +// :2:12: error: empty character literal diff --git a/test/cases/compile_errors/invalid_legacy_unicode_escape.zig b/test/cases/compile_errors/invalid_legacy_unicode_escape.zig index cc4e78f6e4ed..c22d70f3eba3 100644 --- a/test/cases/compile_errors/invalid_legacy_unicode_escape.zig +++ b/test/cases/compile_errors/invalid_legacy_unicode_escape.zig @@ -6,5 +6,4 @@ export fn entry() void { // backend=stage2 // target=native // -// :2:15: error: expected expression, found 'invalid bytes' -// :2:18: note: invalid byte: '1' +// :2:17: error: invalid escape character: 'U' diff --git a/test/cases/compile_errors/invalid_unicode_escape.zig b/test/cases/compile_errors/invalid_unicode_escape.zig index 1555f2be801a..956b4a37a2c7 100644 --- a/test/cases/compile_errors/invalid_unicode_escape.zig +++ b/test/cases/compile_errors/invalid_unicode_escape.zig @@ -6,6 +6,5 @@ export fn entry() void { // backend=stage2 // target=native // -// :2:15: error: expected expression, found 'invalid bytes' -// :2:21: note: invalid byte: 'z' +// :2:21: error: expected hex digit or '}', found 'z' diff --git a/test/cases/compile_errors/normal_string_with_newline.zig b/test/cases/compile_errors/normal_string_with_newline.zig index 19e15133ee34..f19ce59ec814 100644 --- a/test/cases/compile_errors/normal_string_with_newline.zig +++ b/test/cases/compile_errors/normal_string_with_newline.zig @@ -5,5 +5,4 @@ b"; // backend=stage2 // target=native // -// :1:13: error: expected expression, found 'invalid bytes' -// :1:15: note: invalid byte: '\n' +// :1:13: error: expected expression, found 'invalid token' diff --git a/test/compile_errors.zig b/test/compile_errors.zig index 5c5a574caf5b..c7a3be8f9fc6 100644 --- a/test/compile_errors.zig +++ b/test/compile_errors.zig @@ -42,8 +42,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void { const case = ctx.obj("isolated carriage return in multiline string literal", b.graph.host); case.addError("const foo = \\\\\test\r\r rogue carriage return\n;", &[_][]const u8{ - ":1:13: error: expected expression, found 'invalid bytes'", - ":1:19: note: invalid byte: '\\r'", + ":1:13: error: expected expression, found 'invalid token'", }); } @@ -179,8 +178,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void { \\ return true; \\} , &[_][]const u8{ - ":1:1: error: expected type expression, found 'invalid bytes'", - ":1:1: note: invalid byte: '\\xff'", + ":1:1: error: expected type expression, found 'invalid token'", }); } @@ -222,8 +220,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void { const case = ctx.obj("invalid byte in string", b.graph.host); case.addError("_ = \"\x01Q\";", &[_][]const u8{ - ":1:5: error: expected expression, found 'invalid bytes'", - ":1:6: note: invalid byte: '\\x01'", + ":1:5: error: expected expression, found 'invalid token'", }); } @@ -231,8 +228,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void { const case = ctx.obj("invalid byte in comment", b.graph.host); case.addError("//\x01Q", &[_][]const u8{ - ":1:1: error: expected type expression, found 'invalid bytes'", - ":1:3: note: invalid byte: '\\x01'", + ":1:1: error: expected type expression, found 'invalid token'", }); } @@ -240,8 +236,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void { const case = ctx.obj("control character in character literal", b.graph.host); case.addError("const c = '\x01';", &[_][]const u8{ - ":1:11: error: expected expression, found 'invalid bytes'", - ":1:12: note: invalid byte: '\\x01'", + ":1:11: error: expected expression, found 'invalid token'", }); } @@ -249,8 +244,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void { const case = ctx.obj("invalid byte at start of token", b.graph.host); case.addError("x = \x00Q", &[_][]const u8{ - ":1:5: error: expected expression, found 'invalid bytes'", - ":1:5: note: invalid byte: '\\x00'", + ":1:5: error: expected expression, found 'invalid token'", }); } } From a7029496d153bd2ba4e91ef561a399cad6d77307 Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Wed, 31 Jul 2024 14:03:20 -0700 Subject: [PATCH 2/3] remove hard tabs from source code these are illegal according to the spec --- lib/std/crypto/ml_kem.zig | 6 +- lib/std/macho.zig | 9 ++- lib/std/zig/system/darwin/macos.zig | 92 ++++++++++++++--------------- lib/std/zig/system/linux.zig | 60 +++++++++---------- src/link/tapi/yaml/test.zig | 5 +- test/run_translated_c.zig | 12 ++-- test/translate_c.zig | 34 +++++------ 7 files changed, 107 insertions(+), 111 deletions(-) diff --git a/lib/std/crypto/ml_kem.zig b/lib/std/crypto/ml_kem.zig index c3cb5805eef5..23ea7bb8ec3e 100644 --- a/lib/std/crypto/ml_kem.zig +++ b/lib/std/crypto/ml_kem.zig @@ -677,10 +677,10 @@ fn montReduce(x: i32) i16 { // Note gcd(2¹⁶, q) = 1 as q is prime. Write q' := 62209 = q⁻¹ mod R. // First we compute // - // m := ((x mod R) q') mod R + // m := ((x mod R) q') mod R // = x q' mod R - // = int16(x q') - // = int16(int32(x) * int32(q')) + // = int16(x q') + // = int16(int32(x) * int32(q')) // // Note that x q' might be as big as 2³² and could overflow the int32 // multiplication in the last line. However for any int32s a and b, diff --git a/lib/std/macho.zig b/lib/std/macho.zig index d477befbc4cd..75aa91e5361d 100644 --- a/lib/std/macho.zig +++ b/lib/std/macho.zig @@ -203,8 +203,7 @@ pub const symtab_command = extern struct { /// local symbols (static and debugging symbols) - grouped by module /// defined external symbols - grouped by module (sorted by name if not lib) /// undefined external symbols (sorted by name if MH_BINDATLOAD is not set, -/// and in order the were seen by the static -/// linker if MH_BINDATLOAD is set) +/// and in order the were seen by the static linker if MH_BINDATLOAD is set) /// In this load command there are offsets and counts to each of the three groups /// of symbols. /// @@ -219,9 +218,9 @@ pub const symtab_command = extern struct { /// shared library. For executable and object modules, which are files /// containing only one module, the information that would be in these three /// tables is determined as follows: -/// table of contents - the defined external symbols are sorted by name -/// module table - the file contains only one module so everything in the -/// file is part of the module. +/// table of contents - the defined external symbols are sorted by name +/// module table - the file contains only one module so everything in the file +/// is part of the module. /// reference symbol table - is the defined and undefined external symbols /// /// For dynamically linked shared library files this load command also contains diff --git a/lib/std/zig/system/darwin/macos.zig b/lib/std/zig/system/darwin/macos.zig index bf26079e5ab0..0bc08b319cc4 100644 --- a/lib/std/zig/system/darwin/macos.zig +++ b/lib/std/zig/system/darwin/macos.zig @@ -303,16 +303,16 @@ test "detect" { \\ \\ \\ - \\ ProductBuildVersion - \\ 7W98 - \\ ProductCopyright - \\ Apple Computer, Inc. 1983-2004 - \\ ProductName - \\ Mac OS X - \\ ProductUserVisibleVersion - \\ 10.3.9 - \\ ProductVersion - \\ 10.3.9 + \\ ProductBuildVersion + \\ 7W98 + \\ ProductCopyright + \\ Apple Computer, Inc. 1983-2004 + \\ ProductName + \\ Mac OS X + \\ ProductUserVisibleVersion + \\ 10.3.9 + \\ ProductVersion + \\ 10.3.9 \\ \\ , @@ -323,18 +323,18 @@ test "detect" { \\ \\ \\ - \\ ProductBuildVersion - \\ 19G68 - \\ ProductCopyright - \\ 1983-2020 Apple Inc. - \\ ProductName - \\ Mac OS X - \\ ProductUserVisibleVersion - \\ 10.15.6 - \\ ProductVersion - \\ 10.15.6 - \\ iOSSupportVersion - \\ 13.6 + \\ ProductBuildVersion + \\ 19G68 + \\ ProductCopyright + \\ 1983-2020 Apple Inc. + \\ ProductName + \\ Mac OS X + \\ ProductUserVisibleVersion + \\ 10.15.6 + \\ ProductVersion + \\ 10.15.6 + \\ iOSSupportVersion + \\ 13.6 \\ \\ , @@ -345,18 +345,18 @@ test "detect" { \\ \\ \\ - \\ ProductBuildVersion - \\ 20A2408 - \\ ProductCopyright - \\ 1983-2020 Apple Inc. - \\ ProductName - \\ macOS - \\ ProductUserVisibleVersion - \\ 11.0 - \\ ProductVersion - \\ 11.0 - \\ iOSSupportVersion - \\ 14.2 + \\ ProductBuildVersion + \\ 20A2408 + \\ ProductCopyright + \\ 1983-2020 Apple Inc. + \\ ProductName + \\ macOS + \\ ProductUserVisibleVersion + \\ 11.0 + \\ ProductVersion + \\ 11.0 + \\ iOSSupportVersion + \\ 14.2 \\ \\ , @@ -367,18 +367,18 @@ test "detect" { \\ \\ \\ - \\ ProductBuildVersion - \\ 20C63 - \\ ProductCopyright - \\ 1983-2020 Apple Inc. - \\ ProductName - \\ macOS - \\ ProductUserVisibleVersion - \\ 11.1 - \\ ProductVersion - \\ 11.1 - \\ iOSSupportVersion - \\ 14.3 + \\ ProductBuildVersion + \\ 20C63 + \\ ProductCopyright + \\ 1983-2020 Apple Inc. + \\ ProductName + \\ macOS + \\ ProductUserVisibleVersion + \\ 11.1 + \\ ProductVersion + \\ 11.1 + \\ iOSSupportVersion + \\ 14.3 \\ \\ , diff --git a/lib/std/zig/system/linux.zig b/lib/std/zig/system/linux.zig index 5f58d6d926a5..7e37185f3e2b 100644 --- a/lib/std/zig/system/linux.zig +++ b/lib/std/zig/system/linux.zig @@ -109,12 +109,12 @@ const RiscvCpuinfoParser = CpuinfoParser(RiscvCpuinfoImpl); test "cpuinfo: RISC-V" { try testParser(RiscvCpuinfoParser, .riscv64, &Target.riscv.cpu.sifive_u74, - \\processor : 0 - \\hart : 1 - \\isa : rv64imafdc - \\mmu : sv39 - \\isa-ext : - \\uarch : sifive,u74-mc + \\processor : 0 + \\hart : 1 + \\isa : rv64imafdc + \\mmu : sv39 + \\isa-ext : + \\uarch : sifive,u74-mc ); } @@ -177,16 +177,16 @@ const PowerpcCpuinfoParser = CpuinfoParser(PowerpcCpuinfoImpl); test "cpuinfo: PowerPC" { try testParser(PowerpcCpuinfoParser, .powerpc, &Target.powerpc.cpu.@"970", - \\processor : 0 - \\cpu : PPC970MP, altivec supported - \\clock : 1250.000000MHz - \\revision : 1.1 (pvr 0044 0101) + \\processor : 0 + \\cpu : PPC970MP, altivec supported + \\clock : 1250.000000MHz + \\revision : 1.1 (pvr 0044 0101) ); try testParser(PowerpcCpuinfoParser, .powerpc64le, &Target.powerpc.cpu.pwr8, - \\processor : 0 - \\cpu : POWER8 (raw), altivec supported - \\clock : 2926.000000MHz - \\revision : 2.0 (pvr 004d 0200) + \\processor : 0 + \\cpu : POWER8 (raw), altivec supported + \\clock : 2926.000000MHz + \\revision : 2.0 (pvr 004d 0200) ); } @@ -304,25 +304,25 @@ test "cpuinfo: ARM" { \\CPU revision : 7 ); try testParser(ArmCpuinfoParser, .arm, &Target.arm.cpu.cortex_a7, - \\processor : 0 - \\model name : ARMv7 Processor rev 3 (v7l) - \\BogoMIPS : 18.00 - \\Features : half thumb fastmult vfp edsp neon vfpv3 tls vfpv4 idiva idivt vfpd32 lpae - \\CPU implementer : 0x41 + \\processor : 0 + \\model name : ARMv7 Processor rev 3 (v7l) + \\BogoMIPS : 18.00 + \\Features : half thumb fastmult vfp edsp neon vfpv3 tls vfpv4 idiva idivt vfpd32 lpae + \\CPU implementer : 0x41 \\CPU architecture: 7 - \\CPU variant : 0x0 - \\CPU part : 0xc07 - \\CPU revision : 3 + \\CPU variant : 0x0 + \\CPU part : 0xc07 + \\CPU revision : 3 \\ - \\processor : 4 - \\model name : ARMv7 Processor rev 3 (v7l) - \\BogoMIPS : 90.00 - \\Features : half thumb fastmult vfp edsp neon vfpv3 tls vfpv4 idiva idivt vfpd32 lpae - \\CPU implementer : 0x41 + \\processor : 4 + \\model name : ARMv7 Processor rev 3 (v7l) + \\BogoMIPS : 90.00 + \\Features : half thumb fastmult vfp edsp neon vfpv3 tls vfpv4 idiva idivt vfpd32 lpae + \\CPU implementer : 0x41 \\CPU architecture: 7 - \\CPU variant : 0x2 - \\CPU part : 0xc0f - \\CPU revision : 3 + \\CPU variant : 0x2 + \\CPU part : 0xc0f + \\CPU revision : 3 ); try testParser(ArmCpuinfoParser, .aarch64, &Target.aarch64.cpu.cortex_a72, \\processor : 0 diff --git a/src/link/tapi/yaml/test.zig b/src/link/tapi/yaml/test.zig index 8db943588587..d354a94b1dcc 100644 --- a/src/link/tapi/yaml/test.zig +++ b/src/link/tapi/yaml/test.zig @@ -237,10 +237,7 @@ test "double quoted string" { try testing.expectEqualStrings( \\"here" are some escaped quotes , arr[1]); - try testing.expectEqualStrings( - \\newlines and tabs - \\are supported - , arr[2]); + try testing.expectEqualStrings("newlines and tabs\nare\tsupported", arr[2]); try testing.expectEqualStrings( \\let's have \\some fun! diff --git a/test/run_translated_c.zig b/test/run_translated_c.zig index 744d08f27a1d..73aa7d01f434 100644 --- a/test/run_translated_c.zig +++ b/test/run_translated_c.zig @@ -26,17 +26,17 @@ pub fn addCases(cases: *tests.RunTranslatedCContext) void { \\void baz(void); \\struct foo { int x; }; \\void bar() { - \\ struct foo tmp; + \\ struct foo tmp; \\} \\ \\void baz() { - \\ struct foo tmp; + \\ struct foo tmp; \\} \\ \\int main(void) { - \\ bar(); - \\ baz(); - \\ return 0; + \\ bar(); + \\ baz(); + \\ return 0; \\} , ""); @@ -53,7 +53,7 @@ pub fn addCases(cases: *tests.RunTranslatedCContext) void { cases.add("parenthesized string literal", \\void foo(const char *s) {} \\int main(void) { - \\ foo(("bar")); + \\ foo(("bar")); \\} , ""); diff --git a/test/translate_c.zig b/test/translate_c.zig index c07b29f77248..2100e23ea3ee 100644 --- a/test/translate_c.zig +++ b/test/translate_c.zig @@ -133,20 +133,20 @@ pub fn addCases(cases: *tests.TranslateCContext) void { cases.add("scoped typedef", \\void foo() { - \\ typedef union { - \\ int A; - \\ int B; - \\ int C; - \\ } Foo; - \\ Foo a = {0}; - \\ { - \\ typedef union { - \\ int A; - \\ int B; - \\ int C; - \\ } Foo; - \\ Foo a = {0}; - \\ } + \\ typedef union { + \\ int A; + \\ int B; + \\ int C; + \\ } Foo; + \\ Foo a = {0}; + \\ { + \\ typedef union { + \\ int A; + \\ int B; + \\ int C; + \\ } Foo; + \\ Foo a = {0}; + \\ } \\} , &[_][]const u8{ \\pub export fn foo() void { @@ -2043,18 +2043,18 @@ pub fn addCases(cases: *tests.TranslateCContext) void { \\ break; \\ } \\ case 4: - \\ case 5: + \\ case 5: \\ res = 69; \\ { \\ res = 5; - \\ return; + \\ return; \\ } \\ case 6: \\ switch (res) { \\ case 9: break; \\ } \\ res = 1; - \\ return; + \\ return; \\ } \\} , &[_][]const u8{ From c2b8afcac9e427102370dc5bac8c3d9621eee6d8 Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Wed, 31 Jul 2024 14:04:15 -0700 Subject: [PATCH 3/3] tokenizer: tabs and carriage returns spec conformance --- lib/std/zig/tokenizer.zig | 129 +++++++++++++++++++++++++++----------- test/compile_errors.zig | 8 --- 2 files changed, 94 insertions(+), 43 deletions(-) diff --git a/lib/std/zig/tokenizer.zig b/lib/std/zig/tokenizer.zig index 32e11b1b9a6d..c375818770ab 100644 --- a/lib/std/zig/tokenizer.zig +++ b/lib/std/zig/tokenizer.zig @@ -424,10 +424,7 @@ pub const Tokenizer = struct { }; state = .invalid; }, - '\r' => { - state = .expect_newline; - }, - ' ', '\n', '\t' => { + ' ', '\n', '\t', '\r' => { result.loc.start = self.index + 1; }, '"' => { @@ -553,6 +550,13 @@ pub const Tokenizer = struct { }, .expect_newline => switch (c) { + 0 => { + if (self.index == self.buffer.len) { + result.tag = .invalid; + break; + } + state = .invalid; + }, '\n' => { result.loc.start = self.index + 1; state = .start; @@ -846,7 +850,15 @@ pub const Tokenizer = struct { self.index += 1; break; }, - 0x01...0x08, 0x0b...0x1f, 0x7f => { + '\r' => { + if (self.buffer[self.index + 1] == '\n') { + self.index += 2; + break; + } else { + state = .invalid; + } + }, + 0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => { state = .invalid; }, else => continue, @@ -1091,7 +1103,7 @@ pub const Tokenizer = struct { state = .start; result.loc.start = self.index + 1; }, - 0x01...0x08, 0x0b...0x0c, 0x0e...0x1f, 0x7f => { + 0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => { state = .invalid; }, else => { @@ -1099,14 +1111,23 @@ pub const Tokenizer = struct { }, }, .doc_comment_start => switch (c) { - 0, '\n', '\r' => { + 0, '\n' => { result.tag = .doc_comment; break; }, + '\r' => { + if (self.buffer[self.index + 1] == '\n') { + self.index += 1; + result.tag = .doc_comment; + break; + } else { + state = .invalid; + } + }, '/' => { state = .line_comment; }, - 0x01...0x08, 0x0b...0x0c, 0x0e...0x1f, 0x7f => { + 0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => { state = .invalid; }, else => { @@ -1135,16 +1156,24 @@ pub const Tokenizer = struct { state = .start; result.loc.start = self.index + 1; }, - 0x01...0x08, 0x0b...0x0c, 0x0e...0x1f, 0x7f => { + 0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => { state = .invalid; }, else => continue, }, .doc_comment => switch (c) { - 0, '\n', '\r' => { + 0, '\n' => { break; }, - 0x01...0x08, 0x0b...0x0c, 0x0e...0x1f, 0x7f => { + '\r' => { + if (self.buffer[self.index + 1] == '\n') { + self.index += 1; + break; + } else { + state = .invalid; + } + }, + 0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => { state = .invalid; }, else => continue, @@ -1386,30 +1415,6 @@ test "string identifier and builtin fns" { }); } -test "multiline string literal with literal tab" { - try testTokenize( - \\\\foo bar - , &.{ - .multiline_string_literal_line, - }); -} - -test "comments with literal tab" { - try testTokenize( - \\//foo bar - \\//!foo bar - \\///foo bar - \\// foo - \\/// foo - \\/// /foo - , &.{ - .container_doc_comment, - .doc_comment, - .doc_comment, - .doc_comment, - }); -} - test "pipe and then invalid" { try testTokenize("||=", &.{ .pipe_pipe, @@ -1767,6 +1772,60 @@ test "null byte before eof" { try testTokenize("/// NUL\x00\n", &.{ .doc_comment, .invalid }); } +test "invalid tabs and carriage returns" { + // "Inside Line Comments and Documentation Comments, Any TAB is rejected by + // the grammar since it is ambiguous how it should be rendered." + // https://github.com/ziglang/zig-spec/issues/38 + try testTokenize("//\t", &.{.invalid}); + try testTokenize("// \t", &.{.invalid}); + try testTokenize("///\t", &.{.invalid}); + try testTokenize("/// \t", &.{.invalid}); + try testTokenize("//!\t", &.{.invalid}); + try testTokenize("//! \t", &.{.invalid}); + + // "Inside Line Comments and Documentation Comments, CR directly preceding + // NL is unambiguously part of the newline sequence. It is accepted by the + // grammar and removed by zig fmt, leaving only NL. CR anywhere else is + // rejected by the grammar." + // https://github.com/ziglang/zig-spec/issues/38 + try testTokenize("//\r", &.{.invalid}); + try testTokenize("// \r", &.{.invalid}); + try testTokenize("///\r", &.{.invalid}); + try testTokenize("/// \r", &.{.invalid}); + try testTokenize("//\r ", &.{.invalid}); + try testTokenize("// \r ", &.{.invalid}); + try testTokenize("///\r ", &.{.invalid}); + try testTokenize("/// \r ", &.{.invalid}); + try testTokenize("//\r\n", &.{}); + try testTokenize("// \r\n", &.{}); + try testTokenize("///\r\n", &.{.doc_comment}); + try testTokenize("/// \r\n", &.{.doc_comment}); + try testTokenize("//!\r", &.{.invalid}); + try testTokenize("//! \r", &.{.invalid}); + try testTokenize("//!\r ", &.{.invalid}); + try testTokenize("//! \r ", &.{.invalid}); + try testTokenize("//!\r\n", &.{.container_doc_comment}); + try testTokenize("//! \r\n", &.{.container_doc_comment}); + + // The control characters TAB and CR are rejected by the grammar inside multi-line string literals, + // except if CR is directly before NL. + // https://github.com/ziglang/zig-spec/issues/38 + try testTokenize("\\\\\r", &.{.invalid}); + try testTokenize("\\\\\r ", &.{.invalid}); + try testTokenize("\\\\ \r", &.{.invalid}); + try testTokenize("\\\\\t", &.{.invalid}); + try testTokenize("\\\\\t ", &.{.invalid}); + try testTokenize("\\\\ \t", &.{.invalid}); + try testTokenize("\\\\\r\n", &.{.multiline_string_literal_line}); + + // "TAB used as whitespace is...accepted by the grammar. CR used as + // whitespace, whether directly preceding NL or stray, is...accepted by the + // grammar." + // https://github.com/ziglang/zig-spec/issues/38 + try testTokenize("\tpub\tswitch\t", &.{ .keyword_pub, .keyword_switch }); + try testTokenize("\rpub\rswitch\r", &.{ .keyword_pub, .keyword_switch }); +} + fn testTokenize(source: [:0]const u8, expected_token_tags: []const Token.Tag) !void { var tokenizer = Tokenizer.init(source); for (expected_token_tags) |expected_token_tag| { diff --git a/test/compile_errors.zig b/test/compile_errors.zig index c7a3be8f9fc6..07ad178859db 100644 --- a/test/compile_errors.zig +++ b/test/compile_errors.zig @@ -38,14 +38,6 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void { }); } - { - const case = ctx.obj("isolated carriage return in multiline string literal", b.graph.host); - - case.addError("const foo = \\\\\test\r\r rogue carriage return\n;", &[_][]const u8{ - ":1:13: error: expected expression, found 'invalid token'", - }); - } - { const case = ctx.obj("missing semicolon at EOF", b.graph.host); case.addError(