Skip to content

Commit e676305

Browse files
committed
Only accept carriage returns before line feeds
Previous commit was much less strict about this, this more closely matches the desired spec of only allow CR characters in a CRLF pair, but not otherwise.
1 parent 23c4061 commit e676305

File tree

2 files changed

+34
-30
lines changed

2 files changed

+34
-30
lines changed

lib/std/zig/tokenizer.zig

+27-7
Original file line numberDiff line numberDiff line change
@@ -442,7 +442,7 @@ pub const Tokenizer = struct {
442442
switch (state) {
443443
.start => switch (c) {
444444
0 => break,
445-
' ', '\n', '\t', '\r' => {
445+
' ', '\n', '\t' => {
446446
result.loc.start = self.index + 1;
447447
},
448448
'"' => {
@@ -565,6 +565,18 @@ pub const Tokenizer = struct {
565565
state = .int_literal_dec;
566566
result.tag = .integer_literal;
567567
},
568+
'\r' => {
569+
// Carriage returns are *only* allowed just before a linefeed as part of a CRLF pair, otherwise
570+
// they constitute an illegal byte!
571+
if (self.index + 1 < self.buffer.len and self.buffer[self.index + 1] == '\n') {
572+
result.loc.start = self.index + 1;
573+
} else {
574+
result.tag = .invalid;
575+
result.loc.end = self.index;
576+
self.index += 1;
577+
return result;
578+
}
579+
},
568580
else => {
569581
result.tag = .invalid;
570582
result.loc.end = self.index;
@@ -903,7 +915,7 @@ pub const Tokenizer = struct {
903915
self.index += 1;
904916
break;
905917
},
906-
'\t', '\r' => {},
918+
'\t' => {},
907919
else => self.checkLiteralCharacter(),
908920
},
909921

@@ -1137,7 +1149,7 @@ pub const Tokenizer = struct {
11371149
state = .start;
11381150
result.loc.start = self.index + 1;
11391151
},
1140-
'\t', '\r' => state = .line_comment,
1152+
'\t' => state = .line_comment,
11411153
else => {
11421154
state = .line_comment;
11431155
self.checkLiteralCharacter();
@@ -1151,7 +1163,7 @@ pub const Tokenizer = struct {
11511163
result.tag = .doc_comment;
11521164
break;
11531165
},
1154-
'\t', '\r' => {
1166+
'\t' => {
11551167
state = .doc_comment;
11561168
result.tag = .doc_comment;
11571169
},
@@ -1167,12 +1179,12 @@ pub const Tokenizer = struct {
11671179
state = .start;
11681180
result.loc.start = self.index + 1;
11691181
},
1170-
'\t', '\r' => {},
1182+
'\t' => {},
11711183
else => self.checkLiteralCharacter(),
11721184
},
11731185
.doc_comment => switch (c) {
11741186
0, '\n' => break,
1175-
'\t', '\r' => {},
1187+
'\t' => {},
11761188
else => self.checkLiteralCharacter(),
11771189
},
11781190
.zero => switch (c) {
@@ -1433,7 +1445,15 @@ pub const Tokenizer = struct {
14331445
fn getInvalidCharacterLength(self: *Tokenizer) u3 {
14341446
const c0 = self.buffer[self.index];
14351447
if (std.ascii.isASCII(c0)) {
1436-
if (std.ascii.isCntrl(c0)) {
1448+
if (c0 == '\r') {
1449+
if (self.index + 1 < self.buffer.len and self.buffer[self.index + 1] == '\n') {
1450+
// Carriage returns are *only* allowed just before a linefeed as part of a CRLF pair, otherwise
1451+
// they constitute an illegal byte!
1452+
return 0;
1453+
} else {
1454+
return 1;
1455+
}
1456+
} else if (std.ascii.isCntrl(c0)) {
14371457
// ascii control codes are never allowed
14381458
// (note that \n was checked before we got here)
14391459
return 1;

src/AstGen.zig

+7-23
Original file line numberDiff line numberDiff line change
@@ -9961,35 +9961,19 @@ fn strLitNodeAsString(astgen: *AstGen, node: Ast.Node.Index) !IndexSlice {
99619961
var tok_i = start;
99629962
{
99639963
const slice = tree.tokenSlice(tok_i);
9964-
const line_bytes = slice[2 .. slice.len - 1];
9965-
const carriage_return_count = mem.count(u8, line_bytes, "\r");
9966-
if (carriage_return_count > 0) {
9967-
try string_bytes.ensureUnusedCapacity(gpa, line_bytes.len - carriage_return_count);
9968-
for (line_bytes) |line_byte| {
9969-
if (line_byte == '\r') continue;
9970-
string_bytes.appendAssumeCapacity(line_byte);
9971-
}
9972-
} else {
9973-
try string_bytes.appendSlice(gpa, line_bytes);
9974-
}
9964+
const carriage_return_ending: usize = if (slice[slice.len - 2] == '\r') 2 else 1;
9965+
const line_bytes = slice[2 .. slice.len - carriage_return_ending];
9966+
try string_bytes.appendSlice(gpa, line_bytes);
99759967
tok_i += 1;
99769968
}
99779969
// Following lines: each line prepends a newline.
99789970
while (tok_i <= end) : (tok_i += 1) {
99799971
const slice = tree.tokenSlice(tok_i);
9980-
const line_bytes = slice[2 .. slice.len - 1];
9981-
9982-
const carriage_return_count = mem.count(u8, line_bytes, "\r");
9983-
try string_bytes.ensureUnusedCapacity(gpa, line_bytes.len - carriage_return_count + 1);
9972+
const carriage_return_ending: usize = if (slice[slice.len - 2] == '\r') 2 else 1;
9973+
const line_bytes = slice[2 .. slice.len - carriage_return_ending];
9974+
try string_bytes.ensureUnusedCapacity(gpa, line_bytes.len + 1);
99849975
string_bytes.appendAssumeCapacity('\n');
9985-
if (carriage_return_count > 0) {
9986-
for (line_bytes) |line_byte| {
9987-
if (line_byte == '\r') continue;
9988-
string_bytes.appendAssumeCapacity(line_byte);
9989-
}
9990-
} else {
9991-
string_bytes.appendSliceAssumeCapacity(line_bytes);
9992-
}
9976+
string_bytes.appendSliceAssumeCapacity(line_bytes);
99939977
}
99949978
const len = string_bytes.items.len - str_index;
99959979
try string_bytes.append(gpa, 0);

0 commit comments

Comments
 (0)