Skip to content

Commit db47cb1

Browse files
authored
Merge pull request #14962 from omochi/syntax-nul-trivia
[Syntax] support nul character as garbage text trivia in libSyntax
2 parents adfead0 + 190af6c commit db47cb1

File tree

5 files changed

+112
-42
lines changed

5 files changed

+112
-42
lines changed

include/swift/Parse/Lexer.h

+12
Original file line numberDiff line numberDiff line change
@@ -461,6 +461,16 @@ class Lexer {
461461
};
462462

463463
private:
464+
/// Nul character meaning kind.
465+
enum class NulCharacterKind {
466+
/// String buffer terminator.
467+
BufferEnd,
468+
/// Embedded nul character.
469+
Embedded,
470+
/// Code completion marker.
471+
CodeCompletion
472+
};
473+
464474
/// For a source location in the current buffer, returns the corresponding
465475
/// pointer.
466476
const char *getBufferPtrForSourceLoc(SourceLoc Loc) const {
@@ -520,6 +530,8 @@ class Lexer {
520530
/// Try to lex conflict markers by checking for the presence of the start and
521531
/// end of the marker in diff3 or Perforce style respectively.
522532
bool tryLexConflictMarker(bool EatNewline);
533+
534+
NulCharacterKind getNulCharacterKind(const char *Ptr) const;
523535
};
524536

525537
/// Given an ordered token \param Array , get the iterator pointing to the first

lib/Parse/Lexer.cpp

+67-37
Original file line numberDiff line numberDiff line change
@@ -351,16 +351,19 @@ void Lexer::skipToEndOfLine(bool EatNewline) {
351351
}
352352
break; // Otherwise, eat other characters.
353353
case 0:
354-
// If this is a random nul character in the middle of a buffer, skip it as
355-
// whitespace.
356-
if (CurPtr-1 != BufferEnd) {
354+
switch (getNulCharacterKind(CurPtr - 1)) {
355+
case NulCharacterKind::Embedded:
356+
// If this is a random nul character in the middle of a buffer, skip it
357+
// as whitespace.
357358
diagnoseEmbeddedNul(Diags, CurPtr-1);
358-
break;
359+
LLVM_FALLTHROUGH;
360+
case NulCharacterKind::CodeCompletion:
361+
continue;
362+
case NulCharacterKind::BufferEnd:
363+
// Otherwise, the last line of the file does not have a newline.
364+
--CurPtr;
365+
return;
359366
}
360-
361-
// Otherwise, the last line of the file does not have a newline.
362-
--CurPtr;
363-
return;
364367
}
365368
}
366369
}
@@ -422,26 +425,30 @@ void Lexer::skipSlashStarComment() {
422425

423426
break; // Otherwise, eat other characters.
424427
case 0:
425-
// If this is a random nul character in the middle of a buffer, skip it as
426-
// whitespace.
427-
if (CurPtr-1 != BufferEnd) {
428-
diagnoseEmbeddedNul(Diags, CurPtr-1);
429-
break;
430-
}
431-
432-
// Otherwise, we have an unterminated /* comment.
433-
--CurPtr;
428+
switch (getNulCharacterKind(CurPtr - 1)) {
429+
case NulCharacterKind::Embedded:
430+
// If this is a random nul character in the middle of a buffer, skip it
431+
// as whitespace.
432+
diagnoseEmbeddedNul(Diags, CurPtr - 1);
433+
LLVM_FALLTHROUGH;
434+
case NulCharacterKind::CodeCompletion:
435+
continue;
436+
case NulCharacterKind::BufferEnd: {
437+
// Otherwise, we have an unterminated /* comment.
438+
--CurPtr;
434439

435-
// Count how many levels deep we are.
436-
llvm::SmallString<8> Terminator("*/");
437-
while (--Depth != 0)
438-
Terminator += "*/";
440+
// Count how many levels deep we are.
441+
llvm::SmallString<8> Terminator("*/");
442+
while (--Depth != 0)
443+
Terminator += "*/";
439444

440-
const char *EOL = (CurPtr[-1] == '\n') ? (CurPtr - 1) : CurPtr;
441-
diagnose(EOL, diag::lex_unterminated_block_comment)
442-
.fixItInsert(getSourceLoc(EOL), Terminator);
443-
diagnose(StartPtr, diag::lex_comment_start);
444-
return;
445+
const char *EOL = (CurPtr[-1] == '\n') ? (CurPtr - 1) : CurPtr;
446+
diagnose(EOL, diag::lex_unterminated_block_comment)
447+
.fixItInsert(getSourceLoc(EOL), Terminator);
448+
diagnose(StartPtr, diag::lex_comment_start);
449+
return;
450+
}
451+
}
445452
}
446453
}
447454
}
@@ -1857,6 +1864,16 @@ bool Lexer::tryLexConflictMarker(bool EatNewline) {
18571864
return false;
18581865
}
18591866

1867+
Lexer::NulCharacterKind Lexer::getNulCharacterKind(const char *Ptr) const {
1868+
assert(Ptr != nullptr && *Ptr == 0);
1869+
if (Ptr == CodeCompletionPtr) {
1870+
return NulCharacterKind::CodeCompletion;
1871+
}
1872+
if (Ptr == BufferEnd) {
1873+
return NulCharacterKind::BufferEnd;
1874+
}
1875+
return NulCharacterKind::Embedded;
1876+
}
18601877

18611878
void Lexer::tryLexEditorPlaceholder() {
18621879
assert(CurPtr[-1] == '<' && CurPtr[0] == '#');
@@ -2164,22 +2181,23 @@ void Lexer::lexImpl() {
21642181
return formToken(tok::unknown, TokStart);
21652182

21662183
case 0:
2167-
if (CurPtr-1 == CodeCompletionPtr)
2184+
switch (getNulCharacterKind(CurPtr - 1)) {
2185+
case NulCharacterKind::CodeCompletion:
21682186
return formToken(tok::code_complete, TokStart);
21692187

2170-
// If this is a random nul character in the middle of a buffer, skip it as
2171-
// whitespace.
2172-
if (CurPtr-1 != BufferEnd) {
2188+
case NulCharacterKind::Embedded:
2189+
// If this is a random nul character in the middle of a buffer, skip it as
2190+
// whitespace.
21732191
diagnoseEmbeddedNul(Diags, CurPtr-1);
21742192
goto Restart;
2193+
case NulCharacterKind::BufferEnd:
2194+
// Otherwise, this is the real end of the buffer. Put CurPtr back into
2195+
// buffer bounds.
2196+
--CurPtr;
2197+
// Return EOF.
2198+
return formToken(tok::eof, TokStart);
21752199
}
21762200

2177-
// Otherwise, this is the real end of the buffer. Put CurPtr back into
2178-
// buffer bounds.
2179-
--CurPtr;
2180-
// Return EOF.
2181-
return formToken(tok::eof, TokStart);
2182-
21832201
case '@': return formToken(tok::at_sign, TokStart);
21842202
case '{': return formToken(tok::l_brace, TokStart);
21852203
case '[': {
@@ -2323,7 +2341,6 @@ void Lexer::lexTrivia(syntax::Trivia &Pieces, bool IsForTrailingTrivia) {
23232341
Restart:
23242342
const char *TriviaStart = CurPtr;
23252343

2326-
// TODO: Handle random nul('\0') character in the middle of a buffer.
23272344
// TODO: Handle invalid UTF8 sequence which is skipped in lexImpl().
23282345
switch (*CurPtr++) {
23292346
case '\n':
@@ -2403,6 +2420,19 @@ void Lexer::lexTrivia(syntax::Trivia &Pieces, bool IsForTrailingTrivia) {
24032420
goto Restart;
24042421
}
24052422
break;
2423+
case 0:
2424+
switch (getNulCharacterKind(CurPtr - 1)) {
2425+
case NulCharacterKind::Embedded: {
2426+
diagnoseEmbeddedNul(Diags, CurPtr - 1);
2427+
size_t Length = CurPtr - TriviaStart;
2428+
Pieces.push_back(TriviaPiece::garbageText({TriviaStart, Length}));
2429+
goto Restart;
2430+
}
2431+
case NulCharacterKind::CodeCompletion:
2432+
case NulCharacterKind::BufferEnd:
2433+
break;
2434+
}
2435+
break;
24062436
default:
24072437
break;
24082438
}

test/Syntax/lexer_invalid_nul.swift

-5
This file was deleted.

test/Syntax/round_trip_nul.swift

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
// RUN: cat %s | tr '\132' '\0' > %t.tr
2+
// RUN: cp -f %t.tr %t
3+
// RUN: %round-trip-syntax-test --swift-syntax-test %swift-syntax-test --file %t
4+
let a = Z3Z // nul(Z)
5+
func b() {}

test/Syntax/tokens_nul.swift

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
// RUN: cat %s | tr '\132' '\0' > %t.tmp
2+
// RUN: cp -f %t.tmp %t
3+
// RUN: %swift-syntax-test -input-source-filename %t -dump-full-tokens 2>&1 | %FileCheck %t
4+
let a = Z3Z // nul(Z)
5+
func b() {}
6+
7+
// CHECK: 4:9: warning: nul character embedded in middle of file
8+
// CHECK: 4:11: warning: nul character embedded in middle of file
9+
// CHECK: 4:20: warning: nul character embedded in middle of file
10+
11+
// CHECK-LABEL: 4:7
12+
// CHECK-NEXT:(Token equal
13+
// CHECK-NEXT: (text="=")
14+
// CHECK-NEXT: (trivia space 1)
15+
// CHECK-NEXT: (trivia garbage_text \000))
16+
17+
// CHECK-LABEL: 4:10
18+
// CHECK-NEXT:(Token integer_literal
19+
// CHECK-NEXT: (text="3")
20+
// CHECK-NEXT: (trivia garbage_text \000)
21+
// CHECK-NEXT: (trivia space 1))
22+
23+
// CHECK-LABEL: 5:1
24+
// CHECK-NEXT:(Token kw_func
25+
// CHECK-NEXT: (trivia line_comment // nul(\000))
26+
// CHECK-NEXT: (trivia newline 1)
27+
// CHECK-NEXT: (text="func")
28+
// CHECK-NEXT: (trivia space 1))

0 commit comments

Comments
 (0)