Skip to content

Commit c8c2b46

Browse files
committed
[Demangle][Rust] Parse non-ASCII identifiers
Rust allows use of non-ASCII identifiers, which in Rust mangling scheme are encoded using Punycode. The encoding deviates from the standard by using an underscore as the separator between ASCII part and a base-36 encoding of non-ASCII characters (avoiding hypen-minus in the symbol name). Other than that, the encoding follows the standard, and the decoder implemented here in turn follows the one given in RFC 3492. To avoid an extra intermediate memory allocation while decoding Punycode, the interface of OutputStream is extended with an insert method. Reviewed By: dblaikie Differential Revision: https://reviews.llvm.org/D104366
1 parent df672f6 commit c8c2b46

File tree

6 files changed

+297
-4
lines changed

6 files changed

+297
-4
lines changed

libcxxabi/src/demangle/Utility.h

+10
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,16 @@ class OutputStream {
126126
return this->operator<<(static_cast<unsigned long long>(N));
127127
}
128128

129+
void insert(size_t Pos, const char *S, size_t N) {
130+
assert(Pos <= CurrentPosition);
131+
if (N == 0)
132+
return;
133+
grow(N);
134+
std::memmove(Buffer + Pos + N, Buffer + Pos, CurrentPosition - Pos);
135+
std::memcpy(Buffer + Pos, S, N);
136+
CurrentPosition += N;
137+
}
138+
129139
size_t getCurrentPosition() const { return CurrentPosition; }
130140
void setCurrentPosition(size_t NewPos) { CurrentPosition = NewPos; }
131141

llvm/include/llvm/Demangle/Utility.h

+10
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,16 @@ class OutputStream {
126126
return this->operator<<(static_cast<unsigned long long>(N));
127127
}
128128

129+
void insert(size_t Pos, const char *S, size_t N) {
130+
assert(Pos <= CurrentPosition);
131+
if (N == 0)
132+
return;
133+
grow(N);
134+
std::memmove(Buffer + Pos + N, Buffer + Pos, CurrentPosition - Pos);
135+
std::memcpy(Buffer + Pos, S, N);
136+
CurrentPosition += N;
137+
}
138+
129139
size_t getCurrentPosition() const { return CurrentPosition; }
130140
void setCurrentPosition(size_t NewPos) { CurrentPosition = NewPos; }
131141

llvm/lib/Demangle/RustDemangle.cpp

+172-4
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ class Demangler {
135135
void printDecimalNumber(uint64_t N);
136136
void printBasicType(BasicType);
137137
void printLifetime(uint64_t Index);
138+
void printIdentifier(Identifier Ident);
138139

139140
char look() const;
140141
char consume();
@@ -283,8 +284,7 @@ bool Demangler::demanglePath(IsInType InType, LeaveGenericsOpen LeaveOpen) {
283284
switch (consume()) {
284285
case 'C': {
285286
parseOptionalBase62Number('s');
286-
Identifier Ident = parseIdentifier();
287-
print(Ident.Name);
287+
printIdentifier(parseIdentifier());
288288
break;
289289
}
290290
case 'M': {
@@ -333,7 +333,7 @@ bool Demangler::demanglePath(IsInType InType, LeaveGenericsOpen LeaveOpen) {
333333
print(NS);
334334
if (!Ident.empty()) {
335335
print(":");
336-
print(Ident.Name);
336+
printIdentifier(Ident);
337337
}
338338
print('#');
339339
printDecimalNumber(Disambiguator);
@@ -342,7 +342,7 @@ bool Demangler::demanglePath(IsInType InType, LeaveGenericsOpen LeaveOpen) {
342342
// Implementation internal namespaces.
343343
if (!Ident.empty()) {
344344
print("::");
345-
print(Ident.Name);
345+
printIdentifier(Ident);
346346
}
347347
}
348348
break;
@@ -669,6 +669,8 @@ void Demangler::demangleFnSig() {
669669
print("C");
670670
} else {
671671
Identifier Ident = parseIdentifier();
672+
if (Ident.Punycode)
673+
Error = true;
672674
for (char C : Ident.Name) {
673675
// When mangling ABI string, the "-" is replaced with "_".
674676
if (C == '_')
@@ -1078,6 +1080,172 @@ void Demangler::printLifetime(uint64_t Index) {
10781080
}
10791081
}
10801082

1083+
static inline bool decodePunycodeDigit(char C, size_t &Value) {
1084+
if (isLower(C)) {
1085+
Value = C - 'a';
1086+
return true;
1087+
}
1088+
1089+
if (isDigit(C)) {
1090+
Value = 26 + (C - '0');
1091+
return true;
1092+
}
1093+
1094+
return false;
1095+
}
1096+
1097+
static void removeNullBytes(OutputStream &Output, size_t StartIdx) {
1098+
char *Buffer = Output.getBuffer();
1099+
char *Start = Buffer + StartIdx;
1100+
char *End = Buffer + Output.getCurrentPosition();
1101+
Output.setCurrentPosition(std::remove(Start, End, '\0') - Buffer);
1102+
}
1103+
1104+
// Encodes code point as UTF-8 and stores results in Output. Returns false if
1105+
// CodePoint is not a valid unicode scalar value.
1106+
static inline bool encodeUTF8(size_t CodePoint, char *Output) {
1107+
if (0xD800 <= CodePoint && CodePoint <= 0xDFFF)
1108+
return false;
1109+
1110+
if (CodePoint <= 0x7F) {
1111+
Output[0] = CodePoint;
1112+
return true;
1113+
}
1114+
1115+
if (CodePoint <= 0x7FF) {
1116+
Output[0] = 0xC0 | ((CodePoint >> 6) & 0x3F);
1117+
Output[1] = 0x80 | (CodePoint & 0x3F);
1118+
return true;
1119+
}
1120+
1121+
if (CodePoint <= 0xFFFF) {
1122+
Output[0] = 0xE0 | (CodePoint >> 12);
1123+
Output[1] = 0x80 | ((CodePoint >> 6) & 0x3F);
1124+
Output[2] = 0x80 | (CodePoint & 0x3F);
1125+
return true;
1126+
}
1127+
1128+
if (CodePoint <= 0x10FFFF) {
1129+
Output[0] = 0xF0 | (CodePoint >> 18);
1130+
Output[1] = 0x80 | ((CodePoint >> 12) & 0x3F);
1131+
Output[2] = 0x80 | ((CodePoint >> 6) & 0x3F);
1132+
Output[3] = 0x80 | (CodePoint & 0x3F);
1133+
return true;
1134+
}
1135+
1136+
return false;
1137+
}
1138+
1139+
// Decodes string encoded using punycode and appends results to Output.
1140+
// Returns true if decoding was successful.
1141+
static bool decodePunycode(StringView Input, OutputStream &Output) {
1142+
size_t OutputSize = Output.getCurrentPosition();
1143+
size_t InputIdx = 0;
1144+
1145+
// Rust uses an underscore as a delimiter.
1146+
size_t DelimiterPos = StringView::npos;
1147+
for (size_t I = 0; I != Input.size(); ++I)
1148+
if (Input[I] == '_')
1149+
DelimiterPos = I;
1150+
1151+
if (DelimiterPos != StringView::npos) {
1152+
// Copy basic code points before the last delimiter to the output.
1153+
for (; InputIdx != DelimiterPos; ++InputIdx) {
1154+
char C = Input[InputIdx];
1155+
if (!isValid(C))
1156+
return false;
1157+
// Code points are padded with zeros while decoding is in progress.
1158+
char UTF8[4] = {C};
1159+
Output += StringView(UTF8, UTF8 + 4);
1160+
}
1161+
// Skip over the delimiter.
1162+
++InputIdx;
1163+
}
1164+
1165+
size_t Base = 36;
1166+
size_t Skew = 38;
1167+
size_t Bias = 72;
1168+
size_t N = 0x80;
1169+
size_t TMin = 1;
1170+
size_t TMax = 26;
1171+
size_t Damp = 700;
1172+
1173+
auto Adapt = [&](size_t Delta, size_t NumPoints) {
1174+
Delta /= Damp;
1175+
Delta += Delta / NumPoints;
1176+
Damp = 2;
1177+
1178+
size_t K = 0;
1179+
while (Delta > (Base - TMin) * TMax / 2) {
1180+
Delta /= Base - TMin;
1181+
K += Base;
1182+
}
1183+
return K + (((Base - TMin + 1) * Delta) / (Delta + Skew));
1184+
};
1185+
1186+
// Main decoding loop.
1187+
for (size_t I = 0; InputIdx != Input.size(); I += 1) {
1188+
size_t OldI = I;
1189+
size_t W = 1;
1190+
size_t Max = std::numeric_limits<size_t>::max();
1191+
for (size_t K = Base; true; K += Base) {
1192+
if (InputIdx == Input.size())
1193+
return false;
1194+
char C = Input[InputIdx++];
1195+
size_t Digit = 0;
1196+
if (!decodePunycodeDigit(C, Digit))
1197+
return false;
1198+
1199+
if (Digit > (Max - I) / W)
1200+
return false;
1201+
I += Digit * W;
1202+
1203+
size_t T;
1204+
if (K <= Bias)
1205+
T = TMin;
1206+
else if (K >= Bias + TMax)
1207+
T = TMax;
1208+
else
1209+
T = K - Bias;
1210+
1211+
if (Digit < T)
1212+
break;
1213+
1214+
if (W > Max / (Base - T))
1215+
return false;
1216+
W *= (Base - T);
1217+
}
1218+
size_t NumPoints = (Output.getCurrentPosition() - OutputSize) / 4 + 1;
1219+
Bias = Adapt(I - OldI, NumPoints);
1220+
1221+
if (I / NumPoints > Max - N)
1222+
return false;
1223+
N += I / NumPoints;
1224+
I = I % NumPoints;
1225+
1226+
// Insert N at position I in the output.
1227+
char UTF8[4] = {};
1228+
if (!encodeUTF8(N, UTF8))
1229+
return false;
1230+
Output.insert(OutputSize + I * 4, UTF8, 4);
1231+
}
1232+
1233+
removeNullBytes(Output, OutputSize);
1234+
return true;
1235+
}
1236+
1237+
void Demangler::printIdentifier(Identifier Ident) {
1238+
if (Error || !Print)
1239+
return;
1240+
1241+
if (Ident.Punycode) {
1242+
if (!decodePunycode(Ident.Name, Output))
1243+
Error = true;
1244+
} else {
1245+
print(Ident.Name);
1246+
}
1247+
}
1248+
10811249
char Demangler::look() const {
10821250
if (Error || Position >= Input.size())
10831251
return 0;

llvm/test/Demangle/rust.test

+43
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,11 @@ CHECK: function::<extern "cdecl" fn()>
237237
CHECK: function::<unsafe extern "C-cmse-nonsecure-call" fn()>
238238
_RIC8functionFUK21C_cmse_nonsecure_callEuE
239239

240+
; Invalid ABI with punycode.
241+
242+
CHECK: _RIC8functionFKu3n3hEuE
243+
_RIC8functionFKu3n3hEuE
244+
240245
; Trait objects
241246

242247
CHECK: trait::<dyn >
@@ -456,6 +461,44 @@ CHECK: dot (.llvm.1234)
456461
CHECK: dot (.llvm.6789)
457462
_RC3dotC5crate.llvm.6789
458463

464+
; Punycode
465+
466+
CHECK: punycode::東京
467+
_RNvC8punycodeu7_1lqs71d
468+
469+
CHECK: punycode::zażółć_gęślą_jaźń
470+
_RNvC8punycodeu29za_gl_ja_w3a7psa2tqtgb10airva
471+
472+
CHECK: punycode::საჭმელად_გემრიელი_სადილი
473+
_RNvC8punycodeu30____7hkackfecea1cbdathfdh9hlq6y
474+
475+
CHECK: Gödel::Escher::Bach
476+
_RNtNvCu8Gdel_5qa6Escher4Bach
477+
478+
CHECK: punycode::🦁🐅
479+
_RNvC8punycodeu7wn8hx1g
480+
481+
; Punycode - invalid code point
482+
483+
CHECK: _RCu5r731r
484+
_RCu5r731r
485+
486+
CHECK: _RCu8b44444yy
487+
_RCu8b44444yy
488+
489+
CHECK: _RNvC1au25zzzzzzzzzzzzzzzzzzzzzzzzz
490+
_RNvC1au25zzzzzzzzzzzzzzzzzzzzzzzzz
491+
492+
; Punycode - early EOF
493+
494+
CHECK: _RCu8_CCCAR_u4
495+
_RCu8_CCCAR_u4
496+
497+
; Punycode - overflow
498+
499+
CHECK: _RNvC1au21p18888888888888888888
500+
_RNvC1au21p18888888888888888888
501+
459502
; Invalid mangled characters
460503

461504
CHECK: _RNvC2a.1c

llvm/unittests/Demangle/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ set(LLVM_LINK_COMPONENTS
66
add_llvm_unittest(DemangleTests
77
DemangleTest.cpp
88
ItaniumDemangleTest.cpp
9+
OutputStreamTest.cpp
910
PartialDemangleTest.cpp
1011
RustDemangleTest.cpp
1112
StringViewTest.cpp
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
//===- llvm/unittest/OutputStreamTest.cpp - OutputStream unit tests -------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "llvm/Demangle/Utility.h"
10+
#include "gtest/gtest.h"
11+
#include <string>
12+
13+
using namespace llvm;
14+
using llvm::itanium_demangle::OutputStream;
15+
16+
static std::string toString(OutputStream &OS) {
17+
return {OS.getBuffer(), OS.getCurrentPosition()};
18+
}
19+
20+
template <typename T> static std::string printToString(const T &Value) {
21+
OutputStream OS;
22+
OS << Value;
23+
std::string s = toString(OS);
24+
std::free(OS.getBuffer());
25+
return s;
26+
}
27+
28+
TEST(OutputStreamTest, Format) {
29+
EXPECT_EQ("0", printToString(0));
30+
EXPECT_EQ("1", printToString(1));
31+
EXPECT_EQ("-1", printToString(-1));
32+
EXPECT_EQ("-90", printToString(-90));
33+
EXPECT_EQ("109", printToString(109));
34+
EXPECT_EQ("400", printToString(400));
35+
36+
EXPECT_EQ("a", printToString('a'));
37+
EXPECT_EQ("?", printToString('?'));
38+
39+
EXPECT_EQ("abc", printToString("abc"));
40+
}
41+
42+
TEST(OutputStreamTest, Insert) {
43+
OutputStream OS;
44+
45+
OS.insert(0, "", 0);
46+
EXPECT_EQ("", toString(OS));
47+
48+
OS.insert(0, "abcd", 4);
49+
EXPECT_EQ("abcd", toString(OS));
50+
51+
OS.insert(0, "x", 1);
52+
EXPECT_EQ("xabcd", toString(OS));
53+
54+
OS.insert(5, "y", 1);
55+
EXPECT_EQ("xabcdy", toString(OS));
56+
57+
OS.insert(3, "defghi", 6);
58+
EXPECT_EQ("xabdefghicdy", toString(OS));
59+
60+
std::free(OS.getBuffer());
61+
}

0 commit comments

Comments
 (0)