Skip to content

Commit 907068c

Browse files
anonrigjuanarbol
authored andcommitted
buffer: add buffer.isUtf8 for utf8 validation
PR-URL: #45947 Reviewed-By: Robert Nagy <[email protected]> Reviewed-By: Matteo Collina <[email protected]> Reviewed-By: Luigi Pinca <[email protected]> Reviewed-By: Rafael Gonzaga <[email protected]> Reviewed-By: Ben Noordhuis <[email protected]> Reviewed-By: Anna Henningsen <[email protected]>
1 parent cb767a2 commit 907068c

File tree

7 files changed

+131
-1
lines changed

7 files changed

+131
-1
lines changed

doc/api/buffer.md

+11
Original file line numberDiff line numberDiff line change
@@ -5126,6 +5126,17 @@ For code running using Node.js APIs, converting between base64-encoded strings
51265126
and binary data should be performed using `Buffer.from(str, 'base64')` and
51275127
`buf.toString('base64')`.**
51285128

5129+
### `buffer.isUtf8(input)`
5130+
5131+
<!-- YAML
5132+
added: REPLACEME
5133+
-->
5134+
5135+
* input {Buffer | ArrayBuffer | TypedArray} The input to validate.
5136+
* Returns: {boolean} Returns `true` if and only if the input is valid UTF-8.
5137+
5138+
This function is used to check if input contains UTF-8 code points (characters).
5139+
51295140
### `buffer.INSPECT_MAX_BYTES`
51305141

51315142
<!-- YAML

lib/buffer.js

+12-1
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ const {
5757
compareOffset,
5858
createFromString,
5959
fill: bindingFill,
60+
isUtf8: bindingIsUtf8,
6061
indexOfBuffer,
6162
indexOfNumber,
6263
indexOfString,
@@ -83,7 +84,8 @@ const {
8384
const {
8485
isAnyArrayBuffer,
8586
isArrayBufferView,
86-
isUint8Array
87+
isUint8Array,
88+
isTypedArray,
8789
} = require('internal/util/types');
8890
const {
8991
inspect: utilInspect
@@ -1322,13 +1324,22 @@ function atob(input) {
13221324
return Buffer.from(input, 'base64').toString('latin1');
13231325
}
13241326

1327+
function isUtf8(input) {
1328+
if (isTypedArray(input) || isAnyArrayBuffer(input)) {
1329+
return bindingIsUtf8(input);
1330+
}
1331+
1332+
throw new ERR_INVALID_ARG_TYPE('input', ['TypedArray', 'Buffer'], input);
1333+
}
1334+
13251335
module.exports = {
13261336
Blob,
13271337
File,
13281338
resolveObjectURL,
13291339
Buffer,
13301340
SlowBuffer,
13311341
transcode,
1342+
isUtf8,
13321343

13331344
// Legacy
13341345
kMaxLength,

src/node_buffer.cc

+18
Original file line numberDiff line numberDiff line change
@@ -1223,6 +1223,20 @@ static void EncodeInto(const FunctionCallbackInfo<Value>& args) {
12231223
results[1] = written;
12241224
}
12251225

1226+
static void IsUtf8(const FunctionCallbackInfo<Value>& args) {
1227+
Environment* env = Environment::GetCurrent(args);
1228+
CHECK_EQ(args.Length(), 1);
1229+
CHECK(args[0]->IsTypedArray() || args[0]->IsArrayBuffer() ||
1230+
args[0]->IsSharedArrayBuffer());
1231+
ArrayBufferViewContents<char> abv(args[0]);
1232+
1233+
if (abv.WasDetached()) {
1234+
return node::THROW_ERR_INVALID_STATE(
1235+
env, "Cannot validate on a detached buffer");
1236+
}
1237+
1238+
args.GetReturnValue().Set(simdutf::validate_utf8(abv.data(), abv.length()));
1239+
}
12261240

12271241
void SetBufferPrototype(const FunctionCallbackInfo<Value>& args) {
12281242
Environment* env = Environment::GetCurrent(args);
@@ -1358,6 +1372,8 @@ void Initialize(Local<Object> target,
13581372
SetMethod(context, target, "encodeInto", EncodeInto);
13591373
SetMethodNoSideEffect(context, target, "encodeUtf8String", EncodeUtf8String);
13601374

1375+
SetMethodNoSideEffect(context, target, "isUtf8", IsUtf8);
1376+
13611377
target
13621378
->Set(context,
13631379
FIXED_ONE_BYTE_STRING(isolate, "kMaxLength"),
@@ -1413,6 +1429,8 @@ void RegisterExternalReferences(ExternalReferenceRegistry* registry) {
14131429
registry->Register(EncodeInto);
14141430
registry->Register(EncodeUtf8String);
14151431

1432+
registry->Register(IsUtf8);
1433+
14161434
registry->Register(StringSlice<ASCII>);
14171435
registry->Register(StringSlice<BASE64>);
14181436
registry->Register(StringSlice<BASE64URL>);

src/node_errors.h

+1
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ void OOMErrorHandler(const char* location, bool is_heap_oom);
6868
V(ERR_INVALID_ARG_TYPE, TypeError) \
6969
V(ERR_INVALID_OBJECT_DEFINE_PROPERTY, TypeError) \
7070
V(ERR_INVALID_MODULE, Error) \
71+
V(ERR_INVALID_STATE, Error) \
7172
V(ERR_INVALID_THIS, TypeError) \
7273
V(ERR_INVALID_TRANSFER_OBJECT, TypeError) \
7374
V(ERR_MEMORY_ALLOCATION_FAILED, Error) \

src/util-inl.h

+1
Original file line numberDiff line numberDiff line change
@@ -555,6 +555,7 @@ void ArrayBufferViewContents<T, S>::ReadValue(v8::Local<v8::Value> buf) {
555555
auto ab = buf.As<v8::ArrayBuffer>();
556556
length_ = ab->ByteLength();
557557
data_ = static_cast<T*>(ab->Data());
558+
was_detached_ = ab->WasDetached();
558559
} else {
559560
CHECK(buf->IsSharedArrayBuffer());
560561
auto sab = buf.As<v8::SharedArrayBuffer>();

src/util.h

+2
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,7 @@ class ArrayBufferViewContents {
510510
inline void Read(v8::Local<v8::ArrayBufferView> abv);
511511
inline void ReadValue(v8::Local<v8::Value> buf);
512512

513+
inline bool WasDetached() const { return was_detached_; }
513514
inline const T* data() const { return data_; }
514515
inline size_t length() const { return length_; }
515516

@@ -524,6 +525,7 @@ class ArrayBufferViewContents {
524525
T stack_storage_[kStackStorageSize];
525526
T* data_ = nullptr;
526527
size_t length_ = 0;
528+
bool was_detached_ = false;
527529
};
528530

529531
class Utf8Value : public MaybeStackBuffer<char> {

test/parallel/test-buffer-isutf8.js

+86
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
'use strict';
2+
3+
require('../common');
4+
const assert = require('assert');
5+
const { isUtf8, Buffer } = require('buffer');
6+
const { TextEncoder } = require('util');
7+
8+
const encoder = new TextEncoder();
9+
10+
assert.strictEqual(isUtf8(encoder.encode('hello')), true);
11+
assert.strictEqual(isUtf8(encoder.encode('ğ')), true);
12+
assert.strictEqual(isUtf8(Buffer.from([])), true);
13+
14+
// Taken from test/fixtures/wpt/encoding/textdecoder-fatal.any.js
15+
[
16+
[0xFF], // 'invalid code'
17+
[0xC0], // 'ends early'
18+
[0xE0], // 'ends early 2'
19+
[0xC0, 0x00], // 'invalid trail'
20+
[0xC0, 0xC0], // 'invalid trail 2'
21+
[0xE0, 0x00], // 'invalid trail 3'
22+
[0xE0, 0xC0], // 'invalid trail 4'
23+
[0xE0, 0x80, 0x00], // 'invalid trail 5'
24+
[0xE0, 0x80, 0xC0], // 'invalid trail 6'
25+
[0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], // '> 0x10FFFF'
26+
[0xFE, 0x80, 0x80, 0x80, 0x80, 0x80], // 'obsolete lead byte'
27+
28+
// Overlong encodings
29+
[0xC0, 0x80], // 'overlong U+0000 - 2 bytes'
30+
[0xE0, 0x80, 0x80], // 'overlong U+0000 - 3 bytes'
31+
[0xF0, 0x80, 0x80, 0x80], // 'overlong U+0000 - 4 bytes'
32+
[0xF8, 0x80, 0x80, 0x80, 0x80], // 'overlong U+0000 - 5 bytes'
33+
[0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], // 'overlong U+0000 - 6 bytes'
34+
35+
[0xC1, 0xBF], // 'overlong U+007F - 2 bytes'
36+
[0xE0, 0x81, 0xBF], // 'overlong U+007F - 3 bytes'
37+
[0xF0, 0x80, 0x81, 0xBF], // 'overlong U+007F - 4 bytes'
38+
[0xF8, 0x80, 0x80, 0x81, 0xBF], // 'overlong U+007F - 5 bytes'
39+
[0xFC, 0x80, 0x80, 0x80, 0x81, 0xBF], // 'overlong U+007F - 6 bytes'
40+
41+
[0xE0, 0x9F, 0xBF], // 'overlong U+07FF - 3 bytes'
42+
[0xF0, 0x80, 0x9F, 0xBF], // 'overlong U+07FF - 4 bytes'
43+
[0xF8, 0x80, 0x80, 0x9F, 0xBF], // 'overlong U+07FF - 5 bytes'
44+
[0xFC, 0x80, 0x80, 0x80, 0x9F, 0xBF], // 'overlong U+07FF - 6 bytes'
45+
46+
[0xF0, 0x8F, 0xBF, 0xBF], // 'overlong U+FFFF - 4 bytes'
47+
[0xF8, 0x80, 0x8F, 0xBF, 0xBF], // 'overlong U+FFFF - 5 bytes'
48+
[0xFC, 0x80, 0x80, 0x8F, 0xBF, 0xBF], // 'overlong U+FFFF - 6 bytes'
49+
50+
[0xF8, 0x84, 0x8F, 0xBF, 0xBF], // 'overlong U+10FFFF - 5 bytes'
51+
[0xFC, 0x80, 0x84, 0x8F, 0xBF, 0xBF], // 'overlong U+10FFFF - 6 bytes'
52+
53+
// UTF-16 surrogates encoded as code points in UTF-8
54+
[0xED, 0xA0, 0x80], // 'lead surrogate'
55+
[0xED, 0xB0, 0x80], // 'trail surrogate'
56+
[0xED, 0xA0, 0x80, 0xED, 0xB0, 0x80], // 'surrogate pair'
57+
].forEach((input) => {
58+
assert.strictEqual(isUtf8(Buffer.from(input)), false);
59+
});
60+
61+
[
62+
null,
63+
undefined,
64+
'hello',
65+
true,
66+
false,
67+
].forEach((input) => {
68+
assert.throws(
69+
() => { isUtf8(input); },
70+
{
71+
code: 'ERR_INVALID_ARG_TYPE',
72+
},
73+
);
74+
});
75+
76+
{
77+
// Test with detached array buffers
78+
const arrayBuffer = new ArrayBuffer(1024);
79+
structuredClone(arrayBuffer, { transfer: [arrayBuffer] });
80+
assert.throws(
81+
() => { isUtf8(arrayBuffer); },
82+
{
83+
code: 'ERR_INVALID_STATE'
84+
}
85+
);
86+
}

0 commit comments

Comments
 (0)