Add fast path for utf16le encoding in stringToBuffer()/bufferToString()#981
Add fast path for utf16le encoding in stringToBuffer()/bufferToString()#981wh201906 wants to merge 13 commits intomargelo:mainfrom
Conversation
Add argument count check Wrap exceptions in the same style as Nitro HybridFunction
And use createFromUtf8() override with less overhead
No significant performance improvements for normal cases
|
Test cases from Node.js v24.15.0 Roundtrips ASCII text through utf16le encoding.Current test(SUITE, '[Node.js] Roundtrips ASCII text through utf16le encoding.', () => {
const str = 'foo';
const ab = stringToBuffer(str, 'utf16le');
expect(bufferToString(ab, 'utf16le')).to.equal(str);
});Original Node.js ( // utf8, ucs2, ascii, latin1, utf16le
for (const encoding of [
'utf8',
'utf-8',
'ucs2',
'ucs-2',
'ascii',
'latin1',
'binary',
'utf16le',
'utf-16le',
].flatMap(e => [e, e.toUpperCase()])) {
assert.strictEqual(Buffer.from('foo', encoding).toString(encoding), 'foo');
}Roundtrips UTF-16LE text containing an unpaired high surrogate.Current test(
SUITE,
'Roundtrips UTF-16LE text containing an unpaired high surrogate.',
() => {
const str = 'A\uD83DB';
const ab = stringToBuffer(str, 'utf16le');
expect(toU8(ab)).to.deep.equal(
new Uint8Array([0x41, 0x00, 0x3d, 0xd8, 0x42, 0x00]),
);
expect(bufferToString(ab, 'utf16le')).to.equal(str);
},
);Original Node.js: No direct matching test case was found in Node.js Verified Node.js runtime behavior: const str = 'A\uD83DB';
const buf = Buffer.from(str, 'utf16le');
assert.deepStrictEqual([...buf], [0x41, 0x00, 0x3d, 0xd8, 0x42, 0x00]);
assert.strictEqual(buf.toString('utf16le'), str);Roundtrips UTF-16LE text containing an unpaired low surrogate.Current test(
SUITE,
'Roundtrips UTF-16LE text containing an unpaired low surrogate.',
() => {
const str = 'A\uDC00B';
const ab = stringToBuffer(str, 'utf16le');
expect(toU8(ab)).to.deep.equal(
new Uint8Array([0x41, 0x00, 0x00, 0xdc, 0x42, 0x00]),
);
expect(bufferToString(ab, 'utf16le')).to.equal(str);
},
);Original Node.js: No direct matching test case was found in Node.js Verified Node.js runtime behavior: const str = 'A\uDC00B';
const buf = Buffer.from(str, 'utf16le');
assert.deepStrictEqual([...buf], [0x41, 0x00, 0x00, 0xdc, 0x42, 0x00]);
assert.strictEqual(buf.toString('utf16le'), str);UTF-16LE encoding of "über"Current test(SUITE, '[Node.js] UTF-16LE encoding of "über"', () => {
expect(toU8(stringToBuffer('über', 'utf16le'))).to.deep.equal(
new Uint8Array([252, 0, 98, 0, 101, 0, 114, 0]),
);
});Original Node.js ( ['ucs2', 'ucs-2', 'utf16le', 'utf-16le'].forEach(encoding => {
{
// Test for proper UTF16LE encoding, length should be 8
const f = Buffer.from('über', encoding);
assert.deepStrictEqual(f, Buffer.from([252, 0, 98, 0, 101, 0, 114, 0]));
}
});UTF-16LE encoding of "привет"Current test(SUITE, '[Node.js] UTF-16LE encoding of "привет"', () => {
const encoded = toU8(stringToBuffer('привет', 'utf16le'));
expect(encoded).to.deep.equal(
new Uint8Array([63, 4, 64, 4, 56, 4, 50, 4, 53, 4, 66, 4]),
);
expect(bufferToString(encoded.buffer as ArrayBuffer, 'utf16le')).to.equal(
'привет',
);
});Original Node.js ( ['ucs2', 'ucs-2', 'utf16le', 'utf-16le'].forEach(encoding => {
{
// Length should be 12
const f = Buffer.from('привет', encoding);
assert.deepStrictEqual(
f,
Buffer.from([63, 4, 64, 4, 56, 4, 50, 4, 53, 4, 66, 4]),
);
assert.strictEqual(f.toString(encoding), 'привет');
}
});UTF-16LE encoding of Thumbs up sign (U+1F44D)Current test(SUITE, '[Node.js] UTF-16LE encoding of Thumbs up sign (U+1F44D)', () => {
expect(toU8(stringToBuffer('\uD83D\uDC4D', 'utf16le'))).to.deep.equal(
new Uint8Array([0x3d, 0xd8, 0x4d, 0xdc]),
);
});Original Node.js ( {
const f = Buffer.from('\uD83D\uDC4D', 'utf-16le'); // THUMBS UP SIGN (U+1F44D)
assert.strictEqual(f.length, 4);
assert.deepStrictEqual(f, Buffer.from('3DD84DDC', 'hex'));
}Decodes UTF-16LE bytes back to Japanese text.Current test(SUITE, '[Node.js] Decodes UTF-16LE bytes back to Japanese text.', () => {
const bytes = new Uint8Array([
0x42, 0x30, 0x44, 0x30, 0x46, 0x30, 0x48, 0x30, 0x4a, 0x30,
]);
expect(bufferToString(bytes.buffer as ArrayBuffer, 'utf16le')).to.equal(
'あいうえお',
);
});Original Node.js ( ['ucs2', 'ucs-2', 'utf16le', 'utf-16le'].forEach(encoding => {
const b = Buffer.allocUnsafe(10);
b.write('あいうえお', encoding);
assert.strictEqual(b.toString(encoding), 'あいうえお');
});Decodes UTF-16LE bytes correctly from a sliced buffer starting at byte offset 1.Current test(
SUITE,
'[Node.js] Decodes UTF-16LE bytes correctly from a sliced buffer starting at byte offset 1.',
() => {
const bytes = new Uint8Array([
0xff, 0x42, 0x30, 0x44, 0x30, 0x46, 0x30, 0x48, 0x30, 0x4a, 0x30,
]);
expect(
bufferToString(bytes.slice(1).buffer as ArrayBuffer, 'utf16le'),
).to.equal('あいうえお');
},
);Original Node.js ( ['ucs2', 'ucs-2', 'utf16le', 'utf-16le'].forEach(encoding => {
const b = Buffer.allocUnsafe(11);
b.write('あいうえお', 1, encoding);
assert.strictEqual(b.toString(encoding, 1), 'あいうえお');
}); |
|
This PR is ready for review. |
The native implementation is way much faster
Screenshot
In the current mainstream React Native JavaScript engine, Hermes, strings are internally represented using UTF-16 or ASCII. Therefore, when the native side needs access to the UTF-16 representation of a string, Hermes can provide the underlying data with minimal overhead. However, in the current implementation of Nitro, JavaScript strings are always converted to UTF-8 by default. For UTF-16 data, this introduces unnecessary conversion overhead and may also lead to data loss (e.g., unpaired surrogates) during the conversion process.
To address this, I bypass the Nitrogen-generated conversion path from JS string to
std::stringby accessing jsi::String object directly. For other encodings, the existing Nitrogen-like code path is preserved (calljsi::String::utf8()like what nitro does). For UTF-16 encoding, a lower-level fast path is used whenever possible (calljsi::String::getStringData()).Note: this optimized UTF-16 encoding/decoding path is only available in the Hermes environment and for React Native 0.78+. Therefore, I added conditional checks on both the JavaScript side and the C++ side to selectively enable this feature.
For testing, I added UTF-16LE-related test cases based on Node.js, as well as performance benchmarks for the UTF-16 encoding path.
(text polished by ChatGPT)