From 2a30acce6bb7f06855971977bae2472b94464d4b Mon Sep 17 00:00:00 2001 From: Nikita Skovoroda Date: Wed, 28 Jan 2026 10:12:20 +0400 Subject: [PATCH] lib: add utf16 fast path for TextDecoder --- lib/internal/encoding.js | 80 +++++++++---------- lib/internal/encoding/util.js | 57 ++++++++----- ...test-whatwg-encoding-custom-textdecoder.js | 2 +- 3 files changed, 74 insertions(+), 65 deletions(-) diff --git a/lib/internal/encoding.js b/lib/internal/encoding.js index 7d4747abc23bb9..5f1655426d5bd5 100644 --- a/lib/internal/encoding.js +++ b/lib/internal/encoding.js @@ -20,8 +20,8 @@ const { FastBuffer } = require('internal/buffer'); const { ERR_ENCODING_NOT_SUPPORTED, ERR_INVALID_ARG_TYPE, + ERR_ENCODING_INVALID_ENCODED_DATA, ERR_INVALID_THIS, - ERR_NO_ICU, } = require('internal/errors').codes; const kSingleByte = Symbol('single-byte'); const kHandle = Symbol('handle'); @@ -30,11 +30,11 @@ const kEncoding = Symbol('encoding'); const kDecoder = Symbol('decoder'); const kChunk = Symbol('chunk'); const kFatal = Symbol('kFatal'); -const kUTF8FastPath = Symbol('kUTF8FastPath'); +const kUnicode = Symbol('kUnicode'); const kIgnoreBOM = Symbol('kIgnoreBOM'); const { isSinglebyteEncoding, createSinglebyteDecoder } = require('internal/encoding/single-byte'); -const { unfinishedBytesUtf8, mergePrefixUtf8 } = require('internal/encoding/util'); +const { unfinishedBytes, mergePrefix } = require('internal/encoding/util'); const { getConstructorOf, @@ -419,11 +419,24 @@ if (hasIntl) { const kBOMSeen = Symbol('BOM seen'); -let StringDecoder; -function lazyStringDecoder() { - if (StringDecoder === undefined) - ({ StringDecoder } = require('string_decoder')); - return StringDecoder; +function fixupDecodedString(res, ignoreBom, fatal, encoding) { + if (res.length === 0) return ''; + if (!ignoreBom && res[0] === '\ufeff') res = StringPrototypeSlice(res, 1); + if (!fatal) return res.toWellFormed(); + if (!res.isWellFormed()) throw new ERR_ENCODING_INVALID_ENCODED_DATA(encoding, undefined); + return res; +} + +function decodeUTF16le(input, ignoreBom, fatal) { + return fixupDecodedString(parseInput(input).ucs2Slice(), ignoreBom, fatal, 'utf-16le'); +} + +function decodeUTF16be(input, ignoreBom, fatal) { + const be = parseInput(input); + const le = new FastBuffer(be.length); + le.set(be); + le.swap16(); + return fixupDecodedString(le.ucs2Slice(), ignoreBom, fatal, 'utf-16be'); } class TextDecoder { @@ -446,33 +459,29 @@ class TextDecoder { this[kEncoding] = enc; this[kIgnoreBOM] = Boolean(options?.ignoreBOM); this[kFatal] = Boolean(options?.fatal); - this[kUTF8FastPath] = false; + this[kUnicode] = undefined; this[kHandle] = undefined; this[kSingleByte] = undefined; // Does not care about streaming or BOM this[kChunk] = null; // A copy of previous streaming tail or null if (enc === 'utf-8') { - this[kUTF8FastPath] = true; + this[kUnicode] = decodeUTF8; + this[kBOMSeen] = false; + } else if (enc === 'utf-16le') { + this[kUnicode] = decodeUTF16le; + this[kBOMSeen] = false; + } else if (enc === 'utf-16be') { + this[kUnicode] = decodeUTF16be; this[kBOMSeen] = false; } else if (isSinglebyteEncoding(enc)) { this[kSingleByte] = createSinglebyteDecoder(enc, this[kFatal]); - } else { - this.#prepareConverter(); // Need to throw early if we don't support the encoding - } - } - - #prepareConverter() { - if (hasIntl) { + } if (hasIntl) { let icuEncoding = this[kEncoding]; if (icuEncoding === 'gbk') icuEncoding = 'gb18030'; // 10.1.1. GBK's decoder is gb18030's decoder const handle = icuGetConverter(icuEncoding, this[kFlags]); if (handle === undefined) throw new ERR_ENCODING_NOT_SUPPORTED(this[kEncoding]); this[kHandle] = handle; - } else if (this[kEncoding] === 'utf-16le') { - if (this[kFatal]) throw new ERR_NO_ICU('"fatal" option'); - this[kHandle] = new (lazyStringDecoder())(this[kEncoding]); - this[kBOMSeen] = false; } else { throw new ERR_ENCODING_NOT_SUPPORTED(this[kEncoding]); } @@ -485,19 +494,19 @@ class TextDecoder { if (this[kSingleByte]) return this[kSingleByte](parseInput(input)); const stream = options?.stream; - if (this[kUTF8FastPath]) { + if (this[kUnicode]) { const chunk = this[kChunk]; const ignoreBom = this[kIgnoreBOM] || this[kBOMSeen]; if (!stream) { this[kBOMSeen] = false; - if (!chunk) return decodeUTF8(input, ignoreBom, this[kFatal]); + if (!chunk) return this[kUnicode](input, ignoreBom, this[kFatal]); } let u = parseInput(input); if (u.length === 0 && stream) return ''; // no state change let prefix; if (chunk) { - const merged = mergePrefixUtf8(u, this[kChunk]); + const merged = mergePrefix(u, this[kChunk], this[kEncoding]); if (u.length < 3) { u = merged; // Might be unfinished, but fully consumed old u } else { @@ -510,7 +519,7 @@ class TextDecoder { } if (stream) { - const trail = unfinishedBytesUtf8(u, u.length); + const trail = unfinishedBytes(u, u.length, this[kEncoding]); if (trail > 0) { this[kChunk] = new FastBuffer(u.subarray(-trail)); // copy if (!prefix && trail === u.length) return ''; // No further state change @@ -519,8 +528,8 @@ class TextDecoder { } try { - const res = (prefix ? decodeUTF8(prefix, ignoreBom, this[kFatal]) : '') + - decodeUTF8(u, ignoreBom || prefix, this[kFatal]); + const res = (prefix ? this[kUnicode](prefix, ignoreBom, this[kFatal]) : '') + + this[kUnicode](u, ignoreBom || prefix, this[kFatal]); // "BOM seen" is set on the current decode call only if it did not error, // in "serialize I/O queue" after decoding @@ -541,22 +550,7 @@ class TextDecoder { return icuDecode(this[kHandle], input, flags, this[kEncoding]); } - input = parseInput(input); - - let result = stream ? this[kHandle].write(input) : this[kHandle].end(input); - - if (result.length > 0 && !this[kBOMSeen] && !this[kIgnoreBOM]) { - // If the very first result in the stream is a BOM, and we are not - // explicitly told to ignore it, then we discard it. - if (result[0] === '\ufeff') { - result = StringPrototypeSlice(result, 1); - } - this[kBOMSeen] = true; - } - - if (!stream) this[kBOMSeen] = false; - - return result; + // Unreachable } } diff --git a/lib/internal/encoding/util.js b/lib/internal/encoding/util.js index 107a0f41b5d811..80d0cb9fc3028f 100644 --- a/lib/internal/encoding/util.js +++ b/lib/internal/encoding/util.js @@ -7,39 +7,54 @@ const { Uint8Array, } = primordials; - /** * Get a number of last bytes in an Uint8Array `data` ending at `len` that don't * form a codepoint yet, but can be a part of a single codepoint on more data. - * @param {Uint8Array} data Uint8Array of potentially UTF-8 bytes + * @param {Uint8Array} data Uint8Array of potentially UTF-8/UTF-16 bytes * @param {number} len Position to look behind from - * @returns {number} Number of unfinished potentially valid UTF-8 bytes ending at position `len` + * @param {string} enc Encoding to use: utf-8, utf-16le, or utf16-be + * @returns {number} Number (0-3) of unfinished potentially valid UTF bytes ending at position `len` */ -function unfinishedBytesUtf8(data, len) { - // 0-3 - let pos = 0; - while (pos < 2 && pos < len && (data[len - pos - 1] & 0xc0) === 0x80) pos++; // Go back 0-2 trailing bytes - if (pos === len) return 0; // no space for lead - const lead = data[len - pos - 1]; - if (lead < 0xc2 || lead > 0xf4) return 0; // not a lead - if (pos === 0) return 1; // Nothing to recheck, we have only lead, return it. 2-byte must return here - if (lead < 0xe0 || (lead < 0xf0 && pos >= 2)) return 0; // 2-byte, or 3-byte or less and we already have 2 trailing - const lower = lead === 0xf0 ? 0x90 : lead === 0xe0 ? 0xa0 : 0x80; - const upper = lead === 0xf4 ? 0x8f : lead === 0xed ? 0x9f : 0xbf; - const next = data[len - pos]; - return next >= lower && next <= upper ? pos + 1 : 0; +function unfinishedBytes(data, len, enc) { + switch (enc) { + case 'utf-8': { + // 0-3 + let pos = 0; + while (pos < 2 && pos < len && (data[len - pos - 1] & 0xc0) === 0x80) pos++; // Go back 0-2 trailing bytes + if (pos === len) return 0; // no space for lead + const lead = data[len - pos - 1]; + if (lead < 0xc2 || lead > 0xf4) return 0; // not a lead + if (pos === 0) return 1; // Nothing to recheck, we have only lead, return it. 2-byte must return here + if (lead < 0xe0 || (lead < 0xf0 && pos >= 2)) return 0; // 2-byte, 3-byte or less and we already have 2 trailing + const lower = lead === 0xf0 ? 0x90 : lead === 0xe0 ? 0xa0 : 0x80; + const upper = lead === 0xf4 ? 0x8f : lead === 0xed ? 0x9f : 0xbf; + const next = data[len - pos]; + return next >= lower && next <= upper ? pos + 1 : 0; + } + + case 'utf-16le': + case 'utf-16be': { + // 0-3 + const uneven = len % 2; // Uneven byte length adds 1 + if (len < 2) return uneven; + const l = len - uneven - 1; + const last = enc === 'utf-16le' ? (data[l] << 8) ^ data[l - 1] : (data[l - 1] << 8) ^ data[l]; + return last >= 0xd8_00 && last < 0xdc_00 ? uneven + 2 : uneven; // lone lead adds 2 + } + } } /** * Merge prefix `chunk` with `data` and return new combined prefix. * For data.length < 3, fully consumes data and can return unfinished data, * otherwise returns a prefix with no unfinished bytes - * @param {Uint8Array} data Uint8Array of potentially UTF-8 bytes + * @param {Uint8Array} data Uint8Array of potentially UTF-8/UTF-16 bytes * @param {Uint8Array} chunk Prefix to prepend before `data` + * @param {string} enc Encoding to use: utf-8, utf-16le, or utf16-be * @returns {Uint8Array} If data.length >= 3: an Uint8Array containing `chunk` and a slice of `data` - * so that the result has no unfinished UTF-8 codepoints. If data.length < 3: concat(chunk, data). + * so that the result has no unfinished codepoints. If data.length < 3: concat(chunk, data). */ -function mergePrefixUtf8(data, chunk) { +function mergePrefix(data, chunk, enc) { if (data.length === 0) return chunk; if (data.length < 3) { // No reason to bruteforce offsets, also it's possible this doesn't yet end the sequence @@ -57,7 +72,7 @@ function mergePrefixUtf8(data, chunk) { // Stop at the first offset where unfinished bytes reaches 0 or fits into data // If that doesn't happen (data too short), just concat chunk and data completely (above) for (let i = 1; i <= 3; i++) { - const unfinished = unfinishedBytesUtf8(temp, chunk.length + i); // 0-3 + const unfinished = unfinishedBytes(temp, chunk.length + i, enc); // 0-3 if (unfinished <= i) { // Always reachable at 3, but we still need 'unfinished' value for it const add = i - unfinished; // 0-3 @@ -69,4 +84,4 @@ function mergePrefixUtf8(data, chunk) { return null; } -module.exports = { unfinishedBytesUtf8, mergePrefixUtf8 }; +module.exports = { unfinishedBytes, mergePrefix }; diff --git a/test/parallel/test-whatwg-encoding-custom-textdecoder.js b/test/parallel/test-whatwg-encoding-custom-textdecoder.js index 9734825b6b27a5..10ef410f5bf77b 100644 --- a/test/parallel/test-whatwg-encoding-custom-textdecoder.js +++ b/test/parallel/test-whatwg-encoding-custom-textdecoder.js @@ -101,7 +101,7 @@ assert(TextDecoder); } // Test TextDecoder, UTF-16be -if (common.hasIntl) { +{ const dec = new TextDecoder('utf-16be'); const res = dec.decode(Buffer.from('test€', 'utf-16le').swap16()); assert.strictEqual(res, 'test€');