From c63b474e11417b5de07720089d7fc44950c4766e Mon Sep 17 00:00:00 2001 From: Mert Can Altin Date: Sun, 1 Feb 2026 01:37:48 +0300 Subject: [PATCH 1/2] src: optimize utf-8 byte length calculation using simdutf --- src/node_buffer.cc | 89 +++++++++++++++++++++++++--------------------- 1 file changed, 49 insertions(+), 40 deletions(-) diff --git a/src/node_buffer.cc b/src/node_buffer.cc index d4a63cf610ca7f..af45a1a1019fa6 100644 --- a/src/node_buffer.cc +++ b/src/node_buffer.cc @@ -761,9 +761,38 @@ void StringWrite(const FunctionCallbackInfo& args) { void SlowByteLengthUtf8(const FunctionCallbackInfo& args) { CHECK(args[0]->IsString()); - // Fast case: avoid StringBytes on UTF8 string. Jump to v8. - size_t result = args[0].As()->Utf8LengthV2(args.GetIsolate()); - args.GetReturnValue().Set(static_cast(result)); + Isolate* isolate = args.GetIsolate(); + Local source = args[0].As(); + + static constexpr int kSmallStringThreshold = 128; + if (source->Length() <= kSmallStringThreshold) { + size_t result = source->Utf8LengthV2(isolate); + args.GetReturnValue().Set(static_cast(result)); + return; + } + + String::ValueView view(isolate, source); + size_t length = view.length(); + size_t utf8_length; + + if (view.is_one_byte()) { + auto data = reinterpret_cast(view.data8()); + simdutf::result result = simdutf::validate_ascii_with_errors(data, length); + if (result.error == simdutf::SUCCESS) { + utf8_length = length; // Pure ASCII, length stays the same + } else { + utf8_length = simdutf::utf8_length_from_latin1(data, length); + } + } else { + auto data = reinterpret_cast(view.data16()); + if (simdutf::validate_utf16(data, length)) { + utf8_length = simdutf::utf8_length_from_utf16(data, length); + } else { + utf8_length = source->Utf8LengthV2(isolate); + } + } + + args.GetReturnValue().Set(static_cast(utf8_length)); } uint32_t FastByteLengthUtf8( @@ -776,49 +805,29 @@ uint32_t FastByteLengthUtf8( CHECK(sourceValue->IsString()); Local sourceStr = sourceValue.As(); - if (!sourceStr->IsExternalOneByte()) { + // For short inputs, use V8's path - function call overhead not worth it + static constexpr int kSmallStringThreshold = 128; + if (sourceStr->Length() <= kSmallStringThreshold) { return sourceStr->Utf8LengthV2(isolate); } - auto source = sourceStr->GetExternalOneByteStringResource(); - // For short inputs, the function call overhead to simdutf is maybe - // not worth it, reserve simdutf for long strings. - if (source->length() > 128) { - return simdutf::utf8_length_from_latin1(source->data(), source->length()); - } - - uint32_t length = source->length(); - const auto input = reinterpret_cast(source->data()); - - uint32_t answer = length; - uint32_t i = 0; - auto pop = [](uint64_t v) { - return static_cast(((v >> 7) & UINT64_C(0x0101010101010101)) * - UINT64_C(0x0101010101010101) >> - 56); - }; + String::ValueView view(isolate, sourceStr); + size_t length = view.length(); - for (; i + 32 <= length; i += 32) { - uint64_t v; - memcpy(&v, input + i, 8); - answer += pop(v); - memcpy(&v, input + i + 8, 8); - answer += pop(v); - memcpy(&v, input + i + 16, 8); - answer += pop(v); - memcpy(&v, input + i + 24, 8); - answer += pop(v); - } - for (; i + 8 <= length; i += 8) { - uint64_t v; - memcpy(&v, input + i, 8); - answer += pop(v); - } - for (; i + 1 <= length; i += 1) { - answer += input[i] >> 7; + if (view.is_one_byte()) { + auto data = reinterpret_cast(view.data8()); + simdutf::result result = simdutf::validate_ascii_with_errors(data, length); + if (result.error == simdutf::SUCCESS) { + return length; // Pure ASCII, length stays the same + } + return simdutf::utf8_length_from_latin1(data, length); } - return answer; + auto data = reinterpret_cast(view.data16()); + if (simdutf::validate_utf16(data, length)) { + return simdutf::utf8_length_from_utf16(data, length); + } + return sourceStr->Utf8LengthV2(isolate); } static CFunction fast_byte_length_utf8(CFunction::Make(FastByteLengthUtf8)); From a9817fcc9f197a3d4d5ad74b4d5ee0a1765335e3 Mon Sep 17 00:00:00 2001 From: Mert Can Altin Date: Sun, 1 Feb 2026 17:30:47 +0300 Subject: [PATCH 2/2] use IsOneByte & IsExternalOneByte path for regresion --- src/node_buffer.cc | 42 +++++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/src/node_buffer.cc b/src/node_buffer.cc index af45a1a1019fa6..ca16064429a76a 100644 --- a/src/node_buffer.cc +++ b/src/node_buffer.cc @@ -771,18 +771,23 @@ void SlowByteLengthUtf8(const FunctionCallbackInfo& args) { return; } + size_t utf8_length; + + // Fast path for external one-byte strings (common case for ASCII/Latin1) + if (source->IsExternalOneByte()) { + auto ext = source->GetExternalOneByteStringResource(); + utf8_length = simdutf::utf8_length_from_latin1(ext->data(), ext->length()); + args.GetReturnValue().Set(static_cast(utf8_length)); + return; + } + + // For non-external strings, use ValueView String::ValueView view(isolate, source); size_t length = view.length(); - size_t utf8_length; if (view.is_one_byte()) { auto data = reinterpret_cast(view.data8()); - simdutf::result result = simdutf::validate_ascii_with_errors(data, length); - if (result.error == simdutf::SUCCESS) { - utf8_length = length; // Pure ASCII, length stays the same - } else { - utf8_length = simdutf::utf8_length_from_latin1(data, length); - } + utf8_length = simdutf::utf8_length_from_latin1(data, length); } else { auto data = reinterpret_cast(view.data16()); if (simdutf::validate_utf16(data, length)) { @@ -805,21 +810,32 @@ uint32_t FastByteLengthUtf8( CHECK(sourceValue->IsString()); Local sourceStr = sourceValue.As(); + int length = sourceStr->Length(); + // For short inputs, use V8's path - function call overhead not worth it static constexpr int kSmallStringThreshold = 128; - if (sourceStr->Length() <= kSmallStringThreshold) { + if (length <= kSmallStringThreshold) { + return sourceStr->Utf8LengthV2(isolate); + } + + // Fast path for external one-byte strings (common case for ASCII/Latin1) + if (sourceStr->IsExternalOneByte()) { + auto ext = sourceStr->GetExternalOneByteStringResource(); + return simdutf::utf8_length_from_latin1(ext->data(), ext->length()); + } + + // For one-byte (Latin1/ASCII) strings, V8 is already fast and ValueView + // creation has overhead. Use higher threshold before switching to simdutf. + static constexpr int kOneByteLargeThreshold = 1024; + if (sourceStr->IsOneByte() && length <= kOneByteLargeThreshold) { return sourceStr->Utf8LengthV2(isolate); } + // For larger strings or two-byte strings, use ValueView + simdutf String::ValueView view(isolate, sourceStr); - size_t length = view.length(); if (view.is_one_byte()) { auto data = reinterpret_cast(view.data8()); - simdutf::result result = simdutf::validate_ascii_with_errors(data, length); - if (result.error == simdutf::SUCCESS) { - return length; // Pure ASCII, length stays the same - } return simdutf::utf8_length_from_latin1(data, length); }