Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions benchmark/buffers/buffer-tostring-utf8-latin1.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
'use strict';

const common = require('../common.js');

const bench = common.createBenchmark(main, {
size: [64, 1024, 16384, 262144, 4194304],
content: ['ascii', 'latin1', 'utf8_mixed', 'latin1_then_cjk'],
n: [1e4],
});

function buildBuffer(kind, size) {
if (kind === 'ascii') {
return Buffer.alloc(size, 0x61);
}
if (kind === 'latin1') {
const pair = Buffer.from([0xC3, 0xA9]);
const buf = Buffer.alloc(size);
for (let i = 0; i + 2 <= size; i += 2) pair.copy(buf, i);
return buf;
}
if (kind === 'utf8_mixed') {
const cjk = Buffer.from([0xE4, 0xB8, 0xAD]);
const buf = Buffer.alloc(size);
let i = 0;
while (i + 4 <= size) {
buf[i++] = 0x61;
cjk.copy(buf, i);
i += 3;
}
return buf;
}
if (kind === 'latin1_then_cjk') {
const pair = Buffer.from([0xC3, 0xA9]);
const cjk = Buffer.from([0xE4, 0xB8, 0xAD]);
const buf = Buffer.alloc(size);
const mid = (size >> 1) & ~1;
for (let i = 0; i + 2 <= mid; i += 2) pair.copy(buf, i);
cjk.copy(buf, mid);
for (let i = mid + 3; i + 2 <= size; i += 2) pair.copy(buf, i);
return buf;
}
throw new Error('unknown content: ' + kind);
}

function main({ n, size, content }) {
const buf = buildBuffer(content, size);

bench.start();
for (let i = 0; i < n; i++) {
buf.toString('utf8');
}
bench.end(n);
}
92 changes: 78 additions & 14 deletions src/string_bytes.cc
Original file line number Diff line number Diff line change
Expand Up @@ -582,21 +582,85 @@ MaybeLocal<Value> StringBytes::Encode(Isolate* isolate,
return ExternOneByteString::NewFromCopy(isolate, buf, buflen);
}

if (buflen >= 32 && simdutf::validate_utf8(buf, buflen)) {
// We know that we are non-ASCII (and are unlikely Latin1), use 2-byte
// In the most likely case of valid UTF-8, we can use this fast impl
// For very short input, it is slower, so we limit min size
size_t u16size = simdutf::utf16_length_from_utf8(buf, buflen);
if (u16size > static_cast<size_t>(v8::String::kMaxLength)) {
isolate->ThrowException(ERR_STRING_TOO_LONG(isolate));
return MaybeLocal<Value>();
// Latin1-fits fast path: one-byte V8 string, half the heap of UTF-16.
// Capped at 1 MiB (above that the prescan cost erases the win).
constexpr size_t kLatin1Max = 1u << 20;
if (buflen >= 256 && buflen <= kLatin1Max) {
// Skip the allocation when any byte >= 0xC4 (3+ byte UTF-8 lead).
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

C4 is not a 3-byte UTF-8 lead. The actual code seems fine, but the comment isn't right

// Inner loop has no early exit so clang vectorizes it.
constexpr size_t kChunk = 64;
bool maybe_latin1 = true;
size_t i = 0;
for (; i + kChunk <= buflen; i += kChunk) {
uint8_t acc = 0;
for (size_t j = 0; j < kChunk; j++) {
acc |= static_cast<uint8_t>(buf[i + j]) >= 0xC4 ? 1 : 0;
}
if (acc) {
maybe_latin1 = false;
break;
}
}
if (maybe_latin1) {
for (; i < buflen; i++) {
if (static_cast<uint8_t>(buf[i]) >= 0xC4) {
maybe_latin1 = false;
break;
}
}
}
if (maybe_latin1) {
MaybeStackBuffer<char, 4096> latin1;
latin1.AllocateSufficientStorage(buflen);
simdutf::result l1 = simdutf::convert_utf8_to_latin1_with_errors(
buf, buflen, latin1.out());
if (l1.error == simdutf::error_code::SUCCESS) {
return ExternOneByteString::NewFromCopy(
isolate, latin1.out(), l1.count);
}
}
}

if (buflen >= 32) {
// Single-pass UTF-16: over-allocate (1 char16_t per byte), then
// shrink. Above 1 MiB the exact-size 3-pass below is cheaper.
constexpr size_t kSinglePassMax = 1u << 20;
if (buflen <= kSinglePassMax) {
MaybeStackBuffer<uint16_t, 256> u16;
u16.AllocateSufficientStorage(buflen);
simdutf::result r = simdutf::convert_utf8_to_utf16_with_errors(
buf, buflen, reinterpret_cast<char16_t*>(u16.out()));
if (r.error == simdutf::error_code::SUCCESS) {
if (r.count > static_cast<size_t>(v8::String::kMaxLength)) {
isolate->ThrowException(ERR_STRING_TOO_LONG(isolate));
return MaybeLocal<Value>();
}
if (u16.IsAllocated()) {
uint16_t* data = u16.out();
u16.Release();
uint16_t* shrunk = static_cast<uint16_t*>(
realloc(data, r.count * sizeof(uint16_t)));
if (shrunk == nullptr) shrunk = data;
return ExternTwoByteString::New(isolate, shrunk, r.count);
}
return String::NewFromTwoByte(isolate,
u16.out(),
v8::NewStringType::kNormal,
static_cast<int>(r.count));
}
} else if (simdutf::validate_utf8(buf, buflen)) {
size_t u16size = simdutf::utf16_length_from_utf8(buf, buflen);
if (u16size > static_cast<size_t>(v8::String::kMaxLength)) {
isolate->ThrowException(ERR_STRING_TOO_LONG(isolate));
return MaybeLocal<Value>();
}
return EncodeTwoByteString(
isolate, u16size, [buf, buflen, u16size](uint16_t* dst) {
size_t written = simdutf::convert_valid_utf8_to_utf16(
buf, buflen, reinterpret_cast<char16_t*>(dst));
CHECK_EQ(written, u16size);
});
}
return EncodeTwoByteString(
isolate, u16size, [buf, buflen, u16size](uint16_t* dst) {
size_t written = simdutf::convert_valid_utf8_to_utf16(
buf, buflen, reinterpret_cast<char16_t*>(dst));
CHECK_EQ(written, u16size);
});
}

val =
Expand Down
Loading