lts/src/string_decoder.cc
#include "string_decoder.h" // NOLINT(build/include_inline)
#include "string_decoder-inl.h"
#include "env-inl.h"
#include "node_buffer.h"
#include "string_bytes.h"
#include "util.h"
using v8::Array;
using v8::ArrayBufferView;
using v8::Context;
using v8::FunctionCallbackInfo;
using v8::Integer;
using v8::Isolate;
using v8::Local;
using v8::MaybeLocal;
using v8::Object;
using v8::String;
using v8::Value;
namespace node {
namespace {
MaybeLocal<String> MakeString(Isolate* isolate,
const char* data,
size_t length,
enum encoding encoding) {
Local<Value> error;
MaybeLocal<Value> ret;
if (encoding == UTF8) {
return String::NewFromUtf8(
isolate,
data,
v8::NewStringType::kNormal,
length);
} else {
ret = StringBytes::Encode(
isolate,
data,
length,
encoding,
&error);
}
if (ret.IsEmpty()) {
CHECK(!error.IsEmpty());
isolate->ThrowException(error);
}
DCHECK(ret.IsEmpty() || ret.ToLocalChecked()->IsString());
return ret.FromMaybe(Local<Value>()).As<String>();
}
} // anonymous namespace
MaybeLocal<String> StringDecoder::DecodeData(Isolate* isolate,
const char* data,
size_t* nread_ptr) {
Local<String> prepend, body;
size_t nread = *nread_ptr;
if (Encoding() == UTF8 || Encoding() == UCS2 || Encoding() == BASE64) {
// See if we want bytes to finish a character from the previous
// chunk; if so, copy the new bytes to the missing bytes buffer
// and create a small string from it that is to be prepended to the
// main body.
if (MissingBytes() > 0) {
// There are never more bytes missing than the pre-calculated maximum.
CHECK_LE(MissingBytes() + BufferedBytes(),
kIncompleteCharactersEnd);
if (Encoding() == UTF8) {
// For UTF-8, we need special treatment to align with the V8 decoder:
// If an incomplete character is found at a chunk boundary, we use
// its remainder and pass it to V8 as-is.
for (size_t i = 0; i < nread && i < MissingBytes(); ++i) {
if ((data[i] & 0xC0) != 0x80) {
// This byte is not a continuation byte even though it should have
// been one. We stop decoding of the incomplete character at this
// point (but still use the rest of the incomplete bytes from this
// chunk) and assume that the new, unexpected byte starts a new one.
state_[kMissingBytes] = 0;
memcpy(IncompleteCharacterBuffer() + BufferedBytes(), data, i);
state_[kBufferedBytes] += i;
data += i;
nread -= i;
break;
}
}
}
size_t found_bytes =
std::min(nread, static_cast<size_t>(MissingBytes()));
memcpy(IncompleteCharacterBuffer() + BufferedBytes(),
data,
found_bytes);
// Adjust the two buffers.
data += found_bytes;
nread -= found_bytes;
state_[kMissingBytes] -= found_bytes;
state_[kBufferedBytes] += found_bytes;
if (LIKELY(MissingBytes() == 0)) {
// If no more bytes are missing, create a small string that we
// will later prepend.
if (!MakeString(isolate,
IncompleteCharacterBuffer(),
BufferedBytes(),
Encoding()).ToLocal(&prepend)) {
return MaybeLocal<String>();
}
*nread_ptr += BufferedBytes();
// No more buffered bytes.
state_[kBufferedBytes] = 0;
}
}
// It could be that trying to finish the previous chunk already
// consumed all data that we received in this chunk.
if (UNLIKELY(nread == 0)) {
body = !prepend.IsEmpty() ? prepend : String::Empty(isolate);
prepend = Local<String>();
} else {
// If not, that means is no character left to finish at this point.
DCHECK_EQ(MissingBytes(), 0);
DCHECK_EQ(BufferedBytes(), 0);
// See whether there is a character that we may have to cut off and
// finish when receiving the next chunk.
if (Encoding() == UTF8 && data[nread - 1] & 0x80) {
// This is UTF-8 encoded data and we ended on a non-ASCII UTF-8 byte.
// This means we'll need to figure out where the character to which
// the byte belongs begins.
for (size_t i = nread - 1; ; --i) {
DCHECK_LT(i, nread);
state_[kBufferedBytes]++;
if ((data[i] & 0xC0) == 0x80) {
// This byte does not start a character (a "trailing" byte).
if (state_[kBufferedBytes] >= 4 || i == 0) {
// We either have more then 4 trailing bytes (which means
// the current character would not be inside the range for
// valid Unicode, and in particular cannot be represented
// through JavaScript's UTF-16-based approach to strings), or the
// current buffer does not contain the start of an UTF-8 character
// at all. Either way, this is invalid UTF8 and we can just
// let the engine's decoder handle it.
state_[kBufferedBytes] = 0;
break;
}
} else {
// Found the first byte of a UTF-8 character. By looking at the
// upper bits we can tell how long the character *should* be.
if ((data[i] & 0xE0) == 0xC0) {
state_[kMissingBytes] = 2;
} else if ((data[i] & 0xF0) == 0xE0) {
state_[kMissingBytes] = 3;
} else if ((data[i] & 0xF8) == 0xF0) {
state_[kMissingBytes] = 4;
} else {
// This lead byte would indicate a character outside of the
// representable range.
state_[kBufferedBytes] = 0;
break;
}
if (BufferedBytes() >= MissingBytes()) {
// Received more or exactly as many trailing bytes than the lead
// character would indicate. In the "==" case, we have valid
// data and don't need to slice anything off;
// in the ">" case, this is invalid UTF-8 anyway.
state_[kMissingBytes] = 0;
state_[kBufferedBytes] = 0;
}
state_[kMissingBytes] -= state_[kBufferedBytes];
break;
}
}
} else if (Encoding() == UCS2) {
if ((nread % 2) == 1) {
// We got half a codepoint, and need the second byte of it.
state_[kBufferedBytes] = 1;
state_[kMissingBytes] = 1;
} else if ((data[nread - 1] & 0xFC) == 0xD8) {
// Half a split UTF-16 character.
state_[kBufferedBytes] = 2;
state_[kMissingBytes] = 2;
}
} else if (Encoding() == BASE64) {
state_[kBufferedBytes] = nread % 3;
if (state_[kBufferedBytes] > 0)
state_[kMissingBytes] = 3 - BufferedBytes();
}
if (BufferedBytes() > 0) {
// Copy the requested number of buffered bytes from the end of the
// input into the incomplete character buffer.
nread -= BufferedBytes();
*nread_ptr -= BufferedBytes();
memcpy(IncompleteCharacterBuffer(), data + nread, BufferedBytes());
}
if (nread > 0) {
if (!MakeString(isolate, data, nread, Encoding()).ToLocal(&body))
return MaybeLocal<String>();
} else {
body = String::Empty(isolate);
}
}
if (prepend.IsEmpty()) {
return body;
} else {
return String::Concat(isolate, prepend, body);
}
} else {
CHECK(Encoding() == ASCII || Encoding() == HEX || Encoding() == LATIN1);
return MakeString(isolate, data, nread, Encoding());
}
}
MaybeLocal<String> StringDecoder::FlushData(Isolate* isolate) {
if (Encoding() == ASCII || Encoding() == HEX || Encoding() == LATIN1) {
CHECK_EQ(MissingBytes(), 0);
CHECK_EQ(BufferedBytes(), 0);
}
if (Encoding() == UCS2 && BufferedBytes() % 2 == 1) {
// Ignore a single trailing byte, like the JS decoder does.
state_[kMissingBytes]--;
state_[kBufferedBytes]--;
}
if (BufferedBytes() == 0)
return String::Empty(isolate);
MaybeLocal<String> ret =
MakeString(isolate,
IncompleteCharacterBuffer(),
BufferedBytes(),
Encoding());
state_[kMissingBytes] = 0;
state_[kBufferedBytes] = 0;
return ret;
}
namespace {
void DecodeData(const FunctionCallbackInfo<Value>& args) {
StringDecoder* decoder =
reinterpret_cast<StringDecoder*>(Buffer::Data(args[0]));
CHECK_NOT_NULL(decoder);
CHECK(args[1]->IsArrayBufferView());
ArrayBufferViewContents<char> content(args[1].As<ArrayBufferView>());
size_t length = content.length();
MaybeLocal<String> ret =
decoder->DecodeData(args.GetIsolate(), content.data(), &length);
if (!ret.IsEmpty())
args.GetReturnValue().Set(ret.ToLocalChecked());
}
void FlushData(const FunctionCallbackInfo<Value>& args) {
StringDecoder* decoder =
reinterpret_cast<StringDecoder*>(Buffer::Data(args[0]));
CHECK_NOT_NULL(decoder);
MaybeLocal<String> ret = decoder->FlushData(args.GetIsolate());
if (!ret.IsEmpty())
args.GetReturnValue().Set(ret.ToLocalChecked());
}
void InitializeStringDecoder(Local<Object> target,
Local<Value> unused,
Local<Context> context,
void* priv) {
Environment* env = Environment::GetCurrent(context);
Isolate* isolate = env->isolate();
#define SET_DECODER_CONSTANT(name) \
target->Set(context, \
FIXED_ONE_BYTE_STRING(isolate, #name), \
Integer::New(isolate, StringDecoder::name)).FromJust()
SET_DECODER_CONSTANT(kIncompleteCharactersStart);
SET_DECODER_CONSTANT(kIncompleteCharactersEnd);
SET_DECODER_CONSTANT(kMissingBytes);
SET_DECODER_CONSTANT(kBufferedBytes);
SET_DECODER_CONSTANT(kEncodingField);
SET_DECODER_CONSTANT(kNumFields);
Local<Array> encodings = Array::New(isolate);
#define ADD_TO_ENCODINGS_ARRAY(cname, jsname) \
encodings->Set(context, \
static_cast<int32_t>(cname), \
FIXED_ONE_BYTE_STRING(isolate, jsname)).FromJust()
ADD_TO_ENCODINGS_ARRAY(ASCII, "ascii");
ADD_TO_ENCODINGS_ARRAY(UTF8, "utf8");
ADD_TO_ENCODINGS_ARRAY(BASE64, "base64");
ADD_TO_ENCODINGS_ARRAY(UCS2, "utf16le");
ADD_TO_ENCODINGS_ARRAY(HEX, "hex");
ADD_TO_ENCODINGS_ARRAY(BUFFER, "buffer");
ADD_TO_ENCODINGS_ARRAY(LATIN1, "latin1");
target->Set(context,
FIXED_ONE_BYTE_STRING(isolate, "encodings"),
encodings).Check();
target->Set(context,
FIXED_ONE_BYTE_STRING(isolate, "kSize"),
Integer::New(isolate, sizeof(StringDecoder))).Check();
env->SetMethod(target, "decode", DecodeData);
env->SetMethod(target, "flush", FlushData);
}
} // anonymous namespace
} // namespace node
NODE_MODULE_CONTEXT_AWARE_INTERNAL(string_decoder,
node::InitializeStringDecoder)