mirror of https://github.com/nodejs/node.git
Implemented Utf8Decoder module
Allows to safely decode a utf8 stream into strings without breaking on multibyte characters.pull/22966/head
parent
e232f09d38
commit
caba9c70c3
|
@ -0,0 +1,89 @@
|
|||
var Buffer = require('buffer').Buffer;
|
||||
|
||||
var Utf8Decoder = exports.Utf8Decoder = function() {
|
||||
this.charBuffer = new Buffer(4);
|
||||
this.charReceived = 0;
|
||||
this.charLength = 0;
|
||||
};
|
||||
|
||||
Utf8Decoder.prototype.write = function(buffer) {
|
||||
var charStr = '';
|
||||
// if our last write ended with an incomplete multibyte character
|
||||
if (this.charLength) {
|
||||
// determine how many remaining bytes this buffer has to offer for this char
|
||||
var i = (buffer.length >= this.charLength - this.charReceived)
|
||||
? this.charLength - this.charReceived
|
||||
: buffer.length;
|
||||
|
||||
// add the new bytes to the char buffer
|
||||
buffer.copy(this.charBuffer, this.charReceived, 0, i);
|
||||
this.charReceived += i;
|
||||
|
||||
if (this.charReceived < this.charLength) {
|
||||
// still not enough chars in this buffer? wait for more ...
|
||||
return;
|
||||
}
|
||||
|
||||
// get the character that was split
|
||||
charStr = this.charBuffer.slice(0, this.charLength).toString();
|
||||
this.charReceived = this.charLength = 0;
|
||||
|
||||
if (i == buffer.length) {
|
||||
// if there are no more bytes in this buffer, just emit our char
|
||||
this.onString(charStr)
|
||||
return;
|
||||
}
|
||||
|
||||
// otherwise cut of the characters end from the beginning of this buffer
|
||||
buffer = buffer.slice(i, buffer.length);
|
||||
}
|
||||
|
||||
|
||||
// determine how many bytes we have to check at the end of this buffer
|
||||
var i = (buffer.length >= 3)
|
||||
? 3
|
||||
: buffer.length;
|
||||
|
||||
// figure out if one of the last i bytes of our buffer announces an incomplete char
|
||||
for (; i > 0; i--) {
|
||||
c = buffer[buffer.length - i];
|
||||
|
||||
// See http://en.wikipedia.org/wiki/UTF-8#Description
|
||||
|
||||
// 110XXXXX
|
||||
if (i == 1 && c >> 5 == 0x06) {
|
||||
this.charLength = 2;
|
||||
break;
|
||||
}
|
||||
|
||||
// 1110XXXX
|
||||
if (i <= 2 && c >> 4 == 0x0E) {
|
||||
this.charLength = 3;
|
||||
break;
|
||||
}
|
||||
|
||||
// 11110XXX
|
||||
if (i <= 3 && c >> 3 == 0x1E) {
|
||||
this.charLength = 4;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!this.charLength) {
|
||||
// no incomplete char at the end of this buffer, emit the whole thing
|
||||
this.onString(charStr+buffer.toString());
|
||||
return;
|
||||
}
|
||||
|
||||
// buffer the incomplete character bytes we got
|
||||
buffer.copy(this.charBuffer, 0, buffer.length - i, buffer.length);
|
||||
this.charReceived = i;
|
||||
|
||||
if (buffer.length - i > 0) {
|
||||
// buffer had more bytes before the incomplete char, emit them
|
||||
this.onString(charStr+buffer.slice(0, buffer.length - i).toString());
|
||||
} else if (charStr) {
|
||||
// or just emit the charStr if any
|
||||
this.onString(charStr);
|
||||
}
|
||||
};
|
|
@ -1820,6 +1820,7 @@ static Handle<Value> Binding(const Arguments& args) {
|
|||
exports->Set(String::New("utils"), String::New(native_utils));
|
||||
exports->Set(String::New("path"), String::New(native_path));
|
||||
exports->Set(String::New("module"), String::New(native_module));
|
||||
exports->Set(String::New("utf8decoder"), String::New(native_utf8decoder));
|
||||
binding_cache->Set(module, exports);
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,71 @@
|
|||
require('../common');
|
||||
var Utf8Decoder = require('utf8decoder').Utf8Decoder,
|
||||
Buffer = require('buffer').Buffer,
|
||||
decoder = new Utf8Decoder(),
|
||||
buffer,
|
||||
onStringCalled = 0;
|
||||
|
||||
decoder.onString = function(str) {
|
||||
onStringCalled++;
|
||||
assert.deepEqual(str, buffer.toString());
|
||||
};
|
||||
|
||||
buffer = new Buffer('$');
|
||||
decoder.write(buffer);
|
||||
assert.equal(onStringCalled, 1);
|
||||
|
||||
buffer = new Buffer('¢');
|
||||
decoder.write(buffer.slice(0, 1));
|
||||
decoder.write(buffer.slice(1, 2));
|
||||
assert.equal(onStringCalled, 2);
|
||||
|
||||
buffer = new Buffer('€');
|
||||
decoder.write(buffer.slice(0, 1));
|
||||
decoder.write(buffer.slice(1, 2));
|
||||
decoder.write(buffer.slice(2, 3));
|
||||
assert.equal(onStringCalled, 3);
|
||||
|
||||
buffer = new Buffer([0xF0, 0xA4, 0xAD, 0xA2]);
|
||||
decoder.write(buffer.slice(0, 1));
|
||||
decoder.write(buffer.slice(1, 2));
|
||||
decoder.write(buffer.slice(2, 3));
|
||||
decoder.write(buffer.slice(3, 4));
|
||||
assert.equal(onStringCalled, 4);
|
||||
|
||||
// A mixed ascii and non-ascii string
|
||||
// Test stolen from deps/v8/test/cctest/test-strings.cc
|
||||
// U+02E4 -> CB A4
|
||||
// U+0064 -> 64
|
||||
// U+12E4 -> E1 8B A4
|
||||
// U+0030 -> 30
|
||||
// U+3045 -> E3 81 85
|
||||
expected = "\u02e4\u0064\u12e4\u0030\u3045";
|
||||
buffer = new Buffer([0xCB, 0xA4, 0x64, 0xE1, 0x8B, 0xA4, 0x30, 0xE3, 0x81, 0x85]);
|
||||
charLengths = [0, 0, 1, 2, 2, 2, 3, 4, 4, 4, 5, 5];
|
||||
|
||||
// Split the buffer into 3 segments
|
||||
// |----|------|-------|
|
||||
// 0 i j buffer.length
|
||||
// Scan through every possible 3 segment combination
|
||||
// and make sure that the string is always parsed.
|
||||
print('scanning ');
|
||||
for (var j = 2; j < buffer.length; j++) {
|
||||
for (var i = 1; i < j; i++) {
|
||||
var decoder = new Utf8Decoder();
|
||||
var sum = "";
|
||||
decoder.onString = function (s) { sum += s; };
|
||||
|
||||
decoder.write(buffer.slice(0, i));
|
||||
|
||||
// just check that we've received the right amount
|
||||
// after the first write
|
||||
assert.equal(charLengths[i], sum.length);
|
||||
|
||||
decoder.write(buffer.slice(i, j));
|
||||
decoder.write(buffer.slice(j, buffer.length));
|
||||
assert.equal(expected, sum);
|
||||
print(".");
|
||||
}
|
||||
}
|
||||
puts(" crayon!");
|
||||
|
Loading…
Reference in New Issue