Skip to content

Commit c682ae3

Browse files
authored
fix(deserialize): fix deserialization of 0xFFFD
When deserializing a string, we previously relied on the appearance of 0xFFFD to denote an invalid unicode character. However, 0xFFFD is a valid unicode character if that is what was originally input. Fixes NODE-1718 Fixes #277
1 parent 06af813 commit c682ae3

File tree

3 files changed

+65
-5
lines changed

3 files changed

+65
-5
lines changed

Diff for: lib/parser/deserializer.js

+5-5
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ const DBRef = require('../db_ref');
1414
const BSONRegExp = require('../regexp');
1515
const Binary = require('../binary');
1616
const constants = require('../constants');
17+
const validateUtf8 = require('../validate_utf8').validateUtf8;
1718

1819
// Internal long versions
1920
const JS_INT_MAX_LONG = Long.fromNumber(constants.JS_INT_MAX);
@@ -134,13 +135,12 @@ function deserializeObject(buffer, index, options, isArray) {
134135
)
135136
throw new Error('bad string length in bson');
136137

137-
const s = buffer.toString('utf8', index, index + stringSize - 1);
138-
for (i = 0; i < s.length; i++) {
139-
if (s.charCodeAt(i) === 0xfffd) {
140-
throw new Error('Invalid UTF-8 string in BSON document');
141-
}
138+
if (!validateUtf8(buffer, index, index + stringSize - 1)) {
139+
throw new Error('Invalid UTF-8 string in BSON document');
142140
}
143141

142+
const s = buffer.toString('utf8', index, index + stringSize - 1);
143+
144144
object[name] = s;
145145
index = index + stringSize;
146146
} else if (elementType === constants.BSON_DATA_OID) {

Diff for: lib/validate_utf8.js

+48
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
'use strict';
2+
3+
const FIRST_BIT = 0x80;
4+
const FIRST_TWO_BITS = 0xc0;
5+
const FIRST_THREE_BITS = 0xe0;
6+
const FIRST_FOUR_BITS = 0xf0;
7+
const FIRST_FIVE_BITS = 0xf8;
8+
9+
const TWO_BIT_CHAR = 0xc0;
10+
const THREE_BIT_CHAR = 0xe0;
11+
const FOUR_BIT_CHAR = 0xf0;
12+
const CONTINUING_CHAR = 0x80;
13+
14+
/**
15+
* Determines if the passed in bytes are valid utf8
16+
* @param {Buffer|Uint8Array} bytes An array of 8-bit bytes. Must be indexable and have length property
17+
* @param {Number} start The index to start validating
18+
* @param {Number} end The index to end validating
19+
* @returns {boolean} True if valid utf8
20+
*/
21+
function validateUtf8(bytes, start, end) {
22+
let continuation = 0;
23+
24+
for (let i = start; i < end; i += 1) {
25+
const byte = bytes[i];
26+
27+
if (continuation) {
28+
if ((byte & FIRST_TWO_BITS) !== CONTINUING_CHAR) {
29+
return false;
30+
}
31+
continuation -= 1;
32+
} else if (byte & FIRST_BIT) {
33+
if ((byte & FIRST_THREE_BITS) === TWO_BIT_CHAR) {
34+
continuation = 1;
35+
} else if ((byte & FIRST_FOUR_BITS) === THREE_BIT_CHAR) {
36+
continuation = 2;
37+
} else if ((byte & FIRST_FIVE_BITS) === FOUR_BIT_CHAR) {
38+
continuation = 3;
39+
} else {
40+
return false;
41+
}
42+
}
43+
}
44+
45+
return !continuation;
46+
}
47+
48+
module.exports.validateUtf8 = validateUtf8;

Diff for: test/node/string_test.js

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
'use strict';
2+
3+
const BSON = require('../../lib/bson');
4+
5+
describe('string tests', function() {
6+
it('can serialize and deserialize 0xFFFD', function() {
7+
const unicodeString = String.fromCharCode(0x41, 0x42, 0xfffd, 0x43, 0x44); // "AB�CD"
8+
9+
const serialized = BSON.serialize({ value: unicodeString });
10+
BSON.deserialize(serialized);
11+
});
12+
});

0 commit comments

Comments
 (0)