Skip to content

Commit ae8bac7

Browse files
fix(NODE-6123): utf8 validation is insufficiently strict (#676)
1 parent c58d1e2 commit ae8bac7

File tree

10 files changed

+191
-89
lines changed

10 files changed

+191
-89
lines changed
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
import MagicString from 'magic-string';
22

3-
const REQUIRE_POLYFILLS =
4-
`const { TextEncoder, TextDecoder } = require('../vendor/text-encoding');
3+
const REQUIRE_WEB_UTILS_POLYFILLS =
4+
`const { TextEncoder } = require('../vendor/text-encoding');
55
const { encode: btoa, decode: atob } = require('../vendor/base64');\n`
66

7+
const REQUIRE_PARSE_UTF8_POLYFILLS =
8+
`const { TextDecoder } = require('../vendor/text-encoding');\n`;
9+
710
export class RequireVendor {
811
/**
912
* Take the compiled source code input; types are expected to already have been removed.
@@ -14,17 +17,24 @@ export class RequireVendor {
1417
* @returns {{ code: string; map: import('magic-string').SourceMap }}
1518
*/
1619
transform(code, id) {
17-
if (!id.includes('web_byte_utils')) {
18-
return;
19-
}
20+
if (id.includes('parse_utf8')) {
21+
// MagicString lets us edit the source code and still generate an accurate source map
22+
const magicString = new MagicString(code);
23+
magicString.prepend(REQUIRE_PARSE_UTF8_POLYFILLS);
2024

21-
// MagicString lets us edit the source code and still generate an accurate source map
22-
const magicString = new MagicString(code);
23-
magicString.prepend(REQUIRE_POLYFILLS);
25+
return {
26+
code: magicString.toString(),
27+
map: magicString.generateMap({ hires: true })
28+
};
29+
} else if (id.includes('web_byte_utils')) {
30+
// MagicString lets us edit the source code and still generate an accurate source map
31+
const magicString = new MagicString(code);
32+
magicString.prepend(REQUIRE_WEB_UTILS_POLYFILLS);
2433

25-
return {
26-
code: magicString.toString(),
27-
map: magicString.generateMap({ hires: true })
28-
};
34+
return {
35+
code: magicString.toString(),
36+
map: magicString.generateMap({ hires: true })
37+
};
38+
}
2939
}
3040
}

src/parse_utf8.ts

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import { BSONError } from './error';
2+
3+
type TextDecoder = {
4+
readonly encoding: string;
5+
readonly fatal: boolean;
6+
readonly ignoreBOM: boolean;
7+
decode(input?: Uint8Array): string;
8+
};
9+
type TextDecoderConstructor = {
10+
new (label: 'utf8', options: { fatal: boolean; ignoreBOM?: boolean }): TextDecoder;
11+
};
12+
13+
// parse utf8 globals
14+
declare const TextDecoder: TextDecoderConstructor;
15+
let TextDecoderFatal: TextDecoder;
16+
let TextDecoderNonFatal: TextDecoder;
17+
18+
/**
19+
* Determines if the passed in bytes are valid utf8
20+
* @param bytes - An array of 8-bit bytes. Must be indexable and have length property
21+
* @param start - The index to start validating
22+
* @param end - The index to end validating
23+
*/
24+
export function parseUtf8(buffer: Uint8Array, start: number, end: number, fatal: boolean): string {
25+
if (fatal) {
26+
TextDecoderFatal ??= new TextDecoder('utf8', { fatal: true });
27+
try {
28+
return TextDecoderFatal.decode(buffer.subarray(start, end));
29+
} catch (cause) {
30+
throw new BSONError('Invalid UTF-8 string in BSON document', { cause });
31+
}
32+
}
33+
TextDecoderNonFatal ??= new TextDecoder('utf8', { fatal: false });
34+
return TextDecoderNonFatal.decode(buffer.subarray(start, end));
35+
}

src/parser/deserializer.ts

+1-7
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ import { BSONSymbol } from '../symbol';
1616
import { Timestamp } from '../timestamp';
1717
import { ByteUtils } from '../utils/byte_utils';
1818
import { NumberUtils } from '../utils/number_utils';
19-
import { validateUtf8 } from '../validate_utf8';
2019

2120
/** @public */
2221
export interface DeserializeOptions {
@@ -604,12 +603,7 @@ function deserializeObject(
604603
)
605604
throw new BSONError('bad string length in bson');
606605
// Namespace
607-
if (validation != null && validation.utf8) {
608-
if (!validateUtf8(buffer, index, index + stringSize - 1)) {
609-
throw new BSONError('Invalid UTF-8 string in BSON document');
610-
}
611-
}
612-
const namespace = ByteUtils.toUTF8(buffer, index, index + stringSize - 1, false);
606+
const namespace = ByteUtils.toUTF8(buffer, index, index + stringSize - 1, shouldValidateKey);
613607
// Update parse index position
614608
index = index + stringSize;
615609

src/utils/node_byte_utils.ts

+2-5
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { BSONError } from '../error';
2-
import { validateUtf8 } from '../validate_utf8';
2+
import { parseUtf8 } from '../parse_utf8';
33
import { tryReadBasicLatin, tryWriteBasicLatin } from './latin';
44

55
type NodeJsEncoding = 'base64' | 'hex' | 'utf8' | 'binary';
@@ -136,12 +136,9 @@ export const nodeJsByteUtils = {
136136

137137
const string = nodeJsByteUtils.toLocalBufferType(buffer).toString('utf8', start, end);
138138
if (fatal) {
139-
// TODO(NODE-4930): Insufficiently strict BSON UTF8 validation
140139
for (let i = 0; i < string.length; i++) {
141140
if (string.charCodeAt(i) === 0xfffd) {
142-
if (!validateUtf8(buffer, start, end)) {
143-
throw new BSONError('Invalid UTF-8 string in BSON document');
144-
}
141+
parseUtf8(buffer, start, end, true);
145142
break;
146143
}
147144
}

src/utils/web_byte_utils.ts

+2-8
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { BSONError } from '../error';
22
import { tryReadBasicLatin } from './latin';
3+
import { parseUtf8 } from '../parse_utf8';
34

45
type TextDecoder = {
56
readonly encoding: string;
@@ -179,14 +180,7 @@ export const webByteUtils = {
179180
return basicLatin;
180181
}
181182

182-
if (fatal) {
183-
try {
184-
return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end));
185-
} catch (cause) {
186-
throw new BSONError('Invalid UTF-8 string in BSON document', { cause });
187-
}
188-
}
189-
return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end));
183+
return parseUtf8(uint8array, start, end, fatal);
190184
},
191185

192186
utf8ByteLength(input: string): number {

src/validate_utf8.ts

-47
This file was deleted.

test/node/byte_utils.test.ts

+16-8
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import { webByteUtils } from '../../src/utils/web_byte_utils';
88
import * as sinon from 'sinon';
99
import { loadCJSModuleBSON, loadReactNativeCJSModuleBSON, loadESModuleBSON } from '../load_bson';
1010
import * as crypto from 'node:crypto';
11+
import { utf8WebPlatformSpecTests } from './data/utf8_wpt_error_cases';
1112

1213
type ByteUtilTest<K extends keyof ByteUtils> = {
1314
name: string;
@@ -399,6 +400,7 @@ const fromUTF8Tests: ByteUtilTest<'encodeUTF8Into'>[] = [
399400
}
400401
}
401402
];
403+
402404
const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [
403405
{
404406
name: 'should create utf8 string from buffer input',
@@ -416,22 +418,28 @@ const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [
416418
expect(output).to.be.a('string').with.lengthOf(0);
417419
}
418420
},
419-
{
420-
name: 'should throw an error if fatal is set and string is invalid',
421-
inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, true],
422-
expectation({ error }) {
423-
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
424-
}
425-
},
426421
{
427422
name: 'should insert replacement character fatal is false and string is invalid',
428423
inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, false],
429424
expectation({ error, output }) {
430425
expect(error).to.not.exist;
431426
expect(output).to.equal('abc\uFFFD');
432427
}
433-
}
428+
},
429+
...utf8WebPlatformSpecTests.map(t => ({
430+
name: t.name,
431+
inputs: [Uint8Array.from(t.input), 0, t.input.length, true] as [
432+
buffer: Uint8Array,
433+
start: number,
434+
end: number,
435+
fatal: boolean
436+
],
437+
expectation({ error }) {
438+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
439+
}
440+
}))
434441
];
442+
435443
const utf8ByteLengthTests: ByteUtilTest<'utf8ByteLength'>[] = [
436444
{
437445
name: 'should return zero for empty string',
+67
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
// extra error cases copied from wpt/encoding/textdecoder-fatal.any.js
2+
// commit sha: 7c9f867
3+
// link: https://github.com/web-platform-tests/wpt/commit/7c9f8674d9809731e8919073d957d6233f6e0544
4+
5+
export const utf8WebPlatformSpecTests = [
6+
{ encoding: 'utf-8', input: [0xff], name: 'invalid code' },
7+
{ encoding: 'utf-8', input: [0xc0], name: 'ends early' },
8+
{ encoding: 'utf-8', input: [0xe0], name: 'ends early 2' },
9+
{ encoding: 'utf-8', input: [0xc0, 0x00], name: 'invalid trail' },
10+
{ encoding: 'utf-8', input: [0xc0, 0xc0], name: 'invalid trail 2' },
11+
{ encoding: 'utf-8', input: [0xe0, 0x00], name: 'invalid trail 3' },
12+
{ encoding: 'utf-8', input: [0xe0, 0xc0], name: 'invalid trail 4' },
13+
{ encoding: 'utf-8', input: [0xe0, 0x80, 0x00], name: 'invalid trail 5' },
14+
{ encoding: 'utf-8', input: [0xe0, 0x80, 0xc0], name: 'invalid trail 6' },
15+
{ encoding: 'utf-8', input: [0xfc, 0x80, 0x80, 0x80, 0x80, 0x80], name: '> 0x10ffff' },
16+
{ encoding: 'utf-8', input: [0xfe, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'obsolete lead byte' },
17+
18+
// Overlong encodings
19+
{ encoding: 'utf-8', input: [0xc0, 0x80], name: 'overlong U+0000 - 2 bytes' },
20+
{ encoding: 'utf-8', input: [0xe0, 0x80, 0x80], name: 'overlong U+0000 - 3 bytes' },
21+
{ encoding: 'utf-8', input: [0xf0, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 4 bytes' },
22+
{ encoding: 'utf-8', input: [0xf8, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 5 bytes' },
23+
{
24+
encoding: 'utf-8',
25+
input: [0xfc, 0x80, 0x80, 0x80, 0x80, 0x80],
26+
name: 'overlong U+0000 - 6 bytes'
27+
},
28+
29+
{ encoding: 'utf-8', input: [0xc1, 0xbf], name: 'overlong U+007f - 2 bytes' },
30+
{ encoding: 'utf-8', input: [0xe0, 0x81, 0xbf], name: 'overlong U+007f - 3 bytes' },
31+
{ encoding: 'utf-8', input: [0xf0, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 4 bytes' },
32+
{ encoding: 'utf-8', input: [0xf8, 0x80, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 5 bytes' },
33+
{
34+
encoding: 'utf-8',
35+
input: [0xfc, 0x80, 0x80, 0x80, 0x81, 0xbf],
36+
name: 'overlong U+007f - 6 bytes'
37+
},
38+
39+
{ encoding: 'utf-8', input: [0xe0, 0x9f, 0xbf], name: 'overlong U+07ff - 3 bytes' },
40+
{ encoding: 'utf-8', input: [0xf0, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 4 bytes' },
41+
{ encoding: 'utf-8', input: [0xf8, 0x80, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 5 bytes' },
42+
{
43+
encoding: 'utf-8',
44+
input: [0xfc, 0x80, 0x80, 0x80, 0x9f, 0xbf],
45+
name: 'overlong U+07ff - 6 bytes'
46+
},
47+
48+
{ encoding: 'utf-8', input: [0xf0, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 4 bytes' },
49+
{ encoding: 'utf-8', input: [0xf8, 0x80, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 5 bytes' },
50+
{
51+
encoding: 'utf-8',
52+
input: [0xfc, 0x80, 0x80, 0x8f, 0xbf, 0xbf],
53+
name: 'overlong U+ffff - 6 bytes'
54+
},
55+
56+
{ encoding: 'utf-8', input: [0xf8, 0x84, 0x8f, 0xbf, 0xbf], name: 'overlong U+10ffff - 5 bytes' },
57+
{
58+
encoding: 'utf-8',
59+
input: [0xfc, 0x80, 0x84, 0x8f, 0xbf, 0xbf],
60+
name: 'overlong U+10ffff - 6 bytes'
61+
},
62+
63+
// UTf-16 surrogates encoded as code points in UTf-8
64+
{ encoding: 'utf-8', input: [0xed, 0xa0, 0x80], name: 'lead surrogate' },
65+
{ encoding: 'utf-8', input: [0xed, 0xb0, 0x80], name: 'trail surrogate' },
66+
{ encoding: 'utf-8', input: [0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80], name: 'surrogate pair' }
67+
];

test/node/parser/deserializer.test.ts

+45-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import * as BSON from '../../register-bson';
22
import { expect } from 'chai';
3-
import { bufferFromHexArray } from '../tools/utils';
3+
import { bufferFromHexArray, int32LEToHex } from '../tools/utils';
4+
import { utf8WebPlatformSpecTests } from '../data/utf8_wpt_error_cases';
45

56
describe('deserializer()', () => {
67
describe('when the fieldsAsRaw options is present and has a value that corresponds to a key in the object', () => {
@@ -58,4 +59,47 @@ describe('deserializer()', () => {
5859
expect(resultCodeWithScope).to.have.deep.nested.property('a.scope', { b: true });
5960
});
6061
});
62+
63+
describe('utf8 validation', () => {
64+
for (const test of utf8WebPlatformSpecTests) {
65+
const inputStringSize = int32LEToHex(test.input.length + 1); // int32 size of string
66+
const inputHexString = Buffer.from(test.input).toString('hex');
67+
const buffer = bufferFromHexArray([
68+
'02', // string
69+
'6100', // 'a' key with null terminator
70+
inputStringSize,
71+
inputHexString,
72+
'00'
73+
]);
74+
context(`when utf8 validation is on and input is ${test.name}`, () => {
75+
it(`throws error containing 'Invalid UTF-8'`, () => {
76+
// global case
77+
expect(() => BSON.deserialize(buffer, { validation: { utf8: true } })).to.throw(
78+
BSON.BSONError,
79+
/Invalid UTF-8 string in BSON document/i
80+
);
81+
82+
// specific case
83+
expect(() => BSON.deserialize(buffer, { validation: { utf8: { a: true } } })).to.throw(
84+
BSON.BSONError,
85+
/Invalid UTF-8 string in BSON document/i
86+
);
87+
});
88+
});
89+
90+
context(`when utf8 validation is off and input is ${test.name}`, () => {
91+
it('returns a string containing at least 1 replacement character', () => {
92+
// global case
93+
expect(BSON.deserialize(buffer, { validation: { utf8: false } }))
94+
.to.have.property('a')
95+
.that.includes('\uFFFD');
96+
97+
// specific case
98+
expect(BSON.deserialize(buffer, { validation: { utf8: { a: false } } }))
99+
.to.have.property('a')
100+
.that.includes('\uFFFD');
101+
});
102+
});
103+
}
104+
});
61105
});

test/node/release.test.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ const REQUIRED_FILES = [
5050
'src/utils/number_utils.ts',
5151
'src/utils/web_byte_utils.ts',
5252
'src/utils/latin.ts',
53-
'src/validate_utf8.ts',
53+
'src/parse_utf8.ts',
5454
'vendor/base64/base64.js',
5555
'vendor/base64/package.json',
5656
'vendor/base64/LICENSE-MIT.txt',

0 commit comments

Comments
 (0)