Skip to content

fix(NODE-6123): utf8 validation is insufficiently strict #676

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Apr 30, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 22 additions & 12 deletions etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import MagicString from 'magic-string';

const REQUIRE_POLYFILLS =
`const { TextEncoder, TextDecoder } = require('../vendor/text-encoding');
const REQUIRE_WEB_UTILS_POLYFILLS =
`const { TextEncoder } = require('../vendor/text-encoding');
const { encode: btoa, decode: atob } = require('../vendor/base64');\n`

const REQUIRE_PARSE_UTF8_POLYFILLS =
`const { TextDecoder } = require('../vendor/text-encoding');\n`;

export class RequireVendor {
/**
* Take the compiled source code input; types are expected to already have been removed.
Expand All @@ -14,17 +17,24 @@ export class RequireVendor {
* @returns {{ code: string; map: import('magic-string').SourceMap }}
*/
transform(code, id) {
if (!id.includes('web_byte_utils')) {
return;
}
if (id.includes('parse_utf8')) {
// MagicString lets us edit the source code and still generate an accurate source map
const magicString = new MagicString(code);
magicString.prepend(REQUIRE_PARSE_UTF8_POLYFILLS);

// MagicString lets us edit the source code and still generate an accurate source map
const magicString = new MagicString(code);
magicString.prepend(REQUIRE_POLYFILLS);
return {
code: magicString.toString(),
map: magicString.generateMap({ hires: true })
};
} else if (id.includes('web_byte_utils')) {
// MagicString lets us edit the source code and still generate an accurate source map
const magicString = new MagicString(code);
magicString.prepend(REQUIRE_WEB_UTILS_POLYFILLS);

return {
code: magicString.toString(),
map: magicString.generateMap({ hires: true })
};
return {
code: magicString.toString(),
map: magicString.generateMap({ hires: true })
};
}
}
}
35 changes: 35 additions & 0 deletions src/parse_utf8.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import { BSONError } from './error';

type TextDecoder = {
readonly encoding: string;
readonly fatal: boolean;
readonly ignoreBOM: boolean;
decode(input?: Uint8Array): string;
};
type TextDecoderConstructor = {
new (label: 'utf8', options: { fatal: boolean; ignoreBOM?: boolean }): TextDecoder;
};

// parse utf8 globals
declare const TextDecoder: TextDecoderConstructor;
let TextDecoderFatal: TextDecoder;
let TextDecoderNonFatal: TextDecoder;

/**
* Determines if the passed in bytes are valid utf8
* @param bytes - An array of 8-bit bytes. Must be indexable and have length property
* @param start - The index to start validating
* @param end - The index to end validating
*/
export function parseUtf8(buffer: Uint8Array, start: number, end: number, fatal: boolean): string {
if (fatal) {
try {
TextDecoderFatal ??= new TextDecoder('utf8', { fatal: true });
return TextDecoderFatal.decode(buffer.subarray(start, end));
} catch (cause) {
throw new BSONError('Invalid UTF-8 string in BSON document', { cause });
}
}
TextDecoderNonFatal ??= new TextDecoder('utf8', { fatal: false });
return TextDecoderNonFatal.decode(buffer.subarray(start, end));
}
8 changes: 1 addition & 7 deletions src/parser/deserializer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ import { BSONSymbol } from '../symbol';
import { Timestamp } from '../timestamp';
import { ByteUtils } from '../utils/byte_utils';
import { NumberUtils } from '../utils/number_utils';
import { validateUtf8 } from '../validate_utf8';

/** @public */
export interface DeserializeOptions {
Expand Down Expand Up @@ -604,12 +603,7 @@ function deserializeObject(
)
throw new BSONError('bad string length in bson');
// Namespace
if (validation != null && validation.utf8) {
if (!validateUtf8(buffer, index, index + stringSize - 1)) {
throw new BSONError('Invalid UTF-8 string in BSON document');
}
}
const namespace = ByteUtils.toUTF8(buffer, index, index + stringSize - 1, false);
const namespace = ByteUtils.toUTF8(buffer, index, index + stringSize - 1, shouldValidateKey);
// Update parse index position
index = index + stringSize;

Expand Down
7 changes: 2 additions & 5 deletions src/utils/node_byte_utils.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { BSONError } from '../error';
import { validateUtf8 } from '../validate_utf8';
import { parseUtf8 } from '../parse_utf8';
import { tryReadBasicLatin, tryWriteBasicLatin } from './latin';

type NodeJsEncoding = 'base64' | 'hex' | 'utf8' | 'binary';
Expand Down Expand Up @@ -136,12 +136,9 @@ export const nodeJsByteUtils = {

const string = nodeJsByteUtils.toLocalBufferType(buffer).toString('utf8', start, end);
if (fatal) {
// TODO(NODE-4930): Insufficiently strict BSON UTF8 validation
for (let i = 0; i < string.length; i++) {
if (string.charCodeAt(i) === 0xfffd) {
if (!validateUtf8(buffer, start, end)) {
throw new BSONError('Invalid UTF-8 string in BSON document');
}
parseUtf8(buffer, start, end, true);
break;
}
}
Expand Down
10 changes: 2 additions & 8 deletions src/utils/web_byte_utils.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { BSONError } from '../error';
import { tryReadBasicLatin } from './latin';
import { parseUtf8 } from '../parse_utf8';

type TextDecoder = {
readonly encoding: string;
Expand Down Expand Up @@ -179,14 +180,7 @@ export const webByteUtils = {
return basicLatin;
}

if (fatal) {
try {
return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end));
} catch (cause) {
throw new BSONError('Invalid UTF-8 string in BSON document', { cause });
}
}
return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end));
return parseUtf8(uint8array, start, end, fatal);
},

utf8ByteLength(input: string): number {
Expand Down
47 changes: 0 additions & 47 deletions src/validate_utf8.ts

This file was deleted.

124 changes: 118 additions & 6 deletions test/node/byte_utils.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import { webByteUtils } from '../../src/utils/web_byte_utils';
import * as sinon from 'sinon';
import { loadCJSModuleBSON, loadReactNativeCJSModuleBSON, loadESModuleBSON } from '../load_bson';
import * as crypto from 'node:crypto';
import { BSONError } from '../../src/error';

type ByteUtilTest<K extends keyof ByteUtils> = {
name: string;
Expand Down Expand Up @@ -399,6 +400,7 @@ const fromUTF8Tests: ByteUtilTest<'encodeUTF8Into'>[] = [
}
}
];

const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [
{
name: 'should create utf8 string from buffer input',
Expand All @@ -417,21 +419,57 @@ const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [
}
},
{
name: 'should throw an error if fatal is set and string is invalid',
name: 'should insert replacement character fatal is false and string is invalid',
inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, false],
expectation({ error, output }) {
expect(error).to.not.exist;
expect(output).to.equal('abc\uFFFD');
}
},
{
name: 'should throw an error if fatal is set and string is a sequence that decodes to an invalid code point',
inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, true],
expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{
name: 'should insert replacement character fatal is false and string is invalid',
inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, false],
expectation({ error, output }) {
expect(error).to.not.exist;
expect(output).to.equal('abc\uFFFD');
name: 'throw an error if fatal is set and string contains overlong encoding',
inputs: [Buffer.from('11000000025f0005000000f08282ac0000', 'hex'), 0, 18, true],
expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{
name: 'throw an error if fatal is set and string contains invalid bytes',
inputs: [Buffer.from('abcff', 'hex'), 0, 2, true],
expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{
name: 'throw an error if fatal is set and string contains an unexpected continuation byte',
inputs: [Buffer.from('7F80', 'hex'), 0, 2, true],
expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{
name: 'throw an error if fatal is set and string contains a non-continuation byte before the end of the character',
inputs: [Buffer.from('c000', 'hex'), 0, 2, true],
expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{
name: 'throw an error if fatal is set and string ends before the end of the character',
inputs: [Buffer.from('c0', 'hex'), 0, 1, true],
expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
}
];

const utf8ByteLengthTests: ByteUtilTest<'utf8ByteLength'>[] = [
{
name: 'should return zero for empty string',
Expand Down Expand Up @@ -493,6 +531,72 @@ const randomBytesTests: ByteUtilTest<'randomBytes'>[] = [
}
];

// extra error cases copied from wpt/encoding/textdecoder-fatal.any.js
// commit sha: 7c9f867
const toUTF8WebPlatformSpecTests = [
{ encoding: 'utf-8', input: [0xff], name: 'invalid code' },
{ encoding: 'utf-8', input: [0xc0], name: 'ends early' },
{ encoding: 'utf-8', input: [0xe0], name: 'ends early 2' },
{ encoding: 'utf-8', input: [0xc0, 0x00], name: 'invalid trail' },
{ encoding: 'utf-8', input: [0xc0, 0xc0], name: 'invalid trail 2' },
{ encoding: 'utf-8', input: [0xe0, 0x00], name: 'invalid trail 3' },
{ encoding: 'utf-8', input: [0xe0, 0xc0], name: 'invalid trail 4' },
{ encoding: 'utf-8', input: [0xe0, 0x80, 0x00], name: 'invalid trail 5' },
{ encoding: 'utf-8', input: [0xe0, 0x80, 0xc0], name: 'invalid trail 6' },
{ encoding: 'utf-8', input: [0xfc, 0x80, 0x80, 0x80, 0x80, 0x80], name: '> 0x10ffff' },
{ encoding: 'utf-8', input: [0xfe, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'obsolete lead byte' },

// Overlong encodings
{ encoding: 'utf-8', input: [0xc0, 0x80], name: 'overlong U+0000 - 2 bytes' },
{ encoding: 'utf-8', input: [0xe0, 0x80, 0x80], name: 'overlong U+0000 - 3 bytes' },
{ encoding: 'utf-8', input: [0xf0, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 4 bytes' },
{ encoding: 'utf-8', input: [0xf8, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 5 bytes' },
{
encoding: 'utf-8',
input: [0xfc, 0x80, 0x80, 0x80, 0x80, 0x80],
name: 'overlong U+0000 - 6 bytes'
},

{ encoding: 'utf-8', input: [0xc1, 0xbf], name: 'overlong U+007f - 2 bytes' },
{ encoding: 'utf-8', input: [0xe0, 0x81, 0xbf], name: 'overlong U+007f - 3 bytes' },
{ encoding: 'utf-8', input: [0xf0, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 4 bytes' },
{ encoding: 'utf-8', input: [0xf8, 0x80, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 5 bytes' },
{
encoding: 'utf-8',
input: [0xfc, 0x80, 0x80, 0x80, 0x81, 0xbf],
name: 'overlong U+007f - 6 bytes'
},

{ encoding: 'utf-8', input: [0xe0, 0x9f, 0xbf], name: 'overlong U+07ff - 3 bytes' },
{ encoding: 'utf-8', input: [0xf0, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 4 bytes' },
{ encoding: 'utf-8', input: [0xf8, 0x80, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 5 bytes' },
{
encoding: 'utf-8',
input: [0xfc, 0x80, 0x80, 0x80, 0x9f, 0xbf],
name: 'overlong U+07ff - 6 bytes'
},

{ encoding: 'utf-8', input: [0xf0, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 4 bytes' },
{ encoding: 'utf-8', input: [0xf8, 0x80, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 5 bytes' },
{
encoding: 'utf-8',
input: [0xfc, 0x80, 0x80, 0x8f, 0xbf, 0xbf],
name: 'overlong U+ffff - 6 bytes'
},

{ encoding: 'utf-8', input: [0xf8, 0x84, 0x8f, 0xbf, 0xbf], name: 'overlong U+10ffff - 5 bytes' },
{
encoding: 'utf-8',
input: [0xfc, 0x80, 0x84, 0x8f, 0xbf, 0xbf],
name: 'overlong U+10ffff - 6 bytes'
},

// UTf-16 surrogates encoded as code points in UTf-8
{ encoding: 'utf-8', input: [0xed, 0xa0, 0x80], name: 'lead surrogate' },
{ encoding: 'utf-8', input: [0xed, 0xb0, 0x80], name: 'trail surrogate' },
{ encoding: 'utf-8', input: [0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80], name: 'surrogate pair' }
];

const utils = new Map([
['nodeJsByteUtils', nodeJsByteUtils],
['webByteUtils', webByteUtils]
Expand Down Expand Up @@ -798,6 +902,14 @@ describe('ByteUtils', () => {
test.expectation({ web: byteUtilsName === 'webByteUtils', output, error });
});
}
if (utility === 'toUTF8')
for (const test of toUTF8WebPlatformSpecTests) {
it(`throws error when fatal is set and provided ${test.name} as input`, () => {
expect(() =>
byteUtils[utility](Uint8Array.from(test.input), 0, test.input.length, true)
).to.throw(BSONError, /Invalid UTF-8 string in BSON document/i);
});
}
});
}
}
Expand Down
2 changes: 1 addition & 1 deletion test/node/release.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ const REQUIRED_FILES = [
'src/utils/number_utils.ts',
'src/utils/web_byte_utils.ts',
'src/utils/latin.ts',
'src/validate_utf8.ts',
'src/parse_utf8.ts',
'vendor/base64/base64.js',
'vendor/base64/package.json',
'vendor/base64/LICENSE-MIT.txt',
Expand Down