Skip to content

Commit 7c3ef65

Browse files
temp try to detect overlong encoding w/o textDecoder
1 parent 88fb767 commit 7c3ef65

File tree

7 files changed

+289
-11
lines changed

7 files changed

+289
-11
lines changed

etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs

+1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ export class RequireVendor {
1414
* @returns {{ code: string; map: import('magic-string').SourceMap }}
1515
*/
1616
transform(code, id) {
17+
// TODO(NODE-4930)
1718
if (!id.includes('web_byte_utils')) {
1819
return;
1920
}

src/error.ts

+10
Original file line numberDiff line numberDiff line change
@@ -103,3 +103,13 @@ export class BSONOffsetError extends BSONError {
103103
this.offset = offset;
104104
}
105105
}
106+
107+
export class BSONUTF8Error extends BSONError {
108+
public get name(): 'BSONUTF8Error' {
109+
return 'BSONUTF8Error';
110+
}
111+
112+
constructor(message: string, options?: { cause?: unknown }) {
113+
super(message, options);
114+
}
115+
}

src/test.ts

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
function parseUtf8Bits(arr: number[]): number {
2+
arr[0] >>= (arr.length - 1);
3+
for (let i = 1; i < arr.length; i++) {
4+
arr[i] >>= 2;
5+
arr[i] <<= i*8;
6+
arr[0] = arr[0] | arr[i]
7+
}
8+
return arr[0];
9+
}

src/utils/node_byte_utils.ts

+23-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { BSONError } from '../error';
1+
import { BSONError, BSONUTF8Error } from '../error';
22
import { validateUtf8 } from '../validate_utf8';
33
import { tryReadBasicLatin, tryWriteBasicLatin } from './latin';
44

@@ -27,6 +27,28 @@ type NodeJsBufferConstructor = Omit<Uint8ArrayConstructor, 'from'> & {
2727
declare const Buffer: NodeJsBufferConstructor;
2828
declare const require: (mod: 'crypto') => { randomBytes: (byteLength: number) => Uint8Array };
2929

30+
type TextDecoder = {
31+
readonly encoding: string;
32+
readonly fatal: boolean;
33+
readonly ignoreBOM: boolean;
34+
decode(input?: Uint8Array): string;
35+
};
36+
type TextDecoderConstructor = {
37+
new (label: 'utf8', options: { fatal: boolean; ignoreBOM?: boolean }): TextDecoder;
38+
};
39+
40+
type TextEncoder = {
41+
readonly encoding: string;
42+
encode(input?: string): Uint8Array;
43+
};
44+
type TextEncoderConstructor = {
45+
new (): TextEncoder;
46+
};
47+
48+
// Node byte utils global
49+
declare const TextDecoder: TextDecoderConstructor;
50+
declare const TextEncoder: TextEncoderConstructor;
51+
3052
/** @internal */
3153
export function nodejsMathRandomBytes(byteLength: number) {
3254
return nodeJsByteUtils.fromNumberArray(

src/utils/web_byte_utils.ts

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { BSONError } from '../error';
1+
import { BSONError, BSONUTF8Error } from '../error';
22
import { tryReadBasicLatin } from './latin';
33

44
type TextDecoder = {
@@ -183,7 +183,7 @@ export const webByteUtils = {
183183
try {
184184
return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end));
185185
} catch (cause) {
186-
throw new BSONError('Invalid UTF-8 string in BSON document', { cause });
186+
throw new BSONUTF8Error('Invalid UTF-8 string in BSON document', { cause });
187187
}
188188
}
189189
return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end));

src/validate_utf8.ts

+24-3
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import { NumberUtils } from "./utils/number_utils";
2+
13
const FIRST_BIT = 0x80;
24
const FIRST_TWO_BITS = 0xc0;
35
const FIRST_THREE_BITS = 0xe0;
@@ -9,6 +11,12 @@ const THREE_BIT_CHAR = 0xe0;
911
const FOUR_BIT_CHAR = 0xf0;
1012
const CONTINUING_CHAR = 0x80;
1113

14+
// max utf8 values representable in given number of bytes
15+
const ONE_BYTE_MAX = 0x7f;
16+
const TWO_BYTE_MAX = 0x7ff;
17+
const THREE_BYTE_MAX = 0xf7ff;
18+
19+
1220
/**
1321
* Determines if the passed in bytes are valid utf8
1422
* @param bytes - An array of 8-bit bytes. Must be indexable and have length property
@@ -30,12 +38,15 @@ export function validateUtf8(
3038
return false;
3139
}
3240
continuation -= 1;
33-
} else if (byte & FIRST_BIT) {
41+
} else if (byte & FIRST_BIT &&
42+
parseUtf8Bytes([byte, bytes[i+1]]) > ONE_BYTE_MAX) {
3443
if ((byte & FIRST_THREE_BITS) === TWO_BIT_CHAR) {
3544
continuation = 1;
36-
} else if ((byte & FIRST_FOUR_BITS) === THREE_BIT_CHAR) {
45+
} else if ((byte & FIRST_FOUR_BITS) === THREE_BIT_CHAR &&
46+
parseUtf8Bytes([byte, bytes[i+1], bytes[i+2]]) > TWO_BYTE_MAX) {
3747
continuation = 2;
38-
} else if ((byte & FIRST_FIVE_BITS) === FOUR_BIT_CHAR) {
48+
} else if ((byte & FIRST_FIVE_BITS) === FOUR_BIT_CHAR &&
49+
parseUtf8Bytes([byte, bytes[i+1], bytes[i+2], bytes[i+3]]) > THREE_BYTE_MAX) {
3950
continuation = 3;
4051
} else {
4152
return false;
@@ -45,3 +56,13 @@ export function validateUtf8(
4556

4657
return !continuation;
4758
}
59+
60+
function parseUtf8Bytes(arr: number[]): number {
61+
arr[0] >>= (arr.length - 1);
62+
for (let i = 1; i < arr.length; i++) {
63+
arr[i] >>= 2;
64+
arr[i] <<= i*8;
65+
arr[0] = arr[0] | arr[i]
66+
}
67+
return arr[0];
68+
}

test/node/byte_utils.test.ts

+220-5
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import { webByteUtils } from '../../src/utils/web_byte_utils';
88
import * as sinon from 'sinon';
99
import { loadCJSModuleBSON, loadReactNativeCJSModuleBSON, loadESModuleBSON } from '../load_bson';
1010
import * as crypto from 'node:crypto';
11+
import { BSONError, BSONUTF8Error } from '../../src/error';
1112

1213
type ByteUtilTest<K extends keyof ByteUtils> = {
1314
name: string;
@@ -399,6 +400,8 @@ const fromUTF8Tests: ByteUtilTest<'encodeUTF8Into'>[] = [
399400
}
400401
}
401402
];
403+
404+
402405
const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [
403406
{
404407
name: 'should create utf8 string from buffer input',
@@ -416,6 +419,14 @@ const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [
416419
expect(output).to.be.a('string').with.lengthOf(0);
417420
}
418421
},
422+
{
423+
name: 'should insert replacement character fatal is false and string is invalid',
424+
inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, false],
425+
expectation({ error, output }) {
426+
expect(error).to.not.exist;
427+
expect(output).to.equal('abc\uFFFD');
428+
}
429+
},
419430
{
420431
name: 'should throw an error if fatal is set and string is invalid',
421432
inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, true],
@@ -424,14 +435,168 @@ const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [
424435
}
425436
},
426437
{
427-
name: 'should insert replacement character fatal is false and string is invalid',
428-
inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, false],
429-
expectation({ error, output }) {
430-
expect(error).to.not.exist;
431-
expect(output).to.equal('abc\uFFFD');
438+
name: 'throw an error if fatal is set and string contains overlong encoding',
439+
inputs: [Buffer.from('11000000025f0005000000f08282ac0000', 'hex'), 0, 18, true],
440+
expectation({ error }) {
441+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
442+
}
443+
},
444+
{
445+
name: 'throw an error if fatal is set and string contains invalid bytes',
446+
inputs: [Buffer.from('abcff', 'hex'), 0, 2, true],
447+
expectation({ error }) {
448+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
449+
}
450+
},
451+
{
452+
name: 'throw an error if fatal is set and string contains an unexpected continuation byte',
453+
inputs: [Buffer.from('7F80', 'hex'), 0, 2, true],
454+
expectation({ error }) {
455+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
456+
}
457+
},
458+
{ inputs: [Buffer.from('0xFF', 'hex'), 0, 1, true], name: 'throws when provided with invalid code' , expectation({ error }) {
459+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
460+
}
461+
},
462+
{ inputs: [Buffer.from('0xC0', 'hex'), 0, 1, true], name: 'throws when provided with ends early' , expectation({ error }) {
463+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
464+
}
465+
},
466+
{ inputs: [Buffer.from('0xE0', 'hex'), 0, 1, true], name: 'throws when provided with ends early 2' , expectation({ error }) {
467+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
468+
}
469+
},
470+
{ inputs: [Buffer.from('0xC000', 'hex'), 0, 2, true], name: 'throws when provided with invalid trail' , expectation({ error }) {
471+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
472+
}
473+
},
474+
{ inputs: [Buffer.from('0xC0C0', 'hex'), 0, 2, true], name: 'throws when provided with invalid trail 2' , expectation({ error }) {
475+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
476+
}
477+
},
478+
{ inputs: [Buffer.from('0xE000', 'hex'), 0, 2, true], name: 'throws when provided with invalid trail 3' , expectation({ error }) {
479+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
480+
}
481+
},
482+
{ inputs: [Buffer.from('0xE0C0', 'hex'), 0, 2, true], name: 'throws when provided with invalid trail 4' , expectation({ error }) {
483+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
484+
}
485+
},
486+
{ inputs: [Buffer.from('0xE08000', 'hex'), 0, 3, true], name: 'throws when provided with invalid trail 5' , expectation({ error }) {
487+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
488+
}
489+
},
490+
{ inputs: [Buffer.from('0xE080C0', 'hex'), 0, 3, true], name: 'throws when provided with invalid trail 6' , expectation({ error }) {
491+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
492+
}
493+
},
494+
{ inputs: [Buffer.from('0xFC8080808080', 'hex'), 0, 6, true], name: 'throws when provided with > 0x10FFFF' , expectation({ error }) {
495+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
496+
}
497+
},
498+
{ inputs: [Buffer.from('0xFE8080808080', 'hex'), 0, 6, true], name: 'throws when provided with obsolete lead byte' , expectation({ error }) {
499+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
500+
}
501+
},
502+
503+
// Overlong encodings
504+
{ inputs: [Buffer.from('0xC080', 'hex'), 0, 2, true], name: 'throws when provided with overlong U+0000 - 2 bytes' , expectation({ error }) {
505+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
506+
}
507+
},
508+
{ inputs: [Buffer.from('0xE08080', 'hex'), 0, 3, true], name: 'throws when provided with overlong U+0000 - 3 bytes' , expectation({ error }) {
509+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
510+
}
511+
},
512+
{ inputs: [Buffer.from('0xF0808080', 'hex'), 0, 4, true], name: 'throws when provided with overlong U+0000 - 4 bytes' , expectation({ error }) {
513+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
514+
}
515+
},
516+
{ inputs: [Buffer.from('0xF880808080', 'hex'), 0, 5, true], name: 'throws when provided with overlong U+0000 - 5 bytes' , expectation({ error }) {
517+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
518+
}
519+
},
520+
{ inputs: [Buffer.from('0xFC8080808080', 'hex'), 0, 6, true], name: 'throws when provided with overlong U+0000 - 6 bytes' , expectation({ error }) {
521+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
522+
}
523+
},
524+
525+
{ inputs: [Buffer.from('0xC1BF', 'hex'), 0, 2, true], name: 'throws when provided with overlong U+007F - 2 bytes' , expectation({ error }) {
526+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
527+
}
528+
},
529+
{ inputs: [Buffer.from('0xE081BF', 'hex'), 0, 3, true], name: 'throws when provided with overlong U+007F - 3 bytes' , expectation({ error }) {
530+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
531+
}
532+
},
533+
{ inputs: [Buffer.from('0xF08081BF', 'hex'), 0, 4, true], name: 'throws when provided with overlong U+007F - 4 bytes' , expectation({ error }) {
534+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
535+
}
536+
},
537+
{ inputs: [Buffer.from('0xF8808081BF', 'hex'), 0, 5, true], name: 'throws when provided with overlong U+007F - 5 bytes' , expectation({ error }) {
538+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
539+
}
540+
},
541+
{ inputs: [Buffer.from('0xFC80808081BF', 'hex'), 0, 6, true], name: 'throws when provided with overlong U+007F - 6 bytes' , expectation({ error }) {
542+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
543+
}
544+
},
545+
546+
{ inputs: [Buffer.from('0xE09FBF', 'hex'), 0, 3, true], name: 'throws when provided with overlong U+07FF - 3 bytes' , expectation({ error }) {
547+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
548+
}
549+
},
550+
{ inputs: [Buffer.from('0xF0809FBF', 'hex'), 0, 4, true], name: 'throws when provided with overlong U+07FF - 4 bytes' , expectation({ error }) {
551+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
552+
}
553+
},
554+
{ inputs: [Buffer.from('0xF880809FBF', 'hex'), 0, 5, true], name: 'throws when provided with overlong U+07FF - 5 bytes' , expectation({ error }) {
555+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
556+
}
557+
},
558+
{ inputs: [Buffer.from('0xFC8080809FBF', 'hex'), 0, 6, true], name: 'throws when provided with overlong U+07FF - 6 bytes' , expectation({ error }) {
559+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
560+
}
561+
},
562+
563+
{ inputs: [Buffer.from('0xF08FBFBF', 'hex'), 0, 4, true], name: 'throws when provided with overlong U+FFFF - 4 bytes' , expectation({ error }) {
564+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
565+
}
566+
},
567+
{ inputs: [Buffer.from('0xF8808FBFBF', 'hex'), 0, 5, true], name: 'throws when provided with overlong U+FFFF - 5 bytes' , expectation({ error }) {
568+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
569+
}
570+
},
571+
{ inputs: [Buffer.from('0xFC80808FBFBF', 'hex'), 0, 6, true], name: 'throws when provided with overlong U+FFFF - 6 bytes' , expectation({ error }) {
572+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
573+
}
574+
},
575+
576+
{ inputs: [Buffer.from('0xF8848FBFBF', 'hex'), 0, 5, true], name: 'throws when provided with overlong U+10FFFF - 5 bytes' , expectation({ error }) {
577+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
578+
}
579+
},
580+
{ inputs: [Buffer.from('0xFC80848FBFBF', 'hex'), 0, 6, true], name: 'throws when provided with overlong U+10FFFF - 6 bytes' , expectation({ error }) {
581+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
582+
}
583+
},
584+
585+
// UTF-16 surrogates encoded as code points in UTF-8
586+
{ inputs: [Buffer.from('0xEDA080', 'hex'), 0, 3, true], name: 'throws when provided with lead surrogate' , expectation({ error }) {
587+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
588+
}
589+
},
590+
{ inputs: [Buffer.from('0xEDB080', 'hex'), 0, 3, true], name: 'throws when provided with trail surrogate' , expectation({ error }) {
591+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
592+
}
593+
},
594+
{ inputs: [Buffer.from('0xEDA080EDB080', 'hex'), 0, 6, true], name: 'throws when provided with surrogate pair' , expectation({ error }) {
595+
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
432596
}
433597
}
434598
];
599+
435600
const utf8ByteLengthTests: ByteUtilTest<'utf8ByteLength'>[] = [
436601
{
437602
name: 'should return zero for empty string',
@@ -801,4 +966,54 @@ describe('ByteUtils', () => {
801966
});
802967
}
803968
}
969+
970+
let bad = [
971+
{ encoding: 'utf-8', input: [0xFF], name: 'invalid code' },
972+
{ encoding: 'utf-8', input: [0xC0], name: 'ends early' },
973+
{ encoding: 'utf-8', input: [0xE0], name: 'ends early 2' },
974+
{ encoding: 'utf-8', input: [0xC0, 0x00], name: 'invalid trail' },
975+
{ encoding: 'utf-8', input: [0xC0, 0xC0], name: 'invalid trail 2' },
976+
{ encoding: 'utf-8', input: [0xE0, 0x00], name: 'invalid trail 3' },
977+
{ encoding: 'utf-8', input: [0xE0, 0xC0], name: 'invalid trail 4' },
978+
{ encoding: 'utf-8', input: [0xE0, 0x80, 0x00], name: 'invalid trail 5' },
979+
{ encoding: 'utf-8', input: [0xE0, 0x80, 0xC0], name: 'invalid trail 6' },
980+
{ encoding: 'utf-8', input: [0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], name: '> 0x10FFFF' },
981+
{ encoding: 'utf-8', input: [0xFE, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'obsolete lead byte' },
982+
983+
// Overlong encodings
984+
{ encoding: 'utf-8', input: [0xC0, 0x80], name: 'overlong U+0000 - 2 bytes' },
985+
{ encoding: 'utf-8', input: [0xE0, 0x80, 0x80], name: 'overlong U+0000 - 3 bytes' },
986+
{ encoding: 'utf-8', input: [0xF0, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 4 bytes' },
987+
{ encoding: 'utf-8', input: [0xF8, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 5 bytes' },
988+
{ encoding: 'utf-8', input: [0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 6 bytes' },
989+
990+
{ encoding: 'utf-8', input: [0xC1, 0xBF], name: 'overlong U+007F - 2 bytes' },
991+
{ encoding: 'utf-8', input: [0xE0, 0x81, 0xBF], name: 'overlong U+007F - 3 bytes' },
992+
{ encoding: 'utf-8', input: [0xF0, 0x80, 0x81, 0xBF], name: 'overlong U+007F - 4 bytes' },
993+
{ encoding: 'utf-8', input: [0xF8, 0x80, 0x80, 0x81, 0xBF], name: 'overlong U+007F - 5 bytes' },
994+
{ encoding: 'utf-8', input: [0xFC, 0x80, 0x80, 0x80, 0x81, 0xBF], name: 'overlong U+007F - 6 bytes' },
995+
996+
{ encoding: 'utf-8', input: [0xE0, 0x9F, 0xBF], name: 'overlong U+07FF - 3 bytes' },
997+
{ encoding: 'utf-8', input: [0xF0, 0x80, 0x9F, 0xBF], name: 'overlong U+07FF - 4 bytes' },
998+
{ encoding: 'utf-8', input: [0xF8, 0x80, 0x80, 0x9F, 0xBF], name: 'overlong U+07FF - 5 bytes' },
999+
{ encoding: 'utf-8', input: [0xFC, 0x80, 0x80, 0x80, 0x9F, 0xBF], name: 'overlong U+07FF - 6 bytes' },
1000+
1001+
{ encoding: 'utf-8', input: [0xF0, 0x8F, 0xBF, 0xBF], name: 'overlong U+FFFF - 4 bytes' },
1002+
{ encoding: 'utf-8', input: [0xF8, 0x80, 0x8F, 0xBF, 0xBF], name: 'overlong U+FFFF - 5 bytes' },
1003+
{ encoding: 'utf-8', input: [0xFC, 0x80, 0x80, 0x8F, 0xBF, 0xBF], name: 'overlong U+FFFF - 6 bytes' },
1004+
1005+
{ encoding: 'utf-8', input: [0xF8, 0x84, 0x8F, 0xBF, 0xBF], name: 'overlong U+10FFFF - 5 bytes' },
1006+
{ encoding: 'utf-8', input: [0xFC, 0x80, 0x84, 0x8F, 0xBF, 0xBF], name: 'overlong U+10FFFF - 6 bytes' },
1007+
1008+
// UTF-16 surrogates encoded as code points in UTF-8
1009+
{ encoding: 'utf-8', input: [0xED, 0xA0, 0x80], name: 'lead surrogate' },
1010+
{ encoding: 'utf-8', input: [0xED, 0xB0, 0x80], name: 'trail surrogate' },
1011+
{ encoding: 'utf-8', input: [0xED, 0xA0, 0x80, 0xED, 0xB0, 0x80], name: 'surrogate pair' },
1012+
];
1013+
1014+
for (const test of bad) {
1015+
it.only(`${test.name}`, () => {
1016+
expect(() => nodeJsByteUtils.toUTF8(Uint8Array.from(test.input), 0, test.input.length, true)).to.throw(BSONError);
1017+
});
1018+
}
8041019
});

0 commit comments

Comments
 (0)