Skip to content

Commit d7bdcec

Browse files
authored
feat(NODE-6537): add support for binary vectors (#730)
1 parent f6e86bb commit d7bdcec

12 files changed

+585
-77
lines changed

Diff for: .evergreen/run-big-endian-test.sh

-5
This file was deleted.

Diff for: etc/Dockerfile

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
FROM node:22 AS build
2+
3+
WORKDIR /bson
4+
COPY . .
5+
6+
RUN rm -rf node_modules && npm install && npm test
7+
8+
FROM scratch
9+
10+
COPY --from=build /bson/docs/ /

Diff for: etc/run-big-endian-test.sh

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/usr/bin/env bash
2+
3+
# At the time of writing. This script is not used in CI.
4+
# but can be used to locally iterate on big endian bugs.
5+
# buildx requires an output, so I put docs which should be a no-op.
6+
7+
set -o errexit
8+
set -o nounset
9+
set -o pipefail
10+
set -o xtrace
11+
12+
# If you get an error you may have an outdated buildkit version
13+
# Try running this:
14+
# docker buildx rm builder && docker buildx create --name builder --bootstrap --use
15+
16+
docker buildx build \
17+
--progress=plain \
18+
--platform linux/s390x \
19+
--build-arg="NODE_ARCH=s390x" \
20+
-f ./etc/Dockerfile \
21+
--output type=local,dest=./docs,platform-split=false \
22+
.

Diff for: src/binary.ts

+218
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import { BSONError } from './error';
44
import { BSON_BINARY_SUBTYPE_UUID_NEW } from './constants';
55
import { ByteUtils } from './utils/byte_utils';
66
import { BSONValue } from './bson_value';
7+
import { NumberUtils } from './utils/number_utils';
78

89
/** @public */
910
export type BinarySequence = Uint8Array | number[];
@@ -58,9 +59,18 @@ export class Binary extends BSONValue {
5859
static readonly SUBTYPE_COLUMN = 7;
5960
/** Sensitive BSON type */
6061
static readonly SUBTYPE_SENSITIVE = 8;
62+
/** Vector BSON type */
63+
static readonly SUBTYPE_VECTOR = 9;
6164
/** User BSON type */
6265
static readonly SUBTYPE_USER_DEFINED = 128;
6366

67+
/** datatype of a Binary Vector (subtype: 9) */
68+
static readonly VECTOR_TYPE = Object.freeze({
69+
Int8: 0x03,
70+
Float32: 0x27,
71+
PackedBit: 0x10
72+
} as const);
73+
6474
/**
6575
* The bytes of the Binary value.
6676
*
@@ -238,6 +248,11 @@ export class Binary extends BSONValue {
238248
/** @internal */
239249
toExtendedJSON(options?: EJSONOptions): BinaryExtendedLegacy | BinaryExtended {
240250
options = options || {};
251+
252+
if (this.sub_type === Binary.SUBTYPE_VECTOR) {
253+
validateBinaryVector(this);
254+
}
255+
241256
const base64String = ByteUtils.toBase64(this.buffer);
242257

243258
const subType = Number(this.sub_type).toString(16);
@@ -310,6 +325,209 @@ export class Binary extends BSONValue {
310325
const subTypeArg = inspect(this.sub_type, options);
311326
return `Binary.createFromBase64(${base64Arg}, ${subTypeArg})`;
312327
}
328+
329+
/**
330+
* If this Binary represents a Int8 Vector (`binary.buffer[0] === Binary.VECTOR_TYPE.Int8`),
331+
* returns a copy of the bytes in a new Int8Array.
332+
*
333+
* If the Binary is not a Vector, or the datatype is not Int8, an error is thrown.
334+
*/
335+
public toInt8Array(): Int8Array {
336+
if (this.sub_type !== Binary.SUBTYPE_VECTOR) {
337+
throw new BSONError('Binary sub_type is not Vector');
338+
}
339+
340+
if (this.buffer[0] !== Binary.VECTOR_TYPE.Int8) {
341+
throw new BSONError('Binary datatype field is not Int8');
342+
}
343+
344+
return new Int8Array(
345+
this.buffer.buffer.slice(this.buffer.byteOffset + 2, this.buffer.byteOffset + this.position)
346+
);
347+
}
348+
349+
/**
350+
* If this Binary represents a Float32 Vector (`binary.buffer[0] === Binary.VECTOR_TYPE.Float32`),
351+
* returns a copy of the bytes in a new Float32Array.
352+
*
353+
* If the Binary is not a Vector, or the datatype is not Float32, an error is thrown.
354+
*/
355+
public toFloat32Array(): Float32Array {
356+
if (this.sub_type !== Binary.SUBTYPE_VECTOR) {
357+
throw new BSONError('Binary sub_type is not Vector');
358+
}
359+
360+
if (this.buffer[0] !== Binary.VECTOR_TYPE.Float32) {
361+
throw new BSONError('Binary datatype field is not Float32');
362+
}
363+
364+
const floatBytes = new Uint8Array(
365+
this.buffer.buffer.slice(this.buffer.byteOffset + 2, this.buffer.byteOffset + this.position)
366+
);
367+
368+
if (NumberUtils.isBigEndian) ByteUtils.swap32(floatBytes);
369+
370+
return new Float32Array(floatBytes.buffer);
371+
}
372+
373+
/**
374+
* If this Binary represents packed bit Vector (`binary.buffer[0] === Binary.VECTOR_TYPE.PackedBit`),
375+
* returns a copy of the bytes that are packed bits.
376+
*
377+
* Use `toBits` to get the unpacked bits.
378+
*
379+
* If the Binary is not a Vector, or the datatype is not PackedBit, an error is thrown.
380+
*/
381+
public toPackedBits(): Uint8Array {
382+
if (this.sub_type !== Binary.SUBTYPE_VECTOR) {
383+
throw new BSONError('Binary sub_type is not Vector');
384+
}
385+
386+
if (this.buffer[0] !== Binary.VECTOR_TYPE.PackedBit) {
387+
throw new BSONError('Binary datatype field is not packed bit');
388+
}
389+
390+
return new Uint8Array(
391+
this.buffer.buffer.slice(this.buffer.byteOffset + 2, this.buffer.byteOffset + this.position)
392+
);
393+
}
394+
395+
/**
396+
* If this Binary represents a Packed bit Vector (`binary.buffer[0] === Binary.VECTOR_TYPE.PackedBit`),
397+
* returns a copy of the bit unpacked into a new Int8Array.
398+
*
399+
* Use `toPackedBits` to get the bits still in packed form.
400+
*
401+
* If the Binary is not a Vector, or the datatype is not PackedBit, an error is thrown.
402+
*/
403+
public toBits(): Int8Array {
404+
if (this.sub_type !== Binary.SUBTYPE_VECTOR) {
405+
throw new BSONError('Binary sub_type is not Vector');
406+
}
407+
408+
if (this.buffer[0] !== Binary.VECTOR_TYPE.PackedBit) {
409+
throw new BSONError('Binary datatype field is not packed bit');
410+
}
411+
412+
const byteCount = this.length() - 2;
413+
const bitCount = byteCount * 8 - this.buffer[1];
414+
const bits = new Int8Array(bitCount);
415+
416+
for (let bitOffset = 0; bitOffset < bits.length; bitOffset++) {
417+
const byteOffset = (bitOffset / 8) | 0;
418+
const byte = this.buffer[byteOffset + 2];
419+
const shift = 7 - (bitOffset % 8);
420+
const bit = (byte >> shift) & 1;
421+
bits[bitOffset] = bit;
422+
}
423+
424+
return bits;
425+
}
426+
427+
/**
428+
* Constructs a Binary representing an Int8 Vector.
429+
* @param array - The array to store as a view on the Binary class
430+
*/
431+
public static fromInt8Array(array: Int8Array): Binary {
432+
const buffer = ByteUtils.allocate(array.byteLength + 2);
433+
buffer[0] = Binary.VECTOR_TYPE.Int8;
434+
buffer[1] = 0;
435+
const intBytes = new Uint8Array(array.buffer, array.byteOffset, array.byteLength);
436+
buffer.set(intBytes, 2);
437+
return new this(buffer, this.SUBTYPE_VECTOR);
438+
}
439+
440+
/** Constructs a Binary representing an Float32 Vector. */
441+
public static fromFloat32Array(array: Float32Array): Binary {
442+
const binaryBytes = ByteUtils.allocate(array.byteLength + 2);
443+
binaryBytes[0] = Binary.VECTOR_TYPE.Float32;
444+
binaryBytes[1] = 0;
445+
446+
const floatBytes = new Uint8Array(array.buffer, array.byteOffset, array.byteLength);
447+
binaryBytes.set(floatBytes, 2);
448+
449+
if (NumberUtils.isBigEndian) ByteUtils.swap32(new Uint8Array(binaryBytes.buffer, 2));
450+
451+
return new this(binaryBytes, this.SUBTYPE_VECTOR);
452+
}
453+
454+
/**
455+
* Constructs a Binary representing a packed bit Vector.
456+
*
457+
* Use `fromBits` to pack an array of 1s and 0s.
458+
*/
459+
public static fromPackedBits(array: Uint8Array, padding = 0): Binary {
460+
const buffer = ByteUtils.allocate(array.byteLength + 2);
461+
buffer[0] = Binary.VECTOR_TYPE.PackedBit;
462+
buffer[1] = padding;
463+
buffer.set(array, 2);
464+
return new this(buffer, this.SUBTYPE_VECTOR);
465+
}
466+
467+
/**
468+
* Constructs a Binary representing an Packed Bit Vector.
469+
* @param array - The array of 1s and 0s to pack into the Binary instance
470+
*/
471+
public static fromBits(bits: ArrayLike<number>): Binary {
472+
const byteLength = (bits.length + 7) >>> 3; // ceil(bits.length / 8)
473+
const bytes = new Uint8Array(byteLength + 2);
474+
bytes[0] = Binary.VECTOR_TYPE.PackedBit;
475+
476+
const remainder = bits.length % 8;
477+
bytes[1] = remainder === 0 ? 0 : 8 - remainder;
478+
479+
for (let bitOffset = 0; bitOffset < bits.length; bitOffset++) {
480+
const byteOffset = bitOffset >>> 3; // floor(bitOffset / 8)
481+
const bit = bits[bitOffset];
482+
483+
if (bit !== 0 && bit !== 1) {
484+
throw new BSONError(
485+
`Invalid bit value at ${bitOffset}: must be 0 or 1, found ${bits[bitOffset]}`
486+
);
487+
}
488+
489+
if (bit === 0) continue;
490+
491+
const shift = 7 - (bitOffset % 8);
492+
bytes[byteOffset + 2] |= bit << shift;
493+
}
494+
495+
return new this(bytes, Binary.SUBTYPE_VECTOR);
496+
}
497+
}
498+
499+
export function validateBinaryVector(vector: Binary): void {
500+
if (vector.sub_type !== Binary.SUBTYPE_VECTOR) return;
501+
502+
const size = vector.position;
503+
504+
// NOTE: Validation is only applied to **KNOWN** vector types
505+
// If a new datatype is introduced, a future version of the library will need to add validation
506+
const datatype = vector.buffer[0];
507+
508+
// NOTE: We do not enable noUncheckedIndexedAccess so TS believes this is always number
509+
// a Binary vector may be empty, in which case the padding is undefined
510+
// this possible value is tolerable for our validation checks
511+
const padding: number | undefined = vector.buffer[1];
512+
513+
if (
514+
(datatype === Binary.VECTOR_TYPE.Float32 || datatype === Binary.VECTOR_TYPE.Int8) &&
515+
padding !== 0
516+
) {
517+
throw new BSONError('Invalid Vector: padding must be zero for int8 and float32 vectors');
518+
}
519+
520+
if (datatype === Binary.VECTOR_TYPE.PackedBit && padding !== 0 && size === 2) {
521+
throw new BSONError(
522+
'Invalid Vector: padding must be zero for packed bit vectors that are empty'
523+
);
524+
}
525+
526+
if (datatype === Binary.VECTOR_TYPE.PackedBit && padding > 7) {
527+
throw new BSONError(
528+
`Invalid Vector: padding must be a value between 0 and 7. found: ${padding}`
529+
);
530+
}
313531
}
314532

315533
/** @public */

Diff for: src/parser/serializer.ts

+5-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { Binary } from '../binary';
1+
import { Binary, validateBinaryVector } from '../binary';
22
import type { BSONSymbol, DBRef, Document, MaxKey } from '../bson';
33
import type { Code } from '../code';
44
import * as constants from '../constants';
@@ -495,6 +495,10 @@ function serializeBinary(buffer: Uint8Array, key: string, value: Binary, index:
495495
index += NumberUtils.setInt32LE(buffer, index, size);
496496
}
497497

498+
if (value.sub_type === Binary.SUBTYPE_VECTOR) {
499+
validateBinaryVector(value);
500+
}
501+
498502
if (size <= 16) {
499503
for (let i = 0; i < size; i++) buffer[index + i] = data[i];
500504
} else {

Diff for: src/utils/byte_utils.ts

+2
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ export type ByteUtils = {
3939
encodeUTF8Into: (destination: Uint8Array, source: string, byteOffset: number) => number;
4040
/** Generate a Uint8Array filled with random bytes with byteLength */
4141
randomBytes: (byteLength: number) => Uint8Array;
42+
/** Interprets `buffer` as an array of 32-bit values and swaps the byte order in-place. */
43+
swap32: (buffer: Uint8Array) => Uint8Array;
4244
};
4345

4446
declare const Buffer: { new (): unknown; prototype?: { _isBuffer?: boolean } } | undefined;

Diff for: src/utils/node_byte_utils.ts

+6-1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ type NodeJsBuffer = ArrayBufferView &
99
copy(target: Uint8Array, targetStart: number, sourceStart: number, sourceEnd: number): number;
1010
toString: (this: Uint8Array, encoding: NodeJsEncoding, start?: number, end?: number) => string;
1111
equals: (this: Uint8Array, other: Uint8Array) => boolean;
12+
swap32: (this: NodeJsBuffer) => NodeJsBuffer;
1213
};
1314
type NodeJsBufferConstructor = Omit<Uint8ArrayConstructor, 'from'> & {
1415
alloc: (size: number) => NodeJsBuffer;
@@ -159,5 +160,9 @@ export const nodeJsByteUtils = {
159160
return nodeJsByteUtils.toLocalBufferType(buffer).write(source, byteOffset, undefined, 'utf8');
160161
},
161162

162-
randomBytes: nodejsRandomBytes
163+
randomBytes: nodejsRandomBytes,
164+
165+
swap32(buffer: Uint8Array): NodeJsBuffer {
166+
return nodeJsByteUtils.toLocalBufferType(buffer).swap32();
167+
}
163168
};

Diff for: src/utils/number_utils.ts

+4
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ const isBigEndian = FLOAT_BYTES[7] === 0;
1313
* A collection of functions that get or set various numeric types and bit widths from a Uint8Array.
1414
*/
1515
export type NumberUtils = {
16+
/** Is true if the current system is big endian. */
17+
isBigEndian: boolean;
1618
/**
1719
* Parses a signed int32 at offset. Throws a `RangeError` if value is negative.
1820
*/
@@ -35,6 +37,8 @@ export type NumberUtils = {
3537
* @public
3638
*/
3739
export const NumberUtils: NumberUtils = {
40+
isBigEndian,
41+
3842
getNonnegativeInt32LE(source: Uint8Array, offset: number): number {
3943
if (source[offset + 3] > 127) {
4044
throw new RangeError(`Size cannot be negative at offset: ${offset}`);

Diff for: src/utils/web_byte_utils.ts

+20-1
Original file line numberDiff line numberDiff line change
@@ -193,5 +193,24 @@ export const webByteUtils = {
193193
return bytes.byteLength;
194194
},
195195

196-
randomBytes: webRandomBytes
196+
randomBytes: webRandomBytes,
197+
198+
swap32(buffer: Uint8Array): Uint8Array {
199+
if (buffer.length % 4 !== 0) {
200+
throw new RangeError('Buffer size must be a multiple of 32-bits');
201+
}
202+
203+
for (let i = 0; i < buffer.length; i += 4) {
204+
const byte0 = buffer[i];
205+
const byte1 = buffer[i + 1];
206+
const byte2 = buffer[i + 2];
207+
const byte3 = buffer[i + 3];
208+
buffer[i] = byte3;
209+
buffer[i + 1] = byte2;
210+
buffer[i + 2] = byte1;
211+
buffer[i + 3] = byte0;
212+
}
213+
214+
return buffer;
215+
}
197216
};

0 commit comments

Comments
 (0)