feat(NODE-6537): add support for binary vectors (#730)

nbbeeken · web-flow · commit d7bdcec04349 · 2024-11-18T16:05:52.000-07:00
diff --git a/.evergreen/run-big-endian-test.sh b/.evergreen/run-big-endian-test.sh
diff --git a/etc/Dockerfile b/etc/Dockerfile
@@ -0,0 +1,10 @@
+FROM node:22 AS build
+
+WORKDIR /bson
+COPY . .
+
+RUN rm -rf node_modules && npm install && npm test
+
+FROM scratch
+
+COPY --from=build /bson/docs/ /
diff --git a/etc/run-big-endian-test.sh b/etc/run-big-endian-test.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+# At the time of writing. This script is not used in CI.
+# but can be used to locally iterate on big endian bugs.
+# buildx requires an output, so I put docs which should be a no-op.
+
+set -o errexit
+set -o nounset
+set -o pipefail
+set -o xtrace
+
+# If you get an error you may have an outdated buildkit version
+# Try running this:
+# docker buildx rm builder && docker buildx create --name builder --bootstrap --use
+
+docker buildx build \
+    --progress=plain \
+    --platform linux/s390x \
+    --build-arg="NODE_ARCH=s390x" \
+    -f ./etc/Dockerfile \
+    --output type=local,dest=./docs,platform-split=false \
+    .
diff --git a/src/binary.ts b/src/binary.ts
@@ -4,6 +4,7 @@ import { BSONError } from './error';
 import { BSON_BINARY_SUBTYPE_UUID_NEW } from './constants';
 import { ByteUtils } from './utils/byte_utils';
 import { BSONValue } from './bson_value';
+import { NumberUtils } from './utils/number_utils';
 
 /** @public */
 export type BinarySequence = Uint8Array | number[];
@@ -58,9 +59,18 @@ export class Binary extends BSONValue {
   static readonly SUBTYPE_COLUMN = 7;
   /** Sensitive BSON type */
   static readonly SUBTYPE_SENSITIVE = 8;
+  /** Vector BSON type */
+  static readonly SUBTYPE_VECTOR = 9;
   /** User BSON type */
   static readonly SUBTYPE_USER_DEFINED = 128;
 
+  /** datatype of a Binary Vector (subtype: 9) */
+  static readonly VECTOR_TYPE = Object.freeze({
+    Int8: 0x03,
+    Float32: 0x27,
+    PackedBit: 0x10
+  } as const);
+
   /**
    * The bytes of the Binary value.
    *
@@ -238,6 +248,11 @@ export class Binary extends BSONValue {
   /** @internal */
   toExtendedJSON(options?: EJSONOptions): BinaryExtendedLegacy | BinaryExtended {
     options = options || {};
+
+    if (this.sub_type === Binary.SUBTYPE_VECTOR) {
+      validateBinaryVector(this);
+    }
+
     const base64String = ByteUtils.toBase64(this.buffer);
 
     const subType = Number(this.sub_type).toString(16);
@@ -310,6 +325,209 @@ export class Binary extends BSONValue {
     const subTypeArg = inspect(this.sub_type, options);
     return `Binary.createFromBase64(${base64Arg}, ${subTypeArg})`;
   }
+
+  /**
+   * If this Binary represents a Int8 Vector (`binary.buffer[0] === Binary.VECTOR_TYPE.Int8`),
+   * returns a copy of the bytes in a new Int8Array.
+   *
+   * If the Binary is not a Vector, or the datatype is not Int8, an error is thrown.
+   */
+  public toInt8Array(): Int8Array {
+    if (this.sub_type !== Binary.SUBTYPE_VECTOR) {
+      throw new BSONError('Binary sub_type is not Vector');
+    }
+
+    if (this.buffer[0] !== Binary.VECTOR_TYPE.Int8) {
+      throw new BSONError('Binary datatype field is not Int8');
+    }
+
+    return new Int8Array(
+      this.buffer.buffer.slice(this.buffer.byteOffset + 2, this.buffer.byteOffset + this.position)
+    );
+  }
+
+  /**
+   * If this Binary represents a Float32 Vector (`binary.buffer[0] === Binary.VECTOR_TYPE.Float32`),
+   * returns a copy of the bytes in a new Float32Array.
+   *
+   * If the Binary is not a Vector, or the datatype is not Float32, an error is thrown.
+   */
+  public toFloat32Array(): Float32Array {
+    if (this.sub_type !== Binary.SUBTYPE_VECTOR) {
+      throw new BSONError('Binary sub_type is not Vector');
+    }
+
+    if (this.buffer[0] !== Binary.VECTOR_TYPE.Float32) {
+      throw new BSONError('Binary datatype field is not Float32');
+    }
+
+    const floatBytes = new Uint8Array(
+      this.buffer.buffer.slice(this.buffer.byteOffset + 2, this.buffer.byteOffset + this.position)
+    );
+
+    if (NumberUtils.isBigEndian) ByteUtils.swap32(floatBytes);
+
+    return new Float32Array(floatBytes.buffer);
+  }
+
+  /**
+   * If this Binary represents packed bit Vector (`binary.buffer[0] === Binary.VECTOR_TYPE.PackedBit`),
+   * returns a copy of the bytes that are packed bits.
+   *
+   * Use `toBits` to get the unpacked bits.
+   *
+   * If the Binary is not a Vector, or the datatype is not PackedBit, an error is thrown.
+   */
+  public toPackedBits(): Uint8Array {
+    if (this.sub_type !== Binary.SUBTYPE_VECTOR) {
+      throw new BSONError('Binary sub_type is not Vector');
+    }
+
+    if (this.buffer[0] !== Binary.VECTOR_TYPE.PackedBit) {
+      throw new BSONError('Binary datatype field is not packed bit');
+    }
+
+    return new Uint8Array(
+      this.buffer.buffer.slice(this.buffer.byteOffset + 2, this.buffer.byteOffset + this.position)
+    );
+  }
+
+  /**
+   * If this Binary represents a Packed bit Vector (`binary.buffer[0] === Binary.VECTOR_TYPE.PackedBit`),
+   * returns a copy of the bit unpacked into a new Int8Array.
+   *
+   * Use `toPackedBits` to get the bits still in packed form.
+   *
+   * If the Binary is not a Vector, or the datatype is not PackedBit, an error is thrown.
+   */
+  public toBits(): Int8Array {
+    if (this.sub_type !== Binary.SUBTYPE_VECTOR) {
+      throw new BSONError('Binary sub_type is not Vector');
+    }
+
+    if (this.buffer[0] !== Binary.VECTOR_TYPE.PackedBit) {
+      throw new BSONError('Binary datatype field is not packed bit');
+    }
+
+    const byteCount = this.length() - 2;
+    const bitCount = byteCount * 8 - this.buffer[1];
+    const bits = new Int8Array(bitCount);
+
+    for (let bitOffset = 0; bitOffset < bits.length; bitOffset++) {
+      const byteOffset = (bitOffset / 8) | 0;
+      const byte = this.buffer[byteOffset + 2];
+      const shift = 7 - (bitOffset % 8);
+      const bit = (byte >> shift) & 1;
+      bits[bitOffset] = bit;
+    }
+
+    return bits;
+  }
+
+  /**
+   * Constructs a Binary representing an Int8 Vector.
+   * @param array - The array to store as a view on the Binary class
+   */
+  public static fromInt8Array(array: Int8Array): Binary {
+    const buffer = ByteUtils.allocate(array.byteLength + 2);
+    buffer[0] = Binary.VECTOR_TYPE.Int8;
+    buffer[1] = 0;
+    const intBytes = new Uint8Array(array.buffer, array.byteOffset, array.byteLength);
+    buffer.set(intBytes, 2);
+    return new this(buffer, this.SUBTYPE_VECTOR);
+  }
+
+  /** Constructs a Binary representing an Float32 Vector. */
+  public static fromFloat32Array(array: Float32Array): Binary {
+    const binaryBytes = ByteUtils.allocate(array.byteLength + 2);
+    binaryBytes[0] = Binary.VECTOR_TYPE.Float32;
+    binaryBytes[1] = 0;
+
+    const floatBytes = new Uint8Array(array.buffer, array.byteOffset, array.byteLength);
+    binaryBytes.set(floatBytes, 2);
+
+    if (NumberUtils.isBigEndian) ByteUtils.swap32(new Uint8Array(binaryBytes.buffer, 2));
+
+    return new this(binaryBytes, this.SUBTYPE_VECTOR);
+  }
+
+  /**
+   * Constructs a Binary representing a packed bit Vector.
+   *
+   * Use `fromBits` to pack an array of 1s and 0s.
+   */
+  public static fromPackedBits(array: Uint8Array, padding = 0): Binary {
+    const buffer = ByteUtils.allocate(array.byteLength + 2);
+    buffer[0] = Binary.VECTOR_TYPE.PackedBit;
+    buffer[1] = padding;
+    buffer.set(array, 2);
+    return new this(buffer, this.SUBTYPE_VECTOR);
+  }
+
+  /**
+   * Constructs a Binary representing an Packed Bit Vector.
+   * @param array - The array of 1s and 0s to pack into the Binary instance
+   */
+  public static fromBits(bits: ArrayLike<number>): Binary {
+    const byteLength = (bits.length + 7) >>> 3; // ceil(bits.length / 8)
+    const bytes = new Uint8Array(byteLength + 2);
+    bytes[0] = Binary.VECTOR_TYPE.PackedBit;
+
+    const remainder = bits.length % 8;
+    bytes[1] = remainder === 0 ? 0 : 8 - remainder;
+
+    for (let bitOffset = 0; bitOffset < bits.length; bitOffset++) {
+      const byteOffset = bitOffset >>> 3; // floor(bitOffset / 8)
+      const bit = bits[bitOffset];
+
+      if (bit !== 0 && bit !== 1) {
+        throw new BSONError(
+          `Invalid bit value at ${bitOffset}: must be 0 or 1, found ${bits[bitOffset]}`
+        );
+      }
+
+      if (bit === 0) continue;
+
+      const shift = 7 - (bitOffset % 8);
+      bytes[byteOffset + 2] |= bit << shift;
+    }
+
+    return new this(bytes, Binary.SUBTYPE_VECTOR);
+  }
+}
+
+export function validateBinaryVector(vector: Binary): void {
+  if (vector.sub_type !== Binary.SUBTYPE_VECTOR) return;
+
+  const size = vector.position;
+
+  // NOTE: Validation is only applied to **KNOWN** vector types
+  // If a new datatype is introduced, a future version of the library will need to add validation
+  const datatype = vector.buffer[0];
+
+  // NOTE: We do not enable noUncheckedIndexedAccess so TS believes this is always number
+  // a Binary vector may be empty, in which case the padding is undefined
+  // this possible value is tolerable for our validation checks
+  const padding: number | undefined = vector.buffer[1];
+
+  if (
+    (datatype === Binary.VECTOR_TYPE.Float32 || datatype === Binary.VECTOR_TYPE.Int8) &&
+    padding !== 0
+  ) {
+    throw new BSONError('Invalid Vector: padding must be zero for int8 and float32 vectors');
+  }
+
+  if (datatype === Binary.VECTOR_TYPE.PackedBit && padding !== 0 && size === 2) {
+    throw new BSONError(
+      'Invalid Vector: padding must be zero for packed bit vectors that are empty'
+    );
+  }
+
+  if (datatype === Binary.VECTOR_TYPE.PackedBit && padding > 7) {
+    throw new BSONError(
+      `Invalid Vector: padding must be a value between 0 and 7. found: ${padding}`
+    );
+  }
 }
 
 /** @public */
diff --git a/src/parser/serializer.ts b/src/parser/serializer.ts
@@ -1,4 +1,4 @@
-import { Binary } from '../binary';
+import { Binary, validateBinaryVector } from '../binary';
 import type { BSONSymbol, DBRef, Document, MaxKey } from '../bson';
 import type { Code } from '../code';
 import * as constants from '../constants';
@@ -495,6 +495,10 @@ function serializeBinary(buffer: Uint8Array, key: string, value: Binary, index:
     index += NumberUtils.setInt32LE(buffer, index, size);
   }
 
+  if (value.sub_type === Binary.SUBTYPE_VECTOR) {
+    validateBinaryVector(value);
+  }
+
   if (size <= 16) {
     for (let i = 0; i < size; i++) buffer[index + i] = data[i];
   } else {
diff --git a/src/utils/byte_utils.ts b/src/utils/byte_utils.ts
@@ -39,6 +39,8 @@ export type ByteUtils = {
   encodeUTF8Into: (destination: Uint8Array, source: string, byteOffset: number) => number;
   /** Generate a Uint8Array filled with random bytes with byteLength */
   randomBytes: (byteLength: number) => Uint8Array;
+  /** Interprets `buffer` as an array of 32-bit values and swaps the byte order in-place. */
+  swap32: (buffer: Uint8Array) => Uint8Array;
 };
 
 declare const Buffer: { new (): unknown; prototype?: { _isBuffer?: boolean } } | undefined;
diff --git a/src/utils/node_byte_utils.ts b/src/utils/node_byte_utils.ts
@@ -9,6 +9,7 @@ type NodeJsBuffer = ArrayBufferView &
     copy(target: Uint8Array, targetStart: number, sourceStart: number, sourceEnd: number): number;
     toString: (this: Uint8Array, encoding: NodeJsEncoding, start?: number, end?: number) => string;
     equals: (this: Uint8Array, other: Uint8Array) => boolean;
+    swap32: (this: NodeJsBuffer) => NodeJsBuffer;
   };
 type NodeJsBufferConstructor = Omit<Uint8ArrayConstructor, 'from'> & {
   alloc: (size: number) => NodeJsBuffer;
@@ -159,5 +160,9 @@ export const nodeJsByteUtils = {
     return nodeJsByteUtils.toLocalBufferType(buffer).write(source, byteOffset, undefined, 'utf8');
   },
 
-  randomBytes: nodejsRandomBytes
+  randomBytes: nodejsRandomBytes,
+
+  swap32(buffer: Uint8Array): NodeJsBuffer {
+    return nodeJsByteUtils.toLocalBufferType(buffer).swap32();
+  }
 };
diff --git a/src/utils/number_utils.ts b/src/utils/number_utils.ts
@@ -13,6 +13,8 @@ const isBigEndian = FLOAT_BYTES[7] === 0;
  * A collection of functions that get or set various numeric types and bit widths from a Uint8Array.
  */
 export type NumberUtils = {
+  /** Is true if the current system is big endian. */
+  isBigEndian: boolean;
   /**
    * Parses a signed int32 at offset. Throws a `RangeError` if value is negative.
    */
@@ -35,6 +37,8 @@ export type NumberUtils = {
  * @public
  */
 export const NumberUtils: NumberUtils = {
+  isBigEndian,
+
   getNonnegativeInt32LE(source: Uint8Array, offset: number): number {
     if (source[offset + 3] > 127) {
       throw new RangeError(`Size cannot be negative at offset: ${offset}`);
diff --git a/src/utils/web_byte_utils.ts b/src/utils/web_byte_utils.ts
@@ -193,5 +193,24 @@ export const webByteUtils = {
     return bytes.byteLength;
   },
 
-  randomBytes: webRandomBytes
+  randomBytes: webRandomBytes,
+
+  swap32(buffer: Uint8Array): Uint8Array {
+    if (buffer.length % 4 !== 0) {
+      throw new RangeError('Buffer size must be a multiple of 32-bits');
+    }
+
+    for (let i = 0; i < buffer.length; i += 4) {
+      const byte0 = buffer[i];
+      const byte1 = buffer[i + 1];
+      const byte2 = buffer[i + 2];
+      const byte3 = buffer[i + 3];
+      buffer[i] = byte3;
+      buffer[i + 1] = byte2;
+      buffer[i + 2] = byte1;
+      buffer[i + 3] = byte0;
+    }
+
+    return buffer;
+  }
 };
diff --git a/test/node/binary.test.ts b/test/node/binary.test.ts
diff --git a/test/node/bson_binary_vector.spec.test.ts b/test/node/bson_binary_vector.spec.test.ts
diff --git a/test/node/byte_utils.test.ts b/test/node/byte_utils.test.ts