Add word-level timestamp example

xenova · xenova · commit 34be58fc5945 · 2024-10-16T06:58:45.000Z
diff --git a/whisper-node/index.js b/whisper-node/index.js
@@ -1,41 +1,22 @@
 import { pipeline } from "@huggingface/transformers";
-import wavefile from "wavefile";
+import { read_audio } from "./utils.js";
 
 // Load model
 const transcriber = await pipeline(
   "automatic-speech-recognition",
-  "Xenova/whisper-tiny.en",
+  "onnx-community/whisper-tiny.en",
+  { dtype: { encoder_model: "fp32", decoder_model_merged: "q4" } },
 );
 
 // Load audio data
-const url =
-  "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav";
-const buffer = Buffer.from(await fetch(url).then((x) => x.arrayBuffer()));
-
-// Read .wav file and convert it to required format
-const wav = new wavefile.WaveFile(buffer);
-wav.toBitDepth("32f"); // Pipeline expects input as a Float32Array
-wav.toSampleRate(16000); // Whisper expects audio with a sampling rate of 16000
-let audioData = wav.getSamples();
-if (Array.isArray(audioData)) {
-  if (audioData.length > 1) {
-    const SCALING_FACTOR = Math.sqrt(2);
-
-    // Merge channels (into first channel to save memory)
-    for (let i = 0; i < audioData[0].length; ++i) {
-      audioData[0][i] =
-        (SCALING_FACTOR * (audioData[0][i] + audioData[1][i])) / 2;
-    }
-  }
-
-  // Select first channel
-  audioData = audioData[0];
-}
+const audio = await read_audio(
+  "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav",
+  transcriber.processor.feature_extractor.config.sampling_rate,
+);
 
-// Run model
-const start = performance.now();
-const output = await transcriber(audioData);
-const end = performance.now();
-console.log(`Execution duration: ${(end - start) / 1000} seconds`);
+// Run model w/ default settings
+console.time("Execution time");
+const output = await transcriber(audio);
+console.timeEnd("Execution time");
 console.log(output);
 // { text: ' And so my fellow Americans ask not what your country can do for you, ask what you can do for your country.' }
diff --git a/whisper-node/utils.js b/whisper-node/utils.js
@@ -0,0 +1,25 @@
+import wavefile from "wavefile";
+
+export async function read_audio(url, sampling_rate = 16000) {
+  const buffer = Buffer.from(await fetch(url).then((x) => x.arrayBuffer()));
+
+  // Read .wav file and convert it to required format
+  const wav = new wavefile.WaveFile(buffer);
+  wav.toBitDepth("32f");
+  wav.toSampleRate(sampling_rate);
+  let samples = wav.getSamples();
+  if (Array.isArray(samples)) {
+    if (samples.length > 1) {
+      const SCALING_FACTOR = Math.sqrt(2);
+
+      // Merge channels (into first channel to save memory)
+      for (let i = 0; i < samples[0].length; ++i) {
+        samples[0][i] = (SCALING_FACTOR * (samples[0][i] + samples[1][i])) / 2;
+      }
+    }
+
+    // Select first channel
+    samples = samples[0];
+  }
+  return samples;
+}
diff --git a/whisper-node/word-level-timestamps.js b/whisper-node/word-level-timestamps.js
@@ -0,0 +1,30 @@
+import { pipeline } from "@huggingface/transformers";
+import { read_audio } from "./utils.js";
+
+// Load model
+const transcriber = await pipeline(
+  "automatic-speech-recognition",
+  "onnx-community/whisper-tiny.en_timestamped",
+  { dtype: { encoder_model: "fp32", decoder_model_merged: "q4" } },
+);
+
+// Load audio data
+const audio = await read_audio(
+  "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav",
+  transcriber.processor.feature_extractor.config.sampling_rate,
+);
+
+// Run model w/ default settings
+console.time("Execution time");
+const output = await transcriber(audio, { return_timestamps: "word" });
+console.timeEnd("Execution time");
+console.log(output);
+// {
+//   text: ' And so my fellow Americans ask not what your country can do for you, ask what you can do for your country.',
+//   chunks: [
+//     { text: ' And', timestamp: [0, 0.76] },
+//     { text: ' so', timestamp: [0.76, 1.06] },
+//     ...
+//     { text: ' country.', timestamp: [10.22, 10.72] },
+//   ],
+// }