Skip to content

Commit 34be58f

Browse files
committed
Add word-level timestamp example
1 parent 9064ef7 commit 34be58f

File tree

3 files changed

+66
-30
lines changed

3 files changed

+66
-30
lines changed

whisper-node/index.js

+11-30
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,22 @@
11
import { pipeline } from "@huggingface/transformers";
2-
import wavefile from "wavefile";
2+
import { read_audio } from "./utils.js";
33

44
// Load model
55
const transcriber = await pipeline(
66
"automatic-speech-recognition",
7-
"Xenova/whisper-tiny.en",
7+
"onnx-community/whisper-tiny.en",
8+
{ dtype: { encoder_model: "fp32", decoder_model_merged: "q4" } },
89
);
910

1011
// Load audio data
11-
const url =
12-
"https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav";
13-
const buffer = Buffer.from(await fetch(url).then((x) => x.arrayBuffer()));
14-
15-
// Read .wav file and convert it to required format
16-
const wav = new wavefile.WaveFile(buffer);
17-
wav.toBitDepth("32f"); // Pipeline expects input as a Float32Array
18-
wav.toSampleRate(16000); // Whisper expects audio with a sampling rate of 16000
19-
let audioData = wav.getSamples();
20-
if (Array.isArray(audioData)) {
21-
if (audioData.length > 1) {
22-
const SCALING_FACTOR = Math.sqrt(2);
23-
24-
// Merge channels (into first channel to save memory)
25-
for (let i = 0; i < audioData[0].length; ++i) {
26-
audioData[0][i] =
27-
(SCALING_FACTOR * (audioData[0][i] + audioData[1][i])) / 2;
28-
}
29-
}
30-
31-
// Select first channel
32-
audioData = audioData[0];
33-
}
12+
const audio = await read_audio(
13+
"https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav",
14+
transcriber.processor.feature_extractor.config.sampling_rate,
15+
);
3416

35-
// Run model
36-
const start = performance.now();
37-
const output = await transcriber(audioData);
38-
const end = performance.now();
39-
console.log(`Execution duration: ${(end - start) / 1000} seconds`);
17+
// Run model w/ default settings
18+
console.time("Execution time");
19+
const output = await transcriber(audio);
20+
console.timeEnd("Execution time");
4021
console.log(output);
4122
// { text: ' And so my fellow Americans ask not what your country can do for you, ask what you can do for your country.' }

whisper-node/utils.js

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import wavefile from "wavefile";
2+
3+
export async function read_audio(url, sampling_rate = 16000) {
4+
const buffer = Buffer.from(await fetch(url).then((x) => x.arrayBuffer()));
5+
6+
// Read .wav file and convert it to required format
7+
const wav = new wavefile.WaveFile(buffer);
8+
wav.toBitDepth("32f");
9+
wav.toSampleRate(sampling_rate);
10+
let samples = wav.getSamples();
11+
if (Array.isArray(samples)) {
12+
if (samples.length > 1) {
13+
const SCALING_FACTOR = Math.sqrt(2);
14+
15+
// Merge channels (into first channel to save memory)
16+
for (let i = 0; i < samples[0].length; ++i) {
17+
samples[0][i] = (SCALING_FACTOR * (samples[0][i] + samples[1][i])) / 2;
18+
}
19+
}
20+
21+
// Select first channel
22+
samples = samples[0];
23+
}
24+
return samples;
25+
}

whisper-node/word-level-timestamps.js

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
import { pipeline } from "@huggingface/transformers";
2+
import { read_audio } from "./utils.js";
3+
4+
// Load model
5+
const transcriber = await pipeline(
6+
"automatic-speech-recognition",
7+
"onnx-community/whisper-tiny.en_timestamped",
8+
{ dtype: { encoder_model: "fp32", decoder_model_merged: "q4" } },
9+
);
10+
11+
// Load audio data
12+
const audio = await read_audio(
13+
"https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav",
14+
transcriber.processor.feature_extractor.config.sampling_rate,
15+
);
16+
17+
// Run model w/ default settings
18+
console.time("Execution time");
19+
const output = await transcriber(audio, { return_timestamps: "word" });
20+
console.timeEnd("Execution time");
21+
console.log(output);
22+
// {
23+
// text: ' And so my fellow Americans ask not what your country can do for you, ask what you can do for your country.',
24+
// chunks: [
25+
// { text: ' And', timestamp: [0, 0.76] },
26+
// { text: ' so', timestamp: [0.76, 1.06] },
27+
// ...
28+
// { text: ' country.', timestamp: [10.22, 10.72] },
29+
// ],
30+
// }

0 commit comments

Comments
 (0)