|
1 | 1 | import { pipeline } from "@huggingface/transformers";
|
2 |
| -import wavefile from "wavefile"; |
| 2 | +import { read_audio } from "./utils.js"; |
3 | 3 |
|
4 | 4 | // Load model
|
5 | 5 | const transcriber = await pipeline(
|
6 | 6 | "automatic-speech-recognition",
|
7 |
| - "Xenova/whisper-tiny.en", |
| 7 | + "onnx-community/whisper-tiny.en", |
| 8 | + { dtype: { encoder_model: "fp32", decoder_model_merged: "q4" } }, |
8 | 9 | );
|
9 | 10 |
|
10 | 11 | // Load audio data
|
11 |
| -const url = |
12 |
| - "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav"; |
13 |
| -const buffer = Buffer.from(await fetch(url).then((x) => x.arrayBuffer())); |
14 |
| - |
15 |
| -// Read .wav file and convert it to required format |
16 |
| -const wav = new wavefile.WaveFile(buffer); |
17 |
| -wav.toBitDepth("32f"); // Pipeline expects input as a Float32Array |
18 |
| -wav.toSampleRate(16000); // Whisper expects audio with a sampling rate of 16000 |
19 |
| -let audioData = wav.getSamples(); |
20 |
| -if (Array.isArray(audioData)) { |
21 |
| - if (audioData.length > 1) { |
22 |
| - const SCALING_FACTOR = Math.sqrt(2); |
23 |
| - |
24 |
| - // Merge channels (into first channel to save memory) |
25 |
| - for (let i = 0; i < audioData[0].length; ++i) { |
26 |
| - audioData[0][i] = |
27 |
| - (SCALING_FACTOR * (audioData[0][i] + audioData[1][i])) / 2; |
28 |
| - } |
29 |
| - } |
30 |
| - |
31 |
| - // Select first channel |
32 |
| - audioData = audioData[0]; |
33 |
| -} |
| 12 | +const audio = await read_audio( |
| 13 | + "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav", |
| 14 | + transcriber.processor.feature_extractor.config.sampling_rate, |
| 15 | +); |
34 | 16 |
|
35 |
| -// Run model |
36 |
| -const start = performance.now(); |
37 |
| -const output = await transcriber(audioData); |
38 |
| -const end = performance.now(); |
39 |
| -console.log(`Execution duration: ${(end - start) / 1000} seconds`); |
| 17 | +// Run model w/ default settings |
| 18 | +console.time("Execution time"); |
| 19 | +const output = await transcriber(audio); |
| 20 | +console.timeEnd("Execution time"); |
40 | 21 | console.log(output);
|
41 | 22 | // { text: ' And so my fellow Americans ask not what your country can do for you, ask what you can do for your country.' }
|
0 commit comments