Skip to content

Commit 1c4c630

Browse files
authored
move over to whisper (#6)
* move over to whisper * TODO * cleanup * more todo
1 parent 7ef30b5 commit 1c4c630

20 files changed

+113
-539
lines changed

.gitattributes

-2
This file was deleted.

.gitmodules

-3
This file was deleted.

README.md

+4-23
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
<p>
55
A simple, offline program to automatically get transcriptions from files with English audio.</br>
6-
Built with F# alongside Coqui's STT 1.3.0, FFMpeg + FFMpegCore, and Avalonia.
6+
Built with F# alongside OpenAI's Whisper, FFMpeg + FFMpegCore, and Avalonia.
77
</p>
88
</div>
99

@@ -23,31 +23,12 @@ This repo is split into two main parts:
2323

2424
- [`TranscripterLib`](./TranscripterLib): The "core" library for transcription logic. This isn't really much more than a
2525
simple wrapper on top of
26-
Coqui's [STT](https://github.com/coqui-ai/STT) for speech-to-text and FFMpeg via FFMpegCore to convert video files
27-
into the appropriate audio files for STT.
26+
OpenAI's [Whisper](https://github.com/openai/whisper) for speech-to-text and FFMpeg via FFMpegCore to convert video files
27+
into the appropriate audio files for Whisper.
2828

2929
- [`TranscripterUI`](./TranscripterUI): Handles the UI and application logic to transcribe files, built
3030
on [Avalonia](https://avaloniaui.net/).
3131

32-
Transcripter's speech-to-text is currently based on version 1.3.0 of STT, and a submodule from v1.3.0
33-
of [the STT repo](https://github.com/coqui-ai/STT) is also included in the repo for usage in .NET - the main
34-
important part is the [.NET library](https://github.com/coqui-ai/STT/tree/main/native_client/dotnet) portion of the
35-
repo.
36-
37-
The English language model and scorer used for speech-to-text is also from STT (English, 1.0.0). This can be found
38-
inside the `TranscripterLib` portion of the repo [here](./TranscripterLib/model), and
39-
the model/scorer itself can be found from Coqui's website [here](https://coqui.ai/english/coqui/v1.0.0-huge-vocab).
40-
41-
### Running
42-
43-
If you're running via something like Rider's Run, you may need to set the `LD_LIBRARY_PATH` environment variable
44-
to not be blank (if it is) for the STT shared libraries to be detected. For example:
45-
46-
```bash
47-
LD_LIBRARY_PATH=:
48-
```
49-
50-
seems to work. This appears to be [a bug](https://github.com/dotnet/sdk/issues/9586) with dotnet in general.
5132

5233
## Disclaimer
5334

@@ -61,4 +42,4 @@ accept bug reports and PRs though.
6142
## Motivation
6243

6344
I was looking for a simple offline program to get transcripts from a bunch of lecture recordings I had, and also wanted
64-
a bit of an excuse to get my feet wet with F#, Avalonia, and STT.
45+
a bit of an excuse to get my feet wet with F# and Avalonia.

STT

-1
This file was deleted.

TranscripterLib/Library.fs

+45-33
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,12 @@ open System
44
open System.IO
55
open System.Threading
66
open FFMpegCore
7+
open FSharp.Control
8+
open Microsoft.FSharp.Core
79
open Microsoft.VisualBasic.FileIO
810
open NAudio.Wave
9-
open STTClient
11+
open Whisper.net
12+
open Whisper.net.Ggml
1013

1114
module Transcripter =
1215
let private deleteIfExists filePath =
@@ -22,13 +25,11 @@ module Transcripter =
2225
| _ -> return false
2326
}
2427

25-
type public TranscripterClient(stt: STT) =
26-
let client = stt
28+
type public TranscripterClient(client: WhisperProcessor) =
29+
let client = client
2730

28-
member this.Transcribe(inputPath: string, ?numAttempts: uint, ?token: CancellationToken) =
31+
member this.Transcribe(inputPath: string, ?token: CancellationToken) =
2932
if File.Exists inputPath then
30-
let sampleRate = client.GetModelSampleRate()
31-
3233
let wavPath =
3334
let tmp = FileSystem.GetTempFileName()
3435

@@ -50,9 +51,9 @@ module Transcripter =
5051
true,
5152
fun options ->
5253
options
53-
.WithAudioSamplingRate(sampleRate)
5454
.WithFastStart()
5555
.WithCustomArgument("-ac 1")
56+
.WithCustomArgument("-ar 16000")
5657
.WithCustomArgument("-async 1")
5758
|> ignore
5859
)
@@ -65,22 +66,9 @@ module Transcripter =
6566
.Wait()
6667
| None -> args.ProcessAsynchronously(true).Wait()
6768

68-
let inputBytes = File.ReadAllBytes(wavPath)
69-
70-
let buffer = WaveBuffer inputBytes
71-
72-
let bufferSize =
73-
Convert.ToUInt32(buffer.MaxSize / 2)
74-
75-
let numAttempts =
76-
match numAttempts with
77-
| Some numAttempts -> numAttempts
78-
| None -> 1u
79-
80-
let result =
81-
client.SpeechToTextWithMetadata(buffer.ShortBuffer, bufferSize, numAttempts)
82-
83-
buffer.Clear()
69+
let fileStream = File.OpenRead(wavPath)
70+
71+
let result = client.ProcessAsync(fileStream) |> TaskSeq.toList
8472

8573
deleteIfExists wavPath
8674
Ok result
@@ -91,15 +79,39 @@ module Transcripter =
9179
else
9280
Error($"{inputPath} does not exist.")
9381

94-
let public NewClient (useScorer: bool, modelFilePath: string, scorerFilePath: string) =
95-
if not (File.Exists modelFilePath) then
96-
Error($"Failed to load model file at `{modelFilePath}`.")
97-
else if useScorer && not (File.Exists scorerFilePath) then
98-
Error($"Failed to load scorer file at `{scorerFilePath}`.")
99-
else
100-
let client = new STT(modelFilePath)
101-
102-
if useScorer then
103-
client.EnableExternalScorer(scorerFilePath)
82+
let public NewClient (modelFilePath: string, numThreads: int) =
83+
let validFile =
84+
if not (File.Exists modelFilePath) then
85+
try
86+
// printfn "Model doesn't exist at path, going to try downloading model..."
87+
88+
let parent = Directory.GetParent(modelFilePath)
89+
Directory.CreateDirectory(parent.FullName) |> ignore
90+
91+
let writer = File.OpenWrite(modelFilePath)
92+
let stream = WhisperGgmlDownloader.GetGgmlModelAsync(GgmlType.Base) |> Async.AwaitTask |> Async.RunSynchronously
93+
stream.CopyToAsync(writer).Wait()
94+
writer.Dispose()
95+
96+
// printfn "Finished downloading model."
97+
98+
Ok(())
99+
with
100+
| ex ->
101+
Error($"Failed to load model file at `{modelFilePath}`; failed to download due to {ex}.")
102+
else
103+
Ok(())
104+
105+
match validFile with
106+
| Ok _ ->
107+
// printfn $"Using model at {modelFilePath}."
108+
let factory = WhisperFactory.FromPath(modelFilePath)
109+
let client = factory
110+
.CreateBuilder()
111+
.WithLanguage("auto")
112+
.WithThreads(numThreads)
113+
.Build()
104114

105115
Ok(TranscripterClient(client))
116+
| Error err ->
117+
Error(err)

TranscripterLib/README.md

+4-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
# TranscripterLib
22

3-
The bit of code that handles converting media files to audio via FFMpeg, and then runs speech-to-text over it using Coqui's STT.
3+
The bit of code that handles converting media files to audio via FFMpeg, and then runs speech-to-text over it using
4+
Whisper.
45

5-
Note that not all required files are in this repo, due to size limitations and the like. Run the `setup.sh` to download these files.
6+
Note that not all required files are in this repo, due to size limitations and the like. Run the `setup.sh` to download
7+
these files.
68

TranscripterLib/TranscripterLib.fsproj

+6-7
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,12 @@
1010
</ItemGroup>
1111

1212
<ItemGroup>
13-
<PackageReference Include="FFMpegCore" Version="4.8.0"/>
14-
<PackageReference Include="NAudio" Version="2.1.0"/>
15-
<PackageReference Update="FSharp.Core" Version="6.0.4"/>
16-
</ItemGroup>
17-
18-
<ItemGroup>
19-
<ProjectReference Include="..\STT\native_client\dotnet\STTClient\STTClient.csproj"/>
13+
<PackageReference Include="FFMpegCore" Version="5.1.0" />
14+
<PackageReference Include="FSharp.Control.TaskSeq" Version="0.3.0" />
15+
<PackageReference Include="NAudio" Version="2.2.1" />
16+
<PackageReference Update="FSharp.Core" Version="8.0.101" />
17+
<PackageReference Include="Whisper.net" Version="1.5.0" />
18+
<PackageReference Include="Whisper.net.Runtime" Version="1.5.0" />
2019
</ItemGroup>
2120

2221
</Project>

TranscripterLib/TranscripterLib.sln

-6
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@
22
Microsoft Visual Studio Solution File, Format Version 12.00
33
Project("{F2A71F9B-5D33-465A-A702-920D77279786}") = "TranscripterLib", "TranscripterLib.fsproj", "{7B9ADACA-CB7F-416C-87CA-5696268A9A91}"
44
EndProject
5-
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "STTClient", "..\STT\native_client\dotnet\STTClient\STTClient.csproj", "{87565D0F-F803-4D1D-A3AF-2D56B69C8BE7}"
6-
EndProject
75
Global
86
GlobalSection(SolutionConfigurationPlatforms) = preSolution
97
Debug|Any CPU = Debug|Any CPU
@@ -14,9 +12,5 @@ Global
1412
{7B9ADACA-CB7F-416C-87CA-5696268A9A91}.Debug|Any CPU.Build.0 = Debug|Any CPU
1513
{7B9ADACA-CB7F-416C-87CA-5696268A9A91}.Release|Any CPU.ActiveCfg = Release|Any CPU
1614
{7B9ADACA-CB7F-416C-87CA-5696268A9A91}.Release|Any CPU.Build.0 = Release|Any CPU
17-
{87565D0F-F803-4D1D-A3AF-2D56B69C8BE7}.Debug|Any CPU.ActiveCfg = Debug|x64
18-
{87565D0F-F803-4D1D-A3AF-2D56B69C8BE7}.Debug|Any CPU.Build.0 = Debug|x64
19-
{87565D0F-F803-4D1D-A3AF-2D56B69C8BE7}.Release|Any CPU.ActiveCfg = Release|x64
20-
{87565D0F-F803-4D1D-A3AF-2D56B69C8BE7}.Release|Any CPU.Build.0 = Release|x64
2115
EndGlobalSection
2216
EndGlobal

TranscripterUI/TranscripterUI.fsproj

+5-24
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
<Project Sdk="Microsoft.NET.Sdk">
22
<PropertyGroup>
33
<OutputType>WinExe</OutputType>
4-
<TargetFramework>net6.0</TargetFramework>
4+
<TargetFramework>net8.0</TargetFramework>
55
<!--Avalonia doesen't support TrimMode=link currently,but we are working on that https://github.com/AvaloniaUI/Avalonia/issues/6892 -->
66
<TrimMode>copyused</TrimMode>
77
<BuiltInComInteropSupport>true</BuiltInComInteropSupport>
@@ -18,26 +18,6 @@
1818
<ItemGroup>
1919
<Folder Include="Models\" />
2020
<AvaloniaResource Include="Assets\**" />
21-
<Content Include="model\english_huge_1.0.0_model.tflite">
22-
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
23-
<CopyToPublishDirectory>PreserveNewest</CopyToPublishDirectory>
24-
</Content>
25-
<Content Include="model\huge-vocabulary.scorer">
26-
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
27-
<CopyToPublishDirectory>PreserveNewest</CopyToPublishDirectory>
28-
</Content>
29-
<None Include="shared_libs\libstt.tflite.Linux\libkenlm.so" Condition="$([MSBuild]::IsOSPlatform('Linux'))">
30-
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
31-
<TargetPath>libkenlm.so</TargetPath>
32-
</None>
33-
<None Include="shared_libs\libstt.tflite.Linux\libsox.so.3" Condition="$([MSBuild]::IsOSPlatform('Linux'))">
34-
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
35-
<TargetPath>libsox.so.3</TargetPath>
36-
</None>
37-
<None Include="shared_libs\libstt.tflite.Linux\libstt.so" Condition="$([MSBuild]::IsOSPlatform('Linux'))">
38-
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
39-
<TargetPath>libstt.so</TargetPath>
40-
</None>
4121
<None Remove=".gitignore" />
4222
<None Include="NLog.config">
4323
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
@@ -79,11 +59,12 @@
7959
<PackageReference Condition="'$(Configuration)' == 'Debug'" Include="Avalonia.Diagnostics" Version="0.10.14" />
8060
<PackageReference Include="Avalonia.ReactiveUI" Version="0.10.14" />
8161
<PackageReference Include="FSharp.Collections.ParallelSeq" Version="1.2.0" />
82-
<PackageReference Include="NLog" Version="5.0.0" />
83-
<PackageReference Update="FSharp.Core" Version="6.0.4" />
62+
<PackageReference Include="NLog" Version="5.2.8" />
63+
<PackageReference Update="FSharp.Core" Version="8.0.101" />
64+
<PackageReference Include="Whisper.net" Version="1.5.0" />
65+
<PackageReference Include="Whisper.net.Runtime" Version="1.5.0" />
8466
</ItemGroup>
8567
<ItemGroup>
86-
<ProjectReference Include="..\STT\native_client\dotnet\STTClient\STTClient.csproj" />
8768
<ProjectReference Include="..\TranscripterLib\TranscripterLib.fsproj" />
8869
</ItemGroup>
8970
<ItemGroup>

TranscripterUI/TranscripterUI.sln

-2
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,6 @@ Project("{F2A71F9B-5D33-465A-A702-920D77279786}") = "TranscripterUI", "Transcrip
44
EndProject
55
Project("{F2A71F9B-5D33-465A-A702-920D77279786}") = "TranscripterLib", "..\TranscripterLib\TranscripterLib.fsproj", "{B1FB803A-C705-4F18-917F-663ACEAEEDEC}"
66
EndProject
7-
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "STTClient", "..\STT\native_client\dotnet\STTClient\STTClient.csproj", "{D103BD3B-B115-459D-829B-E534CB2CDC11}"
8-
EndProject
97
Global
108
GlobalSection(SolutionConfigurationPlatforms) = preSolution
119
Debug|Any CPU = Debug|Any CPU

0 commit comments

Comments
 (0)