Skip to content

Commit a6839fd

Browse files
authored
feat: [whisper] Partial support for verbose_json format in transcribe endpoint (#721)
1 parent f3063f9 commit a6839fd

File tree

2 files changed

+35
-12
lines changed

2 files changed

+35
-12
lines changed

api/openai.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -737,7 +737,7 @@ func transcriptEndpoint(cm *ConfigMerger, o *Option) func(c *fiber.Ctx) error {
737737

738738
log.Debug().Msgf("Trascribed: %+v", tr)
739739
// TODO: handle different outputs here
740-
return c.Status(http.StatusOK).JSON(fiber.Map{"text": tr})
740+
return c.Status(http.StatusOK).JSON(tr)
741741
}
742742
}
743743

pkg/whisper/whisper.go

+34-11
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,25 @@ import (
55
"os"
66
"os/exec"
77
"path/filepath"
8+
"time"
89

910
"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
1011
wav "github.com/go-audio/wav"
1112
)
1213

14+
type Segment struct {
15+
Id int `json:"id"`
16+
Start time.Duration `json:"start"`
17+
End time.Duration `json:"end"`
18+
Text string `json:"text"`
19+
Tokens []int `json:"tokens"`
20+
}
21+
22+
type Result struct {
23+
Segments []Segment `json:"segments"`
24+
Text string `json:"text"`
25+
}
26+
1327
func sh(c string) (string, error) {
1428
cmd := exec.Command("/bin/sh", "-c", c)
1529
cmd.Env = os.Environ()
@@ -28,40 +42,41 @@ func audioToWav(src, dst string) error {
2842
return nil
2943
}
3044

31-
func Transcript(model whisper.Model, audiopath, language string, threads uint) (string, error) {
45+
func Transcript(model whisper.Model, audiopath, language string, threads uint) (Result, error) {
46+
res := Result{}
3247

3348
dir, err := os.MkdirTemp("", "whisper")
3449
if err != nil {
35-
return "", err
50+
return res, err
3651
}
3752
defer os.RemoveAll(dir)
3853

3954
convertedPath := filepath.Join(dir, "converted.wav")
4055

4156
if err := audioToWav(audiopath, convertedPath); err != nil {
42-
return "", err
57+
return res, err
4358
}
4459

4560
// Open samples
4661
fh, err := os.Open(convertedPath)
4762
if err != nil {
48-
return "", err
63+
return res, err
4964
}
5065
defer fh.Close()
5166

5267
// Read samples
5368
d := wav.NewDecoder(fh)
5469
buf, err := d.FullPCMBuffer()
5570
if err != nil {
56-
return "", err
71+
return res, err
5772
}
5873

5974
data := buf.AsFloat32Buffer().Data
6075

6176
// Process samples
6277
context, err := model.NewContext()
6378
if err != nil {
64-
return "", err
79+
return res, err
6580

6681
}
6782

@@ -74,17 +89,25 @@ func Transcript(model whisper.Model, audiopath, language string, threads uint) (
7489
}
7590

7691
if err := context.Process(data, nil, nil); err != nil {
77-
return "", err
92+
return res, err
7893
}
7994

80-
text := ""
8195
for {
82-
segment, err := context.NextSegment()
96+
s, err := context.NextSegment()
8397
if err != nil {
8498
break
8599
}
86-
text += segment.Text
100+
101+
var tokens []int
102+
for _, t := range(s.Tokens) {
103+
tokens = append(tokens, t.Id)
104+
}
105+
106+
segment := Segment{Id: s.Num, Text: s.Text, Start:s.Start, End: s.End, Tokens: tokens}
107+
res.Segments = append(res.Segments, segment)
108+
109+
res.Text += s.Text
87110
}
88111

89-
return text, nil
112+
return res, nil
90113
}

0 commit comments

Comments
 (0)