@@ -9,7 +9,22 @@ export class Transcriptions extends APIResource {
9
9
/**
10
10
* Transcribes audio into the input language.
11
11
*/
12
- create ( body : TranscriptionCreateParams , options ?: Core . RequestOptions ) : Core . APIPromise < Transcription > {
12
+ create (
13
+ body : TranscriptionCreateParams < 'json' | undefined > ,
14
+ options ?: Core . RequestOptions ,
15
+ ) : Core . APIPromise < Transcription > ;
16
+ create (
17
+ body : TranscriptionCreateParams < 'verbose_json' > ,
18
+ options ?: Core . RequestOptions ,
19
+ ) : Core . APIPromise < TranscriptionVerbose > ;
20
+ create (
21
+ body : TranscriptionCreateParams < 'srt' | 'vtt' | 'text' > ,
22
+ options ?: Core . RequestOptions ,
23
+ ) : Core . APIPromise < string > ;
24
+ create (
25
+ body : TranscriptionCreateParams ,
26
+ options ?: Core . RequestOptions ,
27
+ ) : Core . APIPromise < TranscriptionCreateResponse | string > {
13
28
return this . _client . post ( '/audio/transcriptions' , Core . multipartFormRequestOptions ( { body, ...options } ) ) ;
14
29
}
15
30
}
@@ -25,7 +40,118 @@ export interface Transcription {
25
40
text : string ;
26
41
}
27
42
28
- export interface TranscriptionCreateParams {
43
+ export interface TranscriptionSegment {
44
+ /**
45
+ * Unique identifier of the segment.
46
+ */
47
+ id : number ;
48
+
49
+ /**
50
+ * Average logprob of the segment. If the value is lower than -1, consider the
51
+ * logprobs failed.
52
+ */
53
+ avg_logprob : number ;
54
+
55
+ /**
56
+ * Compression ratio of the segment. If the value is greater than 2.4, consider the
57
+ * compression failed.
58
+ */
59
+ compression_ratio : number ;
60
+
61
+ /**
62
+ * End time of the segment in seconds.
63
+ */
64
+ end : number ;
65
+
66
+ /**
67
+ * Probability of no speech in the segment. If the value is higher than 1.0 and the
68
+ * `avg_logprob` is below -1, consider this segment silent.
69
+ */
70
+ no_speech_prob : number ;
71
+
72
+ /**
73
+ * Seek offset of the segment.
74
+ */
75
+ seek : number ;
76
+
77
+ /**
78
+ * Start time of the segment in seconds.
79
+ */
80
+ start : number ;
81
+
82
+ /**
83
+ * Temperature parameter used for generating the segment.
84
+ */
85
+ temperature : number ;
86
+
87
+ /**
88
+ * Text content of the segment.
89
+ */
90
+ text : string ;
91
+
92
+ /**
93
+ * Array of token IDs for the text content.
94
+ */
95
+ tokens : Array < number > ;
96
+ }
97
+
98
+ /**
99
+ * Represents a verbose json transcription response returned by model, based on the
100
+ * provided input.
101
+ */
102
+ export interface TranscriptionVerbose {
103
+ /**
104
+ * The duration of the input audio.
105
+ */
106
+ duration : string ;
107
+
108
+ /**
109
+ * The language of the input audio.
110
+ */
111
+ language : string ;
112
+
113
+ /**
114
+ * The transcribed text.
115
+ */
116
+ text : string ;
117
+
118
+ /**
119
+ * Segments of the transcribed text and their corresponding details.
120
+ */
121
+ segments ?: Array < TranscriptionSegment > ;
122
+
123
+ /**
124
+ * Extracted words and their corresponding timestamps.
125
+ */
126
+ words ?: Array < TranscriptionWord > ;
127
+ }
128
+
129
+ export interface TranscriptionWord {
130
+ /**
131
+ * End time of the word in seconds.
132
+ */
133
+ end : number ;
134
+
135
+ /**
136
+ * Start time of the word in seconds.
137
+ */
138
+ start : number ;
139
+
140
+ /**
141
+ * The text content of the word.
142
+ */
143
+ word : string ;
144
+ }
145
+
146
+ /**
147
+ * Represents a transcription response returned by model, based on the provided
148
+ * input.
149
+ */
150
+ export type TranscriptionCreateResponse = Transcription | TranscriptionVerbose ;
151
+
152
+ export interface TranscriptionCreateParams <
153
+ ResponseFormat extends AudioAPI . AudioResponseFormat | undefined = AudioAPI . AudioResponseFormat | undefined ,
154
+ > {
29
155
/**
30
156
* The audio file object (not file name) to transcribe, in one of these formats:
31
157
* flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
@@ -57,7 +183,7 @@ export interface TranscriptionCreateParams {
57
183
* The format of the output, in one of these options: `json`, `text`, `srt`,
58
184
* `verbose_json`, or `vtt`.
59
185
*/
60
- response_format ?: AudioAPI . AudioResponseFormat ;
186
+ response_format ?: ResponseFormat ;
61
187
62
188
/**
63
189
* The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
@@ -80,5 +206,9 @@ export interface TranscriptionCreateParams {
80
206
81
207
export namespace Transcriptions {
82
208
export import Transcription = TranscriptionsAPI . Transcription ;
209
+ export import TranscriptionSegment = TranscriptionsAPI . TranscriptionSegment ;
210
+ export import TranscriptionVerbose = TranscriptionsAPI . TranscriptionVerbose ;
211
+ export import TranscriptionWord = TranscriptionsAPI . TranscriptionWord ;
212
+ export import TranscriptionCreateResponse = TranscriptionsAPI . TranscriptionCreateResponse ;
83
213
export import TranscriptionCreateParams = TranscriptionsAPI . TranscriptionCreateParams ;
84
214
}
0 commit comments