-
Notifications
You must be signed in to change notification settings - Fork 1k
/
Copy pathtranscription-sessions.ts
308 lines (274 loc) · 11.3 KB
/
transcription-sessions.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
import { APIResource } from '../../../resource';
import * as Core from '../../../core';
export class TranscriptionSessions extends APIResource {
/**
* Create an ephemeral API token for use in client-side applications with the
* Realtime API specifically for realtime transcriptions. Can be configured with
* the same session parameters as the `transcription_session.update` client event.
*
* It responds with a session object, plus a `client_secret` key which contains a
* usable ephemeral API token that can be used to authenticate browser clients for
* the Realtime API.
*/
create(
body: TranscriptionSessionCreateParams,
options?: Core.RequestOptions,
): Core.APIPromise<TranscriptionSession> {
return this._client.post('/realtime/transcription_sessions', {
body,
...options,
headers: { 'OpenAI-Beta': 'assistants=v2', ...options?.headers },
});
}
}
/**
* A new Realtime transcription session configuration.
*
* When a session is created on the server via REST API, the session object also
* contains an ephemeral key. Default TTL for keys is one minute. This property is
* not present when a session is updated via the WebSocket API.
*/
export interface TranscriptionSession {
/**
* Ephemeral key returned by the API. Only present when the session is created on
* the server via REST API.
*/
client_secret: TranscriptionSession.ClientSecret;
/**
* The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
*/
input_audio_format?: string;
/**
* Configuration of the transcription model.
*/
input_audio_transcription?: TranscriptionSession.InputAudioTranscription;
/**
* The set of modalities the model can respond with. To disable audio, set this to
* ["text"].
*/
modalities?: Array<'text' | 'audio'>;
/**
* Configuration for turn detection. Can be set to `null` to turn off. Server VAD
* means that the model will detect the start and end of speech based on audio
* volume and respond at the end of user speech.
*/
turn_detection?: TranscriptionSession.TurnDetection;
}
export namespace TranscriptionSession {
/**
* Ephemeral key returned by the API. Only present when the session is created on
* the server via REST API.
*/
export interface ClientSecret {
/**
* Timestamp for when the token expires. Currently, all tokens expire after one
* minute.
*/
expires_at: number;
/**
* Ephemeral key usable in client environments to authenticate connections to the
* Realtime API. Use this in client-side environments rather than a standard API
* token, which should only be used server-side.
*/
value: string;
}
/**
* Configuration of the transcription model.
*/
export interface InputAudioTranscription {
/**
* The language of the input audio. Supplying the input language in
* [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
* format will improve accuracy and latency.
*/
language?: string;
/**
* The model to use for transcription. Can be `gpt-4o-transcribe`,
* `gpt-4o-mini-transcribe`, or `whisper-1`.
*/
model?: 'gpt-4o-transcribe' | 'gpt-4o-mini-transcribe' | 'whisper-1';
/**
* An optional text to guide the model's style or continue a previous audio
* segment. The
* [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
* should match the audio language.
*/
prompt?: string;
}
/**
* Configuration for turn detection. Can be set to `null` to turn off. Server VAD
* means that the model will detect the start and end of speech based on audio
* volume and respond at the end of user speech.
*/
export interface TurnDetection {
/**
* Amount of audio to include before the VAD detected speech (in milliseconds).
* Defaults to 300ms.
*/
prefix_padding_ms?: number;
/**
* Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
* With shorter values the model will respond more quickly, but may jump in on
* short pauses from the user.
*/
silence_duration_ms?: number;
/**
* Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
* threshold will require louder audio to activate the model, and thus might
* perform better in noisy environments.
*/
threshold?: number;
/**
* Type of turn detection, only `server_vad` is currently supported.
*/
type?: string;
}
}
export interface TranscriptionSessionCreateParams {
/**
* The set of items to include in the transcription. Current available items are:
*
* - `item.input_audio_transcription.logprobs`
*/
include?: Array<string>;
/**
* The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For
* `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel
* (mono), and little-endian byte order.
*/
input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
/**
* Configuration for input audio noise reduction. This can be set to `null` to turn
* off. Noise reduction filters audio added to the input audio buffer before it is
* sent to VAD and the model. Filtering the audio can improve VAD and turn
* detection accuracy (reducing false positives) and model performance by improving
* perception of the input audio.
*/
input_audio_noise_reduction?: TranscriptionSessionCreateParams.InputAudioNoiseReduction;
/**
* Configuration for input audio transcription. The client can optionally set the
* language and prompt for transcription, these offer additional guidance to the
* transcription service.
*/
input_audio_transcription?: TranscriptionSessionCreateParams.InputAudioTranscription;
/**
* The set of modalities the model can respond with. To disable audio, set this to
* ["text"].
*/
modalities?: Array<'text' | 'audio'>;
/**
* Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
* set to `null` to turn off, in which case the client must manually trigger model
* response. Server VAD means that the model will detect the start and end of
* speech based on audio volume and respond at the end of user speech. Semantic VAD
* is more advanced and uses a turn detection model (in conjuction with VAD) to
* semantically estimate whether the user has finished speaking, then dynamically
* sets a timeout based on this probability. For example, if user audio trails off
* with "uhhm", the model will score a low probability of turn end and wait longer
* for the user to continue speaking. This can be useful for more natural
* conversations, but may have a higher latency.
*/
turn_detection?: TranscriptionSessionCreateParams.TurnDetection;
}
export namespace TranscriptionSessionCreateParams {
/**
* Configuration for input audio noise reduction. This can be set to `null` to turn
* off. Noise reduction filters audio added to the input audio buffer before it is
* sent to VAD and the model. Filtering the audio can improve VAD and turn
* detection accuracy (reducing false positives) and model performance by improving
* perception of the input audio.
*/
export interface InputAudioNoiseReduction {
/**
* Type of noise reduction. `near_field` is for close-talking microphones such as
* headphones, `far_field` is for far-field microphones such as laptop or
* conference room microphones.
*/
type?: 'near_field' | 'far_field';
}
/**
* Configuration for input audio transcription. The client can optionally set the
* language and prompt for transcription, these offer additional guidance to the
* transcription service.
*/
export interface InputAudioTranscription {
/**
* The language of the input audio. Supplying the input language in
* [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
* format will improve accuracy and latency.
*/
language?: string;
/**
* The model to use for transcription, current options are `gpt-4o-transcribe`,
* `gpt-4o-mini-transcribe`, and `whisper-1`.
*/
model?: 'gpt-4o-transcribe' | 'gpt-4o-mini-transcribe' | 'whisper-1';
/**
* An optional text to guide the model's style or continue a previous audio
* segment. For `whisper-1`, the
* [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
* For `gpt-4o-transcribe` models, the prompt is a free text string, for example
* "expect words related to technology".
*/
prompt?: string;
}
/**
* Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
* set to `null` to turn off, in which case the client must manually trigger model
* response. Server VAD means that the model will detect the start and end of
* speech based on audio volume and respond at the end of user speech. Semantic VAD
* is more advanced and uses a turn detection model (in conjuction with VAD) to
* semantically estimate whether the user has finished speaking, then dynamically
* sets a timeout based on this probability. For example, if user audio trails off
* with "uhhm", the model will score a low probability of turn end and wait longer
* for the user to continue speaking. This can be useful for more natural
* conversations, but may have a higher latency.
*/
export interface TurnDetection {
/**
* Whether or not to automatically generate a response when a VAD stop event
* occurs. Not available for transcription sessions.
*/
create_response?: boolean;
/**
* Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
* will wait longer for the user to continue speaking, `high` will respond more
* quickly. `auto` is the default and is equivalent to `medium`.
*/
eagerness?: 'low' | 'medium' | 'high' | 'auto';
/**
* Whether or not to automatically interrupt any ongoing response with output to
* the default conversation (i.e. `conversation` of `auto`) when a VAD start event
* occurs. Not available for transcription sessions.
*/
interrupt_response?: boolean;
/**
* Used only for `server_vad` mode. Amount of audio to include before the VAD
* detected speech (in milliseconds). Defaults to 300ms.
*/
prefix_padding_ms?: number;
/**
* Used only for `server_vad` mode. Duration of silence to detect speech stop (in
* milliseconds). Defaults to 500ms. With shorter values the model will respond
* more quickly, but may jump in on short pauses from the user.
*/
silence_duration_ms?: number;
/**
* Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
* defaults to 0.5. A higher threshold will require louder audio to activate the
* model, and thus might perform better in noisy environments.
*/
threshold?: number;
/**
* Type of turn detection.
*/
type?: 'server_vad' | 'semantic_vad';
}
}
export declare namespace TranscriptionSessions {
export {
type TranscriptionSession as TranscriptionSession,
type TranscriptionSessionCreateParams as TranscriptionSessionCreateParams,
};
}