@@ -66,6 +66,12 @@ export interface ChatCompletion {
66
66
*/
67
67
object : 'chat.completion' ;
68
68
69
+ /**
70
+ * The service tier used for processing the request. This field is only included if
71
+ * the `service_tier` parameter is specified in the request.
72
+ */
73
+ service_tier ?: 'scale' | 'default' | null ;
74
+
69
75
/**
70
76
* This fingerprint represents the backend configuration that the model runs with.
71
77
*
@@ -205,6 +211,12 @@ export interface ChatCompletionChunk {
205
211
*/
206
212
object : 'chat.completion.chunk' ;
207
213
214
+ /**
215
+ * The service tier used for processing the request. This field is only included if
216
+ * the `service_tier` parameter is specified in the request.
217
+ */
218
+ service_tier ?: 'scale' | 'default' | null ;
219
+
208
220
/**
209
221
* This fingerprint represents the backend configuration that the model runs with.
210
222
* Can be used in conjunction with the `seed` request parameter to understand when
@@ -800,6 +812,19 @@ export interface ChatCompletionCreateParamsBase {
800
812
*/
801
813
seed ?: number | null ;
802
814
815
+ /**
816
+ * Specifies the latency tier to use for processing the request. This parameter is
817
+ * relevant for customers subscribed to the scale tier service:
818
+ *
819
+ * - If set to 'auto', the system will utilize scale tier credits until they are
820
+ * exhausted.
821
+ * - If set to 'default', the request will be processed in the shared cluster.
822
+ *
823
+ * When this parameter is set, the response body will include the `service_tier`
824
+ * utilized.
825
+ */
826
+ service_tier ?: 'auto' | 'default' | null ;
827
+
803
828
/**
804
829
* Up to 4 sequences where the API will stop generating further tokens.
805
830
*/
0 commit comments