Skip to content

Commit e9ef479

Browse files
VinayGuthaldaymxn
andauthored
Enable multimodal response generation in android (#6901)
This change enables use of multiple modalities when calling generateContent from the model. This change adds a new field into the GenerationConfig to specify the responseModalities which would be sent to the server. Also for easier response handling it exposes a function inlineDataParts which should return all the data sent back by the model. --------- Co-authored-by: Daymon <[email protected]>
1 parent 534cc53 commit e9ef479

File tree

6 files changed

+38
-6
lines changed

6 files changed

+38
-6
lines changed

firebase-vertexai/CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
* [fixed] Fixed an issue with `LiveContentResponse` audio data not being present when the model was
1414
interrupted or the turn completed. (#6870)
1515
* [fixed] Fixed an issue with `LiveSession` not converting exceptions to `FirebaseVertexAIException`. (#6870)
16+
* [feature] Enable response generation in multiple modalities. (#6901)
1617

1718

1819
# 16.3.0

firebase-vertexai/api.txt

+6-3
Original file line numberDiff line numberDiff line change
@@ -132,9 +132,9 @@ package com.google.firebase.vertexai.java {
132132
method public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> send(String text);
133133
method public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> sendFunctionResponse(java.util.List<com.google.firebase.vertexai.type.FunctionResponsePart> functionList);
134134
method public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> sendMediaStream(java.util.List<com.google.firebase.vertexai.type.MediaData> mediaChunks);
135-
method public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation();
135+
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation();
136136
method public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.vertexai.type.FunctionCallPart,com.google.firebase.vertexai.type.FunctionResponsePart>? functionCallHandler);
137-
method public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> stopAudioConversation();
137+
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> stopAudioConversation();
138138
method public abstract void stopReceiving();
139139
field public static final com.google.firebase.vertexai.java.LiveSessionFutures.Companion Companion;
140140
}
@@ -330,11 +330,13 @@ package com.google.firebase.vertexai.type {
330330
ctor public GenerateContentResponse(java.util.List<com.google.firebase.vertexai.type.Candidate> candidates, com.google.firebase.vertexai.type.PromptFeedback? promptFeedback, com.google.firebase.vertexai.type.UsageMetadata? usageMetadata);
331331
method public java.util.List<com.google.firebase.vertexai.type.Candidate> getCandidates();
332332
method public java.util.List<com.google.firebase.vertexai.type.FunctionCallPart> getFunctionCalls();
333+
method public java.util.List<com.google.firebase.vertexai.type.InlineDataPart> getInlineDataParts();
333334
method public com.google.firebase.vertexai.type.PromptFeedback? getPromptFeedback();
334335
method public String? getText();
335336
method public com.google.firebase.vertexai.type.UsageMetadata? getUsageMetadata();
336337
property public final java.util.List<com.google.firebase.vertexai.type.Candidate> candidates;
337338
property public final java.util.List<com.google.firebase.vertexai.type.FunctionCallPart> functionCalls;
339+
property public final java.util.List<com.google.firebase.vertexai.type.InlineDataPart> inlineDataParts;
338340
property public final com.google.firebase.vertexai.type.PromptFeedback? promptFeedback;
339341
property public final String? text;
340342
property public final com.google.firebase.vertexai.type.UsageMetadata? usageMetadata;
@@ -352,6 +354,7 @@ package com.google.firebase.vertexai.type {
352354
field public Integer? maxOutputTokens;
353355
field public Float? presencePenalty;
354356
field public String? responseMimeType;
357+
field public java.util.List<com.google.firebase.vertexai.type.ResponseModality>? responseModalities;
355358
field public com.google.firebase.vertexai.type.Schema? responseSchema;
356359
field public java.util.List<java.lang.String>? stopSequences;
357360
field public Float? temperature;
@@ -690,7 +693,7 @@ package com.google.firebase.vertexai.type {
690693
public final class RequestTimeoutException extends com.google.firebase.vertexai.type.FirebaseVertexAIException {
691694
}
692695

693-
@com.google.firebase.vertexai.type.PublicPreviewAPI public final class ResponseModality {
696+
public final class ResponseModality {
694697
method public int getOrdinal();
695698
property public final int ordinal;
696699
field public static final com.google.firebase.vertexai.type.ResponseModality AUDIO;

firebase-vertexai/src/main/kotlin/com/google/firebase/vertexai/type/GenerateContentResponse.kt

+12
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,18 @@ public class GenerateContentResponse(
4444
candidates.first().content.parts.filterIsInstance<FunctionCallPart>()
4545
}
4646

47+
/**
48+
* Convenience field representing all the [InlineDataPart]s in the first candidate, if they exist.
49+
*
50+
* This also includes any [ImagePart], but they will be represented as [InlineDataPart] instead.
51+
*/
52+
public val inlineDataParts: List<InlineDataPart> by lazy {
53+
candidates.first().content.parts.let { parts ->
54+
parts.filterIsInstance<ImagePart>().map { it.toInlineDataPart() } +
55+
parts.filterIsInstance<InlineDataPart>()
56+
}
57+
}
58+
4759
@Serializable
4860
internal data class Internal(
4961
val candidates: List<Candidate.Internal>? = null,

firebase-vertexai/src/main/kotlin/com/google/firebase/vertexai/type/GenerationConfig.kt

+11-1
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,8 @@ import kotlinx.serialization.Serializable
6969
* @property responseSchema Output schema of the generated candidate text. If set, a compatible
7070
* [responseMimeType] must also be set.
7171
*
72+
* @property responseModalities The format of data in which the model should respond with.
73+
*
7274
* Compatible MIME types:
7375
* - `application/json`: Schema for JSON response.
7476
*
@@ -88,6 +90,7 @@ private constructor(
8890
internal val stopSequences: List<String>?,
8991
internal val responseMimeType: String?,
9092
internal val responseSchema: Schema?,
93+
internal val responseModalities: List<ResponseModality>?,
9194
) {
9295

9396
/**
@@ -115,6 +118,9 @@ private constructor(
115118
* @property responseMimeType See [GenerationConfig.responseMimeType].
116119
*
117120
* @property responseSchema See [GenerationConfig.responseSchema].
121+
*
122+
* @property responseModalities See [GenerationConfig.responseModalities].
123+
*
118124
* @see [generationConfig]
119125
*/
120126
public class Builder {
@@ -128,6 +134,7 @@ private constructor(
128134
@JvmField public var stopSequences: List<String>? = null
129135
@JvmField public var responseMimeType: String? = null
130136
@JvmField public var responseSchema: Schema? = null
137+
@JvmField public var responseModalities: List<ResponseModality>? = null
131138

132139
/** Create a new [GenerationConfig] with the attached arguments. */
133140
public fun build(): GenerationConfig =
@@ -142,6 +149,7 @@ private constructor(
142149
frequencyPenalty = frequencyPenalty,
143150
responseMimeType = responseMimeType,
144151
responseSchema = responseSchema,
152+
responseModalities = responseModalities
145153
)
146154
}
147155

@@ -156,7 +164,8 @@ private constructor(
156164
frequencyPenalty = frequencyPenalty,
157165
presencePenalty = presencePenalty,
158166
responseMimeType = responseMimeType,
159-
responseSchema = responseSchema?.toInternal()
167+
responseSchema = responseSchema?.toInternal(),
168+
responseModalities = responseModalities?.map { it.toInternal() }
160169
)
161170

162171
@Serializable
@@ -171,6 +180,7 @@ private constructor(
171180
@SerialName("presence_penalty") val presencePenalty: Float? = null,
172181
@SerialName("frequency_penalty") val frequencyPenalty: Float? = null,
173182
@SerialName("response_schema") val responseSchema: Schema.Internal? = null,
183+
@SerialName("response_modalities") val responseModalities: List<String>? = null
174184
)
175185

176186
public companion object {

firebase-vertexai/src/main/kotlin/com/google/firebase/vertexai/type/Part.kt

+8-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,14 @@ public class TextPart(public val text: String) : Part {
4545
*
4646
* @param image [Bitmap] to convert into a [Part]
4747
*/
48-
public class ImagePart(public val image: Bitmap) : Part
48+
public class ImagePart(public val image: Bitmap) : Part {
49+
50+
internal fun toInlineDataPart() =
51+
InlineDataPart(
52+
android.util.Base64.decode(encodeBitmapToBase64Png(image), BASE_64_FLAGS),
53+
"image/jpeg"
54+
)
55+
}
4956

5057
/**
5158
* Represents binary data with an associated MIME type sent to and received from requests.

firebase-vertexai/src/main/kotlin/com/google/firebase/vertexai/type/ResponseModality.kt

-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ import kotlinx.serialization.KSerializer
2121
import kotlinx.serialization.Serializable
2222

2323
/** Represents the type of content present in a response (e.g., text, image, audio). */
24-
@PublicPreviewAPI
2524
public class ResponseModality private constructor(public val ordinal: Int) {
2625

2726
@Serializable(Internal.Serializer::class)

0 commit comments

Comments
 (0)