@@ -95,14 +95,15 @@ class GenericLlmRequest
95
95
using RequestPtr = std::shared_ptr<GenericLlmRequest>;
96
96
using MillisecondsType = std::chrono::milliseconds;
97
97
98
- // 45 parameters, 52 items in initialization list
98
+ // 46 parameters, 53 items in initialization list
99
99
GenericLlmRequest (RequestIdType requestId, SizeType32 maxNewTokens, std::shared_ptr<VecTokens> const & inputTokens,
100
100
runtime::SamplingConfig const & samplingConfig, bool isStreaming, std::optional<SizeType32> endId = std::nullopt,
101
101
std::optional<SizeType32> padId = std::nullopt, std::optional<TensorPtr> embeddingBias = std::nullopt,
102
102
std::optional<TensorPtr> badWordsList = std::nullopt, std::optional<TensorPtr> stopWordsList = std::nullopt,
103
103
std::optional<std::shared_ptr<std::vector<SizeType32>>> positionIds = std::nullopt,
104
104
std::optional<TensorPtr> promptEmbeddingTable = std::nullopt,
105
105
std::optional<SizeType32> promptVocabSize = std::nullopt,
106
+ std::optional<TensorPtr> multimodalEmbedding = std::nullopt,
106
107
std::optional<TensorPtr> mropeRotaryCosSin = std::nullopt,
107
108
std::optional<SizeType32> mropePositionDeltas = std::nullopt,
108
109
std::optional<LoraTaskIdType> loraTaskId = std::nullopt, std::optional<TensorPtr> loraWeights = std::nullopt,
@@ -147,6 +148,7 @@ class GenericLlmRequest
147
148
, mPositionIds(std::move(positionIds))
148
149
, mPromptEmbeddingTable(std::move(promptEmbeddingTable))
149
150
, mPromptVocabSize(promptVocabSize)
151
+ , mMultimodalEmbedding(std::move(multimodalEmbedding))
150
152
, mMropeRotaryCosSin(std::move(mropeRotaryCosSin))
151
153
, mMropePositionDeltas(mropePositionDeltas)
152
154
, mLoraTaskId(loraTaskId)
@@ -854,6 +856,11 @@ class GenericLlmRequest
854
856
return mPromptVocabSize ;
855
857
}
856
858
859
+ [[nodiscard]] std::optional<TensorPtr> getMultimodalEmbedding () const
860
+ {
861
+ return mMultimodalEmbedding ;
862
+ }
863
+
857
864
[[nodiscard]] std::optional<TensorPtr> getMropeRotaryCosSin () const
858
865
{
859
866
return mMropeRotaryCosSin ;
@@ -1818,6 +1825,7 @@ class GenericLlmRequest
1818
1825
1819
1826
std::optional<TensorPtr> mPromptEmbeddingTable {std::nullopt};
1820
1827
std::optional<SizeType32> mPromptVocabSize {std::nullopt};
1828
+ std::optional<TensorPtr> mMultimodalEmbedding {std::nullopt};
1821
1829
std::optional<TensorPtr> mMropeRotaryCosSin {std::nullopt};
1822
1830
std::optional<SizeType32> mMropePositionDeltas {std::nullopt};
1823
1831
@@ -2076,14 +2084,15 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
2076
2084
using TokenExtraIdType = Base::TokenExtraIdType;
2077
2085
using VecTokenExtraIds = Base::VecTokenExtraIds;
2078
2086
2079
- // 45 parameters, 45 parameters in Base class constructor
2087
+ // 46 parameters, 46 parameters in Base class constructor
2080
2088
LlmRequest (RequestIdType requestId, SizeType32 maxNewTokens, std::shared_ptr<VecTokens> inputTokens,
2081
2089
runtime::SamplingConfig const & samplingConfig, bool isStreaming, std::optional<SizeType32> endId = std::nullopt,
2082
2090
std::optional<SizeType32> padId = std::nullopt, std::optional<TensorPtr> embeddingBias = std::nullopt,
2083
2091
std::optional<TensorPtr> badWordsList = std::nullopt, std::optional<TensorPtr> stopWordsList = std::nullopt,
2084
2092
std::optional<std::shared_ptr<std::vector<SizeType32>>> positionIds = std::nullopt,
2085
2093
std::optional<TensorPtr> promptEmbeddingTable = std::nullopt,
2086
2094
std::optional<SizeType32> promptVocabSize = std::nullopt,
2095
+ std::optional<TensorPtr> multimodalEmbedding = std::nullopt,
2087
2096
std::optional<TensorPtr> mropeRotaryCosSin = std::nullopt,
2088
2097
std::optional<SizeType32> mropePositionDeltas = std::nullopt,
2089
2098
std::optional<LoraTaskIdType> loraTaskId = std::nullopt, std::optional<TensorPtr> loraWeights = std::nullopt,
@@ -2111,26 +2120,27 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
2111
2120
std::optional<executor::ContextPhaseParams> const & contextPhaseParams = std::nullopt)
2112
2121
: Base(requestId, maxNewTokens, std::move(inputTokens), samplingConfig, isStreaming, endId, padId,
2113
2122
std::move (embeddingBias), std::move(badWordsList), std::move(stopWordsList), std::move(positionIds),
2114
- std::move(promptEmbeddingTable), promptVocabSize, std::move(mropeRotaryCosSin), mropePositionDeltas ,
2115
- loraTaskId, std::move(loraWeights ), std::move(loraConfig), std::move(lookaheadConfig ),
2116
- std::move(kvCacheRetentionConfig ), returnLogProbs, returnContextLogits, returnGenerationLogits ,
2117
- std::move(draftTokens), std::move(draftLogits ), excludeInputFromOutput, std::move(logitsPostProcessor ),
2118
- applyLogitsPostProcessorBatched , std::move(encoderInputTokens ), returnEncoderOutput, clientId, priority ,
2119
- std::move(encoderInputFeatures ), std::move(encoderOutputLength), std::move(crossAttentionMask ),
2120
- llmRequestType, std::move(inputTokenExtraIds ), numReturnSequences, std::move(eagleConfig) ,
2121
- std::move(skipCrossAttnBlocks ), returnPerfMetrics , std::move(guidedDecodingParams ), languageAdapterUid ,
2122
- allottedTimeMs, contextPhaseParams)
2123
+ std::move(promptEmbeddingTable), promptVocabSize, std::move(multimodalEmbedding) ,
2124
+ std::move(mropeRotaryCosSin ), mropePositionDeltas, loraTaskId, std::move(loraWeights ),
2125
+ std::move(loraConfig ), std::move(lookaheadConfig), std::move(kvCacheRetentionConfig), returnLogProbs ,
2126
+ returnContextLogits, returnGenerationLogits, std::move(draftTokens ), std::move(draftLogits ),
2127
+ excludeInputFromOutput , std::move(logitsPostProcessor ), applyLogitsPostProcessorBatched ,
2128
+ std::move(encoderInputTokens ), returnEncoderOutput, clientId, priority, std::move(encoderInputFeatures ),
2129
+ std::move(encoderOutputLength ), std::move(crossAttentionMask), llmRequestType ,
2130
+ std::move(inputTokenExtraIds ), numReturnSequences , std::move(eagleConfig ), std::move(skipCrossAttnBlocks) ,
2131
+ returnPerfMetrics, std::move(guidedDecodingParams), languageAdapterUid, allottedTimeMs, contextPhaseParams)
2123
2132
{
2124
2133
}
2125
2134
2126
- // 45 parameters, 45 parameters in Base class constructor
2135
+ // 46 parameters, 46 parameters in Base class constructor
2127
2136
LlmRequest (RequestIdType requestId, SizeType32 maxNewTokens, std::vector<TokenIdType> inputTokens,
2128
2137
runtime::SamplingConfig const & samplingConfig, bool isStreaming, std::optional<SizeType32> endId = std::nullopt,
2129
2138
std::optional<SizeType32> padId = std::nullopt, std::optional<TensorPtr> embeddingBias = std::nullopt,
2130
2139
std::optional<TensorPtr> badWordsList = std::nullopt, std::optional<TensorPtr> stopWordsList = std::nullopt,
2131
2140
std::optional<std::vector<SizeType32>> positionIds = std::nullopt,
2132
2141
std::optional<TensorPtr> promptEmbeddingTable = std::nullopt,
2133
2142
std::optional<SizeType32> promptVocabSize = std::nullopt,
2143
+ std::optional<TensorPtr> multimodalEmbedding = std::nullopt,
2134
2144
std::optional<TensorPtr> mropeRotaryCosSin = std::nullopt,
2135
2145
std::optional<SizeType32> mropePositionDeltas = std::nullopt,
2136
2146
std::optional<LoraTaskIdType> loraTaskId = std::nullopt, std::optional<TensorPtr> loraWeights = std::nullopt,
@@ -2159,9 +2169,10 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
2159
2169
std::move(stopWordsList),
2160
2170
positionIds.has_value() ? std::make_shared<std::vector<SizeType32>>(std::move(positionIds.value()))
2161
2171
: std::optional<std::shared_ptr<std::vector<SizeType32>>>(std::nullopt),
2162
- std::move(promptEmbeddingTable), promptVocabSize, std::move(mropeRotaryCosSin), mropePositionDeltas,
2163
- loraTaskId, std::move(loraWeights), std::move(loraConfig), lookaheadConfig,
2164
- std::move(kvCacheRetentionConfig), returnLogProbs, returnContextLogits, returnGenerationLogits,
2172
+ std::move(promptEmbeddingTable), promptVocabSize, std::move(multimodalEmbedding),
2173
+ std::move(mropeRotaryCosSin), mropePositionDeltas, loraTaskId, std::move(loraWeights),
2174
+ std::move(loraConfig), lookaheadConfig, std::move(kvCacheRetentionConfig), returnLogProbs,
2175
+ returnContextLogits, returnGenerationLogits,
2165
2176
draftTokens.has_value() ? std::make_shared<VecTokens>(std::move(draftTokens.value()))
2166
2177
: std::make_shared<VecTokens>(),
2167
2178
std::move(draftLogits), excludeInputFromOutput, std::move(logitsPostProcessor),
0 commit comments