NVIDIA
diff --git a/‎benchmarks/cpp/disaggServerBenchmark.cpp
+1 b/‎benchmarks/cpp/disaggServerBenchmark.cpp
+1
diff --git a/‎benchmarks/cpp/gptManagerBenchmark.cpp
+1 b/‎benchmarks/cpp/gptManagerBenchmark.cpp
+1
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/llmRequest.h
+26-15 b/‎cpp/include/tensorrt_llm/batch_manager/llmRequest.h
+26-15
diff --git a/‎cpp/include/tensorrt_llm/executor/executor.h
+6-1 b/‎cpp/include/tensorrt_llm/executor/executor.h
+6-1
diff --git a/‎cpp/tensorrt_llm/executor/request.cpp
+19-9 b/‎cpp/tensorrt_llm/executor/request.cpp
+19-9
diff --git a/‎cpp/tensorrt_llm/executor/requestImpl.h
+17-3 b/‎cpp/tensorrt_llm/executor/requestImpl.h
+17-3
@@ -535,6 +535,7 @@ texec::Request makeExecutorContextRequest(Sample const& sample, SizeType32 const
             std::nullopt,    // embeddingBias
             std::nullopt,    // speculativeDecoding
             std::nullopt,    // pTuning
+            std::nullopt,    // multimodalEmbedding
             std::nullopt,    // mRopeConfig
             loraConfig,      // loraConfig
             lookaheadConfig, // lookaheadConfig
 
@@ -828,6 +828,7 @@ texec::Request makeExecutorRequest(Sample const& sample, SizeType32 const& beamW
         std::nullopt,    // embeddingBias
         std::nullopt,    // speculativeDecoding
         std::nullopt,    // pTuning
+        std::nullopt,    // multimodalEmbedding
         std::nullopt,    // mRopeConfig
         loraConfig,      // loraConfig
         lookaheadConfig, // lookaheadConfig
 
@@ -95,14 +95,15 @@ class GenericLlmRequest
     using RequestPtr = std::shared_ptr<GenericLlmRequest>;
     using MillisecondsType = std::chrono::milliseconds;
 
-    // 45 parameters, 52 items in initialization list
+    // 46 parameters, 53 items in initialization list
     GenericLlmRequest(RequestIdType requestId, SizeType32 maxNewTokens, std::shared_ptr<VecTokens> const& inputTokens,
         runtime::SamplingConfig const& samplingConfig, bool isStreaming, std::optional<SizeType32> endId = std::nullopt,
         std::optional<SizeType32> padId = std::nullopt, std::optional<TensorPtr> embeddingBias = std::nullopt,
         std::optional<TensorPtr> badWordsList = std::nullopt, std::optional<TensorPtr> stopWordsList = std::nullopt,
         std::optional<std::shared_ptr<std::vector<SizeType32>>> positionIds = std::nullopt,
         std::optional<TensorPtr> promptEmbeddingTable = std::nullopt,
         std::optional<SizeType32> promptVocabSize = std::nullopt,
+        std::optional<TensorPtr> multimodalEmbedding = std::nullopt,
         std::optional<TensorPtr> mropeRotaryCosSin = std::nullopt,
         std::optional<SizeType32> mropePositionDeltas = std::nullopt,
         std::optional<LoraTaskIdType> loraTaskId = std::nullopt, std::optional<TensorPtr> loraWeights = std::nullopt,
@@ -147,6 +148,7 @@ class GenericLlmRequest
         , mPositionIds(std::move(positionIds))
         , mPromptEmbeddingTable(std::move(promptEmbeddingTable))
         , mPromptVocabSize(promptVocabSize)
+        , mMultimodalEmbedding(std::move(multimodalEmbedding))
         , mMropeRotaryCosSin(std::move(mropeRotaryCosSin))
         , mMropePositionDeltas(mropePositionDeltas)
         , mLoraTaskId(loraTaskId)
@@ -854,6 +856,11 @@ class GenericLlmRequest
         return mPromptVocabSize;
     }
 
+    [[nodiscard]] std::optional<TensorPtr> getMultimodalEmbedding() const
+    {
+        return mMultimodalEmbedding;
+    }
+
     [[nodiscard]] std::optional<TensorPtr> getMropeRotaryCosSin() const
     {
         return mMropeRotaryCosSin;
@@ -1818,6 +1825,7 @@ class GenericLlmRequest
 
     std::optional<TensorPtr> mPromptEmbeddingTable{std::nullopt};
     std::optional<SizeType32> mPromptVocabSize{std::nullopt};
+    std::optional<TensorPtr> mMultimodalEmbedding{std::nullopt};
     std::optional<TensorPtr> mMropeRotaryCosSin{std::nullopt};
     std::optional<SizeType32> mMropePositionDeltas{std::nullopt};
 
@@ -2076,14 +2084,15 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
     using TokenExtraIdType = Base::TokenExtraIdType;
     using VecTokenExtraIds = Base::VecTokenExtraIds;
 
-    // 45 parameters, 45 parameters in Base class constructor
+    // 46 parameters, 46 parameters in Base class constructor
     LlmRequest(RequestIdType requestId, SizeType32 maxNewTokens, std::shared_ptr<VecTokens> inputTokens,
         runtime::SamplingConfig const& samplingConfig, bool isStreaming, std::optional<SizeType32> endId = std::nullopt,
         std::optional<SizeType32> padId = std::nullopt, std::optional<TensorPtr> embeddingBias = std::nullopt,
         std::optional<TensorPtr> badWordsList = std::nullopt, std::optional<TensorPtr> stopWordsList = std::nullopt,
         std::optional<std::shared_ptr<std::vector<SizeType32>>> positionIds = std::nullopt,
         std::optional<TensorPtr> promptEmbeddingTable = std::nullopt,
         std::optional<SizeType32> promptVocabSize = std::nullopt,
+        std::optional<TensorPtr> multimodalEmbedding = std::nullopt,
         std::optional<TensorPtr> mropeRotaryCosSin = std::nullopt,
         std::optional<SizeType32> mropePositionDeltas = std::nullopt,
         std::optional<LoraTaskIdType> loraTaskId = std::nullopt, std::optional<TensorPtr> loraWeights = std::nullopt,
@@ -2111,26 +2120,27 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
         std::optional<executor::ContextPhaseParams> const& contextPhaseParams = std::nullopt)
         : Base(requestId, maxNewTokens, std::move(inputTokens), samplingConfig, isStreaming, endId, padId,
             std::move(embeddingBias), std::move(badWordsList), std::move(stopWordsList), std::move(positionIds),
-            std::move(promptEmbeddingTable), promptVocabSize, std::move(mropeRotaryCosSin), mropePositionDeltas,
-            loraTaskId, std::move(loraWeights), std::move(loraConfig), std::move(lookaheadConfig),
-            std::move(kvCacheRetentionConfig), returnLogProbs, returnContextLogits, returnGenerationLogits,
-            std::move(draftTokens), std::move(draftLogits), excludeInputFromOutput, std::move(logitsPostProcessor),
-            applyLogitsPostProcessorBatched, std::move(encoderInputTokens), returnEncoderOutput, clientId, priority,
-            std::move(encoderInputFeatures), std::move(encoderOutputLength), std::move(crossAttentionMask),
-            llmRequestType, std::move(inputTokenExtraIds), numReturnSequences, std::move(eagleConfig),
-            std::move(skipCrossAttnBlocks), returnPerfMetrics, std::move(guidedDecodingParams), languageAdapterUid,
-            allottedTimeMs, contextPhaseParams)
+            std::move(promptEmbeddingTable), promptVocabSize, std::move(multimodalEmbedding),
+            std::move(mropeRotaryCosSin), mropePositionDeltas, loraTaskId, std::move(loraWeights),
+            std::move(loraConfig), std::move(lookaheadConfig), std::move(kvCacheRetentionConfig), returnLogProbs,
+            returnContextLogits, returnGenerationLogits, std::move(draftTokens), std::move(draftLogits),
+            excludeInputFromOutput, std::move(logitsPostProcessor), applyLogitsPostProcessorBatched,
+            std::move(encoderInputTokens), returnEncoderOutput, clientId, priority, std::move(encoderInputFeatures),
+            std::move(encoderOutputLength), std::move(crossAttentionMask), llmRequestType,
+            std::move(inputTokenExtraIds), numReturnSequences, std::move(eagleConfig), std::move(skipCrossAttnBlocks),
+            returnPerfMetrics, std::move(guidedDecodingParams), languageAdapterUid, allottedTimeMs, contextPhaseParams)
     {
     }
 
-    // 45 parameters, 45 parameters in Base class constructor
+    // 46 parameters, 46 parameters in Base class constructor
     LlmRequest(RequestIdType requestId, SizeType32 maxNewTokens, std::vector<TokenIdType> inputTokens,
         runtime::SamplingConfig const& samplingConfig, bool isStreaming, std::optional<SizeType32> endId = std::nullopt,
         std::optional<SizeType32> padId = std::nullopt, std::optional<TensorPtr> embeddingBias = std::nullopt,
         std::optional<TensorPtr> badWordsList = std::nullopt, std::optional<TensorPtr> stopWordsList = std::nullopt,
         std::optional<std::vector<SizeType32>> positionIds = std::nullopt,
         std::optional<TensorPtr> promptEmbeddingTable = std::nullopt,
         std::optional<SizeType32> promptVocabSize = std::nullopt,
+        std::optional<TensorPtr> multimodalEmbedding = std::nullopt,
         std::optional<TensorPtr> mropeRotaryCosSin = std::nullopt,
         std::optional<SizeType32> mropePositionDeltas = std::nullopt,
         std::optional<LoraTaskIdType> loraTaskId = std::nullopt, std::optional<TensorPtr> loraWeights = std::nullopt,
@@ -2159,9 +2169,10 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
             std::move(stopWordsList),
             positionIds.has_value() ? std::make_shared<std::vector<SizeType32>>(std::move(positionIds.value()))
                                     : std::optional<std::shared_ptr<std::vector<SizeType32>>>(std::nullopt),
-            std::move(promptEmbeddingTable), promptVocabSize, std::move(mropeRotaryCosSin), mropePositionDeltas,
-            loraTaskId, std::move(loraWeights), std::move(loraConfig), lookaheadConfig,
-            std::move(kvCacheRetentionConfig), returnLogProbs, returnContextLogits, returnGenerationLogits,
+            std::move(promptEmbeddingTable), promptVocabSize, std::move(multimodalEmbedding),
+            std::move(mropeRotaryCosSin), mropePositionDeltas, loraTaskId, std::move(loraWeights),
+            std::move(loraConfig), lookaheadConfig, std::move(kvCacheRetentionConfig), returnLogProbs,
+            returnContextLogits, returnGenerationLogits,
             draftTokens.has_value() ? std::make_shared<VecTokens>(std::move(draftTokens.value()))
                                     : std::make_shared<VecTokens>(),
             std::move(draftLogits), excludeInputFromOutput, std::move(logitsPostProcessor),
 
@@ -609,6 +609,8 @@ class Request
     /// @param embeddingBias The embedding bias tensor. Expected shape is [vocab_size]
     /// @param externalDraftTokensConfig The speculative decoding with external draft tokens configuration
     /// @param pTuningConfig The prompt tuning configuration
+    /// @param multimodalEmbedding The multimodal embedding tensor. Expected shape is [num_multimodal_tokens,
+    /// hidden_dim]
     /// @param mRopeConfig The mrope configuration
     /// @param loraConfig The LoRA configuration
     /// @param lookaheadConfig The lookahead speculative decoding configuration
@@ -646,7 +648,8 @@ class Request
         std::optional<Tensor> embeddingBias = std::nullopt,
         std::optional<ExternalDraftTokensConfig> externalDraftTokensConfig = std::nullopt,
         std::optional<PromptTuningConfig> pTuningConfig = std::nullopt,
-        std::optional<MropeConfig> mRopeConfig = std::nullopt, std::optional<LoraConfig> loraConfig = std::nullopt,
+        std::optional<Tensor> multimodalEmbedding = std::nullopt, std::optional<MropeConfig> mRopeConfig = std::nullopt,
+        std::optional<LoraConfig> loraConfig = std::nullopt,
         std::optional<LookaheadDecodingConfig> lookaheadConfig = std::nullopt,
         std::optional<KvCacheRetentionConfig> kvCacheRetentionConfig = std::nullopt,
         std::optional<std::string> logitsPostProcessorName = std::nullopt,
@@ -688,6 +691,7 @@ class Request
     [[nodiscard]] std::optional<Tensor> getEmbeddingBias() const;
     [[nodiscard]] std::optional<ExternalDraftTokensConfig> getExternalDraftTokensConfig() const;
     [[nodiscard]] std::optional<PromptTuningConfig> getPromptTuningConfig() const;
+    [[nodiscard]] std::optional<Tensor> getMultimodalEmbedding() const;
     [[nodiscard]] std::optional<MropeConfig> getMropeConfig() const;
     [[nodiscard]] std::optional<LoraConfig> getLoraConfig() const;
     [[nodiscard]] std::optional<LookaheadDecodingConfig> getLookaheadConfig() const;
@@ -722,6 +726,7 @@ class Request
     void setEmbeddingBias(Tensor const& embeddingBias);
     void setExternalDraftTokensConfig(ExternalDraftTokensConfig const& externalDraftTokensConfig);
     void setPromptTuningConfig(PromptTuningConfig const& pTuningConfig);
+    void setMultimodalEmbedding(Tensor const& multimodalEmbedding);
     void setMropeConfig(MropeConfig const& mRopeConfig);
     void setLoraConfig(LoraConfig const& loraConfig);
     void setLookaheadConfig(LookaheadDecodingConfig const& lookaheadConfig);
 
@@ -25,14 +25,14 @@
 
 namespace tensorrt_llm::executor
 {
-// 34 parameters
+// 35 parameters
 Request::Request(VecTokens inputTokenIds, SizeType32 maxTokens, bool streaming, SamplingConfig const& samplingConfig,
     OutputConfig const& outputConfig, std::optional<SizeType32> const& endId, std::optional<SizeType32> const& padId,
     std::optional<std::vector<SizeType32>> positionIds, std::optional<std::list<VecTokens>> badWords,
     std::optional<std::list<VecTokens>> stopWords, std::optional<Tensor> embeddingBias,
     std::optional<ExternalDraftTokensConfig> externalDraftTokensConfig, std::optional<PromptTuningConfig> pTuningConfig,
-    std::optional<MropeConfig> mRopeConfig, std::optional<LoraConfig> loraConfig,
-    std::optional<LookaheadDecodingConfig> lookaheadConfig,
+    std::optional<Tensor> multimodalEmbedding, std::optional<MropeConfig> mRopeConfig,
+    std::optional<LoraConfig> loraConfig, std::optional<LookaheadDecodingConfig> lookaheadConfig,
     std::optional<KvCacheRetentionConfig> kvCacheRetentionConfig, std::optional<std::string> logitsPostProcessorName,
     std::optional<LogitsPostProcessor> logitslogitsPostProcessor, std::optional<VecTokens> encoderInputTokenIds,
     std::optional<IdType> clientId, bool returnAllGeneratedTokens, float priority, RequestType type,
@@ -43,12 +43,12 @@ Request::Request(VecTokens inputTokenIds, SizeType32 maxTokens, bool streaming,
     std::optional<MillisecondsType> allottedTimeMs)
     : mImpl(std::make_unique<Impl>(std::move(inputTokenIds), maxTokens, streaming, samplingConfig, outputConfig, endId,
         padId, std::move(positionIds), std::move(badWords), std::move(stopWords), std::move(embeddingBias),
-        std::move(externalDraftTokensConfig), std::move(pTuningConfig), std::move(mRopeConfig), std::move(loraConfig),
-        lookaheadConfig, std::move(kvCacheRetentionConfig), std::move(logitsPostProcessorName),
-        std::move(logitslogitsPostProcessor), std::move(encoderInputTokenIds), clientId, returnAllGeneratedTokens,
-        priority, type, std::move(contextPhaseParams), std::move(encoderInputFeatures), encoderOutputLength,
-        crossAttentionMask, numReturnSequences, eagleConfig, skipCrossAttnBlocks, std::move(guidedDecodingParams),
-        languageAdapterUid, allottedTimeMs))
+        std::move(externalDraftTokensConfig), std::move(pTuningConfig), std::move(multimodalEmbedding),
+        std::move(mRopeConfig), std::move(loraConfig), lookaheadConfig, std::move(kvCacheRetentionConfig),
+        std::move(logitsPostProcessorName), std::move(logitslogitsPostProcessor), std::move(encoderInputTokenIds),
+        clientId, returnAllGeneratedTokens, priority, type, std::move(contextPhaseParams),
+        std::move(encoderInputFeatures), encoderOutputLength, crossAttentionMask, numReturnSequences, eagleConfig,
+        skipCrossAttnBlocks, std::move(guidedDecodingParams), languageAdapterUid, allottedTimeMs))
 {
 }
 
@@ -143,6 +143,11 @@ std::optional<PromptTuningConfig> Request::getPromptTuningConfig() const
     return mImpl->getPromptTuningConfig();
 }
 
+std::optional<Tensor> Request::getMultimodalEmbedding() const
+{
+    return mImpl->getMultimodalEmbedding();
+}
+
 std::optional<MropeConfig> Request::getMropeConfig() const
 {
     return mImpl->getMropeConfig();
@@ -306,6 +311,11 @@ void Request::setPromptTuningConfig(PromptTuningConfig const& pTuningConfig)
     return mImpl->setPromptTuningConfig(pTuningConfig);
 }
 
+void Request::setMultimodalEmbedding(Tensor const& multimodalEmbedding)
+{
+    return mImpl->setMultimodalEmbedding(multimodalEmbedding);
+}
+
 void Request::setMropeConfig(MropeConfig const& mRopeConfig)
 {
     return mImpl->setMropeConfig(mRopeConfig);
 
@@ -32,14 +32,15 @@ class Request::Impl
 {
 
 public:
-    // 34 parameters, 34 items in initialization list
+    // 35 parameters, 35 items in initialization list
     Impl(VecTokens inputTokenIds, SizeType32 maxNewTokens, bool streaming, SamplingConfig const& samplingConfig,
         OutputConfig outputConfig, std::optional<TokenIdType> const& endId, std::optional<TokenIdType> const& padId,
         std::optional<std::vector<SizeType32>> positionIds, std::optional<std::list<VecTokens>> badWords,
         std::optional<std::list<VecTokens>> stopWords, std::optional<Tensor> embeddingBias,
         std::optional<ExternalDraftTokensConfig> externalDraftTokensConfig,
-        std::optional<PromptTuningConfig> pTuningConfig, std::optional<MropeConfig> mRopeConfig,
-        std::optional<LoraConfig> loraConfig, std::optional<LookaheadDecodingConfig> lookaheadConfig,
+        std::optional<PromptTuningConfig> pTuningConfig, std::optional<Tensor> multimodalEmbedding,
+        std::optional<MropeConfig> mRopeConfig, std::optional<LoraConfig> loraConfig,
+        std::optional<LookaheadDecodingConfig> lookaheadConfig,
         std::optional<KvCacheRetentionConfig> kvCacheRetentionConfig,
         std::optional<std::string> logitsPostProcessorName, std::optional<LogitsPostProcessor> logitsPostProcessor,
         std::optional<VecTokens> encoderInputTokenIds, std::optional<IdType> clientId, bool returnAllGeneratedTokens,
@@ -61,6 +62,7 @@ class Request::Impl
         , mEmbeddingBias(checkEmbeddingBias(std::move(embeddingBias)))
         , mExternalDraftTokensConfig(std::move(externalDraftTokensConfig))
         , mPTuningConfig(std::move(pTuningConfig))
+        , mMultimodalEmbedding(std::move(multimodalEmbedding))
         , mMropeConfig(std::move(mRopeConfig))
         , mLoraConfig(std::move(loraConfig))
         , mLookaheadConfig(lookaheadConfig)
@@ -175,6 +177,11 @@ class Request::Impl
         return mPTuningConfig;
     }
 
+    [[nodiscard]] std::optional<Tensor> getMultimodalEmbedding() const
+    {
+        return mMultimodalEmbedding;
+    }
+
     [[nodiscard]] std::optional<MropeConfig> getMropeConfig() const
     {
         return mMropeConfig;
@@ -338,6 +345,11 @@ class Request::Impl
         mPTuningConfig = pTuningConfig;
     }
 
+    void setMultimodalEmbedding(Tensor const& multimodalEmbedding)
+    {
+        mMultimodalEmbedding = multimodalEmbedding;
+    }
+
     void setMropeConfig(MropeConfig const& mRopeConfig)
     {
         mMropeConfig = mRopeConfig;
@@ -498,6 +510,7 @@ class Request::Impl
         lambda(mEmbeddingBias);
         lambda(mExternalDraftTokensConfig);
         lambda(mPTuningConfig);
+        lambda(mMultimodalEmbedding);
         lambda(mMropeConfig);
         lambda(mLoraConfig);
         lambda(mLookaheadConfig);
@@ -533,6 +546,7 @@ class Request::Impl
     std::optional<Tensor> mEmbeddingBias;
     std::optional<ExternalDraftTokensConfig> mExternalDraftTokensConfig;
     std::optional<PromptTuningConfig> mPTuningConfig;
+    std::optional<Tensor> mMultimodalEmbedding;
     std::optional<MropeConfig> mMropeConfig;
     std::optional<LoraConfig> mLoraConfig;
     std::optional<LookaheadDecodingConfig> mLookaheadConfig;