NVIDIA
diff --git a/‎benchmarks/cpp/bertBenchmark.cpp
+9-5 b/‎benchmarks/cpp/bertBenchmark.cpp
+9-5
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h
+12-9 b/‎cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h
+12-9
diff --git a/‎cpp/include/tensorrt_llm/executor/executor.h
+7-2 b/‎cpp/include/tensorrt_llm/executor/executor.h
+7-2
diff --git a/‎cpp/include/tensorrt_llm/runtime/gptSession.h
+3 b/‎cpp/include/tensorrt_llm/runtime/gptSession.h
+3
diff --git a/‎cpp/tensorrt_llm/batch_manager/trtEncoderModel.cpp
+2-1 b/‎cpp/tensorrt_llm/batch_manager/trtEncoderModel.cpp
+2-1
diff --git a/‎cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
+2-2 b/‎cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
+2-2
diff --git a/‎cpp/tensorrt_llm/executor/executorConfig.cpp
+12-1 b/‎cpp/tensorrt_llm/executor/executorConfig.cpp
+12-1
diff --git a/‎cpp/tensorrt_llm/executor/serialization.cpp
+6-3 b/‎cpp/tensorrt_llm/executor/serialization.cpp
+6-3
diff --git a/‎cpp/tensorrt_llm/pybind/bindings.cpp
+1 b/‎cpp/tensorrt_llm/pybind/bindings.cpp
+1
@@ -74,15 +74,17 @@ std::string engineFilename(
 }
 
 void benchmarkBert(std::string const& modelName, std::filesystem::path const& dataPath,
-    std::vector<int> const& batchSizes, std::vector<int> const& inLens, std::vector<float> const& gpuWeightsPercents,
-    std::shared_ptr<nvinfer1::ILogger> const& logger, int warmUp, int numRuns, int duration)
+    std::vector<int> const& batchSizes, std::vector<int> const& inLens, bool useGpuDirectStorage,
+    std::vector<float> const& gpuWeightsPercents, std::shared_ptr<nvinfer1::ILogger> const& logger, int warmUp,
+    int numRuns, int duration)
 {
     auto const worldConfig = WorldConfig::mpi();
     auto const enginePath = dataPath / engineFilename(dataPath, worldConfig, modelName);
 
     for (float gpuWeightsPercent : gpuWeightsPercents)
     {
-        auto rt = std::make_shared<TllmRuntime>(RawEngine(enginePath), logger.get(), gpuWeightsPercent);
+        auto rt = std::make_shared<TllmRuntime>(
+            RawEngine(enginePath), logger.get(), useGpuDirectStorage, gpuWeightsPercent);
         rt->addContext(0);
         for (auto inLen : inLens)
         {
@@ -174,6 +176,8 @@ int main(int argc, char* argv[])
         "by \";\", "
         "example: \"0.0;0.5;1.0\".",
         cxxopts::value<std::string>()->default_value("1.0"));
+    options.add_options()("use_gpu_direct_storage", "Enable GPUDirect Storage (GDS) for loading engine.",
+        cxxopts::value<bool>()->default_value("false"));
 
     auto result = options.parse(argc, argv);
 
@@ -258,8 +262,8 @@ int main(int argc, char* argv[])
     try
     {
         benchmarkBert(result["model"].as<std::string>(), result["engine_dir"].as<std::string>(), batchSizes, inLens,
-            gpuWeightsPercents, logger, result["warm_up"].as<int>(), result["num_runs"].as<int>(),
-            result["duration"].as<int>());
+            result["use_gpu_direct_storage"].as<bool>(), gpuWeightsPercents, logger, result["warm_up"].as<int>(),
+            result["num_runs"].as<int>(), result["duration"].as<int>());
     }
     catch (std::exception const& e)
     {
 
@@ -41,9 +41,9 @@ class TrtGptModelOptionalParams
         std::optional<std::vector<SizeType32>> deviceIds = std::nullopt, bool normalizeLogProbs = true,
         bool enableChunkedContext = true,
         PeftCacheManagerConfig const& peftCacheManagerConfig = PeftCacheManagerConfig{},
-        executor::DecodingConfig decodingConfig = executor::DecodingConfig{}, float gpuWeightsPercent = 1,
-        std::optional<SizeType32> maxBeamWidth = std::nullopt, std::optional<SizeType32> maxBatchSize = std::nullopt,
-        std::optional<SizeType32> maxNumTokens = std::nullopt,
+        executor::DecodingConfig decodingConfig = executor::DecodingConfig{}, bool useGpuDirectStorage = false,
+        float gpuWeightsPercent = 1, std::optional<SizeType32> maxBeamWidth = std::nullopt,
+        std::optional<SizeType32> maxBatchSize = std::nullopt, std::optional<SizeType32> maxNumTokens = std::nullopt,
         executor::SchedulerConfig schedulerConfig = executor::SchedulerConfig{},
         executor::ExtendedRuntimePerfKnobConfig const& extendedRuntimePerfKnobConfig
         = executor::ExtendedRuntimePerfKnobConfig{},
@@ -61,6 +61,7 @@ class TrtGptModelOptionalParams
         , enableChunkedContext{enableChunkedContext}
         , peftCacheManagerConfig(peftCacheManagerConfig)
         , decodingConfig(std::move(decodingConfig))
+        , useGpuDirectStorage(useGpuDirectStorage)
         , gpuWeightsPercent(gpuWeightsPercent)
         , maxBeamWidth(maxBeamWidth)
         , maxBatchSize(maxBatchSize)
@@ -87,12 +88,12 @@ class TrtGptModelOptionalParams
             executorConfig.getNormalizeLogProbs(), executorConfig.getEnableChunkedContext(),
             PeftCacheManagerConfig(executorConfig.getPeftCacheConfig().value_or(executor::PeftCacheConfig())),
             executorConfig.getDecodingConfig().value_or(executor::DecodingConfig{}),
-            executorConfig.getGpuWeightsPercent(), executorConfig.getMaxBeamWidth(), executorConfig.getMaxBatchSize(),
-            executorConfig.getMaxNumTokens(), executorConfig.getSchedulerConfig(),
-            executorConfig.getExtendedRuntimePerfKnobConfig(), executorConfig.getDebugConfig(),
-            executorConfig.getMaxSeqIdleMicroseconds(), executorConfig.getSpecDecConfig(),
-            executorConfig.getGuidedDecodingConfig(), isLeaderInOrchMode, executorConfig.getAdditionalModelOutputs(),
-            executorConfig.getGatherGenerationLogits())
+            executorConfig.getUseGpuDirectStorage(), executorConfig.getGpuWeightsPercent(),
+            executorConfig.getMaxBeamWidth(), executorConfig.getMaxBatchSize(), executorConfig.getMaxNumTokens(),
+            executorConfig.getSchedulerConfig(), executorConfig.getExtendedRuntimePerfKnobConfig(),
+            executorConfig.getDebugConfig(), executorConfig.getMaxSeqIdleMicroseconds(),
+            executorConfig.getSpecDecConfig(), executorConfig.getGuidedDecodingConfig(), isLeaderInOrchMode,
+            executorConfig.getAdditionalModelOutputs(), executorConfig.getGatherGenerationLogits())
     {
     }
 
@@ -106,6 +107,8 @@ class TrtGptModelOptionalParams
     bool enableChunkedContext;
     PeftCacheManagerConfig peftCacheManagerConfig;
     executor::DecodingConfig decodingConfig;
+    // Use GDS to load the engines?
+    bool useGpuDirectStorage;
     // Percentage of weights on the gpu at runtime
     float gpuWeightsPercent;
     std::optional<SizeType32> maxBeamWidth;
 
@@ -1400,8 +1400,8 @@ class ExecutorConfig
         std::optional<ParallelConfig> parallelConfig = std::nullopt,
         std::optional<PeftCacheConfig> const& peftCacheConfig = std::nullopt,
         std::optional<LogitsPostProcessorConfig> logitsPostProcessorConfig = std::nullopt,
-        std::optional<DecodingConfig> decodingConfig = std::nullopt, float gpuWeightsPercent = 1,
-        std::optional<SizeType32> maxQueueSize = std::nullopt,
+        std::optional<DecodingConfig> decodingConfig = std::nullopt, bool useGpuDirectStorage = false,
+        float gpuWeightsPercent = 1, std::optional<SizeType32> maxQueueSize = std::nullopt,
         ExtendedRuntimePerfKnobConfig const& extendedRuntimePerfKnobConfig = ExtendedRuntimePerfKnobConfig(),
         std::optional<DebugConfig> debugConfig = std::nullopt, SizeType32 recvPollPeriodMs = 0,
         uint64_t maxSeqIdleMicroseconds = kDefaultMaxSeqIdleMicroseconds,
@@ -1429,6 +1429,7 @@ class ExecutorConfig
     [[nodiscard]] std::optional<PeftCacheConfig> getPeftCacheConfig() const;
     [[nodiscard]] std::optional<LogitsPostProcessorConfig> getLogitsPostProcessorConfig() const;
     [[nodiscard]] std::optional<DecodingConfig> getDecodingConfig() const;
+    [[nodiscard]] bool getUseGpuDirectStorage() const;
     [[nodiscard]] float getGpuWeightsPercent() const;
     [[nodiscard]] std::optional<SizeType32> getMaxQueueSize() const;
     [[nodiscard]] ExtendedRuntimePerfKnobConfig getExtendedRuntimePerfKnobConfig() const;
@@ -1455,6 +1456,7 @@ class ExecutorConfig
     void setPeftCacheConfig(PeftCacheConfig const& peftCacheConfig);
     void setLogitsPostProcessorConfig(LogitsPostProcessorConfig const& logitsPostProcessorConfig);
     void setDecodingConfig(DecodingConfig const& decodingConfig);
+    void setUseGpuDirectStorage(bool const& useGpuDirectStorage);
     void setGpuWeightsPercent(float const& gpuWeightsPercent);
     void setMaxQueueSize(std::optional<SizeType32> const& maxQueueSize);
     void setExtendedRuntimePerfKnobConfig(ExtendedRuntimePerfKnobConfig const& extendedRuntimePerfKnobConfig);
@@ -1510,6 +1512,9 @@ class ExecutorConfig
     /// @brief Decoding configuration.
     std::optional<DecodingConfig> mDecodingConfig;
 
+    /// @brief Enable/disable use of GPU Direct Storage (GDS) to load engines.
+    bool mUseGpuDirectStorage;
+
     /// @brief GPU weights percent for weight streaming.
     float mGpuWeightsPercent;
 
 
@@ -99,6 +99,9 @@ class [[deprecated("Use the executor API instead.")]] GptSession
         SizeType32 maxBeamWidth;
         // The length of the longest input sequence
         SizeType32 maxSequenceLength;
+        // Enable/disable GPUDirectStorage
+        // Not supported by GptSession so hard-coded as false
+        bool useGpuDirectStorage{false};
         // Percentage of weights on the gpu at runtime
         float gpuWeightsPercent;
         // Whether the session will use a different decoder per request.
 
@@ -45,7 +45,8 @@ TrtEncoderModel::TrtEncoderModel(runtime::ModelConfig const& modelConfig, WorldC
     , mWorldConfig{worldConfig}
     , mDevice{runtime::utils::initDevice(worldConfig)}
     , mLogger{logger ? std::move(logger) : std::make_shared<TllmLogger>()}
-    , mRuntime{std::make_shared<TllmRuntime>(rawEngine, mLogger.get(), optionalParams.gpuWeightsPercent)}
+    , mRuntime{std::make_shared<TllmRuntime>(
+          rawEngine, mLogger.get(), optionalParams.useGpuDirectStorage, optionalParams.gpuWeightsPercent)}
     , mMicroBatchId(0)
     , mCopyBufferManager{std::make_shared<CudaStream>()}
 {
 
@@ -138,8 +138,8 @@ TrtGptModelInflightBatching::TrtGptModelInflightBatching(std::shared_ptr<nvinfer
     , mDebugConfig{optionalParams.debugConfig}
     , mAdditionalModelOutputs{optionalParams.additionalModelOutputs}
     , mLogger{logger ? std::move(logger) : std::make_shared<TllmLogger>()}
-    , mRuntime{std::make_shared<TllmRuntime>(
-          rawEngine, mLogger.get(), optionalParams.gpuWeightsPercent, modelConfig.useShapeInference())}
+    , mRuntime{std::make_shared<TllmRuntime>(rawEngine, mLogger.get(), optionalParams.useGpuDirectStorage,
+          optionalParams.gpuWeightsPercent, modelConfig.useShapeInference())}
     , mCopyBufferManager{std::make_shared<CudaStream>()}
     , mCtxGenFusion(ctxGenFusion)
     , mOperatingBeamWidth{getMaxBeamWidth()}
 
@@ -28,7 +28,7 @@ ExecutorConfig::ExecutorConfig(SizeType32 maxBeamWidth, SchedulerConfig schedule
     std::optional<SizeType32> maxNumTokens, std::optional<ParallelConfig> parallelConfig,
     std::optional<PeftCacheConfig> const& peftCacheConfig,
     std::optional<LogitsPostProcessorConfig> logitsPostProcessorConfig, std::optional<DecodingConfig> decodingConfig,
-    float gpuWeightPercent, std::optional<SizeType32> maxQueueSize,
+    bool useGpuDirectStorage, float gpuWeightPercent, std::optional<SizeType32> maxQueueSize,
     ExtendedRuntimePerfKnobConfig const& extendedRuntimePerfKnobConfig, std::optional<DebugConfig> debugConfig,
     SizeType32 recvPollPeriodMs, uint64_t maxSeqIdleMicroseconds,
     std::optional<SpeculativeDecodingConfig> specDecConfig, std::optional<GuidedDecodingConfig> guidedDecodingConfig,
@@ -48,6 +48,7 @@ ExecutorConfig::ExecutorConfig(SizeType32 maxBeamWidth, SchedulerConfig schedule
     , mPeftCacheConfig(peftCacheConfig)
     , mLogitsPostProcessorConfig(std::move(logitsPostProcessorConfig))
     , mDecodingConfig(std::move(decodingConfig))
+    , mUseGpuDirectStorage((useGpuDirectStorage))
     , mGpuWeightsPercent(gpuWeightPercent)
     , mMaxQueueSize(maxQueueSize)
     , mExtendedRuntimePerfKnobConfig(extendedRuntimePerfKnobConfig)
@@ -146,6 +147,11 @@ std::optional<DecodingConfig> ExecutorConfig::getDecodingConfig() const
     return mDecodingConfig;
 }
 
+bool ExecutorConfig::getUseGpuDirectStorage() const
+{
+    return mUseGpuDirectStorage;
+}
+
 float ExecutorConfig::getGpuWeightsPercent() const
 {
     return mGpuWeightsPercent;
@@ -276,6 +282,11 @@ void ExecutorConfig::setDecodingConfig(DecodingConfig const& decodingConfig)
     mDecodingConfig = decodingConfig;
 }
 
+void ExecutorConfig::setUseGpuDirectStorage(bool const& useGpuDirectStorage)
+{
+    mUseGpuDirectStorage = useGpuDirectStorage;
+}
+
 void ExecutorConfig::setGpuWeightsPercent(float const& gpuWeightsPercent)
 {
     mGpuWeightsPercent = gpuWeightsPercent;
 
@@ -978,6 +978,7 @@ ExecutorConfig Serialization::deserializeExecutorConfig(std::istream& is)
     auto parallelConfig = su::deserializeWithGetterType<decltype(&ExecutorConfig::getParallelConfig)>(is);
     auto peftCacheConfig = su::deserializeWithGetterType<decltype(&ExecutorConfig::getPeftCacheConfig)>(is);
     auto decodingConfig = su::deserializeWithGetterType<decltype(&ExecutorConfig::getDecodingConfig)>(is);
+    auto useGpuDirectStorage = su::deserializeWithGetterType<decltype(&ExecutorConfig::getUseGpuDirectStorage)>(is);
     auto gpuWeightsPercent = su::deserializeWithGetterType<decltype(&ExecutorConfig::getGpuWeightsPercent)>(is);
     auto maxQueueSize = su::deserializeWithGetterType<decltype(&ExecutorConfig::getMaxQueueSize)>(is);
     auto extendedRuntimePerfKnobConfig
@@ -995,9 +996,9 @@ ExecutorConfig Serialization::deserializeExecutorConfig(std::istream& is)
 
     return ExecutorConfig{maxBeamWidth, schedulerConfig, kvCacheConfig, enableChunkedContext, normalizeLogProbs,
         iterStatsMaxIterations, requestStatsMaxIterations, batchingType, maxBatchSize, maxNumTokens, parallelConfig,
-        peftCacheConfig, std::nullopt, decodingConfig, gpuWeightsPercent, maxQueueSize, extendedRuntimePerfKnobConfig,
-        debugConfig, recvPollPeriodMs, maxSeqIdleMicroseconds, specDecConfig, guidedDecodingConfig,
-        additionalModelOutputs, gatherGenerationLogits};
+        peftCacheConfig, std::nullopt, decodingConfig, useGpuDirectStorage, gpuWeightsPercent, maxQueueSize,
+        extendedRuntimePerfKnobConfig, debugConfig, recvPollPeriodMs, maxSeqIdleMicroseconds, specDecConfig,
+        guidedDecodingConfig, additionalModelOutputs, gatherGenerationLogits};
 }
 
 size_t Serialization::serializedSize(ExecutorConfig const& executorConfig)
@@ -1020,6 +1021,7 @@ size_t Serialization::serializedSize(ExecutorConfig const& executorConfig)
     totalSize += su::serializedSize(executorConfig.getParallelConfig());
     totalSize += su::serializedSize(executorConfig.getPeftCacheConfig());
     totalSize += su::serializedSize(executorConfig.getDecodingConfig());
+    totalSize += su::serializedSize(executorConfig.getUseGpuDirectStorage());
     totalSize += su::serializedSize(executorConfig.getGpuWeightsPercent());
     totalSize += su::serializedSize(executorConfig.getMaxQueueSize());
     totalSize += su::serializedSize(executorConfig.getExtendedRuntimePerfKnobConfig());
@@ -1052,6 +1054,7 @@ void Serialization::serialize(ExecutorConfig const& executorConfig, std::ostream
     su::serialize(executorConfig.getParallelConfig(), os);
     su::serialize(executorConfig.getPeftCacheConfig(), os);
     su::serialize(executorConfig.getDecodingConfig(), os);
+    su::serialize(executorConfig.getUseGpuDirectStorage(), os);
     su::serialize(executorConfig.getGpuWeightsPercent(), os);
     su::serialize(executorConfig.getMaxQueueSize(), os);
     su::serialize(executorConfig.getExtendedRuntimePerfKnobConfig(), os);
 
@@ -527,6 +527,7 @@ PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m)
         .def_readwrite("enable_chunked_context", &tb::TrtGptModelOptionalParams::enableChunkedContext)
         .def_readwrite("normalize_log_probs", &tb::TrtGptModelOptionalParams::normalizeLogProbs)
         .def_readwrite("decoding_config", &tb::TrtGptModelOptionalParams::decodingConfig)
+        .def_readwrite("use_gpu_direct_storage", &tb::TrtGptModelOptionalParams::useGpuDirectStorage)
         .def_readwrite("gpu_weights_percent", &tb::TrtGptModelOptionalParams::gpuWeightsPercent)
         .def_readwrite("max_beam_width", &tb::TrtGptModelOptionalParams::maxBeamWidth)
         .def_readwrite("scheduler_config", &tb::TrtGptModelOptionalParams::schedulerConfig)
Original file line number	Diff line number	Diff line change
`@@ -74,15 +74,17 @@ std::string engineFilename(`
`74`	`74`	`}`
`75`	`75`
`76`	`76`	`void benchmarkBert(std::string const& modelName, std::filesystem::path const& dataPath,`
`77`		`- std::vector<int> const& batchSizes, std::vector<int> const& inLens, std::vector<float> const& gpuWeightsPercents,`
`78`		`- std::shared_ptr<nvinfer1::ILogger> const& logger, int warmUp, int numRuns, int duration)`
	`77`	`+ std::vector<int> const& batchSizes, std::vector<int> const& inLens, bool useGpuDirectStorage,`
	`78`	`+ std::vector<float> const& gpuWeightsPercents, std::shared_ptr<nvinfer1::ILogger> const& logger, int warmUp,`
	`79`	`+ int numRuns, int duration)`
`79`	`80`	`{`
`80`	`81`	`auto const worldConfig = WorldConfig::mpi();`
`81`	`82`	`auto const enginePath = dataPath / engineFilename(dataPath, worldConfig, modelName);`
`82`	`83`
`83`	`84`	`for (float gpuWeightsPercent : gpuWeightsPercents)`
`84`	`85`	`{`
`85`		`- auto rt = std::make_shared<TllmRuntime>(RawEngine(enginePath), logger.get(), gpuWeightsPercent);`
	`86`	`+ auto rt = std::make_shared<TllmRuntime>(`
	`87`	`+ RawEngine(enginePath), logger.get(), useGpuDirectStorage, gpuWeightsPercent);`
`86`	`88`	`rt->addContext(0);`
`87`	`89`	`for (auto inLen : inLens)`
`88`	`90`	`{`
`@@ -174,6 +176,8 @@ int main(int argc, char* argv[])`
`174`	`176`	`"by \";\", "`
`175`	`177`	`"example: \"0.0;0.5;1.0\".",`
`176`	`178`	`cxxopts::value<std::string>()->default_value("1.0"));`
	`179`	`+ options.add_options()("use_gpu_direct_storage", "Enable GPUDirect Storage (GDS) for loading engine.",`
	`180`	`+ cxxopts::value<bool>()->default_value("false"));`
`177`	`181`
`178`	`182`	`auto result = options.parse(argc, argv);`
`179`	`183`
`@@ -258,8 +262,8 @@ int main(int argc, char* argv[])`
`258`	`262`	`try`
`259`	`263`	`{`
`260`	`264`	`benchmarkBert(result["model"].as<std::string>(), result["engine_dir"].as<std::string>(), batchSizes, inLens,`
`261`		`- gpuWeightsPercents, logger, result["warm_up"].as<int>(), result["num_runs"].as<int>(),`
`262`		`- result["duration"].as<int>());`
	`265`	`+ result["use_gpu_direct_storage"].as<bool>(), gpuWeightsPercents, logger, result["warm_up"].as<int>(),`
	`266`	`+ result["num_runs"].as<int>(), result["duration"].as<int>());`
`263`	`267`	`}`
`264`	`268`	`catch (std::exception const& e)`
`265`	`269`	`{`