Skip to content

Commit 23996bb

Browse files
authored
[CUDA][ROCm][Training] Fix cuda/rocm provider info hash (#19398)
When I test a new provider option, the training pipeline failed. I found that training uses hash code of provider info to try get provider instance. If a provider option is not used in hashing, the provider instance fetched from cache might have different configuration for that option. Here I fix the hashing to use all provider options (except the default Arena config that cannot be set from python API since training is used with PyTorch in most cases). Fixed a few obvious typo in the touched files. Add regression test cases.
1 parent 06a84c8 commit 23996bb

File tree

5 files changed

+75
-38
lines changed

5 files changed

+75
-38
lines changed

onnxruntime/core/providers/cuda/cuda_execution_provider_info.cc

+9-8
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,9 @@ constexpr const char* kTunableOpEnable = "tunable_op_enable";
3131
constexpr const char* kTunableOpTuningEnable = "tunable_op_tuning_enable";
3232
constexpr const char* kTunableOpMaxTuningDurationMs = "tunable_op_max_tuning_duration_ms";
3333
constexpr const char* kEnableSkipLayerNormStrictMode = "enable_skip_layer_norm_strict_mode";
34-
constexpr const char* kPreferNCHWMode = "prefer_nhwc";
35-
constexpr const char* KUseEPLevelUnifiedStream = "use_ep_level_unified_stream";
34+
constexpr const char* kPreferNHWCMode = "prefer_nhwc";
35+
constexpr const char* kUseEPLevelUnifiedStream = "use_ep_level_unified_stream";
36+
3637
} // namespace provider_option_names
3738
} // namespace cuda
3839

@@ -112,8 +113,8 @@ CUDAExecutionProviderInfo CUDAExecutionProviderInfo::FromProviderOptions(const P
112113
.AddAssignmentToReference(cuda::provider_option_names::kEnableCudaGraph, info.enable_cuda_graph)
113114
.AddAssignmentToReference(cuda::provider_option_names::kCudnnConv1dPadToNc1d, info.cudnn_conv1d_pad_to_nc1d)
114115
.AddAssignmentToReference(cuda::provider_option_names::kEnableSkipLayerNormStrictMode, info.enable_skip_layer_norm_strict_mode)
115-
.AddAssignmentToReference(cuda::provider_option_names::kPreferNCHWMode, info.prefer_nhwc)
116-
.AddAssignmentToReference(cuda::provider_option_names::KUseEPLevelUnifiedStream, info.use_ep_level_unified_stream)
116+
.AddAssignmentToReference(cuda::provider_option_names::kPreferNHWCMode, info.prefer_nhwc)
117+
.AddAssignmentToReference(cuda::provider_option_names::kUseEPLevelUnifiedStream, info.use_ep_level_unified_stream)
117118
.AddValueParser(
118119
cuda::provider_option_names::kTunableOpEnable,
119120
[&info](const std::string& value_str) -> Status {
@@ -164,8 +165,8 @@ ProviderOptions CUDAExecutionProviderInfo::ToProviderOptions(const CUDAExecution
164165
{cuda::provider_option_names::kTunableOpTuningEnable, MakeStringWithClassicLocale(info.tunable_op.tuning_enable)},
165166
{cuda::provider_option_names::kTunableOpMaxTuningDurationMs, MakeStringWithClassicLocale(info.tunable_op.max_tuning_duration_ms)},
166167
{cuda::provider_option_names::kEnableSkipLayerNormStrictMode, MakeStringWithClassicLocale(info.enable_skip_layer_norm_strict_mode)},
167-
{cuda::provider_option_names::kPreferNCHWMode, MakeStringWithClassicLocale(info.prefer_nhwc)},
168-
{cuda::provider_option_names::KUseEPLevelUnifiedStream, MakeStringWithClassicLocale(info.use_ep_level_unified_stream)},
168+
{cuda::provider_option_names::kPreferNHWCMode, MakeStringWithClassicLocale(info.prefer_nhwc)},
169+
{cuda::provider_option_names::kUseEPLevelUnifiedStream, MakeStringWithClassicLocale(info.use_ep_level_unified_stream)},
169170
};
170171

171172
return options;
@@ -185,8 +186,8 @@ ProviderOptions CUDAExecutionProviderInfo::ToProviderOptions(const OrtCUDAProvid
185186
{cuda::provider_option_names::kTunableOpEnable, MakeStringWithClassicLocale(info.tunable_op_enable)},
186187
{cuda::provider_option_names::kTunableOpTuningEnable, MakeStringWithClassicLocale(info.tunable_op_tuning_enable)},
187188
{cuda::provider_option_names::kTunableOpMaxTuningDurationMs, MakeStringWithClassicLocale(info.tunable_op_max_tuning_duration_ms)},
188-
{cuda::provider_option_names::kPreferNCHWMode, MakeStringWithClassicLocale(info.prefer_nhwc)},
189-
{cuda::provider_option_names::KUseEPLevelUnifiedStream, MakeStringWithClassicLocale(info.use_ep_level_unified_stream)},
189+
{cuda::provider_option_names::kPreferNHWCMode, MakeStringWithClassicLocale(info.prefer_nhwc)},
190+
{cuda::provider_option_names::kUseEPLevelUnifiedStream, MakeStringWithClassicLocale(info.use_ep_level_unified_stream)},
190191
};
191192

192193
return options;

onnxruntime/core/providers/cuda/cuda_execution_provider_info.h

+31-7
Original file line numberDiff line numberDiff line change
@@ -83,12 +83,36 @@ struct CUDAExecutionProviderInfo {
8383
} // namespace onnxruntime
8484

8585
template <>
86-
struct std::hash<::onnxruntime::cuda::TunableOpInfo> {
87-
size_t operator()(const ::onnxruntime::cuda::TunableOpInfo& info) const {
88-
size_t seed_and_value{0xbc9f1d34};
89-
onnxruntime::HashCombine(info.enable, seed_and_value);
90-
onnxruntime::HashCombine(info.tuning_enable, seed_and_value);
91-
onnxruntime::HashCombine(info.max_tuning_duration_ms, seed_and_value);
92-
return seed_and_value;
86+
struct std::hash<::onnxruntime::CUDAExecutionProviderInfo> {
87+
size_t operator()(const ::onnxruntime::CUDAExecutionProviderInfo& info) const {
88+
size_t value{0xbc9f1d34}; // seed
89+
90+
// Bits: device_id (16), arena_extend_strategy/cudnn_conv_algo_search (reserved 2), boolean options (1 each)
91+
size_t data = static_cast<size_t>(info.device_id) ^
92+
(static_cast<size_t>(info.arena_extend_strategy) << 16) ^
93+
(static_cast<size_t>(info.cudnn_conv_algo_search) << 18) ^
94+
(static_cast<size_t>(info.do_copy_in_default_stream) << 20) ^
95+
(static_cast<size_t>(info.has_user_compute_stream) << 21) ^
96+
(static_cast<size_t>(info.cudnn_conv_use_max_workspace) << 22) ^
97+
(static_cast<size_t>(info.enable_cuda_graph) << 23) ^
98+
(static_cast<size_t>(info.tunable_op.enable) << 24) ^
99+
(static_cast<size_t>(info.tunable_op.tuning_enable) << 25) ^
100+
(static_cast<size_t>(info.cudnn_conv1d_pad_to_nc1d) << 26) ^
101+
(static_cast<size_t>(info.enable_skip_layer_norm_strict_mode) << 27) ^
102+
(static_cast<size_t>(info.prefer_nhwc) << 28) ^
103+
(static_cast<size_t>(info.use_ep_level_unified_stream) << 29);
104+
onnxruntime::HashCombine(data, value);
105+
106+
onnxruntime::HashCombine(info.gpu_mem_limit, value);
107+
onnxruntime::HashCombine(info.tunable_op.max_tuning_duration_ms, value);
108+
109+
// Memory pointers
110+
onnxruntime::HashCombine(reinterpret_cast<size_t>(info.user_compute_stream), value);
111+
onnxruntime::HashCombine(reinterpret_cast<size_t>(info.external_allocator_info.alloc), value);
112+
onnxruntime::HashCombine(reinterpret_cast<size_t>(info.external_allocator_info.free), value);
113+
onnxruntime::HashCombine(reinterpret_cast<size_t>(info.external_allocator_info.empty_cache), value);
114+
115+
// The default memory arena cfg is not used in hashing right now.
116+
return value;
93117
}
94118
};

onnxruntime/core/providers/rocm/rocm_execution_provider_info.h

+27-7
Original file line numberDiff line numberDiff line change
@@ -74,12 +74,32 @@ struct ROCMExecutionProviderInfo {
7474
} // namespace onnxruntime
7575

7676
template <>
77-
struct std::hash<::onnxruntime::rocm::TunableOpInfo> {
78-
size_t operator()(const ::onnxruntime::rocm::TunableOpInfo& info) const {
79-
size_t seed_and_value{0xbc9f1d34};
80-
onnxruntime::HashCombine(info.enable, seed_and_value);
81-
onnxruntime::HashCombine(info.tuning_enable, seed_and_value);
82-
onnxruntime::HashCombine(info.max_tuning_duration_ms, seed_and_value);
83-
return seed_and_value;
77+
struct std::hash<::onnxruntime::ROCMExecutionProviderInfo> {
78+
size_t operator()(const ::onnxruntime::ROCMExecutionProviderInfo& info) const {
79+
size_t value{0xbc9f1d34}; // seed
80+
81+
// Bits: device_id (16), arena_extend_strategy/miopen_conv_exhaustive_search (reserved 2), boolean options (1 each)
82+
size_t data = static_cast<size_t>(info.device_id) ^
83+
(static_cast<size_t>(info.arena_extend_strategy) << 16) ^
84+
(static_cast<size_t>(info.miopen_conv_exhaustive_search) << 18) ^
85+
(static_cast<size_t>(info.do_copy_in_default_stream) << 20) ^
86+
(static_cast<size_t>(info.has_user_compute_stream) << 21) ^
87+
(static_cast<size_t>(info.miopen_conv_use_max_workspace) << 22) ^
88+
(static_cast<size_t>(info.enable_hip_graph) << 23) ^
89+
(static_cast<size_t>(info.tunable_op.enable) << 24) ^
90+
(static_cast<size_t>(info.tunable_op.tuning_enable) << 25);
91+
onnxruntime::HashCombine(data, value);
92+
93+
onnxruntime::HashCombine(info.gpu_mem_limit, value);
94+
onnxruntime::HashCombine(info.tunable_op.max_tuning_duration_ms, value);
95+
96+
// Memory pointers
97+
onnxruntime::HashCombine(reinterpret_cast<size_t>(info.user_compute_stream), value);
98+
onnxruntime::HashCombine(reinterpret_cast<size_t>(info.external_allocator_info.alloc), value);
99+
onnxruntime::HashCombine(reinterpret_cast<size_t>(info.external_allocator_info.free), value);
100+
onnxruntime::HashCombine(reinterpret_cast<size_t>(info.external_allocator_info.empty_cache), value);
101+
102+
// The default memory arena cfg is not used in hashing right now.
103+
return value;
84104
}
85105
};

onnxruntime/test/python/onnxruntime_test_python.py

+4
Original file line numberDiff line numberDiff line change
@@ -414,6 +414,8 @@ def test_get_and_set_option_with_values(option_name, option_values):
414414
str(option_value),
415415
)
416416

417+
test_get_and_set_option_with_values("enable_cuda_graph", ["1", "0"])
418+
417419
test_get_and_set_option_with_values("arena_extend_strategy", ["kNextPowerOfTwo", "kSameAsRequested"])
418420

419421
test_get_and_set_option_with_values("cudnn_conv_algo_search", ["DEFAULT", "EXHAUSTIVE", "HEURISTIC"])
@@ -553,6 +555,8 @@ def test_get_and_set_option_with_values(option_name, option_values):
553555

554556
test_get_and_set_option_with_values("tunable_op_max_tuning_duration_ms", ["-1", "1"])
555557

558+
test_get_and_set_option_with_values("enable_hip_graph", ["1", "0"])
559+
556560
run_rocm_options_test()
557561

558562
def test_invalid_set_providers(self):

orttraining/orttraining/python/orttraining_python_module.cc

+4-16
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ void addObjectMethodsForLazyTensor(py::module& m);
4747
#endif
4848
bool InitArray();
4949

50-
bool GetDyanmicExecutionProviderHash(
50+
bool GetDynamicExecutionProviderHash(
5151
const std::string& ep_shared_lib_path,
5252
const ProviderOptions& provider_options,
5353
size_t& hash,
@@ -87,13 +87,7 @@ bool GetProviderInstanceHash(const std::string& type,
8787
if (auto* cuda_provider_info = TryGetProviderInfo_CUDA()) {
8888
const CUDAExecutionProviderInfo info = GetCudaExecutionProviderInfo(cuda_provider_info,
8989
provider_options_map);
90-
hash = static_cast<size_t>(info.device_id) ^
91-
info.gpu_mem_limit ^
92-
(static_cast<size_t>(info.arena_extend_strategy) << 16) ^
93-
(static_cast<size_t>(info.cudnn_conv_algo_search) << 18) ^
94-
(static_cast<size_t>(info.do_copy_in_default_stream) << 20) ^
95-
(static_cast<size_t>(info.has_user_compute_stream) << 22) ^
96-
std::hash<cuda::TunableOpInfo>{}(info.tunable_op);
90+
hash = std::hash<CUDAExecutionProviderInfo>{}(info);
9791
return true;
9892
}
9993
#endif
@@ -102,13 +96,7 @@ bool GetProviderInstanceHash(const std::string& type,
10296
if (auto* rocm_provider_info = TryGetProviderInfo_ROCM()) {
10397
const ROCMExecutionProviderInfo info = GetRocmExecutionProviderInfo(rocm_provider_info,
10498
provider_options_map);
105-
hash = static_cast<size_t>(info.device_id) ^
106-
info.gpu_mem_limit ^
107-
(static_cast<size_t>(info.arena_extend_strategy) << 16) ^
108-
(static_cast<size_t>(info.miopen_conv_exhaustive_search) << 18) ^
109-
(static_cast<size_t>(info.do_copy_in_default_stream) << 20) ^
110-
(static_cast<size_t>(info.has_user_compute_stream) << 22) ^
111-
std::hash<rocm::TunableOpInfo>{}(info.tunable_op);
99+
hash = std::hash<ROCMExecutionProviderInfo>{}(info);
112100
return true;
113101
}
114102
#endif
@@ -128,7 +116,7 @@ bool GetProviderInstanceHash(const std::string& type,
128116
provider_options.insert(option);
129117
}
130118
}
131-
return GetDyanmicExecutionProviderHash(shared_lib_path_it->second, provider_options, hash);
119+
return GetDynamicExecutionProviderHash(shared_lib_path_it->second, provider_options, hash);
132120
}
133121
}
134122
}

0 commit comments

Comments
 (0)