Skip to content

Commit c5e30f4

Browse files
fs-eireguschmue
authored andcommitted
Fix delay load for WebGPU EP and DML EP (#23111)
### Description This change fixes the DLL delay load problem for the WebGPU EP and DirectML EP. See detailed explanation below. ### Problem When onnxruntime.dll uses delay loading for its dependencies, the dependencies are loaded using `LoadLibraryEx()`, which search the directory of process (.exe) instead of this library (onnxruntime.dll). This is a problem for usages of Node.js binding and python binding, because Windows will try to find the dependencies in the directory of node.exe or python.exe, which is not the directory of onnxruntime.dll. There was previous attempt to fix this by loading DirectML.dll in the initialization of onnxruntime nodejs binding, which works for DML EP but is not a good solution because it does not really "delay" the load. For WebGPU, the situation became worse because webgpu_dawn.dll depends on dxil.dll and dxcompiler.dll, which are explicitly dynamically loaded in the code using `LoadLibraryA()`. This has the same problem of the DLL search. ### Solutions For onnxruntime.dll loading its direct dependencies, it can be resolved by set the [`__pfnDliNotifyHook2` hook](https://learn.microsoft.com/en-us/cpp/build/reference/understanding-the-helper-function?view=msvc-170#structure-and-constant-definitions) to load from an absolute path that constructed from the onnxruntime.dll folder and the DLL name. For webgpu_dawn.dll loading dxil.dll and dxcompiler.dll, since they are explicitly loaded in the code, the hook does not work. Instead, it can be resolved by ~~using WIN32 API `SetDllDirectory()` to add the onnxruntime.dll folder to the search path.~~ preloading the 2 DLLs from the onnxruntime.dll folder .
1 parent 6fb416d commit c5e30f4

16 files changed

+324
-66
lines changed

cmake/onnxruntime.cmake

+1
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ if(WIN32)
7777
onnxruntime_add_shared_library(onnxruntime
7878
${SYMBOL_FILE}
7979
"${ONNXRUNTIME_ROOT}/core/dll/dllmain.cc"
80+
"${ONNXRUNTIME_ROOT}/core/dll/delay_load_hook.cc"
8081
"${ONNXRUNTIME_ROOT}/core/dll/onnxruntime.rc"
8182
)
8283
elseif(onnxruntime_BUILD_APPLE_FRAMEWORK)

cmake/onnxruntime_nodejs.cmake

+17-3
Original file line numberDiff line numberDiff line change
@@ -60,15 +60,26 @@ else()
6060
endif()
6161
endif()
6262

63+
# a list of DLLs that the Node.js binding depends on
64+
set(NODEJS_DLL_DEPS)
65+
6366
# setup providers
6467
if (onnxruntime_USE_CUDA)
6568
set(NODEJS_BINDING_USE_CUDA "--use_cuda")
6669
endif()
6770
if (onnxruntime_USE_DML)
6871
set(NODEJS_BINDING_USE_DML "--use_dml")
72+
list(APPEND NODEJS_DLL_DEPS "$<TARGET_FILE_DIR:onnxruntime>/DirectML.dll")
6973
endif()
7074
if (onnxruntime_USE_WEBGPU)
7175
set(NODEJS_BINDING_USE_WEBGPU "--use_webgpu")
76+
if (WIN32 AND onnxruntime_ENABLE_DAWN_BACKEND_D3D12)
77+
list(APPEND NODEJS_DLL_DEPS "$<TARGET_FILE_DIR:dxcompiler>/dxil.dll")
78+
list(APPEND NODEJS_DLL_DEPS "$<TARGET_FILE_DIR:dxcompiler>/dxcompiler.dll")
79+
endif()
80+
if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
81+
list(APPEND NODEJS_DLL_DEPS "$<TARGET_FILE:dawn::webgpu_dawn>")
82+
endif()
7283
endif()
7384
if (onnxruntime_USE_TENSORRT)
7485
set(NODEJS_BINDING_USE_TENSORRT "--use_tensorrt")
@@ -94,9 +105,12 @@ add_custom_target(js_common_npm_ci ALL
94105

95106
add_custom_target(nodejs_binding_wrapper ALL
96107
COMMAND ${NPM_CLI} ci
97-
COMMAND ${NPM_CLI} run build -- --onnxruntime-build-dir=${CMAKE_CURRENT_BINARY_DIR} --config=${CMAKE_BUILD_TYPE} --onnxruntime-generator=${CMAKE_GENERATOR}
98-
--arch=${NODEJS_BINDING_ARCH} ${NODEJS_BINDING_USE_CUDA} ${NODEJS_BINDING_USE_DML} ${NODEJS_BINDING_USE_WEBGPU} ${NODEJS_BINDING_USE_TENSORRT}
99-
${NODEJS_BINDING_USE_COREML} ${NODEJS_BINDING_USE_QNN}
108+
COMMAND ${NPM_CLI} run build -- "--onnxruntime-build-dir=${CMAKE_CURRENT_BINARY_DIR}"
109+
--config=${CMAKE_BUILD_TYPE}
110+
"--onnxruntime-generator=${CMAKE_GENERATOR}"
111+
"--dll_deps=${NODEJS_DLL_DEPS}"
112+
--arch=${NODEJS_BINDING_ARCH} ${NODEJS_BINDING_USE_CUDA} ${NODEJS_BINDING_USE_DML} ${NODEJS_BINDING_USE_WEBGPU}
113+
${NODEJS_BINDING_USE_TENSORRT} ${NODEJS_BINDING_USE_COREML} ${NODEJS_BINDING_USE_QNN}
100114
WORKING_DIRECTORY ${JS_NODE_ROOT}
101115
COMMENT "Using cmake-js to build OnnxRuntime Node.js binding")
102116

cmake/onnxruntime_providers_webgpu.cmake

+27-9
Original file line numberDiff line numberDiff line change
@@ -23,24 +23,42 @@
2323
onnxruntime_add_include_to_target(onnxruntime_providers_webgpu
2424
onnxruntime_common dawn::dawncpp_headers dawn::dawn_headers onnx onnx_proto flatbuffers::flatbuffers Boost::mp11 safeint_interface)
2525

26+
set(onnxruntime_providers_webgpu_dll_deps)
27+
2628
if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
2729
target_link_libraries(onnxruntime_providers_webgpu dawn::webgpu_dawn)
2830

29-
if (onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS)
30-
list(APPEND onnxruntime_DELAYLOAD_FLAGS "/DELAYLOAD:webgpu_dawn.dll")
31-
endif()
31+
if (WIN32)
32+
if (onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS)
33+
list(APPEND onnxruntime_DELAYLOAD_FLAGS "/DELAYLOAD:webgpu_dawn.dll")
34+
endif()
3235

33-
# Copy webgpu_dawn.dll to the output directory
34-
add_custom_command(
35-
TARGET onnxruntime_providers_webgpu
36-
POST_BUILD
37-
COMMAND ${CMAKE_COMMAND} -E copy_if_different "$<TARGET_FILE:dawn::webgpu_dawn>" "$<TARGET_FILE_DIR:onnxruntime_providers_webgpu>"
38-
VERBATIM )
36+
list(APPEND onnxruntime_providers_webgpu_dll_deps "$<TARGET_FILE:dawn::webgpu_dawn>")
37+
endif()
3938
else()
4039
if (NOT onnxruntime_USE_EXTERNAL_DAWN)
4140
target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_native)
4241
endif()
4342
target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_proc)
4443
endif()
4544

45+
if (WIN32 AND onnxruntime_ENABLE_DAWN_BACKEND_D3D12)
46+
# Ensure dxil.dll and dxcompiler.dll exist in the output directory $<TARGET_FILE_DIR:dxcompiler>
47+
add_dependencies(onnxruntime_providers_webgpu copy_dxil_dll)
48+
add_dependencies(onnxruntime_providers_webgpu dxcompiler)
49+
50+
list(APPEND onnxruntime_providers_webgpu_dll_deps "$<TARGET_FILE_DIR:dxcompiler>/dxil.dll")
51+
list(APPEND onnxruntime_providers_webgpu_dll_deps "$<TARGET_FILE_DIR:dxcompiler>/dxcompiler.dll")
52+
endif()
53+
54+
if (onnxruntime_providers_webgpu_dll_deps)
55+
# Copy dependency DLLs to the output directory
56+
add_custom_command(
57+
TARGET onnxruntime_providers_webgpu
58+
POST_BUILD
59+
COMMAND ${CMAKE_COMMAND} -E copy_if_different "${onnxruntime_providers_webgpu_dll_deps}" "$<TARGET_FILE_DIR:onnxruntime_providers_webgpu>"
60+
COMMAND_EXPAND_LISTS
61+
VERBATIM )
62+
endif()
63+
4664
set_target_properties(onnxruntime_providers_webgpu PROPERTIES FOLDER "ONNXRuntime")

cmake/onnxruntime_unittests.cmake

+12
Original file line numberDiff line numberDiff line change
@@ -525,6 +525,9 @@ set (onnxruntime_global_thread_pools_test_SRC
525525
set (onnxruntime_webgpu_external_dawn_test_SRC
526526
${TEST_SRC_DIR}/webgpu/external_dawn/main.cc)
527527

528+
set (onnxruntime_webgpu_delay_load_test_SRC
529+
${TEST_SRC_DIR}/webgpu/delay_load/main.cc)
530+
528531
# tests from lowest level library up.
529532
# the order of libraries should be maintained, with higher libraries being added first in the list
530533

@@ -1864,4 +1867,13 @@ if (onnxruntime_USE_WEBGPU AND onnxruntime_USE_EXTERNAL_DAWN)
18641867
onnxruntime_add_include_to_target(onnxruntime_webgpu_external_dawn_test dawn::dawncpp_headers dawn::dawn_headers)
18651868
endif()
18661869

1870+
if (onnxruntime_USE_WEBGPU AND WIN32 AND onnxruntime_BUILD_SHARED_LIB AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND NOT onnxruntime_MINIMAL_BUILD)
1871+
AddTest(DYN
1872+
TARGET onnxruntime_webgpu_delay_load_test
1873+
SOURCES ${onnxruntime_webgpu_delay_load_test_SRC}
1874+
LIBS ${SYS_PATH_LIB}
1875+
DEPENDS ${all_dependencies}
1876+
)
1877+
endif()
1878+
18671879
include(onnxruntime_fuzz_test.cmake)

js/node/CMakeLists.txt

+6-4
Original file line numberDiff line numberDiff line change
@@ -113,10 +113,12 @@ endif()
113113
if (WIN32)
114114
file(COPY ${ONNXRUNTIME_WIN_BIN_DIR}/onnxruntime.dll
115115
DESTINATION ${dist_folder})
116-
if (USE_DML)
117-
file(COPY ${ONNXRUNTIME_WIN_BIN_DIR}/DirectML.dll
118-
DESTINATION ${dist_folder})
119-
endif ()
116+
if (ORT_NODEJS_DLL_DEPS)
117+
foreach(dll ${ORT_NODEJS_DLL_DEPS})
118+
file(COPY ${dll} DESTINATION ${dist_folder})
119+
endforeach()
120+
endif()
121+
120122
elseif (APPLE)
121123
file(COPY ${ONNXRUNTIME_BUILD_DIR}/libonnxruntime.dylib
122124
DESTINATION ${dist_folder} FOLLOW_SYMLINK_CHAIN)

js/node/script/build.ts

+5
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ const USE_TENSORRT = !!buildArgs.use_tensorrt;
3939
const USE_COREML = !!buildArgs.use_coreml;
4040
// --use_qnn
4141
const USE_QNN = !!buildArgs.use_qnn;
42+
// --dll_deps=
43+
const DLL_DEPS = buildArgs.dll_deps;
4244

4345
// build path
4446
const ROOT_FOLDER = path.join(__dirname, '..');
@@ -82,6 +84,9 @@ if (USE_COREML) {
8284
if (USE_QNN) {
8385
args.push('--CDUSE_QNN=ON');
8486
}
87+
if (DLL_DEPS) {
88+
args.push(`--CDORT_NODEJS_DLL_DEPS=${DLL_DEPS}`);
89+
}
8590

8691
// set CMAKE_OSX_ARCHITECTURES for macOS build
8792
if (os.platform() === 'darwin') {

js/node/src/directml_load_helper.cc

-37
This file was deleted.

js/node/src/directml_load_helper.h

-6
This file was deleted.

js/node/src/inference_session_wrap.cc

-4
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
#include "onnxruntime_cxx_api.h"
55

66
#include "common.h"
7-
#include "directml_load_helper.h"
87
#include "inference_session_wrap.h"
98
#include "run_options_helper.h"
109
#include "session_options_helper.h"
@@ -19,9 +18,6 @@ Napi::FunctionReference& InferenceSessionWrap::GetTensorConstructor() {
1918
}
2019

2120
Napi::Object InferenceSessionWrap::Init(Napi::Env env, Napi::Object exports) {
22-
#if defined(USE_DML) && defined(_WIN32)
23-
LoadDirectMLDll(env);
24-
#endif
2521
// create ONNX runtime env
2622
Ort::InitApi();
2723
ORT_NAPI_THROW_ERROR_IF(
+83
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
// Copyright (c) Microsoft Corporation. All rights reserved.
2+
// Licensed under the MIT License.
3+
4+
// == workaround for delay loading of dependencies of onnxruntime.dll ==
5+
//
6+
// Problem:
7+
//
8+
// When onnxruntime.dll uses delay loading for its dependencies, the dependencies are loaded using LoadLibraryEx,
9+
// which search the directory of process (.exe) instead of this library (onnxruntime.dll). This is a problem for
10+
// usages of Node.js binding and python binding, because Windows will try to find the dependencies in the directory
11+
// of node.exe or python.exe, which is not the directory of onnxruntime.dll.
12+
//
13+
// Solution:
14+
//
15+
// By using the delay load hook `__pfnDliNotifyHook2`, we can intervene the loading procedure by loading from an
16+
// absolute path. The absolute path is constructed by appending the name of the DLL to load to the directory of
17+
// onnxruntime.dll. This way, we can ensure that the dependencies are loaded from the same directory as onnxruntime.dll.
18+
//
19+
// See also:
20+
// - https://learn.microsoft.com/en-us/cpp/build/reference/understanding-the-helper-function?view=msvc-170#structure-and-constant-definitions
21+
// - https://learn.microsoft.com/en-us/windows/win32/dlls/dynamic-link-library-search-order#alternate-search-order-for-unpackaged-apps
22+
//
23+
// The DLL DelayLoad hook is only enabled when the compiler is MSVC and at least one of the following is True:
24+
// - both USE_WEBGPU and BUILD_DAWN_MONOLITHIC_LIBRARY are defined
25+
// - USE_DML is defined
26+
//
27+
#define ORT_DELAY_LOAD_WEBGPU_DAWN_DLL (defined(USE_WEBGPU) && defined(BUILD_DAWN_MONOLITHIC_LIBRARY))
28+
#define ORT_DELAY_LOAD_DIRECTML_DLL defined(USE_DML)
29+
#if defined(_MSC_VER) && (ORT_DELAY_LOAD_WEBGPU_DAWN_DLL || ORT_DELAY_LOAD_DIRECTML_DLL)
30+
31+
#include <Windows.h>
32+
#include <delayimp.h>
33+
#include <stdlib.h>
34+
#include <string>
35+
36+
#include "core/platform/env.h"
37+
38+
namespace {
39+
40+
#define DEFINE_KNOWN_DLL(name) {#name ".dll", L#name L".dll"}
41+
42+
constexpr struct {
43+
const char* str;
44+
const wchar_t* wstr;
45+
} known_dlls[] = {
46+
#if ORT_DELAY_LOAD_WEBGPU_DAWN_DLL
47+
DEFINE_KNOWN_DLL(webgpu_dawn),
48+
#endif
49+
#if ORT_DELAY_LOAD_DIRECTML_DLL
50+
DEFINE_KNOWN_DLL(DirectML),
51+
#endif
52+
};
53+
} // namespace
54+
55+
FARPROC WINAPI delay_load_hook(unsigned dliNotify, PDelayLoadInfo pdli) {
56+
if (dliNotify == dliNotePreLoadLibrary) {
57+
for (size_t i = 0; i < _countof(known_dlls); ++i) {
58+
if (_stricmp(pdli->szDll, known_dlls[i].str) == 0) {
59+
// Try to load the DLL from the same directory as onnxruntime.dll
60+
61+
// First, get the path to onnxruntime.dll
62+
auto path = Env::Default().GetRuntimePath();
63+
if (path.empty()) {
64+
// Failed to get the path to onnxruntime.dll. In this case, we will just return NULL and let the system
65+
// search for the DLL in the default search order.
66+
return NULL;
67+
}
68+
69+
// Append the name of the DLL. Now `path` is the absolute path to the DLL to load.
70+
path.append(known_dlls[i].wstr);
71+
72+
// Load the DLL
73+
return FARPROC(LoadLibraryExW(path.c_str(), NULL,
74+
LOAD_LIBRARY_SEARCH_DEFAULT_DIRS | LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR));
75+
}
76+
}
77+
}
78+
return NULL;
79+
}
80+
81+
extern "C" const PfnDliHook __pfnDliNotifyHook2 = delay_load_hook;
82+
83+
#endif

onnxruntime/core/dll/dllmain.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
#pragma GCC diagnostic pop
1414
#endif
1515

16-
// dllmain.cpp : Defines the entry point for the DLL application.
16+
// dllmain.cc : Defines the entry point for the DLL application.
1717
BOOL APIENTRY DllMain(HMODULE /*hModule*/,
1818
DWORD ul_reason_for_call,
1919
LPVOID /*lpReserved*/

onnxruntime/core/providers/webgpu/webgpu_context.cc

+26
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
#endif
1111

1212
#include "core/common/common.h"
13+
#include "core/common/path_string.h"
14+
#include "core/platform/env.h"
1315

1416
#include "core/providers/webgpu/compute_context.h"
1517
#include "core/providers/webgpu/webgpu_context.h"
@@ -50,6 +52,30 @@ void WebGpuContext::Initialize(const WebGpuExecutionProviderInfo& webgpu_ep_info
5052

5153
// Initialization.Step.2 - Create wgpu::Adapter
5254
if (adapter_ == nullptr) {
55+
#if !defined(__EMSCRIPTEN__) && defined(_MSC_VER) && defined(DAWN_ENABLE_D3D12) && !defined(USE_EXTERNAL_DAWN)
56+
// If we are using the D3D12 backend on Windows and the build does not use external Dawn, dxil.dll and dxcompiler.dll are required.
57+
//
58+
// Dawn will try to load them later, but if they are in the different directory to the executable, it may fail to find them.
59+
// To avoid this issue, we try to load them from the same directory as current module (usually onnxruntime.dll).
60+
auto runtime_path = Env::Default().GetRuntimePath();
61+
if (!runtime_path.empty()) {
62+
Status status;
63+
void* module_handle = nullptr;
64+
65+
PathString dxil_path = runtime_path + ToPathString(L"dxil.dll");
66+
status = Env::Default().LoadDynamicLibrary(dxil_path, false, &module_handle);
67+
if (status.IsOK() && module_handle != nullptr) {
68+
modules_.Add(dxil_path, module_handle);
69+
}
70+
71+
PathString dxcompiler_path = runtime_path + ToPathString(L"dxcompiler.dll");
72+
status = Env::Default().LoadDynamicLibrary(dxcompiler_path, false, &module_handle);
73+
if (status.IsOK() && module_handle != nullptr) {
74+
modules_.Add(dxcompiler_path, module_handle);
75+
}
76+
}
77+
#endif
78+
5379
wgpu::RequestAdapterOptions req_adapter_options = {};
5480
wgpu::DawnTogglesDescriptor adapter_toggles_desc = {};
5581
req_adapter_options.nextInChain = &adapter_toggles_desc;

onnxruntime/core/providers/webgpu/webgpu_context.h

+3
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <webgpu/webgpu_cpp.h>
1414

1515
#include "core/common/common.h"
16+
#include "core/framework/library_handles.h"
1617
#include "core/providers/webgpu/webgpu_execution_provider.h"
1718
#include "core/providers/webgpu/buffer_manager.h"
1819
#include "core/providers/webgpu/program_manager.h"
@@ -153,6 +154,8 @@ class WebGpuContext final {
153154

154155
std::once_flag init_flag_;
155156

157+
LibraryHandles modules_;
158+
156159
wgpu::Instance instance_;
157160
wgpu::Adapter adapter_;
158161
wgpu::Device device_;

0 commit comments

Comments
 (0)