diff --git a/buildbot/configure.py b/buildbot/configure.py index fcb45d39a39b6..7a26a460108d2 100644 --- a/buildbot/configure.py +++ b/buildbot/configure.py @@ -29,7 +29,7 @@ def do_configure(args): libclc_targets_to_build = '' libclc_gen_remangled_variants = 'OFF' sycl_build_pi_cuda = 'OFF' - sycl_build_pi_esimd_cpu = 'ON' + sycl_build_pi_esimd_cpu = 'OFF' sycl_build_pi_rocm = 'OFF' sycl_build_pi_rocm_platform = 'AMD' sycl_werror = 'ON' @@ -45,8 +45,8 @@ def do_configure(args): if args.arm: llvm_targets_to_build = 'ARM;AArch64' - if args.disable_esimd_cpu: - sycl_build_pi_esimd_cpu = 'OFF' + if args.enable_esimd_cpu_emulation: + sycl_build_pi_esimd_cpu = 'ON' if args.cuda or args.rocm: llvm_enable_projects += ';libclc' @@ -181,7 +181,7 @@ def main(): parser.add_argument("--rocm", action='store_true', help="switch from OpenCL to ROCm") parser.add_argument("--rocm-platform", type=str, choices=['AMD', 'NVIDIA'], default='AMD', help="choose ROCm backend") parser.add_argument("--arm", action='store_true', help="build ARM support rather than x86") - parser.add_argument("--disable-esimd-cpu", action='store_true', help="build without ESIMD_CPU support") + parser.add_argument("--enable-esimd-cpu-emulation", action='store_true', help="build with ESIMD_CPU emulation support") parser.add_argument("--no-assertions", action='store_true', help="build without assertions") parser.add_argument("--docs", action='store_true', help="build Doxygen documentation") parser.add_argument("--no-werror", action='store_true', help="Don't treat warnings as errors") diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt index ecb8c9369dcc0..1af0e2c8eb4d9 100644 --- a/sycl/CMakeLists.txt +++ b/sycl/CMakeLists.txt @@ -259,7 +259,6 @@ set( SYCL_TOOLCHAIN_DEPLOY_COMPONENTS sycl pi_opencl pi_level_zero - pi_esimd_cpu libsycldevice ${XPTIFW_LIBS} ) @@ -292,6 +291,19 @@ if(SYCL_BUILD_PI_ROCM) list(APPEND SYCL_TOOLCHAIN_DEPLOY_COMPONENTS libspirv-builtins pi_rocm) endif() +# TODO : Remove 'if (NOT MSVC)' when CM_EMU supports Windows +# environment +if (NOT MSVC) + if (SYCL_BUILD_PI_ESIMD_CPU) + list(APPEND SYCL_TOOLCHAIN_DEPLOY_COMPONENTS pi_esimd_cpu libcmrt-headers) + if (MSVC) + list(APPEND SYCL_TOOLCHAIN_DEPLOY_COMPONENTS libcmrt-libs libcmrt-dlls) + else() + list(APPEND SYCL_TOOLCHAIN_DEPLOY_COMPONENTS libcmrt-sos) + endif() + endif() +endif() + # Use it as fake dependency in order to force another command(s) to execute. add_custom_command(OUTPUT __force_it COMMAND "${CMAKE_COMMAND}" -E echo diff --git a/sycl/doc/GetStartedGuide.md b/sycl/doc/GetStartedGuide.md index 34eff1725cfa0..3c8cf47135808 100644 --- a/sycl/doc/GetStartedGuide.md +++ b/sycl/doc/GetStartedGuide.md @@ -11,6 +11,7 @@ and a wide range of compute accelerators such as GPU and FPGA. - [Build DPC++ toolchain with support for NVIDIA CUDA](#build-dpc-toolchain-with-support-for-nvidia-cuda) - [Build DPC++ toolchain with support for AMD ROCm](#build-dpc-toolchain-with-support-for-amd-rocm) - [Build DPC++ toolchain with support for NVIDIA ROCm](#build-dpc-toolchain-with-support-for-nvidia-rocm) + - [Build DPC++ toolchain with support for ESIMD CPU Emulation](#build-dpc-toolchain-with-support-for-esimd-cpu) - [Build Doxygen documentation](#build-doxygen-documentation) - [Deployment](#deployment) - [Use DPC++ toolchain](#use-dpc-toolchain) @@ -109,6 +110,7 @@ flags can be found by launching the script with `--help`): * `--cuda` -> use the cuda backend (see [Nvidia CUDA](#build-dpc-toolchain-with-support-for-nvidia-cuda)) * `--rocm` -> use the rocm backend (see [AMD ROCm](#build-dpc-toolchain-with-support-for-amd-rocm)) * `--rocm-platform` -> select the platform used by the rocm backend, `AMD` or `NVIDIA` (see [AMD ROCm](#build-dpc-toolchain-with-support-for-amd-rocm) or see [NVIDIA ROCm](#build-dpc-toolchain-with-support-for-nvidia-rocm)) +* '--enable-esimd-cpu-emulation' -> enable ESIMD CPU emulation (see [ESIMD CPU emulation](#build-dpc-toolchain-with-support-for-esimd-cpu)) * `--shared-libs` -> Build shared libraries * `-t` -> Build type (debug or release) * `-o` -> Path to build directory @@ -206,6 +208,48 @@ as well as CUDA to be installed, see Currently this was only tested on Linux with ROCm 4.2, CUDA 11 and a GeForce GTX 1060 card. +### Build DPC++ toolchain with support for ESIMD CPU Emulation + +There is experimental support for DPC++ for using ESIMD CPU Emulation + +This feature supports ESIMD CPU Emulation using CM_EMU library [CM +Emulation +project](https://github.com/intel/cm-cpu-emulation). Pre-built library +package will be downloaded and installed in your deploy directory +during toolchain build. + +To enable support for ESIMD CPU emulation, follow the instructions for +the Linux DPC++ toolchain, but add the `--enable-esimd-cpu-emulation'. + +Enabling this flag requires following packages installed. + +* Ubuntu 20.04 + * libva-dev / 2.7.0-2 + * libva-drm2 / 2.7.0-2 + * libva-glx2 / 2.7.0-2 + * libva-wayland2 / 2.7.0-2 + * libva-x11-2 / 2.7.0-2 + * libva2 / focal 2.7.0-2 + * libffi-dev / 3.3-4 + * libffi7 / 3.3-4 + * libdrm-amdgpu1 + * libdrm-common + * libdrm-dev + * libdrm-intel1 + * libdrm-nouveau2 + * libdrm-radeon1 + * libdrm2 +* RHEL 8.* + * libffi + * libffi-devel + * libdrm + * libdrm-devel + * libva + * libva-devel + +Currently, this feature was tested and verified on Ubuntu 20.04 +environment. + ### Build Doxygen documentation Building Doxygen documentation is similar to building the product itself. First, diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/emu/detail/esimd_emu_functions_v1.h b/sycl/include/sycl/ext/intel/experimental/esimd/emu/detail/esimd_emu_functions_v1.h index 9fcde11e6e9d4..239e567dc0986 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/emu/detail/esimd_emu_functions_v1.h +++ b/sycl/include/sycl/ext/intel/experimental/esimd/emu/detail/esimd_emu_functions_v1.h @@ -44,3 +44,7 @@ void (*cm_fence_ptr)(void); char *(*sycl_get_surface_base_addr_ptr)(int); char *(*__cm_emu_get_slm_ptr)(void); void (*cm_slm_init_ptr)(size_t); +void (*sycl_get_cm_buffer_params_ptr)(void *, char **, uint32_t *, + std::mutex **); +void (*sycl_get_cm_image_params_ptr)(void *, char **, uint32_t *, uint32_t *, + uint32_t *, std::mutex **); diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/emu/detail/esimdcpu_device_interface.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/emu/detail/esimdcpu_device_interface.hpp index eb249f7c61781..ca24b20b38019 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/emu/detail/esimdcpu_device_interface.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/emu/detail/esimdcpu_device_interface.hpp @@ -24,6 +24,7 @@ // pointer table file ('esimd_emu_functions_v1.h') included in 'struct // ESIMDDeviceInterface' definition. #include +#include __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { diff --git a/sycl/plugins/CMakeLists.txt b/sycl/plugins/CMakeLists.txt index ddc4e067beb40..10f22d881da84 100644 --- a/sycl/plugins/CMakeLists.txt +++ b/sycl/plugins/CMakeLists.txt @@ -15,6 +15,10 @@ endif() add_subdirectory(opencl) add_subdirectory(level_zero) -if (SYCL_BUILD_PI_ESIMD_CPU) - add_subdirectory(esimd_cpu) +# TODO : Remove 'if (NOT MSVC)' when CM_EMU supports Windows +# environment +if (NOT MSVC) + if (SYCL_BUILD_PI_ESIMD_CPU) + add_subdirectory(esimd_cpu) + endif() endif() diff --git a/sycl/plugins/esimd_cpu/CMakeLists.txt b/sycl/plugins/esimd_cpu/CMakeLists.txt index b6c12e47b5e0f..e520e63137efd 100755 --- a/sycl/plugins/esimd_cpu/CMakeLists.txt +++ b/sycl/plugins/esimd_cpu/CMakeLists.txt @@ -2,9 +2,99 @@ # PI Esimd CPU library # Create Shared library for libpi_esimd_cpu.so. +include(ExternalProject) + include_directories("${sycl_inc_dir}") +# FIXME/TODO: 'pi.h' is included in 'pi_esimd_cpu.cpp', and CL_*_INTEL +# and CL_*_KHR definitions in 'pi.h' are from +# ${OPENCL_INCLUDE}. Remove build dependency on OpenCL include_directories(${OpenCL_INCLUDE_DIR}) -include_directories(${LIBCMRT_INCLUDE}) + +file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_build) +file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_install) + +if (MSVC) + set(LIBCM ${LLVM_BINARY_DIR}/pi_esimd_cpu_deps/lib/libcm${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(LIBIGFXCMRT_EMU ${LLVM_BINARY_DIR}/pi_esimd_cpu_deps/lib/igfxcmrt64_emu${CMAKE_STATIC_LIBRARY_SUFFIX}) +else() + set(LIBCM ${LLVM_BINARY_DIR}/pi_esimd_cpu_deps/lib/libcm${CMAKE_SHARED_LIBRARY_SUFFIX}) + set(LIBIGFXCMRT_EMU ${LLVM_BINARY_DIR}/pi_esimd_cpu_deps/lib/libigfxcmrt_emu${CMAKE_SHARED_LIBRARY_SUFFIX}) +endif() + +if (DEFINED CM_LOCAL_SOURCE_DIR) + # Using local CM directory for online building without downloading + if (MSVC) + ExternalProject_Add(cm-emu + DOWNLOAD_COMMAND "" + SOURCE_DIR ${CM_LOCAL_SOURCE_DIR} + BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_build + INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_install + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX= + BUILD_BYPRODUCTS ${LIBCM} ${LIBIGFXCMRT_EMU} + ) + else() + ExternalProject_Add(cm-emu + DOWNLOAD_COMMAND "" + SOURCE_DIR ${CM_LOCAL_SOURCE_DIR} + BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_build + INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_install + CMAKE_ARGS -DLIBVA_INSTALL_PATH=/usr + -D__SYCL_EXPLICIT_SIMD_PLUGIN__=true + -DCMAKE_INSTALL_PREFIX= + BUILD_BYPRODUCTS ${LIBCM} ${LIBIGFXCMRT_EMU} + ) + endif() +else () + if (DEFINED CM_PACKAGE_URL) + # Downloading pre-built CM Package + file (MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_install) + ExternalProject_Add(cm-emu + URL ${CM_PACKAGE_URL} + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + UPDATE_COMMAND "" + INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_install + INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_BINARY_DIR}/cm-emu-prefix/src/cm-emu/ + BUILD_BYPRODUCTS ${LIBCM} ${LIBIGFXCMRT_EMU} + ) + else() + # Build from CM source tree fetched from github + if (MSVC) + message(FATAL_ERROR "Online-building of CM_EMU library is not supported under Windows environment") + else() + ExternalProject_Add(cm-emu + GIT_REPOSITORY https://github.com/intel/cm-cpu-emulation.git + BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_build + INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_install + CMAKE_ARGS -DLIBVA_INSTALL_PATH=/usr + -D__SYCL_EXPLICIT_SIMD_PLUGIN__=true + -DCMAKE_INSTALL_PREFIX= + BUILD_BYPRODUCTS ${LIBCM} ${LIBIGFXCMRT_EMU} + ) + endif() + endif() +endif () +ExternalProject_Add_Step(cm-emu llvminstall + COMMAND ${CMAKE_COMMAND} -E make_directory ${LLVM_BINARY_DIR}/pi_esimd_cpu_deps && ${CMAKE_COMMAND} -E copy_directory / ${LLVM_BINARY_DIR}/pi_esimd_cpu_deps + COMMENT "Installing cm-emu into the LLVM binary directory" + DEPENDEES install +) + +include_directories(${LLVM_BINARY_DIR}/pi_esimd_cpu_deps/include/igfxcmrt_emu) +include_directories(${LLVM_BINARY_DIR}/pi_esimd_cpu_deps/include/libcm/cm) + +# Compilation flag to exclude lines in header files imported from CM +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__SYCL_EXPLICIT_SIMD_PLUGIN__") + +set(CMAKE_CXX_STANDARD 17) + +# Compilation option modification to prevent build termination caused by +# warnings from CM-imported files +if (MSVC) +string(REPLACE "/W4" " " CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") +else() +string(REPLACE "-pedantic" " " CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") +endif() add_library(pi_esimd_cpu SHARED "${sycl_inc_dir}/CL/sycl/detail/pi.h" @@ -31,16 +121,42 @@ else() ) endif() +add_dependencies(pi_esimd_cpu OpenCL-Headers) +add_dependencies(pi_esimd_cpu cm-emu) add_dependencies(sycl-toolchain pi_esimd_cpu) -add_dependencies(pi_esimd_cpu - OpenCL-Headers) - -target_link_libraries(pi_esimd_cpu PRIVATE sycl) +target_link_libraries(pi_esimd_cpu PRIVATE sycl ${LIBCM} ${LIBIGFXCMRT_EMU}) set_target_properties(pi_esimd_cpu PROPERTIES LINKER_LANGUAGE CXX) add_common_options(pi_esimd_cpu) install(TARGETS pi_esimd_cpu - LIBRARY DESTINATION "lib${LLVM_LIBDIR_SUFFIX}" COMPONENT pi_esimd_cpu - RUNTIME DESTINATION "bin" COMPONENT pi_esimd_cpu) + LIBRARY DESTINATION "lib${LLVM_LIBDIR_SUFFIX}" COMPONENT pi_esimd_cpu + RUNTIME DESTINATION "bin" COMPONENT pi_esimd_cpu) + +# Copy CM Header files to $(INSTALL)/include/sycl/CL/ +install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_install/include/libcm/cm/ + DESTINATION ${SYCL_INCLUDE_DIR}/CL + COMPONENT libcmrt-headers + FILES_MATCHING PATTERN "*.h" +) + +# Copy '.so' files to '$(INSTALL)/lib' +if (MSVC) + install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_install/lib/ + DESTINATION ${CMAKE_INSTALL_PREFIX}/lib + COMPONENT libcmrt-libs + FILES_MATCHING PATTERN "*.lib" + ) + install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_install/bin/ + DESTINATION ${CMAKE_INSTALL_PREFIX}/bin + COMPONENT libcmrt-dlls + FILES_MATCHING PATTERN "*.dll" + ) +else() + install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_install/lib/ + DESTINATION ${CMAKE_INSTALL_PREFIX}/lib + COMPONENT libcmrt-sos + FILES_MATCHING PATTERN "*.so" + ) +endif() diff --git a/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp b/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp index 2b29c46382738..b98f18f393148 100644 --- a/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp +++ b/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp @@ -28,6 +28,8 @@ #include #include +#include + #include #include #include @@ -38,17 +40,73 @@ #include #include -#ifdef __GNUC__ -// Linux -#include -#else -// Windows -#include -#endif - #include "pi_esimd_cpu.hpp" -#define PLACEHOLDER_UNUSED(x) (void)x +namespace { + +// Helper functions for unified 'Return' type declaration - imported +// from pi_level_zero.cpp +template +pi_result getInfoImpl(size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet, T Value, size_t ValueSize, + Assign &&AssignFunc) { + if (ParamValue != nullptr) { + if (ParamValueSize < ValueSize) { + return PI_INVALID_VALUE; + } + AssignFunc(ParamValue, Value, ValueSize); + } + if (ParamValueSizeRet != nullptr) { + *ParamValueSizeRet = ValueSize; + } + return PI_SUCCESS; +} + +template +pi_result getInfo(size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet, T Value) { + auto assignment = [](void *ParamValue, T Value, size_t ValueSize) { + *static_cast(ParamValue) = Value; + }; + return getInfoImpl(ParamValueSize, ParamValue, ParamValueSizeRet, Value, + sizeof(T), assignment); +} + +template +pi_result getInfoArray(size_t ArrayLength, size_t ParamValueSize, + void *ParamValue, size_t *ParamValueSizeRet, T *Value) { + return getInfoImpl(ParamValueSize, ParamValue, ParamValueSizeRet, Value, + ArrayLength * sizeof(T), memcpy); +} + +template <> +pi_result getInfo(size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet, const char *Value) { + return getInfoArray(strlen(Value) + 1, ParamValueSize, ParamValue, + ParamValueSizeRet, Value); +} + +class ReturnHelper { +public: + ReturnHelper(size_t ArgParamValueSize, void *ArgParamValue, + size_t *ArgParamValueSizeRet) + : ParamValueSize(ArgParamValueSize), ParamValue(ArgParamValue), + ParamValueSizeRet(ArgParamValueSizeRet) {} + + template pi_result operator()(const T &t) { + return getInfo(ParamValueSize, ParamValue, ParamValueSizeRet, t); + } + +private: + size_t ParamValueSize; + void *ParamValue; + size_t *ParamValueSizeRet; +}; + +} // anonymous namespace + +// Controls PI level tracing prints. +static bool PrintPiTrace = false; // Global variables used in PI_esimd_cpu // Note we only create a simple pointer variables such that C++ RT won't @@ -67,524 +125,489 @@ static sycl::detail::ESIMDEmuPluginOpaqueData *PiESimdDeviceAccess; using IDBuilder = sycl::detail::Builder; -// Lambda-call interface definition. -// 'extern "C"' is required as CM supports only C-style function calls -// while kernel is given as lambda function -// -#define LAMBDA_WRAPPER_TMPL(ARGTYPE, TAG, DIMS) \ - typedef std::function LambdaFunction_##TAG; \ - \ - extern "C" struct LambdaWrapper_##TAG { \ - LambdaFunction_##TAG Func; \ - const sycl::range &LocalSize; \ - const sycl::range &GlobalSize; \ - const sycl::id &GlobalOffset; \ - LambdaWrapper_##TAG(LambdaFunction_##TAG ArgFunc, \ - const sycl::range &ArgLocalSize, \ - const sycl::range &ArgGlobalSize, \ - const sycl::id &ArgGlobalOffset) \ - : Func(ArgFunc), LocalSize(ArgLocalSize), GlobalSize(ArgGlobalSize), \ - GlobalOffset(ArgGlobalOffset) {} \ - }; \ - \ - template \ - auto makeWrapper_##TAG(LambdaTy F, const sycl::range &LocalSize, \ - const sycl::range &GlobalSize, \ - const sycl::id &GlobalOffset) { \ - std::unique_ptr Wrapper = \ - std::make_unique(LambdaWrapper_##TAG( \ - LambdaFunction_##TAG(F), LocalSize, GlobalSize, GlobalOffset)); \ - return Wrapper; \ - } - -#define _COMMA_ , - -LAMBDA_WRAPPER_TMPL(sycl::id<1>, ID_1DIM, 1) -LAMBDA_WRAPPER_TMPL(sycl::id<2>, ID_2DIM, 2) -LAMBDA_WRAPPER_TMPL(sycl::id<3>, ID_3DIM, 3) -LAMBDA_WRAPPER_TMPL(sycl::item<1 _COMMA_ false>, ITEM_1DIM, 1) -LAMBDA_WRAPPER_TMPL(sycl::item<2 _COMMA_ false>, ITEM_2DIM, 2) -LAMBDA_WRAPPER_TMPL(sycl::item<3 _COMMA_ false>, ITEM_3DIM, 3) -LAMBDA_WRAPPER_TMPL(sycl::item<1 _COMMA_ true>, ITEM_OFFSET_1DIM, 1) -LAMBDA_WRAPPER_TMPL(sycl::item<2 _COMMA_ true>, ITEM_OFFSET_2DIM, 2) -LAMBDA_WRAPPER_TMPL(sycl::item<3 _COMMA_ true>, ITEM_OFFSET_3DIM, 3) -LAMBDA_WRAPPER_TMPL(sycl::nd_item<1>, NDITEM_1DIM, 1) -LAMBDA_WRAPPER_TMPL(sycl::nd_item<2>, NDITEM_2DIM, 2) -LAMBDA_WRAPPER_TMPL(sycl::nd_item<3>, NDITEM_3DIM, 3) - -#undef _COMMA_ -#undef LAMBDA_WRAPPER_TMPL - -extern "C" inline void invokeLambda_ID_1DIM(void *Wrapper) { - auto *LambdaWrapper = reinterpret_cast(Wrapper); - - PLACEHOLDER_UNUSED(LambdaWrapper); - // TODO : construct cl::sycl::id<1> instance using thread ID info - // retrieved from CM and call Lambda function - // LambdaWrapper->Func(id_1dim); -} - -extern "C" inline void invokeLambda_ID_2DIM(void *Wrapper) { - auto *LambdaWrapper = reinterpret_cast(Wrapper); - - PLACEHOLDER_UNUSED(LambdaWrapper); - // TODO : construct cl::sycl::id<2> instance using thread ID info - // retrieved from CM and call Lambda function - // LambdaWrapper->Func(id_2dim); -} - -extern "C" inline void invokeLambda_ID_3DIM(void *Wrapper) { - auto *LambdaWrapper = reinterpret_cast(Wrapper); - - PLACEHOLDER_UNUSED(LambdaWrapper); - // TODO : construct cl::sycl::id<3> instance using thread ID info - // retrieved from CM and call Lambda function - // LambdaWrapper->Func(id_3dim); -} - -extern "C" inline void invokeLambda_ITEM_1DIM(void *Wrapper) { - auto *LambdaWrapper = reinterpret_cast(Wrapper); - - PLACEHOLDER_UNUSED(LambdaWrapper); - // TODO : construct cl::sycl::item<1, false> instance using thread - // ID info retrieved from CM and call Lambda function - // LambdaWrapper->Func(item_1dim); -} - -extern "C" inline void invokeLambda_ITEM_2DIM(void *Wrapper) { - auto *LambdaWrapper = reinterpret_cast(Wrapper); - - PLACEHOLDER_UNUSED(LambdaWrapper); - // TODO : construct cl::sycl::item<2, false> instance using thread - // ID info retrieved from CM and call Lambda function - // LambdaWrapper->Func(item_2dim); -} - -extern "C" inline void invokeLambda_ITEM_3DIM(void *Wrapper) { - auto *LambdaWrapper = reinterpret_cast(Wrapper); - - PLACEHOLDER_UNUSED(LambdaWrapper); - // TODO : construct cl::sycl::item<3, false> instance using thread - // ID info retrieved from CM and call Lambda function - // LambdaWrapper->Func(item_3dim); -} - -extern "C" inline void invokeLambda_ITEM_OFFSET_1DIM(void *Wrapper) { - auto *LambdaWrapper = - reinterpret_cast(Wrapper); - - PLACEHOLDER_UNUSED(LambdaWrapper); - // TODO : construct cl::sycl::item<1, true> instance using thread - // ID info retrieved from CM with GlobalOffset info and call Lambda - // function - // LambdaWrapper->Func(item_offset_1dim); -} - -extern "C" inline void invokeLambda_ITEM_OFFSET_2DIM(void *Wrapper) { - auto *LambdaWrapper = - reinterpret_cast(Wrapper); +template +using KernelFunc = std::function &)>; + +// Struct to wrap dimension info and lambda function to be invoked by +// CM Kernel launcher that only accepts raw function pointer for +// kernel execution. Function instances of 'InvokeLambda' un-wrap this +// struct instance and invoke lambda function ('Func') +template struct LambdaWrapper { + KernelFunc Func; + const sycl::range &LocalSize; + const sycl::range &GlobalSize; + const sycl::id &GlobalOffset; + LambdaWrapper(KernelFunc ArgFunc, + const sycl::range &ArgLocalSize, + const sycl::range &ArgGlobalSize, + const sycl::id &ArgGlobalOffset) + : Func(ArgFunc), LocalSize(ArgLocalSize), GlobalSize(ArgGlobalSize), + GlobalOffset(ArgGlobalOffset) {} +}; - PLACEHOLDER_UNUSED(LambdaWrapper); - // TODO : construct cl::sycl::item<2, true> instance using thread - // ID info retrieved from CM with GlobalOffset info and call Lambda - // function - // LambdaWrapper->Func(item_offset_2dim); -} +// Function to generate a lambda wrapper object above +template +auto MakeLambdaWrapper(KernelFunc ArgFunc, + const sycl::range &LocalSize, + const sycl::range &GlobalSize, + const sycl::id &GlobalOffset) { + std::unique_ptr> Wrapper = + std::make_unique>(LambdaWrapper( + KernelFunc(ArgFunc), LocalSize, GlobalSize, GlobalOffset)); + return Wrapper; +} + +// A helper structure to create multi-dimensional range when +// dimensionality is given as a template parameter. `create` function +// in specializations accepts a template `Gen` function which +// generates range extent for a dimension given as an argument. +template struct RangeBuilder; + +template <> struct RangeBuilder<1> { + template static sycl::range<1> create(Gen G) { + return sycl::range<1>{G(0)}; + } +}; +template <> struct RangeBuilder<2> { + template static sycl::range<2> create(Gen G) { + return sycl::range<2>{G(0), G(1)}; + } +}; +template <> struct RangeBuilder<3> { + template static sycl::range<3> create(Gen G) { + return sycl::range<3>{G(0), G(1), G(2)}; + } +}; -extern "C" inline void invokeLambda_ITEM_OFFSET_3DIM(void *Wrapper) { - auto *LambdaWrapper = - reinterpret_cast(Wrapper); +// Function template to generate entry point of kernel execution as +// raw function pointer. CM kernel launcher executes one instance of +// this function per 'NDims' +template void InvokeLambda(void *Wrapper) { + auto *WrappedLambda = reinterpret_cast *>(Wrapper); + sycl::range GroupSize( + sycl::detail::InitializedVal::template get<0>()); - PLACEHOLDER_UNUSED(LambdaWrapper); - // TODO : construct cl::sycl::item<3, true> instance using thread - // ID info retrieved from CM with GlobalOffset info and call Lambda - // function - // LambdaWrapper->Func(item_offset_3dim); -} + for (int I = 0; I < NDims /*Dims*/; ++I) { + GroupSize[I] = WrappedLambda->GlobalSize[I] / WrappedLambda->LocalSize[I]; + } -extern "C" inline void invokeLambda_NDITEM_1DIM(void *Wrapper) { - auto *LambdaWrapper = reinterpret_cast(Wrapper); + const sycl::id LocalID = RangeBuilder::create( + [](int i) { return cm_support::get_thread_idx(i); }); - PLACEHOLDER_UNUSED(LambdaWrapper); - // TODO : construct cl::sycl::nd_item<1> instance using thread ID - // info retrieved from CM with GlobalOffset/GlobalSize/LocalSize - // info and call Lambda function - // LambdaWrapper->Func(nd_item_1dim); -} + const sycl::id GroupID = RangeBuilder::create( + [](int Id) { return cm_support::get_group_idx(Id); }); -extern "C" inline void invokeLambda_NDITEM_2DIM(void *Wrapper) { - auto *LambdaWrapper = reinterpret_cast(Wrapper); + const sycl::group Group = IDBuilder::createGroup( + WrappedLambda->GlobalSize, WrappedLambda->LocalSize, GroupSize, GroupID); - PLACEHOLDER_UNUSED(LambdaWrapper); - // TODO : construct cl::sycl::nd_item<2> instance using thread ID - // info retrieved from CM with GlobalOffset/GlobalSize/LocalSize - // info and call Lambda function - // LambdaWrapper->Func(nd_item_2dim); -} + const sycl::id GlobalID = GroupID * WrappedLambda->LocalSize + + LocalID + WrappedLambda->GlobalOffset; + const sycl::item GlobalItem = + IDBuilder::createItem(WrappedLambda->GlobalSize, GlobalID, + WrappedLambda->GlobalOffset); + const sycl::item LocalItem = + IDBuilder::createItem(WrappedLambda->LocalSize, LocalID); -extern "C" inline void invokeLambda_NDITEM_3DIM(void *Wrapper) { - auto *LambdaWrapper = reinterpret_cast(Wrapper); + const sycl::nd_item NDItem = + IDBuilder::createNDItem(GlobalItem, LocalItem, Group); - PLACEHOLDER_UNUSED(LambdaWrapper); - // TODO : construct cl::sycl::nd_item<3> instance using thread ID - // info retrieved from CM with GlobalOffset/GlobalSize/LocalSize - // info and call Lambda function - // LambdaWrapper->Func(nd_item_3dim); + WrappedLambda->Func(NDItem); } // libCMBatch class defines interface for lauching kernels with // software multi-threads -template class libCMBatch { +template class libCMBatch { private: // Kernel function - KernelType MKernel; + KernelFunc MKernel; // Space-dimension info std::vector GroupDim; std::vector SpaceDim; - // Number of threads for parallelization - const uint32_t hwThreads = (uint32_t)std::thread::hardware_concurrency(); - - using IDBuilder = sycl::detail::Builder; - const sycl::id UnusedID = - sycl::detail::InitializedVal::template get<0>(); - const sycl::range UnusedRange = - sycl::detail::InitializedVal::template get<0>(); - public: - libCMBatch(KernelType Kernel) - : MKernel(Kernel), GroupDim{1, 1, 1}, SpaceDim{1, 1, 1} { - assert(MKernel != nullptr); - } + libCMBatch(KernelFunc Kernel) + : MKernel(Kernel), GroupDim{1, 1, 1}, SpaceDim{1, 1, 1} {} + + /// Invoking kernel lambda function wrapped by 'LambdaWrapper' using + /// 'InvokeLambda' function. + void runIterationSpace(const sycl::range &LocalSize, + const sycl::range &GlobalSize, + const sycl::id &GlobalOffset) { + auto WrappedLambda = + MakeLambdaWrapper(MKernel, LocalSize, GlobalSize, GlobalOffset); - // ID_1DIM - template - typename std::enable_if<(DIMS == 1) && - (std::is_same>::value)>::type - runIterationSpace(const sycl::range<1> &Range) { - auto WrappedLambda_ID_1DIM = - makeWrapper_ID_1DIM(MKernel, UnusedRange, UnusedRange, UnusedID); + for (int I = 0; I < DIMS; I++) { + SpaceDim[I] = (uint32_t)LocalSize[I]; + GroupDim[I] = (uint32_t)(GlobalSize[I] / LocalSize[I]); + } - SpaceDim[0] = (uint32_t)Range[0]; + ESimdCPUKernel ESimdCPU((fptrVoid)InvokeLambda, GroupDim, SpaceDim); - PLACEHOLDER_UNUSED(WrappedLambda_ID_1DIM); - // TODO : Invoke invokeLambda_ID_1DIM through CM's multi-threaded - // kernel launching with WrappedLambda_ID_1DIM and dimension info + ESimdCPU.launchMT(sizeof(struct LambdaWrapper), WrappedLambda.get()); } +}; - // ID_2DIM - template - typename std::enable_if<(DIMS == 2) && - (std::is_same>::value)>::type - runIterationSpace(const sycl::range<2> &Range) { - auto WrappedLambda_ID_2DIM = - makeWrapper_ID_2DIM(MKernel, UnusedRange, UnusedRange, UnusedID); - - SpaceDim[0] = (uint32_t)Range[0]; - SpaceDim[1] = (uint32_t)Range[1]; +// Function to provide buffer info for kernel compilation without +// dependency on '_pi_buffer' definition +void sycl_get_cm_buffer_params(void *PtrInput, char **BaseAddr, uint32_t *Width, + std::mutex **MtxLock) { + _pi_buffer *Buf = static_cast<_pi_buffer *>(PtrInput); - PLACEHOLDER_UNUSED(WrappedLambda_ID_2DIM); - // TODO : Invoke invokeLambda_ID_2DIM through CM's multi-threaded - // kernel launching with WrappedLambda_ID_2DIM and dimension info - } + *BaseAddr = cm_support::get_surface_base_addr(Buf->SurfaceIndex); + *Width = static_cast(Buf->Size); - // ID_3DIM - template - typename std::enable_if<(DIMS == 3) && - (std::is_same>::value)>::type - runIterationSpace(const sycl::range<3> &Range) { - auto WrappedLambda_ID_3DIM = - makeWrapper_ID_3DIM(MKernel, UnusedRange, UnusedRange, UnusedID); - - SpaceDim[0] = (uint32_t)Range[0]; - SpaceDim[1] = (uint32_t)Range[1]; - SpaceDim[2] = (uint32_t)Range[2]; - - PLACEHOLDER_UNUSED(WrappedLambda_ID_3DIM); - // TODO : Invoke invokeLambda_ID_3DIM through CM's multi-threaded - // kernel launching with WrappedLambda_ID_3DIM and dimension info - } + *MtxLock = &(Buf->mutexLock); +} - // Item w/o offset - template - typename std::enable_if< - (DIMS == 1) && - (std::is_same>::value)>::type - runIterationSpace(const sycl::range<1> &Range) { - auto WrappedLambda_ITEM_1DIM = - makeWrapper_ITEM_1DIM(MKernel, UnusedRange, UnusedRange, UnusedID); - - SpaceDim[0] = (uint32_t)Range[0]; - - PLACEHOLDER_UNUSED(WrappedLambda_ITEM_1DIM); - // TODO : Invoke invokeLambda_ITEM_1DIM through CM's multi-threaded - // kernel launching with WrappedLambda_ITEM_1DIM and dimension info - } +// Function to provide image info for kernel compilation without +// dependency on '_pi_image' definition +void sycl_get_cm_image_params(void *PtrInput, char **BaseAddr, uint32_t *Width, + uint32_t *Height, uint32_t *Bpp, + std::mutex **MtxLock) { + _pi_image *Img = static_cast<_pi_image *>(PtrInput); - template - typename std::enable_if< - (DIMS == 2) && - (std::is_same>::value)>::type - runIterationSpace(const sycl::range<2> &Range) { - auto WrappedLambda_ITEM_2DIM = - makeWrapper_ITEM_2DIM(MKernel, UnusedRange, UnusedRange, UnusedID); - - SpaceDim[0] = (uint32_t)Range[0]; - SpaceDim[1] = (uint32_t)Range[1]; - - PLACEHOLDER_UNUSED(WrappedLambda_ITEM_2DIM); - // TODO : Invoke invokeLambda_ITEM_2DIM through CM's multi-threaded - // kernel launching with WrappedLambda_ITEM_2DIM and dimension info - } - - template - typename std::enable_if< - (DIMS == 3) && - (std::is_same>::value)>::type - runIterationSpace(const sycl::range<3> &Range) { - auto WrappedLambda_ITEM_3DIM = - makeWrapper_ITEM_3DIM(MKernel, UnusedRange, UnusedRange, UnusedID); + *BaseAddr = cm_support::get_surface_base_addr(Img->SurfaceIndex); - SpaceDim[0] = (uint32_t)Range[0]; - SpaceDim[1] = (uint32_t)Range[1]; - SpaceDim[2] = (uint32_t)Range[2]; - - PLACEHOLDER_UNUSED(WrappedLambda_ITEM_3DIM); - // TODO : Invoke invokeLambda_ITEM_3DIM through CM's multi-threaded - // kernel launching with WrappedLambda_ITEM_3DIM and dimension info - } - - // Item w/ offset - template - typename std::enable_if< - (DIMS == 1) && - (std::is_same>::value)>::type - runIterationSpace(const sycl::range<1> &Range, const sycl::id<1> &Offset) { - auto WrappedLambda_ITEM_OFFSET_1DIM = - makeWrapper_ITEM_OFFSET_1DIM(MKernel, UnusedRange, UnusedRange, Offset); - - SpaceDim[0] = (uint32_t)Range[0]; + *Bpp = static_cast(Img->BytesPerPixel); + *Width = static_cast(Img->Width) * (*Bpp); + *Height = static_cast(Img->Height); - PLACEHOLDER_UNUSED(WrappedLambda_ITEM_OFFSET_1DIM); - // TODO : Invoke invokeLambda_ITEM_OFFSET_1DIM through CM's multi-threaded - // kernel launching with WrappedLambda_ITEM_OFFSET_1DIM and dimension info - } - - template - typename std::enable_if< - (DIMS == 2) && - (std::is_same>::value)>::type - runIterationSpace(const sycl::range<2> &Range, const sycl::id<2> &Offset) { - auto WrappedLambda_ITEM_OFFSET_2DIM = - makeWrapper_ITEM_OFFSET_2DIM(MKernel, UnusedRange, UnusedRange, Offset); - - SpaceDim[0] = (uint32_t)Range[0]; - SpaceDim[1] = (uint32_t)Range[1]; - - PLACEHOLDER_UNUSED(WrappedLambda_ITEM_OFFSET_2DIM); - // TODO : Invoke invokeLambda_ITEM_OFFSET_2DIM through CM's multi-threaded - // kernel launching with WrappedLambda_ITEM_OFFSET_2DIM and dimension info - } - - template - typename std::enable_if< - (DIMS == 3) && - (std::is_same>::value)>::type - runIterationSpace(const sycl::range<3> &Range, const sycl::id<3> &Offset) { - auto WrappedLambda_ITEM_OFFSET_3DIM = - makeWrapper_ITEM_OFFSET_3DIM(MKernel, UnusedRange, UnusedRange, Offset); - - SpaceDim[0] = (uint32_t)Range[0]; - SpaceDim[1] = (uint32_t)Range[1]; - SpaceDim[2] = (uint32_t)Range[2]; - - PLACEHOLDER_UNUSED(WrappedLambda_ITEM_OFFSET_3DIM); - // TODO : Invoke invokeLambda_ITEM_OFFSET_3DIM through CM's multi-threaded - // kernel launching with WrappedLambda_ITEM_OFFSET_3DIM and dimension info - } + *MtxLock = &(Img->mutexLock); +} - // NDItem_1DIM - template - typename std::enable_if<(DIMS == 1) && - (std::is_same>::value)>::type - runIterationSpace(const sycl::range<1> &LocalSize, - const sycl::range<1> &GlobalSize, - const sycl::id<1> &GlobalOffset) { - auto WrappedLambda_NDITEM_1DIM = - makeWrapper_NDITEM_1DIM(MKernel, LocalSize, GlobalSize, GlobalOffset); +/// Implementation for ESIMD_CPU device interface accessing ESIMD +/// intrinsics and LibCM functionalties requred by intrinsics +sycl::detail::ESIMDDeviceInterface::ESIMDDeviceInterface() { + version = ESIMDEmuPluginInterfaceVersion; + reserved = nullptr; - SpaceDim[0] = (uint32_t)LocalSize[0]; + /* From 'esimd_emu_functions_v1.h' : Start */ + cm_barrier_ptr = cm_support::barrier; + cm_sbarrier_ptr = cm_support::split_barrier; + cm_fence_ptr = cm_support::fence; - GroupDim[0] = (uint32_t)(GlobalSize[0] / LocalSize[0]); - - PLACEHOLDER_UNUSED(WrappedLambda_NDITEM_1DIM); - // TODO : Invoke invokeLambda_NDITEM_1DIM through CM's multi-threaded - // kernel launching with WrappedLambda_NDITEM_1DIM and dimension info - } - - // NDItem_2DIM - template - typename std::enable_if<(DIMS == 2) && - (std::is_same>::value)>::type - runIterationSpace(const sycl::range<2> &LocalSize, - const sycl::range<2> &GlobalSize, - const sycl::id<2> &GlobalOffset) { - auto WrappedLambda_NDITEM_2DIM = - makeWrapper_NDITEM_2DIM(MKernel, LocalSize, GlobalSize, GlobalOffset); + sycl_get_surface_base_addr_ptr = cm_support::get_surface_base_addr; + __cm_emu_get_slm_ptr = cm_support::get_slm_base; + cm_slm_init_ptr = cm_support::init_slm; - SpaceDim[0] = (uint32_t)LocalSize[0]; - SpaceDim[1] = (uint32_t)LocalSize[1]; + sycl_get_cm_buffer_params_ptr = sycl_get_cm_buffer_params; + sycl_get_cm_image_params_ptr = sycl_get_cm_image_params; + /* From 'esimd_emu_functions_v1.h' : End */ +} - GroupDim[0] = (uint32_t)(GlobalSize[0] / LocalSize[0]); - GroupDim[1] = (uint32_t)(GlobalSize[1] / LocalSize[1]); - - PLACEHOLDER_UNUSED(WrappedLambda_NDITEM_2DIM); - // TODO : Invoke invokeLambda_NDITEM_2DIM through CM's multi-threaded - // kernel launching with WrappedLambda_NDITEM_2DIM and dimension info - } +/// Implementation for Host Kernel Launch used by +/// piEnqueueKernelLaunch - // NDItem_3DIM - template - typename std::enable_if<(DIMS == 3) && - (std::is_same>::value)>::type - runIterationSpace(const sycl::range<3> &LocalSize, - const sycl::range<3> &GlobalSize, - const sycl::id<3> &GlobalOffset) { - auto WrappedLambda_NDITEM_3DIM = - makeWrapper_NDITEM_3DIM(MKernel, LocalSize, GlobalSize, GlobalOffset); +static bool isNull(int NDims, const size_t *R) { + return ((0 == R[0]) && (NDims < 2 || 0 == R[1]) && (NDims < 3 || 0 == R[2])); +} - SpaceDim[0] = (uint32_t)LocalSize[0]; - SpaceDim[1] = (uint32_t)LocalSize[1]; - SpaceDim[2] = (uint32_t)LocalSize[2]; - - GroupDim[0] = (uint32_t)(GlobalSize[0] / LocalSize[0]); - GroupDim[1] = (uint32_t)(GlobalSize[1] / LocalSize[1]); - GroupDim[2] = (uint32_t)(GlobalSize[2] / LocalSize[2]); +// NDims is the number of dimensions in the ND-range. Kernels are +// normalized in the handler so that all kernels take an sycl::nd_item +// as argument (see StoreLambda in CL/sycl/handler.hpp). For kernels +// whose workgroup size (LocalWorkSize) is unspecified, InvokeImpl +// sets LocalWorkSize to {1, 1, 1}, i.e. each workgroup contains just +// one work item. CM emulator will run several workgroups in parallel +// depending on environment settings. - PLACEHOLDER_UNUSED(WrappedLambda_NDITEM_3DIM); - // TODO : Invoke invokeLambda_NDITEM_3DIM through CM's multi-threaded - // kernel launching with WrappedLambda_NDITEM_3DIM and dimension info - } -}; +template struct InvokeImpl { -// Intrinsics -sycl::detail::ESIMDDeviceInterface::ESIMDDeviceInterface() { - reserved = nullptr; - version = ESIMDEmuPluginInterfaceVersion; + static sycl::range get_range(const size_t *Array) { + if constexpr (NDims == 1) + return sycl::range{Array[0]}; + else if constexpr (NDims == 2) + return sycl::range{Array[0], Array[1]}; + else if constexpr (NDims == 3) + return sycl::range{Array[0], Array[1], Array[2]}; + } - /// TODO : Fill *_ptr fields with function pointers from CM - /// functions prefixed with 'cm_support' + static void invoke(void *Fptr, const size_t *GlobalWorkOffset, + const size_t *GlobalWorkSize, + const size_t *LocalWorkSize) { + auto GlobalSize = get_range(GlobalWorkSize); + auto LocalSize = get_range(LocalWorkSize); + sycl::id GlobalOffset = get_range(GlobalWorkOffset); - cm_barrier_ptr = nullptr; /* cm_support::barrier; */ - cm_sbarrier_ptr = nullptr; /* cm_support::split_barrier; */ - cm_fence_ptr = nullptr; /* cm_support::fence; */ + auto KFunc = reinterpret_cast *>(Fptr); + libCMBatch CmThreading(*KFunc); - sycl_get_surface_base_addr_ptr = - nullptr; /* cm_support::get_surface_base_addr; */ - __cm_emu_get_slm_ptr = nullptr; /* cm_support::get_slm_base; */ - cm_slm_init_ptr = nullptr; /* cm_support::init_slm; */ -} + CmThreading.runIterationSpace(LocalSize, GlobalSize, GlobalOffset); + } +}; extern "C" { #define DIE_NO_IMPLEMENTATION \ - std::cerr << "Not Implemented : " << __FUNCTION__ \ - << " - File : " << __FILE__; \ - std::cerr << " / Line : " << __LINE__ << std::endl; \ - die("Terminated") - -#define DIE_NO_SUPPORT \ - std::cerr << "Not Supported : " << __FUNCTION__ << " - File : " << __FILE__; \ - std::cerr << " / Line : " << __LINE__ << std::endl; \ - die("Terminated") + if (PrintPiTrace) { \ + std::cerr << "Not Implemented : " << __FUNCTION__ \ + << " - File : " << __FILE__; \ + std::cerr << " / Line : " << __LINE__ << std::endl; \ + } \ + return PI_INVALID_OPERATION; #define CONTINUE_NO_IMPLEMENTATION \ - std::cerr << "Warning : Not Implemented : " << __FUNCTION__ \ - << " - File : " << __FILE__; \ - std::cerr << " / Line : " << __LINE__ << std::endl; + if (PrintPiTrace) { \ + std::cerr << "Warning : Not Implemented : " << __FUNCTION__ \ + << " - File : " << __FILE__; \ + std::cerr << " / Line : " << __LINE__ << std::endl; \ + } pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms, pi_uint32 *NumPlatforms) { - (void)NumEntries; - (void)Platforms; - (void)NumPlatforms; - DIE_NO_IMPLEMENTATION; + + static const char *PiTrace = std::getenv("SYCL_PI_TRACE"); + static const int PiTraceValue = PiTrace ? std::stoi(PiTrace) : 0; + if (PiTraceValue == -1) { // Means print all PI traces + PrintPiTrace = true; + } + + if (NumEntries == 0 && Platforms != nullptr) { + return PI_INVALID_VALUE; + } + if (Platforms == nullptr && NumPlatforms == nullptr) { + return PI_INVALID_VALUE; + } + + if (Platforms && NumEntries > 0) { + *Platforms = new _pi_platform(); + Platforms[0]->CmEmuVersion = std::string("0.0.1"); + } + + if (NumPlatforms) { + *NumPlatforms = 1; + } + return PI_SUCCESS; } pi_result piPlatformGetInfo(pi_platform Platform, pi_platform_info ParamName, size_t ParamValueSize, void *ParamValue, size_t *ParamValueSizeRet) { - (void)Platform; - (void)ParamName; - (void)ParamValueSize; - (void)ParamValue; - (void)ParamValueSizeRet; - DIE_NO_IMPLEMENTATION; + if (Platform == nullptr) { + return PI_INVALID_PLATFORM; + } + ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); + + switch (ParamName) { + case PI_PLATFORM_INFO_NAME: + return ReturnValue("Intel(R) ESIMD_CPU/GPU"); + + case PI_PLATFORM_INFO_VENDOR: + return ReturnValue("Intel(R) Corporation"); + + case PI_PLATFORM_INFO_VERSION: + return ReturnValue(Platform->CmEmuVersion); + + case PI_PLATFORM_INFO_PROFILE: + return ReturnValue("FULL_PROFILE"); + + case PI_PLATFORM_INFO_EXTENSIONS: + return ReturnValue(""); + + default: + // TODO: implement other parameters + die("Unsupported ParamName in piPlatformGetInfo"); + } + return PI_SUCCESS; } pi_result piextPlatformGetNativeHandle(pi_platform, pi_native_handle *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piextPlatformCreateWithNativeHandle(pi_native_handle, pi_platform *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piDevicesGet(pi_platform Platform, pi_device_type DeviceType, pi_uint32 NumEntries, pi_device *Devices, pi_uint32 *NumDevices) { - (void)Platform; - (void)DeviceType; - (void)NumEntries; - (void)Devices; - (void)NumDevices; - DIE_NO_IMPLEMENTATION; + if (Platform == nullptr) { + return PI_INVALID_PLATFORM; + } + + // CM has single-root-device without sub-device support. + if (NumDevices) { + *NumDevices = 1; + } + + cm_support::CmDevice *CmDevice = nullptr; + // TODO FIXME Implement proper version checking and reporting: + // - version passed to cm_support::CreateCmDevice + // - CmEmuVersion + // - PluginVersion + // - ESIMDEmuPluginOpaqueData::version + // + // PI_DEVICE_INFO_DRIVER_VERSION could report the ESIMDDeviceInterface + // version, PI_PLATFORM_INFO_VERSION - the underlying libCM library version. + unsigned int Version = 0; + + int Result = cm_support::CreateCmDevice(CmDevice, Version); + + if (Result != cm_support::CM_SUCCESS) { + return PI_INVALID_DEVICE; + } + + // FIXME / TODO : piDevicesGet always must return same pointer for + // 'Devices[0]' from cached entry. Reference : level-zero + // platform/device implementation with PiDevicesCache and + // PiDevicesCache + if (Devices) { + Devices[0] = new _pi_device(Platform, CmDevice); + } + return PI_SUCCESS; } pi_result piDeviceRetain(pi_device Device) { - (void)Device; - DIE_NO_IMPLEMENTATION; + if (Device == nullptr) { + return PI_INVALID_DEVICE; + } + + // CM supports only single device, which is root-device. 'Retain' is + // No-op. return PI_SUCCESS; } -pi_result piDeviceRelease(pi_device) { - DIE_NO_IMPLEMENTATION; +pi_result piDeviceRelease(pi_device Device) { + if (Device == nullptr) { + return PI_INVALID_DEVICE; + } + + // CM supports only single device, which is root-device. 'Release' + // is No-op. return PI_SUCCESS; } pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, size_t ParamValueSize, void *ParamValue, size_t *ParamValueSizeRet) { - (void)Device; - (void)ParamName; - (void)ParamValueSize; - (void)ParamValue; - (void)ParamValueSizeRet; - DIE_NO_IMPLEMENTATION; + ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); + + switch (ParamName) { + case PI_DEVICE_INFO_TYPE: + return ReturnValue(PI_DEVICE_TYPE_GPU); + case PI_DEVICE_INFO_PARENT_DEVICE: + return ReturnValue(pi_device{0}); + case PI_DEVICE_INFO_PLATFORM: + return ReturnValue(Device->Platform); + case PI_DEVICE_INFO_NAME: + return ReturnValue("ESIMD_CPU"); + case PI_DEVICE_INFO_IMAGE_SUPPORT: + return ReturnValue(pi_bool{true}); + case PI_DEVICE_INFO_DRIVER_VERSION: + return ReturnValue("0.0.1"); + case PI_DEVICE_INFO_VENDOR: + return ReturnValue("Intel(R) Corporation"); + case PI_DEVICE_INFO_IMAGE2D_MAX_WIDTH: + return ReturnValue(size_t{8192}); + case PI_DEVICE_INFO_IMAGE2D_MAX_HEIGHT: + return ReturnValue(size_t{8192}); + case PI_DEVICE_INFO_HOST_UNIFIED_MEMORY: + return ReturnValue(pi_bool{1}); + +#define UNSUPPORTED_INFO(info) \ + case info: \ + std::cerr << std::endl \ + << "Unsupported device info = " << #info << " from ESIMD_CPU" \ + << std::endl; \ + DIE_NO_IMPLEMENTATION; \ + break; + + UNSUPPORTED_INFO(PI_DEVICE_INFO_VENDOR_ID) + UNSUPPORTED_INFO(PI_DEVICE_INFO_EXTENSIONS) + UNSUPPORTED_INFO(PI_DEVICE_INFO_COMPILER_AVAILABLE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_LINKER_AVAILABLE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_COMPUTE_UNITS) + UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS) + UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_WORK_GROUP_SIZE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES) + UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_CLOCK_FREQUENCY) + UNSUPPORTED_INFO(PI_DEVICE_INFO_ADDRESS_BITS) + UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_MEM_ALLOC_SIZE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_GLOBAL_MEM_SIZE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_LOCAL_MEM_SIZE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_AVAILABLE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_VERSION) + UNSUPPORTED_INFO(PI_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES) + UNSUPPORTED_INFO(PI_DEVICE_INFO_REFERENCE_COUNT) + UNSUPPORTED_INFO(PI_DEVICE_INFO_PARTITION_PROPERTIES) + UNSUPPORTED_INFO(PI_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN) + UNSUPPORTED_INFO(PI_DEVICE_INFO_PARTITION_TYPE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_OPENCL_C_VERSION) + UNSUPPORTED_INFO(PI_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC) + UNSUPPORTED_INFO(PI_DEVICE_INFO_PRINTF_BUFFER_SIZE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_PROFILE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_BUILT_IN_KERNELS) + UNSUPPORTED_INFO(PI_DEVICE_INFO_QUEUE_PROPERTIES) + UNSUPPORTED_INFO(PI_DEVICE_INFO_EXECUTION_CAPABILITIES) + UNSUPPORTED_INFO(PI_DEVICE_INFO_ENDIAN_LITTLE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_ERROR_CORRECTION_SUPPORT) + UNSUPPORTED_INFO(PI_DEVICE_INFO_PROFILING_TIMER_RESOLUTION) + UNSUPPORTED_INFO(PI_DEVICE_INFO_LOCAL_MEM_TYPE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_CONSTANT_ARGS) + UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_GLOBAL_MEM_CACHELINE_SIZE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_PARAMETER_SIZE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_MEM_BASE_ADDR_ALIGN) + UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_SAMPLERS) + UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_READ_IMAGE_ARGS) + UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS) + UNSUPPORTED_INFO(PI_DEVICE_INFO_SINGLE_FP_CONFIG) + UNSUPPORTED_INFO(PI_DEVICE_INFO_HALF_FP_CONFIG) + UNSUPPORTED_INFO(PI_DEVICE_INFO_DOUBLE_FP_CONFIG) + UNSUPPORTED_INFO(PI_DEVICE_INFO_IMAGE3D_MAX_WIDTH) + UNSUPPORTED_INFO(PI_DEVICE_INFO_IMAGE3D_MAX_HEIGHT) + UNSUPPORTED_INFO(PI_DEVICE_INFO_IMAGE3D_MAX_DEPTH) + UNSUPPORTED_INFO(PI_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR) + UNSUPPORTED_INFO(PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR) + UNSUPPORTED_INFO(PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT) + UNSUPPORTED_INFO(PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_SHORT) + UNSUPPORTED_INFO(PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT) + UNSUPPORTED_INFO(PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT) + UNSUPPORTED_INFO(PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG) + UNSUPPORTED_INFO(PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_LONG) + UNSUPPORTED_INFO(PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT) + UNSUPPORTED_INFO(PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT) + UNSUPPORTED_INFO(PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF) + UNSUPPORTED_INFO(PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF) + UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_NUM_SUB_GROUPS) + UNSUPPORTED_INFO(PI_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS) + UNSUPPORTED_INFO(PI_DEVICE_INFO_SUB_GROUP_SIZES_INTEL) + UNSUPPORTED_INFO(PI_DEVICE_INFO_IL_VERSION) + UNSUPPORTED_INFO(PI_DEVICE_INFO_USM_HOST_SUPPORT) + UNSUPPORTED_INFO(PI_DEVICE_INFO_USM_DEVICE_SUPPORT) + UNSUPPORTED_INFO(PI_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT) + UNSUPPORTED_INFO(PI_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT) + UNSUPPORTED_INFO(PI_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT) + +#undef UNSUPPORTED_INFO + default: + DIE_NO_IMPLEMENTATION; + } return PI_SUCCESS; } pi_result piDevicePartition(pi_device, const pi_device_partition_property *, pi_uint32, pi_device *, pi_uint32 *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piextDeviceGetNativeHandle(pi_device, pi_native_handle *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piextDeviceCreateWithNativeHandle(pi_native_handle, pi_platform, pi_device *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piContextCreate(const pi_context_properties *Properties, @@ -593,149 +616,346 @@ pi_result piContextCreate(const pi_context_properties *Properties, const void *PrivateInfo, size_t CB, void *UserData), void *UserData, pi_context *RetContext) { - (void)Properties; - (void)NumDevices; - (void)Devices; - (void)PFnNotify; - (void)UserData; - (void)RetContext; - DIE_NO_IMPLEMENTATION; + if (NumDevices != 1) { + return PI_INVALID_VALUE; + } + if (Devices == nullptr) { + return PI_INVALID_DEVICE; + } + if (RetContext == nullptr) { + return PI_INVALID_VALUE; + } + + try { + /// Single-root-device + *RetContext = new _pi_context(Devices[0]); + } catch (const std::bad_alloc &) { + return PI_OUT_OF_HOST_MEMORY; + } catch (...) { + return PI_ERROR_UNKNOWN; + } return PI_SUCCESS; } pi_result piContextGetInfo(pi_context, pi_context_info, size_t, void *, size_t *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piextContextSetExtendedDeleter(pi_context, pi_context_extended_deleter, void *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piextContextGetNativeHandle(pi_context, pi_native_handle *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piextContextCreateWithNativeHandle(pi_native_handle, pi_uint32, const pi_device *, bool, pi_context *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } -pi_result piContextRetain(pi_context) { - DIE_NO_IMPLEMENTATION; +pi_result piContextRetain(pi_context Context) { + if (Context == nullptr) { + return PI_INVALID_CONTEXT; + } + + ++(Context->RefCount); + return PI_SUCCESS; } pi_result piContextRelease(pi_context Context) { - (void)Context; - DIE_NO_IMPLEMENTATION; + if (Context == nullptr || (Context->RefCount <= 0)) { + return PI_INVALID_CONTEXT; + } + + if (--(Context->RefCount) == 0) { + for (auto &Entry : Context->Addr2CmBufferSVM) { + Context->Device->CmDevicePtr->DestroyBufferSVM(Entry.second); + } + delete Context; + } + return PI_SUCCESS; } pi_result piQueueCreate(pi_context Context, pi_device Device, pi_queue_properties Properties, pi_queue *Queue) { - (void)Context; - (void)Device; - (void)Properties; - (void)Queue; - DIE_NO_IMPLEMENTATION; + cm_support::CmQueue *CmQueue; + + int Result = Context->Device->CmDevicePtr->CreateQueue(CmQueue); + if (Result != cm_support::CM_SUCCESS) { + return PI_INVALID_CONTEXT; + } + + try { + *Queue = new _pi_queue(Context, CmQueue); + } catch (const std::bad_alloc &) { + return PI_OUT_OF_HOST_MEMORY; + } catch (...) { + return PI_ERROR_UNKNOWN; + } + return PI_SUCCESS; } pi_result piQueueGetInfo(pi_queue, pi_queue_info, size_t, void *, size_t *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } -pi_result piQueueRetain(pi_queue) { - DIE_NO_IMPLEMENTATION; +pi_result piQueueRetain(pi_queue Queue) { + if (Queue == nullptr) { + return PI_INVALID_QUEUE; + } + ++(Queue->RefCount); return PI_SUCCESS; } pi_result piQueueRelease(pi_queue Queue) { - (void)Queue; - DIE_NO_IMPLEMENTATION; + if ((Queue == nullptr) || (Queue->CmQueuePtr == nullptr)) { + return PI_INVALID_QUEUE; + } + + if (--(Queue->RefCount) == 0) { + // CM's 'DestoryQueue' is no-op + // Queue->Context->Device->CmDevicePTr->DestroyQueue(Queue->CmQueuePtr); + delete Queue; + } + return PI_SUCCESS; } pi_result piQueueFinish(pi_queue) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piextQueueGetNativeHandle(pi_queue, pi_native_handle *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piextQueueCreateWithNativeHandle(pi_native_handle, pi_context, pi_queue *, bool) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size, void *HostPtr, pi_mem *RetMem, const pi_mem_properties *properties) { - (void)Context; - (void)Flags; - (void)Size; - (void)HostPtr; - (void)RetMem; - (void)properties; - DIE_NO_IMPLEMENTATION; + if ((Flags & PI_MEM_FLAGS_ACCESS_RW) == 0) { + if (PrintPiTrace) { + std::cerr << "Invalid memory attribute for piMemBufferCreate"; + } + return PI_INVALID_OPERATION; + } + + if (Context == nullptr) { + return PI_INVALID_CONTEXT; + } + if (RetMem == nullptr) { + return PI_INVALID_VALUE; + } + + cm_support::CmBuffer *CmBuf = nullptr; + cm_support::SurfaceIndex *CmIndex; + + int Status = Context->Device->CmDevicePtr->CreateBuffer( + static_cast(Size), CmBuf); + + if (Status != cm_support::CM_SUCCESS) { + return PI_OUT_OF_HOST_MEMORY; + } + + Status = CmBuf->GetIndex(CmIndex); + + // Initialize the buffer with user data provided with 'HostPtr' + if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0) { + if (HostPtr != nullptr) { + Status = + CmBuf->WriteSurface(reinterpret_cast(HostPtr), + nullptr, static_cast(Size)); + } + } + + auto HostPtrOrNull = + (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) ? nullptr : pi_cast(HostPtr); + + try { + *RetMem = + new _pi_buffer(Context, HostPtrOrNull, CmBuf, + /* integer buffer index */ CmIndex->get_data(), Size); + } catch (const std::bad_alloc &) { + return PI_OUT_OF_HOST_MEMORY; + } catch (...) { + return PI_ERROR_UNKNOWN; + } + return PI_SUCCESS; } pi_result piMemGetInfo(pi_mem, cl_mem_info, size_t, void *, size_t *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } -pi_result piMemRetain(pi_mem) { - DIE_NO_IMPLEMENTATION; +pi_result piMemRetain(pi_mem Mem) { + if (Mem == nullptr) { + return PI_INVALID_MEM_OBJECT; + } + ++(Mem->RefCount); return PI_SUCCESS; } pi_result piMemRelease(pi_mem Mem) { - (void)Mem; - DIE_NO_IMPLEMENTATION; + if ((Mem == nullptr) || (Mem->RefCount == 0)) { + return PI_INVALID_MEM_OBJECT; + } + + if (--(Mem->RefCount) == 0) { + if (Mem->getMemType() == PI_MEM_TYPE_BUFFER) { + _pi_buffer *PiBuf = static_cast<_pi_buffer *>(Mem); + // TODO implement libCM API failure logging mechanism, so that these + // failures are clearly distinguishable from other EMU plugin failures. + int Result = + Mem->Context->Device->CmDevicePtr->DestroySurface(PiBuf->CmBufferPtr); + + if (Result != cm_support::CM_SUCCESS) { + return PI_INVALID_MEM_OBJECT; + } + } else if (Mem->getMemType() == PI_MEM_TYPE_IMAGE2D) { + _pi_image *PiImg = static_cast<_pi_image *>(Mem); + int Result = Mem->Context->Device->CmDevicePtr->DestroySurface( + PiImg->CmSurfacePtr); + if (Result != cm_support::CM_SUCCESS) { + return PI_INVALID_MEM_OBJECT; + } + } else { + return PI_INVALID_MEM_OBJECT; + } + + delete Mem; + } + return PI_SUCCESS; } +cm_support::CM_SURFACE_FORMAT +ConvertPiImageFormatToCmFormat(const pi_image_format *PiFormat) { + using ULongPair = std::pair; + using FmtMap = std::map; + static const FmtMap pi2cm = { + {{PI_IMAGE_CHANNEL_TYPE_UNORM_INT8, PI_IMAGE_CHANNEL_ORDER_RGBA}, + cm_support::CM_SURFACE_FORMAT_A8R8G8B8}, + + {{PI_IMAGE_CHANNEL_TYPE_UNORM_INT8, PI_IMAGE_CHANNEL_ORDER_ARGB}, + cm_support::CM_SURFACE_FORMAT_A8R8G8B8}, + + {{PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8, PI_IMAGE_CHANNEL_ORDER_RGBA}, + cm_support::CM_SURFACE_FORMAT_A8R8G8B8}, + + {{PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32, PI_IMAGE_CHANNEL_ORDER_RGBA}, + cm_support::CM_SURFACE_FORMAT_R32G32B32A32F}, + }; + auto Result = pi2cm.find( + {PiFormat->image_channel_data_type, PiFormat->image_channel_order}); + if (Result != pi2cm.end()) { + return Result->second; + } + return cm_support::CM_SURFACE_FORMAT_UNKNOWN; +} + pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags, const pi_image_format *ImageFormat, const pi_image_desc *ImageDesc, void *HostPtr, pi_mem *RetImage) { - (void)Context; - (void)Flags; - (void)ImageFormat; - (void)ImageDesc; - (void)HostPtr; - (void)RetImage; - DIE_NO_IMPLEMENTATION; + if ((Flags & PI_MEM_FLAGS_ACCESS_RW) == 0) { + if (PrintPiTrace) { + std::cerr << "Invalid memory attribute for piMemImageCreate"; + } + return PI_INVALID_OPERATION; + } + + if (ImageFormat == nullptr || ImageDesc == nullptr) + return PI_INVALID_IMAGE_FORMAT_DESCRIPTOR; + + switch (ImageDesc->image_type) { + case PI_MEM_TYPE_IMAGE2D: + break; + default: + return PI_INVALID_MEM_OBJECT; + } + + auto BytesPerPixel = 4; + switch (ImageFormat->image_channel_data_type) { + case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32: + BytesPerPixel = 16; + break; + case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8: + case PI_IMAGE_CHANNEL_TYPE_UNORM_INT8: + BytesPerPixel = 4; + break; + default: + return PI_IMAGE_FORMAT_NOT_SUPPORTED; + } + + cm_support::CmSurface2D *CmSurface = nullptr; + cm_support::SurfaceIndex *CmIndex; + cm_support::CM_SURFACE_FORMAT CmSurfFormat = + ConvertPiImageFormatToCmFormat(ImageFormat); + + if (CmSurfFormat == cm_support::CM_SURFACE_FORMAT_UNKNOWN) { + return PI_IMAGE_FORMAT_NOT_SUPPORTED; + } + + int Status = Context->Device->CmDevicePtr->CreateSurface2D( + static_cast(ImageDesc->image_width), + static_cast(ImageDesc->image_height), CmSurfFormat, + CmSurface); + + if (Status != cm_support::CM_SUCCESS) { + return PI_OUT_OF_HOST_MEMORY; + } + + Status = CmSurface->GetIndex(CmIndex); + + // Initialize the buffer with user data provided with 'HostPtr' + if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0) { + if (HostPtr != nullptr) { + Status = CmSurface->WriteSurface( + reinterpret_cast(HostPtr), nullptr, + static_cast(ImageDesc->image_width * + ImageDesc->image_height * BytesPerPixel)); + } + } + + auto HostPtrOrNull = + (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) ? nullptr : pi_cast(HostPtr); + + try { + *RetImage = new _pi_image(Context, HostPtrOrNull, CmSurface, + /* integer surface index */ CmIndex->get_data(), + ImageDesc->image_width, ImageDesc->image_height, + BytesPerPixel); + } catch (const std::bad_alloc &) { + return PI_OUT_OF_HOST_MEMORY; + } catch (...) { + return PI_ERROR_UNKNOWN; + } + return PI_SUCCESS; } pi_result piextMemGetNativeHandle(pi_mem, pi_native_handle *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piextMemCreateWithNativeHandle(pi_native_handle, pi_mem *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piProgramCreate(pi_context, const void *, size_t, pi_program *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piProgramCreateWithBinary(pi_context, pi_uint32, const pi_device *, @@ -743,33 +963,28 @@ pi_result piProgramCreateWithBinary(pi_context, pi_uint32, const pi_device *, size_t, const pi_device_binary_property *, pi_int32 *, pi_program *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piclProgramCreateWithBinary(pi_context, pi_uint32, const pi_device *, const size_t *, const unsigned char **, pi_int32 *, pi_program *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piclProgramCreateWithSource(pi_context, pi_uint32, const char **, const size_t *, pi_program *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piProgramGetInfo(pi_program, pi_program_info, size_t, void *, size_t *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piProgramLink(pi_context, pi_uint32, const pi_device *, const char *, pi_uint32, const pi_program *, void (*)(pi_program, void *), void *, pi_program *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piProgramCompile(pi_program, pi_uint32, const pi_device *, @@ -777,186 +992,185 @@ pi_result piProgramCompile(pi_program, pi_uint32, const pi_device *, const char **, void (*)(pi_program, void *), void *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piProgramBuild(pi_program, pi_uint32, const pi_device *, const char *, void (*)(pi_program, void *), void *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piProgramGetBuildInfo(pi_program, pi_device, cl_program_build_info, size_t, void *, size_t *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piProgramRetain(pi_program) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piProgramRelease(pi_program) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piextProgramGetNativeHandle(pi_program, pi_native_handle *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piextProgramCreateWithNativeHandle(pi_native_handle, pi_context, pi_program *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piKernelCreate(pi_program, const char *, pi_kernel *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piKernelSetArg(pi_kernel, pi_uint32, size_t, const void *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piextKernelSetArgMemObj(pi_kernel, pi_uint32, const pi_mem *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } // Special version of piKernelSetArg to accept pi_sampler. pi_result piextKernelSetArgSampler(pi_kernel, pi_uint32, const pi_sampler *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piKernelGetInfo(pi_kernel, pi_kernel_info, size_t, void *, size_t *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piKernelGetGroupInfo(pi_kernel, pi_device, pi_kernel_group_info, size_t, void *, size_t *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } -pi_result -piKernelGetSubGroupInfo(pi_kernel, pi_device, - pi_kernel_sub_group_info, // TODO: untie from OpenCL - size_t, const void *, size_t, void *, size_t *) { +pi_result piKernelGetSubGroupInfo(pi_kernel, pi_device, + pi_kernel_sub_group_info, size_t, + const void *, size_t, void *, size_t *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piKernelRetain(pi_kernel) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piKernelRelease(pi_kernel) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piEventCreate(pi_context, pi_event *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piEventGetInfo(pi_event, pi_event_info, size_t, void *, size_t *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piEventGetProfilingInfo(pi_event Event, pi_profiling_info ParamName, size_t ParamValueSize, void *ParamValue, size_t *ParamValueSizeRet) { - (void)Event; - (void)ParamName; - (void)ParamValueSize; - (void)ParamValue; - (void)ParamValueSizeRet; - DIE_NO_IMPLEMENTATION; + if (PrintPiTrace) { + std::cerr << "Warning : Profiling Not supported under PI_ESIMD_CPU" + << std::endl; + } return PI_SUCCESS; } pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) { - (void)NumEvents; - (void)EventList; - DIE_NO_IMPLEMENTATION; + for (int i = 0; i < (int)NumEvents; i++) { + if (EventList[i]->IsDummyEvent) { + // Dummy event is already completed ones done by CM. Skip + // waiting. + continue; + } + if (EventList[i]->CmEventPtr == nullptr) { + return PI_INVALID_EVENT; + } + int Result = EventList[i]->CmEventPtr->WaitForTaskFinished(); + if (Result != cm_support::CM_SUCCESS) { + return PI_OUT_OF_RESOURCES; + } + } return PI_SUCCESS; } pi_result piEventSetCallback(pi_event, pi_int32, void (*)(pi_event, pi_int32, void *), void *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piEventSetStatus(pi_event, pi_int32) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } -pi_result piEventRetain(pi_event) { - DIE_NO_IMPLEMENTATION; +pi_result piEventRetain(pi_event Event) { + if (Event == nullptr) { + return PI_INVALID_EVENT; + } + + ++(Event->RefCount); + return PI_SUCCESS; } pi_result piEventRelease(pi_event Event) { - (void)Event; - DIE_NO_IMPLEMENTATION; + if (Event == nullptr || (Event->RefCount <= 0)) { + return PI_INVALID_EVENT; + } + + if (--(Event->RefCount) == 0) { + if (!Event->IsDummyEvent) { + if ((Event->CmEventPtr == nullptr) || (Event->OwnerQueue == nullptr)) { + return PI_INVALID_EVENT; + } + int Result = Event->OwnerQueue->DestroyEvent(Event->CmEventPtr); + if (Result != cm_support::CM_SUCCESS) { + return PI_INVALID_EVENT; + } + } + delete Event; + } + return PI_SUCCESS; } pi_result piextEventGetNativeHandle(pi_event, pi_native_handle *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piextEventCreateWithNativeHandle(pi_native_handle, pi_context, bool, pi_event *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } - pi_result piSamplerCreate(pi_context, const pi_sampler_properties *, pi_sampler *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piSamplerGetInfo(pi_sampler, pi_sampler_info, size_t, void *, size_t *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piSamplerRetain(pi_sampler) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piSamplerRelease(pi_sampler) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piEnqueueEventsWait(pi_queue, pi_uint32, const pi_event *, pi_event *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piEnqueueEventsWaitWithBarrier(pi_queue, pi_uint32, const pi_event *, pi_event *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piEnqueueMemBufferRead(pi_queue Queue, pi_mem Src, @@ -965,16 +1179,41 @@ pi_result piEnqueueMemBufferRead(pi_queue Queue, pi_mem Src, pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, pi_event *Event) { - (void)Queue; - (void)Src; - (void)BlockingRead; - (void)Offset; - (void)Size; - (void)Dst; - (void)NumEventsInWaitList; - (void)EventWaitList; - (void)Event; - DIE_NO_IMPLEMENTATION; + /// TODO : Support Blocked read, 'Queue' handling + if (BlockingRead) { + assert(false && + "ESIMD_CPU support for blocking piEnqueueMemBufferRead is NYI"); + } + if (NumEventsInWaitList != 0) { + return PI_INVALID_EVENT_WAIT_LIST; + } + + _pi_buffer *buf = static_cast<_pi_buffer *>(Src); + + int Status = + buf->CmBufferPtr->ReadSurface(reinterpret_cast(Dst), + nullptr, // event + static_cast(Size)); + + if (Status != cm_support::CM_SUCCESS) { + return PI_INVALID_MEM_OBJECT; + } + + if (Event) { + try { + *Event = new _pi_event(); + } catch (const std::bad_alloc &) { + return PI_OUT_OF_HOST_MEMORY; + } catch (...) { + return PI_ERROR_UNKNOWN; + } + + // At this point, CM already completed buffer-read (ReadSurface) + // operation. Therefore, 'event' corresponding to this operation + // is marked as dummy one and ignored during events-waiting. + (*Event)->IsDummyEvent = true; + } + return PI_SUCCESS; } @@ -984,14 +1223,12 @@ pi_result piEnqueueMemBufferReadRect(pi_queue, pi_mem, pi_bool, size_t, size_t, void *, pi_uint32, const pi_event *, pi_event *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piEnqueueMemBufferWrite(pi_queue, pi_mem, pi_bool, size_t, size_t, const void *, pi_uint32, const pi_event *, pi_event *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piEnqueueMemBufferWriteRect(pi_queue, pi_mem, pi_bool, @@ -1000,14 +1237,12 @@ pi_result piEnqueueMemBufferWriteRect(pi_queue, pi_mem, pi_bool, size_t, size_t, const void *, pi_uint32, const pi_event *, pi_event *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piEnqueueMemBufferCopy(pi_queue, pi_mem, pi_mem, size_t, size_t, size_t, pi_uint32, const pi_event *, pi_event *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piEnqueueMemBufferCopyRect(pi_queue, pi_mem, pi_mem, @@ -1016,53 +1251,63 @@ pi_result piEnqueueMemBufferCopyRect(pi_queue, pi_mem, pi_mem, size_t, size_t, pi_uint32, const pi_event *, pi_event *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piEnqueueMemBufferFill(pi_queue, pi_mem, const void *, size_t, size_t, size_t, pi_uint32, const pi_event *, pi_event *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piEnqueueMemBufferMap(pi_queue, pi_mem, pi_bool, pi_map_flags, size_t, size_t, pi_uint32, const pi_event *, pi_event *, void **) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piEnqueueMemUnmap(pi_queue, pi_mem, void *, pi_uint32, const pi_event *, pi_event *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piMemImageGetInfo(pi_mem, pi_image_info, size_t, void *, size_t *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } -pi_result piEnqueueMemImageRead(pi_queue command_queue, pi_mem image, - pi_bool blocking_read, pi_image_offset origin, - pi_image_region region, size_t row_pitch, - size_t slice_pitch, void *ptr, - pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, - pi_event *event) { - (void)command_queue; - (void)image; - (void)blocking_read; - (void)origin; - (void)region; - (void)row_pitch; - (void)slice_pitch; - (void)ptr; - (void)num_events_in_wait_list; - (void)event_wait_list; - (void)event; - DIE_NO_IMPLEMENTATION; +pi_result piEnqueueMemImageRead(pi_queue CommandQueue, pi_mem Image, + pi_bool BlockingRead, pi_image_offset Origin, + pi_image_region Region, size_t RowPitch, + size_t SlicePitch, void *Ptr, + pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, + pi_event *Event) { + /// TODO : Support Blocked read, 'Queue' handling + if (BlockingRead) { + assert(false && "ESIMD_CPU does not support Blocking Read"); + } + _pi_image *PiImg = static_cast<_pi_image *>(Image); + int Status = + PiImg->CmSurfacePtr->ReadSurface(reinterpret_cast(Ptr), + nullptr, // event + RowPitch * (Region->height)); + if (Status != cm_support::CM_SUCCESS) { + return PI_INVALID_MEM_OBJECT; + } + + if (Event) { + try { + *Event = new _pi_event(); + } catch (const std::bad_alloc &) { + return PI_OUT_OF_HOST_MEMORY; + } catch (...) { + return PI_ERROR_UNKNOWN; + } + + // At this point, CM already completed image-read (ReadSurface) + // operation. Therefore, 'event' corresponding to this operation + // is marked as dummy one and ignored during events-waiting. + (*Event)->IsDummyEvent = true; + } return PI_SUCCESS; } @@ -1070,27 +1315,23 @@ pi_result piEnqueueMemImageWrite(pi_queue, pi_mem, pi_bool, pi_image_offset, pi_image_region, size_t, size_t, const void *, pi_uint32, const pi_event *, pi_event *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piEnqueueMemImageCopy(pi_queue, pi_mem, pi_mem, pi_image_offset, pi_image_offset, pi_image_region, pi_uint32, const pi_event *, pi_event *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piEnqueueMemImageFill(pi_queue, pi_mem, const void *, const size_t *, const size_t *, pi_uint32, const pi_event *, pi_event *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piMemBufferPartition(pi_mem, pi_mem_flags, pi_buffer_create_type, void *, pi_mem *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result @@ -1099,132 +1340,176 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, const size_t *GlobalWorkSize, const size_t *LocalWorkSize, pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, pi_event *Event) { - (void)Queue; - (void)Kernel; - (void)WorkDim; - (void)GlobalWorkOffset; - (void)GlobalWorkSize; - (void)LocalWorkSize; - (void)NumEventsInWaitList; - (void)EventWaitList; - (void)Event; - DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; + const size_t LocalWorkSz[] = {1, 1, 1}; + + if (Kernel == nullptr) { + return PI_INVALID_KERNEL; + } + + if ((WorkDim > 3) || (WorkDim == 0)) { + return PI_INVALID_WORK_GROUP_SIZE; + } + + if (isNull(WorkDim, LocalWorkSize)) { + LocalWorkSize = LocalWorkSz; + } + + for (pi_uint32 I = 0; I < WorkDim; I++) { + if ((GlobalWorkSize[I] % LocalWorkSize[I]) != 0) { + return PI_INVALID_WORK_GROUP_SIZE; + } + } + + switch (WorkDim) { + case 1: + InvokeImpl<1>::invoke(Kernel, GlobalWorkOffset, GlobalWorkSize, + LocalWorkSize); + return PI_SUCCESS; + + case 2: + InvokeImpl<2>::invoke(Kernel, GlobalWorkOffset, GlobalWorkSize, + LocalWorkSize); + return PI_SUCCESS; + + case 3: + InvokeImpl<3>::invoke(Kernel, GlobalWorkOffset, GlobalWorkSize, + LocalWorkSize); + return PI_SUCCESS; + + default: + DIE_NO_IMPLEMENTATION; + } } pi_result piextKernelCreateWithNativeHandle(pi_native_handle, pi_context, bool, pi_kernel *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piextKernelGetNativeHandle(pi_kernel, pi_native_handle *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piEnqueueNativeKernel(pi_queue, void (*)(void *), void *, size_t, pi_uint32, const pi_mem *, const void **, pi_uint32, const pi_event *, pi_event *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piextGetDeviceFunctionPointer(pi_device, pi_program, const char *, pi_uint64 *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piextUSMHostAlloc(void **, pi_context, pi_usm_mem_properties *, size_t, pi_uint32) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piextUSMDeviceAlloc(void **, pi_context, pi_device, pi_usm_mem_properties *, size_t, pi_uint32) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piextUSMSharedAlloc(void **ResultPtr, pi_context Context, pi_device Device, pi_usm_mem_properties *Properties, size_t Size, pi_uint32 Alignment) { - (void)ResultPtr; - (void)Context; - (void)Device; - (void)Properties; - (void)Size; - (void)Alignment; - DIE_NO_IMPLEMENTATION; + if (Context == nullptr || (Device != Context->Device)) { + return PI_INVALID_CONTEXT; + } + + if (ResultPtr == nullptr) { + return PI_INVALID_OPERATION; + } + + cm_support::CmBufferSVM *Buf = nullptr; + void *SystemMemPtr = nullptr; + int32_t Result = Context->Device->CmDevicePtr->CreateBufferSVM( + Size, SystemMemPtr, CM_SVM_ACCESS_FLAG_DEFAULT, Buf); + + if (Result != cm_support::CM_SUCCESS) { + return PI_OUT_OF_HOST_MEMORY; + } + *ResultPtr = SystemMemPtr; + auto Iter = Context->Addr2CmBufferSVM.find(SystemMemPtr); + if (Context->Addr2CmBufferSVM.end() != Iter) { + return PI_INVALID_MEM_OBJECT; + } + Context->Addr2CmBufferSVM[SystemMemPtr] = Buf; return PI_SUCCESS; } pi_result piextUSMFree(pi_context Context, void *Ptr) { - (void)Context; - (void)Ptr; - DIE_NO_IMPLEMENTATION; + if (Context == nullptr) { + return PI_INVALID_CONTEXT; + } + if (Ptr == nullptr) { + return PI_INVALID_OPERATION; + } + + cm_support::CmBufferSVM *Buf = Context->Addr2CmBufferSVM[Ptr]; + if (Buf == nullptr) { + return PI_INVALID_MEM_OBJECT; + } + auto Count = Context->Addr2CmBufferSVM.erase(Ptr); + if (Count != 1) { + return PI_INVALID_MEM_OBJECT; + } + int32_t Result = Context->Device->CmDevicePtr->DestroyBufferSVM(Buf); + if (cm_support::CM_SUCCESS != Result) { + return PI_ERROR_UNKNOWN; + } return PI_SUCCESS; } pi_result piextKernelSetArgPointer(pi_kernel, pi_uint32, size_t, const void *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piextUSMEnqueueMemset(pi_queue, void *, pi_int32, size_t, pi_uint32, const pi_event *, pi_event *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piextUSMEnqueueMemcpy(pi_queue, pi_bool, void *, const void *, size_t, pi_uint32, const pi_event *, pi_event *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piextUSMEnqueueMemAdvise(pi_queue, const void *, size_t, pi_mem_advice, pi_event *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piextUSMGetMemAllocInfo(pi_context, const void *, pi_mem_info, size_t, void *, size_t *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piKernelSetExecInfo(pi_kernel, pi_kernel_exec_info, size_t, const void *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piextProgramSetSpecializationConstant(pi_program, pi_uint32, size_t, const void *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piextDeviceSelectBinary(pi_device, pi_device_binary *, pi_uint32, pi_uint32 *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } pi_result piextUSMEnqueuePrefetch(pi_queue, const void *, size_t, pi_usm_migration_flags, pi_uint32, const pi_event *, pi_event *) { DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; } -pi_result piextPluginGetOpaqueData(void *, void **opaque_data_return) { - *opaque_data_return = reinterpret_cast(PiESimdDeviceAccess); +pi_result piextPluginGetOpaqueData(void *, void **OpaqueDataReturn) { + *OpaqueDataReturn = reinterpret_cast(PiESimdDeviceAccess); return PI_SUCCESS; } @@ -1236,9 +1521,14 @@ pi_result piTearDown(void *) { } pi_result piPluginInit(pi_plugin *PluginInit) { - assert(PluginInit); + if (PluginInit == nullptr) { + return PI_INVALID_VALUE; + } + size_t PluginVersionSize = sizeof(PluginInit->PluginVersion); - assert(strlen(_PI_H_VERSION_STRING) < PluginVersionSize); + if (strlen(_PI_H_VERSION_STRING) >= PluginVersionSize) { + return PI_INVALID_VALUE; + } strncpy(PluginInit->PluginVersion, _PI_H_VERSION_STRING, PluginVersionSize); PiESimdDeviceAccess = new sycl::detail::ESIMDEmuPluginOpaqueData(); diff --git a/sycl/plugins/esimd_cpu/pi_esimd_cpu.hpp b/sycl/plugins/esimd_cpu/pi_esimd_cpu.hpp index 7f7df4d2fe71f..099794e2c56d3 100755 --- a/sycl/plugins/esimd_cpu/pi_esimd_cpu.hpp +++ b/sycl/plugins/esimd_cpu/pi_esimd_cpu.hpp @@ -23,6 +23,10 @@ #include +namespace cm_support { +#include +} // namespace cm_support + template To pi_cast(From Value) { // TODO: see if more sanity checks are possible. assert(sizeof(From) == sizeof(To)); @@ -43,4 +47,117 @@ template <> uint32_t pi_cast(uint64_t Value) { std::terminate(); } +// Base class to store common data +struct _pi_object { + _pi_object() : RefCount{1} {} + + std::atomic RefCount; +}; +struct _pi_platform { + _pi_platform() = default; + + // Keep Version information. + std::string CmEmuVersion; +}; + +struct _pi_device : _pi_object { + _pi_device(pi_platform ArgPlt, cm_support::CmDevice *ArgCmDev) + : Platform{ArgPlt}, CmDevicePtr{ArgCmDev} {} + + pi_platform Platform; + cm_support::CmDevice *CmDevicePtr = nullptr; +}; + +struct _pi_context : _pi_object { + _pi_context(pi_device ArgDevice) : Device{ArgDevice} {} + + /// One-to-one mapping between Context and Device + pi_device Device; + + /// Map SVM memory starting address to corresponding + /// CmBufferSVM object. CmBufferSVM object is needed to release memory. + std::unordered_map Addr2CmBufferSVM; +}; + +struct _pi_queue : _pi_object { + _pi_queue(pi_context ContextArg, cm_support::CmQueue *CmQueueArg) + : Context{ContextArg}, CmQueuePtr{CmQueueArg} {} + + // Keeps the PI context to which this queue belongs. + pi_context Context = nullptr; + cm_support::CmQueue *CmQueuePtr = nullptr; +}; + +struct _pi_mem : _pi_object { + _pi_mem() = default; + + pi_context Context; + + char *MapHostPtr = nullptr; + + // Mutex for load/store accessing + std::mutex mutexLock; + + // Surface index used by CM + int SurfaceIndex; + + virtual ~_pi_mem() = default; + + _pi_mem_type getMemType() const { return MemType; }; + +protected: + _pi_mem(pi_context ctxt, char *HostPtr, _pi_mem_type MemTypeArg, + int SurfaceIdxArg) + : Context{ctxt}, MapHostPtr{HostPtr}, + SurfaceIndex{SurfaceIdxArg}, MemType{MemTypeArg} {} + +private: + _pi_mem_type MemType; +}; + +struct _pi_buffer final : _pi_mem { + // Buffer/Sub-buffer constructor + _pi_buffer(pi_context ctxt, char *HostPtr, cm_support::CmBuffer *CmBufArg, + int SurfaceIdxArg, size_t SizeArg) + : _pi_mem(ctxt, HostPtr, PI_MEM_TYPE_BUFFER, SurfaceIdxArg), + CmBufferPtr{CmBufArg}, Size{SizeArg} {} + + cm_support::CmBuffer *CmBufferPtr; + size_t Size; +}; + +struct _pi_image final : _pi_mem { + // Image constructor + _pi_image(pi_context ctxt, char *HostPtr, cm_support::CmSurface2D *CmSurfArg, + int SurfaceIdxArg, size_t WidthArg, size_t HeightArg, size_t BPPArg) + : _pi_mem(ctxt, HostPtr, PI_MEM_TYPE_IMAGE2D, SurfaceIdxArg), + CmSurfacePtr{CmSurfArg}, Width{WidthArg}, Height{HeightArg}, + BytesPerPixel{BPPArg} {} + + cm_support::CmSurface2D *CmSurfacePtr; + size_t Width; + size_t Height; + size_t BytesPerPixel; +}; + +struct _pi_event : _pi_object { + _pi_event() {} + + cm_support::CmEvent *CmEventPtr = nullptr; + cm_support::CmQueue *OwnerQueue = nullptr; + pi_context Context = nullptr; + bool IsDummyEvent = false; +}; + +struct _pi_program : _pi_object { + _pi_program() {} + + // Keep the context of the program. + pi_context Context; +}; + +struct _pi_kernel : _pi_object { + _pi_kernel() {} +}; + #include