diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index 37d00e130ac1e..0cda023406a8e 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -5524,7 +5524,8 @@ class OffloadingActionBuilder final { // device libraries are only needed when current toolchain is using // AOT compilation. bool SYCLDeviceLibLinked = false; - if (IsSPIR || IsNVPTX) { + Action *NativeCPULib = nullptr; + if (IsSPIR || IsNVPTX || IsSYCLNativeCPU) { bool UseJitLink = IsSPIR && Args.hasFlag(options::OPT_fsycl_device_lib_jit_link, @@ -5532,10 +5533,8 @@ class OffloadingActionBuilder final { bool UseAOTLink = IsSPIR && (IsSpirvAOT || !UseJitLink); SYCLDeviceLibLinked = addSYCLDeviceLibs( TC, SYCLDeviceLibs, UseAOTLink, - C.getDefaultToolChain().getTriple().isWindowsMSVCEnvironment()); - } - if (IsSYCLNativeCPU) { - SYCLDeviceLibLinked |= addSYCLNativeCPULibs(TC, SYCLDeviceLibs); + C.getDefaultToolChain().getTriple().isWindowsMSVCEnvironment(), + IsSYCLNativeCPU, NativeCPULib); } JobAction *LinkSYCLLibs = C.MakeAction(SYCLDeviceLibs, types::TY_LLVM_BC); @@ -5618,6 +5617,15 @@ class OffloadingActionBuilder final { }; Action *PostLinkAction = createPostLinkAction(); if (IsSYCLNativeCPU) { + if (NativeCPULib) { + // The native cpu device lib is linked without --only-needed + // as it contains builtins not referenced in source code but + // needed by the native cpu backend. + clang::driver::ActionList AllLibs = {FullDeviceLinkAction, + NativeCPULib}; + FullDeviceLinkAction = + C.MakeAction(AllLibs, types::TY_LLVM_BC); + } // for SYCL Native CPU, we just take the linked device // modules, lower them to an object file , and link it to the host // object file. @@ -5802,60 +5810,9 @@ class OffloadingActionBuilder final { } } - bool addSYCLNativeCPULibs(const ToolChain *TC, - ActionList &DeviceLinkObjects) { - std::string LibSpirvFile; - if (Args.hasArg(options::OPT_fsycl_libspirv_path_EQ)) { - auto ProvidedPath = - Args.getLastArgValue(options::OPT_fsycl_libspirv_path_EQ).str(); - if (llvm::sys::fs::exists(ProvidedPath)) - LibSpirvFile = ProvidedPath; - } else { - SmallVector LibraryPaths; - - // Expected path w/out install. - SmallString<256> WithoutInstallPath(C.getDriver().ResourceDir); - llvm::sys::path::append(WithoutInstallPath, Twine("../../clc")); - LibraryPaths.emplace_back(WithoutInstallPath.c_str()); - - // Expected path w/ install. - SmallString<256> WithInstallPath(C.getDriver().ResourceDir); - llvm::sys::path::append(WithInstallPath, Twine("../../../share/clc")); - LibraryPaths.emplace_back(WithInstallPath.c_str()); - - // Select libclc variant based on target triple. - // On Windows long is 32 bits, so we have to select the right remangled - // libclc version. - std::string LibSpirvTargetName = - (TC->getAuxTriple()->isOSWindows()) - ? "remangled-l32-signed_char.libspirv-" - : "remangled-l64-signed_char.libspirv-"; - LibSpirvTargetName.append(TC->getTripleString() + ".bc"); - - for (StringRef LibraryPath : LibraryPaths) { - SmallString<128> LibSpirvTargetFile(LibraryPath); - llvm::sys::path::append(LibSpirvTargetFile, LibSpirvTargetName); - if (llvm::sys::fs::exists(LibSpirvTargetFile) || - Args.hasArg(options::OPT__HASH_HASH_HASH)) { - LibSpirvFile = std::string(LibSpirvTargetFile.str()); - break; - } - } - } - - if (!LibSpirvFile.empty()) { - Arg *LibClcInputArg = MakeInputArg(Args, C.getDriver().getOpts(), - Args.MakeArgString(LibSpirvFile)); - auto *SYCLLibClcInputAction = - C.MakeAction(*LibClcInputArg, types::TY_LLVM_BC); - DeviceLinkObjects.push_back(SYCLLibClcInputAction); - return true; - } - return false; - } - bool addSYCLDeviceLibs(const ToolChain *TC, ActionList &DeviceLinkObjects, - bool isSpirvAOT, bool isMSVCEnv) { + bool isSpirvAOT, bool isMSVCEnv, bool isNativeCPU, + Action *&NativeCPULib) { int NumOfDeviceLibLinked = 0; SmallVector, 4> LibLocCandidates; SYCLInstallation.getSYCLDeviceLibPath(LibLocCandidates); @@ -5872,6 +5829,14 @@ class OffloadingActionBuilder final { SmallString<128> LibName(LLCandidate); llvm::sys::path::append(LibName, DeviceLib); if (llvm::sys::fs::exists(LibName)) { + + // NativeCPU currently only needs libsycl-nativecpu_utils and + // libclc, so temporarily skip other device libs in invocation. + // Todo: remove once NativeCPU tests the other libraries. + if (isNativeCPU && + !LibName.str().contains("libsycl-nativecpu_utils")) + continue; + ++NumOfDeviceLibLinked; Arg *InputArg = MakeInputArg(Args, C.getDriver().getOpts(), Args.MakeArgString(LibName)); @@ -5905,6 +5870,16 @@ class OffloadingActionBuilder final { } if (!LibLocSelected) LibLocSelected = !LibLocSelected; + + // The device link stage may remove symbols not referenced in the + // source code. Since libsycl-nativecpu_utils contains such symbols + // which are later needed by the NativeCPU backend passes we link + // that library separately afterwards without --only-needed. + if (isNativeCPU) { + assert(!NativeCPULib); + NativeCPULib = DeviceLinkObjects.back(); + DeviceLinkObjects.pop_back(); + } } } } @@ -5912,7 +5887,7 @@ class OffloadingActionBuilder final { // For NVPTX backend we need to also link libclc and CUDA libdevice // at the same stage that we link all of the unbundled SYCL libdevice // objects together. - if (TC->getTriple().isNVPTX() && NumOfDeviceLibLinked) { + if ((TC->getTriple().isNVPTX() || isNativeCPU) && NumOfDeviceLibLinked) { std::string LibSpirvFile; if (Args.hasArg(options::OPT_fsycl_libspirv_path_EQ)) { auto ProvidedPath = @@ -5932,13 +5907,18 @@ class OffloadingActionBuilder final { llvm::sys::path::append(WithInstallPath, Twine("../../../share/clc")); LibraryPaths.emplace_back(WithInstallPath.c_str()); + // TODO: check if the isNVPTX() path can also use + // TC->getTripleString() so that the conditional could be removed + const std::string TrStr = + isNativeCPU ? TC->getTripleString() : "nvptx64-nvidia-cuda"; + // Select remangled libclc variant - std::string LibSpirvTargetName = - (TC->getAuxTriple()->isOSWindows()) - ? "remangled-l32-signed_char.libspirv-nvptx64-nvidia-cuda." - "bc" - : "remangled-l64-signed_char.libspirv-nvptx64-nvidia-cuda." - "bc"; + StringRef LibSpirvTargetNamePref = + TC->getAuxTriple()->isOSWindows() + ? "remangled-l32-signed_char.libspirv-" + : "remangled-l64-signed_char.libspirv-"; + llvm::Twine LibSpirvTargetNameTemp = LibSpirvTargetNamePref + TrStr; + llvm::Twine LibSpirvTargetName = LibSpirvTargetNameTemp + ".bc"; for (StringRef LibraryPath : LibraryPaths) { SmallString<128> LibSpirvTargetFile(LibraryPath); @@ -5950,7 +5930,6 @@ class OffloadingActionBuilder final { } } } - if (!LibSpirvFile.empty()) { Arg *LibClcInputArg = MakeInputArg(Args, C.getDriver().getOpts(), Args.MakeArgString(LibSpirvFile)); @@ -5959,6 +5938,11 @@ class OffloadingActionBuilder final { DeviceLinkObjects.push_back(SYCLLibClcInputAction); } + if (isNativeCPU) { + // return here to not generate cuda actions + return NumOfDeviceLibLinked != 0; + } + const toolchains::CudaToolChain *CudaTC = static_cast(TC); for (const auto &LinkInputEnum : enumerate(DeviceLinkerInputs)) { @@ -9233,7 +9217,10 @@ InputInfoList Driver::BuildJobsForActionNoCache( Action::OffloadKind DependentOffloadKind; if (UI.DependentOffloadKind == Action::OFK_SYCL && TargetDeviceOffloadKind == Action::OFK_None && - !(isSYCLNativeCPU(Args) && isSYCLNativeCPU(C.getDefaultToolChain().getTriple(), TC->getTriple()))) + !(isSYCLNativeCPU(Args) && + isSYCLNativeCPU(C.getDefaultToolChain().getTriple(), + TC->getTriple()) && + UA->getDependentActionsInfo().size() > 1)) DependentOffloadKind = Action::OFK_Host; else DependentOffloadKind = UI.DependentOffloadKind; diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 6d60e4d64eab8..db47af8c8a2d5 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -5605,10 +5605,6 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, // Let the FE know we are doing a SYCL offload compilation, but we are // doing the host pass. CmdArgs.push_back("-fsycl-is-host"); - if (IsSYCLNativeCPU) { - CmdArgs.push_back("-D"); - CmdArgs.push_back("__SYCL_NATIVE_CPU__"); - } if (!D.IsCLMode()) { // SYCL library is guaranteed to work correctly only with dynamic diff --git a/clang/lib/Driver/ToolChains/SYCL.cpp b/clang/lib/Driver/ToolChains/SYCL.cpp index 39a6d6d5534e8..add27c68e958d 100644 --- a/clang/lib/Driver/ToolChains/SYCL.cpp +++ b/clang/lib/Driver/ToolChains/SYCL.cpp @@ -290,6 +290,15 @@ SYCL::getDeviceLibraries(const Compilation &C, const llvm::Triple &TargetTriple, const SYCLDeviceLibsList SYCLDeviceSanitizerLibs = { {"libsycl-sanitizer", "internal"}}; #endif + + const SYCLDeviceLibsList SYCLNativeCpuDeviceLibs = { + {"libsycl-nativecpu_utils", "internal"}}; + + const bool isNativeCPU = + (driver::isSYCLNativeCPU(Args) && + driver::isSYCLNativeCPU(C.getDefaultToolChain().getTriple(), + TargetTriple)); + bool IsWindowsMSVCEnv = C.getDefaultToolChain().getTriple().isWindowsMSVCEnvironment(); bool IsNewOffload = C.getDriver().getUseNewOffloadingDriver(); @@ -368,6 +377,10 @@ SYCL::getDeviceLibraries(const Compilation &C, const llvm::Triple &TargetTriple, addLibraries(SYCLDeviceSanitizerLibs); } #endif + + if (isNativeCPU) + addLibraries(SYCLNativeCpuDeviceLibs); + return LibraryList; } diff --git a/clang/test/Driver/sycl-native-cpu-fsycl.cpp b/clang/test/Driver/sycl-native-cpu-fsycl.cpp index 9750f6cb09507..a352366103e8f 100644 --- a/clang/test/Driver/sycl-native-cpu-fsycl.cpp +++ b/clang/test/Driver/sycl-native-cpu-fsycl.cpp @@ -1,3 +1,4 @@ +//UNSUPPORTED: (system-windows && !native_cpu) //RUN: %clang -fsycl -fsycl-targets=native_cpu -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -ccc-print-phases %s 2>&1 | FileCheck %s --check-prefix=CHECK_ACTIONS //RUN: %clang -fsycl -fsycl-targets=native_cpu -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -ccc-print-bindings %s 2>&1 | FileCheck %s --check-prefix=CHECK_BINDINGS //RUN: %clang -fsycl -fsycl-targets=native_cpu -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | FileCheck %s --check-prefix=CHECK_INVO @@ -19,24 +20,29 @@ //CHECK_ACTIONS: +- 7: compiler, {6}, ir, (host-sycl) //CHECK_ACTIONS: +- 8: backend, {7}, assembler, (host-sycl) //CHECK_ACTIONS: +- 9: assembler, {8}, object, (host-sycl) - -//CHECK_ACTIONS:| +- 10: linker, {5}, ir, (device-sycl) -//CHECK_ACTIONS:| |- 11: input, "{{.*}}libspirv{{.*}}", ir, (device-sycl) -//CHECK_ACTIONS:| +- 12: linker, {10, 11}, ir, (device-sycl) -//CHECK_ACTIONS:| +- 13: backend, {12}, assembler, (device-sycl) -//CHECK_ACTIONS:| +- 14: assembler, {13}, object, (device-sycl) -//CHECK_ACTIONS:|- 15: offload, "device-sycl ({{.*}})" {14}, object -//CHECK_ACTIONS:| +- 16: sycl-post-link, {12}, tempfiletable, (device-sycl) -//CHECK_ACTIONS:| +- 17: clang-offload-wrapper, {16}, object, (device-sycl) -//CHECK_ACTIONS:|- 18: offload, "device-sycl ({{.*}})" {17}, object -//CHECK_ACTIONS:19: linker, {9, 15, 18}, image, (host-sycl) +//CHECK_ACTIONS: +- 10: linker, {5}, ir, (device-sycl) +//CHECK_ACTIONS: |- [[SPIRVLIB:.*]]: input, "{{.*}}libspirv{{.*}}", ir, (device-sycl) +//different libraries may be linked on different platforms, so just check the common stages +//CHECK_ACTIONS: +- [[LINKALL:.*]]: linker, {10, [[SPIRVLIB]]}, ir, (device-sycl) +//CHECK_ACTIONS: |- [[NCPUINP:.*]]: input, "{{.*}}nativecpu{{.*}}", ir, (device-sycl) +//CHECK_ACTIONS: +- [[NCPULINK:.*]]: linker, {[[LINKALL]], [[NCPUINP]]}, ir, (device-sycl) +//this is where we compile the device code to a shared lib, and we link the host shared lib and the device shared lib +//CHECK_ACTIONS:| +- [[VAL81:.*]]: backend, {[[NCPULINK]]}, assembler, (device-sycl) +//CHECK_ACTIONS:| +- [[VAL82:.*]]: assembler, {[[VAL81]]}, object, (device-sycl) +//CHECK_ACTIONS:|- [[VAL822:.*]]: offload, "device-sycl ({{.*}})" {[[VAL82]]}, object +//call sycl-post-link and clang-offload-wrapper +//CHECK_ACTIONS:| +- [[VAL83:.*]]: sycl-post-link, {[[LINKALL]]}, tempfiletable, (device-sycl) +//CHECK_ACTIONS:| +- [[VAL84:.*]]: clang-offload-wrapper, {[[VAL83]]}, object, (device-sycl) +//CHECK_ACTIONS:|- [[VAL85:.*]]: offload, "device-sycl ({{.*}})" {[[VAL84]]}, object +//CHECK_ACTIONS:[[VAL86:.*]]: linker, {9, [[VAL822]], [[VAL85]]}, image, (host-sycl) //CHECK_BINDINGS:# "{{.*}}" - "clang", inputs: ["{{.*}}sycl-native-cpu-fsycl.cpp"], output: "[[KERNELIR:.*]].bc" //CHECK_BINDINGS:# "{{.*}}" - "Append Footer to source", inputs: ["{{.*}}sycl-native-cpu-fsycl.cpp"], output: "[[SRCWFOOTER:.*]].cpp" //CHECK_BINDINGS:# "{{.*}}" - "clang", inputs: ["[[SRCWFOOTER]].cpp", "[[KERNELIR]].bc"], output: "[[HOSTOBJ:.*]].o" //CHECK_BINDINGS:# "{{.*}}" - "SYCL::Linker", inputs: ["[[KERNELIR]].bc"], output: "[[KERNELLINK:.*]].bc" //CHECK_BINDINGS:# "{{.*}}" - "SYCL::Linker", inputs: ["[[KERNELLINK]].bc", "{{.*}}.bc"], output: "[[KERNELLINKWLIB:.*]].bc" -//CHECK_BINDINGS:# "{{.*}}" - "clang", inputs: ["[[KERNELLINKWLIB]].bc"], output: "[[KERNELOBJ:.*]].o" +//CHECK_BINDINGS:# "{{.*}}" - "SYCL::Linker", inputs: ["[[KERNELLINKWLIB]].bc", "[[UNBUNDLEDNCPU:.*]].bc"], output: "[[KERNELLINKWLIB12:.*]].bc" +//CHECK_BINDINGS:# "{{.*}}" - "clang", inputs: ["[[KERNELLINKWLIB12]].bc"], output: "[[KERNELOBJ:.*]].o" //CHECK_BINDINGS:# "{{.*}}" - "SYCL post link", inputs: ["[[KERNELLINKWLIB]].bc"], output: "[[TABLEFILE:.*]].table" //CHECK_BINDINGS:# "{{.*}}" - "offload wrapper", inputs: ["[[TABLEFILE]].table"], output: "[[WRAPPEROBJ:.*]].o" //CHECK_BINDINGS:# "{{.*}}" - "{{.*}}::Linker", inputs: ["[[HOSTOBJ]].o", "[[KERNELOBJ]].o", "[[WRAPPEROBJ]].o"], output: "a.{{.*}}" @@ -49,8 +55,8 @@ // checks that the device and host triple is correct in the generated actions when it is set explicitly //CHECK_ACTIONS-AARCH64: +- 6: offload, "host-sycl (aarch64-unknown-linux-gnu)" {2}, "device-sycl (aarch64-unknown-linux-gnu)" {5}, c++-cpp-output -//CHECK_ACTIONS-AARCH64:|- 15: offload, "device-sycl (aarch64-unknown-linux-gnu)" {14}, object -//CHECK_ACTIONS-AARCH64:|- 18: offload, "device-sycl (aarch64-unknown-linux-gnu)" {17}, object +//CHECK_ACTIONS-AARCH64:|- 17: offload, "device-sycl (aarch64-unknown-linux-gnu)" {16}, object +//CHECK_ACTIONS-AARCH64:|- 20: offload, "device-sycl (aarch64-unknown-linux-gnu)" {19}, object // checks that bindings are correct when linking together multiple TUs on native cpu //CHECK_BINDINGS_MULTI_TU:# "{{.*}}" - "offload bundler", inputs: ["{{.*}}.o"], outputs: ["[[FILE1HOST:.*]].o", "[[FILE1DEV:.*]].o"] @@ -59,8 +65,7 @@ //CHECK_BINDINGS_MULTI_TU:# "{{.*}}" - "Convert SPIR-V to LLVM-IR if needed", inputs: ["[[FILE2DEV]].o"], output: "[[FILE2SPV:.*]].bc" //CHECK_BINDINGS_MULTI_TU:# "{{.*}}" - "SYCL::Linker", inputs: ["[[FILE1SPV]].bc", "[[FILE2SPV]].bc"], output: "[[LINK1:.*]].bc" //CHECK_BINDINGS_MULTI_TU:# "{{.*}}" - "SYCL::Linker", inputs: ["[[LINK1]].bc", "{{.*}}.bc"], output: "[[LINK2:.*]].bc" -//CHECK_BINDINGS_MULTI_TU:# "{{.*}}" - "clang", inputs: ["[[LINK2]].bc"], output: "[[KERNELO:.*]].o" +//CHECK_BINDINGS_MULTI_TU:# "{{.*}}" - "clang", inputs: ["{{.*}}.bc"], output: "[[KERNELO:.*]].o" //CHECK_BINDINGS_MULTI_TU:# "{{.*}}" - "SYCL post link", inputs: ["[[LINK2]].bc"], output: "[[POSTL:.*]].table" //CHECK_BINDINGS_MULTI_TU:# "{{.*}}" - "offload wrapper", inputs: ["[[POSTL]].table"], output: "[[WRAP:.*]].o" //CHECK_BINDINGS_MULTI_TU:# "{{.*}}" - "{{.*}}::Linker", inputs: ["[[FILE1HOST]].o", "[[FILE2HOST]].o", "[[KERNELO]].o", "[[WRAP]].o"], output: "{{.*}}" - diff --git a/libdevice/cmake/modules/SYCLLibdevice.cmake b/libdevice/cmake/modules/SYCLLibdevice.cmake index ce3f826316e55..c1aac6d017eff 100644 --- a/libdevice/cmake/modules/SYCLLibdevice.cmake +++ b/libdevice/cmake/modules/SYCLLibdevice.cmake @@ -167,6 +167,16 @@ if (NOT MSVC) sycl-compiler) endif() +if("native_cpu" IN_LIST SYCL_ENABLE_PLUGINS) + if (NOT DEFINED NATIVE_CPU_DIR) + message( FATAL_ERROR "Undefined UR variable NATIVE_CPU_DIR. The name may have changed." ) + endif() + # Include NativeCPU UR adapter path to enable finding header file with state struct. + # libsycl-nativecpu_utils is only needed as BC file by NativeCPU. + # Todo: add versions for other targets (for cross-compilation) + add_devicelib_bc(libsycl-nativecpu_utils SRC nativecpu_utils.cpp DEP ${itt_obj_deps} EXTRA_ARGS -I ${NATIVE_CPU_DIR} -fsycl-targets=native_cpu) +endif() + add_devicelib(libsycl-itt-stubs SRC itt_stubs.cpp DEP ${itt_obj_deps}) add_devicelib(libsycl-itt-compiler-wrappers SRC itt_compiler_wrappers.cpp DEP ${itt_obj_deps}) add_devicelib(libsycl-itt-user-wrappers SRC itt_user_wrappers.cpp DEP ${itt_obj_deps}) diff --git a/libdevice/nativecpu_utils.cpp b/libdevice/nativecpu_utils.cpp new file mode 100755 index 0000000000000..24d3fbafad3bc --- /dev/null +++ b/libdevice/nativecpu_utils.cpp @@ -0,0 +1,361 @@ +//==--- nativecpu_utils.cpp - builtins for Native CPU ------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// This file contains all device builtins and types that are either directly +// referenced in the SYCL source code (e.g. __spirv_...) or used by the +// NativeCPU passes. This is currently built as a device code library but +// could perhaps eventually be compiled as a standard C++ translation unit +// without SYCL attributes etc. + +#if defined(__SYCL_NATIVE_CPU__) + +#include "CL/__spirv/spirv_ops.hpp" +#include "device.h" +#include +#include + +// including state definition from Native CPU UR adapter +#include "nativecpu_state.hpp" +using __nativecpu_state = native_cpu::state; + +#undef DEVICE_EXTERNAL +#undef DEVICE_EXTERN_C +#define DEVICE_EXTERN_C extern "C" SYCL_EXTERNAL +#define DEVICE_EXTERNAL_C DEVICE_EXTERN_C __attribute__((always_inline)) +#define DEVICE_EXTERNAL SYCL_EXTERNAL __attribute__((always_inline)) + +#define OCL_LOCAL __attribute__((opencl_local)) +#define OCL_GLOBAL __attribute__((opencl_global)) + +DEVICE_EXTERNAL OCL_LOCAL void * +__spirv_GenericCastToPtrExplicit_ToLocal(void *p, int) { + return (OCL_LOCAL void *)p; +} + +DEVICE_EXTERNAL OCL_GLOBAL void * +__spirv_GenericCastToPtrExplicit_ToGlobal(void *p, int) { + return (OCL_GLOBAL void *)p; +} + +DEVICE_EXTERN_C void __mux_work_group_barrier(uint32_t id, uint32_t scope, + uint32_t semantics); +__SYCL_CONVERGENT__ DEVICE_EXTERNAL void +__spirv_ControlBarrier(uint32_t Execution, uint32_t Memory, + uint32_t Semantics) { + if (__spv::Scope::Flag::Workgroup == Execution) + // todo: check id and args; use mux constants + __mux_work_group_barrier(0, Execution, Semantics); +} + +// Turning clang format off here because it reorders macro invocations +// making the following code very difficult to read. +// clang-format off +#define DefSubgroupBlockINTEL1(Type, PType) \ + template <> \ + __SYCL_CONVERGENT__ DEVICE_EXTERNAL Type \ + __spirv_SubgroupBlockReadINTEL(const OCL_GLOBAL PType *Ptr) noexcept { \ + return *Ptr; \ + } \ + template <> \ + __SYCL_CONVERGENT__ DEVICE_EXTERNAL void \ + __spirv_SubgroupBlockWriteINTEL(PType OCL_GLOBAL * ptr, \ + Type v) noexcept { \ + *(Type *)ptr = v; \ + } + +#define DefSubgroupBlockINTEL_vt(Type, VT_name) \ + DefSubgroupBlockINTEL1(ncpu_types::vtypes::VT_name, Type) + +#define DefSubgroupBlockINTEL(Type) \ + DefSubgroupBlockINTEL1(Type, Type) DefSubgroupBlockINTEL_vt(Type, v2) \ + DefSubgroupBlockINTEL_vt(Type, v4) DefSubgroupBlockINTEL_vt(Type, v8) + +namespace ncpu_types { +template struct vtypes { + using v2 = typename sycl::detail::VecStorage::DataType; + using v4 = typename sycl::detail::VecStorage::DataType; + using v8 = typename sycl::detail::VecStorage::DataType; +}; +} // namespace ncpu_types + +DefSubgroupBlockINTEL(uint32_t) DefSubgroupBlockINTEL(uint64_t) +DefSubgroupBlockINTEL(uint8_t) DefSubgroupBlockINTEL(uint16_t) + +#define DefineGOp1(spir_sfx, mux_name)\ +DEVICE_EXTERN_C bool mux_name(bool);\ +DEVICE_EXTERNAL bool __spirv_Group ## spir_sfx(unsigned g, bool val) {\ + if (__spv::Scope::Flag::Subgroup == g)\ + return mux_name(val);\ + return false;\ +} + +DefineGOp1(Any, __mux_sub_group_any_i1) +DefineGOp1(All, __mux_sub_group_all_i1) + + +#define DefineGOp(Type, MuxType, spir_sfx, mux_sfx) \ + DEVICE_EXTERN_C MuxType __mux_sub_group_scan_inclusive_##mux_sfx(MuxType); \ + DEVICE_EXTERN_C MuxType __mux_sub_group_scan_exclusive_##mux_sfx(MuxType); \ + DEVICE_EXTERN_C MuxType __mux_sub_group_reduce_##mux_sfx(MuxType); \ + DEVICE_EXTERN_C MuxType __mux_work_group_scan_exclusive_##mux_sfx(uint32_t, \ + MuxType); \ + DEVICE_EXTERN_C MuxType __mux_work_group_scan_inclusive_##mux_sfx(uint32_t, \ + MuxType); \ + DEVICE_EXTERN_C MuxType __mux_work_group_reduce_##mux_sfx(uint32_t, MuxType);\ + DEVICE_EXTERNAL Type __spirv_Group##spir_sfx(uint32_t g, uint32_t id, \ + Type v) { \ + if (__spv::Scope::Flag::Subgroup == g) { \ + if (static_cast(__spv::GroupOperation::InclusiveScan) == id) \ + return __mux_sub_group_scan_inclusive_##mux_sfx(v); \ + if (static_cast(__spv::GroupOperation::ExclusiveScan) == id) \ + return __mux_sub_group_scan_exclusive_##mux_sfx(v); \ + if (static_cast(__spv::GroupOperation::Reduce) == id) \ + return __mux_sub_group_reduce_##mux_sfx(v); \ + } else if (__spv::Scope::Flag::Workgroup == g) { \ + uint32_t bid = 0; \ + if (static_cast(__spv::GroupOperation::ExclusiveScan) == id) \ + return __mux_work_group_scan_exclusive_##mux_sfx(bid, v); \ + if (static_cast(__spv::GroupOperation::InclusiveScan) == id) \ + return __mux_work_group_scan_inclusive_##mux_sfx(bid, v); \ + if (static_cast(__spv::GroupOperation::Reduce) == id) \ + return __mux_work_group_reduce_##mux_sfx(bid, v); \ + } \ + return Type(); /*todo: add support for other flags as they are tested*/ \ + } + +#define DefineSignedGOp(Name, MuxName, Bits)\ + DefineGOp(int##Bits##_t, int##Bits##_t, Name, MuxName##Bits) + +#define DefineUnsignedGOp(Name, MuxName, Bits)\ + DefineGOp(uint##Bits##_t, int##Bits##_t, Name, MuxName##Bits) + +#define Define_32_64(Define, Name, MuxName) \ + Define(Name, MuxName, 32) \ + Define(Name, MuxName, 64) + +// todo: add support for other integer and float types once there are tests +#define DefineIntGOps(Name, MuxName) \ + Define_32_64(DefineSignedGOp, Name, MuxName) \ + Define_32_64(DefineUnsignedGOp, Name, MuxName) + +#define DefineFPGOps(Name, MuxName) \ + DefineGOp(float, float, Name, MuxName##32) \ + DefineGOp(double, double, Name, MuxName##64) + +DefineIntGOps(IAdd, add_i) +DefineIntGOps(IMulKHR, mul_i) + +Define_32_64(DefineUnsignedGOp, UMin, umin_i) +Define_32_64(DefineUnsignedGOp, UMax, umax_i) +Define_32_64(DefineSignedGOp, SMin, smin_i) +Define_32_64(DefineSignedGOp, SMax, smax_i) + +DefineFPGOps(FMulKHR, fmul_f) +DefineFPGOps(FAdd, fadd_f) +DefineFPGOps(FMin, fmin_f) +DefineFPGOps(FMax, fmax_f) + +#define DefineBitwiseGroupOp(Type, MuxType, mux_sfx) \ + DefineGOp(Type, MuxType, BitwiseOrKHR, or_##mux_sfx) \ + DefineGOp(Type, MuxType, BitwiseXorKHR, xor_##mux_sfx) \ + DefineGOp(Type, MuxType, BitwiseAndKHR, and_##mux_sfx) + +DefineBitwiseGroupOp(int32_t, int32_t, i32) +DefineBitwiseGroupOp(uint32_t, int32_t, i32) +DefineBitwiseGroupOp(int64_t, int64_t, i64) +DefineBitwiseGroupOp(uint64_t, int64_t, i64) + +#define DefineBroadCastImpl(Type, Sfx, MuxType, IDType) \ + DEVICE_EXTERN_C MuxType __mux_work_group_broadcast_##Sfx( \ + int32_t id, MuxType val, int64_t lidx, int64_t lidy, int64_t lidz); \ + DEVICE_EXTERN_C MuxType __mux_sub_group_broadcast_##Sfx(MuxType val, \ + int32_t sg_lid); \ + DEVICE_EXTERNAL Type __spirv_GroupBroadcast(uint32_t g, Type v, \ + IDType l) { \ + if (__spv::Scope::Flag::Subgroup == g) \ + return __mux_sub_group_broadcast_##Sfx(v, l); \ + return Type(); /*todo: add support for other flags as they are tested*/ \ + } + +#define DefineBroadCast(Type, Sfx, MuxType)\ + DefineBroadCastImpl(Type, Sfx, MuxType, uint32_t) + +DefineBroadCast(int64_t, i64, int64_t) +DefineBroadCast(uint64_t, i64, int64_t) +DefineBroadCast(int32_t, i32, int32_t) +DefineBroadCast(uint32_t, i32, int32_t) +DefineBroadCast(float, f32, float) +DefineBroadCast(double, f64, double) + +DefineBroadCastImpl(int32_t, i32, int32_t, uint64_t) +DefineBroadCastImpl(float, f32, float, uint64_t) +DefineBroadCastImpl(double, f64, double, uint64_t) +DefineBroadCastImpl(uint64_t, i64, int64_t, uint64_t) + + +#define DefShuffleINTEL(Type, Sfx, MuxType) \ + DEVICE_EXTERN_C MuxType __mux_sub_group_shuffle_##Sfx(MuxType val, \ + int32_t lid); \ + template <> \ + DEVICE_EXTERNAL Type __spirv_SubgroupShuffleINTEL( \ + Type val, unsigned id) noexcept { \ + return (Type)__mux_sub_group_shuffle_##Sfx((MuxType)val, id); \ + } + +#define DefShuffleUpINTEL(Type, Sfx, MuxType) \ + DEVICE_EXTERN_C MuxType __mux_sub_group_shuffle_up_##Sfx( \ + MuxType prev, MuxType curr, int32_t delta); \ + template <> \ + DEVICE_EXTERNAL Type __spirv_SubgroupShuffleUpINTEL( \ + Type prev, Type curr, unsigned delta) noexcept { \ + return (Type)__mux_sub_group_shuffle_up_##Sfx((MuxType)prev, \ + (MuxType)curr, delta); \ + } + +#define DefShuffleDownINTEL(Type, Sfx, MuxType) \ + DEVICE_EXTERN_C MuxType __mux_sub_group_shuffle_down_##Sfx( \ + MuxType curr, MuxType next, int32_t delta); \ + template <> \ + DEVICE_EXTERNAL Type __spirv_SubgroupShuffleDownINTEL( \ + Type curr, Type next, unsigned delta) noexcept { \ + return (Type)__mux_sub_group_shuffle_down_##Sfx((MuxType)curr, \ + (MuxType)next, delta); \ + } + +#define DefShuffleXorINTEL(Type, Sfx, MuxType) \ + DEVICE_EXTERN_C MuxType __mux_sub_group_shuffle_xor_##Sfx(MuxType val, \ + int32_t xor_val); \ + template <> \ + DEVICE_EXTERNAL Type __spirv_SubgroupShuffleXorINTEL( \ + Type data, unsigned value) noexcept { \ + return (Type)__mux_sub_group_shuffle_xor_##Sfx((MuxType)data, value); \ + } + +#define DefShuffleINTEL_All(Type, Sfx, MuxType) \ + DefShuffleINTEL(Type, Sfx, MuxType) \ + DefShuffleUpINTEL(Type, Sfx, MuxType) \ + DefShuffleDownINTEL(Type, Sfx, MuxType) \ + DefShuffleXorINTEL(Type, Sfx, MuxType) + +DefShuffleINTEL_All(uint64_t, i64, int64_t) +DefShuffleINTEL_All(int64_t, i64, int64_t) +DefShuffleINTEL_All(int32_t, i32, int32_t) +DefShuffleINTEL_All(uint32_t, i32, int32_t) +DefShuffleINTEL_All(int16_t, i16, int16_t) +DefShuffleINTEL_All(uint16_t, i16, int16_t) +DefShuffleINTEL_All(double, f64, double) +DefShuffleINTEL_All(float, f32, float) + +#define DefineShuffleVec(T, N, Sfx, MuxType) \ + using vt##T##N = sycl::detail::VecStorage::DataType; \ + using vt##MuxType##N = sycl::detail::VecStorage::DataType; \ + DefShuffleINTEL_All(vt##T##N, v##N##Sfx, vt##MuxType##N) + +#define DefineShuffleVec2to16(Type, Sfx, MuxType) \ + DefineShuffleVec(Type, 2, Sfx, MuxType) \ + DefineShuffleVec(Type, 4, Sfx, MuxType) \ + DefineShuffleVec(Type, 8, Sfx, MuxType) \ + DefineShuffleVec(Type, 16, Sfx, MuxType) + +DefineShuffleVec2to16(int32_t, i32, int32_t) +DefineShuffleVec2to16(uint32_t, i32, int32_t) +DefineShuffleVec2to16(float, f32, float) + +#define Define2ArgForward(Type, Name, Callee)\ +DEVICE_EXTERNAL Type Name(Type a, Type b) { return Callee(a,b);} + +Define2ArgForward(uint64_t, __spirv_ocl_u_min, std::min) + + +#define GEN_u32(bname, muxname) \ + DEVICE_EXTERN_C uint32_t muxname(); \ + DEVICE_EXTERNAL uint32_t bname() { return muxname(); } +// subgroup +GEN_u32(__spirv_SubgroupLocalInvocationId, __mux_get_sub_group_local_id) +GEN_u32(__spirv_SubgroupMaxSize, __mux_get_max_sub_group_size) +GEN_u32(__spirv_SubgroupId, __mux_get_sub_group_id) +GEN_u32(__spirv_NumSubgroups, __mux_get_num_sub_groups) +GEN_u32(__spirv_SubgroupSize, __mux_get_sub_group_size) + +// I64_I32 +#define GEN_p(bname, muxname, arg) \ + DEVICE_EXTERN_C uint64_t muxname(uint32_t); \ + DEVICE_EXTERNAL uint64_t bname() { return muxname(arg); } + +#define GEN_xyz(bname, ncpu_name) \ + GEN_p(bname##_x, ncpu_name, 0) \ + GEN_p(bname##_y, ncpu_name, 1) \ + GEN_p(bname##_z, ncpu_name, 2) + +GEN_xyz(__spirv_GlobalInvocationId, __mux_get_global_id) +GEN_xyz(__spirv_GlobalSize, __mux_get_global_size) +GEN_xyz(__spirv_GlobalOffset, __mux_get_global_offset) +GEN_xyz(__spirv_LocalInvocationId, __mux_get_local_id) +GEN_xyz(__spirv_NumWorkgroups, __mux_get_num_groups) +GEN_xyz(__spirv_WorkgroupSize, __mux_get_local_size) +GEN_xyz(__spirv_WorkgroupId, __mux_get_group_id) + +#define NCPUPREFIX(name) __dpcpp_nativecpu##name + +template using MakeGlobalType = + typename sycl::detail::DecoratedType < T, sycl::access::address_space:: + global_space>::type; + +#define DefStateSetWithType(name, field, type) \ + DEVICE_EXTERNAL_C void NCPUPREFIX(name)( \ + type value, MakeGlobalType<__nativecpu_state> *s) { \ + s->field = value; \ + } + +// Subgroup setters +DefStateSetWithType(_set_num_sub_groups, NumSubGroups, uint32_t) +DefStateSetWithType(_set_sub_group_id, SubGroup_id, uint32_t) +DefStateSetWithType(_set_max_sub_group_size, SubGroup_size, uint32_t) + +#define DefineStateGetWithType(name, field, type)\ + DEVICE_EXTERNAL_C type NCPUPREFIX(name)( \ + MakeGlobalType<__nativecpu_state> *s) { \ + return s->field; \ + } +#define DefineStateGet_U32(name, field) \ + DefineStateGetWithType(name, field, uint32_t) + +// Subgroup getters +DefineStateGet_U32(_get_sub_group_id, SubGroup_id) +DefineStateGet_U32(_get_sub_group_local_id, SubGroup_local_id) +DefineStateGet_U32(_get_sub_group_size, SubGroup_size) +DefineStateGet_U32(_get_max_sub_group_size, SubGroup_size) +DefineStateGet_U32(_get_num_sub_groups, NumSubGroups) + +#define DefineStateGetWithType2(name, field, rtype, ptype) \ + DEVICE_EXTERNAL_C rtype NCPUPREFIX(name)(ptype dim, \ + MakeGlobalType<__nativecpu_state> *s) { \ + return s->field[dim]; \ + } + +#define DefineStateGet_U64(name, field) \ + DefineStateGetWithType2(name, field, uint64_t, uint32_t) + +// Workgroup getters +DefineStateGet_U64(_get_global_id, MGlobal_id) +DefineStateGet_U64(_get_global_range, MGlobal_range) +DefineStateGet_U64(_get_global_offset, MGlobalOffset) +DefineStateGet_U64(_get_local_id, MLocal_id) +DefineStateGet_U64(_get_num_groups, MNumGroups) +DefineStateGet_U64(_get_wg_size, MWorkGroup_size) +DefineStateGet_U64(_get_wg_id, MWorkGroup_id) + +DEVICE_EXTERNAL_C void + __dpcpp_nativecpu_set_local_id(uint32_t dim, uint64_t value, + MakeGlobalType<__nativecpu_state> *s) { + s->MLocal_id[dim] = value; + s->MGlobal_id[dim] = s->MWorkGroup_size[dim] * s->MWorkGroup_id[dim] + + s->MLocal_id[dim] + s->MGlobalOffset[dim]; +} + +#endif // __SYCL_NATIVE_CPU__ diff --git a/llvm/include/llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h b/llvm/include/llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h index d1f5b0173991c..d1694a7b99696 100644 --- a/llvm/include/llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h +++ b/llvm/include/llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h @@ -21,22 +21,6 @@ namespace utils { void addSYCLNativeCPUBackendPasses(ModulePassManager &MPM, ModuleAnalysisManager &MAM, OptimizationLevel OptLevel); -const constexpr char NativeCPUGlobalId[] = "__dpcpp_nativecpu_get_global_id"; -const constexpr char NativeCPUGlobaRange[] = - "__dpcpp_nativecpu_get_global_range"; -const constexpr char NativeCPUGlobalOffset[] = - "__dpcpp_nativecpu_get_global_offset"; -const constexpr char NativeCPULocalId[] = "__dpcpp_nativecpu_get_local_id"; -const constexpr char NativeCPUNumGroups[] = "__dpcpp_nativecpu_get_num_groups"; -const constexpr char NativeCPUWGSize[] = "__dpcpp_nativecpu_get_wg_size"; -const constexpr char NativeCPUWGId[] = "__dpcpp_nativecpu_get_wg_id"; -const constexpr char NativeCPUSetNumSubgroups[] = - "__dpcpp_nativecpu_set_num_sub_groups"; -const constexpr char NativeCPUSetSubgroupId[] = - "__dpcpp_nativecpu_set_sub_group_id"; -const constexpr char NativeCPUSetMaxSubgroupSize[] = - "__dpcpp_nativecpu_set_max_sub_group_size"; -const constexpr char NativeCPUSetLocalId[] = "__dpcpp_nativecpu_set_local_id"; constexpr char SYCLNATIVECPUSUFFIX[] = ".SYCLNCPU"; constexpr char SYCLNATIVECPUKERNEL[] = ".NativeCPUKernel"; diff --git a/llvm/lib/SYCLNativeCPUUtils/ConvertToMuxBuiltinsSYCLNativeCPU.cpp b/llvm/lib/SYCLNativeCPUUtils/ConvertToMuxBuiltinsSYCLNativeCPU.cpp index 7ca462fdc05c7..6e3e73dc69901 100644 --- a/llvm/lib/SYCLNativeCPUUtils/ConvertToMuxBuiltinsSYCLNativeCPU.cpp +++ b/llvm/lib/SYCLNativeCPUUtils/ConvertToMuxBuiltinsSYCLNativeCPU.cpp @@ -42,119 +42,12 @@ static void fixFunctionAttributes(Function *F) { F->addFnAttr("frame-pointer", "none"); } -// Helper macros for constructing builtin MS names -#define GENMS1(builtin_str) "?" builtin_str "@@YA_KXZ" - -#define GEN_IT_proc(b_str, len) "_Z" #len b_str "v" -#define GEN_p(b_str, len, ncpu_bstr, num) \ - { \ - {([]() { static_assert(sizeof(b_str) == len + 1); }, \ - GEN_IT_proc(b_str, len)), \ - GENMS1(b_str)}, \ - { \ - ncpu_bstr, num \ - } \ - } -#define GEN_xyz(b_name, len, ncpu_name) \ - GEN_p(#b_name "_x", len, #ncpu_name, 0), \ - GEN_p(#b_name "_y", len, #ncpu_name, 1), \ - GEN_p(#b_name "_z", len, #ncpu_name, 2) - -// Todo: add support for more SPIRV builtins here -static const std::pair, - std::pair> - BuiltinNamesMap[] = { - GEN_xyz(__spirv_GlobalInvocationId, 28, __mux_get_global_id), - GEN_xyz(__spirv_GlobalSize, 20, __mux_get_global_size), - GEN_xyz(__spirv_GlobalOffset, 22, __mux_get_global_offset), - GEN_xyz(__spirv_LocalInvocationId, 27, __mux_get_local_id), - GEN_xyz(__spirv_NumWorkgroups, 23, __mux_get_num_groups), - GEN_xyz(__spirv_WorkgroupSize, 23, __mux_get_local_size), - GEN_xyz(__spirv_WorkgroupId, 21, __mux_get_group_id), -}; - -static inline bool isForVisualStudio(StringRef TripleStr) { - llvm::Triple Triple(TripleStr); - return Triple.isKnownWindowsMSVCEnvironment(); -} - -static constexpr char SPIRVBarrier[] = "_Z22__spirv_ControlBarrierjjj"; -static constexpr char MuxBarrier[] = "__mux_work_group_barrier"; - -Function *getReplaceFunc(Module &M, StringRef Name) { - LLVMContext &Ctx = M.getContext(); - auto *MuxFTy = - FunctionType::get(Type::getInt64Ty(Ctx), {Type::getInt32Ty(Ctx)}, false); - auto F = M.getOrInsertFunction(Name, MuxFTy); - return cast(F.getCallee()); -} - -Function *getMuxBarrierFunc(Module &M) { - // void __mux_work_group_barrier(i32 %id, i32 %scope, i32 %semantics) - LLVMContext &Ctx = M.getContext(); - auto *Int32Ty = Type::getInt32Ty(Ctx); - auto *MuxFTy = FunctionType::get(Type::getVoidTy(Ctx), - {Int32Ty, Int32Ty, Int32Ty}, false); - auto FCallee = M.getOrInsertFunction(MuxBarrier, MuxFTy); - auto *F = dyn_cast(FCallee.getCallee()); - if (!F) { - report_fatal_error("Error while inserting mux builtins"); - } - return F; -} - static constexpr const char *MuxKernelAttrName = "mux-kernel"; void setIsKernelEntryPt(Function &F) { F.addFnAttr(MuxKernelAttrName, "entry-point"); } -bool replaceBarriers(Module &M) { - // DPC++ emits - //__spirv_ControlBarrier(__spv::Scope Execution, __spv::Scope Memory, - // uint32_t Semantics) noexcept; - // OCK expects void __mux_work_group_barrier(i32 %id, i32 %scope, i32 - // %semantics) - // __spv::Scope is - // enum Flag : uint32_t { - // CrossDevice = 0, - // Device = 1, - // Workgroup = 2, - // Subgroup = 3, - // Invocation = 4, - // }; - auto *SPIRVBarrierFunc = M.getFunction(SPIRVBarrier); - if (!SPIRVBarrierFunc) { - // No barriers are found, just return - return false; - } - static auto *MuxBarrierFunc = getMuxBarrierFunc(M); - SmallVector> ToRemove; - auto *Zero = ConstantInt::get(Type::getInt32Ty(M.getContext()), 0); - for (auto &Use : SPIRVBarrierFunc->uses()) { - auto *I = dyn_cast(Use.getUser()); - if (!I) - report_fatal_error("Unsupported Value in SYCL Native CPU\n"); - SmallVector Args{Zero, I->getArgOperand(0), - I->getArgOperand(2)}; // todo: check how the - // args map to each other - auto *NewI = CallInst::Create(MuxBarrierFunc->getFunctionType(), - MuxBarrierFunc, Args, "", I); - ToRemove.push_back(std::pair(I, NewI)); - } - - for (auto &El : ToRemove) { - auto OldI = El.first; - auto NewI = El.second; - OldI->replaceAllUsesWith(NewI); - OldI->eraseFromParent(); - } - - SPIRVBarrierFunc->eraseFromParent(); - - return true; -} - } // namespace PreservedAnalyses @@ -165,42 +58,8 @@ ConvertToMuxBuiltinsSYCLNativeCPUPass::run(Module &M, if (F.getCallingConv() == llvm::CallingConv::SPIR_KERNEL) { fixFunctionAttributes(&F); setIsKernelEntryPt(F); - } - } - const bool VisualStudioMangling = isForVisualStudio(M.getTargetTriple()); - - // Then we iterate over all the supported builtins, find their uses and - // replace them with calls to our Native CPU functions. - for (auto &Entry : BuiltinNamesMap) { - auto *Glob = M.getFunction(VisualStudioMangling ? Entry.first.second - : Entry.first.first); - if (!Glob) - continue; - auto *ReplaceFunc = getReplaceFunc(M, Entry.second.first); - SmallVector> ToRemove; - for (auto &Use : Glob->uses()) { - auto *I = dyn_cast(Use.getUser()); - if (!I) - report_fatal_error("Unsupported Value in SYCL Native CPU\n"); - auto *Arg = ConstantInt::get(Type::getInt32Ty(M.getContext()), - Entry.second.second); - auto *NewI = CallInst::Create(ReplaceFunc->getFunctionType(), ReplaceFunc, - {Arg}, "mux_call", I); ModuleChanged = true; - ToRemove.push_back(std::make_pair(I, NewI)); - } - - for (auto &El : ToRemove) { - auto OldI = El.first; - auto NewI = El.second; - OldI->replaceAllUsesWith(NewI); - OldI->eraseFromParent(); } - - // Finally, we erase the builtin from the module - Glob->eraseFromParent(); } - - ModuleChanged |= replaceBarriers(M); return ModuleChanged ? PreservedAnalyses::none() : PreservedAnalyses::all(); } diff --git a/llvm/lib/SYCLNativeCPUUtils/PipelineSYCLNativeCPU.cpp b/llvm/lib/SYCLNativeCPUUtils/PipelineSYCLNativeCPU.cpp index 410d0aeb3f243..d5445227d1b39 100644 --- a/llvm/lib/SYCLNativeCPUUtils/PipelineSYCLNativeCPU.cpp +++ b/llvm/lib/SYCLNativeCPUUtils/PipelineSYCLNativeCPU.cpp @@ -18,6 +18,7 @@ #ifdef NATIVECPU_USE_OCK #include "compiler/utils/builtin_info.h" +#include "compiler/utils/define_mux_builtins_pass.h" #include "compiler/utils/device_info.h" #include "compiler/utils/prepare_barriers_pass.h" #include "compiler/utils/sub_group_analysis.h" @@ -49,6 +50,10 @@ static cl::opt SYCLNativeCPUNoVecz("sycl-native-cpu-no-vecz", cl::init(false), cl::desc("Disable vectorizer for SYCL Native CPU")); +static cl::opt + SYCLDumpIR("sycl-native-dump-device-ir", cl::init(false), + cl::desc("Dump device IR after Native passes.")); + void llvm::sycl::utils::addSYCLNativeCPUBackendPasses( llvm::ModulePassManager &MPM, ModuleAnalysisManager &MAM, OptimizationLevel OptLevel) { @@ -84,5 +89,19 @@ void llvm::sycl::utils::addSYCLNativeCPUBackendPasses( MPM.addPass(AlwaysInlinerPass()); #endif MPM.addPass(PrepareSYCLNativeCPUPass()); +#ifdef NATIVECPU_USE_OCK + MPM.addPass(compiler::utils::DefineMuxBuiltinsPass()); +#endif MPM.addPass(RenameKernelSYCLNativeCPUPass()); + + if (SYCLDumpIR) { + // Fixme: Use PrintModulePass after PR to fix dependencies/--shared-libs + struct DumpIR : public PassInfoMixin { + PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM) { + M.print(llvm::outs(), nullptr); + return PreservedAnalyses::all(); + } + }; + MPM.addPass(DumpIR()); + } } diff --git a/llvm/lib/SYCLNativeCPUUtils/PrepareSYCLNativeCPU.cpp b/llvm/lib/SYCLNativeCPUUtils/PrepareSYCLNativeCPU.cpp index f430af168e424..bbc2a84e669dc 100644 --- a/llvm/lib/SYCLNativeCPUUtils/PrepareSYCLNativeCPU.cpp +++ b/llvm/lib/SYCLNativeCPUUtils/PrepareSYCLNativeCPU.cpp @@ -8,7 +8,7 @@ // // Prepares the kernel for SYCL Native CPU: // * Handles kernel calling convention and attributes. -// * Materializes spirv buitlins. +// * Materializes spirv builtins. //===----------------------------------------------------------------------===// #include "llvm/SYCLLowerIR/PrepareSYCLNativeCPU.h" @@ -156,155 +156,55 @@ Function *cloneFunctionAndAddParam(Function *OldF, Type *T, return NewF; } +#define BMapEntry2(n1, n2) {"__mux_" #n1, "__dpcpp_nativecpu_" #n2} +#define BMapEntry1(n) BMapEntry2(n, n) + static const std::pair BuiltinNamesMap[]{ - {"__mux_get_global_id", NativeCPUGlobalId}, - {"__mux_get_global_size", NativeCPUGlobaRange}, - {"__mux_get_global_offset", NativeCPUGlobalOffset}, - {"__mux_get_local_id", NativeCPULocalId}, - {"__mux_get_num_groups", NativeCPUNumGroups}, - {"__mux_get_local_size", NativeCPUWGSize}, - {"__mux_get_group_id", NativeCPUWGId}, - {"__mux_set_num_sub_groups", NativeCPUSetNumSubgroups}, - {"__mux_set_sub_group_id", NativeCPUSetSubgroupId}, - {"__mux_set_max_sub_group_size", NativeCPUSetMaxSubgroupSize}, - {"__mux_set_local_id", NativeCPUSetLocalId}}; + BMapEntry1(get_global_id), + BMapEntry2(get_global_size, get_global_range), + BMapEntry1(get_global_offset), + BMapEntry1(get_local_id), + BMapEntry1(get_num_groups), + BMapEntry2(get_local_size, get_wg_size), + BMapEntry2(get_group_id, get_wg_id), + BMapEntry1(set_num_sub_groups), + BMapEntry1(set_sub_group_id), + BMapEntry1(set_max_sub_group_size), + BMapEntry1(set_local_id), + + BMapEntry1(get_sub_group_local_id), + BMapEntry1(get_max_sub_group_size), + BMapEntry1(get_sub_group_id), + BMapEntry1(get_num_sub_groups), + BMapEntry1(get_sub_group_size)}; static constexpr unsigned int NativeCPUGlobalAS = 1; static constexpr char StateTypeName[] = "struct.__nativecpu_state"; static Type *getStateType(Module &M) { - // %struct.__nativecpu_state = type { [3 x i64], [3 x i64], [3 x i64], [3 x - // i64], [3 x i64], [3 x i64], [3 x i64] } Check that there's no // __nativecpu_state type - auto Types = M.getIdentifiedStructTypes(); - bool HasStateT = - llvm::any_of(Types, [](auto T) { return T->getName() == StateTypeName; }); - if (HasStateT) - report_fatal_error("Native CPU state unexpectedly found in the module."); auto &Ctx = M.getContext(); - auto *I64Ty = Type::getInt64Ty(Ctx); - auto *Array3dTy = ArrayType::get(I64Ty, 3); - std::array Elements; - Elements.fill(Array3dTy); - auto *StateType = StructType::create(Ctx, StateTypeName); - StateType->setBody(Elements); - return StateType; + return StructType::create(Ctx, StateTypeName); } -static const StringMap OffsetMap{ - {NativeCPUGlobalId, 0}, {NativeCPUGlobaRange, 1}, - {NativeCPUWGSize, 2}, {NativeCPUWGId, 3}, - {NativeCPULocalId, 4}, {NativeCPUNumGroups, 5}, - {NativeCPUGlobalOffset, 6}}; - -static Function *addSetLocalIdFunc(Module &M, StringRef Name, Type *StateType) { - /* - void __dpcpp_nativecpu_set_local_id(unsigned dim, size_t value, - __nativecpu_state *s) { - s->MLocal_id[dim] = value; - s->MGlobal_id[dim] = s->MWorkGroup_size[dim] * s->MWorkGroup_id[dim] + - s->MLocal_id[dim] + s->MGlobalOffset[dim]; - } - */ - auto &Ctx = M.getContext(); - Type *I64Ty = Type::getInt64Ty(Ctx); - Type *I32Ty = Type::getInt32Ty(Ctx); - Type *RetTy = Type::getVoidTy(Ctx); - Type *DimTy = I32Ty; - Type *ValTy = I64Ty; - Type *PtrTy = PointerType::get(Ctx, NativeCPUGlobalAS); - FunctionType *FTy = FunctionType::get(RetTy, {DimTy, ValTy, PtrTy}, false); +static Function *addReplaceFunc(Module &M, StringRef Name, Type *RetTy, + const SmallVector &Args) { + // Emit declarations if builtins are not found, e.g. when the device library + // is not linked by some lit tests. TODO: Change those lit tests and + // raise compiler error instead. + llvm::SmallVector Paras; + for (Value *arg : Args) + Paras.push_back(arg->getType()); + FunctionType *FTy = FunctionType::get(RetTy, Paras, false); auto FCallee = M.getOrInsertFunction(Name, FTy); - auto *F = cast(FCallee.getCallee()); - IRBuilder<> Builder(Ctx); - BasicBlock *BB = BasicBlock::Create(Ctx, "entry", F); - Builder.SetInsertPoint(BB); - auto *StatePtr = F->getArg(2); - auto *IdxProm = Builder.CreateZExt(F->getArg(0), DimTy, "idxprom"); - auto *Zero = ConstantInt::get(I64Ty, 0); - auto *Offset = ConstantInt::get(I32Ty, OffsetMap.at(NativeCPULocalId)); - auto *GEP = Builder.CreateGEP(StateType, StatePtr, {Zero, Offset, IdxProm}); - // store local id - auto *Val = F->getArg(1); - Builder.CreateStore(Val, GEP); - // update global id - auto loadHelper = [&](const char *BTName) { - auto *Offset = ConstantInt::get(I32Ty, OffsetMap.at(BTName)); - auto *Addr = - Builder.CreateGEP(StateType, StatePtr, {Zero, Offset, IdxProm}); - auto *Load = Builder.CreateLoad(I64Ty, Addr); - return Load; - }; - auto *WGId = loadHelper(NativeCPUWGId); - auto *WGSize = loadHelper(NativeCPUWGSize); - auto *GlobalOffset = loadHelper(NativeCPUGlobalOffset); - auto *Mul = Builder.CreateMul(WGId, WGSize); - auto *GId = Builder.CreateAdd(Builder.CreateAdd(Mul, GlobalOffset), Val); - auto *GIdOffset = ConstantInt::get(I32Ty, OffsetMap.at(NativeCPUGlobalId)); - auto *GIdAddr = - Builder.CreateGEP(StateType, StatePtr, {Zero, GIdOffset, IdxProm}); - Builder.CreateStore(GId, GIdAddr); - Builder.CreateRetVoid(); - return F; + return cast(FCallee.getCallee()); } -static Function *addGetFunc(Module &M, StringRef Name, Type *StateType) { - auto &Ctx = M.getContext(); - Type *I64Ty = Type::getInt64Ty(Ctx); - Type *I32Ty = Type::getInt32Ty(Ctx); - Type *RetTy = I64Ty; - Type *DimTy = I32Ty; - Type *PtrTy = PointerType::get(Ctx, NativeCPUGlobalAS); - static FunctionType *FTy = FunctionType::get(RetTy, {DimTy, PtrTy}, false); - auto FCallee = M.getOrInsertFunction(Name, FTy); - auto *F = cast(FCallee.getCallee()); - IRBuilder<> Builder(Ctx); - BasicBlock *BB = BasicBlock::Create(Ctx, "entry", F); - Builder.SetInsertPoint(BB); - auto *IdxProm = Builder.CreateZExt(F->getArg(0), DimTy, "idxprom"); - auto *Zero = ConstantInt::get(I64Ty, 0); - auto *Offset = ConstantInt::get(I32Ty, OffsetMap.at(Name)); - auto *GEP = - Builder.CreateGEP(StateType, F->getArg(1), {Zero, Offset, IdxProm}); - auto *Load = Builder.CreateLoad(I64Ty, GEP); - Builder.CreateRet(Load); - return F; -} - -static Function *addReplaceFunc(Module &M, StringRef Name, Type *StateType) { - Function *Res; - const char GetPrefix[] = "__dpcpp_nativecpu_get"; - if (Name.starts_with(GetPrefix)) { - Res = addGetFunc(M, Name, StateType); - } else if (Name == NativeCPUSetLocalId) { - Res = addSetLocalIdFunc(M, Name, StateType); - } else { - // the other __dpcpp_nativecpu_set* builtins are subgroup-related and - // not supported yet, emit empty functions for now. - auto &Ctx = M.getContext(); - Type *I32Ty = Type::getInt32Ty(Ctx); - Type *RetTy = Type::getVoidTy(Ctx); - Type *ValTy = I32Ty; - Type *PtrTy = PointerType::get(Ctx, NativeCPUGlobalAS); - static FunctionType *FTy = FunctionType::get(RetTy, {ValTy, PtrTy}, false); - auto FCallee = M.getOrInsertFunction(Name, FTy); - auto *F = cast(FCallee.getCallee()); - IRBuilder<> Builder(Ctx); - BasicBlock *BB = BasicBlock::Create(Ctx, "entry", F); - Builder.SetInsertPoint(BB); - Builder.CreateRetVoid(); - Res = F; - } - Res->setLinkage(GlobalValue::LinkageTypes::InternalLinkage); - return Res; -} - -static Function *getReplaceFunc(Module &M, StringRef Name, Type *StateType) { - Function *F = M.getFunction(Name); - if (!F) - return addReplaceFunc(M, Name, StateType); - assert(F && "Error retrieving replace function"); - return F; +static Function *getReplaceFunc(Module &M, StringRef Name, const Use &U, + const SmallVector &Args) { + if (Function *F = M.getFunction(Name)) + return F; + return addReplaceFunc(M, Name, U.getUser()->getType(), Args); } static Value *getStateArg(Function *F, llvm::Constant *StateTLS) { @@ -316,24 +216,63 @@ static Value *getStateArg(Function *F, llvm::Constant *StateTLS) { return BB.CreateLoad(StateTLS->getType(), V); } auto *FT = F->getFunctionType(); + assert(FT->getNumParams() > 0); return F->getArg(FT->getNumParams() - 1); } static inline bool IsNativeCPUKernel(const Function *F) { return F->getCallingConv() == llvm::CallingConv::SPIR_KERNEL; } + +static bool IsNonKernelCalledByNativeCPUKernel(const Function *F) { + if (IsNativeCPUKernel(F)) + return false; + llvm::DenseSet todo, checked; + for (;;) { + if (checked.insert(F).second) { + for (const auto &Use : F->uses()) { + if (auto *I = cast(Use.getUser())) { + const Function *const F2 = I->getFunction(); + if (IsNativeCPUKernel(F2)) + return true; + todo.insert(F2); + } + } + } + if (todo.empty()) + return false; + auto first = todo.begin(); + F = *first; + todo.erase(first); + } + return false; +} + +static bool IsUnusedBuiltinOrPrivateDef(const Function &F) { + if (F.getCallingConv() != llvm::CallingConv::SPIR_KERNEL && + F.getNumUses() == 0 && !F.isDeclaration()) { + if (F.hasFnAttribute(llvm::Attribute::AlwaysInline)) { + StringRef val = F.getFnAttribute("sycl-module-id").getValueAsString(); + if (val.ends_with("libdevice/nativecpu_utils.cpp")) + return true; + } + if (Function::isPrivateLinkage(F.getLinkage())) + return true; + } + return false; +} + static constexpr StringRef STATE_TLS_NAME = "_ZL28nativecpu_thread_local_state"; } // namespace -static llvm::Constant *CurrentStatePointerTLS; + PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M, ModuleAnalysisManager &MAM) { bool ModuleChanged = false; SmallVector OldKernels; - for (auto &F : M) { + for (auto &F : M) if (F.getCallingConv() == llvm::CallingConv::SPIR_KERNEL) OldKernels.push_back(&F); - } // Materialize builtins // First we add a pointer to the Native CPU state as arg to all the @@ -342,9 +281,9 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M, // Todo: fix this check since we are emitting the state type in the pass now if (!StateType) return PreservedAnalyses::all(); - Type *StatePtrType = PointerType::get(StateType, 1); + Type *StatePtrType = PointerType::get(StateType, NativeCPUGlobalAS); - CurrentStatePointerTLS = nullptr; + llvm::Constant *CurrentStatePointerTLS = nullptr; // check if any of the kernels is called by some other function. // This can happen e.g. with OCK, where wrapper functions are @@ -364,12 +303,13 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M, auto *Glob = M.getFunction(Entry.first); if (!Glob) continue; - for (const auto &Use : Glob->uses()) { - auto *I = cast(Use.getUser()); - if (!IsNativeCPUKernel(I->getFunction()) || KernelIsCalled) { - // only use the threadlocal if we have kernels calling builtins - // indirectly, or if the kernel is called by some other func. - if (CurrentStatePointerTLS == nullptr) + if (CurrentStatePointerTLS == nullptr) { + for (const auto &Use : Glob->uses()) { + auto *I = cast(Use.getUser()); + if (KernelIsCalled || + IsNonKernelCalledByNativeCPUKernel(I->getFunction())) { + // only use the threadlocal if we have kernels calling builtins + // indirectly, or if the kernel is called by some other func. CurrentStatePointerTLS = M.getOrInsertGlobal( STATE_TLS_NAME, StatePtrType, [&M, StatePtrType]() { GlobalVariable *p = new GlobalVariable( @@ -384,7 +324,8 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M, p->setInitializer(Constant::getNullValue(StatePtrType)); return p; }); - break; + break; + } } } UsedBuiltins.push_back({Glob, Entry.second}); @@ -400,7 +341,7 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M, // so this wrapper steals the original kernel name. std::optional veczR = compiler::utils::parseVeczToOrigFnLinkMetadata(*OldF); - if (veczR) { + if (veczR && veczR.value().first) { auto ScalarF = veczR.value().first; OldF->takeName(ScalarF); ScalarF->setName(OldF->getName() + "_scalar"); @@ -435,12 +376,25 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M, // replace them with calls to our Native CPU functions. for (const auto &Entry : UsedBuiltins) { SmallVector> ToRemove; + SmallVector ToRemove2; Function *const Glob = Entry.first; + Function *ReplaceFunc = nullptr; for (const auto &Use : Glob->uses()) { - auto *ReplaceFunc = getReplaceFunc(M, Entry.second, StateType); - auto I = cast(Use.getUser()); + auto I = cast(Use.getUser()); + Function *const C = I->getFunction(); + if (IsUnusedBuiltinOrPrivateDef(*C)) { + ToRemove2.push_back(C); + continue; + } + Value *SVal = getStateArg(I->getFunction(), CurrentStatePointerTLS); + if (nullptr == SVal || SVal->getType() != StatePtrType) { + // Something went wrong, try to recover + SVal = Constant::getNullValue(StatePtrType); + } SmallVector Args(I->arg_begin(), I->arg_end()); - Args.push_back(getStateArg(I->getFunction(), CurrentStatePointerTLS)); + Args.push_back(SVal); + if (nullptr == ReplaceFunc) + ReplaceFunc = getReplaceFunc(M, Entry.second, Use, Args); auto *NewI = CallInst::Create(ReplaceFunc->getFunctionType(), ReplaceFunc, Args, "", I); // If the parent function has debug info, we need to make sure that the @@ -462,6 +416,8 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M, OldI->replaceAllUsesWith(NewI); OldI->eraseFromParent(); } + for (auto temp : ToRemove2) + temp->eraseFromParent(); // Finally, we erase the builtin from the module Glob->eraseFromParent(); @@ -494,5 +450,27 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M, Builder.CreateRetVoid(); } #endif + + // removing unused builtins + SmallVector UnusedLibBuiltins; + for (auto &F : M) { + if (IsUnusedBuiltinOrPrivateDef(F)) { + UnusedLibBuiltins.push_back(&F); + } + } + for (Function *f : UnusedLibBuiltins) { + f->eraseFromParent(); + ModuleChanged = true; + } + for (auto it = M.begin(); it != M.end();) { + auto Curr = it++; + Function &F = *Curr; + if (F.getNumUses() == 0 && F.isDeclaration() && + F.getName().starts_with("__mux_")) { + F.eraseFromParent(); + ModuleChanged = true; + } + } + return ModuleChanged ? PreservedAnalyses::none() : PreservedAnalyses::all(); } diff --git a/llvm/lib/SYCLNativeCPUUtils/RenameKernelSYCLNativeCPU.cpp b/llvm/lib/SYCLNativeCPUUtils/RenameKernelSYCLNativeCPU.cpp index 9c1166398cb16..b5b0c9931cffc 100644 --- a/llvm/lib/SYCLNativeCPUUtils/RenameKernelSYCLNativeCPU.cpp +++ b/llvm/lib/SYCLNativeCPUUtils/RenameKernelSYCLNativeCPU.cpp @@ -33,6 +33,25 @@ RenameKernelSYCLNativeCPUPass::run(Module &M, ModuleAnalysisManager &MAM) { // Add NativeCPU suffix to module exports (kernels) and make other // function definitions private for (auto &F : M) { + + // Update call sites that still have SPIR calling conventions + const llvm::CallingConv::ID conv = F.getCallingConv(); + // assert(conv != llvm::CallingConv::SPIR_FUNC && + // conv != llvm::CallingConv::SPIR_KERNEL); + // todo: reenable assert + for (const auto &Use : F.uses()) { + if (auto I = dyn_cast(Use.getUser())) { + if (I->getCallingConv() == llvm::CallingConv::SPIR_FUNC || + I->getCallingConv() == llvm::CallingConv::SPIR_KERNEL) { + if (I->getCallingConv() == conv) + continue; + I->setCallingConv(conv); + ModuleChanged = true; + } + assert(I->getCallingConv() == conv); + } + } + if (F.hasFnAttribute(sycl::utils::ATTR_SYCL_MODULE_ID)) { F.setName(sycl::utils::addSYCLNativeCPUSuffix(F.getName())); ModuleChanged = true; diff --git a/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp b/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp index d4b088ac3c0f7..9996c9072d938 100644 --- a/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp +++ b/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp @@ -1,22 +1,19 @@ -// RUN: %clangxx -fsycl-device-only -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -mllvm -sycl-opt -mllvm -inline-threshold=500 -S -emit-llvm -o %t_temp.ll %s -// RUN: %clangxx -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s +// REQUIRES: native_cpu_ock +// RUN: %clangxx -fsycl -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -mllvm -sycl-opt -mllvm -inline-threshold=500 -mllvm -sycl-native-cpu-no-vecz -mllvm -sycl-native-dump-device-ir %s | FileCheck %s // RUN: %clangxx -fsycl-device-only -fsycl-targets=native_cpu -fno-inline -Xclang -sycl-std=2020 -mllvm -sycl-opt -S -emit-llvm -o %t_temp.ll %s // RUN: %clangxx -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-TL -// RUN: %clangxx -fsycl-device-only -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -Xclang -fenable-sycl-dae -mllvm -sycl-opt -mllvm -inline-threshold=500 -S -emit-llvm %s -o %t_temp.ll -// RUN: %clangxx -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s +// RUN: %clangxx -fsycl -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -Xclang -fenable-sycl-dae -mllvm -sycl-opt -mllvm -inline-threshold=500 -mllvm -sycl-native-cpu-no-vecz -mllvm -sycl-native-dump-device-ir %s | FileCheck %s // RUN: %clangxx -fsycl-device-only -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -Xclang -fenable-sycl-dae -mllvm -sycl-opt -fno-inline -S -emit-llvm %s -o %t_temp.ll // RUN: %clangxx -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-TL -// Check that builtins are emitted as expected -// RUN: %clangxx -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-BT - // check that we added the state struct as a function argument, and that we // inject the calls to our builtins. -// CHECK: %struct.__nativecpu_state = type { [3 x i64], [3 x i64], [3 x i64], [3 x i64], [3 x i64], [3 x i64], [3 x i64] } +// CHECK-NOT: define internal{{.*}}__mux_sub_group_shuffle + #include "sycl.hpp" class Test1; class Test2; @@ -97,12 +94,5 @@ int main() { }); } -// check that builtins are generated as expected -// CHECK-BT: define internal i64 @__dpcpp_nativecpu_get_global_id(i32 %[[DIMARG:.*]], ptr addrspace(1) %[[PTRARG:.*]]) { -// CHECK-BT: %[[GEP:.*]] = getelementptr %struct.__nativecpu_state, ptr addrspace(1) %[[PTRARG]], i64 0, i32 0, i32 %[[DIMARG]] -// CHECK-BT: %[[LOAD:.*]] = load i64, ptr addrspace(1) %[[GEP]], align 8 -// CHECK-BT: ret i64 %[[LOAD]] -// CHECK-BT: } - // check that the generated module has the is-native-cpu module flag set // CHECK: !{{[0-9]*}} = !{i32 1, !"is-native-cpu", i32 1} diff --git a/sycl/test/check_device_code/native_cpu/sycl-native-cpu-libclc.cpp b/sycl/test/check_device_code/native_cpu/sycl-native-cpu-libclc.cpp index afebe7b677783..ac300a39fb9b9 100644 --- a/sycl/test/check_device_code/native_cpu/sycl-native-cpu-libclc.cpp +++ b/sycl/test/check_device_code/native_cpu/sycl-native-cpu-libclc.cpp @@ -1,5 +1,8 @@ // TODO: move this to clang/Driver once Native CPU is enabled in CI // REQUIRES: native_cpu +// TODO: currently disabled on Windows, but could work on Windows if Linux +// libraries are found +// UNSUPPORTED: (windows) // RUN: %clang -### -fsycl -fsycl-targets=native_cpu -target x86_64-unknown-linux-gnu %s 2> %t.ncpu.out // RUN: FileCheck %s --input-file %t.ncpu.out // CHECK: {{(\\|/)}}remangled-l64-signed_char.libspirv-x86_64-unknown-linux-gnu.bc" diff --git a/sycl/test/check_device_code/native_cpu/vectorization.cpp b/sycl/test/check_device_code/native_cpu/vectorization.cpp index 07e7a92805935..12b8a21cc069e 100644 --- a/sycl/test/check_device_code/native_cpu/vectorization.cpp +++ b/sycl/test/check_device_code/native_cpu/vectorization.cpp @@ -1,10 +1,15 @@ // REQUIRES: native_cpu_ock -// RUN: %clangxx -fsycl-device-only -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -mllvm -sycl-opt -mllvm -inline-threshold=500 -S -emit-llvm -o %t_temp.ll %s -// RUN: %clangxx -O2 -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-DEFAULT -// RUN: %clangxx -O2 -mllvm -sycl-native-cpu-backend -mllvm -sycl-native-cpu-vecz-width=16 -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-16 -// RUN: %clangxx -O2 -mllvm -sycl-native-cpu-backend -mllvm -sycl-native-cpu-vecz-width=4 -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-4 -// RUN: %clangxx -O0 -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-O0 -// RUN: %clangxx -fsycl -fsycl-targets=native_cpu -O2 -mllvm -sycl-native-cpu-backend -mllvm -sycl-native-cpu-no-vecz -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-DISABLE +// RUN: %clangxx -fsycl -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -mllvm -sycl-opt -mllvm -inline-threshold=500 -mllvm -sycl-native-dump-device-ir %s | FileCheck %s --check-prefix=CHECK-DEFAULT +// RUN: %clangxx -fsycl -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -mllvm -sycl-opt -mllvm -inline-threshold=500 -O2 -mllvm -sycl-native-cpu-vecz-width=16 -mllvm -sycl-native-dump-device-ir %s | FileCheck %s --check-prefix=CHECK-16 +// RUN: %clangxx -fsycl -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -mllvm -sycl-opt -mllvm -inline-threshold=500 -O2 -mllvm -sycl-native-cpu-vecz-width=4 -mllvm -sycl-native-dump-device-ir %s | FileCheck %s --check-prefix=CHECK-4 +// RUN: %clangxx -fsycl -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -O0 -mllvm -sycl-native-dump-device-ir %s | FileCheck %s --check-prefix=CHECK-O0 +// RUN: %clangxx -fsycl -fsycl-targets=native_cpu -O2 -mllvm -sycl-native-cpu-no-vecz -mllvm -sycl-native-dump-device-ir %s | FileCheck %s --check-prefix=CHECK-DISABLE + +// Invalid invocations: check that they don't crash the compiler +// RUN: %clangxx -fsycl -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -mllvm -sycl-opt -mllvm -inline-threshold=500 -mllvm -sycl-native-dump-device-ir %s > %t_temp.ll +// RUN: opt -O3 -verify-each %t_temp.ll -S -o %t_temp2.ll +// RUN: %clangxx -O2 -mllvm -sycl-native-cpu-backend -S -emit-llvm %t_temp.ll + #include class Test1; int main() { diff --git a/sycl/test/native_cpu/vector-add.cpp b/sycl/test/native_cpu/vector-add.cpp index 25d80f572699c..51c9593b61ccd 100644 --- a/sycl/test/native_cpu/vector-add.cpp +++ b/sycl/test/native_cpu/vector-add.cpp @@ -2,6 +2,10 @@ // RUN: %clangxx -fsycl -fsycl-targets=native_cpu %s -o %t // RUN: env ONEAPI_DEVICE_SELECTOR="native_cpu:cpu" %t +// Same test with -O0 to ensure unremoved builtins link +// RUN: %clangxx -fsycl -fsycl-targets=native_cpu -O0 %s -o %t +// RUN: env ONEAPI_DEVICE_SELECTOR="native_cpu:cpu" %t + // Same test but with -g // RUN: %clangxx -fsycl -fsycl-targets=native_cpu %s -g -o %t-debug // RUN: env ONEAPI_DEVICE_SELECTOR="native_cpu:cpu" %t-debug