diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index 6031069aa1f28..6b25922337810 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -41,6 +41,7 @@ #include "llvm/Passes/PassBuilder.h" #include "llvm/Passes/PassPlugin.h" #include "llvm/Passes/StandardInstrumentations.h" +#include "llvm/SYCLLowerIR/LowerESIMD.h" #include "llvm/Support/BuryPointer.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/MemoryBuffer.h" @@ -785,6 +786,25 @@ void EmitAssemblyHelper::CreatePasses(legacy::PassManager &MPM, PMBuilder.populateFunctionPassManager(FPM); PMBuilder.populateModulePassManager(MPM); + + // Customize the tail of the module passes list for the ESIMD extension. + if (LangOpts.SYCLIsDevice && LangOpts.SYCLExplicitSIMD && + CodeGenOpts.OptimizationLevel != 0) { + MPM.add(createESIMDLowerVecArgPass()); + MPM.add(createESIMDLowerLoadStorePass()); + MPM.add(createSROAPass()); + MPM.add(createEarlyCSEPass(true)); + MPM.add(createInstructionCombiningPass()); + MPM.add(createDeadCodeEliminationPass()); + MPM.add(createFunctionInliningPass( + CodeGenOpts.OptimizationLevel, CodeGenOpts.OptimizeSize, + (!CodeGenOpts.SampleProfileFile.empty() && + CodeGenOpts.PrepareForThinLTO))); + MPM.add(createSROAPass()); + MPM.add(createEarlyCSEPass(true)); + MPM.add(createInstructionCombiningPass()); + MPM.add(createDeadCodeEliminationPass()); + } } static void setCommandLineOpts(const CodeGenOptions &CodeGenOpts) { @@ -879,6 +899,11 @@ void EmitAssemblyHelper::EmitAssembly(BackendAction Action, PerFunctionPasses.add( createTargetTransformInfoWrapperPass(getTargetIRAnalysis())); + // ESIMD extension always requires lowering of certain IR constructs, such as + // ESIMD C++ intrinsics, as the last FE step. + if (LangOpts.SYCLIsDevice && LangOpts.SYCLExplicitSIMD) + PerModulePasses.add(createSYCLLowerESIMDPass()); + CreatePasses(PerModulePasses, PerFunctionPasses); legacy::PassManager CodeGenPasses; diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index 0a04ab5c940c9..63b31130df627 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -816,10 +816,14 @@ static bool ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, InputKind IK, Args.getLastArg(OPT_emit_llvm_uselists, OPT_no_emit_llvm_uselists)) Opts.EmitLLVMUseLists = A->getOption().getID() == OPT_emit_llvm_uselists; + // ESIMD GPU Back-end requires optimized IR + bool IsSyclESIMD = Args.hasFlag(options::OPT_fsycl_esimd, + options::OPT_fno_sycl_esimd, false); + Opts.DisableLLVMPasses = Args.hasArg(OPT_disable_llvm_passes) || (Args.hasArg(OPT_fsycl_is_device) && Triple.isSPIR() && - !Args.hasArg(OPT_fsycl_enable_optimizations)); + !Args.hasArg(OPT_fsycl_enable_optimizations) && !IsSyclESIMD); Opts.DisableLifetimeMarkers = Args.hasArg(OPT_disable_lifetimemarkers); const llvm::Triple::ArchType DebugEntryValueArchs[] = { diff --git a/clang/test/CodeGenSYCL/esimd-private-global.cpp b/clang/test/CodeGenSYCL/esimd-private-global.cpp index 8e4aa7f81001f..b0e80eb34ee78 100644 --- a/clang/test/CodeGenSYCL/esimd-private-global.cpp +++ b/clang/test/CodeGenSYCL/esimd-private-global.cpp @@ -9,6 +9,6 @@ __attribute__((opencl_private)) __attribute__((register_num(17))) int vc; SYCL_EXTERNAL void init_vc(int x) { vc = x; - // CHECK: store i32 %0, i32* @vc + // CHECK: store i32 %{{[0-9a-zA-Z_]+}}, i32* @vc } // CHECK: attributes #0 = {{.*"VCByteOffset"="17".*"VCVolatile"}} diff --git a/llvm/lib/SYCLLowerIR/LowerESIMD.cpp b/llvm/lib/SYCLLowerIR/LowerESIMD.cpp index 232706c101d23..38cec50ac814e 100644 --- a/llvm/lib/SYCLLowerIR/LowerESIMD.cpp +++ b/llvm/lib/SYCLLowerIR/LowerESIMD.cpp @@ -820,7 +820,7 @@ translateSpirvIntrinsic(CallInst *CI, StringRef SpirvIntrName, auto translateSpirvIntr = [&SpirvIntrName, &ESIMDToErases, CI](StringRef SpvIName, auto TranslateFunc) { if (SpirvIntrName.consume_front(SpvIName)) { - Value *TranslatedV = TranslateFunc(*CI, SpirvIntrName); + Value *TranslatedV = TranslateFunc(*CI, SpirvIntrName.substr(1, 1)); CI->replaceAllUsesWith(TranslatedV); ESIMDToErases.push_back(CI); } diff --git a/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll b/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll index f29503b1bfce3..5ed90614a675b 100644 --- a/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll +++ b/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll @@ -170,6 +170,17 @@ define dso_local spir_kernel void @FUNC_30() !sycl_explicit_simd !1 { ; CHECK-NEXT: ret void } +define dso_local spir_kernel void @FUNC_31() !sycl_explicit_simd !1 { +; CHECK: define dso_local spir_kernel void @FUNC_31() !sycl_explicit_simd !1 + %call = call spir_func i64 @_Z27__spirv_LocalInvocationId_xv() +; CHECK-NEXT: %call.esimd = call <3 x i32> @llvm.genx.local.id.v3i32() +; CHECK-NEXT: %local_id.x = extractelement <3 x i32> %call.esimd, i32 0 +; CHECK-NEXT: %local_id.x.cast.ty = zext i32 %local_id.x to i64 + ret void +; CHECK-NEXT: ret void +} + +declare dso_local spir_func i64 @_Z27__spirv_LocalInvocationId_xv() declare dso_local spir_func <32 x i32> @_Z20__esimd_flat_atomic0ILN2cm3gen14CmAtomicOpTypeE2EjLi32ELNS1_9CacheHintE0ELS3_0EENS1_13__vector_typeIT0_XT1_EE4typeENS4_IyXT1_EE4typeENS4_ItXT1_EE4typeE(<32 x i64> %0, <32 x i16> %1) declare dso_local spir_func <32 x i32> @_Z20__esimd_flat_atomic1ILN2cm3gen14CmAtomicOpTypeE0EjLi32ELNS1_9CacheHintE0ELS3_0EENS1_13__vector_typeIT0_XT1_EE4typeENS4_IyXT1_EE4typeES7_NS4_ItXT1_EE4typeE(<32 x i64> %0, <32 x i32> %1, <32 x i16> %2) declare dso_local spir_func <32 x i32> @_Z20__esimd_flat_atomic2ILN2cm3gen14CmAtomicOpTypeE7EjLi32ELNS1_9CacheHintE0ELS3_0EENS1_13__vector_typeIT0_XT1_EE4typeENS4_IyXT1_EE4typeES7_S7_NS4_ItXT1_EE4typeE(<32 x i64> %0, <32 x i32> %1, <32 x i32> %2, <32 x i16> %3) diff --git a/llvm/test/SYCLLowerIR/esimd_subroutine.ll b/llvm/test/SYCLLowerIR/esimd_subroutine.ll index 81bbcb9e0016b..6b5019bcc868e 100644 --- a/llvm/test/SYCLLowerIR/esimd_subroutine.ll +++ b/llvm/test/SYCLLowerIR/esimd_subroutine.ll @@ -49,7 +49,7 @@ entry: } ; Function Attrs: norecurse nounwind -; CHECK: define linkonce_odr spir_func void @_ZN4simdIiLi16EEC1ERS0_(<16 x i32> addrspace(4)* [[OLDARG0:%[a-zA-Z0-9_]*]], <16 x i32> addrspace(4)* [[OLDARG1:%[a-zA-Z0-9_]*]]) unnamed_addr {{.+}} +; CHECK: define linkonce_odr spir_func void @_ZN4simdIiLi16EEC1ERS0_(<16 x i32> addrspace(4)* [[OLDARG0:%[a-zA-Z0-9_]*]], <16 x i32> addrspace(4)*{{.*}} [[OLDARG1:%[a-zA-Z0-9_]*]]) unnamed_addr {{.+}} define linkonce_odr spir_func void @_ZN4simdIiLi16EEC1ERS0_(%class._ZTS4simdIiLi16EE.simd addrspace(4)* %this, %class._ZTS4simdIiLi16EE.simd addrspace(4)* align 64 dereferenceable(64) %other) unnamed_addr #0 comdat align 2 { entry: ; CHECK: [[NEWARG1:%[a-zA-Z0-9_]*]] = bitcast <16 x i32> addrspace(4)* [[OLDARG1]] to {{.+}} diff --git a/sycl/test/basic_tests/esimd/vadd.cpp b/sycl/test/basic_tests/esimd/vadd.cpp new file mode 100644 index 0000000000000..481c22a2718d9 --- /dev/null +++ b/sycl/test/basic_tests/esimd/vadd.cpp @@ -0,0 +1,105 @@ +// TODO ESIMD enable host device under -fsycl +// RUN: %clangxx -I %sycl_include %s -o %t.out -lsycl +// RUN: env SYCL_DEVICE_TYPE=HOST %t.out + +#include +#include +#include + +using namespace cl::sycl; + +class ESIMDSelector : public device_selector { + // Require GPU device unless HOST is requested in SYCL_DEVICE_TYPE env + virtual int operator()(const device &device) const { + if (const char *dev_type = getenv("SYCL_DEVICE_TYPE")) { + if (!strcmp(dev_type, "GPU")) + return device.is_gpu() ? 1000 : -1; + if (!strcmp(dev_type, "HOST")) + return device.is_host() ? 1000 : -1; + std::cerr << "Supported 'SYCL_DEVICE_TYPE' env var values are 'GPU' and " + "'HOST', '" + << dev_type << "' is not.\n"; + return -1; + } + // If "SYCL_DEVICE_TYPE" not defined, only allow gpu device + return device.is_gpu() ? 1000 : -1; + } +}; + +auto exception_handler = [](exception_list l) { + for (auto ep : l) { + try { + std::rethrow_exception(ep); + } catch (cl::sycl::exception &e0) { + std::cout << "sycl::exception: " << e0.what() << std::endl; + } catch (std::exception &e) { + std::cout << "std::exception: " << e.what() << std::endl; + } catch (...) { + std::cout << "generic exception\n"; + } + } +}; + +int main(void) { + constexpr unsigned Size = 256; + constexpr unsigned VL = 32; + constexpr unsigned GroupSize = 2; + + int A[Size]; + int B[Size]; + int C[Size] = {}; + + for (unsigned i = 0; i < Size; ++i) { + A[i] = B[i] = i; + } + + { + cl::sycl::buffer bufA(A, Size); + cl::sycl::buffer bufB(B, Size); + cl::sycl::buffer bufC(C, Size); + + // We need that many task groups + cl::sycl::range<1> GroupRange{Size / VL}; + + // We need that many tasks in each group + cl::sycl::range<1> TaskRange{GroupSize}; + + cl::sycl::nd_range<1> Range{GroupRange, TaskRange}; + + queue q(ESIMDSelector{}, exception_handler); + q.submit([&](cl::sycl::handler &cgh) { + auto accA = bufA.get_access(cgh); + auto accB = bufB.get_access(cgh); + auto accC = bufC.get_access(cgh); + + cgh.parallel_for( + Range, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL { + using namespace sycl::intel::gpu; + auto pA = accA.get_pointer().get(); + auto pB = accB.get_pointer().get(); + auto pC = accC.get_pointer().get(); + + int i = ndi.get_global_id(0); + constexpr int ESIZE = sizeof(int); + simd offsets(0, ESIZE); + + simd va = gather(pA + i * VL, offsets); + simd vb = block_load(pB + i * VL); + simd vc = va + vb; + + block_store(pC + i * VL, vc); + }); + }); + + for (unsigned i = 0; i < Size; ++i) { + if (A[i] + B[i] != C[i]) { + std::cout << "failed at index " << i << ", " << C[i] << " != " << A[i] + << " + " << B[i] << "\n"; + return 1; + } + } + } + + std::cout << "Passed\n"; + return 0; +} diff --git a/sycl/test/esimd/glob.cpp b/sycl/test/esimd/glob.cpp new file mode 100644 index 0000000000000..ae752ad5801ea --- /dev/null +++ b/sycl/test/esimd/glob.cpp @@ -0,0 +1,28 @@ +// RUN: %clangxx -fsycl -fsycl-explicit-simd -c -fsycl-device-only -Xclang -emit-llvm %s -o - | \ +// RUN: FileCheck %s + +// This test checks that globals with register attribute are allowed in ESIMD +// mode, can be accessed in functions and correct LLVM IR is generated +// (including translation of the register attribute) + +#include +#include +#include + +using namespace cl::sycl; +using namespace sycl::intel::gpu; + +constexpr unsigned VL = 16; + +ESIMD_PRIVATE ESIMD_REGISTER(17) simd vc; +// CHECK-DAG: @vc = {{.+}} <16 x i32> zeroinitializer, align 64 #0 +// CHECK-DAG: attributes #0 = { {{.*}}"VCByteOffset"="17" "VCGlobalVariable" "VCVolatile"{{.*}} } + +ESIMD_PRIVATE ESIMD_REGISTER(17 + VL) simd vc1; +// CHECK-DAG: @vc1 = {{.+}} <16 x i32> zeroinitializer, align 64 #1 +// CHECK-DAG: attributes #1 = { {{.*}}"VCByteOffset"="33" "VCGlobalVariable" "VCVolatile"{{.*}} } + +SYCL_EXTERNAL ESIMD_NOINLINE void init_vc(int x) { + vc1 = vc + 1; + vc = x; +} diff --git a/sycl/test/esimd/hw_compile.cpp b/sycl/test/esimd/hw_compile.cpp new file mode 100644 index 0000000000000..96ffe30fda89c --- /dev/null +++ b/sycl/test/esimd/hw_compile.cpp @@ -0,0 +1,39 @@ +// Basic ESIMD test which checks that ESIMD invocation syntax can get compiled. +// RUN: %clangxx -fsycl -fsycl-explicit-simd -fsycl-device-only -c %s -o %t.bc + +#include +#include +#include + +int main(void) { + constexpr unsigned Size = 4; + int A[Size] = {1, 2, 3, 4}; + int B[Size] = {1, 2, 3, 4}; + int C[Size]; + + { + cl::sycl::range<1> UnitRange{1}; + cl::sycl::buffer bufA(A, UnitRange); + cl::sycl::buffer bufB(B, UnitRange); + cl::sycl::buffer bufC(C, UnitRange); + + cl::sycl::queue().submit([&](cl::sycl::handler &cgh) { + auto accA = bufA.get_access(cgh); + auto accB = bufB.get_access(cgh); + auto accC = bufC.get_access(cgh); + + cgh.parallel_for(UnitRange * UnitRange, + [=](sycl::id<1> i) SYCL_ESIMD_KERNEL { + // those operations below would normally be + // represented as a single vector operation + // through ESIMD vector + accC[i + 0] = accA[i + 0] + accB[i + 0]; + accC[i + 1] = accA[i + 1] + accB[i + 1]; + accC[i + 2] = accA[i + 2] + accB[i + 2]; + accC[i + 3] = accA[i + 3] + accB[i + 3]; + }); + }); + } + + return 0; +} diff --git a/sycl/test/esimd/intrins_trans.cpp b/sycl/test/esimd/intrins_trans.cpp new file mode 100644 index 0000000000000..8bdd4981b78e9 --- /dev/null +++ b/sycl/test/esimd/intrins_trans.cpp @@ -0,0 +1,117 @@ +// RUN: %clangxx -O0 -fsycl -fsycl-explicit-simd -fsycl-device-only -Xclang -emit-llvm %s -o - | \ +// RUN: FileCheck %s + +// Checks ESIMD intrinsic translation. +// NOTE: must be run in -O0, as optimizer optimizes away some of the code + +#include +#include +#include + +using namespace sycl::intel::gpu; + +ESIMD_PRIVATE vector_type_t vc; +ESIMD_PRIVATE ESIMD_REGISTER(192) simd vg; + +SYCL_ESIMD_FUNCTION SYCL_EXTERNAL simd foo(); + +class EsimdFunctor { +public: + void operator()() __attribute__((sycl_explicit_simd)) { foo(); } +}; + +template +__attribute__((sycl_kernel)) void kernel(Func kernelFunc) { + kernelFunc(); +} + +void bar() { + EsimdFunctor esimdf; + kernel(esimdf); +} + +SYCL_ESIMD_FUNCTION SYCL_EXTERNAL simd foo() { + // CHECK-LABEL: @_Z3foov + constexpr int VL = 32; + uint32_t *ptr = 0; + + int x = 0, y = 0, z = 0; + + simd v1(0, x + z); + simd offsets(0, y); + simd v_addr(reinterpret_cast(ptr)); + simd pred; + v_addr += offsets; + + __esimd_flat_atomic0( + v_addr.data(), pred.data()); + // CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.atomic.inc.v32i32.v32i1.v32i64(<32 x i1> %{{[0-9a-zA-Z_.]+}}, <32 x i64> %{{[0-9a-zA-Z_.]+}}, <32 x i32> undef) + + __esimd_flat_atomic1( + v_addr.data(), v1, pred.data()); + // CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.atomic.add.v32i32.v32i1.v32i64(<32 x i1> %{{[0-9a-zA-Z_.]+}}, <32 x i64> %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}}, <32 x i32> undef) + __esimd_flat_atomic2( + v_addr.data(), v1, v1, pred.data()); + // CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.atomic.cmpxchg.v32i32.v32i1.v32i64(<32 x i1> %{{[0-9a-zA-Z_.]+}}, <32 x i64> %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}}, <32 x i32> undef) + + uintptr_t addr = reinterpret_cast(ptr); + simd v00 = + __esimd_flat_block_read_unaligned(addr); + // CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.block.ld.unaligned.v32i32(i64 %{{[0-9a-zA-Z_.]+}}) + __esimd_flat_block_write(addr, v00.data()); + // CHECK: call void @llvm.genx.svm.block.st.v32i32(i64 %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}}) + + simd v01 = + __esimd_flat_read(v_addr.data(), 0, pred.data()); + // CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.gather.v32i32.v32i1.v32i64(<32 x i1> %{{[0-9a-zA-Z_.]+}}, i32 0, <32 x i64> %{{[0-9a-zA-Z_.]+}}, <32 x i32> undef) + + __esimd_flat_write(v_addr.data(), v01.data(), 0, pred.data()); + // CHECK: call void @llvm.genx.svm.scatter.v32i1.v32i64.v32i32(<32 x i1> %{{[0-9a-zA-Z_.]+}}, i32 0, <32 x i64> %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}}) + + simd mina(0, 1); + simd minc(5); + minc = __esimd_smin(mina.data(), minc.data()); + // CHECK: %{{[0-9a-zA-Z_.]+}} = call <16 x i16> @llvm.genx.smin.v16i16.v16i16(<16 x i16> %{{[0-9a-zA-Z_.]+}}, <16 x i16> %{{[0-9a-zA-Z_.]+}}) + + simd diva(2.f); + simd divb(1.f); + diva = __esimd_div_ieee<1>(diva.data(), divb.data()); + // CHECK: %{{[0-9a-zA-Z_.]+}} = call <1 x float> @llvm.genx.ieee.div.v1f32(<1 x float> %{{[0-9a-zA-Z_.]+}}, <1 x float> %{{[0-9a-zA-Z_.]+}}) + + simd a(0.1f); + simd b = __esimd_rdregion(a.data(), 0); + // CHECK: %{{[0-9a-zA-Z_.]+}} = call <8 x float> @llvm.genx.rdregionf.v8f32.v16f32.i16(<16 x float> %{{[0-9a-zA-Z_.]+}}, i32 0, i32 8, i32 1, i16 0, i32 0) + + simd c(0.0f); + + using PH = cl::sycl::access::placeholder; + + cl::sycl::accessor + pA; + cl::sycl::accessor + pB; + + auto d = __esimd_wrregion( + c.data() /*dst*/, b.data() /*src*/, 0 /*offset*/); + // CHECK: %{{[0-9a-zA-Z_.]+}} = call <16 x float> @llvm.genx.wrregionf.v16f32.v8f32.i16.v8i1(<16 x float> %{{[0-9a-zA-Z_.]+}}, <8 x float> %{{[0-9a-zA-Z_.]+}}, i32 0, i32 8, i32 1, i16 0, i32 0, <8 x i1> ) + + simd va; + va = media_block_load(pA, x, y); + // CHECK: %[[SI0:[0-9a-zA-Z_.]+]] = ptrtoint %opencl.image2d_ro_t addrspace(1)* %{{[0-9a-zA-Z_.]+}} to i32 + // CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.media.ld.v32i32(i32 0, i32 %[[SI0]], i32 0, i32 32, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}) + + simd vb = va + 1; + media_block_store(pB, x, y, vb); + // CHECK: %[[SI2:[0-9a-zA-Z_.]+]] = ptrtoint %opencl.image2d_wo_t addrspace(1)* %{{[0-9a-zA-Z_.]+}} to i32 + // CHECK: call void @llvm.genx.media.st.v32i32(i32 0, i32 %[[SI2]], i32 0, i32 32, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}}) + + auto ee = __esimd_vload((vector_type_t *)(&vg)); + // CHECK: %{{[0-9a-zA-Z_.]+}} = call <16 x i32> @llvm.genx.vload.v16i32.p4v16i32(<16 x i32> addrspace(4)* {{.*}}) + __esimd_vstore(&vc, va.data()); + // CHECK: store <32 x i32> %{{[0-9a-zA-Z_.]+}}, <32 x i32> addrspace(4)* {{.*}} + + return d; +} diff --git a/sycl/test/esimd/spirv_intrins_trans.cpp b/sycl/test/esimd/spirv_intrins_trans.cpp new file mode 100644 index 0000000000000..3327b0c13e117 --- /dev/null +++ b/sycl/test/esimd/spirv_intrins_trans.cpp @@ -0,0 +1,205 @@ +// RUN: %clangxx -fsycl -fsycl-explicit-simd -fsycl-device-only -O0 -S -emit-llvm -x c++ %s -o - | FileCheck %s +// This test checks that all SPIRV intrinsics are correctly +// translated into GenX counterparts (implemented in LowerCM.cpp) + +#include +#include + +SYCL_EXTERNAL size_t __spirv_GlobalInvocationId_x(); +SYCL_EXTERNAL size_t __spirv_GlobalInvocationId_y(); +SYCL_EXTERNAL size_t __spirv_GlobalInvocationId_z(); + +SYCL_EXTERNAL size_t __spirv_GlobalSize_x(); +SYCL_EXTERNAL size_t __spirv_GlobalSize_y(); +SYCL_EXTERNAL size_t __spirv_GlobalSize_z(); + +SYCL_EXTERNAL size_t __spirv_GlobalOffset_x(); +SYCL_EXTERNAL size_t __spirv_GlobalOffset_y(); +SYCL_EXTERNAL size_t __spirv_GlobalOffset_z(); + +SYCL_EXTERNAL size_t __spirv_NumWorkgroups_x(); +SYCL_EXTERNAL size_t __spirv_NumWorkgroups_y(); +SYCL_EXTERNAL size_t __spirv_NumWorkgroups_z(); + +SYCL_EXTERNAL size_t __spirv_WorkgroupSize_x(); +SYCL_EXTERNAL size_t __spirv_WorkgroupSize_y(); +SYCL_EXTERNAL size_t __spirv_WorkgroupSize_z(); + +SYCL_EXTERNAL size_t __spirv_WorkgroupId_x(); +SYCL_EXTERNAL size_t __spirv_WorkgroupId_y(); +SYCL_EXTERNAL size_t __spirv_WorkgroupId_z(); + +SYCL_EXTERNAL size_t __spirv_LocalInvocationId_x(); +SYCL_EXTERNAL size_t __spirv_LocalInvocationId_y(); +SYCL_EXTERNAL size_t __spirv_LocalInvocationId_z(); + +template +__attribute__((sycl_kernel)) void kernel(Func kernelFunc) { + kernelFunc(); +} + +size_t caller() { + + size_t DoNotOpt; + cl::sycl::buffer buf(&DoNotOpt, 1); + cl::sycl::queue().submit([&](cl::sycl::handler &cgh) { + auto DoNotOptimize = buf.get_access(cgh); + + kernel([=]() SYCL_ESIMD_KERNEL { + *DoNotOptimize.get_pointer() = __spirv_GlobalInvocationId_x(); + }); + // CHECK-LABEL: @{{.*}}kernel_GlobalInvocationId_x + // CHECK: [[CALL_ESIMD1:%.*]] = call <3 x i32> @llvm.genx.local.id.v3i32() + // CHECK: {{.*}} extractelement <3 x i32> [[CALL_ESIMD1]], i32 0 + // CHECK: [[CALL_ESIMD2:%.*]] = call <3 x i32> @llvm.genx.local.size.v3i32() + // CHECK: {{.*}} extractelement <3 x i32> [[CALL_ESIMD2]], i32 0 + // CHECK: {{.*}} call i32 @llvm.genx.group.id.x() + + kernel([=]() SYCL_ESIMD_KERNEL { + *DoNotOptimize.get_pointer() = __spirv_GlobalInvocationId_y(); + }); + // CHECK-LABEL: @{{.*}}kernel_GlobalInvocationId_y + // CHECK: [[CALL_ESIMD1:%.*]] = call <3 x i32> @llvm.genx.local.id.v3i32() + // CHECK: {{.*}} extractelement <3 x i32> [[CALL_ESIMD1]], i32 1 + // CHECK: [[CALL_ESIMD2:%.*]] = call <3 x i32> @llvm.genx.local.size.v3i32() + // CHECK: {{.*}} extractelement <3 x i32> [[CALL_ESIMD2]], i32 1 + // CHECK: {{.*}} call i32 @llvm.genx.group.id.y() + + kernel([=]() SYCL_ESIMD_KERNEL { + *DoNotOptimize.get_pointer() = __spirv_GlobalInvocationId_z(); + }); + // CHECK-LABEL: @{{.*}}kernel_GlobalInvocationId_z + // CHECK: [[CALL_ESIMD1:%.*]] = call <3 x i32> @llvm.genx.local.id.v3i32() + // CHECK: {{.*}} extractelement <3 x i32> [[CALL_ESIMD1]], i32 2 + // CHECK: [[CALL_ESIMD2:%.*]] = call <3 x i32> @llvm.genx.local.size.v3i32() + // CHECK: {{.*}} extractelement <3 x i32> [[CALL_ESIMD2]], i32 2 + // CHECK: {{.*}} call i32 @llvm.genx.group.id.z() + + kernel([=]() SYCL_ESIMD_KERNEL { + *DoNotOptimize.get_pointer() = __spirv_GlobalSize_x(); + }); + // CHECK-LABEL: @{{.*}}kernel_GlobalSize_x + // CHECK: [[CALL_ESIMD1:%.*]] = call <3 x i32> @llvm.genx.local.size.v3i32() + // CHECK: {{.*}} extractelement <3 x i32> [[CALL_ESIMD1]], i32 0 + // CHECK: [[CALL_ESIMD2:%.*]] = call <3 x i32> @llvm.genx.group.count.v3i32() + // CHECK: {{.*}} extractelement <3 x i32> [[CALL_ESIMD2]], i32 0 + + kernel([=]() SYCL_ESIMD_KERNEL { + *DoNotOptimize.get_pointer() = __spirv_GlobalSize_y(); + }); + // CHECK-LABEL: @{{.*}}kernel_GlobalSize_y + // CHECK: [[CALL_ESIMD1:%.*]] = call <3 x i32> @llvm.genx.local.size.v3i32() + // CHECK: {{.*}} extractelement <3 x i32> [[CALL_ESIMD1]], i32 1 + // CHECK: [[CALL_ESIMD2:%.*]] = call <3 x i32> @llvm.genx.group.count.v3i32() + // CHECK: {{.*}} extractelement <3 x i32> [[CALL_ESIMD2]], i32 1 + + kernel([=]() SYCL_ESIMD_KERNEL { + *DoNotOptimize.get_pointer() = __spirv_GlobalSize_z(); + }); + // CHECK-LABEL: @{{.*}}kernel_GlobalSize_z + // CHECK: [[CALL_ESIMD1:%.*]] = call <3 x i32> @llvm.genx.local.size.v3i32() + // CHECK: {{.*}} extractelement <3 x i32> [[CALL_ESIMD1]], i32 2 + // CHECK: [[CALL_ESIMD2:%.*]] = call <3 x i32> @llvm.genx.group.count.v3i32() + // CHECK: {{.*}} extractelement <3 x i32> [[CALL_ESIMD2]], i32 2 + + kernel([=]() SYCL_ESIMD_KERNEL { + *DoNotOptimize.get_pointer() = __spirv_GlobalOffset_x(); + }); + // CHECK-LABEL: @{{.*}}kernel_GlobalOffset_x + // CHECK: store i64 0 + + kernel([=]() SYCL_ESIMD_KERNEL { + *DoNotOptimize.get_pointer() = __spirv_GlobalOffset_y(); + }); + // CHECK-LABEL: @{{.*}}kernel_GlobalOffset_y + // CHECK: store i64 0 + + kernel([=]() SYCL_ESIMD_KERNEL { + *DoNotOptimize.get_pointer() = __spirv_GlobalOffset_z(); + }); + // CHECK-LABEL: @{{.*}}kernel_GlobalOffset_z + // CHECK: store i64 0 + + kernel([=]() SYCL_ESIMD_KERNEL { + *DoNotOptimize.get_pointer() = __spirv_NumWorkgroups_x(); + }); + // CHECK-LABEL: @{{.*}}kernel_NumWorkgroups_x + // CHECK: [[CALL_ESIMD:%.*]] = call <3 x i32> @llvm.genx.group.count.v3i32() + // CHECK: {{.*}} extractelement <3 x i32> [[CALL_ESIMD]], i32 0 + + kernel([=]() SYCL_ESIMD_KERNEL { + *DoNotOptimize.get_pointer() = __spirv_NumWorkgroups_y(); + }); + // CHECK-LABEL: @{{.*}}kernel_NumWorkgroups_y + // CHECK: [[CALL_ESIMD:%.*]] = call <3 x i32> @llvm.genx.group.count.v3i32() + // CHECK: {{.*}} extractelement <3 x i32> [[CALL_ESIMD]], i32 1 + + kernel([=]() SYCL_ESIMD_KERNEL { + *DoNotOptimize.get_pointer() = __spirv_NumWorkgroups_z(); + }); + // CHECK-LABEL: @{{.*}}kernel_NumWorkgroups_z + // CHECK: [[CALL_ESIMD:%.*]] = call <3 x i32> @llvm.genx.group.count.v3i32() + // CHECK: {{.*}} extractelement <3 x i32> [[CALL_ESIMD]], i32 2 + + kernel([=]() SYCL_ESIMD_KERNEL { + *DoNotOptimize.get_pointer() = __spirv_WorkgroupSize_x(); + }); + // CHECK-LABEL: @{{.*}}kernel_WorkgroupSize_x + // CHECK: [[CALL_ESIMD:%.*]] = call <3 x i32> @llvm.genx.local.size.v3i32() + // CHECK: {{.*}} extractelement <3 x i32> [[CALL_ESIMD]], i32 0 + + kernel([=]() SYCL_ESIMD_KERNEL { + *DoNotOptimize.get_pointer() = __spirv_WorkgroupSize_y(); + }); + // CHECK-LABEL: @{{.*}}kernel_WorkgroupSize_y + // CHECK: [[CALL_ESIMD:%.*]] = call <3 x i32> @llvm.genx.local.size.v3i32() + // CHECK: {{.*}} extractelement <3 x i32> [[CALL_ESIMD]], i32 1 + + kernel([=]() SYCL_ESIMD_KERNEL { + *DoNotOptimize.get_pointer() = __spirv_WorkgroupSize_z(); + }); + // CHECK-LABEL: @{{.*}}kernel_WorkgroupSize_z + // CHECK: [[CALL_ESIMD:%.*]] = call <3 x i32> @llvm.genx.local.size.v3i32() + // CHECK: {{.*}} extractelement <3 x i32> [[CALL_ESIMD]], i32 2 + + kernel([=]() SYCL_ESIMD_KERNEL { + *DoNotOptimize.get_pointer() = __spirv_WorkgroupId_x(); + }); + // CHECK-LABEL: @{{.*}}kernel_WorkgroupId_x + // CHECK: {{.*}} call i32 @llvm.genx.group.id.x() + + kernel([=]() SYCL_ESIMD_KERNEL { + *DoNotOptimize.get_pointer() = __spirv_WorkgroupId_y(); + }); + // CHECK-LABEL: @{{.*}}kernel_WorkgroupId_y + // CHECK: {{.*}} call i32 @llvm.genx.group.id.y() + + kernel([=]() SYCL_ESIMD_KERNEL { + *DoNotOptimize.get_pointer() = __spirv_WorkgroupId_z(); + }); + // CHECK-LABEL: @{{.*}}kernel_WorkgroupId_z + // CHECK: {{.*}} call i32 @llvm.genx.group.id.z() + + kernel([=]() SYCL_ESIMD_KERNEL { + *DoNotOptimize.get_pointer() = __spirv_LocalInvocationId_x(); + }); + // CHECK-LABEL: @{{.*}}kernel_LocalInvocationId_x + // CHECK: [[CALL_ESIMD:%.*]] = call <3 x i32> @llvm.genx.local.id.v3i32() + // CHECK: {{.*}} extractelement <3 x i32> [[CALL_ESIMD]], i32 0 + + kernel([=]() SYCL_ESIMD_KERNEL { + *DoNotOptimize.get_pointer() = __spirv_LocalInvocationId_y(); + }); + // CHECK-LABEL: @{{.*}}kernel_LocalInvocationId_y + // CHECK: [[CALL_ESIMD:%.*]] = call <3 x i32> @llvm.genx.local.id.v3i32() + // CHECK: {{.*}} extractelement <3 x i32> [[CALL_ESIMD]], i32 1 + + kernel([=]() SYCL_ESIMD_KERNEL { + *DoNotOptimize.get_pointer() = __spirv_LocalInvocationId_z(); + }); + // CHECK-LABEL: @{{.*}}kernel_LocalInvocationId_z + // CHECK: [[CALL_ESIMD:%.*]] = call <3 x i32> @llvm.genx.local.id.v3i32() + // CHECK: {{.*}} extractelement <3 x i32> [[CALL_ESIMD]], i32 2 + }); + return DoNotOpt; +}