Skip to content

[SYCL][ESIMD] Setup compilation pipeline for ESIMD #2134

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Jul 29, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions clang/lib/CodeGen/BackendUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
#include "llvm/Passes/PassBuilder.h"
#include "llvm/Passes/PassPlugin.h"
#include "llvm/Passes/StandardInstrumentations.h"
#include "llvm/SYCLLowerIR/LowerESIMD.h"
#include "llvm/Support/BuryPointer.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/MemoryBuffer.h"
Expand Down Expand Up @@ -785,6 +786,25 @@ void EmitAssemblyHelper::CreatePasses(legacy::PassManager &MPM,

PMBuilder.populateFunctionPassManager(FPM);
PMBuilder.populateModulePassManager(MPM);

// Customize the tail of the module passes list for the ESIMD extension.
if (LangOpts.SYCLIsDevice && LangOpts.SYCLExplicitSIMD &&
CodeGenOpts.OptimizationLevel != 0) {
MPM.add(createESIMDLowerVecArgPass());
MPM.add(createESIMDLowerLoadStorePass());
MPM.add(createSROAPass());
MPM.add(createEarlyCSEPass(true));
MPM.add(createInstructionCombiningPass());
MPM.add(createDeadCodeEliminationPass());
MPM.add(createFunctionInliningPass(
CodeGenOpts.OptimizationLevel, CodeGenOpts.OptimizeSize,
(!CodeGenOpts.SampleProfileFile.empty() &&
CodeGenOpts.PrepareForThinLTO)));
MPM.add(createSROAPass());
MPM.add(createEarlyCSEPass(true));
MPM.add(createInstructionCombiningPass());
MPM.add(createDeadCodeEliminationPass());
}
}

static void setCommandLineOpts(const CodeGenOptions &CodeGenOpts) {
Expand Down Expand Up @@ -879,6 +899,11 @@ void EmitAssemblyHelper::EmitAssembly(BackendAction Action,
PerFunctionPasses.add(
createTargetTransformInfoWrapperPass(getTargetIRAnalysis()));

// ESIMD extension always requires lowering of certain IR constructs, such as
// ESIMD C++ intrinsics, as the last FE step.
if (LangOpts.SYCLIsDevice && LangOpts.SYCLExplicitSIMD)
PerModulePasses.add(createSYCLLowerESIMDPass());

CreatePasses(PerModulePasses, PerFunctionPasses);

legacy::PassManager CodeGenPasses;
Expand Down
6 changes: 5 additions & 1 deletion clang/lib/Frontend/CompilerInvocation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -816,10 +816,14 @@ static bool ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, InputKind IK,
Args.getLastArg(OPT_emit_llvm_uselists, OPT_no_emit_llvm_uselists))
Opts.EmitLLVMUseLists = A->getOption().getID() == OPT_emit_llvm_uselists;

// ESIMD GPU Back-end requires optimized IR
bool IsSyclESIMD = Args.hasFlag(options::OPT_fsycl_esimd,
options::OPT_fno_sycl_esimd, false);

Opts.DisableLLVMPasses =
Args.hasArg(OPT_disable_llvm_passes) ||
(Args.hasArg(OPT_fsycl_is_device) && Triple.isSPIR() &&
!Args.hasArg(OPT_fsycl_enable_optimizations));
!Args.hasArg(OPT_fsycl_enable_optimizations) && !IsSyclESIMD);
Opts.DisableLifetimeMarkers = Args.hasArg(OPT_disable_lifetimemarkers);

const llvm::Triple::ArchType DebugEntryValueArchs[] = {
Expand Down
2 changes: 1 addition & 1 deletion clang/test/CodeGenSYCL/esimd-private-global.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@ __attribute__((opencl_private)) __attribute__((register_num(17))) int vc;

SYCL_EXTERNAL void init_vc(int x) {
vc = x;
// CHECK: store i32 %0, i32* @vc
// CHECK: store i32 %{{[0-9a-zA-Z_]+}}, i32* @vc
}
// CHECK: attributes #0 = {{.*"VCByteOffset"="17".*"VCVolatile"}}
2 changes: 1 addition & 1 deletion llvm/lib/SYCLLowerIR/LowerESIMD.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -820,7 +820,7 @@ translateSpirvIntrinsic(CallInst *CI, StringRef SpirvIntrName,
auto translateSpirvIntr = [&SpirvIntrName, &ESIMDToErases,
CI](StringRef SpvIName, auto TranslateFunc) {
if (SpirvIntrName.consume_front(SpvIName)) {
Value *TranslatedV = TranslateFunc(*CI, SpirvIntrName);
Value *TranslatedV = TranslateFunc(*CI, SpirvIntrName.substr(1, 1));
CI->replaceAllUsesWith(TranslatedV);
ESIMDToErases.push_back(CI);
}
Expand Down
11 changes: 11 additions & 0 deletions llvm/test/SYCLLowerIR/esimd_lower_intrins.ll
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,17 @@ define dso_local spir_kernel void @FUNC_30() !sycl_explicit_simd !1 {
; CHECK-NEXT: ret void
}

define dso_local spir_kernel void @FUNC_31() !sycl_explicit_simd !1 {
; CHECK: define dso_local spir_kernel void @FUNC_31() !sycl_explicit_simd !1
%call = call spir_func i64 @_Z27__spirv_LocalInvocationId_xv()
; CHECK-NEXT: %call.esimd = call <3 x i32> @llvm.genx.local.id.v3i32()
; CHECK-NEXT: %local_id.x = extractelement <3 x i32> %call.esimd, i32 0
; CHECK-NEXT: %local_id.x.cast.ty = zext i32 %local_id.x to i64
Comment on lines +175 to +178
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please, consider following existing design for lowering functions like SPIR-V built-ins to device specific intrinsics by linking corresponding device library with SPIR-V functions implementation (see NVPTX implementation for details).
This will allow you to remove most of LowerESIMD pass (including the buggy part).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem with NVPTX approach as well as with using existing clang built-in infra is that it is hard to support the "C++" intrinsics mechanism with those. Once C++ intrinsics is supported in clang built-in infra - yes, this pass will simplify.

What buggy part do you refer to?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem with NVPTX approach as well as with using existing clang built-in infra is that it is hard to support the "C++" intrinsics mechanism with those.

I'm not sure why this mechanism is needed. translateSpirvIntrinsic doesn't require it.

BTW, why do we need this in the first place? According to my understanding these SPIR-V built-ins are lowered to valid SPIR-V, which should be supported by the back-end.

Once C++ intrinsics is supported in clang built-in infra - yes, this pass will simplify.

What buggy part do you refer to?

The one fixed by this patch.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. You refer to SPIRV translation part. Agree - this could be done similar to NVPTX.

According to my understanding these SPIR-V built-ins are lowered to valid SPIR-V, which should be supported by the back-end.

Not quite. The vector BE does not support SPIRV intrinsics, as it is not a SIMT BE. But with subgroup size = 1 restriction it is doable really. I thought about this as a future step.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

According to my understanding these SPIR-V built-ins are lowered to valid SPIR-V, which should be supported by the back-end.

Not quite. The vector BE does not support SPIRV intrinsics, as it is not a SIMT BE. But with subgroup size = 1 restriction it is doable really. I thought about this as a future step.

To be precise, this functions are represented in SPIR-V as a variable with decoration and I think it must be supported by any back-end. AFAIK, there are a lot of ESIMD specific patches landed to SPIR-V translator and it seems to the right place to lower this functionality.
SPIR-V consumer by design is supposed to lower standard SPIR-V instructions to HW specific intructions/intrinsics.
If I understand it correctly ESIMD extension implementation includes these two parts:

  • LLVM pass replaces SPIR-V built-in functions to GEN specific intrinsics in FE/ME.
  • SPIR-V translator is enhanced to preserve undefined GEN specific intrinsics in SPIR-V format.

According to my understanding these are not required if we move lowering of SPIR-V standard instructions to SPIR-V consumer.

NOTE: I'm talking about standard SPIR-V functionality only.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SPIR-V consumer by design is supposed to lower standard SPIR-V instructions to HW specific intructions/intrinsics.

That is true. But ESIMD back-end by design wasn't a "SPIRV consumer" and glue passes are still required to feed it LLVM IR resulting from SPIRV translation. Also ESIMD BE can't consume arbitrary SPIRV with Kernel capability - there are a number of restrictions. E.g. ESIMD BE does not have a concept of workitem, a central one in SIMT world. My point is that there are still design options which need to be discussed with the ESIMD BE devs.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But ESIMD back-end by design wasn't a "SPIRV consumer" and glue passes are still required to feed it LLVM IR resulting from SPIRV translation. Also ESIMD BE can't consume arbitrary SPIRV with Kernel capability - there are a number of restrictions. E.g. ESIMD BE does not have a concept of workitem, a central one in SIMT world. My point is that there are still design options which need to be discussed with the ESIMD BE devs.

If SPIR-V consumption is not designed, what is the point in using SPIR-V then?
It looks like using LLVM IR as exchange format in ESIMD mode would be much easier and doesn't require so much hacks.

Copy link
Contributor Author

@kbobrovs kbobrovs Jul 22, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If SPIR-V consumption is not designed, what is the point in using SPIR-V then?

IR stability.

doesn't require so much hacks.

There can be parts which can be improved, but I don't see how this justifies calling it hacks.

ret void
; CHECK-NEXT: ret void
}

declare dso_local spir_func i64 @_Z27__spirv_LocalInvocationId_xv()
declare dso_local spir_func <32 x i32> @_Z20__esimd_flat_atomic0ILN2cm3gen14CmAtomicOpTypeE2EjLi32ELNS1_9CacheHintE0ELS3_0EENS1_13__vector_typeIT0_XT1_EE4typeENS4_IyXT1_EE4typeENS4_ItXT1_EE4typeE(<32 x i64> %0, <32 x i16> %1)
declare dso_local spir_func <32 x i32> @_Z20__esimd_flat_atomic1ILN2cm3gen14CmAtomicOpTypeE0EjLi32ELNS1_9CacheHintE0ELS3_0EENS1_13__vector_typeIT0_XT1_EE4typeENS4_IyXT1_EE4typeES7_NS4_ItXT1_EE4typeE(<32 x i64> %0, <32 x i32> %1, <32 x i16> %2)
declare dso_local spir_func <32 x i32> @_Z20__esimd_flat_atomic2ILN2cm3gen14CmAtomicOpTypeE7EjLi32ELNS1_9CacheHintE0ELS3_0EENS1_13__vector_typeIT0_XT1_EE4typeENS4_IyXT1_EE4typeES7_S7_NS4_ItXT1_EE4typeE(<32 x i64> %0, <32 x i32> %1, <32 x i32> %2, <32 x i16> %3)
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/SYCLLowerIR/esimd_subroutine.ll
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ entry:
}

; Function Attrs: norecurse nounwind
; CHECK: define linkonce_odr spir_func void @_ZN4simdIiLi16EEC1ERS0_(<16 x i32> addrspace(4)* [[OLDARG0:%[a-zA-Z0-9_]*]], <16 x i32> addrspace(4)* [[OLDARG1:%[a-zA-Z0-9_]*]]) unnamed_addr {{.+}}
; CHECK: define linkonce_odr spir_func void @_ZN4simdIiLi16EEC1ERS0_(<16 x i32> addrspace(4)* [[OLDARG0:%[a-zA-Z0-9_]*]], <16 x i32> addrspace(4)*{{.*}} [[OLDARG1:%[a-zA-Z0-9_]*]]) unnamed_addr {{.+}}
define linkonce_odr spir_func void @_ZN4simdIiLi16EEC1ERS0_(%class._ZTS4simdIiLi16EE.simd addrspace(4)* %this, %class._ZTS4simdIiLi16EE.simd addrspace(4)* align 64 dereferenceable(64) %other) unnamed_addr #0 comdat align 2 {
entry:
; CHECK: [[NEWARG1:%[a-zA-Z0-9_]*]] = bitcast <16 x i32> addrspace(4)* [[OLDARG1]] to {{.+}}
Expand Down
105 changes: 105 additions & 0 deletions sycl/test/basic_tests/esimd/vadd.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
// TODO ESIMD enable host device under -fsycl
// RUN: %clangxx -I %sycl_include %s -o %t.out -lsycl
// RUN: env SYCL_DEVICE_TYPE=HOST %t.out

#include <CL/sycl.hpp>
#include <CL/sycl/intel/esimd.hpp>
#include <iostream>

using namespace cl::sycl;

class ESIMDSelector : public device_selector {
// Require GPU device unless HOST is requested in SYCL_DEVICE_TYPE env
virtual int operator()(const device &device) const {
if (const char *dev_type = getenv("SYCL_DEVICE_TYPE")) {
if (!strcmp(dev_type, "GPU"))
return device.is_gpu() ? 1000 : -1;
if (!strcmp(dev_type, "HOST"))
return device.is_host() ? 1000 : -1;
std::cerr << "Supported 'SYCL_DEVICE_TYPE' env var values are 'GPU' and "
"'HOST', '"
<< dev_type << "' is not.\n";
return -1;
}
// If "SYCL_DEVICE_TYPE" not defined, only allow gpu device
return device.is_gpu() ? 1000 : -1;
}
};

auto exception_handler = [](exception_list l) {
for (auto ep : l) {
try {
std::rethrow_exception(ep);
} catch (cl::sycl::exception &e0) {
std::cout << "sycl::exception: " << e0.what() << std::endl;
} catch (std::exception &e) {
std::cout << "std::exception: " << e.what() << std::endl;
} catch (...) {
std::cout << "generic exception\n";
}
}
};

int main(void) {
constexpr unsigned Size = 256;
constexpr unsigned VL = 32;
constexpr unsigned GroupSize = 2;

int A[Size];
int B[Size];
int C[Size] = {};

for (unsigned i = 0; i < Size; ++i) {
A[i] = B[i] = i;
}

{
cl::sycl::buffer<int, 1> bufA(A, Size);
cl::sycl::buffer<int, 1> bufB(B, Size);
cl::sycl::buffer<int, 1> bufC(C, Size);

// We need that many task groups
cl::sycl::range<1> GroupRange{Size / VL};

// We need that many tasks in each group
cl::sycl::range<1> TaskRange{GroupSize};

cl::sycl::nd_range<1> Range{GroupRange, TaskRange};

queue q(ESIMDSelector{}, exception_handler);
q.submit([&](cl::sycl::handler &cgh) {
auto accA = bufA.get_access<cl::sycl::access::mode::read>(cgh);
auto accB = bufB.get_access<cl::sycl::access::mode::read>(cgh);
auto accC = bufC.get_access<cl::sycl::access::mode::write>(cgh);

cgh.parallel_for<class Test>(
Range, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
using namespace sycl::intel::gpu;
auto pA = accA.get_pointer().get();
auto pB = accB.get_pointer().get();
auto pC = accC.get_pointer().get();

int i = ndi.get_global_id(0);
constexpr int ESIZE = sizeof(int);
simd<uint32_t, VL> offsets(0, ESIZE);

simd<int, VL> va = gather<int, VL>(pA + i * VL, offsets);
simd<int, VL> vb = block_load<int, VL>(pB + i * VL);
simd<int, VL> vc = va + vb;

block_store<int, VL>(pC + i * VL, vc);
});
});

for (unsigned i = 0; i < Size; ++i) {
if (A[i] + B[i] != C[i]) {
std::cout << "failed at index " << i << ", " << C[i] << " != " << A[i]
<< " + " << B[i] << "\n";
return 1;
}
}
}

std::cout << "Passed\n";
return 0;
}
28 changes: 28 additions & 0 deletions sycl/test/esimd/glob.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
// RUN: %clangxx -fsycl -fsycl-explicit-simd -c -fsycl-device-only -Xclang -emit-llvm %s -o - | \
// RUN: FileCheck %s

// This test checks that globals with register attribute are allowed in ESIMD
// mode, can be accessed in functions and correct LLVM IR is generated
// (including translation of the register attribute)

#include <CL/sycl.hpp>
#include <CL/sycl/intel/esimd.hpp>
#include <iostream>

using namespace cl::sycl;
using namespace sycl::intel::gpu;

constexpr unsigned VL = 16;

ESIMD_PRIVATE ESIMD_REGISTER(17) simd<int, VL> vc;
// CHECK-DAG: @vc = {{.+}} <16 x i32> zeroinitializer, align 64 #0
// CHECK-DAG: attributes #0 = { {{.*}}"VCByteOffset"="17" "VCGlobalVariable" "VCVolatile"{{.*}} }

ESIMD_PRIVATE ESIMD_REGISTER(17 + VL) simd<int, VL> vc1;
// CHECK-DAG: @vc1 = {{.+}} <16 x i32> zeroinitializer, align 64 #1
// CHECK-DAG: attributes #1 = { {{.*}}"VCByteOffset"="33" "VCGlobalVariable" "VCVolatile"{{.*}} }

SYCL_EXTERNAL ESIMD_NOINLINE void init_vc(int x) {
vc1 = vc + 1;
vc = x;
}
39 changes: 39 additions & 0 deletions sycl/test/esimd/hw_compile.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// Basic ESIMD test which checks that ESIMD invocation syntax can get compiled.
// RUN: %clangxx -fsycl -fsycl-explicit-simd -fsycl-device-only -c %s -o %t.bc

#include <CL/sycl.hpp>
#include <CL/sycl/intel/esimd.hpp>
#include <iostream>

int main(void) {
constexpr unsigned Size = 4;
int A[Size] = {1, 2, 3, 4};
int B[Size] = {1, 2, 3, 4};
int C[Size];

{
cl::sycl::range<1> UnitRange{1};
cl::sycl::buffer<int, 1> bufA(A, UnitRange);
cl::sycl::buffer<int, 1> bufB(B, UnitRange);
cl::sycl::buffer<int, 1> bufC(C, UnitRange);

cl::sycl::queue().submit([&](cl::sycl::handler &cgh) {
auto accA = bufA.get_access<cl::sycl::access::mode::read>(cgh);
auto accB = bufB.get_access<cl::sycl::access::mode::read>(cgh);
auto accC = bufC.get_access<cl::sycl::access::mode::write>(cgh);

cgh.parallel_for<class Test>(UnitRange * UnitRange,
[=](sycl::id<1> i) SYCL_ESIMD_KERNEL {
// those operations below would normally be
// represented as a single vector operation
// through ESIMD vector
accC[i + 0] = accA[i + 0] + accB[i + 0];
accC[i + 1] = accA[i + 1] + accB[i + 1];
accC[i + 2] = accA[i + 2] + accB[i + 2];
accC[i + 3] = accA[i + 3] + accB[i + 3];
});
});
}

return 0;
}
117 changes: 117 additions & 0 deletions sycl/test/esimd/intrins_trans.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
// RUN: %clangxx -O0 -fsycl -fsycl-explicit-simd -fsycl-device-only -Xclang -emit-llvm %s -o - | \
// RUN: FileCheck %s

// Checks ESIMD intrinsic translation.
// NOTE: must be run in -O0, as optimizer optimizes away some of the code

#include <CL/sycl.hpp>
#include <CL/sycl/detail/image_ocl_types.hpp>
#include <CL/sycl/intel/esimd.hpp>

using namespace sycl::intel::gpu;

ESIMD_PRIVATE vector_type_t<int, 32> vc;
ESIMD_PRIVATE ESIMD_REGISTER(192) simd<int, 16> vg;

SYCL_ESIMD_FUNCTION SYCL_EXTERNAL simd<float, 16> foo();

class EsimdFunctor {
public:
void operator()() __attribute__((sycl_explicit_simd)) { foo(); }
};

template <typename name, typename Func>
__attribute__((sycl_kernel)) void kernel(Func kernelFunc) {
kernelFunc();
}

void bar() {
EsimdFunctor esimdf;
kernel<class kernel_esimd>(esimdf);
}

SYCL_ESIMD_FUNCTION SYCL_EXTERNAL simd<float, 16> foo() {
// CHECK-LABEL: @_Z3foov
constexpr int VL = 32;
uint32_t *ptr = 0;

int x = 0, y = 0, z = 0;

simd<uint32_t, VL> v1(0, x + z);
simd<uint64_t, VL> offsets(0, y);
simd<uintptr_t, VL> v_addr(reinterpret_cast<uintptr_t>(ptr));
simd<ushort, VL> pred;
v_addr += offsets;

__esimd_flat_atomic0<EsimdAtomicOpType::ATOMIC_INC, uint32_t, VL>(
v_addr.data(), pred.data());
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.atomic.inc.v32i32.v32i1.v32i64(<32 x i1> %{{[0-9a-zA-Z_.]+}}, <32 x i64> %{{[0-9a-zA-Z_.]+}}, <32 x i32> undef)

__esimd_flat_atomic1<EsimdAtomicOpType::ATOMIC_ADD, uint32_t, VL>(
v_addr.data(), v1, pred.data());
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.atomic.add.v32i32.v32i1.v32i64(<32 x i1> %{{[0-9a-zA-Z_.]+}}, <32 x i64> %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}}, <32 x i32> undef)
__esimd_flat_atomic2<EsimdAtomicOpType::ATOMIC_CMPXCHG, uint32_t, VL>(
v_addr.data(), v1, v1, pred.data());
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.atomic.cmpxchg.v32i32.v32i1.v32i64(<32 x i1> %{{[0-9a-zA-Z_.]+}}, <32 x i64> %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}}, <32 x i32> undef)

uintptr_t addr = reinterpret_cast<uintptr_t>(ptr);
simd<uint32_t, VL> v00 =
__esimd_flat_block_read_unaligned<uint32_t, VL>(addr);
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.block.ld.unaligned.v32i32(i64 %{{[0-9a-zA-Z_.]+}})
__esimd_flat_block_write<uint32_t, VL>(addr, v00.data());
// CHECK: call void @llvm.genx.svm.block.st.v32i32(i64 %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}})

simd<uint32_t, VL> v01 =
__esimd_flat_read<uint32_t, VL>(v_addr.data(), 0, pred.data());
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.gather.v32i32.v32i1.v32i64(<32 x i1> %{{[0-9a-zA-Z_.]+}}, i32 0, <32 x i64> %{{[0-9a-zA-Z_.]+}}, <32 x i32> undef)

__esimd_flat_write<uint32_t, VL>(v_addr.data(), v01.data(), 0, pred.data());
// CHECK: call void @llvm.genx.svm.scatter.v32i1.v32i64.v32i32(<32 x i1> %{{[0-9a-zA-Z_.]+}}, i32 0, <32 x i64> %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}})

simd<short, 16> mina(0, 1);
simd<short, 16> minc(5);
minc = __esimd_smin<short, 16>(mina.data(), minc.data());
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <16 x i16> @llvm.genx.smin.v16i16.v16i16(<16 x i16> %{{[0-9a-zA-Z_.]+}}, <16 x i16> %{{[0-9a-zA-Z_.]+}})

simd<float, 1> diva(2.f);
simd<float, 1> divb(1.f);
diva = __esimd_div_ieee<1>(diva.data(), divb.data());
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <1 x float> @llvm.genx.ieee.div.v1f32(<1 x float> %{{[0-9a-zA-Z_.]+}}, <1 x float> %{{[0-9a-zA-Z_.]+}})

simd<float, 16> a(0.1f);
simd<float, 8> b = __esimd_rdregion<float, 16, 8, 0, 8, 1>(a.data(), 0);
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <8 x float> @llvm.genx.rdregionf.v8f32.v16f32.i16(<16 x float> %{{[0-9a-zA-Z_.]+}}, i32 0, i32 8, i32 1, i16 0, i32 0)

simd<float, 16> c(0.0f);

using PH = cl::sycl::access::placeholder;

cl::sycl::accessor<cl::sycl::cl_int4, 2, cl::sycl::access::mode::read,
cl::sycl::access::target::image, PH::false_t>
pA;
cl::sycl::accessor<cl::sycl::cl_int4, 2, cl::sycl::access::mode::write,
cl::sycl::access::target::image, PH::false_t>
pB;

auto d = __esimd_wrregion<float, 16 /*ret size*/, 8 /*write size*/,
0 /*vstride*/, 8 /*row width*/, 1 /*hstride*/>(
c.data() /*dst*/, b.data() /*src*/, 0 /*offset*/);
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <16 x float> @llvm.genx.wrregionf.v16f32.v8f32.i16.v8i1(<16 x float> %{{[0-9a-zA-Z_.]+}}, <8 x float> %{{[0-9a-zA-Z_.]+}}, i32 0, i32 8, i32 1, i16 0, i32 0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)

simd<int, 32> va;
va = media_block_load<int, 4, 8>(pA, x, y);
// CHECK: %[[SI0:[0-9a-zA-Z_.]+]] = ptrtoint %opencl.image2d_ro_t addrspace(1)* %{{[0-9a-zA-Z_.]+}} to i32
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.media.ld.v32i32(i32 0, i32 %[[SI0]], i32 0, i32 32, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}})

simd<int, 32> vb = va + 1;
media_block_store<int, 4, 8>(pB, x, y, vb);
// CHECK: %[[SI2:[0-9a-zA-Z_.]+]] = ptrtoint %opencl.image2d_wo_t addrspace(1)* %{{[0-9a-zA-Z_.]+}} to i32
// CHECK: call void @llvm.genx.media.st.v32i32(i32 0, i32 %[[SI2]], i32 0, i32 32, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}})

auto ee = __esimd_vload<int, 16>((vector_type_t<int, 16> *)(&vg));
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <16 x i32> @llvm.genx.vload.v16i32.p4v16i32(<16 x i32> addrspace(4)* {{.*}})
__esimd_vstore<int, 32>(&vc, va.data());
// CHECK: store <32 x i32> %{{[0-9a-zA-Z_.]+}}, <32 x i32> addrspace(4)* {{.*}}

return d;
}
Loading