Skip to content

Commit 1455b29

Browse files
seven-milelanza
authored andcommitted
[CIR][CodeGen] Special treatment of 3-element extended vector load and store (#674)
Continue the work of #613 . Original CodeGen treat vec3 as vec4 to get aligned memory access. This PR enable these paths.
1 parent 0b9f59e commit 1455b29

File tree

2 files changed

+94
-3
lines changed

2 files changed

+94
-3
lines changed

clang/lib/CIR/CodeGen/CIRGenExpr.cpp

+36-3
Original file line numberDiff line numberDiff line change
@@ -598,9 +598,25 @@ void CIRGenFunction::buildStoreOfScalar(mlir::Value Value, Address Addr,
598598
return;
599599
}
600600

601+
mlir::Type SrcTy = Value.getType();
601602
if (const auto *ClangVecTy = Ty->getAs<clang::VectorType>()) {
602603
// TODO(CIR): this has fallen out of date with codegen
603604
llvm_unreachable("NYI: Special treatment of 3-element vector store");
605+
// auto VecTy = dyn_cast<mlir::cir::VectorType>(SrcTy);
606+
// if (!CGM.getCodeGenOpts().PreserveVec3Type &&
607+
// ClangVecTy->getNumElements() == 3) {
608+
// // Handle vec3 special.
609+
// if (VecTy && VecTy.getSize() == 3) {
610+
// // Our source is a vec3, do a shuffle vector to make it a vec4.
611+
// Value = builder.createVecShuffle(Value.getLoc(), Value,
612+
// ArrayRef<int64_t>{0, 1, 2, -1});
613+
// SrcTy = mlir::cir::VectorType::get(VecTy.getContext(),
614+
// VecTy.getEltType(), 4);
615+
// }
616+
// if (Addr.getElementType() != SrcTy) {
617+
// Addr = Addr.withElementType(SrcTy);
618+
// }
619+
// }
604620
}
605621

606622
// Update the alloca with more info on initialization.
@@ -773,7 +789,7 @@ void CIRGenFunction::buildStoreThroughExtVectorComponentLValue(RValue Src,
773789
// of the Elts constant array will be one past the size of the vector.
774790
// Ignore the last element here, if it is greater than the mask size.
775791
if (getAccessedFieldNo(NumSrcElts - 1, Elts) == Mask.size())
776-
llvm_unreachable("NYI");
792+
NumSrcElts--;
777793

778794
// modify when what gets shuffled in
779795
for (unsigned i = 0; i != NumSrcElts; ++i)
@@ -2773,13 +2789,30 @@ mlir::Value CIRGenFunction::buildLoadOfScalar(Address Addr, bool Volatile,
27732789
llvm_unreachable("NYI");
27742790
}
27752791

2792+
auto ElemTy = Addr.getElementType();
2793+
27762794
if (const auto *ClangVecTy = Ty->getAs<clang::VectorType>()) {
2795+
// Handle vectors of size 3 like size 4 for better performance.
2796+
const auto VTy = cast<mlir::cir::VectorType>(ElemTy);
2797+
27772798
// TODO(CIR): this has fallen out of sync with codegen
2778-
llvm_unreachable("NYI: Special treatment of 3-element vector load");
2799+
llvm_unreachable("NYI: Special treatment of 3-element vector store");
2800+
// if (!CGM.getCodeGenOpts().PreserveVec3Type &&
2801+
// ClangVecTy->getNumElements() == 3) {
2802+
// auto loc = Addr.getPointer().getLoc();
2803+
// auto vec4Ty =
2804+
// mlir::cir::VectorType::get(VTy.getContext(), VTy.getEltType(), 4);
2805+
// Address Cast = Addr.withElementType(vec4Ty);
2806+
// // Now load value.
2807+
// mlir::Value V = builder.createLoad(loc, Cast);
2808+
2809+
// // Shuffle vector to get vec3.
2810+
// V = builder.createVecShuffle(loc, V, ArrayRef<int64_t>{0, 1, 2});
2811+
// return buildFromMemory(V, Ty);
2812+
// }
27792813
}
27802814

27812815
auto Ptr = Addr.getPointer();
2782-
auto ElemTy = Addr.getElementType();
27832816
if (ElemTy.isa<mlir::cir::VoidType>()) {
27842817
ElemTy = mlir::cir::IntType::get(builder.getContext(), 8, true);
27852818
auto ElemPtrTy = mlir::cir::PointerType::get(builder.getContext(), ElemTy);

clang/test/CIR/CodeGen/vectype-ext.cpp

+58
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
// XFAIL: *
66

77
typedef int vi4 __attribute__((ext_vector_type(4)));
8+
typedef int vi3 __attribute__((ext_vector_type(3)));
89
typedef int vi2 __attribute__((ext_vector_type(2)));
910
typedef double vd2 __attribute__((ext_vector_type(2)));
1011
typedef long vl2 __attribute__((ext_vector_type(2)));
@@ -350,6 +351,10 @@ void test_store() {
350351
// CIR-NEXT: %[[#PVECB:]] = cir.alloca !cir.vector<!s32i x 2>
351352
// LLVM-NEXT: %[[#PVECB:]] = alloca <2 x i32>
352353

354+
vi3 c = {};
355+
// CIR-NEXT: %[[#PVECC:]] = cir.alloca !cir.vector<!s32i x 3>
356+
// LLVM-NEXT: %[[#PVECC:]] = alloca <3 x i32>
357+
353358
a.xy = b;
354359
// CIR: %[[#LOAD4RHS:]] = cir.load %{{[0-9]+}} : !cir.ptr<!cir.vector<!s32i x 2>>, !cir.vector<!s32i x 2>
355360
// CIR-NEXT: %[[#LOAD5LHS:]] = cir.load %{{[0-9]+}} : !cir.ptr<!cir.vector<!s32i x 4>>, !cir.vector<!s32i x 4>
@@ -389,6 +394,35 @@ void test_store() {
389394
// LLVM-NEXT: %[[#RESULT:]] = shufflevector <4 x i32> %[[#VECA]], <4 x i32> %[[#EXTVECB]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
390395
// LLVM-NEXT: store <4 x i32> %[[#RESULT]], ptr %[[#PVECA]], align 16
391396

397+
// OpenCL C Specification 6.3.7. Vector Components
398+
// The suffixes .lo (or .even) and .hi (or .odd) for a 3-component vector type
399+
// operate as if the 3-component vector type is a 4-component vector type with
400+
// the value in the w component undefined.
401+
b = c.hi;
402+
403+
// CIR-NEXT: %[[#VECC:]] = cir.load %[[#PVECC]] : !cir.ptr<!cir.vector<!s32i x 3>>, !cir.vector<!s32i x 3>
404+
// CIR-NEXT: %[[#HIPART:]] = cir.vec.shuffle(%[[#VECC]], %[[#VECC]] : !cir.vector<!s32i x 3>) [#cir.int<2> : !s32i, #cir.int<3> : !s32i] : !cir.vector<!s32i x 2>
405+
// CIR-NEXT: cir.store %[[#HIPART]], %[[#PVECB]] : !cir.vector<!s32i x 2>, !cir.ptr<!cir.vector<!s32i x 2>>
406+
407+
// LLVM-NEXT: %[[#VECC:]] = load <3 x i32>, ptr %[[#PVECC]], align 16
408+
// LLVM-NEXT: %[[#HIPART:]] = shufflevector <3 x i32> %[[#VECC]], <3 x i32> %[[#VECC]], <2 x i32> <i32 2, i32 3>
409+
// LLVM-NEXT: store <2 x i32> %[[#HIPART]], ptr %[[#PVECB]], align 8
410+
411+
// c.hi is c[2, 3], in which 3 should be ignored in CIRGen for store
412+
c.hi = b;
413+
414+
// CIR-NEXT: %[[#VECB:]] = cir.load %[[#PVECB]] : !cir.ptr<!cir.vector<!s32i x 2>>, !cir.vector<!s32i x 2>
415+
// CIR-NEXT: %[[#VECC:]] = cir.load %[[#PVECC]] : !cir.ptr<!cir.vector<!s32i x 3>>, !cir.vector<!s32i x 3>
416+
// CIR-NEXT: %[[#EXTVECB:]] = cir.vec.shuffle(%[[#VECB]], %[[#VECB]] : !cir.vector<!s32i x 2>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<-1> : !s32i] : !cir.vector<!s32i x 3>
417+
// CIR-NEXT: %[[#RESULT:]] = cir.vec.shuffle(%[[#VECC]], %[[#EXTVECB]] : !cir.vector<!s32i x 3>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<3> : !s32i] : !cir.vector<!s32i x 3>
418+
// CIR-NEXT: cir.store %[[#RESULT]], %[[#PVECC]] : !cir.vector<!s32i x 3>, !cir.ptr<!cir.vector<!s32i x 3>>
419+
420+
// LLVM-NEXT: %[[#VECB:]] = load <2 x i32>, ptr %[[#PVECB]], align 8
421+
// LLVM-NEXT: %[[#VECC:]] = load <3 x i32>, ptr %[[#PVECC]], align 16
422+
// LLVM-NEXT: %[[#EXTVECB:]] = shufflevector <2 x i32> %[[#VECB]], <2 x i32> %[[#VECB]], <3 x i32> <i32 0, i32 1, i32 poison>
423+
// LLVM-NEXT: %[[#RESULT:]] = shufflevector <3 x i32> %[[#VECC]], <3 x i32> %[[#EXTVECB]], <3 x i32> <i32 0, i32 1, i32 3>
424+
// LLVM-NEXT: store <3 x i32> %[[#RESULT]], ptr %[[#PVECC]], align 16
425+
392426
}
393427

394428
// CIR: cir.func {{@.*test_build_lvalue.*}}
@@ -453,3 +487,27 @@ void test_build_lvalue() {
453487
// LLVM-NEXT: store i32 %[[#RESULT]], ptr %[[#ALLOCAR]], align 4
454488

455489
}
490+
491+
// CIR: cir.func {{@.*test_vec3.*}}
492+
// LLVM: define void {{@.*test_vec3.*}}
493+
void test_vec3() {
494+
vi3 v = {};
495+
// CIR-NEXT: %[[#PV:]] = cir.alloca !cir.vector<!s32i x 3>, !cir.ptr<!cir.vector<!s32i x 3>>, ["v", init] {alignment = 16 : i64}
496+
// CIR: %[[#VEC4:]] = cir.vec.shuffle(%{{[0-9]+}}, %{{[0-9]+}} : !cir.vector<!s32i x 3>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<-1> : !s32i] : !cir.vector<!s32i x 4>
497+
// CIR-NEXT: %[[#PV4:]] = cir.cast(bitcast, %[[#PV]] : !cir.ptr<!cir.vector<!s32i x 3>>), !cir.ptr<!cir.vector<!s32i x 4>>
498+
// CIR-NEXT: cir.store %[[#VEC4]], %[[#PV4]] : !cir.vector<!s32i x 4>, !cir.ptr<!cir.vector<!s32i x 4>>
499+
500+
// LLVM-NEXT: %[[#PV:]] = alloca <3 x i32>, i64 1, align 16
501+
// LLVM-NEXT: store <4 x i32> <i32 0, i32 0, i32 0, i32 undef>, ptr %[[#PV]], align 16
502+
503+
v + 1;
504+
// CIR-NEXT: %[[#PV4:]] = cir.cast(bitcast, %[[#PV]] : !cir.ptr<!cir.vector<!s32i x 3>>), !cir.ptr<!cir.vector<!s32i x 4>>
505+
// CIR-NEXT: %[[#V4:]] = cir.load %[[#PV4]] : !cir.ptr<!cir.vector<!s32i x 4>>, !cir.vector<!s32i x 4>
506+
// CIR-NEXT: %[[#V3:]] = cir.vec.shuffle(%[[#V4]], %[[#V4]] : !cir.vector<!s32i x 4>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i] : !cir.vector<!s32i x 3>
507+
// CIR: %[[#RES:]] = cir.binop(add, %[[#V3]], %{{[0-9]+}}) : !cir.vector<!s32i x 3>
508+
509+
// LLVM-NEXT: %[[#V4:]] = load <4 x i32>, ptr %[[#PV:]], align 16
510+
// LLVM-NEXT: %[[#V3:]] = shufflevector <4 x i32> %[[#V4]], <4 x i32> %[[#V4]], <3 x i32> <i32 0, i32 1, i32 2>
511+
// LLVM-NEXT: %[[#RES:]] = add <3 x i32> %[[#V3]], splat (i32 1)
512+
513+
}

0 commit comments

Comments
 (0)