[CIR][CodeGen] Special treatment of 3-element extended vector load and store (#674)

seven-mile · lanza · commit 1455b29d4a81 · 2025-03-13T11:30:07.000-07:00
Continue the work of #613 . Original CodeGen treat vec3 as vec4 to get aligned memory access. This PR enable these paths.
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -598,9 +598,25 @@ void CIRGenFunction::buildStoreOfScalar(mlir::Value Value, Address Addr,
     return;
   }
 
+  mlir::Type SrcTy = Value.getType();
   if (const auto *ClangVecTy = Ty->getAs<clang::VectorType>()) {
     // TODO(CIR): this has fallen out of date with codegen
     llvm_unreachable("NYI: Special treatment of 3-element vector store");
+    // auto VecTy = dyn_cast<mlir::cir::VectorType>(SrcTy);
+    // if (!CGM.getCodeGenOpts().PreserveVec3Type &&
+    //     ClangVecTy->getNumElements() == 3) {
+    //   // Handle vec3 special.
+    //   if (VecTy && VecTy.getSize() == 3) {
+    //     // Our source is a vec3, do a shuffle vector to make it a vec4.
+    //     Value = builder.createVecShuffle(Value.getLoc(), Value,
+    //                                      ArrayRef<int64_t>{0, 1, 2, -1});
+    //     SrcTy = mlir::cir::VectorType::get(VecTy.getContext(),
+    //                                        VecTy.getEltType(), 4);
+    //   }
+    //   if (Addr.getElementType() != SrcTy) {
+    //     Addr = Addr.withElementType(SrcTy);
+    //   }
+    // }
   }
 
   // Update the alloca with more info on initialization.
@@ -773,7 +789,7 @@ void CIRGenFunction::buildStoreThroughExtVectorComponentLValue(RValue Src,
       // of the Elts constant array will be one past the size of the vector.
       // Ignore the last element here, if it is greater than the mask size.
       if (getAccessedFieldNo(NumSrcElts - 1, Elts) == Mask.size())
-        llvm_unreachable("NYI");
+        NumSrcElts--;
 
       // modify when what gets shuffled in
       for (unsigned i = 0; i != NumSrcElts; ++i)
@@ -2773,13 +2789,30 @@ mlir::Value CIRGenFunction::buildLoadOfScalar(Address Addr, bool Volatile,
     llvm_unreachable("NYI");
   }
 
+  auto ElemTy = Addr.getElementType();
+
   if (const auto *ClangVecTy = Ty->getAs<clang::VectorType>()) {
+    // Handle vectors of size 3 like size 4 for better performance.
+    const auto VTy = cast<mlir::cir::VectorType>(ElemTy);
+
     // TODO(CIR): this has fallen out of sync with codegen
-    llvm_unreachable("NYI: Special treatment of 3-element vector load");
+    llvm_unreachable("NYI: Special treatment of 3-element vector store");
+    // if (!CGM.getCodeGenOpts().PreserveVec3Type &&
+    //     ClangVecTy->getNumElements() == 3) {
+    //   auto loc = Addr.getPointer().getLoc();
+    //   auto vec4Ty =
+    //       mlir::cir::VectorType::get(VTy.getContext(), VTy.getEltType(), 4);
+    //   Address Cast = Addr.withElementType(vec4Ty);
+    //   // Now load value.
+    //   mlir::Value V = builder.createLoad(loc, Cast);
+
+    //   // Shuffle vector to get vec3.
+    //   V = builder.createVecShuffle(loc, V, ArrayRef<int64_t>{0, 1, 2});
+    //   return buildFromMemory(V, Ty);
+    // }
   }
 
   auto Ptr = Addr.getPointer();
-  auto ElemTy = Addr.getElementType();
   if (ElemTy.isa<mlir::cir::VoidType>()) {
     ElemTy = mlir::cir::IntType::get(builder.getContext(), 8, true);
     auto ElemPtrTy = mlir::cir::PointerType::get(builder.getContext(), ElemTy);
diff --git a/clang/test/CIR/CodeGen/vectype-ext.cpp b/clang/test/CIR/CodeGen/vectype-ext.cpp
@@ -5,6 +5,7 @@
 // XFAIL: *
 
 typedef int vi4 __attribute__((ext_vector_type(4)));
+typedef int vi3 __attribute__((ext_vector_type(3)));
 typedef int vi2 __attribute__((ext_vector_type(2)));
 typedef double vd2 __attribute__((ext_vector_type(2)));
 typedef long vl2 __attribute__((ext_vector_type(2)));
@@ -350,6 +351,10 @@ void test_store() {
   // CIR-NEXT: %[[#PVECB:]] = cir.alloca !cir.vector<!s32i x 2>
   // LLVM-NEXT: %[[#PVECB:]] = alloca <2 x i32>
 
+  vi3 c = {};
+  // CIR-NEXT: %[[#PVECC:]] = cir.alloca !cir.vector<!s32i x 3>
+  // LLVM-NEXT: %[[#PVECC:]] = alloca <3 x i32>
+
   a.xy = b;
   // CIR:      %[[#LOAD4RHS:]] = cir.load %{{[0-9]+}} : !cir.ptr<!cir.vector<!s32i x 2>>, !cir.vector<!s32i x 2>
   // CIR-NEXT: %[[#LOAD5LHS:]] = cir.load %{{[0-9]+}} : !cir.ptr<!cir.vector<!s32i x 4>>, !cir.vector<!s32i x 4>
@@ -389,6 +394,35 @@ void test_store() {
   // LLVM-NEXT: %[[#RESULT:]] = shufflevector <4 x i32> %[[#VECA]], <4 x i32> %[[#EXTVECB]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
   // LLVM-NEXT: store <4 x i32> %[[#RESULT]], ptr %[[#PVECA]], align 16
 
+  // OpenCL C Specification 6.3.7. Vector Components
+  // The suffixes .lo (or .even) and .hi (or .odd) for a 3-component vector type
+  // operate as if the 3-component vector type is a 4-component vector type with
+  // the value in the w component undefined.
+  b = c.hi;
+
+  // CIR-NEXT: %[[#VECC:]] = cir.load %[[#PVECC]] : !cir.ptr<!cir.vector<!s32i x 3>>, !cir.vector<!s32i x 3>
+  // CIR-NEXT: %[[#HIPART:]] = cir.vec.shuffle(%[[#VECC]], %[[#VECC]] : !cir.vector<!s32i x 3>) [#cir.int<2> : !s32i, #cir.int<3> : !s32i] : !cir.vector<!s32i x 2>
+  // CIR-NEXT: cir.store %[[#HIPART]], %[[#PVECB]] : !cir.vector<!s32i x 2>, !cir.ptr<!cir.vector<!s32i x 2>>
+
+  // LLVM-NEXT: %[[#VECC:]] = load <3 x i32>, ptr %[[#PVECC]], align 16
+  // LLVM-NEXT: %[[#HIPART:]] = shufflevector <3 x i32> %[[#VECC]], <3 x i32> %[[#VECC]], <2 x i32> <i32 2, i32 3>
+  // LLVM-NEXT: store <2 x i32> %[[#HIPART]], ptr %[[#PVECB]], align 8
+
+  // c.hi is c[2, 3], in which 3 should be ignored in CIRGen for store
+  c.hi = b;
+
+  // CIR-NEXT: %[[#VECB:]] = cir.load %[[#PVECB]] : !cir.ptr<!cir.vector<!s32i x 2>>, !cir.vector<!s32i x 2>
+  // CIR-NEXT: %[[#VECC:]] = cir.load %[[#PVECC]] : !cir.ptr<!cir.vector<!s32i x 3>>, !cir.vector<!s32i x 3>
+  // CIR-NEXT: %[[#EXTVECB:]] = cir.vec.shuffle(%[[#VECB]], %[[#VECB]] : !cir.vector<!s32i x 2>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<-1> : !s32i] : !cir.vector<!s32i x 3>
+  // CIR-NEXT: %[[#RESULT:]] = cir.vec.shuffle(%[[#VECC]], %[[#EXTVECB]] : !cir.vector<!s32i x 3>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<3> : !s32i] : !cir.vector<!s32i x 3>
+  // CIR-NEXT: cir.store %[[#RESULT]], %[[#PVECC]] : !cir.vector<!s32i x 3>, !cir.ptr<!cir.vector<!s32i x 3>>
+
+  // LLVM-NEXT: %[[#VECB:]] = load <2 x i32>, ptr %[[#PVECB]], align 8
+  // LLVM-NEXT: %[[#VECC:]] = load <3 x i32>, ptr %[[#PVECC]], align 16
+  // LLVM-NEXT: %[[#EXTVECB:]] = shufflevector <2 x i32> %[[#VECB]], <2 x i32> %[[#VECB]], <3 x i32> <i32 0, i32 1, i32 poison>
+  // LLVM-NEXT: %[[#RESULT:]] = shufflevector <3 x i32> %[[#VECC]], <3 x i32> %[[#EXTVECB]], <3 x i32> <i32 0, i32 1, i32 3>
+  // LLVM-NEXT: store <3 x i32> %[[#RESULT]], ptr %[[#PVECC]], align 16
+
 }
 
 // CIR: cir.func {{@.*test_build_lvalue.*}}
@@ -453,3 +487,27 @@ void test_build_lvalue() {
   // LLVM-NEXT: store i32 %[[#RESULT]], ptr %[[#ALLOCAR]], align 4
 
 }
+
+// CIR: cir.func {{@.*test_vec3.*}}
+// LLVM: define void {{@.*test_vec3.*}}
+void test_vec3() {
+  vi3 v = {};
+  // CIR-NEXT: %[[#PV:]] = cir.alloca !cir.vector<!s32i x 3>, !cir.ptr<!cir.vector<!s32i x 3>>, ["v", init] {alignment = 16 : i64}
+  // CIR:      %[[#VEC4:]] = cir.vec.shuffle(%{{[0-9]+}}, %{{[0-9]+}} : !cir.vector<!s32i x 3>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<-1> : !s32i] : !cir.vector<!s32i x 4>
+  // CIR-NEXT: %[[#PV4:]] = cir.cast(bitcast, %[[#PV]] : !cir.ptr<!cir.vector<!s32i x 3>>), !cir.ptr<!cir.vector<!s32i x 4>>
+  // CIR-NEXT: cir.store %[[#VEC4]], %[[#PV4]] : !cir.vector<!s32i x 4>, !cir.ptr<!cir.vector<!s32i x 4>>
+
+  // LLVM-NEXT: %[[#PV:]] = alloca <3 x i32>, i64 1, align 16
+  // LLVM-NEXT: store <4 x i32> <i32 0, i32 0, i32 0, i32 undef>, ptr %[[#PV]], align 16
+
+  v + 1;
+  // CIR-NEXT: %[[#PV4:]] = cir.cast(bitcast, %[[#PV]] : !cir.ptr<!cir.vector<!s32i x 3>>), !cir.ptr<!cir.vector<!s32i x 4>>
+  // CIR-NEXT: %[[#V4:]] = cir.load %[[#PV4]] : !cir.ptr<!cir.vector<!s32i x 4>>, !cir.vector<!s32i x 4>
+  // CIR-NEXT: %[[#V3:]] = cir.vec.shuffle(%[[#V4]], %[[#V4]] : !cir.vector<!s32i x 4>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i] : !cir.vector<!s32i x 3>
+  // CIR:      %[[#RES:]] = cir.binop(add, %[[#V3]], %{{[0-9]+}}) : !cir.vector<!s32i x 3>
+
+  // LLVM-NEXT: %[[#V4:]] = load <4 x i32>, ptr %[[#PV:]], align 16
+  // LLVM-NEXT: %[[#V3:]] = shufflevector <4 x i32> %[[#V4]], <4 x i32> %[[#V4]], <3 x i32> <i32 0, i32 1, i32 2>
+  // LLVM-NEXT: %[[#RES:]] = add <3 x i32> %[[#V3]], splat (i32 1)
+
+}