Skip to content

Commit 0adaf62

Browse files
ghehglanza
authored andcommitted
[CIR][CIRGen][Builtin][Neon] Lower neon_vtrn and neon_vtrnq (#942)
as title. The generated code is the same as Clang codeden except in a small discrepancy when GEP: OG generates code like this: `%6 = getelementptr inbounds <4 x i16>, ptr %retval.i, i32 1` CIR generates a bit differently: `%6 = getelementptr <4 x i16>, ptr %retval.i, i64 1` Ptr offest might be trivial because choosing i64 over i32 as index type seems to be LLVM Dialect's choice. The lack of `inbounds` keyword might be an issue as `mlir::cir::PtrStrideOp` is currently not lowering to LLVM:GEPOp with `inbounds` attribute as `mlir::cir::PtrStrideOp` itself has no `inbounds`. It's probably because there was no need for it though we do have an implementation of [`CIRGenFunction::buildCheckedInBoundsGEP` ](https://github.com/llvm/clangir/blob/10d6f4b94da7e0181a070f0265d079419d96cf78/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp#L2762). Anyway, the issue is not in the scope of this PR and should be addressed in a separate PR. If we think this is an issue, I can create another PR and probably add optional attribute to `mlir::cir::PtrStrideOp` to achieve it. In addition to lowering work, a couple of more works: 1. Did a little refactoring on variable name changing into desired CamelBack case. 2. Changed neon-misc RUN Options to be consistent with other neon test files and make test case more concise.
1 parent 3cfefd7 commit 0adaf62

File tree

2 files changed

+298
-280
lines changed

2 files changed

+298
-280
lines changed

clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp

+46-12
Original file line numberDiff line numberDiff line change
@@ -2968,8 +2968,8 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
29682968
}
29692969
}
29702970

2971-
mlir::cir::VectorType Ty = GetNeonType(this, Type);
2972-
if (!Ty)
2971+
mlir::cir::VectorType ty = GetNeonType(this, Type);
2972+
if (!ty)
29732973
return nullptr;
29742974

29752975
// Not all intrinsics handled by the common case work for AArch64 yet, so only
@@ -2986,7 +2986,7 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
29862986
buildAArch64TblBuiltinExpr(*this, BuiltinID, E, Ops, Arch))
29872987
return V;
29882988

2989-
mlir::cir::VectorType VTy = Ty;
2989+
mlir::cir::VectorType vTy = ty;
29902990
llvm::SmallVector<mlir::Value, 4> args;
29912991
switch (BuiltinID) {
29922992
default:
@@ -3066,8 +3066,8 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
30663066
// https://developer.arm.com/architectures/instruction-sets/intrinsics/
30673067
return buildNeonCall(
30683068
BuiltinID, *this,
3069-
{builder.getExtendedElementVectorType(Ty, true), SInt32Ty}, Ops,
3070-
"llvm.aarch64.neon.sqrshrun", Ty, getLoc(E->getExprLoc()));
3069+
{builder.getExtendedElementVectorType(ty, true), SInt32Ty}, Ops,
3070+
"llvm.aarch64.neon.sqrshrun", ty, getLoc(E->getExprLoc()));
30713071
case NEON::BI__builtin_neon_vqshrn_n_v:
30723072
llvm_unreachable("NYI");
30733073
case NEON::BI__builtin_neon_vrshrn_n_v:
@@ -3080,7 +3080,7 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
30803080
case NEON::BI__builtin_neon_vrnda_v:
30813081
case NEON::BI__builtin_neon_vrndaq_v: {
30823082
assert(!MissingFeatures::buildConstrainedFPCall());
3083-
return buildNeonCall(BuiltinID, *this, {Ty}, Ops, "llvm.round", Ty,
3083+
return buildNeonCall(BuiltinID, *this, {ty}, Ops, "llvm.round", ty,
30843084
getLoc(E->getExprLoc()));
30853085
}
30863086
case NEON::BI__builtin_neon_vrndih_f16: {
@@ -3407,20 +3407,20 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
34073407
}
34083408
case NEON::BI__builtin_neon_vld1_v:
34093409
case NEON::BI__builtin_neon_vld1q_v: {
3410-
return builder.createAlignedLoad(Ops[0].getLoc(), VTy, Ops[0],
3410+
return builder.createAlignedLoad(Ops[0].getLoc(), vTy, Ops[0],
34113411
PtrOp0.getAlignment());
34123412
}
34133413
case NEON::BI__builtin_neon_vst1_v:
34143414
case NEON::BI__builtin_neon_vst1q_v: {
3415-
Ops[1] = builder.createBitcast(Ops[1], VTy);
3415+
Ops[1] = builder.createBitcast(Ops[1], vTy);
34163416
(void)builder.createAlignedStore(Ops[1].getLoc(), Ops[1], Ops[0],
34173417
PtrOp0.getAlignment());
34183418
return Ops[1];
34193419
}
34203420
case NEON::BI__builtin_neon_vld1_lane_v:
34213421
case NEON::BI__builtin_neon_vld1q_lane_v: {
3422-
Ops[1] = builder.createBitcast(Ops[1], VTy);
3423-
Ops[0] = builder.createAlignedLoad(Ops[0].getLoc(), VTy.getEltType(),
3422+
Ops[1] = builder.createBitcast(Ops[1], vTy);
3423+
Ops[0] = builder.createAlignedLoad(Ops[0].getLoc(), vTy.getEltType(),
34243424
Ops[0], PtrOp0.getAlignment());
34253425
return builder.create<mlir::cir::VecInsertOp>(getLoc(E->getExprLoc()),
34263426
Ops[1], Ops[0], Ops[2]);
@@ -3435,7 +3435,7 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
34353435
}
34363436
case NEON::BI__builtin_neon_vst1_lane_v:
34373437
case NEON::BI__builtin_neon_vst1q_lane_v: {
3438-
Ops[1] = builder.createBitcast(Ops[1], Ty);
3438+
Ops[1] = builder.createBitcast(Ops[1], ty);
34393439
Ops[1] = builder.create<mlir::cir::VecExtractOp>(Ops[1].getLoc(), Ops[1],
34403440
Ops[2]);
34413441
(void)builder.createAlignedStore(getLoc(E->getExprLoc()), Ops[1], Ops[0],
@@ -3508,7 +3508,41 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
35083508
}
35093509
case NEON::BI__builtin_neon_vtrn_v:
35103510
case NEON::BI__builtin_neon_vtrnq_v: {
3511-
llvm_unreachable("NYI");
3511+
// This set of neon intrinsics implement SIMD matrix transpose.
3512+
// The matrix transposed is always 2x2, and these intrincis transpose
3513+
// multiple 2x2 matrices in parallel, that is why result type is
3514+
// always 2-D matrix whose last dimension is 2.
3515+
// For example `vtrn_s16` would have:
3516+
// input 1: {0, 1, 2, 3}
3517+
// input 2; {4, 5, 6, 7}
3518+
// This basically represents two 2x2 matrices:
3519+
// [ 0, 1 ] and [ 2, 3]
3520+
// [ 4, 5 ] [ 6, 7]
3521+
// They should be simultaneously and independently transposed.
3522+
// Thus, result is :
3523+
// { {0, 4, 2, 6},
3524+
// {1, 5, 3, 7 } }
3525+
Ops[1] = builder.createBitcast(Ops[1], ty);
3526+
Ops[2] = builder.createBitcast(Ops[2], ty);
3527+
// Adding a bitcast here as Ops[0] might be a void pointer.
3528+
mlir::Value baseAddr =
3529+
builder.createBitcast(Ops[0], builder.getPointerTo(ty));
3530+
mlir::Value sv;
3531+
mlir::Location loc = getLoc(E->getExprLoc());
3532+
3533+
for (unsigned vi = 0; vi != 2; ++vi) {
3534+
llvm::SmallVector<int64_t, 16> indices;
3535+
for (unsigned i = 0, e = vTy.getSize(); i != e; i += 2) {
3536+
indices.push_back(i + vi);
3537+
indices.push_back(i + e + vi);
3538+
}
3539+
mlir::cir::ConstantOp idx = builder.getConstInt(loc, SInt32Ty, vi);
3540+
mlir::Value addr = builder.create<mlir::cir::PtrStrideOp>(
3541+
loc, baseAddr.getType(), baseAddr, idx);
3542+
sv = builder.createVecShuffle(loc, Ops[1], Ops[2], indices);
3543+
(void)builder.CIRBaseBuilderTy::createStore(loc, sv, addr);
3544+
}
3545+
return sv;
35123546
}
35133547
case NEON::BI__builtin_neon_vuzp_v:
35143548
case NEON::BI__builtin_neon_vuzpq_v: {

0 commit comments

Comments
 (0)