Skip to content

Commit 1713a21

Browse files
ghehglanza
authored andcommitted
[CIR][CIRGen][Builtin][Neon] Lower neon_vtrn and neon_vtrnq (#942)
as title. The generated code is the same as Clang codeden except in a small discrepancy when GEP: OG generates code like this: `%6 = getelementptr inbounds <4 x i16>, ptr %retval.i, i32 1` CIR generates a bit differently: `%6 = getelementptr <4 x i16>, ptr %retval.i, i64 1` Ptr offest might be trivial because choosing i64 over i32 as index type seems to be LLVM Dialect's choice. The lack of `inbounds` keyword might be an issue as `mlir::cir::PtrStrideOp` is currently not lowering to LLVM:GEPOp with `inbounds` attribute as `mlir::cir::PtrStrideOp` itself has no `inbounds`. It's probably because there was no need for it though we do have an implementation of [`CIRGenFunction::buildCheckedInBoundsGEP` ](https://github.com/llvm/clangir/blob/10d6f4b94da7e0181a070f0265d079419d96cf78/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp#L2762). Anyway, the issue is not in the scope of this PR and should be addressed in a separate PR. If we think this is an issue, I can create another PR and probably add optional attribute to `mlir::cir::PtrStrideOp` to achieve it. In addition to lowering work, a couple of more works: 1. Did a little refactoring on variable name changing into desired CamelBack case. 2. Changed neon-misc RUN Options to be consistent with other neon test files and make test case more concise.
1 parent e5ced60 commit 1713a21

File tree

2 files changed

+298
-280
lines changed

2 files changed

+298
-280
lines changed

clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp

+46-12
Original file line numberDiff line numberDiff line change
@@ -2973,8 +2973,8 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
29732973
}
29742974
}
29752975

2976-
mlir::cir::VectorType Ty = GetNeonType(this, Type);
2977-
if (!Ty)
2976+
mlir::cir::VectorType ty = GetNeonType(this, Type);
2977+
if (!ty)
29782978
return nullptr;
29792979

29802980
// Not all intrinsics handled by the common case work for AArch64 yet, so only
@@ -2991,7 +2991,7 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
29912991
buildAArch64TblBuiltinExpr(*this, BuiltinID, E, Ops, Arch))
29922992
return V;
29932993

2994-
mlir::cir::VectorType VTy = Ty;
2994+
mlir::cir::VectorType vTy = ty;
29952995
llvm::SmallVector<mlir::Value, 4> args;
29962996
switch (BuiltinID) {
29972997
default:
@@ -3071,8 +3071,8 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
30713071
// https://developer.arm.com/architectures/instruction-sets/intrinsics/
30723072
return buildNeonCall(
30733073
BuiltinID, *this,
3074-
{builder.getExtendedElementVectorType(Ty, true), SInt32Ty}, Ops,
3075-
"llvm.aarch64.neon.sqrshrun", Ty, getLoc(E->getExprLoc()));
3074+
{builder.getExtendedElementVectorType(ty, true), SInt32Ty}, Ops,
3075+
"llvm.aarch64.neon.sqrshrun", ty, getLoc(E->getExprLoc()));
30763076
case NEON::BI__builtin_neon_vqshrn_n_v:
30773077
llvm_unreachable("NYI");
30783078
case NEON::BI__builtin_neon_vrshrn_n_v:
@@ -3085,7 +3085,7 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
30853085
case NEON::BI__builtin_neon_vrnda_v:
30863086
case NEON::BI__builtin_neon_vrndaq_v: {
30873087
assert(!MissingFeatures::buildConstrainedFPCall());
3088-
return buildNeonCall(BuiltinID, *this, {Ty}, Ops, "llvm.round", Ty,
3088+
return buildNeonCall(BuiltinID, *this, {ty}, Ops, "llvm.round", ty,
30893089
getLoc(E->getExprLoc()));
30903090
}
30913091
case NEON::BI__builtin_neon_vrndih_f16: {
@@ -3412,20 +3412,20 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
34123412
}
34133413
case NEON::BI__builtin_neon_vld1_v:
34143414
case NEON::BI__builtin_neon_vld1q_v: {
3415-
return builder.createAlignedLoad(Ops[0].getLoc(), VTy, Ops[0],
3415+
return builder.createAlignedLoad(Ops[0].getLoc(), vTy, Ops[0],
34163416
PtrOp0.getAlignment());
34173417
}
34183418
case NEON::BI__builtin_neon_vst1_v:
34193419
case NEON::BI__builtin_neon_vst1q_v: {
3420-
Ops[1] = builder.createBitcast(Ops[1], VTy);
3420+
Ops[1] = builder.createBitcast(Ops[1], vTy);
34213421
(void)builder.createAlignedStore(Ops[1].getLoc(), Ops[1], Ops[0],
34223422
PtrOp0.getAlignment());
34233423
return Ops[1];
34243424
}
34253425
case NEON::BI__builtin_neon_vld1_lane_v:
34263426
case NEON::BI__builtin_neon_vld1q_lane_v: {
3427-
Ops[1] = builder.createBitcast(Ops[1], VTy);
3428-
Ops[0] = builder.createAlignedLoad(Ops[0].getLoc(), VTy.getEltType(),
3427+
Ops[1] = builder.createBitcast(Ops[1], vTy);
3428+
Ops[0] = builder.createAlignedLoad(Ops[0].getLoc(), vTy.getEltType(),
34293429
Ops[0], PtrOp0.getAlignment());
34303430
return builder.create<mlir::cir::VecInsertOp>(getLoc(E->getExprLoc()),
34313431
Ops[1], Ops[0], Ops[2]);
@@ -3440,7 +3440,7 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
34403440
}
34413441
case NEON::BI__builtin_neon_vst1_lane_v:
34423442
case NEON::BI__builtin_neon_vst1q_lane_v: {
3443-
Ops[1] = builder.createBitcast(Ops[1], Ty);
3443+
Ops[1] = builder.createBitcast(Ops[1], ty);
34443444
Ops[1] = builder.create<mlir::cir::VecExtractOp>(Ops[1].getLoc(), Ops[1],
34453445
Ops[2]);
34463446
(void)builder.createAlignedStore(getLoc(E->getExprLoc()), Ops[1], Ops[0],
@@ -3513,7 +3513,41 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
35133513
}
35143514
case NEON::BI__builtin_neon_vtrn_v:
35153515
case NEON::BI__builtin_neon_vtrnq_v: {
3516-
llvm_unreachable("NYI");
3516+
// This set of neon intrinsics implement SIMD matrix transpose.
3517+
// The matrix transposed is always 2x2, and these intrincis transpose
3518+
// multiple 2x2 matrices in parallel, that is why result type is
3519+
// always 2-D matrix whose last dimension is 2.
3520+
// For example `vtrn_s16` would have:
3521+
// input 1: {0, 1, 2, 3}
3522+
// input 2; {4, 5, 6, 7}
3523+
// This basically represents two 2x2 matrices:
3524+
// [ 0, 1 ] and [ 2, 3]
3525+
// [ 4, 5 ] [ 6, 7]
3526+
// They should be simultaneously and independently transposed.
3527+
// Thus, result is :
3528+
// { {0, 4, 2, 6},
3529+
// {1, 5, 3, 7 } }
3530+
Ops[1] = builder.createBitcast(Ops[1], ty);
3531+
Ops[2] = builder.createBitcast(Ops[2], ty);
3532+
// Adding a bitcast here as Ops[0] might be a void pointer.
3533+
mlir::Value baseAddr =
3534+
builder.createBitcast(Ops[0], builder.getPointerTo(ty));
3535+
mlir::Value sv;
3536+
mlir::Location loc = getLoc(E->getExprLoc());
3537+
3538+
for (unsigned vi = 0; vi != 2; ++vi) {
3539+
llvm::SmallVector<int64_t, 16> indices;
3540+
for (unsigned i = 0, e = vTy.getSize(); i != e; i += 2) {
3541+
indices.push_back(i + vi);
3542+
indices.push_back(i + e + vi);
3543+
}
3544+
mlir::cir::ConstantOp idx = builder.getConstInt(loc, SInt32Ty, vi);
3545+
mlir::Value addr = builder.create<mlir::cir::PtrStrideOp>(
3546+
loc, baseAddr.getType(), baseAddr, idx);
3547+
sv = builder.createVecShuffle(loc, Ops[1], Ops[2], indices);
3548+
(void)builder.CIRBaseBuilderTy::createStore(loc, sv, addr);
3549+
}
3550+
return sv;
35173551
}
35183552
case NEON::BI__builtin_neon_vuzp_v:
35193553
case NEON::BI__builtin_neon_vuzpq_v: {

0 commit comments

Comments
 (0)