@@ -2968,8 +2968,8 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
2968
2968
}
2969
2969
}
2970
2970
2971
- mlir::cir::VectorType Ty = GetNeonType (this , Type);
2972
- if (!Ty )
2971
+ mlir::cir::VectorType ty = GetNeonType (this , Type);
2972
+ if (!ty )
2973
2973
return nullptr ;
2974
2974
2975
2975
// Not all intrinsics handled by the common case work for AArch64 yet, so only
@@ -2986,7 +2986,7 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
2986
2986
buildAArch64TblBuiltinExpr (*this , BuiltinID, E, Ops, Arch))
2987
2987
return V;
2988
2988
2989
- mlir::cir::VectorType VTy = Ty ;
2989
+ mlir::cir::VectorType vTy = ty ;
2990
2990
llvm::SmallVector<mlir::Value, 4 > args;
2991
2991
switch (BuiltinID) {
2992
2992
default :
@@ -3066,8 +3066,8 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
3066
3066
// https://developer.arm.com/architectures/instruction-sets/intrinsics/
3067
3067
return buildNeonCall (
3068
3068
BuiltinID, *this ,
3069
- {builder.getExtendedElementVectorType (Ty , true ), SInt32Ty}, Ops,
3070
- " llvm.aarch64.neon.sqrshrun" , Ty , getLoc (E->getExprLoc ()));
3069
+ {builder.getExtendedElementVectorType (ty , true ), SInt32Ty}, Ops,
3070
+ " llvm.aarch64.neon.sqrshrun" , ty , getLoc (E->getExprLoc ()));
3071
3071
case NEON::BI__builtin_neon_vqshrn_n_v:
3072
3072
llvm_unreachable (" NYI" );
3073
3073
case NEON::BI__builtin_neon_vrshrn_n_v:
@@ -3080,7 +3080,7 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
3080
3080
case NEON::BI__builtin_neon_vrnda_v:
3081
3081
case NEON::BI__builtin_neon_vrndaq_v: {
3082
3082
assert (!MissingFeatures::buildConstrainedFPCall ());
3083
- return buildNeonCall (BuiltinID, *this , {Ty }, Ops, " llvm.round" , Ty ,
3083
+ return buildNeonCall (BuiltinID, *this , {ty }, Ops, " llvm.round" , ty ,
3084
3084
getLoc (E->getExprLoc ()));
3085
3085
}
3086
3086
case NEON::BI__builtin_neon_vrndih_f16: {
@@ -3407,20 +3407,20 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
3407
3407
}
3408
3408
case NEON::BI__builtin_neon_vld1_v:
3409
3409
case NEON::BI__builtin_neon_vld1q_v: {
3410
- return builder.createAlignedLoad (Ops[0 ].getLoc (), VTy , Ops[0 ],
3410
+ return builder.createAlignedLoad (Ops[0 ].getLoc (), vTy , Ops[0 ],
3411
3411
PtrOp0.getAlignment ());
3412
3412
}
3413
3413
case NEON::BI__builtin_neon_vst1_v:
3414
3414
case NEON::BI__builtin_neon_vst1q_v: {
3415
- Ops[1 ] = builder.createBitcast (Ops[1 ], VTy );
3415
+ Ops[1 ] = builder.createBitcast (Ops[1 ], vTy );
3416
3416
(void )builder.createAlignedStore (Ops[1 ].getLoc (), Ops[1 ], Ops[0 ],
3417
3417
PtrOp0.getAlignment ());
3418
3418
return Ops[1 ];
3419
3419
}
3420
3420
case NEON::BI__builtin_neon_vld1_lane_v:
3421
3421
case NEON::BI__builtin_neon_vld1q_lane_v: {
3422
- Ops[1 ] = builder.createBitcast (Ops[1 ], VTy );
3423
- Ops[0 ] = builder.createAlignedLoad (Ops[0 ].getLoc (), VTy .getEltType (),
3422
+ Ops[1 ] = builder.createBitcast (Ops[1 ], vTy );
3423
+ Ops[0 ] = builder.createAlignedLoad (Ops[0 ].getLoc (), vTy .getEltType (),
3424
3424
Ops[0 ], PtrOp0.getAlignment ());
3425
3425
return builder.create <mlir::cir::VecInsertOp>(getLoc (E->getExprLoc ()),
3426
3426
Ops[1 ], Ops[0 ], Ops[2 ]);
@@ -3435,7 +3435,7 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
3435
3435
}
3436
3436
case NEON::BI__builtin_neon_vst1_lane_v:
3437
3437
case NEON::BI__builtin_neon_vst1q_lane_v: {
3438
- Ops[1 ] = builder.createBitcast (Ops[1 ], Ty );
3438
+ Ops[1 ] = builder.createBitcast (Ops[1 ], ty );
3439
3439
Ops[1 ] = builder.create <mlir::cir::VecExtractOp>(Ops[1 ].getLoc (), Ops[1 ],
3440
3440
Ops[2 ]);
3441
3441
(void )builder.createAlignedStore (getLoc (E->getExprLoc ()), Ops[1 ], Ops[0 ],
@@ -3508,7 +3508,41 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
3508
3508
}
3509
3509
case NEON::BI__builtin_neon_vtrn_v:
3510
3510
case NEON::BI__builtin_neon_vtrnq_v: {
3511
- llvm_unreachable (" NYI" );
3511
+ // This set of neon intrinsics implement SIMD matrix transpose.
3512
+ // The matrix transposed is always 2x2, and these intrincis transpose
3513
+ // multiple 2x2 matrices in parallel, that is why result type is
3514
+ // always 2-D matrix whose last dimension is 2.
3515
+ // For example `vtrn_s16` would have:
3516
+ // input 1: {0, 1, 2, 3}
3517
+ // input 2; {4, 5, 6, 7}
3518
+ // This basically represents two 2x2 matrices:
3519
+ // [ 0, 1 ] and [ 2, 3]
3520
+ // [ 4, 5 ] [ 6, 7]
3521
+ // They should be simultaneously and independently transposed.
3522
+ // Thus, result is :
3523
+ // { {0, 4, 2, 6},
3524
+ // {1, 5, 3, 7 } }
3525
+ Ops[1 ] = builder.createBitcast (Ops[1 ], ty);
3526
+ Ops[2 ] = builder.createBitcast (Ops[2 ], ty);
3527
+ // Adding a bitcast here as Ops[0] might be a void pointer.
3528
+ mlir::Value baseAddr =
3529
+ builder.createBitcast (Ops[0 ], builder.getPointerTo (ty));
3530
+ mlir::Value sv;
3531
+ mlir::Location loc = getLoc (E->getExprLoc ());
3532
+
3533
+ for (unsigned vi = 0 ; vi != 2 ; ++vi) {
3534
+ llvm::SmallVector<int64_t , 16 > indices;
3535
+ for (unsigned i = 0 , e = vTy.getSize (); i != e; i += 2 ) {
3536
+ indices.push_back (i + vi);
3537
+ indices.push_back (i + e + vi);
3538
+ }
3539
+ mlir::cir::ConstantOp idx = builder.getConstInt (loc, PtrDiffTy, vi);
3540
+ mlir::Value addr = builder.create <mlir::cir::PtrStrideOp>(
3541
+ loc, baseAddr.getType (), baseAddr, idx);
3542
+ sv = builder.createVecShuffle (loc, Ops[1 ], Ops[2 ], indices);
3543
+ (void )builder.CIRBaseBuilderTy ::createStore (loc, sv, addr);
3544
+ }
3545
+ return sv;
3512
3546
}
3513
3547
case NEON::BI__builtin_neon_vuzp_v:
3514
3548
case NEON::BI__builtin_neon_vuzpq_v: {
0 commit comments