From b00271123e45c8e1fb66e8ec777eb6bb1ee6b311 Mon Sep 17 00:00:00 2001 From: Vikram Date: Wed, 10 Apr 2024 11:53:16 +0000 Subject: [PATCH 1/9] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types --- clang/lib/CodeGen/CGBuiltin.cpp | 4 + clang/test/CodeGenOpenCL/builtins-amdgcn.cl | 4 +- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 15 +- .../Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 10 +- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 3 + llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 4 + llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td | 29 + .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 190 + llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h | 3 + llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 60 + llvm/lib/Target/AMDGPU/SIInstructions.td | 2 +- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 11 +- llvm/lib/Target/AMDGPU/VOP2Instructions.td | 16 +- .../UniformityAnalysis/AMDGPU/intrinsics.ll | 6 +- .../atomic_optimizations_mul_one.ll | 13 +- .../atomic_optimization_split_dt_update.ll | 2 +- .../test/CodeGen/AMDGPU/global-atomic-scan.ll | 48 +- .../AMDGPU/global_atomic_optimizer_fp_rtn.ll | 120 +- .../AMDGPU/global_atomics_iterative_scan.ll | 2 +- .../global_atomics_iterative_scan_fp.ll | 8 +- .../global_atomics_optimizer_fp_no_rtn.ll | 24 +- .../AMDGPU/llvm.amdgcn.readfirstlane.ll | 654 +++- .../AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll | 92 + .../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll | 960 +++++- .../AMDGPU/llvm.amdgcn.readlane.ptr.ll | 105 + .../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll | 3041 ++++++++++++++++- .../AMDGPU/llvm.amdgcn.writelane.ptr.ll | 292 ++ .../InstCombine/AMDGPU/amdgcn-intrinsics.ll | 32 +- 28 files changed, 5485 insertions(+), 265 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index f9ee93049b12d..a275ada586d13 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18479,6 +18479,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, CGM.getIntrinsic(Intrinsic::amdgcn_update_dpp, Args[0]->getType()); return Builder.CreateCall(F, Args); } + case AMDGPU::BI__builtin_amdgcn_readlane: + return emitBinaryBuiltin(*this, E, Intrinsic::amdgcn_readlane); + case AMDGPU::BI__builtin_amdgcn_readfirstlane: + return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_readfirstlane); case AMDGPU::BI__builtin_amdgcn_div_fixup: case AMDGPU::BI__builtin_amdgcn_div_fixupf: case AMDGPU::BI__builtin_amdgcn_div_fixuph: diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index c2ef9ea947e93..83b75f9e2d618 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -306,14 +306,14 @@ void test_ds_bpermute(global int* out, int a, int b) } // CHECK-LABEL: @test_readfirstlane -// CHECK: call i32 @llvm.amdgcn.readfirstlane(i32 %a) +// CHECK: call i32 @llvm.amdgcn.readfirstlane.i32(i32 %a) void test_readfirstlane(global int* out, int a) { *out = __builtin_amdgcn_readfirstlane(a); } // CHECK-LABEL: @test_readlane -// CHECK: call i32 @llvm.amdgcn.readlane(i32 %a, i32 %b) +// CHECK: call i32 @llvm.amdgcn.readlane.i32(i32 %a, i32 %b) void test_readlane(global int* out, int a, int b) { *out = __builtin_amdgcn_readlane(a, b); diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index be8048ca2459c..457566944069e 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2176,26 +2176,23 @@ def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce; def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce; def int_amdgcn_readfirstlane : - ClangBuiltin<"__builtin_amdgcn_readfirstlane">, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty], + Intrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; // The lane argument must be uniform across the currently active threads of the // current wave. Otherwise, the result is undefined. def int_amdgcn_readlane : - ClangBuiltin<"__builtin_amdgcn_readlane">, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], + Intrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; // The value to write and lane select arguments must be uniform across the // currently active threads of the current wave. Otherwise, the result is // undefined. def int_amdgcn_writelane : - ClangBuiltin<"__builtin_amdgcn_writelane">, - Intrinsic<[llvm_i32_ty], [ - llvm_i32_ty, // uniform value to write: returned by the selected lane - llvm_i32_ty, // uniform lane select - llvm_i32_ty // returned by all lanes other than the selected one + Intrinsic<[llvm_any_ty], [ + LLVMMatchType<0>, // uniform value to write: returned by the selected lane + llvm_i32_ty, // uniform lane select + LLVMMatchType<0> // returned by all lanes other than the selected one ], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] >; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp index 1d645002b1fe6..9ac0a52f63fd8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -433,7 +433,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B, // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and // combine them with a scalar operation. Function *ReadLane = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, B.getInt32Ty()); V = B.CreateBitCast(V, IntNTy); Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)}); Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)}); @@ -523,10 +523,10 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V, {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); } else { - Function *ReadLane = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); - Function *WriteLane = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {}); + Function *ReadLane = Intrinsic::getDeclaration( + M, Intrinsic::amdgcn_readlane, B.getInt32Ty()); + Function *WriteLane = Intrinsic::getDeclaration( + M, Intrinsic::amdgcn_writelane, B.getInt32Ty()); // On GFX10 all DPP operations are confined to a single row. To get cross- // row operations we have to use permlane or readlane. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index d35a022ad6806..9cd7496ba9f96 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -5496,6 +5496,9 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(LDS) NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD) NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD) + NODE_NAME_CASE(READLANE) + NODE_NAME_CASE(READFIRSTLANE) + NODE_NAME_CASE(WRITELANE) NODE_NAME_CASE(DUMMY_CHAIN) case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; NODE_NAME_CASE(LOAD_D16_HI) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 3814b56a4d56a..02c3dcf39e554 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -558,6 +558,10 @@ enum NodeType : unsigned { FPTRUNC_ROUND_UPWARD, FPTRUNC_ROUND_DOWNWARD, + READLANE, + READFIRSTLANE, + WRITELANE, + DUMMY_CHAIN, FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE, LOAD_D16_HI, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index 702f6e67c5527..e4f329b200c86 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -342,6 +342,22 @@ def AMDGPUfdot2_impl : SDNode<"AMDGPUISD::FDOT2", def AMDGPUperm_impl : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>; +def AMDGPUReadfirstlaneOp : SDTypeProfile<1, 1, [ + SDTCisSameAs<0, 1> +]>; + +def AMDGPUReadlaneOp : SDTypeProfile<1, 2, [ + SDTCisSameAs<0, 1>, SDTCisInt<2> +]>; + +def AMDGPUDWritelaneOp : SDTypeProfile<1, 3, [ + SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisSameAs<0, 3> +]>; + +def AMDGPUreadlane_impl : SDNode<"AMDGPUISD::READLANE", AMDGPUReadlaneOp>; +def AMDGPUreadfirstlane_impl : SDNode<"AMDGPUISD::READFIRSTLANE", AMDGPUReadfirstlaneOp>; +def AMDGPUwritelane_impl : SDNode<"AMDGPUISD::WRITELANE", AMDGPUDWritelaneOp>; + // SI+ export def AMDGPUExportOp : SDTypeProfile<0, 8, [ SDTCisInt<0>, // i8 tgt @@ -506,3 +522,16 @@ def AMDGPUdiv_fmas : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$vcc def AMDGPUperm : PatFrags<(ops node:$src0, node:$src1, node:$src2), [(int_amdgcn_perm node:$src0, node:$src1, node:$src2), (AMDGPUperm_impl node:$src0, node:$src1, node:$src2)]>; + +def AMDGPUreadlane : PatFrags<(ops node:$src0, node:$src1), + [(int_amdgcn_readlane node:$src0, node:$src1), + (AMDGPUreadlane_impl node:$src0, node:$src1)]>; + +def AMDGPUreadfirstlane : PatFrags<(ops node:$src), + [(int_amdgcn_readfirstlane node:$src), + (AMDGPUreadfirstlane_impl node:$src)]>; + +def AMDGPUwritelane : PatFrags<(ops node:$src0, node:$src1, node:$src2), + [(int_amdgcn_writelane node:$src0, node:$src1, node:$src2), + (AMDGPUwritelane_impl node:$src0, node:$src1, node:$src2)]>; + diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index bd7bf78c4c0bd..6ffc8a20f76fa 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -5387,6 +5387,192 @@ bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, return true; } +bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, + MachineInstr &MI, + Intrinsic::ID IID) const { + + MachineIRBuilder &B = Helper.MIRBuilder; + MachineRegisterInfo &MRI = *B.getMRI(); + + Register DstReg = MI.getOperand(0).getReg(); + Register Src0 = MI.getOperand(2).getReg(); + + auto createLaneOp = [&](Register Src0, Register Src1, + Register Src2) -> Register { + auto LaneOp = B.buildIntrinsic(IID, {S32}).addUse(Src0); + switch (IID) { + case Intrinsic::amdgcn_readfirstlane: + return LaneOp.getReg(0); + case Intrinsic::amdgcn_readlane: + return LaneOp.addUse(Src1).getReg(0); + case Intrinsic::amdgcn_writelane: + return LaneOp.addUse(Src1).addUse(Src2).getReg(0); + default: + llvm_unreachable("unhandled lane op"); + } + }; + + Register Src1, Src2; + if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) { + Src1 = MI.getOperand(3).getReg(); + if (IID == Intrinsic::amdgcn_writelane) { + Src2 = MI.getOperand(4).getReg(); + } + } + + LLT Ty = MRI.getType(DstReg); + unsigned Size = Ty.getSizeInBits(); + + if (Size == 32) { + // Already legal + return true; + } + + if (Size < 32) { + Register Src0Cast = MRI.getType(Src0).isScalar() + ? Src0 + : B.buildBitcast(LLT::scalar(Size), Src0).getReg(0); + Src0 = B.buildAnyExt(S32, Src0Cast).getReg(0); + if (Src2.isValid()) { + Register Src2Cast = + MRI.getType(Src2).isScalar() + ? Src2 + : B.buildBitcast(LLT::scalar(Size), Src2).getReg(0); + Src2 = B.buildAnyExt(LLT::scalar(32), Src2Cast).getReg(0); + } + + Register LaneOpDst = createLaneOp(Src0, Src1, Src2); + if (Ty.isScalar()) + B.buildTrunc(DstReg, LaneOpDst); + else { + auto Trunc = B.buildTrunc(LLT::scalar(Size), LaneOpDst); + B.buildBitcast(DstReg, Trunc); + } + + MI.eraseFromParent(); + return true; + } + + if ((Size % 32) == 0) { + SmallVector PartialRes; + unsigned NumParts = Size / 32; + auto IsS16Vec = Ty.isVector() && Ty.getElementType() == S16; + MachineInstrBuilder Src0Parts; + + if (Ty.isPointer()) { + auto PtrToInt = B.buildPtrToInt(LLT::scalar(Size), Src0); + Src0Parts = B.buildUnmerge(S32, PtrToInt); + } else if (Ty.isPointerVector()) { + LLT IntVecTy = Ty.changeElementType( + LLT::scalar(Ty.getElementType().getSizeInBits())); + auto PtrToInt = B.buildPtrToInt(IntVecTy, Src0); + Src0Parts = B.buildUnmerge(S32, PtrToInt); + } else + Src0Parts = + IsS16Vec ? B.buildUnmerge(V2S16, Src0) : B.buildUnmerge(S32, Src0); + + switch (IID) { + case Intrinsic::amdgcn_readlane: { + Register Src1 = MI.getOperand(3).getReg(); + for (unsigned i = 0; i < NumParts; ++i) { + Src0 = IsS16Vec ? B.buildBitcast(S32, Src0Parts.getReg(i)).getReg(0) + : Src0Parts.getReg(i); + PartialRes.push_back( + (B.buildIntrinsic(Intrinsic::amdgcn_readlane, {S32}) + .addUse(Src0) + .addUse(Src1)) + .getReg(0)); + } + break; + } + case Intrinsic::amdgcn_readfirstlane: { + for (unsigned i = 0; i < NumParts; ++i) { + Src0 = IsS16Vec ? B.buildBitcast(S32, Src0Parts.getReg(i)).getReg(0) + : Src0Parts.getReg(i); + PartialRes.push_back( + (B.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, {S32}) + .addUse(Src0) + .getReg(0))); + } + + break; + } + case Intrinsic::amdgcn_writelane: { + Register Src1 = MI.getOperand(3).getReg(); + Register Src2 = MI.getOperand(4).getReg(); + MachineInstrBuilder Src2Parts; + + if (Ty.isPointer()) { + auto PtrToInt = B.buildPtrToInt(S64, Src2); + Src2Parts = B.buildUnmerge(S32, PtrToInt); + } else if (Ty.isPointerVector()) { + LLT IntVecTy = Ty.changeElementType( + LLT::scalar(Ty.getElementType().getSizeInBits())); + auto PtrToInt = B.buildPtrToInt(IntVecTy, Src2); + Src2Parts = B.buildUnmerge(S32, PtrToInt); + } else + Src2Parts = + IsS16Vec ? B.buildUnmerge(V2S16, Src2) : B.buildUnmerge(S32, Src2); + + for (unsigned i = 0; i < NumParts; ++i) { + Src0 = IsS16Vec ? B.buildBitcast(S32, Src0Parts.getReg(i)).getReg(0) + : Src0Parts.getReg(i); + Src2 = IsS16Vec ? B.buildBitcast(S32, Src2Parts.getReg(i)).getReg(0) + : Src2Parts.getReg(i); + PartialRes.push_back( + (B.buildIntrinsic(Intrinsic::amdgcn_writelane, {S32}) + .addUse(Src0) + .addUse(Src1) + .addUse(Src2)) + .getReg(0)); + } + + break; + } + } + + if (Ty.isPointerVector()) { + unsigned PtrSize = Ty.getElementType().getSizeInBits(); + SmallVector PtrElements; + if (PtrSize == 32) { + // Handle 32 bit pointers + for (unsigned i = 0; i < NumParts; i++) + PtrElements.push_back( + B.buildIntToPtr(Ty.getElementType(), PartialRes[i]).getReg(0)); + } else { + // Handle legalization of + SmallVector PtrParts; + unsigned NumS32Parts = PtrSize / 32; + unsigned PartIdx = 0; + for (unsigned i = 0, j = 1; i < NumParts; i += NumS32Parts, j++) { + // Merge S32 components of a pointer element first. + for (; PartIdx < (j * NumS32Parts); PartIdx++) + PtrParts.push_back(PartialRes[PartIdx]); + + auto MergedPtr = + B.buildMergeLikeInstr(LLT::scalar(PtrSize), PtrParts); + PtrElements.push_back( + B.buildIntToPtr(Ty.getElementType(), MergedPtr).getReg(0)); + PtrParts.clear(); + } + } + + B.buildMergeLikeInstr(DstReg, PtrElements); + } else { + if (IsS16Vec) { + for (unsigned i = 0; i < NumParts; i++) + PartialRes[i] = B.buildBitcast(V2S16, PartialRes[i]).getReg(0); + } + B.buildMergeLikeInstr(DstReg, PartialRes); + } + + MI.eraseFromParent(); + return true; + } + + return false; +} + bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { @@ -7330,6 +7516,10 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, Observer.changedInstr(MI); return true; } + case Intrinsic::amdgcn_readlane: + case Intrinsic::amdgcn_writelane: + case Intrinsic::amdgcn_readfirstlane: + return legalizeLaneOp(Helper, MI, IntrID); default: { if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrID)) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index e5ba84a74a0f8..40e056154527f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -208,6 +208,9 @@ class AMDGPULegalizerInfo final : public LegalizerInfo { bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const; + bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, + Intrinsic::ID IID) const; + bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const; bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 0a3a56e9b3a0b..6c23bdf09974b 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6086,6 +6086,62 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE)); } +static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, + SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + unsigned ValSize = VT.getSizeInBits(); + unsigned IntrinsicID = N->getConstantOperandVal(0); + SDValue Src0 = N->getOperand(1); + SDLoc SL(N); + MVT IntVT = MVT::getIntegerVT(ValSize); + + auto createLaneOp = [&DAG, &SL](SDValue Src0, SDValue Src1, SDValue Src2, + MVT VT) -> SDValue { + return (Src2 ? DAG.getNode(AMDGPUISD::WRITELANE, SL, VT, {Src0, Src1, Src2}) + : Src1 ? DAG.getNode(AMDGPUISD::READLANE, SL, VT, {Src0, Src1}) + : DAG.getNode(AMDGPUISD::READFIRSTLANE, SL, VT, {Src0})); + }; + + SDValue Src1, Src2; + if (IntrinsicID == Intrinsic::amdgcn_readlane || + IntrinsicID == Intrinsic::amdgcn_writelane) { + Src1 = N->getOperand(2); + if (IntrinsicID == Intrinsic::amdgcn_writelane) + Src2 = N->getOperand(3); + } + + if (ValSize == 32) { + // Already legal + return SDValue(); + } + + if (ValSize < 32) { + SDValue InitBitCast = DAG.getBitcast(IntVT, Src0); + Src0 = DAG.getAnyExtOrTrunc(InitBitCast, SL, MVT::i32); + if (Src2.getNode()) { + SDValue Src2Cast = DAG.getBitcast(IntVT, Src2); + Src2 = DAG.getAnyExtOrTrunc(Src2Cast, SL, MVT::i32); + } + SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32); + SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT); + return DAG.getBitcast(VT, Trunc); + } + + if ((ValSize % 32) == 0) { + MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32); + Src0 = DAG.getBitcast(VecVT, Src0); + + if (Src2.getNode()) + Src2 = DAG.getBitcast(VecVT, Src2); + + SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT); + SDValue UnrolledLaneOp = DAG.UnrollVectorOp(LaneOp.getNode()); + return DAG.getBitcast(VT, UnrolledLaneOp); + } + + return SDValue(); +} + void SITargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { @@ -8553,6 +8609,10 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } case Intrinsic::amdgcn_addrspacecast_nonnull: return lowerADDRSPACECAST(Op, DAG); + case Intrinsic::amdgcn_readlane: + case Intrinsic::amdgcn_readfirstlane: + case Intrinsic::amdgcn_writelane: + return lowerLaneOp(*this, Op.getNode(), DAG); default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index f9e811f54d05e..628fa438f696a 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3400,7 +3400,7 @@ def : GCNPat< // FIXME: Should also do this for readlane, but tablegen crashes on // the ignored src1. def : GCNPat< - (int_amdgcn_readfirstlane (i32 imm:$src)), + (i32 (AMDGPUreadfirstlane (i32 imm:$src))), (S_MOV_B32 SReg_32:$src) >; diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 4a56fad0cd603..9c5d6a7bf6d0b 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -112,7 +112,7 @@ class getVOP1Pat : LetDummies { !if(P.HasOMod, [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3OMods P.Src0VT:$src0, i1:$clamp, i32:$omod))))], - [(set P.DstVT:$vdst, (node P.Src0RC32:$src0))] + [(set P.DstVT:$vdst, (node (P.Src0VT P.Src0RC32:$src0)))] ) ); } @@ -243,11 +243,16 @@ def VOP_READFIRSTLANE : VOPProfile <[i32, i32, untyped, untyped]> { // FIXME: Specify SchedRW for READFIRSTLANE_B32 // TODO: There is VOP3 encoding also def V_READFIRSTLANE_B32 : VOP1_Pseudo <"v_readfirstlane_b32", VOP_READFIRSTLANE, - getVOP1Pat.ret, 1> { + [], 1> { let isConvergent = 1; } +foreach vt = Reg32Types.types in { + def : GCNPat<(vt (AMDGPUreadfirstlane (vt VRegOrLdsSrc_32:$src0))), + (V_READFIRSTLANE_B32 (vt VRegOrLdsSrc_32:$src0)) + >; +} + let isReMaterializable = 1 in { let SchedRW = [WriteDoubleCvt] in { // OMod clears exceptions when set in this instruction diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index d2af1753d5503..b1df57320cfdd 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -780,14 +780,22 @@ defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag, // These are special and do not read the exec mask. let isConvergent = 1, Uses = [] in { -def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE, - [(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))]>; +def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE,[]>; let IsNeverUniform = 1, Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in { -def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, - [(set i32:$vdst, (int_amdgcn_writelane i32:$src0, i32:$src1, i32:$vdst_in))]>; +def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, []>; } // End IsNeverUniform, $vdst = $vdst_in, DisableEncoding $vdst_in } // End isConvergent = 1 +foreach vt = Reg32Types.types in { + def : GCNPat<(vt (AMDGPUreadlane vt:$src0, i32:$src1)), + (V_READLANE_B32 VRegOrLdsSrc_32:$src0, SCSrc_b32:$src1) + >; + + def : GCNPat<(vt (AMDGPUwritelane vt:$src0, i32:$src1, vt:$src2)), + (V_WRITELANE_B32 SCSrc_b32:$src0, SCSrc_b32:$src1, VGPR_32:$src2) + >; +} + let isReMaterializable = 1 in { defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_I32_I32_I32>; defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_I32_I32_I32, add_ctpop>; diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll index 26c85e83b53ad..74d2f53d7b317 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll @@ -56,9 +56,9 @@ define amdgpu_kernel void @mov_dpp8(ptr addrspace(1) %out, i32 %in) #0 { ret void } -; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.writelane(i32 0, i32 1, i32 2) +; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.writelane.i32(i32 0, i32 1, i32 2) define amdgpu_kernel void @writelane(ptr addrspace(1) %out) #0 { - %tmp0 = call i32 @llvm.amdgcn.writelane(i32 0, i32 1, i32 2) + %tmp0 = call i32 @llvm.amdgcn.writelane.i32(i32 0, i32 1, i32 2) store i32 %tmp0, ptr addrspace(1) %out ret void } @@ -237,7 +237,7 @@ declare i32 @llvm.amdgcn.permlanex16.var(i32, i32, i32, i1, i1) #1 declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #1 declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #1 declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #1 -declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #1 +declare i32 @llvm.amdgcn.writelane.i32(i32, i32, i32) #1 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half>, <16 x half> , <8 x float>) #1 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16>, <16 x i16> , <8 x float>) #1 declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) #1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll index 220dc70165e87..bdfafa89cd047 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-atomic-optimizer %s | FileCheck -check-prefix=IR %s ; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s @@ -74,7 +73,7 @@ define amdgpu_cs void @atomic_add_and_format(<4 x i32> inreg %arg) { ; IR-NEXT: br label [[TMP11]] ; IR: 11: ; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ] -; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP12]]) +; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]]) ; IR-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP5]] ; IR-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP14]], i32 0, i32 0, i32 0) ; IR-NEXT: ret void @@ -172,7 +171,7 @@ define amdgpu_cs void @atomic_sub_and_format(<4 x i32> inreg %arg) { ; IR-NEXT: br label [[TMP11]] ; IR: 11: ; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ] -; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP12]]) +; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]]) ; IR-NEXT: [[TMP14:%.*]] = sub i32 [[TMP13]], [[TMP5]] ; IR-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP14]], i32 0, i32 0, i32 0) ; IR-NEXT: ret void @@ -273,7 +272,7 @@ define amdgpu_cs void @atomic_xor_and_format(<4 x i32> inreg %arg) { ; IR-NEXT: br label [[TMP12]] ; IR: 12: ; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] -; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]]) +; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) ; IR-NEXT: [[TMP15:%.*]] = and i32 [[TMP5]], 1 ; IR-NEXT: [[TMP16:%.*]] = xor i32 [[TMP14]], [[TMP15]] ; IR-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP16]], i32 0, i32 0, i32 0) @@ -374,7 +373,7 @@ define amdgpu_cs void @atomic_ptr_add_and_format(ptr addrspace(8) inreg %arg) { ; IR-NEXT: br label [[TMP11]] ; IR: 11: ; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ] -; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP12]]) +; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]]) ; IR-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP5]] ; IR-NEXT: [[ARG_INT:%.*]] = ptrtoint ptr addrspace(8) [[ARG]] to i128 ; IR-NEXT: [[ARG_VEC:%.*]] = bitcast i128 [[ARG_INT]] to <4 x i32> @@ -476,7 +475,7 @@ define amdgpu_cs void @atomic_ptr_sub_and_format(ptr addrspace(8) inreg %arg) { ; IR-NEXT: br label [[TMP11]] ; IR: 11: ; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ] -; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP12]]) +; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]]) ; IR-NEXT: [[TMP14:%.*]] = sub i32 [[TMP13]], [[TMP5]] ; IR-NEXT: [[ARG_INT:%.*]] = ptrtoint ptr addrspace(8) [[ARG]] to i128 ; IR-NEXT: [[ARG_VEC:%.*]] = bitcast i128 [[ARG_INT]] to <4 x i32> @@ -581,7 +580,7 @@ define amdgpu_cs void @atomic_ptr_xor_and_format(ptr addrspace(8) inreg %arg) { ; IR-NEXT: br label [[TMP12]] ; IR: 12: ; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] -; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]]) +; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) ; IR-NEXT: [[TMP15:%.*]] = and i32 [[TMP5]], 1 ; IR-NEXT: [[TMP16:%.*]] = xor i32 [[TMP14]], [[TMP15]] ; IR-NEXT: [[ARG_INT:%.*]] = ptrtoint ptr addrspace(8) [[ARG]] to i128 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimization_split_dt_update.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimization_split_dt_update.ll index c07cd4e493b9a..019f76aa44a87 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimization_split_dt_update.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimization_split_dt_update.ll @@ -48,7 +48,7 @@ define amdgpu_kernel void @ham(ptr addrspace(4) %arg) { ; CHECK-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP6]], [[BB7]] ], [ [[TMP16:%.*]], [[COMPUTELOOP]] ] ; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) ; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32 -; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[PHI]], i32 [[TMP11]]) +; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[PHI]], i32 [[TMP11]]) ; CHECK-NEXT: [[TMP13]] = add i32 [[ACCUMULATOR]], [[TMP12]] ; CHECK-NEXT: [[TMP14:%.*]] = shl i64 1, [[TMP10]] ; CHECK-NEXT: [[TMP15:%.*]] = xor i64 [[TMP14]], -1 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll index 6b47f81bccb71..6c61c837881c4 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll @@ -130,7 +130,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr addrspace(1) %out, ptr ; IR-NEXT: br label [[TMP12]] ; IR: 12: ; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] -; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]]) +; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) ; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] ; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]] ; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 @@ -193,7 +193,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %ou ; IR-NEXT: br label [[TMP12]] ; IR: 12: ; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] -; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]]) +; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) ; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] ; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]] ; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 @@ -251,7 +251,7 @@ define amdgpu_kernel void @atomic_add_i32_ret(ptr addrspace(1) %out, ptr addrspa ; IR-NEXT: br label [[TMP12]] ; IR: 12: ; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] -; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]]) +; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) ; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] ; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]] ; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 @@ -310,7 +310,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr ; IR-NEXT: br label [[TMP12]] ; IR: 12: ; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] -; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]]) +; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) ; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] ; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]] ; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 @@ -364,7 +364,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr addrspace(1) %out, ptr ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] -; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]] ; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]] ; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4 @@ -421,7 +421,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %ou ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] -; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]] ; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]] ; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4 @@ -473,7 +473,7 @@ define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspa ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] -; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]] ; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]] ; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4 @@ -526,7 +526,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] -; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]] ; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]] ; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4 @@ -586,7 +586,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr addrspace(1) %out, ptr ; IR-NEXT: br label [[TMP12]] ; IR: 12: ; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] -; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]]) +; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) ; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] ; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] ; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 @@ -649,7 +649,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %ou ; IR-NEXT: br label [[TMP12]] ; IR: 12: ; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] -; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]]) +; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) ; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] ; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] ; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 @@ -707,7 +707,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspa ; IR-NEXT: br label [[TMP12]] ; IR: 12: ; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] -; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]]) +; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) ; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] ; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] ; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 @@ -766,7 +766,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr ; IR-NEXT: br label [[TMP12]] ; IR: 12: ; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] -; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]]) +; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) ; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]] ; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] ; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4 @@ -820,7 +820,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr addrspace(1) %out, ptr ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] -; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]] ; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] ; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] @@ -878,7 +878,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] -; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]] ; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] ; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] @@ -931,7 +931,7 @@ define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspa ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] -; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]] ; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] ; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] @@ -985,7 +985,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] -; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]] ; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] ; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] @@ -1040,7 +1040,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr addrspace(1) %out, ptr ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] -; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]] ; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]] ; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] @@ -1098,7 +1098,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] -; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]] ; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]] ; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] @@ -1151,7 +1151,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrsp ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] -; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]] ; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]] ; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] @@ -1205,7 +1205,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] -; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]] ; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]] ; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] @@ -1260,7 +1260,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr addrspace(1) %out, ptr ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] -; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]] ; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]] ; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] @@ -1318,7 +1318,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] -; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]] ; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]] ; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] @@ -1371,7 +1371,7 @@ define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspa ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] -; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]] ; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]] ; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] @@ -1425,7 +1425,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; IR-NEXT: br label [[TMP9]] ; IR: 9: ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ] -; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]]) +; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]]) ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]] ; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]] ; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] diff --git a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll index b71728096093c..baaf50377338c 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll @@ -29,7 +29,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_agent_scope_uns ; IR: 16: ; IR-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] ; IR-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32 -; IR-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) +; IR-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) ; IR-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float ; IR-NEXT: [[TMP21:%.*]] = uitofp i32 [[TMP8]] to float ; IR-NEXT: [[TMP22:%.*]] = fmul float [[VAL]], [[TMP21]] @@ -62,7 +62,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_scope_agent_sco ; IR-ITERATIVE: 12: ; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ] ; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32 -; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) ; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float ; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = fadd float [[TMP16]], [[TMP28:%.*]] ; IR-ITERATIVE-NEXT: br label [[TMP18]] @@ -76,11 +76,11 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_scope_agent_sco ; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) ; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 ; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32 -; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP22]], i32 [[TMP21]]) +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]]) ; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float ; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32 ; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32 -; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) ; IR-ITERATIVE-NEXT: [[TMP28]] = bitcast i32 [[TMP27]] to float ; IR-ITERATIVE-NEXT: [[TMP29]] = fadd float [[ACCUMULATOR]], [[TMP24]] ; IR-ITERATIVE-NEXT: [[TMP30:%.*]] = shl i64 1, [[TMP20]] @@ -120,7 +120,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_scope_agent_sco ; IR-DPP-NEXT: [[TMP24:%.*]] = fadd float [[TMP22]], [[TMP23]] ; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP24]], i32 312, i32 15, i32 15, i1 false) ; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP24]] to i32 -; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP26]], i32 63) +; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63) ; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float ; IR-DPP-NEXT: [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]]) ; IR-DPP-NEXT: [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0 @@ -131,7 +131,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_scope_agent_sco ; IR-DPP: 33: ; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ] ; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast float [[TMP34]] to i32 -; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP35]]) +; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]]) ; IR-DPP-NEXT: [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float ; IR-DPP-NEXT: [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]]) ; IR-DPP-NEXT: [[TMP39:%.*]] = fadd float [[TMP37]], [[TMP38]] @@ -167,7 +167,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_one_as_scope_un ; IR-ITERATIVE: 16: ; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] ; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32 -; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float ; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] @@ -199,7 +199,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_one_as_scope_un ; IR-DPP: 16: ; IR-DPP-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] ; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32 -; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float ; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] ; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] @@ -232,7 +232,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_one_as_scope_un ; IR-ITERATIVE: 12: ; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ] ; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32 -; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float ; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP16]], float [[TMP28:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] ; IR-ITERATIVE-NEXT: br label [[TMP18]] @@ -246,11 +246,11 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_one_as_scope_un ; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 ; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32 -; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float ; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32 ; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32 -; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP28]] = bitcast i32 [[TMP27]] to float ; IR-ITERATIVE-NEXT: [[TMP29]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP24]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP30:%.*]] = shl i64 1, [[TMP20]] @@ -290,7 +290,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_one_as_scope_un ; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] ; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP24]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP24]] to i32 -; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP26]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float ; IR-DPP-NEXT: [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]]) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0 @@ -301,7 +301,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_one_as_scope_un ; IR-DPP: 33: ; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ] ; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast float [[TMP34]] to i32 -; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP35]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]]) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float ; IR-DPP-NEXT: [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]]) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP39:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP37]], float [[TMP38]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] @@ -337,7 +337,7 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_uni_value_agent_scope_str ; IR-ITERATIVE: 16: ; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] ; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32 -; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float ; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] @@ -369,7 +369,7 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_uni_value_agent_scope_str ; IR-DPP: 16: ; IR-DPP-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] ; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32 -; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float ; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] ; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] @@ -402,7 +402,7 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_str ; IR-ITERATIVE: 12: ; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ] ; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32 -; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float ; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP16]], float [[TMP28:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] ; IR-ITERATIVE-NEXT: br label [[TMP18]] @@ -416,11 +416,11 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_str ; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 ; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32 -; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float ; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32 ; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32 -; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP28]] = bitcast i32 [[TMP27]] to float ; IR-ITERATIVE-NEXT: [[TMP29]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP24]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP30:%.*]] = shl i64 1, [[TMP20]] @@ -460,7 +460,7 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_str ; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] ; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP24]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP24]] to i32 -; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP26]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float ; IR-DPP-NEXT: [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]]) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0 @@ -471,7 +471,7 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_str ; IR-DPP: 33: ; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ] ; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast float [[TMP34]] to i32 -; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP35]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]]) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float ; IR-DPP-NEXT: [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]]) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP39:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP37]], float [[TMP38]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] @@ -503,7 +503,7 @@ define amdgpu_ps float @global_atomic_fmin_uni_address_uni_value_agent_scope_uns ; IR: 12: ; IR-NEXT: [[TMP13:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ] ; IR-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32 -; IR-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) +; IR-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) ; IR-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float ; IR-NEXT: [[TMP17:%.*]] = uitofp i32 [[TMP8]] to float ; IR-NEXT: [[TMP18:%.*]] = select i1 [[TMP9]], float 0x7FF0000000000000, float [[VAL]] @@ -536,7 +536,7 @@ define amdgpu_ps float @global_atomic_fmin_uni_address_div_value_agent_scope_uns ; IR-ITERATIVE: 12: ; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ] ; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32 -; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) ; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float ; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call float @llvm.minnum.f32(float [[TMP16]], float [[TMP28:%.*]]) ; IR-ITERATIVE-NEXT: br label [[TMP18]] @@ -550,11 +550,11 @@ define amdgpu_ps float @global_atomic_fmin_uni_address_div_value_agent_scope_uns ; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) ; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 ; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32 -; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP22]], i32 [[TMP21]]) +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]]) ; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float ; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32 ; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32 -; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) ; IR-ITERATIVE-NEXT: [[TMP28]] = bitcast i32 [[TMP27]] to float ; IR-ITERATIVE-NEXT: [[TMP29]] = call float @llvm.minnum.f32(float [[ACCUMULATOR]], float [[TMP24]]) ; IR-ITERATIVE-NEXT: [[TMP30:%.*]] = shl i64 1, [[TMP20]] @@ -594,7 +594,7 @@ define amdgpu_ps float @global_atomic_fmin_uni_address_div_value_agent_scope_uns ; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.minnum.f32(float [[TMP22]], float [[TMP23]]) ; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP24]], i32 312, i32 15, i32 15, i1 false) ; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP24]] to i32 -; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP26]], i32 63) +; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63) ; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float ; IR-DPP-NEXT: [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]]) ; IR-DPP-NEXT: [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0 @@ -605,7 +605,7 @@ define amdgpu_ps float @global_atomic_fmin_uni_address_div_value_agent_scope_uns ; IR-DPP: 33: ; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ] ; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast float [[TMP34]] to i32 -; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP35]]) +; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]]) ; IR-DPP-NEXT: [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float ; IR-DPP-NEXT: [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]]) ; IR-DPP-NEXT: [[TMP39:%.*]] = call float @llvm.minnum.f32(float [[TMP37]], float [[TMP38]]) @@ -637,7 +637,7 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_uni_value_agent_scope_uns ; IR-ITERATIVE: 12: ; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ] ; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32 -; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float ; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = select i1 [[TMP9]], float 0xFFF0000000000000, float [[VAL]] @@ -665,7 +665,7 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_uni_value_agent_scope_uns ; IR-DPP: 12: ; IR-DPP-NEXT: [[TMP13:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ] ; IR-DPP-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32 -; IR-DPP-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float ; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] ; IR-DPP-NEXT: [[TMP18:%.*]] = select i1 [[TMP9]], float 0xFFF0000000000000, float [[VAL]] @@ -698,7 +698,7 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_div_value_agent_scope_uns ; IR-ITERATIVE: 12: ; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ] ; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32 -; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float ; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP16]], float [[TMP28:%.*]], metadata !"fpexcept.strict") #[[ATTR7]] ; IR-ITERATIVE-NEXT: br label [[TMP18]] @@ -712,11 +712,11 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_div_value_agent_scope_uns ; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 ; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32 -; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float ; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32 ; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32 -; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP28]] = bitcast i32 [[TMP27]] to float ; IR-ITERATIVE-NEXT: [[TMP29]] = call float @llvm.experimental.constrained.maxnum.f32(float [[ACCUMULATOR]], float [[TMP24]], metadata !"fpexcept.strict") #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP30:%.*]] = shl i64 1, [[TMP20]] @@ -756,7 +756,7 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_div_value_agent_scope_uns ; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP22]], float [[TMP23]], metadata !"fpexcept.strict") #[[ATTR8]] ; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP24]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP24]] to i32 -; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP26]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float ; IR-DPP-NEXT: [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]]) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0 @@ -767,7 +767,7 @@ define amdgpu_ps float @global_atomic_fmax_uni_address_div_value_agent_scope_uns ; IR-DPP: 33: ; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ] ; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast float [[TMP34]] to i32 -; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP35]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]]) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float ; IR-DPP-NEXT: [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]]) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP39:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP37]], float [[TMP38]], metadata !"fpexcept.strict") #[[ATTR8]] @@ -803,7 +803,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_system_scope_st ; IR-ITERATIVE: 16: ; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] ; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32 -; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float ; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] @@ -835,7 +835,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_system_scope_st ; IR-DPP: 16: ; IR-DPP-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] ; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32 -; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP18]]) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float ; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] ; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[VAL]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] @@ -868,7 +868,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_system_scope_st ; IR-ITERATIVE: 12: ; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ] ; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32 -; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP14]]) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float ; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP16]], float [[TMP28:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] ; IR-ITERATIVE-NEXT: br label [[TMP18]] @@ -882,11 +882,11 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_system_scope_st ; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 ; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32 -; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP22]], i32 [[TMP21]]) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float ; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32 ; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32 -; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP28]] = bitcast i32 [[TMP27]] to float ; IR-ITERATIVE-NEXT: [[TMP29]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP24]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP30:%.*]] = shl i64 1, [[TMP20]] @@ -926,7 +926,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_system_scope_st ; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] ; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP24]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP24]] to i32 -; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP26]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP26]], i32 63) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float ; IR-DPP-NEXT: [[TMP29:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP28]]) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP30:%.*]] = icmp eq i32 [[TMP8]], 0 @@ -937,7 +937,7 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_system_scope_st ; IR-DPP: 33: ; IR-DPP-NEXT: [[TMP34:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP32]], [[TMP31]] ] ; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast float [[TMP34]] to i32 -; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP35]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP35]]) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float ; IR-DPP-NEXT: [[TMP38:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]]) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP39:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP37]], float [[TMP38]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] @@ -1084,8 +1084,8 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_agent_s ; IR-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 ; IR-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 ; IR-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 -; IR-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) -; IR-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) +; IR-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) +; IR-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) ; IR-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0 ; IR-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1 ; IR-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double @@ -1136,8 +1136,8 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_one_as_ ; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 ; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 ; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 -; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0 ; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1 ; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double @@ -1174,8 +1174,8 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_one_as_ ; IR-DPP-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 ; IR-DPP-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 ; IR-DPP-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 -; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0 ; IR-DPP-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1 ; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double @@ -1226,8 +1226,8 @@ define amdgpu_ps double @global_atomic_fsub_double_uni_address_uni_value_agent_s ; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 ; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 ; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 -; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0 ; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1 ; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double @@ -1264,8 +1264,8 @@ define amdgpu_ps double @global_atomic_fsub_double_uni_address_uni_value_agent_s ; IR-DPP-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 ; IR-DPP-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 ; IR-DPP-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 -; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0 ; IR-DPP-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1 ; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double @@ -1312,8 +1312,8 @@ define amdgpu_ps double @global_atomic_fmin_double_uni_address_uni_value_agent_s ; IR-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 ; IR-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP14]], 32 ; IR-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 -; IR-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP15]]) -; IR-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP17]]) +; IR-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP15]]) +; IR-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP17]]) ; IR-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0 ; IR-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP19]], i32 1 ; IR-NEXT: [[TMP22:%.*]] = bitcast <2 x i32> [[TMP21]] to double @@ -1360,8 +1360,8 @@ define amdgpu_ps double @global_atomic__fmax_double_uni_address_uni_value_agent_ ; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 ; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP14]], 32 ; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 -; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP15]]) #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP17]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP15]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP17]]) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0 ; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP19]], i32 1 ; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast <2 x i32> [[TMP21]] to double @@ -1394,8 +1394,8 @@ define amdgpu_ps double @global_atomic__fmax_double_uni_address_uni_value_agent_ ; IR-DPP-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 ; IR-DPP-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP14]], 32 ; IR-DPP-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 -; IR-DPP-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP15]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP17]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP15]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP17]]) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0 ; IR-DPP-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP19]], i32 1 ; IR-DPP-NEXT: [[TMP22:%.*]] = bitcast <2 x i32> [[TMP21]] to double @@ -1446,8 +1446,8 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_system_ ; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 ; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 ; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 -; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR7]] -; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0 ; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1 ; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double @@ -1484,8 +1484,8 @@ define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_system_ ; IR-DPP-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 ; IR-DPP-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 ; IR-DPP-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 -; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR8]] -; IR-DPP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP21]]) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0 ; IR-DPP-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1 ; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan.ll index f954560d0f5ca..4b4c99b3cd14c 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan.ll @@ -83,7 +83,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, ptr addrspace( ; IR-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP6]], [[ENTRY]] ], [ [[TMP16:%.*]], [[COMPUTELOOP]] ] ; IR-NEXT: [[TMP10:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) ; IR-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32 -; IR-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[VALUE]], i32 [[TMP11]]) +; IR-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[VALUE]], i32 [[TMP11]]) ; IR-NEXT: [[TMP13]] = add i32 [[ACCUMULATOR]], [[TMP12]] ; IR-NEXT: [[TMP14:%.*]] = shl i64 1, [[TMP10]] ; IR-NEXT: [[TMP15:%.*]] = xor i64 [[TMP14]], -1 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll index 86e3d9338e078..38823681d1bb5 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll @@ -69,7 +69,7 @@ define amdgpu_kernel void @global_atomic_fadd_div_value(ptr addrspace(1) %ptr) # ; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) ; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 ; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = bitcast float [[DIVVALUE]] to i32 -; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP13]], i32 [[TMP12]]) +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP13]], i32 [[TMP12]]) ; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float ; IR-ITERATIVE-NEXT: [[TMP16]] = fadd float [[ACCUMULATOR]], [[TMP15]] ; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = shl i64 1, [[TMP11]] @@ -107,7 +107,7 @@ define amdgpu_kernel void @global_atomic_fadd_div_value(ptr addrspace(1) %ptr) # ; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP20]], i32 323, i32 12, i32 15, i1 false) ; IR-DPP-NEXT: [[TMP22:%.*]] = fadd float [[TMP20]], [[TMP21]] ; IR-DPP-NEXT: [[TMP23:%.*]] = bitcast float [[TMP22]] to i32 -; IR-DPP-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP23]], i32 63) +; IR-DPP-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP23]], i32 63) ; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast i32 [[TMP24]] to float ; IR-DPP-NEXT: [[TMP26:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]]) ; IR-DPP-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP6]], 0 @@ -191,7 +191,7 @@ define amdgpu_kernel void @global_atomic_fsub_div_value(ptr addrspace(1) %ptr) # ; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) ; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 ; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = bitcast float [[DIVVALUE]] to i32 -; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP13]], i32 [[TMP12]]) +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP13]], i32 [[TMP12]]) ; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float ; IR-ITERATIVE-NEXT: [[TMP16]] = fadd float [[ACCUMULATOR]], [[TMP15]] ; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = shl i64 1, [[TMP11]] @@ -229,7 +229,7 @@ define amdgpu_kernel void @global_atomic_fsub_div_value(ptr addrspace(1) %ptr) # ; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP20]], i32 323, i32 12, i32 15, i1 false) ; IR-DPP-NEXT: [[TMP22:%.*]] = fadd float [[TMP20]], [[TMP21]] ; IR-DPP-NEXT: [[TMP23:%.*]] = bitcast float [[TMP22]] to i32 -; IR-DPP-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP23]], i32 63) +; IR-DPP-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP23]], i32 63) ; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast i32 [[TMP24]] to float ; IR-DPP-NEXT: [[TMP26:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP25]]) ; IR-DPP-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP6]], 0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll index b9234f47df192..83453354320fe 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll @@ -61,7 +61,7 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_scope_agent_scop ; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) ; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 ; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32 -; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP16]], i32 [[TMP15]]) +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]]) ; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float ; IR-ITERATIVE-NEXT: [[TMP19]] = fadd float [[ACCUMULATOR]], [[TMP18]] ; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = shl i64 1, [[TMP14]] @@ -100,7 +100,7 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_scope_agent_scop ; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) ; IR-DPP-NEXT: [[TMP24:%.*]] = fadd float [[TMP22]], [[TMP23]] ; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast float [[TMP24]] to i32 -; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP25]], i32 63) +; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63) ; IR-DPP-NEXT: [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float ; IR-DPP-NEXT: [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]]) ; IR-DPP-NEXT: [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0 @@ -196,7 +196,7 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_one_as_scope_uns ; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 ; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32 -; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float ; IR-ITERATIVE-NEXT: [[TMP19]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = shl i64 1, [[TMP14]] @@ -235,7 +235,7 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_one_as_scope_uns ; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] ; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast float [[TMP24]] to i32 -; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP25]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float ; IR-DPP-NEXT: [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]]) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0 @@ -331,7 +331,7 @@ define amdgpu_ps void @global_atomic_fsub_uni_address_div_value_agent_scope_stri ; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 ; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32 -; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float ; IR-ITERATIVE-NEXT: [[TMP19]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = shl i64 1, [[TMP14]] @@ -370,7 +370,7 @@ define amdgpu_ps void @global_atomic_fsub_uni_address_div_value_agent_scope_stri ; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] ; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast float [[TMP24]] to i32 -; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP25]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float ; IR-DPP-NEXT: [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]]) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0 @@ -438,7 +438,7 @@ define amdgpu_ps void @global_atomic_fmin_uni_address_div_value_agent_scope_unsa ; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) ; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 ; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32 -; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP16]], i32 [[TMP15]]) +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]]) ; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float ; IR-ITERATIVE-NEXT: [[TMP19]] = call float @llvm.minnum.f32(float [[ACCUMULATOR]], float [[TMP18]]) ; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = shl i64 1, [[TMP14]] @@ -477,7 +477,7 @@ define amdgpu_ps void @global_atomic_fmin_uni_address_div_value_agent_scope_unsa ; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0x7FF0000000000000, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) ; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.minnum.f32(float [[TMP22]], float [[TMP23]]) ; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast float [[TMP24]] to i32 -; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP25]], i32 63) +; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63) ; IR-DPP-NEXT: [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float ; IR-DPP-NEXT: [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]]) ; IR-DPP-NEXT: [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0 @@ -565,7 +565,7 @@ define amdgpu_ps void @global_atomic_fmax_uni_address_div_value_agent_scope_unsa ; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 ; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32 -; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float ; IR-ITERATIVE-NEXT: [[TMP19]] = call float @llvm.experimental.constrained.maxnum.f32(float [[ACCUMULATOR]], float [[TMP18]], metadata !"fpexcept.strict") #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = shl i64 1, [[TMP14]] @@ -604,7 +604,7 @@ define amdgpu_ps void @global_atomic_fmax_uni_address_div_value_agent_scope_unsa ; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0xFFF0000000000000, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.maxnum.f32(float [[TMP22]], float [[TMP23]], metadata !"fpexcept.strict") #[[ATTR8]] ; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast float [[TMP24]] to i32 -; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP25]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float ; IR-DPP-NEXT: [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]]) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0 @@ -700,7 +700,7 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_system_scope_str ; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 ; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32 -; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float ; IR-ITERATIVE-NEXT: [[TMP19]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] ; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = shl i64 1, [[TMP14]] @@ -739,7 +739,7 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_system_scope_str ; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float -0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] ; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast float [[TMP24]] to i32 -; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP25]], i32 63) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TMP25]], i32 63) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float ; IR-DPP-NEXT: [[TMP28:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP27]]) #[[ATTR8]] ; IR-DPP-NEXT: [[TMP29:%.*]] = icmp eq i32 [[TMP8]], 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll index 0284f44f5f14d..732489f22c36f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -1,65 +1,387 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-SDAG -enable-var-scope %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -global-isel < %s | FileCheck -check-prefix=CHECK-GISEL -enable-var-scope %s declare i32 @llvm.amdgcn.readfirstlane(i32) #0 +declare i64 @llvm.amdgcn.readfirstlane.i64(i64) #0 +declare double @llvm.amdgcn.readfirstlane.f64(double) #0 -; CHECK-LABEL: {{^}}test_readfirstlane: -; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2 -define void @test_readfirstlane(ptr addrspace(1) %out, i32 %src) #1 { - %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %src) +define void @test_readfirstlane_i32(ptr addrspace(1) %out, i32 %src) #1 { +; CHECK-SDAG-LABEL: test_readfirstlane_i32: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s4 +; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_i32: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %src) store i32 %readfirstlane, ptr addrspace(1) %out, align 4 ret void } -; CHECK-LABEL: {{^}}test_readfirstlane_imm: -; CHECK: s_mov_b32 [[SGPR_VAL:s[0-9]]], 32 -; CHECK-NOT: [[SGPR_VAL]] -; CHECK: ; use [[SGPR_VAL]] -define amdgpu_kernel void @test_readfirstlane_imm(ptr addrspace(1) %out) #1 { - %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32) +define void @test_readfirstlane_i64(ptr addrspace(1) %out, i64 %src) #1 { +; CHECK-SDAG-LABEL: test_readfirstlane_i64: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v2 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s5 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s4 +; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_i64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; CHECK-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %readfirstlane = call i64 @llvm.amdgcn.readfirstlane.i64(i64 %src) + store i64 %readfirstlane, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_readfirstlane_f64(ptr addrspace(1) %out, double %src) #1 { +; CHECK-SDAG-LABEL: test_readfirstlane_f64: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v2 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s5 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s4 +; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_f64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; CHECK-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %readfirstlane = call double @llvm.amdgcn.readfirstlane.f64(double %src) + store double %readfirstlane, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @test_readfirstlane_imm_i32(ptr addrspace(1) %out) #1 { +; CHECK-SDAG-LABEL: test_readfirstlane_imm_i32: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_mov_b32 s0, 32 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s0 +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_endpgm +; +; CHECK-GISEL-LABEL: test_readfirstlane_imm_i32: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_mov_b32 s0, 32 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s0 +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_endpgm + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 32) call void asm sideeffect "; use $0", "s"(i32 %readfirstlane) ret void } -; CHECK-LABEL: {{^}}test_readfirstlane_imm_fold: -; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], 32 -; CHECK-NOT: [[VVAL]] -; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]] -define amdgpu_kernel void @test_readfirstlane_imm_fold(ptr addrspace(1) %out) #1 { - %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32) +define amdgpu_kernel void @test_readfirstlane_imm_i64(ptr addrspace(1) %out) #1 { +; CHECK-SDAG-LABEL: test_readfirstlane_imm_i64: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_mov_b64 s[0:1], 32 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[0:1] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_endpgm +; +; CHECK-GISEL-LABEL: test_readfirstlane_imm_i64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_mov_b64 s[0:1], 32 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[0:1] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_endpgm + %readfirstlane = call i64 @llvm.amdgcn.readfirstlane.i64(i64 32) + call void asm sideeffect "; use $0", "s"(i64 %readfirstlane) + ret void +} + +define amdgpu_kernel void @test_readfirstlane_imm_f64(ptr addrspace(1) %out) #1 { +; CHECK-SDAG-LABEL: test_readfirstlane_imm_f64: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_mov_b32 s0, 0 +; CHECK-SDAG-NEXT: s_mov_b32 s1, 0x40400000 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[0:1] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_endpgm +; +; CHECK-GISEL-LABEL: test_readfirstlane_imm_f64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_mov_b32 s0, 0 +; CHECK-GISEL-NEXT: s_mov_b32 s1, 0x40400000 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[0:1] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_endpgm + %readfirstlane = call double @llvm.amdgcn.readfirstlane.f64(double 32.0) + call void asm sideeffect "; use $0", "s"(double %readfirstlane) + ret void +} + +define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out) #1 { +; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i32: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, 32 +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 +; CHECK-SDAG-NEXT: s_endpgm +; +; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i32: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, 32 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 +; CHECK-GISEL-NEXT: s_endpgm + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 32) store i32 %readfirstlane, ptr addrspace(1) %out, align 4 ret void } -; CHECK-LABEL: {{^}}test_readfirstlane_m0: -; CHECK: s_mov_b32 m0, -1 -; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], m0 -; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]] +define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out) #1 { +; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i64: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 32 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CHECK-SDAG-NEXT: s_endpgm +; +; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CHECK-GISEL-NEXT: s_endpgm + %readfirstlane = call i64 @llvm.amdgcn.readfirstlane.i64(i64 32) + store i64 %readfirstlane, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out) #1 { +; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_f64: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000 +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CHECK-SDAG-NEXT: s_endpgm +; +; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_f64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 +; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CHECK-GISEL-NEXT: s_endpgm + %readfirstlane = call double @llvm.amdgcn.readfirstlane.f64(double 32.0) + store double %readfirstlane, ptr addrspace(1) %out, align 4 + ret void +} + define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #1 { +; CHECK-SDAG-LABEL: test_readfirstlane_m0: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: s_mov_b32 m0, -1 +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0 +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 +; CHECK-SDAG-NEXT: s_endpgm +; +; CHECK-GISEL-LABEL: test_readfirstlane_m0: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: s_mov_b32 m0, -1 +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 +; CHECK-GISEL-NEXT: s_endpgm %m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"() %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %m0) store i32 %readfirstlane, ptr addrspace(1) %out, align 4 ret void } -; CHECK-LABEL: {{^}}test_readfirstlane_copy_from_sgpr: -; CHECK: ;;#ASMSTART -; CHECK-NEXT: s_mov_b32 [[SGPR:s[0-9]+]] -; CHECK: ;;#ASMEND -; CHECK-NOT: [[SGPR]] -; CHECK-NOT: readfirstlane -; CHECK: v_mov_b32_e32 [[VCOPY:v[0-9]+]], [[SGPR]] -; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VCOPY]] -define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr(ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1) %out) #1 { +; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i32: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: s_mov_b32 s2, 0 +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 +; CHECK-SDAG-NEXT: s_endpgm +; +; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i32: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 +; CHECK-GISEL-NEXT: s_endpgm %sgpr = call i32 asm "s_mov_b32 $0, 0", "=s"() - %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %sgpr) + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %sgpr) store i32 %readfirstlane, ptr addrspace(1) %out, align 4 ret void } -; Make sure this doesn't crash. -; CHECK-LABEL: {{^}}test_readfirstlane_fi: -; CHECK: s_mov_b32 [[FIVAL:s[0-9]]], 0 +define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1) %out) #1 { +; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i64: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CHECK-SDAG-NEXT: s_endpgm +; +; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CHECK-GISEL-NEXT: s_endpgm + %sgpr = call i64 asm "s_mov_b64 $0, 0", "=s"() + %readfirstlane = call i64 @llvm.amdgcn.readfirstlane.i64(i64 %sgpr) + store i64 %readfirstlane, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1) %out) #1 { +; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_f64: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CHECK-SDAG-NEXT: s_endpgm +; +; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_f64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CHECK-GISEL-NEXT: s_endpgm + %sgpr = call double asm "s_mov_b64 $0, 0", "=s"() + %readfirstlane = call double @llvm.amdgcn.readfirstlane.f64(double %sgpr) + store double %readfirstlane, ptr addrspace(1) %out, align 4 + ret void +} + define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) #1 { +; CHECK-SDAG-LABEL: test_readfirstlane_fi: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_add_u32 s0, s0, s9 +; CHECK-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-SDAG-NEXT: s_mov_b32 s4, 0 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s4 +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_endpgm +; +; CHECK-GISEL-LABEL: test_readfirstlane_fi: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_add_u32 s0, s0, s9 +; CHECK-GISEL-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-GISEL-NEXT: s_mov_b32 s4, 0 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s4 +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_endpgm %alloca = alloca i32, addrspace(5) %int = ptrtoint ptr addrspace(5) %alloca to i32 %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %int) @@ -67,5 +389,269 @@ define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) #1 { ret void } +define void @test_readfirstlane_half(ptr addrspace(1) %out, half %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_half: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s4 +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_half: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s4 +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call half @llvm.amdgcn.readfirstlane.f16(half %src) + call void asm sideeffect "; use $0", "s"(half %x) + ret void +} + +define void @test_readfirstlane_float(ptr addrspace(1) %out, float %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_float: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s4 +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_float: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s4 +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call float @llvm.amdgcn.readfirstlane.f32(float %src) + call void asm sideeffect "; use $0", "s"(float %x) + ret void +} + +define void @test_readfirstlane_bfloat(ptr addrspace(1) %out, bfloat %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_bfloat: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s4 +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_bfloat: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s4 +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call bfloat @llvm.amdgcn.readfirstlane.bf16(bfloat %src) + call void asm sideeffect "; use $0", "s"(bfloat %x) + ret void +} + +define void @test_readfirstlane_i16(ptr addrspace(1) %out, i16 %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_i16: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0xffff +; CHECK-SDAG-NEXT: v_and_b32_e32 v0, s4, v0 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use v0 +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_i16: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s4 +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call i16 @llvm.amdgcn.readfirstlane.i16(i16 %src) + call void asm sideeffect "; use $0", "s"(i16 %x) + ret void +} + +define void @test_readfirstlane_v2f16(ptr addrspace(1) %out, <2 x half> %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_v2f16: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s4 +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_v2f16: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s4 +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <2 x half> @llvm.amdgcn.readfirstlane.v2f16(<2 x half> %src) + call void asm sideeffect "; use $0", "s"(<2 x half> %x) + ret void +} + +define void @test_readfirstlane_v2f32(ptr addrspace(1) %out, <2 x float> %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_v2f32: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:5] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_v2f32: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:5] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <2 x float> @llvm.amdgcn.readfirstlane.v2f32(<2 x float> %src) + call void asm sideeffect "; use $0", "s"(<2 x float> %x) + ret void +} + +define void @test_readfirstlane_v7i32(ptr addrspace(1) %out, <7 x i32> %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_v7i32: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s10, v8 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:10] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_v7i32: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v8 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:10] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <7 x i32> @llvm.amdgcn.readfirstlane.v7i32(<7 x i32> %src) + call void asm sideeffect "; use $0", "s"(<7 x i32> %x) + ret void +} + +define void @test_readfirstlane_p0(ptr addrspace(1) %out, ptr %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_p0: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:5] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_p0: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:5] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call ptr @llvm.amdgcn.readfirstlane.p0(ptr %src) + call void asm sideeffect "; use $0", "s"(ptr %x) + ret void +} + +define void @test_readfirstlane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_v3p0: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:9] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_v3p0: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:9] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <3 x ptr> @llvm.amdgcn.readfirstlane.v3p0(<3 x ptr> %src) + call void asm sideeffect "; use $0", "s"(<3 x ptr> %x) + ret void +} + +define void @test_readfirstlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_v8i16: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:7] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_v8i16: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:7] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <8 x i16> @llvm.amdgcn.readfirstlane.v8i16(<8 x i16> %src) + call void asm sideeffect "; use $0", "s"(<8 x i16> %x) + ret void +} + attributes #0 = { nounwind readnone convergent } attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll new file mode 100644 index 0000000000000..588f239606f52 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll @@ -0,0 +1,92 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-SDAG -enable-var-scope %s + +define void @test_readfirstlane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_p3: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s4 +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] + %x = call ptr addrspace(3) @llvm.amdgcn.readfirstlane.p3(ptr addrspace(3) %src) + call void asm sideeffect "; use $0", "s"(ptr addrspace(3) %x) + ret void +} + +define void @test_readfirstlane_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_v3p3: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:6] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] + %x = call <3 x ptr addrspace(3)> @llvm.amdgcn.readfirstlane.v3p3(<3 x ptr addrspace(3)> %src) + call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(3)> %x) + ret void +} + +define void @test_readfirstlane_p5(ptr addrspace(1) %out, ptr addrspace(5) %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_p5: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s4 +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] + %x = call ptr addrspace(5) @llvm.amdgcn.readfirstlane.p5(ptr addrspace(5) %src) + call void asm sideeffect "; use $0", "s"(ptr addrspace(5) %x) + ret void +} + +define void @test_readfirstlane_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_v3p5: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:6] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] + %x = call <3 x ptr addrspace(5)> @llvm.amdgcn.readfirstlane.v3p5(<3 x ptr addrspace(5)> %src) + call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(5)> %x) + ret void +} + +define void @test_readfirstlane_p6(ptr addrspace(1) %out, ptr addrspace(6) %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_p6: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s4 +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] + %x = call ptr addrspace(6) @llvm.amdgcn.readfirstlane.p6(ptr addrspace(6) %src) + call void asm sideeffect "; use $0", "s"(ptr addrspace(6) %x) + ret void +} + +define void @test_readfirstlane_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_v3p6: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:6] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] + %x = call <3 x ptr addrspace(6)> @llvm.amdgcn.readfirstlane.v3p6(<3 x ptr addrspace(6)> %src) + call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(6)> %x) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll index 51465f6bd10ce..71cd3db81addd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll @@ -1,82 +1,966 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-SDAG -enable-var-scope %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -global-isel < %s | FileCheck --check-prefix=CHECK-GISEL -enable-var-scope %s -declare i32 @llvm.amdgcn.readlane(i32, i32) #0 +declare i32 @llvm.amdgcn.readlane.i32(i32, i32) #0 +declare i64 @llvm.amdgcn.readlane.i64(i64, i32) #0 +declare double @llvm.amdgcn.readlane.f64(double, i32) #0 -; CHECK-LABEL: {{^}}test_readlane_sreg_sreg: -; CHECK-NOT: v_readlane_b32 -define amdgpu_kernel void @test_readlane_sreg_sreg(i32 %src0, i32 %src1) #1 { - %readlane = call i32 @llvm.amdgcn.readlane(i32 %src0, i32 %src1) +define amdgpu_kernel void @test_readlane_sreg_sreg_i32(i32 %src0, i32 %src1) #1 { +; CHECK-SDAG-LABEL: test_readlane_sreg_sreg_i32: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s0 +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_endpgm +; +; CHECK-GISEL-LABEL: test_readlane_sreg_sreg_i32: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s0 +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_endpgm + %readlane = call i32 @llvm.amdgcn.readlane.i32(i32 %src0, i32 %src1) call void asm sideeffect "; use $0", "s"(i32 %readlane) ret void } -; CHECK-LABEL: {{^}}test_readlane_vreg_sreg: -; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} -define amdgpu_kernel void @test_readlane_vreg_sreg(i32 %src0, i32 %src1) #1 { +define amdgpu_kernel void @test_readlane_sreg_sreg_i64(i64 %src0, i32 %src1) #1 { +; CHECK-SDAG-LABEL: test_readlane_sreg_sreg_i64: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[0:1] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_endpgm +; +; CHECK-GISEL-LABEL: test_readlane_sreg_sreg_i64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[0:1] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_endpgm + %readlane = call i64 @llvm.amdgcn.readlane.i64(i64 %src0, i32 %src1) + call void asm sideeffect "; use $0", "s"(i64 %readlane) + ret void +} + +define amdgpu_kernel void @test_readlane_sreg_sreg_f64(double %src0, i32 %src1) #1 { +; CHECK-SDAG-LABEL: test_readlane_sreg_sreg_f64: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[0:1] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_endpgm +; +; CHECK-GISEL-LABEL: test_readlane_sreg_sreg_f64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[0:1] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_endpgm + %readlane = call double @llvm.amdgcn.readlane.f64(double %src0, i32 %src1) + call void asm sideeffect "; use $0", "s"(double %readlane) + ret void +} + +define amdgpu_kernel void @test_readlane_vreg_sreg_i32(i32 %src0, i32 %src1) #1 { +; CHECK-SDAG-LABEL: test_readlane_vreg_sreg_i32: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_load_dword s0, s[4:5], 0x4 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; def v0 +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readlane_b32 s0, v0, s0 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s0 +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_endpgm +; +; CHECK-GISEL-LABEL: test_readlane_vreg_sreg_i32: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_load_dword s0, s[4:5], 0x4 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; def v0 +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readlane_b32 s0, v0, s0 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s0 +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_endpgm %vgpr = call i32 asm sideeffect "; def $0", "=v"() - %readlane = call i32 @llvm.amdgcn.readlane(i32 %vgpr, i32 %src1) + %readlane = call i32 @llvm.amdgcn.readlane.i32(i32 %vgpr, i32 %src1) call void asm sideeffect "; use $0", "s"(i32 %readlane) ret void } -; CHECK-LABEL: {{^}}test_readlane_imm_sreg: -; CHECK-NOT: v_readlane_b32 -define amdgpu_kernel void @test_readlane_imm_sreg(ptr addrspace(1) %out, i32 %src1) #1 { - %readlane = call i32 @llvm.amdgcn.readlane(i32 32, i32 %src1) +define amdgpu_kernel void @test_readlane_vreg_sreg_i64(i64 %src0, i32 %src1) #1 { +; CHECK-SDAG-LABEL: test_readlane_vreg_sreg_i64: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_load_dword s0, s[4:5], 0x8 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; def v[0:1] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readlane_b32 s1, v1, s0 +; CHECK-SDAG-NEXT: v_readlane_b32 s0, v0, s0 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[0:1] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_endpgm +; +; CHECK-GISEL-LABEL: test_readlane_vreg_sreg_i64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_load_dword s1, s[4:5], 0x8 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; def v[0:1] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readlane_b32 s0, v0, s1 +; CHECK-GISEL-NEXT: v_readlane_b32 s1, v1, s1 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[0:1] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_endpgm + %vgpr = call i64 asm sideeffect "; def $0", "=v"() + %readlane = call i64 @llvm.amdgcn.readlane.i64(i64 %vgpr, i32 %src1) + call void asm sideeffect "; use $0", "s"(i64 %readlane) + ret void +} + +define amdgpu_kernel void @test_readlane_vreg_sreg_f64(double %src0, i32 %src1) #1 { +; CHECK-SDAG-LABEL: test_readlane_vreg_sreg_f64: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_load_dword s0, s[4:5], 0x8 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; def v[0:1] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readlane_b32 s1, v1, s0 +; CHECK-SDAG-NEXT: v_readlane_b32 s0, v0, s0 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[0:1] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_endpgm +; +; CHECK-GISEL-LABEL: test_readlane_vreg_sreg_f64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_load_dword s1, s[4:5], 0x8 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; def v[0:1] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readlane_b32 s0, v0, s1 +; CHECK-GISEL-NEXT: v_readlane_b32 s1, v1, s1 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[0:1] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_endpgm + %vgpr = call double asm sideeffect "; def $0", "=v"() + %readlane = call double @llvm.amdgcn.readlane.f64(double %vgpr, i32 %src1) + call void asm sideeffect "; use $0", "s"(double %readlane) + ret void +} + +define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32 %src1) #1 { +; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i32: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, 32 +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 +; CHECK-SDAG-NEXT: s_endpgm +; +; CHECK-GISEL-LABEL: test_readlane_imm_sreg_i32: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, 32 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 +; CHECK-GISEL-NEXT: s_endpgm + %readlane = call i32 @llvm.amdgcn.readlane.i32(i32 32, i32 %src1) store i32 %readlane, ptr addrspace(1) %out, align 4 ret void } -; CHECK-LABEL: {{^}}test_readlane_vregs: -; CHECK: v_readfirstlane_b32 [[LANE:s[0-9]+]], v{{[0-9]+}} -; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, [[LANE]] -define amdgpu_kernel void @test_readlane_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32 %src1) #1 { +; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i64: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 32 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CHECK-SDAG-NEXT: s_endpgm +; +; CHECK-GISEL-LABEL: test_readlane_imm_sreg_i64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CHECK-GISEL-NEXT: s_endpgm + %readlane = call i64 @llvm.amdgcn.readlane.i64(i64 32, i32 %src1) + store i64 %readlane, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32 %src1) #1 { +; CHECK-SDAG-LABEL: test_readlane_imm_sreg_f64: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000 +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CHECK-SDAG-NEXT: s_endpgm +; +; CHECK-GISEL-LABEL: test_readlane_imm_sreg_f64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 +; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CHECK-GISEL-NEXT: s_endpgm + %readlane = call double @llvm.amdgcn.readlane.f64(double 32.0, i32 %src1) + store double %readlane, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; CHECK-SDAG-LABEL: test_readlane_vregs_i32: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; CHECK-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CHECK-SDAG-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s0, v1 +; CHECK-SDAG-NEXT: s_nop 3 +; CHECK-SDAG-NEXT: v_readlane_b32 s0, v0, s0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-SDAG-NEXT: flat_store_dword v[2:3], v0 +; CHECK-SDAG-NEXT: s_endpgm +; +; CHECK-GISEL-LABEL: test_readlane_vregs_i32: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; CHECK-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CHECK-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s2, v1 +; CHECK-GISEL-NEXT: s_nop 3 +; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, s2 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 +; CHECK-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.in = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 %tid %args = load <2 x i32>, ptr addrspace(1) %gep.in %value = extractelement <2 x i32> %args, i32 0 %lane = extractelement <2 x i32> %args, i32 1 - %readlane = call i32 @llvm.amdgcn.readlane(i32 %value, i32 %lane) + %readlane = call i32 @llvm.amdgcn.readlane.i32(i32 %value, i32 %lane) store i32 %readlane, ptr addrspace(1) %out, align 4 ret void } -; TODO: m0 should be folded. -; CHECK-LABEL: {{^}}test_readlane_m0_sreg: -; CHECK: s_mov_b32 m0, -1 -; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], m0 -; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]] +define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; CHECK-SDAG-LABEL: test_readlane_vregs_i64: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; CHECK-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CHECK-SDAG-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v4, s1 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s0, v2 +; CHECK-SDAG-NEXT: s_nop 3 +; CHECK-SDAG-NEXT: v_readlane_b32 s1, v1, s0 +; CHECK-SDAG-NEXT: v_readlane_b32 s0, v0, s0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-SDAG-NEXT: flat_store_dwordx2 v[3:4], v[0:1] +; CHECK-SDAG-NEXT: s_endpgm +; +; CHECK-GISEL-LABEL: test_readlane_vregs_i64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; CHECK-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CHECK-GISEL-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s3, v2 +; CHECK-GISEL-NEXT: s_nop 3 +; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, s3 +; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, s3 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CHECK-GISEL-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.in = getelementptr <2 x i64>, ptr addrspace(1) %in, i32 %tid + %args = load <2 x i64>, ptr addrspace(1) %gep.in + %value = extractelement <2 x i64> %args, i32 0 + %lane = extractelement <2 x i64> %args, i32 1 + %lane32 = trunc i64 %lane to i32 + %readlane = call i64 @llvm.amdgcn.readlane.i64(i64 %value, i32 %lane32) + store i64 %readlane, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; CHECK-SDAG-LABEL: test_readlane_vregs_f64: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; CHECK-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CHECK-SDAG-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v4, s1 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s0, v2 +; CHECK-SDAG-NEXT: s_nop 3 +; CHECK-SDAG-NEXT: v_readlane_b32 s1, v1, s0 +; CHECK-SDAG-NEXT: v_readlane_b32 s0, v0, s0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-SDAG-NEXT: flat_store_dwordx2 v[3:4], v[0:1] +; CHECK-SDAG-NEXT: s_endpgm +; +; CHECK-GISEL-LABEL: test_readlane_vregs_f64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; CHECK-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CHECK-GISEL-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s3, v2 +; CHECK-GISEL-NEXT: s_nop 3 +; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, s3 +; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, s3 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CHECK-GISEL-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.in = getelementptr <2 x double>, ptr addrspace(1) %in, i32 %tid + %args = load <2 x double>, ptr addrspace(1) %gep.in + %value = extractelement <2 x double> %args, i32 0 + %lane = extractelement <2 x double> %args, i32 1 + %lane_cast = bitcast double %lane to i64 + %lane32 = trunc i64 %lane_cast to i32 + %readlane = call double @llvm.amdgcn.readlane.f64(double %value, i32 %lane32) + store double %readlane, ptr addrspace(1) %out, align 4 + ret void +} + define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src1) #1 { +; CHECK-SDAG-LABEL: test_readlane_m0_sreg: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: s_mov_b32 m0, -1 +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0 +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 +; CHECK-SDAG-NEXT: s_endpgm +; +; CHECK-GISEL-LABEL: test_readlane_m0_sreg: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: s_mov_b32 m0, -1 +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 +; CHECK-GISEL-NEXT: s_endpgm %m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"() %readlane = call i32 @llvm.amdgcn.readlane(i32 %m0, i32 %src1) store i32 %readlane, ptr addrspace(1) %out, align 4 ret void } -; CHECK-LABEL: {{^}}test_readlane_vgpr_imm: -; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 32 -define amdgpu_kernel void @test_readlane_vgpr_imm(ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1 { +; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_i32: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; def v0 +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: v_readlane_b32 s2, v0, 32 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 +; CHECK-SDAG-NEXT: s_endpgm +; +; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_i32: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; def v0 +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 +; CHECK-GISEL-NEXT: s_endpgm %vgpr = call i32 asm sideeffect "; def $0", "=v"() - %readlane = call i32 @llvm.amdgcn.readlane(i32 %vgpr, i32 32) #0 + %readlane = call i32 @llvm.amdgcn.readlane.i32(i32 %vgpr, i32 32) #0 store i32 %readlane, ptr addrspace(1) %out, align 4 ret void } -; CHECK-LABEL: {{^}}test_readlane_copy_from_sgpr: -; CHECK: ;;#ASMSTART -; CHECK-NEXT: s_mov_b32 [[SGPR:s[0-9]+]] -; CHECK: ;;#ASMEND -; CHECK-NOT: [[SGPR]] -; CHECK-NOT: readlane -; CHECK: v_mov_b32_e32 [[VCOPY:v[0-9]+]], [[SGPR]] -; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VCOPY]] -define amdgpu_kernel void @test_readlane_copy_from_sgpr(ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1 { +; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_i64: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; def v[0:1] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: v_readlane_b32 s2, v1, 32 +; CHECK-SDAG-NEXT: v_readlane_b32 s3, v0, 32 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3 +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CHECK-SDAG-NEXT: s_endpgm +; +; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_i64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; def v[0:1] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32 +; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, 32 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CHECK-GISEL-NEXT: s_endpgm + %vgpr = call i64 asm sideeffect "; def $0", "=v"() + %readlane = call i64 @llvm.amdgcn.readlane.i64(i64 %vgpr, i32 32) #0 + store i64 %readlane, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1 { +; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_f64: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; def v[0:1] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: v_readlane_b32 s2, v1, 32 +; CHECK-SDAG-NEXT: v_readlane_b32 s3, v0, 32 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3 +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CHECK-SDAG-NEXT: s_endpgm +; +; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_f64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; def v[0:1] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32 +; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, 32 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CHECK-GISEL-NEXT: s_endpgm + %vgpr = call double asm sideeffect "; def $0", "=v"() + %readlane = call double @llvm.amdgcn.readlane.f64(double %vgpr, i32 32) #0 + store double %readlane, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @test_readlane_copy_from_sgpr_i32(ptr addrspace(1) %out) #1 { +; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i32: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: s_mov_b32 s2, 0 +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 +; CHECK-SDAG-NEXT: s_endpgm +; +; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i32: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 +; CHECK-GISEL-NEXT: s_endpgm %sgpr = call i32 asm "s_mov_b32 $0, 0", "=s"() - %readfirstlane = call i32 @llvm.amdgcn.readlane(i32 %sgpr, i32 7) + %readfirstlane = call i32 @llvm.amdgcn.readlane.i32(i32 %sgpr, i32 7) store i32 %readfirstlane, ptr addrspace(1) %out, align 4 ret void } +define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %out) #1 { +; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i64: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CHECK-SDAG-NEXT: s_endpgm +; +; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CHECK-GISEL-NEXT: s_endpgm + %sgpr = call i64 asm "s_mov_b64 $0, 0", "=s"() + %readfirstlane = call i64 @llvm.amdgcn.readlane.i64(i64 %sgpr, i32 7) + store i64 %readfirstlane, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %out) #1 { +; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_f64: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CHECK-SDAG-NEXT: s_endpgm +; +; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_f64: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CHECK-GISEL-NEXT: s_endpgm + %sgpr = call double asm "s_mov_b64 $0, 0", "=s"() + %readfirstlane = call double @llvm.amdgcn.readlane.f64(double %sgpr, i32 7) + store double %readfirstlane, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_readlane_half(ptr addrspace(1) %out, half %src, i32 %src1) { +; CHECK-SDAG-LABEL: test_readlane_half: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3 +; CHECK-SDAG-NEXT: s_nop 3 +; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s4 +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readlane_half: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v3 +; CHECK-GISEL-NEXT: s_nop 3 +; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s4 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s4 +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call half @llvm.amdgcn.readlane.f16(half %src, i32 %src1) + call void asm sideeffect "; use $0", "s"(half %x) + ret void +} + +define void @test_readlane_float(ptr addrspace(1) %out, float %src, i32 %src1) { +; CHECK-SDAG-LABEL: test_readlane_float: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3 +; CHECK-SDAG-NEXT: s_nop 3 +; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s4 +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readlane_float: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v3 +; CHECK-GISEL-NEXT: s_nop 3 +; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s4 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s4 +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call float @llvm.amdgcn.readlane.f32(float %src, i32 %src1) + call void asm sideeffect "; use $0", "s"(float %x) + ret void +} + +define void @test_readlane_bfloat(ptr addrspace(1) %out, bfloat %src, i32 %src1) { +; CHECK-SDAG-LABEL: test_readlane_bfloat: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3 +; CHECK-SDAG-NEXT: s_nop 3 +; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s4 +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readlane_bfloat: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v3 +; CHECK-GISEL-NEXT: s_nop 3 +; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s4 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s4 +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call bfloat @llvm.amdgcn.readlane.bf16(bfloat %src, i32 %src1) + call void asm sideeffect "; use $0", "s"(bfloat %x) + ret void +} + +define void @test_readlane_i16(ptr addrspace(1) %out, i16 %src, i32 %src1) { +; CHECK-SDAG-LABEL: test_readlane_i16: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0xffff +; CHECK-SDAG-NEXT: s_nop 2 +; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 +; CHECK-SDAG-NEXT: v_and_b32_e32 v0, s4, v0 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use v0 +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readlane_i16: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v3 +; CHECK-GISEL-NEXT: s_nop 3 +; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s4 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s4 +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call i16 @llvm.amdgcn.readlane.i16(i16 %src, i32 %src1) + call void asm sideeffect "; use $0", "s"(i16 %x) + ret void +} + +define void @test_readlane_v2f16(ptr addrspace(1) %out, <2 x half> %src, i32 %src1) { +; CHECK-SDAG-LABEL: test_readlane_v2f16: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3 +; CHECK-SDAG-NEXT: s_nop 3 +; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s4 +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readlane_v2f16: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v3 +; CHECK-GISEL-NEXT: s_nop 3 +; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s4 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s4 +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <2 x half> @llvm.amdgcn.readlane.v2f16(<2 x half> %src, i32 %src1) + call void asm sideeffect "; use $0", "s"(<2 x half> %x) + ret void +} + +define void @test_readlane_v2f32(ptr addrspace(1) %out, <2 x float> %src, i32 %src1) { +; CHECK-SDAG-LABEL: test_readlane_v2f32: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v4 +; CHECK-SDAG-NEXT: s_nop 3 +; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:5] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readlane_v2f32: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v4 +; CHECK-GISEL-NEXT: s_nop 3 +; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s5 +; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s5 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:5] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <2 x float> @llvm.amdgcn.readlane.v2f32(<2 x float> %src, i32 %src1) + call void asm sideeffect "; use $0", "s"(<2 x float> %x) + ret void +} + +define void @test_readlane_v7i32(ptr addrspace(1) %out, <7 x i32> %src, i32 %src1) { +; CHECK-SDAG-LABEL: test_readlane_v7i32: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v9 +; CHECK-SDAG-NEXT: s_nop 3 +; CHECK-SDAG-NEXT: v_readlane_b32 s10, v8, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s9, v7, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s8, v6, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:10] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readlane_v7i32: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v9 +; CHECK-GISEL-NEXT: s_nop 3 +; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s10 +; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s10 +; CHECK-GISEL-NEXT: v_readlane_b32 s6, v4, s10 +; CHECK-GISEL-NEXT: v_readlane_b32 s7, v5, s10 +; CHECK-GISEL-NEXT: v_readlane_b32 s8, v6, s10 +; CHECK-GISEL-NEXT: v_readlane_b32 s9, v7, s10 +; CHECK-GISEL-NEXT: v_readlane_b32 s10, v8, s10 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:10] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <7 x i32> @llvm.amdgcn.readlane.v7i32(<7 x i32> %src, i32 %src1) + call void asm sideeffect "; use $0", "s"(<7 x i32> %x) + ret void +} + +define void @test_readlane_p0(ptr addrspace(1) %out, ptr %src, i32 %src1) { +; CHECK-SDAG-LABEL: test_readlane_p0: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v4 +; CHECK-SDAG-NEXT: s_nop 3 +; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:5] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readlane_p0: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v4 +; CHECK-GISEL-NEXT: s_nop 3 +; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s5 +; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s5 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:5] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call ptr @llvm.amdgcn.readlane.p0(ptr %src, i32 %src1) + call void asm sideeffect "; use $0", "s"(ptr %x) + ret void +} + +define void @test_readlane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src, i32 %src1) { +; CHECK-SDAG-LABEL: test_readlane_v3p0: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v8 +; CHECK-SDAG-NEXT: s_nop 3 +; CHECK-SDAG-NEXT: v_readlane_b32 s9, v7, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s8, v6, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:9] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readlane_v3p0: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v8 +; CHECK-GISEL-NEXT: s_nop 3 +; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s9 +; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s9 +; CHECK-GISEL-NEXT: v_readlane_b32 s6, v4, s9 +; CHECK-GISEL-NEXT: v_readlane_b32 s7, v5, s9 +; CHECK-GISEL-NEXT: v_readlane_b32 s8, v6, s9 +; CHECK-GISEL-NEXT: v_readlane_b32 s9, v7, s9 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:9] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <3 x ptr> @llvm.amdgcn.readlane.v3p0(<3 x ptr> %src, i32 %src1) + call void asm sideeffect "; use $0", "s"(<3 x ptr> %x) + ret void +} + +define void @test_readlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src1) { +; CHECK-SDAG-LABEL: test_readlane_v8i16: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v6 +; CHECK-SDAG-NEXT: s_nop 3 +; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:7] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readlane_v8i16: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v6 +; CHECK-GISEL-NEXT: s_nop 3 +; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s7 +; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s7 +; CHECK-GISEL-NEXT: v_readlane_b32 s6, v4, s7 +; CHECK-GISEL-NEXT: v_readlane_b32 s7, v5, s7 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; use s[4:7] +; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %x = call <8 x i16> @llvm.amdgcn.readlane.v8i16(<8 x i16> %src, i32 %src1) + call void asm sideeffect "; use $0", "s"(<8 x i16> %x) + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #2 attributes #0 = { nounwind readnone convergent } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll new file mode 100644 index 0000000000000..1b4ee84c75250 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll @@ -0,0 +1,105 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-SDAG -enable-var-scope %s + +define void @test_readlane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src, i32 %src1) { +; CHECK-SDAG-LABEL: test_readlane_p3: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3 +; CHECK-SDAG-NEXT: s_nop 3 +; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s4 +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] + %x = call ptr addrspace(3) @llvm.amdgcn.readlane.p3(ptr addrspace(3) %src, i32 %src1) + call void asm sideeffect "; use $0", "s"(ptr addrspace(3) %x) + ret void +} + +define void @test_readlane_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %src, i32 %src1) { +; CHECK-SDAG-LABEL: test_readlane_v3p3: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v5 +; CHECK-SDAG-NEXT: s_nop 3 +; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:6] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] + %x = call <3 x ptr addrspace(3)> @llvm.amdgcn.readlane.v3p3(<3 x ptr addrspace(3)> %src, i32 %src1) + call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(3)> %x) + ret void +} + +define void @test_readlane_p5(ptr addrspace(1) %out, ptr addrspace(5) %src, i32 %src1) { +; CHECK-SDAG-LABEL: test_readlane_p5: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3 +; CHECK-SDAG-NEXT: s_nop 3 +; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s4 +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] + %x = call ptr addrspace(5) @llvm.amdgcn.readlane.p5(ptr addrspace(5) %src, i32 %src1) + call void asm sideeffect "; use $0", "s"(ptr addrspace(5) %x) + ret void +} + +define void @test_readlane_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %src, i32 %src1) { +; CHECK-SDAG-LABEL: test_readlane_v3p5: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v5 +; CHECK-SDAG-NEXT: s_nop 3 +; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:6] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] + %x = call <3 x ptr addrspace(5)> @llvm.amdgcn.readlane.v3p5(<3 x ptr addrspace(5)> %src, i32 %src1) + call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(5)> %x) + ret void +} + +define void @test_readlane_p6(ptr addrspace(1) %out, ptr addrspace(6) %src, i32 %src1) { +; CHECK-SDAG-LABEL: test_readlane_p6: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3 +; CHECK-SDAG-NEXT: s_nop 3 +; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s4 +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] + %x = call ptr addrspace(6) @llvm.amdgcn.readlane.p6(ptr addrspace(6) %src, i32 %src1) + call void asm sideeffect "; use $0", "s"(ptr addrspace(6) %x) + ret void +} + +define void @test_readlane_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %src, i32 %src1) { +; CHECK-SDAG-LABEL: test_readlane_v3p6: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v5 +; CHECK-SDAG-NEXT: s_nop 3 +; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:6] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] + %x = call <3 x ptr addrspace(6)> @llvm.amdgcn.readlane.v3p6(<3 x ptr addrspace(6)> %src, i32 %src1) + call void asm sideeffect "; use $0", "s"(<3 x ptr addrspace(6)> %x) + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll index 37951669dbe75..d0a865f565eeb 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll @@ -1,85 +1,3044 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,CIGFX9 %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,CIGFX9 %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX10 %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=CHECK,GFX10 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX802-SDAG %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1010-SDAG %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX1100-SDAG %s + +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX802-GISEL %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX1010-GISEL %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 -global-isel < %s | FileCheck -check-prefixes=GFX1100-GISEL %s declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #0 +declare i64 @llvm.amdgcn.writelane.i64(i64, i32, i64) #0 +declare double @llvm.amdgcn.writelane.f64(double, i32, double) #0 -; CHECK-LABEL: {{^}}test_writelane_sreg: -; CIGFX9: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, m0 -; GFX10: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -define amdgpu_kernel void @test_writelane_sreg(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 { +define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 { +; GFX802-SDAG-LABEL: test_writelane_sreg_i32: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-SDAG-NEXT: s_mov_b32 m0, s3 +; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s3 +; GFX802-SDAG-NEXT: v_writelane_b32 v2, s2, m0 +; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2 +; GFX802-SDAG-NEXT: s_endpgm +; +; GFX1010-SDAG-LABEL: test_writelane_sreg_i32: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-SDAG-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s3 +; GFX1010-SDAG-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1010-SDAG-NEXT: s_endpgm +; +; GFX1100-SDAG-LABEL: test_writelane_sreg_i32: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, s3 +; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1100-SDAG-NEXT: s_nop 0 +; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-SDAG-NEXT: s_endpgm +; +; GFX802-GISEL-LABEL: test_writelane_sreg_i32: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-GISEL-NEXT: s_mov_b32 m0, s3 +; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; GFX802-GISEL-NEXT: v_writelane_b32 v2, s2, m0 +; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX802-GISEL-NEXT: s_endpgm +; +; GFX1010-GISEL-LABEL: test_writelane_sreg_i32: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-GISEL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s3 +; GFX1010-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1010-GISEL-NEXT: s_endpgm +; +; GFX1100-GISEL-LABEL: test_writelane_sreg_i32: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, s3 +; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1100-GISEL-NEXT: s_nop 0 +; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-GISEL-NEXT: s_endpgm %oldval = load i32, ptr addrspace(1) %out - %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 %oldval) + %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %src0, i32 %src1, i32 %oldval) store i32 %writelane, ptr addrspace(1) %out, align 4 ret void } -; CHECK-LABEL: {{^}}test_writelane_imm_sreg: -; CHECK: v_writelane_b32 v{{[0-9]+}}, 32, s{{[0-9]+}} -define amdgpu_kernel void @test_writelane_imm_sreg(ptr addrspace(1) %out, i32 %src1) #1 { +define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 { +; GFX802-SDAG-LABEL: test_writelane_sreg_i64: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s6 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s5 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX802-SDAG-NEXT: v_writelane_b32 v1, s3, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v0, s2, m0 +; GFX802-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX802-SDAG-NEXT: s_endpgm +; +; GFX1010-SDAG-LABEL: test_writelane_sreg_i64: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_clause 0x1 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s5 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s6 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s6 +; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1010-SDAG-NEXT: s_endpgm +; +; GFX1100-SDAG-LABEL: test_writelane_sreg_i64: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x10 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s7, s2 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s6, s2 +; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX1100-SDAG-NEXT: s_nop 0 +; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-SDAG-NEXT: s_endpgm +; +; GFX802-GISEL-LABEL: test_writelane_sreg_i64: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0 +; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX802-GISEL-NEXT: s_endpgm +; +; GFX1010-GISEL-LABEL: test_writelane_sreg_i64: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_clause 0x1 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s6 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s6 +; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1010-GISEL-NEXT: s_endpgm +; +; GFX1100-GISEL-LABEL: test_writelane_sreg_i64: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_clause 0x1 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x10 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s6, s2 +; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s7, s2 +; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX1100-GISEL-NEXT: s_nop 0 +; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-GISEL-NEXT: s_endpgm + %oldval = load i64, ptr addrspace(1) %out + %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 %src0, i32 %src1, i64 %oldval) + store i64 %writelane, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double %src0, i32 %src1) #1 { +; GFX802-SDAG-LABEL: test_writelane_sreg_f64: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s6 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s5 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX802-SDAG-NEXT: v_writelane_b32 v1, s3, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v0, s2, m0 +; GFX802-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX802-SDAG-NEXT: s_endpgm +; +; GFX1010-SDAG-LABEL: test_writelane_sreg_f64: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_clause 0x1 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s5 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s6 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s6 +; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1010-SDAG-NEXT: s_endpgm +; +; GFX1100-SDAG-LABEL: test_writelane_sreg_f64: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x10 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s7, s2 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s6, s2 +; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX1100-SDAG-NEXT: s_nop 0 +; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-SDAG-NEXT: s_endpgm +; +; GFX802-GISEL-LABEL: test_writelane_sreg_f64: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0 +; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX802-GISEL-NEXT: s_endpgm +; +; GFX1010-GISEL-LABEL: test_writelane_sreg_f64: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_clause 0x1 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s6 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s6 +; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1010-GISEL-NEXT: s_endpgm +; +; GFX1100-GISEL-LABEL: test_writelane_sreg_f64: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_clause 0x1 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x10 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s6, s2 +; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s7, s2 +; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX1100-GISEL-NEXT: s_nop 0 +; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-GISEL-NEXT: s_endpgm + %oldval = load double, ptr addrspace(1) %out + %writelane = call double @llvm.amdgcn.writelane.f64(double %src0, i32 %src1, double %oldval) + store double %writelane, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i32 %src1) #1 { +; GFX802-SDAG-LABEL: test_writelane_imm_sreg_i32: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s3 +; GFX802-SDAG-NEXT: v_writelane_b32 v2, 32, s2 +; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2 +; GFX802-SDAG-NEXT: s_endpgm +; +; GFX1010-SDAG-LABEL: test_writelane_imm_sreg_i32: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_clause 0x1 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s3 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, 32, s2 +; GFX1010-SDAG-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1010-SDAG-NEXT: s_endpgm +; +; GFX1100-SDAG-LABEL: test_writelane_imm_sreg_i32: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, 32, s0 +; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX1100-SDAG-NEXT: s_nop 0 +; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-SDAG-NEXT: s_endpgm +; +; GFX802-GISEL-LABEL: test_writelane_imm_sreg_i32: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; GFX802-GISEL-NEXT: v_writelane_b32 v2, 32, s2 +; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX802-GISEL-NEXT: s_endpgm +; +; GFX1010-GISEL-LABEL: test_writelane_imm_sreg_i32: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_clause 0x1 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s3 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, 32, s2 +; GFX1010-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1010-GISEL-NEXT: s_endpgm +; +; GFX1100-GISEL-LABEL: test_writelane_imm_sreg_i32: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_clause 0x1 +; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, 32, s0 +; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX1100-GISEL-NEXT: s_nop 0 +; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-GISEL-NEXT: s_endpgm %oldval = load i32, ptr addrspace(1) %out - %writelane = call i32 @llvm.amdgcn.writelane(i32 32, i32 %src1, i32 %oldval) + %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 32, i32 %src1, i32 %oldval) store i32 %writelane, ptr addrspace(1) %out, align 4 ret void } -; CHECK-LABEL: {{^}}test_writelane_vreg_lane: -; CHECK: v_readfirstlane_b32 [[LANE:s[0-9]+]], v{{[0-9]+}} -; CHECK: v_writelane_b32 v{{[0-9]+}}, 12, [[LANE]] -define amdgpu_kernel void @test_writelane_vreg_lane(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i32 %src1) #1 { +; GFX802-SDAG-LABEL: test_writelane_imm_sreg_i64: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX802-SDAG-NEXT: v_writelane_b32 v1, 0, s4 +; GFX802-SDAG-NEXT: v_writelane_b32 v0, 32, s4 +; GFX802-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX802-SDAG-NEXT: s_endpgm +; +; GFX1010-SDAG-LABEL: test_writelane_imm_sreg_i64: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_clause 0x1 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, 0, s6 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, 32, s6 +; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1010-SDAG-NEXT: s_endpgm +; +; GFX1100-SDAG-LABEL: test_writelane_imm_sreg_i64: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x8 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX1100-SDAG-NEXT: v_writelane_b32 v1, 0, s4 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, 32, s4 +; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX1100-SDAG-NEXT: s_nop 0 +; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-SDAG-NEXT: s_endpgm +; +; GFX802-GISEL-LABEL: test_writelane_imm_sreg_i64: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX802-GISEL-NEXT: v_writelane_b32 v0, 32, s4 +; GFX802-GISEL-NEXT: v_writelane_b32 v1, 0, s4 +; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX802-GISEL-NEXT: s_endpgm +; +; GFX1010-GISEL-LABEL: test_writelane_imm_sreg_i64: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_clause 0x1 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, 32, s6 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, 0, s6 +; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1010-GISEL-NEXT: s_endpgm +; +; GFX1100-GISEL-LABEL: test_writelane_imm_sreg_i64: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_clause 0x1 +; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x8 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, 32, s4 +; GFX1100-GISEL-NEXT: v_writelane_b32 v1, 0, s4 +; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX1100-GISEL-NEXT: s_nop 0 +; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-GISEL-NEXT: s_endpgm + %oldval = load i64, ptr addrspace(1) %out + %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 32, i32 %src1, i64 %oldval) + store i64 %writelane, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i32 %src1) #1 { +; GFX802-SDAG-LABEL: test_writelane_imm_sreg_f64: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX802-SDAG-NEXT: s_mov_b32 s5, 0x40400000 +; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX802-SDAG-NEXT: v_writelane_b32 v1, s5, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v0, 0, s4 +; GFX802-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX802-SDAG-NEXT: s_endpgm +; +; GFX1010-SDAG-LABEL: test_writelane_imm_sreg_f64: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_clause 0x1 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX1010-SDAG-NEXT: s_mov_b32 s2, 0x40400000 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s2, s6 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, 0, s6 +; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1010-SDAG-NEXT: s_endpgm +; +; GFX1100-SDAG-LABEL: test_writelane_imm_sreg_f64: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x8 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX1100-SDAG-NEXT: s_mov_b32 s0, 0x40400000 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, 0, s4 +; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX1100-SDAG-NEXT: s_nop 0 +; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-SDAG-NEXT: s_endpgm +; +; GFX802-GISEL-LABEL: test_writelane_imm_sreg_f64: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX802-GISEL-NEXT: s_mov_b32 s5, 0x40400000 +; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s4 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX802-GISEL-NEXT: v_writelane_b32 v0, 0, s4 +; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0 +; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX802-GISEL-NEXT: s_endpgm +; +; GFX1010-GISEL-LABEL: test_writelane_imm_sreg_f64: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_clause 0x1 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX1010-GISEL-NEXT: s_mov_b32 s2, 0x40400000 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, 0, s6 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s2, s6 +; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1010-GISEL-NEXT: s_endpgm +; +; GFX1100-GISEL-LABEL: test_writelane_imm_sreg_f64: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_clause 0x1 +; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x8 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1100-GISEL-NEXT: s_mov_b32 s0, 0x40400000 +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, 0, s4 +; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX1100-GISEL-NEXT: s_nop 0 +; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-GISEL-NEXT: s_endpgm + %oldval = load double, ptr addrspace(1) %out + %writelane = call double @llvm.amdgcn.writelane.f64(double 32.0, i32 %src1, double %oldval) + store double %writelane, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GFX802-SDAG-LABEL: test_writelane_vreg_lane_i32: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, 4, v0 +; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX802-SDAG-NEXT: flat_load_dword v0, v[0:1] +; GFX802-SDAG-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX802-SDAG-NEXT: s_nop 2 +; GFX802-SDAG-NEXT: v_writelane_b32 v2, 12, s2 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2 +; GFX802-SDAG-NEXT: s_endpgm +; +; GFX1010-SDAG-LABEL: test_writelane_vreg_lane_i32: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-SDAG-NEXT: global_load_dword v0, v0, s[2:3] offset:4 +; GFX1010-SDAG-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1010-SDAG-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, 12, s2 +; GFX1010-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1010-SDAG-NEXT: s_endpgm +; +; GFX1100-SDAG-LABEL: test_writelane_vreg_lane_i32: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 +; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_writelane_b32 v1, 12, s2 +; GFX1100-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1100-SDAG-NEXT: s_nop 0 +; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-SDAG-NEXT: s_endpgm +; +; GFX802-GISEL-LABEL: test_writelane_vreg_lane_i32: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX802-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX802-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX802-GISEL-NEXT: v_add_u32_e32 v0, vcc, 4, v0 +; GFX802-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX802-GISEL-NEXT: flat_load_dword v0, v[0:1] +; GFX802-GISEL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX802-GISEL-NEXT: s_nop 2 +; GFX802-GISEL-NEXT: v_writelane_b32 v2, 12, s2 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX802-GISEL-NEXT: s_endpgm +; +; GFX1010-GISEL-LABEL: test_writelane_vreg_lane_i32: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-GISEL-NEXT: global_load_dword v0, v0, s[2:3] offset:4 +; GFX1010-GISEL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1010-GISEL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, 12, s2 +; GFX1010-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1010-GISEL-NEXT: s_endpgm +; +; GFX1100-GISEL-LABEL: test_writelane_vreg_lane_i32: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 +; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_writelane_b32 v1, 12, s2 +; GFX1100-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1100-GISEL-NEXT: s_nop 0 +; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.in = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 %tid %args = load <2 x i32>, ptr addrspace(1) %gep.in %oldval = load i32, ptr addrspace(1) %out %lane = extractelement <2 x i32> %args, i32 1 - %writelane = call i32 @llvm.amdgcn.writelane(i32 12, i32 %lane, i32 %oldval) + %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 12, i32 %lane, i32 %oldval) store i32 %writelane, ptr addrspace(1) %out, align 4 ret void } -; CHECK-LABEL: {{^}}test_writelane_m0_sreg: -; CHECK: s_mov_b32 m0, -1 -; CIGFX9: s_mov_b32 [[COPY_M0:s[0-9]+]], m0 -; CIGFX9: v_writelane_b32 v{{[0-9]+}}, [[COPY_M0]], m0 -; GFX10: v_writelane_b32 v{{[0-9]+}}, m0, s{{[0-9]+}} -define amdgpu_kernel void @test_writelane_m0_sreg(ptr addrspace(1) %out, i32 %src1) #1 { +define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GFX802-SDAG-LABEL: test_writelane_vreg_lane_i64: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, 8, v0 +; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX802-SDAG-NEXT: flat_load_dword v2, v[0:1] +; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s2, v2 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX802-SDAG-NEXT: s_nop 2 +; GFX802-SDAG-NEXT: v_writelane_b32 v1, 0, s2 +; GFX802-SDAG-NEXT: v_writelane_b32 v0, 12, s2 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX802-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX802-SDAG-NEXT: s_endpgm +; +; GFX1010-SDAG-LABEL: test_writelane_vreg_lane_i64: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-SDAG-NEXT: global_load_dword v0, v0, s[2:3] offset:8 +; GFX1010-SDAG-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s3, v0 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, 0, s3 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, 12, s3 +; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1010-SDAG-NEXT: s_endpgm +; +; GFX1100-SDAG-LABEL: test_writelane_vreg_lane_i64: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: global_load_b32 v0, v0, s[2:3] offset:8 +; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v0 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_writelane_b32 v1, 0, s3 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, 12, s3 +; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1100-SDAG-NEXT: s_nop 0 +; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-SDAG-NEXT: s_endpgm +; +; GFX802-GISEL-LABEL: test_writelane_vreg_lane_i64: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 +; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX802-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX802-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX802-GISEL-NEXT: v_add_u32_e32 v0, vcc, 8, v0 +; GFX802-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX802-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v4, s1 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s0 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX802-GISEL-NEXT: s_nop 3 +; GFX802-GISEL-NEXT: v_writelane_b32 v1, 12, s2 +; GFX802-GISEL-NEXT: v_writelane_b32 v2, 0, s2 +; GFX802-GISEL-NEXT: flat_store_dwordx2 v[3:4], v[1:2] +; GFX802-GISEL-NEXT: s_endpgm +; +; GFX1010-GISEL-LABEL: test_writelane_vreg_lane_i64: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:8 +; GFX1010-GISEL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, 12, s2 +; GFX1010-GISEL-NEXT: v_writelane_b32 v2, 0, s2 +; GFX1010-GISEL-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1] +; GFX1010-GISEL-NEXT: s_endpgm +; +; GFX1100-GISEL-LABEL: test_writelane_vreg_lane_i64: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:8 +; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_writelane_b32 v1, 12, s2 +; GFX1100-GISEL-NEXT: v_writelane_b32 v2, 0, s2 +; GFX1100-GISEL-NEXT: global_store_b64 v0, v[1:2], s[0:1] +; GFX1100-GISEL-NEXT: s_nop 0 +; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-GISEL-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.in = getelementptr <2 x i64>, ptr addrspace(1) %in, i32 %tid + %args = load <2 x i64>, ptr addrspace(1) %gep.in + %oldval = load i64, ptr addrspace(1) %out + %lane = extractelement <2 x i64> %args, i32 1 + %lane32 = trunc i64 %lane to i32 + %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 12, i32 %lane32, i64 %oldval) + store i64 %writelane, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GFX802-SDAG-LABEL: test_writelane_vreg_lane_f64: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX802-SDAG-NEXT: s_mov_b32 s4, 0x40280000 +; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, 8, v0 +; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX802-SDAG-NEXT: flat_load_dword v2, v[0:1] +; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v2 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s2, v2 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX802-SDAG-NEXT: s_nop 1 +; GFX802-SDAG-NEXT: v_writelane_b32 v1, s4, m0 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX802-SDAG-NEXT: v_writelane_b32 v0, 0, s2 +; GFX802-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX802-SDAG-NEXT: s_endpgm +; +; GFX1010-SDAG-LABEL: test_writelane_vreg_lane_f64: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-SDAG-NEXT: global_load_dword v0, v0, s[2:3] offset:8 +; GFX1010-SDAG-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s3, v0 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX1010-SDAG-NEXT: s_mov_b32 s2, 0x40280000 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s2, s3 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, 0, s3 +; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1010-SDAG-NEXT: s_endpgm +; +; GFX1100-SDAG-LABEL: test_writelane_vreg_lane_f64: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: global_load_b32 v0, v0, s[2:3] offset:8 +; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v0 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX1100-SDAG-NEXT: s_mov_b32 s2, 0x40280000 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s2, s3 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, 0, s3 +; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1100-SDAG-NEXT: s_nop 0 +; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-SDAG-NEXT: s_endpgm +; +; GFX802-GISEL-LABEL: test_writelane_vreg_lane_f64: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 +; GFX802-GISEL-NEXT: s_mov_b32 s4, 0x40280000 +; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX802-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX802-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX802-GISEL-NEXT: v_add_u32_e32 v0, vcc, 8, v0 +; GFX802-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX802-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v4, s1 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s0 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s2 +; GFX802-GISEL-NEXT: s_nop 2 +; GFX802-GISEL-NEXT: v_writelane_b32 v1, 0, s2 +; GFX802-GISEL-NEXT: v_writelane_b32 v2, s4, m0 +; GFX802-GISEL-NEXT: flat_store_dwordx2 v[3:4], v[1:2] +; GFX802-GISEL-NEXT: s_endpgm +; +; GFX1010-GISEL-LABEL: test_writelane_vreg_lane_f64: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:8 +; GFX1010-GISEL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; GFX1010-GISEL-NEXT: s_mov_b32 s3, 0x40280000 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, 0, s2 +; GFX1010-GISEL-NEXT: v_writelane_b32 v2, s3, s2 +; GFX1010-GISEL-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1] +; GFX1010-GISEL-NEXT: s_endpgm +; +; GFX1100-GISEL-LABEL: test_writelane_vreg_lane_f64: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:8 +; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; GFX1100-GISEL-NEXT: s_mov_b32 s3, 0x40280000 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_writelane_b32 v1, 0, s2 +; GFX1100-GISEL-NEXT: v_writelane_b32 v2, s3, s2 +; GFX1100-GISEL-NEXT: global_store_b64 v0, v[1:2], s[0:1] +; GFX1100-GISEL-NEXT: s_nop 0 +; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-GISEL-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.in = getelementptr <2 x double>, ptr addrspace(1) %in, i32 %tid + %args = load <2 x double>, ptr addrspace(1) %gep.in + %oldval = load double, ptr addrspace(1) %out + %lane = extractelement <2 x double> %args, i32 1 + %lane_cast = bitcast double %lane to i64 + %lane32 = trunc i64 %lane_cast to i32 + %writelane = call double @llvm.amdgcn.writelane.f64(double 12.0, i32 %lane32, double %oldval) + store double %writelane, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 %src1) #1 { +; GFX802-SDAG-LABEL: test_writelane_m0_sreg_i32: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX802-SDAG-NEXT: ;;#ASMSTART +; GFX802-SDAG-NEXT: s_mov_b32 m0, -1 +; GFX802-SDAG-NEXT: ;;#ASMEND +; GFX802-SDAG-NEXT: s_mov_b32 s4, m0 +; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s2 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s3 +; GFX802-SDAG-NEXT: v_writelane_b32 v2, s4, m0 +; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2 +; GFX802-SDAG-NEXT: s_endpgm +; +; GFX1010-SDAG-LABEL: test_writelane_m0_sreg_i32: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_clause 0x1 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1010-SDAG-NEXT: ;;#ASMSTART +; GFX1010-SDAG-NEXT: s_mov_b32 m0, -1 +; GFX1010-SDAG-NEXT: ;;#ASMEND +; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s3 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, m0, s2 +; GFX1010-SDAG-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1010-SDAG-NEXT: s_endpgm +; +; GFX1100-SDAG-LABEL: test_writelane_m0_sreg_i32: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1100-SDAG-NEXT: ;;#ASMSTART +; GFX1100-SDAG-NEXT: s_mov_b32 m0, -1 +; GFX1100-SDAG-NEXT: ;;#ASMEND +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, m0, s0 +; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX1100-SDAG-NEXT: s_nop 0 +; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-SDAG-NEXT: s_endpgm +; +; GFX802-GISEL-LABEL: test_writelane_m0_sreg_i32: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX802-GISEL-NEXT: ;;#ASMSTART +; GFX802-GISEL-NEXT: s_mov_b32 m0, -1 +; GFX802-GISEL-NEXT: ;;#ASMEND +; GFX802-GISEL-NEXT: s_mov_b32 s4, m0 +; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s2 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; GFX802-GISEL-NEXT: v_writelane_b32 v2, s4, m0 +; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX802-GISEL-NEXT: s_endpgm +; +; GFX1010-GISEL-LABEL: test_writelane_m0_sreg_i32: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_clause 0x1 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX1010-GISEL-NEXT: ;;#ASMSTART +; GFX1010-GISEL-NEXT: s_mov_b32 m0, -1 +; GFX1010-GISEL-NEXT: ;;#ASMEND +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s3 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, m0, s2 +; GFX1010-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1010-GISEL-NEXT: s_endpgm +; +; GFX1100-GISEL-LABEL: test_writelane_m0_sreg_i32: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_clause 0x1 +; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX1100-GISEL-NEXT: ;;#ASMSTART +; GFX1100-GISEL-NEXT: s_mov_b32 m0, -1 +; GFX1100-GISEL-NEXT: ;;#ASMEND +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, m0, s0 +; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX1100-GISEL-NEXT: s_nop 0 +; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-GISEL-NEXT: s_endpgm %oldval = load i32, ptr addrspace(1) %out %m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"() - %writelane = call i32 @llvm.amdgcn.writelane(i32 %m0, i32 %src1, i32 %oldval) + %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %m0, i32 %src1, i32 %oldval) store i32 %writelane, ptr addrspace(1) %out, align 4 ret void } -; CHECK-LABEL: {{^}}test_writelane_imm: -; CHECK: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 32 -define amdgpu_kernel void @test_writelane_imm(ptr addrspace(1) %out, i32 %src0) #1 { +define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %src0) #1 { +; GFX802-SDAG-LABEL: test_writelane_imm_i32: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s3 +; GFX802-SDAG-NEXT: v_writelane_b32 v2, s2, 32 +; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2 +; GFX802-SDAG-NEXT: s_endpgm +; +; GFX1010-SDAG-LABEL: test_writelane_imm_i32: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_clause 0x1 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s3 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, 32 +; GFX1010-SDAG-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1010-SDAG-NEXT: s_endpgm +; +; GFX1100-SDAG-LABEL: test_writelane_imm_i32: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s0, 32 +; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX1100-SDAG-NEXT: s_nop 0 +; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-SDAG-NEXT: s_endpgm +; +; GFX802-GISEL-LABEL: test_writelane_imm_i32: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; GFX802-GISEL-NEXT: v_writelane_b32 v2, s2, 32 +; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX802-GISEL-NEXT: s_endpgm +; +; GFX1010-GISEL-LABEL: test_writelane_imm_i32: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_clause 0x1 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s3 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, 32 +; GFX1010-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1010-GISEL-NEXT: s_endpgm +; +; GFX1100-GISEL-LABEL: test_writelane_imm_i32: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_clause 0x1 +; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s0, 32 +; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX1100-GISEL-NEXT: s_nop 0 +; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-GISEL-NEXT: s_endpgm %oldval = load i32, ptr addrspace(1) %out - %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 32, i32 %oldval) #0 + %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %src0, i32 32, i32 %oldval) #0 store i32 %writelane, ptr addrspace(1) %out, align 4 ret void } -; CHECK-LABEL: {{^}}test_writelane_sreg_oldval: -; CHECK: v_mov_b32_e32 [[OLDVAL:v[0-9]+]], s{{[0-9]+}} -; CIGFX9: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, m0 -; GFX10: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}} -define amdgpu_kernel void @test_writelane_sreg_oldval(i32 inreg %oldval, ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 { - %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 %oldval) +define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %src0) #1 { +; GFX802-SDAG-LABEL: test_writelane_imm_i64: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s5 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX802-SDAG-NEXT: v_writelane_b32 v1, s3, 32 +; GFX802-SDAG-NEXT: v_writelane_b32 v0, s2, 32 +; GFX802-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX802-SDAG-NEXT: s_endpgm +; +; GFX1010-SDAG-LABEL: test_writelane_imm_i64: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s5 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, 32 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, 32 +; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1010-SDAG-NEXT: s_endpgm +; +; GFX1100-SDAG-LABEL: test_writelane_imm_i64: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s5 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s3, 32 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, 32 +; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1100-SDAG-NEXT: s_nop 0 +; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-SDAG-NEXT: s_endpgm +; +; GFX802-GISEL-LABEL: test_writelane_imm_i64: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, 32 +; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, 32 +; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX802-GISEL-NEXT: s_endpgm +; +; GFX1010-GISEL-LABEL: test_writelane_imm_i64: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, 32 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, 32 +; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1010-GISEL-NEXT: s_endpgm +; +; GFX1100-GISEL-LABEL: test_writelane_imm_i64: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, 32 +; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s3, 32 +; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1100-GISEL-NEXT: s_nop 0 +; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-GISEL-NEXT: s_endpgm + %oldval = load i64, ptr addrspace(1) %out + %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 %src0, i32 32, i64 %oldval) #0 + store i64 %writelane, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double %src0) #1 { +; GFX802-SDAG-LABEL: test_writelane_imm_f64: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s5 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX802-SDAG-NEXT: v_writelane_b32 v1, s3, 32 +; GFX802-SDAG-NEXT: v_writelane_b32 v0, s2, 32 +; GFX802-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX802-SDAG-NEXT: s_endpgm +; +; GFX1010-SDAG-LABEL: test_writelane_imm_f64: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s5 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, 32 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, 32 +; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1010-SDAG-NEXT: s_endpgm +; +; GFX1100-SDAG-LABEL: test_writelane_imm_f64: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s5 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s3, 32 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, 32 +; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1100-SDAG-NEXT: s_nop 0 +; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-SDAG-NEXT: s_endpgm +; +; GFX802-GISEL-LABEL: test_writelane_imm_f64: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, 32 +; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, 32 +; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX802-GISEL-NEXT: s_endpgm +; +; GFX1010-GISEL-LABEL: test_writelane_imm_f64: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, 32 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, 32 +; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1010-GISEL-NEXT: s_endpgm +; +; GFX1100-GISEL-LABEL: test_writelane_imm_f64: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, 32 +; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s3, 32 +; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1100-GISEL-NEXT: s_nop 0 +; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-GISEL-NEXT: s_endpgm + %oldval = load double, ptr addrspace(1) %out + %writelane = call double @llvm.amdgcn.writelane.f64(double %src0, i32 32, double %oldval) #0 + store double %writelane, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 { +; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_i32: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s6 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s3 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX802-SDAG-NEXT: v_writelane_b32 v2, s2, m0 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2 +; GFX802-SDAG-NEXT: s_endpgm +; +; GFX1010-SDAG-LABEL: test_writelane_sreg_oldval_i32: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_clause 0x1 +; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s3 +; GFX1010-SDAG-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1010-SDAG-NEXT: s_endpgm +; +; GFX1100-SDAG-LABEL: test_writelane_sreg_oldval_i32: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x8 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, s3 +; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1100-SDAG-NEXT: s_nop 0 +; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-SDAG-NEXT: s_endpgm +; +; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_i32: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s3 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX802-GISEL-NEXT: v_writelane_b32 v2, s2, m0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX802-GISEL-NEXT: s_endpgm +; +; GFX1010-GISEL-LABEL: test_writelane_sreg_oldval_i32: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_clause 0x1 +; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s3 +; GFX1010-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1010-GISEL-NEXT: s_endpgm +; +; GFX1100-GISEL-LABEL: test_writelane_sreg_oldval_i32: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_clause 0x1 +; GFX1100-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x8 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, s3 +; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1100-GISEL-NEXT: s_nop 0 +; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-GISEL-NEXT: s_endpgm + %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %src0, i32 %src1, i32 %oldval) store i32 %writelane, ptr addrspace(1) %out, align 4 ret void } -; CHECK-LABEL: {{^}}test_writelane_imm_oldval: -; CHECK: v_mov_b32_e32 [[OLDVAL:v[0-9]+]], 42 -; CIGFX9: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, m0 -; GFX10: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}} -define amdgpu_kernel void @test_writelane_imm_oldval(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 { - %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 42) +define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 { +; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_i64: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s6, s[4:5], 0x18 +; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s6 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX802-SDAG-NEXT: v_writelane_b32 v3, s5, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v2, s4, m0 +; GFX802-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX802-SDAG-NEXT: s_endpgm +; +; GFX1010-SDAG-LABEL: test_writelane_sreg_oldval_i64: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_clause 0x2 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX1010-SDAG-NEXT: s_load_dword s8, s[4:5], 0x18 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s7, s8 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s6, s8 +; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX1010-SDAG-NEXT: s_endpgm +; +; GFX1100-SDAG-LABEL: test_writelane_sreg_oldval_i64: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_clause 0x2 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x10 +; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x18 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s5 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s3, s0 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, s0 +; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[6:7] +; GFX1100-SDAG-NEXT: s_nop 0 +; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-SDAG-NEXT: s_endpgm +; +; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_i64: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x18 +; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX802-GISEL-NEXT: v_writelane_b32 v0, s4, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s3 +; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX802-GISEL-NEXT: s_endpgm +; +; GFX1010-GISEL-LABEL: test_writelane_sreg_oldval_i64: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_clause 0x2 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX1010-GISEL-NEXT: s_load_dword s8, s[4:5], 0x18 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s6, s8 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s7, s8 +; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX1010-GISEL-NEXT: s_endpgm +; +; GFX1100-GISEL-LABEL: test_writelane_sreg_oldval_i64: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_clause 0x2 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x10 +; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x18 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, s0 +; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s3, s0 +; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[6:7] +; GFX1100-GISEL-NEXT: s_nop 0 +; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-GISEL-NEXT: s_endpgm + %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 %src0, i32 %src1, i64 %oldval) + store i64 %writelane, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, ptr addrspace(1) %out, double %src0, i32 %src1) #1 { +; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_f64: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s6, s[4:5], 0x18 +; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s6 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX802-SDAG-NEXT: v_writelane_b32 v3, s5, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v2, s4, m0 +; GFX802-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX802-SDAG-NEXT: s_endpgm +; +; GFX1010-SDAG-LABEL: test_writelane_sreg_oldval_f64: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_clause 0x2 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX1010-SDAG-NEXT: s_load_dword s8, s[4:5], 0x18 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s7, s8 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s6, s8 +; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX1010-SDAG-NEXT: s_endpgm +; +; GFX1100-SDAG-LABEL: test_writelane_sreg_oldval_f64: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_clause 0x2 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x10 +; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x18 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s5 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s3, s0 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, s0 +; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[6:7] +; GFX1100-SDAG-NEXT: s_nop 0 +; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-SDAG-NEXT: s_endpgm +; +; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_f64: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x18 +; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX802-GISEL-NEXT: v_writelane_b32 v0, s4, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s3 +; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX802-GISEL-NEXT: s_endpgm +; +; GFX1010-GISEL-LABEL: test_writelane_sreg_oldval_f64: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_clause 0x2 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX1010-GISEL-NEXT: s_load_dword s8, s[4:5], 0x18 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s6, s8 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s7, s8 +; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX1010-GISEL-NEXT: s_endpgm +; +; GFX1100-GISEL-LABEL: test_writelane_sreg_oldval_f64: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_clause 0x2 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x10 +; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x18 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, s0 +; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s3, s0 +; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[6:7] +; GFX1100-GISEL-NEXT: s_nop 0 +; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-GISEL-NEXT: s_endpgm + %writelane = call double @llvm.amdgcn.writelane.f64(double %src0, i32 %src1, double %oldval) + store double %writelane, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 { +; GFX802-SDAG-LABEL: test_writelane_imm_oldval_i32: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, 42 +; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-SDAG-NEXT: s_mov_b32 m0, s3 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX802-SDAG-NEXT: v_writelane_b32 v2, s2, m0 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2 +; GFX802-SDAG-NEXT: s_endpgm +; +; GFX1010-SDAG-LABEL: test_writelane_imm_oldval_i32: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, 42 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s3 +; GFX1010-SDAG-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1010-SDAG-NEXT: s_endpgm +; +; GFX1100-SDAG-LABEL: test_writelane_imm_oldval_i32: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, 42 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, s3 +; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1100-SDAG-NEXT: s_nop 0 +; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-SDAG-NEXT: s_endpgm +; +; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i32: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, 42 +; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-GISEL-NEXT: s_mov_b32 m0, s3 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX802-GISEL-NEXT: v_writelane_b32 v2, s2, m0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX802-GISEL-NEXT: s_endpgm +; +; GFX1010-GISEL-LABEL: test_writelane_imm_oldval_i32: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, 42 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s3 +; GFX1010-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1010-GISEL-NEXT: s_endpgm +; +; GFX1100-GISEL-LABEL: test_writelane_imm_oldval_i32: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, 42 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, s3 +; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1100-GISEL-NEXT: s_nop 0 +; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-GISEL-NEXT: s_endpgm + %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %src0, i32 %src1, i32 42) store i32 %writelane, ptr addrspace(1) %out, align 4 ret void } +define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 { +; GFX802-SDAG-LABEL: test_writelane_imm_oldval_i64: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, 42 +; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX802-SDAG-NEXT: v_writelane_b32 v1, s3, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v0, s2, m0 +; GFX802-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX802-SDAG-NEXT: s_endpgm +; +; GFX1010-SDAG-LABEL: test_writelane_imm_oldval_i64: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_clause 0x1 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, 42 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s6 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s6 +; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1010-SDAG-NEXT: s_endpgm +; +; GFX1100-SDAG-LABEL: test_writelane_imm_oldval_i64: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x10 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, 42 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s7, s0 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s6, s0 +; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX1100-SDAG-NEXT: s_nop 0 +; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-SDAG-NEXT: s_endpgm +; +; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i64: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, 42 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX802-GISEL-NEXT: s_endpgm +; +; GFX1010-GISEL-LABEL: test_writelane_imm_oldval_i64: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_clause 0x1 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, 42 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s6 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s6 +; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1010-GISEL-NEXT: s_endpgm +; +; GFX1100-GISEL-LABEL: test_writelane_imm_oldval_i64: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_clause 0x1 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x10 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, 42 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s6, s0 +; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s7, s0 +; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX1100-GISEL-NEXT: s_nop 0 +; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-GISEL-NEXT: s_endpgm + %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 %src0, i32 %src1, i64 42) + store i64 %writelane, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, double %src0, i32 %src1) #1 { +; GFX802-SDAG-LABEL: test_writelane_imm_oldval_f64: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, 0x40450000 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX802-SDAG-NEXT: v_writelane_b32 v1, s3, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v0, s2, m0 +; GFX802-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX802-SDAG-NEXT: s_endpgm +; +; GFX1010-SDAG-LABEL: test_writelane_imm_oldval_f64: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_clause 0x1 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0x40450000 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s6 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s6 +; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1010-SDAG-NEXT: s_endpgm +; +; GFX1100-SDAG-LABEL: test_writelane_imm_oldval_f64: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x10 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0x40450000 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s7, s0 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s6, s0 +; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX1100-SDAG-NEXT: s_nop 0 +; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-SDAG-NEXT: s_endpgm +; +; GFX802-GISEL-LABEL: test_writelane_imm_oldval_f64: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, 0x40450000 +; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX802-GISEL-NEXT: s_endpgm +; +; GFX1010-GISEL-LABEL: test_writelane_imm_oldval_f64: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_clause 0x1 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0x40450000 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s6 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s6 +; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1010-GISEL-NEXT: s_endpgm +; +; GFX1100-GISEL-LABEL: test_writelane_imm_oldval_f64: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_clause 0x1 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x10 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0x40450000 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s6, s0 +; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s7, s0 +; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX1100-GISEL-NEXT: s_nop 0 +; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-GISEL-NEXT: s_endpgm + %writelane = call double @llvm.amdgcn.writelane.f64(double %src0, i32 %src1, double 42.0) + store double %writelane, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_writelane_half(ptr addrspace(1) %out, half %src, i32 %src1) { +; GFX802-SDAG-LABEL: test_writelane_half: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-SDAG-NEXT: flat_load_ushort v4, v[0:1] +; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_nop 1 +; GFX802-SDAG-NEXT: v_writelane_b32 v4, s4, m0 +; GFX802-SDAG-NEXT: flat_store_short v[0:1], v4 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-SDAG-LABEL: test_writelane_half: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-SDAG-NEXT: global_load_ushort v4, v[0:1], off +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1010-SDAG-NEXT: v_writelane_b32 v4, s4, s5 +; GFX1010-SDAG-NEXT: global_store_short v[0:1], v4, off +; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: test_writelane_half: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: global_load_u16 v4, v[0:1], off +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1 +; GFX1100-SDAG-NEXT: global_store_b16 v[0:1], v4, off +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX802-GISEL-LABEL: test_writelane_half: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-GISEL-NEXT: flat_load_ushort v4, v[0:1] +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s5 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: v_writelane_b32 v4, s4, m0 +; GFX802-GISEL-NEXT: flat_store_short v[0:1], v4 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-GISEL-LABEL: test_writelane_half: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-GISEL-NEXT: global_load_ushort v4, v[0:1], off +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-GISEL-NEXT: v_writelane_b32 v4, s4, s5 +; GFX1010-GISEL-NEXT: global_store_short v[0:1], v4, off +; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: test_writelane_half: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: global_load_u16 v4, v[0:1], off +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_writelane_b32 v4, s0, s1 +; GFX1100-GISEL-NEXT: global_store_b16 v[0:1], v4, off +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] + %oldval = load half, ptr addrspace(1) %out + %writelane = call half @llvm.amdgcn.writelane.f16(half %src, i32 %src1, half %oldval) + store half %writelane, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_writelane_float(ptr addrspace(1) %out, float %src, i32 %src1) { +; GFX802-SDAG-LABEL: test_writelane_float: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-SDAG-NEXT: flat_load_dword v4, v[0:1] +; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_nop 1 +; GFX802-SDAG-NEXT: v_writelane_b32 v4, s4, m0 +; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v4 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-SDAG-LABEL: test_writelane_float: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-SDAG-NEXT: global_load_dword v4, v[0:1], off +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1010-SDAG-NEXT: v_writelane_b32 v4, s4, s5 +; GFX1010-SDAG-NEXT: global_store_dword v[0:1], v4, off +; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: test_writelane_float: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: global_load_b32 v4, v[0:1], off +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1 +; GFX1100-SDAG-NEXT: global_store_b32 v[0:1], v4, off +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX802-GISEL-LABEL: test_writelane_float: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-GISEL-NEXT: flat_load_dword v4, v[0:1] +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s5 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: v_writelane_b32 v4, s4, m0 +; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v4 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-GISEL-LABEL: test_writelane_float: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-GISEL-NEXT: global_load_dword v4, v[0:1], off +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-GISEL-NEXT: v_writelane_b32 v4, s4, s5 +; GFX1010-GISEL-NEXT: global_store_dword v[0:1], v4, off +; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: test_writelane_float: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: global_load_b32 v4, v[0:1], off +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_writelane_b32 v4, s0, s1 +; GFX1100-GISEL-NEXT: global_store_b32 v[0:1], v4, off +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] + %oldval = load float, ptr addrspace(1) %out + %writelane = call float @llvm.amdgcn.writelane.f32(float %src, i32 %src1, float %oldval) + store float %writelane, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_writelane_bfloat(ptr addrspace(1) %out, bfloat %src, i32 %src1) { +; GFX802-SDAG-LABEL: test_writelane_bfloat: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-SDAG-NEXT: flat_load_ushort v4, v[0:1] +; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_nop 1 +; GFX802-SDAG-NEXT: v_writelane_b32 v4, s4, m0 +; GFX802-SDAG-NEXT: flat_store_short v[0:1], v4 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-SDAG-LABEL: test_writelane_bfloat: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-SDAG-NEXT: global_load_ushort v4, v[0:1], off +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1010-SDAG-NEXT: v_writelane_b32 v4, s4, s5 +; GFX1010-SDAG-NEXT: global_store_short v[0:1], v4, off +; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: test_writelane_bfloat: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: global_load_u16 v4, v[0:1], off +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1 +; GFX1100-SDAG-NEXT: global_store_b16 v[0:1], v4, off +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX802-GISEL-LABEL: test_writelane_bfloat: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-GISEL-NEXT: flat_load_ushort v4, v[0:1] +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s5 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: v_writelane_b32 v4, s4, m0 +; GFX802-GISEL-NEXT: flat_store_short v[0:1], v4 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-GISEL-LABEL: test_writelane_bfloat: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-GISEL-NEXT: global_load_ushort v4, v[0:1], off +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-GISEL-NEXT: v_writelane_b32 v4, s4, s5 +; GFX1010-GISEL-NEXT: global_store_short v[0:1], v4, off +; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: test_writelane_bfloat: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: global_load_u16 v4, v[0:1], off +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_writelane_b32 v4, s0, s1 +; GFX1100-GISEL-NEXT: global_store_b16 v[0:1], v4, off +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] + %oldval = load bfloat, ptr addrspace(1) %out + %writelane = call bfloat @llvm.amdgcn.writelane.bf16(bfloat %src, i32 %src1, bfloat %oldval) + store bfloat %writelane, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_writelane_i16(ptr addrspace(1) %out, i16 %src, i32 %src1) { +; GFX802-SDAG-LABEL: test_writelane_i16: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-SDAG-NEXT: flat_load_ushort v4, v[0:1] +; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_nop 1 +; GFX802-SDAG-NEXT: v_writelane_b32 v4, s4, m0 +; GFX802-SDAG-NEXT: flat_store_short v[0:1], v4 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-SDAG-LABEL: test_writelane_i16: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-SDAG-NEXT: global_load_ushort v4, v[0:1], off +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1010-SDAG-NEXT: v_writelane_b32 v4, s4, s5 +; GFX1010-SDAG-NEXT: global_store_short v[0:1], v4, off +; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: test_writelane_i16: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: global_load_u16 v4, v[0:1], off +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1 +; GFX1100-SDAG-NEXT: global_store_b16 v[0:1], v4, off +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX802-GISEL-LABEL: test_writelane_i16: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-GISEL-NEXT: flat_load_ushort v4, v[0:1] +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s5 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: v_writelane_b32 v4, s4, m0 +; GFX802-GISEL-NEXT: flat_store_short v[0:1], v4 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-GISEL-LABEL: test_writelane_i16: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-GISEL-NEXT: global_load_ushort v4, v[0:1], off +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-GISEL-NEXT: v_writelane_b32 v4, s4, s5 +; GFX1010-GISEL-NEXT: global_store_short v[0:1], v4, off +; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: test_writelane_i16: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: global_load_u16 v4, v[0:1], off +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_writelane_b32 v4, s0, s1 +; GFX1100-GISEL-NEXT: global_store_b16 v[0:1], v4, off +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] + %oldval = load i16, ptr addrspace(1) %out + %writelane = call i16 @llvm.amdgcn.writelane.i16(i16 %src, i32 %src1, i16 %oldval) + store i16 %writelane, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_writelane_v2f16(ptr addrspace(1) %out, <2 x half> %src, i32 %src1) { +; GFX802-SDAG-LABEL: test_writelane_v2f16: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-SDAG-NEXT: flat_load_dword v4, v[0:1] +; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_nop 1 +; GFX802-SDAG-NEXT: v_writelane_b32 v4, s4, m0 +; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v4 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-SDAG-LABEL: test_writelane_v2f16: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-SDAG-NEXT: global_load_dword v4, v[0:1], off +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1010-SDAG-NEXT: v_writelane_b32 v4, s4, s5 +; GFX1010-SDAG-NEXT: global_store_dword v[0:1], v4, off +; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: test_writelane_v2f16: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: global_load_b32 v4, v[0:1], off +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1 +; GFX1100-SDAG-NEXT: global_store_b32 v[0:1], v4, off +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX802-GISEL-LABEL: test_writelane_v2f16: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-GISEL-NEXT: flat_load_dword v4, v[0:1] +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s5 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: v_writelane_b32 v4, s4, m0 +; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v4 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-GISEL-LABEL: test_writelane_v2f16: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-GISEL-NEXT: global_load_dword v4, v[0:1], off +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-GISEL-NEXT: v_writelane_b32 v4, s4, s5 +; GFX1010-GISEL-NEXT: global_store_dword v[0:1], v4, off +; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: test_writelane_v2f16: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: global_load_b32 v4, v[0:1], off +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_writelane_b32 v4, s0, s1 +; GFX1100-GISEL-NEXT: global_store_b32 v[0:1], v4, off +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] + %oldval = load <2 x half>, ptr addrspace(1) %out + %writelane = call <2 x half> @llvm.amdgcn.writelane.v2f16(<2 x half> %src, i32 %src1, <2 x half> %oldval) + store <2 x half> %writelane, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_readlane_v2f32(ptr addrspace(1) %out, <2 x float> %src, i32 %src1) { +; GFX802-SDAG-LABEL: test_readlane_v2f32: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-SDAG-NEXT: flat_load_dwordx2 v[5:6], v[0:1] +; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v4 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v2 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_nop 0 +; GFX802-SDAG-NEXT: v_writelane_b32 v6, s4, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v5, s5, m0 +; GFX802-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[5:6] +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-SDAG-LABEL: test_readlane_v2f32: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-SDAG-NEXT: global_load_dwordx2 v[5:6], v[0:1], off +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v3 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v4 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v2 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1010-SDAG-NEXT: v_writelane_b32 v6, s4, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v5, s6, s5 +; GFX1010-SDAG-NEXT: global_store_dwordx2 v[0:1], v[5:6], off +; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: test_readlane_v2f32: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: global_load_b64 v[5:6], v[0:1], off +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v3 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v4 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_writelane_b32 v6, s0, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v5, s2, s1 +; GFX1100-SDAG-NEXT: global_store_b64 v[0:1], v[5:6], off +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX802-GISEL-LABEL: test_readlane_v2f32: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-GISEL-NEXT: flat_load_dwordx2 v[5:6], v[0:1] +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v4 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s5 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: v_writelane_b32 v5, s4, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v6, s6, m0 +; GFX802-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[5:6] +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-GISEL-LABEL: test_readlane_v2f32: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-GISEL-NEXT: global_load_dwordx2 v[5:6], v[0:1], off +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v4 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-GISEL-NEXT: v_writelane_b32 v5, s4, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v6, s6, s5 +; GFX1010-GISEL-NEXT: global_store_dwordx2 v[0:1], v[5:6], off +; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: test_readlane_v2f32: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: global_load_b64 v[5:6], v[0:1], off +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v4 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_writelane_b32 v5, s0, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v6, s2, s1 +; GFX1100-GISEL-NEXT: global_store_b64 v[0:1], v[5:6], off +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] + %oldval = load <2 x float>, ptr addrspace(1) %out + %writelane = call <2 x float> @llvm.amdgcn.writelane.v2f32(<2 x float> %src, i32 %src1, <2 x float> %oldval) + store <2 x float> %writelane, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_writelane_v7i32(ptr addrspace(1) %out, <7 x i32> %src, i32 %src1) { +; GFX802-SDAG-LABEL: test_writelane_v7i32: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-SDAG-NEXT: v_add_u32_e32 v17, vcc, 16, v0 +; GFX802-SDAG-NEXT: flat_load_dwordx4 v[10:13], v[0:1] +; GFX802-SDAG-NEXT: v_addc_u32_e32 v18, vcc, 0, v1, vcc +; GFX802-SDAG-NEXT: flat_load_dwordx3 v[14:16], v[17:18] +; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v9 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v5 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v4 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s9, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s10, v2 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v8 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v7 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v6 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX802-SDAG-NEXT: v_writelane_b32 v13, s7, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v12, s8, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v11, s9, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v10, s10, m0 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: v_writelane_b32 v16, s4, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v15, s5, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v14, s6, m0 +; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[10:13] +; GFX802-SDAG-NEXT: flat_store_dwordx3 v[17:18], v[14:16] +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-SDAG-LABEL: test_writelane_v7i32: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-SDAG-NEXT: s_clause 0x1 +; GFX1010-SDAG-NEXT: global_load_dwordx3 v[14:16], v[0:1], off offset:16 +; GFX1010-SDAG-NEXT: global_load_dwordx4 v[10:13], v[0:1], off +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v9 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s8, v5 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s9, v4 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s10, v3 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s11, v2 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v8 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v7 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s7, v6 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX1010-SDAG-NEXT: v_writelane_b32 v16, s4, s5 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1010-SDAG-NEXT: v_writelane_b32 v13, s8, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v12, s9, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v11, s10, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v10, s11, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v15, s6, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v14, s7, s5 +; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[10:13], off +; GFX1010-SDAG-NEXT: global_store_dwordx3 v[0:1], v[14:16], off offset:16 +; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: test_writelane_v7i32: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: global_load_b96 v[14:16], v[0:1], off offset:16 +; GFX1100-SDAG-NEXT: global_load_b128 v[10:13], v[0:1], off +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s4, v5 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s5, v4 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s6, v3 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s7, v2 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v7 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v6 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_writelane_b32 v16, s0, s1 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: v_writelane_b32 v13, s4, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v12, s5, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v11, s6, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v10, s7, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v15, s2, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v14, s3, s1 +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[10:13], off +; GFX1100-SDAG-NEXT: global_store_b96 v[0:1], v[14:16], off offset:16 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX802-GISEL-LABEL: test_writelane_v7i32: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-GISEL-NEXT: v_add_u32_e32 v18, vcc, 16, v0 +; GFX802-GISEL-NEXT: flat_load_dwordx4 v[10:13], v[0:1] +; GFX802-GISEL-NEXT: v_addc_u32_e32 v19, vcc, 0, v1, vcc +; GFX802-GISEL-NEXT: flat_load_dwordx4 v[14:17], v[18:19] +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v9 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s7, v4 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s8, v5 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s5 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s9, v6 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s10, v7 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s11, v8 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX802-GISEL-NEXT: v_writelane_b32 v10, s4, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v11, s6, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v12, s7, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v13, s8, m0 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: v_writelane_b32 v14, s9, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v15, s10, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v16, s11, m0 +; GFX802-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[10:13] +; GFX802-GISEL-NEXT: flat_store_dwordx3 v[18:19], v[14:16] +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-GISEL-LABEL: test_writelane_v7i32: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-GISEL-NEXT: s_clause 0x1 +; GFX1010-GISEL-NEXT: global_load_dwordx4 v[10:13], v[0:1], off +; GFX1010-GISEL-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:16 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v9 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s7, v4 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s8, v5 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s9, v6 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s10, v7 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s11, v8 +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX1010-GISEL-NEXT: v_writelane_b32 v10, s4, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v11, s6, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v12, s7, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v13, s8, s5 +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-GISEL-NEXT: v_writelane_b32 v14, s9, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v15, s10, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v16, s11, s5 +; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[10:13], off +; GFX1010-GISEL-NEXT: global_store_dwordx3 v[0:1], v[14:16], off offset:16 +; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: test_writelane_v7i32: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_clause 0x1 +; GFX1100-GISEL-NEXT: global_load_b128 v[10:13], v[0:1], off +; GFX1100-GISEL-NEXT: global_load_b128 v[14:17], v[0:1], off offset:16 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s4, v5 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s5, v6 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s6, v7 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s7, v8 +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX1100-GISEL-NEXT: v_writelane_b32 v10, s0, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v11, s2, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v12, s3, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v13, s4, s1 +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1100-GISEL-NEXT: v_writelane_b32 v14, s5, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v15, s6, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v16, s7, s1 +; GFX1100-GISEL-NEXT: s_clause 0x1 +; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[10:13], off +; GFX1100-GISEL-NEXT: global_store_b96 v[0:1], v[14:16], off offset:16 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] + %oldval = load <7 x i32>, ptr addrspace(1) %out + %writelane = call <7 x i32> @llvm.amdgcn.writelane.v7i32(<7 x i32> %src, i32 %src1, <7 x i32> %oldval) + store <7 x i32> %writelane, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_writelane_p0(ptr addrspace(1) %out, ptr %src, i32 %src1) { +; GFX802-SDAG-LABEL: test_writelane_p0: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-SDAG-NEXT: flat_load_dwordx2 v[5:6], v[0:1] +; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v4 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v2 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_nop 0 +; GFX802-SDAG-NEXT: v_writelane_b32 v6, s4, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v5, s5, m0 +; GFX802-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[5:6] +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-SDAG-LABEL: test_writelane_p0: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-SDAG-NEXT: global_load_dwordx2 v[5:6], v[0:1], off +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v3 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v4 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v2 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1010-SDAG-NEXT: v_writelane_b32 v6, s4, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v5, s6, s5 +; GFX1010-SDAG-NEXT: global_store_dwordx2 v[0:1], v[5:6], off +; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: test_writelane_p0: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: global_load_b64 v[5:6], v[0:1], off +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v3 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v4 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_writelane_b32 v6, s0, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v5, s2, s1 +; GFX1100-SDAG-NEXT: global_store_b64 v[0:1], v[5:6], off +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX802-GISEL-LABEL: test_writelane_p0: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-GISEL-NEXT: flat_load_dwordx2 v[5:6], v[0:1] +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v4 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s5 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: v_writelane_b32 v5, s4, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v6, s6, m0 +; GFX802-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[5:6] +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-GISEL-LABEL: test_writelane_p0: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-GISEL-NEXT: global_load_dwordx2 v[5:6], v[0:1], off +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v4 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-GISEL-NEXT: v_writelane_b32 v5, s4, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v6, s6, s5 +; GFX1010-GISEL-NEXT: global_store_dwordx2 v[0:1], v[5:6], off +; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: test_writelane_p0: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: global_load_b64 v[5:6], v[0:1], off +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v4 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_writelane_b32 v5, s0, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v6, s2, s1 +; GFX1100-GISEL-NEXT: global_store_b64 v[0:1], v[5:6], off +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] + %oldval = load ptr, ptr addrspace(1) %out + %writelane = call ptr @llvm.amdgcn.writelane.p0(ptr %src, i32 %src1, ptr %oldval) + store ptr %writelane, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_writelane_v3p0(ptr addrspace(1) %out, <4 x ptr> %src, i32 %src1) { +; GFX802-SDAG-LABEL: test_writelane_v3p0: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-SDAG-NEXT: v_add_u32_e32 v19, vcc, 16, v0 +; GFX802-SDAG-NEXT: flat_load_dwordx4 v[11:14], v[0:1] +; GFX802-SDAG-NEXT: v_addc_u32_e32 v20, vcc, 0, v1, vcc +; GFX802-SDAG-NEXT: flat_load_dwordx4 v[15:18], v[19:20] +; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v10 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v5 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s9, v4 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s10, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s11, v2 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v9 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v8 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v7 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v6 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX802-SDAG-NEXT: v_writelane_b32 v14, s8, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v13, s9, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v12, s10, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v11, s11, m0 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: v_writelane_b32 v18, s4, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v17, s5, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v16, s6, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v15, s7, m0 +; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[11:14] +; GFX802-SDAG-NEXT: flat_store_dwordx4 v[19:20], v[15:18] +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-SDAG-LABEL: test_writelane_v3p0: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-SDAG-NEXT: s_clause 0x1 +; GFX1010-SDAG-NEXT: global_load_dwordx4 v[11:14], v[0:1], off offset:16 +; GFX1010-SDAG-NEXT: global_load_dwordx4 v[15:18], v[0:1], off +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v10 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s9, v5 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s10, v4 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s11, v3 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s12, v2 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v8 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s7, v7 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s8, v6 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX1010-SDAG-NEXT: v_writelane_b32 v14, s4, s5 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1010-SDAG-NEXT: v_writelane_b32 v18, s9, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v17, s10, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v16, s11, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v15, s12, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v13, s6, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v12, s7, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v11, s8, s5 +; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[15:18], off +; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[11:14], off offset:16 +; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: test_writelane_v3p0: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: global_load_b128 v[11:14], v[0:1], off offset:16 +; GFX1100-SDAG-NEXT: global_load_b128 v[15:18], v[0:1], off +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s5, v5 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s7, v3 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s8, v2 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v9 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v8 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v7 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s4, v6 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX1100-SDAG-NEXT: v_writelane_b32 v14, s0, s1 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: v_writelane_b32 v18, s5, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v17, s6, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v16, s7, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v15, s8, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v13, s2, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v12, s3, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v11, s4, s1 +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[15:18], off +; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[11:14], off offset:16 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX802-GISEL-LABEL: test_writelane_v3p0: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-GISEL-NEXT: v_add_u32_e32 v19, vcc, 16, v0 +; GFX802-GISEL-NEXT: flat_load_dwordx4 v[11:14], v[0:1] +; GFX802-GISEL-NEXT: v_addc_u32_e32 v20, vcc, 0, v1, vcc +; GFX802-GISEL-NEXT: flat_load_dwordx4 v[15:18], v[19:20] +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v10 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s7, v4 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s8, v5 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s5 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s9, v6 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s10, v7 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s11, v8 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s12, v9 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX802-GISEL-NEXT: v_writelane_b32 v11, s4, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v12, s6, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v13, s7, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v14, s8, m0 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: v_writelane_b32 v15, s9, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v16, s10, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v17, s11, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v18, s12, m0 +; GFX802-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[11:14] +; GFX802-GISEL-NEXT: flat_store_dwordx4 v[19:20], v[15:18] +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-GISEL-LABEL: test_writelane_v3p0: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-GISEL-NEXT: s_clause 0x1 +; GFX1010-GISEL-NEXT: global_load_dwordx4 v[11:14], v[0:1], off +; GFX1010-GISEL-NEXT: global_load_dwordx4 v[15:18], v[0:1], off offset:16 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v10 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s7, v4 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s8, v5 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s9, v6 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s10, v7 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s11, v8 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s12, v9 +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX1010-GISEL-NEXT: v_writelane_b32 v11, s4, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v12, s6, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v13, s7, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v14, s8, s5 +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-GISEL-NEXT: v_writelane_b32 v15, s9, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v16, s10, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v17, s11, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v18, s12, s5 +; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[11:14], off +; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[15:18], off offset:16 +; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: test_writelane_v3p0: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: s_clause 0x1 +; GFX1100-GISEL-NEXT: global_load_b128 v[11:14], v[0:1], off +; GFX1100-GISEL-NEXT: global_load_b128 v[15:18], v[0:1], off offset:16 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s4, v5 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s5, v6 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s6, v7 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s7, v8 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s8, v9 +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX1100-GISEL-NEXT: v_writelane_b32 v11, s0, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v12, s2, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v13, s3, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v14, s4, s1 +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1100-GISEL-NEXT: v_writelane_b32 v15, s5, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v16, s6, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v17, s7, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v18, s8, s1 +; GFX1100-GISEL-NEXT: s_clause 0x1 +; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[11:14], off +; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[15:18], off offset:16 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] + %oldval = load <4 x ptr>, ptr addrspace(1) %out + %writelane = call <4 x ptr> @llvm.amdgcn.writelane.v3p0(<4 x ptr> %src, i32 %src1, <4 x ptr> %oldval) + store <4 x ptr> %writelane, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_writelane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src1) { +; GFX802-SDAG-LABEL: test_writelane_v8i16: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-SDAG-NEXT: flat_load_dwordx4 v[7:10], v[0:1] +; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v6 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v5 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v4 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v2 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: v_writelane_b32 v10, s4, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v9, s5, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v8, s6, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v7, s7, m0 +; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[7:10] +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-SDAG-LABEL: test_writelane_v8i16: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-SDAG-NEXT: global_load_dwordx4 v[7:10], v[0:1], off +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v5 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v6 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s7, v3 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s8, v2 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1010-SDAG-NEXT: v_writelane_b32 v10, s4, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v9, s6, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v8, s7, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v7, s8, s5 +; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[7:10], off +; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: test_writelane_v8i16: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: global_load_b128 v[7:10], v[0:1], off +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v5 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v6 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v4 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: v_writelane_b32 v10, s0, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v9, s2, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v8, s3, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v7, s4, s1 +; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[7:10], off +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX802-GISEL-LABEL: test_writelane_v8i16: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-GISEL-NEXT: flat_load_dwordx4 v[7:10], v[0:1] +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v6 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s7, v4 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s8, v5 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s5 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: v_writelane_b32 v7, s4, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v8, s6, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v9, s7, m0 +; GFX802-GISEL-NEXT: v_writelane_b32 v10, s8, m0 +; GFX802-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[7:10] +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-GISEL-LABEL: test_writelane_v8i16: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-GISEL-NEXT: global_load_dwordx4 v[7:10], v[0:1], off +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v6 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s7, v4 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s8, v5 +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-GISEL-NEXT: v_writelane_b32 v7, s4, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v8, s6, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v9, s7, s5 +; GFX1010-GISEL-NEXT: v_writelane_b32 v10, s8, s5 +; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[7:10], off +; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: test_writelane_v8i16: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: global_load_b128 v[7:10], v[0:1], off +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v6 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s4, v5 +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1100-GISEL-NEXT: v_writelane_b32 v7, s0, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v8, s2, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v9, s3, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v10, s4, s1 +; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[7:10], off +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] + %oldval = load <8 x i16>, ptr addrspace(1) %out + %writelane = call <8 x i16> @llvm.amdgcn.writelane.v8i16(<8 x i16> %src, i32 %src1, <8 x i16> %oldval) + store <8 x i16> %writelane, ptr addrspace(1) %out, align 4 + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #2 attributes #0 = { nounwind readnone convergent } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll new file mode 100644 index 0000000000000..afc394627d356 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll @@ -0,0 +1,292 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX802-SDAG %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1010-SDAG %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX1100-SDAG %s + +define void @test_writelane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src, i32 %src1) { +; GFX802-SDAG-LABEL: test_writelane_p3: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-SDAG-NEXT: flat_load_dword v4, v[0:1] +; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_nop 1 +; GFX802-SDAG-NEXT: v_writelane_b32 v4, s4, m0 +; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v4 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-SDAG-LABEL: test_writelane_p3: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-SDAG-NEXT: global_load_dword v4, v[0:1], off +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1010-SDAG-NEXT: v_writelane_b32 v4, s4, s5 +; GFX1010-SDAG-NEXT: global_store_dword v[0:1], v4, off +; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: test_writelane_p3: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: global_load_b32 v4, v[0:1], off +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1 +; GFX1100-SDAG-NEXT: global_store_b32 v[0:1], v4, off +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] + %oldval = load ptr addrspace(3), ptr addrspace(1) %out + %writelane = call ptr addrspace(3) @llvm.amdgcn.writelane.p3(ptr addrspace(3) %src, i32 %src1, ptr addrspace(3) %oldval) + store ptr addrspace(3) %writelane, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_writelane_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %src, i32 %src1) { +; GFX802-SDAG-LABEL: test_writelane_v3p3: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-SDAG-NEXT: flat_load_dwordx3 v[6:8], v[0:1] +; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v5 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v4 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v2 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: v_writelane_b32 v8, s4, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v7, s5, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v6, s6, m0 +; GFX802-SDAG-NEXT: flat_store_dwordx3 v[0:1], v[6:8] +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-SDAG-LABEL: test_writelane_v3p3: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-SDAG-NEXT: global_load_dwordx3 v[6:8], v[0:1], off +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v4 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v5 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v3 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s7, v2 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1010-SDAG-NEXT: v_writelane_b32 v8, s4, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v7, s6, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v6, s7, s5 +; GFX1010-SDAG-NEXT: global_store_dwordx3 v[0:1], v[6:8], off +; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: test_writelane_v3p3: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: global_load_b96 v[6:8], v[0:1], off +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v4 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v5 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_writelane_b32 v8, s0, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v7, s2, s1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_writelane_b32 v6, s3, s1 +; GFX1100-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] + %oldval = load <3 x ptr addrspace(3)>, ptr addrspace(1) %out + %writelane = call <3 x ptr addrspace(3)> @llvm.amdgcn.writelane.v3p3(<3 x ptr addrspace(3)> %src, i32 %src1, <3 x ptr addrspace(3)> %oldval) + store <3 x ptr addrspace(3)> %writelane, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_writelane_p5(ptr addrspace(1) %out, ptr addrspace(5) %src, i32 %src1) { +; GFX802-SDAG-LABEL: test_writelane_p5: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-SDAG-NEXT: flat_load_dword v4, v[0:1] +; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_nop 1 +; GFX802-SDAG-NEXT: v_writelane_b32 v4, s4, m0 +; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v4 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-SDAG-LABEL: test_writelane_p5: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-SDAG-NEXT: global_load_dword v4, v[0:1], off +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1010-SDAG-NEXT: v_writelane_b32 v4, s4, s5 +; GFX1010-SDAG-NEXT: global_store_dword v[0:1], v4, off +; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: test_writelane_p5: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: global_load_b32 v4, v[0:1], off +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1 +; GFX1100-SDAG-NEXT: global_store_b32 v[0:1], v4, off +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] + %oldval = load ptr addrspace(5), ptr addrspace(1) %out + %writelane = call ptr addrspace(5) @llvm.amdgcn.writelane.p5(ptr addrspace(5) %src, i32 %src1, ptr addrspace(5) %oldval) + store ptr addrspace(5) %writelane, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_writelane_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %src, i32 %src1) { +; GFX802-SDAG-LABEL: test_writelane_v3p5: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-SDAG-NEXT: flat_load_dwordx3 v[6:8], v[0:1] +; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v5 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v4 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v2 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: v_writelane_b32 v8, s4, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v7, s5, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v6, s6, m0 +; GFX802-SDAG-NEXT: flat_store_dwordx3 v[0:1], v[6:8] +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-SDAG-LABEL: test_writelane_v3p5: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-SDAG-NEXT: global_load_dwordx3 v[6:8], v[0:1], off +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v4 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v5 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v3 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s7, v2 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1010-SDAG-NEXT: v_writelane_b32 v8, s4, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v7, s6, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v6, s7, s5 +; GFX1010-SDAG-NEXT: global_store_dwordx3 v[0:1], v[6:8], off +; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: test_writelane_v3p5: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: global_load_b96 v[6:8], v[0:1], off +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v4 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v5 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_writelane_b32 v8, s0, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v7, s2, s1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_writelane_b32 v6, s3, s1 +; GFX1100-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] + %oldval = load <3 x ptr addrspace(5)>, ptr addrspace(1) %out + %writelane = call <3 x ptr addrspace(5)> @llvm.amdgcn.writelane.v3p5(<3 x ptr addrspace(5)> %src, i32 %src1, <3 x ptr addrspace(5)> %oldval) + store <3 x ptr addrspace(5)> %writelane, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_writelane_p6(ptr addrspace(1) %out, ptr addrspace(6) %src, i32 %src1) { +; GFX802-SDAG-LABEL: test_writelane_p6: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-SDAG-NEXT: flat_load_dword v4, v[0:1] +; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_nop 1 +; GFX802-SDAG-NEXT: v_writelane_b32 v4, s4, m0 +; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v4 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-SDAG-LABEL: test_writelane_p6: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-SDAG-NEXT: global_load_dword v4, v[0:1], off +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1010-SDAG-NEXT: v_writelane_b32 v4, s4, s5 +; GFX1010-SDAG-NEXT: global_store_dword v[0:1], v4, off +; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: test_writelane_p6: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: global_load_b32 v4, v[0:1], off +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1 +; GFX1100-SDAG-NEXT: global_store_b32 v[0:1], v4, off +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] + %oldval = load ptr addrspace(6), ptr addrspace(1) %out + %writelane = call ptr addrspace(6) @llvm.amdgcn.writelane.p6(ptr addrspace(6) %src, i32 %src1, ptr addrspace(6) %oldval) + store ptr addrspace(6) %writelane, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_writelane_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %src, i32 %src1) { +; GFX802-SDAG-LABEL: test_writelane_v3p6: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-SDAG-NEXT: flat_load_dwordx3 v[6:8], v[0:1] +; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v5 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v4 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v2 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: v_writelane_b32 v8, s4, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v7, s5, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v6, s6, m0 +; GFX802-SDAG-NEXT: flat_store_dwordx3 v[0:1], v[6:8] +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-SDAG-LABEL: test_writelane_v3p6: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-SDAG-NEXT: global_load_dwordx3 v[6:8], v[0:1], off +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v4 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v5 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v3 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s7, v2 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1010-SDAG-NEXT: v_writelane_b32 v8, s4, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v7, s6, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v6, s7, s5 +; GFX1010-SDAG-NEXT: global_store_dwordx3 v[0:1], v[6:8], off +; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: test_writelane_v3p6: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: global_load_b96 v[6:8], v[0:1], off +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v4 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v5 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_writelane_b32 v8, s0, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v7, s2, s1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_writelane_b32 v6, s3, s1 +; GFX1100-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] + %oldval = load <3 x ptr addrspace(6)>, ptr addrspace(1) %out + %writelane = call <3 x ptr addrspace(6)> @llvm.amdgcn.writelane.v3p6(<3 x ptr addrspace(6)> %src, i32 %src1, <3 x ptr addrspace(6)> %oldval) + store <3 x ptr addrspace(6)> %writelane, ptr addrspace(1) %out, align 4 + ret void +} diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll index 94c32e3cbe99f..483ea8ad57d1b 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll @@ -2714,7 +2714,7 @@ declare i32 @llvm.amdgcn.readfirstlane(i32) define amdgpu_kernel void @readfirstlane_constant(i32 %arg) { ; CHECK-LABEL: @readfirstlane_constant( -; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]]) +; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]]) ; CHECK-NEXT: store volatile i32 [[VAR]], ptr undef, align 4 ; CHECK-NEXT: store volatile i32 0, ptr undef, align 4 ; CHECK-NEXT: store volatile i32 123, ptr undef, align 4 @@ -2737,7 +2737,7 @@ define amdgpu_kernel void @readfirstlane_constant(i32 %arg) { define i32 @readfirstlane_idempotent(i32 %arg) { ; CHECK-LABEL: @readfirstlane_idempotent( -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]]) +; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]]) ; CHECK-NEXT: ret i32 [[READ0]] ; %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg) @@ -2748,7 +2748,7 @@ define i32 @readfirstlane_idempotent(i32 %arg) { define i32 @readfirstlane_readlane(i32 %arg) { ; CHECK-LABEL: @readfirstlane_readlane( -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]]) +; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]]) ; CHECK-NEXT: ret i32 [[READ0]] ; %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg) @@ -2759,10 +2759,10 @@ define i32 @readfirstlane_readlane(i32 %arg) { define i32 @readfirstlane_readfirstlane_different_block(i32 %arg) { ; CHECK-LABEL: @readfirstlane_readfirstlane_different_block( ; CHECK-NEXT: bb0: -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]]) +; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]]) ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[READ0]]) +; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[READ0]]) ; CHECK-NEXT: ret i32 [[READ1]] ; bb0: @@ -2777,10 +2777,10 @@ bb1: define i32 @readfirstlane_readlane_different_block(i32 %arg) { ; CHECK-LABEL: @readfirstlane_readlane_different_block( ; CHECK-NEXT: bb0: -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 0) +; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 0) ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[READ0]]) +; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[READ0]]) ; CHECK-NEXT: ret i32 [[READ1]] ; bb0: @@ -2800,7 +2800,7 @@ declare i32 @llvm.amdgcn.readlane(i32, i32) define amdgpu_kernel void @readlane_constant(i32 %arg, i32 %lane) { ; CHECK-LABEL: @readlane_constant( -; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 7) +; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 7) ; CHECK-NEXT: store volatile i32 [[VAR]], ptr undef, align 4 ; CHECK-NEXT: store volatile i32 0, ptr undef, align 4 ; CHECK-NEXT: store volatile i32 123, ptr undef, align 4 @@ -2823,7 +2823,7 @@ define amdgpu_kernel void @readlane_constant(i32 %arg, i32 %lane) { define i32 @readlane_idempotent(i32 %arg, i32 %lane) { ; CHECK-LABEL: @readlane_idempotent( -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE:%.*]]) +; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 [[LANE:%.*]]) ; CHECK-NEXT: ret i32 [[READ0]] ; %read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane) @@ -2833,8 +2833,8 @@ define i32 @readlane_idempotent(i32 %arg, i32 %lane) { define i32 @readlane_idempotent_different_lanes(i32 %arg, i32 %lane0, i32 %lane1) { ; CHECK-LABEL: @readlane_idempotent_different_lanes( -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE0:%.*]]) -; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 [[LANE1:%.*]]) +; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 [[LANE0:%.*]]) +; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[READ0]], i32 [[LANE1:%.*]]) ; CHECK-NEXT: ret i32 [[READ1]] ; %read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane0) @@ -2844,7 +2844,7 @@ define i32 @readlane_idempotent_different_lanes(i32 %arg, i32 %lane0, i32 %lane1 define i32 @readlane_readfirstlane(i32 %arg) { ; CHECK-LABEL: @readlane_readfirstlane( -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]]) +; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]]) ; CHECK-NEXT: ret i32 [[READ0]] ; %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg) @@ -2855,10 +2855,10 @@ define i32 @readlane_readfirstlane(i32 %arg) { define i32 @readlane_idempotent_different_block(i32 %arg, i32 %lane) { ; CHECK-LABEL: @readlane_idempotent_different_block( ; CHECK-NEXT: bb0: -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE:%.*]]) +; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 [[LANE:%.*]]) ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 [[LANE]]) +; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[READ0]], i32 [[LANE]]) ; CHECK-NEXT: ret i32 [[READ1]] ; bb0: @@ -2874,10 +2874,10 @@ bb1: define i32 @readlane_readfirstlane_different_block(i32 %arg) { ; CHECK-LABEL: @readlane_readfirstlane_different_block( ; CHECK-NEXT: bb0: -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]]) +; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]]) ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 0) +; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[READ0]], i32 0) ; CHECK-NEXT: ret i32 [[READ1]] ; bb0: From 881e11673545164b0172c42ffd0fbba47ebe38dd Mon Sep 17 00:00:00 2001 From: Vikram Date: Sat, 18 May 2024 05:28:50 -0400 Subject: [PATCH 2/9] [AMDGPU] Extend permlane16, permlanex16 and permlane64 intrinsic lowering for generic types --- clang/lib/CodeGen/CGBuiltin.cpp | 19 + llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 10 +- .../Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 6 +- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 3 + llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 3 + llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td | 25 + .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 74 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 49 +- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 10 +- llvm/lib/Target/AMDGPU/VOP3Instructions.td | 10 +- .../UniformityAnalysis/AMDGPU/intrinsics.ll | 12 +- .../CodeGen/AMDGPU/llvm.amdgcn.permlane.ll | 2694 +++++++++++++++-- .../InstCombine/AMDGPU/amdgcn-intrinsics.ll | 28 +- llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll | 20 +- 14 files changed, 2671 insertions(+), 292 deletions(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index a275ada586d13..9ce2f5b6c103b 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18479,6 +18479,25 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, CGM.getIntrinsic(Intrinsic::amdgcn_update_dpp, Args[0]->getType()); return Builder.CreateCall(F, Args); } + case AMDGPU::BI__builtin_amdgcn_permlane16: + case AMDGPU::BI__builtin_amdgcn_permlanex16: { + Intrinsic::ID IID; + IID = BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16 + ? Intrinsic::amdgcn_permlane16 + : Intrinsic::amdgcn_permlanex16; + + llvm::Value *Src0 = EmitScalarExpr(E->getArg(0)); + llvm::Value *Src1 = EmitScalarExpr(E->getArg(1)); + llvm::Value *Src2 = EmitScalarExpr(E->getArg(2)); + llvm::Value *Src3 = EmitScalarExpr(E->getArg(3)); + llvm::Value *Src4 = EmitScalarExpr(E->getArg(4)); + llvm::Value *Src5 = EmitScalarExpr(E->getArg(5)); + + llvm::Function *F = CGM.getIntrinsic(IID, Src1->getType()); + return Builder.CreateCall(F, {Src0, Src1, Src2, Src3, Src4, Src5}); + } + case AMDGPU::BI__builtin_amdgcn_permlane64: + return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_permlane64); case AMDGPU::BI__builtin_amdgcn_readlane: return emitBinaryBuiltin(*this, E, Intrinsic::amdgcn_readlane); case AMDGPU::BI__builtin_amdgcn_readfirstlane: diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 457566944069e..63f7f48e82e4a 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2488,15 +2488,15 @@ def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS; // llvm.amdgcn.permlane16 def int_amdgcn_permlane16 : ClangBuiltin<"__builtin_amdgcn_permlane16">, - Intrinsic<[llvm_i32_ty], - [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], + Intrinsic<[llvm_any_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg>, ImmArg>, IntrNoCallback, IntrNoFree]>; // llvm.amdgcn.permlanex16 def int_amdgcn_permlanex16 : ClangBuiltin<"__builtin_amdgcn_permlanex16">, - Intrinsic<[llvm_i32_ty], - [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], + Intrinsic<[llvm_any_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg>, ImmArg>, IntrNoCallback, IntrNoFree]>; @@ -2540,7 +2540,7 @@ def int_amdgcn_image_bvh_intersect_ray : // llvm.amdgcn.permlane64 def int_amdgcn_permlane64 : ClangBuiltin<"__builtin_amdgcn_permlane64">, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty], + Intrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; def int_amdgcn_ds_add_gs_reg_rtn : diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp index 9ac0a52f63fd8..5fec5dafe2acd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -413,7 +413,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B, assert(ST->hasPermLaneX16()); V = B.CreateBitCast(V, IntNTy); Value *Permlanex16Call = B.CreateIntrinsic( - Intrinsic::amdgcn_permlanex16, {}, + V->getType(), Intrinsic::amdgcn_permlanex16, {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), B.CreateBitCast(Permlanex16Call, AtomicTy)); @@ -425,7 +425,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B, // Reduce across the upper and lower 32 lanes. V = B.CreateBitCast(V, IntNTy); Value *Permlane64Call = - B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V); + B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_permlane64, V); return buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), B.CreateBitCast(Permlane64Call, AtomicTy)); } @@ -481,7 +481,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B, assert(ST->hasPermLaneX16()); V = B.CreateBitCast(V, IntNTy); Value *PermX = B.CreateIntrinsic( - Intrinsic::amdgcn_permlanex16, {}, + V->getType(), Intrinsic::amdgcn_permlanex16, {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); Value *UpdateDPPCall = diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 9cd7496ba9f96..12a3ce486a75d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -5499,6 +5499,9 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(READLANE) NODE_NAME_CASE(READFIRSTLANE) NODE_NAME_CASE(WRITELANE) + NODE_NAME_CASE(PERMLANE16) + NODE_NAME_CASE(PERMLANEX16) + NODE_NAME_CASE(PERMLANE64) NODE_NAME_CASE(DUMMY_CHAIN) case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; NODE_NAME_CASE(LOAD_D16_HI) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 02c3dcf39e554..650ad948184d1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -561,6 +561,9 @@ enum NodeType : unsigned { READLANE, READFIRSTLANE, WRITELANE, + PERMLANE16, + PERMLANEX16, + PERMLANE64, DUMMY_CHAIN, FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index e4f329b200c86..a82f48950f493 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -354,9 +354,21 @@ def AMDGPUDWritelaneOp : SDTypeProfile<1, 3, [ SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisSameAs<0, 3> ]>; +def AMDGPUDPermlane16Op : SDTypeProfile<1, 6, [ + SDTCisSameAs<0, 1>, // old + SDTCisSameAs<0, 2>, // src0 + SDTCisInt<3>, // src1 + SDTCisInt<4>, // src2 + SDTCisInt<5>, // i1 fi + SDTCisInt<6> // i1 bound_ctrl +]>; + def AMDGPUreadlane_impl : SDNode<"AMDGPUISD::READLANE", AMDGPUReadlaneOp>; def AMDGPUreadfirstlane_impl : SDNode<"AMDGPUISD::READFIRSTLANE", AMDGPUReadfirstlaneOp>; def AMDGPUwritelane_impl : SDNode<"AMDGPUISD::WRITELANE", AMDGPUDWritelaneOp>; +def AMDGPUpermlane16_impl : SDNode<"AMDGPUISD::PERMLANE16", AMDGPUDPermlane16Op>; +def AMDGPUpermlanex16_impl : SDNode<"AMDGPUISD::PERMLANEX16", AMDGPUDPermlane16Op>; +def AMDGPUpermlane64_impl : SDNode<"AMDGPUISD::PERMLANE64", AMDGPUReadfirstlaneOp>; // SI+ export def AMDGPUExportOp : SDTypeProfile<0, 8, [ @@ -535,3 +547,16 @@ def AMDGPUwritelane : PatFrags<(ops node:$src0, node:$src1, node:$src2), [(int_amdgcn_writelane node:$src0, node:$src1, node:$src2), (AMDGPUwritelane_impl node:$src0, node:$src1, node:$src2)]>; +def AMDGPUpermlane16 : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5), + [(int_amdgcn_permlane16 node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5), + (AMDGPUpermlane16_impl node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5)]>; + +def AMDGPUpermlanex16 : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5), + [(int_amdgcn_permlanex16 node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5), + (AMDGPUpermlanex16_impl node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5)]>; + +def AMDGPUpermlane64 : PatFrags<(ops node:$src), + [(int_amdgcn_permlane64 node:$src), + (AMDGPUpermlane64_impl node:$src)]>; + + diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 6ffc8a20f76fa..b28c3521d6336 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -5397,25 +5397,42 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, Register DstReg = MI.getOperand(0).getReg(); Register Src0 = MI.getOperand(2).getReg(); + bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 || + IID == Intrinsic::amdgcn_permlanex16; + auto createLaneOp = [&](Register Src0, Register Src1, Register Src2) -> Register { auto LaneOp = B.buildIntrinsic(IID, {S32}).addUse(Src0); switch (IID) { case Intrinsic::amdgcn_readfirstlane: + case Intrinsic::amdgcn_permlane64: return LaneOp.getReg(0); case Intrinsic::amdgcn_readlane: return LaneOp.addUse(Src1).getReg(0); case Intrinsic::amdgcn_writelane: return LaneOp.addUse(Src1).addUse(Src2).getReg(0); + case Intrinsic::amdgcn_permlane16: + case Intrinsic::amdgcn_permlanex16: { + Register Src3 = MI.getOperand(5).getReg(); + Register Src4 = MI.getOperand(6).getImm(); + Register Src5 = MI.getOperand(7).getImm(); + return LaneOp.addUse(Src1) + .addUse(Src2) + .addUse(Src3) + .addImm(Src4) + .addImm(Src5) + .getReg(0); + } default: llvm_unreachable("unhandled lane op"); } }; Register Src1, Src2; - if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) { + if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane || + IsPermLane16) { Src1 = MI.getOperand(3).getReg(); - if (IID == Intrinsic::amdgcn_writelane) { + if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) { Src2 = MI.getOperand(4).getReg(); } } @@ -5433,7 +5450,16 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, ? Src0 : B.buildBitcast(LLT::scalar(Size), Src0).getReg(0); Src0 = B.buildAnyExt(S32, Src0Cast).getReg(0); - if (Src2.isValid()) { + + if (IsPermLane16) { + Register Src1Cast = + MRI.getType(Src1).isScalar() + ? Src1 + : B.buildBitcast(LLT::scalar(Size), Src2).getReg(0); + Src1 = B.buildAnyExt(LLT::scalar(32), Src1Cast).getReg(0); + } + + if (IID == Intrinsic::amdgcn_writelane) { Register Src2Cast = MRI.getType(Src2).isScalar() ? Src2 @@ -5485,46 +5511,45 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, } break; } - case Intrinsic::amdgcn_readfirstlane: { + case Intrinsic::amdgcn_readfirstlane: + case Intrinsic::amdgcn_permlane64: { for (unsigned i = 0; i < NumParts; ++i) { Src0 = IsS16Vec ? B.buildBitcast(S32, Src0Parts.getReg(i)).getReg(0) : Src0Parts.getReg(i); PartialRes.push_back( - (B.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, {S32}) - .addUse(Src0) - .getReg(0))); + (B.buildIntrinsic(IID, {S32}).addUse(Src0).getReg(0))); } break; } - case Intrinsic::amdgcn_writelane: { + case Intrinsic::amdgcn_writelane: + case Intrinsic::amdgcn_permlane16: + case Intrinsic::amdgcn_permlanex16: { Register Src1 = MI.getOperand(3).getReg(); Register Src2 = MI.getOperand(4).getReg(); - MachineInstrBuilder Src2Parts; + + Register SrcX = IsPermLane16 ? Src1 : Src2; + MachineInstrBuilder SrcXParts; if (Ty.isPointer()) { - auto PtrToInt = B.buildPtrToInt(S64, Src2); - Src2Parts = B.buildUnmerge(S32, PtrToInt); + auto PtrToInt = B.buildPtrToInt(S64, SrcX); + SrcXParts = B.buildUnmerge(S32, PtrToInt); } else if (Ty.isPointerVector()) { LLT IntVecTy = Ty.changeElementType( LLT::scalar(Ty.getElementType().getSizeInBits())); - auto PtrToInt = B.buildPtrToInt(IntVecTy, Src2); - Src2Parts = B.buildUnmerge(S32, PtrToInt); + auto PtrToInt = B.buildPtrToInt(IntVecTy, SrcX); + SrcXParts = B.buildUnmerge(S32, PtrToInt); } else - Src2Parts = - IsS16Vec ? B.buildUnmerge(V2S16, Src2) : B.buildUnmerge(S32, Src2); + SrcXParts = + IsS16Vec ? B.buildUnmerge(V2S16, SrcX) : B.buildUnmerge(S32, SrcX); for (unsigned i = 0; i < NumParts; ++i) { Src0 = IsS16Vec ? B.buildBitcast(S32, Src0Parts.getReg(i)).getReg(0) : Src0Parts.getReg(i); - Src2 = IsS16Vec ? B.buildBitcast(S32, Src2Parts.getReg(i)).getReg(0) - : Src2Parts.getReg(i); - PartialRes.push_back( - (B.buildIntrinsic(Intrinsic::amdgcn_writelane, {S32}) - .addUse(Src0) - .addUse(Src1) - .addUse(Src2)) - .getReg(0)); + SrcX = IsS16Vec ? B.buildBitcast(S32, SrcXParts.getReg(i)).getReg(0) + : SrcXParts.getReg(i); + PartialRes.push_back(IsPermLane16 ? createLaneOp(Src0, SrcX, Src2) + : createLaneOp(Src0, Src1, SrcX)); } break; @@ -7519,6 +7544,9 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_readlane: case Intrinsic::amdgcn_writelane: case Intrinsic::amdgcn_readfirstlane: + case Intrinsic::amdgcn_permlane16: + case Intrinsic::amdgcn_permlanex16: + case Intrinsic::amdgcn_permlane64: return legalizeLaneOp(Helper, MI, IntrID); default: { if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 6c23bdf09974b..5d34ed089f65d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6091,22 +6091,39 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, EVT VT = N->getValueType(0); unsigned ValSize = VT.getSizeInBits(); unsigned IntrinsicID = N->getConstantOperandVal(0); + bool IsPermLane16 = IntrinsicID == Intrinsic::amdgcn_permlane16 || + IntrinsicID == Intrinsic::amdgcn_permlanex16; + bool IsPermLane64 = IntrinsicID == Intrinsic::amdgcn_permlane64; SDValue Src0 = N->getOperand(1); SDLoc SL(N); MVT IntVT = MVT::getIntegerVT(ValSize); - auto createLaneOp = [&DAG, &SL](SDValue Src0, SDValue Src1, SDValue Src2, - MVT VT) -> SDValue { - return (Src2 ? DAG.getNode(AMDGPUISD::WRITELANE, SL, VT, {Src0, Src1, Src2}) - : Src1 ? DAG.getNode(AMDGPUISD::READLANE, SL, VT, {Src0, Src1}) - : DAG.getNode(AMDGPUISD::READFIRSTLANE, SL, VT, {Src0})); + auto createLaneOp = [&](SDValue Src0, SDValue Src1, SDValue Src2, + MVT ValueT) -> SDValue { + if (IsPermLane16 || IsPermLane64) { + if (IsPermLane16) { + SDValue Src3 = N->getOperand(4); + SDValue Src4 = N->getOperand(5); + SDValue Src5 = N->getOperand(6); + return DAG.getNode(IntrinsicID == Intrinsic::amdgcn_permlane16 + ? AMDGPUISD::PERMLANE16 + : AMDGPUISD::PERMLANEX16, + SL, ValueT, {Src0, Src1, Src2, Src3, Src4, Src5}); + } + return DAG.getNode(AMDGPUISD::PERMLANE64, SL, ValueT, {Src0}); + } + + return ( + Src2 ? DAG.getNode(AMDGPUISD::WRITELANE, SL, ValueT, {Src0, Src1, Src2}) + : Src1 ? DAG.getNode(AMDGPUISD::READLANE, SL, ValueT, {Src0, Src1}) + : DAG.getNode(AMDGPUISD::READFIRSTLANE, SL, ValueT, {Src0})); }; SDValue Src1, Src2; if (IntrinsicID == Intrinsic::amdgcn_readlane || - IntrinsicID == Intrinsic::amdgcn_writelane) { + IntrinsicID == Intrinsic::amdgcn_writelane || IsPermLane16) { Src1 = N->getOperand(2); - if (IntrinsicID == Intrinsic::amdgcn_writelane) + if (IntrinsicID == Intrinsic::amdgcn_writelane || IsPermLane16) Src2 = N->getOperand(3); } @@ -6118,10 +6135,17 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, if (ValSize < 32) { SDValue InitBitCast = DAG.getBitcast(IntVT, Src0); Src0 = DAG.getAnyExtOrTrunc(InitBitCast, SL, MVT::i32); - if (Src2.getNode()) { + + if (IsPermLane16) { + SDValue Src1Cast = DAG.getBitcast(IntVT, Src1); + Src1 = DAG.getAnyExtOrTrunc(Src1Cast, SL, MVT::i32); + } + + if (IntrinsicID == Intrinsic::amdgcn_writelane) { SDValue Src2Cast = DAG.getBitcast(IntVT, Src2); Src2 = DAG.getAnyExtOrTrunc(Src2Cast, SL, MVT::i32); } + SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32); SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT); return DAG.getBitcast(VT, Trunc); @@ -6131,7 +6155,11 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32); Src0 = DAG.getBitcast(VecVT, Src0); - if (Src2.getNode()) + if (IsPermLane16) { + Src1 = DAG.getBitcast(VecVT, Src1); + } + + if (IntrinsicID == Intrinsic::amdgcn_writelane) Src2 = DAG.getBitcast(VecVT, Src2); SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT); @@ -8612,6 +8640,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::amdgcn_readlane: case Intrinsic::amdgcn_readfirstlane: case Intrinsic::amdgcn_writelane: + case Intrinsic::amdgcn_permlane16: + case Intrinsic::amdgcn_permlanex16: + case Intrinsic::amdgcn_permlane64: return lowerLaneOp(*this, Op.getNode(), DAG); default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 9c5d6a7bf6d0b..33d3da3830f57 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -717,15 +717,19 @@ def V_ACCVGPR_MOV_B32 : VOP1_Pseudo<"v_accvgpr_mov_b32", VOPProfileAccMov, [], 1 let SubtargetPredicate = isGFX11Plus in { // Restrict src0 to be VGPR def V_PERMLANE64_B32 : VOP1_Pseudo<"v_permlane64_b32", VOP_MOVRELS, - getVOP1Pat.ret, - /*VOP1Only=*/ 1>; + [], /*VOP1Only=*/ 1>; defm V_MOV_B16_t16 : VOP1Inst<"v_mov_b16_t16", VOPProfile_True16>; defm V_NOT_B16 : VOP1Inst_t16<"v_not_b16", VOP_I16_I16>; defm V_CVT_I32_I16 : VOP1Inst_t16<"v_cvt_i32_i16", VOP_I32_I16>; defm V_CVT_U32_U16 : VOP1Inst_t16<"v_cvt_u32_u16", VOP_I32_I16>; } // End SubtargetPredicate = isGFX11Plus +foreach vt = Reg32Types.types in { + def : GCNPat<(AMDGPUpermlane64 (vt VRegSrc_32:$src0)), + (vt (V_PERMLANE64_B32 (vt VRegSrc_32:$src0))) + >; +} + //===----------------------------------------------------------------------===// // Target-specific instruction encodings. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 616bc7684753e..eee2085eab341 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -838,8 +838,8 @@ def gi_opsel_i1timm : GICustomOperandRenderer<"renderOpSelTImm">, GISDNodeXFormEquiv; class PermlanePat : GCNPat< - (permlane i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, + Instruction inst, ValueType vt> : GCNPat< + (permlane vt:$vdst_in, vt:$src0, i32:$src1, i32:$src2, timm:$fi, timm:$bc), (inst (opsel_i1timm $fi), VGPR_32:$src0, (opsel_i1timm $bc), SCSrc_b32:$src1, 0, SCSrc_b32:$src2, VGPR_32:$vdst_in) @@ -864,8 +864,10 @@ let SubtargetPredicate = isGFX10Plus in { defm V_PERMLANEX16_B32 : VOP3Inst<"v_permlanex16_b32", VOP3_PERMLANE_Profile>; } // End $vdst = $vdst_in, DisableEncoding $vdst_in - def : PermlanePat; - def : PermlanePat; + foreach vt = Reg32Types.types in { + def : PermlanePat; + def : PermlanePat; + } defm V_ADD_NC_U16 : VOP3Inst <"v_add_nc_u16", VOP3_Profile, add>; defm V_SUB_NC_U16 : VOP3Inst <"v_sub_nc_u16", VOP3_Profile, sub>; diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll index 74d2f53d7b317..680c998a4b39f 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll @@ -7,16 +7,16 @@ define amdgpu_kernel void @ds_swizzle(ptr addrspace(1) %out, i32 %src) #0 { ret void } -; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0 +; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0 define amdgpu_kernel void @v_permlane16_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 { - %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0 + %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0 store i32 %v, ptr addrspace(1) %out ret void } -; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0 +; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0 define amdgpu_kernel void @v_permlanex16_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 { - %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0 + %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0 store i32 %v, ptr addrspace(1) %out ret void } @@ -230,8 +230,8 @@ bb: } declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1 -declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1) #1 -declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1) #1 +declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1 +declare i32 @llvm.amdgcn.permlanex16.i32(i32, i32, i32, i32, i1, i1) #1 declare i32 @llvm.amdgcn.permlane16.var(i32, i32, i32, i1, i1) #1 declare i32 @llvm.amdgcn.permlanex16.var(i32, i32, i32, i1, i1) #1 declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll index 265d64f47bb23..b8dab361346c3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll @@ -6,13 +6,16 @@ ; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s ; RUN: llc -global-isel=1 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s -declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1) +declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) +declare float @llvm.amdgcn.permlane16.f32(float, float, i32, i32, i1, i1) +declare i64 @llvm.amdgcn.permlane16.i64(i64, i64, i32, i32, i1, i1) +declare double @llvm.amdgcn.permlane16.f64(double, double, i32, i32, i1, i1) declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1) declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.workitem.id.y() -define amdgpu_kernel void @v_permlane16_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_b32_vss: +define amdgpu_kernel void @v_permlane16_b32_vss_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_vss_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 @@ -24,7 +27,7 @@ define amdgpu_kernel void @v_permlane16_b32_vss(ptr addrspace(1) %out, i32 %src0 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_vss: +; GFX11-LABEL: v_permlane16_b32_vss_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 @@ -38,7 +41,7 @@ define amdgpu_kernel void @v_permlane16_b32_vss(ptr addrspace(1) %out, i32 %src0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_vss: +; GFX12-LABEL: v_permlane16_b32_vss_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 @@ -51,13 +54,252 @@ define amdgpu_kernel void @v_permlane16_b32_vss(ptr addrspace(1) %out, i32 %src0 ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) + %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vii(ptr addrspace(1) %out, i32 %src0) { -; GFX10-LABEL: v_permlane16_b32_vii: +define amdgpu_kernel void @v_permlane16_b32_vss_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_vss_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlane16_b32_vss_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlane16_b32_vss_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vss_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vss_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vss_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_vss_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_vss_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vss_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vss_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vss_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vss_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vss_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_vss_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_vss_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vss_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vss_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vii_i32(ptr addrspace(1) %out, i32 %src0) { +; GFX10-LABEL: v_permlane16_b32_vii_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c @@ -69,7 +311,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii(ptr addrspace(1) %out, i32 %src0 ; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_vii: +; GFX11-LABEL: v_permlane16_b32_vii_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c @@ -83,7 +325,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii(ptr addrspace(1) %out, i32 %src0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_vii: +; GFX12-LABEL: v_permlane16_b32_vii_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -94,326 +336,2057 @@ define amdgpu_kernel void @v_permlane16_b32_vii(ptr addrspace(1) %out, i32 %src0 ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 1, i32 2, i1 false, i1 false) + %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 1, i32 2, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out ret void } -; FIXME-GFX10PLUS: It is allowed to have both immediates as literals -define amdgpu_kernel void @v_permlane16_b32_vll(ptr addrspace(1) %out, i32 %src0) { -; GFX10-LABEL: v_permlane16_b32_vll: +define amdgpu_kernel void @v_permlane16_b32_vii_f32(ptr addrspace(1) %out, float %src0) { +; GFX10-LABEL: v_permlane16_b32_vii_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: s_movk_i32 s0, 0x1234 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, 0xc1d1 +; GFX10-NEXT: v_permlane16_b32 v0, v0, 1, 2 ; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_vll: +; GFX11-LABEL: v_permlane16_b32_vii_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX11-NEXT: s_movk_i32 s2, 0x1234 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, 1, 2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_vll: +; GFX12-LABEL: v_permlane16_b32_vii_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX12-NEXT: s_movk_i32 s2, 0x1234 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, 1, 2 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out + %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 1, i32 2, i1 false, i1 false) + store float %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0) { -; GFX10-LABEL: v_permlane16_b32_vvv: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 null, 0 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-NEXT: global_store_dword v1, v0, s[2:3] -; GFX10-NEXT: s_endpgm +define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 %src0) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vii_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, 1, 2 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, 1, 2 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vii_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, 1, 2 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, 1, 2 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_permlane16_b32_vvv: +; GFX11-SDAG-LABEL: v_permlane16_b32_vii_i64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, 1, 2 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, 1, 2 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_permlane16_b32_vvv: +; GFX11-GISEL-LABEL: v_permlane16_b32_vii_i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 -; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, 1, 2 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, 1, 2 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: v_permlane16_b32_vvv: +; GFX12-SDAG-LABEL: v_permlane16_b32_vii_i64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, 1, 2 +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, 1, 2 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: v_permlane16_b32_vvv: +; GFX12-GISEL-LABEL: v_permlane16_b32_vii_i64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, 1, 2 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, 1, 2 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidy = call i32 @llvm.amdgcn.workitem.id.y() - %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out + %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 1, i32 2, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vvs(ptr addrspace(1) %out, i32 %src0, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlane16_b32_vvs: +define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, double %src0) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vii_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, 1, 2 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, 1, 2 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; -; GFX10-GISEL-LABEL: v_permlane16_b32_vvs: +; GFX10-GISEL-LABEL: v_permlane16_b32_vii_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, 1, 2 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, 1, 2 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_permlane16_b32_vvs: +; GFX11-SDAG-LABEL: v_permlane16_b32_vii_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, 1, 2 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, 1, 2 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_vii_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, 1, 2 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, 1, 2 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vii_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, 1, 2 +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, 1, 2 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vii_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, 1, 2 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, 1, 2 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 1, i32 2, i1 false, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +; FIXME-GFX10PLUS: It is allowed to have both immediates as literals +define amdgpu_kernel void @v_permlane16_b32_vll_i32(ptr addrspace(1) %out, i32 %src0) { +; GFX10-LABEL: v_permlane16_b32_vll_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_movk_i32 s0, 0x1234 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, 0xc1d1 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlane16_b32_vll_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlane16_b32_vll_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-NEXT: s_movk_i32 s2, 0x1234 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 %src0) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vll_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: s_movk_i32 s2, 0x1234 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vll_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: s_movk_i32 s2, 0x1234 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_vll_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_vll_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vll_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1234 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vll_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: s_movk_i32 s2, 0x1234 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 4660, i32 49617, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vll_f32(ptr addrspace(1) %out,float %src0) { +; GFX10-LABEL: v_permlane16_b32_vll_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_movk_i32 s0, 0x1234 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, 0xc1d1 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlane16_b32_vll_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlane16_b32_vll_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-NEXT: s_movk_i32 s2, 0x1234 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 4660, i32 49617, i1 false, i1 false) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, double %src0) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vll_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: s_movk_i32 s2, 0x1234 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vll_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: s_movk_i32 s2, 0x1234 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_vll_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_vll_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vll_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1234 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vll_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: s_movk_i32 s2, 0x1234 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 4660, i32 49617, i1 false, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 %src0) { +; GFX10-LABEL: v_permlane16_b32_vvv_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 null, 0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_vvv_i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_vvv_i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vvv_i32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vvv_i32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vvv_i64(ptr addrspace(1) %out, i64 %src0) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vvv_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v1 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vvv_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlane16_b32_vvv_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlane16_b32_vvv_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_readfirstlane_b32 s5, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: v_readfirstlane_b32 s4, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float %src0) { +; GFX10-LABEL: v_permlane16_b32_vvv_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 null, 0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_vvv_f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_vvv_f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vvv_f32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vvv_f32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vvv_f64(ptr addrspace(1) %out, double %src0) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vvv_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v1 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vvv_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlane16_b32_vvv_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlane16_b32_vvv_f64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_readfirstlane_b32 s5, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: v_readfirstlane_b32 s4, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 %src0, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_i32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_i32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_i32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_i32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vvs_i64(ptr addrspace(1) %out, i64 %src0, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s2 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s2 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s2 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s2 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float %src0, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_f32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_f32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_f32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_f32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %tidx, i32 %src2, i1 false, i1 false) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vvs_f64(ptr addrspace(1) %out, double %src0, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s2 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s2 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s2 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s2 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %tidx, i32 %src2, i1 false, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_i32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s3, s2 +; GFX10-SDAG-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_i32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_i32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_i32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vsv_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s0 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s0 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s0 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s0 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i64 @llvm.amdgcn.permlane16(i64 %src0, i64 %src0, i32 %src1, i32 %tidy, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float %src0, i32 %src1) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_f32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s3, s2 +; GFX10-SDAG-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_f32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_f32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_f32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %tidy, i1 false, i1 false) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vsv_f64(ptr addrspace(1) %out, double %src0, i32 %src1) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s0 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s0 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s0 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s0 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %tidy, i1 false, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vss_fi_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_vss_fi_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[1,0] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlane16_b32_vss_fi_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlane16_b32_vss_fi_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vss_fi_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vss_fi_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vss_fi_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_vss_fi_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vss_fi_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_vss_fi_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[1,0] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlane16_b32_vss_fi_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlane16_b32_vss_fi_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 false) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vss_fi_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vss_fi_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vss_fi_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_vss_fi_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_permlane16_b32_vvs: +; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 -; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: v_permlane16_b32_vvs: +; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: v_permlane16_b32_vvs: +; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) + %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vss_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_vss_bc_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[0,1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlane16_b32_vss_bc_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlane16_b32_vss_bc_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true) store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vsv(ptr addrspace(1) %out, i32 %src0, i32 %src1) { -; GFX10-SDAG-LABEL: v_permlane16_b32_vsv: +define amdgpu_kernel void @v_permlane16_b32_vss_bc_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vss_bc_i64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s3, s2 -; GFX10-SDAG-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; -; GFX10-GISEL-LABEL: v_permlane16_b32_vsv: +; GFX10-GISEL-LABEL: v_permlane16_b32_vss_bc_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_permlane16_b32_vsv: +; GFX11-SDAG-LABEL: v_permlane16_b32_vss_bc_i64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_permlane16_b32_vsv: +; GFX11-GISEL-LABEL: v_permlane16_b32_vss_bc_i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 -; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: v_permlane16_b32_vsv: +; GFX12-SDAG-LABEL: v_permlane16_b32_vss_bc_i64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: v_permlane16_b32_vsv: +; GFX12-GISEL-LABEL: v_permlane16_b32_vss_bc_i64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm - %tidy = call i32 @llvm.amdgcn.workitem.id.y() - %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out + %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 true) + store i64 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vss_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_b32_vss_fi: +define amdgpu_kernel void @v_permlane16_b32_vss_bc_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_vss_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 @@ -421,11 +2394,11 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[1,0] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[0,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_vss_fi: +; GFX11-LABEL: v_permlane16_b32_vss_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 @@ -433,13 +2406,13 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_vss_fi: +; GFX12-LABEL: v_permlane16_b32_vss_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 @@ -447,18 +2420,115 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi(ptr addrspace(1) %out, i32 %s ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 false) - store i32 %v, ptr addrspace(1) %out + %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 true) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vss_bc_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vss_bc_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vss_bc_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_vss_bc_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_vss_bc_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vss_bc_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vss_bc_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 true) + store double %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vss_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_b32_vss_bc: +define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_vss_fi_bc_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 @@ -466,11 +2536,11 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[0,1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[1,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_vss_bc: +; GFX11-LABEL: v_permlane16_b32_vss_fi_bc_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 @@ -478,13 +2548,13 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_vss_bc: +; GFX12-LABEL: v_permlane16_b32_vss_fi_bc_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 @@ -492,18 +2562,115 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc(ptr addrspace(1) %out, i32 %s ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true) + %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true) store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_b32_vss_fi_bc: +define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 true) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_vss_fi_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 @@ -515,7 +2682,7 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc(ptr addrspace(1) %out, i32 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_vss_fi_bc: +; GFX11-LABEL: v_permlane16_b32_vss_fi_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 @@ -529,7 +2696,7 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc(ptr addrspace(1) %out, i32 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_vss_fi_bc: +; GFX12-LABEL: v_permlane16_b32_vss_fi_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 @@ -542,8 +2709,105 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc(ptr addrspace(1) %out, i32 ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true) - store i32 %v, ptr addrspace(1) %out + %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 true) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 true) + store double %v, ptr addrspace(1) %out ret void } diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll index 483ea8ad57d1b..925e88d041715 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll @@ -2933,37 +2933,37 @@ define amdgpu_kernel void @update_dpp_undef_old(ptr addrspace(1) %out, i32 %in1) ; llvm.amdgcn.permlane16 ; -------------------------------------------------------------------- -declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1 immarg, i1 immarg) +declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1 immarg, i1 immarg) define amdgpu_kernel void @permlane16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { ; CHECK-LABEL: @permlane16( -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16.i32(i32 12345, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 false) ; CHECK-NEXT: store i32 [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; - %res = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) + %res = call i32 @llvm.amdgcn.permlane16.i32(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) store i32 %res, ptr addrspace(1) %out ret void } define amdgpu_kernel void @permlane16_bound_ctrl(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { ; CHECK-LABEL: @permlane16_bound_ctrl( -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 true) +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16.i32(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 true) ; CHECK-NEXT: store i32 [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; - %res = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true) + %res = call i32 @llvm.amdgcn.permlane16.i32(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true) store i32 %res, ptr addrspace(1) %out ret void } define amdgpu_kernel void @permlane16_fetch_invalid_bound_ctrl(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { ; CHECK-LABEL: @permlane16_fetch_invalid_bound_ctrl( -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 true, i1 true) +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlane16.i32(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 true, i1 true) ; CHECK-NEXT: store i32 [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; - %res = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true) + %res = call i32 @llvm.amdgcn.permlane16.i32(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true) store i32 %res, ptr addrspace(1) %out ret void } @@ -2972,37 +2972,37 @@ define amdgpu_kernel void @permlane16_fetch_invalid_bound_ctrl(ptr addrspace(1) ; llvm.amdgcn.permlanex16 ; -------------------------------------------------------------------- -declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1 immarg, i1 immarg) +declare i32 @llvm.amdgcn.permlanex16.i32(i32, i32, i32, i32, i1 immarg, i1 immarg) define amdgpu_kernel void @permlanex16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { ; CHECK-LABEL: @permlanex16( -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 12345, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 false) ; CHECK-NEXT: store i32 [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; - %res = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) + %res = call i32 @llvm.amdgcn.permlanex16.i32(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) store i32 %res, ptr addrspace(1) %out ret void } define amdgpu_kernel void @permlanex16_bound_ctrl(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { ; CHECK-LABEL: @permlanex16_bound_ctrl( -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 true) +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 false, i1 true) ; CHECK-NEXT: store i32 [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; - %res = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true) + %res = call i32 @llvm.amdgcn.permlanex16.i32(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true) store i32 %res, ptr addrspace(1) %out ret void } define amdgpu_kernel void @permlanex16_fetch_invalid_bound_ctrl(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { ; CHECK-LABEL: @permlanex16_fetch_invalid_bound_ctrl( -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 true, i1 true) +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 undef, i32 [[SRC0:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]], i1 true, i1 true) ; CHECK-NEXT: store i32 [[RES]], ptr addrspace(1) [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; - %res = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true) + %res = call i32 @llvm.amdgcn.permlanex16.i32(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true) store i32 %res, ptr addrspace(1) %out ret void } diff --git a/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll b/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll index bb370a6d1dfeb..5a061fc286935 100644 --- a/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll +++ b/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll @@ -513,31 +513,31 @@ define i32 @test_udot4(i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3) { ret i32 %val } -declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1) +declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) define i32 @test_permlane16(ptr addrspace(1) %out, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 %arg4) { ; CHECK: immarg operand has non-immediate parameter ; CHECK-NEXT: i1 %arg3 - ; CHECK-NEXT: %v1 = call i32 @llvm.amdgcn.permlane16(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false) - %v1 = call i32 @llvm.amdgcn.permlane16(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false) + ; CHECK-NEXT: %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false) + %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false) ; CHECK: immarg operand has non-immediate parameter ; CHECK-NEXT: i1 %arg4 - ; CHECK-NEXT: call i32 @llvm.amdgcn.permlane16(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4) - %v2 = call i32 @llvm.amdgcn.permlane16(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4) + ; CHECK-NEXT: call i32 @llvm.amdgcn.permlane16.i32(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4) + %v2 = call i32 @llvm.amdgcn.permlane16.i32(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4) ret i32 %v2 } -declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1) +declare i32 @llvm.amdgcn.permlanex16.i32(i32, i32, i32, i32, i1, i1) define i32 @test_permlanex16(ptr addrspace(1) %out, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 %arg4) { ; CHECK: immarg operand has non-immediate parameter ; CHECK-NEXT: i1 %arg3 - ; CHECK-NEXT: %v1 = call i32 @llvm.amdgcn.permlanex16(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false) - %v1 = call i32 @llvm.amdgcn.permlanex16(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false) + ; CHECK-NEXT: %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false) + %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %arg0, i32 %arg0, i32 %arg1, i32 %arg2, i1 %arg3, i1 false) ; CHECK: immarg operand has non-immediate parameter ; CHECK-NEXT: i1 %arg4 - ; CHECK-NEXT: call i32 @llvm.amdgcn.permlanex16(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4) - %v2 = call i32 @llvm.amdgcn.permlanex16(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4) + ; CHECK-NEXT: call i32 @llvm.amdgcn.permlanex16.i32(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4) + %v2 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %v2, i32 %arg0, i32 %arg1, i32 %arg2, i1 false, i1 %arg4) ret i32 %v2 } From 827d209bd87738da673da40c8e7d9ce870fe9140 Mon Sep 17 00:00:00 2001 From: Vikram Date: Mon, 20 May 2024 07:36:01 -0400 Subject: [PATCH 3/9] fix builtin handling --- clang/lib/CodeGen/CGBuiltin.cpp | 2 +- clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl | 4 ++-- clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl | 2 +- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 5 ++--- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 9ce2f5b6c103b..4184074860bcc 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18493,7 +18493,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, llvm::Value *Src4 = EmitScalarExpr(E->getArg(4)); llvm::Value *Src5 = EmitScalarExpr(E->getArg(5)); - llvm::Function *F = CGM.getIntrinsic(IID, Src1->getType()); + llvm::Function *F = CGM.getIntrinsic(IID, Src0->getType()); return Builder.CreateCall(F, {Src0, Src1, Src2, Src3, Src4, Src5}); } case AMDGPU::BI__builtin_amdgcn_permlane64: diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl index 3c40370e7f107..bc4933edeb32e 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl @@ -7,13 +7,13 @@ typedef unsigned int uint; typedef unsigned long ulong; // CHECK-LABEL: @test_permlane16( -// CHECK: call i32 @llvm.amdgcn.permlane16(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false) +// CHECK: call i32 @llvm.amdgcn.permlane16.i32(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false) void test_permlane16(global uint* out, uint a, uint b, uint c, uint d) { *out = __builtin_amdgcn_permlane16(a, b, c, d, 0, 0); } // CHECK-LABEL: @test_permlanex16( -// CHECK: call i32 @llvm.amdgcn.permlanex16(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false) +// CHECK: call i32 @llvm.amdgcn.permlanex16.i32(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false) void test_permlanex16(global uint* out, uint a, uint b, uint c, uint d) { *out = __builtin_amdgcn_permlanex16(a, b, c, d, 0, 0); } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl index d17ff81e5d43c..2f45ef326eef2 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl @@ -35,7 +35,7 @@ void test_ds_bvh_stack_rtn(global uint2* out, uint addr, uint data, uint4 data1) } // CHECK-LABEL: @test_permlane64( -// CHECK: call i32 @llvm.amdgcn.permlane64(i32 %a) +// CHECK: call i32 @llvm.amdgcn.permlane64.i32(i32 %a) void test_permlane64(global uint* out, uint a) { *out = __builtin_amdgcn_permlane64(a); } diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 63f7f48e82e4a..ba2582588db94 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2487,14 +2487,14 @@ def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS; //===----------------------------------------------------------------------===// // llvm.amdgcn.permlane16 -def int_amdgcn_permlane16 : ClangBuiltin<"__builtin_amdgcn_permlane16">, +def int_amdgcn_permlane16 : Intrinsic<[llvm_any_ty], [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg>, ImmArg>, IntrNoCallback, IntrNoFree]>; // llvm.amdgcn.permlanex16 -def int_amdgcn_permlanex16 : ClangBuiltin<"__builtin_amdgcn_permlanex16">, +def int_amdgcn_permlanex16 : Intrinsic<[llvm_any_ty], [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, @@ -2539,7 +2539,6 @@ def int_amdgcn_image_bvh_intersect_ray : // llvm.amdgcn.permlane64 def int_amdgcn_permlane64 : - ClangBuiltin<"__builtin_amdgcn_permlane64">, Intrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; From 6047848af702cc5d781442ee8fb339a9ab567e65 Mon Sep 17 00:00:00 2001 From: Vikram Date: Mon, 27 May 2024 06:47:09 +0000 Subject: [PATCH 4/9] Review comments --- clang/lib/CodeGen/CGBuiltin.cpp | 9 ++++----- llvm/docs/AMDGPUUsage.rst | 19 +++++++++++++++++++ 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 4184074860bcc..d9ab0051f0d23 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18481,11 +18481,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, } case AMDGPU::BI__builtin_amdgcn_permlane16: case AMDGPU::BI__builtin_amdgcn_permlanex16: { - Intrinsic::ID IID; - IID = BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16 - ? Intrinsic::amdgcn_permlane16 - : Intrinsic::amdgcn_permlanex16; - llvm::Value *Src0 = EmitScalarExpr(E->getArg(0)); llvm::Value *Src1 = EmitScalarExpr(E->getArg(1)); llvm::Value *Src2 = EmitScalarExpr(E->getArg(2)); @@ -18493,6 +18488,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, llvm::Value *Src4 = EmitScalarExpr(E->getArg(4)); llvm::Value *Src5 = EmitScalarExpr(E->getArg(5)); + Intrinsic::ID IID = BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16 + ? Intrinsic::amdgcn_permlane16 + : Intrinsic::amdgcn_permlanex16; + llvm::Function *F = CGM.getIntrinsic(IID, Src0->getType()); return Builder.CreateCall(F, {Src0, Src1, Src2, Src3, Src4, Src5}); } diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 51969be85648f..97e2aecd614ed 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -1190,6 +1190,25 @@ The AMDGPU backend implements the following LLVM IR intrinsics. reduction will be performed using default iterative strategy. Intrinsic is currently only implemented for i32. + llvm.amdgcn.permlane16 Provides direct access to v_permlane16_b32. Performs arbitrary gather-style + operation within a row (16 contiguous lanes) of the second input operand. + The third and fourth inputs must be scalar values. these are combined into + a single 64-bit value representing lane selects used to swizzle within each + row. Currently implemented for i16, i32, float, half, bf16, v2i16, v2f16 and + types whose sizes are multiples of 32-bit. + + llvm.amdgcn.permlanex16 Provides direct access to v_permlanex16_b32. Performs arbitrary gather-style + operation across two rows of the second input operand (each row is 16 contiguous + lanes). The third and fourth inputs must be scalar values. these are combined + into a single 64-bit value representing lane selects used to swizzle within each + row. Currently implemented for i16, i32, float, half, bf16, v2i16, v2f16 and types + whose sizes are multiples of 32-bit. + + llvm.amdgcn.permlane64 Provides direct access to v_permlane64_b32. Performs a specific permutation across + lanes of the input operand where the high half and low half of a wave64 are swapped. + Performs no operation in wave32 mode. Currently implemented for i16, i32, float, + half, bf16, v2i16, v2f16 and types whose sizes are multiples of 32-bit. + llvm.amdgcn.udot2 Provides direct access to v_dot2_u32_u16 across targets which support such instructions. This performs unsigned dot product with two v2i16 operands, summed with the third i32 operand. The From 8a36f078ea65a267713cfe477f62500fa1964e21 Mon Sep 17 00:00:00 2001 From: Vikram Date: Wed, 29 May 2024 11:54:16 +0000 Subject: [PATCH 5/9] updated test cases, added new pointer/vector tests --- .../CodeGen/AMDGPU/llvm.amdgcn.permlane.ll | 7558 +++++++++++++++-- .../AMDGPU/llvm.amdgcn.permlane.ptr.ll | 694 ++ .../CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll | 401 +- .../AMDGPU/llvm.amdgcn.permlane64.ptr.ll | 180 + 4 files changed, 8081 insertions(+), 752 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll index b8dab361346c3..1ae22c3eec185 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll @@ -6,10 +6,7 @@ ; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s ; RUN: llc -global-isel=1 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s -declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) -declare float @llvm.amdgcn.permlane16.f32(float, float, i32, i32, i1, i1) -declare i64 @llvm.amdgcn.permlane16.i64(i64, i64, i32, i32, i1, i1) -declare double @llvm.amdgcn.permlane16.f64(double, double, i32, i32, i1, i1) +declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1) declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1) declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.workitem.id.y() @@ -1752,7 +1749,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i64(ptr addrspace(1) %out, i64 % ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm %tidy = call i32 @llvm.amdgcn.workitem.id.y() - %v = call i64 @llvm.amdgcn.permlane16(i64 %src0, i64 %src0, i32 %src1, i32 %tidy, i1 false, i1 false) + %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %tidy, i1 false, i1 false) store i64 %v, ptr addrspace(1) %out ret void } @@ -2811,8 +2808,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f64(ptr addrspace(1) %out, ret void } -define amdgpu_kernel void @v_permlanex16_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_b32_vss: +define amdgpu_kernel void @v_permlanex16_b32_vss_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_vss_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 @@ -2824,7 +2821,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vss(ptr addrspace(1) %out, i32 %src ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlanex16_b32_vss: +; GFX11-LABEL: v_permlanex16_b32_vss_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 @@ -2838,7 +2835,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vss(ptr addrspace(1) %out, i32 %src ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlanex16_b32_vss: +; GFX12-LABEL: v_permlanex16_b32_vss_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 @@ -2851,1112 +2848,7189 @@ define amdgpu_kernel void @v_permlanex16_b32_vss(ptr addrspace(1) %out, i32 %src ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) + %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlanex16_b32_vii(ptr addrspace(1) %out, i32 %src0) { -; GFX10-LABEL: v_permlanex16_b32_vii: +define amdgpu_kernel void @v_permlanex16_b32_vss_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_vss_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, 1, 2 -; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlanex16_b32_vii: +; GFX11-LABEL: v_permlanex16_b32_vss_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, 1, 2 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlanex16_b32_vii: +; GFX12-LABEL: v_permlanex16_b32_vss_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, 1, 2 -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 1, i32 2, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out + %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store float %v, ptr addrspace(1) %out ret void } -; FIXME-GFX10PLUS: It is allowed to have both immediates as literals -define amdgpu_kernel void @v_permlanex16_b32_vll(ptr addrspace(1) %out, i32 %src0) { -; GFX10-LABEL: v_permlanex16_b32_vll: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: s_movk_i32 s0, 0x1234 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, 0xc1d1 -; GFX10-NEXT: global_store_dword v1, v0, s[2:3] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlanex16_b32_vll: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX11-NEXT: s_movk_i32 s2, 0x1234 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +define amdgpu_kernel void @v_permlanex16_b32_vss_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlanex16_b32_vll: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX12-NEXT: s_movk_i32 s2, 0x1234 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src0) { -; GFX10-LABEL: v_permlanex16_b32_vvv: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 null, 0 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-NEXT: global_store_dword v1, v0, s[2:3] -; GFX10-NEXT: s_endpgm +; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_permlanex16_b32_vvv: +; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_permlanex16_b32_vvv: +; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 -; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv: +; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_i64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv: +; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_i64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidy = call i32 @llvm.amdgcn.workitem.id.y() - %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out + %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src0, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs: +define amdgpu_kernel void @v_permlanex16_b32_vss_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_f64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; -; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs: +; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_f64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs: +; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs: +; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 -; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs: +; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs: +; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) + %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vii_i32(ptr addrspace(1) %out, i32 %src0) { +; GFX10-LABEL: v_permlanex16_b32_vii_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_vii_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_vii_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 1, i32 2, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlanex16_b32_vsv(ptr addrspace(1) %out, i32 %src0, i32 %src1) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv: +define amdgpu_kernel void @v_permlanex16_b32_vii_f32(ptr addrspace(1) %out, float %src0) { +; GFX10-LABEL: v_permlanex16_b32_vii_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_vii_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_vii_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 1, i32 2, i1 false, i1 false) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 %src0) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vii_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s3, s2 -; GFX10-SDAG-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, 1, 2 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; -; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv: +; GFX10-GISEL-LABEL: v_permlanex16_b32_vii_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, 1, 2 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv: +; GFX11-SDAG-LABEL: v_permlanex16_b32_vii_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, 1, 2 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv: +; GFX11-GISEL-LABEL: v_permlanex16_b32_vii_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 -; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, 1, 2 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv: +; GFX12-SDAG-LABEL: v_permlanex16_b32_vii_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, 1, 2 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv: +; GFX12-GISEL-LABEL: v_permlanex16_b32_vii_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, 1, 2 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm - %tidy = call i32 @llvm.amdgcn.workitem.id.y() - %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out + %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 1, i32 2, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, double %src0) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vii_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, 1, 2 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vii_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, 1, 2 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vii_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, 1, 2 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vii_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, 1, 2 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vii_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, 1, 2 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vii_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, 1, 2 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 1, i32 2, i1 false, i1 false) + store double %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlanex16_b32_vss_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_b32_vss_fi: +; FIXME-GFX10PLUS: It is allowed to have both immediates as literals +define amdgpu_kernel void @v_permlanex16_b32_vll_i32(ptr addrspace(1) %out, i32 %src0) { +; GFX10-LABEL: v_permlanex16_b32_vll_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_movk_i32 s0, 0x1234 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[1,0] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, 0xc1d1 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlanex16_b32_vss_fi: +; GFX11-LABEL: v_permlanex16_b32_vll_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlanex16_b32_vss_fi: +; GFX12-LABEL: v_permlanex16_b32_vll_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-NEXT: s_movk_i32 s2, 0x1234 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 false) + %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlanex16_b32_vss_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_b32_vss_bc: +define amdgpu_kernel void @v_permlanex16_b32_vll_f32(ptr addrspace(1) %out, float %src0) { +; GFX10-LABEL: v_permlanex16_b32_vll_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_movk_i32 s0, 0x1234 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[0,1] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, 0xc1d1 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlanex16_b32_vss_bc: +; GFX11-LABEL: v_permlanex16_b32_vll_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlanex16_b32_vss_bc: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_b32_vss_fi_bc: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[1,1] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlanex16_b32_vss_fi_bc: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlanex16_b32_vss_fi_bc: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_tid_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_b32_tid_tid: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlane16_b32_tid_tid: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: v_permlane16_b32_tid_tid: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %v = call i32 @llvm.amdgcn.permlane16(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane16_b32_undef_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_b32_undef_tid: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlane16_b32_undef_tid: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_undef_tid: +; GFX12-LABEL: v_permlanex16_b32_vll_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-NEXT: s_movk_i32 s2, 0x1234 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %undef = freeze i32 poison - %v = call i32 @llvm.amdgcn.permlane16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out + %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 4660, i32 49617, i1 false, i1 false) + store float %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_i_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid: +define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 %src0) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vll_i64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 -; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: s_movk_i32 s2, 0x1234 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; -; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid: +; GFX10-GISEL-LABEL: v_permlanex16_b32_vll_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: s_movk_i32 s2, 0x1234 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid: +; GFX11-SDAG-LABEL: v_permlanex16_b32_vll_i64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 -; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid: +; GFX11-GISEL-LABEL: v_permlanex16_b32_vll_i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid: +; GFX12-SDAG-LABEL: v_permlanex16_b32_vll_i64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 -; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1234 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid: +; GFX12-GISEL-LABEL: v_permlanex16_b32_vll_i64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: s_movk_i32 s2, 0x1234 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %v = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out + %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 4660, i32 49617, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_i_tid_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_b32_i_tid_fi: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_permlane16_b32_i_tid_fi: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, double %src0) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vll_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: s_movk_i32 s2, 0x1234 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-SDAG-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_i_tid_fi: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 +; GFX10-GISEL-LABEL: v_permlanex16_b32_vll_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: s_movk_i32 s2, 0x1234 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vll_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vll_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vll_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1234 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vll_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: s_movk_i32 s2, 0x1234 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 4660, i32 49617, i1 false, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 %src0) { +; GFX10-LABEL: v_permlanex16_b32_vvv_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 null, 0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vvv_i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vvv_i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv_i32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv_i32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, float %src0) { +; GFX10-LABEL: v_permlanex16_b32_vvv_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 null, 0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vvv_f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vvv_f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv_f32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv_f32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vvv_i64(ptr addrspace(1) %out, i64 %src0) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vvv_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v1 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vvv_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_vvv_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_vvv_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_readfirstlane_b32 s5, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: v_readfirstlane_b32 s4, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vvv_f64(ptr addrspace(1) %out, double %src0) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vvv_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v1 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vvv_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_vvv_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX11-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_vvv_f64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_readfirstlane_b32 s5, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: v_readfirstlane_b32 s4, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 %src0, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_i32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_i32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_i32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_i32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, float %src0, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_f32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_f32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_f32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_f32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %tidx, i32 %src2, i1 false, i1 false) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vvs_i64(ptr addrspace(1) %out, i64 %src0, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s2 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s2 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s2 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s2 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vvs_f64(ptr addrspace(1) %out, double %src0, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s2 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s2 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s2 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s2 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %tidx, i32 %src2, i1 false, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_i32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s3, s2 +; GFX10-SDAG-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_i32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_i32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_i32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, float %src0, i32 %src1) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_f32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s3, s2 +; GFX10-SDAG-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_f32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_f32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_f32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %tidy, i1 false, i1 false) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vsv_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s0 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s0 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s0 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s0 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %tidy, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vsv_f64(ptr addrspace(1) %out, double %src0, i32 %src1) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s0 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s0 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s0 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s0 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %tidy, i1 false, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_vss_fi_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[1,0] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_vss_fi_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_vss_fi_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_vss_fi_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[1,0] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_vss_fi_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_vss_fi_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 false) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_fi_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_fi_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_fi_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_fi_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_fi_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_fi_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_vss_bc_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[0,1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_vss_bc_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_vss_bc_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_vss_bc_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[0,1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_vss_bc_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_vss_bc_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 true) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_bc_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_bc_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_bc_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_bc_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_bc_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_bc_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 true) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_bc_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_bc_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_bc_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_bc_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_bc_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_bc_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 true) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_vss_fi_bc_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[1,1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_vss_fi_bc_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_vss_fi_bc_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f32(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_vss_fi_bc_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[1,1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_vss_fi_bc_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_vss_fi_bc_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 true) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 true) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f64(ptr addrspace(1) %out, double %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 true) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_tid_tid_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_tid_tid_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlane16_b32_tid_tid_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlane16_b32_tid_tid_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_tid_tid_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_tid_tid_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlane16_b32_tid_tid_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlane16_b32_tid_tid_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %v = call float @llvm.amdgcn.permlane16.f32(float %tidx_f32, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_tid_tid_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_tid_tid_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_tid_tid_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_tid_tid_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_tid_tid_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_tid_tid_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_tid_tid_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_i64 = zext i32 %tidx to i64 + %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %tidx_i64, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_tid_tid_f64(ptr addrspace(1) %out, float %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_tid_tid_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_tid_tid_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_tid_tid_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_tid_tid_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_tid_tid_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_tid_tid_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %tidx_f64 = fpext float %tidx_f32 to double + %v = call double @llvm.amdgcn.permlane16.f64(double %tidx_f64, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_undef_tid_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_undef_tid_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlane16_b32_undef_tid_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlane16_b32_undef_tid_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %undef = freeze i32 poison + %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_undef_tid_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_undef_tid_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlane16_b32_undef_tid_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlane16_b32_undef_tid_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %undef = freeze float poison + %v = call float @llvm.amdgcn.permlane16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_undef_tid_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_undef_tid_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_undef_tid_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_undef_tid_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_undef_tid_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_undef_tid_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_undef_tid_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_i64 = zext i32 %tidx to i64 + %undef = freeze i64 poison + %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_undef_tid_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_undef_tid_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_undef_tid_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_undef_tid_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_undef_tid_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_undef_tid_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_undef_tid_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %tidx_f64 = fpext float %tidx_f32 to double + %undef = freeze double poison + %v = call double @llvm.amdgcn.permlane16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_i_tid_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_i32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_i32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_i32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_i32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlane16.i32(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_i_tid_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_f32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x449a5000 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_f32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_f32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_f32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %v = call float @llvm.amdgcn.permlane16.f32(float 1234.5, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_i_tid_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s2, s3 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX10-SDAG-NEXT: global_store_dwordx2 v3, v[1:2], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s2, s3 +; GFX10-GISEL-NEXT: global_store_dwordx2 v3, v[1:2], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s2, s3 +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b64 v3, v[1:2], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s2, s3 +; GFX11-GISEL-NEXT: global_store_b64 v3, v[1:2], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s2, s3 +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b64 v3, v[1:2], s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s2, s3 +; GFX12-GISEL-NEXT: global_store_b64 v3, v[1:2], s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_i64 = zext i32 %tidx to i64 + %v = call i64 @llvm.amdgcn.permlane16.i64(i64 12345, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_i_tid_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x40934a00 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v1, s2, s3 +; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v0, s2, s3 +; GFX10-SDAG-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x40934a00 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v1, s2, s3 +; GFX10-GISEL-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v1, s2, s3 +; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b64 v4, v[2:3], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v1, s2, s3 +; GFX11-GISEL-NEXT: global_store_b64 v4, v[2:3], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v1, s2, s3 +; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b64 v4, v[2:3], s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v0, s2, s3 +; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v1, s2, s3 +; GFX12-GISEL-NEXT: global_store_b64 v4, v[2:3], s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %tidx_f64 = fpext float %tidx_f32 to double + %v = call double @llvm.amdgcn.permlane16.f64(double 1234.5, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_i_tid_fi_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlane16_b32_i_tid_fi_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlane16_b32_i_tid_fi_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %undef = freeze i32 poison + %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_i_tid_fi_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlane16_b32_i_tid_fi_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlane16_b32_i_tid_fi_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %undef = freeze float poison + %v = call float @llvm.amdgcn.permlane16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 true, i1 false) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_fi_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_fi_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_fi_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,0] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,0] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,0] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_i64 = zext i32 %tidx to i64 + %undef = freeze i64 poison + %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 true, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_fi_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_fi_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_fi_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %tidx_f64 = fpext float %tidx_f32 to double + %undef = freeze double poison + %v = call double @llvm.amdgcn.permlane16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 true, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_i_tid_bc_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlane16_b32_i_tid_bc_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlane16_b32_i_tid_bc_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %undef = freeze i32 poison + %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 true) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_i_tid_bc_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlane16_b32_i_tid_bc_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlane16_b32_i_tid_bc_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %undef = freeze float poison + %v = call float @llvm.amdgcn.permlane16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 true) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_bc_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_bc_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_bc_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[0,1] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_bc_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[0,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_bc_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[0,1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_bc_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[0,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_i64 = zext i32 %tidx to i64 + %undef = freeze i64 poison + %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 true) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_bc_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_bc_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_bc_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_bc_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_bc_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_bc_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %tidx_f64 = fpext float %tidx_f32 to double + %undef = freeze double poison + %v = call double @llvm.amdgcn.permlane16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 true) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_i_tid_fi_bc_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlane16_b32_i_tid_fi_bc_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlane16_b32_i_tid_fi_bc_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %undef = freeze i32 poison + %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 true) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_b32_i_tid_fi_bc_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlane16_b32_i_tid_fi_bc_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlane16_b32_i_tid_fi_bc_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %undef = freeze float poison + %v = call float @llvm.amdgcn.permlane16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 true, i1 true) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,1] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_i64 = zext i32 %tidx to i64 + %undef = freeze i64 poison + %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 true, i1 true) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %tidx_f64 = fpext float %tidx_f32 to double + %undef = freeze double poison + %v = call double @llvm.amdgcn.permlane16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 true, i1 true) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_tid_tid_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_tid_tid_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_tid_tid_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_tid_tid_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_tid_tid_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_tid_tid_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %v = call float @llvm.amdgcn.permlanex16.f32(float %tidx_f32, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_tid_tid_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_tid_tid_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_tid_tid_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_tid_tid_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_tid_tid_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_tid_tid_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_i64 = zext i32 %tidx to i64 + %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %tidx_i64, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_tid_tid_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_tid_tid_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_tid_tid_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_tid_tid_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_tid_tid_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_tid_tid_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %tidx_f64 = fpext float %tidx_f32 to double + %v = call double @llvm.amdgcn.permlanex16.f64(double %tidx_f64, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_undef_tid_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_undef_tid_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_undef_tid_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %undef = freeze i32 poison + %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_undef_tid_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_undef_tid_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_undef_tid_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %undef = freeze float poison + %v = call float @llvm.amdgcn.permlanex16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_undef_tid_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_undef_tid_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_undef_tid_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_undef_tid_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_undef_tid_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_undef_tid_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_i64 = zext i32 %tidx to i64 + %undef = freeze i64 poison + %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_undef_tid_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_undef_tid_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_undef_tid_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_undef_tid_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_undef_tid_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_undef_tid_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %tidx_f64 = fpext float %tidx_f32 to double + %undef = freeze double poison + %v = call double @llvm.amdgcn.permlanex16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_i_tid_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_i32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_i32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_i32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_i32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_i_tid_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_f32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x449a5000 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_f32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_f32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_f32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %v = call float @llvm.amdgcn.permlanex16.f32(float 1234.5, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_i_tid_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s2, s3 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX10-SDAG-NEXT: global_store_dwordx2 v3, v[1:2], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s2, s3 +; GFX10-GISEL-NEXT: global_store_dwordx2 v3, v[1:2], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s2, s3 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b64 v3, v[1:2], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s2, s3 +; GFX11-GISEL-NEXT: global_store_b64 v3, v[1:2], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s2, s3 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b64 v3, v[1:2], s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s2, s3 +; GFX12-GISEL-NEXT: global_store_b64 v3, v[1:2], s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_i64 = zext i32 %tidx to i64 + %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 12345, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_i_tid_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x40934a00 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v1, s2, s3 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v0, s2, s3 +; GFX10-SDAG-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x40934a00 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v1, s2, s3 +; GFX10-GISEL-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v1, s2, s3 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b64 v4, v[2:3], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v1, s2, s3 +; GFX11-GISEL-NEXT: global_store_b64 v4, v[2:3], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v1, s2, s3 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b64 v4, v[2:3], s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v0, s2, s3 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v1, s2, s3 +; GFX12-GISEL-NEXT: global_store_b64 v4, v[2:3], s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %tidx_f64 = fpext float %tidx_f32 to double + %v = call double @llvm.amdgcn.permlanex16.f64(double 1234.5, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %undef = freeze i32 poison + %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 false) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %undef = freeze float poison + %v = call float @llvm.amdgcn.permlanex16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 true, i1 false) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,0] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,0] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,0] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_i64 = zext i32 %tidx to i64 + %undef = freeze i64 poison + %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 true, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %tidx_f64 = fpext float %tidx_f32 to double + %undef = freeze double poison + %v = call double @llvm.amdgcn.permlanex16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 true, i1 false) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_i_tid_bc_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_permlanex16_b32_i_tid_bc_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_permlanex16_b32_i_tid_bc_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison - %v = call i32 @llvm.amdgcn.permlane16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 false) + %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 true) store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_i_tid_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_b32_i_tid_bc: +define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_i_tid_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_i_tid_bc: +; GFX11-LABEL: v_permlanex16_b32_i_tid_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_i_tid_bc: +; GFX12-LABEL: v_permlanex16_b32_i_tid_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %undef = freeze i32 poison - %v = call i32 @llvm.amdgcn.permlane16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 true) - store i32 %v, ptr addrspace(1) %out + %tidx_f32 = bitcast i32 %tidx to float + %undef = freeze float poison + %v = call float @llvm.amdgcn.permlanex16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 true) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[0,1] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[0,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[0,1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[0,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_i64 = zext i32 %tidx to i64 + %undef = freeze i64 poison + %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 true) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %tidx_f64 = fpext float %tidx_f32 to double + %undef = freeze double poison + %v = call double @llvm.amdgcn.permlanex16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 false, i1 true) + store double %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlane16_b32_i_tid_fi_bc: +define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_bc_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_i_tid_fi_bc: +; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_bc_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_i_tid_fi_bc: +; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_bc_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison - %v = call i32 @llvm.amdgcn.permlane16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 true) + %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 true) store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlanex16_b32_tid_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_b32_tid_tid: +define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlanex16_b32_tid_tid: +; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlanex16_b32_tid_tid: +; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %v = call i32 @llvm.amdgcn.permlanex16(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out + %tidx_f32 = bitcast i32 %tidx to float + %undef = freeze float poison + %v = call float @llvm.amdgcn.permlanex16.f32(float %undef, float %tidx_f32, i32 %src1, i32 %src2, i1 true, i1 true) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,1] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_i64 = zext i32 %tidx to i64 + %undef = freeze i64 poison + %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %undef, i64 %tidx_i64, i32 %src1, i32 %src2, i1 true, i1 true) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %tidx_f64 = fpext float %tidx_f32 to double + %undef = freeze double poison + %v = call double @llvm.amdgcn.permlanex16.f64(double %undef, double %tidx_f64, i32 %src1, i32 %src2, i1 true, i1 true) + store double %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlanex16_b32_undef_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_b32_undef_tid: +define void @v_permlane16_half(ptr addrspace(1) %out, half %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_half: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s4, v3 +; GFX10-NEXT: v_readfirstlane_b32 s5, v4 +; GFX10-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-NEXT: global_store_short v[0:1], v2, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_permlane16_half: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_permlanex16_b32_undef_tid: +; GFX12-LABEL: v_permlane16_half: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v3 +; GFX12-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-NEXT: s_setpc_b64 s[30:31] + %v = call half @llvm.amdgcn.permlane16.f16(half %src0, half %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store half %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlanex16_half(ptr addrspace(1) %out, half %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_half: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s4, v3 +; GFX10-NEXT: v_readfirstlane_b32 s5, v4 +; GFX10-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-NEXT: global_store_short v[0:1], v2, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_permlanex16_half: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_permlanex16_half: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v3 +; GFX12-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-NEXT: s_setpc_b64 s[30:31] + %v = call half @llvm.amdgcn.permlanex16.f16(half %src0, half %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store half %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlane16_bfloat(ptr addrspace(1) %out, bfloat %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_bfloat: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s4, v3 +; GFX10-NEXT: v_readfirstlane_b32 s5, v4 +; GFX10-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-NEXT: global_store_short v[0:1], v2, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_permlane16_bfloat: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_permlane16_bfloat: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v3 +; GFX12-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-NEXT: s_setpc_b64 s[30:31] + %v = call bfloat @llvm.amdgcn.permlane16.f16(bfloat %src0, bfloat %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store bfloat %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlanex16_bfloat(ptr addrspace(1) %out, bfloat %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_bfloat: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s4, v3 +; GFX10-NEXT: v_readfirstlane_b32 s5, v4 +; GFX10-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-NEXT: global_store_short v[0:1], v2, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_permlanex16_bfloat: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_permlanex16_bfloat: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v3 +; GFX12-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-NEXT: s_setpc_b64 s[30:31] + %v = call bfloat @llvm.amdgcn.permlanex16.f16(bfloat %src0, bfloat %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store bfloat %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlane16_i16(ptr addrspace(1) %out, i16 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s4, v3 +; GFX10-NEXT: v_readfirstlane_b32 s5, v4 +; GFX10-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-NEXT: global_store_short v[0:1], v2, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_permlane16_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_permlane16_i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v3 +; GFX12-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-NEXT: s_setpc_b64 s[30:31] + %v = call i16 @llvm.amdgcn.permlane16.i16(i16 %src0, i16 %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store i16 %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlanex16_i16(ptr addrspace(1) %out, i16 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s4, v3 +; GFX10-NEXT: v_readfirstlane_b32 s5, v4 +; GFX10-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-NEXT: global_store_short v[0:1], v2, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_permlanex16_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_permlanex16_i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v3 +; GFX12-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-NEXT: s_setpc_b64 s[30:31] + %v = call i16 @llvm.amdgcn.permlanex16.i16(i16 %src0, i16 %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store i16 %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlane16_v2f16(ptr addrspace(1) %out, <2 x half> %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_v2f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s4, v3 +; GFX10-NEXT: v_readfirstlane_b32 s5, v4 +; GFX10-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_permlane16_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_permlane16_v2f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v3 +; GFX12-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_setpc_b64 s[30:31] + %v = call <2 x half> @llvm.amdgcn.permlane16.v2f16(<2 x half> %src0, <2 x half> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <2 x half> %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlanex16_v2f16(ptr addrspace(1) %out, <2 x half> %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_v2f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s4, v3 +; GFX10-NEXT: v_readfirstlane_b32 s5, v4 +; GFX10-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_permlanex16_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX12-LABEL: v_permlanex16_b32_undef_tid: +; GFX12-LABEL: v_permlanex16_v2f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %undef = freeze i32 poison - %v = call i32 @llvm.amdgcn.permlanex16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out +; GFX12-NEXT: v_readfirstlane_b32 s0, v3 +; GFX12-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_setpc_b64 s[30:31] + %v = call <2 x half> @llvm.amdgcn.permlanex16.v2f16(<2 x half> %src0, <2 x half> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <2 x half> %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlanex16_b32_i_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid: +define void @v_permlane16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_v2f32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 -; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v4 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_permlane16_v2f32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] -; GFX10-GISEL-NEXT: s_endpgm -; -; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v4 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v3, s4, s5 +; GFX10-GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlane16_v2f32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v4 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v5 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 -; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] -; GFX11-SDAG-NEXT: s_nop 0 -; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-SDAG-NEXT: s_endpgm +; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid: +; GFX11-GISEL-LABEL: v_permlane16_v2f32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v4 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v5 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_nop 0 -; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-GISEL-NEXT: s_endpgm +; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid: +; GFX12-SDAG-LABEL: v_permlane16_v2f32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v4 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v5 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 -; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm +; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid: +; GFX12-GISEL-LABEL: v_permlane16_v2f32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v4 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v5 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %v = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) - store i32 %v, ptr addrspace(1) %out +; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <2 x float> @llvm.amdgcn.permlane16.v2f32(<2 x float> %src0, <2 x float> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <2 x float> %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_b32_i_tid_fi: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm +define void @v_permlanex16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_v2f32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v4 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_permlanex16_v2f32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v4 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v3, s4, s5 +; GFX10-GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlanex16_v2f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v4 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_permlanex16_b32_i_tid_fi: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-GISEL-LABEL: v_permlanex16_v2f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v4 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v5 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX11-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX12-LABEL: v_permlanex16_b32_i_tid_fi: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %undef = freeze i32 poison - %v = call i32 @llvm.amdgcn.permlanex16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 false) - store i32 %v, ptr addrspace(1) %out +; GFX12-SDAG-LABEL: v_permlanex16_v2f32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v4 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v5 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: v_permlanex16_v2f32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v4 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v5 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX12-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <2 x float> @llvm.amdgcn.permlanex16.v2f32(<2 x float> %src0, <2 x float> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <2 x float> %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_b32_i_tid_bc: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm +define void @v_permlane16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_v7i32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v9 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v10 +; GFX10-SDAG-NEXT: v_permlane16_b32 v8, v8, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v7, v7, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v6, v6, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v5, v5, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_permlane16_v7i32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v9 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v10 +; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v3, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v4, v4, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v5, v5, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v6, v6, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v7, v7, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v8, v8, s4, s5 +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-GISEL-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlane16_v7i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v9 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v10 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v8, v8, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_permlanex16_b32_i_tid_bc: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-GISEL-LABEL: v_permlane16_v7i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v9 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v10 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v8, v8, s0, s1 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX12-LABEL: v_permlanex16_b32_i_tid_bc: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %undef = freeze i32 poison - %v = call i32 @llvm.amdgcn.permlanex16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 true) - store i32 %v, ptr addrspace(1) %out +; GFX12-SDAG-LABEL: v_permlane16_v7i32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v9 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v10 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v8, v8, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: v_permlane16_v7i32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v9 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v10 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v8, v8, s0, s1 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <7 x i32> @llvm.amdgcn.permlane16.v7i32(<7 x i32> %src0, <7 x i32> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <7 x i32> %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_bc: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm +define void @v_permlanex16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_v7i32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v9 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v10 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v8, v8, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v7, v7, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v6, v6, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v5, v5, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v4, v4, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_permlanex16_v7i32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v9 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v10 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v3, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v4, v4, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v5, v5, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v6, v6, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v7, v7, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v8, v8, s4, s5 +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-GISEL-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlanex16_v7i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v9 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v10 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v8, v8, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_bc: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-GISEL-LABEL: v_permlanex16_v7i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v9 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v10 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v8, v8, s0, s1 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_bc: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %undef = freeze i32 poison - %v = call i32 @llvm.amdgcn.permlanex16(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 true) - store i32 %v, ptr addrspace(1) %out +; GFX12-SDAG-LABEL: v_permlanex16_v7i32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v9 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v10 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v8, v8, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: v_permlanex16_v7i32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v9 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v10 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v8, v8, s0, s1 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <7 x i32> @llvm.amdgcn.permlanex16.v7i32(<7 x i32> %src0, <7 x i32> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <7 x i32> %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlane16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_v8i16: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v6 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v7 +; GFX10-SDAG-NEXT: v_permlane16_b32 v5, v5, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_permlane16_v8i16: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v6 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v3, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v4, v4, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v5, v5, s4, s5 +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlane16_v8i16: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v6 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_permlane16_v8i16: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v6 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlane16_v8i16: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v6 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: v_permlane16_v8i16: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v6 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <8 x i16> @llvm.amdgcn.permlane16.v8i16(<8 x i16> %src0, <8 x i16> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <8 x i16> %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlanex16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_v8i16: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v6 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v7 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v5, v5, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v4, v4, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_permlanex16_v8i16: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v6 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v7 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v3, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v4, v4, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v5, v5, s4, s5 +; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlanex16_v8i16: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v6 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_permlanex16_v8i16: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v6 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlanex16_v8i16: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v6 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: v_permlanex16_v8i16: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v6 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <8 x i16> @llvm.amdgcn.permlanex16.v8i16(<8 x i16> %src0, <8 x i16> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <8 x i16> %v, ptr addrspace(1) %out ret void } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll new file mode 100644 index 0000000000000..bb42834221681 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll @@ -0,0 +1,694 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-SDAG %s +; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-SDAG %s +; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-SDAG %s + +define void @v_permlane16_p0(ptr addrspace(1) %out, ptr %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_p0: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v4 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlane16_p0: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v4 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlane16_p0: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v4 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v5 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] + %v = call ptr @llvm.amdgcn.permlane16.p0(ptr %src0, ptr %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store ptr %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlanex16_p0(ptr addrspace(1) %out, ptr %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_p0: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v4 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlanex16_p0: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v4 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlanex16_p0: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v4 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v5 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] + %v = call ptr @llvm.amdgcn.permlanex16.p0(ptr %src0, ptr %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store ptr %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlane16_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_v3p0: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v8 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v9 +; GFX10-SDAG-NEXT: v_permlane16_b32 v7, v7, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v6, v6, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v5, v5, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlane16_v3p0: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v8 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v9 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlane16_v3p0: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v8 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v9 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] + %v = call <3 x ptr> @llvm.amdgcn.permlane16.v3p0(<3 x ptr> %src0, <3 x ptr> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <3 x ptr> %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlanex16_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_v3p0: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v8 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v9 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v7, v7, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v6, v6, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v5, v5, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v4, v4, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 +; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlanex16_v3p0: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v8 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v9 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlanex16_v3p0: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v8 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v9 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] + %v = call <3 x ptr> @llvm.amdgcn.permlanex16.v3p0(<3 x ptr> %src0, <3 x ptr> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <3 x ptr> %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlane16_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_p3: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v3 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v4 +; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dword v[0:1], v2, off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlane16_p3: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlane16_p3: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] + %v = call ptr addrspace(3) @llvm.amdgcn.permlane16.p3(ptr addrspace(3) %src0, ptr addrspace(3) %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store ptr addrspace(3) %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlanex16_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_p3: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v3 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v4 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dword v[0:1], v2, off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlanex16_p3: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlanex16_p3: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] + %v = call ptr addrspace(3) @llvm.amdgcn.permlanex16.p3(ptr addrspace(3) %src0, ptr addrspace(3) %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store ptr addrspace(3) %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlane16_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_v3p3: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v5 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx3 v[0:1], v[2:4], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlane16_v3p3: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v5 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlane16_v3p3: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] + %v = call <3 x ptr addrspace(3)> @llvm.amdgcn.permlane16.v3p3(<3 x ptr addrspace(3)> %src0, <3 x ptr addrspace(3)> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <3 x ptr addrspace(3)> %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlanex16_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_v3p3: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v5 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v4, v4, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx3 v[0:1], v[2:4], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlanex16_v3p3: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v5 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlanex16_v3p3: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] + %v = call <3 x ptr addrspace(3)> @llvm.amdgcn.permlanex16.v3p3(<3 x ptr addrspace(3)> %src0, <3 x ptr addrspace(3)> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <3 x ptr addrspace(3)> %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlane16_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_p5: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v3 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v4 +; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dword v[0:1], v2, off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlane16_p5: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlane16_p5: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] + %v = call ptr addrspace(5) @llvm.amdgcn.permlane16.p5(ptr addrspace(5) %src0, ptr addrspace(5) %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store ptr addrspace(5) %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlanex16_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_p5: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v3 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v4 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dword v[0:1], v2, off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlanex16_p5: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlanex16_p5: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] + %v = call ptr addrspace(5) @llvm.amdgcn.permlanex16.p5(ptr addrspace(5) %src0, ptr addrspace(5) %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store ptr addrspace(5) %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlane16_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_v3p5: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v5 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx3 v[0:1], v[2:4], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlane16_v3p5: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v5 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlane16_v3p5: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] + %v = call <3 x ptr addrspace(5)> @llvm.amdgcn.permlane16.v3p5(<3 x ptr addrspace(5)> %src0, <3 x ptr addrspace(5)> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <3 x ptr addrspace(5)> %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlanex16_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_v3p5: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v5 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v4, v4, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx3 v[0:1], v[2:4], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlanex16_v3p5: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v5 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlanex16_v3p5: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] + %v = call <3 x ptr addrspace(5)> @llvm.amdgcn.permlanex16.v3p5(<3 x ptr addrspace(5)> %src0, <3 x ptr addrspace(5)> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <3 x ptr addrspace(5)> %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlane16_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_p6: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v3 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v4 +; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dword v[0:1], v2, off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlane16_p6: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlane16_p6: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] + %v = call ptr addrspace(6) @llvm.amdgcn.permlane16.p6(ptr addrspace(6) %src0, ptr addrspace(6) %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store ptr addrspace(6) %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlanex16_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_p6: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v3 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v4 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dword v[0:1], v2, off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlanex16_p6: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlanex16_p6: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] + %v = call ptr addrspace(6) @llvm.amdgcn.permlanex16.p6(ptr addrspace(6) %src0, ptr addrspace(6) %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store ptr addrspace(6) %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlane16_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlane16_v3p6: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v5 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx3 v[0:1], v[2:4], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlane16_v3p6: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v5 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlane16_v3p6: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] + %v = call <3 x ptr addrspace(6)> @llvm.amdgcn.permlane16.v3p6(<3 x ptr addrspace(6)> %src0, <3 x ptr addrspace(6)> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <3 x ptr addrspace(6)> %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlanex16_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %src0, i32 %src1, i32 %src2) { +; GFX10-SDAG-LABEL: v_permlanex16_v3p6: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v5 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v4, v4, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx3 v[0:1], v[2:4], off +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_permlanex16_v3p6: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v5 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: v_permlanex16_v3p6: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] + %v = call <3 x ptr addrspace(6)> @llvm.amdgcn.permlanex16.v3p6(<3 x ptr addrspace(6)> %src0, <3 x ptr addrspace(6)> %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store <3 x ptr addrspace(6)> %v, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll index b81cb97725648..f653baa7365c7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll @@ -5,8 +5,8 @@ declare i32 @llvm.amdgcn.permlane64(i32) declare i32 @llvm.amdgcn.workitem.id.x() -define amdgpu_kernel void @test_s(ptr addrspace(1) %out, i32 %src0) { -; GFX11-LABEL: test_s: +define amdgpu_kernel void @test_s_i32(ptr addrspace(1) %out, i32 %src0) { +; GFX11-LABEL: test_s_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c @@ -19,13 +19,98 @@ define amdgpu_kernel void @test_s(ptr addrspace(1) %out, i32 %src0) { ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane64(i32 %src0) + %v = call i32 @llvm.amdgcn.permlane64.i32(i32 %src0) store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @test_i(ptr addrspace(1) %out) { -; GFX11-LABEL: test_i: +define amdgpu_kernel void @test_s_f32(ptr addrspace(1) %out, float %src0) { +; GFX11-LABEL: test_s_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane64_b32 v0, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + %v = call float @llvm.amdgcn.permlane64.f32(float %src0) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_s_i64(ptr addrspace(1) %out, i64 %src0) { +; GFX11-SDAG-LABEL: test_s_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0 +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2 +; GFX11-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_s_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm + %v = call i64 @llvm.amdgcn.permlane64.i64(i64 %src0) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_s_f64(ptr addrspace(1) %out, double %src0) { +; GFX11-SDAG-LABEL: test_s_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0 +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2 +; GFX11-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_s_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm + %v = call double @llvm.amdgcn.permlane64.f64(double %src0) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_i_i32(ptr addrspace(1) %out) { +; GFX11-LABEL: test_i_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0x63 :: v_dual_mov_b32 v1, 0 @@ -36,13 +121,95 @@ define amdgpu_kernel void @test_i(ptr addrspace(1) %out) { ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane64(i32 99) + %v = call i32 @llvm.amdgcn.permlane64.i32(i32 99) store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @test_v(ptr addrspace(1) %out, i32 %src0) #1 { -; GFX11-SDAG-LABEL: test_v: +define amdgpu_kernel void @test_i_f32(ptr addrspace(1) %out) { +; GFX11-LABEL: test_i_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0x449a5000 :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane64_b32 v0, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + %v = call float @llvm.amdgcn.permlane64.f32(float 1234.5) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_i_i64(ptr addrspace(1) %out) { +; GFX11-SDAG-LABEL: test_i_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x63 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v2 +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_i_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x63 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v2 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm + %v = call i64 @llvm.amdgcn.permlane64.i64(i64 99) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_i_f64(ptr addrspace(1) %out) { +; GFX11-SDAG-LABEL: test_i_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x40934a00 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0 +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_i_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x40934a00 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v2 +; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm + %v = call double @llvm.amdgcn.permlane64.f64(double 1234.5) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_v_i32(ptr addrspace(1) %out, i32 %src0) #1 { +; GFX11-SDAG-LABEL: test_v_i32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0 @@ -53,7 +220,7 @@ define amdgpu_kernel void @test_v(ptr addrspace(1) %out, i32 %src0) #1 { ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: test_v: +; GFX11-GISEL-LABEL: test_v_i32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 @@ -64,7 +231,221 @@ define amdgpu_kernel void @test_v(ptr addrspace(1) %out, i32 %src0) #1 { ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %v = call i32 @llvm.amdgcn.permlane64(i32 %tidx) + %v = call i32 @llvm.amdgcn.permlane64.i32(i32 %tidx) store i32 %v, ptr addrspace(1) %out ret void } + +define amdgpu_kernel void @test_v_f32(ptr addrspace(1) %out, float %src0) #1 { +; GFX11-SDAG-LABEL: test_v_f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_v_f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %v = call float @llvm.amdgcn.permlane64.f32(float %tidx_f32) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_v_i64(ptr addrspace(1) %out, i64 %src0) #1 { +; GFX11-LABEL: test_v_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: v_permlane64_b32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_permlane64_b32 v1, v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_i64 = zext i32 %tidx to i64 + %v = call i64 @llvm.amdgcn.permlane64.i64(i64 %tidx_i64) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_v_f64(ptr addrspace(1) %out, double %src0) #1 { +; GFX11-SDAG-LABEL: test_v_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v1 +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_v_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %tidx_f64 = fpext float %tidx_f32 to double + %v = call double @llvm.amdgcn.permlane64.f64(double %tidx_f64) + store double %v, ptr addrspace(1) %out + ret void +} + +define void @test_half(ptr addrspace(1) %out, half %src0) { +; GFX11-LABEL: test_half: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_permlane64_b32 v2, v2 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_setpc_b64 s[30:31] + %v = call half @llvm.amdgcn.permlane64.f16(half %src0) + store half %v, ptr addrspace(1) %out + ret void +} + +define void @test_bfloat(ptr addrspace(1) %out, bfloat %src0) { +; GFX11-LABEL: test_bfloat: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_permlane64_b32 v2, v2 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_setpc_b64 s[30:31] + %v = call bfloat @llvm.amdgcn.permlane64.bf16(bfloat %src0) + store bfloat %v, ptr addrspace(1) %out + ret void +} + +define void @test_i16(ptr addrspace(1) %out, i16 %src0) { +; GFX11-LABEL: test_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_permlane64_b32 v2, v2 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_setpc_b64 s[30:31] + %v = call i16 @llvm.amdgcn.permlane64.i16(i16 %src0) + store i16 %v, ptr addrspace(1) %out + ret void +} + +define void @test_v2f16(ptr addrspace(1) %out, <2 x half> %src0) { +; GFX11-LABEL: test_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_permlane64_b32 v2, v2 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_setpc_b64 s[30:31] + %v = call <2 x half> @llvm.amdgcn.permlane64.v2f16(<2 x half> %src0) + store <2 x half> %v, ptr addrspace(1) %out + ret void +} + +define void @test_v2f32(ptr addrspace(1) %out, <2 x float> %src0) { +; GFX11-SDAG-LABEL: test_v2f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3 +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 +; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_v2f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 +; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3 +; GFX11-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <2 x float> @llvm.amdgcn.permlane64.v2f32(<2 x float> %src0) + store <2 x float> %v, ptr addrspace(1) %out + ret void +} + +define void @test_v7i32(ptr addrspace(1) %out, <7 x i32> %src0) { +; GFX11-SDAG-LABEL: test_v7i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_permlane64_b32 v8, v8 +; GFX11-SDAG-NEXT: v_permlane64_b32 v7, v7 +; GFX11-SDAG-NEXT: v_permlane64_b32 v6, v6 +; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5 +; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4 +; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3 +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_v7i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 +; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3 +; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4 +; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5 +; GFX11-GISEL-NEXT: v_permlane64_b32 v6, v6 +; GFX11-GISEL-NEXT: v_permlane64_b32 v7, v7 +; GFX11-GISEL-NEXT: v_permlane64_b32 v8, v8 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <7 x i32> @llvm.amdgcn.permlane64.v7i32(<7 x i32> %src0) + store <7 x i32> %v, ptr addrspace(1) %out + ret void +} + +define void @test_v8i16(ptr addrspace(1) %out, <8 x i16> %src0) { +; GFX11-SDAG-LABEL: test_v8i16: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5 +; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4 +; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3 +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_v8i16: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 +; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3 +; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4 +; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <8 x i16> @llvm.amdgcn.permlane64.v8i16(<8 x i16> %src0) + store <8 x i16> %v, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll new file mode 100644 index 0000000000000..2070a832e0fcd --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll @@ -0,0 +1,180 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-SDAG %s + +define amdgpu_kernel void @test_p0(ptr addrspace(1) %out, ptr %src0) { +; GFX11-SDAG-LABEL: test_p0: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0 +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2 +; GFX11-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm + %v = call ptr @llvm.amdgcn.permlane64.p0(ptr %src0) + store ptr %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0) { +; GFX11-SDAG-LABEL: test_v3p0: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x2 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x44 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x54 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v1, s6 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v8, s2 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v7, s4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v1 +; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v4 +; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5 +; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v8 +; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v0 +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v7 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v6, v[0:3], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm + %v = call <3 x ptr> @llvm.amdgcn.permlane64.v3p0(<3 x ptr> %src0) + store <3 x ptr> %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0) { +; GFX11-SDAG-LABEL: test_p3: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm + %v = call ptr addrspace(3) @llvm.amdgcn.permlane64.v3p0(ptr addrspace(3) %src0) + store ptr addrspace(3) %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %src0) { +; GFX11-SDAG-LABEL: test_v3p3: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s5 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v0 +; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v3 +; GFX11-SDAG-NEXT: global_store_b96 v4, v[0:2], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm + %v = call <3 x ptr addrspace(3)> @llvm.amdgcn.permlane64.v3p3(<3 x ptr addrspace(3)> %src0) + store <3 x ptr addrspace(3)> %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0) { +; GFX11-SDAG-LABEL: test_p5: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm + %v = call ptr addrspace(5) @llvm.amdgcn.permlane64.p5(ptr addrspace(5) %src0) + store ptr addrspace(5) %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %src0) { +; GFX11-SDAG-LABEL: test_v3p5: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s5 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v0 +; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v3 +; GFX11-SDAG-NEXT: global_store_b96 v4, v[0:2], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm + %v = call <3 x ptr addrspace(5)> @llvm.amdgcn.permlane64.v3p5(<3 x ptr addrspace(5)> %src0) + store <3 x ptr addrspace(5)> %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0) { +; GFX11-SDAG-LABEL: test_p6: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm + %v = call ptr addrspace(6) @llvm.amdgcn.permlane64.p6(ptr addrspace(6) %src0) + store ptr addrspace(6) %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %src0) { +; GFX11-SDAG-LABEL: test_v3p6: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s5 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v0 +; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v3 +; GFX11-SDAG-NEXT: global_store_b96 v4, v[0:2], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm + %v = call <3 x ptr addrspace(6)> @llvm.amdgcn.permlane64.v3p6(<3 x ptr addrspace(6)> %src0) + store <3 x ptr addrspace(6)> %v, ptr addrspace(1) %out + ret void +} From 5a4c4c4dfbcbe85746947a6cfefa8dd48d793bd2 Mon Sep 17 00:00:00 2001 From: Vikram Date: Fri, 31 May 2024 08:11:42 +0000 Subject: [PATCH 6/9] Take over recent changes from original patch --- llvm/docs/AMDGPUUsage.rst | 13 +- .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 186 +++--------- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 109 +++++-- .../AMDGPU/llvm.amdgcn.readfirstlane.ll | 58 ---- .../AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll | 34 +++ .../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll | 66 ----- .../AMDGPU/llvm.amdgcn.readlane.ptr.ll | 38 +++ .../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll | 277 ------------------ .../AMDGPU/llvm.amdgcn.writelane.ptr.ll | 144 +++++++++ 9 files changed, 347 insertions(+), 578 deletions(-) diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 97e2aecd614ed..2d24d9e26dbed 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -1194,20 +1194,21 @@ The AMDGPU backend implements the following LLVM IR intrinsics. operation within a row (16 contiguous lanes) of the second input operand. The third and fourth inputs must be scalar values. these are combined into a single 64-bit value representing lane selects used to swizzle within each - row. Currently implemented for i16, i32, float, half, bf16, v2i16, v2f16 and - types whose sizes are multiples of 32-bit. + row. Currently implemented for i16, i32, float, half, bfloat, <2 x i16>, + <2 x half>, <2 x bfloat>, i64, double, pointers, multiples of the 32-bit vectors. llvm.amdgcn.permlanex16 Provides direct access to v_permlanex16_b32. Performs arbitrary gather-style operation across two rows of the second input operand (each row is 16 contiguous lanes). The third and fourth inputs must be scalar values. these are combined into a single 64-bit value representing lane selects used to swizzle within each - row. Currently implemented for i16, i32, float, half, bf16, v2i16, v2f16 and types - whose sizes are multiples of 32-bit. + row. Currently implemented for i16, i32, float, half, bfloat, <2 x i16>, <2 x half>, + <2 x bfloat>, i64, double, pointers, multiples of the 32-bit vectors. llvm.amdgcn.permlane64 Provides direct access to v_permlane64_b32. Performs a specific permutation across lanes of the input operand where the high half and low half of a wave64 are swapped. - Performs no operation in wave32 mode. Currently implemented for i16, i32, float, - half, bf16, v2i16, v2f16 and types whose sizes are multiples of 32-bit. + Performs no operation in wave32 mode. Currently implemented for i16, i32, float, half, + bfloat, <2 x i16>, <2 x half>, <2 x bfloat>, i64, double, pointers, multiples of the + 32-bit vectors. llvm.amdgcn.udot2 Provides direct access to v_dot2_u32_u16 across targets which support such instructions. This performs unsigned dot product diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index b28c3521d6336..aa0ea06c6092a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -5394,15 +5394,12 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, MachineIRBuilder &B = Helper.MIRBuilder; MachineRegisterInfo &MRI = *B.getMRI(); - Register DstReg = MI.getOperand(0).getReg(); - Register Src0 = MI.getOperand(2).getReg(); - bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 || IID == Intrinsic::amdgcn_permlanex16; - auto createLaneOp = [&](Register Src0, Register Src1, - Register Src2) -> Register { - auto LaneOp = B.buildIntrinsic(IID, {S32}).addUse(Src0); + auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1, + Register Src2, LLT VT) -> Register { + auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0); switch (IID) { case Intrinsic::amdgcn_readfirstlane: case Intrinsic::amdgcn_permlane64: @@ -5428,6 +5425,8 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, } }; + Register DstReg = MI.getOperand(0).getReg(); + Register Src0 = MI.getOperand(2).getReg(); Register Src1, Src2; if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane || IsPermLane16) { @@ -5446,156 +5445,65 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, } if (Size < 32) { - Register Src0Cast = MRI.getType(Src0).isScalar() - ? Src0 - : B.buildBitcast(LLT::scalar(Size), Src0).getReg(0); - Src0 = B.buildAnyExt(S32, Src0Cast).getReg(0); - - if (IsPermLane16) { - Register Src1Cast = - MRI.getType(Src1).isScalar() - ? Src1 - : B.buildBitcast(LLT::scalar(Size), Src2).getReg(0); - Src1 = B.buildAnyExt(LLT::scalar(32), Src1Cast).getReg(0); - } + Src0 = B.buildAnyExt(S32, Src0).getReg(0); - if (IID == Intrinsic::amdgcn_writelane) { - Register Src2Cast = - MRI.getType(Src2).isScalar() - ? Src2 - : B.buildBitcast(LLT::scalar(Size), Src2).getReg(0); - Src2 = B.buildAnyExt(LLT::scalar(32), Src2Cast).getReg(0); - } + if (IsPermLane16) + Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0); - Register LaneOpDst = createLaneOp(Src0, Src1, Src2); - if (Ty.isScalar()) - B.buildTrunc(DstReg, LaneOpDst); - else { - auto Trunc = B.buildTrunc(LLT::scalar(Size), LaneOpDst); - B.buildBitcast(DstReg, Trunc); - } + if (IID == Intrinsic::amdgcn_writelane) + Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0); + Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32); + B.buildTrunc(DstReg, LaneOpDst); MI.eraseFromParent(); return true; } - if ((Size % 32) == 0) { - SmallVector PartialRes; - unsigned NumParts = Size / 32; - auto IsS16Vec = Ty.isVector() && Ty.getElementType() == S16; - MachineInstrBuilder Src0Parts; - - if (Ty.isPointer()) { - auto PtrToInt = B.buildPtrToInt(LLT::scalar(Size), Src0); - Src0Parts = B.buildUnmerge(S32, PtrToInt); - } else if (Ty.isPointerVector()) { - LLT IntVecTy = Ty.changeElementType( - LLT::scalar(Ty.getElementType().getSizeInBits())); - auto PtrToInt = B.buildPtrToInt(IntVecTy, Src0); - Src0Parts = B.buildUnmerge(S32, PtrToInt); - } else - Src0Parts = - IsS16Vec ? B.buildUnmerge(V2S16, Src0) : B.buildUnmerge(S32, Src0); + if (Size % 32 != 0) + return false; - switch (IID) { - case Intrinsic::amdgcn_readlane: { - Register Src1 = MI.getOperand(3).getReg(); - for (unsigned i = 0; i < NumParts; ++i) { - Src0 = IsS16Vec ? B.buildBitcast(S32, Src0Parts.getReg(i)).getReg(0) - : Src0Parts.getReg(i); - PartialRes.push_back( - (B.buildIntrinsic(Intrinsic::amdgcn_readlane, {S32}) - .addUse(Src0) - .addUse(Src1)) - .getReg(0)); - } + LLT PartialResTy = S32; + if (Ty.isVector()) { + LLT EltTy = Ty.getElementType(); + switch (EltTy.getSizeInBits()) { + case 16: + PartialResTy = Ty.changeElementCount(ElementCount::getFixed(2)); break; - } - case Intrinsic::amdgcn_readfirstlane: - case Intrinsic::amdgcn_permlane64: { - for (unsigned i = 0; i < NumParts; ++i) { - Src0 = IsS16Vec ? B.buildBitcast(S32, Src0Parts.getReg(i)).getReg(0) - : Src0Parts.getReg(i); - PartialRes.push_back( - (B.buildIntrinsic(IID, {S32}).addUse(Src0).getReg(0))); - } - + case 32: + PartialResTy = EltTy; break; - } - case Intrinsic::amdgcn_writelane: - case Intrinsic::amdgcn_permlane16: - case Intrinsic::amdgcn_permlanex16: { - Register Src1 = MI.getOperand(3).getReg(); - Register Src2 = MI.getOperand(4).getReg(); - - Register SrcX = IsPermLane16 ? Src1 : Src2; - MachineInstrBuilder SrcXParts; - - if (Ty.isPointer()) { - auto PtrToInt = B.buildPtrToInt(S64, SrcX); - SrcXParts = B.buildUnmerge(S32, PtrToInt); - } else if (Ty.isPointerVector()) { - LLT IntVecTy = Ty.changeElementType( - LLT::scalar(Ty.getElementType().getSizeInBits())); - auto PtrToInt = B.buildPtrToInt(IntVecTy, SrcX); - SrcXParts = B.buildUnmerge(S32, PtrToInt); - } else - SrcXParts = - IsS16Vec ? B.buildUnmerge(V2S16, SrcX) : B.buildUnmerge(S32, SrcX); - - for (unsigned i = 0; i < NumParts; ++i) { - Src0 = IsS16Vec ? B.buildBitcast(S32, Src0Parts.getReg(i)).getReg(0) - : Src0Parts.getReg(i); - SrcX = IsS16Vec ? B.buildBitcast(S32, SrcXParts.getReg(i)).getReg(0) - : SrcXParts.getReg(i); - PartialRes.push_back(IsPermLane16 ? createLaneOp(Src0, SrcX, Src2) - : createLaneOp(Src0, Src1, SrcX)); - } - + default: + // Handle all other cases via S32 pieces; break; } - } + } - if (Ty.isPointerVector()) { - unsigned PtrSize = Ty.getElementType().getSizeInBits(); - SmallVector PtrElements; - if (PtrSize == 32) { - // Handle 32 bit pointers - for (unsigned i = 0; i < NumParts; i++) - PtrElements.push_back( - B.buildIntToPtr(Ty.getElementType(), PartialRes[i]).getReg(0)); - } else { - // Handle legalization of - SmallVector PtrParts; - unsigned NumS32Parts = PtrSize / 32; - unsigned PartIdx = 0; - for (unsigned i = 0, j = 1; i < NumParts; i += NumS32Parts, j++) { - // Merge S32 components of a pointer element first. - for (; PartIdx < (j * NumS32Parts); PartIdx++) - PtrParts.push_back(PartialRes[PartIdx]); - - auto MergedPtr = - B.buildMergeLikeInstr(LLT::scalar(PtrSize), PtrParts); - PtrElements.push_back( - B.buildIntToPtr(Ty.getElementType(), MergedPtr).getReg(0)); - PtrParts.clear(); - } - } + SmallVector PartialRes; + unsigned NumParts = Size / 32; + MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0); + MachineInstrBuilder Src1Parts, Src2Parts; - B.buildMergeLikeInstr(DstReg, PtrElements); - } else { - if (IsS16Vec) { - for (unsigned i = 0; i < NumParts; i++) - PartialRes[i] = B.buildBitcast(V2S16, PartialRes[i]).getReg(0); - } - B.buildMergeLikeInstr(DstReg, PartialRes); - } + if (IsPermLane16) + Src1Parts = B.buildUnmerge(PartialResTy, Src1); - MI.eraseFromParent(); - return true; + if (IID == Intrinsic::amdgcn_writelane) + Src2Parts = B.buildUnmerge(PartialResTy, Src2); + + for (unsigned i = 0; i < NumParts; ++i) { + Src0 = Src0Parts.getReg(i); + + if (IsPermLane16) + Src1 = Src1Parts.getReg(i); + + if (IID == Intrinsic::amdgcn_writelane) + Src2 = Src2Parts.getReg(i); + + PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy)); } - return false; + B.buildMergeLikeInstr(DstReg, PartialRes); + MI.eraseFromParent(); + return true; } bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 5d34ed089f65d..acd290cc36441 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6093,30 +6093,36 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, unsigned IntrinsicID = N->getConstantOperandVal(0); bool IsPermLane16 = IntrinsicID == Intrinsic::amdgcn_permlane16 || IntrinsicID == Intrinsic::amdgcn_permlanex16; - bool IsPermLane64 = IntrinsicID == Intrinsic::amdgcn_permlane64; SDValue Src0 = N->getOperand(1); SDLoc SL(N); MVT IntVT = MVT::getIntegerVT(ValSize); - auto createLaneOp = [&](SDValue Src0, SDValue Src1, SDValue Src2, - MVT ValueT) -> SDValue { - if (IsPermLane16 || IsPermLane64) { - if (IsPermLane16) { - SDValue Src3 = N->getOperand(4); - SDValue Src4 = N->getOperand(5); - SDValue Src5 = N->getOperand(6); - return DAG.getNode(IntrinsicID == Intrinsic::amdgcn_permlane16 - ? AMDGPUISD::PERMLANE16 - : AMDGPUISD::PERMLANEX16, - SL, ValueT, {Src0, Src1, Src2, Src3, Src4, Src5}); - } - return DAG.getNode(AMDGPUISD::PERMLANE64, SL, ValueT, {Src0}); + auto createLaneOp = [&DAG, &SL, N](SDValue Src0, SDValue Src1, SDValue Src2, + MVT ValueT) -> SDValue { + switch (unsigned IID = N->getConstantOperandVal(0)) { + case Intrinsic::amdgcn_readfirstlane: + case Intrinsic::amdgcn_permlane64: + return DAG.getNode(IID == Intrinsic::amdgcn_readfirstlane + ? AMDGPUISD::READFIRSTLANE + : AMDGPUISD::PERMLANE64, + SL, ValueT, {Src0}); + case Intrinsic::amdgcn_readlane: + return DAG.getNode(AMDGPUISD::READLANE, SL, ValueT, {Src0, Src1}); + case Intrinsic::amdgcn_writelane: + return DAG.getNode(AMDGPUISD::WRITELANE, SL, ValueT, {Src0, Src1, Src2}); + case Intrinsic::amdgcn_permlane16: + case Intrinsic::amdgcn_permlanex16: { + SDValue Src3 = N->getOperand(4); + SDValue Src4 = N->getOperand(5); + SDValue Src5 = N->getOperand(6); + return DAG.getNode(IID == Intrinsic::amdgcn_permlane16 + ? AMDGPUISD::PERMLANE16 + : AMDGPUISD::PERMLANEX16, + SL, ValueT, {Src0, Src1, Src2, Src3, Src4, Src5}); + } + default: + llvm_unreachable("unhandled lane op"); } - - return ( - Src2 ? DAG.getNode(AMDGPUISD::WRITELANE, SL, ValueT, {Src0, Src1, Src2}) - : Src1 ? DAG.getNode(AMDGPUISD::READLANE, SL, ValueT, {Src0, Src1}) - : DAG.getNode(AMDGPUISD::READFIRSTLANE, SL, ValueT, {Src0})); }; SDValue Src1, Src2; @@ -6133,31 +6139,73 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, } if (ValSize < 32) { - SDValue InitBitCast = DAG.getBitcast(IntVT, Src0); - Src0 = DAG.getAnyExtOrTrunc(InitBitCast, SL, MVT::i32); + bool IsFloat = VT.isFloatingPoint(); + Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0, + SL, MVT::i32); if (IsPermLane16) { - SDValue Src1Cast = DAG.getBitcast(IntVT, Src1); - Src1 = DAG.getAnyExtOrTrunc(Src1Cast, SL, MVT::i32); + Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1, + SL, MVT::i32); } if (IntrinsicID == Intrinsic::amdgcn_writelane) { - SDValue Src2Cast = DAG.getBitcast(IntVT, Src2); - Src2 = DAG.getAnyExtOrTrunc(Src2Cast, SL, MVT::i32); + Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2, + SL, MVT::i32); } SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32); SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT); - return DAG.getBitcast(VT, Trunc); + return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc; + } + + if (ValSize % 32 != 0) + return SDValue(); + + if (VT.isVector()) { + switch (MVT::SimpleValueType EltTy = + VT.getVectorElementType().getSimpleVT().SimpleTy) { + case MVT::i32: + case MVT::f32: { + SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT()); + return DAG.UnrollVectorOp(LaneOp.getNode()); + } + case MVT::i16: + case MVT::f16: + case MVT::bf16: { + MVT SubVecVT = MVT::getVectorVT(EltTy, 2); + SmallVector Pieces; + SDValue Src0SubVec, Src1SubVec, Src2SubVec; + for (unsigned i = 0, EltIdx = 0; i < ValSize / 32; i++) { + Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0, + DAG.getConstant(EltIdx, SL, MVT::i32)); + + if (IsPermLane16) + Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1, + DAG.getConstant(EltIdx, SL, MVT::i32)); + + if (IntrinsicID == Intrinsic::amdgcn_writelane) + Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2, + DAG.getConstant(EltIdx, SL, MVT::i32)); + + Pieces.push_back( + IsPermLane16 + ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT) + : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT)); + EltIdx += 2; + } + return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces); + } + default: + // Handle all other cases by bitcasting to i32 vectors + break; + } } - if ((ValSize % 32) == 0) { MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32); Src0 = DAG.getBitcast(VecVT, Src0); - if (IsPermLane16) { + if (IsPermLane16) Src1 = DAG.getBitcast(VecVT, Src1); - } if (IntrinsicID == Intrinsic::amdgcn_writelane) Src2 = DAG.getBitcast(VecVT, Src2); @@ -6165,9 +6213,6 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT); SDValue UnrolledLaneOp = DAG.UnrollVectorOp(LaneOp.getNode()); return DAG.getBitcast(VT, UnrolledLaneOp); - } - - return SDValue(); } void SITargetLowering::ReplaceNodeResults(SDNode *N, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll index 732489f22c36f..ed0da0d2a61a2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -566,64 +566,6 @@ define void @test_readfirstlane_v7i32(ptr addrspace(1) %out, <7 x i32> %src) { ret void } -define void @test_readfirstlane_p0(ptr addrspace(1) %out, ptr %src) { -; CHECK-SDAG-LABEL: test_readfirstlane_p0: -; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:5] -; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; CHECK-GISEL-LABEL: test_readfirstlane_p0: -; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s[4:5] -; CHECK-GISEL-NEXT: ;;#ASMEND -; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] - %x = call ptr @llvm.amdgcn.readfirstlane.p0(ptr %src) - call void asm sideeffect "; use $0", "s"(ptr %x) - ret void -} - -define void @test_readfirstlane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src) { -; CHECK-SDAG-LABEL: test_readfirstlane_v3p0: -; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:9] -; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; CHECK-GISEL-LABEL: test_readfirstlane_v3p0: -; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7 -; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s[4:9] -; CHECK-GISEL-NEXT: ;;#ASMEND -; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] - %x = call <3 x ptr> @llvm.amdgcn.readfirstlane.v3p0(<3 x ptr> %src) - call void asm sideeffect "; use $0", "s"(<3 x ptr> %x) - ret void -} - define void @test_readfirstlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_v8i16: ; CHECK-SDAG: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll index 588f239606f52..3882a5f0f9f4f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll @@ -1,6 +1,40 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-SDAG -enable-var-scope %s +define void @test_readfirstlane_p0(ptr addrspace(1) %out, ptr %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_p0: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:5] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] + %x = call ptr @llvm.amdgcn.readfirstlane.p0(ptr %src) + call void asm sideeffect "; use $0", "s"(ptr %x) + ret void +} + +define void @test_readfirstlane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_v3p0: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:9] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] + %x = call <3 x ptr> @llvm.amdgcn.readfirstlane.v3p0(<3 x ptr> %src) + call void asm sideeffect "; use $0", "s"(<3 x ptr> %x) + ret void +} + define void @test_readfirstlane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_p3: ; CHECK-SDAG: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll index 71cd3db81addd..325a39abb588a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll @@ -862,72 +862,6 @@ define void @test_readlane_v7i32(ptr addrspace(1) %out, <7 x i32> %src, i32 %src ret void } -define void @test_readlane_p0(ptr addrspace(1) %out, ptr %src, i32 %src1) { -; CHECK-SDAG-LABEL: test_readlane_p0: -; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v4 -; CHECK-SDAG-NEXT: s_nop 3 -; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 -; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:5] -; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; CHECK-GISEL-LABEL: test_readlane_p0: -; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v4 -; CHECK-GISEL-NEXT: s_nop 3 -; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s5 -; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s5 -; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s[4:5] -; CHECK-GISEL-NEXT: ;;#ASMEND -; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] - %x = call ptr @llvm.amdgcn.readlane.p0(ptr %src, i32 %src1) - call void asm sideeffect "; use $0", "s"(ptr %x) - ret void -} - -define void @test_readlane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src, i32 %src1) { -; CHECK-SDAG-LABEL: test_readlane_v3p0: -; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v8 -; CHECK-SDAG-NEXT: s_nop 3 -; CHECK-SDAG-NEXT: v_readlane_b32 s9, v7, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s8, v6, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 -; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:9] -; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; CHECK-GISEL-LABEL: test_readlane_v3p0: -; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v8 -; CHECK-GISEL-NEXT: s_nop 3 -; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s9 -; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s9 -; CHECK-GISEL-NEXT: v_readlane_b32 s6, v4, s9 -; CHECK-GISEL-NEXT: v_readlane_b32 s7, v5, s9 -; CHECK-GISEL-NEXT: v_readlane_b32 s8, v6, s9 -; CHECK-GISEL-NEXT: v_readlane_b32 s9, v7, s9 -; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s[4:9] -; CHECK-GISEL-NEXT: ;;#ASMEND -; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] - %x = call <3 x ptr> @llvm.amdgcn.readlane.v3p0(<3 x ptr> %src, i32 %src1) - call void asm sideeffect "; use $0", "s"(<3 x ptr> %x) - ret void -} - define void @test_readlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src1) { ; CHECK-SDAG-LABEL: test_readlane_v8i16: ; CHECK-SDAG: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll index 1b4ee84c75250..49f8ef391c230 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll @@ -1,6 +1,44 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-SDAG -enable-var-scope %s +define void @test_readlane_p0(ptr addrspace(1) %out, ptr %src, i32 %src1) { +; CHECK-SDAG-LABEL: test_readlane_p0: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v4 +; CHECK-SDAG-NEXT: s_nop 3 +; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:5] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] + %x = call ptr @llvm.amdgcn.readlane.p0(ptr %src, i32 %src1) + call void asm sideeffect "; use $0", "s"(ptr %x) + ret void +} + +define void @test_readlane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src, i32 %src1) { +; CHECK-SDAG-LABEL: test_readlane_v3p0: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v8 +; CHECK-SDAG-NEXT: s_nop 3 +; CHECK-SDAG-NEXT: v_readlane_b32 s9, v7, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s8, v6, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4 +; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; use s[4:9] +; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] + %x = call <3 x ptr> @llvm.amdgcn.readlane.v3p0(<3 x ptr> %src, i32 %src1) + call void asm sideeffect "; use $0", "s"(<3 x ptr> %x) + ret void +} + define void @test_readlane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src, i32 %src1) { ; CHECK-SDAG-LABEL: test_readlane_p3: ; CHECK-SDAG: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll index d0a865f565eeb..31f1085dd76ee 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll @@ -2651,283 +2651,6 @@ define void @test_writelane_v7i32(ptr addrspace(1) %out, <7 x i32> %src, i32 %sr ret void } -define void @test_writelane_p0(ptr addrspace(1) %out, ptr %src, i32 %src1) { -; GFX802-SDAG-LABEL: test_writelane_p0: -; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX802-SDAG-NEXT: flat_load_dwordx2 v[5:6], v[0:1] -; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v4 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v3 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v2 -; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX802-SDAG-NEXT: s_nop 0 -; GFX802-SDAG-NEXT: v_writelane_b32 v6, s4, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v5, s5, m0 -; GFX802-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[5:6] -; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX1010-SDAG-LABEL: test_writelane_p0: -; GFX1010-SDAG: ; %bb.0: -; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1010-SDAG-NEXT: global_load_dwordx2 v[5:6], v[0:1], off -; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v3 -; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v4 -; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v2 -; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX1010-SDAG-NEXT: v_writelane_b32 v6, s4, s5 -; GFX1010-SDAG-NEXT: v_writelane_b32 v5, s6, s5 -; GFX1010-SDAG-NEXT: global_store_dwordx2 v[0:1], v[5:6], off -; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-SDAG-LABEL: test_writelane_p0: -; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-SDAG-NEXT: global_load_b64 v[5:6], v[0:1], off -; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v3 -; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v4 -; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v2 -; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_writelane_b32 v6, s0, s1 -; GFX1100-SDAG-NEXT: v_writelane_b32 v5, s2, s1 -; GFX1100-SDAG-NEXT: global_store_b64 v[0:1], v[5:6], off -; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX802-GISEL-LABEL: test_writelane_p0: -; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX802-GISEL-NEXT: flat_load_dwordx2 v[5:6], v[0:1] -; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v4 -; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2 -; GFX802-GISEL-NEXT: v_readfirstlane_b32 s6, v3 -; GFX802-GISEL-NEXT: s_mov_b32 m0, s5 -; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX802-GISEL-NEXT: v_writelane_b32 v5, s4, m0 -; GFX802-GISEL-NEXT: v_writelane_b32 v6, s6, m0 -; GFX802-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[5:6] -; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1010-GISEL-LABEL: test_writelane_p0: -; GFX1010-GISEL: ; %bb.0: -; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1010-GISEL-NEXT: global_load_dwordx2 v[5:6], v[0:1], off -; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2 -; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v4 -; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s6, v3 -; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX1010-GISEL-NEXT: v_writelane_b32 v5, s4, s5 -; GFX1010-GISEL-NEXT: v_writelane_b32 v6, s6, s5 -; GFX1010-GISEL-NEXT: global_store_dwordx2 v[0:1], v[5:6], off -; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-GISEL-LABEL: test_writelane_p0: -; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-GISEL-NEXT: global_load_b64 v[5:6], v[0:1], off -; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2 -; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v4 -; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v3 -; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_writelane_b32 v5, s0, s1 -; GFX1100-GISEL-NEXT: v_writelane_b32 v6, s2, s1 -; GFX1100-GISEL-NEXT: global_store_b64 v[0:1], v[5:6], off -; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] - %oldval = load ptr, ptr addrspace(1) %out - %writelane = call ptr @llvm.amdgcn.writelane.p0(ptr %src, i32 %src1, ptr %oldval) - store ptr %writelane, ptr addrspace(1) %out, align 4 - ret void -} - -define void @test_writelane_v3p0(ptr addrspace(1) %out, <4 x ptr> %src, i32 %src1) { -; GFX802-SDAG-LABEL: test_writelane_v3p0: -; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX802-SDAG-NEXT: v_add_u32_e32 v19, vcc, 16, v0 -; GFX802-SDAG-NEXT: flat_load_dwordx4 v[11:14], v[0:1] -; GFX802-SDAG-NEXT: v_addc_u32_e32 v20, vcc, 0, v1, vcc -; GFX802-SDAG-NEXT: flat_load_dwordx4 v[15:18], v[19:20] -; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v10 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v5 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s9, v4 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s10, v3 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s11, v2 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v9 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v8 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v7 -; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v6 -; GFX802-SDAG-NEXT: s_waitcnt vmcnt(1) -; GFX802-SDAG-NEXT: v_writelane_b32 v14, s8, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v13, s9, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v12, s10, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v11, s11, m0 -; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX802-SDAG-NEXT: v_writelane_b32 v18, s4, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v17, s5, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v16, s6, m0 -; GFX802-SDAG-NEXT: v_writelane_b32 v15, s7, m0 -; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[11:14] -; GFX802-SDAG-NEXT: flat_store_dwordx4 v[19:20], v[15:18] -; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX1010-SDAG-LABEL: test_writelane_v3p0: -; GFX1010-SDAG: ; %bb.0: -; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: global_load_dwordx4 v[11:14], v[0:1], off offset:16 -; GFX1010-SDAG-NEXT: global_load_dwordx4 v[15:18], v[0:1], off -; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v10 -; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s9, v5 -; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s10, v4 -; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s11, v3 -; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s12, v2 -; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v9 -; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v8 -; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s7, v7 -; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s8, v6 -; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(1) -; GFX1010-SDAG-NEXT: v_writelane_b32 v14, s4, s5 -; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX1010-SDAG-NEXT: v_writelane_b32 v18, s9, s5 -; GFX1010-SDAG-NEXT: v_writelane_b32 v17, s10, s5 -; GFX1010-SDAG-NEXT: v_writelane_b32 v16, s11, s5 -; GFX1010-SDAG-NEXT: v_writelane_b32 v15, s12, s5 -; GFX1010-SDAG-NEXT: v_writelane_b32 v13, s6, s5 -; GFX1010-SDAG-NEXT: v_writelane_b32 v12, s7, s5 -; GFX1010-SDAG-NEXT: v_writelane_b32 v11, s8, s5 -; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[15:18], off -; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[11:14], off offset:16 -; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-SDAG-LABEL: test_writelane_v3p0: -; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: global_load_b128 v[11:14], v[0:1], off offset:16 -; GFX1100-SDAG-NEXT: global_load_b128 v[15:18], v[0:1], off -; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v10 -; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s5, v5 -; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s6, v4 -; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s7, v3 -; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s8, v2 -; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v9 -; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v8 -; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s4, v6 -; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(1) -; GFX1100-SDAG-NEXT: v_writelane_b32 v14, s0, s1 -; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX1100-SDAG-NEXT: v_writelane_b32 v18, s5, s1 -; GFX1100-SDAG-NEXT: v_writelane_b32 v17, s6, s1 -; GFX1100-SDAG-NEXT: v_writelane_b32 v16, s7, s1 -; GFX1100-SDAG-NEXT: v_writelane_b32 v15, s8, s1 -; GFX1100-SDAG-NEXT: v_writelane_b32 v13, s2, s1 -; GFX1100-SDAG-NEXT: v_writelane_b32 v12, s3, s1 -; GFX1100-SDAG-NEXT: v_writelane_b32 v11, s4, s1 -; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[15:18], off -; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[11:14], off offset:16 -; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX802-GISEL-LABEL: test_writelane_v3p0: -; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX802-GISEL-NEXT: v_add_u32_e32 v19, vcc, 16, v0 -; GFX802-GISEL-NEXT: flat_load_dwordx4 v[11:14], v[0:1] -; GFX802-GISEL-NEXT: v_addc_u32_e32 v20, vcc, 0, v1, vcc -; GFX802-GISEL-NEXT: flat_load_dwordx4 v[15:18], v[19:20] -; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v10 -; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2 -; GFX802-GISEL-NEXT: v_readfirstlane_b32 s6, v3 -; GFX802-GISEL-NEXT: v_readfirstlane_b32 s7, v4 -; GFX802-GISEL-NEXT: v_readfirstlane_b32 s8, v5 -; GFX802-GISEL-NEXT: s_mov_b32 m0, s5 -; GFX802-GISEL-NEXT: v_readfirstlane_b32 s9, v6 -; GFX802-GISEL-NEXT: v_readfirstlane_b32 s10, v7 -; GFX802-GISEL-NEXT: v_readfirstlane_b32 s11, v8 -; GFX802-GISEL-NEXT: v_readfirstlane_b32 s12, v9 -; GFX802-GISEL-NEXT: s_waitcnt vmcnt(1) -; GFX802-GISEL-NEXT: v_writelane_b32 v11, s4, m0 -; GFX802-GISEL-NEXT: v_writelane_b32 v12, s6, m0 -; GFX802-GISEL-NEXT: v_writelane_b32 v13, s7, m0 -; GFX802-GISEL-NEXT: v_writelane_b32 v14, s8, m0 -; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX802-GISEL-NEXT: v_writelane_b32 v15, s9, m0 -; GFX802-GISEL-NEXT: v_writelane_b32 v16, s10, m0 -; GFX802-GISEL-NEXT: v_writelane_b32 v17, s11, m0 -; GFX802-GISEL-NEXT: v_writelane_b32 v18, s12, m0 -; GFX802-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[11:14] -; GFX802-GISEL-NEXT: flat_store_dwordx4 v[19:20], v[15:18] -; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1010-GISEL-LABEL: test_writelane_v3p0: -; GFX1010-GISEL: ; %bb.0: -; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: global_load_dwordx4 v[11:14], v[0:1], off -; GFX1010-GISEL-NEXT: global_load_dwordx4 v[15:18], v[0:1], off offset:16 -; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2 -; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v10 -; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s6, v3 -; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s7, v4 -; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s8, v5 -; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s9, v6 -; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s10, v7 -; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s11, v8 -; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s12, v9 -; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(1) -; GFX1010-GISEL-NEXT: v_writelane_b32 v11, s4, s5 -; GFX1010-GISEL-NEXT: v_writelane_b32 v12, s6, s5 -; GFX1010-GISEL-NEXT: v_writelane_b32 v13, s7, s5 -; GFX1010-GISEL-NEXT: v_writelane_b32 v14, s8, s5 -; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX1010-GISEL-NEXT: v_writelane_b32 v15, s9, s5 -; GFX1010-GISEL-NEXT: v_writelane_b32 v16, s10, s5 -; GFX1010-GISEL-NEXT: v_writelane_b32 v17, s11, s5 -; GFX1010-GISEL-NEXT: v_writelane_b32 v18, s12, s5 -; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[11:14], off -; GFX1010-GISEL-NEXT: global_store_dwordx4 v[0:1], v[15:18], off offset:16 -; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-GISEL-LABEL: test_writelane_v3p0: -; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: global_load_b128 v[11:14], v[0:1], off -; GFX1100-GISEL-NEXT: global_load_b128 v[15:18], v[0:1], off offset:16 -; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2 -; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v10 -; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v3 -; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s3, v4 -; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s4, v5 -; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s5, v6 -; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s6, v7 -; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s7, v8 -; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s8, v9 -; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(1) -; GFX1100-GISEL-NEXT: v_writelane_b32 v11, s0, s1 -; GFX1100-GISEL-NEXT: v_writelane_b32 v12, s2, s1 -; GFX1100-GISEL-NEXT: v_writelane_b32 v13, s3, s1 -; GFX1100-GISEL-NEXT: v_writelane_b32 v14, s4, s1 -; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX1100-GISEL-NEXT: v_writelane_b32 v15, s5, s1 -; GFX1100-GISEL-NEXT: v_writelane_b32 v16, s6, s1 -; GFX1100-GISEL-NEXT: v_writelane_b32 v17, s7, s1 -; GFX1100-GISEL-NEXT: v_writelane_b32 v18, s8, s1 -; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[11:14], off -; GFX1100-GISEL-NEXT: global_store_b128 v[0:1], v[15:18], off offset:16 -; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] - %oldval = load <4 x ptr>, ptr addrspace(1) %out - %writelane = call <4 x ptr> @llvm.amdgcn.writelane.v3p0(<4 x ptr> %src, i32 %src1, <4 x ptr> %oldval) - store <4 x ptr> %writelane, ptr addrspace(1) %out, align 4 - ret void -} - define void @test_writelane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src1) { ; GFX802-SDAG-LABEL: test_writelane_v8i16: ; GFX802-SDAG: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll index afc394627d356..c45c73dd88150 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll @@ -3,6 +3,150 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1010-SDAG %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX1100-SDAG %s +define void @test_writelane_p0(ptr addrspace(1) %out, ptr %src, i32 %src1) { +; GFX802-SDAG-LABEL: test_writelane_p0: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-SDAG-NEXT: flat_load_dwordx2 v[5:6], v[0:1] +; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v4 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v2 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_nop 0 +; GFX802-SDAG-NEXT: v_writelane_b32 v6, s4, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v5, s5, m0 +; GFX802-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[5:6] +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-SDAG-LABEL: test_writelane_p0: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-SDAG-NEXT: global_load_dwordx2 v[5:6], v[0:1], off +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v3 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v4 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v2 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1010-SDAG-NEXT: v_writelane_b32 v6, s4, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v5, s6, s5 +; GFX1010-SDAG-NEXT: global_store_dwordx2 v[0:1], v[5:6], off +; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: test_writelane_p0: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: global_load_b64 v[5:6], v[0:1], off +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v3 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v4 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_writelane_b32 v6, s0, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v5, s2, s1 +; GFX1100-SDAG-NEXT: global_store_b64 v[0:1], v[5:6], off +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] + %oldval = load ptr, ptr addrspace(1) %out + %writelane = call ptr @llvm.amdgcn.writelane.p0(ptr %src, i32 %src1, ptr %oldval) + store ptr %writelane, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_writelane_v3p0(ptr addrspace(1) %out, <4 x ptr> %src, i32 %src1) { +; GFX802-SDAG-LABEL: test_writelane_v3p0: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-SDAG-NEXT: v_add_u32_e32 v19, vcc, 16, v0 +; GFX802-SDAG-NEXT: flat_load_dwordx4 v[11:14], v[0:1] +; GFX802-SDAG-NEXT: v_addc_u32_e32 v20, vcc, 0, v1, vcc +; GFX802-SDAG-NEXT: flat_load_dwordx4 v[15:18], v[19:20] +; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v10 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s8, v5 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s9, v4 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s10, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s11, v2 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v9 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v8 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s6, v7 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s7, v6 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX802-SDAG-NEXT: v_writelane_b32 v14, s8, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v13, s9, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v12, s10, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v11, s11, m0 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: v_writelane_b32 v18, s4, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v17, s5, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v16, s6, m0 +; GFX802-SDAG-NEXT: v_writelane_b32 v15, s7, m0 +; GFX802-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[11:14] +; GFX802-SDAG-NEXT: flat_store_dwordx4 v[19:20], v[15:18] +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-SDAG-LABEL: test_writelane_v3p0: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-SDAG-NEXT: s_clause 0x1 +; GFX1010-SDAG-NEXT: global_load_dwordx4 v[11:14], v[0:1], off offset:16 +; GFX1010-SDAG-NEXT: global_load_dwordx4 v[15:18], v[0:1], off +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v10 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s9, v5 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s10, v4 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s11, v3 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s12, v2 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s6, v8 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s7, v7 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s8, v6 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX1010-SDAG-NEXT: v_writelane_b32 v14, s4, s5 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1010-SDAG-NEXT: v_writelane_b32 v18, s9, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v17, s10, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v16, s11, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v15, s12, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v13, s6, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v12, s7, s5 +; GFX1010-SDAG-NEXT: v_writelane_b32 v11, s8, s5 +; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[15:18], off +; GFX1010-SDAG-NEXT: global_store_dwordx4 v[0:1], v[11:14], off offset:16 +; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: test_writelane_v3p0: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: global_load_b128 v[11:14], v[0:1], off offset:16 +; GFX1100-SDAG-NEXT: global_load_b128 v[15:18], v[0:1], off +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s5, v5 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s6, v4 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s7, v3 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s8, v2 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v9 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v8 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v7 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s4, v6 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX1100-SDAG-NEXT: v_writelane_b32 v14, s0, s1 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: v_writelane_b32 v18, s5, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v17, s6, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v16, s7, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v15, s8, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v13, s2, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v12, s3, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v11, s4, s1 +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[15:18], off +; GFX1100-SDAG-NEXT: global_store_b128 v[0:1], v[11:14], off offset:16 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] + %oldval = load <4 x ptr>, ptr addrspace(1) %out + %writelane = call <4 x ptr> @llvm.amdgcn.writelane.v3p0(<4 x ptr> %src, i32 %src1, <4 x ptr> %oldval) + store <4 x ptr> %writelane, ptr addrspace(1) %out, align 4 + ret void +} + define void @test_writelane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src, i32 %src1) { ; GFX802-SDAG-LABEL: test_writelane_p3: ; GFX802-SDAG: ; %bb.0: From 12155f59f4b08de99b05d8744857c68e8adfce85 Mon Sep 17 00:00:00 2001 From: Vikram Date: Fri, 31 May 2024 09:38:25 +0000 Subject: [PATCH 7/9] add hepler to emit N-ary builtins --- clang/lib/CodeGen/CGBuiltin.cpp | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index d9ab0051f0d23..a21aa85821375 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -614,6 +614,17 @@ static Value *emitTernaryBuiltin(CodeGenFunction &CGF, return CGF.Builder.CreateCall(F, { Src0, Src1, Src2 }); } +// Emit an intrinsic that has N operands of the same type as its result. +static Value *emitNaryBuiltin(CodeGenFunction &CGF, const CallExpr *E, + unsigned IntrinsicID) { + SmallVector Args; + for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) + Args.push_back(CGF.EmitScalarExpr(E->getArg(i))); + + Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Args[0]->getType()); + return CGF.Builder.CreateCall(F, Args); +} + // Emit an intrinsic that has 1 float or double operand, and 1 integer. static Value *emitFPIntBuiltin(CodeGenFunction &CGF, const CallExpr *E, @@ -18480,21 +18491,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, return Builder.CreateCall(F, Args); } case AMDGPU::BI__builtin_amdgcn_permlane16: - case AMDGPU::BI__builtin_amdgcn_permlanex16: { - llvm::Value *Src0 = EmitScalarExpr(E->getArg(0)); - llvm::Value *Src1 = EmitScalarExpr(E->getArg(1)); - llvm::Value *Src2 = EmitScalarExpr(E->getArg(2)); - llvm::Value *Src3 = EmitScalarExpr(E->getArg(3)); - llvm::Value *Src4 = EmitScalarExpr(E->getArg(4)); - llvm::Value *Src5 = EmitScalarExpr(E->getArg(5)); - - Intrinsic::ID IID = BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16 - ? Intrinsic::amdgcn_permlane16 - : Intrinsic::amdgcn_permlanex16; - - llvm::Function *F = CGM.getIntrinsic(IID, Src0->getType()); - return Builder.CreateCall(F, {Src0, Src1, Src2, Src3, Src4, Src5}); - } + case AMDGPU::BI__builtin_amdgcn_permlanex16: + return emitNaryBuiltin(*this, E, + BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16 + ? Intrinsic::amdgcn_permlane16 + : Intrinsic::amdgcn_permlanex16); case AMDGPU::BI__builtin_amdgcn_permlane64: return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_permlane64); case AMDGPU::BI__builtin_amdgcn_readlane: From 40381ca63182f7986a1bf723128cfdfa6b387557 Mon Sep 17 00:00:00 2001 From: Vikram Date: Mon, 17 Jun 2024 12:09:20 +0000 Subject: [PATCH 8/9] update with latest changes from #89217 --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 6 - llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 7 -- llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td | 54 -------- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 115 +++++++++++++----- llvm/lib/Target/AMDGPU/SIInstructions.td | 2 +- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 4 +- llvm/lib/Target/AMDGPU/VOP2Instructions.td | 4 +- llvm/lib/Target/AMDGPU/VOP3Instructions.td | 8 +- .../AMDGPU/llvm.amdgcn.readfirstlane.ll | 5 +- .../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll | 7 +- 10 files changed, 96 insertions(+), 116 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 47347ada935be..18193d8807597 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -5508,12 +5508,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(LDS) NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD) NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD) - NODE_NAME_CASE(READLANE) - NODE_NAME_CASE(READFIRSTLANE) - NODE_NAME_CASE(WRITELANE) - NODE_NAME_CASE(PERMLANE16) - NODE_NAME_CASE(PERMLANEX16) - NODE_NAME_CASE(PERMLANE64) NODE_NAME_CASE(DUMMY_CHAIN) case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; NODE_NAME_CASE(LOAD_D16_HI) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 293fa6259fef7..71c4334029b43 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -558,13 +558,6 @@ enum NodeType : unsigned { FPTRUNC_ROUND_UPWARD, FPTRUNC_ROUND_DOWNWARD, - READLANE, - READFIRSTLANE, - WRITELANE, - PERMLANE16, - PERMLANEX16, - PERMLANE64, - DUMMY_CHAIN, FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE, LOAD_D16_HI, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index a82f48950f493..702f6e67c5527 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -342,34 +342,6 @@ def AMDGPUfdot2_impl : SDNode<"AMDGPUISD::FDOT2", def AMDGPUperm_impl : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>; -def AMDGPUReadfirstlaneOp : SDTypeProfile<1, 1, [ - SDTCisSameAs<0, 1> -]>; - -def AMDGPUReadlaneOp : SDTypeProfile<1, 2, [ - SDTCisSameAs<0, 1>, SDTCisInt<2> -]>; - -def AMDGPUDWritelaneOp : SDTypeProfile<1, 3, [ - SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisSameAs<0, 3> -]>; - -def AMDGPUDPermlane16Op : SDTypeProfile<1, 6, [ - SDTCisSameAs<0, 1>, // old - SDTCisSameAs<0, 2>, // src0 - SDTCisInt<3>, // src1 - SDTCisInt<4>, // src2 - SDTCisInt<5>, // i1 fi - SDTCisInt<6> // i1 bound_ctrl -]>; - -def AMDGPUreadlane_impl : SDNode<"AMDGPUISD::READLANE", AMDGPUReadlaneOp>; -def AMDGPUreadfirstlane_impl : SDNode<"AMDGPUISD::READFIRSTLANE", AMDGPUReadfirstlaneOp>; -def AMDGPUwritelane_impl : SDNode<"AMDGPUISD::WRITELANE", AMDGPUDWritelaneOp>; -def AMDGPUpermlane16_impl : SDNode<"AMDGPUISD::PERMLANE16", AMDGPUDPermlane16Op>; -def AMDGPUpermlanex16_impl : SDNode<"AMDGPUISD::PERMLANEX16", AMDGPUDPermlane16Op>; -def AMDGPUpermlane64_impl : SDNode<"AMDGPUISD::PERMLANE64", AMDGPUReadfirstlaneOp>; - // SI+ export def AMDGPUExportOp : SDTypeProfile<0, 8, [ SDTCisInt<0>, // i8 tgt @@ -534,29 +506,3 @@ def AMDGPUdiv_fmas : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$vcc def AMDGPUperm : PatFrags<(ops node:$src0, node:$src1, node:$src2), [(int_amdgcn_perm node:$src0, node:$src1, node:$src2), (AMDGPUperm_impl node:$src0, node:$src1, node:$src2)]>; - -def AMDGPUreadlane : PatFrags<(ops node:$src0, node:$src1), - [(int_amdgcn_readlane node:$src0, node:$src1), - (AMDGPUreadlane_impl node:$src0, node:$src1)]>; - -def AMDGPUreadfirstlane : PatFrags<(ops node:$src), - [(int_amdgcn_readfirstlane node:$src), - (AMDGPUreadfirstlane_impl node:$src)]>; - -def AMDGPUwritelane : PatFrags<(ops node:$src0, node:$src1, node:$src2), - [(int_amdgcn_writelane node:$src0, node:$src1, node:$src2), - (AMDGPUwritelane_impl node:$src0, node:$src1, node:$src2)]>; - -def AMDGPUpermlane16 : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5), - [(int_amdgcn_permlane16 node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5), - (AMDGPUpermlane16_impl node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5)]>; - -def AMDGPUpermlanex16 : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5), - [(int_amdgcn_permlanex16 node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5), - (AMDGPUpermlanex16_impl node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5)]>; - -def AMDGPUpermlane64 : PatFrags<(ops node:$src), - [(int_amdgcn_permlane64 node:$src), - (AMDGPUpermlane64_impl node:$src)]>; - - diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index d91ab31832070..f55025ec5e08d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6095,46 +6095,55 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); unsigned ValSize = VT.getSizeInBits(); - unsigned IntrinsicID = N->getConstantOperandVal(0); - bool IsPermLane16 = IntrinsicID == Intrinsic::amdgcn_permlane16 || - IntrinsicID == Intrinsic::amdgcn_permlanex16; - SDValue Src0 = N->getOperand(1); + unsigned IID = N->getConstantOperandVal(0); + bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 || + IID == Intrinsic::amdgcn_permlanex16; SDLoc SL(N); MVT IntVT = MVT::getIntegerVT(ValSize); - auto createLaneOp = [&DAG, &SL, N](SDValue Src0, SDValue Src1, SDValue Src2, - MVT ValueT) -> SDValue { - switch (unsigned IID = N->getConstantOperandVal(0)) { + auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1, SDValue Src2, + MVT ValT) -> SDValue { + SmallVector Operands; + switch (IID) { + case Intrinsic::amdgcn_permlane16: + case Intrinsic::amdgcn_permlanex16: + Operands.push_back(N->getOperand(6)); + Operands.push_back(N->getOperand(5)); + Operands.push_back(N->getOperand(4)); + [[fallthrough]]; + case Intrinsic::amdgcn_writelane: + Operands.push_back(Src2); + [[fallthrough]]; + case Intrinsic::amdgcn_readlane: + Operands.push_back(Src1); + [[fallthrough]]; case Intrinsic::amdgcn_readfirstlane: case Intrinsic::amdgcn_permlane64: - return DAG.getNode(IID == Intrinsic::amdgcn_readfirstlane - ? AMDGPUISD::READFIRSTLANE - : AMDGPUISD::PERMLANE64, - SL, ValueT, {Src0}); - case Intrinsic::amdgcn_readlane: - return DAG.getNode(AMDGPUISD::READLANE, SL, ValueT, {Src0, Src1}); - case Intrinsic::amdgcn_writelane: - return DAG.getNode(AMDGPUISD::WRITELANE, SL, ValueT, {Src0, Src1, Src2}); - case Intrinsic::amdgcn_permlane16: - case Intrinsic::amdgcn_permlanex16: { - SDValue Src3 = N->getOperand(4); - SDValue Src4 = N->getOperand(5); - SDValue Src5 = N->getOperand(6); - return DAG.getNode(IID == Intrinsic::amdgcn_permlane16 - ? AMDGPUISD::PERMLANE16 - : AMDGPUISD::PERMLANEX16, - SL, ValueT, {Src0, Src1, Src2, Src3, Src4, Src5}); - } + Operands.push_back(Src0); + break; default: llvm_unreachable("unhandled lane op"); } + + Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32)); + std::reverse(Operands.begin(), Operands.end()); + + if (SDNode *GL = N->getGluedNode()) { + assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE); + GL = GL->getOperand(0).getNode(); + Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue, + SDValue(GL, 0))); + } + + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands); }; + SDValue Src0 = N->getOperand(1); SDValue Src1, Src2; - if (IntrinsicID == Intrinsic::amdgcn_readlane || - IntrinsicID == Intrinsic::amdgcn_writelane || IsPermLane16) { + if (IID == Intrinsic::amdgcn_readlane || + IID == Intrinsic::amdgcn_writelane || IsPermLane16) { Src1 = N->getOperand(2); - if (IntrinsicID == Intrinsic::amdgcn_writelane || IsPermLane16) + if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) Src2 = N->getOperand(3); } @@ -6153,7 +6162,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SL, MVT::i32); } - if (IntrinsicID == Intrinsic::amdgcn_writelane) { + if (IID == Intrinsic::amdgcn_writelane) { Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2, SL, MVT::i32); } @@ -6165,6 +6174,46 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, if (ValSize % 32 != 0) return SDValue(); + + auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue { + EVT VT = N->getValueType(0); + unsigned NE = VT.getVectorNumElements(); + EVT EltVT = VT.getVectorElementType(); + SmallVector Scalars; + unsigned NumOperands = N->getNumOperands(); + SmallVector Operands(NumOperands); + SDNode *GL = N->getGluedNode(); + + // only handle convergencectrl_glue + assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE); + + for (unsigned i = 0; i != NE; ++i) { + for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e; + ++j) { + SDValue Operand = N->getOperand(j); + EVT OperandVT = Operand.getValueType(); + if (OperandVT.isVector()) { + // A vector operand; extract a single element. + EVT OperandEltVT = OperandVT.getVectorElementType(); + Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT, + Operand, DAG.getVectorIdxConstant(i, SL)); + } else { + // A scalar operand; just use it as is. + Operands[j] = Operand; + } + } + + if (GL) + Operands[NumOperands - 1] = + DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue, + SDValue(GL->getOperand(0).getNode(), 0)); + + Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands)); + } + + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE); + return DAG.getBuildVector(VecVT, SL, Scalars); + }; if (VT.isVector()) { switch (MVT::SimpleValueType EltTy = @@ -6172,7 +6221,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, case MVT::i32: case MVT::f32: { SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT()); - return DAG.UnrollVectorOp(LaneOp.getNode()); + return unrollLaneOp(LaneOp.getNode()); } case MVT::i16: case MVT::f16: @@ -6188,7 +6237,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1, DAG.getConstant(EltIdx, SL, MVT::i32)); - if (IntrinsicID == Intrinsic::amdgcn_writelane) + if (IID == Intrinsic::amdgcn_writelane) Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2, DAG.getConstant(EltIdx, SL, MVT::i32)); @@ -6212,11 +6261,11 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, if (IsPermLane16) Src1 = DAG.getBitcast(VecVT, Src1); - if (IntrinsicID == Intrinsic::amdgcn_writelane) + if (IID == Intrinsic::amdgcn_writelane) Src2 = DAG.getBitcast(VecVT, Src2); SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT); - SDValue UnrolledLaneOp = DAG.UnrollVectorOp(LaneOp.getNode()); + SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode()); return DAG.getBitcast(VT, UnrolledLaneOp); } diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 5941b264fdb21..a5e59fde107ea 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3389,7 +3389,7 @@ def : GCNPat< // FIXME: Should also do this for readlane, but tablegen crashes on // the ignored src1. def : GCNPat< - (i32 (AMDGPUreadfirstlane (i32 imm:$src))), + (i32 (int_amdgcn_readfirstlane (i32 imm:$src))), (S_MOV_B32 SReg_32:$src) >; diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 71c83e072b4b0..2c0d61ee4afa1 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -255,7 +255,7 @@ def V_READFIRSTLANE_B32 : VOP1_Pseudo <"v_readfirstlane_b32", VOP_READFIRSTLANE, } foreach vt = Reg32Types.types in { - def : GCNPat<(vt (AMDGPUreadfirstlane (vt VRegOrLdsSrc_32:$src0))), + def : GCNPat<(vt (int_amdgcn_readfirstlane (vt VRegOrLdsSrc_32:$src0))), (V_READFIRSTLANE_B32 (vt VRegOrLdsSrc_32:$src0)) >; } @@ -743,7 +743,7 @@ let SubtargetPredicate = isGFX11Plus in { } // End SubtargetPredicate = isGFX11Plus foreach vt = Reg32Types.types in { - def : GCNPat<(AMDGPUpermlane64 (vt VRegSrc_32:$src0)), + def : GCNPat<(int_amdgcn_permlane64 (vt VRegSrc_32:$src0)), (vt (V_PERMLANE64_B32 (vt VRegSrc_32:$src0))) >; } diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 40106314cb0e6..ae4030e509e9c 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -789,11 +789,11 @@ def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, []> { } // End isConvergent = 1 foreach vt = Reg32Types.types in { - def : GCNPat<(vt (AMDGPUreadlane vt:$src0, i32:$src1)), + def : GCNPat<(vt (int_amdgcn_readlane vt:$src0, i32:$src1)), (V_READLANE_B32 VRegOrLdsSrc_32:$src0, SCSrc_b32:$src1) >; - def : GCNPat<(vt (AMDGPUwritelane vt:$src0, i32:$src1, vt:$src2)), + def : GCNPat<(vt (int_amdgcn_writelane vt:$src0, i32:$src1, vt:$src2)), (V_WRITELANE_B32 SCSrc_b32:$src0, SCSrc_b32:$src1, VGPR_32:$src2) >; } diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index ff098f8fe50f3..6ba54cca44958 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -841,8 +841,8 @@ def gi_opsel_i1timm : GICustomOperandRenderer<"renderOpSelTImm">, class PermlanePat : GCNPat< - (permlane vt:$vdst_in, vt:$src0, i32:$src1, i32:$src2, - timm:$fi, timm:$bc), + (vt (permlane vt:$vdst_in, vt:$src0, i32:$src1, i32:$src2, + timm:$fi, timm:$bc)), (inst (opsel_i1timm $fi), VGPR_32:$src0, (opsel_i1timm $bc), SCSrc_b32:$src1, 0, SCSrc_b32:$src2, VGPR_32:$vdst_in) >; @@ -867,8 +867,8 @@ let SubtargetPredicate = isGFX10Plus in { } // End $vdst = $vdst_in, DisableEncoding $vdst_in, IsInvalidSingleUseConsumer = 1, IsInvalidSingleUseProducer = 1 foreach vt = Reg32Types.types in { - def : PermlanePat; - def : PermlanePat; + def : PermlanePat; + def : PermlanePat; } defm V_ADD_NC_U16 : VOP3Inst <"v_add_nc_u16", VOP3_Profile, add>; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll index ed0da0d2a61a2..cc6c630ae6466 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -463,10 +463,9 @@ define void @test_readfirstlane_i16(ptr addrspace(1) %out, i16 %src) { ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0xffff -; CHECK-SDAG-NEXT: v_and_b32_e32 v0, s4, v0 +; CHECK-SDAG-NEXT: s_and_b32 s4, s4, 0xffff ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use v0 +; CHECK-SDAG-NEXT: ; use s4 ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll index 325a39abb588a..66e1f9396de5a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll @@ -743,12 +743,11 @@ define void @test_readlane_i16(ptr addrspace(1) %out, i16 %src, i32 %src1) { ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0xffff -; CHECK-SDAG-NEXT: s_nop 2 +; CHECK-SDAG-NEXT: s_nop 3 ; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 -; CHECK-SDAG-NEXT: v_and_b32_e32 v0, s4, v0 +; CHECK-SDAG-NEXT: s_and_b32 s4, s4, 0xffff ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use v0 +; CHECK-SDAG-NEXT: ; use s4 ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; From 4e4cdd961e3daea74e2ae5689fea73c0b6685f57 Mon Sep 17 00:00:00 2001 From: Vikram Date: Mon, 17 Jun 2024 08:16:39 -0400 Subject: [PATCH 9/9] clang format --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index f55025ec5e08d..1101de206a8fe 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6101,8 +6101,8 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SDLoc SL(N); MVT IntVT = MVT::getIntegerVT(ValSize); - auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1, SDValue Src2, - MVT ValT) -> SDValue { + auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1, + SDValue Src2, MVT ValT) -> SDValue { SmallVector Operands; switch (IID) { case Intrinsic::amdgcn_permlane16: @@ -6140,8 +6140,8 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SDValue Src0 = N->getOperand(1); SDValue Src1, Src2; - if (IID == Intrinsic::amdgcn_readlane || - IID == Intrinsic::amdgcn_writelane || IsPermLane16) { + if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane || + IsPermLane16) { Src1 = N->getOperand(2); if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) Src2 = N->getOperand(3); @@ -6174,7 +6174,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, if (ValSize % 32 != 0) return SDValue(); - + auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue { EVT VT = N->getValueType(0); unsigned NE = VT.getVectorNumElements();