From 780b9703ec9dbabfe821a0846397c6615bb6dec6 Mon Sep 17 00:00:00 2001 From: Stefan Stipanovic Date: Mon, 12 Feb 2024 13:33:25 +0100 Subject: [PATCH] [AMDGPU][GlobalIsel] Introduce isRegisterClassType to check for legal types, instead of checking bit width. --- .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 135 +++++++++++------- .../AMDGPU/GlobalISel/bitcast_38_i16.ll | 85 +++++++++++ .../AMDGPU/GlobalISel/extractelement.ll | 126 ++++++++++++++++ .../GlobalISel/legalize-build-vector.mir | 123 ---------------- 4 files changed, 296 insertions(+), 173 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 17ffb7ec988f0..df58f3794cb06 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -239,6 +239,7 @@ static bool isRegisterVectorType(LLT Ty) { EltSize == 128 || EltSize == 256; } +// TODO: replace all uses of isRegisterType with isRegisterClassType static bool isRegisterType(LLT Ty) { if (!isRegisterSize(Ty.getSizeInBits())) return false; @@ -258,6 +259,8 @@ static LegalityPredicate isRegisterType(unsigned TypeIdx) { } // RegisterType that doesn't have a corresponding RegClass. +// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this +// should be removed. static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) { return [=](const LegalityQuery &Query) { LLT Ty = Query.Types[TypeIdx]; @@ -276,6 +279,85 @@ static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { }; } +static const LLT S1 = LLT::scalar(1); +static const LLT S8 = LLT::scalar(8); +static const LLT S16 = LLT::scalar(16); +static const LLT S32 = LLT::scalar(32); +static const LLT S64 = LLT::scalar(64); +static const LLT S96 = LLT::scalar(96); +static const LLT S128 = LLT::scalar(128); +static const LLT S160 = LLT::scalar(160); +static const LLT S224 = LLT::scalar(224); +static const LLT S256 = LLT::scalar(256); +static const LLT S512 = LLT::scalar(512); +static const LLT MaxScalar = LLT::scalar(MaxRegisterSize); + +static const LLT V2S8 = LLT::fixed_vector(2, 8); +static const LLT V2S16 = LLT::fixed_vector(2, 16); +static const LLT V4S16 = LLT::fixed_vector(4, 16); +static const LLT V6S16 = LLT::fixed_vector(6, 16); +static const LLT V8S16 = LLT::fixed_vector(8, 16); +static const LLT V10S16 = LLT::fixed_vector(10, 16); +static const LLT V12S16 = LLT::fixed_vector(12, 16); +static const LLT V16S16 = LLT::fixed_vector(16, 16); + +static const LLT V2S32 = LLT::fixed_vector(2, 32); +static const LLT V3S32 = LLT::fixed_vector(3, 32); +static const LLT V4S32 = LLT::fixed_vector(4, 32); +static const LLT V5S32 = LLT::fixed_vector(5, 32); +static const LLT V6S32 = LLT::fixed_vector(6, 32); +static const LLT V7S32 = LLT::fixed_vector(7, 32); +static const LLT V8S32 = LLT::fixed_vector(8, 32); +static const LLT V9S32 = LLT::fixed_vector(9, 32); +static const LLT V10S32 = LLT::fixed_vector(10, 32); +static const LLT V11S32 = LLT::fixed_vector(11, 32); +static const LLT V12S32 = LLT::fixed_vector(12, 32); +static const LLT V16S32 = LLT::fixed_vector(16, 32); +static const LLT V32S32 = LLT::fixed_vector(32, 32); + +static const LLT V2S64 = LLT::fixed_vector(2, 64); +static const LLT V3S64 = LLT::fixed_vector(3, 64); +static const LLT V4S64 = LLT::fixed_vector(4, 64); +static const LLT V5S64 = LLT::fixed_vector(5, 64); +static const LLT V6S64 = LLT::fixed_vector(6, 64); +static const LLT V7S64 = LLT::fixed_vector(7, 64); +static const LLT V8S64 = LLT::fixed_vector(8, 64); +static const LLT V16S64 = LLT::fixed_vector(16, 64); + +static const LLT V2S128 = LLT::fixed_vector(2, 128); +static const LLT V4S128 = LLT::fixed_vector(4, 128); + +static std::initializer_list AllScalarTypes = {S32, S64, S96, S128, + S160, S224, S256, S512}; + +static std::initializer_list AllS16Vectors{ + V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128, V4S128}; + +static std::initializer_list AllS32Vectors = { + V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, + V9S32, V10S32, V11S32, V12S32, V16S32, V32S32}; + +static std::initializer_list AllS64Vectors = {V2S64, V3S64, V4S64, V5S64, + V6S64, V7S64, V8S64, V16S64}; + +// Checks whether a type is in the list of legal register types. +static bool isRegisterClassType(LLT Ty) { + if (Ty.isVector() && Ty.getElementType().isPointer()) + Ty = LLT::fixed_vector(Ty.getNumElements(), + LLT::scalar(Ty.getScalarSizeInBits())); + else if (Ty.isPointer()) + Ty = LLT::scalar(Ty.getScalarSizeInBits()); + + return is_contained(AllS32Vectors, Ty) || is_contained(AllS64Vectors, Ty) || + is_contained(AllScalarTypes, Ty) || is_contained(AllS16Vectors, Ty); +} + +static LegalityPredicate isRegisterClassType(unsigned TypeIdx) { + return [TypeIdx](const LegalityQuery &Query) { + return isRegisterClassType(Query.Types[TypeIdx]); + }; +} + // If we have a truncating store or an extending load with a data size larger // than 32-bits, we need to reduce to a 32-bit type. static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { @@ -578,52 +660,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); }; - const LLT S1 = LLT::scalar(1); - const LLT S8 = LLT::scalar(8); - const LLT S16 = LLT::scalar(16); - const LLT S32 = LLT::scalar(32); - const LLT S64 = LLT::scalar(64); - const LLT S128 = LLT::scalar(128); - const LLT S256 = LLT::scalar(256); - const LLT S512 = LLT::scalar(512); - const LLT MaxScalar = LLT::scalar(MaxRegisterSize); - - const LLT V2S8 = LLT::fixed_vector(2, 8); - const LLT V2S16 = LLT::fixed_vector(2, 16); - const LLT V4S16 = LLT::fixed_vector(4, 16); - - const LLT V2S32 = LLT::fixed_vector(2, 32); - const LLT V3S32 = LLT::fixed_vector(3, 32); - const LLT V4S32 = LLT::fixed_vector(4, 32); - const LLT V5S32 = LLT::fixed_vector(5, 32); - const LLT V6S32 = LLT::fixed_vector(6, 32); - const LLT V7S32 = LLT::fixed_vector(7, 32); - const LLT V8S32 = LLT::fixed_vector(8, 32); - const LLT V9S32 = LLT::fixed_vector(9, 32); - const LLT V10S32 = LLT::fixed_vector(10, 32); - const LLT V11S32 = LLT::fixed_vector(11, 32); - const LLT V12S32 = LLT::fixed_vector(12, 32); - const LLT V13S32 = LLT::fixed_vector(13, 32); - const LLT V14S32 = LLT::fixed_vector(14, 32); - const LLT V15S32 = LLT::fixed_vector(15, 32); - const LLT V16S32 = LLT::fixed_vector(16, 32); - const LLT V32S32 = LLT::fixed_vector(32, 32); - - const LLT V2S64 = LLT::fixed_vector(2, 64); - const LLT V3S64 = LLT::fixed_vector(3, 64); - const LLT V4S64 = LLT::fixed_vector(4, 64); - const LLT V5S64 = LLT::fixed_vector(5, 64); - const LLT V6S64 = LLT::fixed_vector(6, 64); - const LLT V7S64 = LLT::fixed_vector(7, 64); - const LLT V8S64 = LLT::fixed_vector(8, 64); - const LLT V16S64 = LLT::fixed_vector(16, 64); - - std::initializer_list AllS32Vectors = - {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, - V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; - std::initializer_list AllS64Vectors = - {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; - const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); @@ -836,10 +872,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0); getActionDefinitionsBuilder(G_BITCAST) - // Don't worry about the size constraint. - .legalIf(all(isRegisterType(0), isRegisterType(1))) - .lower(); - + // Don't worry about the size constraint. + .legalIf(all(isRegisterClassType(0), isRegisterClassType(1))) + .lower(); getActionDefinitionsBuilder(G_CONSTANT) .legalFor({S1, S32, S64, S16, GlobalPtr, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll new file mode 100644 index 0000000000000..5bea13af1649a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll @@ -0,0 +1,85 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GPRIDX %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s +define void @main(<19 x i32> %arg) { +; GCN-LABEL: main: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: s_mov_b32 s12, s4 +; GCN-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_mov_b32 s13, s4 +; GCN-NEXT: v_mov_b32_e32 v4, s12 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: s_mov_b32 s6, s4 +; GCN-NEXT: s_mov_b32 s7, s4 +; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s9, s4 +; GCN-NEXT: s_mov_b32 s10, s4 +; GCN-NEXT: s_mov_b32 s11, s4 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, v1 +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: v_mov_b32_e32 v5, s13 +; GCN-NEXT: image_store v[0:3], v[4:5], s[4:11] unorm +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: main: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX10-NEXT: s_mov_b32 s10, s4 +; GFX10-NEXT: s_mov_b32 s11, s4 +; GFX10-NEXT: v_mov_b32_e32 v4, s10 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, s11 +; GFX10-NEXT: s_mov_b32 s5, s4 +; GFX10-NEXT: s_mov_b32 s6, s4 +; GFX10-NEXT: s_mov_b32 s7, s4 +; GFX10-NEXT: s_mov_b32 s8, s4 +; GFX10-NEXT: s_mov_b32 s9, s4 +; GFX10-NEXT: image_store v[0:3], v[4:5], s[4:11] dim:SQ_RSRC_IMG_2D unorm +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: main: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, s6 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_mov_b32_e32 v5, s7 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dim:SQ_RSRC_IMG_2D unorm +; GFX11-NEXT: s_setpc_b64 s[30:31] +bb: + %i = bitcast <19 x i32> %arg to <38 x i16> + %i1 = extractelement <38 x i16> %i, i64 0 + %i2 = icmp eq i16 %i1, 0 + %i3 = zext i1 %i2 to i32 + %i4 = bitcast i32 %i3 to float + %i5 = insertelement <4 x float> zeroinitializer, float %i4, i64 0 + call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %i5, i32 0, i32 0, i32 0, <8 x i32> zeroinitializer, i32 0, i32 0) + ret void +} +declare void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float>, i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX10PLUS: {{.*}} +; GPRIDX: {{.*}} +; MOVREL: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll index ac153183be642..1e1c90d142a1f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -2626,6 +2626,132 @@ entry: ret double %ext } +define amdgpu_ps double @dyn_extract_v7f64_s_v_bitcast(<14 x float> inreg %userData, i32 %sel) { +; GCN-LABEL: dyn_extract_v7f64_s_v_bitcast: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: v_mov_b32_e32 v3, s4 +; GCN-NEXT: v_mov_b32_e32 v4, s5 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_mov_b32_e32 v5, s6 +; GCN-NEXT: v_mov_b32_e32 v6, s7 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v7, s8 +; GCN-NEXT: v_mov_b32_e32 v8, s9 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v9, s10 +; GCN-NEXT: v_mov_b32_e32 v10, s11 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 +; GCN-NEXT: v_mov_b32_e32 v11, s12 +; GCN-NEXT: v_mov_b32_e32 v12, s13 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 +; GCN-NEXT: v_mov_b32_e32 v13, s14 +; GCN-NEXT: v_mov_b32_e32 v14, s15 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v14, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0 +; GCN-NEXT: ; kill: def $vgpr15 killed $sgpr2 killed $exec +; GCN-NEXT: ; kill: def $vgpr16 killed $sgpr3 killed $exec +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v15, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: v_readfirstlane_b32 s1, v1 +; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v7f64_s_v_bitcast: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10-NEXT: s_mov_b32 s0, s14 +; GFX10-NEXT: v_cndmask_b32_e32 v1, s2, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, s3, v2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s7, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s9, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s13, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s15, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: dyn_extract_v7f64_s_v_bitcast: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11-NEXT: s_mov_b32 s0, s14 +; GFX11-NEXT: v_cndmask_b32_e32 v1, s2, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v2, s3, v2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s6, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s7, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s8, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s9, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s12, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s13, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s15, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v1, s2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: ; return to shader part epilog +entry: + %bc = bitcast <14 x float> %userData to <7 x double> + %ext = extractelement <7 x double> %bc, i32 %sel + ret double %ext +} + +define amdgpu_ps i64 @dyn_extract_v7i64_s_v_bitcast(<14 x i32> inreg %userData, i32 %sel) { +; GCN-LABEL: dyn_extract_v7i64_s_v_bitcast: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_mov_b32 s0, s10 +; GCN-NEXT: s_mov_b32 s1, s11 +; GCN-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: dyn_extract_v7i64_s_v_bitcast: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_mov_b32 s0, s10 +; GFX10PLUS-NEXT: s_mov_b32 s1, s11 +; GFX10PLUS-NEXT: ; return to shader part epilog +entry: + %.bc = bitcast <14 x i32> %userData to <7 x i64> + %ext = extractelement <7 x i64> %.bc, i32 4 + ret i64 %ext +} + define amdgpu_ps double @dyn_extract_v7f64_s_v(<7 x double> inreg %vec, i32 %sel) { ; GCN-LABEL: dyn_extract_v7f64_s_v: ; GCN: ; %bb.0: ; %entry diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-build-vector.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-build-vector.mir index 10766b0f79d81..25652b69afa92 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-build-vector.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-build-vector.mir @@ -299,129 +299,6 @@ body: | S_NOP 0, implicit %12 ... --- -name: legal_v13s32 -body: | - bb.0: - liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12 - ; CHECK-LABEL: name: legal_v13s32 - ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<13 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32) - ; CHECK-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<13 x s32>) - %0:_(s32) = COPY $vgpr0 - %1:_(s32) = COPY $vgpr1 - %2:_(s32) = COPY $vgpr2 - %3:_(s32) = COPY $vgpr3 - %4:_(s32) = COPY $vgpr4 - %5:_(s32) = COPY $vgpr5 - %6:_(s32) = COPY $vgpr6 - %7:_(s32) = COPY $vgpr7 - %8:_(s32) = COPY $vgpr8 - %9:_(s32) = COPY $vgpr9 - %10:_(s32) = COPY $vgpr10 - %11:_(s32) = COPY $vgpr11 - %12:_(s32) = COPY $vgpr12 - %13:_(<13 x s32>) = G_BUILD_VECTOR %0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12 - S_NOP 0, implicit %13 -... ---- -name: legal_v14s32 -body: | - bb.0: - liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13 - ; CHECK-LABEL: name: legal_v14s32 - ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<14 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32) - ; CHECK-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<14 x s32>) - %0:_(s32) = COPY $vgpr0 - %1:_(s32) = COPY $vgpr1 - %2:_(s32) = COPY $vgpr2 - %3:_(s32) = COPY $vgpr3 - %4:_(s32) = COPY $vgpr4 - %5:_(s32) = COPY $vgpr5 - %6:_(s32) = COPY $vgpr6 - %7:_(s32) = COPY $vgpr7 - %8:_(s32) = COPY $vgpr8 - %9:_(s32) = COPY $vgpr9 - %10:_(s32) = COPY $vgpr10 - %11:_(s32) = COPY $vgpr11 - %12:_(s32) = COPY $vgpr12 - %13:_(s32) = COPY $vgpr13 - %14:_(<14 x s32>) = G_BUILD_VECTOR %0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13 - S_NOP 0, implicit %14 -... ---- -name: legal_v15s32 -body: | - bb.0: - liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14 - ; CHECK-LABEL: name: legal_v15s32 - ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<15 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32) - ; CHECK-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<15 x s32>) - %0:_(s32) = COPY $vgpr0 - %1:_(s32) = COPY $vgpr1 - %2:_(s32) = COPY $vgpr2 - %3:_(s32) = COPY $vgpr3 - %4:_(s32) = COPY $vgpr4 - %5:_(s32) = COPY $vgpr5 - %6:_(s32) = COPY $vgpr6 - %7:_(s32) = COPY $vgpr7 - %8:_(s32) = COPY $vgpr8 - %9:_(s32) = COPY $vgpr9 - %10:_(s32) = COPY $vgpr10 - %11:_(s32) = COPY $vgpr11 - %12:_(s32) = COPY $vgpr12 - %13:_(s32) = COPY $vgpr13 - %14:_(s32) = COPY $vgpr14 - %15:_(<15 x s32>) = G_BUILD_VECTOR %0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14 - S_NOP 0, implicit %15 -... ---- name: legal_v16s32 body: | bb.0: