Skip to content

Commit 0f19020

Browse files
author
Stefan Stipanovic
committed
[AMDGPU][GlobalIsel] Introduce isRegType to check for legal types, instead of checking bit width.
Make v13s32, v14s32, v15s32 and v7s64 illegal for bitcast first.
1 parent 77c43e1 commit 0f19020

File tree

3 files changed

+273
-186
lines changed

3 files changed

+273
-186
lines changed

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 111 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,104 @@ static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
272272
};
273273
}
274274

275+
static const LLT S1 = LLT::scalar(1);
276+
static const LLT S8 = LLT::scalar(8);
277+
static const LLT S16 = LLT::scalar(16);
278+
static const LLT S32 = LLT::scalar(32);
279+
static const LLT S64 = LLT::scalar(64);
280+
static const LLT S96 = LLT::scalar(96);
281+
static const LLT S128 = LLT::scalar(128);
282+
static const LLT S160 = LLT::scalar(160);
283+
static const LLT S224 = LLT::scalar(224);
284+
static const LLT S256 = LLT::scalar(256);
285+
static const LLT S512 = LLT::scalar(512);
286+
static const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
287+
288+
static const LLT V2S8 = LLT::fixed_vector(2, 8);
289+
static const LLT V2S16 = LLT::fixed_vector(2, 16);
290+
static const LLT V4S16 = LLT::fixed_vector(4, 16);
291+
static const LLT V6S16 = LLT::fixed_vector(6, 16);
292+
static const LLT V8S16 = LLT::fixed_vector(8, 16);
293+
static const LLT V10S16 = LLT::fixed_vector(10, 16);
294+
static const LLT V12S16 = LLT::fixed_vector(12, 16);
295+
static const LLT V16S16 = LLT::fixed_vector(16, 16);
296+
297+
static const LLT V2S32 = LLT::fixed_vector(2, 32);
298+
static const LLT V3S32 = LLT::fixed_vector(3, 32);
299+
static const LLT V4S32 = LLT::fixed_vector(4, 32);
300+
static const LLT V5S32 = LLT::fixed_vector(5, 32);
301+
static const LLT V6S32 = LLT::fixed_vector(6, 32);
302+
static const LLT V7S32 = LLT::fixed_vector(7, 32);
303+
static const LLT V8S32 = LLT::fixed_vector(8, 32);
304+
static const LLT V9S32 = LLT::fixed_vector(9, 32);
305+
static const LLT V10S32 = LLT::fixed_vector(10, 32);
306+
static const LLT V11S32 = LLT::fixed_vector(11, 32);
307+
static const LLT V12S32 = LLT::fixed_vector(12, 32);
308+
static const LLT V16S32 = LLT::fixed_vector(16, 32);
309+
static const LLT V32S32 = LLT::fixed_vector(32, 32);
310+
311+
static const LLT V2S64 = LLT::fixed_vector(2, 64);
312+
static const LLT V3S64 = LLT::fixed_vector(3, 64);
313+
static const LLT V4S64 = LLT::fixed_vector(4, 64);
314+
static const LLT V5S64 = LLT::fixed_vector(5, 64);
315+
static const LLT V6S64 = LLT::fixed_vector(6, 64);
316+
static const LLT V7S64 = LLT::fixed_vector(7, 64);
317+
static const LLT V8S64 = LLT::fixed_vector(8, 64);
318+
static const LLT V16S64 = LLT::fixed_vector(16, 64);
319+
320+
static const LLT V2S128 = LLT::fixed_vector(2, 128);
321+
static const LLT V4S128 = LLT::fixed_vector(4, 128);
322+
323+
static std::initializer_list<LLT> AllScalarTypes = {S32, S64, S96, S128,
324+
S160, S224, S256, S512};
325+
326+
static std::initializer_list<LLT> AllS16Vectors{
327+
V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128, V4S128};
328+
329+
static std::initializer_list<LLT> AllS32Vectors = {
330+
V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
331+
V9S32, V10S32, V11S32, V12S32, V16S32, V32S32};
332+
333+
static std::initializer_list<LLT> AllS64Vectors = {V2S64, V3S64, V4S64, V5S64,
334+
V6S64, V7S64, V8S64, V16S64};
335+
336+
static bool typeInSet(LLT Ty, std::initializer_list<LLT> TypesInit) {
337+
SmallVector<LLT, 4> Types = TypesInit;
338+
return llvm::is_contained(Types, Ty);
339+
}
340+
341+
static LLT GetAddrSpacePtr(unsigned AS, const GCNTargetMachine &TM) {
342+
return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
343+
}
344+
345+
static bool isRegType(LLT Ty, const GCNTargetMachine &TM) {
346+
const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS, TM);
347+
const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS, TM);
348+
const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS, TM);
349+
350+
// TODO: list all possible ptr vectors
351+
const LLT V2FlatPtr = LLT::fixed_vector(2, FlatPtr);
352+
const LLT V3LocalPtr = LLT::fixed_vector(3, LocalPtr);
353+
const LLT V5LocalPtr = LLT::fixed_vector(5, LocalPtr);
354+
const LLT V16LocalPtr = LLT::fixed_vector(16, LocalPtr);
355+
const LLT V2GlobalPtr = LLT::fixed_vector(2, GlobalPtr);
356+
const LLT V4GlobalPtr = LLT::fixed_vector(4, GlobalPtr);
357+
358+
std::initializer_list<LLT> AllPtrTypes{V2FlatPtr, V3LocalPtr, V5LocalPtr,
359+
V16LocalPtr, V2GlobalPtr, V4GlobalPtr};
360+
361+
return typeInSet(Ty, AllS32Vectors) || typeInSet(Ty, AllS64Vectors) ||
362+
typeInSet(Ty, AllScalarTypes) || typeInSet(Ty, AllS16Vectors) ||
363+
typeInSet(Ty, AllPtrTypes) || Ty.isPointer();
364+
}
365+
366+
static LegalityPredicate isRegType(unsigned TypeIdx,
367+
const GCNTargetMachine &TM) {
368+
return [TypeIdx, &TM](const LegalityQuery &Query) {
369+
return isRegType(Query.Types[TypeIdx], TM);
370+
};
371+
}
372+
275373
// If we have a truncating store or an extending load with a data size larger
276374
// than 32-bits, we need to reduce to a 32-bit type.
277375
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
@@ -570,65 +668,16 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
570668
: ST(ST_) {
571669
using namespace TargetOpcode;
572670

573-
auto GetAddrSpacePtr = [&TM](unsigned AS) {
574-
return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
575-
};
576-
577-
const LLT S1 = LLT::scalar(1);
578-
const LLT S8 = LLT::scalar(8);
579-
const LLT S16 = LLT::scalar(16);
580-
const LLT S32 = LLT::scalar(32);
581-
const LLT S64 = LLT::scalar(64);
582-
const LLT S128 = LLT::scalar(128);
583-
const LLT S256 = LLT::scalar(256);
584-
const LLT S512 = LLT::scalar(512);
585-
const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
586-
587-
const LLT V2S8 = LLT::fixed_vector(2, 8);
588-
const LLT V2S16 = LLT::fixed_vector(2, 16);
589-
const LLT V4S16 = LLT::fixed_vector(4, 16);
590-
591-
const LLT V2S32 = LLT::fixed_vector(2, 32);
592-
const LLT V3S32 = LLT::fixed_vector(3, 32);
593-
const LLT V4S32 = LLT::fixed_vector(4, 32);
594-
const LLT V5S32 = LLT::fixed_vector(5, 32);
595-
const LLT V6S32 = LLT::fixed_vector(6, 32);
596-
const LLT V7S32 = LLT::fixed_vector(7, 32);
597-
const LLT V8S32 = LLT::fixed_vector(8, 32);
598-
const LLT V9S32 = LLT::fixed_vector(9, 32);
599-
const LLT V10S32 = LLT::fixed_vector(10, 32);
600-
const LLT V11S32 = LLT::fixed_vector(11, 32);
601-
const LLT V12S32 = LLT::fixed_vector(12, 32);
602-
const LLT V13S32 = LLT::fixed_vector(13, 32);
603-
const LLT V14S32 = LLT::fixed_vector(14, 32);
604-
const LLT V15S32 = LLT::fixed_vector(15, 32);
605-
const LLT V16S32 = LLT::fixed_vector(16, 32);
606-
const LLT V32S32 = LLT::fixed_vector(32, 32);
607-
608-
const LLT V2S64 = LLT::fixed_vector(2, 64);
609-
const LLT V3S64 = LLT::fixed_vector(3, 64);
610-
const LLT V4S64 = LLT::fixed_vector(4, 64);
611-
const LLT V5S64 = LLT::fixed_vector(5, 64);
612-
const LLT V6S64 = LLT::fixed_vector(6, 64);
613-
const LLT V7S64 = LLT::fixed_vector(7, 64);
614-
const LLT V8S64 = LLT::fixed_vector(8, 64);
615-
const LLT V16S64 = LLT::fixed_vector(16, 64);
616-
617-
std::initializer_list<LLT> AllS32Vectors =
618-
{V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
619-
V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
620-
std::initializer_list<LLT> AllS64Vectors =
621-
{V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
622-
623-
const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
624-
const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
625-
const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
626-
const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
627-
const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
628-
const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
629-
const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
630-
const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
631-
const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
671+
const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS, TM);
672+
const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS, TM);
673+
const LLT Constant32Ptr =
674+
GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT, TM);
675+
const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS, TM);
676+
const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS, TM);
677+
const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS, TM);
678+
const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS, TM);
679+
const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER, TM);
680+
const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE, TM);
632681

633682
const LLT CodePtr = FlatPtr;
634683

@@ -810,10 +859,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
810859
.scalarize(0);
811860

812861
getActionDefinitionsBuilder(G_BITCAST)
813-
// Don't worry about the size constraint.
814-
.legalIf(all(isRegisterType(0), isRegisterType(1)))
815-
.lower();
816-
862+
// Don't worry about the size constraint.
863+
.legalIf(all(isRegType(0, TM), isRegType(1, TM)))
864+
.lower();
817865

818866
getActionDefinitionsBuilder(G_CONSTANT)
819867
.legalFor({S1, S32, S64, S16, GlobalPtr,

llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2714,6 +2714,168 @@ entry:
27142714
ret double %ext
27152715
}
27162716

2717+
define amdgpu_ps double @dyn_extract_v7f64_s_v_bitcast(<14 x float> inreg %userData, i32 %sel) {
2718+
; GCN-LABEL: dyn_extract_v7f64_s_v_bitcast:
2719+
; GCN: ; %bb.0: ; %entry
2720+
; GCN-NEXT: s_mov_b32 s0, s2
2721+
; GCN-NEXT: s_mov_b32 s1, s3
2722+
; GCN-NEXT: s_mov_b32 s2, s4
2723+
; GCN-NEXT: s_mov_b32 s3, s5
2724+
; GCN-NEXT: s_mov_b32 s4, s6
2725+
; GCN-NEXT: s_mov_b32 s5, s7
2726+
; GCN-NEXT: v_mov_b32_e32 v1, s0
2727+
; GCN-NEXT: v_mov_b32_e32 v2, s1
2728+
; GCN-NEXT: v_mov_b32_e32 v3, s2
2729+
; GCN-NEXT: v_mov_b32_e32 v4, s3
2730+
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
2731+
; GCN-NEXT: s_mov_b32 s6, s8
2732+
; GCN-NEXT: s_mov_b32 s7, s9
2733+
; GCN-NEXT: v_mov_b32_e32 v5, s4
2734+
; GCN-NEXT: v_mov_b32_e32 v6, s5
2735+
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
2736+
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
2737+
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
2738+
; GCN-NEXT: s_mov_b32 s8, s10
2739+
; GCN-NEXT: s_mov_b32 s9, s11
2740+
; GCN-NEXT: v_mov_b32_e32 v7, s6
2741+
; GCN-NEXT: v_mov_b32_e32 v8, s7
2742+
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
2743+
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
2744+
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0
2745+
; GCN-NEXT: v_mov_b32_e32 v9, s8
2746+
; GCN-NEXT: v_mov_b32_e32 v10, s9
2747+
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc
2748+
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
2749+
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0
2750+
; GCN-NEXT: v_mov_b32_e32 v11, s12
2751+
; GCN-NEXT: v_mov_b32_e32 v12, s13
2752+
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
2753+
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
2754+
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0
2755+
; GCN-NEXT: v_mov_b32_e32 v13, s14
2756+
; GCN-NEXT: v_mov_b32_e32 v14, s15
2757+
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc
2758+
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc
2759+
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0
2760+
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc
2761+
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v14, vcc
2762+
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0
2763+
; GCN-NEXT: ; kill: def $vgpr15 killed $sgpr14 killed $exec
2764+
; GCN-NEXT: ; kill: def $vgpr16 killed $sgpr15 killed $exec
2765+
; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v15, vcc
2766+
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc
2767+
; GCN-NEXT: v_readfirstlane_b32 s0, v0
2768+
; GCN-NEXT: v_readfirstlane_b32 s1, v1
2769+
; GCN-NEXT: ; return to shader part epilog
2770+
;
2771+
; GFX10-LABEL: dyn_extract_v7f64_s_v_bitcast:
2772+
; GFX10: ; %bb.0: ; %entry
2773+
; GFX10-NEXT: s_mov_b32 s0, s2
2774+
; GFX10-NEXT: s_mov_b32 s2, s4
2775+
; GFX10-NEXT: s_mov_b32 s19, s5
2776+
; GFX10-NEXT: v_mov_b32_e32 v1, s2
2777+
; GFX10-NEXT: v_mov_b32_e32 v2, s19
2778+
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
2779+
; GFX10-NEXT: s_mov_b32 s1, s3
2780+
; GFX10-NEXT: s_mov_b32 s4, s6
2781+
; GFX10-NEXT: s_mov_b32 s5, s7
2782+
; GFX10-NEXT: s_mov_b32 s6, s8
2783+
; GFX10-NEXT: v_cndmask_b32_e32 v1, s0, v1, vcc_lo
2784+
; GFX10-NEXT: v_cndmask_b32_e32 v2, s1, v2, vcc_lo
2785+
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0
2786+
; GFX10-NEXT: s_mov_b32 s7, s9
2787+
; GFX10-NEXT: s_mov_b32 s8, s10
2788+
; GFX10-NEXT: s_mov_b32 s9, s11
2789+
; GFX10-NEXT: s_mov_b32 s10, s12
2790+
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo
2791+
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s5, vcc_lo
2792+
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
2793+
; GFX10-NEXT: s_mov_b32 s11, s13
2794+
; GFX10-NEXT: s_mov_b32 s12, s14
2795+
; GFX10-NEXT: s_mov_b32 s13, s15
2796+
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s6, vcc_lo
2797+
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s7, vcc_lo
2798+
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0
2799+
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s8, vcc_lo
2800+
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s9, vcc_lo
2801+
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0
2802+
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, vcc_lo
2803+
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo
2804+
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0
2805+
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s12, vcc_lo
2806+
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s13, vcc_lo
2807+
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0
2808+
; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s14, vcc_lo
2809+
; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s15, vcc_lo
2810+
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
2811+
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
2812+
; GFX10-NEXT: ; return to shader part epilog
2813+
;
2814+
; GFX11-LABEL: dyn_extract_v7f64_s_v_bitcast:
2815+
; GFX11: ; %bb.0: ; %entry
2816+
; GFX11-NEXT: s_mov_b32 s0, s2
2817+
; GFX11-NEXT: s_mov_b32 s2, s4
2818+
; GFX11-NEXT: s_mov_b32 s19, s5
2819+
; GFX11-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s19
2820+
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
2821+
; GFX11-NEXT: s_mov_b32 s1, s3
2822+
; GFX11-NEXT: s_mov_b32 s4, s6
2823+
; GFX11-NEXT: s_mov_b32 s5, s7
2824+
; GFX11-NEXT: s_mov_b32 s6, s8
2825+
; GFX11-NEXT: v_cndmask_b32_e32 v1, s0, v1, vcc_lo
2826+
; GFX11-NEXT: v_cndmask_b32_e32 v2, s1, v2, vcc_lo
2827+
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0
2828+
; GFX11-NEXT: s_mov_b32 s7, s9
2829+
; GFX11-NEXT: s_mov_b32 s8, s10
2830+
; GFX11-NEXT: s_mov_b32 s9, s11
2831+
; GFX11-NEXT: s_mov_b32 s10, s12
2832+
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo
2833+
; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s5, vcc_lo
2834+
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
2835+
; GFX11-NEXT: s_mov_b32 s11, s13
2836+
; GFX11-NEXT: s_mov_b32 s12, s14
2837+
; GFX11-NEXT: s_mov_b32 s13, s15
2838+
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s6, vcc_lo
2839+
; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s7, vcc_lo
2840+
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0
2841+
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s8, vcc_lo
2842+
; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s9, vcc_lo
2843+
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0
2844+
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s10, vcc_lo
2845+
; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo
2846+
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0
2847+
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s12, vcc_lo
2848+
; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s13, vcc_lo
2849+
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0
2850+
; GFX11-NEXT: v_cndmask_b32_e64 v0, v1, s14, vcc_lo
2851+
; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, s15, vcc_lo
2852+
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
2853+
; GFX11-NEXT: v_readfirstlane_b32 s1, v1
2854+
; GFX11-NEXT: ; return to shader part epilog
2855+
entry:
2856+
%bc = bitcast <14 x float> %userData to <7 x double>
2857+
%ext = extractelement <7 x double> %bc, i32 %sel
2858+
ret double %ext
2859+
}
2860+
2861+
define amdgpu_ps i64 @dyn_extract_v7i64_s_v_bitcast(<14 x i32> inreg %userData, i32 %sel) {
2862+
; GCN-LABEL: dyn_extract_v7i64_s_v_bitcast:
2863+
; GCN: ; %bb.0: ; %entry
2864+
; GCN-NEXT: s_mov_b32 s0, s10
2865+
; GCN-NEXT: s_mov_b32 s1, s11
2866+
; GCN-NEXT: ; return to shader part epilog
2867+
;
2868+
; GFX10PLUS-LABEL: dyn_extract_v7i64_s_v_bitcast:
2869+
; GFX10PLUS: ; %bb.0: ; %entry
2870+
; GFX10PLUS-NEXT: s_mov_b32 s0, s10
2871+
; GFX10PLUS-NEXT: s_mov_b32 s1, s11
2872+
; GFX10PLUS-NEXT: ; return to shader part epilog
2873+
entry:
2874+
%.bc = bitcast <14 x i32> %userData to <7 x i64>
2875+
%ext = extractelement <7 x i64> %.bc, i32 4
2876+
ret i64 %ext
2877+
}
2878+
27172879
define amdgpu_ps double @dyn_extract_v7f64_s_v(<7 x double> inreg %vec, i32 %sel) {
27182880
; GCN-LABEL: dyn_extract_v7f64_s_v:
27192881
; GCN: ; %bb.0: ; %entry

0 commit comments

Comments
 (0)