diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 9b62d6067be39..828532dcffb7d 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -831,6 +831,12 @@ class CombinerHelper { /// Combine ors. bool matchOr(MachineInstr &MI, BuildFnTy &MatchInfo); + /// trunc (binop X, C) --> binop (trunc X, trunc C). + bool matchNarrowBinop(const MachineInstr &TruncMI, + const MachineInstr &BinopMI, BuildFnTy &MatchInfo); + + bool matchCastOfInteger(const MachineInstr &CastMI, APInt &MatchInfo); + /// Combine addos. bool matchAddOverflow(MachineInstr &MI, BuildFnTy &MatchInfo); diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 525cc815e73ce..a595a51d7b01f 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -1867,6 +1867,33 @@ class buildvector_of_opcode : GICombineRule < def buildvector_of_truncate : buildvector_of_opcode; +// narrow binop. +// trunc (binop X, C) --> binop (trunc X, trunc C) +class narrow_binop_opcode : GICombineRule < + (defs root:$root, build_fn_matchinfo:$matchinfo), + (match (G_CONSTANT $const, $imm), + (binopOpcode $binop, $x, $const):$Binop, + (G_TRUNC $root, $binop):$Trunc, + [{ return Helper.matchNarrowBinop(*${Trunc}, *${Binop}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFn(*${Trunc}, ${matchinfo}); }])>; + +def narrow_binop_add : narrow_binop_opcode; +def narrow_binop_sub : narrow_binop_opcode; +def narrow_binop_mul : narrow_binop_opcode; +def narrow_binop_and : narrow_binop_opcode; +def narrow_binop_or : narrow_binop_opcode; +def narrow_binop_xor : narrow_binop_opcode; + +// Cast of integer. +class integer_of_opcode : GICombineRule < + (defs root:$root, apint_matchinfo:$matchinfo), + (match (G_CONSTANT $int, $imm), + (castOpcode $root, $int):$Cast, + [{ return Helper.matchCastOfInteger(*${Cast}, ${matchinfo}); }]), + (apply [{ Helper.replaceInstWithConstant(*${Cast}, ${matchinfo}); }])>; + +def integer_of_truncate : integer_of_opcode; + def cast_combines: GICombineGroup<[ truncate_of_zext, truncate_of_sext, @@ -1881,7 +1908,14 @@ def cast_combines: GICombineGroup<[ anyext_of_anyext, anyext_of_zext, anyext_of_sext, - buildvector_of_truncate + buildvector_of_truncate, + narrow_binop_add, + narrow_binop_sub, + narrow_binop_mul, + narrow_binop_and, + narrow_binop_or, + narrow_binop_xor, + integer_of_truncate ]>; diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp index 8714fdabf6549..30557e6a2304e 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp @@ -313,3 +313,49 @@ bool CombinerHelper::matchCastOfBuildVector(const MachineInstr &CastMI, return true; } + +bool CombinerHelper::matchNarrowBinop(const MachineInstr &TruncMI, + const MachineInstr &BinopMI, + BuildFnTy &MatchInfo) { + const GTrunc *Trunc = cast(&TruncMI); + const GBinOp *BinOp = cast(&BinopMI); + + if (!MRI.hasOneNonDBGUse(BinOp->getReg(0))) + return false; + + Register Dst = Trunc->getReg(0); + LLT DstTy = MRI.getType(Dst); + + // Is narrow binop legal? + if (!isLegalOrBeforeLegalizer({BinOp->getOpcode(), {DstTy}})) + return false; + + MatchInfo = [=](MachineIRBuilder &B) { + auto LHS = B.buildTrunc(DstTy, BinOp->getLHSReg()); + auto RHS = B.buildTrunc(DstTy, BinOp->getRHSReg()); + B.buildInstr(BinOp->getOpcode(), {Dst}, {LHS, RHS}); + }; + + return true; +} + +bool CombinerHelper::matchCastOfInteger(const MachineInstr &CastMI, + APInt &MatchInfo) { + const GExtOrTruncOp *Cast = cast(&CastMI); + + APInt Input = getIConstantFromReg(Cast->getSrcReg(), MRI); + + LLT DstTy = MRI.getType(Cast->getReg(0)); + + if (!isConstantLegalOrBeforeLegalizer(DstTy)) + return false; + + switch (Cast->getOpcode()) { + case TargetOpcode::G_TRUNC: { + MatchInfo = Input.trunc(DstTy.getScalarSizeInBits()); + return true; + } + default: + return false; + } +} diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-narrow-binop.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-narrow-binop.mir new file mode 100644 index 0000000000000..f207e9c149a47 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-narrow-binop.mir @@ -0,0 +1,136 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s | FileCheck %s --check-prefixes=CHECK + +--- +name: test_combine_trunc_xor_i64 +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_trunc_xor_i64 + ; CHECK: %lhs:_(s64) = COPY $x0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %lhs(s64) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: %small:_(s32) = G_XOR [[TRUNC]], [[C]] + ; CHECK-NEXT: $w0 = COPY %small(s32) + %lhs:_(s64) = COPY $x0 + %rhs:_(s64) = G_CONSTANT i64 5 + %res:_(s64) = G_XOR %lhs, %rhs + %small:_(s32) = G_TRUNC %res(s64) + $w0 = COPY %small(s32) +... +--- +name: test_combine_trunc_add_i64 +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_trunc_add_i64 + ; CHECK: %lhs:_(s64) = COPY $x0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %lhs(s64) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: %small:_(s32) = G_ADD [[TRUNC]], [[C]] + ; CHECK-NEXT: $w0 = COPY %small(s32) + %lhs:_(s64) = COPY $x0 + %rhs:_(s64) = G_CONSTANT i64 5 + %res:_(s64) = G_ADD %lhs, %rhs + %small:_(s32) = G_TRUNC %res(s64) + $w0 = COPY %small(s32) +... +--- +name: test_combine_trunc_mul_i64 +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_trunc_mul_i64 + ; CHECK: %lhs:_(s64) = COPY $x0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %lhs(s64) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: %small:_(s32) = G_MUL [[TRUNC]], [[C]] + ; CHECK-NEXT: $w0 = COPY %small(s32) + %lhs:_(s64) = COPY $x0 + %rhs:_(s64) = G_CONSTANT i64 5 + %res:_(s64) = G_MUL %lhs, %rhs + %small:_(s32) = G_TRUNC %res(s64) + $w0 = COPY %small(s32) +... +--- +name: test_combine_trunc_and_i64 +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_trunc_and_i64 + ; CHECK: %lhs:_(s64) = COPY $x0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %lhs(s64) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: %small:_(s32) = G_AND [[TRUNC]], [[C]] + ; CHECK-NEXT: $w0 = COPY %small(s32) + %lhs:_(s64) = COPY $x0 + %rhs:_(s64) = G_CONSTANT i64 5 + %res:_(s64) = G_AND %lhs, %rhs + %small:_(s32) = G_TRUNC %res(s64) + $w0 = COPY %small(s32) +... +--- +name: test_combine_trunc_or_i64 +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_trunc_or_i64 + ; CHECK: %lhs:_(s64) = COPY $x0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %lhs(s64) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: %small:_(s32) = G_OR [[TRUNC]], [[C]] + ; CHECK-NEXT: $w0 = COPY %small(s32) + %lhs:_(s64) = COPY $x0 + %rhs:_(s64) = G_CONSTANT i64 5 + %res:_(s64) = G_OR %lhs, %rhs + %small:_(s32) = G_TRUNC %res(s64) + $w0 = COPY %small(s32) +... +--- +name: test_combine_trunc_sub_i128 +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_trunc_sub_i128 + ; CHECK: %lhs:_(s128) = COPY $q0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %lhs(s128) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: %small:_(s32) = G_SUB [[TRUNC]], [[C]] + ; CHECK-NEXT: $w0 = COPY %small(s32) + %lhs:_(s128) = COPY $q0 + %rhs:_(s128) = G_CONSTANT i128 5 + %res:_(s128) = G_SUB %lhs, %rhs + %small:_(s32) = G_TRUNC %res(s128) + $w0 = COPY %small(s32) +... +--- +name: test_combine_trunc_sub_i128_multi_use +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_trunc_sub_i128_multi_use + ; CHECK: %lhs:_(s128) = COPY $q0 + ; CHECK-NEXT: %rhs:_(s128) = G_CONSTANT i128 5 + ; CHECK-NEXT: %res:_(s128) = G_SUB %lhs, %rhs + ; CHECK-NEXT: %small:_(s32) = G_TRUNC %res(s128) + ; CHECK-NEXT: $q0 = COPY %res(s128) + ; CHECK-NEXT: $w0 = COPY %small(s32) + %lhs:_(s128) = COPY $q0 + %rhs:_(s128) = G_CONSTANT i128 5 + %res:_(s128) = G_SUB %lhs, %rhs + %small:_(s32) = G_TRUNC %res(s128) + $q0 = COPY %res(s128) + $w0 = COPY %small(s32) +... +--- +name: test_combine_trunc_xor_vector_pattern_did_not_match +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_trunc_xor_vector_pattern_did_not_match + ; CHECK: %arg1:_(s64) = COPY $x0 + ; CHECK-NEXT: %arg2:_(s64) = COPY $x0 + ; CHECK-NEXT: %lhs:_(<2 x s64>) = G_BUILD_VECTOR %arg1(s64), %arg2(s64) + ; CHECK-NEXT: %rhs:_(<2 x s64>) = G_BUILD_VECTOR %arg1(s64), %arg2(s64) + ; CHECK-NEXT: %res:_(<2 x s64>) = G_XOR %lhs, %rhs + ; CHECK-NEXT: %small:_(<2 x s16>) = G_TRUNC %res(<2 x s64>) + ; CHECK-NEXT: $w0 = COPY %small(<2 x s16>) + %arg1:_(s64) = COPY $x0 + %arg2:_(s64) = COPY $x0 + %lhs:_(<2 x s64>) = G_BUILD_VECTOR %arg1(s64), %arg2(s64) + %rhs:_(<2 x s64>) = G_BUILD_VECTOR %arg1(s64), %arg2(s64) + %res:_(<2 x s64>) = G_XOR %lhs, %rhs + %small:_(<2 x s16>) = G_TRUNC %res(<2 x s64>) + $w0 = COPY %small(<2 x s16>) diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir index fee5afd3ddbb2..9ed1e2d9eee3b 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir @@ -224,10 +224,10 @@ body: | ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64) ; CHECK-NEXT: G_STORE [[C]](s64), [[PTR_ADD]](p0) :: (store (s64) into %ir.dst + 8, align 1) - ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64) - ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; CHECK-NEXT: G_STORE [[TRUNC]](s16), [[PTR_ADD1]](p0) :: (store (s16) into %ir.dst + 16, align 1) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 16448 + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) + ; CHECK-NEXT: G_STORE [[C2]](s16), [[PTR_ADD1]](p0) :: (store (s16) into %ir.dst + 16, align 1) ; CHECK-NEXT: RET_ReallyLR %0:_(p0) = COPY $x0 %1:_(s8) = G_CONSTANT i8 64 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-conflict.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-conflict.mir index e51d9bd13163b..a87ff305d1535 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-conflict.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-conflict.mir @@ -8,9 +8,8 @@ tracksRegLiveness: true body: | bb.1: ; CHECK-LABEL: name: test - ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64) - ; CHECK-NEXT: $w0 = COPY [[TRUNC]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: $w0 = COPY [[C]](s32) ; CHECK-NEXT: RET_ReallyLR implicit $w0 %0:_(s16) = G_CONSTANT i16 0 %2:_(s1) = G_CONSTANT i1 true @@ -41,9 +40,7 @@ body: | bb.1: ; CHECK-LABEL: name: test_inverted_div_rem ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32) - ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s8) - ; CHECK-NEXT: $w0 = COPY [[SEXT]](s32) + ; CHECK-NEXT: $w0 = COPY [[C]](s32) ; CHECK-NEXT: RET_ReallyLR implicit $w0 %0:_(s16) = G_CONSTANT i16 0 %2:_(s1) = G_CONSTANT i1 true diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir index e4f11dfa9e027..d6135d86022be 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir @@ -193,10 +193,10 @@ body: | ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 255 - ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]] - ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64) - ; CHECK-NEXT: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[TRUNC]] + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]] + ; CHECK-NEXT: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[AND]] ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_CVT_F32_UBYTE0_]](s32) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = G_CONSTANT i64 255 @@ -216,10 +216,10 @@ body: | ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 255 - ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]] - ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64) - ; CHECK-NEXT: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[TRUNC]] + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]] + ; CHECK-NEXT: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[AND]] ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_CVT_F32_UBYTE0_]](s32) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = G_CONSTANT i64 255 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-zext-trunc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-zext-trunc.mir index 3b914df7f8f8a..3423af64162e5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-zext-trunc.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-zext-trunc.mir @@ -12,9 +12,11 @@ body: | ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: %var:_(s32) = COPY $vgpr0 - ; GCN-NEXT: %c3FFF:_(s32) = G_CONSTANT i32 16383 - ; GCN-NEXT: %low_bits:_(s32) = G_AND %var, %c3FFF - ; GCN-NEXT: $vgpr0 = COPY %low_bits(s32) + ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC %var(s32) + ; GCN-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 16383 + ; GCN-NEXT: %trunc:_(s16) = G_AND [[TRUNC]], [[C]] + ; GCN-NEXT: %zext:_(s32) = G_ZEXT %trunc(s16) + ; GCN-NEXT: $vgpr0 = COPY %zext(s32) %var:_(s32) = COPY $vgpr0 %c3FFF:_(s32) = G_CONSTANT i32 16383 %low_bits:_(s32) = G_AND %var, %c3FFF @@ -34,10 +36,8 @@ body: | ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: %var:_(s32) = COPY $vgpr0 - ; GCN-NEXT: %cFFFFF:_(s32) = G_CONSTANT i32 1048575 - ; GCN-NEXT: %low_bits:_(s32) = G_AND %var, %cFFFFF - ; GCN-NEXT: %trunc:_(s16) = G_TRUNC %low_bits(s32) - ; GCN-NEXT: %zext:_(s32) = G_ZEXT %trunc(s16) + ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC %var(s32) + ; GCN-NEXT: %zext:_(s32) = G_ZEXT [[TRUNC]](s16) ; GCN-NEXT: $vgpr0 = COPY %zext(s32) %var:_(s32) = COPY $vgpr0 %cFFFFF:_(s32) = G_CONSTANT i32 1048575 @@ -58,9 +58,9 @@ body: | ; GCN: liveins: $vgpr0_vgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: %var:_(s64) = COPY $vgpr0_vgpr1 - ; GCN-NEXT: %c3FFF:_(s64) = G_CONSTANT i64 16383 - ; GCN-NEXT: %low_bits:_(s64) = G_AND %var, %c3FFF - ; GCN-NEXT: %trunc:_(s16) = G_TRUNC %low_bits(s64) + ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC %var(s64) + ; GCN-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 16383 + ; GCN-NEXT: %trunc:_(s16) = G_AND [[TRUNC]], [[C]] ; GCN-NEXT: %zext:_(s32) = G_ZEXT %trunc(s16) ; GCN-NEXT: $vgpr0 = COPY %zext(s32) %var:_(s64) = COPY $vgpr0_vgpr1 @@ -82,9 +82,9 @@ body: | ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: %var:_(s32) = COPY $vgpr0 - ; GCN-NEXT: %c3FFF:_(s32) = G_CONSTANT i32 16383 - ; GCN-NEXT: %low_bits:_(s32) = G_AND %var, %c3FFF - ; GCN-NEXT: %trunc:_(s16) = G_TRUNC %low_bits(s32) + ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC %var(s32) + ; GCN-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 16383 + ; GCN-NEXT: %trunc:_(s16) = G_AND [[TRUNC]], [[C]] ; GCN-NEXT: %zext:_(s64) = G_ZEXT %trunc(s16) ; GCN-NEXT: $vgpr0_vgpr1 = COPY %zext(s64) %var:_(s32) = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll index 966a481b6594d..bb7bc0447aea0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll @@ -238,13 +238,12 @@ define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3 ; GFX10-NEXT: s_load_dwordx8 s[4:11], s[12:13], 0x0 ; GFX10-NEXT: v_mbcnt_hi_u32_b32 v1, -1, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v1 -; GFX10-NEXT: v_and_b32_e32 v3, 1, v1 -; GFX10-NEXT: v_xor_b32_e32 v3, 1, v3 +; GFX10-NEXT: v_xor_b32_e32 v3, 1, v1 ; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 ; GFX10-NEXT: ; implicit-def: $vgpr3 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2 ; GFX10-NEXT: s_cbranch_vccnz .LBB4_4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll index afffebea451a0..3bd3486ec261d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -350,10 +350,12 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) { ; GFX8-LABEL: s_fshl_i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_and_b32 s3, s2, 7 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_andn2_b32 s2, 7, s2 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshr_b32 s1, s1, 1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshl_b32 s0, s0, s3 ; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 @@ -362,10 +364,12 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) { ; GFX9-LABEL: s_fshl_i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_and_b32 s3, s2, 7 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_andn2_b32 s2, 7, s2 +; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX9-NEXT: s_lshr_b32 s1, s1, 1 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX9-NEXT: s_lshl_b32 s0, s0, s3 ; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 @@ -377,7 +381,9 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) { ; GFX10-NEXT: s_and_b32 s3, s2, 7 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 +; GFX10-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX10-NEXT: s_lshr_b32 s1, s1, 1 +; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX10-NEXT: s_lshl_b32 s0, s0, s3 ; GFX10-NEXT: s_lshr_b32 s1, s1, s2 ; GFX10-NEXT: s_or_b32 s0, s0, s1 @@ -389,7 +395,9 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) { ; GFX11-NEXT: s_and_b32 s3, s2, 7 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 +; GFX11-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX11-NEXT: s_lshr_b32 s1, s1, 1 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX11-NEXT: s_lshl_b32 s0, s0, s3 ; GFX11-NEXT: s_lshr_b32 s1, s1, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -416,11 +424,11 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v3, 7, v2 -; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, v3, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, 1 -; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -429,11 +437,11 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v3, 7, v2 -; GFX9-NEXT: v_not_b32_e32 v2, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -441,11 +449,11 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) { ; GFX10-LABEL: v_fshl_i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_not_b32_e32 v3, v2 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 ; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 ; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0 ; GFX10-NEXT: v_lshrrev_b16 v1, v3, v1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 @@ -454,12 +462,12 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) { ; GFX11-LABEL: v_fshl_i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_not_b32_e32 v3, v2 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_b32_e32 v3, 7, v3 ; GFX11-NEXT: v_lshrrev_b16 v1, 1, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 7, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b16 v0, v2, v0 ; GFX11-NEXT: v_lshrrev_b16 v1, v3, v1 @@ -692,22 +700,26 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s4, s1, 8 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshr_b32 s5, s2, 8 ; GFX8-NEXT: s_and_b32 s6, s2, 7 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_andn2_b32 s2, 7, s2 +; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX8-NEXT: s_lshr_b32 s1, s1, 1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshr_b32 s3, s0, 8 ; GFX8-NEXT: s_lshl_b32 s0, s0, s6 ; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s5, 7 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s2, s4, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s3, s1 -; GFX8-NEXT: s_and_b32 s3, s4, 0xff +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_andn2_b32 s3, 7, s5 +; GFX8-NEXT: s_lshr_b32 s2, s2, 1 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX8-NEXT: s_andn2_b32 s2, 7, s5 -; GFX8-NEXT: s_lshr_b32 s3, s3, 1 -; GFX8-NEXT: s_lshr_b32 s2, s3, s2 +; GFX8-NEXT: s_lshr_b32 s2, s2, s3 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_and_b32 s0, s0, 0xff @@ -719,22 +731,26 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshr_b32 s4, s1, 8 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_lshr_b32 s5, s2, 8 ; GFX9-NEXT: s_and_b32 s6, s2, 7 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_andn2_b32 s2, 7, s2 +; GFX9-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX9-NEXT: s_lshr_b32 s1, s1, 1 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX9-NEXT: s_lshr_b32 s3, s0, 8 ; GFX9-NEXT: s_lshl_b32 s0, s0, s6 ; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s5, 7 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_and_b32 s2, s4, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s3, s1 -; GFX9-NEXT: s_and_b32 s3, s4, 0xff +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX9-NEXT: s_andn2_b32 s3, 7, s5 +; GFX9-NEXT: s_lshr_b32 s2, s2, 1 ; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX9-NEXT: s_andn2_b32 s2, 7, s5 -; GFX9-NEXT: s_lshr_b32 s3, s3, 1 -; GFX9-NEXT: s_lshr_b32 s2, s3, s2 +; GFX9-NEXT: s_lshr_b32 s2, s2, s3 ; GFX9-NEXT: s_or_b32 s1, s1, s2 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_and_b32 s0, s0, 0xff @@ -745,21 +761,25 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; GFX10-LABEL: s_fshl_v2i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_lshr_b32 s4, s1, 8 -; GFX10-NEXT: s_lshr_b32 s5, s2, 8 +; GFX10-NEXT: s_and_b32 s5, s2, 7 +; GFX10-NEXT: s_lshr_b32 s6, s2, 8 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX10-NEXT: s_and_b32 s4, s4, 0xff -; GFX10-NEXT: s_and_b32 s6, s2, 7 +; GFX10-NEXT: s_lshr_b32 s3, s0, 8 ; GFX10-NEXT: s_and_b32 s1, s1, 0xff +; GFX10-NEXT: s_lshl_b32 s0, s0, s5 +; GFX10-NEXT: s_and_b32 s5, s6, 7 ; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX10-NEXT: s_lshr_b32 s3, s0, 8 +; GFX10-NEXT: s_andn2_b32 s6, 7, s6 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX10-NEXT: s_lshl_b32 s0, s0, s6 -; GFX10-NEXT: s_and_b32 s6, s5, 7 -; GFX10-NEXT: s_andn2_b32 s5, 7, s5 -; GFX10-NEXT: s_lshr_b32 s4, s4, 1 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX10-NEXT: s_lshr_b32 s4, s4, 1 +; GFX10-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX10-NEXT: s_lshr_b32 s1, s1, 1 -; GFX10-NEXT: s_lshl_b32 s3, s3, s6 -; GFX10-NEXT: s_lshr_b32 s4, s4, s5 +; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX10-NEXT: s_lshl_b32 s3, s3, s5 +; GFX10-NEXT: s_lshr_b32 s4, s4, s6 ; GFX10-NEXT: s_lshr_b32 s1, s1, s2 ; GFX10-NEXT: s_or_b32 s2, s3, s4 ; GFX10-NEXT: s_or_b32 s0, s0, s1 @@ -772,21 +792,25 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; GFX11-LABEL: s_fshl_v2i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_lshr_b32 s4, s1, 8 -; GFX11-NEXT: s_lshr_b32 s5, s2, 8 +; GFX11-NEXT: s_and_b32 s5, s2, 7 +; GFX11-NEXT: s_lshr_b32 s6, s2, 8 +; GFX11-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX11-NEXT: s_and_b32 s4, s4, 0xff -; GFX11-NEXT: s_and_b32 s6, s2, 7 +; GFX11-NEXT: s_lshr_b32 s3, s0, 8 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_lshl_b32 s0, s0, s5 +; GFX11-NEXT: s_and_b32 s5, s6, 7 ; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX11-NEXT: s_lshr_b32 s3, s0, 8 +; GFX11-NEXT: s_and_not1_b32 s6, 7, s6 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: s_lshl_b32 s0, s0, s6 -; GFX11-NEXT: s_and_b32 s6, s5, 7 -; GFX11-NEXT: s_and_not1_b32 s5, 7, s5 -; GFX11-NEXT: s_lshr_b32 s4, s4, 1 ; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 +; GFX11-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX11-NEXT: s_lshr_b32 s4, s4, 1 +; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX11-NEXT: s_lshr_b32 s1, s1, 1 -; GFX11-NEXT: s_lshl_b32 s3, s3, s6 -; GFX11-NEXT: s_lshr_b32 s4, s4, s5 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX11-NEXT: s_lshl_b32 s3, s3, s5 +; GFX11-NEXT: s_lshr_b32 s4, s4, s6 ; GFX11-NEXT: s_lshr_b32 s1, s1, s2 ; GFX11-NEXT: s_or_b32 s2, s3, s4 ; GFX11-NEXT: s_or_b32 s0, s0, s1 @@ -837,20 +861,20 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) { ; GFX8-NEXT: v_and_b32_e32 v6, 7, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, v6, v0 ; GFX8-NEXT: v_mov_b32_e32 v6, 1 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_and_b32_e32 v1, 7, v5 -; GFX8-NEXT: v_not_b32_e32 v2, v5 -; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, v1, v3 -; GFX8-NEXT: v_lshrrev_b16_sdwa v3, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, v2, v3 +; GFX8-NEXT: v_xor_b32_e32 v3, -1, v5 +; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_e32 v3, 7, v3 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, v3, v2 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -863,20 +887,20 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) { ; GFX9-NEXT: v_and_b32_e32 v6, 7, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX9-NEXT: v_not_b32_e32 v2, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v6, v0 ; GFX9-NEXT: v_mov_b32_e32 v6, 1 +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 7, v5 -; GFX9-NEXT: v_not_b32_e32 v2, v5 -; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, v1, v3 -; GFX9-NEXT: v_lshrrev_b16_sdwa v3, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshrrev_b16_e32 v2, v2, v3 +; GFX9-NEXT: v_xor_b32_e32 v3, -1, v5 +; GFX9-NEXT: v_lshrrev_b16_sdwa v2, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_e32 v3, 7, v3 +; GFX9-NEXT: v_lshrrev_b16_e32 v2, v3, v2 ; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -886,24 +910,24 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) { ; GFX10-LABEL: v_fshl_v2i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v0 -; GFX10-NEXT: v_not_b32_e32 v7, v2 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 +; GFX10-NEXT: v_xor_b32_e32 v7, -1, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX10-NEXT: v_xor_b32_e32 v6, -1, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 7, v4 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 -; GFX10-NEXT: v_lshrrev_b16 v4, 1, v4 ; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1 -; GFX10-NEXT: v_lshlrev_b16 v3, v3, v5 +; GFX10-NEXT: v_lshrrev_b16 v3, 1, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX10-NEXT: v_lshlrev_b16 v4, v4, v5 ; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0 -; GFX10-NEXT: v_lshrrev_b16 v4, v6, v4 +; GFX10-NEXT: v_lshrrev_b16 v3, v6, v3 ; GFX10-NEXT: v_lshrrev_b16 v1, v7, v1 -; GFX10-NEXT: v_or_b32_e32 v2, v3, v4 +; GFX10-NEXT: v_or_b32_e32 v2, v4, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, 0xff ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: v_and_b32_sdwa v1, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -913,26 +937,26 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) { ; GFX11-LABEL: v_fshl_v2i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 8, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v0 -; GFX11-NEXT: v_not_b32_e32 v7, v2 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-NEXT: v_and_b32_e32 v3, 7, v3 +; GFX11-NEXT: v_xor_b32_e32 v7, -1, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_xor_b32_e32 v6, -1, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 7, v4 ; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX11-NEXT: v_and_b32_e32 v6, 7, v6 -; GFX11-NEXT: v_lshrrev_b16 v4, 1, v4 ; GFX11-NEXT: v_lshrrev_b16 v1, 1, v1 -; GFX11-NEXT: v_lshlrev_b16 v3, v3, v5 +; GFX11-NEXT: v_lshrrev_b16 v3, 1, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX11-NEXT: v_lshlrev_b16 v4, v4, v5 ; GFX11-NEXT: v_lshlrev_b16 v0, v2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b16 v4, v6, v4 +; GFX11-NEXT: v_lshrrev_b16 v3, v6, v3 ; GFX11-NEXT: v_lshrrev_b16 v1, v7, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_or_b32_e32 v2, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v3 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v2 @@ -1002,13 +1026,15 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 ; GFX8-NEXT: s_lshr_b32 s8, s1, 24 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshr_b32 s9, s2, 8 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 ; GFX8-NEXT: s_lshr_b32 s11, s2, 24 ; GFX8-NEXT: s_and_b32 s12, s2, 7 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_andn2_b32 s2, 7, s2 +; GFX8-NEXT: s_and_b32 s12, 0xffff, s12 ; GFX8-NEXT: s_lshr_b32 s1, s1, 1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshr_b32 s3, s0, 8 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 ; GFX8-NEXT: s_lshr_b32 s5, s0, 24 @@ -1016,29 +1042,35 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s9, 7 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s2, s6, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s3, s1 -; GFX8-NEXT: s_and_b32 s3, s6, 0xff +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_andn2_b32 s3, 7, s9 +; GFX8-NEXT: s_lshr_b32 s2, s2, 1 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX8-NEXT: s_andn2_b32 s2, 7, s9 -; GFX8-NEXT: s_lshr_b32 s3, s3, 1 -; GFX8-NEXT: s_lshr_b32 s2, s3, s2 +; GFX8-NEXT: s_lshr_b32 s2, s2, s3 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: s_and_b32 s2, s10, 7 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_and_b32 s3, s7, 0xff ; GFX8-NEXT: s_lshl_b32 s2, s4, s2 -; GFX8-NEXT: s_and_b32 s4, s7, 0xff +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX8-NEXT: s_andn2_b32 s4, 7, s10 +; GFX8-NEXT: s_lshr_b32 s3, s3, 1 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX8-NEXT: s_andn2_b32 s3, 7, s10 -; GFX8-NEXT: s_lshr_b32 s4, s4, 1 -; GFX8-NEXT: s_lshr_b32 s3, s4, s3 +; GFX8-NEXT: s_lshr_b32 s3, s3, s4 ; GFX8-NEXT: s_or_b32 s2, s2, s3 ; GFX8-NEXT: s_and_b32 s3, s11, 7 -; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_andn2_b32 s4, 7, s11 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshl_b32 s3, s5, s3 -; GFX8-NEXT: s_lshr_b32 s5, s8, 1 +; GFX8-NEXT: s_andn2_b32 s5, 7, s11 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_lshr_b32 s4, s8, 1 +; GFX8-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NEXT: s_lshr_b32 s4, s5, s4 +; GFX8-NEXT: s_lshr_b32 s4, s4, s5 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s2, 0xff ; GFX8-NEXT: s_or_b32 s3, s3, s4 @@ -1055,13 +1087,15 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX9-NEXT: s_lshr_b32 s7, s1, 16 ; GFX9-NEXT: s_lshr_b32 s8, s1, 24 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_lshr_b32 s9, s2, 8 ; GFX9-NEXT: s_lshr_b32 s10, s2, 16 ; GFX9-NEXT: s_lshr_b32 s11, s2, 24 ; GFX9-NEXT: s_and_b32 s12, s2, 7 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_andn2_b32 s2, 7, s2 +; GFX9-NEXT: s_and_b32 s12, 0xffff, s12 ; GFX9-NEXT: s_lshr_b32 s1, s1, 1 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX9-NEXT: s_lshr_b32 s3, s0, 8 ; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: s_lshr_b32 s5, s0, 24 @@ -1069,29 +1103,35 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s9, 7 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_and_b32 s2, s6, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s3, s1 -; GFX9-NEXT: s_and_b32 s3, s6, 0xff +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX9-NEXT: s_andn2_b32 s3, 7, s9 +; GFX9-NEXT: s_lshr_b32 s2, s2, 1 ; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX9-NEXT: s_andn2_b32 s2, 7, s9 -; GFX9-NEXT: s_lshr_b32 s3, s3, 1 -; GFX9-NEXT: s_lshr_b32 s2, s3, s2 +; GFX9-NEXT: s_lshr_b32 s2, s2, s3 ; GFX9-NEXT: s_or_b32 s1, s1, s2 ; GFX9-NEXT: s_and_b32 s2, s10, 7 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX9-NEXT: s_and_b32 s3, s7, 0xff ; GFX9-NEXT: s_lshl_b32 s2, s4, s2 -; GFX9-NEXT: s_and_b32 s4, s7, 0xff +; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX9-NEXT: s_andn2_b32 s4, 7, s10 +; GFX9-NEXT: s_lshr_b32 s3, s3, 1 ; GFX9-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX9-NEXT: s_andn2_b32 s3, 7, s10 -; GFX9-NEXT: s_lshr_b32 s4, s4, 1 -; GFX9-NEXT: s_lshr_b32 s3, s4, s3 +; GFX9-NEXT: s_lshr_b32 s3, s3, s4 ; GFX9-NEXT: s_or_b32 s2, s2, s3 ; GFX9-NEXT: s_and_b32 s3, s11, 7 -; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_andn2_b32 s4, 7, s11 +; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX9-NEXT: s_lshl_b32 s3, s5, s3 -; GFX9-NEXT: s_lshr_b32 s5, s8, 1 +; GFX9-NEXT: s_andn2_b32 s5, 7, s11 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_lshr_b32 s4, s8, 1 +; GFX9-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX9-NEXT: s_and_b32 s0, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 -; GFX9-NEXT: s_lshr_b32 s4, s5, s4 +; GFX9-NEXT: s_lshr_b32 s4, s4, s5 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s2, 0xff ; GFX9-NEXT: s_or_b32 s3, s3, s4 @@ -1108,48 +1148,56 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX10-NEXT: s_lshr_b32 s7, s1, 16 ; GFX10-NEXT: s_lshr_b32 s8, s1, 24 ; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_lshr_b32 s9, s2, 8 +; GFX10-NEXT: s_and_b32 s11, s2, 7 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX10-NEXT: s_lshr_b32 s10, s2, 16 -; GFX10-NEXT: s_lshr_b32 s11, s2, 24 -; GFX10-NEXT: s_and_b32 s12, s2, 7 -; GFX10-NEXT: s_andn2_b32 s2, 7, s2 +; GFX10-NEXT: s_andn2_b32 s12, 7, s2 +; GFX10-NEXT: s_and_b32 s11, 0xffff, s11 ; GFX10-NEXT: s_lshr_b32 s1, s1, 1 +; GFX10-NEXT: s_and_b32 s12, 0xffff, s12 ; GFX10-NEXT: s_lshr_b32 s3, s0, 8 -; GFX10-NEXT: s_lshr_b32 s1, s1, s2 -; GFX10-NEXT: s_and_b32 s2, s6, 0xff -; GFX10-NEXT: s_and_b32 s6, s9, 7 -; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX10-NEXT: s_andn2_b32 s9, 7, s9 -; GFX10-NEXT: s_lshr_b32 s2, s2, 1 ; GFX10-NEXT: s_lshr_b32 s4, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s0, 24 -; GFX10-NEXT: s_lshl_b32 s0, s0, s12 -; GFX10-NEXT: s_lshl_b32 s3, s3, s6 -; GFX10-NEXT: s_lshr_b32 s2, s2, s9 +; GFX10-NEXT: s_lshr_b32 s9, s2, 8 +; GFX10-NEXT: s_lshl_b32 s0, s0, s11 +; GFX10-NEXT: s_lshr_b32 s1, s1, s12 +; GFX10-NEXT: s_and_b32 s6, s6, 0xff ; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_or_b32 s1, s3, s2 -; GFX10-NEXT: s_and_b32 s2, s7, 0xff -; GFX10-NEXT: s_and_b32 s3, s10, 7 -; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX10-NEXT: s_andn2_b32 s6, 7, s10 -; GFX10-NEXT: s_lshr_b32 s2, s2, 1 +; GFX10-NEXT: s_and_b32 s1, s9, 7 +; GFX10-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX10-NEXT: s_andn2_b32 s9, 7, s9 +; GFX10-NEXT: s_lshr_b32 s10, s2, 16 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX10-NEXT: s_lshr_b32 s6, s6, 1 +; GFX10-NEXT: s_and_b32 s9, 0xffff, s9 +; GFX10-NEXT: s_lshl_b32 s1, s3, s1 +; GFX10-NEXT: s_lshr_b32 s3, s6, s9 +; GFX10-NEXT: s_and_b32 s6, s10, 7 +; GFX10-NEXT: s_or_b32 s1, s1, s3 +; GFX10-NEXT: s_and_b32 s3, 0xffff, s6 +; GFX10-NEXT: s_and_b32 s6, s7, 0xff +; GFX10-NEXT: s_lshr_b32 s2, s2, 24 ; GFX10-NEXT: s_lshl_b32 s3, s4, s3 -; GFX10-NEXT: s_lshr_b32 s2, s2, s6 -; GFX10-NEXT: s_and_b32 s4, s11, 7 -; GFX10-NEXT: s_andn2_b32 s6, 7, s11 +; GFX10-NEXT: s_and_b32 s4, 0xffff, s6 +; GFX10-NEXT: s_andn2_b32 s6, 7, s10 +; GFX10-NEXT: s_lshr_b32 s4, s4, 1 +; GFX10-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX10-NEXT: s_and_b32 s7, s2, 7 +; GFX10-NEXT: s_andn2_b32 s2, 7, s2 +; GFX10-NEXT: s_lshr_b32 s4, s4, s6 +; GFX10-NEXT: s_and_b32 s6, 0xffff, s7 ; GFX10-NEXT: s_lshr_b32 s7, s8, 1 -; GFX10-NEXT: s_lshl_b32 s4, s5, s4 -; GFX10-NEXT: s_lshr_b32 s5, s7, s6 -; GFX10-NEXT: s_or_b32 s2, s3, s2 +; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX10-NEXT: s_lshl_b32 s5, s5, s6 +; GFX10-NEXT: s_lshr_b32 s2, s7, s2 +; GFX10-NEXT: s_or_b32 s3, s3, s4 ; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_or_b32 s3, s4, s5 +; GFX10-NEXT: s_or_b32 s2, s5, s2 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff ; GFX10-NEXT: s_lshl_b32 s1, s1, 8 -; GFX10-NEXT: s_and_b32 s2, s2, 0xff +; GFX10-NEXT: s_and_b32 s3, s3, 0xff ; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_lshl_b32 s1, s2, 16 -; GFX10-NEXT: s_and_b32 s2, s3, 0xff +; GFX10-NEXT: s_lshl_b32 s1, s3, 16 +; GFX10-NEXT: s_and_b32 s2, s2, 0xff ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_lshl_b32 s1, s2, 24 ; GFX10-NEXT: s_or_b32 s0, s0, s1 @@ -1161,48 +1209,56 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX11-NEXT: s_lshr_b32 s7, s1, 16 ; GFX11-NEXT: s_lshr_b32 s8, s1, 24 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_lshr_b32 s9, s2, 8 +; GFX11-NEXT: s_and_b32 s11, s2, 7 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: s_lshr_b32 s10, s2, 16 -; GFX11-NEXT: s_lshr_b32 s11, s2, 24 -; GFX11-NEXT: s_and_b32 s12, s2, 7 -; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 +; GFX11-NEXT: s_and_not1_b32 s12, 7, s2 +; GFX11-NEXT: s_and_b32 s11, 0xffff, s11 ; GFX11-NEXT: s_lshr_b32 s1, s1, 1 +; GFX11-NEXT: s_and_b32 s12, 0xffff, s12 ; GFX11-NEXT: s_lshr_b32 s3, s0, 8 -; GFX11-NEXT: s_lshr_b32 s1, s1, s2 -; GFX11-NEXT: s_and_b32 s2, s6, 0xff -; GFX11-NEXT: s_and_b32 s6, s9, 7 -; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX11-NEXT: s_and_not1_b32 s9, 7, s9 -; GFX11-NEXT: s_lshr_b32 s2, s2, 1 ; GFX11-NEXT: s_lshr_b32 s4, s0, 16 ; GFX11-NEXT: s_lshr_b32 s5, s0, 24 -; GFX11-NEXT: s_lshl_b32 s0, s0, s12 -; GFX11-NEXT: s_lshl_b32 s3, s3, s6 -; GFX11-NEXT: s_lshr_b32 s2, s2, s9 +; GFX11-NEXT: s_lshr_b32 s9, s2, 8 +; GFX11-NEXT: s_lshl_b32 s0, s0, s11 +; GFX11-NEXT: s_lshr_b32 s1, s1, s12 +; GFX11-NEXT: s_and_b32 s6, s6, 0xff ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s3, s2 -; GFX11-NEXT: s_and_b32 s2, s7, 0xff -; GFX11-NEXT: s_and_b32 s3, s10, 7 -; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX11-NEXT: s_and_not1_b32 s6, 7, s10 -; GFX11-NEXT: s_lshr_b32 s2, s2, 1 +; GFX11-NEXT: s_and_b32 s1, s9, 7 +; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX11-NEXT: s_and_not1_b32 s9, 7, s9 +; GFX11-NEXT: s_lshr_b32 s10, s2, 16 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: s_lshr_b32 s6, s6, 1 +; GFX11-NEXT: s_and_b32 s9, 0xffff, s9 +; GFX11-NEXT: s_lshl_b32 s1, s3, s1 +; GFX11-NEXT: s_lshr_b32 s3, s6, s9 +; GFX11-NEXT: s_and_b32 s6, s10, 7 +; GFX11-NEXT: s_or_b32 s1, s1, s3 +; GFX11-NEXT: s_and_b32 s3, 0xffff, s6 +; GFX11-NEXT: s_and_b32 s6, s7, 0xff +; GFX11-NEXT: s_lshr_b32 s2, s2, 24 ; GFX11-NEXT: s_lshl_b32 s3, s4, s3 -; GFX11-NEXT: s_lshr_b32 s2, s2, s6 -; GFX11-NEXT: s_and_b32 s4, s11, 7 -; GFX11-NEXT: s_and_not1_b32 s6, 7, s11 +; GFX11-NEXT: s_and_b32 s4, 0xffff, s6 +; GFX11-NEXT: s_and_not1_b32 s6, 7, s10 +; GFX11-NEXT: s_lshr_b32 s4, s4, 1 +; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX11-NEXT: s_and_b32 s7, s2, 7 +; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 +; GFX11-NEXT: s_lshr_b32 s4, s4, s6 +; GFX11-NEXT: s_and_b32 s6, 0xffff, s7 ; GFX11-NEXT: s_lshr_b32 s7, s8, 1 -; GFX11-NEXT: s_lshl_b32 s4, s5, s4 -; GFX11-NEXT: s_lshr_b32 s5, s7, s6 -; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX11-NEXT: s_lshl_b32 s5, s5, s6 +; GFX11-NEXT: s_lshr_b32 s2, s7, s2 +; GFX11-NEXT: s_or_b32 s3, s3, s4 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_or_b32 s3, s4, s5 +; GFX11-NEXT: s_or_b32 s2, s5, s2 ; GFX11-NEXT: s_and_b32 s0, s0, 0xff ; GFX11-NEXT: s_lshl_b32 s1, s1, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_and_b32 s3, s3, 0xff ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_lshl_b32 s1, s2, 16 -; GFX11-NEXT: s_and_b32 s2, s3, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s3, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_lshl_b32 s1, s2, 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -1271,37 +1327,38 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX8-LABEL: v_fshl_v4i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_not_b32_e32 v7, v2 -; GFX8-NEXT: v_mov_b32_e32 v9, 1 +; GFX8-NEXT: v_mov_b32_e32 v8, 1 +; GFX8-NEXT: v_xor_b32_e32 v10, -1, v2 ; GFX8-NEXT: v_and_b32_e32 v6, 7, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX8-NEXT: v_lshrrev_b16_sdwa v10, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_e32 v10, 7, v10 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v6, v6, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v7, v7, v10 +; GFX8-NEXT: v_lshrrev_b16_e32 v9, v10, v9 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX8-NEXT: v_and_b32_e32 v7, 7, v5 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v9 +; GFX8-NEXT: v_and_b32_e32 v9, 7, v5 +; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_and_b32_e32 v5, 7, v5 -; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b16_e32 v3, v7, v3 +; GFX8-NEXT: v_lshlrev_b16_e32 v3, v9, v3 ; GFX8-NEXT: v_lshrrev_b16_e32 v4, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v8, 0xff +; GFX8-NEXT: v_mov_b32_e32 v7, 0xff ; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX8-NEXT: v_mov_b32_e32 v4, 7 +; GFX8-NEXT: v_mov_b32_e32 v9, -1 ; GFX8-NEXT: v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_and_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v7, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_xor_b32_sdwa v10, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 -; GFX8-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX8-NEXT: v_lshrrev_b16_e32 v8, 1, v8 +; GFX8-NEXT: v_xor_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b16_e32 v7, 1, v7 +; GFX8-NEXT: v_and_b32_e32 v10, 7, v10 +; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_e32 v7, v7, v8 +; GFX8-NEXT: v_lshrrev_b16_e32 v7, v10, v7 ; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v7 @@ -1320,46 +1377,47 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX9-LABEL: v_fshl_v4i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_not_b32_e32 v7, v2 -; GFX9-NEXT: v_mov_b32_e32 v9, 1 +; GFX9-NEXT: v_mov_b32_e32 v8, 1 +; GFX9-NEXT: v_xor_b32_e32 v10, -1, v2 ; GFX9-NEXT: v_and_b32_e32 v6, 7, v2 -; GFX9-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX9-NEXT: v_lshrrev_b16_sdwa v10, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_e32 v10, 7, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v6, v6, v0 -; GFX9-NEXT: v_lshrrev_b16_e32 v7, v7, v10 +; GFX9-NEXT: v_lshrrev_b16_e32 v9, v10, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX9-NEXT: v_and_b32_e32 v7, 7, v5 -; GFX9-NEXT: v_not_b32_e32 v5, v5 +; GFX9-NEXT: v_or_b32_e32 v6, v6, v9 +; GFX9-NEXT: v_and_b32_e32 v9, 7, v5 +; GFX9-NEXT: v_xor_b32_e32 v5, -1, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX9-NEXT: v_lshrrev_b16_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 -; GFX9-NEXT: v_lshrrev_b16_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, v7, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, v9, v3 ; GFX9-NEXT: v_lshrrev_b16_e32 v4, v5, v4 -; GFX9-NEXT: v_mov_b32_e32 v8, 0xff +; GFX9-NEXT: v_mov_b32_e32 v7, 0xff ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-NEXT: v_mov_b32_e32 v4, 7 +; GFX9-NEXT: v_mov_b32_e32 v10, -1 ; GFX9-NEXT: v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_and_b32_sdwa v10, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v9, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_xor_b32_sdwa v11, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 -; GFX9-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX9-NEXT: v_lshrrev_b16_e32 v10, 1, v10 +; GFX9-NEXT: v_xor_b32_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b16_e32 v9, 1, v9 +; GFX9-NEXT: v_and_b32_e32 v11, 7, v11 +; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b16_e32 v7, v7, v10 +; GFX9-NEXT: v_lshrrev_b16_e32 v9, v11, v9 ; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 -; GFX9-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX9-NEXT: v_or_b32_e32 v5, v5, v9 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v5 ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX9-NEXT: v_and_or_b32 v1, v6, v8, v1 +; GFX9-NEXT: v_and_or_b32 v1, v6, v7, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 @@ -1368,41 +1426,42 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX10-LABEL: v_fshl_v4i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v2 -; GFX10-NEXT: v_and_b32_e32 v9, 7, v2 -; GFX10-NEXT: v_and_b32_e32 v11, 0xff, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX10-NEXT: v_and_b32_e32 v8, 7, v2 +; GFX10-NEXT: v_and_b32_e32 v9, 0xff, v1 +; GFX10-NEXT: v_xor_b32_e32 v10, -1, v2 +; GFX10-NEXT: v_and_b32_e32 v11, 7, v6 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX10-NEXT: v_not_b32_e32 v12, v7 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1 -; GFX10-NEXT: v_lshlrev_b16 v0, v9, v0 -; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX10-NEXT: v_lshrrev_b16 v9, 1, v11 -; GFX10-NEXT: v_and_b32_e32 v11, 7, v12 -; GFX10-NEXT: v_mov_b32_e32 v12, 0xff -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v1 -; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX10-NEXT: v_lshlrev_b16 v3, v7, v3 -; GFX10-NEXT: v_mov_b32_e32 v7, 7 -; GFX10-NEXT: v_not_b32_sdwa v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_and_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_not_b32_sdwa v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 -; GFX10-NEXT: v_not_b32_e32 v8, v2 -; GFX10-NEXT: v_lshrrev_b16 v6, 1, v6 -; GFX10-NEXT: v_and_b32_sdwa v14, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_e32 v13, 7, v13 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 +; GFX10-NEXT: v_lshrrev_b16 v8, 1, v9 +; GFX10-NEXT: v_and_b32_e32 v9, 7, v10 +; GFX10-NEXT: v_lshlrev_b16 v3, v11, v3 +; GFX10-NEXT: v_mov_b32_e32 v10, 0xff +; GFX10-NEXT: v_mov_b32_e32 v11, -1 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v1 +; GFX10-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX10-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX10-NEXT: v_mov_b32_e32 v13, 7 +; GFX10-NEXT: v_and_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_xor_b32_sdwa v10, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_xor_b32_sdwa v11, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b16 v7, 1, v7 +; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX10-NEXT: v_and_b32_sdwa v14, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1 -; GFX10-NEXT: v_and_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_e32 v7, 7, v12 -; GFX10-NEXT: v_lshrrev_b16 v10, 1, v10 -; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 -; GFX10-NEXT: v_lshrrev_b16 v6, v11, v6 +; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 +; GFX10-NEXT: v_and_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b16 v12, 1, v12 +; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 +; GFX10-NEXT: v_lshrrev_b16 v6, v6, v7 ; GFX10-NEXT: v_lshlrev_b16 v4, v14, v4 -; GFX10-NEXT: v_lshrrev_b16 v1, v13, v1 +; GFX10-NEXT: v_lshrrev_b16 v1, v10, v1 ; GFX10-NEXT: v_lshlrev_b16 v2, v2, v5 -; GFX10-NEXT: v_lshrrev_b16 v5, v7, v10 -; GFX10-NEXT: v_lshrrev_b16 v7, v8, v9 +; GFX10-NEXT: v_lshrrev_b16 v5, v11, v12 +; GFX10-NEXT: v_lshrrev_b16 v7, v9, v8 ; GFX10-NEXT: v_or_b32_e32 v3, v3, v6 ; GFX10-NEXT: v_mov_b32_e32 v6, 8 ; GFX10-NEXT: v_or_b32_e32 v1, v4, v1 @@ -1426,7 +1485,7 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-NEXT: v_not_b32_e32 v13, v9 +; GFX11-NEXT: v_xor_b32_e32 v13, -1, v9 ; GFX11-NEXT: v_lshrrev_b32_e32 v11, 24, v2 ; GFX11-NEXT: v_and_b32_e32 v9, 7, v9 ; GFX11-NEXT: v_lshrrev_b32_e32 v8, 24, v1 @@ -1434,22 +1493,22 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX11-NEXT: v_and_b32_e32 v13, 7, v13 ; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 ; GFX11-NEXT: v_lshlrev_b16 v3, v9, v3 -; GFX11-NEXT: v_not_b32_e32 v9, v10 +; GFX11-NEXT: v_xor_b32_e32 v9, -1, v10 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX11-NEXT: v_lshrrev_b16 v6, v13, v6 -; GFX11-NEXT: v_not_b32_e32 v13, v11 +; GFX11-NEXT: v_xor_b32_e32 v13, -1, v11 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX11-NEXT: v_and_b32_e32 v12, 7, v2 -; GFX11-NEXT: v_not_b32_e32 v2, v2 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX11-NEXT: v_and_b32_e32 v10, 7, v10 -; GFX11-NEXT: v_and_b32_e32 v9, 7, v9 ; GFX11-NEXT: v_lshrrev_b16 v7, 1, v7 +; GFX11-NEXT: v_and_b32_e32 v9, 7, v9 ; GFX11-NEXT: v_and_b32_e32 v11, 7, v11 -; GFX11-NEXT: v_and_b32_e32 v13, 7, v13 ; GFX11-NEXT: v_lshrrev_b16 v8, 1, v8 -; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX11-NEXT: v_and_b32_e32 v13, 7, v13 ; GFX11-NEXT: v_lshrrev_b16 v1, 1, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX11-NEXT: v_or_b32_e32 v3, v3, v6 ; GFX11-NEXT: v_lshlrev_b16 v4, v10, v4 ; GFX11-NEXT: v_lshrrev_b16 v6, v9, v7 @@ -5087,23 +5146,48 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) } define amdgpu_ps i64 @s_fshl_i64(i64 inreg %lhs, i64 inreg %rhs, i64 inreg %amt) { -; GCN-LABEL: s_fshl_i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_and_b64 s[6:7], s[4:5], 63 -; GCN-NEXT: s_andn2_b64 s[4:5], 63, s[4:5] -; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 -; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], s6 -; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 -; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GCN-NEXT: ; return to shader part epilog +; GFX6-LABEL: s_fshl_i64: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 +; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX6-NEXT: s_not_b32 s4, s4 +; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 +; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshl_i64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 +; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX8-NEXT: s_not_b32 s4, s4 +; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 +; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshl_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX9-NEXT: s_not_b32 s4, s4 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 +; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshl_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX10-NEXT: s_not_b32 s5, s4 +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s5 +; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshl_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b64 s[6:7], s[4:5], 63 -; GFX11-NEXT: s_and_not1_b64 s[4:5], 63, s[4:5] ; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 -; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s6 -; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 +; GFX11-NEXT: s_not_b32 s5, s4 +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX11-NEXT: ; return to shader part epilog @@ -5181,8 +5265,8 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 63, v4 -; GFX6-NEXT: v_not_b32_e32 v4, v4 ; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], 1 +; GFX6-NEXT: v_not_b32_e32 v4, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 63, v4 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v5 ; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v4 @@ -5194,8 +5278,8 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v5, 63, v4 -; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] +; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: v_and_b32_e32 v4, 63, v4 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] ; GFX8-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] @@ -5207,8 +5291,8 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v5, 63, v4 -; GFX9-NEXT: v_not_b32_e32 v4, v4 ; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] +; GFX9-NEXT: v_not_b32_e32 v4, v4 ; GFX9-NEXT: v_and_b32_e32 v4, 63, v4 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] ; GFX9-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] @@ -5362,36 +5446,36 @@ define amdgpu_ps <2 x float> @v_fshl_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64 ; GFX6: ; %bb.0: ; GFX6-NEXT: v_and_b32_e32 v1, 63, v0 ; GFX6-NEXT: v_not_b32_e32 v0, v0 -; GFX6-NEXT: v_and_b32_e32 v2, 63, v0 -; GFX6-NEXT: v_lshl_b64 v[0:1], s[0:1], v1 +; GFX6-NEXT: v_lshl_b64 v[1:2], s[0:1], v1 ; GFX6-NEXT: s_lshr_b64 s[0:1], s[2:3], 1 -; GFX6-NEXT: v_lshr_b64 v[2:3], s[0:1], v2 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX6-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX6-NEXT: v_lshr_b64 v[3:4], s[0:1], v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v4 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_fshl_i64_ssv: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_and_b32_e32 v1, 63, v0 ; GFX8-NEXT: v_not_b32_e32 v0, v0 -; GFX8-NEXT: v_and_b32_e32 v2, 63, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v1, s[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[1:2], v1, s[0:1] ; GFX8-NEXT: s_lshr_b64 s[0:1], s[2:3], 1 -; GFX8-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1] -; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX8-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX8-NEXT: v_lshrrev_b64 v[3:4], v0, s[0:1] +; GFX8-NEXT: v_or_b32_e32 v0, v1, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v4 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_fshl_i64_ssv: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_and_b32_e32 v1, 63, v0 ; GFX9-NEXT: v_not_b32_e32 v0, v0 -; GFX9-NEXT: v_and_b32_e32 v2, 63, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v1, s[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[1:2], v1, s[0:1] ; GFX9-NEXT: s_lshr_b64 s[0:1], s[2:3], 1 -; GFX9-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1] -; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX9-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX9-NEXT: v_lshrrev_b64 v[3:4], v0, s[0:1] +; GFX9-NEXT: v_or_b32_e32 v0, v1, v3 +; GFX9-NEXT: v_or_b32_e32 v1, v2, v4 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_fshl_i64_ssv: @@ -5429,10 +5513,9 @@ define amdgpu_ps <2 x float> @v_fshl_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg ; GFX6-LABEL: v_fshl_i64_svs: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], 1 -; GFX6-NEXT: s_and_b64 s[4:5], s[2:3], 63 -; GFX6-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] -; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], s2 -; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 +; GFX6-NEXT: s_andn2_b32 s3, 63, s2 +; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], s3 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -5440,10 +5523,9 @@ define amdgpu_ps <2 x float> @v_fshl_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg ; GFX8-LABEL: v_fshl_i64_svs: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] -; GFX8-NEXT: s_and_b64 s[4:5], s[2:3], 63 -; GFX8-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] -; GFX8-NEXT: v_lshrrev_b64 v[0:1], s2, v[0:1] -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 +; GFX8-NEXT: s_andn2_b32 s3, 63, s2 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], s3, v[0:1] +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX8-NEXT: ; return to shader part epilog @@ -5451,10 +5533,9 @@ define amdgpu_ps <2 x float> @v_fshl_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg ; GFX9-LABEL: v_fshl_i64_svs: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] -; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], 63 -; GFX9-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] -; GFX9-NEXT: v_lshrrev_b64 v[0:1], s2, v[0:1] -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 +; GFX9-NEXT: s_andn2_b32 s3, 63, s2 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], s3, v[0:1] +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 ; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX9-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX9-NEXT: ; return to shader part epilog @@ -5462,10 +5543,9 @@ define amdgpu_ps <2 x float> @v_fshl_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg ; GFX10-LABEL: v_fshl_i64_svs: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] -; GFX10-NEXT: s_andn2_b64 s[4:5], 63, s[2:3] -; GFX10-NEXT: s_and_b64 s[2:3], s[2:3], 63 +; GFX10-NEXT: s_andn2_b32 s3, 63, s2 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 -; GFX10-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[0:1], s3, v[0:1] ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX10-NEXT: ; return to shader part epilog @@ -5473,13 +5553,12 @@ define amdgpu_ps <2 x float> @v_fshl_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg ; GFX11-LABEL: v_fshl_i64_svs: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] -; GFX11-NEXT: s_and_not1_b64 s[4:5], 63, s[2:3] -; GFX11-NEXT: s_and_b64 s[2:3], s[2:3], 63 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_and_not1_b32 s3, 63, s2 ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 -; GFX11-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b64 v[0:1], s3, v[0:1] ; GFX11-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX11-NEXT: ; return to shader part epilog %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 %amt) @@ -5490,10 +5569,10 @@ define amdgpu_ps <2 x float> @v_fshl_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg define amdgpu_ps <2 x float> @v_fshl_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg %amt) { ; GFX6-LABEL: v_fshl_i64_vss: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b64 s[4:5], s[2:3], 63 -; GFX6-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] -; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], s4 +; GFX6-NEXT: s_and_b32 s3, s2, 63 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], s3 ; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 +; GFX6-NEXT: s_not_b32 s2, s2 ; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_or_b32_e32 v1, s1, v1 @@ -5501,10 +5580,10 @@ define amdgpu_ps <2 x float> @v_fshl_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg ; ; GFX8-LABEL: v_fshl_i64_vss: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b64 s[4:5], s[2:3], 63 -; GFX8-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] -; GFX8-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1] +; GFX8-NEXT: s_and_b32 s3, s2, 63 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], s3, v[0:1] ; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 +; GFX8-NEXT: s_not_b32 s2, s2 ; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_or_b32_e32 v1, s1, v1 @@ -5512,10 +5591,10 @@ define amdgpu_ps <2 x float> @v_fshl_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg ; ; GFX9-LABEL: v_fshl_i64_vss: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], 63 -; GFX9-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] -; GFX9-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1] +; GFX9-NEXT: s_and_b32 s3, s2, 63 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], s3, v[0:1] ; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 +; GFX9-NEXT: s_not_b32 s2, s2 ; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 ; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX9-NEXT: v_or_b32_e32 v1, s1, v1 @@ -5523,10 +5602,10 @@ define amdgpu_ps <2 x float> @v_fshl_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg ; ; GFX10-LABEL: v_fshl_i64_vss: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], 63 -; GFX10-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] -; GFX10-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1] +; GFX10-NEXT: s_and_b32 s3, s2, 63 ; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], s3, v[0:1] +; GFX10-NEXT: s_not_b32 s2, s2 ; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: v_or_b32_e32 v1, s1, v1 @@ -5534,10 +5613,10 @@ define amdgpu_ps <2 x float> @v_fshl_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg ; ; GFX11-LABEL: v_fshl_i64_vss: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], 63 -; GFX11-NEXT: s_and_not1_b64 s[2:3], 63, s[2:3] -; GFX11-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1] +; GFX11-NEXT: s_and_b32 s3, s2, 63 ; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], s3, v[0:1] +; GFX11-NEXT: s_not_b32 s2, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -5553,80 +5632,70 @@ define amdgpu_ps <2 x float> @v_fshl_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg define amdgpu_ps <2 x i64> @s_fshl_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %rhs, <2 x i64> inreg %amt) { ; GFX6-LABEL: s_fshl_v2i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b64 s[12:13], s[8:9], 63 -; GFX6-NEXT: s_andn2_b64 s[8:9], 63, s[8:9] +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 ; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 -; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 +; GFX6-NEXT: s_not_b32 s8, s8 ; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 ; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; GFX6-NEXT: s_and_b64 s[4:5], s[10:11], 63 -; GFX6-NEXT: s_andn2_b64 s[8:9], 63, s[10:11] -; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 ; GFX6-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 -; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 +; GFX6-NEXT: s_not_b32 s6, s10 +; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s10 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 ; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshl_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b64 s[12:13], s[8:9], 63 -; GFX8-NEXT: s_andn2_b64 s[8:9], 63, s[8:9] +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 ; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 +; GFX8-NEXT: s_not_b32 s8, s8 ; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 ; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; GFX8-NEXT: s_and_b64 s[4:5], s[10:11], 63 -; GFX8-NEXT: s_andn2_b64 s[8:9], 63, s[10:11] -; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 ; GFX8-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 -; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 +; GFX8-NEXT: s_not_b32 s6, s10 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s10 +; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 ; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshl_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b64 s[12:13], s[8:9], 63 -; GFX9-NEXT: s_andn2_b64 s[8:9], 63, s[8:9] +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 ; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 +; GFX9-NEXT: s_not_b32 s8, s8 ; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 ; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; GFX9-NEXT: s_and_b64 s[4:5], s[10:11], 63 -; GFX9-NEXT: s_andn2_b64 s[8:9], 63, s[10:11] -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 ; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 -; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 +; GFX9-NEXT: s_not_b32 s6, s10 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s10 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 ; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshl_v2i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b64 s[12:13], s[8:9], 63 -; GFX10-NEXT: s_andn2_b64 s[8:9], 63, s[8:9] ; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 +; GFX10-NEXT: s_not_b32 s9, s8 +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 ; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 -; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 -; GFX10-NEXT: s_and_b64 s[8:9], s[10:11], 63 -; GFX10-NEXT: s_andn2_b64 s[10:11], 63, s[10:11] -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 -; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 -; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], s10 +; GFX10-NEXT: s_not_b32 s8, s10 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s9 +; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], s10 +; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], s8 ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshl_v2i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b64 s[12:13], s[8:9], 63 -; GFX11-NEXT: s_and_not1_b64 s[8:9], 63, s[8:9] ; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 +; GFX11-NEXT: s_not_b32 s9, s8 +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 ; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 -; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 -; GFX11-NEXT: s_and_b64 s[8:9], s[10:11], 63 -; GFX11-NEXT: s_and_not1_b64 s[10:11], 63, s[10:11] -; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 -; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 -; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], s10 +; GFX11-NEXT: s_not_b32 s8, s10 +; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s9 +; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s10 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], s8 ; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] ; GFX11-NEXT: ; return to shader part epilog @@ -5639,18 +5708,18 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v9, 63, v8 -; GFX6-NEXT: v_not_b32_e32 v8, v8 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[4:5], 1 +; GFX6-NEXT: v_not_b32_e32 v8, v8 ; GFX6-NEXT: v_and_b32_e32 v8, 63, v8 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v9 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[4:5], v8 -; GFX6-NEXT: v_not_b32_e32 v8, v10 ; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], 1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 63, v10 -; GFX6-NEXT: v_and_b32_e32 v8, 63, v8 ; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], v4 -; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v8 +; GFX6-NEXT: v_not_b32_e32 v4, v10 +; GFX6-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v4 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v7 @@ -5660,18 +5729,18 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v9, 63, v8 -; GFX8-NEXT: v_not_b32_e32 v8, v8 ; GFX8-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] +; GFX8-NEXT: v_not_b32_e32 v8, v8 ; GFX8-NEXT: v_and_b32_e32 v8, 63, v8 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] ; GFX8-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] -; GFX8-NEXT: v_not_b32_e32 v8, v10 ; GFX8-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: v_and_b32_e32 v4, 63, v10 -; GFX8-NEXT: v_and_b32_e32 v8, 63, v8 ; GFX8-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] -; GFX8-NEXT: v_lshrrev_b64 v[6:7], v8, v[6:7] +; GFX8-NEXT: v_not_b32_e32 v4, v10 +; GFX8-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX8-NEXT: v_lshrrev_b64 v[6:7], v4, v[6:7] ; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v7 @@ -5681,18 +5750,18 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v9, 63, v8 -; GFX9-NEXT: v_not_b32_e32 v8, v8 ; GFX9-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] +; GFX9-NEXT: v_not_b32_e32 v8, v8 ; GFX9-NEXT: v_and_b32_e32 v8, 63, v8 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] ; GFX9-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] -; GFX9-NEXT: v_not_b32_e32 v8, v10 ; GFX9-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] ; GFX9-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX9-NEXT: v_and_b32_e32 v4, 63, v10 -; GFX9-NEXT: v_and_b32_e32 v8, 63, v8 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] -; GFX9-NEXT: v_lshrrev_b64 v[6:7], v8, v[6:7] +; GFX9-NEXT: v_not_b32_e32 v4, v10 +; GFX9-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX9-NEXT: v_lshrrev_b64 v[6:7], v4, v[6:7] ; GFX9-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v7 @@ -5750,231 +5819,236 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg %amt) { ; GFX6-LABEL: s_fshl_i128: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f -; GFX6-NEXT: s_andn2_b64 s[8:9], 0x7f, s[8:9] -; GFX6-NEXT: s_sub_i32 s9, s10, 64 -; GFX6-NEXT: s_sub_i32 s11, 64, s10 -; GFX6-NEXT: s_cmp_lt_u32 s10, 64 -; GFX6-NEXT: s_cselect_b32 s13, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s10, 0 +; GFX6-NEXT: s_and_b32 s9, s8, 0x7f +; GFX6-NEXT: s_sub_i32 s11, s9, 64 +; GFX6-NEXT: s_sub_i32 s14, 64, s9 +; GFX6-NEXT: s_cmp_lt_u32 s9, 64 ; GFX6-NEXT: s_cselect_b32 s18, 1, 0 -; GFX6-NEXT: s_lshl_b64 s[14:15], s[0:1], s10 -; GFX6-NEXT: s_lshr_b64 s[16:17], s[0:1], s11 -; GFX6-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 -; GFX6-NEXT: s_or_b64 s[10:11], s[16:17], s[10:11] -; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 -; GFX6-NEXT: s_cmp_lg_u32 s13, 0 -; GFX6-NEXT: s_cselect_b64 s[14:15], s[14:15], 0 -; GFX6-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1] +; GFX6-NEXT: s_cmp_eq_u32 s9, 0 +; GFX6-NEXT: s_cselect_b32 s9, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[14:15], s[0:1], s14 +; GFX6-NEXT: s_lshl_b64 s[16:17], s[2:3], s8 +; GFX6-NEXT: s_lshl_b64 s[12:13], s[0:1], s8 +; GFX6-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s11 ; GFX6-NEXT: s_cmp_lg_u32 s18, 0 -; GFX6-NEXT: s_mov_b32 s12, 0 +; GFX6-NEXT: s_cselect_b64 s[12:13], s[12:13], 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[14:15], s[0:1] +; GFX6-NEXT: s_cmp_lg_u32 s9, 0 +; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX6-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 -; GFX6-NEXT: s_lshl_b32 s13, s6, 31 -; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[12:13] +; GFX6-NEXT: s_lshl_b32 s11, s6, 31 ; GFX6-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 -; GFX6-NEXT: s_sub_i32 s12, s8, 64 -; GFX6-NEXT: s_sub_i32 s10, 64, s8 -; GFX6-NEXT: s_cmp_lt_u32 s8, 64 -; GFX6-NEXT: s_cselect_b32 s13, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s8, 0 +; GFX6-NEXT: s_andn2_b32 s6, 0x7f, s8 +; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[10:11] +; GFX6-NEXT: s_not_b32 s9, s8 +; GFX6-NEXT: s_sub_i32 s14, s6, 64 +; GFX6-NEXT: s_sub_i32 s10, 64, s6 +; GFX6-NEXT: s_cmp_lt_u32 s6, 64 +; GFX6-NEXT: s_cselect_b32 s15, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s6, 0 ; GFX6-NEXT: s_cselect_b32 s16, 1, 0 -; GFX6-NEXT: s_lshr_b64 s[6:7], s[4:5], s8 -; GFX6-NEXT: s_lshr_b64 s[8:9], s[0:1], s8 +; GFX6-NEXT: s_lshr_b64 s[6:7], s[4:5], s9 +; GFX6-NEXT: s_lshr_b64 s[8:9], s[0:1], s9 ; GFX6-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 ; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] -; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s12 -; GFX6-NEXT: s_cmp_lg_u32 s13, 0 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s14 +; GFX6-NEXT: s_cmp_lg_u32 s15, 0 ; GFX6-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX6-NEXT: s_cmp_lg_u32 s16, 0 ; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] -; GFX6-NEXT: s_cmp_lg_u32 s13, 0 +; GFX6-NEXT: s_cmp_lg_u32 s15, 0 ; GFX6-NEXT: s_cselect_b64 s[4:5], s[6:7], 0 -; GFX6-NEXT: s_or_b64 s[0:1], s[14:15], s[0:1] +; GFX6-NEXT: s_or_b64 s[0:1], s[12:13], s[0:1] ; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshl_i128: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f -; GFX8-NEXT: s_andn2_b64 s[8:9], 0x7f, s[8:9] -; GFX8-NEXT: s_sub_i32 s9, s10, 64 -; GFX8-NEXT: s_sub_i32 s11, 64, s10 -; GFX8-NEXT: s_cmp_lt_u32 s10, 64 -; GFX8-NEXT: s_cselect_b32 s13, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s10, 0 +; GFX8-NEXT: s_and_b32 s9, s8, 0x7f +; GFX8-NEXT: s_sub_i32 s11, s9, 64 +; GFX8-NEXT: s_sub_i32 s14, 64, s9 +; GFX8-NEXT: s_cmp_lt_u32 s9, 64 ; GFX8-NEXT: s_cselect_b32 s18, 1, 0 -; GFX8-NEXT: s_lshl_b64 s[14:15], s[0:1], s10 -; GFX8-NEXT: s_lshr_b64 s[16:17], s[0:1], s11 -; GFX8-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 -; GFX8-NEXT: s_or_b64 s[10:11], s[16:17], s[10:11] -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 -; GFX8-NEXT: s_cmp_lg_u32 s13, 0 -; GFX8-NEXT: s_cselect_b64 s[14:15], s[14:15], 0 -; GFX8-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1] +; GFX8-NEXT: s_cmp_eq_u32 s9, 0 +; GFX8-NEXT: s_cselect_b32 s9, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[14:15], s[0:1], s14 +; GFX8-NEXT: s_lshl_b64 s[16:17], s[2:3], s8 +; GFX8-NEXT: s_lshl_b64 s[12:13], s[0:1], s8 +; GFX8-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s11 ; GFX8-NEXT: s_cmp_lg_u32 s18, 0 -; GFX8-NEXT: s_mov_b32 s12, 0 +; GFX8-NEXT: s_cselect_b64 s[12:13], s[12:13], 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[14:15], s[0:1] +; GFX8-NEXT: s_cmp_lg_u32 s9, 0 +; GFX8-NEXT: s_mov_b32 s10, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX8-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 -; GFX8-NEXT: s_lshl_b32 s13, s6, 31 -; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[12:13] +; GFX8-NEXT: s_lshl_b32 s11, s6, 31 ; GFX8-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 -; GFX8-NEXT: s_sub_i32 s12, s8, 64 -; GFX8-NEXT: s_sub_i32 s10, 64, s8 -; GFX8-NEXT: s_cmp_lt_u32 s8, 64 -; GFX8-NEXT: s_cselect_b32 s13, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s8, 0 +; GFX8-NEXT: s_andn2_b32 s6, 0x7f, s8 +; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[10:11] +; GFX8-NEXT: s_not_b32 s9, s8 +; GFX8-NEXT: s_sub_i32 s14, s6, 64 +; GFX8-NEXT: s_sub_i32 s10, 64, s6 +; GFX8-NEXT: s_cmp_lt_u32 s6, 64 +; GFX8-NEXT: s_cselect_b32 s15, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s6, 0 ; GFX8-NEXT: s_cselect_b32 s16, 1, 0 -; GFX8-NEXT: s_lshr_b64 s[6:7], s[4:5], s8 -; GFX8-NEXT: s_lshr_b64 s[8:9], s[0:1], s8 +; GFX8-NEXT: s_lshr_b64 s[6:7], s[4:5], s9 +; GFX8-NEXT: s_lshr_b64 s[8:9], s[0:1], s9 ; GFX8-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 ; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] -; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s12 -; GFX8-NEXT: s_cmp_lg_u32 s13, 0 +; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s14 +; GFX8-NEXT: s_cmp_lg_u32 s15, 0 ; GFX8-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX8-NEXT: s_cmp_lg_u32 s16, 0 ; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] -; GFX8-NEXT: s_cmp_lg_u32 s13, 0 +; GFX8-NEXT: s_cmp_lg_u32 s15, 0 ; GFX8-NEXT: s_cselect_b64 s[4:5], s[6:7], 0 -; GFX8-NEXT: s_or_b64 s[0:1], s[14:15], s[0:1] +; GFX8-NEXT: s_or_b64 s[0:1], s[12:13], s[0:1] ; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshl_i128: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f -; GFX9-NEXT: s_andn2_b64 s[8:9], 0x7f, s[8:9] -; GFX9-NEXT: s_sub_i32 s9, s10, 64 -; GFX9-NEXT: s_sub_i32 s11, 64, s10 -; GFX9-NEXT: s_cmp_lt_u32 s10, 64 -; GFX9-NEXT: s_cselect_b32 s13, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s10, 0 +; GFX9-NEXT: s_and_b32 s9, s8, 0x7f +; GFX9-NEXT: s_sub_i32 s11, s9, 64 +; GFX9-NEXT: s_sub_i32 s14, 64, s9 +; GFX9-NEXT: s_cmp_lt_u32 s9, 64 ; GFX9-NEXT: s_cselect_b32 s18, 1, 0 -; GFX9-NEXT: s_lshl_b64 s[14:15], s[0:1], s10 -; GFX9-NEXT: s_lshr_b64 s[16:17], s[0:1], s11 -; GFX9-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 -; GFX9-NEXT: s_or_b64 s[10:11], s[16:17], s[10:11] -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 -; GFX9-NEXT: s_cmp_lg_u32 s13, 0 -; GFX9-NEXT: s_cselect_b64 s[14:15], s[14:15], 0 -; GFX9-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1] +; GFX9-NEXT: s_cmp_eq_u32 s9, 0 +; GFX9-NEXT: s_cselect_b32 s9, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[14:15], s[0:1], s14 +; GFX9-NEXT: s_lshl_b64 s[16:17], s[2:3], s8 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[0:1], s8 +; GFX9-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s11 ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_mov_b32 s12, 0 +; GFX9-NEXT: s_cselect_b64 s[12:13], s[12:13], 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[14:15], s[0:1] +; GFX9-NEXT: s_cmp_lg_u32 s9, 0 +; GFX9-NEXT: s_mov_b32 s10, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 -; GFX9-NEXT: s_lshl_b32 s13, s6, 31 -; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[12:13] +; GFX9-NEXT: s_lshl_b32 s11, s6, 31 ; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 -; GFX9-NEXT: s_sub_i32 s12, s8, 64 -; GFX9-NEXT: s_sub_i32 s10, 64, s8 -; GFX9-NEXT: s_cmp_lt_u32 s8, 64 -; GFX9-NEXT: s_cselect_b32 s13, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s8, 0 +; GFX9-NEXT: s_andn2_b32 s6, 0x7f, s8 +; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[10:11] +; GFX9-NEXT: s_not_b32 s9, s8 +; GFX9-NEXT: s_sub_i32 s14, s6, 64 +; GFX9-NEXT: s_sub_i32 s10, 64, s6 +; GFX9-NEXT: s_cmp_lt_u32 s6, 64 +; GFX9-NEXT: s_cselect_b32 s15, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s6, 0 ; GFX9-NEXT: s_cselect_b32 s16, 1, 0 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], s8 -; GFX9-NEXT: s_lshr_b64 s[8:9], s[0:1], s8 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], s9 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[0:1], s9 ; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 ; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] -; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s12 -; GFX9-NEXT: s_cmp_lg_u32 s13, 0 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s14 +; GFX9-NEXT: s_cmp_lg_u32 s15, 0 ; GFX9-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX9-NEXT: s_cmp_lg_u32 s16, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] -; GFX9-NEXT: s_cmp_lg_u32 s13, 0 +; GFX9-NEXT: s_cmp_lg_u32 s15, 0 ; GFX9-NEXT: s_cselect_b64 s[4:5], s[6:7], 0 -; GFX9-NEXT: s_or_b64 s[0:1], s[14:15], s[0:1] +; GFX9-NEXT: s_or_b64 s[0:1], s[12:13], s[0:1] ; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshl_i128: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f -; GFX10-NEXT: s_andn2_b64 s[8:9], 0x7f, s[8:9] -; GFX10-NEXT: s_sub_i32 s9, s10, 64 -; GFX10-NEXT: s_sub_i32 s11, 64, s10 -; GFX10-NEXT: s_cmp_lt_u32 s10, 64 -; GFX10-NEXT: s_mov_b32 s12, 0 -; GFX10-NEXT: s_cselect_b32 s13, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s10, 0 +; GFX10-NEXT: s_and_b32 s9, s8, 0x7f +; GFX10-NEXT: s_mov_b32 s10, 0 +; GFX10-NEXT: s_sub_i32 s11, s9, 64 +; GFX10-NEXT: s_sub_i32 s12, 64, s9 +; GFX10-NEXT: s_cmp_lt_u32 s9, 64 ; GFX10-NEXT: s_cselect_b32 s18, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[14:15], s[0:1], s11 -; GFX10-NEXT: s_lshl_b64 s[16:17], s[2:3], s10 -; GFX10-NEXT: s_lshl_b64 s[10:11], s[0:1], s10 -; GFX10-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 -; GFX10-NEXT: s_cmp_lg_u32 s13, 0 -; GFX10-NEXT: s_cselect_b64 s[10:11], s[10:11], 0 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[14:15], s[0:1] +; GFX10-NEXT: s_cmp_eq_u32 s9, 0 +; GFX10-NEXT: s_cselect_b32 s9, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[12:13], s[0:1], s12 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], s8 +; GFX10-NEXT: s_lshl_b64 s[16:17], s[0:1], s8 +; GFX10-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s11 ; GFX10-NEXT: s_cmp_lg_u32 s18, 0 +; GFX10-NEXT: s_cselect_b64 s[14:15], s[16:17], 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[12:13], s[0:1] +; GFX10-NEXT: s_cmp_lg_u32 s9, 0 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX10-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 -; GFX10-NEXT: s_lshl_b32 s13, s6, 31 +; GFX10-NEXT: s_lshl_b32 s11, s6, 31 ; GFX10-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 -; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[12:13] -; GFX10-NEXT: s_sub_i32 s14, s8, 64 -; GFX10-NEXT: s_sub_i32 s9, 64, s8 -; GFX10-NEXT: s_cmp_lt_u32 s8, 64 -; GFX10-NEXT: s_cselect_b32 s15, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s8, 0 +; GFX10-NEXT: s_andn2_b32 s6, 0x7f, s8 +; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[10:11] +; GFX10-NEXT: s_not_b32 s10, s8 +; GFX10-NEXT: s_sub_i32 s12, s6, 64 +; GFX10-NEXT: s_sub_i32 s8, 64, s6 +; GFX10-NEXT: s_cmp_lt_u32 s6, 64 +; GFX10-NEXT: s_cselect_b32 s13, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s6, 0 ; GFX10-NEXT: s_cselect_b32 s16, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s8 -; GFX10-NEXT: s_lshl_b64 s[12:13], s[4:5], s9 -; GFX10-NEXT: s_lshr_b64 s[8:9], s[4:5], s8 -; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[12:13] -; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s14 -; GFX10-NEXT: s_cmp_lg_u32 s15, 0 +; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s10 +; GFX10-NEXT: s_lshl_b64 s[8:9], s[4:5], s8 +; GFX10-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 +; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s12 +; GFX10-NEXT: s_cmp_lg_u32 s13, 0 ; GFX10-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] ; GFX10-NEXT: s_cmp_lg_u32 s16, 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] -; GFX10-NEXT: s_cmp_lg_u32 s15, 0 -; GFX10-NEXT: s_cselect_b64 s[4:5], s[8:9], 0 -; GFX10-NEXT: s_or_b64 s[0:1], s[10:11], s[0:1] +; GFX10-NEXT: s_cmp_lg_u32 s13, 0 +; GFX10-NEXT: s_cselect_b64 s[4:5], s[10:11], 0 +; GFX10-NEXT: s_or_b64 s[0:1], s[14:15], s[0:1] ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshl_i128: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f -; GFX11-NEXT: s_and_not1_b64 s[8:9], 0x7f, s[8:9] -; GFX11-NEXT: s_sub_i32 s9, s10, 64 -; GFX11-NEXT: s_sub_i32 s11, 64, s10 -; GFX11-NEXT: s_cmp_lt_u32 s10, 64 -; GFX11-NEXT: s_mov_b32 s12, 0 -; GFX11-NEXT: s_cselect_b32 s13, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s10, 0 +; GFX11-NEXT: s_and_b32 s9, s8, 0x7f +; GFX11-NEXT: s_mov_b32 s10, 0 +; GFX11-NEXT: s_sub_i32 s11, s9, 64 +; GFX11-NEXT: s_sub_i32 s12, 64, s9 +; GFX11-NEXT: s_cmp_lt_u32 s9, 64 ; GFX11-NEXT: s_cselect_b32 s18, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[14:15], s[0:1], s11 -; GFX11-NEXT: s_lshl_b64 s[16:17], s[2:3], s10 -; GFX11-NEXT: s_lshl_b64 s[10:11], s[0:1], s10 -; GFX11-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] -; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 -; GFX11-NEXT: s_cmp_lg_u32 s13, 0 -; GFX11-NEXT: s_cselect_b64 s[10:11], s[10:11], 0 -; GFX11-NEXT: s_cselect_b64 s[0:1], s[14:15], s[0:1] +; GFX11-NEXT: s_cmp_eq_u32 s9, 0 +; GFX11-NEXT: s_cselect_b32 s9, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[12:13], s[0:1], s12 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], s8 +; GFX11-NEXT: s_lshl_b64 s[16:17], s[0:1], s8 +; GFX11-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s11 ; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_cselect_b64 s[14:15], s[16:17], 0 +; GFX11-NEXT: s_cselect_b64 s[0:1], s[12:13], s[0:1] +; GFX11-NEXT: s_cmp_lg_u32 s9, 0 ; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX11-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 -; GFX11-NEXT: s_lshl_b32 s13, s6, 31 +; GFX11-NEXT: s_lshl_b32 s11, s6, 31 ; GFX11-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 -; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[12:13] -; GFX11-NEXT: s_sub_i32 s14, s8, 64 -; GFX11-NEXT: s_sub_i32 s9, 64, s8 -; GFX11-NEXT: s_cmp_lt_u32 s8, 64 -; GFX11-NEXT: s_cselect_b32 s15, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s8, 0 +; GFX11-NEXT: s_and_not1_b32 s6, 0x7f, s8 +; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[10:11] +; GFX11-NEXT: s_not_b32 s10, s8 +; GFX11-NEXT: s_sub_i32 s12, s6, 64 +; GFX11-NEXT: s_sub_i32 s8, 64, s6 +; GFX11-NEXT: s_cmp_lt_u32 s6, 64 +; GFX11-NEXT: s_cselect_b32 s13, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s6, 0 ; GFX11-NEXT: s_cselect_b32 s16, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[6:7], s[0:1], s8 -; GFX11-NEXT: s_lshl_b64 s[12:13], s[4:5], s9 -; GFX11-NEXT: s_lshr_b64 s[8:9], s[4:5], s8 -; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[12:13] -; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s14 -; GFX11-NEXT: s_cmp_lg_u32 s15, 0 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[0:1], s10 +; GFX11-NEXT: s_lshl_b64 s[8:9], s[4:5], s8 +; GFX11-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 +; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s12 +; GFX11-NEXT: s_cmp_lg_u32 s13, 0 ; GFX11-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] ; GFX11-NEXT: s_cmp_lg_u32 s16, 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] -; GFX11-NEXT: s_cmp_lg_u32 s15, 0 -; GFX11-NEXT: s_cselect_b64 s[4:5], s[8:9], 0 -; GFX11-NEXT: s_or_b64 s[0:1], s[10:11], s[0:1] +; GFX11-NEXT: s_cmp_lg_u32 s13, 0 +; GFX11-NEXT: s_cselect_b64 s[4:5], s[10:11], 0 +; GFX11-NEXT: s_or_b64 s[0:1], s[14:15], s[0:1] ; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt) @@ -5985,143 +6059,143 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX6-LABEL: v_fshl_i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v14, 0x7f, v8 -; GFX6-NEXT: v_not_b32_e32 v8, v8 ; GFX6-NEXT: v_and_b32_e32 v15, 0x7f, v8 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v14 -; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, 64, v14 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[0:1], v8 -; GFX6-NEXT: v_lshl_b64 v[10:11], v[2:3], v14 -; GFX6-NEXT: v_lshl_b64 v[12:13], v[0:1], v14 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 64, v15 +; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, 64, v15 +; GFX6-NEXT: v_lshr_b64 v[9:10], v[0:1], v9 +; GFX6-NEXT: v_lshl_b64 v[11:12], v[2:3], v15 +; GFX6-NEXT: v_lshl_b64 v[13:14], v[0:1], v15 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v16 -; GFX6-NEXT: v_or_b32_e32 v8, v8, v10 ; GFX6-NEXT: v_or_b32_e32 v9, v9, v11 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 -; GFX6-NEXT: v_cndmask_b32_e32 v10, 0, v12, vcc +; GFX6-NEXT: v_or_b32_e32 v10, v10, v12 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 ; GFX6-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 -; GFX6-NEXT: v_cndmask_b32_e32 v12, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 +; GFX6-NEXT: v_cndmask_b32_e32 v10, v0, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v13, v1, v3, vcc ; GFX6-NEXT: v_lshr_b64 v[0:1], v[4:5], 1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 31, v6 +; GFX6-NEXT: v_not_b32_e32 v4, v8 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_lshr_b64 v[2:3], v[6:7], 1 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 64, v15 -; GFX6-NEXT: v_subrev_i32_e32 v14, vcc, 64, v15 -; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], v15 +; GFX6-NEXT: v_and_b32_e32 v14, 0x7f, v4 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 64, v14 +; GFX6-NEXT: v_subrev_i32_e32 v15, vcc, 64, v14 +; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], v14 ; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], v6 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], v15 -; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v14 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], v14 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v15 ; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_or_b32_e32 v5, v5, v7 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc -; GFX6-NEXT: v_or_b32_e32 v0, v10, v0 -; GFX6-NEXT: v_or_b32_e32 v1, v11, v1 -; GFX6-NEXT: v_or_b32_e32 v2, v12, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v11, v0 +; GFX6-NEXT: v_or_b32_e32 v1, v12, v1 +; GFX6-NEXT: v_or_b32_e32 v2, v10, v2 ; GFX6-NEXT: v_or_b32_e32 v3, v13, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshl_i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v14, 0x7f, v8 -; GFX8-NEXT: v_not_b32_e32 v8, v8 ; GFX8-NEXT: v_and_b32_e32 v15, 0x7f, v8 -; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v14 -; GFX8-NEXT: v_subrev_u32_e32 v16, vcc, 64, v14 -; GFX8-NEXT: v_lshrrev_b64 v[8:9], v8, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[10:11], v14, v[2:3] -; GFX8-NEXT: v_lshlrev_b64 v[12:13], v14, v[0:1] +; GFX8-NEXT: v_sub_u32_e32 v9, vcc, 64, v15 +; GFX8-NEXT: v_subrev_u32_e32 v16, vcc, 64, v15 +; GFX8-NEXT: v_lshrrev_b64 v[9:10], v9, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[11:12], v15, v[2:3] +; GFX8-NEXT: v_lshlrev_b64 v[13:14], v15, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v16, v[0:1] -; GFX8-NEXT: v_or_b32_e32 v8, v8, v10 ; GFX8-NEXT: v_or_b32_e32 v9, v9, v11 -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 -; GFX8-NEXT: v_cndmask_b32_e32 v10, 0, v12, vcc +; GFX8-NEXT: v_or_b32_e32 v10, v10, v12 +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 ; GFX8-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 -; GFX8-NEXT: v_cndmask_b32_e32 v12, v0, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v13, v1, v3, vcc ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[4:5] ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 31, v6 +; GFX8-NEXT: v_not_b32_e32 v4, v8 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[6:7] -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 64, v15 -; GFX8-NEXT: v_subrev_u32_e32 v14, vcc, 64, v15 -; GFX8-NEXT: v_lshrrev_b64 v[4:5], v15, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v14, 0x7f, v4 +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 64, v14 +; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, 64, v14 +; GFX8-NEXT: v_lshrrev_b64 v[4:5], v14, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[6:7], v6, v[2:3] -; GFX8-NEXT: v_lshrrev_b64 v[8:9], v15, v[2:3] -; GFX8-NEXT: v_lshrrev_b64 v[2:3], v14, v[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[8:9], v14, v[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[2:3], v15, v[2:3] ; GFX8-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v7 -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc -; GFX8-NEXT: v_or_b32_e32 v0, v10, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v11, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v12, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v11, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v12, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v10, v2 ; GFX8-NEXT: v_or_b32_e32 v3, v13, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshl_i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v14, 0x7f, v8 -; GFX9-NEXT: v_not_b32_e32 v8, v8 ; GFX9-NEXT: v_and_b32_e32 v15, 0x7f, v8 -; GFX9-NEXT: v_sub_u32_e32 v8, 64, v14 -; GFX9-NEXT: v_subrev_u32_e32 v16, 64, v14 -; GFX9-NEXT: v_lshrrev_b64 v[8:9], v8, v[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[10:11], v14, v[2:3] -; GFX9-NEXT: v_lshlrev_b64 v[12:13], v14, v[0:1] +; GFX9-NEXT: v_sub_u32_e32 v9, 64, v15 +; GFX9-NEXT: v_subrev_u32_e32 v16, 64, v15 +; GFX9-NEXT: v_lshrrev_b64 v[9:10], v9, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[11:12], v15, v[2:3] +; GFX9-NEXT: v_lshlrev_b64 v[13:14], v15, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v16, v[0:1] -; GFX9-NEXT: v_or_b32_e32 v8, v8, v10 ; GFX9-NEXT: v_or_b32_e32 v9, v9, v11 -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v12, vcc +; GFX9-NEXT: v_or_b32_e32 v10, v10, v12 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 ; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v8, v1, v9, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v9, v1, v10, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v0, v2, vcc ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v13, v8, v3, vcc +; GFX9-NEXT: v_not_b32_e32 v4, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v9, v3, vcc ; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[6:7] +; GFX9-NEXT: v_and_b32_e32 v14, 0x7f, v4 ; GFX9-NEXT: v_lshl_or_b32 v1, v6, 31, v1 -; GFX9-NEXT: v_sub_u32_e32 v6, 64, v15 -; GFX9-NEXT: v_subrev_u32_e32 v14, 64, v15 -; GFX9-NEXT: v_lshrrev_b64 v[4:5], v15, v[0:1] +; GFX9-NEXT: v_sub_u32_e32 v6, 64, v14 +; GFX9-NEXT: v_subrev_u32_e32 v15, 64, v14 +; GFX9-NEXT: v_lshrrev_b64 v[4:5], v14, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[6:7], v6, v[2:3] -; GFX9-NEXT: v_lshrrev_b64 v[8:9], v15, v[2:3] -; GFX9-NEXT: v_lshrrev_b64 v[2:3], v14, v[2:3] +; GFX9-NEXT: v_lshrrev_b64 v[8:9], v14, v[2:3] +; GFX9-NEXT: v_lshrrev_b64 v[2:3], v15, v[2:3] ; GFX9-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX9-NEXT: v_or_b32_e32 v5, v5, v7 -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc -; GFX9-NEXT: v_or_b32_e32 v0, v10, v0 -; GFX9-NEXT: v_or_b32_e32 v1, v11, v1 -; GFX9-NEXT: v_or_b32_e32 v2, v12, v2 +; GFX9-NEXT: v_or_b32_e32 v0, v11, v0 +; GFX9-NEXT: v_or_b32_e32 v1, v12, v1 +; GFX9-NEXT: v_or_b32_e32 v2, v10, v2 ; GFX9-NEXT: v_or_b32_e32 v3, v13, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -6129,15 +6203,15 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v18, 0x7f, v8 -; GFX10-NEXT: v_not_b32_e32 v8, v8 +; GFX10-NEXT: v_not_b32_e32 v10, v8 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] ; GFX10-NEXT: v_lshrrev_b64 v[12:13], 1, v[6:7] -; GFX10-NEXT: v_sub_nc_u32_e32 v10, 64, v18 -; GFX10-NEXT: v_and_b32_e32 v19, 0x7f, v8 +; GFX10-NEXT: v_sub_nc_u32_e32 v11, 64, v18 +; GFX10-NEXT: v_and_b32_e32 v19, 0x7f, v10 ; GFX10-NEXT: v_lshlrev_b64 v[8:9], v18, v[2:3] ; GFX10-NEXT: v_lshl_or_b32 v5, v6, 31, v5 ; GFX10-NEXT: v_subrev_nc_u32_e32 v20, 64, v18 -; GFX10-NEXT: v_lshrrev_b64 v[10:11], v10, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[10:11], v11, v[0:1] ; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v19 ; GFX10-NEXT: v_lshlrev_b64 v[6:7], v18, v[0:1] ; GFX10-NEXT: v_lshrrev_b64 v[14:15], v19, v[4:5] @@ -6175,43 +6249,43 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX11-LABEL: v_fshl_i128: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] ; GFX11-NEXT: v_and_b32_e32 v18, 0x7f, v8 -; GFX11-NEXT: v_not_b32_e32 v8, v8 +; GFX11-NEXT: v_not_b32_e32 v10, v8 +; GFX11-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] ; GFX11-NEXT: v_lshrrev_b64 v[12:13], 1, v[6:7] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_sub_nc_u32_e32 v10, 64, v18 -; GFX11-NEXT: v_lshl_or_b32 v5, v6, 31, v5 -; GFX11-NEXT: v_lshlrev_b64 v[6:7], v18, v[0:1] -; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18 -; GFX11-NEXT: v_and_b32_e32 v19, 0x7f, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_sub_nc_u32_e32 v11, 64, v18 +; GFX11-NEXT: v_and_b32_e32 v19, 0x7f, v10 ; GFX11-NEXT: v_lshlrev_b64 v[8:9], v18, v[2:3] -; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[0:1] +; GFX11-NEXT: v_lshl_or_b32 v5, v6, 31, v5 ; GFX11-NEXT: v_subrev_nc_u32_e32 v20, 64, v18 -; GFX11-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc_lo +; GFX11-NEXT: v_lshrrev_b64 v[10:11], v11, v[0:1] ; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v19 +; GFX11-NEXT: v_lshlrev_b64 v[6:7], v18, v[0:1] ; GFX11-NEXT: v_lshrrev_b64 v[14:15], v19, v[4:5] -; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v19 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v20, v[0:1] +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18 ; GFX11-NEXT: v_or_b32_e32 v10, v10, v8 ; GFX11-NEXT: v_subrev_nc_u32_e32 v8, 64, v19 ; GFX11-NEXT: v_lshlrev_b64 v[16:17], v16, v[12:13] -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v20, v[0:1] ; GFX11-NEXT: v_or_b32_e32 v11, v11, v9 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v19 +; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v0, v10, vcc_lo ; GFX11-NEXT: v_lshrrev_b64 v[8:9], v8, v[12:13] -; GFX11-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo ; GFX11-NEXT: v_or_b32_e32 v14, v14, v16 ; GFX11-NEXT: v_or_b32_e32 v15, v15, v17 -; GFX11-NEXT: v_dual_cndmask_b32 v10, v0, v10 :: v_dual_cndmask_b32 v11, v1, v11 ; GFX11-NEXT: v_lshrrev_b64 v[0:1], v19, v[12:13] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v14, s0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v18 +; GFX11-NEXT: v_dual_cndmask_b32 v6, 0, v6 :: v_dual_cndmask_b32 v7, 0, v7 +; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v14, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v15, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v4, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e64 v2, v10, v2, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v11, v3, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v4, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e64 v5, v9, v5, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v0, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v1, s0 @@ -6229,173 +6303,173 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, i128 %amt) { ; GFX6-LABEL: v_fshl_i128_ssv: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_and_b32_e32 v6, 0x7f, v0 -; GFX6-NEXT: v_not_b32_e32 v0, v0 ; GFX6-NEXT: v_and_b32_e32 v7, 0x7f, v0 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v6 -; GFX6-NEXT: v_lshr_b64 v[0:1], s[0:1], v0 -; GFX6-NEXT: v_lshl_b64 v[2:3], s[2:3], v6 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 64, v6 -; GFX6-NEXT: v_lshl_b64 v[4:5], s[0:1], v6 -; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 64, v7 +; GFX6-NEXT: v_lshr_b64 v[1:2], s[0:1], v1 +; GFX6-NEXT: v_lshl_b64 v[3:4], s[2:3], v7 +; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 64, v7 +; GFX6-NEXT: v_lshl_b64 v[5:6], s[0:1], v7 ; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX6-NEXT: v_lshl_b64 v[0:1], s[0:1], v8 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v2, v4 +; GFX6-NEXT: v_lshl_b64 v[1:2], s[0:1], v8 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 +; GFX6-NEXT: v_not_b32_e32 v0, v0 ; GFX6-NEXT: s_mov_b32 s8, 0 -; GFX6-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v8, 0, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: v_mov_b32_e32 v3, s3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX6-NEXT: v_mov_b32_e32 v3, s2 +; GFX6-NEXT: v_mov_b32_e32 v4, s3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX6-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 ; GFX6-NEXT: s_lshl_b32 s9, s6, 31 -; GFX6-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v10, v1, v3, vcc +; GFX6-NEXT: v_and_b32_e32 v10, 0x7f, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v9, v2, v4, vcc ; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GFX6-NEXT: s_lshr_b64 s[2:3], s[6:7], 1 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v7 -; GFX6-NEXT: v_lshr_b64 v[0:1], s[0:1], v7 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v10 +; GFX6-NEXT: v_lshr_b64 v[0:1], s[0:1], v10 ; GFX6-NEXT: v_lshl_b64 v[2:3], s[2:3], v2 -; GFX6-NEXT: v_subrev_i32_e32 v11, vcc, 64, v7 +; GFX6-NEXT: v_subrev_i32_e32 v11, vcc, 64, v10 ; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX6-NEXT: v_lshr_b64 v[0:1], s[2:3], v11 -; GFX6-NEXT: v_lshr_b64 v[4:5], s[2:3], v7 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 +; GFX6-NEXT: v_lshr_b64 v[4:5], s[2:3], v10 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: v_mov_b32_e32 v3, s1 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v10 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc ; GFX6-NEXT: v_or_b32_e32 v0, v8, v0 -; GFX6-NEXT: v_or_b32_e32 v1, v9, v1 -; GFX6-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v10, v3 +; GFX6-NEXT: v_or_b32_e32 v1, v6, v1 +; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_fshl_i128_ssv: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_and_b32_e32 v6, 0x7f, v0 -; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_and_b32_e32 v7, 0x7f, v0 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v6 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[2:3], v6, s[2:3] -; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, 64, v6 -; GFX8-NEXT: v_lshlrev_b64 v[4:5], v6, s[0:1] -; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 64, v7 +; GFX8-NEXT: v_lshrrev_b64 v[1:2], v1, s[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[3:4], v7, s[2:3] +; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, 64, v7 +; GFX8-NEXT: v_lshlrev_b64 v[5:6], v7, s[0:1] ; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, s[0:1] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GFX8-NEXT: v_or_b32_e32 v4, v2, v4 +; GFX8-NEXT: v_lshlrev_b64 v[1:2], v8, s[0:1] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 +; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: s_mov_b32 s8, 0 -; GFX8-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v8, 0, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v4, s3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX8-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 ; GFX8-NEXT: s_lshl_b32 s9, s6, 31 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v10, v1, v3, vcc +; GFX8-NEXT: v_and_b32_e32 v10, 0x7f, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v9, v2, v4, vcc ; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GFX8-NEXT: s_lshr_b64 s[2:3], s[6:7], 1 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v7 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v7, s[0:1] +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v10 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v10, s[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3] -; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, 64, v7 +; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, 64, v10 ; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v11, s[2:3] -; GFX8-NEXT: v_lshrrev_b64 v[4:5], v7, s[2:3] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 +; GFX8-NEXT: v_lshrrev_b64 v[4:5], v10, s[2:3] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v10 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc ; GFX8-NEXT: v_or_b32_e32 v0, v8, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v9, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v10, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v6, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_fshl_i128_ssv: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_and_b32_e32 v6, 0x7f, v0 -; GFX9-NEXT: v_not_b32_e32 v0, v0 ; GFX9-NEXT: v_and_b32_e32 v7, 0x7f, v0 -; GFX9-NEXT: v_sub_u32_e32 v0, 64, v6 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[2:3], v6, s[2:3] -; GFX9-NEXT: v_subrev_u32_e32 v8, 64, v6 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v6, s[0:1] -; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX9-NEXT: v_sub_u32_e32 v1, 64, v7 +; GFX9-NEXT: v_lshrrev_b64 v[1:2], v1, s[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[3:4], v7, s[2:3] +; GFX9-NEXT: v_subrev_u32_e32 v8, 64, v7 +; GFX9-NEXT: v_lshlrev_b64 v[5:6], v7, s[0:1] ; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, s[0:1] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GFX9-NEXT: v_or_b32_e32 v4, v2, v4 +; GFX9-NEXT: v_lshlrev_b64 v[1:2], v8, s[0:1] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 +; GFX9-NEXT: v_not_b32_e32 v0, v0 ; GFX9-NEXT: s_mov_b32 s8, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v8, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 ; GFX9-NEXT: s_lshl_b32 s9, s6, 31 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc +; GFX9-NEXT: v_and_b32_e32 v10, 0x7f, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v2, v4, vcc ; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GFX9-NEXT: s_lshr_b64 s[2:3], s[6:7], 1 -; GFX9-NEXT: v_sub_u32_e32 v2, 64, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v1, v3, vcc -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v7, s[0:1] +; GFX9-NEXT: v_sub_u32_e32 v2, 64, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v10, s[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3] -; GFX9-NEXT: v_subrev_u32_e32 v11, 64, v7 +; GFX9-NEXT: v_subrev_u32_e32 v11, 64, v10 ; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v11, s[2:3] -; GFX9-NEXT: v_lshrrev_b64 v[4:5], v7, s[2:3] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 +; GFX9-NEXT: v_lshrrev_b64 v[4:5], v10, s[2:3] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc ; GFX9-NEXT: v_or_b32_e32 v0, v8, v0 -; GFX9-NEXT: v_or_b32_e32 v1, v9, v1 -; GFX9-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX9-NEXT: v_or_b32_e32 v3, v10, v3 +; GFX9-NEXT: v_or_b32_e32 v1, v6, v1 +; GFX9-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX9-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_fshl_i128_ssv: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_and_b32_e32 v12, 0x7f, v0 -; GFX10-NEXT: v_not_b32_e32 v0, v0 +; GFX10-NEXT: v_not_b32_e32 v2, v0 ; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 ; GFX10-NEXT: s_lshl_b32 s9, s6, 31 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v12 -; GFX10-NEXT: v_and_b32_e32 v13, 0x7f, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, 64, v12 +; GFX10-NEXT: v_and_b32_e32 v13, 0x7f, v2 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v12, s[2:3] ; GFX10-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] ; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 -; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v3, s[0:1] ; GFX10-NEXT: v_sub_nc_u32_e32 v8, 64, v13 ; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v12 ; GFX10-NEXT: v_lshrrev_b64 v[6:7], v13, s[8:9] @@ -6434,58 +6508,52 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX11-LABEL: v_fshl_i128_ssv: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_and_b32_e32 v12, 0x7f, v0 -; GFX11-NEXT: v_not_b32_e32 v0, v0 +; GFX11-NEXT: v_not_b32_e32 v2, v0 ; GFX11-NEXT: s_mov_b32 s8, 0 ; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 ; GFX11-NEXT: s_lshl_b32 s9, s6, 31 ; GFX11-NEXT: v_lshlrev_b64 v[4:5], v12, s[0:1] ; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0x7f, v2 ; GFX11-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] ; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo -; GFX11-NEXT: v_subrev_nc_u32_e32 v10, 64, v12 -; GFX11-NEXT: v_sub_nc_u32_e32 v2, 64, v12 -; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 0, v12 -; GFX11-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc_lo -; GFX11-NEXT: v_lshlrev_b64 v[10:11], v10, s[0:1] -; GFX11-NEXT: v_and_b32_e32 v13, 0x7f, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v4, 0, v4 :: v_dual_cndmask_b32 v5, 0, v5 +; GFX11-NEXT: v_sub_nc_u32_e32 v3, 64, v12 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v12, s[2:3] -; GFX11-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_sub_nc_u32_e32 v8, 64, v13 +; GFX11-NEXT: v_subrev_nc_u32_e32 v10, 64, v12 ; GFX11-NEXT: v_lshrrev_b64 v[6:7], v13, s[8:9] +; GFX11-NEXT: v_lshrrev_b64 v[2:3], v3, s[0:1] +; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 0, v12 +; GFX11-NEXT: v_lshlrev_b64 v[8:9], v8, s[6:7] +; GFX11-NEXT: v_lshlrev_b64 v[10:11], v10, s[0:1] +; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v13 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v13 ; GFX11-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_subrev_nc_u32_e32 v0, 64, v13 ; GFX11-NEXT: v_or_b32_e32 v3, v3, v1 -; GFX11-NEXT: v_lshlrev_b64 v[8:9], v8, s[6:7] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v6, v6, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v10, v2, vcc_lo -; GFX11-NEXT: v_subrev_nc_u32_e32 v0, 64, v13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v7, v7, v9 -; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v13 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v10, v2, vcc_lo +; GFX11-NEXT: v_lshrrev_b64 v[0:1], v0, s[6:7] ; GFX11-NEXT: v_cndmask_b32_e32 v10, v11, v3, vcc_lo ; GFX11-NEXT: v_lshrrev_b64 v[2:3], v13, s[6:7] -; GFX11-NEXT: v_lshrrev_b64 v[0:1], v0, s[6:7] -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v6, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v7, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v8, s2, s4 ; GFX11-NEXT: v_cndmask_b32_e64 v7, v10, s3, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s8, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s9, s1 -; GFX11-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v6, v2 ; GFX11-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt) %cast.result = bitcast i128 %result to <4 x float> @@ -6495,43 +6563,43 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 inreg %amt) { ; GFX6-LABEL: v_fshl_i128_svs: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX6-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] -; GFX6-NEXT: s_sub_i32 s5, s6, 64 -; GFX6-NEXT: s_sub_i32 s7, 64, s6 -; GFX6-NEXT: s_cmp_lt_u32 s6, 64 -; GFX6-NEXT: s_cselect_b32 s12, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s6, 0 +; GFX6-NEXT: s_and_b32 s5, s4, 0x7f +; GFX6-NEXT: s_sub_i32 s12, s5, 64 +; GFX6-NEXT: s_sub_i32 s8, 64, s5 +; GFX6-NEXT: s_cmp_lt_u32 s5, 64 ; GFX6-NEXT: s_cselect_b32 s13, 1, 0 -; GFX6-NEXT: s_lshl_b64 s[8:9], s[0:1], s6 -; GFX6-NEXT: s_lshr_b64 s[10:11], s[0:1], s7 -; GFX6-NEXT: s_lshl_b64 s[6:7], s[2:3], s6 -; GFX6-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] -; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 -; GFX6-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1] +; GFX6-NEXT: s_cmp_eq_u32 s5, 0 +; GFX6-NEXT: s_cselect_b32 s5, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[8:9], s[0:1], s8 +; GFX6-NEXT: s_lshl_b64 s[10:11], s[2:3], s4 +; GFX6-NEXT: s_lshl_b64 s[6:7], s[0:1], s4 +; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 ; GFX6-NEXT: s_cmp_lg_u32 s13, 0 -; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], 1 +; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX6-NEXT: s_cmp_lg_u32 s5, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] +; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], 1 +; GFX6-NEXT: s_andn2_b32 s0, 0x7f, s4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 31, v2 ; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], 1 -; GFX6-NEXT: s_sub_i32 s0, s4, 64 -; GFX6-NEXT: s_sub_i32 s1, 64, s4 +; GFX6-NEXT: s_sub_i32 s1, s0, 64 +; GFX6-NEXT: s_sub_i32 s4, 64, s0 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX6-NEXT: s_cmp_lt_u32 s4, 64 +; GFX6-NEXT: s_cmp_lt_u32 s0, 64 ; GFX6-NEXT: s_cselect_b32 s5, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s4, 0 -; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], s4 -; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s1 -; GFX6-NEXT: s_cselect_b32 s6, 1, 0 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], s4 -; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], s0 +; GFX6-NEXT: s_cmp_eq_u32 s0, 0 +; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], s0 +; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s4 +; GFX6-NEXT: s_cselect_b32 s8, 1, 0 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], s0 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], s1 ; GFX6-NEXT: s_and_b32 s0, 1, s5 ; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX6-NEXT: s_and_b32 s0, 1, s6 +; GFX6-NEXT: s_and_b32 s0, 1, s8 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 @@ -6539,51 +6607,51 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc -; GFX6-NEXT: v_or_b32_e32 v0, s8, v0 -; GFX6-NEXT: v_or_b32_e32 v1, s9, v1 +; GFX6-NEXT: v_or_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_or_b32_e32 v1, s7, v1 ; GFX6-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX6-NEXT: v_or_b32_e32 v3, s3, v3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_fshl_i128_svs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX8-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] -; GFX8-NEXT: s_sub_i32 s5, s6, 64 -; GFX8-NEXT: s_sub_i32 s7, 64, s6 -; GFX8-NEXT: s_cmp_lt_u32 s6, 64 -; GFX8-NEXT: s_cselect_b32 s12, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s6, 0 +; GFX8-NEXT: s_and_b32 s5, s4, 0x7f +; GFX8-NEXT: s_sub_i32 s12, s5, 64 +; GFX8-NEXT: s_sub_i32 s8, 64, s5 +; GFX8-NEXT: s_cmp_lt_u32 s5, 64 ; GFX8-NEXT: s_cselect_b32 s13, 1, 0 -; GFX8-NEXT: s_lshl_b64 s[8:9], s[0:1], s6 -; GFX8-NEXT: s_lshr_b64 s[10:11], s[0:1], s7 -; GFX8-NEXT: s_lshl_b64 s[6:7], s[2:3], s6 -; GFX8-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 -; GFX8-NEXT: s_cmp_lg_u32 s12, 0 -; GFX8-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 -; GFX8-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1] +; GFX8-NEXT: s_cmp_eq_u32 s5, 0 +; GFX8-NEXT: s_cselect_b32 s5, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[8:9], s[0:1], s8 +; GFX8-NEXT: s_lshl_b64 s[10:11], s[2:3], s4 +; GFX8-NEXT: s_lshl_b64 s[6:7], s[0:1], s4 +; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 ; GFX8-NEXT: s_cmp_lg_u32 s13, 0 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] +; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX8-NEXT: s_cmp_lg_u32 s5, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] +; GFX8-NEXT: s_andn2_b32 s0, 0x7f, s4 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 31, v2 ; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] -; GFX8-NEXT: s_sub_i32 s0, s4, 64 -; GFX8-NEXT: s_sub_i32 s1, 64, s4 +; GFX8-NEXT: s_sub_i32 s1, s0, 64 +; GFX8-NEXT: s_sub_i32 s4, 64, s0 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: s_cmp_lt_u32 s4, 64 +; GFX8-NEXT: s_cmp_lt_u32 s0, 64 ; GFX8-NEXT: s_cselect_b32 s5, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s4, 0 -; GFX8-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[6:7], s1, v[2:3] -; GFX8-NEXT: s_cselect_b32 s6, 1, 0 -; GFX8-NEXT: v_lshrrev_b64 v[8:9], s4, v[2:3] -; GFX8-NEXT: v_lshrrev_b64 v[2:3], s0, v[2:3] +; GFX8-NEXT: s_cmp_eq_u32 s0, 0 +; GFX8-NEXT: v_lshrrev_b64 v[4:5], s0, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[6:7], s4, v[2:3] +; GFX8-NEXT: s_cselect_b32 s8, 1, 0 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[2:3], s1, v[2:3] ; GFX8-NEXT: s_and_b32 s0, 1, s5 ; GFX8-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: s_and_b32 s0, 1, s6 +; GFX8-NEXT: s_and_b32 s0, 1, s8 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 @@ -6591,50 +6659,50 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc -; GFX8-NEXT: v_or_b32_e32 v0, s8, v0 -; GFX8-NEXT: v_or_b32_e32 v1, s9, v1 +; GFX8-NEXT: v_or_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_or_b32_e32 v1, s7, v1 ; GFX8-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX8-NEXT: v_or_b32_e32 v3, s3, v3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_fshl_i128_svs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX9-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] -; GFX9-NEXT: s_sub_i32 s5, s6, 64 -; GFX9-NEXT: s_sub_i32 s7, 64, s6 -; GFX9-NEXT: s_cmp_lt_u32 s6, 64 -; GFX9-NEXT: s_cselect_b32 s12, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s6, 0 +; GFX9-NEXT: s_and_b32 s5, s4, 0x7f +; GFX9-NEXT: s_sub_i32 s12, s5, 64 +; GFX9-NEXT: s_sub_i32 s8, 64, s5 +; GFX9-NEXT: s_cmp_lt_u32 s5, 64 ; GFX9-NEXT: s_cselect_b32 s13, 1, 0 -; GFX9-NEXT: s_lshl_b64 s[8:9], s[0:1], s6 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[0:1], s7 -; GFX9-NEXT: s_lshl_b64 s[6:7], s[2:3], s6 -; GFX9-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 -; GFX9-NEXT: s_cmp_lg_u32 s12, 0 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] -; GFX9-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 -; GFX9-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1] +; GFX9-NEXT: s_cmp_eq_u32 s5, 0 +; GFX9-NEXT: s_cselect_b32 s5, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[0:1], s8 +; GFX9-NEXT: s_lshl_b64 s[10:11], s[2:3], s4 +; GFX9-NEXT: s_lshl_b64 s[6:7], s[0:1], s4 +; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 ; GFX9-NEXT: s_cmp_lg_u32 s13, 0 +; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] +; GFX9-NEXT: s_cmp_lg_u32 s5, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] +; GFX9-NEXT: s_andn2_b32 s0, 0x7f, s4 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 31, v1 ; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] -; GFX9-NEXT: s_sub_i32 s0, s4, 64 -; GFX9-NEXT: s_sub_i32 s1, 64, s4 -; GFX9-NEXT: s_cmp_lt_u32 s4, 64 +; GFX9-NEXT: s_sub_i32 s1, s0, 64 +; GFX9-NEXT: s_sub_i32 s4, 64, s0 +; GFX9-NEXT: s_cmp_lt_u32 s0, 64 ; GFX9-NEXT: s_cselect_b32 s5, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s4, 0 -; GFX9-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[6:7], s1, v[2:3] -; GFX9-NEXT: s_cselect_b32 s6, 1, 0 -; GFX9-NEXT: v_lshrrev_b64 v[8:9], s4, v[2:3] -; GFX9-NEXT: v_lshrrev_b64 v[2:3], s0, v[2:3] +; GFX9-NEXT: s_cmp_eq_u32 s0, 0 +; GFX9-NEXT: v_lshrrev_b64 v[4:5], s0, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[6:7], s4, v[2:3] +; GFX9-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3] +; GFX9-NEXT: v_lshrrev_b64 v[2:3], s1, v[2:3] ; GFX9-NEXT: s_and_b32 s0, 1, s5 ; GFX9-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX9-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: s_and_b32 s0, 1, s6 +; GFX9-NEXT: s_and_b32 s0, 1, s8 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 @@ -6642,50 +6710,50 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc -; GFX9-NEXT: v_or_b32_e32 v0, s8, v0 -; GFX9-NEXT: v_or_b32_e32 v1, s9, v1 +; GFX9-NEXT: v_or_b32_e32 v0, s6, v0 +; GFX9-NEXT: v_or_b32_e32 v1, s7, v1 ; GFX9-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX9-NEXT: v_or_b32_e32 v3, s3, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_fshl_i128_svs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX10-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] -; GFX10-NEXT: s_sub_i32 s5, s6, 64 -; GFX10-NEXT: s_sub_i32 s7, 64, s6 -; GFX10-NEXT: s_cmp_lt_u32 s6, 64 +; GFX10-NEXT: s_and_b32 s5, s4, 0x7f ; GFX10-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] -; GFX10-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s6, 0 +; GFX10-NEXT: s_sub_i32 s12, s5, 64 +; GFX10-NEXT: s_sub_i32 s6, 64, s5 +; GFX10-NEXT: s_cmp_lt_u32 s5, 64 ; GFX10-NEXT: s_cselect_b32 s13, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[8:9], s[0:1], s7 -; GFX10-NEXT: s_lshl_b64 s[10:11], s[2:3], s6 -; GFX10-NEXT: s_lshl_b64 s[6:7], s[0:1], s6 -; GFX10-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 -; GFX10-NEXT: s_cmp_lg_u32 s12, 0 +; GFX10-NEXT: s_cmp_eq_u32 s5, 0 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 31, v1 -; GFX10-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] -; GFX10-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX10-NEXT: s_cselect_b32 s5, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 +; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], s4 +; GFX10-NEXT: s_lshl_b64 s[10:11], s[0:1], s4 +; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 ; GFX10-NEXT: s_cmp_lg_u32 s13, 0 -; GFX10-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] +; GFX10-NEXT: s_cselect_b64 s[8:9], s[10:11], 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1] +; GFX10-NEXT: s_cmp_lg_u32 s5, 0 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] -; GFX10-NEXT: s_sub_i32 s0, 64, s4 -; GFX10-NEXT: v_lshlrev_b64 v[6:7], s0, v[2:3] -; GFX10-NEXT: s_sub_i32 s0, s4, 64 -; GFX10-NEXT: s_cmp_lt_u32 s4, 64 -; GFX10-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3] -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s4, 0 +; GFX10-NEXT: s_andn2_b32 s0, 0x7f, s4 +; GFX10-NEXT: s_sub_i32 s1, 64, s0 +; GFX10-NEXT: v_lshrrev_b64 v[4:5], s0, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[6:7], s1, v[2:3] +; GFX10-NEXT: s_sub_i32 s1, s0, 64 +; GFX10-NEXT: s_cmp_lt_u32 s0, 64 +; GFX10-NEXT: v_lshrrev_b64 v[8:9], s1, v[2:3] +; GFX10-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s0, 0 ; GFX10-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX10-NEXT: s_cselect_b32 s5, 1, 0 -; GFX10-NEXT: s_and_b32 s0, 1, s1 +; GFX10-NEXT: s_and_b32 s1, 1, s4 ; GFX10-NEXT: v_or_b32_e32 v5, v5, v7 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 +; GFX10-NEXT: v_lshrrev_b64 v[2:3], s0, v[2:3] ; GFX10-NEXT: s_and_b32 s0, 1, s5 -; GFX10-NEXT: v_lshrrev_b64 v[2:3], s4, v[2:3] ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc_lo @@ -6695,62 +6763,62 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 ; GFX10-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX10-NEXT: v_or_b32_e32 v3, s3, v3 -; GFX10-NEXT: v_or_b32_e32 v0, s6, v0 -; GFX10-NEXT: v_or_b32_e32 v1, s7, v1 +; GFX10-NEXT: v_or_b32_e32 v0, s8, v0 +; GFX10-NEXT: v_or_b32_e32 v1, s9, v1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: v_fshl_i128_svs: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX11-NEXT: s_and_not1_b64 s[4:5], 0x7f, s[4:5] -; GFX11-NEXT: s_sub_i32 s5, s6, 64 -; GFX11-NEXT: s_sub_i32 s7, 64, s6 -; GFX11-NEXT: s_cmp_lt_u32 s6, 64 +; GFX11-NEXT: s_and_b32 s5, s4, 0x7f ; GFX11-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] -; GFX11-NEXT: s_cselect_b32 s12, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s6, 0 +; GFX11-NEXT: s_sub_i32 s12, s5, 64 +; GFX11-NEXT: s_sub_i32 s6, 64, s5 +; GFX11-NEXT: s_cmp_lt_u32 s5, 64 ; GFX11-NEXT: s_cselect_b32 s13, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[8:9], s[0:1], s7 -; GFX11-NEXT: s_lshl_b64 s[10:11], s[2:3], s6 -; GFX11-NEXT: s_lshl_b64 s[6:7], s[0:1], s6 -; GFX11-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] -; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 -; GFX11-NEXT: s_cmp_lg_u32 s12, 0 +; GFX11-NEXT: s_cmp_eq_u32 s5, 0 ; GFX11-NEXT: v_lshl_or_b32 v1, v2, 31, v1 -; GFX11-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] -; GFX11-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 -; GFX11-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX11-NEXT: s_cselect_b32 s5, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 +; GFX11-NEXT: s_lshl_b64 s[8:9], s[2:3], s4 +; GFX11-NEXT: s_lshl_b64 s[10:11], s[0:1], s4 +; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 ; GFX11-NEXT: s_cmp_lg_u32 s13, 0 -; GFX11-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] +; GFX11-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] +; GFX11-NEXT: s_cselect_b64 s[8:9], s[10:11], 0 +; GFX11-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1] +; GFX11-NEXT: s_cmp_lg_u32 s5, 0 ; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] -; GFX11-NEXT: s_sub_i32 s0, 64, s4 +; GFX11-NEXT: s_and_not1_b32 s0, 0x7f, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_lshlrev_b64 v[6:7], s0, v[2:3] -; GFX11-NEXT: s_sub_i32 s0, s4, 64 -; GFX11-NEXT: s_cmp_lt_u32 s4, 64 -; GFX11-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3] -; GFX11-NEXT: s_cselect_b32 s1, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s4, 0 +; GFX11-NEXT: s_sub_i32 s1, 64, s0 +; GFX11-NEXT: v_lshrrev_b64 v[4:5], s0, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[6:7], s1, v[2:3] +; GFX11-NEXT: s_sub_i32 s1, s0, 64 +; GFX11-NEXT: s_cmp_lt_u32 s0, 64 +; GFX11-NEXT: v_lshrrev_b64 v[8:9], s1, v[2:3] +; GFX11-NEXT: s_cselect_b32 s4, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 0 ; GFX11-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX11-NEXT: s_cselect_b32 s5, 1, 0 -; GFX11-NEXT: s_and_b32 s0, 1, s1 +; GFX11-NEXT: s_and_b32 s1, 1, s4 ; GFX11-NEXT: v_or_b32_e32 v5, v5, v7 -; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 +; GFX11-NEXT: v_lshrrev_b64 v[2:3], s0, v[2:3] ; GFX11-NEXT: s_and_b32 s0, 1, s5 -; GFX11-NEXT: v_lshrrev_b64 v[2:3], s4, v[2:3] +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 ; GFX11-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v5, v9, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_dual_cndmask_b32 v2, 0, v2 :: v_dual_cndmask_b32 v3, 0, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v2, s2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v3, s3, v3 -; GFX11-NEXT: v_or_b32_e32 v0, s6, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v1, s7, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v0, s8, v0 +; GFX11-NEXT: v_or_b32_e32 v1, s9, v1 ; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt) %cast.result = bitcast i128 %result to <4 x float> @@ -6760,25 +6828,26 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 inreg %amt) { ; GFX6-LABEL: v_fshl_i128_vss: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX6-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] -; GFX6-NEXT: s_sub_i32 s5, s6, 64 -; GFX6-NEXT: s_sub_i32 s7, 64, s6 -; GFX6-NEXT: s_cmp_lt_u32 s6, 64 +; GFX6-NEXT: s_and_b32 s5, s4, 0x7f +; GFX6-NEXT: s_sub_i32 s7, s5, 64 +; GFX6-NEXT: s_sub_i32 s8, 64, s5 +; GFX6-NEXT: s_cmp_lt_u32 s5, 64 ; GFX6-NEXT: s_cselect_b32 s9, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s6, 0 -; GFX6-NEXT: s_mov_b32 s8, 0 +; GFX6-NEXT: s_cmp_eq_u32 s5, 0 +; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_cselect_b32 s10, 1, 0 -; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], s7 -; GFX6-NEXT: v_lshl_b64 v[8:9], v[0:1], s6 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], s5 -; GFX6-NEXT: s_and_b32 s5, 1, s9 +; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], s8 +; GFX6-NEXT: v_lshl_b64 v[8:9], v[0:1], s5 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], s7 ; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 -; GFX6-NEXT: s_lshl_b32 s9, s2, 31 -; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s6 +; GFX6-NEXT: s_lshl_b32 s7, s2, 31 +; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s5 +; GFX6-NEXT: s_and_b32 s5, 1, s9 +; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX6-NEXT: s_not_b32 s6, s4 +; GFX6-NEXT: s_andn2_b32 s4, 0x7f, s4 ; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 ; GFX6-NEXT: s_and_b32 s5, 1, s10 -; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 ; GFX6-NEXT: s_sub_i32 s10, s4, 64 ; GFX6-NEXT: s_sub_i32 s8, 64, s4 @@ -6793,19 +6862,19 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 ; GFX6-NEXT: s_cselect_b32 s12, 1, 0 -; GFX6-NEXT: s_lshr_b64 s[6:7], s[2:3], s4 -; GFX6-NEXT: s_lshr_b64 s[4:5], s[0:1], s4 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[2:3], s6 +; GFX6-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 ; GFX6-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 -; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX6-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 ; GFX6-NEXT: s_cmp_lg_u32 s11, 0 -; GFX6-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] +; GFX6-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] ; GFX6-NEXT: s_cmp_lg_u32 s12, 0 ; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GFX6-NEXT: s_cmp_lg_u32 s11, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc -; GFX6-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 +; GFX6-NEXT: s_cselect_b64 s[2:3], s[4:5], 0 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v6 ; GFX6-NEXT: v_or_b32_e32 v1, s1, v7 ; GFX6-NEXT: v_or_b32_e32 v2, s2, v2 @@ -6814,25 +6883,26 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; ; GFX8-LABEL: v_fshl_i128_vss: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX8-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] -; GFX8-NEXT: s_sub_i32 s5, s6, 64 -; GFX8-NEXT: s_sub_i32 s7, 64, s6 -; GFX8-NEXT: s_cmp_lt_u32 s6, 64 +; GFX8-NEXT: s_and_b32 s5, s4, 0x7f +; GFX8-NEXT: s_sub_i32 s7, s5, 64 +; GFX8-NEXT: s_sub_i32 s8, 64, s5 +; GFX8-NEXT: s_cmp_lt_u32 s5, 64 ; GFX8-NEXT: s_cselect_b32 s9, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s6, 0 -; GFX8-NEXT: s_mov_b32 s8, 0 +; GFX8-NEXT: s_cmp_eq_u32 s5, 0 +; GFX8-NEXT: s_mov_b32 s6, 0 ; GFX8-NEXT: s_cselect_b32 s10, 1, 0 -; GFX8-NEXT: v_lshrrev_b64 v[4:5], s7, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[8:9], s6, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] -; GFX8-NEXT: s_and_b32 s5, 1, s9 +; GFX8-NEXT: v_lshrrev_b64 v[4:5], s8, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[8:9], s5, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[0:1], s7, v[0:1] ; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 -; GFX8-NEXT: s_lshl_b32 s9, s2, 31 -; GFX8-NEXT: v_lshlrev_b64 v[6:7], s6, v[2:3] +; GFX8-NEXT: s_lshl_b32 s7, s2, 31 +; GFX8-NEXT: v_lshlrev_b64 v[6:7], s5, v[2:3] +; GFX8-NEXT: s_and_b32 s5, 1, s9 +; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_not_b32 s6, s4 +; GFX8-NEXT: s_andn2_b32 s4, 0x7f, s4 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 ; GFX8-NEXT: s_and_b32 s5, 1, s10 -; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 ; GFX8-NEXT: s_sub_i32 s10, s4, 64 ; GFX8-NEXT: s_sub_i32 s8, 64, s4 @@ -6847,19 +6917,19 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 ; GFX8-NEXT: s_cselect_b32 s12, 1, 0 -; GFX8-NEXT: s_lshr_b64 s[6:7], s[2:3], s4 -; GFX8-NEXT: s_lshr_b64 s[4:5], s[0:1], s4 +; GFX8-NEXT: s_lshr_b64 s[4:5], s[2:3], s6 +; GFX8-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 ; GFX8-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 -; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 ; GFX8-NEXT: s_cmp_lg_u32 s11, 0 -; GFX8-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] +; GFX8-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] ; GFX8-NEXT: s_cmp_lg_u32 s12, 0 ; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GFX8-NEXT: s_cmp_lg_u32 s11, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc -; GFX8-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 +; GFX8-NEXT: s_cselect_b64 s[2:3], s[4:5], 0 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v6 ; GFX8-NEXT: v_or_b32_e32 v1, s1, v7 ; GFX8-NEXT: v_or_b32_e32 v2, s2, v2 @@ -6868,25 +6938,26 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; ; GFX9-LABEL: v_fshl_i128_vss: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX9-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] -; GFX9-NEXT: s_sub_i32 s5, s6, 64 -; GFX9-NEXT: s_sub_i32 s7, 64, s6 -; GFX9-NEXT: s_cmp_lt_u32 s6, 64 +; GFX9-NEXT: s_and_b32 s5, s4, 0x7f +; GFX9-NEXT: s_sub_i32 s7, s5, 64 +; GFX9-NEXT: s_sub_i32 s8, 64, s5 +; GFX9-NEXT: s_cmp_lt_u32 s5, 64 ; GFX9-NEXT: s_cselect_b32 s9, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s6, 0 -; GFX9-NEXT: s_mov_b32 s8, 0 +; GFX9-NEXT: s_cmp_eq_u32 s5, 0 +; GFX9-NEXT: s_mov_b32 s6, 0 ; GFX9-NEXT: s_cselect_b32 s10, 1, 0 -; GFX9-NEXT: v_lshrrev_b64 v[4:5], s7, v[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[8:9], s6, v[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] -; GFX9-NEXT: s_and_b32 s5, 1, s9 +; GFX9-NEXT: v_lshrrev_b64 v[4:5], s8, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[8:9], s5, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[0:1], s7, v[0:1] ; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 -; GFX9-NEXT: s_lshl_b32 s9, s2, 31 -; GFX9-NEXT: v_lshlrev_b64 v[6:7], s6, v[2:3] +; GFX9-NEXT: s_lshl_b32 s7, s2, 31 +; GFX9-NEXT: v_lshlrev_b64 v[6:7], s5, v[2:3] +; GFX9-NEXT: s_and_b32 s5, 1, s9 +; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_not_b32 s6, s4 +; GFX9-NEXT: s_andn2_b32 s4, 0x7f, s4 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 ; GFX9-NEXT: s_and_b32 s5, 1, s10 -; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 ; GFX9-NEXT: s_sub_i32 s10, s4, 64 ; GFX9-NEXT: s_sub_i32 s8, 64, s4 @@ -6901,19 +6972,19 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 ; GFX9-NEXT: s_cselect_b32 s12, 1, 0 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[2:3], s4 -; GFX9-NEXT: s_lshr_b64 s[4:5], s[0:1], s4 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[2:3], s6 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 ; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 -; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX9-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 ; GFX9-NEXT: s_cmp_lg_u32 s11, 0 -; GFX9-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] +; GFX9-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] ; GFX9-NEXT: s_cmp_lg_u32 s12, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GFX9-NEXT: s_cmp_lg_u32 s11, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc -; GFX9-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 +; GFX9-NEXT: s_cselect_b64 s[2:3], s[4:5], 0 ; GFX9-NEXT: v_or_b32_e32 v0, s0, v6 ; GFX9-NEXT: v_or_b32_e32 v1, s1, v7 ; GFX9-NEXT: v_or_b32_e32 v2, s2, v2 @@ -6922,53 +6993,54 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; ; GFX10-LABEL: v_fshl_i128_vss: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX10-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] -; GFX10-NEXT: s_sub_i32 s5, s6, 64 -; GFX10-NEXT: s_sub_i32 s7, 64, s6 -; GFX10-NEXT: s_cmp_lt_u32 s6, 64 +; GFX10-NEXT: s_and_b32 s5, s4, 0x7f +; GFX10-NEXT: s_sub_i32 s6, s5, 64 +; GFX10-NEXT: s_sub_i32 s7, 64, s5 +; GFX10-NEXT: s_cmp_lt_u32 s5, 64 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], s7, v[0:1] ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s6, 0 -; GFX10-NEXT: v_lshlrev_b64 v[6:7], s6, v[2:3] +; GFX10-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10-NEXT: v_lshlrev_b64 v[6:7], s5, v[2:3] ; GFX10-NEXT: s_cselect_b32 s9, 1, 0 -; GFX10-NEXT: v_lshlrev_b64 v[8:9], s6, v[0:1] -; GFX10-NEXT: s_and_b32 s6, 1, s8 -; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6 +; GFX10-NEXT: v_lshlrev_b64 v[8:9], s5, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], s6, v[0:1] ; GFX10-NEXT: s_mov_b32 s6, 0 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 ; GFX10-NEXT: s_lshl_b32 s7, s2, 31 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] -; GFX10-NEXT: s_and_b32 s5, 1, s9 +; GFX10-NEXT: s_and_b32 s5, 1, s8 ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 -; GFX10-NEXT: s_sub_i32 s10, s4, 64 -; GFX10-NEXT: s_sub_i32 s8, 64, s4 +; GFX10-NEXT: s_andn2_b32 s6, 0x7f, s4 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 ; GFX10-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX10-NEXT: v_or_b32_e32 v5, v5, v7 -; GFX10-NEXT: s_cmp_lt_u32 s4, 64 +; GFX10-NEXT: s_and_b32 s5, 1, s9 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX10-NEXT: s_not_b32 s8, s4 +; GFX10-NEXT: s_sub_i32 s10, s6, 64 +; GFX10-NEXT: s_sub_i32 s7, 64, s6 +; GFX10-NEXT: s_cmp_lt_u32 s6, 64 ; GFX10-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc_lo ; GFX10-NEXT: s_cselect_b32 s11, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s4, 0 +; GFX10-NEXT: s_cmp_eq_u32 s6, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc_lo -; GFX10-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s4 -; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 -; GFX10-NEXT: s_lshr_b64 s[4:5], s[2:3], s4 -; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX10-NEXT: s_cselect_b32 s12, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[0:1], s8 +; GFX10-NEXT: s_lshl_b64 s[6:7], s[2:3], s7 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[2:3], s8 +; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 ; GFX10-NEXT: s_cmp_lg_u32 s11, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo -; GFX10-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] +; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] ; GFX10-NEXT: s_cmp_lg_u32 s12, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GFX10-NEXT: s_cmp_lg_u32 s11, 0 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v6 -; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], 0 +; GFX10-NEXT: s_cselect_b64 s[2:3], s[8:9], 0 ; GFX10-NEXT: v_or_b32_e32 v1, s1, v7 ; GFX10-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX10-NEXT: v_or_b32_e32 v3, s3, v3 @@ -6976,50 +7048,52 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; ; GFX11-LABEL: v_fshl_i128_vss: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX11-NEXT: s_and_not1_b64 s[4:5], 0x7f, s[4:5] -; GFX11-NEXT: s_sub_i32 s5, s6, 64 -; GFX11-NEXT: s_sub_i32 s7, 64, s6 -; GFX11-NEXT: s_cmp_lt_u32 s6, 64 +; GFX11-NEXT: s_and_b32 s5, s4, 0x7f +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_sub_i32 s6, s5, 64 +; GFX11-NEXT: s_sub_i32 s7, 64, s5 +; GFX11-NEXT: s_cmp_lt_u32 s5, 64 ; GFX11-NEXT: v_lshrrev_b64 v[4:5], s7, v[0:1] ; GFX11-NEXT: s_cselect_b32 s8, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s6, 0 -; GFX11-NEXT: v_lshlrev_b64 v[6:7], s6, v[2:3] +; GFX11-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11-NEXT: v_lshlrev_b64 v[6:7], s5, v[2:3] ; GFX11-NEXT: s_cselect_b32 s9, 1, 0 -; GFX11-NEXT: v_lshlrev_b64 v[8:9], s6, v[0:1] -; GFX11-NEXT: s_and_b32 s6, 1, s8 -; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 -; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6 +; GFX11-NEXT: v_lshlrev_b64 v[8:9], s5, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[0:1], s6, v[0:1] ; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 ; GFX11-NEXT: s_lshl_b32 s7, s2, 31 -; GFX11-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] -; GFX11-NEXT: s_and_b32 s5, 1, s9 +; GFX11-NEXT: s_and_b32 s5, 1, s8 ; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 -; GFX11-NEXT: s_sub_i32 s10, s4, 64 -; GFX11-NEXT: s_sub_i32 s8, 64, s4 +; GFX11-NEXT: s_and_not1_b32 s6, 0x7f, s4 +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 ; GFX11-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX11-NEXT: v_or_b32_e32 v5, v5, v7 -; GFX11-NEXT: s_cmp_lt_u32 s4, 64 +; GFX11-NEXT: s_and_b32 s5, 1, s9 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX11-NEXT: s_not_b32 s8, s4 +; GFX11-NEXT: s_sub_i32 s10, s6, 64 +; GFX11-NEXT: s_sub_i32 s7, 64, s6 +; GFX11-NEXT: s_cmp_lt_u32 s6, 64 ; GFX11-NEXT: v_dual_cndmask_b32 v6, 0, v8 :: v_dual_cndmask_b32 v7, 0, v9 ; GFX11-NEXT: s_cselect_b32 s11, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s4, 0 +; GFX11-NEXT: s_cmp_eq_u32 s6, 0 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5 -; GFX11-NEXT: s_cselect_b32 s12, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[6:7], s[0:1], s4 -; GFX11-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 -; GFX11-NEXT: s_lshr_b64 s[4:5], s[2:3], s4 -; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX11-NEXT: s_cselect_b32 s12, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[4:5], s[0:1], s8 +; GFX11-NEXT: s_lshl_b64 s[6:7], s[2:3], s7 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[2:3], s8 +; GFX11-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 ; GFX11-NEXT: s_cmp_lg_u32 s11, 0 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v0, v2 :: v_dual_cndmask_b32 v3, v1, v3 -; GFX11-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] +; GFX11-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] ; GFX11-NEXT: s_cmp_lg_u32 s12, 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GFX11-NEXT: s_cmp_lg_u32 s11, 0 ; GFX11-NEXT: v_or_b32_e32 v0, s0, v6 -; GFX11-NEXT: s_cselect_b64 s[2:3], s[4:5], 0 +; GFX11-NEXT: s_cselect_b64 s[2:3], s[8:9], 0 ; GFX11-NEXT: v_or_b32_e32 v1, s1, v7 ; GFX11-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX11-NEXT: v_or_b32_e32 v3, s3, v3 @@ -7152,40 +7226,41 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) { define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs, <2 x i128> inreg %amt) { ; GFX6-LABEL: s_fshl_v2i128: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f -; GFX6-NEXT: s_andn2_b64 s[16:17], 0x7f, s[16:17] -; GFX6-NEXT: s_sub_i32 s17, s18, 64 -; GFX6-NEXT: s_sub_i32 s19, 64, s18 -; GFX6-NEXT: s_cmp_lt_u32 s18, 64 -; GFX6-NEXT: s_cselect_b32 s23, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s18, 0 +; GFX6-NEXT: s_and_b32 s17, s16, 0x7f +; GFX6-NEXT: s_sub_i32 s19, s17, 64 +; GFX6-NEXT: s_sub_i32 s21, 64, s17 +; GFX6-NEXT: s_cmp_lt_u32 s17, 64 ; GFX6-NEXT: s_cselect_b32 s28, 1, 0 -; GFX6-NEXT: s_lshl_b64 s[24:25], s[0:1], s18 -; GFX6-NEXT: s_lshr_b64 s[26:27], s[0:1], s19 -; GFX6-NEXT: s_lshl_b64 s[18:19], s[2:3], s18 -; GFX6-NEXT: s_or_b64 s[18:19], s[26:27], s[18:19] -; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s17 -; GFX6-NEXT: s_cmp_lg_u32 s23, 0 -; GFX6-NEXT: s_cselect_b64 s[24:25], s[24:25], 0 -; GFX6-NEXT: s_cselect_b64 s[0:1], s[18:19], s[0:1] +; GFX6-NEXT: s_cmp_eq_u32 s17, 0 +; GFX6-NEXT: s_cselect_b32 s17, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[24:25], s[0:1], s21 +; GFX6-NEXT: s_lshl_b64 s[26:27], s[2:3], s16 +; GFX6-NEXT: s_lshl_b64 s[22:23], s[0:1], s16 +; GFX6-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s19 ; GFX6-NEXT: s_cmp_lg_u32 s28, 0 -; GFX6-NEXT: s_mov_b32 s22, 0 +; GFX6-NEXT: s_cselect_b64 s[22:23], s[22:23], 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1] +; GFX6-NEXT: s_cmp_lg_u32 s17, 0 +; GFX6-NEXT: s_mov_b32 s18, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX6-NEXT: s_lshr_b64 s[0:1], s[8:9], 1 -; GFX6-NEXT: s_lshl_b32 s23, s10, 31 -; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[22:23] +; GFX6-NEXT: s_lshl_b32 s19, s10, 31 ; GFX6-NEXT: s_lshr_b64 s[8:9], s[10:11], 1 -; GFX6-NEXT: s_sub_i32 s23, s16, 64 -; GFX6-NEXT: s_sub_i32 s18, 64, s16 -; GFX6-NEXT: s_cmp_lt_u32 s16, 64 +; GFX6-NEXT: s_andn2_b32 s10, 0x7f, s16 +; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[18:19] +; GFX6-NEXT: s_not_b32 s17, s16 +; GFX6-NEXT: s_sub_i32 s19, s10, 64 +; GFX6-NEXT: s_sub_i32 s21, 64, s10 +; GFX6-NEXT: s_cmp_lt_u32 s10, 64 ; GFX6-NEXT: s_cselect_b32 s26, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s16, 0 +; GFX6-NEXT: s_cmp_eq_u32 s10, 0 ; GFX6-NEXT: s_cselect_b32 s27, 1, 0 -; GFX6-NEXT: s_lshr_b64 s[10:11], s[8:9], s16 -; GFX6-NEXT: s_lshr_b64 s[16:17], s[0:1], s16 -; GFX6-NEXT: s_lshl_b64 s[18:19], s[8:9], s18 -; GFX6-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] -; GFX6-NEXT: s_lshr_b64 s[8:9], s[8:9], s23 +; GFX6-NEXT: s_lshr_b64 s[10:11], s[8:9], s17 +; GFX6-NEXT: s_lshr_b64 s[16:17], s[0:1], s17 +; GFX6-NEXT: s_lshl_b64 s[24:25], s[8:9], s21 +; GFX6-NEXT: s_or_b64 s[16:17], s[16:17], s[24:25] +; GFX6-NEXT: s_lshr_b64 s[8:9], s[8:9], s19 ; GFX6-NEXT: s_cmp_lg_u32 s26, 0 ; GFX6-NEXT: s_cselect_b64 s[8:9], s[16:17], s[8:9] ; GFX6-NEXT: s_cmp_lg_u32 s27, 0 @@ -7193,86 +7268,88 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX6-NEXT: s_cmp_lg_u32 s26, 0 ; GFX6-NEXT: s_cselect_b64 s[8:9], s[10:11], 0 ; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX6-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f -; GFX6-NEXT: s_andn2_b64 s[10:11], 0x7f, s[20:21] -; GFX6-NEXT: s_or_b64 s[0:1], s[24:25], s[0:1] -; GFX6-NEXT: s_sub_i32 s11, s8, 64 -; GFX6-NEXT: s_sub_i32 s9, 64, s8 +; GFX6-NEXT: s_and_b32 s8, s20, 0x7f +; GFX6-NEXT: s_or_b64 s[0:1], s[22:23], s[0:1] +; GFX6-NEXT: s_sub_i32 s19, s8, 64 +; GFX6-NEXT: s_sub_i32 s10, 64, s8 ; GFX6-NEXT: s_cmp_lt_u32 s8, 64 -; GFX6-NEXT: s_cselect_b32 s20, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s8, 0 ; GFX6-NEXT: s_cselect_b32 s21, 1, 0 -; GFX6-NEXT: s_lshl_b64 s[16:17], s[4:5], s8 -; GFX6-NEXT: s_lshr_b64 s[18:19], s[4:5], s9 -; GFX6-NEXT: s_lshl_b64 s[8:9], s[6:7], s8 -; GFX6-NEXT: s_or_b64 s[8:9], s[18:19], s[8:9] -; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s11 -; GFX6-NEXT: s_cmp_lg_u32 s20, 0 -; GFX6-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 -; GFX6-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] +; GFX6-NEXT: s_cmp_eq_u32 s8, 0 +; GFX6-NEXT: s_cselect_b32 s22, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 +; GFX6-NEXT: s_lshl_b64 s[16:17], s[6:7], s20 +; GFX6-NEXT: s_lshl_b64 s[8:9], s[4:5], s20 +; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] +; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s19 ; GFX6-NEXT: s_cmp_lg_u32 s21, 0 +; GFX6-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX6-NEXT: s_cselect_b64 s[4:5], s[10:11], s[4:5] +; GFX6-NEXT: s_cmp_lg_u32 s22, 0 ; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] ; GFX6-NEXT: s_lshr_b64 s[4:5], s[12:13], 1 -; GFX6-NEXT: s_lshl_b32 s23, s14, 31 -; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[22:23] -; GFX6-NEXT: s_lshr_b64 s[8:9], s[14:15], 1 -; GFX6-NEXT: s_sub_i32 s18, s10, 64 -; GFX6-NEXT: s_sub_i32 s14, 64, s10 -; GFX6-NEXT: s_cmp_lt_u32 s10, 64 +; GFX6-NEXT: s_lshl_b32 s19, s14, 31 +; GFX6-NEXT: s_andn2_b32 s12, 0x7f, s20 +; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[18:19] +; GFX6-NEXT: s_lshr_b64 s[10:11], s[14:15], 1 +; GFX6-NEXT: s_not_b32 s14, s20 +; GFX6-NEXT: s_sub_i32 s18, s12, 64 +; GFX6-NEXT: s_sub_i32 s16, 64, s12 +; GFX6-NEXT: s_cmp_lt_u32 s12, 64 ; GFX6-NEXT: s_cselect_b32 s19, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s10, 0 +; GFX6-NEXT: s_cmp_eq_u32 s12, 0 ; GFX6-NEXT: s_cselect_b32 s20, 1, 0 -; GFX6-NEXT: s_lshr_b64 s[12:13], s[8:9], s10 -; GFX6-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 -; GFX6-NEXT: s_lshl_b64 s[14:15], s[8:9], s14 -; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] -; GFX6-NEXT: s_lshr_b64 s[8:9], s[8:9], s18 +; GFX6-NEXT: s_lshr_b64 s[12:13], s[10:11], s14 +; GFX6-NEXT: s_lshr_b64 s[14:15], s[4:5], s14 +; GFX6-NEXT: s_lshl_b64 s[16:17], s[10:11], s16 +; GFX6-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] +; GFX6-NEXT: s_lshr_b64 s[10:11], s[10:11], s18 ; GFX6-NEXT: s_cmp_lg_u32 s19, 0 -; GFX6-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] +; GFX6-NEXT: s_cselect_b64 s[10:11], s[14:15], s[10:11] ; GFX6-NEXT: s_cmp_lg_u32 s20, 0 -; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] +; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], s[10:11] ; GFX6-NEXT: s_cmp_lg_u32 s19, 0 -; GFX6-NEXT: s_cselect_b64 s[8:9], s[12:13], 0 -; GFX6-NEXT: s_or_b64 s[4:5], s[16:17], s[4:5] -; GFX6-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX6-NEXT: s_cselect_b64 s[10:11], s[12:13], 0 +; GFX6-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] +; GFX6-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshl_v2i128: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f -; GFX8-NEXT: s_andn2_b64 s[16:17], 0x7f, s[16:17] -; GFX8-NEXT: s_sub_i32 s17, s18, 64 -; GFX8-NEXT: s_sub_i32 s19, 64, s18 -; GFX8-NEXT: s_cmp_lt_u32 s18, 64 -; GFX8-NEXT: s_cselect_b32 s23, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s18, 0 +; GFX8-NEXT: s_and_b32 s17, s16, 0x7f +; GFX8-NEXT: s_sub_i32 s19, s17, 64 +; GFX8-NEXT: s_sub_i32 s21, 64, s17 +; GFX8-NEXT: s_cmp_lt_u32 s17, 64 ; GFX8-NEXT: s_cselect_b32 s28, 1, 0 -; GFX8-NEXT: s_lshl_b64 s[24:25], s[0:1], s18 -; GFX8-NEXT: s_lshr_b64 s[26:27], s[0:1], s19 -; GFX8-NEXT: s_lshl_b64 s[18:19], s[2:3], s18 -; GFX8-NEXT: s_or_b64 s[18:19], s[26:27], s[18:19] -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s17 -; GFX8-NEXT: s_cmp_lg_u32 s23, 0 -; GFX8-NEXT: s_cselect_b64 s[24:25], s[24:25], 0 -; GFX8-NEXT: s_cselect_b64 s[0:1], s[18:19], s[0:1] +; GFX8-NEXT: s_cmp_eq_u32 s17, 0 +; GFX8-NEXT: s_cselect_b32 s17, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[24:25], s[0:1], s21 +; GFX8-NEXT: s_lshl_b64 s[26:27], s[2:3], s16 +; GFX8-NEXT: s_lshl_b64 s[22:23], s[0:1], s16 +; GFX8-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s19 ; GFX8-NEXT: s_cmp_lg_u32 s28, 0 -; GFX8-NEXT: s_mov_b32 s22, 0 +; GFX8-NEXT: s_cselect_b64 s[22:23], s[22:23], 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1] +; GFX8-NEXT: s_cmp_lg_u32 s17, 0 +; GFX8-NEXT: s_mov_b32 s18, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX8-NEXT: s_lshr_b64 s[0:1], s[8:9], 1 -; GFX8-NEXT: s_lshl_b32 s23, s10, 31 -; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[22:23] +; GFX8-NEXT: s_lshl_b32 s19, s10, 31 ; GFX8-NEXT: s_lshr_b64 s[8:9], s[10:11], 1 -; GFX8-NEXT: s_sub_i32 s23, s16, 64 -; GFX8-NEXT: s_sub_i32 s18, 64, s16 -; GFX8-NEXT: s_cmp_lt_u32 s16, 64 +; GFX8-NEXT: s_andn2_b32 s10, 0x7f, s16 +; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[18:19] +; GFX8-NEXT: s_not_b32 s17, s16 +; GFX8-NEXT: s_sub_i32 s19, s10, 64 +; GFX8-NEXT: s_sub_i32 s21, 64, s10 +; GFX8-NEXT: s_cmp_lt_u32 s10, 64 ; GFX8-NEXT: s_cselect_b32 s26, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s16, 0 +; GFX8-NEXT: s_cmp_eq_u32 s10, 0 ; GFX8-NEXT: s_cselect_b32 s27, 1, 0 -; GFX8-NEXT: s_lshr_b64 s[10:11], s[8:9], s16 -; GFX8-NEXT: s_lshr_b64 s[16:17], s[0:1], s16 -; GFX8-NEXT: s_lshl_b64 s[18:19], s[8:9], s18 -; GFX8-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] -; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s23 +; GFX8-NEXT: s_lshr_b64 s[10:11], s[8:9], s17 +; GFX8-NEXT: s_lshr_b64 s[16:17], s[0:1], s17 +; GFX8-NEXT: s_lshl_b64 s[24:25], s[8:9], s21 +; GFX8-NEXT: s_or_b64 s[16:17], s[16:17], s[24:25] +; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s19 ; GFX8-NEXT: s_cmp_lg_u32 s26, 0 ; GFX8-NEXT: s_cselect_b64 s[8:9], s[16:17], s[8:9] ; GFX8-NEXT: s_cmp_lg_u32 s27, 0 @@ -7280,86 +7357,88 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX8-NEXT: s_cmp_lg_u32 s26, 0 ; GFX8-NEXT: s_cselect_b64 s[8:9], s[10:11], 0 ; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX8-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f -; GFX8-NEXT: s_andn2_b64 s[10:11], 0x7f, s[20:21] -; GFX8-NEXT: s_or_b64 s[0:1], s[24:25], s[0:1] -; GFX8-NEXT: s_sub_i32 s11, s8, 64 -; GFX8-NEXT: s_sub_i32 s9, 64, s8 +; GFX8-NEXT: s_and_b32 s8, s20, 0x7f +; GFX8-NEXT: s_or_b64 s[0:1], s[22:23], s[0:1] +; GFX8-NEXT: s_sub_i32 s19, s8, 64 +; GFX8-NEXT: s_sub_i32 s10, 64, s8 ; GFX8-NEXT: s_cmp_lt_u32 s8, 64 -; GFX8-NEXT: s_cselect_b32 s20, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s8, 0 ; GFX8-NEXT: s_cselect_b32 s21, 1, 0 -; GFX8-NEXT: s_lshl_b64 s[16:17], s[4:5], s8 -; GFX8-NEXT: s_lshr_b64 s[18:19], s[4:5], s9 -; GFX8-NEXT: s_lshl_b64 s[8:9], s[6:7], s8 -; GFX8-NEXT: s_or_b64 s[8:9], s[18:19], s[8:9] -; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], s11 -; GFX8-NEXT: s_cmp_lg_u32 s20, 0 -; GFX8-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 -; GFX8-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] +; GFX8-NEXT: s_cmp_eq_u32 s8, 0 +; GFX8-NEXT: s_cselect_b32 s22, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 +; GFX8-NEXT: s_lshl_b64 s[16:17], s[6:7], s20 +; GFX8-NEXT: s_lshl_b64 s[8:9], s[4:5], s20 +; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] +; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], s19 ; GFX8-NEXT: s_cmp_lg_u32 s21, 0 +; GFX8-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], s[10:11], s[4:5] +; GFX8-NEXT: s_cmp_lg_u32 s22, 0 ; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] ; GFX8-NEXT: s_lshr_b64 s[4:5], s[12:13], 1 -; GFX8-NEXT: s_lshl_b32 s23, s14, 31 -; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[22:23] -; GFX8-NEXT: s_lshr_b64 s[8:9], s[14:15], 1 -; GFX8-NEXT: s_sub_i32 s18, s10, 64 -; GFX8-NEXT: s_sub_i32 s14, 64, s10 -; GFX8-NEXT: s_cmp_lt_u32 s10, 64 +; GFX8-NEXT: s_lshl_b32 s19, s14, 31 +; GFX8-NEXT: s_andn2_b32 s12, 0x7f, s20 +; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[18:19] +; GFX8-NEXT: s_lshr_b64 s[10:11], s[14:15], 1 +; GFX8-NEXT: s_not_b32 s14, s20 +; GFX8-NEXT: s_sub_i32 s18, s12, 64 +; GFX8-NEXT: s_sub_i32 s16, 64, s12 +; GFX8-NEXT: s_cmp_lt_u32 s12, 64 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s10, 0 +; GFX8-NEXT: s_cmp_eq_u32 s12, 0 ; GFX8-NEXT: s_cselect_b32 s20, 1, 0 -; GFX8-NEXT: s_lshr_b64 s[12:13], s[8:9], s10 -; GFX8-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 -; GFX8-NEXT: s_lshl_b64 s[14:15], s[8:9], s14 -; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] -; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s18 +; GFX8-NEXT: s_lshr_b64 s[12:13], s[10:11], s14 +; GFX8-NEXT: s_lshr_b64 s[14:15], s[4:5], s14 +; GFX8-NEXT: s_lshl_b64 s[16:17], s[10:11], s16 +; GFX8-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] +; GFX8-NEXT: s_lshr_b64 s[10:11], s[10:11], s18 ; GFX8-NEXT: s_cmp_lg_u32 s19, 0 -; GFX8-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] +; GFX8-NEXT: s_cselect_b64 s[10:11], s[14:15], s[10:11] ; GFX8-NEXT: s_cmp_lg_u32 s20, 0 -; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] +; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], s[10:11] ; GFX8-NEXT: s_cmp_lg_u32 s19, 0 -; GFX8-NEXT: s_cselect_b64 s[8:9], s[12:13], 0 -; GFX8-NEXT: s_or_b64 s[4:5], s[16:17], s[4:5] -; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX8-NEXT: s_cselect_b64 s[10:11], s[12:13], 0 +; GFX8-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] +; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshl_v2i128: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f -; GFX9-NEXT: s_andn2_b64 s[16:17], 0x7f, s[16:17] -; GFX9-NEXT: s_sub_i32 s17, s18, 64 -; GFX9-NEXT: s_sub_i32 s19, 64, s18 -; GFX9-NEXT: s_cmp_lt_u32 s18, 64 -; GFX9-NEXT: s_cselect_b32 s23, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s18, 0 +; GFX9-NEXT: s_and_b32 s17, s16, 0x7f +; GFX9-NEXT: s_sub_i32 s19, s17, 64 +; GFX9-NEXT: s_sub_i32 s21, 64, s17 +; GFX9-NEXT: s_cmp_lt_u32 s17, 64 ; GFX9-NEXT: s_cselect_b32 s28, 1, 0 -; GFX9-NEXT: s_lshl_b64 s[24:25], s[0:1], s18 -; GFX9-NEXT: s_lshr_b64 s[26:27], s[0:1], s19 -; GFX9-NEXT: s_lshl_b64 s[18:19], s[2:3], s18 -; GFX9-NEXT: s_or_b64 s[18:19], s[26:27], s[18:19] -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s17 -; GFX9-NEXT: s_cmp_lg_u32 s23, 0 -; GFX9-NEXT: s_cselect_b64 s[24:25], s[24:25], 0 -; GFX9-NEXT: s_cselect_b64 s[0:1], s[18:19], s[0:1] +; GFX9-NEXT: s_cmp_eq_u32 s17, 0 +; GFX9-NEXT: s_cselect_b32 s17, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[24:25], s[0:1], s21 +; GFX9-NEXT: s_lshl_b64 s[26:27], s[2:3], s16 +; GFX9-NEXT: s_lshl_b64 s[22:23], s[0:1], s16 +; GFX9-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s19 ; GFX9-NEXT: s_cmp_lg_u32 s28, 0 -; GFX9-NEXT: s_mov_b32 s22, 0 +; GFX9-NEXT: s_cselect_b64 s[22:23], s[22:23], 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1] +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_mov_b32 s18, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX9-NEXT: s_lshr_b64 s[0:1], s[8:9], 1 -; GFX9-NEXT: s_lshl_b32 s23, s10, 31 -; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[22:23] +; GFX9-NEXT: s_lshl_b32 s19, s10, 31 ; GFX9-NEXT: s_lshr_b64 s[8:9], s[10:11], 1 -; GFX9-NEXT: s_sub_i32 s23, s16, 64 -; GFX9-NEXT: s_sub_i32 s18, 64, s16 -; GFX9-NEXT: s_cmp_lt_u32 s16, 64 +; GFX9-NEXT: s_andn2_b32 s10, 0x7f, s16 +; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[18:19] +; GFX9-NEXT: s_not_b32 s17, s16 +; GFX9-NEXT: s_sub_i32 s19, s10, 64 +; GFX9-NEXT: s_sub_i32 s21, 64, s10 +; GFX9-NEXT: s_cmp_lt_u32 s10, 64 ; GFX9-NEXT: s_cselect_b32 s26, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s16, 0 +; GFX9-NEXT: s_cmp_eq_u32 s10, 0 ; GFX9-NEXT: s_cselect_b32 s27, 1, 0 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[8:9], s16 -; GFX9-NEXT: s_lshr_b64 s[16:17], s[0:1], s16 -; GFX9-NEXT: s_lshl_b64 s[18:19], s[8:9], s18 -; GFX9-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] -; GFX9-NEXT: s_lshr_b64 s[8:9], s[8:9], s23 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[8:9], s17 +; GFX9-NEXT: s_lshr_b64 s[16:17], s[0:1], s17 +; GFX9-NEXT: s_lshl_b64 s[24:25], s[8:9], s21 +; GFX9-NEXT: s_or_b64 s[16:17], s[16:17], s[24:25] +; GFX9-NEXT: s_lshr_b64 s[8:9], s[8:9], s19 ; GFX9-NEXT: s_cmp_lg_u32 s26, 0 ; GFX9-NEXT: s_cselect_b64 s[8:9], s[16:17], s[8:9] ; GFX9-NEXT: s_cmp_lg_u32 s27, 0 @@ -7367,222 +7446,227 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX9-NEXT: s_cmp_lg_u32 s26, 0 ; GFX9-NEXT: s_cselect_b64 s[8:9], s[10:11], 0 ; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX9-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f -; GFX9-NEXT: s_andn2_b64 s[10:11], 0x7f, s[20:21] -; GFX9-NEXT: s_or_b64 s[0:1], s[24:25], s[0:1] -; GFX9-NEXT: s_sub_i32 s11, s8, 64 -; GFX9-NEXT: s_sub_i32 s9, 64, s8 +; GFX9-NEXT: s_and_b32 s8, s20, 0x7f +; GFX9-NEXT: s_or_b64 s[0:1], s[22:23], s[0:1] +; GFX9-NEXT: s_sub_i32 s19, s8, 64 +; GFX9-NEXT: s_sub_i32 s10, 64, s8 ; GFX9-NEXT: s_cmp_lt_u32 s8, 64 -; GFX9-NEXT: s_cselect_b32 s20, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s8, 0 ; GFX9-NEXT: s_cselect_b32 s21, 1, 0 -; GFX9-NEXT: s_lshl_b64 s[16:17], s[4:5], s8 -; GFX9-NEXT: s_lshr_b64 s[18:19], s[4:5], s9 -; GFX9-NEXT: s_lshl_b64 s[8:9], s[6:7], s8 -; GFX9-NEXT: s_or_b64 s[8:9], s[18:19], s[8:9] -; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], s11 -; GFX9-NEXT: s_cmp_lg_u32 s20, 0 -; GFX9-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 -; GFX9-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] +; GFX9-NEXT: s_cmp_eq_u32 s8, 0 +; GFX9-NEXT: s_cselect_b32 s22, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 +; GFX9-NEXT: s_lshl_b64 s[16:17], s[6:7], s20 +; GFX9-NEXT: s_lshl_b64 s[8:9], s[4:5], s20 +; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] +; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], s19 ; GFX9-NEXT: s_cmp_lg_u32 s21, 0 +; GFX9-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], s[10:11], s[4:5] +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] ; GFX9-NEXT: s_lshr_b64 s[4:5], s[12:13], 1 -; GFX9-NEXT: s_lshl_b32 s23, s14, 31 -; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[22:23] -; GFX9-NEXT: s_lshr_b64 s[8:9], s[14:15], 1 -; GFX9-NEXT: s_sub_i32 s18, s10, 64 -; GFX9-NEXT: s_sub_i32 s14, 64, s10 -; GFX9-NEXT: s_cmp_lt_u32 s10, 64 +; GFX9-NEXT: s_lshl_b32 s19, s14, 31 +; GFX9-NEXT: s_andn2_b32 s12, 0x7f, s20 +; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[18:19] +; GFX9-NEXT: s_lshr_b64 s[10:11], s[14:15], 1 +; GFX9-NEXT: s_not_b32 s14, s20 +; GFX9-NEXT: s_sub_i32 s18, s12, 64 +; GFX9-NEXT: s_sub_i32 s16, 64, s12 +; GFX9-NEXT: s_cmp_lt_u32 s12, 64 ; GFX9-NEXT: s_cselect_b32 s19, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s10, 0 +; GFX9-NEXT: s_cmp_eq_u32 s12, 0 ; GFX9-NEXT: s_cselect_b32 s20, 1, 0 -; GFX9-NEXT: s_lshr_b64 s[12:13], s[8:9], s10 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 -; GFX9-NEXT: s_lshl_b64 s[14:15], s[8:9], s14 -; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] -; GFX9-NEXT: s_lshr_b64 s[8:9], s[8:9], s18 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[10:11], s14 +; GFX9-NEXT: s_lshr_b64 s[14:15], s[4:5], s14 +; GFX9-NEXT: s_lshl_b64 s[16:17], s[10:11], s16 +; GFX9-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] +; GFX9-NEXT: s_lshr_b64 s[10:11], s[10:11], s18 ; GFX9-NEXT: s_cmp_lg_u32 s19, 0 -; GFX9-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] +; GFX9-NEXT: s_cselect_b64 s[10:11], s[14:15], s[10:11] ; GFX9-NEXT: s_cmp_lg_u32 s20, 0 -; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] +; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], s[10:11] ; GFX9-NEXT: s_cmp_lg_u32 s19, 0 -; GFX9-NEXT: s_cselect_b64 s[8:9], s[12:13], 0 -; GFX9-NEXT: s_or_b64 s[4:5], s[16:17], s[4:5] -; GFX9-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX9-NEXT: s_cselect_b64 s[10:11], s[12:13], 0 +; GFX9-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] +; GFX9-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshl_v2i128: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f -; GFX10-NEXT: s_andn2_b64 s[16:17], 0x7f, s[16:17] -; GFX10-NEXT: s_sub_i32 s17, s18, 64 -; GFX10-NEXT: s_sub_i32 s19, 64, s18 -; GFX10-NEXT: s_cmp_lt_u32 s18, 64 -; GFX10-NEXT: s_mov_b32 s22, 0 -; GFX10-NEXT: s_cselect_b32 s23, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s18, 0 +; GFX10-NEXT: s_and_b32 s17, s16, 0x7f +; GFX10-NEXT: s_mov_b32 s18, 0 +; GFX10-NEXT: s_sub_i32 s19, s17, 64 +; GFX10-NEXT: s_sub_i32 s21, 64, s17 +; GFX10-NEXT: s_cmp_lt_u32 s17, 64 ; GFX10-NEXT: s_cselect_b32 s28, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[24:25], s[0:1], s19 -; GFX10-NEXT: s_lshl_b64 s[26:27], s[2:3], s18 -; GFX10-NEXT: s_lshl_b64 s[18:19], s[0:1], s18 -; GFX10-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s17 -; GFX10-NEXT: s_cmp_lg_u32 s23, 0 -; GFX10-NEXT: s_cselect_b64 s[18:19], s[18:19], 0 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1] +; GFX10-NEXT: s_cmp_eq_u32 s17, 0 +; GFX10-NEXT: s_cselect_b32 s17, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[22:23], s[0:1], s21 +; GFX10-NEXT: s_lshl_b64 s[24:25], s[2:3], s16 +; GFX10-NEXT: s_lshl_b64 s[26:27], s[0:1], s16 +; GFX10-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25] +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s19 ; GFX10-NEXT: s_cmp_lg_u32 s28, 0 +; GFX10-NEXT: s_cselect_b64 s[24:25], s[26:27], 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[22:23], s[0:1] +; GFX10-NEXT: s_cmp_lg_u32 s17, 0 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], 1 -; GFX10-NEXT: s_lshl_b32 s23, s10, 31 +; GFX10-NEXT: s_lshl_b32 s19, s10, 31 ; GFX10-NEXT: s_lshr_b64 s[8:9], s[10:11], 1 -; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[22:23] -; GFX10-NEXT: s_sub_i32 s23, s16, 64 -; GFX10-NEXT: s_sub_i32 s17, 64, s16 -; GFX10-NEXT: s_cmp_lt_u32 s16, 64 +; GFX10-NEXT: s_andn2_b32 s10, 0x7f, s16 +; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[18:19] +; GFX10-NEXT: s_not_b32 s19, s16 +; GFX10-NEXT: s_sub_i32 s21, s10, 64 +; GFX10-NEXT: s_sub_i32 s16, 64, s10 +; GFX10-NEXT: s_cmp_lt_u32 s10, 64 ; GFX10-NEXT: s_cselect_b32 s26, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s16, 0 +; GFX10-NEXT: s_cmp_eq_u32 s10, 0 ; GFX10-NEXT: s_cselect_b32 s27, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[10:11], s[0:1], s16 -; GFX10-NEXT: s_lshl_b64 s[24:25], s[8:9], s17 -; GFX10-NEXT: s_lshr_b64 s[16:17], s[8:9], s16 -; GFX10-NEXT: s_or_b64 s[10:11], s[10:11], s[24:25] -; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s23 +; GFX10-NEXT: s_lshr_b64 s[10:11], s[0:1], s19 +; GFX10-NEXT: s_lshl_b64 s[16:17], s[8:9], s16 +; GFX10-NEXT: s_lshr_b64 s[22:23], s[8:9], s19 +; GFX10-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] +; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s21 ; GFX10-NEXT: s_cmp_lg_u32 s26, 0 ; GFX10-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] ; GFX10-NEXT: s_cmp_lg_u32 s27, 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] ; GFX10-NEXT: s_cmp_lg_u32 s26, 0 -; GFX10-NEXT: s_cselect_b64 s[8:9], s[16:17], 0 -; GFX10-NEXT: s_andn2_b64 s[10:11], 0x7f, s[20:21] +; GFX10-NEXT: s_cselect_b64 s[8:9], s[22:23], 0 +; GFX10-NEXT: s_and_b32 s10, s20, 0x7f +; GFX10-NEXT: s_or_b64 s[0:1], s[24:25], s[0:1] ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX10-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f -; GFX10-NEXT: s_or_b64 s[0:1], s[18:19], s[0:1] -; GFX10-NEXT: s_sub_i32 s11, s8, 64 -; GFX10-NEXT: s_sub_i32 s9, 64, s8 -; GFX10-NEXT: s_cmp_lt_u32 s8, 64 -; GFX10-NEXT: s_cselect_b32 s20, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s8, 0 +; GFX10-NEXT: s_sub_i32 s19, s10, 64 +; GFX10-NEXT: s_sub_i32 s8, 64, s10 +; GFX10-NEXT: s_cmp_lt_u32 s10, 64 ; GFX10-NEXT: s_cselect_b32 s21, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[16:17], s[4:5], s9 -; GFX10-NEXT: s_lshl_b64 s[18:19], s[6:7], s8 -; GFX10-NEXT: s_lshl_b64 s[8:9], s[4:5], s8 -; GFX10-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] -; GFX10-NEXT: s_lshl_b64 s[4:5], s[4:5], s11 -; GFX10-NEXT: s_cmp_lg_u32 s20, 0 -; GFX10-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 -; GFX10-NEXT: s_cselect_b64 s[4:5], s[16:17], s[4:5] +; GFX10-NEXT: s_cmp_eq_u32 s10, 0 +; GFX10-NEXT: s_cselect_b32 s22, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[4:5], s8 +; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], s20 +; GFX10-NEXT: s_lshl_b64 s[16:17], s[4:5], s20 +; GFX10-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GFX10-NEXT: s_lshl_b64 s[4:5], s[4:5], s19 ; GFX10-NEXT: s_cmp_lg_u32 s21, 0 +; GFX10-NEXT: s_cselect_b64 s[10:11], s[16:17], 0 +; GFX10-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] +; GFX10-NEXT: s_cmp_lg_u32 s22, 0 ; GFX10-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] ; GFX10-NEXT: s_lshr_b64 s[4:5], s[12:13], 1 -; GFX10-NEXT: s_lshl_b32 s23, s14, 31 -; GFX10-NEXT: s_lshr_b64 s[12:13], s[14:15], 1 -; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[22:23] -; GFX10-NEXT: s_sub_i32 s18, s10, 64 -; GFX10-NEXT: s_sub_i32 s11, 64, s10 -; GFX10-NEXT: s_cmp_lt_u32 s10, 64 +; GFX10-NEXT: s_lshl_b32 s19, s14, 31 +; GFX10-NEXT: s_andn2_b32 s12, 0x7f, s20 +; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[18:19] +; GFX10-NEXT: s_lshr_b64 s[8:9], s[14:15], 1 +; GFX10-NEXT: s_not_b32 s16, s20 +; GFX10-NEXT: s_sub_i32 s18, s12, 64 +; GFX10-NEXT: s_sub_i32 s14, 64, s12 +; GFX10-NEXT: s_cmp_lt_u32 s12, 64 ; GFX10-NEXT: s_cselect_b32 s19, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s10, 0 +; GFX10-NEXT: s_cmp_eq_u32 s12, 0 ; GFX10-NEXT: s_cselect_b32 s20, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[14:15], s[4:5], s10 -; GFX10-NEXT: s_lshl_b64 s[16:17], s[12:13], s11 -; GFX10-NEXT: s_lshr_b64 s[10:11], s[12:13], s10 -; GFX10-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] -; GFX10-NEXT: s_lshr_b64 s[12:13], s[12:13], s18 +; GFX10-NEXT: s_lshr_b64 s[12:13], s[4:5], s16 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[8:9], s14 +; GFX10-NEXT: s_lshr_b64 s[16:17], s[8:9], s16 +; GFX10-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] +; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s18 ; GFX10-NEXT: s_cmp_lg_u32 s19, 0 -; GFX10-NEXT: s_cselect_b64 s[12:13], s[14:15], s[12:13] +; GFX10-NEXT: s_cselect_b64 s[8:9], s[12:13], s[8:9] ; GFX10-NEXT: s_cmp_lg_u32 s20, 0 -; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], s[12:13] +; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] ; GFX10-NEXT: s_cmp_lg_u32 s19, 0 -; GFX10-NEXT: s_cselect_b64 s[10:11], s[10:11], 0 -; GFX10-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] -; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] +; GFX10-NEXT: s_cselect_b64 s[8:9], s[16:17], 0 +; GFX10-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] +; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshl_v2i128: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f -; GFX11-NEXT: s_and_not1_b64 s[16:17], 0x7f, s[16:17] -; GFX11-NEXT: s_sub_i32 s17, s18, 64 -; GFX11-NEXT: s_sub_i32 s19, 64, s18 -; GFX11-NEXT: s_cmp_lt_u32 s18, 64 -; GFX11-NEXT: s_mov_b32 s22, 0 -; GFX11-NEXT: s_cselect_b32 s23, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s18, 0 +; GFX11-NEXT: s_and_b32 s17, s16, 0x7f +; GFX11-NEXT: s_mov_b32 s18, 0 +; GFX11-NEXT: s_sub_i32 s19, s17, 64 +; GFX11-NEXT: s_sub_i32 s21, 64, s17 +; GFX11-NEXT: s_cmp_lt_u32 s17, 64 ; GFX11-NEXT: s_cselect_b32 s28, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[24:25], s[0:1], s19 -; GFX11-NEXT: s_lshl_b64 s[26:27], s[2:3], s18 -; GFX11-NEXT: s_lshl_b64 s[18:19], s[0:1], s18 -; GFX11-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] -; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s17 -; GFX11-NEXT: s_cmp_lg_u32 s23, 0 -; GFX11-NEXT: s_cselect_b64 s[18:19], s[18:19], 0 -; GFX11-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1] +; GFX11-NEXT: s_cmp_eq_u32 s17, 0 +; GFX11-NEXT: s_cselect_b32 s17, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[22:23], s[0:1], s21 +; GFX11-NEXT: s_lshl_b64 s[24:25], s[2:3], s16 +; GFX11-NEXT: s_lshl_b64 s[26:27], s[0:1], s16 +; GFX11-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25] +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s19 ; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_cselect_b64 s[24:25], s[26:27], 0 +; GFX11-NEXT: s_cselect_b64 s[0:1], s[22:23], s[0:1] +; GFX11-NEXT: s_cmp_lg_u32 s17, 0 ; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX11-NEXT: s_lshr_b64 s[0:1], s[8:9], 1 -; GFX11-NEXT: s_lshl_b32 s23, s10, 31 +; GFX11-NEXT: s_lshl_b32 s19, s10, 31 ; GFX11-NEXT: s_lshr_b64 s[8:9], s[10:11], 1 -; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[22:23] -; GFX11-NEXT: s_sub_i32 s23, s16, 64 -; GFX11-NEXT: s_sub_i32 s17, 64, s16 -; GFX11-NEXT: s_cmp_lt_u32 s16, 64 +; GFX11-NEXT: s_and_not1_b32 s10, 0x7f, s16 +; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[18:19] +; GFX11-NEXT: s_not_b32 s19, s16 +; GFX11-NEXT: s_sub_i32 s21, s10, 64 +; GFX11-NEXT: s_sub_i32 s16, 64, s10 +; GFX11-NEXT: s_cmp_lt_u32 s10, 64 ; GFX11-NEXT: s_cselect_b32 s26, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s16, 0 +; GFX11-NEXT: s_cmp_eq_u32 s10, 0 ; GFX11-NEXT: s_cselect_b32 s27, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[10:11], s[0:1], s16 -; GFX11-NEXT: s_lshl_b64 s[24:25], s[8:9], s17 -; GFX11-NEXT: s_lshr_b64 s[16:17], s[8:9], s16 -; GFX11-NEXT: s_or_b64 s[10:11], s[10:11], s[24:25] -; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s23 +; GFX11-NEXT: s_lshr_b64 s[10:11], s[0:1], s19 +; GFX11-NEXT: s_lshl_b64 s[16:17], s[8:9], s16 +; GFX11-NEXT: s_lshr_b64 s[22:23], s[8:9], s19 +; GFX11-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] +; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s21 ; GFX11-NEXT: s_cmp_lg_u32 s26, 0 ; GFX11-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] ; GFX11-NEXT: s_cmp_lg_u32 s27, 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] ; GFX11-NEXT: s_cmp_lg_u32 s26, 0 -; GFX11-NEXT: s_cselect_b64 s[8:9], s[16:17], 0 -; GFX11-NEXT: s_and_not1_b64 s[10:11], 0x7f, s[20:21] +; GFX11-NEXT: s_cselect_b64 s[8:9], s[22:23], 0 +; GFX11-NEXT: s_and_b32 s10, s20, 0x7f +; GFX11-NEXT: s_or_b64 s[0:1], s[24:25], s[0:1] ; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX11-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f -; GFX11-NEXT: s_or_b64 s[0:1], s[18:19], s[0:1] -; GFX11-NEXT: s_sub_i32 s11, s8, 64 -; GFX11-NEXT: s_sub_i32 s9, 64, s8 -; GFX11-NEXT: s_cmp_lt_u32 s8, 64 -; GFX11-NEXT: s_cselect_b32 s20, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s8, 0 +; GFX11-NEXT: s_sub_i32 s19, s10, 64 +; GFX11-NEXT: s_sub_i32 s8, 64, s10 +; GFX11-NEXT: s_cmp_lt_u32 s10, 64 ; GFX11-NEXT: s_cselect_b32 s21, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[16:17], s[4:5], s9 -; GFX11-NEXT: s_lshl_b64 s[18:19], s[6:7], s8 -; GFX11-NEXT: s_lshl_b64 s[8:9], s[4:5], s8 -; GFX11-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] -; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s11 -; GFX11-NEXT: s_cmp_lg_u32 s20, 0 -; GFX11-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 -; GFX11-NEXT: s_cselect_b64 s[4:5], s[16:17], s[4:5] +; GFX11-NEXT: s_cmp_eq_u32 s10, 0 +; GFX11-NEXT: s_cselect_b32 s22, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[4:5], s8 +; GFX11-NEXT: s_lshl_b64 s[10:11], s[6:7], s20 +; GFX11-NEXT: s_lshl_b64 s[16:17], s[4:5], s20 +; GFX11-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s19 ; GFX11-NEXT: s_cmp_lg_u32 s21, 0 +; GFX11-NEXT: s_cselect_b64 s[10:11], s[16:17], 0 +; GFX11-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 ; GFX11-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] ; GFX11-NEXT: s_lshr_b64 s[4:5], s[12:13], 1 -; GFX11-NEXT: s_lshl_b32 s23, s14, 31 -; GFX11-NEXT: s_lshr_b64 s[12:13], s[14:15], 1 -; GFX11-NEXT: s_or_b64 s[4:5], s[4:5], s[22:23] -; GFX11-NEXT: s_sub_i32 s18, s10, 64 -; GFX11-NEXT: s_sub_i32 s11, 64, s10 -; GFX11-NEXT: s_cmp_lt_u32 s10, 64 +; GFX11-NEXT: s_lshl_b32 s19, s14, 31 +; GFX11-NEXT: s_and_not1_b32 s12, 0x7f, s20 +; GFX11-NEXT: s_or_b64 s[4:5], s[4:5], s[18:19] +; GFX11-NEXT: s_lshr_b64 s[8:9], s[14:15], 1 +; GFX11-NEXT: s_not_b32 s16, s20 +; GFX11-NEXT: s_sub_i32 s18, s12, 64 +; GFX11-NEXT: s_sub_i32 s14, 64, s12 +; GFX11-NEXT: s_cmp_lt_u32 s12, 64 ; GFX11-NEXT: s_cselect_b32 s19, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s10, 0 +; GFX11-NEXT: s_cmp_eq_u32 s12, 0 ; GFX11-NEXT: s_cselect_b32 s20, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[14:15], s[4:5], s10 -; GFX11-NEXT: s_lshl_b64 s[16:17], s[12:13], s11 -; GFX11-NEXT: s_lshr_b64 s[10:11], s[12:13], s10 -; GFX11-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] -; GFX11-NEXT: s_lshr_b64 s[12:13], s[12:13], s18 +; GFX11-NEXT: s_lshr_b64 s[12:13], s[4:5], s16 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[8:9], s14 +; GFX11-NEXT: s_lshr_b64 s[16:17], s[8:9], s16 +; GFX11-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] +; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s18 ; GFX11-NEXT: s_cmp_lg_u32 s19, 0 -; GFX11-NEXT: s_cselect_b64 s[12:13], s[14:15], s[12:13] +; GFX11-NEXT: s_cselect_b64 s[8:9], s[12:13], s[8:9] ; GFX11-NEXT: s_cmp_lg_u32 s20, 0 -; GFX11-NEXT: s_cselect_b64 s[4:5], s[4:5], s[12:13] +; GFX11-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] ; GFX11-NEXT: s_cmp_lg_u32 s19, 0 -; GFX11-NEXT: s_cselect_b64 s[10:11], s[10:11], 0 -; GFX11-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] -; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] +; GFX11-NEXT: s_cselect_b64 s[8:9], s[16:17], 0 +; GFX11-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] +; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX11-NEXT: ; return to shader part epilog %result = call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt) ret <2 x i128> %result @@ -7592,56 +7676,54 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6-LABEL: v_fshl_v2i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v23, 0x7f, v16 -; GFX6-NEXT: v_not_b32_e32 v16, v16 -; GFX6-NEXT: v_and_b32_e32 v24, 0x7f, v16 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 64, v23 -; GFX6-NEXT: v_subrev_i32_e32 v25, vcc, 64, v23 -; GFX6-NEXT: v_lshr_b64 v[16:17], v[0:1], v16 -; GFX6-NEXT: v_lshl_b64 v[18:19], v[2:3], v23 -; GFX6-NEXT: v_lshl_b64 v[21:22], v[0:1], v23 +; GFX6-NEXT: v_and_b32_e32 v19, 0x7f, v16 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 64, v19 +; GFX6-NEXT: v_subrev_i32_e32 v25, vcc, 64, v19 +; GFX6-NEXT: v_lshr_b64 v[17:18], v[0:1], v17 +; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v19 +; GFX6-NEXT: v_lshl_b64 v[23:24], v[0:1], v19 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v25 -; GFX6-NEXT: v_or_b32_e32 v16, v16, v18 -; GFX6-NEXT: v_or_b32_e32 v17, v17, v19 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 -; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v21, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v19, 0, v22, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v23 -; GFX6-NEXT: v_cndmask_b32_e32 v21, v0, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v22, v1, v3, vcc +; GFX6-NEXT: v_or_b32_e32 v17, v17, v21 +; GFX6-NEXT: v_or_b32_e32 v18, v18, v22 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX6-NEXT: v_cndmask_b32_e32 v21, 0, v23, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v22, 0, v24, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v18, vcc +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; GFX6-NEXT: v_cndmask_b32_e32 v18, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v19, v1, v3, vcc ; GFX6-NEXT: v_lshr_b64 v[0:1], v[8:9], 1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 31, v10 +; GFX6-NEXT: v_not_b32_e32 v8, v16 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_lshr_b64 v[2:3], v[10:11], 1 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v24 -; GFX6-NEXT: v_subrev_i32_e32 v23, vcc, 64, v24 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[0:1], v24 +; GFX6-NEXT: v_and_b32_e32 v23, 0x7f, v8 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v23 +; GFX6-NEXT: v_subrev_i32_e32 v24, vcc, 64, v23 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[0:1], v23 ; GFX6-NEXT: v_lshl_b64 v[10:11], v[2:3], v10 -; GFX6-NEXT: v_lshr_b64 v[16:17], v[2:3], v24 -; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v23 +; GFX6-NEXT: v_lshr_b64 v[16:17], v[2:3], v23 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v24 ; GFX6-NEXT: v_or_b32_e32 v8, v8, v10 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24 -; GFX6-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24 -; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 +; GFX6-NEXT: v_or_b32_e32 v9, v9, v11 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] -; GFX6-NEXT: v_or_b32_e32 v0, v18, v0 -; GFX6-NEXT: v_and_b32_e32 v18, 0x7f, v20 -; GFX6-NEXT: v_not_b32_e32 v8, v20 ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX6-NEXT: v_or_b32_e32 v2, v18, v2 +; GFX6-NEXT: v_and_b32_e32 v18, 0x7f, v20 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc -; GFX6-NEXT: v_or_b32_e32 v1, v19, v1 -; GFX6-NEXT: v_and_b32_e32 v19, 0x7f, v8 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v18 -; GFX6-NEXT: v_subrev_i32_e32 v20, vcc, 64, v18 +; GFX6-NEXT: v_or_b32_e32 v3, v19, v3 +; GFX6-NEXT: v_subrev_i32_e32 v19, vcc, 64, v18 ; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v8 ; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v18 ; GFX6-NEXT: v_lshl_b64 v[16:17], v[4:5], v18 -; GFX6-NEXT: v_lshl_b64 v[4:5], v[4:5], v20 +; GFX6-NEXT: v_lshl_b64 v[4:5], v[4:5], v19 ; GFX6-NEXT: v_or_b32_e32 v8, v8, v10 ; GFX6-NEXT: v_or_b32_e32 v9, v9, v11 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 @@ -7651,88 +7733,88 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 ; GFX6-NEXT: v_cndmask_b32_e32 v18, v4, v6, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v20, v5, v7, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v19, v5, v7, vcc ; GFX6-NEXT: v_lshr_b64 v[4:5], v[12:13], 1 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 31, v14 +; GFX6-NEXT: v_not_b32_e32 v8, v20 ; GFX6-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX6-NEXT: v_lshr_b64 v[6:7], v[14:15], 1 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v19 -; GFX6-NEXT: v_subrev_i32_e32 v14, vcc, 64, v19 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v19 +; GFX6-NEXT: v_and_b32_e32 v14, 0x7f, v8 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v14 +; GFX6-NEXT: v_subrev_i32_e32 v15, vcc, 64, v14 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v14 ; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v10 -; GFX6-NEXT: v_lshr_b64 v[12:13], v[6:7], v19 -; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v14 +; GFX6-NEXT: v_lshr_b64 v[12:13], v[6:7], v14 +; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v15 ; GFX6-NEXT: v_or_b32_e32 v8, v8, v10 ; GFX6-NEXT: v_or_b32_e32 v9, v9, v11 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 ; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v12, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc -; GFX6-NEXT: v_or_b32_e32 v2, v21, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v22, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v21, v0 +; GFX6-NEXT: v_or_b32_e32 v1, v22, v1 ; GFX6-NEXT: v_or_b32_e32 v4, v16, v4 ; GFX6-NEXT: v_or_b32_e32 v5, v17, v5 ; GFX6-NEXT: v_or_b32_e32 v6, v18, v6 -; GFX6-NEXT: v_or_b32_e32 v7, v20, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v19, v7 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshl_v2i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v23, 0x7f, v16 -; GFX8-NEXT: v_not_b32_e32 v16, v16 -; GFX8-NEXT: v_and_b32_e32 v24, 0x7f, v16 -; GFX8-NEXT: v_sub_u32_e32 v16, vcc, 64, v23 -; GFX8-NEXT: v_subrev_u32_e32 v25, vcc, 64, v23 -; GFX8-NEXT: v_lshrrev_b64 v[16:17], v16, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[18:19], v23, v[2:3] -; GFX8-NEXT: v_lshlrev_b64 v[21:22], v23, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v19, 0x7f, v16 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, 64, v19 +; GFX8-NEXT: v_subrev_u32_e32 v25, vcc, 64, v19 +; GFX8-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3] +; GFX8-NEXT: v_lshlrev_b64 v[23:24], v19, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v25, v[0:1] -; GFX8-NEXT: v_or_b32_e32 v16, v16, v18 -; GFX8-NEXT: v_or_b32_e32 v17, v17, v19 -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 -; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v21, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v19, 0, v22, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v23 -; GFX8-NEXT: v_cndmask_b32_e32 v21, v0, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v22, v1, v3, vcc +; GFX8-NEXT: v_or_b32_e32 v17, v17, v21 +; GFX8-NEXT: v_or_b32_e32 v18, v18, v22 +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX8-NEXT: v_cndmask_b32_e32 v21, 0, v23, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v22, 0, v24, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v18, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; GFX8-NEXT: v_cndmask_b32_e32 v18, v0, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v19, v1, v3, vcc ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[8:9] ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 31, v10 +; GFX8-NEXT: v_not_b32_e32 v8, v16 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[10:11] -; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v24 -; GFX8-NEXT: v_subrev_u32_e32 v23, vcc, 64, v24 -; GFX8-NEXT: v_lshrrev_b64 v[8:9], v24, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v23, 0x7f, v8 +; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v23 +; GFX8-NEXT: v_subrev_u32_e32 v24, vcc, 64, v23 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], v23, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3] -; GFX8-NEXT: v_lshrrev_b64 v[16:17], v24, v[2:3] -; GFX8-NEXT: v_lshrrev_b64 v[2:3], v23, v[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[16:17], v23, v[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[2:3], v24, v[2:3] ; GFX8-NEXT: v_or_b32_e32 v8, v8, v10 -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24 -; GFX8-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 +; GFX8-NEXT: v_or_b32_e32 v9, v9, v11 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] -; GFX8-NEXT: v_or_b32_e32 v0, v18, v0 -; GFX8-NEXT: v_and_b32_e32 v18, 0x7f, v20 -; GFX8-NEXT: v_not_b32_e32 v8, v20 ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX8-NEXT: v_or_b32_e32 v2, v18, v2 +; GFX8-NEXT: v_and_b32_e32 v18, 0x7f, v20 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc -; GFX8-NEXT: v_or_b32_e32 v1, v19, v1 -; GFX8-NEXT: v_and_b32_e32 v19, 0x7f, v8 ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v18 -; GFX8-NEXT: v_subrev_u32_e32 v20, vcc, 64, v18 +; GFX8-NEXT: v_or_b32_e32 v3, v19, v3 +; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, 64, v18 ; GFX8-NEXT: v_lshrrev_b64 v[8:9], v8, v[4:5] ; GFX8-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7] ; GFX8-NEXT: v_lshlrev_b64 v[16:17], v18, v[4:5] -; GFX8-NEXT: v_lshlrev_b64 v[4:5], v20, v[4:5] +; GFX8-NEXT: v_lshlrev_b64 v[4:5], v19, v[4:5] ; GFX8-NEXT: v_or_b32_e32 v8, v8, v10 ; GFX8-NEXT: v_or_b32_e32 v9, v9, v11 ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 @@ -7742,87 +7824,87 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 ; GFX8-NEXT: v_cndmask_b32_e32 v18, v4, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v20, v5, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v19, v5, v7, vcc ; GFX8-NEXT: v_lshrrev_b64 v[4:5], 1, v[12:13] ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 31, v14 +; GFX8-NEXT: v_not_b32_e32 v8, v20 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX8-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15] -; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v19 -; GFX8-NEXT: v_subrev_u32_e32 v14, vcc, 64, v19 -; GFX8-NEXT: v_lshrrev_b64 v[8:9], v19, v[4:5] +; GFX8-NEXT: v_and_b32_e32 v14, 0x7f, v8 +; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v14 +; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, 64, v14 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], v14, v[4:5] ; GFX8-NEXT: v_lshlrev_b64 v[10:11], v10, v[6:7] -; GFX8-NEXT: v_lshrrev_b64 v[12:13], v19, v[6:7] -; GFX8-NEXT: v_lshrrev_b64 v[6:7], v14, v[6:7] +; GFX8-NEXT: v_lshrrev_b64 v[12:13], v14, v[6:7] +; GFX8-NEXT: v_lshrrev_b64 v[6:7], v15, v[6:7] ; GFX8-NEXT: v_or_b32_e32 v8, v8, v10 ; GFX8-NEXT: v_or_b32_e32 v9, v9, v11 -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 ; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v12, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc -; GFX8-NEXT: v_or_b32_e32 v2, v21, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v22, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v21, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v22, v1 ; GFX8-NEXT: v_or_b32_e32 v4, v16, v4 ; GFX8-NEXT: v_or_b32_e32 v5, v17, v5 ; GFX8-NEXT: v_or_b32_e32 v6, v18, v6 -; GFX8-NEXT: v_or_b32_e32 v7, v20, v7 +; GFX8-NEXT: v_or_b32_e32 v7, v19, v7 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshl_v2i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v23, 0x7f, v16 -; GFX9-NEXT: v_not_b32_e32 v16, v16 -; GFX9-NEXT: v_and_b32_e32 v24, 0x7f, v16 -; GFX9-NEXT: v_sub_u32_e32 v16, 64, v23 -; GFX9-NEXT: v_subrev_u32_e32 v25, 64, v23 -; GFX9-NEXT: v_lshrrev_b64 v[16:17], v16, v[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[18:19], v23, v[2:3] -; GFX9-NEXT: v_lshlrev_b64 v[21:22], v23, v[0:1] +; GFX9-NEXT: v_and_b32_e32 v19, 0x7f, v16 +; GFX9-NEXT: v_sub_u32_e32 v17, 64, v19 +; GFX9-NEXT: v_subrev_u32_e32 v25, 64, v19 +; GFX9-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3] +; GFX9-NEXT: v_lshlrev_b64 v[23:24], v19, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v25, v[0:1] -; GFX9-NEXT: v_or_b32_e32 v16, v16, v18 -; GFX9-NEXT: v_or_b32_e32 v17, v17, v19 -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v21, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v19, 0, v22, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v16, v1, v17, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v0, v2, vcc +; GFX9-NEXT: v_or_b32_e32 v17, v17, v21 +; GFX9-NEXT: v_or_b32_e32 v18, v18, v22 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v21, 0, v23, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v22, 0, v24, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v17, v1, v18, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v0, v2, vcc ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v22, v16, v3, vcc +; GFX9-NEXT: v_not_b32_e32 v8, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v17, v3, vcc ; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[10:11] +; GFX9-NEXT: v_and_b32_e32 v23, 0x7f, v8 ; GFX9-NEXT: v_lshl_or_b32 v1, v10, 31, v1 -; GFX9-NEXT: v_sub_u32_e32 v10, 64, v24 -; GFX9-NEXT: v_subrev_u32_e32 v23, 64, v24 -; GFX9-NEXT: v_lshrrev_b64 v[8:9], v24, v[0:1] +; GFX9-NEXT: v_sub_u32_e32 v10, 64, v23 +; GFX9-NEXT: v_subrev_u32_e32 v24, 64, v23 +; GFX9-NEXT: v_lshrrev_b64 v[8:9], v23, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3] -; GFX9-NEXT: v_lshrrev_b64 v[16:17], v24, v[2:3] -; GFX9-NEXT: v_lshrrev_b64 v[2:3], v23, v[2:3] +; GFX9-NEXT: v_lshrrev_b64 v[16:17], v23, v[2:3] +; GFX9-NEXT: v_lshrrev_b64 v[2:3], v24, v[2:3] ; GFX9-NEXT: v_or_b32_e32 v8, v8, v10 -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24 -; GFX9-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 +; GFX9-NEXT: v_or_b32_e32 v9, v9, v11 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] -; GFX9-NEXT: v_or_b32_e32 v0, v18, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX9-NEXT: v_or_b32_e32 v2, v18, v2 ; GFX9-NEXT: v_and_b32_e32 v18, 0x7f, v20 -; GFX9-NEXT: v_not_b32_e32 v8, v20 -; GFX9-NEXT: v_or_b32_e32 v1, v19, v1 -; GFX9-NEXT: v_and_b32_e32 v19, 0x7f, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc ; GFX9-NEXT: v_sub_u32_e32 v8, 64, v18 -; GFX9-NEXT: v_subrev_u32_e32 v20, 64, v18 +; GFX9-NEXT: v_or_b32_e32 v3, v19, v3 +; GFX9-NEXT: v_subrev_u32_e32 v19, 64, v18 ; GFX9-NEXT: v_lshrrev_b64 v[8:9], v8, v[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc ; GFX9-NEXT: v_lshlrev_b64 v[16:17], v18, v[4:5] -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v20, v[4:5] +; GFX9-NEXT: v_lshlrev_b64 v[4:5], v19, v[4:5] ; GFX9-NEXT: v_or_b32_e32 v8, v8, v10 ; GFX9-NEXT: v_or_b32_e32 v9, v9, v11 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 @@ -7833,89 +7915,91 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 ; GFX9-NEXT: v_cndmask_b32_e32 v18, v4, v6, vcc ; GFX9-NEXT: v_lshrrev_b64 v[4:5], 1, v[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v20, v8, v7, vcc -; GFX9-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15] +; GFX9-NEXT: v_cndmask_b32_e32 v19, v8, v7, vcc +; GFX9-NEXT: v_not_b32_e32 v8, v20 ; GFX9-NEXT: v_lshl_or_b32 v5, v14, 31, v5 -; GFX9-NEXT: v_sub_u32_e32 v10, 64, v19 -; GFX9-NEXT: v_subrev_u32_e32 v14, 64, v19 -; GFX9-NEXT: v_lshrrev_b64 v[8:9], v19, v[4:5] +; GFX9-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15] +; GFX9-NEXT: v_and_b32_e32 v14, 0x7f, v8 +; GFX9-NEXT: v_sub_u32_e32 v10, 64, v14 +; GFX9-NEXT: v_subrev_u32_e32 v15, 64, v14 +; GFX9-NEXT: v_lshrrev_b64 v[8:9], v14, v[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[10:11], v10, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[12:13], v19, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[6:7], v14, v[6:7] +; GFX9-NEXT: v_lshrrev_b64 v[12:13], v14, v[6:7] +; GFX9-NEXT: v_lshrrev_b64 v[6:7], v15, v[6:7] ; GFX9-NEXT: v_or_b32_e32 v8, v8, v10 ; GFX9-NEXT: v_or_b32_e32 v9, v9, v11 -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 ; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v12, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc -; GFX9-NEXT: v_or_b32_e32 v2, v21, v2 -; GFX9-NEXT: v_or_b32_e32 v3, v22, v3 +; GFX9-NEXT: v_or_b32_e32 v0, v21, v0 +; GFX9-NEXT: v_or_b32_e32 v1, v22, v1 ; GFX9-NEXT: v_or_b32_e32 v4, v16, v4 ; GFX9-NEXT: v_or_b32_e32 v5, v17, v5 ; GFX9-NEXT: v_or_b32_e32 v6, v18, v6 -; GFX9-NEXT: v_or_b32_e32 v7, v20, v7 +; GFX9-NEXT: v_or_b32_e32 v7, v19, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fshl_v2i128: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v27, 0x7f, v16 -; GFX10-NEXT: v_not_b32_e32 v16, v16 +; GFX10-NEXT: v_not_b32_e32 v21, v16 ; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] ; GFX10-NEXT: v_sub_nc_u32_e32 v17, 64, v27 -; GFX10-NEXT: v_and_b32_e32 v28, 0x7f, v16 +; GFX10-NEXT: v_and_b32_e32 v28, 0x7f, v21 ; GFX10-NEXT: v_lshlrev_b64 v[18:19], v27, v[2:3] ; GFX10-NEXT: v_lshl_or_b32 v9, v10, 31, v9 ; GFX10-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] ; GFX10-NEXT: v_lshrrev_b64 v[16:17], v17, v[0:1] -; GFX10-NEXT: v_sub_nc_u32_e32 v25, 64, v28 ; GFX10-NEXT: v_subrev_nc_u32_e32 v29, 64, v27 -; GFX10-NEXT: v_lshrrev_b64 v[23:24], v28, v[8:9] +; GFX10-NEXT: v_sub_nc_u32_e32 v25, 64, v28 ; GFX10-NEXT: v_lshlrev_b64 v[21:22], v27, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[23:24], v28, v[8:9] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v27 ; GFX10-NEXT: v_or_b32_e32 v18, v16, v18 ; GFX10-NEXT: v_subrev_nc_u32_e32 v16, 64, v28 ; GFX10-NEXT: v_lshlrev_b64 v[25:26], v25, v[10:11] ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v29, v[0:1] ; GFX10-NEXT: v_or_b32_e32 v19, v17, v19 -; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v28 -; GFX10-NEXT: v_lshrrev_b64 v[16:17], v16, v[10:11] ; GFX10-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc_lo +; GFX10-NEXT: v_lshrrev_b64 v[16:17], v16, v[10:11] +; GFX10-NEXT: v_cndmask_b32_e32 v22, 0, v22, vcc_lo ; GFX10-NEXT: v_or_b32_e32 v23, v23, v25 ; GFX10-NEXT: v_cndmask_b32_e32 v18, v0, v18, vcc_lo -; GFX10-NEXT: v_or_b32_e32 v0, v24, v26 -; GFX10-NEXT: v_cndmask_b32_e32 v22, 0, v22, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v19, v1, v19, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, v23, s4 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v27 +; GFX10-NEXT: v_or_b32_e32 v24, v24, v26 +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v28 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v27 ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v28 -; GFX10-NEXT: v_cndmask_b32_e64 v17, v17, v0, s4 ; GFX10-NEXT: v_lshrrev_b64 v[0:1], v28, v[10:11] +; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v23, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v10, v17, v24, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v23, v19, v3, s4 ; GFX10-NEXT: v_and_b32_e32 v24, 0x7f, v20 -; GFX10-NEXT: v_cndmask_b32_e32 v23, v19, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v25, 0, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, v16, v8, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v17, v9, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, v0, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, v1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v10, v9, s5 +; GFX10-NEXT: v_not_b32_e32 v16, v20 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v2, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc_lo ; GFX10-NEXT: v_or_b32_e32 v0, v21, v3 -; GFX10-NEXT: v_not_b32_e32 v3, v20 ; GFX10-NEXT: v_or_b32_e32 v1, v22, v8 ; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13] -; GFX10-NEXT: v_sub_nc_u32_e32 v11, 64, v24 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, 64, v24 +; GFX10-NEXT: v_and_b32_e32 v22, 0x7f, v16 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v10 -; GFX10-NEXT: v_and_b32_e32 v22, 0x7f, v3 ; GFX10-NEXT: v_lshlrev_b64 v[12:13], v24, v[6:7] -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 64, v24 -; GFX10-NEXT: v_lshrrev_b64 v[10:11], v11, v[4:5] +; GFX10-NEXT: v_lshlrev_b64 v[16:17], v24, v[4:5] +; GFX10-NEXT: v_lshrrev_b64 v[10:11], v3, v[4:5] ; GFX10-NEXT: v_lshl_or_b32 v9, v14, 31, v9 ; GFX10-NEXT: v_lshrrev_b64 v[14:15], 1, v[14:15] ; GFX10-NEXT: v_sub_nc_u32_e32 v20, 64, v22 -; GFX10-NEXT: v_lshlrev_b64 v[16:17], v24, v[4:5] +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 64, v24 ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v24 ; GFX10-NEXT: v_or_b32_e32 v12, v10, v12 ; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v22 @@ -7953,88 +8037,87 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v27, 0x7f, v16 -; GFX11-NEXT: v_not_b32_e32 v16, v16 +; GFX11-NEXT: v_not_b32_e32 v21, v16 ; GFX11-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[21:22], v27, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v27 -; GFX11-NEXT: v_and_b32_e32 v28, 0x7f, v16 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v28, 0x7f, v21 +; GFX11-NEXT: v_lshlrev_b64 v[21:22], v27, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_lshl_or_b32 v9, v10, 31, v9 ; GFX11-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] -; GFX11-NEXT: v_dual_cndmask_b32 v21, 0, v21 :: v_dual_cndmask_b32 v22, 0, v22 +; GFX11-NEXT: v_cndmask_b32_e32 v22, 0, v22, vcc_lo ; GFX11-NEXT: v_sub_nc_u32_e32 v17, 64, v27 ; GFX11-NEXT: v_lshlrev_b64 v[18:19], v27, v[2:3] -; GFX11-NEXT: v_sub_nc_u32_e32 v25, 64, v28 ; GFX11-NEXT: v_subrev_nc_u32_e32 v29, 64, v27 -; GFX11-NEXT: v_lshrrev_b64 v[23:24], v28, v[8:9] +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v27 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_lshrrev_b64 v[16:17], v17, v[0:1] -; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v28 -; GFX11-NEXT: v_lshlrev_b64 v[25:26], v25, v[10:11] ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v29, v[0:1] -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v19, v17, v19 ; GFX11-NEXT: v_or_b32_e32 v18, v16, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v19, v1, v19 :: v_dual_cndmask_b32 v18, v0, v18 +; GFX11-NEXT: v_sub_nc_u32_e32 v25, 64, v28 ; GFX11-NEXT: v_subrev_nc_u32_e32 v16, 64, v28 -; GFX11-NEXT: v_or_b32_e32 v19, v17, v19 -; GFX11-NEXT: v_or_b32_e32 v23, v23, v25 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v18, v0, v18, vcc_lo +; GFX11-NEXT: v_lshrrev_b64 v[23:24], v28, v[8:9] +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v28 +; GFX11-NEXT: v_lshrrev_b64 v[0:1], v28, v[10:11] +; GFX11-NEXT: v_lshlrev_b64 v[25:26], v25, v[10:11] ; GFX11-NEXT: v_lshrrev_b64 v[16:17], v16, v[10:11] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v19, v1, v19, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v0, v24, v26 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v27 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v28 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v18, v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v23, v23, v25 +; GFX11-NEXT: v_or_b32_e32 v24, v24, v26 +; GFX11-NEXT: v_dual_cndmask_b32 v25, 0, v1 :: v_dual_cndmask_b32 v16, v16, v23 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v10, v17, v24, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v23, v19, v3, s0 ; GFX11-NEXT: v_and_b32_e32 v24, 0x7f, v20 -; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, v23, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v0, s0 -; GFX11-NEXT: v_lshrrev_b64 v[0:1], v28, v[10:11] -; GFX11-NEXT: v_dual_cndmask_b32 v2, v18, v2 :: v_dual_cndmask_b32 v23, v19, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e64 v3, v16, v8, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v8, v17, v9, s1 -; GFX11-NEXT: v_sub_nc_u32_e32 v11, 64, v24 -; GFX11-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v25, 0, v1, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v8, v10, v9, s1 +; GFX11-NEXT: v_not_b32_e32 v16, v20 +; GFX11-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc_lo +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v24 ; GFX11-NEXT: v_or_b32_e32 v0, v21, v3 -; GFX11-NEXT: v_not_b32_e32 v3, v20 ; GFX11-NEXT: v_or_b32_e32 v1, v22, v8 ; GFX11-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13] +; GFX11-NEXT: v_sub_nc_u32_e32 v3, 64, v24 +; GFX11-NEXT: v_and_b32_e32 v22, 0x7f, v16 ; GFX11-NEXT: v_or_b32_e32 v2, v2, v10 -; GFX11-NEXT: v_lshrrev_b64 v[10:11], v11, v[4:5] ; GFX11-NEXT: v_lshlrev_b64 v[12:13], v24, v[6:7] ; GFX11-NEXT: v_lshlrev_b64 v[16:17], v24, v[4:5] -; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v24 -; GFX11-NEXT: v_and_b32_e32 v22, 0x7f, v3 -; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 64, v24 +; GFX11-NEXT: v_lshrrev_b64 v[10:11], v3, v[4:5] ; GFX11-NEXT: v_lshl_or_b32 v9, v14, 31, v9 ; GFX11-NEXT: v_lshrrev_b64 v[14:15], 1, v[14:15] -; GFX11-NEXT: v_or_b32_e32 v12, v10, v12 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] -; GFX11-NEXT: v_or_b32_e32 v5, v11, v13 -; GFX11-NEXT: v_cndmask_b32_e32 v13, 0, v16, vcc_lo ; GFX11-NEXT: v_sub_nc_u32_e32 v20, 64, v22 +; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 64, v24 +; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v22 +; GFX11-NEXT: v_or_b32_e32 v12, v10, v12 ; GFX11-NEXT: v_subrev_nc_u32_e32 v10, 64, v22 ; GFX11-NEXT: v_lshrrev_b64 v[18:19], v22, v[8:9] -; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v22 -; GFX11-NEXT: v_cndmask_b32_e32 v12, v3, v12, vcc_lo ; GFX11-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15] +; GFX11-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] +; GFX11-NEXT: v_or_b32_e32 v5, v11, v13 ; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[14:15] -; GFX11-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc_lo -; GFX11-NEXT: v_lshrrev_b64 v[3:4], v22, v[14:15] +; GFX11-NEXT: v_cndmask_b32_e32 v13, 0, v16, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v22 -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v24 ; GFX11-NEXT: v_or_b32_e32 v16, v18, v20 ; GFX11-NEXT: v_or_b32_e32 v18, v19, v21 +; GFX11-NEXT: v_dual_cndmask_b32 v12, v3, v12 :: v_dual_cndmask_b32 v5, v4, v5 +; GFX11-NEXT: v_lshrrev_b64 v[3:4], v22, v[14:15] ; GFX11-NEXT: v_cndmask_b32_e32 v14, 0, v17, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v6, v12, v6, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v16, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v24 ; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, v18, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v6, v12, v6, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v7, v5, v7, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v10, v8, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v8, v11, v9, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v3, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v10, 0, v4, s0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll index 8538dcabca924..58304d2072d7f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -347,49 +347,57 @@ define amdgpu_ps i8 @s_fshr_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) { ; ; GFX8-LABEL: s_fshr_i8: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_andn2_b32 s3, 7, s2 +; GFX8-NEXT: s_and_b32 s2, s2, 7 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_and_b32 s3, s2, 7 -; GFX8-NEXT: s_andn2_b32 s2, 7, s2 ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: s_lshr_b32 s1, s1, s3 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s3 +; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshr_i8: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_andn2_b32 s3, 7, s2 +; GFX9-NEXT: s_and_b32 s2, s2, 7 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_and_b32 s3, s2, 7 -; GFX9-NEXT: s_andn2_b32 s2, 7, s2 ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_lshr_b32 s1, s1, s3 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, s3 +; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshr_i8: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_andn2_b32 s3, 7, s2 ; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_and_b32 s3, s2, 7 -; GFX10-NEXT: s_andn2_b32 s2, 7, s2 +; GFX10-NEXT: s_and_b32 s2, s2, 7 ; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: s_lshr_b32 s1, s1, s3 +; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, s3 +; GFX10-NEXT: s_lshr_b32 s1, s1, s2 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshr_i8: ; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_not1_b32 s3, 7, s2 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_and_b32 s3, s2, 7 -; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 +; GFX11-NEXT: s_and_b32 s2, s2, 7 ; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: s_lshl_b32 s0, s0, s2 -; GFX11-NEXT: s_lshr_b32 s1, s1, s3 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX11-NEXT: s_lshl_b32 s0, s0, s3 +; GFX11-NEXT: s_lshr_b32 s1, s1, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog @@ -414,33 +422,33 @@ define i8 @v_fshr_i8(i8 %lhs, i8 %rhs, i8 %amt) { ; GFX8-LABEL: v_fshr_i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, 7, v2 -; GFX8-NEXT: v_not_b32_e32 v2, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 1, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, v2, v0 -; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_e32 v3, 7, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v3, v0 +; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshr_i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v3, 7, v2 -; GFX9-NEXT: v_not_b32_e32 v2, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, v2, v0 -; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_e32 v3, 7, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 +; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fshr_i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_not_b32_e32 v3, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 ; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1 @@ -451,9 +459,9 @@ define i8 @v_fshr_i8(i8 %lhs, i8 %rhs, i8 %amt) { ; GFX11-LABEL: v_fshr_i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_not_b32_e32 v3, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX11-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_and_b32_e32 v3, 7, v3 @@ -687,25 +695,29 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; ; GFX8-LABEL: s_fshr_v2i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s3, s0, 8 ; GFX8-NEXT: s_lshr_b32 s4, s1, 8 ; GFX8-NEXT: s_lshr_b32 s5, s2, 8 -; GFX8-NEXT: s_and_b32 s6, s2, 7 -; GFX8-NEXT: s_andn2_b32 s2, 7, s2 -; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_andn2_b32 s6, 7, s2 +; GFX8-NEXT: s_and_b32 s2, s2, 7 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: s_lshr_b32 s3, s0, 8 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s6 +; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_andn2_b32 s2, 7, s5 -; GFX8-NEXT: s_lshl_b32 s3, s3, 1 -; GFX8-NEXT: s_lshr_b32 s1, s1, s6 -; GFX8-NEXT: s_lshl_b32 s2, s3, s2 -; GFX8-NEXT: s_and_b32 s3, s4, 0xff ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s5, 7 +; GFX8-NEXT: s_lshl_b32 s1, s3, 1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_lshl_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s2, s5, 7 +; GFX8-NEXT: s_and_b32 s3, s4, 0xff ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX8-NEXT: s_lshr_b32 s1, s3, s1 -; GFX8-NEXT: s_or_b32 s1, s2, s1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_lshr_b32 s2, s3, s2 +; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 @@ -714,25 +726,29 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; ; GFX9-LABEL: s_fshr_v2i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s3, s0, 8 ; GFX9-NEXT: s_lshr_b32 s4, s1, 8 ; GFX9-NEXT: s_lshr_b32 s5, s2, 8 -; GFX9-NEXT: s_and_b32 s6, s2, 7 -; GFX9-NEXT: s_andn2_b32 s2, 7, s2 -; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_andn2_b32 s6, 7, s2 +; GFX9-NEXT: s_and_b32 s2, s2, 7 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s3, s0, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, s6 +; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_andn2_b32 s2, 7, s5 -; GFX9-NEXT: s_lshl_b32 s3, s3, 1 -; GFX9-NEXT: s_lshr_b32 s1, s1, s6 -; GFX9-NEXT: s_lshl_b32 s2, s3, s2 -; GFX9-NEXT: s_and_b32 s3, s4, 0xff ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s5, 7 +; GFX9-NEXT: s_lshl_b32 s1, s3, 1 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX9-NEXT: s_lshl_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s2, s5, 7 +; GFX9-NEXT: s_and_b32 s3, s4, 0xff ; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX9-NEXT: s_lshr_b32 s1, s3, s1 -; GFX9-NEXT: s_or_b32 s1, s2, s1 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX9-NEXT: s_lshr_b32 s2, s3, s2 +; GFX9-NEXT: s_or_b32 s1, s1, s2 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_and_b32 s0, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 @@ -741,24 +757,28 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; ; GFX10-LABEL: s_fshr_v2i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshr_b32 s4, s1, 8 +; GFX10-NEXT: s_andn2_b32 s5, 7, s2 ; GFX10-NEXT: s_lshr_b32 s3, s0, 8 -; GFX10-NEXT: s_lshr_b32 s5, s2, 8 -; GFX10-NEXT: s_and_b32 s6, s2, 7 -; GFX10-NEXT: s_andn2_b32 s2, 7, s2 +; GFX10-NEXT: s_lshr_b32 s4, s1, 8 ; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX10-NEXT: s_lshr_b32 s6, s2, 8 +; GFX10-NEXT: s_lshl_b32 s0, s0, s5 +; GFX10-NEXT: s_andn2_b32 s5, 7, s6 ; GFX10-NEXT: s_and_b32 s4, s4, 0xff +; GFX10-NEXT: s_and_b32 s6, s6, 7 ; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: s_and_b32 s2, s5, 7 -; GFX10-NEXT: s_andn2_b32 s5, 7, s5 +; GFX10-NEXT: s_and_b32 s2, s2, 7 ; GFX10-NEXT: s_lshl_b32 s3, s3, 1 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX10-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX10-NEXT: s_lshl_b32 s3, s3, s5 -; GFX10-NEXT: s_lshr_b32 s2, s4, s2 -; GFX10-NEXT: s_lshr_b32 s1, s1, s6 -; GFX10-NEXT: s_or_b32 s2, s3, s2 +; GFX10-NEXT: s_lshr_b32 s4, s4, s6 +; GFX10-NEXT: s_lshr_b32 s1, s1, s2 +; GFX10-NEXT: s_or_b32 s2, s3, s4 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_and_b32 s1, s2, 0xff ; GFX10-NEXT: s_and_b32 s0, s0, 0xff @@ -768,24 +788,28 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; ; GFX11-LABEL: s_fshr_v2i8: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_lshr_b32 s4, s1, 8 +; GFX11-NEXT: s_and_not1_b32 s5, 7, s2 ; GFX11-NEXT: s_lshr_b32 s3, s0, 8 -; GFX11-NEXT: s_lshr_b32 s5, s2, 8 -; GFX11-NEXT: s_and_b32 s6, s2, 7 -; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 +; GFX11-NEXT: s_lshr_b32 s4, s1, 8 ; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX11-NEXT: s_lshr_b32 s6, s2, 8 +; GFX11-NEXT: s_lshl_b32 s0, s0, s5 +; GFX11-NEXT: s_and_not1_b32 s5, 7, s6 ; GFX11-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-NEXT: s_and_b32 s6, s6, 7 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_lshl_b32 s0, s0, s2 -; GFX11-NEXT: s_and_b32 s2, s5, 7 -; GFX11-NEXT: s_and_not1_b32 s5, 7, s5 +; GFX11-NEXT: s_and_b32 s2, s2, 7 ; GFX11-NEXT: s_lshl_b32 s3, s3, 1 +; GFX11-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX11-NEXT: s_lshl_b32 s3, s3, s5 -; GFX11-NEXT: s_lshr_b32 s2, s4, s2 -; GFX11-NEXT: s_lshr_b32 s1, s1, s6 -; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_lshr_b32 s4, s4, s6 +; GFX11-NEXT: s_lshr_b32 s1, s1, s2 +; GFX11-NEXT: s_or_b32 s2, s3, s4 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_and_b32 s1, s2, 0xff ; GFX11-NEXT: s_and_b32 s0, s0, 0xff @@ -832,23 +856,23 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) { ; GFX8-LABEL: v_fshr_v2i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX8-NEXT: v_and_b32_e32 v6, 7, v2 -; GFX8-NEXT: v_not_b32_e32 v2, v2 +; GFX8-NEXT: v_xor_b32_e32 v6, -1, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 1, v0 +; GFX8-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, v2, v0 -; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_not_b32_e32 v2, v5 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v6, v0 +; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v5 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 7, v5 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, 1, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v3 -; GFX8-NEXT: v_lshlrev_b16_e32 v2, v2, v3 -; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, v2, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 7, v5 +; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -857,23 +881,23 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) { ; GFX9-LABEL: v_fshr_v2i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX9-NEXT: v_and_b32_e32 v6, 7, v2 -; GFX9-NEXT: v_not_b32_e32 v2, v2 +; GFX9-NEXT: v_xor_b32_e32 v6, -1, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 +; GFX9-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, v2, v0 -; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_not_b32_e32 v2, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, v6, v0 +; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v5 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX9-NEXT: v_and_b32_e32 v1, 7, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 1, v3 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, v2, v3 -; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, v2, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 7, v5 +; GFX9-NEXT: v_lshrrev_b16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -885,20 +909,20 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) { ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX10-NEXT: v_and_b32_e32 v6, 7, v2 -; GFX10-NEXT: v_not_b32_e32 v2, v2 -; GFX10-NEXT: v_not_b32_e32 v7, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 +; GFX10-NEXT: v_xor_b32_e32 v7, -1, v2 +; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX10-NEXT: v_xor_b32_e32 v6, -1, v3 ; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4 +; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 ; GFX10-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX10-NEXT: v_lshrrev_b16 v3, v3, v5 -; GFX10-NEXT: v_lshlrev_b16 v4, v7, v4 -; GFX10-NEXT: v_lshrrev_b16 v1, v6, v1 -; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0 +; GFX10-NEXT: v_lshlrev_b16 v4, v6, v4 +; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1 +; GFX10-NEXT: v_lshlrev_b16 v0, v7, v0 ; GFX10-NEXT: v_or_b32_e32 v2, v4, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, 0xff ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 @@ -912,22 +936,22 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) { ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 8, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX11-NEXT: v_and_b32_e32 v7, 7, v2 -; GFX11-NEXT: v_not_b32_e32 v2, v2 -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 7, v3 +; GFX11-NEXT: v_xor_b32_e32 v7, -1, v2 +; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX11-NEXT: v_xor_b32_e32 v6, -1, v3 ; GFX11-NEXT: v_lshlrev_b16 v4, 1, v4 +; GFX11-NEXT: v_and_b32_e32 v3, 7, v3 ; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX11-NEXT: v_and_b32_e32 v6, 7, v6 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX11-NEXT: v_lshrrev_b16 v3, v3, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshlrev_b16 v4, v6, v4 -; GFX11-NEXT: v_lshrrev_b16 v1, v7, v1 +; GFX11-NEXT: v_lshrrev_b16 v1, v2, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshlrev_b16 v0, v2, v0 +; GFX11-NEXT: v_lshlrev_b16 v0, v7, v0 ; GFX11-NEXT: v_or_b32_e32 v2, v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 @@ -997,50 +1021,58 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; ; GFX8-LABEL: s_fshr_v4i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s3, s0, 8 -; GFX8-NEXT: s_lshr_b32 s4, s0, 16 -; GFX8-NEXT: s_lshr_b32 s5, s0, 24 ; GFX8-NEXT: s_lshr_b32 s6, s1, 8 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 ; GFX8-NEXT: s_lshr_b32 s8, s1, 24 ; GFX8-NEXT: s_lshr_b32 s9, s2, 8 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 ; GFX8-NEXT: s_lshr_b32 s11, s2, 24 -; GFX8-NEXT: s_and_b32 s12, s2, 7 -; GFX8-NEXT: s_andn2_b32 s2, 7, s2 -; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_andn2_b32 s12, 7, s2 +; GFX8-NEXT: s_and_b32 s2, s2, 7 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: s_lshr_b32 s3, s0, 8 +; GFX8-NEXT: s_lshr_b32 s4, s0, 16 +; GFX8-NEXT: s_lshr_b32 s5, s0, 24 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_and_b32 s12, 0xffff, s12 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s12 +; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_andn2_b32 s2, 7, s9 -; GFX8-NEXT: s_lshl_b32 s3, s3, 1 -; GFX8-NEXT: s_lshr_b32 s1, s1, s12 -; GFX8-NEXT: s_lshl_b32 s2, s3, s2 -; GFX8-NEXT: s_and_b32 s3, s6, 0xff ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s9, 7 +; GFX8-NEXT: s_lshl_b32 s1, s3, 1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_lshl_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s2, s9, 7 +; GFX8-NEXT: s_and_b32 s3, s6, 0xff ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX8-NEXT: s_lshr_b32 s1, s3, s1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_lshr_b32 s2, s3, s2 ; GFX8-NEXT: s_andn2_b32 s3, 7, s10 -; GFX8-NEXT: s_lshl_b32 s4, s4, 1 -; GFX8-NEXT: s_lshl_b32 s3, s4, s3 +; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: s_lshl_b32 s2, s4, 1 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX8-NEXT: s_lshl_b32 s2, s2, s3 +; GFX8-NEXT: s_and_b32 s3, s10, 7 ; GFX8-NEXT: s_and_b32 s4, s7, 0xff -; GFX8-NEXT: s_or_b32 s1, s2, s1 -; GFX8-NEXT: s_and_b32 s2, s10, 7 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX8-NEXT: s_lshr_b32 s2, s4, s2 -; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_or_b32 s2, s3, s2 -; GFX8-NEXT: s_and_b32 s3, s11, 7 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX8-NEXT: s_lshr_b32 s3, s4, s3 ; GFX8-NEXT: s_andn2_b32 s4, 7, s11 -; GFX8-NEXT: s_lshl_b32 s5, s5, 1 +; GFX8-NEXT: s_or_b32 s2, s2, s3 +; GFX8-NEXT: s_lshl_b32 s3, s5, 1 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX8-NEXT: s_lshl_b32 s3, s3, s4 +; GFX8-NEXT: s_and_b32 s4, s11, 7 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NEXT: s_lshl_b32 s4, s5, s4 -; GFX8-NEXT: s_lshr_b32 s3, s8, s3 +; GFX8-NEXT: s_lshr_b32 s4, s8, s4 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s2, 0xff -; GFX8-NEXT: s_or_b32 s3, s4, s3 +; GFX8-NEXT: s_or_b32 s3, s3, s4 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s3, 0xff @@ -1050,50 +1082,58 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; ; GFX9-LABEL: s_fshr_v4i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s3, s0, 8 -; GFX9-NEXT: s_lshr_b32 s4, s0, 16 -; GFX9-NEXT: s_lshr_b32 s5, s0, 24 ; GFX9-NEXT: s_lshr_b32 s6, s1, 8 ; GFX9-NEXT: s_lshr_b32 s7, s1, 16 ; GFX9-NEXT: s_lshr_b32 s8, s1, 24 ; GFX9-NEXT: s_lshr_b32 s9, s2, 8 ; GFX9-NEXT: s_lshr_b32 s10, s2, 16 ; GFX9-NEXT: s_lshr_b32 s11, s2, 24 -; GFX9-NEXT: s_and_b32 s12, s2, 7 -; GFX9-NEXT: s_andn2_b32 s2, 7, s2 -; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_andn2_b32 s12, 7, s2 +; GFX9-NEXT: s_and_b32 s2, s2, 7 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s3, s0, 8 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NEXT: s_lshr_b32 s5, s0, 24 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_and_b32 s12, 0xffff, s12 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, s12 +; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_andn2_b32 s2, 7, s9 -; GFX9-NEXT: s_lshl_b32 s3, s3, 1 -; GFX9-NEXT: s_lshr_b32 s1, s1, s12 -; GFX9-NEXT: s_lshl_b32 s2, s3, s2 -; GFX9-NEXT: s_and_b32 s3, s6, 0xff ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s9, 7 +; GFX9-NEXT: s_lshl_b32 s1, s3, 1 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX9-NEXT: s_lshl_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s2, s9, 7 +; GFX9-NEXT: s_and_b32 s3, s6, 0xff ; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX9-NEXT: s_lshr_b32 s1, s3, s1 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX9-NEXT: s_lshr_b32 s2, s3, s2 ; GFX9-NEXT: s_andn2_b32 s3, 7, s10 -; GFX9-NEXT: s_lshl_b32 s4, s4, 1 -; GFX9-NEXT: s_lshl_b32 s3, s4, s3 +; GFX9-NEXT: s_or_b32 s1, s1, s2 +; GFX9-NEXT: s_lshl_b32 s2, s4, 1 +; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX9-NEXT: s_lshl_b32 s2, s2, s3 +; GFX9-NEXT: s_and_b32 s3, s10, 7 ; GFX9-NEXT: s_and_b32 s4, s7, 0xff -; GFX9-NEXT: s_or_b32 s1, s2, s1 -; GFX9-NEXT: s_and_b32 s2, s10, 7 ; GFX9-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX9-NEXT: s_lshr_b32 s2, s4, s2 -; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_or_b32 s2, s3, s2 -; GFX9-NEXT: s_and_b32 s3, s11, 7 +; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX9-NEXT: s_lshr_b32 s3, s4, s3 ; GFX9-NEXT: s_andn2_b32 s4, 7, s11 -; GFX9-NEXT: s_lshl_b32 s5, s5, 1 +; GFX9-NEXT: s_or_b32 s2, s2, s3 +; GFX9-NEXT: s_lshl_b32 s3, s5, 1 +; GFX9-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX9-NEXT: s_lshl_b32 s3, s3, s4 +; GFX9-NEXT: s_and_b32 s4, s11, 7 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX9-NEXT: s_and_b32 s0, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 -; GFX9-NEXT: s_lshl_b32 s4, s5, s4 -; GFX9-NEXT: s_lshr_b32 s3, s8, s3 +; GFX9-NEXT: s_lshr_b32 s4, s8, s4 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s2, 0xff -; GFX9-NEXT: s_or_b32 s3, s4, s3 +; GFX9-NEXT: s_or_b32 s3, s3, s4 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s3, 0xff @@ -1104,43 +1144,51 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX10-LABEL: s_fshr_v4i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_lshr_b32 s6, s1, 8 -; GFX10-NEXT: s_lshr_b32 s3, s0, 8 -; GFX10-NEXT: s_lshr_b32 s4, s0, 16 -; GFX10-NEXT: s_lshr_b32 s5, s0, 24 ; GFX10-NEXT: s_lshr_b32 s7, s1, 16 ; GFX10-NEXT: s_lshr_b32 s8, s1, 24 ; GFX10-NEXT: s_lshr_b32 s9, s2, 8 ; GFX10-NEXT: s_lshr_b32 s10, s2, 16 ; GFX10-NEXT: s_lshr_b32 s11, s2, 24 -; GFX10-NEXT: s_and_b32 s12, s2, 7 -; GFX10-NEXT: s_andn2_b32 s2, 7, s2 +; GFX10-NEXT: s_andn2_b32 s12, 7, s2 ; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_lshl_b32 s0, s0, 1 -; GFX10-NEXT: s_and_b32 s6, s6, 0xff +; GFX10-NEXT: s_and_b32 s2, s2, 7 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: s_and_b32 s2, s9, 7 -; GFX10-NEXT: s_andn2_b32 s9, 7, s9 +; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX10-NEXT: s_lshr_b32 s3, s0, 8 +; GFX10-NEXT: s_lshr_b32 s1, s1, s2 +; GFX10-NEXT: s_andn2_b32 s2, 7, s9 +; GFX10-NEXT: s_and_b32 s6, s6, 0xff +; GFX10-NEXT: s_and_b32 s9, s9, 7 +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_lshr_b32 s5, s0, 24 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: s_and_b32 s12, 0xffff, s12 ; GFX10-NEXT: s_lshl_b32 s3, s3, 1 +; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX10-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX10-NEXT: s_lshr_b32 s1, s1, s12 -; GFX10-NEXT: s_lshl_b32 s3, s3, s9 -; GFX10-NEXT: s_lshr_b32 s2, s6, s2 -; GFX10-NEXT: s_and_b32 s6, s7, 0xff +; GFX10-NEXT: s_and_b32 s9, 0xffff, s9 +; GFX10-NEXT: s_lshl_b32 s0, s0, s12 +; GFX10-NEXT: s_lshl_b32 s2, s3, s2 +; GFX10-NEXT: s_lshr_b32 s3, s6, s9 ; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_or_b32 s1, s3, s2 -; GFX10-NEXT: s_and_b32 s2, s10, 7 -; GFX10-NEXT: s_andn2_b32 s3, 7, s10 -; GFX10-NEXT: s_lshl_b32 s4, s4, 1 +; GFX10-NEXT: s_or_b32 s1, s2, s3 +; GFX10-NEXT: s_andn2_b32 s2, 7, s10 +; GFX10-NEXT: s_lshl_b32 s3, s4, 1 +; GFX10-NEXT: s_and_b32 s4, s7, 0xff +; GFX10-NEXT: s_and_b32 s6, s10, 7 +; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX10-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX10-NEXT: s_lshl_b32 s3, s4, s3 -; GFX10-NEXT: s_lshr_b32 s2, s6, s2 -; GFX10-NEXT: s_andn2_b32 s4, 7, s11 -; GFX10-NEXT: s_lshl_b32 s5, s5, 1 +; GFX10-NEXT: s_lshl_b32 s2, s3, s2 +; GFX10-NEXT: s_lshr_b32 s3, s4, s6 +; GFX10-NEXT: s_lshl_b32 s4, s5, 1 +; GFX10-NEXT: s_andn2_b32 s5, 7, s11 ; GFX10-NEXT: s_and_b32 s6, s11, 7 -; GFX10-NEXT: s_lshl_b32 s4, s5, s4 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX10-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX10-NEXT: s_lshl_b32 s4, s4, s5 ; GFX10-NEXT: s_lshr_b32 s5, s8, s6 -; GFX10-NEXT: s_or_b32 s2, s3, s2 +; GFX10-NEXT: s_or_b32 s2, s2, s3 ; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_or_b32 s3, s4, s5 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff @@ -1157,43 +1205,51 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX11-LABEL: s_fshr_v4i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_lshr_b32 s6, s1, 8 -; GFX11-NEXT: s_lshr_b32 s3, s0, 8 -; GFX11-NEXT: s_lshr_b32 s4, s0, 16 -; GFX11-NEXT: s_lshr_b32 s5, s0, 24 ; GFX11-NEXT: s_lshr_b32 s7, s1, 16 ; GFX11-NEXT: s_lshr_b32 s8, s1, 24 ; GFX11-NEXT: s_lshr_b32 s9, s2, 8 ; GFX11-NEXT: s_lshr_b32 s10, s2, 16 ; GFX11-NEXT: s_lshr_b32 s11, s2, 24 -; GFX11-NEXT: s_and_b32 s12, s2, 7 -; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 +; GFX11-NEXT: s_and_not1_b32 s12, 7, s2 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-NEXT: s_and_b32 s2, s2, 7 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: s_lshl_b32 s0, s0, s2 -; GFX11-NEXT: s_and_b32 s2, s9, 7 -; GFX11-NEXT: s_and_not1_b32 s9, 7, s9 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX11-NEXT: s_lshr_b32 s3, s0, 8 +; GFX11-NEXT: s_lshr_b32 s1, s1, s2 +; GFX11-NEXT: s_and_not1_b32 s2, 7, s9 +; GFX11-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-NEXT: s_and_b32 s9, s9, 7 +; GFX11-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-NEXT: s_lshr_b32 s5, s0, 24 +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: s_and_b32 s12, 0xffff, s12 ; GFX11-NEXT: s_lshl_b32 s3, s3, 1 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX11-NEXT: s_lshr_b32 s1, s1, s12 -; GFX11-NEXT: s_lshl_b32 s3, s3, s9 -; GFX11-NEXT: s_lshr_b32 s2, s6, s2 -; GFX11-NEXT: s_and_b32 s6, s7, 0xff +; GFX11-NEXT: s_and_b32 s9, 0xffff, s9 +; GFX11-NEXT: s_lshl_b32 s0, s0, s12 +; GFX11-NEXT: s_lshl_b32 s2, s3, s2 +; GFX11-NEXT: s_lshr_b32 s3, s6, s9 ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s3, s2 -; GFX11-NEXT: s_and_b32 s2, s10, 7 -; GFX11-NEXT: s_and_not1_b32 s3, 7, s10 -; GFX11-NEXT: s_lshl_b32 s4, s4, 1 +; GFX11-NEXT: s_or_b32 s1, s2, s3 +; GFX11-NEXT: s_and_not1_b32 s2, 7, s10 +; GFX11-NEXT: s_lshl_b32 s3, s4, 1 +; GFX11-NEXT: s_and_b32 s4, s7, 0xff +; GFX11-NEXT: s_and_b32 s6, s10, 7 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX11-NEXT: s_lshl_b32 s3, s4, s3 -; GFX11-NEXT: s_lshr_b32 s2, s6, s2 -; GFX11-NEXT: s_and_not1_b32 s4, 7, s11 -; GFX11-NEXT: s_lshl_b32 s5, s5, 1 +; GFX11-NEXT: s_lshl_b32 s2, s3, s2 +; GFX11-NEXT: s_lshr_b32 s3, s4, s6 +; GFX11-NEXT: s_lshl_b32 s4, s5, 1 +; GFX11-NEXT: s_and_not1_b32 s5, 7, s11 ; GFX11-NEXT: s_and_b32 s6, s11, 7 -; GFX11-NEXT: s_lshl_b32 s4, s5, s4 +; GFX11-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX11-NEXT: s_lshl_b32 s4, s4, s5 ; GFX11-NEXT: s_lshr_b32 s5, s8, s6 -; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_or_b32 s3, s4, s5 ; GFX11-NEXT: s_and_b32 s0, s0, 0xff @@ -1272,40 +1328,41 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX8-LABEL: v_fshr_v4i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_not_b32_e32 v7, v2 -; GFX8-NEXT: v_and_b32_e32 v6, 7, v2 +; GFX8-NEXT: v_xor_b32_e32 v7, -1, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v6, 1, v0 ; GFX8-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX8-NEXT: v_lshlrev_b16_e32 v8, 1, v0 +; GFX8-NEXT: v_lshlrev_b16_e32 v6, v7, v6 +; GFX8-NEXT: v_and_b32_e32 v7, 7, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v7, v7, v8 -; GFX8-NEXT: v_lshrrev_b16_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b16_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX8-NEXT: v_or_b32_e32 v6, v7, v6 -; GFX8-NEXT: v_and_b32_e32 v7, 7, v5 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX8-NEXT: v_xor_b32_e32 v7, -1, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX8-NEXT: v_and_b32_e32 v5, 7, v5 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v3 -; GFX8-NEXT: v_lshlrev_b16_e32 v3, v5, v3 -; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX8-NEXT: v_and_b32_e32 v5, 7, v5 +; GFX8-NEXT: v_lshlrev_b16_e32 v3, v7, v3 +; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_mov_b32_e32 v7, -1 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, 7 +; GFX8-NEXT: v_mov_b32_e32 v4, 1 +; GFX8-NEXT: v_xor_b32_sdwa v9, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b16_sdwa v5, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_and_b32_e32 v9, 7, v9 ; GFX8-NEXT: v_mov_b32_e32 v8, 0xff -; GFX8-NEXT: v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v9, 1 -; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 -; GFX8-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX8-NEXT: v_lshlrev_b16_sdwa v10, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshlrev_b16_e32 v5, v9, v5 +; GFX8-NEXT: v_mov_b32_e32 v9, 7 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX8-NEXT: v_xor_b32_sdwa v4, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v10, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX8-NEXT: v_lshlrev_b16_e32 v7, v7, v10 -; GFX8-NEXT: v_lshrrev_b16_e32 v5, v5, v8 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, v2, v0 -; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: v_and_b32_e32 v4, 7, v4 +; GFX8-NEXT: v_and_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b16_e32 v8, v10, v8 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v4, v0 +; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v8 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 8 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -1321,40 +1378,41 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX9-LABEL: v_fshr_v4i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_not_b32_e32 v7, v2 -; GFX9-NEXT: v_and_b32_e32 v6, 7, v2 +; GFX9-NEXT: v_xor_b32_e32 v7, -1, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v6, 1, v0 ; GFX9-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX9-NEXT: v_lshlrev_b16_e32 v8, 1, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v6, v7, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 7, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX9-NEXT: v_lshlrev_b16_e32 v7, v7, v8 -; GFX9-NEXT: v_lshrrev_b16_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b16_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX9-NEXT: v_or_b32_e32 v6, v7, v6 -; GFX9-NEXT: v_and_b32_e32 v7, 7, v5 -; GFX9-NEXT: v_not_b32_e32 v5, v5 +; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX9-NEXT: v_xor_b32_e32 v7, -1, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v3, 1, v3 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, v5, v3 -; GFX9-NEXT: v_lshrrev_b16_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, v7, v3 +; GFX9-NEXT: v_lshrrev_b16_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_mov_b32_e32 v7, -1 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX9-NEXT: v_mov_b32_e32 v4, 7 -; GFX9-NEXT: v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_mov_b32_e32 v9, 1 +; GFX9-NEXT: v_mov_b32_e32 v4, 1 +; GFX9-NEXT: v_xor_b32_sdwa v9, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_sdwa v5, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_e32 v9, 7, v9 ; GFX9-NEXT: v_mov_b32_e32 v8, 0xff -; GFX9-NEXT: v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX9-NEXT: v_lshlrev_b16_sdwa v10, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 -; GFX9-NEXT: v_lshlrev_b16_e32 v7, v7, v10 -; GFX9-NEXT: v_and_b32_sdwa v10, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_lshrrev_b16_e32 v5, v5, v10 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, v2, v0 -; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v5, v9, v5 +; GFX9-NEXT: v_mov_b32_e32 v9, 7 +; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_xor_b32_sdwa v4, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v10, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v11, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v4, 7, v4 +; GFX9-NEXT: v_and_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b16_e32 v10, v10, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, v4, v0 +; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_or_b32_e32 v5, v5, v10 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -1371,45 +1429,46 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v0 -; GFX10-NEXT: v_not_b32_e32 v8, v2 +; GFX10-NEXT: v_xor_b32_e32 v8, -1, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, -1 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX10-NEXT: v_xor_b32_e32 v10, -1, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; GFX10-NEXT: v_not_b32_e32 v10, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v1 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 ; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 ; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4 -; GFX10-NEXT: v_mov_b32_e32 v3, 7 ; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v1 +; GFX10-NEXT: v_mov_b32_e32 v14, 0xff +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v1 ; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 -; GFX10-NEXT: v_not_b32_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_not_b32_sdwa v14, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX10-NEXT: v_and_b32_e32 v8, 0xff, v9 ; GFX10-NEXT: v_lshlrev_b16 v4, v10, v4 -; GFX10-NEXT: v_mov_b32_e32 v10, 0xff -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v1 +; GFX10-NEXT: v_xor_b32_sdwa v9, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v10, 7 +; GFX10-NEXT: v_xor_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX10-NEXT: v_and_b32_e32 v12, 7, v2 ; GFX10-NEXT: v_and_b32_e32 v13, 0xff, v1 ; GFX10-NEXT: v_and_b32_e32 v5, 7, v5 -; GFX10-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX10-NEXT: v_and_b32_sdwa v15, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 ; GFX10-NEXT: v_lshlrev_b16 v6, 1, v6 -; GFX10-NEXT: v_and_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_e32 v10, 7, v14 +; GFX10-NEXT: v_and_b32_e32 v9, 7, v9 +; GFX10-NEXT: v_and_b32_sdwa v15, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b16 v7, 1, v7 -; GFX10-NEXT: v_and_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b16 v3, v5, v9 -; GFX10-NEXT: v_lshlrev_b16 v5, v8, v6 +; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 +; GFX10-NEXT: v_and_b32_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b16 v5, v5, v8 +; GFX10-NEXT: v_lshlrev_b16 v6, v9, v6 ; GFX10-NEXT: v_lshrrev_b16 v1, v15, v1 -; GFX10-NEXT: v_lshlrev_b16 v6, v10, v7 +; GFX10-NEXT: v_lshlrev_b16 v3, v3, v7 ; GFX10-NEXT: v_lshrrev_b16 v2, v2, v11 ; GFX10-NEXT: v_lshrrev_b16 v7, v12, v13 -; GFX10-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v4, 8 -; GFX10-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX10-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX10-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX10-NEXT: v_mov_b32_e32 v5, 8 +; GFX10-NEXT: v_or_b32_e32 v1, v6, v1 +; GFX10-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v7 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v3 @@ -1427,29 +1486,29 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v13, 24, v2 ; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-NEXT: v_not_b32_e32 v12, v7 +; GFX11-NEXT: v_xor_b32_e32 v12, -1, v7 ; GFX11-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v12, 7, v12 ; GFX11-NEXT: v_lshlrev_b16 v3, 1, v3 -; GFX11-NEXT: v_not_b32_e32 v14, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 7, v12 +; GFX11-NEXT: v_xor_b32_e32 v14, -1, v11 ; GFX11-NEXT: v_lshrrev_b16 v6, v7, v6 -; GFX11-NEXT: v_not_b32_e32 v7, v13 +; GFX11-NEXT: v_xor_b32_e32 v7, -1, v13 ; GFX11-NEXT: v_lshrrev_b32_e32 v9, 24, v1 -; GFX11-NEXT: v_not_b32_e32 v10, v2 +; GFX11-NEXT: v_xor_b32_e32 v10, -1, v2 ; GFX11-NEXT: v_lshlrev_b16 v3, v12, v3 -; GFX11-NEXT: v_and_b32_e32 v11, 7, v11 -; GFX11-NEXT: v_and_b32_e32 v12, 7, v14 ; GFX11-NEXT: v_lshlrev_b16 v4, 1, v4 +; GFX11-NEXT: v_and_b32_e32 v12, 7, v14 +; GFX11-NEXT: v_and_b32_e32 v11, 7, v11 ; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX11-NEXT: v_lshlrev_b16 v5, 1, v5 +; GFX11-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX11-NEXT: v_and_b32_e32 v13, 7, v13 -; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX11-NEXT: v_and_b32_e32 v10, 7, v10 ; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX11-NEXT: v_and_b32_e32 v10, 7, v10 +; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-NEXT: v_or_b32_e32 v3, v3, v6 ; GFX11-NEXT: v_lshlrev_b16 v4, v12, v4 @@ -5112,51 +5171,46 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) define amdgpu_ps i64 @s_fshr_i64(i64 inreg %lhs, i64 inreg %rhs, i64 inreg %amt) { ; GFX6-LABEL: s_fshr_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b64 s[6:7], s[4:5], 63 -; GFX6-NEXT: s_andn2_b64 s[4:5], 63, s[4:5] ; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 -; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], s6 +; GFX6-NEXT: s_not_b32 s5, s4 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 +; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 ; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshr_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b64 s[6:7], s[4:5], 63 -; GFX8-NEXT: s_andn2_b64 s[4:5], 63, s[4:5] ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 -; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s6 +; GFX8-NEXT: s_not_b32 s5, s4 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 +; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 ; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshr_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], 63 -; GFX9-NEXT: s_andn2_b64 s[4:5], 63, s[4:5] ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 -; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], s6 +; GFX9-NEXT: s_not_b32 s5, s4 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 ; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshr_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_andn2_b64 s[6:7], 63, s[4:5] ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX10-NEXT: s_and_b64 s[4:5], s[4:5], 63 -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s6 +; GFX10-NEXT: s_not_b32 s5, s4 ; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshr_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_not1_b64 s[6:7], 63, s[4:5] ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX11-NEXT: s_and_b64 s[4:5], s[4:5], 63 -; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s6 +; GFX11-NEXT: s_not_b32 s5, s4 ; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX11-NEXT: ; return to shader part epilog @@ -5233,12 +5287,12 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) { ; GFX6-LABEL: v_fshr_i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 63, v4 -; GFX6-NEXT: v_not_b32_e32 v4, v4 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 +; GFX6-NEXT: v_not_b32_e32 v5, v4 +; GFX6-NEXT: v_and_b32_e32 v5, 63, v5 ; GFX6-NEXT: v_and_b32_e32 v4, 63, v4 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v4 -; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v5 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v5 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v4 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -5246,12 +5300,12 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) { ; GFX8-LABEL: v_fshr_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v5, 63, v4 -; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX8-NEXT: v_not_b32_e32 v5, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 63, v5 ; GFX8-NEXT: v_and_b32_e32 v4, 63, v4 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] -; GFX8-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3] +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] +; GFX8-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -5259,12 +5313,12 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) { ; GFX9-LABEL: v_fshr_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v5, 63, v4 -; GFX9-NEXT: v_not_b32_e32 v4, v4 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX9-NEXT: v_not_b32_e32 v5, v4 +; GFX9-NEXT: v_and_b32_e32 v5, 63, v5 ; GFX9-NEXT: v_and_b32_e32 v4, 63, v4 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] -; GFX9-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3] +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] +; GFX9-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] ; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5410,38 +5464,38 @@ define i64 @v_fshr_i64_48(i64 %lhs, i64 %rhs) { define amdgpu_ps <2 x float> @v_fshr_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64 %amt) { ; GFX6-LABEL: v_fshr_i64_ssv: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_and_b32_e32 v2, 63, v0 -; GFX6-NEXT: v_not_b32_e32 v0, v0 -; GFX6-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX6-NEXT: v_not_b32_e32 v1, v0 ; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX6-NEXT: v_lshl_b64 v[0:1], s[0:1], v0 -; GFX6-NEXT: v_lshr_b64 v[2:3], s[2:3], v2 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 63, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX6-NEXT: v_lshl_b64 v[1:2], s[0:1], v1 +; GFX6-NEXT: v_lshr_b64 v[3:4], s[2:3], v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v4 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_fshr_i64_ssv: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_and_b32_e32 v2, 63, v0 -; GFX8-NEXT: v_not_b32_e32 v0, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX8-NEXT: v_not_b32_e32 v1, v0 ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v0, s[0:1] -; GFX8-NEXT: v_lshrrev_b64 v[2:3], v2, s[2:3] -; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX8-NEXT: v_and_b32_e32 v1, 63, v1 +; GFX8-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX8-NEXT: v_lshlrev_b64 v[1:2], v1, s[0:1] +; GFX8-NEXT: v_lshrrev_b64 v[3:4], v0, s[2:3] +; GFX8-NEXT: v_or_b32_e32 v0, v1, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v4 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_fshr_i64_ssv: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_and_b32_e32 v2, 63, v0 -; GFX9-NEXT: v_not_b32_e32 v0, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX9-NEXT: v_not_b32_e32 v1, v0 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, s[0:1] -; GFX9-NEXT: v_lshrrev_b64 v[2:3], v2, s[2:3] -; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX9-NEXT: v_and_b32_e32 v1, 63, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX9-NEXT: v_lshlrev_b64 v[1:2], v1, s[0:1] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], v0, s[2:3] +; GFX9-NEXT: v_or_b32_e32 v0, v1, v3 +; GFX9-NEXT: v_or_b32_e32 v1, v2, v4 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_fshr_i64_ssv: @@ -5478,43 +5532,43 @@ define amdgpu_ps <2 x float> @v_fshr_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64 define amdgpu_ps <2 x float> @v_fshr_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg %amt) { ; GFX6-LABEL: v_fshr_i64_svs: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b64 s[4:5], s[2:3], 63 -; GFX6-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] +; GFX6-NEXT: s_not_b32 s3, s2 +; GFX6-NEXT: s_and_b32 s2, s2, 63 ; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], s4 -; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], s2 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s3 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_fshr_i64_svs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b64 s[4:5], s[2:3], 63 -; GFX8-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] +; GFX8-NEXT: s_not_b32 s3, s2 +; GFX8-NEXT: s_and_b32 s2, s2, 63 ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], s2, v[0:1] +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s3 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_fshr_i64_svs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], 63 -; GFX9-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] +; GFX9-NEXT: s_not_b32 s3, s2 +; GFX9-NEXT: s_and_b32 s2, s2, 63 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], s2, v[0:1] +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s3 ; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX9-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_fshr_i64_svs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], 63 -; GFX10-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] -; GFX10-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] +; GFX10-NEXT: s_and_b32 s3, s2, 63 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], s3, v[0:1] +; GFX10-NEXT: s_not_b32 s2, s2 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: v_or_b32_e32 v1, s1, v1 @@ -5522,10 +5576,10 @@ define amdgpu_ps <2 x float> @v_fshr_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg ; ; GFX11-LABEL: v_fshr_i64_svs: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], 63 -; GFX11-NEXT: s_and_not1_b64 s[2:3], 63, s[2:3] -; GFX11-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] +; GFX11-NEXT: s_and_b32 s3, s2, 63 ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GFX11-NEXT: v_lshrrev_b64 v[0:1], s3, v[0:1] +; GFX11-NEXT: s_not_b32 s2, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -5542,10 +5596,9 @@ define amdgpu_ps <2 x float> @v_fshr_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg ; GFX6-LABEL: v_fshr_i64_vss: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; GFX6-NEXT: s_and_b64 s[4:5], s[2:3], 63 -; GFX6-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] -; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], s2 -; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 +; GFX6-NEXT: s_andn2_b32 s3, 63, s2 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], s3 +; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -5553,10 +5606,9 @@ define amdgpu_ps <2 x float> @v_fshr_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg ; GFX8-LABEL: v_fshr_i64_vss: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX8-NEXT: s_and_b64 s[4:5], s[2:3], 63 -; GFX8-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] -; GFX8-NEXT: v_lshlrev_b64 v[0:1], s2, v[0:1] -; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 +; GFX8-NEXT: s_andn2_b32 s3, 63, s2 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], s3, v[0:1] +; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX8-NEXT: ; return to shader part epilog @@ -5564,10 +5616,9 @@ define amdgpu_ps <2 x float> @v_fshr_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg ; GFX9-LABEL: v_fshr_i64_vss: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], 63 -; GFX9-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] -; GFX9-NEXT: v_lshlrev_b64 v[0:1], s2, v[0:1] -; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 +; GFX9-NEXT: s_andn2_b32 s3, 63, s2 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], s3, v[0:1] +; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 ; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX9-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX9-NEXT: ; return to shader part epilog @@ -5575,10 +5626,9 @@ define amdgpu_ps <2 x float> @v_fshr_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg ; GFX10-LABEL: v_fshr_i64_vss: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX10-NEXT: s_andn2_b64 s[4:5], 63, s[2:3] -; GFX10-NEXT: s_and_b64 s[2:3], s[2:3], 63 +; GFX10-NEXT: s_andn2_b32 s3, 63, s2 ; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], s3, v[0:1] ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX10-NEXT: ; return to shader part epilog @@ -5586,13 +5636,12 @@ define amdgpu_ps <2 x float> @v_fshr_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg ; GFX11-LABEL: v_fshr_i64_vss: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX11-NEXT: s_and_not1_b64 s[4:5], 63, s[2:3] -; GFX11-NEXT: s_and_b64 s[2:3], s[2:3], 63 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_and_not1_b32 s3, 63, s2 ; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 -; GFX11-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b64 v[0:1], s3, v[0:1] ; GFX11-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX11-NEXT: ; return to shader part epilog %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 %amt) @@ -5603,63 +5652,55 @@ define amdgpu_ps <2 x float> @v_fshr_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg define amdgpu_ps <2 x i64> @s_fshr_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %rhs, <2 x i64> inreg %amt) { ; GFX6-LABEL: s_fshr_v2i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b64 s[12:13], s[8:9], 63 -; GFX6-NEXT: s_andn2_b64 s[8:9], 63, s[8:9] ; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 -; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s12 +; GFX6-NEXT: s_not_b32 s9, s8 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 ; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; GFX6-NEXT: s_and_b64 s[4:5], s[10:11], 63 -; GFX6-NEXT: s_andn2_b64 s[8:9], 63, s[10:11] ; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 -; GFX6-NEXT: s_lshr_b64 s[4:5], s[6:7], s4 +; GFX6-NEXT: s_not_b32 s4, s10 +; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[6:7], s10 ; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshr_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b64 s[12:13], s[8:9], 63 -; GFX8-NEXT: s_andn2_b64 s[8:9], 63, s[8:9] ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 -; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s12 +; GFX8-NEXT: s_not_b32 s9, s8 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 +; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 ; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; GFX8-NEXT: s_and_b64 s[4:5], s[10:11], 63 -; GFX8-NEXT: s_andn2_b64 s[8:9], 63, s[10:11] ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 -; GFX8-NEXT: s_lshr_b64 s[4:5], s[6:7], s4 +; GFX8-NEXT: s_not_b32 s4, s10 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX8-NEXT: s_lshr_b64 s[4:5], s[6:7], s10 ; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshr_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b64 s[12:13], s[8:9], 63 -; GFX9-NEXT: s_andn2_b64 s[8:9], 63, s[8:9] ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 -; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s12 +; GFX9-NEXT: s_not_b32 s9, s8 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 ; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; GFX9-NEXT: s_and_b64 s[4:5], s[10:11], 63 -; GFX9-NEXT: s_andn2_b64 s[8:9], 63, s[10:11] ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 -; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], s4 +; GFX9-NEXT: s_not_b32 s4, s10 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], s10 ; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshr_v2i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_andn2_b64 s[12:13], 63, s[8:9] -; GFX10-NEXT: s_and_b64 s[8:9], s[8:9], 63 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 -; GFX10-NEXT: s_andn2_b64 s[8:9], 63, s[10:11] +; GFX10-NEXT: s_not_b32 s9, s8 ; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX10-NEXT: s_and_b64 s[10:11], s[10:11], 63 -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 -; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 +; GFX10-NEXT: s_not_b32 s9, s10 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 +; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], s9 ; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], s10 ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] @@ -5667,15 +5708,13 @@ define amdgpu_ps <2 x i64> @s_fshr_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg % ; ; GFX11-LABEL: s_fshr_v2i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_not1_b64 s[12:13], 63, s[8:9] -; GFX11-NEXT: s_and_b64 s[8:9], s[8:9], 63 ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 -; GFX11-NEXT: s_and_not1_b64 s[8:9], 63, s[10:11] +; GFX11-NEXT: s_not_b32 s9, s8 ; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX11-NEXT: s_and_b64 s[10:11], s[10:11], 63 -; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 -; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 +; GFX11-NEXT: s_not_b32 s9, s10 +; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 +; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s9 ; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], s10 ; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] @@ -5688,18 +5727,18 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { ; GFX6-LABEL: v_fshr_v2i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v9, 63, v8 -; GFX6-NEXT: v_not_b32_e32 v8, v8 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 +; GFX6-NEXT: v_not_b32_e32 v9, v8 +; GFX6-NEXT: v_and_b32_e32 v9, 63, v9 ; GFX6-NEXT: v_and_b32_e32 v8, 63, v8 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v8 -; GFX6-NEXT: v_lshr_b64 v[4:5], v[4:5], v9 -; GFX6-NEXT: v_not_b32_e32 v8, v10 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v9 +; GFX6-NEXT: v_lshr_b64 v[4:5], v[4:5], v8 ; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX6-NEXT: v_not_b32_e32 v4, v10 +; GFX6-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], v4 ; GFX6-NEXT: v_and_b32_e32 v4, 63, v10 -; GFX6-NEXT: v_and_b32_e32 v8, 63, v8 -; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], v8 ; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v4 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v6 @@ -5709,18 +5748,18 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { ; GFX8-LABEL: v_fshr_v2i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v9, 63, v8 -; GFX8-NEXT: v_not_b32_e32 v8, v8 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX8-NEXT: v_not_b32_e32 v9, v8 +; GFX8-NEXT: v_and_b32_e32 v9, 63, v9 ; GFX8-NEXT: v_and_b32_e32 v8, 63, v8 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] -; GFX8-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5] -; GFX8-NEXT: v_not_b32_e32 v8, v10 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] +; GFX8-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_not_b32_e32 v4, v10 +; GFX8-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX8-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] ; GFX8-NEXT: v_and_b32_e32 v4, 63, v10 -; GFX8-NEXT: v_and_b32_e32 v8, 63, v8 -; GFX8-NEXT: v_lshlrev_b64 v[2:3], v8, v[2:3] ; GFX8-NEXT: v_lshrrev_b64 v[6:7], v4, v[6:7] ; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v6 @@ -5730,18 +5769,18 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { ; GFX9-LABEL: v_fshr_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v9, 63, v8 -; GFX9-NEXT: v_not_b32_e32 v8, v8 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX9-NEXT: v_not_b32_e32 v9, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 63, v9 ; GFX9-NEXT: v_and_b32_e32 v8, 63, v8 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] -; GFX9-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5] -; GFX9-NEXT: v_not_b32_e32 v8, v10 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] +; GFX9-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX9-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX9-NEXT: v_not_b32_e32 v4, v10 +; GFX9-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX9-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] ; GFX9-NEXT: v_and_b32_e32 v4, 63, v10 -; GFX9-NEXT: v_and_b32_e32 v8, 63, v8 -; GFX9-NEXT: v_lshlrev_b64 v[2:3], v8, v[2:3] ; GFX9-NEXT: v_lshrrev_b64 v[6:7], v4, v[6:7] ; GFX9-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v6 @@ -5800,231 +5839,237 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg %amt) { ; GFX6-LABEL: s_fshr_i128: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f -; GFX6-NEXT: s_andn2_b64 s[8:9], 0x7f, s[8:9] -; GFX6-NEXT: s_lshl_b64 s[12:13], s[0:1], 1 +; GFX6-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 ; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX6-NEXT: s_lshr_b32 s0, s1, 31 ; GFX6-NEXT: s_mov_b32 s1, 0 ; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX6-NEXT: s_sub_i32 s11, s8, 64 -; GFX6-NEXT: s_sub_i32 s9, 64, s8 -; GFX6-NEXT: s_cmp_lt_u32 s8, 64 -; GFX6-NEXT: s_cselect_b32 s16, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s8, 0 +; GFX6-NEXT: s_andn2_b32 s2, 0x7f, s8 +; GFX6-NEXT: s_not_b32 s9, s8 +; GFX6-NEXT: s_sub_i32 s16, s2, 64 +; GFX6-NEXT: s_sub_i32 s12, 64, s2 +; GFX6-NEXT: s_cmp_lt_u32 s2, 64 ; GFX6-NEXT: s_cselect_b32 s17, 1, 0 -; GFX6-NEXT: s_lshl_b64 s[2:3], s[12:13], s8 -; GFX6-NEXT: s_lshr_b64 s[14:15], s[12:13], s9 -; GFX6-NEXT: s_lshl_b64 s[8:9], s[0:1], s8 -; GFX6-NEXT: s_or_b64 s[8:9], s[14:15], s[8:9] -; GFX6-NEXT: s_lshl_b64 s[12:13], s[12:13], s11 -; GFX6-NEXT: s_cmp_lg_u32 s16, 0 -; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX6-NEXT: s_cselect_b64 s[8:9], s[8:9], s[12:13] +; GFX6-NEXT: s_cmp_eq_u32 s2, 0 +; GFX6-NEXT: s_cselect_b32 s18, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[12:13], s[10:11], s12 +; GFX6-NEXT: s_lshl_b64 s[14:15], s[0:1], s9 +; GFX6-NEXT: s_lshl_b64 s[2:3], s[10:11], s9 +; GFX6-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] +; GFX6-NEXT: s_lshl_b64 s[10:11], s[10:11], s16 ; GFX6-NEXT: s_cmp_lg_u32 s17, 0 -; GFX6-NEXT: s_cselect_b64 s[8:9], s[0:1], s[8:9] -; GFX6-NEXT: s_sub_i32 s14, s10, 64 -; GFX6-NEXT: s_sub_i32 s12, 64, s10 -; GFX6-NEXT: s_cmp_lt_u32 s10, 64 +; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX6-NEXT: s_cselect_b64 s[10:11], s[12:13], s[10:11] +; GFX6-NEXT: s_cmp_lg_u32 s18, 0 +; GFX6-NEXT: s_cselect_b64 s[10:11], s[0:1], s[10:11] +; GFX6-NEXT: s_and_b32 s0, s8, 0x7f +; GFX6-NEXT: s_sub_i32 s14, s0, 64 +; GFX6-NEXT: s_sub_i32 s12, 64, s0 +; GFX6-NEXT: s_cmp_lt_u32 s0, 64 ; GFX6-NEXT: s_cselect_b32 s15, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s10, 0 +; GFX6-NEXT: s_cmp_eq_u32 s0, 0 ; GFX6-NEXT: s_cselect_b32 s16, 1, 0 -; GFX6-NEXT: s_lshr_b64 s[0:1], s[6:7], s10 -; GFX6-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 +; GFX6-NEXT: s_lshr_b64 s[0:1], s[6:7], s8 +; GFX6-NEXT: s_lshr_b64 s[8:9], s[4:5], s8 ; GFX6-NEXT: s_lshl_b64 s[12:13], s[6:7], s12 -; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] +; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] ; GFX6-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 ; GFX6-NEXT: s_cmp_lg_u32 s15, 0 -; GFX6-NEXT: s_cselect_b64 s[6:7], s[10:11], s[6:7] +; GFX6-NEXT: s_cselect_b64 s[6:7], s[8:9], s[6:7] ; GFX6-NEXT: s_cmp_lg_u32 s16, 0 ; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] ; GFX6-NEXT: s_cmp_lg_u32 s15, 0 ; GFX6-NEXT: s_cselect_b64 s[6:7], s[0:1], 0 ; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] -; GFX6-NEXT: s_or_b64 s[2:3], s[8:9], s[6:7] +; GFX6-NEXT: s_or_b64 s[2:3], s[10:11], s[6:7] ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshr_i128: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f -; GFX8-NEXT: s_andn2_b64 s[8:9], 0x7f, s[8:9] -; GFX8-NEXT: s_lshl_b64 s[12:13], s[0:1], 1 +; GFX8-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX8-NEXT: s_lshr_b32 s0, s1, 31 ; GFX8-NEXT: s_mov_b32 s1, 0 ; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX8-NEXT: s_sub_i32 s11, s8, 64 -; GFX8-NEXT: s_sub_i32 s9, 64, s8 -; GFX8-NEXT: s_cmp_lt_u32 s8, 64 -; GFX8-NEXT: s_cselect_b32 s16, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s8, 0 +; GFX8-NEXT: s_andn2_b32 s2, 0x7f, s8 +; GFX8-NEXT: s_not_b32 s9, s8 +; GFX8-NEXT: s_sub_i32 s16, s2, 64 +; GFX8-NEXT: s_sub_i32 s12, 64, s2 +; GFX8-NEXT: s_cmp_lt_u32 s2, 64 ; GFX8-NEXT: s_cselect_b32 s17, 1, 0 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[12:13], s8 -; GFX8-NEXT: s_lshr_b64 s[14:15], s[12:13], s9 -; GFX8-NEXT: s_lshl_b64 s[8:9], s[0:1], s8 -; GFX8-NEXT: s_or_b64 s[8:9], s[14:15], s[8:9] -; GFX8-NEXT: s_lshl_b64 s[12:13], s[12:13], s11 -; GFX8-NEXT: s_cmp_lg_u32 s16, 0 -; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX8-NEXT: s_cselect_b64 s[8:9], s[8:9], s[12:13] +; GFX8-NEXT: s_cmp_eq_u32 s2, 0 +; GFX8-NEXT: s_cselect_b32 s18, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[12:13], s[10:11], s12 +; GFX8-NEXT: s_lshl_b64 s[14:15], s[0:1], s9 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[10:11], s9 +; GFX8-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] +; GFX8-NEXT: s_lshl_b64 s[10:11], s[10:11], s16 ; GFX8-NEXT: s_cmp_lg_u32 s17, 0 -; GFX8-NEXT: s_cselect_b64 s[8:9], s[0:1], s[8:9] -; GFX8-NEXT: s_sub_i32 s14, s10, 64 -; GFX8-NEXT: s_sub_i32 s12, 64, s10 -; GFX8-NEXT: s_cmp_lt_u32 s10, 64 +; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX8-NEXT: s_cselect_b64 s[10:11], s[12:13], s[10:11] +; GFX8-NEXT: s_cmp_lg_u32 s18, 0 +; GFX8-NEXT: s_cselect_b64 s[10:11], s[0:1], s[10:11] +; GFX8-NEXT: s_and_b32 s0, s8, 0x7f +; GFX8-NEXT: s_sub_i32 s14, s0, 64 +; GFX8-NEXT: s_sub_i32 s12, 64, s0 +; GFX8-NEXT: s_cmp_lt_u32 s0, 64 ; GFX8-NEXT: s_cselect_b32 s15, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s10, 0 +; GFX8-NEXT: s_cmp_eq_u32 s0, 0 ; GFX8-NEXT: s_cselect_b32 s16, 1, 0 -; GFX8-NEXT: s_lshr_b64 s[0:1], s[6:7], s10 -; GFX8-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 +; GFX8-NEXT: s_lshr_b64 s[0:1], s[6:7], s8 +; GFX8-NEXT: s_lshr_b64 s[8:9], s[4:5], s8 ; GFX8-NEXT: s_lshl_b64 s[12:13], s[6:7], s12 -; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] +; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] ; GFX8-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 ; GFX8-NEXT: s_cmp_lg_u32 s15, 0 -; GFX8-NEXT: s_cselect_b64 s[6:7], s[10:11], s[6:7] +; GFX8-NEXT: s_cselect_b64 s[6:7], s[8:9], s[6:7] ; GFX8-NEXT: s_cmp_lg_u32 s16, 0 ; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] ; GFX8-NEXT: s_cmp_lg_u32 s15, 0 ; GFX8-NEXT: s_cselect_b64 s[6:7], s[0:1], 0 ; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] -; GFX8-NEXT: s_or_b64 s[2:3], s[8:9], s[6:7] +; GFX8-NEXT: s_or_b64 s[2:3], s[10:11], s[6:7] ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshr_i128: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f -; GFX9-NEXT: s_andn2_b64 s[8:9], 0x7f, s[8:9] -; GFX9-NEXT: s_lshl_b64 s[12:13], s[0:1], 1 +; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX9-NEXT: s_lshr_b32 s0, s1, 31 ; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX9-NEXT: s_sub_i32 s11, s8, 64 -; GFX9-NEXT: s_sub_i32 s9, 64, s8 -; GFX9-NEXT: s_cmp_lt_u32 s8, 64 -; GFX9-NEXT: s_cselect_b32 s16, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s8, 0 +; GFX9-NEXT: s_andn2_b32 s2, 0x7f, s8 +; GFX9-NEXT: s_not_b32 s9, s8 +; GFX9-NEXT: s_sub_i32 s16, s2, 64 +; GFX9-NEXT: s_sub_i32 s12, 64, s2 +; GFX9-NEXT: s_cmp_lt_u32 s2, 64 ; GFX9-NEXT: s_cselect_b32 s17, 1, 0 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[12:13], s8 -; GFX9-NEXT: s_lshr_b64 s[14:15], s[12:13], s9 -; GFX9-NEXT: s_lshl_b64 s[8:9], s[0:1], s8 -; GFX9-NEXT: s_or_b64 s[8:9], s[14:15], s[8:9] -; GFX9-NEXT: s_lshl_b64 s[12:13], s[12:13], s11 -; GFX9-NEXT: s_cmp_lg_u32 s16, 0 -; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX9-NEXT: s_cselect_b64 s[8:9], s[8:9], s[12:13] +; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: s_cselect_b32 s18, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[10:11], s12 +; GFX9-NEXT: s_lshl_b64 s[14:15], s[0:1], s9 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[10:11], s9 +; GFX9-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] +; GFX9-NEXT: s_lshl_b64 s[10:11], s[10:11], s16 ; GFX9-NEXT: s_cmp_lg_u32 s17, 0 -; GFX9-NEXT: s_cselect_b64 s[8:9], s[0:1], s[8:9] -; GFX9-NEXT: s_sub_i32 s14, s10, 64 -; GFX9-NEXT: s_sub_i32 s12, 64, s10 -; GFX9-NEXT: s_cmp_lt_u32 s10, 64 +; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX9-NEXT: s_cselect_b64 s[10:11], s[12:13], s[10:11] +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cselect_b64 s[10:11], s[0:1], s[10:11] +; GFX9-NEXT: s_and_b32 s0, s8, 0x7f +; GFX9-NEXT: s_sub_i32 s14, s0, 64 +; GFX9-NEXT: s_sub_i32 s12, 64, s0 +; GFX9-NEXT: s_cmp_lt_u32 s0, 64 ; GFX9-NEXT: s_cselect_b32 s15, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s10, 0 +; GFX9-NEXT: s_cmp_eq_u32 s0, 0 ; GFX9-NEXT: s_cselect_b32 s16, 1, 0 -; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], s10 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], s8 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[4:5], s8 ; GFX9-NEXT: s_lshl_b64 s[12:13], s[6:7], s12 -; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] +; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] ; GFX9-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 ; GFX9-NEXT: s_cmp_lg_u32 s15, 0 -; GFX9-NEXT: s_cselect_b64 s[6:7], s[10:11], s[6:7] +; GFX9-NEXT: s_cselect_b64 s[6:7], s[8:9], s[6:7] ; GFX9-NEXT: s_cmp_lg_u32 s16, 0 ; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] ; GFX9-NEXT: s_cmp_lg_u32 s15, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], s[0:1], 0 ; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] -; GFX9-NEXT: s_or_b64 s[2:3], s[8:9], s[6:7] +; GFX9-NEXT: s_or_b64 s[2:3], s[10:11], s[6:7] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshr_i128: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f -; GFX10-NEXT: s_andn2_b64 s[8:9], 0x7f, s[8:9] ; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX10-NEXT: s_lshr_b32 s12, s1, 31 -; GFX10-NEXT: s_mov_b32 s13, 0 +; GFX10-NEXT: s_lshr_b32 s10, s1, 31 +; GFX10-NEXT: s_mov_b32 s11, 0 +; GFX10-NEXT: s_andn2_b32 s9, 0x7f, s8 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[12:13] -; GFX10-NEXT: s_sub_i32 s11, s8, 64 -; GFX10-NEXT: s_sub_i32 s9, 64, s8 -; GFX10-NEXT: s_cmp_lt_u32 s8, 64 -; GFX10-NEXT: s_cselect_b32 s16, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s8, 0 +; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[10:11] +; GFX10-NEXT: s_not_b32 s14, s8 +; GFX10-NEXT: s_sub_i32 s16, s9, 64 +; GFX10-NEXT: s_sub_i32 s10, 64, s9 +; GFX10-NEXT: s_cmp_lt_u32 s9, 64 ; GFX10-NEXT: s_cselect_b32 s17, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[12:13], s[0:1], s9 -; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], s8 -; GFX10-NEXT: s_lshl_b64 s[8:9], s[0:1], s8 -; GFX10-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s11 -; GFX10-NEXT: s_cmp_lg_u32 s16, 0 -; GFX10-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[12:13], s[0:1] +; GFX10-NEXT: s_cmp_eq_u32 s9, 0 +; GFX10-NEXT: s_cselect_b32 s9, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[10:11], s[0:1], s10 +; GFX10-NEXT: s_lshl_b64 s[12:13], s[2:3], s14 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[0:1], s14 +; GFX10-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s16 ; GFX10-NEXT: s_cmp_lg_u32 s17, 0 +; GFX10-NEXT: s_cselect_b64 s[12:13], s[14:15], 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1] +; GFX10-NEXT: s_cmp_lg_u32 s9, 0 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] -; GFX10-NEXT: s_sub_i32 s14, s10, 64 -; GFX10-NEXT: s_sub_i32 s11, 64, s10 -; GFX10-NEXT: s_cmp_lt_u32 s10, 64 +; GFX10-NEXT: s_and_b32 s0, s8, 0x7f +; GFX10-NEXT: s_sub_i32 s14, s0, 64 +; GFX10-NEXT: s_sub_i32 s9, 64, s0 +; GFX10-NEXT: s_cmp_lt_u32 s0, 64 ; GFX10-NEXT: s_cselect_b32 s15, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s10, 0 +; GFX10-NEXT: s_cmp_eq_u32 s0, 0 ; GFX10-NEXT: s_cselect_b32 s16, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[0:1], s[4:5], s10 -; GFX10-NEXT: s_lshl_b64 s[12:13], s[6:7], s11 -; GFX10-NEXT: s_lshr_b64 s[10:11], s[6:7], s10 -; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[12:13] +; GFX10-NEXT: s_lshr_b64 s[0:1], s[4:5], s8 +; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], s9 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[6:7], s8 +; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[10:11] ; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 ; GFX10-NEXT: s_cmp_lg_u32 s15, 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7] ; GFX10-NEXT: s_cmp_lg_u32 s16, 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX10-NEXT: s_cmp_lg_u32 s15, 0 -; GFX10-NEXT: s_cselect_b64 s[4:5], s[10:11], 0 -; GFX10-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] +; GFX10-NEXT: s_cselect_b64 s[4:5], s[8:9], 0 +; GFX10-NEXT: s_or_b64 s[0:1], s[12:13], s[0:1] ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshr_i128: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f -; GFX11-NEXT: s_and_not1_b64 s[8:9], 0x7f, s[8:9] ; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX11-NEXT: s_lshr_b32 s12, s1, 31 -; GFX11-NEXT: s_mov_b32 s13, 0 +; GFX11-NEXT: s_lshr_b32 s10, s1, 31 +; GFX11-NEXT: s_mov_b32 s11, 0 +; GFX11-NEXT: s_and_not1_b32 s9, 0x7f, s8 ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[12:13] -; GFX11-NEXT: s_sub_i32 s11, s8, 64 -; GFX11-NEXT: s_sub_i32 s9, 64, s8 -; GFX11-NEXT: s_cmp_lt_u32 s8, 64 -; GFX11-NEXT: s_cselect_b32 s16, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s8, 0 +; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[10:11] +; GFX11-NEXT: s_not_b32 s14, s8 +; GFX11-NEXT: s_sub_i32 s16, s9, 64 +; GFX11-NEXT: s_sub_i32 s10, 64, s9 +; GFX11-NEXT: s_cmp_lt_u32 s9, 64 ; GFX11-NEXT: s_cselect_b32 s17, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[12:13], s[0:1], s9 -; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], s8 -; GFX11-NEXT: s_lshl_b64 s[8:9], s[0:1], s8 -; GFX11-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] -; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s11 -; GFX11-NEXT: s_cmp_lg_u32 s16, 0 -; GFX11-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 -; GFX11-NEXT: s_cselect_b64 s[0:1], s[12:13], s[0:1] +; GFX11-NEXT: s_cmp_eq_u32 s9, 0 +; GFX11-NEXT: s_cselect_b32 s9, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[10:11], s[0:1], s10 +; GFX11-NEXT: s_lshl_b64 s[12:13], s[2:3], s14 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[0:1], s14 +; GFX11-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s16 ; GFX11-NEXT: s_cmp_lg_u32 s17, 0 +; GFX11-NEXT: s_cselect_b64 s[12:13], s[14:15], 0 +; GFX11-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1] +; GFX11-NEXT: s_cmp_lg_u32 s9, 0 ; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] -; GFX11-NEXT: s_sub_i32 s14, s10, 64 -; GFX11-NEXT: s_sub_i32 s11, 64, s10 -; GFX11-NEXT: s_cmp_lt_u32 s10, 64 +; GFX11-NEXT: s_and_b32 s0, s8, 0x7f +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_sub_i32 s14, s0, 64 +; GFX11-NEXT: s_sub_i32 s9, 64, s0 +; GFX11-NEXT: s_cmp_lt_u32 s0, 64 ; GFX11-NEXT: s_cselect_b32 s15, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s10, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 0 ; GFX11-NEXT: s_cselect_b32 s16, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[0:1], s[4:5], s10 -; GFX11-NEXT: s_lshl_b64 s[12:13], s[6:7], s11 -; GFX11-NEXT: s_lshr_b64 s[10:11], s[6:7], s10 -; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[12:13] +; GFX11-NEXT: s_lshr_b64 s[0:1], s[4:5], s8 +; GFX11-NEXT: s_lshl_b64 s[10:11], s[6:7], s9 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[6:7], s8 +; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[10:11] ; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 ; GFX11-NEXT: s_cmp_lg_u32 s15, 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7] ; GFX11-NEXT: s_cmp_lg_u32 s16, 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX11-NEXT: s_cmp_lg_u32 s15, 0 -; GFX11-NEXT: s_cselect_b64 s[4:5], s[10:11], 0 -; GFX11-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] +; GFX11-NEXT: s_cselect_b64 s[4:5], s[8:9], 0 +; GFX11-NEXT: s_or_b64 s[0:1], s[12:13], s[0:1] ; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt) @@ -6035,29 +6080,29 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX6-LABEL: v_fshr_i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v14, 0x7f, v8 -; GFX6-NEXT: v_not_b32_e32 v8, v8 ; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GFX6-NEXT: v_and_b32_e32 v15, 0x7f, v8 -; GFX6-NEXT: v_lshl_b64 v[8:9], v[0:1], 1 +; GFX6-NEXT: v_lshl_b64 v[9:10], v[0:1], 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v0 +; GFX6-NEXT: v_not_b32_e32 v0, v8 +; GFX6-NEXT: v_and_b32_e32 v15, 0x7f, v0 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v15 -; GFX6-NEXT: v_lshr_b64 v[0:1], v[8:9], v0 -; GFX6-NEXT: v_lshl_b64 v[10:11], v[2:3], v15 +; GFX6-NEXT: v_lshr_b64 v[0:1], v[9:10], v0 +; GFX6-NEXT: v_lshl_b64 v[11:12], v[2:3], v15 ; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, 64, v15 -; GFX6-NEXT: v_lshl_b64 v[12:13], v[8:9], v15 -; GFX6-NEXT: v_or_b32_e32 v10, v0, v10 -; GFX6-NEXT: v_or_b32_e32 v11, v1, v11 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[8:9], v16 +; GFX6-NEXT: v_lshl_b64 v[13:14], v[9:10], v15 +; GFX6-NEXT: v_or_b32_e32 v11, v0, v11 +; GFX6-NEXT: v_or_b32_e32 v12, v1, v12 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[9:10], v16 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 -; GFX6-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v10, 0, v13, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v13, 0, v14, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 -; GFX6-NEXT: v_cndmask_b32_e32 v10, v0, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v11, v1, v3, vcc +; GFX6-NEXT: v_and_b32_e32 v14, 0x7f, v8 +; GFX6-NEXT: v_cndmask_b32_e32 v11, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v12, v1, v3, vcc ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v14 ; GFX6-NEXT: v_lshr_b64 v[0:1], v[4:5], v14 ; GFX6-NEXT: v_lshl_b64 v[2:3], v[6:7], v2 @@ -6074,38 +6119,38 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc -; GFX6-NEXT: v_or_b32_e32 v0, v12, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v10, v0 ; GFX6-NEXT: v_or_b32_e32 v1, v13, v1 -; GFX6-NEXT: v_or_b32_e32 v2, v10, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v11, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v11, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v12, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshr_i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v14, 0x7f, v8 -; GFX8-NEXT: v_not_b32_e32 v8, v8 ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX8-NEXT: v_and_b32_e32 v15, 0x7f, v8 -; GFX8-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[9:10], 1, v[0:1] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v0 +; GFX8-NEXT: v_not_b32_e32 v0, v8 +; GFX8-NEXT: v_and_b32_e32 v15, 0x7f, v0 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v15 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[8:9] -; GFX8-NEXT: v_lshlrev_b64 v[10:11], v15, v[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[9:10] +; GFX8-NEXT: v_lshlrev_b64 v[11:12], v15, v[2:3] ; GFX8-NEXT: v_subrev_u32_e32 v16, vcc, 64, v15 -; GFX8-NEXT: v_lshlrev_b64 v[12:13], v15, v[8:9] -; GFX8-NEXT: v_or_b32_e32 v10, v0, v10 -; GFX8-NEXT: v_or_b32_e32 v11, v1, v11 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v16, v[8:9] +; GFX8-NEXT: v_lshlrev_b64 v[13:14], v15, v[9:10] +; GFX8-NEXT: v_or_b32_e32 v11, v0, v11 +; GFX8-NEXT: v_or_b32_e32 v12, v1, v12 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v16, v[9:10] ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 -; GFX8-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v10, 0, v13, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v13, 0, v14, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 -; GFX8-NEXT: v_cndmask_b32_e32 v10, v0, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v11, v1, v3, vcc +; GFX8-NEXT: v_and_b32_e32 v14, 0x7f, v8 +; GFX8-NEXT: v_cndmask_b32_e32 v11, v0, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v12, v1, v3, vcc ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v14 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v14, v[4:5] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, v[6:7] @@ -6122,39 +6167,39 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc -; GFX8-NEXT: v_or_b32_e32 v0, v12, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v10, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v13, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v10, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v11, v3 +; GFX8-NEXT: v_or_b32_e32 v2, v11, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v12, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshr_i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v14, 0x7f, v8 -; GFX9-NEXT: v_not_b32_e32 v8, v8 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX9-NEXT: v_and_b32_e32 v15, 0x7f, v8 -; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[9:10], 1, v[0:1] ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v0 +; GFX9-NEXT: v_not_b32_e32 v0, v8 +; GFX9-NEXT: v_and_b32_e32 v15, 0x7f, v0 ; GFX9-NEXT: v_sub_u32_e32 v0, 64, v15 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[8:9] -; GFX9-NEXT: v_lshlrev_b64 v[10:11], v15, v[2:3] +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[9:10] +; GFX9-NEXT: v_lshlrev_b64 v[11:12], v15, v[2:3] ; GFX9-NEXT: v_subrev_u32_e32 v16, 64, v15 -; GFX9-NEXT: v_lshlrev_b64 v[12:13], v15, v[8:9] -; GFX9-NEXT: v_or_b32_e32 v10, v0, v10 -; GFX9-NEXT: v_or_b32_e32 v11, v1, v11 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v16, v[8:9] +; GFX9-NEXT: v_lshlrev_b64 v[13:14], v15, v[9:10] +; GFX9-NEXT: v_or_b32_e32 v11, v0, v11 +; GFX9-NEXT: v_or_b32_e32 v12, v1, v12 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v16, v[9:10] ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v13, 0, v14, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v0, v2, vcc +; GFX9-NEXT: v_and_b32_e32 v14, 0x7f, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v0, v2, vcc ; GFX9-NEXT: v_sub_u32_e32 v2, 64, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v1, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v12, v1, v3, vcc ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v14, v[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, v[6:7] ; GFX9-NEXT: v_subrev_u32_e32 v15, 64, v14 @@ -6170,10 +6215,10 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc -; GFX9-NEXT: v_or_b32_e32 v0, v12, v0 +; GFX9-NEXT: v_or_b32_e32 v0, v10, v0 ; GFX9-NEXT: v_or_b32_e32 v1, v13, v1 -; GFX9-NEXT: v_or_b32_e32 v2, v10, v2 -; GFX9-NEXT: v_or_b32_e32 v3, v11, v3 +; GFX9-NEXT: v_or_b32_e32 v2, v11, v2 +; GFX9-NEXT: v_or_b32_e32 v3, v12, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fshr_i128: @@ -6282,158 +6327,158 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, i128 %amt) { ; GFX6-LABEL: v_fshr_i128_ssv: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_and_b32_e32 v6, 0x7f, v0 -; GFX6-NEXT: v_not_b32_e32 v0, v0 -; GFX6-NEXT: s_mov_b32 s9, 0 -; GFX6-NEXT: v_and_b32_e32 v7, 0x7f, v0 +; GFX6-NEXT: v_not_b32_e32 v1, v0 +; GFX6-NEXT: s_lshl_b64 s[8:9], s[0:1], 1 ; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX6-NEXT: s_lshr_b32 s8, s1, 31 -; GFX6-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 -; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v7 -; GFX6-NEXT: v_lshr_b64 v[0:1], s[10:11], v0 -; GFX6-NEXT: v_lshl_b64 v[2:3], s[0:1], v7 +; GFX6-NEXT: s_lshr_b32 s0, s1, 31 +; GFX6-NEXT: s_mov_b32 s1, 0 +; GFX6-NEXT: v_and_b32_e32 v7, 0x7f, v1 +; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 64, v7 +; GFX6-NEXT: v_lshr_b64 v[1:2], s[8:9], v1 +; GFX6-NEXT: v_lshl_b64 v[3:4], s[0:1], v7 ; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 64, v7 -; GFX6-NEXT: v_lshl_b64 v[4:5], s[10:11], v7 -; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX6-NEXT: v_lshl_b64 v[5:6], s[8:9], v7 ; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX6-NEXT: v_lshl_b64 v[0:1], s[10:11], v8 +; GFX6-NEXT: v_or_b32_e32 v4, v2, v4 +; GFX6-NEXT: v_lshl_b64 v[1:2], s[8:9], v8 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GFX6-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v8, 0, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: v_mov_b32_e32 v3, s1 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX6-NEXT: v_mov_b32_e32 v3, s0 +; GFX6-NEXT: v_mov_b32_e32 v4, s1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; GFX6-NEXT: v_cndmask_b32_e32 v7, v0, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v10, v1, v3, vcc -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v6 -; GFX6-NEXT: v_lshr_b64 v[0:1], s[4:5], v6 +; GFX6-NEXT: v_and_b32_e32 v10, 0x7f, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v9, v2, v4, vcc +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v10 +; GFX6-NEXT: v_lshr_b64 v[0:1], s[4:5], v10 ; GFX6-NEXT: v_lshl_b64 v[2:3], s[6:7], v2 -; GFX6-NEXT: v_subrev_i32_e32 v11, vcc, 64, v6 +; GFX6-NEXT: v_subrev_i32_e32 v11, vcc, 64, v10 ; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX6-NEXT: v_lshr_b64 v[0:1], s[6:7], v11 -; GFX6-NEXT: v_lshr_b64 v[4:5], s[6:7], v6 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GFX6-NEXT: v_lshr_b64 v[4:5], s[6:7], v10 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v10 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc ; GFX6-NEXT: v_or_b32_e32 v0, v8, v0 -; GFX6-NEXT: v_or_b32_e32 v1, v9, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v6, v1 ; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v10, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_fshr_i128_ssv: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_and_b32_e32 v6, 0x7f, v0 -; GFX8-NEXT: v_not_b32_e32 v0, v0 -; GFX8-NEXT: s_mov_b32 s9, 0 -; GFX8-NEXT: v_and_b32_e32 v7, 0x7f, v0 +; GFX8-NEXT: v_not_b32_e32 v1, v0 +; GFX8-NEXT: s_lshl_b64 s[8:9], s[0:1], 1 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX8-NEXT: s_lshr_b32 s8, s1, 31 -; GFX8-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 -; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v7 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, s[10:11] -; GFX8-NEXT: v_lshlrev_b64 v[2:3], v7, s[0:1] +; GFX8-NEXT: s_lshr_b32 s0, s1, 31 +; GFX8-NEXT: s_mov_b32 s1, 0 +; GFX8-NEXT: v_and_b32_e32 v7, 0x7f, v1 +; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 64, v7 +; GFX8-NEXT: v_lshrrev_b64 v[1:2], v1, s[8:9] +; GFX8-NEXT: v_lshlrev_b64 v[3:4], v7, s[0:1] ; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, 64, v7 -; GFX8-NEXT: v_lshlrev_b64 v[4:5], v7, s[10:11] -; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX8-NEXT: v_lshlrev_b64 v[5:6], v7, s[8:9] ; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, s[10:11] +; GFX8-NEXT: v_or_b32_e32 v4, v2, v4 +; GFX8-NEXT: v_lshlrev_b64 v[1:2], v8, s[8:9] ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v8, 0, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v0, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v10, v1, v3, vcc -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v6 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v6, s[4:5] +; GFX8-NEXT: v_and_b32_e32 v10, 0x7f, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v9, v2, v4, vcc +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v10 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v10, s[4:5] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, s[6:7] -; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, 64, v6 +; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, 64, v10 ; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v11, s[6:7] -; GFX8-NEXT: v_lshrrev_b64 v[4:5], v6, s[6:7] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GFX8-NEXT: v_lshrrev_b64 v[4:5], v10, s[6:7] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v10 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc ; GFX8-NEXT: v_or_b32_e32 v0, v8, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v9, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v6, v1 ; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v10, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_fshr_i128_ssv: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_and_b32_e32 v6, 0x7f, v0 -; GFX9-NEXT: v_not_b32_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s9, 0 -; GFX9-NEXT: v_and_b32_e32 v7, 0x7f, v0 +; GFX9-NEXT: v_not_b32_e32 v1, v0 +; GFX9-NEXT: s_lshl_b64 s[8:9], s[0:1], 1 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX9-NEXT: s_lshr_b32 s8, s1, 31 -; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 -; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] -; GFX9-NEXT: v_sub_u32_e32 v0, 64, v7 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, s[10:11] -; GFX9-NEXT: v_lshlrev_b64 v[2:3], v7, s[0:1] +; GFX9-NEXT: s_lshr_b32 s0, s1, 31 +; GFX9-NEXT: s_mov_b32 s1, 0 +; GFX9-NEXT: v_and_b32_e32 v7, 0x7f, v1 +; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX9-NEXT: v_sub_u32_e32 v1, 64, v7 +; GFX9-NEXT: v_lshrrev_b64 v[1:2], v1, s[8:9] +; GFX9-NEXT: v_lshlrev_b64 v[3:4], v7, s[0:1] ; GFX9-NEXT: v_subrev_u32_e32 v8, 64, v7 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v7, s[10:11] -; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX9-NEXT: v_lshlrev_b64 v[5:6], v7, s[8:9] ; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, s[10:11] +; GFX9-NEXT: v_or_b32_e32 v4, v2, v4 +; GFX9-NEXT: v_lshlrev_b64 v[1:2], v8, s[8:9] ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v8, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, 64, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v1, v3, vcc -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v6, s[4:5] +; GFX9-NEXT: v_and_b32_e32 v10, 0x7f, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v2, v4, vcc +; GFX9-NEXT: v_sub_u32_e32 v2, 64, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v10, s[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[6:7] -; GFX9-NEXT: v_subrev_u32_e32 v11, 64, v6 +; GFX9-NEXT: v_subrev_u32_e32 v11, 64, v10 ; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v11, s[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[4:5], v6, s[6:7] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GFX9-NEXT: v_lshrrev_b64 v[4:5], v10, s[6:7] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc ; GFX9-NEXT: v_or_b32_e32 v0, v8, v0 -; GFX9-NEXT: v_or_b32_e32 v1, v9, v1 +; GFX9-NEXT: v_or_b32_e32 v1, v6, v1 ; GFX9-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX9-NEXT: v_or_b32_e32 v3, v10, v3 +; GFX9-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_fshr_i128_ssv: @@ -6543,40 +6588,41 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 inreg %amt) { ; GFX6-LABEL: v_fshr_i128_svs: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX6-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] -; GFX6-NEXT: s_lshl_b64 s[8:9], s[0:1], 1 +; GFX6-NEXT: s_lshl_b64 s[6:7], s[0:1], 1 ; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX6-NEXT: s_lshr_b32 s0, s1, 31 ; GFX6-NEXT: s_mov_b32 s1, 0 ; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX6-NEXT: s_sub_i32 s7, s4, 64 -; GFX6-NEXT: s_sub_i32 s5, 64, s4 -; GFX6-NEXT: s_cmp_lt_u32 s4, 64 -; GFX6-NEXT: s_cselect_b32 s12, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s4, 0 +; GFX6-NEXT: s_andn2_b32 s2, 0x7f, s4 +; GFX6-NEXT: s_not_b32 s5, s4 +; GFX6-NEXT: s_sub_i32 s12, s2, 64 +; GFX6-NEXT: s_sub_i32 s8, 64, s2 +; GFX6-NEXT: s_cmp_lt_u32 s2, 64 ; GFX6-NEXT: s_cselect_b32 s13, 1, 0 -; GFX6-NEXT: s_lshl_b64 s[2:3], s[8:9], s4 -; GFX6-NEXT: s_lshr_b64 s[10:11], s[8:9], s5 -; GFX6-NEXT: s_lshl_b64 s[4:5], s[0:1], s4 -; GFX6-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] -; GFX6-NEXT: s_lshl_b64 s[8:9], s[8:9], s7 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] +; GFX6-NEXT: s_cmp_eq_u32 s2, 0 +; GFX6-NEXT: s_cselect_b32 s14, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[8:9], s[6:7], s8 +; GFX6-NEXT: s_lshl_b64 s[10:11], s[0:1], s5 +; GFX6-NEXT: s_lshl_b64 s[2:3], s[6:7], s5 +; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], s12 ; GFX6-NEXT: s_cmp_lg_u32 s13, 0 -; GFX6-NEXT: s_cselect_b64 s[4:5], s[0:1], s[4:5] -; GFX6-NEXT: s_sub_i32 s0, s6, 64 -; GFX6-NEXT: s_sub_i32 s1, 64, s6 -; GFX6-NEXT: s_cmp_lt_u32 s6, 64 -; GFX6-NEXT: s_cselect_b32 s7, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s6, 0 -; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], s6 -; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s1 +; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX6-NEXT: s_cselect_b64 s[6:7], s[8:9], s[6:7] +; GFX6-NEXT: s_cmp_lg_u32 s14, 0 +; GFX6-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] +; GFX6-NEXT: s_and_b32 s0, s4, 0x7f +; GFX6-NEXT: s_sub_i32 s1, s0, 64 +; GFX6-NEXT: s_sub_i32 s4, 64, s0 +; GFX6-NEXT: s_cmp_lt_u32 s0, 64 +; GFX6-NEXT: s_cselect_b32 s5, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s0, 0 +; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], s0 +; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s4 ; GFX6-NEXT: s_cselect_b32 s8, 1, 0 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], s6 -; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], s0 -; GFX6-NEXT: s_and_b32 s0, 1, s7 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], s0 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], s1 +; GFX6-NEXT: s_and_b32 s0, 1, s5 ; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 @@ -6590,46 +6636,47 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc ; GFX6-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX6-NEXT: v_or_b32_e32 v1, s3, v1 -; GFX6-NEXT: v_or_b32_e32 v2, s4, v2 -; GFX6-NEXT: v_or_b32_e32 v3, s5, v3 +; GFX6-NEXT: v_or_b32_e32 v2, s6, v2 +; GFX6-NEXT: v_or_b32_e32 v3, s7, v3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_fshr_i128_svs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX8-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] -; GFX8-NEXT: s_lshl_b64 s[8:9], s[0:1], 1 +; GFX8-NEXT: s_lshl_b64 s[6:7], s[0:1], 1 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX8-NEXT: s_lshr_b32 s0, s1, 31 ; GFX8-NEXT: s_mov_b32 s1, 0 ; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX8-NEXT: s_sub_i32 s7, s4, 64 -; GFX8-NEXT: s_sub_i32 s5, 64, s4 -; GFX8-NEXT: s_cmp_lt_u32 s4, 64 -; GFX8-NEXT: s_cselect_b32 s12, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s4, 0 +; GFX8-NEXT: s_andn2_b32 s2, 0x7f, s4 +; GFX8-NEXT: s_not_b32 s5, s4 +; GFX8-NEXT: s_sub_i32 s12, s2, 64 +; GFX8-NEXT: s_sub_i32 s8, 64, s2 +; GFX8-NEXT: s_cmp_lt_u32 s2, 64 ; GFX8-NEXT: s_cselect_b32 s13, 1, 0 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[8:9], s4 -; GFX8-NEXT: s_lshr_b64 s[10:11], s[8:9], s5 -; GFX8-NEXT: s_lshl_b64 s[4:5], s[0:1], s4 -; GFX8-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] -; GFX8-NEXT: s_lshl_b64 s[8:9], s[8:9], s7 -; GFX8-NEXT: s_cmp_lg_u32 s12, 0 -; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] +; GFX8-NEXT: s_cmp_eq_u32 s2, 0 +; GFX8-NEXT: s_cselect_b32 s14, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[8:9], s[6:7], s8 +; GFX8-NEXT: s_lshl_b64 s[10:11], s[0:1], s5 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s5 +; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], s12 ; GFX8-NEXT: s_cmp_lg_u32 s13, 0 -; GFX8-NEXT: s_cselect_b64 s[4:5], s[0:1], s[4:5] -; GFX8-NEXT: s_sub_i32 s0, s6, 64 -; GFX8-NEXT: s_sub_i32 s1, 64, s6 -; GFX8-NEXT: s_cmp_lt_u32 s6, 64 -; GFX8-NEXT: s_cselect_b32 s7, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s6, 0 -; GFX8-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[6:7], s1, v[2:3] +; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX8-NEXT: s_cselect_b64 s[6:7], s[8:9], s[6:7] +; GFX8-NEXT: s_cmp_lg_u32 s14, 0 +; GFX8-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] +; GFX8-NEXT: s_and_b32 s0, s4, 0x7f +; GFX8-NEXT: s_sub_i32 s1, s0, 64 +; GFX8-NEXT: s_sub_i32 s4, 64, s0 +; GFX8-NEXT: s_cmp_lt_u32 s0, 64 +; GFX8-NEXT: s_cselect_b32 s5, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s0, 0 +; GFX8-NEXT: v_lshrrev_b64 v[4:5], s0, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[6:7], s4, v[2:3] ; GFX8-NEXT: s_cselect_b32 s8, 1, 0 -; GFX8-NEXT: v_lshrrev_b64 v[8:9], s6, v[2:3] -; GFX8-NEXT: v_lshrrev_b64 v[2:3], s0, v[2:3] -; GFX8-NEXT: s_and_b32 s0, 1, s7 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[2:3], s1, v[2:3] +; GFX8-NEXT: s_and_b32 s0, 1, s5 ; GFX8-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 @@ -6643,46 +6690,47 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc ; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX8-NEXT: v_or_b32_e32 v1, s3, v1 -; GFX8-NEXT: v_or_b32_e32 v2, s4, v2 -; GFX8-NEXT: v_or_b32_e32 v3, s5, v3 +; GFX8-NEXT: v_or_b32_e32 v2, s6, v2 +; GFX8-NEXT: v_or_b32_e32 v3, s7, v3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_fshr_i128_svs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX9-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] -; GFX9-NEXT: s_lshl_b64 s[8:9], s[0:1], 1 +; GFX9-NEXT: s_lshl_b64 s[6:7], s[0:1], 1 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX9-NEXT: s_lshr_b32 s0, s1, 31 ; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX9-NEXT: s_sub_i32 s7, s4, 64 -; GFX9-NEXT: s_sub_i32 s5, 64, s4 -; GFX9-NEXT: s_cmp_lt_u32 s4, 64 -; GFX9-NEXT: s_cselect_b32 s12, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s4, 0 +; GFX9-NEXT: s_andn2_b32 s2, 0x7f, s4 +; GFX9-NEXT: s_not_b32 s5, s4 +; GFX9-NEXT: s_sub_i32 s12, s2, 64 +; GFX9-NEXT: s_sub_i32 s8, 64, s2 +; GFX9-NEXT: s_cmp_lt_u32 s2, 64 ; GFX9-NEXT: s_cselect_b32 s13, 1, 0 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[8:9], s4 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[8:9], s5 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[0:1], s4 -; GFX9-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] -; GFX9-NEXT: s_lshl_b64 s[8:9], s[8:9], s7 -; GFX9-NEXT: s_cmp_lg_u32 s12, 0 -; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] +; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: s_cselect_b32 s14, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[6:7], s8 +; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], s5 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[6:7], s5 +; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], s12 ; GFX9-NEXT: s_cmp_lg_u32 s13, 0 -; GFX9-NEXT: s_cselect_b64 s[4:5], s[0:1], s[4:5] -; GFX9-NEXT: s_sub_i32 s0, s6, 64 -; GFX9-NEXT: s_sub_i32 s1, 64, s6 -; GFX9-NEXT: s_cmp_lt_u32 s6, 64 -; GFX9-NEXT: s_cselect_b32 s7, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s6, 0 -; GFX9-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[6:7], s1, v[2:3] +; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX9-NEXT: s_cselect_b64 s[6:7], s[8:9], s[6:7] +; GFX9-NEXT: s_cmp_lg_u32 s14, 0 +; GFX9-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] +; GFX9-NEXT: s_and_b32 s0, s4, 0x7f +; GFX9-NEXT: s_sub_i32 s1, s0, 64 +; GFX9-NEXT: s_sub_i32 s4, 64, s0 +; GFX9-NEXT: s_cmp_lt_u32 s0, 64 +; GFX9-NEXT: s_cselect_b32 s5, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s0, 0 +; GFX9-NEXT: v_lshrrev_b64 v[4:5], s0, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[6:7], s4, v[2:3] ; GFX9-NEXT: s_cselect_b32 s8, 1, 0 -; GFX9-NEXT: v_lshrrev_b64 v[8:9], s6, v[2:3] -; GFX9-NEXT: v_lshrrev_b64 v[2:3], s0, v[2:3] -; GFX9-NEXT: s_and_b32 s0, 1, s7 +; GFX9-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3] +; GFX9-NEXT: v_lshrrev_b64 v[2:3], s1, v[2:3] +; GFX9-NEXT: s_and_b32 s0, 1, s5 ; GFX9-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX9-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 @@ -6696,50 +6744,51 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc ; GFX9-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX9-NEXT: v_or_b32_e32 v1, s3, v1 -; GFX9-NEXT: v_or_b32_e32 v2, s4, v2 -; GFX9-NEXT: v_or_b32_e32 v3, s5, v3 +; GFX9-NEXT: v_or_b32_e32 v2, s6, v2 +; GFX9-NEXT: v_or_b32_e32 v3, s7, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_fshr_i128_svs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX10-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] ; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX10-NEXT: s_lshr_b32 s8, s1, 31 -; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: s_lshr_b32 s6, s1, 31 +; GFX10-NEXT: s_mov_b32 s7, 0 +; GFX10-NEXT: s_andn2_b32 s5, 0x7f, s4 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX10-NEXT: s_sub_i32 s7, s4, 64 -; GFX10-NEXT: s_sub_i32 s5, 64, s4 -; GFX10-NEXT: s_cmp_lt_u32 s4, 64 -; GFX10-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1] -; GFX10-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s4, 0 +; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] +; GFX10-NEXT: s_not_b32 s10, s4 +; GFX10-NEXT: s_sub_i32 s12, s5, 64 +; GFX10-NEXT: s_sub_i32 s6, 64, s5 +; GFX10-NEXT: s_cmp_lt_u32 s5, 64 ; GFX10-NEXT: s_cselect_b32 s13, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[8:9], s[0:1], s5 -; GFX10-NEXT: s_lshl_b64 s[10:11], s[2:3], s4 -; GFX10-NEXT: s_lshl_b64 s[4:5], s[0:1], s4 -; GFX10-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s7 -; GFX10-NEXT: s_cmp_lg_u32 s12, 0 -; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], 0 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX10-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10-NEXT: s_cselect_b32 s5, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 +; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], s10 +; GFX10-NEXT: s_lshl_b64 s[10:11], s[0:1], s10 +; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 ; GFX10-NEXT: s_cmp_lg_u32 s13, 0 +; GFX10-NEXT: s_cselect_b64 s[8:9], s[10:11], 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1] +; GFX10-NEXT: s_cmp_lg_u32 s5, 0 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] -; GFX10-NEXT: s_sub_i32 s0, 64, s6 -; GFX10-NEXT: v_lshlrev_b64 v[6:7], s0, v[2:3] -; GFX10-NEXT: s_sub_i32 s0, s6, 64 -; GFX10-NEXT: s_cmp_lt_u32 s6, 64 -; GFX10-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3] -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s6, 0 +; GFX10-NEXT: s_and_b32 s0, s4, 0x7f +; GFX10-NEXT: s_sub_i32 s1, 64, s0 +; GFX10-NEXT: v_lshrrev_b64 v[4:5], s0, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[6:7], s1, v[2:3] +; GFX10-NEXT: s_sub_i32 s1, s0, 64 +; GFX10-NEXT: s_cmp_lt_u32 s0, 64 +; GFX10-NEXT: v_lshrrev_b64 v[8:9], s1, v[2:3] +; GFX10-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s0, 0 ; GFX10-NEXT: v_or_b32_e32 v4, v4, v6 -; GFX10-NEXT: s_cselect_b32 s7, 1, 0 -; GFX10-NEXT: s_and_b32 s0, 1, s1 +; GFX10-NEXT: s_cselect_b32 s5, 1, 0 +; GFX10-NEXT: s_and_b32 s1, 1, s4 ; GFX10-NEXT: v_or_b32_e32 v5, v5, v7 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: s_and_b32 s0, 1, s7 -; GFX10-NEXT: v_lshrrev_b64 v[2:3], s6, v[2:3] +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 +; GFX10-NEXT: v_lshrrev_b64 v[2:3], s0, v[2:3] +; GFX10-NEXT: s_and_b32 s0, 1, s5 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc_lo @@ -6749,64 +6798,65 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 ; GFX10-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX10-NEXT: v_or_b32_e32 v3, s3, v3 -; GFX10-NEXT: v_or_b32_e32 v0, s4, v0 -; GFX10-NEXT: v_or_b32_e32 v1, s5, v1 +; GFX10-NEXT: v_or_b32_e32 v0, s8, v0 +; GFX10-NEXT: v_or_b32_e32 v1, s9, v1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: v_fshr_i128_svs: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX11-NEXT: s_and_not1_b64 s[4:5], 0x7f, s[4:5] ; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX11-NEXT: s_lshr_b32 s8, s1, 31 -; GFX11-NEXT: s_mov_b32 s9, 0 +; GFX11-NEXT: s_lshr_b32 s6, s1, 31 +; GFX11-NEXT: s_mov_b32 s7, 0 +; GFX11-NEXT: s_and_not1_b32 s5, 0x7f, s4 ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX11-NEXT: s_sub_i32 s7, s4, 64 -; GFX11-NEXT: s_sub_i32 s5, 64, s4 -; GFX11-NEXT: s_cmp_lt_u32 s4, 64 -; GFX11-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1] -; GFX11-NEXT: s_cselect_b32 s12, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s4, 0 +; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] +; GFX11-NEXT: s_not_b32 s10, s4 +; GFX11-NEXT: s_sub_i32 s12, s5, 64 +; GFX11-NEXT: s_sub_i32 s6, 64, s5 +; GFX11-NEXT: s_cmp_lt_u32 s5, 64 ; GFX11-NEXT: s_cselect_b32 s13, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[8:9], s[0:1], s5 -; GFX11-NEXT: s_lshl_b64 s[10:11], s[2:3], s4 -; GFX11-NEXT: s_lshl_b64 s[4:5], s[0:1], s4 -; GFX11-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] -; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s7 -; GFX11-NEXT: s_cmp_lg_u32 s12, 0 -; GFX11-NEXT: s_cselect_b64 s[4:5], s[4:5], 0 -; GFX11-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX11-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11-NEXT: s_cselect_b32 s5, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 +; GFX11-NEXT: s_lshl_b64 s[8:9], s[2:3], s10 +; GFX11-NEXT: s_lshl_b64 s[10:11], s[0:1], s10 +; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 ; GFX11-NEXT: s_cmp_lg_u32 s13, 0 +; GFX11-NEXT: s_cselect_b64 s[8:9], s[10:11], 0 +; GFX11-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1] +; GFX11-NEXT: s_cmp_lg_u32 s5, 0 ; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] -; GFX11-NEXT: s_sub_i32 s0, 64, s6 +; GFX11-NEXT: s_and_b32 s0, s4, 0x7f ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_lshlrev_b64 v[6:7], s0, v[2:3] -; GFX11-NEXT: s_sub_i32 s0, s6, 64 -; GFX11-NEXT: s_cmp_lt_u32 s6, 64 -; GFX11-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3] -; GFX11-NEXT: s_cselect_b32 s1, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s6, 0 +; GFX11-NEXT: s_sub_i32 s1, 64, s0 +; GFX11-NEXT: v_lshrrev_b64 v[4:5], s0, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[6:7], s1, v[2:3] +; GFX11-NEXT: s_sub_i32 s1, s0, 64 +; GFX11-NEXT: s_cmp_lt_u32 s0, 64 +; GFX11-NEXT: v_lshrrev_b64 v[8:9], s1, v[2:3] +; GFX11-NEXT: s_cselect_b32 s4, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 0 ; GFX11-NEXT: v_or_b32_e32 v4, v4, v6 -; GFX11-NEXT: s_cselect_b32 s7, 1, 0 -; GFX11-NEXT: s_and_b32 s0, 1, s1 +; GFX11-NEXT: s_cselect_b32 s5, 1, 0 +; GFX11-NEXT: s_and_b32 s1, 1, s4 ; GFX11-NEXT: v_or_b32_e32 v5, v5, v7 -; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX11-NEXT: s_and_b32 s0, 1, s7 -; GFX11-NEXT: v_lshrrev_b64 v[2:3], s6, v[2:3] +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 +; GFX11-NEXT: v_lshrrev_b64 v[2:3], s0, v[2:3] +; GFX11-NEXT: s_and_b32 s0, 1, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 ; GFX11-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v5, v9, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_dual_cndmask_b32 v2, 0, v2 :: v_dual_cndmask_b32 v3, 0, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v2, s2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v3, s3, v3 -; GFX11-NEXT: v_or_b32_e32 v0, s4, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v1, s5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v0, s8, v0 +; GFX11-NEXT: v_or_b32_e32 v1, s9, v1 ; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt) %cast.result = bitcast i128 %result to <4 x float> @@ -6816,51 +6866,51 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 inreg %amt) { ; GFX6-LABEL: v_fshr_i128_vss: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX6-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] ; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GFX6-NEXT: s_sub_i32 s5, s4, 64 -; GFX6-NEXT: s_sub_i32 s7, 64, s4 ; GFX6-NEXT: v_lshl_b64 v[4:5], v[0:1], 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v1 -; GFX6-NEXT: s_cmp_lt_u32 s4, 64 +; GFX6-NEXT: s_andn2_b32 s5, 0x7f, s4 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v0 +; GFX6-NEXT: s_sub_i32 s6, s5, 64 +; GFX6-NEXT: s_sub_i32 s7, 64, s5 +; GFX6-NEXT: s_cmp_lt_u32 s5, 64 +; GFX6-NEXT: v_lshr_b64 v[0:1], v[4:5], s7 +; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s5 ; GFX6-NEXT: s_cselect_b32 s8, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s4, 0 +; GFX6-NEXT: s_cmp_eq_u32 s5, 0 ; GFX6-NEXT: s_cselect_b32 s9, 1, 0 -; GFX6-NEXT: v_lshr_b64 v[0:1], v[4:5], s7 -; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s4 -; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], s4 -; GFX6-NEXT: s_and_b32 s4, 1, s8 -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX6-NEXT: s_and_b32 s4, 1, s9 -; GFX6-NEXT: s_sub_i32 s10, s6, 64 -; GFX6-NEXT: s_sub_i32 s8, 64, s6 +; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], s5 ; GFX6-NEXT: v_or_b32_e32 v6, v0, v6 ; GFX6-NEXT: v_or_b32_e32 v7, v1, v7 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[4:5], s5 -; GFX6-NEXT: s_cmp_lt_u32 s6, 64 -; GFX6-NEXT: s_cselect_b32 s11, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s6, 0 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[4:5], s6 +; GFX6-NEXT: s_and_b32 s5, 1, s8 +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX6-NEXT: s_and_b32 s5, 1, s9 ; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX6-NEXT: s_and_b32 s5, s4, 0x7f +; GFX6-NEXT: s_sub_i32 s10, s5, 64 +; GFX6-NEXT: s_sub_i32 s8, 64, s5 +; GFX6-NEXT: s_cmp_lt_u32 s5, 64 +; GFX6-NEXT: s_cselect_b32 s11, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s5, 0 ; GFX6-NEXT: s_cselect_b32 s12, 1, 0 -; GFX6-NEXT: s_lshr_b64 s[4:5], s[2:3], s6 -; GFX6-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 +; GFX6-NEXT: s_lshr_b64 s[6:7], s[2:3], s4 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[0:1], s4 ; GFX6-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 -; GFX6-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 ; GFX6-NEXT: s_cmp_lg_u32 s11, 0 -; GFX6-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] +; GFX6-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] ; GFX6-NEXT: s_cmp_lg_u32 s12, 0 ; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GFX6-NEXT: s_cmp_lg_u32 s11, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc -; GFX6-NEXT: s_cselect_b64 s[2:3], s[4:5], 0 +; GFX6-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v4 ; GFX6-NEXT: v_or_b32_e32 v1, s1, v5 ; GFX6-NEXT: v_or_b32_e32 v2, s2, v2 @@ -6869,51 +6919,51 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; ; GFX8-LABEL: v_fshr_i128_vss: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX8-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX8-NEXT: s_sub_i32 s5, s4, 64 -; GFX8-NEXT: s_sub_i32 s7, 64, s4 ; GFX8-NEXT: v_lshlrev_b64 v[4:5], 1, v[0:1] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 31, v1 -; GFX8-NEXT: s_cmp_lt_u32 s4, 64 +; GFX8-NEXT: s_andn2_b32 s5, 0x7f, s4 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v0 +; GFX8-NEXT: s_sub_i32 s6, s5, 64 +; GFX8-NEXT: s_sub_i32 s7, 64, s5 +; GFX8-NEXT: s_cmp_lt_u32 s5, 64 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], s7, v[4:5] +; GFX8-NEXT: v_lshlrev_b64 v[6:7], s5, v[2:3] ; GFX8-NEXT: s_cselect_b32 s8, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s4, 0 +; GFX8-NEXT: s_cmp_eq_u32 s5, 0 ; GFX8-NEXT: s_cselect_b32 s9, 1, 0 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], s7, v[4:5] -; GFX8-NEXT: v_lshlrev_b64 v[6:7], s4, v[2:3] -; GFX8-NEXT: v_lshlrev_b64 v[8:9], s4, v[4:5] -; GFX8-NEXT: s_and_b32 s4, 1, s8 -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX8-NEXT: s_and_b32 s4, 1, s9 -; GFX8-NEXT: s_sub_i32 s10, s6, 64 -; GFX8-NEXT: s_sub_i32 s8, 64, s6 +; GFX8-NEXT: v_lshlrev_b64 v[8:9], s5, v[4:5] ; GFX8-NEXT: v_or_b32_e32 v6, v0, v6 ; GFX8-NEXT: v_or_b32_e32 v7, v1, v7 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], s5, v[4:5] -; GFX8-NEXT: s_cmp_lt_u32 s6, 64 -; GFX8-NEXT: s_cselect_b32 s11, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s6, 0 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], s6, v[4:5] +; GFX8-NEXT: s_and_b32 s5, 1, s8 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX8-NEXT: s_and_b32 s5, 1, s9 ; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX8-NEXT: s_and_b32 s5, s4, 0x7f +; GFX8-NEXT: s_sub_i32 s10, s5, 64 +; GFX8-NEXT: s_sub_i32 s8, 64, s5 +; GFX8-NEXT: s_cmp_lt_u32 s5, 64 +; GFX8-NEXT: s_cselect_b32 s11, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s5, 0 ; GFX8-NEXT: s_cselect_b32 s12, 1, 0 -; GFX8-NEXT: s_lshr_b64 s[4:5], s[2:3], s6 -; GFX8-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 +; GFX8-NEXT: s_lshr_b64 s[6:7], s[2:3], s4 +; GFX8-NEXT: s_lshr_b64 s[4:5], s[0:1], s4 ; GFX8-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 -; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 ; GFX8-NEXT: s_cmp_lg_u32 s11, 0 -; GFX8-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] +; GFX8-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] ; GFX8-NEXT: s_cmp_lg_u32 s12, 0 ; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GFX8-NEXT: s_cmp_lg_u32 s11, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc -; GFX8-NEXT: s_cselect_b64 s[2:3], s[4:5], 0 +; GFX8-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v4 ; GFX8-NEXT: v_or_b32_e32 v1, s1, v5 ; GFX8-NEXT: v_or_b32_e32 v2, s2, v2 @@ -6922,51 +6972,51 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; ; GFX9-LABEL: v_fshr_i128_vss: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX9-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX9-NEXT: s_sub_i32 s5, s4, 64 -; GFX9-NEXT: s_sub_i32 s7, 64, s4 ; GFX9-NEXT: v_lshlrev_b64 v[4:5], 1, v[0:1] ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 31, v1 -; GFX9-NEXT: s_cmp_lt_u32 s4, 64 +; GFX9-NEXT: s_andn2_b32 s5, 0x7f, s4 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v0 +; GFX9-NEXT: s_sub_i32 s6, s5, 64 +; GFX9-NEXT: s_sub_i32 s7, 64, s5 +; GFX9-NEXT: s_cmp_lt_u32 s5, 64 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], s7, v[4:5] +; GFX9-NEXT: v_lshlrev_b64 v[6:7], s5, v[2:3] ; GFX9-NEXT: s_cselect_b32 s8, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s4, 0 +; GFX9-NEXT: s_cmp_eq_u32 s5, 0 ; GFX9-NEXT: s_cselect_b32 s9, 1, 0 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], s7, v[4:5] -; GFX9-NEXT: v_lshlrev_b64 v[6:7], s4, v[2:3] -; GFX9-NEXT: v_lshlrev_b64 v[8:9], s4, v[4:5] -; GFX9-NEXT: s_and_b32 s4, 1, s8 -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX9-NEXT: s_and_b32 s4, 1, s9 -; GFX9-NEXT: s_sub_i32 s10, s6, 64 -; GFX9-NEXT: s_sub_i32 s8, 64, s6 +; GFX9-NEXT: v_lshlrev_b64 v[8:9], s5, v[4:5] ; GFX9-NEXT: v_or_b32_e32 v6, v0, v6 ; GFX9-NEXT: v_or_b32_e32 v7, v1, v7 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], s5, v[4:5] -; GFX9-NEXT: s_cmp_lt_u32 s6, 64 -; GFX9-NEXT: s_cselect_b32 s11, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s6, 0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], s6, v[4:5] +; GFX9-NEXT: s_and_b32 s5, 1, s8 +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX9-NEXT: s_and_b32 s5, 1, s9 ; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX9-NEXT: s_and_b32 s5, s4, 0x7f +; GFX9-NEXT: s_sub_i32 s10, s5, 64 +; GFX9-NEXT: s_sub_i32 s8, 64, s5 +; GFX9-NEXT: s_cmp_lt_u32 s5, 64 +; GFX9-NEXT: s_cselect_b32 s11, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s5, 0 ; GFX9-NEXT: s_cselect_b32 s12, 1, 0 -; GFX9-NEXT: s_lshr_b64 s[4:5], s[2:3], s6 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[2:3], s4 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[0:1], s4 ; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 -; GFX9-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 ; GFX9-NEXT: s_cmp_lg_u32 s11, 0 -; GFX9-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] +; GFX9-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] ; GFX9-NEXT: s_cmp_lg_u32 s12, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GFX9-NEXT: s_cmp_lg_u32 s11, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc -; GFX9-NEXT: s_cselect_b64 s[2:3], s[4:5], 0 +; GFX9-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 ; GFX9-NEXT: v_or_b32_e32 v0, s0, v4 ; GFX9-NEXT: v_or_b32_e32 v1, s1, v5 ; GFX9-NEXT: v_or_b32_e32 v2, s2, v2 @@ -6978,49 +7028,49 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 31, v1 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX10-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX10-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] -; GFX10-NEXT: s_sub_i32 s7, 64, s4 +; GFX10-NEXT: s_andn2_b32 s5, 0x7f, s4 +; GFX10-NEXT: s_sub_i32 s6, s5, 64 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX10-NEXT: s_sub_i32 s5, s4, 64 -; GFX10-NEXT: s_cmp_lt_u32 s4, 64 +; GFX10-NEXT: s_sub_i32 s7, 64, s5 +; GFX10-NEXT: s_cmp_lt_u32 s5, 64 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], s7, v[0:1] ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_lshlrev_b64 v[6:7], s4, v[2:3] -; GFX10-NEXT: s_cmp_eq_u32 s4, 0 -; GFX10-NEXT: v_lshlrev_b64 v[8:9], s4, v[0:1] +; GFX10-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10-NEXT: v_lshlrev_b64 v[6:7], s5, v[2:3] ; GFX10-NEXT: s_cselect_b32 s9, 1, 0 -; GFX10-NEXT: s_and_b32 s4, 1, s8 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 +; GFX10-NEXT: v_lshlrev_b64 v[8:9], s5, v[0:1] +; GFX10-NEXT: s_and_b32 s5, 1, s8 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], s6, v[0:1] +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 +; GFX10-NEXT: s_and_b32 s5, s4, 0x7f ; GFX10-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX10-NEXT: v_or_b32_e32 v5, v5, v7 -; GFX10-NEXT: s_and_b32 s4, 1, s9 -; GFX10-NEXT: s_sub_i32 s10, s6, 64 -; GFX10-NEXT: s_sub_i32 s7, 64, s6 -; GFX10-NEXT: s_cmp_lt_u32 s6, 64 +; GFX10-NEXT: s_and_b32 s6, 1, s9 +; GFX10-NEXT: s_sub_i32 s10, s5, 64 +; GFX10-NEXT: s_sub_i32 s8, 64, s5 +; GFX10-NEXT: s_cmp_lt_u32 s5, 64 ; GFX10-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc_lo ; GFX10-NEXT: s_cselect_b32 s11, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s6, 0 +; GFX10-NEXT: s_cmp_eq_u32 s5, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6 ; GFX10-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[4:5], s[0:1], s6 -; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], s7 -; GFX10-NEXT: s_lshr_b64 s[6:7], s[2:3], s6 -; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s4 +; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[2:3], s4 +; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 ; GFX10-NEXT: s_cmp_lg_u32 s11, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo -; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] +; GFX10-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] ; GFX10-NEXT: s_cmp_lg_u32 s12, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GFX10-NEXT: s_cmp_lg_u32 s11, 0 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v6 -; GFX10-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 +; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], 0 ; GFX10-NEXT: v_or_b32_e32 v1, s1, v7 ; GFX10-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX10-NEXT: v_or_b32_e32 v3, s3, v3 @@ -7031,47 +7081,47 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; GFX11-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 31, v1 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX11-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX11-NEXT: s_and_not1_b64 s[4:5], 0x7f, s[4:5] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_sub_i32 s7, 64, s4 +; GFX11-NEXT: s_and_not1_b32 s5, 0x7f, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_sub_i32 s6, s5, 64 ; GFX11-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX11-NEXT: s_sub_i32 s5, s4, 64 -; GFX11-NEXT: s_cmp_lt_u32 s4, 64 +; GFX11-NEXT: s_sub_i32 s7, 64, s5 +; GFX11-NEXT: s_cmp_lt_u32 s5, 64 ; GFX11-NEXT: v_lshrrev_b64 v[4:5], s7, v[0:1] ; GFX11-NEXT: s_cselect_b32 s8, 1, 0 -; GFX11-NEXT: v_lshlrev_b64 v[6:7], s4, v[2:3] -; GFX11-NEXT: s_cmp_eq_u32 s4, 0 -; GFX11-NEXT: v_lshlrev_b64 v[8:9], s4, v[0:1] +; GFX11-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11-NEXT: v_lshlrev_b64 v[6:7], s5, v[2:3] ; GFX11-NEXT: s_cselect_b32 s9, 1, 0 -; GFX11-NEXT: s_and_b32 s4, 1, s8 -; GFX11-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] -; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 +; GFX11-NEXT: v_lshlrev_b64 v[8:9], s5, v[0:1] +; GFX11-NEXT: s_and_b32 s5, 1, s8 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], s6, v[0:1] +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 +; GFX11-NEXT: s_and_b32 s5, s4, 0x7f ; GFX11-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX11-NEXT: v_or_b32_e32 v5, v5, v7 -; GFX11-NEXT: s_and_b32 s4, 1, s9 -; GFX11-NEXT: s_sub_i32 s10, s6, 64 -; GFX11-NEXT: s_sub_i32 s7, 64, s6 -; GFX11-NEXT: s_cmp_lt_u32 s6, 64 +; GFX11-NEXT: s_and_b32 s6, 1, s9 +; GFX11-NEXT: s_sub_i32 s10, s5, 64 +; GFX11-NEXT: s_sub_i32 s8, 64, s5 +; GFX11-NEXT: s_cmp_lt_u32 s5, 64 ; GFX11-NEXT: v_dual_cndmask_b32 v6, 0, v8 :: v_dual_cndmask_b32 v7, 0, v9 ; GFX11-NEXT: s_cselect_b32 s11, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s6, 0 +; GFX11-NEXT: s_cmp_eq_u32 s5, 0 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5 -; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6 ; GFX11-NEXT: s_cselect_b32 s12, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[4:5], s[0:1], s6 -; GFX11-NEXT: s_lshl_b64 s[8:9], s[2:3], s7 -; GFX11-NEXT: s_lshr_b64 s[6:7], s[2:3], s6 -; GFX11-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX11-NEXT: s_lshr_b64 s[6:7], s[0:1], s4 +; GFX11-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 +; GFX11-NEXT: s_lshr_b64 s[4:5], s[2:3], s4 +; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 ; GFX11-NEXT: s_cmp_lg_u32 s11, 0 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v0, v2 :: v_dual_cndmask_b32 v3, v1, v3 -; GFX11-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] +; GFX11-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] ; GFX11-NEXT: s_cmp_lg_u32 s12, 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GFX11-NEXT: s_cmp_lg_u32 s11, 0 ; GFX11-NEXT: v_or_b32_e32 v0, s0, v6 -; GFX11-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 +; GFX11-NEXT: s_cselect_b64 s[2:3], s[4:5], 0 ; GFX11-NEXT: v_or_b32_e32 v1, s1, v7 ; GFX11-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX11-NEXT: v_or_b32_e32 v3, s3, v3 @@ -7209,435 +7259,447 @@ define i128 @v_fshr_i128_65(i128 %lhs, i128 %rhs) { define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs, <2 x i128> inreg %amt) { ; GFX6-LABEL: s_fshr_v2i128: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f -; GFX6-NEXT: s_andn2_b64 s[16:17], 0x7f, s[16:17] ; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX6-NEXT: s_lshr_b32 s24, s1, 31 -; GFX6-NEXT: s_mov_b32 s25, 0 -; GFX6-NEXT: s_lshl_b64 s[22:23], s[0:1], 1 -; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[24:25] -; GFX6-NEXT: s_sub_i32 s19, s16, 64 -; GFX6-NEXT: s_sub_i32 s17, 64, s16 -; GFX6-NEXT: s_cmp_lt_u32 s16, 64 -; GFX6-NEXT: s_cselect_b32 s24, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s16, 0 +; GFX6-NEXT: s_lshr_b32 s22, s1, 31 +; GFX6-NEXT: s_mov_b32 s23, 0 +; GFX6-NEXT: s_lshl_b64 s[18:19], s[0:1], 1 +; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[22:23] +; GFX6-NEXT: s_andn2_b32 s2, 0x7f, s16 +; GFX6-NEXT: s_not_b32 s17, s16 +; GFX6-NEXT: s_sub_i32 s21, s2, 64 +; GFX6-NEXT: s_sub_i32 s22, 64, s2 +; GFX6-NEXT: s_cmp_lt_u32 s2, 64 ; GFX6-NEXT: s_cselect_b32 s28, 1, 0 -; GFX6-NEXT: s_lshl_b64 s[2:3], s[22:23], s16 -; GFX6-NEXT: s_lshr_b64 s[26:27], s[22:23], s17 -; GFX6-NEXT: s_lshl_b64 s[16:17], s[0:1], s16 -; GFX6-NEXT: s_or_b64 s[16:17], s[26:27], s[16:17] -; GFX6-NEXT: s_lshl_b64 s[22:23], s[22:23], s19 -; GFX6-NEXT: s_cmp_lg_u32 s24, 0 -; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX6-NEXT: s_cselect_b64 s[16:17], s[16:17], s[22:23] +; GFX6-NEXT: s_cmp_eq_u32 s2, 0 +; GFX6-NEXT: s_cselect_b32 s29, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[24:25], s[18:19], s22 +; GFX6-NEXT: s_lshl_b64 s[26:27], s[0:1], s17 +; GFX6-NEXT: s_lshl_b64 s[2:3], s[18:19], s17 +; GFX6-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] +; GFX6-NEXT: s_lshl_b64 s[18:19], s[18:19], s21 ; GFX6-NEXT: s_cmp_lg_u32 s28, 0 -; GFX6-NEXT: s_cselect_b64 s[16:17], s[0:1], s[16:17] -; GFX6-NEXT: s_sub_i32 s24, s18, 64 -; GFX6-NEXT: s_sub_i32 s22, 64, s18 -; GFX6-NEXT: s_cmp_lt_u32 s18, 64 +; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX6-NEXT: s_cselect_b64 s[18:19], s[24:25], s[18:19] +; GFX6-NEXT: s_cmp_lg_u32 s29, 0 +; GFX6-NEXT: s_cselect_b64 s[18:19], s[0:1], s[18:19] +; GFX6-NEXT: s_and_b32 s0, s16, 0x7f +; GFX6-NEXT: s_sub_i32 s21, s0, 64 +; GFX6-NEXT: s_sub_i32 s22, 64, s0 +; GFX6-NEXT: s_cmp_lt_u32 s0, 64 ; GFX6-NEXT: s_cselect_b32 s26, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s18, 0 +; GFX6-NEXT: s_cmp_eq_u32 s0, 0 ; GFX6-NEXT: s_cselect_b32 s27, 1, 0 -; GFX6-NEXT: s_lshr_b64 s[0:1], s[10:11], s18 -; GFX6-NEXT: s_lshr_b64 s[18:19], s[8:9], s18 -; GFX6-NEXT: s_lshl_b64 s[22:23], s[10:11], s22 -; GFX6-NEXT: s_or_b64 s[18:19], s[18:19], s[22:23] -; GFX6-NEXT: s_lshr_b64 s[10:11], s[10:11], s24 +; GFX6-NEXT: s_lshr_b64 s[0:1], s[10:11], s16 +; GFX6-NEXT: s_lshr_b64 s[16:17], s[8:9], s16 +; GFX6-NEXT: s_lshl_b64 s[24:25], s[10:11], s22 +; GFX6-NEXT: s_or_b64 s[16:17], s[16:17], s[24:25] +; GFX6-NEXT: s_lshr_b64 s[10:11], s[10:11], s21 ; GFX6-NEXT: s_cmp_lg_u32 s26, 0 -; GFX6-NEXT: s_cselect_b64 s[10:11], s[18:19], s[10:11] +; GFX6-NEXT: s_cselect_b64 s[10:11], s[16:17], s[10:11] ; GFX6-NEXT: s_cmp_lg_u32 s27, 0 ; GFX6-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11] ; GFX6-NEXT: s_cmp_lg_u32 s26, 0 ; GFX6-NEXT: s_cselect_b64 s[10:11], s[0:1], 0 -; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] -; GFX6-NEXT: s_or_b64 s[2:3], s[16:17], s[10:11] -; GFX6-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f -; GFX6-NEXT: s_andn2_b64 s[10:11], 0x7f, s[20:21] ; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GFX6-NEXT: s_lshr_b32 s24, s5, 31 -; GFX6-NEXT: s_lshl_b64 s[16:17], s[4:5], 1 -; GFX6-NEXT: s_or_b64 s[4:5], s[6:7], s[24:25] -; GFX6-NEXT: s_sub_i32 s9, s10, 64 -; GFX6-NEXT: s_sub_i32 s11, 64, s10 -; GFX6-NEXT: s_cmp_lt_u32 s10, 64 -; GFX6-NEXT: s_cselect_b32 s20, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s10, 0 +; GFX6-NEXT: s_lshr_b32 s22, s5, 31 +; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] +; GFX6-NEXT: s_lshl_b64 s[8:9], s[4:5], 1 +; GFX6-NEXT: s_or_b64 s[4:5], s[6:7], s[22:23] +; GFX6-NEXT: s_andn2_b32 s6, 0x7f, s20 +; GFX6-NEXT: s_or_b64 s[2:3], s[18:19], s[10:11] +; GFX6-NEXT: s_not_b32 s16, s20 +; GFX6-NEXT: s_sub_i32 s18, s6, 64 +; GFX6-NEXT: s_sub_i32 s10, 64, s6 +; GFX6-NEXT: s_cmp_lt_u32 s6, 64 +; GFX6-NEXT: s_cselect_b32 s19, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s6, 0 ; GFX6-NEXT: s_cselect_b32 s21, 1, 0 -; GFX6-NEXT: s_lshl_b64 s[6:7], s[16:17], s10 -; GFX6-NEXT: s_lshr_b64 s[18:19], s[16:17], s11 -; GFX6-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 -; GFX6-NEXT: s_or_b64 s[10:11], s[18:19], s[10:11] -; GFX6-NEXT: s_lshl_b64 s[16:17], s[16:17], s9 -; GFX6-NEXT: s_cmp_lg_u32 s20, 0 +; GFX6-NEXT: s_lshl_b64 s[6:7], s[8:9], s16 +; GFX6-NEXT: s_lshr_b64 s[10:11], s[8:9], s10 +; GFX6-NEXT: s_lshl_b64 s[16:17], s[4:5], s16 +; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] +; GFX6-NEXT: s_lshl_b64 s[8:9], s[8:9], s18 +; GFX6-NEXT: s_cmp_lg_u32 s19, 0 ; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 -; GFX6-NEXT: s_cselect_b64 s[10:11], s[10:11], s[16:17] +; GFX6-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] ; GFX6-NEXT: s_cmp_lg_u32 s21, 0 -; GFX6-NEXT: s_cselect_b64 s[10:11], s[4:5], s[10:11] -; GFX6-NEXT: s_sub_i32 s18, s8, 64 -; GFX6-NEXT: s_sub_i32 s16, 64, s8 -; GFX6-NEXT: s_cmp_lt_u32 s8, 64 +; GFX6-NEXT: s_cselect_b64 s[8:9], s[4:5], s[8:9] +; GFX6-NEXT: s_and_b32 s4, s20, 0x7f +; GFX6-NEXT: s_sub_i32 s18, s4, 64 +; GFX6-NEXT: s_sub_i32 s16, 64, s4 +; GFX6-NEXT: s_cmp_lt_u32 s4, 64 ; GFX6-NEXT: s_cselect_b32 s19, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s8, 0 -; GFX6-NEXT: s_cselect_b32 s20, 1, 0 -; GFX6-NEXT: s_lshr_b64 s[4:5], s[14:15], s8 -; GFX6-NEXT: s_lshr_b64 s[8:9], s[12:13], s8 +; GFX6-NEXT: s_cmp_eq_u32 s4, 0 +; GFX6-NEXT: s_cselect_b32 s21, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[10:11], s[12:13], s20 ; GFX6-NEXT: s_lshl_b64 s[16:17], s[14:15], s16 -; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[16:17] +; GFX6-NEXT: s_lshr_b64 s[4:5], s[14:15], s20 +; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] ; GFX6-NEXT: s_lshr_b64 s[14:15], s[14:15], s18 ; GFX6-NEXT: s_cmp_lg_u32 s19, 0 -; GFX6-NEXT: s_cselect_b64 s[8:9], s[8:9], s[14:15] -; GFX6-NEXT: s_cmp_lg_u32 s20, 0 -; GFX6-NEXT: s_cselect_b64 s[8:9], s[12:13], s[8:9] +; GFX6-NEXT: s_cselect_b64 s[10:11], s[10:11], s[14:15] +; GFX6-NEXT: s_cmp_lg_u32 s21, 0 +; GFX6-NEXT: s_cselect_b64 s[10:11], s[12:13], s[10:11] ; GFX6-NEXT: s_cmp_lg_u32 s19, 0 ; GFX6-NEXT: s_cselect_b64 s[12:13], s[4:5], 0 -; GFX6-NEXT: s_or_b64 s[4:5], s[6:7], s[8:9] -; GFX6-NEXT: s_or_b64 s[6:7], s[10:11], s[12:13] +; GFX6-NEXT: s_or_b64 s[4:5], s[6:7], s[10:11] +; GFX6-NEXT: s_or_b64 s[6:7], s[8:9], s[12:13] ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshr_v2i128: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f -; GFX8-NEXT: s_andn2_b64 s[16:17], 0x7f, s[16:17] ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX8-NEXT: s_lshr_b32 s24, s1, 31 -; GFX8-NEXT: s_mov_b32 s25, 0 -; GFX8-NEXT: s_lshl_b64 s[22:23], s[0:1], 1 -; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[24:25] -; GFX8-NEXT: s_sub_i32 s19, s16, 64 -; GFX8-NEXT: s_sub_i32 s17, 64, s16 -; GFX8-NEXT: s_cmp_lt_u32 s16, 64 -; GFX8-NEXT: s_cselect_b32 s24, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s16, 0 +; GFX8-NEXT: s_lshr_b32 s22, s1, 31 +; GFX8-NEXT: s_mov_b32 s23, 0 +; GFX8-NEXT: s_lshl_b64 s[18:19], s[0:1], 1 +; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[22:23] +; GFX8-NEXT: s_andn2_b32 s2, 0x7f, s16 +; GFX8-NEXT: s_not_b32 s17, s16 +; GFX8-NEXT: s_sub_i32 s21, s2, 64 +; GFX8-NEXT: s_sub_i32 s22, 64, s2 +; GFX8-NEXT: s_cmp_lt_u32 s2, 64 ; GFX8-NEXT: s_cselect_b32 s28, 1, 0 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[22:23], s16 -; GFX8-NEXT: s_lshr_b64 s[26:27], s[22:23], s17 -; GFX8-NEXT: s_lshl_b64 s[16:17], s[0:1], s16 -; GFX8-NEXT: s_or_b64 s[16:17], s[26:27], s[16:17] -; GFX8-NEXT: s_lshl_b64 s[22:23], s[22:23], s19 -; GFX8-NEXT: s_cmp_lg_u32 s24, 0 -; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX8-NEXT: s_cselect_b64 s[16:17], s[16:17], s[22:23] +; GFX8-NEXT: s_cmp_eq_u32 s2, 0 +; GFX8-NEXT: s_cselect_b32 s29, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[24:25], s[18:19], s22 +; GFX8-NEXT: s_lshl_b64 s[26:27], s[0:1], s17 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[18:19], s17 +; GFX8-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] +; GFX8-NEXT: s_lshl_b64 s[18:19], s[18:19], s21 ; GFX8-NEXT: s_cmp_lg_u32 s28, 0 -; GFX8-NEXT: s_cselect_b64 s[16:17], s[0:1], s[16:17] -; GFX8-NEXT: s_sub_i32 s24, s18, 64 -; GFX8-NEXT: s_sub_i32 s22, 64, s18 -; GFX8-NEXT: s_cmp_lt_u32 s18, 64 +; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX8-NEXT: s_cselect_b64 s[18:19], s[24:25], s[18:19] +; GFX8-NEXT: s_cmp_lg_u32 s29, 0 +; GFX8-NEXT: s_cselect_b64 s[18:19], s[0:1], s[18:19] +; GFX8-NEXT: s_and_b32 s0, s16, 0x7f +; GFX8-NEXT: s_sub_i32 s21, s0, 64 +; GFX8-NEXT: s_sub_i32 s22, 64, s0 +; GFX8-NEXT: s_cmp_lt_u32 s0, 64 ; GFX8-NEXT: s_cselect_b32 s26, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s18, 0 +; GFX8-NEXT: s_cmp_eq_u32 s0, 0 ; GFX8-NEXT: s_cselect_b32 s27, 1, 0 -; GFX8-NEXT: s_lshr_b64 s[0:1], s[10:11], s18 -; GFX8-NEXT: s_lshr_b64 s[18:19], s[8:9], s18 -; GFX8-NEXT: s_lshl_b64 s[22:23], s[10:11], s22 -; GFX8-NEXT: s_or_b64 s[18:19], s[18:19], s[22:23] -; GFX8-NEXT: s_lshr_b64 s[10:11], s[10:11], s24 +; GFX8-NEXT: s_lshr_b64 s[0:1], s[10:11], s16 +; GFX8-NEXT: s_lshr_b64 s[16:17], s[8:9], s16 +; GFX8-NEXT: s_lshl_b64 s[24:25], s[10:11], s22 +; GFX8-NEXT: s_or_b64 s[16:17], s[16:17], s[24:25] +; GFX8-NEXT: s_lshr_b64 s[10:11], s[10:11], s21 ; GFX8-NEXT: s_cmp_lg_u32 s26, 0 -; GFX8-NEXT: s_cselect_b64 s[10:11], s[18:19], s[10:11] +; GFX8-NEXT: s_cselect_b64 s[10:11], s[16:17], s[10:11] ; GFX8-NEXT: s_cmp_lg_u32 s27, 0 ; GFX8-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11] ; GFX8-NEXT: s_cmp_lg_u32 s26, 0 ; GFX8-NEXT: s_cselect_b64 s[10:11], s[0:1], 0 -; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] -; GFX8-NEXT: s_or_b64 s[2:3], s[16:17], s[10:11] -; GFX8-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f -; GFX8-NEXT: s_andn2_b64 s[10:11], 0x7f, s[20:21] ; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GFX8-NEXT: s_lshr_b32 s24, s5, 31 -; GFX8-NEXT: s_lshl_b64 s[16:17], s[4:5], 1 -; GFX8-NEXT: s_or_b64 s[4:5], s[6:7], s[24:25] -; GFX8-NEXT: s_sub_i32 s9, s10, 64 -; GFX8-NEXT: s_sub_i32 s11, 64, s10 -; GFX8-NEXT: s_cmp_lt_u32 s10, 64 -; GFX8-NEXT: s_cselect_b32 s20, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s10, 0 +; GFX8-NEXT: s_lshr_b32 s22, s5, 31 +; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] +; GFX8-NEXT: s_lshl_b64 s[8:9], s[4:5], 1 +; GFX8-NEXT: s_or_b64 s[4:5], s[6:7], s[22:23] +; GFX8-NEXT: s_andn2_b32 s6, 0x7f, s20 +; GFX8-NEXT: s_or_b64 s[2:3], s[18:19], s[10:11] +; GFX8-NEXT: s_not_b32 s16, s20 +; GFX8-NEXT: s_sub_i32 s18, s6, 64 +; GFX8-NEXT: s_sub_i32 s10, 64, s6 +; GFX8-NEXT: s_cmp_lt_u32 s6, 64 +; GFX8-NEXT: s_cselect_b32 s19, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s6, 0 ; GFX8-NEXT: s_cselect_b32 s21, 1, 0 -; GFX8-NEXT: s_lshl_b64 s[6:7], s[16:17], s10 -; GFX8-NEXT: s_lshr_b64 s[18:19], s[16:17], s11 -; GFX8-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 -; GFX8-NEXT: s_or_b64 s[10:11], s[18:19], s[10:11] -; GFX8-NEXT: s_lshl_b64 s[16:17], s[16:17], s9 -; GFX8-NEXT: s_cmp_lg_u32 s20, 0 +; GFX8-NEXT: s_lshl_b64 s[6:7], s[8:9], s16 +; GFX8-NEXT: s_lshr_b64 s[10:11], s[8:9], s10 +; GFX8-NEXT: s_lshl_b64 s[16:17], s[4:5], s16 +; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] +; GFX8-NEXT: s_lshl_b64 s[8:9], s[8:9], s18 +; GFX8-NEXT: s_cmp_lg_u32 s19, 0 ; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 -; GFX8-NEXT: s_cselect_b64 s[10:11], s[10:11], s[16:17] +; GFX8-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] ; GFX8-NEXT: s_cmp_lg_u32 s21, 0 -; GFX8-NEXT: s_cselect_b64 s[10:11], s[4:5], s[10:11] -; GFX8-NEXT: s_sub_i32 s18, s8, 64 -; GFX8-NEXT: s_sub_i32 s16, 64, s8 -; GFX8-NEXT: s_cmp_lt_u32 s8, 64 +; GFX8-NEXT: s_cselect_b64 s[8:9], s[4:5], s[8:9] +; GFX8-NEXT: s_and_b32 s4, s20, 0x7f +; GFX8-NEXT: s_sub_i32 s18, s4, 64 +; GFX8-NEXT: s_sub_i32 s16, 64, s4 +; GFX8-NEXT: s_cmp_lt_u32 s4, 64 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s8, 0 -; GFX8-NEXT: s_cselect_b32 s20, 1, 0 -; GFX8-NEXT: s_lshr_b64 s[4:5], s[14:15], s8 -; GFX8-NEXT: s_lshr_b64 s[8:9], s[12:13], s8 +; GFX8-NEXT: s_cmp_eq_u32 s4, 0 +; GFX8-NEXT: s_cselect_b32 s21, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[10:11], s[12:13], s20 ; GFX8-NEXT: s_lshl_b64 s[16:17], s[14:15], s16 -; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[16:17] +; GFX8-NEXT: s_lshr_b64 s[4:5], s[14:15], s20 +; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] ; GFX8-NEXT: s_lshr_b64 s[14:15], s[14:15], s18 ; GFX8-NEXT: s_cmp_lg_u32 s19, 0 -; GFX8-NEXT: s_cselect_b64 s[8:9], s[8:9], s[14:15] -; GFX8-NEXT: s_cmp_lg_u32 s20, 0 -; GFX8-NEXT: s_cselect_b64 s[8:9], s[12:13], s[8:9] +; GFX8-NEXT: s_cselect_b64 s[10:11], s[10:11], s[14:15] +; GFX8-NEXT: s_cmp_lg_u32 s21, 0 +; GFX8-NEXT: s_cselect_b64 s[10:11], s[12:13], s[10:11] ; GFX8-NEXT: s_cmp_lg_u32 s19, 0 ; GFX8-NEXT: s_cselect_b64 s[12:13], s[4:5], 0 -; GFX8-NEXT: s_or_b64 s[4:5], s[6:7], s[8:9] -; GFX8-NEXT: s_or_b64 s[6:7], s[10:11], s[12:13] +; GFX8-NEXT: s_or_b64 s[4:5], s[6:7], s[10:11] +; GFX8-NEXT: s_or_b64 s[6:7], s[8:9], s[12:13] ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshr_v2i128: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f -; GFX9-NEXT: s_andn2_b64 s[16:17], 0x7f, s[16:17] ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX9-NEXT: s_lshr_b32 s24, s1, 31 -; GFX9-NEXT: s_mov_b32 s25, 0 -; GFX9-NEXT: s_lshl_b64 s[22:23], s[0:1], 1 -; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[24:25] -; GFX9-NEXT: s_sub_i32 s19, s16, 64 -; GFX9-NEXT: s_sub_i32 s17, 64, s16 -; GFX9-NEXT: s_cmp_lt_u32 s16, 64 -; GFX9-NEXT: s_cselect_b32 s24, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s16, 0 +; GFX9-NEXT: s_lshr_b32 s22, s1, 31 +; GFX9-NEXT: s_mov_b32 s23, 0 +; GFX9-NEXT: s_lshl_b64 s[18:19], s[0:1], 1 +; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[22:23] +; GFX9-NEXT: s_andn2_b32 s2, 0x7f, s16 +; GFX9-NEXT: s_not_b32 s17, s16 +; GFX9-NEXT: s_sub_i32 s21, s2, 64 +; GFX9-NEXT: s_sub_i32 s22, 64, s2 +; GFX9-NEXT: s_cmp_lt_u32 s2, 64 ; GFX9-NEXT: s_cselect_b32 s28, 1, 0 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[22:23], s16 -; GFX9-NEXT: s_lshr_b64 s[26:27], s[22:23], s17 -; GFX9-NEXT: s_lshl_b64 s[16:17], s[0:1], s16 -; GFX9-NEXT: s_or_b64 s[16:17], s[26:27], s[16:17] -; GFX9-NEXT: s_lshl_b64 s[22:23], s[22:23], s19 -; GFX9-NEXT: s_cmp_lg_u32 s24, 0 -; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX9-NEXT: s_cselect_b64 s[16:17], s[16:17], s[22:23] +; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: s_cselect_b32 s29, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[24:25], s[18:19], s22 +; GFX9-NEXT: s_lshl_b64 s[26:27], s[0:1], s17 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[18:19], s17 +; GFX9-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] +; GFX9-NEXT: s_lshl_b64 s[18:19], s[18:19], s21 ; GFX9-NEXT: s_cmp_lg_u32 s28, 0 -; GFX9-NEXT: s_cselect_b64 s[16:17], s[0:1], s[16:17] -; GFX9-NEXT: s_sub_i32 s24, s18, 64 -; GFX9-NEXT: s_sub_i32 s22, 64, s18 -; GFX9-NEXT: s_cmp_lt_u32 s18, 64 +; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX9-NEXT: s_cselect_b64 s[18:19], s[24:25], s[18:19] +; GFX9-NEXT: s_cmp_lg_u32 s29, 0 +; GFX9-NEXT: s_cselect_b64 s[18:19], s[0:1], s[18:19] +; GFX9-NEXT: s_and_b32 s0, s16, 0x7f +; GFX9-NEXT: s_sub_i32 s21, s0, 64 +; GFX9-NEXT: s_sub_i32 s22, 64, s0 +; GFX9-NEXT: s_cmp_lt_u32 s0, 64 ; GFX9-NEXT: s_cselect_b32 s26, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s18, 0 +; GFX9-NEXT: s_cmp_eq_u32 s0, 0 ; GFX9-NEXT: s_cselect_b32 s27, 1, 0 -; GFX9-NEXT: s_lshr_b64 s[0:1], s[10:11], s18 -; GFX9-NEXT: s_lshr_b64 s[18:19], s[8:9], s18 -; GFX9-NEXT: s_lshl_b64 s[22:23], s[10:11], s22 -; GFX9-NEXT: s_or_b64 s[18:19], s[18:19], s[22:23] -; GFX9-NEXT: s_lshr_b64 s[10:11], s[10:11], s24 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[10:11], s16 +; GFX9-NEXT: s_lshr_b64 s[16:17], s[8:9], s16 +; GFX9-NEXT: s_lshl_b64 s[24:25], s[10:11], s22 +; GFX9-NEXT: s_or_b64 s[16:17], s[16:17], s[24:25] +; GFX9-NEXT: s_lshr_b64 s[10:11], s[10:11], s21 ; GFX9-NEXT: s_cmp_lg_u32 s26, 0 -; GFX9-NEXT: s_cselect_b64 s[10:11], s[18:19], s[10:11] +; GFX9-NEXT: s_cselect_b64 s[10:11], s[16:17], s[10:11] ; GFX9-NEXT: s_cmp_lg_u32 s27, 0 ; GFX9-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11] ; GFX9-NEXT: s_cmp_lg_u32 s26, 0 ; GFX9-NEXT: s_cselect_b64 s[10:11], s[0:1], 0 -; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] -; GFX9-NEXT: s_or_b64 s[2:3], s[16:17], s[10:11] -; GFX9-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f -; GFX9-NEXT: s_andn2_b64 s[10:11], 0x7f, s[20:21] ; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GFX9-NEXT: s_lshr_b32 s24, s5, 31 -; GFX9-NEXT: s_lshl_b64 s[16:17], s[4:5], 1 -; GFX9-NEXT: s_or_b64 s[4:5], s[6:7], s[24:25] -; GFX9-NEXT: s_sub_i32 s9, s10, 64 -; GFX9-NEXT: s_sub_i32 s11, 64, s10 -; GFX9-NEXT: s_cmp_lt_u32 s10, 64 -; GFX9-NEXT: s_cselect_b32 s20, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s10, 0 +; GFX9-NEXT: s_lshr_b32 s22, s5, 31 +; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] +; GFX9-NEXT: s_lshl_b64 s[8:9], s[4:5], 1 +; GFX9-NEXT: s_or_b64 s[4:5], s[6:7], s[22:23] +; GFX9-NEXT: s_andn2_b32 s6, 0x7f, s20 +; GFX9-NEXT: s_or_b64 s[2:3], s[18:19], s[10:11] +; GFX9-NEXT: s_not_b32 s16, s20 +; GFX9-NEXT: s_sub_i32 s18, s6, 64 +; GFX9-NEXT: s_sub_i32 s10, 64, s6 +; GFX9-NEXT: s_cmp_lt_u32 s6, 64 +; GFX9-NEXT: s_cselect_b32 s19, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s6, 0 ; GFX9-NEXT: s_cselect_b32 s21, 1, 0 -; GFX9-NEXT: s_lshl_b64 s[6:7], s[16:17], s10 -; GFX9-NEXT: s_lshr_b64 s[18:19], s[16:17], s11 -; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 -; GFX9-NEXT: s_or_b64 s[10:11], s[18:19], s[10:11] -; GFX9-NEXT: s_lshl_b64 s[16:17], s[16:17], s9 -; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_lshl_b64 s[6:7], s[8:9], s16 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[8:9], s10 +; GFX9-NEXT: s_lshl_b64 s[16:17], s[4:5], s16 +; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] +; GFX9-NEXT: s_lshl_b64 s[8:9], s[8:9], s18 +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 -; GFX9-NEXT: s_cselect_b64 s[10:11], s[10:11], s[16:17] +; GFX9-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] ; GFX9-NEXT: s_cmp_lg_u32 s21, 0 -; GFX9-NEXT: s_cselect_b64 s[10:11], s[4:5], s[10:11] -; GFX9-NEXT: s_sub_i32 s18, s8, 64 -; GFX9-NEXT: s_sub_i32 s16, 64, s8 -; GFX9-NEXT: s_cmp_lt_u32 s8, 64 +; GFX9-NEXT: s_cselect_b64 s[8:9], s[4:5], s[8:9] +; GFX9-NEXT: s_and_b32 s4, s20, 0x7f +; GFX9-NEXT: s_sub_i32 s18, s4, 64 +; GFX9-NEXT: s_sub_i32 s16, 64, s4 +; GFX9-NEXT: s_cmp_lt_u32 s4, 64 ; GFX9-NEXT: s_cselect_b32 s19, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s8, 0 -; GFX9-NEXT: s_cselect_b32 s20, 1, 0 -; GFX9-NEXT: s_lshr_b64 s[4:5], s[14:15], s8 -; GFX9-NEXT: s_lshr_b64 s[8:9], s[12:13], s8 +; GFX9-NEXT: s_cmp_eq_u32 s4, 0 +; GFX9-NEXT: s_cselect_b32 s21, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[12:13], s20 ; GFX9-NEXT: s_lshl_b64 s[16:17], s[14:15], s16 -; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[16:17] +; GFX9-NEXT: s_lshr_b64 s[4:5], s[14:15], s20 +; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] ; GFX9-NEXT: s_lshr_b64 s[14:15], s[14:15], s18 ; GFX9-NEXT: s_cmp_lg_u32 s19, 0 -; GFX9-NEXT: s_cselect_b64 s[8:9], s[8:9], s[14:15] -; GFX9-NEXT: s_cmp_lg_u32 s20, 0 -; GFX9-NEXT: s_cselect_b64 s[8:9], s[12:13], s[8:9] +; GFX9-NEXT: s_cselect_b64 s[10:11], s[10:11], s[14:15] +; GFX9-NEXT: s_cmp_lg_u32 s21, 0 +; GFX9-NEXT: s_cselect_b64 s[10:11], s[12:13], s[10:11] ; GFX9-NEXT: s_cmp_lg_u32 s19, 0 ; GFX9-NEXT: s_cselect_b64 s[12:13], s[4:5], 0 -; GFX9-NEXT: s_or_b64 s[4:5], s[6:7], s[8:9] -; GFX9-NEXT: s_or_b64 s[6:7], s[10:11], s[12:13] +; GFX9-NEXT: s_or_b64 s[4:5], s[6:7], s[10:11] +; GFX9-NEXT: s_or_b64 s[6:7], s[8:9], s[12:13] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshr_v2i128: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f -; GFX10-NEXT: s_andn2_b64 s[16:17], 0x7f, s[16:17] ; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX10-NEXT: s_lshr_b32 s22, s1, 31 -; GFX10-NEXT: s_mov_b32 s23, 0 +; GFX10-NEXT: s_lshr_b32 s18, s1, 31 +; GFX10-NEXT: s_mov_b32 s19, 0 +; GFX10-NEXT: s_andn2_b32 s17, 0x7f, s16 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[22:23] -; GFX10-NEXT: s_sub_i32 s19, s16, 64 -; GFX10-NEXT: s_sub_i32 s17, 64, s16 -; GFX10-NEXT: s_cmp_lt_u32 s16, 64 -; GFX10-NEXT: s_cselect_b32 s22, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s16, 0 +; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[18:19] +; GFX10-NEXT: s_not_b32 s18, s16 +; GFX10-NEXT: s_sub_i32 s21, s17, 64 +; GFX10-NEXT: s_sub_i32 s22, 64, s17 +; GFX10-NEXT: s_cmp_lt_u32 s17, 64 ; GFX10-NEXT: s_cselect_b32 s28, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[24:25], s[0:1], s17 -; GFX10-NEXT: s_lshl_b64 s[26:27], s[2:3], s16 -; GFX10-NEXT: s_lshl_b64 s[16:17], s[0:1], s16 -; GFX10-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s19 -; GFX10-NEXT: s_cmp_lg_u32 s22, 0 -; GFX10-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1] +; GFX10-NEXT: s_cmp_eq_u32 s17, 0 +; GFX10-NEXT: s_cselect_b32 s17, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[22:23], s[0:1], s22 +; GFX10-NEXT: s_lshl_b64 s[24:25], s[2:3], s18 +; GFX10-NEXT: s_lshl_b64 s[26:27], s[0:1], s18 +; GFX10-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25] +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s21 ; GFX10-NEXT: s_cmp_lg_u32 s28, 0 +; GFX10-NEXT: s_cselect_b64 s[24:25], s[26:27], 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[22:23], s[0:1] +; GFX10-NEXT: s_cmp_lg_u32 s17, 0 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] -; GFX10-NEXT: s_sub_i32 s22, s18, 64 -; GFX10-NEXT: s_sub_i32 s19, 64, s18 -; GFX10-NEXT: s_cmp_lt_u32 s18, 64 +; GFX10-NEXT: s_and_b32 s0, s16, 0x7f +; GFX10-NEXT: s_sub_i32 s18, s0, 64 +; GFX10-NEXT: s_sub_i32 s17, 64, s0 +; GFX10-NEXT: s_cmp_lt_u32 s0, 64 +; GFX10-NEXT: s_cselect_b32 s21, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s0, 0 ; GFX10-NEXT: s_cselect_b32 s26, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s18, 0 -; GFX10-NEXT: s_cselect_b32 s27, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s18 -; GFX10-NEXT: s_lshl_b64 s[24:25], s[10:11], s19 -; GFX10-NEXT: s_lshr_b64 s[18:19], s[10:11], s18 -; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[24:25] -; GFX10-NEXT: s_lshr_b64 s[10:11], s[10:11], s22 -; GFX10-NEXT: s_cmp_lg_u32 s26, 0 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s16 +; GFX10-NEXT: s_lshl_b64 s[22:23], s[10:11], s17 +; GFX10-NEXT: s_lshr_b64 s[16:17], s[10:11], s16 +; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[22:23] +; GFX10-NEXT: s_lshr_b64 s[10:11], s[10:11], s18 +; GFX10-NEXT: s_cmp_lg_u32 s21, 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11] -; GFX10-NEXT: s_cmp_lg_u32 s27, 0 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] ; GFX10-NEXT: s_cmp_lg_u32 s26, 0 -; GFX10-NEXT: s_cselect_b64 s[8:9], s[18:19], 0 -; GFX10-NEXT: s_andn2_b64 s[10:11], 0x7f, s[20:21] +; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX10-NEXT: s_cmp_lg_u32 s21, 0 +; GFX10-NEXT: s_cselect_b64 s[8:9], s[16:17], 0 ; GFX10-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GFX10-NEXT: s_lshr_b32 s22, s5, 31 ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX10-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f -; GFX10-NEXT: s_or_b64 s[0:1], s[16:17], s[0:1] +; GFX10-NEXT: s_lshr_b32 s18, s5, 31 +; GFX10-NEXT: s_andn2_b32 s8, 0x7f, s20 +; GFX10-NEXT: s_or_b64 s[0:1], s[24:25], s[0:1] ; GFX10-NEXT: s_lshl_b64 s[4:5], s[4:5], 1 -; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[22:23] -; GFX10-NEXT: s_sub_i32 s9, s10, 64 -; GFX10-NEXT: s_sub_i32 s11, 64, s10 -; GFX10-NEXT: s_cmp_lt_u32 s10, 64 -; GFX10-NEXT: s_cselect_b32 s20, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s10, 0 -; GFX10-NEXT: s_cselect_b32 s21, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[16:17], s[4:5], s11 -; GFX10-NEXT: s_lshl_b64 s[18:19], s[6:7], s10 -; GFX10-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 -; GFX10-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] -; GFX10-NEXT: s_lshl_b64 s[4:5], s[4:5], s9 -; GFX10-NEXT: s_cmp_lg_u32 s20, 0 -; GFX10-NEXT: s_cselect_b64 s[10:11], s[10:11], 0 -; GFX10-NEXT: s_cselect_b64 s[4:5], s[16:17], s[4:5] -; GFX10-NEXT: s_cmp_lg_u32 s21, 0 -; GFX10-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] +; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[18:19] +; GFX10-NEXT: s_not_b32 s16, s20 ; GFX10-NEXT: s_sub_i32 s18, s8, 64 ; GFX10-NEXT: s_sub_i32 s9, 64, s8 ; GFX10-NEXT: s_cmp_lt_u32 s8, 64 ; GFX10-NEXT: s_cselect_b32 s19, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s8, 0 -; GFX10-NEXT: s_cselect_b32 s20, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[4:5], s[12:13], s8 -; GFX10-NEXT: s_lshl_b64 s[16:17], s[14:15], s9 -; GFX10-NEXT: s_lshr_b64 s[8:9], s[14:15], s8 -; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[16:17] -; GFX10-NEXT: s_lshr_b64 s[14:15], s[14:15], s18 +; GFX10-NEXT: s_cselect_b32 s21, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[4:5], s9 +; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], s16 +; GFX10-NEXT: s_lshl_b64 s[16:17], s[4:5], s16 +; GFX10-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GFX10-NEXT: s_lshl_b64 s[4:5], s[4:5], s18 +; GFX10-NEXT: s_cmp_lg_u32 s19, 0 +; GFX10-NEXT: s_cselect_b64 s[10:11], s[16:17], 0 +; GFX10-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] +; GFX10-NEXT: s_cmp_lg_u32 s21, 0 +; GFX10-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] +; GFX10-NEXT: s_and_b32 s4, s20, 0x7f +; GFX10-NEXT: s_sub_i32 s18, s4, 64 +; GFX10-NEXT: s_sub_i32 s8, 64, s4 +; GFX10-NEXT: s_cmp_lt_u32 s4, 64 +; GFX10-NEXT: s_cselect_b32 s19, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s4, 0 +; GFX10-NEXT: s_cselect_b32 s21, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[12:13], s20 +; GFX10-NEXT: s_lshl_b64 s[8:9], s[14:15], s8 +; GFX10-NEXT: s_lshr_b64 s[16:17], s[14:15], s20 +; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX10-NEXT: s_lshr_b64 s[8:9], s[14:15], s18 ; GFX10-NEXT: s_cmp_lg_u32 s19, 0 -; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], s[14:15] -; GFX10-NEXT: s_cmp_lg_u32 s20, 0 +; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] +; GFX10-NEXT: s_cmp_lg_u32 s21, 0 ; GFX10-NEXT: s_cselect_b64 s[4:5], s[12:13], s[4:5] ; GFX10-NEXT: s_cmp_lg_u32 s19, 0 -; GFX10-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX10-NEXT: s_cselect_b64 s[8:9], s[16:17], 0 ; GFX10-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] ; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshr_v2i128: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f -; GFX11-NEXT: s_and_not1_b64 s[16:17], 0x7f, s[16:17] ; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX11-NEXT: s_lshr_b32 s22, s1, 31 -; GFX11-NEXT: s_mov_b32 s23, 0 +; GFX11-NEXT: s_lshr_b32 s18, s1, 31 +; GFX11-NEXT: s_mov_b32 s19, 0 +; GFX11-NEXT: s_and_not1_b32 s17, 0x7f, s16 ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[22:23] -; GFX11-NEXT: s_sub_i32 s19, s16, 64 -; GFX11-NEXT: s_sub_i32 s17, 64, s16 -; GFX11-NEXT: s_cmp_lt_u32 s16, 64 -; GFX11-NEXT: s_cselect_b32 s22, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s16, 0 +; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[18:19] +; GFX11-NEXT: s_not_b32 s18, s16 +; GFX11-NEXT: s_sub_i32 s21, s17, 64 +; GFX11-NEXT: s_sub_i32 s22, 64, s17 +; GFX11-NEXT: s_cmp_lt_u32 s17, 64 ; GFX11-NEXT: s_cselect_b32 s28, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[24:25], s[0:1], s17 -; GFX11-NEXT: s_lshl_b64 s[26:27], s[2:3], s16 -; GFX11-NEXT: s_lshl_b64 s[16:17], s[0:1], s16 -; GFX11-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] -; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s19 -; GFX11-NEXT: s_cmp_lg_u32 s22, 0 -; GFX11-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 -; GFX11-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1] +; GFX11-NEXT: s_cmp_eq_u32 s17, 0 +; GFX11-NEXT: s_cselect_b32 s17, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[22:23], s[0:1], s22 +; GFX11-NEXT: s_lshl_b64 s[24:25], s[2:3], s18 +; GFX11-NEXT: s_lshl_b64 s[26:27], s[0:1], s18 +; GFX11-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25] +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s21 ; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_cselect_b64 s[24:25], s[26:27], 0 +; GFX11-NEXT: s_cselect_b64 s[0:1], s[22:23], s[0:1] +; GFX11-NEXT: s_cmp_lg_u32 s17, 0 ; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] -; GFX11-NEXT: s_sub_i32 s22, s18, 64 -; GFX11-NEXT: s_sub_i32 s19, 64, s18 -; GFX11-NEXT: s_cmp_lt_u32 s18, 64 +; GFX11-NEXT: s_and_b32 s0, s16, 0x7f +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_sub_i32 s18, s0, 64 +; GFX11-NEXT: s_sub_i32 s17, 64, s0 +; GFX11-NEXT: s_cmp_lt_u32 s0, 64 +; GFX11-NEXT: s_cselect_b32 s21, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 0 ; GFX11-NEXT: s_cselect_b32 s26, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s18, 0 -; GFX11-NEXT: s_cselect_b32 s27, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[0:1], s[8:9], s18 -; GFX11-NEXT: s_lshl_b64 s[24:25], s[10:11], s19 -; GFX11-NEXT: s_lshr_b64 s[18:19], s[10:11], s18 -; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[24:25] -; GFX11-NEXT: s_lshr_b64 s[10:11], s[10:11], s22 -; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[8:9], s16 +; GFX11-NEXT: s_lshl_b64 s[22:23], s[10:11], s17 +; GFX11-NEXT: s_lshr_b64 s[16:17], s[10:11], s16 +; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[22:23] +; GFX11-NEXT: s_lshr_b64 s[10:11], s[10:11], s18 +; GFX11-NEXT: s_cmp_lg_u32 s21, 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11] -; GFX11-NEXT: s_cmp_lg_u32 s27, 0 -; GFX11-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] ; GFX11-NEXT: s_cmp_lg_u32 s26, 0 -; GFX11-NEXT: s_cselect_b64 s[8:9], s[18:19], 0 -; GFX11-NEXT: s_and_not1_b64 s[10:11], 0x7f, s[20:21] +; GFX11-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX11-NEXT: s_cmp_lg_u32 s21, 0 +; GFX11-NEXT: s_cselect_b64 s[8:9], s[16:17], 0 ; GFX11-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GFX11-NEXT: s_lshr_b32 s22, s5, 31 ; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX11-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f -; GFX11-NEXT: s_or_b64 s[0:1], s[16:17], s[0:1] +; GFX11-NEXT: s_lshr_b32 s18, s5, 31 +; GFX11-NEXT: s_and_not1_b32 s8, 0x7f, s20 +; GFX11-NEXT: s_or_b64 s[0:1], s[24:25], s[0:1] ; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], 1 -; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[22:23] -; GFX11-NEXT: s_sub_i32 s9, s10, 64 -; GFX11-NEXT: s_sub_i32 s11, 64, s10 -; GFX11-NEXT: s_cmp_lt_u32 s10, 64 -; GFX11-NEXT: s_cselect_b32 s20, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s10, 0 -; GFX11-NEXT: s_cselect_b32 s21, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[16:17], s[4:5], s11 -; GFX11-NEXT: s_lshl_b64 s[18:19], s[6:7], s10 -; GFX11-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 -; GFX11-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] -; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s9 -; GFX11-NEXT: s_cmp_lg_u32 s20, 0 -; GFX11-NEXT: s_cselect_b64 s[10:11], s[10:11], 0 -; GFX11-NEXT: s_cselect_b64 s[4:5], s[16:17], s[4:5] -; GFX11-NEXT: s_cmp_lg_u32 s21, 0 -; GFX11-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] +; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[18:19] +; GFX11-NEXT: s_not_b32 s16, s20 ; GFX11-NEXT: s_sub_i32 s18, s8, 64 ; GFX11-NEXT: s_sub_i32 s9, 64, s8 ; GFX11-NEXT: s_cmp_lt_u32 s8, 64 ; GFX11-NEXT: s_cselect_b32 s19, 1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s8, 0 -; GFX11-NEXT: s_cselect_b32 s20, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[4:5], s[12:13], s8 -; GFX11-NEXT: s_lshl_b64 s[16:17], s[14:15], s9 -; GFX11-NEXT: s_lshr_b64 s[8:9], s[14:15], s8 -; GFX11-NEXT: s_or_b64 s[4:5], s[4:5], s[16:17] -; GFX11-NEXT: s_lshr_b64 s[14:15], s[14:15], s18 +; GFX11-NEXT: s_cselect_b32 s21, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[4:5], s9 +; GFX11-NEXT: s_lshl_b64 s[10:11], s[6:7], s16 +; GFX11-NEXT: s_lshl_b64 s[16:17], s[4:5], s16 +; GFX11-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s18 +; GFX11-NEXT: s_cmp_lg_u32 s19, 0 +; GFX11-NEXT: s_cselect_b64 s[10:11], s[16:17], 0 +; GFX11-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] +; GFX11-NEXT: s_cmp_lg_u32 s21, 0 +; GFX11-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] +; GFX11-NEXT: s_and_b32 s4, s20, 0x7f +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_sub_i32 s18, s4, 64 +; GFX11-NEXT: s_sub_i32 s8, 64, s4 +; GFX11-NEXT: s_cmp_lt_u32 s4, 64 +; GFX11-NEXT: s_cselect_b32 s19, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s4, 0 +; GFX11-NEXT: s_cselect_b32 s21, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[4:5], s[12:13], s20 +; GFX11-NEXT: s_lshl_b64 s[8:9], s[14:15], s8 +; GFX11-NEXT: s_lshr_b64 s[16:17], s[14:15], s20 +; GFX11-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX11-NEXT: s_lshr_b64 s[8:9], s[14:15], s18 ; GFX11-NEXT: s_cmp_lg_u32 s19, 0 -; GFX11-NEXT: s_cselect_b64 s[4:5], s[4:5], s[14:15] -; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] +; GFX11-NEXT: s_cmp_lg_u32 s21, 0 ; GFX11-NEXT: s_cselect_b64 s[4:5], s[12:13], s[4:5] ; GFX11-NEXT: s_cmp_lg_u32 s19, 0 -; GFX11-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX11-NEXT: s_cselect_b64 s[8:9], s[16:17], 0 ; GFX11-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] ; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX11-NEXT: ; return to shader part epilog @@ -7649,68 +7711,68 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6-LABEL: v_fshr_v2i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v23, 0x7f, v16 -; GFX6-NEXT: v_not_b32_e32 v16, v16 ; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GFX6-NEXT: v_and_b32_e32 v24, 0x7f, v16 -; GFX6-NEXT: v_lshl_b64 v[16:17], v[0:1], 1 +; GFX6-NEXT: v_lshl_b64 v[17:18], v[0:1], 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v24 -; GFX6-NEXT: v_lshr_b64 v[0:1], v[16:17], v0 -; GFX6-NEXT: v_lshl_b64 v[18:19], v[2:3], v24 -; GFX6-NEXT: v_subrev_i32_e32 v25, vcc, 64, v24 -; GFX6-NEXT: v_lshl_b64 v[21:22], v[16:17], v24 -; GFX6-NEXT: v_or_b32_e32 v18, v0, v18 -; GFX6-NEXT: v_or_b32_e32 v19, v1, v19 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[16:17], v25 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24 -; GFX6-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v22, 0, v22, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v18, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v19, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 -; GFX6-NEXT: v_cndmask_b32_e32 v18, v0, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v19, v1, v3, vcc -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v23 -; GFX6-NEXT: v_lshr_b64 v[0:1], v[8:9], v23 +; GFX6-NEXT: v_not_b32_e32 v0, v16 +; GFX6-NEXT: v_and_b32_e32 v19, 0x7f, v0 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v19 +; GFX6-NEXT: v_lshr_b64 v[0:1], v[17:18], v0 +; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v19 +; GFX6-NEXT: v_subrev_i32_e32 v25, vcc, 64, v19 +; GFX6-NEXT: v_lshl_b64 v[23:24], v[17:18], v19 +; GFX6-NEXT: v_or_b32_e32 v21, v0, v21 +; GFX6-NEXT: v_or_b32_e32 v22, v1, v22 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[17:18], v25 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v23, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v23, 0, v24, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v21, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v22, vcc +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; GFX6-NEXT: v_and_b32_e32 v22, 0x7f, v16 +; GFX6-NEXT: v_cndmask_b32_e32 v19, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v21, v1, v3, vcc +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v22 +; GFX6-NEXT: v_lshr_b64 v[0:1], v[8:9], v22 ; GFX6-NEXT: v_lshl_b64 v[2:3], v[10:11], v2 -; GFX6-NEXT: v_subrev_i32_e32 v24, vcc, 64, v23 +; GFX6-NEXT: v_subrev_i32_e32 v24, vcc, 64, v22 ; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v24 -; GFX6-NEXT: v_lshr_b64 v[16:17], v[10:11], v23 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v22 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc -; GFX6-NEXT: v_not_b32_e32 v8, v20 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v22 ; GFX6-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5] -; GFX6-NEXT: v_or_b32_e32 v3, v19, v3 -; GFX6-NEXT: v_and_b32_e32 v19, 0x7f, v8 ; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 31, v5 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc +; GFX6-NEXT: v_lshr_b64 v[16:17], v[10:11], v22 ; GFX6-NEXT: v_or_b32_e32 v6, v6, v4 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 64, v19 +; GFX6-NEXT: v_not_b32_e32 v4, v20 +; GFX6-NEXT: v_or_b32_e32 v0, v18, v0 +; GFX6-NEXT: v_and_b32_e32 v18, 0x7f, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 64, v18 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[8:9], v4 -; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v19 -; GFX6-NEXT: v_or_b32_e32 v2, v18, v2 -; GFX6-NEXT: v_and_b32_e32 v18, 0x7f, v20 -; GFX6-NEXT: v_subrev_i32_e32 v20, vcc, 64, v19 -; GFX6-NEXT: v_lshl_b64 v[16:17], v[8:9], v19 +; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v18 +; GFX6-NEXT: v_or_b32_e32 v2, v19, v2 +; GFX6-NEXT: v_subrev_i32_e32 v19, vcc, 64, v18 +; GFX6-NEXT: v_lshl_b64 v[16:17], v[8:9], v18 ; GFX6-NEXT: v_or_b32_e32 v10, v4, v10 ; GFX6-NEXT: v_or_b32_e32 v11, v5, v11 -; GFX6-NEXT: v_lshl_b64 v[4:5], v[8:9], v20 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX6-NEXT: v_lshl_b64 v[4:5], v[8:9], v19 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 ; GFX6-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 +; GFX6-NEXT: v_and_b32_e32 v18, 0x7f, v20 ; GFX6-NEXT: v_cndmask_b32_e32 v10, v4, v6, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v11, v5, v7, vcc ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 64, v18 @@ -7729,8 +7791,8 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc -; GFX6-NEXT: v_or_b32_e32 v0, v21, v0 -; GFX6-NEXT: v_or_b32_e32 v1, v22, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v23, v1 +; GFX6-NEXT: v_or_b32_e32 v3, v21, v3 ; GFX6-NEXT: v_or_b32_e32 v4, v16, v4 ; GFX6-NEXT: v_or_b32_e32 v5, v17, v5 ; GFX6-NEXT: v_or_b32_e32 v6, v10, v6 @@ -7740,68 +7802,68 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX8-LABEL: v_fshr_v2i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v23, 0x7f, v16 -; GFX8-NEXT: v_not_b32_e32 v16, v16 ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX8-NEXT: v_and_b32_e32 v24, 0x7f, v16 -; GFX8-NEXT: v_lshlrev_b64 v[16:17], 1, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[17:18], 1, v[0:1] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v24 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[16:17] -; GFX8-NEXT: v_lshlrev_b64 v[18:19], v24, v[2:3] -; GFX8-NEXT: v_subrev_u32_e32 v25, vcc, 64, v24 -; GFX8-NEXT: v_lshlrev_b64 v[21:22], v24, v[16:17] -; GFX8-NEXT: v_or_b32_e32 v18, v0, v18 -; GFX8-NEXT: v_or_b32_e32 v19, v1, v19 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v25, v[16:17] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24 -; GFX8-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v22, 0, v22, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v18, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v19, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 -; GFX8-NEXT: v_cndmask_b32_e32 v18, v0, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v19, v1, v3, vcc -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v23 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v23, v[8:9] +; GFX8-NEXT: v_not_b32_e32 v0, v16 +; GFX8-NEXT: v_and_b32_e32 v19, 0x7f, v0 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v19 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[17:18] +; GFX8-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3] +; GFX8-NEXT: v_subrev_u32_e32 v25, vcc, 64, v19 +; GFX8-NEXT: v_lshlrev_b64 v[23:24], v19, v[17:18] +; GFX8-NEXT: v_or_b32_e32 v21, v0, v21 +; GFX8-NEXT: v_or_b32_e32 v22, v1, v22 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v25, v[17:18] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v23, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v23, 0, v24, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v21, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v22, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; GFX8-NEXT: v_and_b32_e32 v22, 0x7f, v16 +; GFX8-NEXT: v_cndmask_b32_e32 v19, v0, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v21, v1, v3, vcc +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v22 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v22, v[8:9] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, v[10:11] -; GFX8-NEXT: v_subrev_u32_e32 v24, vcc, 64, v23 +; GFX8-NEXT: v_subrev_u32_e32 v24, vcc, 64, v22 ; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11] -; GFX8-NEXT: v_lshrrev_b64 v[16:17], v23, v[10:11] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v22 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc -; GFX8-NEXT: v_not_b32_e32 v8, v20 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v22 ; GFX8-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5] -; GFX8-NEXT: v_or_b32_e32 v3, v19, v3 -; GFX8-NEXT: v_and_b32_e32 v19, 0x7f, v8 ; GFX8-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 31, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc +; GFX8-NEXT: v_lshrrev_b64 v[16:17], v22, v[10:11] ; GFX8-NEXT: v_or_b32_e32 v6, v6, v4 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 64, v19 +; GFX8-NEXT: v_not_b32_e32 v4, v20 +; GFX8-NEXT: v_or_b32_e32 v0, v18, v0 +; GFX8-NEXT: v_and_b32_e32 v18, 0x7f, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 64, v18 ; GFX8-NEXT: v_lshrrev_b64 v[4:5], v4, v[8:9] -; GFX8-NEXT: v_lshlrev_b64 v[10:11], v19, v[6:7] -; GFX8-NEXT: v_or_b32_e32 v2, v18, v2 -; GFX8-NEXT: v_and_b32_e32 v18, 0x7f, v20 -; GFX8-NEXT: v_subrev_u32_e32 v20, vcc, 64, v19 -; GFX8-NEXT: v_lshlrev_b64 v[16:17], v19, v[8:9] +; GFX8-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7] +; GFX8-NEXT: v_or_b32_e32 v2, v19, v2 +; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, 64, v18 +; GFX8-NEXT: v_lshlrev_b64 v[16:17], v18, v[8:9] ; GFX8-NEXT: v_or_b32_e32 v10, v4, v10 ; GFX8-NEXT: v_or_b32_e32 v11, v5, v11 -; GFX8-NEXT: v_lshlrev_b64 v[4:5], v20, v[8:9] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX8-NEXT: v_lshlrev_b64 v[4:5], v19, v[8:9] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 ; GFX8-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 +; GFX8-NEXT: v_and_b32_e32 v18, 0x7f, v20 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v4, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v11, v5, v7, vcc ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 64, v18 @@ -7820,8 +7882,8 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc -; GFX8-NEXT: v_or_b32_e32 v0, v21, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v22, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v23, v1 +; GFX8-NEXT: v_or_b32_e32 v3, v21, v3 ; GFX8-NEXT: v_or_b32_e32 v4, v16, v4 ; GFX8-NEXT: v_or_b32_e32 v5, v17, v5 ; GFX8-NEXT: v_or_b32_e32 v6, v10, v6 @@ -7831,68 +7893,68 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX9-LABEL: v_fshr_v2i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v23, 0x7f, v16 -; GFX9-NEXT: v_not_b32_e32 v16, v16 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX9-NEXT: v_and_b32_e32 v24, 0x7f, v16 -; GFX9-NEXT: v_lshlrev_b64 v[16:17], 1, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[17:18], 1, v[0:1] ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX9-NEXT: v_sub_u32_e32 v0, 64, v24 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[16:17] -; GFX9-NEXT: v_lshlrev_b64 v[18:19], v24, v[2:3] -; GFX9-NEXT: v_subrev_u32_e32 v25, 64, v24 -; GFX9-NEXT: v_lshlrev_b64 v[21:22], v24, v[16:17] -; GFX9-NEXT: v_or_b32_e32 v18, v0, v18 -; GFX9-NEXT: v_or_b32_e32 v19, v1, v19 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v25, v[16:17] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v22, 0, v22, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v18, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v19, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v0, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, 64, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v1, v3, vcc -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v23, v[8:9] +; GFX9-NEXT: v_not_b32_e32 v0, v16 +; GFX9-NEXT: v_and_b32_e32 v19, 0x7f, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, 64, v19 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[17:18] +; GFX9-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3] +; GFX9-NEXT: v_subrev_u32_e32 v25, 64, v19 +; GFX9-NEXT: v_lshlrev_b64 v[23:24], v19, v[17:18] +; GFX9-NEXT: v_or_b32_e32 v21, v0, v21 +; GFX9-NEXT: v_or_b32_e32 v22, v1, v22 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v25, v[17:18] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v23, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v23, 0, v24, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v21, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v22, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; GFX9-NEXT: v_and_b32_e32 v22, 0x7f, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v0, v2, vcc +; GFX9-NEXT: v_sub_u32_e32 v2, 64, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v1, v3, vcc +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v22, v[8:9] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, v[10:11] -; GFX9-NEXT: v_subrev_u32_e32 v24, 64, v23 +; GFX9-NEXT: v_subrev_u32_e32 v24, 64, v22 ; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11] -; GFX9-NEXT: v_lshrrev_b64 v[16:17], v23, v[10:11] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v22 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc -; GFX9-NEXT: v_not_b32_e32 v8, v20 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v22 ; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5] -; GFX9-NEXT: v_or_b32_e32 v3, v19, v3 -; GFX9-NEXT: v_and_b32_e32 v19, 0x7f, v8 ; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5] ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v5 ; GFX9-NEXT: v_or_b32_e32 v6, v6, v4 -; GFX9-NEXT: v_sub_u32_e32 v4, 64, v19 +; GFX9-NEXT: v_not_b32_e32 v4, v20 +; GFX9-NEXT: v_lshrrev_b64 v[16:17], v22, v[10:11] +; GFX9-NEXT: v_or_b32_e32 v0, v18, v0 +; GFX9-NEXT: v_and_b32_e32 v18, 0x7f, v4 +; GFX9-NEXT: v_sub_u32_e32 v4, 64, v18 ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc ; GFX9-NEXT: v_lshrrev_b64 v[4:5], v4, v[8:9] -; GFX9-NEXT: v_lshlrev_b64 v[10:11], v19, v[6:7] -; GFX9-NEXT: v_or_b32_e32 v2, v18, v2 -; GFX9-NEXT: v_and_b32_e32 v18, 0x7f, v20 -; GFX9-NEXT: v_subrev_u32_e32 v20, 64, v19 -; GFX9-NEXT: v_lshlrev_b64 v[16:17], v19, v[8:9] +; GFX9-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7] +; GFX9-NEXT: v_or_b32_e32 v2, v19, v2 +; GFX9-NEXT: v_subrev_u32_e32 v19, 64, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc +; GFX9-NEXT: v_lshlrev_b64 v[16:17], v18, v[8:9] ; GFX9-NEXT: v_or_b32_e32 v10, v4, v10 ; GFX9-NEXT: v_or_b32_e32 v11, v5, v11 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v20, v[8:9] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX9-NEXT: v_lshlrev_b64 v[4:5], v19, v[8:9] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 ; GFX9-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_and_b32_e32 v18, 0x7f, v20 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v4, v6, vcc ; GFX9-NEXT: v_sub_u32_e32 v6, 64, v18 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v5, v7, vcc @@ -7911,8 +7973,8 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc -; GFX9-NEXT: v_or_b32_e32 v0, v21, v0 -; GFX9-NEXT: v_or_b32_e32 v1, v22, v1 +; GFX9-NEXT: v_or_b32_e32 v1, v23, v1 +; GFX9-NEXT: v_or_b32_e32 v3, v21, v3 ; GFX9-NEXT: v_or_b32_e32 v4, v16, v4 ; GFX9-NEXT: v_or_b32_e32 v5, v17, v5 ; GFX9-NEXT: v_or_b32_e32 v6, v10, v6 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index 404e726246f4d..81abe91b283f9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -2787,52 +2787,51 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; CGP-LABEL: v_sdiv_v2i64_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v4 -; CGP-NEXT: v_cvt_f32_u32_e32 v1, v3 -; CGP-NEXT: v_and_b32_e32 v4, 0xffffff, v6 -; CGP-NEXT: v_sub_i32_e32 v6, vcc, 0, v3 +; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v4 +; CGP-NEXT: v_cvt_f32_u32_e32 v1, v5 +; CGP-NEXT: v_and_b32_e32 v6, 0xffffff, v6 +; CGP-NEXT: v_cvt_f32_u32_e32 v3, v6 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v5 ; CGP-NEXT: v_rcp_f32_e32 v1, v1 -; CGP-NEXT: v_and_b32_e32 v7, 0xffffff, v0 +; CGP-NEXT: v_rcp_f32_e32 v7, v3 ; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; CGP-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; CGP-NEXT: v_cvt_u32_f32_e32 v5, v1 -; CGP-NEXT: v_cvt_f32_u32_e32 v1, v4 -; CGP-NEXT: v_mul_lo_u32 v6, v6, v5 -; CGP-NEXT: v_rcp_f32_e32 v8, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0 -; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v8 -; CGP-NEXT: v_cvt_u32_f32_e32 v6, v0 -; CGP-NEXT: v_mov_b32_e32 v0, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; CGP-NEXT: v_cvt_u32_f32_e32 v1, v1 +; CGP-NEXT: v_mul_lo_u32 v4, v4, v1 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v4, 0 +; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v0 +; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v7 +; CGP-NEXT: v_cvt_u32_f32_e32 v7, v0 +; CGP-NEXT: v_mov_b32_e32 v0, v4 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v0, 0 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v6 +; CGP-NEXT: v_mul_lo_u32 v4, v1, v5 +; CGP-NEXT: v_mul_lo_u32 v0, v0, v7 +; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v1 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v5 +; CGP-NEXT: v_cndmask_b32_e32 v4, v1, v8, vcc ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v0, 0 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v4 -; CGP-NEXT: v_mov_b32_e32 v5, v1 -; CGP-NEXT: v_mul_lo_u32 v0, v0, v6 -; CGP-NEXT: v_mul_lo_u32 v1, v5, v3 -; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v5 -; CGP-NEXT: v_sub_i32_e32 v7, vcc, v7, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 -; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; CGP-NEXT: v_sub_i32_e64 v8, s[4:5], v3, v5 ; CGP-NEXT: v_mov_b32_e32 v0, v1 -; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v6, v0 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v7, v0 ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0 -; CGP-NEXT: v_sub_i32_e64 v8, s[4:5], v7, v3 -; CGP-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc +; CGP-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v4 ; CGP-NEXT: v_mov_b32_e32 v7, v1 -; CGP-NEXT: v_mul_lo_u32 v8, v7, v4 -; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v5 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 +; CGP-NEXT: v_mul_lo_u32 v8, v7, v6 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 +; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v7 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v2, v4 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v3 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 -; CGP-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v2, v6 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v3 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 +; CGP-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; CGP-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll index 3729f1cc2b12d..183f2edbf9035 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -563,18 +563,21 @@ define amdgpu_ps i32 @s_shl_i32_zext_i16(i16 inreg %x) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s0, s0, 0x3fff ; GFX8-NEXT: s_lshl_b32 s0, s0, 2 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_shl_i32_zext_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s0, s0, 0x3fff ; GFX9-NEXT: s_lshl_b32 s0, s0, 2 +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_shl_i32_zext_i16: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0x3fff ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 2 +; GFX10PLUS-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX10PLUS-NEXT: ; return to shader part epilog %and = and i16 %x, 16383 %ext = zext i16 %and to i32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index 5b94e71ecf52e..cfac0c2fa56aa 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -3286,45 +3286,45 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; CGP-LABEL: v_srem_v2i64_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v4 -; CGP-NEXT: v_cvt_f32_u32_e32 v1, v3 -; CGP-NEXT: v_and_b32_e32 v4, 0xffffff, v6 -; CGP-NEXT: v_sub_i32_e32 v6, vcc, 0, v3 +; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v4 +; CGP-NEXT: v_cvt_f32_u32_e32 v1, v5 +; CGP-NEXT: v_and_b32_e32 v6, 0xffffff, v6 +; CGP-NEXT: v_cvt_f32_u32_e32 v3, v6 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v5 ; CGP-NEXT: v_rcp_f32_e32 v1, v1 -; CGP-NEXT: v_and_b32_e32 v7, 0xffffff, v0 +; CGP-NEXT: v_rcp_f32_e32 v7, v3 ; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; CGP-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; CGP-NEXT: v_cvt_u32_f32_e32 v5, v1 -; CGP-NEXT: v_cvt_f32_u32_e32 v1, v4 -; CGP-NEXT: v_mul_lo_u32 v6, v6, v5 -; CGP-NEXT: v_rcp_f32_e32 v8, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0 -; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v8 -; CGP-NEXT: v_cvt_u32_f32_e32 v6, v0 -; CGP-NEXT: v_mov_b32_e32 v0, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; CGP-NEXT: v_cvt_u32_f32_e32 v1, v1 +; CGP-NEXT: v_mul_lo_u32 v4, v4, v1 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v4, 0 +; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v0 +; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v7 +; CGP-NEXT: v_cvt_u32_f32_e32 v7, v0 +; CGP-NEXT: v_mov_b32_e32 v0, v4 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v0, 0 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v6 +; CGP-NEXT: v_mul_lo_u32 v0, v0, v7 +; CGP-NEXT: v_mul_lo_u32 v4, v1, v5 ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v0, 0 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v4 -; CGP-NEXT: v_mul_lo_u32 v0, v0, v6 -; CGP-NEXT: v_mul_lo_u32 v5, v1, v3 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0 -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 ; CGP-NEXT: v_mov_b32_e32 v0, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v6, v0 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v7, v0 ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0 -; CGP-NEXT: v_sub_i32_e32 v7, vcc, v5, v3 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3 -; CGP-NEXT: v_mul_lo_u32 v6, v1, v4 -; CGP-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v0, v3 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v4 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v3, v5 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v5 +; CGP-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; CGP-NEXT: v_mul_lo_u32 v4, v1, v6 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v0, v5 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v6 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v4 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v6 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll index e31d8e95bd608..1ee521b3dedac 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -2147,26 +2147,26 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v2 -; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v4 +; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v4 +; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v6 ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v0 -; CGP-NEXT: v_cvt_f32_u32_e32 v2, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v1, v1 +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v3, v3 -; CGP-NEXT: v_rcp_f32_e32 v4, v2 +; CGP-NEXT: v_rcp_f32_e32 v4, v1 ; CGP-NEXT: v_rcp_f32_e32 v5, v3 ; CGP-NEXT: v_mul_f32_e32 v4, v0, v4 -; CGP-NEXT: v_mul_f32_e32 v5, v1, v5 +; CGP-NEXT: v_mul_f32_e32 v5, v2, v5 ; CGP-NEXT: v_trunc_f32_e32 v4, v4 ; CGP-NEXT: v_trunc_f32_e32 v5, v5 -; CGP-NEXT: v_mad_f32 v0, -v4, v2, v0 +; CGP-NEXT: v_mad_f32 v0, -v4, v1, v0 ; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CGP-NEXT: v_mad_f32 v1, -v5, v3, v1 +; CGP-NEXT: v_mad_f32 v2, -v5, v3, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v0|, v2 +; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v0|, v1 ; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, v3 +; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, v3 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll index f30b278b3e611..a7e5ce3d21619 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -2561,12 +2561,12 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v2 -; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v4 +; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v4 +; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v6 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v0 -; CGP-NEXT: v_cvt_f32_u32_e32 v5, v2 -; CGP-NEXT: v_cvt_f32_u32_e32 v6, v1 +; CGP-NEXT: v_cvt_f32_u32_e32 v5, v1 +; CGP-NEXT: v_cvt_f32_u32_e32 v6, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v7, v3 ; CGP-NEXT: v_rcp_f32_e32 v8, v5 ; CGP-NEXT: v_rcp_f32_e32 v9, v7 @@ -2584,10 +2584,10 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; CGP-NEXT: v_mul_lo_u32 v2, v4, v2 +; CGP-NEXT: v_mul_lo_u32 v1, v4, v1 ; CGP-NEXT: v_mul_lo_u32 v3, v5, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v2, v3 ; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v1 ; CGP-NEXT: v_mov_b32_e32 v1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll index 9ea9fa91e4f92..1b35a89ad7f93 100644 --- a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll +++ b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll @@ -278,7 +278,6 @@ define amdgpu_ps i64 @s_csh_64_0(i64 inreg %a, i64 inreg %b) { ; ; GISEL-LABEL: s_csh_64_0: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_and_b64 s[2:3], s[2:3], 63 ; GISEL-NEXT: s_lshl_b64 s[4:5], s[0:1], s2 ; GISEL-NEXT: s_lshr_b64 s[6:7], s[0:1], s2 ; GISEL-NEXT: s_ashr_i64 s[0:1], s[0:1], s2 @@ -310,7 +309,6 @@ define amdgpu_ps i64 @s_csh_64_1(i64 inreg %a, i64 inreg %b) { ; ; GISEL-LABEL: s_csh_64_1: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_and_b64 s[2:3], s[2:3], 0xff ; GISEL-NEXT: s_lshl_b64 s[4:5], s[0:1], s2 ; GISEL-NEXT: s_lshr_b64 s[6:7], s[0:1], s2 ; GISEL-NEXT: s_ashr_i64 s[0:1], s[0:1], s2 diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll index a0b549711f339..93e14a205f05d 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -1592,7 +1592,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 -; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 24, v1 +; GFX10-GISEL-NEXT: v_sub_nc_u16 v1, v1, 24 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0xffff, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[4:5] @@ -1837,7 +1837,7 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 -; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 25, v1 +; GFX10-GISEL-NEXT: v_sub_nc_u16 v1, v1, 25 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0