-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[InstSimplify] Fold xor using implied conditions #75609
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-amdgpu Author: Yingwei Zheng (dtcxzyw) ChangesThis patch folds Fixes #70928. Full diff: https://github.com/llvm/llvm-project/pull/75609.diff 3 Files Affected:
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 2a45acf63aa2ca..26ae9b472ff3d6 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -2563,6 +2563,22 @@ static Value *simplifyXorInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
if (Value *V = simplifyByDomEq(Instruction::Xor, Op0, Op1, Q, MaxRecurse))
return V;
+ if (Op0->getType()->isIntOrIntVectorTy(1)) {
+ bool InvalidTable[2][2] = {};
+ if (std::optional<bool> Implied = isImpliedCondition(Op0, Op1, Q.DL, false))
+ InvalidTable[0][!*Implied] = true;
+ if (std::optional<bool> Implied = isImpliedCondition(Op0, Op1, Q.DL, true))
+ InvalidTable[1][!*Implied] = true;
+ if (std::optional<bool> Implied = isImpliedCondition(Op1, Op0, Q.DL, false))
+ InvalidTable[!*Implied][0] = true;
+ if (std::optional<bool> Implied = isImpliedCondition(Op1, Op0, Q.DL, true))
+ InvalidTable[!*Implied][1] = true;
+
+ if (InvalidTable[0][0] && InvalidTable[1][1])
+ return ConstantInt::getTrue(Op0->getType());
+ // NOTE: There would be no benefit to handle other cases.
+ }
+
return nullptr;
}
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index c3ac778f82e049..2b723662fb1cc2 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -237,33 +237,14 @@ define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(ptr addrspace(1) %arg) {
}
define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(ptr addrspace(1) %arg) {
-; GFX1032-LABEL: test_vop3_cmp_i32_sop_xor:
-; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0, v1
-; GFX1032-NEXT: v_cmp_gt_i32_e64 s0, 1, v1
-; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: v_cndmask_b32_e64 v1, 2, 1, s0
-; GFX1032-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX1032-NEXT: s_endpgm
-;
-; GFX1064-LABEL: test_vop3_cmp_i32_sop_xor:
-; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_lt_i32_e32 vcc, 0, v1
-; GFX1064-NEXT: v_cmp_gt_i32_e64 s[0:1], 1, v1
-; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: v_cndmask_b32_e64 v1, 2, 1, s[0:1]
-; GFX1064-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX1064-NEXT: s_endpgm
+; GCN-LABEL: test_vop3_cmp_i32_sop_xor:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: global_store_dword v0, v1, s[0:1]
+; GCN-NEXT: s_endpgm
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %lid
%load = load i32, ptr addrspace(1) %gep, align 4
diff --git a/llvm/test/Transforms/InstCombine/xor-icmps.ll b/llvm/test/Transforms/InstCombine/xor-icmps.ll
index c85993ea9a7e0d..8ef5465d32e845 100644
--- a/llvm/test/Transforms/InstCombine/xor-icmps.ll
+++ b/llvm/test/Transforms/InstCombine/xor-icmps.ll
@@ -171,3 +171,80 @@ define i1 @xor_icmp_ptr(ptr %c, ptr %d) {
ret i1 %xor
}
+; Tests from PR70928
+define i1 @xor_icmp_true1(i32 %x, i32 %y) {
+; CHECK-LABEL: @xor_icmp_true1(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: ret i1 true
+;
+entry:
+ %add = add nsw i32 %y, 1
+ %cmp1 = icmp sgt i32 %x, %y
+ %cmp2 = icmp slt i32 %x, %add
+ %xor = xor i1 %cmp1, %cmp2
+ ret i1 %xor
+}
+
+define i1 @xor_icmp_true2(i32 %x, i32 %y) {
+; CHECK-LABEL: @xor_icmp_true2(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: ret i1 true
+;
+entry:
+ %sub = add nsw i32 %y, -1
+ %cmp1 = icmp slt i32 %x, %y
+ %cmp2 = icmp sgt i32 %x, %sub
+ %xor = xor i1 %cmp1, %cmp2
+ ret i1 %xor
+}
+
+define i1 @xor_icmp_true3(i32 %a) {
+; CHECK-LABEL: @xor_icmp_true3(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: ret i1 true
+;
+entry:
+ %cmp = icmp sgt i32 %a, 5
+ %cmp1 = icmp slt i32 %a, 6
+ %cmp3 = xor i1 %cmp, %cmp1
+ ret i1 %cmp3
+}
+
+define i1 @xor_icmp_true4(i32 %a) {
+; CHECK-LABEL: @xor_icmp_true4(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: ret i1 true
+;
+entry:
+ %cmp = icmp slt i32 %a, 5
+ %cmp1 = icmp sgt i32 %a, 4
+ %cmp3 = xor i1 %cmp, %cmp1
+ ret i1 %cmp3
+}
+
+define i1 @xor_icmp_true4_commuted(i32 %a) {
+; CHECK-LABEL: @xor_icmp_true4_commuted(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: ret i1 true
+;
+entry:
+ %cmp = icmp slt i32 %a, 5
+ %cmp1 = icmp sgt i32 %a, 4
+ %cmp3 = xor i1 %cmp1, %cmp
+ ret i1 %cmp3
+}
+
+define i1 @xor_icmp_failed_to_imply(i32 %a) {
+; CHECK-LABEL: @xor_icmp_failed_to_imply(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[A:%.*]], 7
+; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[A]], 4
+; CHECK-NEXT: [[CMP3:%.*]] = xor i1 [[CMP]], [[CMP1]]
+; CHECK-NEXT: ret i1 [[CMP3]]
+;
+entry:
+ %cmp = icmp slt i32 %a, 7
+ %cmp1 = icmp sgt i32 %a, 4
+ %cmp3 = xor i1 %cmp, %cmp1
+ ret i1 %cmp3
+}
|
In terms of high-level approach for the motivating case, I wonder whether it may make sense to implement an exact set-xor operation on ConstantRange and then handle this in foldAndOrOfICmpsUsingRanges. This would allow handling cases where the xor can be folded into a new icmp, rather than just folded to true. |
This patch folds
xor X, Y
intotrue
if we proveX
is not equal toY
.Alive2: https://alive2.llvm.org/ce/z/Kin_R7
This can be improved by handling other cases (e.g.,
InvalidTable[0][1] && InvalidTable[1][0] --> false
). But I don't see the benefit of that.Compile-time impact: http://llvm-compile-time-tracker.com/compare.php?from=f505a2f8f2d8cf6d17713c3a4703a1cd35f32d66&to=1d0d7fa017a337da4c24bb697fd9a97bcf9001f7&stat=instructions:u
Fixes #70928.