llvm · Pierre-vh · Nov 30, 2023 · Nov 21, 2023 · Nov 28, 2023 · Nov 28, 2023
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1993,12 +1993,14 @@ def int_amdgcn_alignbyte : ClangBuiltin<"__builtin_amdgcn_alignbyte">,
   [IntrNoMem, IntrSpeculatable]
 >;
 
-def int_amdgcn_mul_i24 : DefaultAttrsIntrinsic<[llvm_i32_ty],
+// mul24 intrinsics can return i32 or i64.
+// When returning i64, they're lowered to a mul24/mulhi24 pair.
+def int_amdgcn_mul_i24 : DefaultAttrsIntrinsic<[llvm_anyint_ty],
   [llvm_i32_ty, llvm_i32_ty],
   [IntrNoMem, IntrSpeculatable]
 >;
 
-def int_amdgcn_mul_u24 : DefaultAttrsIntrinsic<[llvm_i32_ty],
+def int_amdgcn_mul_u24 : DefaultAttrsIntrinsic<[llvm_anyint_ty],
   [llvm_i32_ty, llvm_i32_ty],
   [IntrNoMem, IntrSpeculatable]
 >;

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -624,34 +624,6 @@ static Value *insertValues(IRBuilder<> &Builder,
   return NewVal;
 }
 
-// Returns 24-bit or 48-bit (as per `NumBits` and `Size`) mul of `LHS` and
-// `RHS`. `NumBits` is the number of KnownBits of the result and `Size` is the
-// width of the original destination.
-static Value *getMul24(IRBuilder<> &Builder, Value *LHS, Value *RHS,
-                       unsigned Size, unsigned NumBits, bool IsSigned) {
-  if (Size <= 32 || NumBits <= 32) {
-    Intrinsic::ID ID =
-        IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
-    return Builder.CreateIntrinsic(ID, {}, {LHS, RHS});
-  }
-
-  assert(NumBits <= 48);
-
-  Intrinsic::ID LoID =
-      IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
-  Intrinsic::ID HiID =
-      IsSigned ? Intrinsic::amdgcn_mulhi_i24 : Intrinsic::amdgcn_mulhi_u24;
-
-  Value *Lo = Builder.CreateIntrinsic(LoID, {}, {LHS, RHS});
-  Value *Hi = Builder.CreateIntrinsic(HiID, {}, {LHS, RHS});
-
-  IntegerType *I64Ty = Builder.getInt64Ty();
-  Lo = Builder.CreateZExtOrTrunc(Lo, I64Ty);
-  Hi = Builder.CreateZExtOrTrunc(Hi, I64Ty);
-
-  return Builder.CreateOr(Lo, Builder.CreateShl(Hi, 32));
-}
-
 bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
   if (I.getOpcode() != Instruction::Mul)
     return false;
@@ -691,26 +663,20 @@ bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
   extractValues(Builder, RHSVals, RHS);
 
   IntegerType *I32Ty = Builder.getInt32Ty();
-  for (int I = 0, E = LHSVals.size(); I != E; ++I) {
-    Value *LHS, *RHS;
-    if (IsSigned) {
-      LHS = Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty);
-      RHS = Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty);
-    } else {
-      LHS = Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
-      RHS = Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
-    }
+  IntegerType *IntrinTy = Size > 32 ? Builder.getInt64Ty() : I32Ty;
+  Type *DstTy = LHSVals[0]->getType();
 
-    Value *Result =
-        getMul24(Builder, LHS, RHS, Size, LHSBits + RHSBits, IsSigned);
-
-    if (IsSigned) {
-      ResultVals.push_back(
-          Builder.CreateSExtOrTrunc(Result, LHSVals[I]->getType()));
-    } else {
-      ResultVals.push_back(
-          Builder.CreateZExtOrTrunc(Result, LHSVals[I]->getType()));
-    }
+  for (int I = 0, E = LHSVals.size(); I != E; ++I) {
+    Value *LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty)
+                          : Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
+    Value *RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty)
+                          : Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
+    Intrinsic::ID ID =
+        IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
+    Value *Result = Builder.CreateIntrinsic(ID, {IntrinTy}, {LHS, RHS});
+    Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy)
+                      : Builder.CreateZExtOrTrunc(Result, DstTy);
+    ResultVals.push_back(Result);
   }
 
   Value *NewVal = insertValues(Builder, Ty, ResultVals);

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -281,11 +281,15 @@ def AMDGPUffbh_i32_impl : SDNode<"AMDGPUISD::FFBH_I32", SDTIntBitCountUnaryOp>;
 def AMDGPUffbl_b32_impl : SDNode<"AMDGPUISD::FFBL_B32", SDTIntBitCountUnaryOp>;
 
 // Signed and unsigned 24-bit multiply. The highest 8-bits are ignore
-// when performing the multiply. The result is a 32-bit value.
-def AMDGPUmul_u24_impl : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp,
+// when performing the multiply. The result is a 32 or 64 bit value.
+def AMDGPUMul24Op : SDTypeProfile<1, 2, [
+  SDTCisInt<0>, SDTCisInt<1>, SDTCisSameAs<1, 2>
+]>;
+
+def AMDGPUmul_u24_impl : SDNode<"AMDGPUISD::MUL_U24", AMDGPUMul24Op,
   [SDNPCommutative, SDNPAssociative]
 >;
-def AMDGPUmul_i24_impl : SDNode<"AMDGPUISD::MUL_I24", SDTIntBinOp,
+def AMDGPUmul_i24_impl : SDNode<"AMDGPUISD::MUL_I24", AMDGPUMul24Op,
   [SDNPCommutative, SDNPAssociative]
 >;
 

diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -862,6 +862,17 @@ def :  divergent_i64_BinOp <and, V_AND_B32_e64>;
 def :  divergent_i64_BinOp <or,  V_OR_B32_e64>;
 def :  divergent_i64_BinOp <xor, V_XOR_B32_e64>;
 
+// mul24 w/ 64 bit output.
+class mul24_64_Pat<SDPatternOperator Op, Instruction InstLo, Instruction InstHi> : GCNPat<
+  (i64 (Op i32:$src0, i32:$src1)),
+  (REG_SEQUENCE VReg_64,
+    (InstLo $src0, $src1), sub0,
+    (InstHi $src0, $src1), sub1)
+>;
+
+def : mul24_64_Pat<AMDGPUmul_i24, V_MUL_I32_I24_e64, V_MUL_HI_I32_I24_e64>;
+def : mul24_64_Pat<AMDGPUmul_u24, V_MUL_U32_U24_e64, V_MUL_HI_U32_U24_e64>;
+
 //===----------------------------------------------------------------------===//
 // 16-Bit Operand Instructions
 //===----------------------------------------------------------------------===//