llvm · mariusz-sikora-at-amd · Jan 24, 2024 · Jan 15, 2024 · Jan 17, 2024 · Jan 18, 2024
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -8257,15 +8257,6 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
     ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
   }
 
-  if (isCvt_F32_Fp8_Bf8_e64(Opc) && Opc != AMDGPU::V_CVT_PK_F32_BF8_e64_gfx12 &&
-      Opc != AMDGPU::V_CVT_PK_F32_FP8_e64_gfx12) {
-    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I++]);
-    Op.addRegOrImmWithFPInputModsOperands(Inst, 1); // src0
-    // Add dummy src1
-    Inst.addOperand(MCOperand::createImm(0));
-    Inst.addOperand(MCOperand::createReg(0));
-  }
-
   for (unsigned E = Operands.size(); I != E; ++I) {
     AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
     if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
@@ -8818,13 +8809,6 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands,
       Fi = Op.getImm();
     } else if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
       Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
-      if (isCvt_F32_Fp8_Bf8_e64(Inst.getOpcode()) &&
-          Inst.getOpcode() != AMDGPU::V_CVT_PK_F32_BF8_e64_gfx12 &&
-          Inst.getOpcode() != AMDGPU::V_CVT_PK_F32_FP8_e64_gfx12) {
-        // Add dummy src1
-        Inst.addOperand(MCOperand::createImm(0));
-        Inst.addOperand(MCOperand::createReg(0));
-      }
     } else if (Op.isReg()) {
       Op.addRegOperands(Inst, 1);
     } else if (Op.isImm() &&

diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -1305,9 +1305,17 @@ void AMDGPUInstPrinter::printOpSel(const MCInst *MI, unsigned,
                                    const MCSubtargetInfo &STI,
                                    raw_ostream &O) {
   unsigned Opc = MI->getOpcode();
-  if (isPermlane16(Opc) || (isCvt_F32_Fp8_Bf8_e64(Opc) &&
-                            Opc != AMDGPU::V_CVT_PK_F32_BF8_e64_gfx12 &&
-                            Opc != AMDGPU::V_CVT_PK_F32_FP8_e64_gfx12)) {
+  if (isCvt_F32_Fp8_Bf8_e64(Opc)) {
+    auto SrcMod =
+        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
+    unsigned Mod = MI->getOperand(SrcMod).getImm();
+    unsigned Index0 = !!(Mod & SISrcMods::OP_SEL_0);
+    unsigned Index1 = !!(Mod & SISrcMods::OP_SEL_1);
+    if (Index0 || Index1)
+      O << " op_sel:[" << Index0 << ',' << Index1 << ']';
+    return;
+  }
+  if (isPermlane16(Opc)) {
     auto FIN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
     auto BCN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
     unsigned FI = !!(MI->getOperand(FIN).getImm() & SISrcMods::OP_SEL_0);

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2282,6 +2282,8 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
   field bit IsSingle = 0;
   field bit IsWMMA = 0;
 
+  field bit IsFP8 = 0;
+
   field bit HasDst = !ne(DstVT.Value, untyped.Value);
   field bit HasDst32 = HasDst;
   field bit EmitDst = HasDst; // force dst encoding, see v_movreld_b32 special case

diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -641,26 +641,16 @@ let SubtargetPredicate = isGFX9Only in {
 
 // Similar to VOPProfile_Base_CVT_F32_F8, but for VOP3 instructions.
 def VOPProfile_Base_CVT_PK_F32_F8_OpSel : VOPProfileI2F <v2f32, i32> {
-  let InsVOP3OpSel = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0,
-                          clampmod:$clamp, omod:$omod, op_sel0:$op_sel);
-
   let HasOpSel = 1;
   let HasExtVOP3DPP = 0;
 }
 
-def VOPProfile_Base_CVT_F32_F8_OpSel : VOPProfile<[f32, i32, i32, untyped]> {
-  let InsVOP3OpSel = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0,
-                          Src1Mod:$src1_modifiers, Src1RC64:$src1,
-                          clampmod:$clamp, omod:$omod, op_sel0:$op_sel);
-  let AsmVOP3OpSel = !subst(", $src1_modifiers", "", getAsmVOP3OpSel<2, 0, 0, 1, 1, 0>.ret);
-
+def VOPProfile_Base_CVT_F32_F8_OpSel : VOPProfile<[f32, i32, untyped, untyped]> {
   let HasOpSel = 1;
   let HasExtDPP = 1;
   let HasExtVOP3DPP = 1;
-
+  let IsFP8 = 1;
   let Src1VOP3DPP = Src1RC64;
-  let AsmVOP3DPP8 = getAsmVOP3DPP8<AsmVOP3OpSel>.ret;
-  let AsmVOP3DPP16 = getAsmVOP3DPP16<AsmVOP3OpSel>.ret;
 }
 
 let SubtargetPredicate = isGFX12Plus, mayRaiseFPException = 0,
@@ -675,9 +665,10 @@ class Cvt_F32_F8_Pat_OpSel<SDPatternOperator node, bits<2> index,
     VOP1_Pseudo inst_e32, VOP3_Pseudo inst_e64> : GCNPat<
     (f32 (node i32:$src, index)),
     !if (index,
-         (inst_e64 !if(index{0}, SRCMODS.OP_SEL_0, 0), $src,
-                   !if(index{1}, SRCMODS.OP_SEL_0, 0), (i32 0),
-                   0, 0, 0),
+         (inst_e64 !if(index{0},
+                         !if(index{1}, 12 /*SRCMODS.OP_SEL_0 | SRCMODS.OP_SEL_1*/, SRCMODS.OP_SEL_0),
+                         !if(index{1}, SRCMODS.OP_SEL_1, 0)),
+                    $src, 0, 0, 0),
          (inst_e32 $src))
 >;
 

diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -305,6 +305,11 @@ class VOP3OpSel_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
 
 class VOP3OpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3OpSel_gfx10<op, p>;
 
+class VOP3FP8OpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
+  let Inst{11} = !if(p.HasSrc0, src0_modifiers{2}, 0);
+  let Inst{12} = !if(p.HasSrc0, src0_modifiers{3}, 0);
+}
+
 class VOP3DotOpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3OpSel_gfx11_gfx12<op, p>{
   let Inst{11} = ?;
   let Inst{12} = ?;
@@ -738,7 +743,7 @@ class VOP3_DPPe_Common_Base<bits<10> op, VOPProfile P> : Enc96 {
   let Inst{10}    = !if(P.HasSrc2Mods, src2_modifiers{1}, 0);
   // OPSEL must be set such that the low result only uses low inputs, and the high result only uses high inputs.
   let Inst{11} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{2}, 0),?);
-  let Inst{12} = !if(P.HasOpSel,!if(P.HasSrc1Mods, src1_modifiers{2}, 0),?);
+  let Inst{12} = !if(P.HasOpSel,!if(P.HasSrc1Mods, src1_modifiers{2}, !if((P.IsFP8), src0_modifiers{3}, 0)), ?);
   let Inst{13} = !if(P.HasOpSel,!if(P.HasSrc2Mods, src2_modifiers{2}, 0),?);
   let Inst{14} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{3}, 0),?);
   let Inst{15}    = !if(P.HasClamp, clamp, 0);
@@ -1406,14 +1411,20 @@ multiclass VOP3_Real_with_name<GFXGen Gen, bits<10> op, string opName,
   defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
   let AsmString = asmName # ps.AsmOperands,
       IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
-  if ps.Pfl.HasOpSel then
-    def _e64#Gen.Suffix :
-      VOP3_Real_Gen<ps, Gen>,
-      VOP3OpSel_gfx11_gfx12<op, ps.Pfl>;
-  if !not(ps.Pfl.HasOpSel) then
-    def _e64#Gen.Suffix :
-      VOP3_Real_Gen<ps, Gen>,
-      VOP3e_gfx11_gfx12<op, ps.Pfl>;
+    if ps.Pfl.IsFP8 then {
+      def _e64#Gen.Suffix :
+        VOP3_Real_Gen<ps, Gen>,
+        VOP3FP8OpSel_gfx11_gfx12<op, ps.Pfl>;
+    } else {
+      if ps.Pfl.HasOpSel then
+        def _e64#Gen.Suffix :
+          VOP3_Real_Gen<ps, Gen>,
+          VOP3OpSel_gfx11_gfx12<op, ps.Pfl>;
+      if !not(ps.Pfl.HasOpSel) then
+        def _e64#Gen.Suffix :
+          VOP3_Real_Gen<ps, Gen>,
+          VOP3e_gfx11_gfx12<op, ps.Pfl>;
+    }
   }
   def Gen.Suffix#"_VOP3_alias" : MnemonicAlias<ps.Mnemonic, asmName>, Requires<[Gen.AssemblerPredicate]>, LetDummies;
 }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir
@@ -35,12 +35,12 @@ body:             |
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-    ; GFX12-NEXT: [[V_CVT_F32_BF8_OP_SEL_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64_dpp [[DEF]], 8, [[COPY]], 4, 0, 0, 0, 0, 228, 15, 15, 1, implicit $mode, implicit $exec
+    ; GFX12-NEXT: [[V_CVT_F32_BF8_OP_SEL_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64_dpp [[DEF]], 8, [[COPY]], 0, 0, 0, 228, 15, 15, 1, implicit $mode, implicit $exec
     ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_BF8_OP_SEL_e64_dpp]]
     ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec
-    %2:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64 8, killed %1, 4, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64 8, killed %1, 0, 0, 0, implicit $mode, implicit $exec
     $vgpr0 = COPY %2
     SI_RETURN_TO_EPILOG $vgpr0
 
@@ -57,12 +57,12 @@ body:             |
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-    ; GFX12-NEXT: [[V_CVT_F32_FP8_OP_SEL_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64_dpp [[DEF]], 4, [[COPY]], 4, 0, 0, 0, 0, 228, 15, 15, 1, implicit $mode, implicit $exec
+    ; GFX12-NEXT: [[V_CVT_F32_FP8_OP_SEL_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64_dpp [[DEF]], 12, [[COPY]], 0, 0, 0, 228, 15, 15, 1, implicit $mode, implicit $exec
     ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_FP8_OP_SEL_e64_dpp]]
     ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec
-    %2:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64 4, killed %1, 4, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64 12, killed %1, 0, 0, 0, implicit $mode, implicit $exec
     $vgpr0 = COPY %2
     SI_RETURN_TO_EPILOG $vgpr0