[AArch64] Use pattern to select bf16 fpextend (llvm#137212)

john-brawn-arm · GeorgeARM · commit 1e56e59d0edb · 2025-05-07T16:30:06.000+01:00
Currently bf16 fpextend is lowered to a vector shift. Instead leave it
as fpextend and have an instruction selection pattern which selects to a
shift later. Doing this means that DAGCombiner patterns for fpextend
will be applied, leading to better codegen. It also means that in some
situations we use a mov instruction where we previously have a dup
instruction, but I don't think this makes any difference.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -766,13 +766,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(Op, MVT::v8bf16, Expand);
   }
 
-  // For bf16, fpextend is custom lowered to be optionally expanded into shifts.
-  setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
+  // fpextend from f16 or bf16 to f32 is legal
+  setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
+  setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Legal);
+  setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
+  setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f32, Legal);
+  // fpextend from bf16 to f64 needs to be split into two fpextends
   setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
-  setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Custom);
-  setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom);
   setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);
-  setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f32, Custom);
 
   auto LegalizeNarrowFP = [this](MVT ScalarVT) {
     for (auto Op : {
@@ -4559,33 +4560,6 @@ SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
     return SDValue();
   }
 
-  if (VT.getScalarType() == MVT::f32) {
-    // FP16->FP32 extends are legal for v32 and v4f32.
-    if (Op0VT.getScalarType() == MVT::f16)
-      return Op;
-    if (Op0VT.getScalarType() == MVT::bf16) {
-      SDLoc DL(Op);
-      EVT IVT = VT.changeTypeToInteger();
-      if (!Op0VT.isVector()) {
-        Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4bf16, Op0);
-        IVT = MVT::v4i32;
-      }
-
-      EVT Op0IVT = Op0.getValueType().changeTypeToInteger();
-      SDValue Ext =
-          DAG.getNode(ISD::ANY_EXTEND, DL, IVT, DAG.getBitcast(Op0IVT, Op0));
-      SDValue Shift =
-          DAG.getNode(ISD::SHL, DL, IVT, Ext, DAG.getConstant(16, DL, IVT));
-      if (!Op0VT.isVector())
-        Shift = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Shift,
-                            DAG.getConstant(0, DL, MVT::i64));
-      Shift = DAG.getBitcast(VT, Shift);
-      return IsStrict ? DAG.getMergeValues({Shift, Op.getOperand(0)}, DL)
-                      : Shift;
-    }
-    return SDValue();
-  }
-
   assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
   return SDValue();
 }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -8513,6 +8513,26 @@ def : InstAlias<"uxtl2 $dst.2d, $src1.4s",
                 (USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
 }
 
+// fpextend from bf16 to f32 is just a shift left by 16
+let Predicates = [HasNEON] in {
+def : Pat<(f32 (any_fpextend (bf16 FPR16:$Rn))),
+          (f32 (EXTRACT_SUBREG
+            (v4i32 (SHLLv4i16 (v4i16 (SUBREG_TO_REG (i64 0), (bf16 FPR16:$Rn), hsub)))),
+            ssub))>;
+def : Pat<(v4f32 (any_fpextend (v4bf16 V64:$Rn))),
+          (SHLLv4i16 V64:$Rn)>;
+def : Pat<(v4f32 (any_fpextend (extract_high_v8bf16 (v8bf16 V128:$Rn)))),
+          (SHLLv8i16 V128:$Rn)>;
+}
+// Fallback pattern for when we don't have NEON
+def : Pat<(f32 (any_fpextend (bf16 FPR16:$Rn))),
+          (f32 (COPY_TO_REGCLASS
+            (i32 (UBFMWri (COPY_TO_REGCLASS
+                            (f32 (SUBREG_TO_REG (i32 0), (bf16 FPR16:$Rn), hsub)),
+                            GPR32),
+                          (i64 16), (i64 15))),
+            FPR32))>;
+
 def abs_f16 :
   OutPatFrag<(ops node:$Rn),
              (EXTRACT_SUBREG (f32 (COPY_TO_REGCLASS
diff --git a/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll b/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll
@@ -155,9 +155,7 @@ entry:
 define i32 @fptosi_bf(bfloat %a) nounwind ssp {
 ; CHECK-LABEL: fptosi_bf:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s1, s0
-; CHECK-NEXT:    // implicit-def: $d0
-; CHECK-NEXT:    fmov s0, s1
+; CHECK-NEXT:    // kill: def $d0 killed $h0
 ; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    fcvtzs w0, s0
@@ -171,9 +169,7 @@ entry:
 define i32 @fptoui_sbf(bfloat %a) nounwind ssp {
 ; CHECK-LABEL: fptoui_sbf:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s1, s0
-; CHECK-NEXT:    // implicit-def: $d0
-; CHECK-NEXT:    fmov s0, s1
+; CHECK-NEXT:    // kill: def $d0 killed $h0
 ; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    fcvtzu w0, s0
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
@@ -641,15 +641,15 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; NOLSE-LABEL: test_atomicrmw_fmax_v2bf16_seq_cst_align4:
 ; NOLSE:       // %bb.0:
 ; NOLSE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NOLSE-NEXT:    dup v1.4h, v0.h[1]
+; NOLSE-NEXT:    mov h1, v0.h[1]
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
 ; NOLSE-NEXT:    shll v0.4s, v0.4h, #16
 ; NOLSE-NEXT:    shll v1.4s, v1.4h, #16
 ; NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxr w9, [x0]
 ; NOLSE-NEXT:    fmov s2, w9
-; NOLSE-NEXT:    dup v3.4h, v2.h[1]
+; NOLSE-NEXT:    mov h3, v2.h[1]
 ; NOLSE-NEXT:    shll v2.4s, v2.4h, #16
 ; NOLSE-NEXT:    fmaxnm s2, s2, s0
 ; NOLSE-NEXT:    shll v3.4s, v3.4h, #16
@@ -677,14 +677,14 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; LSE-LABEL: test_atomicrmw_fmax_v2bf16_seq_cst_align4:
 ; LSE:       // %bb.0:
 ; LSE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; LSE-NEXT:    dup v1.4h, v0.h[1]
+; LSE-NEXT:    mov h1, v0.h[1]
 ; LSE-NEXT:    shll v2.4s, v0.4h, #16
 ; LSE-NEXT:    mov w8, #32767 // =0x7fff
 ; LSE-NEXT:    ldr s0, [x0]
 ; LSE-NEXT:    shll v1.4s, v1.4h, #16
 ; LSE-NEXT:  .LBB7_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    dup v3.4h, v0.h[1]
+; LSE-NEXT:    mov h3, v0.h[1]
 ; LSE-NEXT:    shll v4.4s, v0.4h, #16
 ; LSE-NEXT:    fmaxnm s4, s4, s2
 ; LSE-NEXT:    shll v3.4s, v3.4h, #16
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
@@ -641,15 +641,15 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; NOLSE-LABEL: test_atomicrmw_fmin_v2bf16_seq_cst_align4:
 ; NOLSE:       // %bb.0:
 ; NOLSE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NOLSE-NEXT:    dup v1.4h, v0.h[1]
+; NOLSE-NEXT:    mov h1, v0.h[1]
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
 ; NOLSE-NEXT:    shll v0.4s, v0.4h, #16
 ; NOLSE-NEXT:    shll v1.4s, v1.4h, #16
 ; NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxr w9, [x0]
 ; NOLSE-NEXT:    fmov s2, w9
-; NOLSE-NEXT:    dup v3.4h, v2.h[1]
+; NOLSE-NEXT:    mov h3, v2.h[1]
 ; NOLSE-NEXT:    shll v2.4s, v2.4h, #16
 ; NOLSE-NEXT:    fminnm s2, s2, s0
 ; NOLSE-NEXT:    shll v3.4s, v3.4h, #16
@@ -677,14 +677,14 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; LSE-LABEL: test_atomicrmw_fmin_v2bf16_seq_cst_align4:
 ; LSE:       // %bb.0:
 ; LSE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; LSE-NEXT:    dup v1.4h, v0.h[1]
+; LSE-NEXT:    mov h1, v0.h[1]
 ; LSE-NEXT:    shll v2.4s, v0.4h, #16
 ; LSE-NEXT:    mov w8, #32767 // =0x7fff
 ; LSE-NEXT:    ldr s0, [x0]
 ; LSE-NEXT:    shll v1.4s, v1.4h, #16
 ; LSE-NEXT:  .LBB7_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    dup v3.4h, v0.h[1]
+; LSE-NEXT:    mov h3, v0.h[1]
 ; LSE-NEXT:    shll v4.4s, v0.4h, #16
 ; LSE-NEXT:    fminnm s4, s4, s2
 ; LSE-NEXT:    shll v3.4s, v3.4h, #16
diff --git a/llvm/test/CodeGen/AArch64/bf16-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-instructions.ll
@@ -202,16 +202,13 @@ define bfloat @test_fmadd(bfloat %a, bfloat %b, bfloat %c) #0 {
 ;
 ; CHECK-BF16-LABEL: test_fmadd:
 ; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h2 killed $h2 def $d2
 ; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
 ; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-BF16-NEXT:    // kill: def $h2 killed $h2 def $d2
 ; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-BF16-NEXT:    fmul s0, s0, s1
-; CHECK-BF16-NEXT:    shll v1.4s, v2.4h, #16
-; CHECK-BF16-NEXT:    bfcvt h0, s0
-; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-BF16-NEXT:    fadd s0, s0, s1
+; CHECK-BF16-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-BF16-NEXT:    fmadd s0, s0, s1, s2
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
   %mul = fmul fast bfloat %a, %b
@@ -1996,13 +1993,11 @@ define bfloat @test_copysign_f64(bfloat %a, double %b) #0 {
 define float @test_copysign_extended(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-LABEL: test_copysign_extended:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-CVT-NEXT:    movi v2.4s, #16
 ; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-CVT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-CVT-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mvni v2.4s, #128, lsl #24
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-CVT-NEXT:    fmov w8, s0
 ; CHECK-CVT-NEXT:    lsr w8, w8, #16
@@ -2013,16 +2008,12 @@ define float @test_copysign_extended(bfloat %a, bfloat %b) #0 {
 ;
 ; CHECK-SD-LABEL: test_copysign_extended:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    movi v2.4s, #16
 ; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-SD-NEXT:    mvni v2.4s, #128, lsl #24
-; CHECK-SD-NEXT:    bif v0.16b, v1.16b, v2.16b
-; CHECK-SD-NEXT:    bfcvt h0, s0
+; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-SD-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-SD-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/AArch64/bf16_fast_math.ll b/llvm/test/CodeGen/AArch64/bf16_fast_math.ll
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll