Initial experiments (with integer regs for fp16).

JonPsson1 · JonPsson1 · commit d12b868bd9be · 2024-10-29T11:00:12.000+01:00
diff --git a/clang/lib/Basic/Targets/SystemZ.h b/clang/lib/Basic/Targets/SystemZ.h
@@ -91,11 +91,23 @@ class LLVM_LIBRARY_VISIBILITY SystemZTargetInfo : public TargetInfo {
                       "-v128:64-a:8:16-n32:64");
     }
     MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 128;
+
+    // True if the backend supports operations on the half LLVM IR type.
+    HasLegalHalfType = false;
+    // Allow half arguments and return values.
+    HalfArgsAndReturns = true;
+    // Support _Float16.
+    HasFloat16 = true;
+
     HasStrictFP = true;
   }
 
   unsigned getMinGlobalAlign(uint64_t Size, bool HasNonWeakDef) const override;
 
+  bool useFP16ConversionIntrinsics() const override {
+    return false;
+  }
+
   void getTargetDefines(const LangOptions &Opts,
                         MacroBuilder &Builder) const override;
 
diff --git a/clang/lib/CodeGen/Targets/SystemZ.cpp b/clang/lib/CodeGen/Targets/SystemZ.cpp
@@ -185,6 +185,8 @@ bool SystemZABIInfo::isFPArgumentType(QualType Ty) const {
 
   if (const BuiltinType *BT = Ty->getAs<BuiltinType>())
     switch (BT->getKind()) {
+//  case BuiltinType::Half:     // __fp16    Support __fp16??
+    case BuiltinType::Float16:  // _Float16
     case BuiltinType::Float:
     case BuiltinType::Double:
       return true;
@@ -277,7 +279,8 @@ RValue SystemZABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
   } else {
     if (AI.getCoerceToType())
       ArgTy = AI.getCoerceToType();
-    InFPRs = (!IsSoftFloatABI && (ArgTy->isFloatTy() || ArgTy->isDoubleTy()));
+    InFPRs = (!IsSoftFloatABI &&
+              (ArgTy->isHalfTy() || ArgTy->isFloatTy() || ArgTy->isDoubleTy()));
     IsVector = ArgTy->isVectorTy();
     UnpaddedSize = TyInfo.Width;
     DirectAlign = TyInfo.Align;
@@ -446,10 +449,11 @@ ABIArgInfo SystemZABIInfo::classifyArgumentType(QualType Ty) const {
 
     // The structure is passed as an unextended integer, a float, or a double.
     if (isFPArgumentType(SingleElementTy)) {
-      assert(Size == 32 || Size == 64);
+      assert(Size == 16 || Size == 32 || Size == 64);
       return ABIArgInfo::getDirect(
-          Size == 32 ? llvm::Type::getFloatTy(getVMContext())
-                     : llvm::Type::getDoubleTy(getVMContext()));
+          Size == 16 ? llvm::Type::getHalfTy(getVMContext())
+                     : Size == 32 ? llvm::Type::getFloatTy(getVMContext())
+                                  : llvm::Type::getDoubleTy(getVMContext()));
     } else {
       llvm::IntegerType *PassTy = llvm::IntegerType::get(getVMContext(), Size);
       return Size <= 32 ? ABIArgInfo::getNoExtend(PassTy)
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
@@ -16548,7 +16548,7 @@ ExprResult Sema::BuildVAArgExpr(SourceLocation BuiltinLoc,
           PromoteType = QualType();
       }
     }
-    if (TInfo->getType()->isSpecificBuiltinType(BuiltinType::Float))
+    if (TInfo->getType()->isFloat16Type() || TInfo->getType()->isFloat32Type())
       PromoteType = Context.DoubleTy;
     if (!PromoteType.isNull())
       DiagRuntimeBehavior(TInfo->getTypeLoc().getBeginLoc(), E,
diff --git a/clang/test/CodeGen/SystemZ/fexcess-precision.c b/clang/test/CodeGen/SystemZ/fexcess-precision.c
@@ -0,0 +1,85 @@
+// RUN: %clang_cc1 -triple s390x-linux-gnu \
+// RUN: -ffloat16-excess-precision=standard -emit-llvm -o - %s \
+// RUN: | FileCheck %s -check-prefix=STANDARD
+
+// RUN: %clang_cc1 -triple s390x-linux-gnu \
+// RUN: -ffloat16-excess-precision=none -emit-llvm -o - %s \
+// RUN: | FileCheck %s -check-prefix=NONE
+
+// RUN: %clang_cc1 -triple s390x-linux-gnu \
+// RUN: -ffloat16-excess-precision=fast -emit-llvm -o - %s \
+// RUN: | FileCheck %s -check-prefix=FAST
+
+_Float16 f(_Float16 a, _Float16 b, _Float16 c, _Float16 d) {
+    return a * b + c * d;
+}
+
+// STANDARD-LABEL: define dso_local half @f(half noundef %a, half noundef %b, half noundef %c, half noundef %d) #0 {
+// STANDARD-NEXT:  entry:
+// STANDARD-NEXT:    %a.addr = alloca half, align 2
+// STANDARD-NEXT:    %b.addr = alloca half, align 2
+// STANDARD-NEXT:    %c.addr = alloca half, align 2
+// STANDARD-NEXT:    %d.addr = alloca half, align 2
+// STANDARD-NEXT:    store half %a, ptr %a.addr, align 2
+// STANDARD-NEXT:    store half %b, ptr %b.addr, align 2
+// STANDARD-NEXT:    store half %c, ptr %c.addr, align 2
+// STANDARD-NEXT:    store half %d, ptr %d.addr, align 2
+// STANDARD-NEXT:    %0 = load half, ptr %a.addr, align 2
+// STANDARD-NEXT:    %ext = fpext half %0 to float
+// STANDARD-NEXT:    %1 = load half, ptr %b.addr, align 2
+// STANDARD-NEXT:    %ext1 = fpext half %1 to float
+// STANDARD-NEXT:    %mul = fmul float %ext, %ext1
+// STANDARD-NEXT:    %2 = load half, ptr %c.addr, align 2
+// STANDARD-NEXT:    %ext2 = fpext half %2 to float
+// STANDARD-NEXT:    %3 = load half, ptr %d.addr, align 2
+// STANDARD-NEXT:    %ext3 = fpext half %3 to float
+// STANDARD-NEXT:    %mul4 = fmul float %ext2, %ext3
+// STANDARD-NEXT:    %add = fadd float %mul, %mul4
+// STANDARD-NEXT:    %unpromotion = fptrunc float %add to half
+// STANDARD-NEXT:    ret half %unpromotion
+// STANDARD-NEXT:  }
+
+// NONE-LABEL: define dso_local half @f(half noundef %a, half noundef %b, half noundef %c, half noundef %d) #0 {
+// NONE-NEXT:  entry:
+// NONE-NEXT:    %a.addr = alloca half, align 2
+// NONE-NEXT:    %b.addr = alloca half, align 2
+// NONE-NEXT:    %c.addr = alloca half, align 2
+// NONE-NEXT:    %d.addr = alloca half, align 2
+// NONE-NEXT:    store half %a, ptr %a.addr, align 2
+// NONE-NEXT:    store half %b, ptr %b.addr, align 2
+// NONE-NEXT:    store half %c, ptr %c.addr, align 2
+// NONE-NEXT:    store half %d, ptr %d.addr, align 2
+// NONE-NEXT:    %0 = load half, ptr %a.addr, align 2
+// NONE-NEXT:    %1 = load half, ptr %b.addr, align 2
+// NONE-NEXT:    %mul = fmul half %0, %1
+// NONE-NEXT:    %2 = load half, ptr %c.addr, align 2
+// NONE-NEXT:    %3 = load half, ptr %d.addr, align 2
+// NONE-NEXT:    %mul1 = fmul half %2, %3
+// NONE-NEXT:    %add = fadd half %mul, %mul1
+// NONE-NEXT:    ret half %add
+// NONE-NEXT:  }
+
+// FAST-LABEL: define dso_local half @f(half noundef %a, half noundef %b, half noundef %c, half noundef %d) #0 {
+// FAST-NEXT:  entry:
+// FAST-NEXT:    %a.addr = alloca half, align 2
+// FAST-NEXT:    %b.addr = alloca half, align 2
+// FAST-NEXT:    %c.addr = alloca half, align 2
+// FAST-NEXT:    %d.addr = alloca half, align 2
+// FAST-NEXT:    store half %a, ptr %a.addr, align 2
+// FAST-NEXT:    store half %b, ptr %b.addr, align 2
+// FAST-NEXT:    store half %c, ptr %c.addr, align 2
+// FAST-NEXT:    store half %d, ptr %d.addr, align 2
+// FAST-NEXT:    %0 = load half, ptr %a.addr, align 2
+// FAST-NEXT:    %ext = fpext half %0 to float
+// FAST-NEXT:    %1 = load half, ptr %b.addr, align 2
+// FAST-NEXT:    %ext1 = fpext half %1 to float
+// FAST-NEXT:    %mul = fmul float %ext, %ext1
+// FAST-NEXT:    %2 = load half, ptr %c.addr, align 2
+// FAST-NEXT:    %ext2 = fpext half %2 to float
+// FAST-NEXT:    %3 = load half, ptr %d.addr, align 2
+// FAST-NEXT:    %ext3 = fpext half %3 to float
+// FAST-NEXT:    %mul4 = fmul float %ext2, %ext3
+// FAST-NEXT:    %add = fadd float %mul, %mul4
+// FAST-NEXT:    %unpromotion = fptrunc float %add to half
+// FAST-NEXT:    ret half %unpromotion
+// FAST-NEXT:  }
diff --git a/clang/test/CodeGen/SystemZ/systemz-abi.c b/clang/test/CodeGen/SystemZ/systemz-abi.c
@@ -45,6 +45,9 @@ long long pass_longlong(long long arg) { return arg; }
 __int128 pass_int128(__int128 arg) { return arg; }
 // CHECK-LABEL: define{{.*}} void @pass_int128(ptr dead_on_unwind noalias writable sret(i128) align 8 %{{.*}}, ptr %0)
 
+_Float16 pass__Float16(_Float16 arg) { return arg; }
+// CHECK-LABEL: define{{.*}} half @pass__Float16(half %{{.*}})
+
 float pass_float(float arg) { return arg; }
 // CHECK-LABEL: define{{.*}} float @pass_float(float %{{.*}})
 
@@ -72,6 +75,9 @@ _Complex long pass_complex_long(_Complex long arg) { return arg; }
 _Complex long long pass_complex_longlong(_Complex long long arg) { return arg; }
 // CHECK-LABEL: define{{.*}} void @pass_complex_longlong(ptr dead_on_unwind noalias writable sret({ i64, i64 }) align 8 %{{.*}}, ptr %{{.*}}arg)
 
+_Complex _Float16 pass_complex__Float16(_Complex _Float16 arg) { return arg; }
+// CHECK-LABEL: define{{.*}} void @pass_complex__Float16(ptr dead_on_unwind noalias writable sret({ half, half }) align 2 %{{.*}}, ptr %{{.*}}arg)
+
 _Complex float pass_complex_float(_Complex float arg) { return arg; }
 // CHECK-LABEL: define{{.*}} void @pass_complex_float(ptr dead_on_unwind noalias writable sret({ float, float }) align 4 %{{.*}}, ptr %{{.*}}arg)
 
@@ -123,6 +129,11 @@ struct agg_16byte pass_agg_16byte(struct agg_16byte arg) { return arg; }
 
 // Float-like aggregate types
 
+struct agg__Float16 { _Float16 a; };
+struct agg__Float16 pass_agg__Float16(struct agg__Float16 arg) { return arg; }
+// HARD-FLOAT-LABEL: define{{.*}} void @pass_agg__Float16(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16) align 2 %{{.*}}, half %{{.*}})
+// SOFT-FLOAT-LABEL: define{{.*}} void @pass_agg__Float16(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16) align 2 %{{.*}}, i16 noext %{{.*}})
+
 struct agg_float { float a; };
 struct agg_float pass_agg_float(struct agg_float arg) { return arg; }
 // HARD-FLOAT-LABEL: define{{.*}} void @pass_agg_float(ptr dead_on_unwind noalias writable sret(%struct.agg_float) align 4 %{{.*}}, float %{{.*}})
@@ -137,6 +148,11 @@ struct agg_longdouble { long double a; };
 struct agg_longdouble pass_agg_longdouble(struct agg_longdouble arg) { return arg; }
 // CHECK-LABEL: define{{.*}} void @pass_agg_longdouble(ptr dead_on_unwind noalias writable sret(%struct.agg_longdouble) align 8 %{{.*}}, ptr %{{.*}})
 
+struct agg__Float16_a8 { _Float16 a __attribute__((aligned (8))); };
+struct agg__Float16_a8 pass_agg__Float16_a8(struct agg__Float16_a8 arg) { return arg; }
+// HARD-FLOAT-LABEL: define{{.*}} void @pass_agg__Float16_a8(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16_a8) align 8 %{{.*}}, double %{{.*}})
+// SOFT-FLOAT-LABEL: define{{.*}} void @pass_agg__Float16_a8(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16_a8) align 8 %{{.*}}, i64 %{{.*}})
+
 struct agg_float_a8 { float a __attribute__((aligned (8))); };
 struct agg_float_a8 pass_agg_float_a8(struct agg_float_a8 arg) { return arg; }
 // HARD-FLOAT-LABEL: define{{.*}} void @pass_agg_float_a8(ptr dead_on_unwind noalias writable sret(%struct.agg_float_a8) align 8 %{{.*}}, double %{{.*}})
@@ -164,6 +180,10 @@ struct agg_nofloat3 pass_agg_nofloat3(struct agg_nofloat3 arg) { return arg; }
 
 // Union types likewise are *not* float-like aggregate types
 
+union union__Float16 { _Float16 a; };
+union union__Float16 pass_union__Float16(union union__Float16 arg) { return arg; }
+// CHECK-LABEL: define{{.*}} void @pass_union__Float16(ptr dead_on_unwind noalias writable sret(%union.union__Float16) align 2 %{{.*}}, i16 noext %{{.*}})
+
 union union_float { float a; };
 union union_float pass_union_float(union union_float arg) { return arg; }
 // CHECK-LABEL: define{{.*}} void @pass_union_float(ptr dead_on_unwind noalias writable sret(%union.union_float) align 4 %{{.*}}, i32 noext %{{.*}})
@@ -441,6 +461,30 @@ struct agg_8byte va_agg_8byte(__builtin_va_list l) { return __builtin_va_arg(l,
 // CHECK: [[VA_ARG_ADDR:%[^ ]+]] = phi ptr [ [[RAW_REG_ADDR]], %{{.*}} ], [ [[RAW_MEM_ADDR]], %{{.*}} ]
 // CHECK: ret void
 
+struct agg__Float16 va_agg__Float16(__builtin_va_list l) { return __builtin_va_arg(l, struct agg__Float16); }
+// CHECK-LABEL: define{{.*}} void @va_agg__Float16(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16) align 2 %{{.*}}, ptr %{{.*}}
+// HARD-FLOAT: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 1
+// SOFT-FLOAT: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 0
+// CHECK: [[REG_COUNT:%[^ ]+]] = load i64, ptr [[REG_COUNT_PTR]]
+// HARD-FLOAT: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 4
+// SOFT-FLOAT: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 5
+// CHECK: br i1 [[FITS_IN_REGS]],
+// CHECK: [[SCALED_REG_COUNT:%[^ ]+]] = mul i64 [[REG_COUNT]], 8
+// HARD-FLOAT: [[REG_OFFSET:%[^ ]+]] = add i64 [[SCALED_REG_COUNT]], 128
+// SOFT-FLOAT: [[REG_OFFSET:%[^ ]+]] = add i64 [[SCALED_REG_COUNT]], 22
+// CHECK: [[REG_SAVE_AREA_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 3
+// CHECK: [[REG_SAVE_AREA:%[^ ]+]] = load ptr, ptr [[REG_SAVE_AREA_PTR:[^ ]+]]
+// CHECK: [[RAW_REG_ADDR:%[^ ]+]] = getelementptr i8, ptr [[REG_SAVE_AREA]], i64 [[REG_OFFSET]]
+// CHECK: [[REG_COUNT1:%[^ ]+]] = add i64 [[REG_COUNT]], 1
+// CHECK: store i64 [[REG_COUNT1]], ptr [[REG_COUNT_PTR]]
+// CHECK: [[OVERFLOW_ARG_AREA_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 2
+// CHECK: [[OVERFLOW_ARG_AREA:%[^ ]+]] = load ptr, ptr [[OVERFLOW_ARG_AREA_PTR]]
+// CHECK: [[RAW_MEM_ADDR:%[^ ]+]] = getelementptr i8, ptr [[OVERFLOW_ARG_AREA]], i64 6
+// CHECK: [[OVERFLOW_ARG_AREA2:%[^ ]+]] = getelementptr i8, ptr [[OVERFLOW_ARG_AREA]], i64 8
+// CHECK: store ptr [[OVERFLOW_ARG_AREA2]], ptr [[OVERFLOW_ARG_AREA_PTR]]
+// CHECK: [[VA_ARG_ADDR:%[^ ]+]] = phi ptr [ [[RAW_REG_ADDR]], %{{.*}} ], [ [[RAW_MEM_ADDR]], %{{.*}} ]
+// CHECK: ret void
+
 struct agg_float va_agg_float(__builtin_va_list l) { return __builtin_va_arg(l, struct agg_float); }
 // CHECK-LABEL: define{{.*}} void @va_agg_float(ptr dead_on_unwind noalias writable sret(%struct.agg_float) align 4 %{{.*}}, ptr %{{.*}}
 // HARD-FLOAT: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 1
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
@@ -50,6 +50,7 @@ def RetCC_SystemZ_ELF : CallingConv<[
   // other floating-point argument registers available for code that
   // doesn't care about the ABI.  All floating-point argument registers
   // are call-clobbered, so we can use all of them here.
+  CCIfType<[f16], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
   CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
   CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>,
 
@@ -115,6 +116,7 @@ def CC_SystemZ_ELF : CallingConv<[
   CCIfType<[i64], CCAssignToReg<[R2D, R3D, R4D, R5D, R6D]>>,
 
   // The first 4 float and double arguments are passed in even registers F0-F6.
+  CCIfType<[f16], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
   CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
   CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>,
 
@@ -138,7 +140,7 @@ def CC_SystemZ_ELF : CallingConv<[
              CCAssignToStack<16, 8>>>,
 
   // Other arguments are passed in 8-byte-aligned 8-byte stack slots.
-  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>
+  CCIfType<[i32, i64, f16, f32, f64], CCAssignToStack<8, 8>>
 ]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -711,6 +711,13 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::BITCAST, MVT::f32, Custom);
   }
 
+  // Expand FP16 <=> FP32 conversions to libcalls and handle FP16 loads and
+  // stores in GPRs.
+  setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
+  setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+
   // VASTART and VACOPY need to deal with the SystemZ-specific varargs
   // structure, but VAEND is a no-op.
   setOperationAction(ISD::VASTART, MVT::Other, Custom);
@@ -784,6 +791,20 @@ bool SystemZTargetLowering::useSoftFloat() const {
   return Subtarget.hasSoftFloat();
 }
 
+MVT SystemZTargetLowering::getRegisterTypeForCallingConv(
+      LLVMContext &Context, CallingConv::ID CC,
+      EVT VT) const {
+  // 128-bit single-element vector types are passed like other vectors,
+  // not like their element type.
+  if (VT.isVector() && VT.getSizeInBits() == 128 &&
+      VT.getVectorNumElements() == 1)
+    return MVT::v16i8;
+  // Keep f16 so that they can be recognized and handled.
+  if (VT == MVT::f16)
+    return MVT::f16;
+  return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
+}
+
 EVT SystemZTargetLowering::getSetCCResultType(const DataLayout &DL,
                                               LLVMContext &, EVT VT) const {
   if (!VT.isVector())
@@ -1602,6 +1623,15 @@ bool SystemZTargetLowering::splitValueIntoRegisterParts(
     return true;
   }
 
+  // Convert f16 to f32 (Out-arg).
+  if (PartVT == MVT::f16) {
+    assert(NumParts == 1 && "");
+    SDValue I16Val = DAG.getBitcast(MVT::i16, Val);
+    SDValue I32Val = DAG.getAnyExtOrTrunc(I16Val, DL, MVT::i32);
+    Parts[0] = DAG.getBitcast(MVT::f32, I32Val);
+    return true;
+  }
+
   return false;
 }
 
@@ -1617,6 +1647,18 @@ SDValue SystemZTargetLowering::joinRegisterPartsIntoValue(
   return SDValue();
 }
 
+// F32Val holds a f16 value in f32, return it as an f16 (In-arg). The
+// CopyFromReg was made into an f32 as required as FP32 registers are used
+// for arguments, now convert it to f16.
+static SDValue convertF32ToF16(SDValue F32Val, SelectionDAG &DAG,
+                               const SDLoc &DL) {
+  assert(F32Val->getOpcode() == ISD::CopyFromReg &&
+         "Only expecting to handle f16 with CopyFromReg here.");
+  SDValue I32Val = DAG.getBitcast(MVT::i32, F32Val);
+  SDValue I16Val = DAG.getAnyExtOrTrunc(I32Val, DL, MVT::i16);
+  return DAG.getBitcast(MVT::f16, I16Val);
+}
+
 SDValue SystemZTargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
@@ -1656,6 +1698,7 @@ SDValue SystemZTargetLowering::LowerFormalArguments(
         NumFixedGPRs += 1;
         RC = &SystemZ::GR64BitRegClass;
         break;
+      case MVT::f16:
       case MVT::f32:
         NumFixedFPRs += 1;
         RC = &SystemZ::FP32BitRegClass;
@@ -1680,7 +1723,11 @@ SDValue SystemZTargetLowering::LowerFormalArguments(
 
       Register VReg = MRI.createVirtualRegister(RC);
       MRI.addLiveIn(VA.getLocReg(), VReg);
-      ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, LocVT);
+      // Special handling is needed for f16.
+      MVT ArgVT = VA.getLocVT() == MVT::f16 ? MVT::f32 : VA.getLocVT();
+      ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, ArgVT);
+      if (VA.getLocVT() == MVT::f16)
+        ArgValue = convertF32ToF16(ArgValue, DAG, DL);
     } else {
       assert(VA.isMemLoc() && "Argument not register or memory");
 
@@ -1700,9 +1747,12 @@ SDValue SystemZTargetLowering::LowerFormalArguments(
       // from this parameter.  Unpromoted ints and floats are
       // passed as right-justified 8-byte values.
       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
-      if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32)
+      if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32 ||
+          VA.getLocVT() == MVT::f16) {
+        unsigned SlotOffs = VA.getLocVT() == MVT::f16 ? 6 : 4;
         FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
-                          DAG.getIntPtrConstant(4, DL));
+                          DAG.getIntPtrConstant(SlotOffs, DL));
+      }
       ArgValue = DAG.getLoad(LocVT, DL, Chain, FIN,
                              MachinePointerInfo::getFixedStack(MF, FI));
     }
@@ -2121,10 +2171,14 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
   // Copy all of the result registers out of their specified physreg.
   for (CCValAssign &VA : RetLocs) {
     // Copy the value out, gluing the copy to the end of the call sequence.
+    // Special handling is needed for f16.
+    MVT ArgVT = VA.getLocVT() == MVT::f16 ? MVT::f32 : VA.getLocVT();
     SDValue RetValue = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(),
-                                          VA.getLocVT(), Glue);
+                                          ArgVT, Glue);
     Chain = RetValue.getValue(1);
     Glue = RetValue.getValue(2);
+    if (VA.getLocVT() == MVT::f16)
+      RetValue = convertF32ToF16(RetValue, DAG, DL);
 
     // Convert the value of the return register into the value that's
     // being returned.
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
diff --git a/llvm/test/CodeGen/SystemZ/fp-half.ll b/llvm/test/CodeGen/SystemZ/fp-half.ll

Original file line number	Diff line number	Diff line change
`@@ -16548,7 +16548,7 @@ ExprResult Sema::BuildVAArgExpr(SourceLocation BuiltinLoc,`
`16548`	`16548`	`PromoteType = QualType();`
`16549`	`16549`	`}`
`16550`	`16550`	`}`
`16551`		`- if (TInfo->getType()->isSpecificBuiltinType(BuiltinType::Float))`
	`16551`	`+ if (TInfo->getType()->isFloat16Type() \|\| TInfo->getType()->isFloat32Type())`
`16552`	`16552`	`PromoteType = Context.DoubleTy;`
`16553`	`16553`	`if (!PromoteType.isNull())`
`16554`	`16554`	`DiagRuntimeBehavior(TInfo->getTypeLoc().getBeginLoc(), E,`