flambda-backend: Float32 min/max/rounding intrinsics (#2684)

TheNumbat · web-flow · commit e48d83cf8827 · 2024-06-13T12:07:01.000-04:00
diff --git a/otherlibs/stdlib_beta/float32.ml b/otherlibs/stdlib_beta/float32.ml
@@ -246,6 +246,16 @@ let[@inline] max (x : t) (y : t) =
   else if is_nan y then y
   else x
 
+module With_weird_nan_behavior = struct
+  external min : t -> t -> t
+    = "caml_sse_float32_min_bytecode" "caml_sse_float32_min"
+    [@@noalloc] [@@unboxed] [@@builtin]
+
+  external max : t -> t -> t
+    = "caml_sse_float32_max_bytecode" "caml_sse_float32_max"
+    [@@noalloc] [@@unboxed] [@@builtin]
+end
+
 let[@inline] min_max (x : t) (y : t) =
   if is_nan x || is_nan y then (nan, nan)
   else if y > x || ((not (sign_bit y)) && sign_bit x) then (x, y)
@@ -267,6 +277,24 @@ let[@inline] min_max_num (x : t) (y : t) =
   else if y > x || ((not (sign_bit y)) && sign_bit x) then (x, y)
   else (y, x)
 
+external iround_half_to_even : t -> int64
+  = "caml_sse_cast_float32_int64_bytecode" "caml_sse_cast_float32_int64"
+  [@@noalloc] [@@unboxed] [@@builtin]
+
+external round_intrinsic : (int[@untagged]) -> (t[@unboxed]) -> (t[@unboxed])
+  = "caml_sse41_float32_round_bytecode" "caml_sse41_float32_round"
+  [@@noalloc] [@@builtin]
+
+(* On amd64, these constants also imply _MM_FROUND_NO_EXC (suppress exceptions). *)
+let round_neg_inf = 0x9
+let round_pos_inf = 0xA
+let round_zero = 0xB
+let round_current_mode = 0xC
+let[@inline] round_half_to_even x = round_intrinsic round_current_mode x
+let[@inline] round_down x = round_intrinsic round_neg_inf x
+let[@inline] round_up x = round_intrinsic round_pos_inf x
+let[@inline] round_towards_zero x = round_intrinsic round_zero x
+
 external seeded_hash_param : int -> int -> int -> 'a -> int = "caml_hash_exn"
   [@@noalloc]
 
diff --git a/otherlibs/stdlib_beta/float32.mli b/otherlibs/stdlib_beta/float32.mli
@@ -374,17 +374,14 @@ external erfc : t -> t = "caml_erfc_float32_bytecode" "erfcf"
 external trunc : t -> t = "caml_trunc_float32_bytecode" "truncf"
   [@@unboxed] [@@noalloc]
 (** [trunc x] rounds [x] to the nearest integer whose absolute value is
-   less than or equal to [x]. *)
+    less than or equal to [x]. *)
 
 external round : t -> t = "caml_round_float32_bytecode" "roundf"
   [@@unboxed] [@@noalloc]
 (** [round x] rounds [x] to the nearest integer with ties (fractional
-   values of 0.5s) rounded away from zero, regardless of the current
-   rounding direction.  If [x] is an integer, [+0.s], [-0.s], [nan], or
-   infinite, [x] itself is returned.
-
-   On 64-bit mingw-w64, this function may be emulated owing to a bug in the
-   C runtime library (CRT) on this platform. *)
+    values of 0.5s) rounded away from zero, regardless of the current
+    rounding direction.  If [x] is an integer, [+0.s], [-0.s], [nan], or
+    infinite, [x] itself is returned. *)
 
 external ceil : t -> t = "caml_ceil_float32_bytecode" "ceilf"
   [@@unboxed] [@@noalloc]
@@ -461,6 +458,24 @@ val max : t -> t -> t
 (** [max x y] returns the maximum of [x] and [y].  It returns [nan]
    when [x] or [y] is [nan].  Moreover [max (-0.s) (+0.s) = +0.s] *)
 
+module With_weird_nan_behavior : sig
+  external min : t -> t -> t
+    = "caml_sse_float32_min_bytecode" "caml_sse_float32_min"
+    [@@noalloc] [@@unboxed] [@@builtin]
+  (** [min x y] returns the minimum of [x] and [y].
+      If either [x] or [y] is [nan], [y] is returned.
+      If both [x] and [y] equal zero, [y] is returned.
+      The amd64 flambda-backend compiler translates this call to MINSS. *)
+
+  external max : t -> t -> t
+    = "caml_sse_float32_max_bytecode" "caml_sse_float32_max"
+    [@@noalloc] [@@unboxed] [@@builtin]
+  (** [max x y] returns the maximum of [x] and [y].
+      If either [x] or [y] is [nan], [y] is returned.
+      If both [x] and [y] equal zero, [y] is returned.
+      The amd64 flambda-backend compiler translates this call to MAXSS. *)
+end
+
 val min_max : t -> t -> t * t
 (** [min_max x y] is [(min x y, max x y)], just more efficient. *)
 
@@ -479,6 +494,34 @@ val min_max_num : t -> t -> t * t
    efficient.  Note that in particular [min_max_num x nan = (x, x)]
    and [min_max_num nan y = (y, y)]. *)
 
+external iround_half_to_even : t -> int64
+  = "caml_sse_cast_float32_int64_bytecode" "caml_sse_cast_float32_int64"
+  [@@noalloc] [@@unboxed] [@@builtin]
+(** Rounds a [float32] to an [int64] using the current rounding mode. The default
+    rounding mode is "round half to even", and we expect that no program will
+    change the rounding mode.
+    If the argument is NaN or infinite or if the rounded value cannot be
+    represented, then the result is unspecified.
+    The amd64 flambda-backend compiler translates this call to CVTSS2SI. *)
+
+val round_half_to_even : t -> t
+(** Rounds a [float32] to an integer [float32] using the current rounding
+    mode.  The default rounding mode is "round half to even", and we
+    expect that no program will change the rounding mode.
+    The amd64 flambda-backend compiler translates this call to ROUNDSS. *)
+
+val round_down : t -> t
+(** Rounds a [float32] down to the next integer [float32] toward negative infinity.
+    The amd64 flambda-backend compiler translates this call to ROUNDSS.*)
+
+val round_up : t -> t
+(** Rounds a [float32] up to the next integer [float32] toward positive infinity.
+    The amd64 flambda-backend compiler translates this call to ROUNDSS.*)
+
+val round_towards_zero : t -> t
+(** Rounds a [float32] to the next integer [float32] toward zero.
+    The amd64 flambda-backend compiler translates this call to ROUNDSS.*)
+
 val seeded_hash : int -> t -> int
 (** A seeded hash function for floats, with the same output value as
     {!Hashtbl.seeded_hash}. This function allows this module to be passed as
diff --git a/otherlibs/stdlib_beta/float32_u.ml b/otherlibs/stdlib_beta/float32_u.ml
@@ -184,6 +184,22 @@ let[@inline always] min x y = of_float32 (Float32.min (to_float32 x) (to_float32
 
 let[@inline always] max x y = of_float32 (Float32.max (to_float32 x) (to_float32 y))
 
+module With_weird_nan_behavior = struct
+  let[@inline always] min x y = of_float32 (Float32.With_weird_nan_behavior.min (to_float32 x) (to_float32 y))
+
+  let[@inline always] max x y = of_float32 (Float32.With_weird_nan_behavior.max (to_float32 x) (to_float32 y))
+end
+
 let[@inline always] min_num x y = of_float32 (Float32.min_num (to_float32 x) (to_float32 y))
 
 let[@inline always] max_num x y = of_float32 (Float32.max_num (to_float32 x) (to_float32 y))
+
+let iround_half_to_even x = unbox_int64 (Float32.iround_half_to_even (to_float32 x))
+
+let round_half_to_even x = of_float32 (Float32.round_half_to_even (to_float32 x))
+
+let round_down x = of_float32 (Float32.round_down (to_float32 x))
+
+let round_up x = of_float32 (Float32.round_up (to_float32 x))
+
+let round_towards_zero x = of_float32 (Float32.round_towards_zero (to_float32 x))
diff --git a/otherlibs/stdlib_beta/float32_u.mli b/otherlibs/stdlib_beta/float32_u.mli
@@ -364,6 +364,20 @@ val max : t -> t -> t
 (** [max x y] returns the maximum of [x] and [y].  It returns [nan]
     when [x] or [y] is [nan].  Moreover [max #-0.s #+0.s = #+0.s] *)
 
+module With_weird_nan_behavior : sig
+    val min : t -> t -> t
+    (** [min x y] returns the minimum of [x] and [y].
+        If either [x] or [y] is [nan], [y] is returned.
+        If both [x] and [y] equal zero, [y] is returned.
+        The amd64 flambda-backend compiler translates this call to MINSS. *)
+
+    val max : t -> t -> t
+    (** [max x y] returns the maximum of [x] and [y].
+        If either [x] or [y] is [nan], [y] is returned.
+        If both [x] and [y] equal zero, [y] is returned.
+        The amd64 flambda-backend compiler translates this call to MAXSS. *)
+end
+
 val min_num : t -> t -> t
 (** [min_num x y] returns the minimum of [x] and [y] treating [nan] as
     missing values.  If both [x] and [y] are [nan], [nan] is returned.
@@ -374,5 +388,31 @@ val max_num : t -> t -> t
     missing values.  If both [x] and [y] are [nan] [nan] is returned.
     Moreover [max_num #-0.s #+0.s = #+0.s] *)
 
+val iround_half_to_even : t -> int64#
+(** Rounds a [float32#] to an [int64#] using the current rounding mode. The default
+    rounding mode is "round half to even", and we expect that no program will
+    change the rounding mode.
+    If the argument is NaN or infinite or if the rounded value cannot be
+    represented, then the result is unspecified.
+    The amd64 flambda-backend compiler translates this call to CVTSS2SI. *)
+
+val round_half_to_even : t -> t
+(** Rounds a [float32#] to an integer [float32#] using the current rounding
+    mode.  The default rounding mode is "round half to even", and we
+    expect that no program will change the rounding mode.
+    The amd64 flambda-backend compiler translates this call to ROUNDSS. *)
+
+val round_down : t -> t
+(** Rounds a [float32#] down to the next integer [float32#] toward negative infinity.
+    The amd64 flambda-backend compiler translates this call to ROUNDSS.*)
+
+val round_up : t -> t
+(** Rounds a [float32#] up to the next integer [float32#] toward positive infinity.
+    The amd64 flambda-backend compiler translates this call to ROUNDSS.*)
+
+val round_towards_zero : t -> t
+(** Rounds a [float32#] to the next integer [float32#] toward zero.
+    The amd64 flambda-backend compiler translates this call to ROUNDSS.*)
+
 (* CR layouts v5: add back hash when we deal with the ad-hoc polymorphic
    functions. *)
diff --git a/runtime/float32.c b/runtime/float32.c
@@ -309,6 +309,51 @@ CAMLprim value caml_ldexp_float32_bytecode(value f, value i)
   return caml_copy_float32(caml_ldexp_float32(Float32_val(f), Int_val(i)));
 }
 
+float caml_sse_float32_min(float x, float y) {
+  return x < y ? x : y;
+}
+
+CAMLprim value caml_sse_float32_min_bytecode(value x, value y) {
+  return Float32_val(x) < Float32_val(y) ? x : y;
+}
+
+float caml_sse_float32_max(float x, float y) {
+  return x > y ? x : y;
+}
+
+CAMLprim value caml_sse_float32_max_bytecode(value x, value y) {
+  return Float32_val(x) > Float32_val(y) ? x : y;
+}
+
+int64_t caml_sse_cast_float32_int64(float f)
+{
+  return llrintf(f);
+}
+
+CAMLprim value caml_sse_cast_float32_int64_bytecode(value f)
+{
+  return caml_copy_int64(caml_sse_cast_float32_int64(Float32_val(f)));
+}
+
+#define ROUND_NEG_INF 0x9
+#define ROUND_POS_INF 0xA
+#define ROUND_ZERO 0xB
+#define ROUND_CURRENT 0xC
+
+float caml_sse41_float32_round(int mode, float f) {
+  switch(mode) {
+  case ROUND_NEG_INF: return floorf(f);
+  case ROUND_POS_INF: return ceilf(f);
+  case ROUND_ZERO:    return truncf(f);
+  case ROUND_CURRENT: return rintf(f);
+  default: caml_fatal_error("Unknown rounding mode.");
+  }
+}
+
+CAMLprim value caml_sse41_float32_round_bytecode(value mode, value f) {
+  return caml_copy_float32(caml_sse41_float32_round(Int_val(mode), Float32_val(f)));
+}
+
 enum { FP_normal, FP_subnormal, FP_zero, FP_infinite, FP_nan };
 
 value caml_classify_float32(float vf)
diff --git a/runtime4/float32.c b/runtime4/float32.c
@@ -309,6 +309,51 @@ CAMLprim value caml_ldexp_float32_bytecode(value f, value i)
   return caml_copy_float32(caml_ldexp_float32(Float32_val(f), Int_val(i)));
 }
 
+float caml_sse_float32_min(float x, float y) {
+  return x < y ? x : y;
+}
+
+CAMLprim value caml_sse_float32_min_bytecode(value x, value y) {
+  return Float32_val(x) < Float32_val(y) ? x : y;
+}
+
+float caml_sse_float32_max(float x, float y) {
+  return x > y ? x : y;
+}
+
+CAMLprim value caml_sse_float32_max_bytecode(value x, value y) {
+  return Float32_val(x) > Float32_val(y) ? x : y;
+}
+
+int64_t caml_sse_cast_float32_int64(float f)
+{
+  return llrintf(f);
+}
+
+CAMLprim value caml_sse_cast_float32_int64_bytecode(value f)
+{
+  return caml_copy_int64(caml_sse_cast_float32_int64(Float32_val(f)));
+}
+
+#define ROUND_NEG_INF 0x9
+#define ROUND_POS_INF 0xA
+#define ROUND_ZERO 0xB
+#define ROUND_CURRENT 0xC
+
+float caml_sse41_float32_round(int mode, float f) {
+  switch(mode) {
+  case ROUND_NEG_INF: return floorf(f);
+  case ROUND_POS_INF: return ceilf(f);
+  case ROUND_ZERO:    return truncf(f);
+  case ROUND_CURRENT: return rintf(f);
+  default: caml_fatal_error("Unknown rounding mode.");
+  }
+}
+
+CAMLprim value caml_sse41_float32_round_bytecode(value mode, value f) {
+  return caml_copy_float32(caml_sse41_float32_round(Int_val(mode), Float32_val(f)));
+}
+
 enum { FP_normal, FP_subnormal, FP_zero, FP_infinite, FP_nan };
 
 value caml_classify_float32(float vf)