oxcaml
diff --git a/‎backend/amd64/emit.mlp
Lines changed: 4 additions & 1 deletion b/‎backend/amd64/emit.mlp
Lines changed: 4 additions & 1 deletion
diff --git a/‎backend/amd64/proc.ml
Lines changed: 1 addition & 1 deletion b/‎backend/amd64/proc.ml
Lines changed: 1 addition & 1 deletion
diff --git a/‎backend/amd64/regalloc_stack_operands.ml
Lines changed: 2 additions & 1 deletion b/‎backend/amd64/regalloc_stack_operands.ml
Lines changed: 2 additions & 1 deletion
diff --git a/‎backend/amd64/reload.ml
Lines changed: 2 additions & 1 deletion b/‎backend/amd64/reload.ml
Lines changed: 2 additions & 1 deletion
diff --git a/‎backend/amd64/selection.ml
Lines changed: 6 additions & 0 deletions b/‎backend/amd64/selection.ml
Lines changed: 6 additions & 0 deletions
diff --git a/‎backend/amd64/simd.ml
Lines changed: 8 additions & 3 deletions b/‎backend/amd64/simd.ml
Lines changed: 8 additions & 3 deletions
diff --git a/‎backend/amd64/simd_proc.ml
Lines changed: 1 addition & 1 deletion b/‎backend/amd64/simd_proc.ml
Lines changed: 1 addition & 1 deletion
diff --git a/‎backend/arm64/emit.mlp
Lines changed: 4 additions & 2 deletions b/‎backend/arm64/emit.mlp
Lines changed: 4 additions & 2 deletions
diff --git a/‎backend/arm64/proc.ml
Lines changed: 3 additions & 1 deletion b/‎backend/arm64/proc.ml
Lines changed: 3 additions & 1 deletion
diff --git a/‎backend/cfg/cfg.ml
Lines changed: 1 addition & 0 deletions b/‎backend/cfg/cfg.ml
Lines changed: 1 addition & 0 deletions
diff --git a/‎backend/cmm.ml
Lines changed: 7 additions & 13 deletions b/‎backend/cmm.ml
Lines changed: 7 additions & 13 deletions
diff --git a/‎backend/cmm.mli
Lines changed: 3 additions & 0 deletions b/‎backend/cmm.mli
Lines changed: 3 additions & 0 deletions
@@ -1046,7 +1046,8 @@ let emit_simd_instr op i =
   | SSE High_64_to_low_64 -> I.movhlps (arg i 1) (res i 0)
   | SSE Low_64_to_high_64 -> I.movlhps (arg i 1) (res i 0)
   | SSE Interleave_high_32 -> I.unpckhps (arg i 1) (res i 0)
-  | SSE Interleave_low_32 -> I.unpcklps (arg i 1) (res i 0)
+  | SSE (Interleave_low_32 | Interleave_low_32_regs) ->
+    I.unpcklps (arg i 1) (res i 0)
   | SSE Movemask_32 -> I.movmskps (arg i 0) (res i 0)
   | SSE (Shuffle_32 n) -> I.shufps (X86_dsl.int n) (arg i 1) (res i 0)
   | SSE2 Max_scalar_f64 -> I.maxsd (arg i 1) (res i 0)
@@ -1568,6 +1569,8 @@ let emit_instr ~first ~fallthrough i =
       instr_for_floatop width floatop (arg i 1) (res i 0)
   | Lop(Iintofvalue | Ivalueofint | Ivectorcast Bits128) ->
       move i.arg.(0) i.res.(0)
+  | Lop(Iscalarcast Float32_as_float) ->
+      I.movss (arg i 0) (res i 0)
   | Lop(Iscalarcast (Float_of_int Float64)) ->
       I.cvtsi2sd  (arg i 0)  (res i 0)
   | Lop(Iscalarcast (Float_to_int Float64)) ->
 
@@ -796,7 +796,7 @@ let operation_supported = function
   | Cbswap _
   | Cclz _ | Cctz _
   | Ccmpi _ | Caddv | Cadda | Ccmpa _
-  | Cnegf _ | Cabsf _ | Caddf _ | Csubf _ | Cmulf _ | Cdivf _
+  | Cnegf _ | Cabsf _ | Caddf _ | Csubf _ | Cmulf _ | Cdivf _ | Cpackf32
   | Cvalueofint | Cintofvalue
   | Ccmpf _
   | Craise _
 
@@ -192,7 +192,8 @@ let basic (map : spilled_map) (instr : Cfg.basic Cfg.instruction) =
     May_still_have_spilled_registers
   | Op (Scalarcast (Float_of_int (Float32 | Float64) |
                     Float_to_int (Float32 | Float64) |
-                    Float_of_float32 | Float_to_float32) |
+                    Float_of_float32 | Float_to_float32 |
+                    Float32_as_float) |
                     Vectorcast _) ->
     may_use_stack_operand_for_only_argument map instr ~has_result:true
   | Op (Const_symbol _) ->
 
@@ -160,7 +160,8 @@ method! reload_operation op arg res =
         (arg', [|r|])
   | Iscalarcast (Float_of_int (Float32 | Float64) |
                  Float_to_int (Float32 | Float64) |
-                 Float_of_float32 | Float_to_float32) ->
+                 Float_of_float32 | Float_to_float32 |
+                 Float32_as_float) ->
     (* Result must be in register, but argument can be on stack *)
     (arg, (if stackp res.(0) then [| self#makereg res.(0) |] else res))
   | Iscalarcast (V128_to_scalar (Float64x2) | V128_of_scalar (Float64x2)) ->
 
@@ -298,6 +298,12 @@ method! select_operation op args dbg =
       self#select_floatarith true width Imulf Ifloatmul args
   | Cdivf width ->
       self#select_floatarith false width Idivf Ifloatdiv args
+  | Cpackf32 ->
+      (* We must operate on registers. This is because if the second argument
+         was a float stack slot, the resulting UNPCKLPS instruction would
+         enforce the validity of loading it as a 128-bit memory location,
+         even though it only loads 64 bits. *)
+      Ispecific (Isimd (SSE Interleave_low_32_regs)), args
   (* Special cases overriding C implementations (regardless of [@@builtin]). *)
   | Cextcall { func = ("sqrt" as func); _ }
   | Cextcall { func = ("caml_int64_bits_of_float_unboxed" as func); _ }
 
@@ -82,6 +82,7 @@ type sse_operation =
   | Low_64_to_high_64
   | Interleave_high_32
   | Interleave_low_32
+  | Interleave_low_32_regs
   | Movemask_32
   | Shuffle_32 of int
 
@@ -305,14 +306,15 @@ let equal_operation_sse l r =
   | Low_64_to_high_64, Low_64_to_high_64
   | Interleave_high_32, Interleave_high_32
   | Interleave_low_32, Interleave_low_32
+  | Interleave_low_32_regs, Interleave_low_32_regs
   | Movemask_32, Movemask_32 ->
     true
   | Cmp_f32 l, Cmp_f32 r when float_condition_equal l r -> true
   | Shuffle_32 l, Shuffle_32 r when Int.equal l r -> true
   | ( ( Add_f32 | Sub_f32 | Mul_f32 | Div_f32 | Max_f32 | Min_f32 | Rcp_f32
       | Sqrt_f32 | Rsqrt_f32 | High_64_to_low_64 | Low_64_to_high_64
-      | Interleave_high_32 | Interleave_low_32 | Movemask_32 | Cmp_f32 _
-      | Shuffle_32 _ ),
+      | Interleave_high_32 | Interleave_low_32_regs | Interleave_low_32
+      | Movemask_32 | Cmp_f32 _ | Shuffle_32 _ ),
       _ ) ->
     false
 
@@ -637,6 +639,8 @@ let print_operation_sse printreg op ppf arg =
     fprintf ppf "interleave_high_32 %a %a" printreg arg.(0) printreg arg.(1)
   | Interleave_low_32 ->
     fprintf ppf "interleave_low_32 %a %a" printreg arg.(0) printreg arg.(1)
+  | Interleave_low_32_regs ->
+    fprintf ppf "interleave_low_32_regs %a %a" printreg arg.(0) printreg arg.(1)
 
 let print_operation_sse2 printreg op ppf arg =
   match op with
@@ -922,7 +926,8 @@ let class_of_operation_bmi2 = function Deposit_64 | Extract_64 -> Pure
 let class_of_operation_sse = function
   | Cmp_f32 _ | Add_f32 | Sub_f32 | Mul_f32 | Div_f32 | Max_f32 | Min_f32
   | Rcp_f32 | Sqrt_f32 | Rsqrt_f32 | High_64_to_low_64 | Low_64_to_high_64
-  | Interleave_high_32 | Interleave_low_32 | Movemask_32 | Shuffle_32 _ ->
+  | Interleave_high_32 | Interleave_low_32 | Interleave_low_32_regs
+  | Movemask_32 | Shuffle_32 _ ->
     Pure
 
 let class_of_operation_sse2 = function
 
@@ -43,7 +43,7 @@ let register_behavior_sse = function
   | Interleave_low_32 | Interleave_high_32 | Shuffle_32 _ ->
     R_RM_to_fst
   | Rcp_f32 | Sqrt_f32 | Rsqrt_f32 -> RM_to_R
-  | High_64_to_low_64 | Low_64_to_high_64 -> R_R_to_fst
+  | Interleave_low_32_regs | High_64_to_low_64 | Low_64_to_high_64 -> R_R_to_fst
   | Movemask_32 -> R_to_R
 
 let register_behavior_sse2 = function
 
@@ -554,7 +554,8 @@ module BR = Branch_relaxation.Make (struct
     | Lop (Ivectorcast _) -> 1
     | Lop (Iscalarcast (Float_of_int Float64 | Float_to_int Float64)) -> 1
     | Lop (Iscalarcast (Float_of_int Float32 | Float_to_int Float32 |
-                        Float_of_float32 | Float_to_float32)) ->
+                        Float_of_float32 | Float_to_float32 |
+                        Float32_as_float)) ->
       (* CR mslater: (float32) arm64 *)
       Misc.fatal_error "float32 is not supported on this architecture"
     | Lop (Iscalarcast (V128_of_scalar _ | V128_to_scalar _)) ->
@@ -784,7 +785,8 @@ let emit_instr i =
     | Lop(Iscalarcast (Float_of_int Float64)) ->
         `	scvtf	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}\n`
      | Lop (Iscalarcast (Float_of_int Float32 | Float_to_int Float32 |
-                         Float_of_float32 | Float_to_float32)) ->
+                         Float_of_float32 | Float_to_float32 |
+                         Float32_as_float)) ->
         (* CR mslater: (float32) arm64 *)
         Misc.fatal_error "float32 not supported on this architecture"
     | Lop(Iscalarcast (V128_of_scalar _ | V128_to_scalar _) | Ivectorcast _) ->
 
@@ -489,9 +489,11 @@ let operation_supported = function
   (* CR mslater: (float32) arm64 *)
   | Cnegf Float32 | Cabsf Float32 | Caddf Float32
   | Csubf Float32 | Cmulf Float32 | Cdivf Float32
+  | Cpackf32
   | Cvectorcast _ | Cscalarcast (Float_of_float32 | Float_to_float32 |
                                  Float_to_int Float32 | Float_of_int Float32 |
-                                 V128_of_scalar _ | V128_to_scalar _)
+                                 V128_of_scalar _ | V128_to_scalar _ |
+                                 Float32_as_float)
     -> false   (* Not implemented *)
   | Cbswap _
   | Capply _ | Cextcall _ | Cload _ | Calloc _ | Cstore _
 
@@ -287,6 +287,7 @@ let dump_op ppf = function
   | Valueofint -> Format.fprintf ppf "valueofint"
   | Intofvalue -> Format.fprintf ppf "intofvalue"
   | Vectorcast Bits128 -> Format.fprintf ppf "vec128->vec128"
+  | Scalarcast Float32_as_float -> Format.fprintf ppf "float32 as float"
   | Scalarcast (Float_of_int Float64) -> Format.fprintf ppf "int->float"
   | Scalarcast (Float_to_int Float64) -> Format.fprintf ppf "float->int"
   | Scalarcast (Float_of_int Float32) -> Format.fprintf ppf "int->float32"
 
@@ -207,6 +207,7 @@ type vector_cast =
   | Bits128
 
 type scalar_cast =
+  | Float32_as_float
   | Float_to_int of float_width
   | Float_of_int of float_width
   | Float_to_float32
@@ -248,6 +249,7 @@ type operation =
   | Cnegf of float_width | Cabsf of float_width
   | Caddf of float_width | Csubf of float_width
   | Cmulf of float_width | Cdivf of float_width
+  | Cpackf32
   | Cvalueofint | Cintofvalue
   | Cvectorcast of vector_cast
   | Cscalarcast of scalar_cast
@@ -563,25 +565,17 @@ let equal_float_width left right =
 
 let equal_scalar_cast left right =
   match left, right with
+  | Float32_as_float, Float32_as_float -> true
   | Float_to_float32, Float_to_float32 -> true
   | Float_of_float32, Float_of_float32 -> true
   | Float_to_int f1, Float_to_int f2 -> equal_float_width f1 f2
   | Float_of_int f1, Float_of_int f2 -> equal_float_width f1 f2
   | V128_to_scalar v1, V128_to_scalar v2 -> Primitive.equal_vec128_type v1 v2
   | V128_of_scalar v1, V128_of_scalar v2 -> Primitive.equal_vec128_type v1 v2
-  | Float_to_float32, (Float_of_float32 | Float_to_int _ | Float_of_int _ |
-                       V128_to_scalar _ | V128_of_scalar _)
-  | Float_of_float32, (Float_to_float32 | Float_to_int _ | Float_of_int _ |
-                       V128_to_scalar _ | V128_of_scalar _)
-  | Float_to_int _, (Float_of_float32 | Float_to_float32 | Float_of_int _ |
-                       V128_to_scalar _ | V128_of_scalar _)
-  | Float_of_int _, (Float_of_float32 | Float_to_float32 | Float_to_int _ |
-                       V128_to_scalar _ | V128_of_scalar _)
-  | V128_to_scalar _, (Float_of_float32 | Float_to_float32 | Float_to_int _ |
-                       Float_of_int _ | V128_of_scalar _)
-  | V128_of_scalar _, (Float_of_float32 | Float_to_float32 | Float_to_int _ |
-                       Float_of_int _ | V128_to_scalar _)
-    -> false
+  | (Float32_as_float |
+     Float_to_float32 | Float_of_float32 |
+     Float_to_int _ | Float_of_int _ |
+     V128_to_scalar _ | V128_of_scalar _), _ -> false
 
 let equal_float_comparison left right =
   match left, right with
 
@@ -182,6 +182,8 @@ type vector_cast =
   | Bits128
 
 type scalar_cast =
+  (* CR mslater: move all bit-casts into a reinterpret_cast type *)
+  | Float32_as_float
   | Float_to_int of float_width
   | Float_of_int of float_width
   | Float_to_float32
@@ -228,6 +230,7 @@ type operation =
   | Cnegf of float_width | Cabsf of float_width
   | Caddf of float_width | Csubf of float_width
   | Cmulf of float_width | Cdivf of float_width
+  | Cpackf32
   | Cvalueofint | Cintofvalue
   | Cvectorcast of vector_cast
   | Cscalarcast of scalar_cast