ocaml-flambda · TheNumbat · Jul 9, 2024 · Jun 14, 2024 · Jun 14, 2024 · Jun 18, 2024
diff --git a/backend/amd64/emit.mlp b/backend/amd64/emit.mlp
@@ -1016,12 +1016,8 @@ let emit_static_cast (cast : Cmm.static_cast) i =
     I.movq (arg i 0) (res i 0)
   | Scalar_of_v128 Int32x4 -> I.movd (arg i 0) (res32 i 0)
   | V128_of_scalar Int32x4 -> I.movd (arg32 i 0) (res i 0)
-  | V128_of_scalar Float32x4 ->
-    (* CR mslater: (SIMD) remove cvt once we have unboxed float32 *)
-    I.cvtsd2ss (arg i 0) (res i 0)
-  | Scalar_of_v128 Float32x4 ->
-    (* CR mslater: (SIMD) remove cvt once we have unboxed float32 *)
-    I.cvtss2sd (arg i 0) (res i 0)
+  | V128_of_scalar Float32x4 | Scalar_of_v128 Float32x4 ->
+    if distinct then I.movss (arg i 0) (res i 0)
   | Scalar_of_v128 Int16x8 ->
     (* [movw] and [movzx] cannot operate on vector registers.
        We must zero extend as the result is an untagged positive int.

diff --git a/backend/amd64/regalloc_stack_operands.ml b/backend/amd64/regalloc_stack_operands.ml
@@ -179,11 +179,9 @@ let basic (map : spilled_map) (instr : Cfg.basic Cfg.instruction) =
     | R_to_RM -> may_use_stack_operand_for_result map instr ~num_args:1
     | RM_to_R -> may_use_stack_operand_for_only_argument map instr ~has_result:true)
   | Op (Reinterpret_cast (Float_of_float32 | Float32_of_float | V128_of_v128))
-  | Op (Static_cast (V128_of_scalar Float64x2 | Scalar_of_v128 Float64x2)) ->
-    unary_operation_argument_or_result_on_stack map instr
+  | Op (Static_cast (V128_of_scalar Float64x2 | Scalar_of_v128 Float64x2))
   | Op (Static_cast (V128_of_scalar Float32x4 | Scalar_of_v128 Float32x4)) ->
-    (* CR mslater: (SIMD) replace once we have unboxed float32 *)
-    may_use_stack_operand_for_only_argument map instr ~has_result:true
+    unary_operation_argument_or_result_on_stack map instr
   | Op (Reinterpret_cast (Float_of_int64 | Float32_of_int32))
   | Op (Static_cast (V128_of_scalar (Int64x2 | Int32x4 | Int16x8 | Int8x16))) ->
     may_use_stack_operand_for_only_argument map instr ~has_result:true

diff --git a/backend/amd64/reload.ml b/backend/amd64/reload.ml
@@ -164,16 +164,13 @@ method! reload_operation op arg res =
     (* Result must be in register, but argument can be on stack *)
     (arg, (if stackp res.(0) then [| self#makereg res.(0) |] else res))
   | Ireinterpret_cast (Float_of_float32 |  Float32_of_float | V128_of_v128)
-  | Istatic_cast (V128_of_scalar Float64x2 | Scalar_of_v128 Float64x2) ->
+  | Istatic_cast (V128_of_scalar Float64x2 | Scalar_of_v128 Float64x2)
+  | Istatic_cast (V128_of_scalar Float32x4 | Scalar_of_v128 Float32x4) ->
     (* These are just moves; either the argument or result may be on the stack. *)
     begin match stackp arg.(0), stackp res.(0) with
     | true, true -> ([| self#makereg arg.(0) |], res)
     | _ -> (arg, res)
     end
-  | Istatic_cast (V128_of_scalar Float32x4 | Scalar_of_v128 Float32x4) ->
-    (* These do additional logic requiring the result to be a register.
-       CR mslater: (SIMD) replace once we have unboxed float32 *)
-    (arg, [| self#makereg res.(0) |])
   | Ireinterpret_cast (Float_of_int64 | Float32_of_int32)
   | Istatic_cast (V128_of_scalar (Int64x2 | Int32x4 | Int16x8 | Int8x16)) ->
     (* Int -> Vec regs need the result to be a register. *)

diff --git a/backend/amd64/simd_proc.ml b/backend/amd64/simd_proc.ml
@@ -99,7 +99,7 @@ let register_behavior_sse41 = function
   | Blendv_8 | Blendv_32 | Blendv_64 -> R_RM_xmm0_to_fst
   | Extract_i64 _ | Extract_i32 _ -> R_to_RM
   | Extract_i8 _ | Extract_i16 _ ->
-    (* CR mslater: (SIMD): replace once we have int8/int16/float32 *)
+    (* CR mslater: (SIMD): replace once we have int8/int16 *)
     R_to_R
 
 let register_behavior_sse42 = function

diff --git a/backend/cmm_builtins.ml b/backend/cmm_builtins.ml
@@ -195,30 +195,45 @@ let bigstring_atomic_add size (arg1, arg2, arg3) dbg =
 let bigstring_atomic_sub size (arg1, arg2, arg3) dbg =
   bigstring_atomic_add size (arg1, arg2, neg_int arg3 dbg) dbg
 
-(* Assumes unboxed float64 *)
-let rec const_float_args n args name =
+let rec const_args_gen ~extract ~type_name n args name =
   match n, args with
   | 0, [] -> []
-  | n, Cconst_float (f, _) :: args -> f :: const_float_args (n - 1) args name
-  | _ -> bad_immediate "Did not find constant float arguments for %s" name
+  | _, [] ->
+    bad_immediate "Missing %d constant %s argument(s) for %s" n type_name name
+  | n, arg :: args -> (
+    match extract arg with
+    | Some value ->
+      value :: const_args_gen ~extract ~type_name (n - 1) args name
+    | None ->
+      bad_immediate "Did not find constant %s arguments for %s" type_name name)
+
+(* Assumes unboxed float32 *)
+let const_float32_args =
+  const_args_gen
+    ~extract:(function Cconst_float32 (f, _) -> Some f | _ -> None)
+    ~type_name:"float32"
+
+(* Assumes unboxed float64 *)
+let const_float_args =
+  const_args_gen
+    ~extract:(function Cconst_float (f, _) -> Some f | _ -> None)
+    ~type_name:"float"
 
 (* Assumes untagged int or unboxed int32, always representable by int63 *)
-let rec const_int_args n args name =
-  match n, args with
-  | 0, [] -> []
-  | n, Cconst_int (i, _) :: args -> i :: const_int_args (n - 1) args name
-  | _ -> bad_immediate "Did not find constant int arguments for %s" name
+let const_int_args =
+  const_args_gen
+    ~extract:(function Cconst_int (i, _) -> Some i | _ -> None)
+    ~type_name:"int"
 
 (* Assumes unboxed int64: no tag, comes as Cconst_int when representable by
    int63, otherwise we get Cconst_natint *)
-let rec const_int64_args n args name =
-  match n, args with
-  | 0, [] -> []
-  | n, Cconst_int (i, _) :: args ->
-    Int64.of_int i :: const_int64_args (n - 1) args name
-  | n, Cconst_natint (i, _) :: args ->
-    Int64.of_nativeint i :: const_int64_args (n - 1) args name
-  | _ -> bad_immediate "Did not find constant int64 arguments for %s" name
+let const_int64_args =
+  const_args_gen
+    ~extract:(function
+      | Cconst_int (i, _) -> Some (Int64.of_int i)
+      | Cconst_natint (i, _) -> Some (Int64.of_nativeint i)
+      | _ -> None)
+    ~type_name:"int64"
 
 let int64_of_int8 i =
   (* CR mslater: (SIMD) replace once we have unboxed int8 *)
@@ -238,7 +253,6 @@ let int64_of_int32 i =
   Int64.of_int i |> Int64.logand 0xffffffffL
 
 let int64_of_float32 f =
-  (* CR mslater: (SIMD) replace once we have unboxed float32 *)
   Int32.bits_of_float f |> Int64.of_int32 |> Int64.logand 0xffffffffL
 
 let pack_int32s i0 i1 = Int64.(logor (shift_left i1 32) i0)
@@ -272,12 +286,10 @@ let transl_vec128_builtin name args dbg _typ_res =
   | "caml_float64x2_low_to_float" ->
     let op = Cstatic_cast (Scalar_of_v128 Float64x2) in
     if_operation_supported op ~f:(fun () -> Cop (op, args, dbg))
-  | "caml_float32x4_low_of_float" ->
-    (* CR mslater: (SIMD) replace once we have unboxed float32 *)
+  | "caml_float32x4_low_of_float32" ->
     let op = Cstatic_cast (V128_of_scalar Float32x4) in
     if_operation_supported op ~f:(fun () -> Cop (op, args, dbg))
-  | "caml_float32x4_low_to_float" ->
-    (* CR mslater: (SIMD) replace once we have unboxed float32 *)
+  | "caml_float32x4_low_to_float32" ->
     let op = Cstatic_cast (Scalar_of_v128 Float32x4) in
     if_operation_supported op ~f:(fun () -> Cop (op, args, dbg))
   | "caml_int64x2_low_of_int64" ->
@@ -310,15 +322,13 @@ let transl_vec128_builtin name args dbg _typ_res =
     if_operation_supported op ~f:(fun () -> Cop (op, args, dbg))
   (* Constants *)
   | "caml_float32x4_const1" ->
-    (* CR mslater: (SIMD) replace once we have unboxed float32 *)
-    let f = const_float_args 1 args name |> List.hd in
+    let f = const_float32_args 1 args name |> List.hd in
     let i = int64_of_float32 f in
     let i = pack_int32s i i in
     Some (Cconst_vec128 ({ low = i; high = i }, dbg))
   | "caml_float32x4_const4" ->
-    (* CR mslater: (SIMD) replace once we have unboxed float32 *)
     let i0, i1, i2, i3 =
-      match const_float_args 4 args name |> List.map int64_of_float32 with
+      match const_float32_args 4 args name |> List.map int64_of_float32 with
       | [i0; i1; i2; i3] -> i0, i1, i2, i3
       | _ -> assert false
     in

diff --git a/backend/selectgen.ml b/backend/selectgen.ml
@@ -186,9 +186,8 @@ let oper_result_type = function
   | Cstatic_cast (Float32_of_float | Float_of_int Float32) -> typ_float32
   | Cstatic_cast (Int_of_float (Float64 | Float32)) -> typ_int
   | Cstatic_cast (V128_of_scalar _) -> typ_vec128
-  | Cstatic_cast (Scalar_of_v128 (Float64x2 | Float32x4)) ->
-    (* CR mslater: (SIMD) replace once we have unboxed float32 *)
-    typ_float
+  | Cstatic_cast (Scalar_of_v128 Float64x2) -> typ_float
+  | Cstatic_cast (Scalar_of_v128 Float32x4) -> typ_float32
   | Cstatic_cast (Scalar_of_v128 (Int8x16 | Int16x8 | Int32x4 | Int64x2)) -> typ_int
   | Craise _ -> typ_void
   | Cprobe _ -> typ_void

diff --git a/middle_end/flambda2/from_lambda/closure_conversion.ml b/middle_end/flambda2/from_lambda/closure_conversion.ml
@@ -939,13 +939,14 @@ let close_primitive acc env ~let_bound_ids_with_kinds named
       | Pbigstring_set_32 _ | Pbigstring_set_f32 _ | Pbigstring_set_64 _
       | Pbigstring_set_128 _ | Pfloatarray_load_128 _ | Pfloat_array_load_128 _
       | Pint_array_load_128 _ | Punboxed_float_array_load_128 _
-      | Punboxed_int32_array_load_128 _ | Punboxed_int64_array_load_128 _
-      | Punboxed_nativeint_array_load_128 _ | Pfloatarray_set_128 _
-      | Pfloat_array_set_128 _ | Pint_array_set_128 _
-      | Punboxed_float_array_set_128 _ | Punboxed_int32_array_set_128 _
-      | Punboxed_int64_array_set_128 _ | Punboxed_nativeint_array_set_128 _
-      | Pctconst _ | Pbswap16 | Pbbswap _ | Pint_as_pointer _ | Popaque _
-      | Pprobe_is_enabled _ | Pobj_dup | Pobj_magic _ | Punbox_float _
+      | Punboxed_float32_array_load_128 _ | Punboxed_int32_array_load_128 _
+      | Punboxed_int64_array_load_128 _ | Punboxed_nativeint_array_load_128 _
+      | Pfloatarray_set_128 _ | Pfloat_array_set_128 _ | Pint_array_set_128 _
+      | Punboxed_float_array_set_128 _ | Punboxed_float32_array_set_128 _
+      | Punboxed_int32_array_set_128 _ | Punboxed_int64_array_set_128 _
+      | Punboxed_nativeint_array_set_128 _ | Pctconst _ | Pbswap16 | Pbbswap _
+      | Pint_as_pointer _ | Popaque _ | Pprobe_is_enabled _ | Pobj_dup
+      | Pobj_magic _ | Punbox_float _
       | Pbox_float (_, _)
       | Punbox_int _ | Pbox_int _ | Pmake_unboxed_product _
       | Punboxed_product_field _ | Pget_header _ | Prunstack | Pperform

diff --git a/middle_end/flambda2/from_lambda/lambda_to_flambda.ml b/middle_end/flambda2/from_lambda/lambda_to_flambda.ml
@@ -641,13 +641,15 @@ let primitive_can_raise (prim : Lambda.primitive) =
   | Pfloat_array_load_128 { unsafe = false; _ }
   | Pint_array_load_128 { unsafe = false; _ }
   | Punboxed_float_array_load_128 { unsafe = false; _ }
+  | Punboxed_float32_array_load_128 { unsafe = false; _ }
   | Punboxed_int32_array_load_128 { unsafe = false; _ }
   | Punboxed_int64_array_load_128 { unsafe = false; _ }
   | Punboxed_nativeint_array_load_128 { unsafe = false; _ }
   | Pfloatarray_set_128 { unsafe = false; _ }
   | Pfloat_array_set_128 { unsafe = false; _ }
   | Pint_array_set_128 { unsafe = false; _ }
   | Punboxed_float_array_set_128 { unsafe = false; _ }
+  | Punboxed_float32_array_set_128 { unsafe = false; _ }
   | Punboxed_int32_array_set_128 { unsafe = false; _ }
   | Punboxed_int64_array_set_128 { unsafe = false; _ }
   | Punboxed_nativeint_array_set_128 { unsafe = false; _ }
@@ -736,13 +738,15 @@ let primitive_can_raise (prim : Lambda.primitive) =
   | Pfloat_array_load_128 { unsafe = true; _ }
   | Pint_array_load_128 { unsafe = true; _ }
   | Punboxed_float_array_load_128 { unsafe = true; _ }
+  | Punboxed_float32_array_load_128 { unsafe = true; _ }
   | Punboxed_int32_array_load_128 { unsafe = true; _ }
   | Punboxed_int64_array_load_128 { unsafe = true; _ }
   | Punboxed_nativeint_array_load_128 { unsafe = true; _ }
   | Pfloatarray_set_128 { unsafe = true; _ }
   | Pfloat_array_set_128 { unsafe = true; _ }
   | Pint_array_set_128 { unsafe = true; _ }
   | Punboxed_float_array_set_128 { unsafe = true; _ }
+  | Punboxed_float32_array_set_128 { unsafe = true; _ }
   | Punboxed_int32_array_set_128 { unsafe = true; _ }
   | Punboxed_int64_array_set_128 { unsafe = true; _ }
   | Punboxed_nativeint_array_set_128 { unsafe = true; _ }

diff --git a/middle_end/flambda2/from_lambda/lambda_to_flambda_primitives.ml b/middle_end/flambda2/from_lambda/lambda_to_flambda_primitives.ml
@@ -1845,6 +1845,9 @@ let convert_lprim ~big_endian (prim : L.primitive) (args : Simple.t list list)
   | Punboxed_float_array_load_128 { unsafe; mode }, [[array]; [index]] ->
     [ array_like_load_128 ~dbg ~size_int ~current_region ~unsafe ~mode
         Naked_floats array index ]
+  | Punboxed_float32_array_load_128 { unsafe; mode }, [[array]; [index]] ->
+    [ array_like_load_128 ~dbg ~size_int ~current_region ~unsafe ~mode
+        Naked_float32s array index ]
   | Pint_array_load_128 { unsafe; mode }, [[array]; [index]] ->
     if Targetint.size <> 64
     then Misc.fatal_error "[Pint_array_load_128]: immediates must be 64 bits.";
@@ -1871,6 +1874,10 @@ let convert_lprim ~big_endian (prim : L.primitive) (args : Simple.t list list)
   | Punboxed_float_array_set_128 { unsafe }, [[array]; [index]; [new_value]] ->
     [ array_like_set_128 ~dbg ~size_int ~unsafe Naked_floats array index
         new_value ]
+  | Punboxed_float32_array_set_128 { unsafe }, [[array]; [index]; [new_value]]
+    ->
+    [ array_like_set_128 ~dbg ~size_int ~unsafe Naked_float32s array index
+        new_value ]
   | Pint_array_set_128 { unsafe }, [[array]; [index]; [new_value]] ->
     if Targetint.size <> 64
     then Misc.fatal_error "[Pint_array_set_128]: immediates must be 64 bits.";
@@ -1985,8 +1992,9 @@ let convert_lprim ~big_endian (prim : L.primitive) (args : Simple.t list list)
       | Pbigstring_load_16 _ | Pbigstring_load_32 _ | Pbigstring_load_f32 _
       | Pbigstring_load_64 _ | Pbigstring_load_128 _ | Pfloatarray_load_128 _
       | Pfloat_array_load_128 _ | Pint_array_load_128 _
-      | Punboxed_float_array_load_128 _ | Punboxed_int32_array_load_128 _
-      | Punboxed_int64_array_load_128 _ | Punboxed_nativeint_array_load_128 _
+      | Punboxed_float_array_load_128 _ | Punboxed_float32_array_load_128 _
+      | Punboxed_int32_array_load_128 _ | Punboxed_int64_array_load_128 _
+      | Punboxed_nativeint_array_load_128 _
       | Parrayrefu
           ( ( Pgenarray_ref _ | Paddrarray_ref | Pintarray_ref
             | Pfloatarray_ref _ | Punboxedfloatarray_ref _
@@ -2023,9 +2031,9 @@ let convert_lprim ~big_endian (prim : L.primitive) (args : Simple.t list list)
       | Pbytes_set_128 _ | Pbigstring_set_16 _ | Pbigstring_set_32 _
       | Pbigstring_set_f32 _ | Pbigstring_set_64 _ | Pbigstring_set_128 _
       | Pfloatarray_set_128 _ | Pfloat_array_set_128 _ | Pint_array_set_128 _
-      | Punboxed_float_array_set_128 _ | Punboxed_int32_array_set_128 _
-      | Punboxed_int64_array_set_128 _ | Punboxed_nativeint_array_set_128 _
-      | Patomic_cas ),
+      | Punboxed_float_array_set_128 _ | Punboxed_float32_array_set_128 _
+      | Punboxed_int32_array_set_128 _ | Punboxed_int64_array_set_128 _
+      | Punboxed_nativeint_array_set_128 _ | Patomic_cas ),
       ( []
       | [_]
       | [_; _]

diff --git a/ocaml/bytecomp/bytegen.ml b/ocaml/bytecomp/bytegen.ml
@@ -150,11 +150,13 @@ let preserve_tailcall_for_prim = function
   | Pbigstring_load_64 _ | Pbigstring_load_128 _
   | Pbigstring_set_16 _ | Pbigstring_set_32 _ | Pbigstring_set_f32 _
   | Pfloatarray_load_128 _ | Pfloat_array_load_128 _ | Pint_array_load_128 _
-  | Punboxed_float_array_load_128 _ | Punboxed_int32_array_load_128 _
-  | Punboxed_int64_array_load_128 _ | Punboxed_nativeint_array_load_128 _
+  | Punboxed_float_array_load_128 _ | Punboxed_float32_array_load_128 _
+  | Punboxed_int32_array_load_128 _ | Punboxed_int64_array_load_128 _
+  | Punboxed_nativeint_array_load_128 _
   | Pfloatarray_set_128 _ | Pfloat_array_set_128 _ | Pint_array_set_128 _
-  | Punboxed_float_array_set_128 _ | Punboxed_int32_array_set_128 _
-  | Punboxed_int64_array_set_128 _ | Punboxed_nativeint_array_set_128 _
+  | Punboxed_float_array_set_128 _ | Punboxed_float32_array_set_128 _
+  | Punboxed_int32_array_set_128 _ | Punboxed_int64_array_set_128 _
+  | Punboxed_nativeint_array_set_128 _
   | Pbigstring_set_64 _ | Pbigstring_set_128 _
   | Pprobe_is_enabled _ | Pobj_dup
   | Pctconst _ | Pbswap16 | Pbbswap _ | Pint_as_pointer _
@@ -573,11 +575,13 @@ let comp_primitive stack_info p sz args =
   | Pstring_load_128 _ | Pbytes_load_128 _ | Pbytes_set_128 _
   | Pbigstring_load_128 _ | Pbigstring_set_128 _
   | Pfloatarray_load_128 _ | Pfloat_array_load_128 _ | Pint_array_load_128 _
-  | Punboxed_float_array_load_128 _ | Punboxed_int32_array_load_128 _
-  | Punboxed_int64_array_load_128 _ | Punboxed_nativeint_array_load_128 _
+  | Punboxed_float_array_load_128 _ | Punboxed_float32_array_load_128 _
+  | Punboxed_int32_array_load_128 _ | Punboxed_int64_array_load_128 _
+  | Punboxed_nativeint_array_load_128 _
   | Pfloatarray_set_128 _ | Pfloat_array_set_128 _ | Pint_array_set_128 _
-  | Punboxed_float_array_set_128 _ | Punboxed_int32_array_set_128 _
-  | Punboxed_int64_array_set_128 _ | Punboxed_nativeint_array_set_128 _ ->
+  | Punboxed_float_array_set_128 _ | Punboxed_float32_array_set_128 _
+  | Punboxed_int32_array_set_128 _ | Punboxed_int64_array_set_128 _
+  | Punboxed_nativeint_array_set_128 _ ->
     fatal_error "128-bit load/store is not supported in bytecode mode."
   (* The cases below are handled in [comp_expr] before the [comp_primitive] call
      (in the order in which they appear below),
@@ -1218,3 +1222,4 @@ let compile_phrase expr =
   let init_code = comp_block empty_env expr 1 [Kreturn 1] in
   let fun_code = comp_remainder [] in
   (init_code, fun_code))
+
diff --git a/ocaml/lambda/lambda.ml b/ocaml/lambda/lambda.ml
@@ -270,13 +270,15 @@ type primitive =
   | Pfloat_array_load_128 of { unsafe : bool; mode : alloc_mode }
   | Pint_array_load_128 of { unsafe : bool; mode : alloc_mode }
   | Punboxed_float_array_load_128 of { unsafe : bool; mode : alloc_mode }
+  | Punboxed_float32_array_load_128 of { unsafe : bool; mode : alloc_mode }
   | Punboxed_int32_array_load_128 of { unsafe : bool; mode : alloc_mode }
   | Punboxed_int64_array_load_128 of { unsafe : bool; mode : alloc_mode }
   | Punboxed_nativeint_array_load_128 of { unsafe : bool; mode : alloc_mode }
   | Pfloatarray_set_128 of { unsafe : bool }
   | Pfloat_array_set_128 of { unsafe : bool }
   | Pint_array_set_128 of { unsafe : bool }
   | Punboxed_float_array_set_128 of { unsafe : bool }
+  | Punboxed_float32_array_set_128 of { unsafe : bool }
   | Punboxed_int32_array_set_128 of { unsafe : bool }
   | Punboxed_int64_array_set_128 of { unsafe : bool }
   | Punboxed_nativeint_array_set_128 of { unsafe : bool }
@@ -1764,6 +1766,7 @@ let primitive_may_allocate : primitive -> alloc_mode option = function
   | Pfloat_array_load_128 { mode = m; _ }
   | Pint_array_load_128 { mode = m; _ }
   | Punboxed_float_array_load_128 { mode = m; _ }
+  | Punboxed_float32_array_load_128 { mode = m; _ }
   | Punboxed_int32_array_load_128 { mode = m; _ }
   | Punboxed_int64_array_load_128 { mode = m; _ }
   | Punboxed_nativeint_array_load_128 { mode = m; _ }
@@ -1782,8 +1785,9 @@ let primitive_may_allocate : primitive -> alloc_mode option = function
   | Pbigstring_set_16 _ | Pbigstring_set_32 _ | Pbigstring_set_f32 _
   | Pbigstring_set_64 _ | Pbigstring_set_128 _
   | Pfloatarray_set_128 _ | Pfloat_array_set_128 _ | Pint_array_set_128 _
-  | Punboxed_float_array_set_128 _ | Punboxed_int32_array_set_128 _
-  | Punboxed_int64_array_set_128 _ | Punboxed_nativeint_array_set_128 _ -> None
+  | Punboxed_float_array_set_128 _ | Punboxed_float32_array_set_128 _
+  | Punboxed_int32_array_set_128 _ | Punboxed_int64_array_set_128 _
+  | Punboxed_nativeint_array_set_128 _ -> None
   | Pctconst _ -> None
   | Pbswap16 -> None
   | Pbbswap (_, m) -> Some m
@@ -1874,8 +1878,9 @@ let primitive_result_layout (p : primitive) =
   | Pbytes_set_128 _ | Pbigstring_set_16 _ | Pbigstring_set_32 _ | Pbigstring_set_f32 _
   | Pbigstring_set_64 _ | Pbigstring_set_128 _
   | Pfloatarray_set_128 _ | Pfloat_array_set_128 _ | Pint_array_set_128 _
-  | Punboxed_float_array_set_128 _ | Punboxed_int32_array_set_128 _
-  | Punboxed_int64_array_set_128 _ | Punboxed_nativeint_array_set_128 _
+  | Punboxed_float_array_set_128 _ | Punboxed_float32_array_set_128 _
+  | Punboxed_int32_array_set_128 _ | Punboxed_int64_array_set_128 _
+  | Punboxed_nativeint_array_set_128 _
     -> layout_unit
   | Pgetglobal _ | Psetglobal _ | Pgetpredef _ -> layout_module_field
   | Pmakeblock _ | Pmakefloatblock _ | Pmakearray _ | Pduprecord _
@@ -1941,6 +1946,8 @@ let primitive_result_layout (p : primitive) =
   | Pfloatarray_load_128 _ | Pfloat_array_load_128 _
   | Punboxed_float_array_load_128 _ ->
     layout_boxed_vector (Pvec128 Float64x2)
+  | Punboxed_float32_array_load_128 _ ->
+    layout_boxed_vector (Pvec128 Float32x4)
   | Pint_array_load_128 _ | Punboxed_int64_array_load_128 _
   | Punboxed_nativeint_array_load_128 _ ->
     (* 128-bit types are only supported in the x86_64 backend, so we may