@@ -185,6 +185,22 @@ let emit_call_gc gc =
185
185
`{emit_label gc.gc_lbl}: bl {emit_symbol "caml_call_gc"}\n`;
186
186
`{emit_label gc.gc_frame_lbl}: b {emit_label gc.gc_return_lbl}\n`
187
187
188
+ (* Record calls to local stack reallocation *)
189
+
190
+ type local_realloc_call =
191
+ { lr_lbl: label;
192
+ lr_return_lbl: label;
193
+ lr_dbg: Debuginfo.t
194
+ }
195
+
196
+ let local_realloc_sites = ref ([] : local_realloc_call list)
197
+
198
+ let emit_local_realloc lr =
199
+ `{emit_label lr.lr_lbl}:\n`;
200
+ ` {emit_debug_info lr.lr_dbg}\n`;
201
+ ` bl {emit_symbol "caml_call_local_realloc"}\n`;
202
+ ` b {emit_label lr.lr_return_lbl}\n`
203
+
188
204
(* Record calls to caml_ml_array_bound_error.
189
205
In debug mode, we maintain one call to caml_ml_array_bound_error
190
206
per bound check site. Otherwise, we can share a single call. *)
@@ -397,7 +413,7 @@ let num_call_gc_and_check_bound_points instr =
397
413
let rec loop instr ((call_gc, check_bound) as totals) =
398
414
match instr.desc with
399
415
| Lend -> totals
400
- | Lop (Ialloc _ ) when !fastcode_flag ->
416
+ | Lop (Ialloc {mode = Alloc_heap; _} ) when !fastcode_flag ->
401
417
loop instr.next (call_gc + 1, check_bound)
402
418
| Lop (Ipoll _) ->
403
419
loop instr.next (call_gc + 1, check_bound)
@@ -523,8 +539,7 @@ module BR = Branch_relaxation.Make (struct
523
539
| _ -> 1 + num_instructions_for_intconst (Nativeint.of_int num_bytes)
524
540
end
525
541
| Lop (Icsel _) -> 4
526
- | Lop (Ibeginregion | Iendregion) ->
527
- Misc.fatal_error "Local allocations not supported on this architecture"
542
+ | Lop (Ibeginregion | Iendregion) -> 1
528
543
| Lop (Iintop (Icomp _)) -> 2
529
544
| Lop (Icompf _) -> 2
530
545
| Lop (Iintop_imm (Icomp _, _)) -> 2
@@ -602,44 +617,67 @@ end)
602
617
603
618
(* Output the assembly code for allocation. *)
604
619
605
- let assembly_code_for_allocation i ~n ~far ~dbginfo =
606
- let lbl_frame =
607
- record_frame_label i.live (Dbg_alloc dbginfo)
608
- in
609
- if !fastcode_flag then begin
610
- let lbl_after_alloc = new_label() in
611
- let lbl_call_gc = new_label() in
612
- (* n is at most Max_young_whsize * 8, i.e. currently 0x808,
613
- so it is reasonable to assume n < 0x1_000. This makes
614
- the generated code simpler. *)
615
- assert (16 <= n && n < 0x1_000 && n land 0x7 = 0);
616
- let offset = Domainstate.(idx_of_field Domain_young_limit) * 8 in
617
- ` ldr {emit_reg reg_tmp1}, [{emit_reg reg_domain_state_ptr}, #{emit_int offset}]\n`;
618
- ` sub {emit_reg reg_alloc_ptr}, {emit_reg reg_alloc_ptr}, #{emit_int n}\n`;
619
- ` cmp {emit_reg reg_alloc_ptr}, {emit_reg reg_tmp1}\n`;
620
- if not far then begin
621
- ` b.lo {emit_label lbl_call_gc}\n`
622
- end else begin
623
- let lbl = new_label () in
624
- ` b.cs {emit_label lbl}\n`;
625
- ` b {emit_label lbl_call_gc}\n`;
626
- `{emit_label lbl}:\n`
627
- end;
628
- `{emit_label lbl_after_alloc}:`;
629
- ` add {emit_reg i.res.(0)}, {emit_reg reg_alloc_ptr}, #8\n`;
630
- call_gc_sites :=
631
- { gc_lbl = lbl_call_gc;
632
- gc_return_lbl = lbl_after_alloc;
633
- gc_frame_lbl = lbl_frame } :: !call_gc_sites
620
+ let assembly_code_for_allocation i ~local ~n ~far ~dbginfo =
621
+ if local then begin
622
+ let r = i.res.(0) in
623
+ let domain_local_sp_offset = Domainstate.(idx_of_field Domain_local_sp) * 8 in
624
+ let domain_local_limit_offset = Domainstate.(idx_of_field Domain_local_limit) * 8 in
625
+ let domain_local_top_offset = Domainstate.(idx_of_field Domain_local_top) * 8 in
626
+ ` ldr {emit_reg reg_tmp1}, [{emit_reg reg_domain_state_ptr}, #{emit_int domain_local_limit_offset}]\n`;
627
+ ` ldr {emit_reg r}, [{emit_reg reg_domain_state_ptr}, #{emit_int domain_local_sp_offset}]\n`;
628
+ ` sub {emit_reg r}, {emit_reg r}, #{emit_int n}\n`;
629
+ ` str {emit_reg r}, [{emit_reg reg_domain_state_ptr}, #{emit_int domain_local_sp_offset}]\n`;
630
+ ` cmp {emit_reg r}, {emit_reg reg_tmp1}\n`;
631
+ let lbl_call = new_label () in
632
+ ` b.le {emit_label lbl_call}\n`;
633
+ let lbl_after_alloc = new_label () in
634
+ `{emit_label lbl_after_alloc}:\n`;
635
+ ` ldr {emit_reg reg_tmp1}, [{emit_reg reg_domain_state_ptr}, #{emit_int domain_local_top_offset}]\n`;
636
+ ` add {emit_reg r}, {emit_reg r}, {emit_reg reg_tmp1}\n`;
637
+ ` add {emit_reg r}, {emit_reg r}, #{emit_int 8}\n`;
638
+ local_realloc_sites :=
639
+ { lr_lbl = lbl_call;
640
+ lr_dbg = i.dbg;
641
+ lr_return_lbl = lbl_after_alloc } :: !local_realloc_sites
634
642
end else begin
635
- begin match n with
636
- | 16 -> ` bl {emit_symbol "caml_alloc1"}\n`
637
- | 24 -> ` bl {emit_symbol "caml_alloc2"}\n`
638
- | 32 -> ` bl {emit_symbol "caml_alloc3"}\n`
639
- | _ -> emit_intconst reg_x8 (Nativeint.of_int n);
640
- ` bl {emit_symbol "caml_allocN"}\n`
641
- end;
642
- `{emit_label lbl_frame}: add {emit_reg i.res.(0)}, {emit_reg reg_alloc_ptr}, #8\n`
643
+ let lbl_frame =
644
+ record_frame_label i.live (Dbg_alloc dbginfo)
645
+ in
646
+ if !fastcode_flag then begin
647
+ let lbl_after_alloc = new_label() in
648
+ let lbl_call_gc = new_label() in
649
+ (* n is at most Max_young_whsize * 8, i.e. currently 0x808,
650
+ so it is reasonable to assume n < 0x1_000. This makes
651
+ the generated code simpler. *)
652
+ assert (16 <= n && n < 0x1_000 && n land 0x7 = 0);
653
+ let offset = Domainstate.(idx_of_field Domain_young_limit) * 8 in
654
+ ` ldr {emit_reg reg_tmp1}, [{emit_reg reg_domain_state_ptr}, #{emit_int offset}]\n`;
655
+ ` sub {emit_reg reg_alloc_ptr}, {emit_reg reg_alloc_ptr}, #{emit_int n}\n`;
656
+ ` cmp {emit_reg reg_alloc_ptr}, {emit_reg reg_tmp1}\n`;
657
+ if not far then begin
658
+ ` b.lo {emit_label lbl_call_gc}\n`
659
+ end else begin
660
+ let lbl = new_label () in
661
+ ` b.cs {emit_label lbl}\n`;
662
+ ` b {emit_label lbl_call_gc}\n`;
663
+ `{emit_label lbl}:\n`
664
+ end;
665
+ `{emit_label lbl_after_alloc}:`;
666
+ ` add {emit_reg i.res.(0)}, {emit_reg reg_alloc_ptr}, #8\n`;
667
+ call_gc_sites :=
668
+ { gc_lbl = lbl_call_gc;
669
+ gc_return_lbl = lbl_after_alloc;
670
+ gc_frame_lbl = lbl_frame } :: !call_gc_sites
671
+ end else begin
672
+ begin match n with
673
+ | 16 -> ` bl {emit_symbol "caml_alloc1"}\n`
674
+ | 24 -> ` bl {emit_symbol "caml_alloc2"}\n`
675
+ | 32 -> ` bl {emit_symbol "caml_alloc3"}\n`
676
+ | _ -> emit_intconst reg_x8 (Nativeint.of_int n);
677
+ ` bl {emit_symbol "caml_allocN"}\n`
678
+ end;
679
+ `{emit_label lbl_frame}: add {emit_reg i.res.(0)}, {emit_reg reg_alloc_ptr}, #8\n`
680
+ end
643
681
end
644
682
645
683
let assembly_code_for_poll i ~far ~return_label =
@@ -852,11 +890,17 @@ let emit_instr i =
852
890
| Onetwentyeight -> fatal_error "arm64: got 128 bit memory chunk"
853
891
end
854
892
| Lop(Ialloc { bytes = n; dbginfo; mode = Alloc_heap }) ->
855
- assembly_code_for_allocation i ~n ~far:false ~dbginfo
893
+ assembly_code_for_allocation i ~n ~local:false ~ far:false ~dbginfo
856
894
| Lop(Ispecific (Ifar_alloc { bytes = n; dbginfo })) ->
857
- assembly_code_for_allocation i ~n ~far:true ~dbginfo
858
- | Lop(Ialloc { mode = Alloc_local } | Ibeginregion | Iendregion) ->
859
- Misc.fatal_error "Local allocations not supported on this architecture"
895
+ assembly_code_for_allocation i ~n ~local:false ~far:true ~dbginfo
896
+ | Lop(Ialloc { bytes = n; dbginfo; mode = Alloc_local }) ->
897
+ assembly_code_for_allocation i ~n ~local:true ~far:false ~dbginfo
898
+ | Lop(Ibeginregion) ->
899
+ let offset = Domainstate.(idx_of_field Domain_local_sp) * 8 in
900
+ ` ldr {emit_reg i.res.(0)}, [{emit_reg reg_domain_state_ptr}, #{emit_int offset}]\n`
901
+ | Lop(Iendregion) ->
902
+ let offset = Domainstate.(idx_of_field Domain_local_sp) * 8 in
903
+ ` str {emit_reg i.arg.(0)}, [{emit_reg reg_domain_state_ptr}, #{emit_int offset}]\n`
860
904
| Lop(Ipoll { return_label }) ->
861
905
assembly_code_for_poll i ~far:false ~return_label
862
906
| Lop(Ispecific (Ifar_poll { return_label })) ->
@@ -1137,6 +1181,7 @@ let fundecl fundecl =
1137
1181
float_literals := [];
1138
1182
stack_offset := 0;
1139
1183
call_gc_sites := [];
1184
+ local_realloc_sites := [];
1140
1185
bound_error_sites := [];
1141
1186
for i = 0 to Proc.num_stack_slot_classes - 1 do
1142
1187
num_stack_slots.(i) <- fundecl.fun_num_stack_slots.(i);
@@ -1160,6 +1205,7 @@ let fundecl fundecl =
1160
1205
BR.relax fundecl.fun_body ~max_out_of_line_code_offset;
1161
1206
emit_all fundecl.fun_body;
1162
1207
List.iter emit_call_gc !call_gc_sites;
1208
+ List.iter emit_local_realloc !local_realloc_sites;
1163
1209
List.iter emit_call_bound_error !bound_error_sites;
1164
1210
assert (List.length !call_gc_sites = num_call_gc);
1165
1211
assert (List.length !bound_error_sites = num_check_bound);
0 commit comments