Skip to content

Commit 0694909

Browse files
committed
Implement local allocation for arm64
This patch implements local allocation by adding code emission for local allocation, regions and assembly routine for calling stack relocation.
1 parent b24a074 commit 0694909

File tree

3 files changed

+187
-51
lines changed

3 files changed

+187
-51
lines changed

backend/arm64/emit.mlp

Lines changed: 90 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,22 @@ let emit_call_gc gc =
185185
`{emit_label gc.gc_lbl}: bl {emit_symbol "caml_call_gc"}\n`;
186186
`{emit_label gc.gc_frame_lbl}: b {emit_label gc.gc_return_lbl}\n`
187187

188+
(* Record calls to local stack reallocation *)
189+
190+
type local_realloc_call =
191+
{ lr_lbl: label;
192+
lr_return_lbl: label;
193+
lr_dbg: Debuginfo.t
194+
}
195+
196+
let local_realloc_sites = ref ([] : local_realloc_call list)
197+
198+
let emit_local_realloc lr =
199+
`{emit_label lr.lr_lbl}:\n`;
200+
` {emit_debug_info lr.lr_dbg}\n`;
201+
` bl {emit_symbol "caml_call_local_realloc"}\n`;
202+
` b {emit_label lr.lr_return_lbl}\n`
203+
188204
(* Record calls to caml_ml_array_bound_error.
189205
In debug mode, we maintain one call to caml_ml_array_bound_error
190206
per bound check site. Otherwise, we can share a single call. *)
@@ -397,7 +413,7 @@ let num_call_gc_and_check_bound_points instr =
397413
let rec loop instr ((call_gc, check_bound) as totals) =
398414
match instr.desc with
399415
| Lend -> totals
400-
| Lop (Ialloc _) when !fastcode_flag ->
416+
| Lop (Ialloc {mode = Alloc_heap; _}) when !fastcode_flag ->
401417
loop instr.next (call_gc + 1, check_bound)
402418
| Lop (Ipoll _) ->
403419
loop instr.next (call_gc + 1, check_bound)
@@ -523,8 +539,7 @@ module BR = Branch_relaxation.Make (struct
523539
| _ -> 1 + num_instructions_for_intconst (Nativeint.of_int num_bytes)
524540
end
525541
| Lop (Icsel _) -> 4
526-
| Lop (Ibeginregion | Iendregion) ->
527-
Misc.fatal_error "Local allocations not supported on this architecture"
542+
| Lop (Ibeginregion | Iendregion) -> 1
528543
| Lop (Iintop (Icomp _)) -> 2
529544
| Lop (Icompf _) -> 2
530545
| Lop (Iintop_imm (Icomp _, _)) -> 2
@@ -602,44 +617,67 @@ end)
602617

603618
(* Output the assembly code for allocation. *)
604619

605-
let assembly_code_for_allocation i ~n ~far ~dbginfo =
606-
let lbl_frame =
607-
record_frame_label i.live (Dbg_alloc dbginfo)
608-
in
609-
if !fastcode_flag then begin
610-
let lbl_after_alloc = new_label() in
611-
let lbl_call_gc = new_label() in
612-
(* n is at most Max_young_whsize * 8, i.e. currently 0x808,
613-
so it is reasonable to assume n < 0x1_000. This makes
614-
the generated code simpler. *)
615-
assert (16 <= n && n < 0x1_000 && n land 0x7 = 0);
616-
let offset = Domainstate.(idx_of_field Domain_young_limit) * 8 in
617-
` ldr {emit_reg reg_tmp1}, [{emit_reg reg_domain_state_ptr}, #{emit_int offset}]\n`;
618-
` sub {emit_reg reg_alloc_ptr}, {emit_reg reg_alloc_ptr}, #{emit_int n}\n`;
619-
` cmp {emit_reg reg_alloc_ptr}, {emit_reg reg_tmp1}\n`;
620-
if not far then begin
621-
` b.lo {emit_label lbl_call_gc}\n`
622-
end else begin
623-
let lbl = new_label () in
624-
` b.cs {emit_label lbl}\n`;
625-
` b {emit_label lbl_call_gc}\n`;
626-
`{emit_label lbl}:\n`
627-
end;
628-
`{emit_label lbl_after_alloc}:`;
629-
` add {emit_reg i.res.(0)}, {emit_reg reg_alloc_ptr}, #8\n`;
630-
call_gc_sites :=
631-
{ gc_lbl = lbl_call_gc;
632-
gc_return_lbl = lbl_after_alloc;
633-
gc_frame_lbl = lbl_frame } :: !call_gc_sites
620+
let assembly_code_for_allocation i ~local ~n ~far ~dbginfo =
621+
if local then begin
622+
let r = i.res.(0) in
623+
let domain_local_sp_offset = Domainstate.(idx_of_field Domain_local_sp) * 8 in
624+
let domain_local_limit_offset = Domainstate.(idx_of_field Domain_local_limit) * 8 in
625+
let domain_local_top_offset = Domainstate.(idx_of_field Domain_local_top) * 8 in
626+
` ldr {emit_reg reg_tmp1}, [{emit_reg reg_domain_state_ptr}, #{emit_int domain_local_limit_offset}]\n`;
627+
` ldr {emit_reg r}, [{emit_reg reg_domain_state_ptr}, #{emit_int domain_local_sp_offset}]\n`;
628+
` sub {emit_reg r}, {emit_reg r}, #{emit_int n}\n`;
629+
` str {emit_reg r}, [{emit_reg reg_domain_state_ptr}, #{emit_int domain_local_sp_offset}]\n`;
630+
` cmp {emit_reg r}, {emit_reg reg_tmp1}\n`;
631+
let lbl_call = new_label () in
632+
` b.le {emit_label lbl_call}\n`;
633+
let lbl_after_alloc = new_label () in
634+
`{emit_label lbl_after_alloc}:\n`;
635+
` ldr {emit_reg reg_tmp1}, [{emit_reg reg_domain_state_ptr}, #{emit_int domain_local_top_offset}]\n`;
636+
` add {emit_reg r}, {emit_reg r}, {emit_reg reg_tmp1}\n`;
637+
` add {emit_reg r}, {emit_reg r}, #{emit_int 8}\n`;
638+
local_realloc_sites :=
639+
{ lr_lbl = lbl_call;
640+
lr_dbg = i.dbg;
641+
lr_return_lbl = lbl_after_alloc } :: !local_realloc_sites
634642
end else begin
635-
begin match n with
636-
| 16 -> ` bl {emit_symbol "caml_alloc1"}\n`
637-
| 24 -> ` bl {emit_symbol "caml_alloc2"}\n`
638-
| 32 -> ` bl {emit_symbol "caml_alloc3"}\n`
639-
| _ -> emit_intconst reg_x8 (Nativeint.of_int n);
640-
` bl {emit_symbol "caml_allocN"}\n`
641-
end;
642-
`{emit_label lbl_frame}: add {emit_reg i.res.(0)}, {emit_reg reg_alloc_ptr}, #8\n`
643+
let lbl_frame =
644+
record_frame_label i.live (Dbg_alloc dbginfo)
645+
in
646+
if !fastcode_flag then begin
647+
let lbl_after_alloc = new_label() in
648+
let lbl_call_gc = new_label() in
649+
(* n is at most Max_young_whsize * 8, i.e. currently 0x808,
650+
so it is reasonable to assume n < 0x1_000. This makes
651+
the generated code simpler. *)
652+
assert (16 <= n && n < 0x1_000 && n land 0x7 = 0);
653+
let offset = Domainstate.(idx_of_field Domain_young_limit) * 8 in
654+
` ldr {emit_reg reg_tmp1}, [{emit_reg reg_domain_state_ptr}, #{emit_int offset}]\n`;
655+
` sub {emit_reg reg_alloc_ptr}, {emit_reg reg_alloc_ptr}, #{emit_int n}\n`;
656+
` cmp {emit_reg reg_alloc_ptr}, {emit_reg reg_tmp1}\n`;
657+
if not far then begin
658+
` b.lo {emit_label lbl_call_gc}\n`
659+
end else begin
660+
let lbl = new_label () in
661+
` b.cs {emit_label lbl}\n`;
662+
` b {emit_label lbl_call_gc}\n`;
663+
`{emit_label lbl}:\n`
664+
end;
665+
`{emit_label lbl_after_alloc}:`;
666+
` add {emit_reg i.res.(0)}, {emit_reg reg_alloc_ptr}, #8\n`;
667+
call_gc_sites :=
668+
{ gc_lbl = lbl_call_gc;
669+
gc_return_lbl = lbl_after_alloc;
670+
gc_frame_lbl = lbl_frame } :: !call_gc_sites
671+
end else begin
672+
begin match n with
673+
| 16 -> ` bl {emit_symbol "caml_alloc1"}\n`
674+
| 24 -> ` bl {emit_symbol "caml_alloc2"}\n`
675+
| 32 -> ` bl {emit_symbol "caml_alloc3"}\n`
676+
| _ -> emit_intconst reg_x8 (Nativeint.of_int n);
677+
` bl {emit_symbol "caml_allocN"}\n`
678+
end;
679+
`{emit_label lbl_frame}: add {emit_reg i.res.(0)}, {emit_reg reg_alloc_ptr}, #8\n`
680+
end
643681
end
644682

645683
let assembly_code_for_poll i ~far ~return_label =
@@ -852,11 +890,17 @@ let emit_instr i =
852890
| Onetwentyeight -> fatal_error "arm64: got 128 bit memory chunk"
853891
end
854892
| Lop(Ialloc { bytes = n; dbginfo; mode = Alloc_heap }) ->
855-
assembly_code_for_allocation i ~n ~far:false ~dbginfo
893+
assembly_code_for_allocation i ~n ~local:false ~far:false ~dbginfo
856894
| Lop(Ispecific (Ifar_alloc { bytes = n; dbginfo })) ->
857-
assembly_code_for_allocation i ~n ~far:true ~dbginfo
858-
| Lop(Ialloc { mode = Alloc_local } | Ibeginregion | Iendregion) ->
859-
Misc.fatal_error "Local allocations not supported on this architecture"
895+
assembly_code_for_allocation i ~n ~local:false ~far:true ~dbginfo
896+
| Lop(Ialloc { bytes = n; dbginfo; mode = Alloc_local }) ->
897+
assembly_code_for_allocation i ~n ~local:true ~far:false ~dbginfo
898+
| Lop(Ibeginregion) ->
899+
let offset = Domainstate.(idx_of_field Domain_local_sp) * 8 in
900+
` ldr {emit_reg i.res.(0)}, [{emit_reg reg_domain_state_ptr}, #{emit_int offset}]\n`
901+
| Lop(Iendregion) ->
902+
let offset = Domainstate.(idx_of_field Domain_local_sp) * 8 in
903+
` str {emit_reg i.arg.(0)}, [{emit_reg reg_domain_state_ptr}, #{emit_int offset}]\n`
860904
| Lop(Ipoll { return_label }) ->
861905
assembly_code_for_poll i ~far:false ~return_label
862906
| Lop(Ispecific (Ifar_poll { return_label })) ->
@@ -1137,6 +1181,7 @@ let fundecl fundecl =
11371181
float_literals := [];
11381182
stack_offset := 0;
11391183
call_gc_sites := [];
1184+
local_realloc_sites := [];
11401185
bound_error_sites := [];
11411186
for i = 0 to Proc.num_stack_slot_classes - 1 do
11421187
num_stack_slots.(i) <- fundecl.fun_num_stack_slots.(i);
@@ -1160,6 +1205,7 @@ let fundecl fundecl =
11601205
BR.relax fundecl.fun_body ~max_out_of_line_code_offset;
11611206
emit_all fundecl.fun_body;
11621207
List.iter emit_call_gc !call_gc_sites;
1208+
List.iter emit_local_realloc !local_realloc_sites;
11631209
List.iter emit_call_bound_error !bound_error_sites;
11641210
assert (List.length !call_gc_sites = num_call_gc);
11651211
assert (List.length !bound_error_sites = num_check_bound);

ocaml/asmcomp/arm64/emit.mlp

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -476,8 +476,7 @@ module BR = Branch_relaxation.Make (struct
476476
| 16 | 24 | 32 -> 1
477477
| _ -> 1 + num_instructions_for_intconst (Nativeint.of_int num_bytes)
478478
end
479-
| Lop (Ibeginregion | Iendregion) ->
480-
Misc.fatal_error "Local allocations not supported on this architecture"
479+
| Lop (Ibeginregion | Iendregion) -> 1
481480
| Lop (Iintop (Icomp _)) -> 2
482481
| Lop (Iintop_imm (Icomp _, _)) -> 2
483482
| Lop (Iintop (Icheckbound)) -> 2
@@ -552,7 +551,7 @@ end)
552551

553552
(* Output the assembly code for allocation. *)
554553

555-
let assembly_code_for_allocation env i ~n ~far ~dbginfo =
554+
let assembly_code_for_allocation env i ~local ~n ~far ~dbginfo =
556555
let lbl_frame =
557556
record_frame_label env i.live (Dbg_alloc dbginfo)
558557
in
@@ -776,11 +775,17 @@ let emit_instr env i =
776775
` str {emit_reg src}, {emit_addressing addr base}\n`
777776
end
778777
| Lop(Ialloc { bytes = n; dbginfo; mode = Alloc_heap }) ->
779-
assembly_code_for_allocation env i ~n ~far:false ~dbginfo
778+
assembly_code_for_allocation env i ~n ~local:false ~far:false ~dbginfo
780779
| Lop(Ispecific (Ifar_alloc { bytes = n; dbginfo })) ->
781-
assembly_code_for_allocation env i ~n ~far:true ~dbginfo
782-
| Lop(Ialloc { mode = Alloc_local } | Ibeginregion | Iendregion) ->
783-
Misc.fatal_error "Local allocations not supported on this architecture"
780+
assembly_code_for_allocation env i ~n ~local:false ~far:true ~dbginfo
781+
| Lop(Ialloc { bytes = n; dbginfo; mode = Alloc_local }) ->
782+
assembly_code_for_allocation env i ~n ~local:false ~far:false ~dbginfo
783+
| Lop(Ibeginregion) ->
784+
let offset = Domainstate.(idx_of_field Domain_local_sp) * 8 in
785+
` ldr {emit_reg i.res.(0)}, [{emit_reg reg_domain_state_ptr}, #{emit_int offset}]\n`
786+
| Lop(Iendregion) ->
787+
let offset = Domainstate.(idx_of_field Domain_local_sp) * 8 in
788+
` str {emit_reg i.res.(0)}, [{emit_reg reg_domain_state_ptr}, #{emit_int offset}]\n`
784789
| Lop(Ipoll { return_label }) ->
785790
assembly_code_for_poll env i ~far:false ~return_label
786791
| Lop(Ispecific (Ifar_poll { return_label })) ->

ocaml/runtime/arm64.S

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,91 @@ FUNCTION(caml_allocN)
280280
CFI_ENDPROC
281281
END_FUNCTION(caml_allocN)
282282

283+
FUNCTION(caml_call_local_realloc)
284+
L(caml_call_local_realloc):
285+
CFI_STARTPROC
286+
/* Set up stack space, saving return address and frame pointer */
287+
/* Store return address and frame pointer */
288+
/* (2 RA/GP, 24 allocatable int regs, 24 caller-saved float regs) * 8 */
289+
CFI_OFFSET(29,-400)
290+
CFI_OFFSET(30,-392)
291+
stp x29, x30, [sp,-400]! /* pre-indexing stp */
292+
CFI_ADJUST(400)
293+
add x29, sp, #0
294+
295+
/* Save allocatable integer registers on the stack, using order in proc.ml */
296+
stp x0, x1, [sp, 16]
297+
stp x2, x3, [sp, 32]
298+
stp x4, x5, [sp, 48]
299+
stp x6, x7, [sp, 64]
300+
stp x8, x9, [sp, 80]
301+
stp x10, x11, [sp, 96]
302+
stp x12, x13, [sp, 112]
303+
stp x14, x15, [sp, 128]
304+
stp x19, x20, [sp, 144]
305+
stp x21, x22, [sp, 160]
306+
stp x23, x24, [sp, 176]
307+
str x25, [sp, 192]
308+
309+
/* Save caller saved floating-point registers on the stack */
310+
stp d0, d1, [sp, 208]
311+
stp d2, d3, [sp, 224]
312+
stp d4, d5, [sp, 240]
313+
stp d6, d7, [sp, 256]
314+
stp d16, d17, [sp, 272]
315+
stp d18, d19, [sp, 288]
316+
stp d20, d21, [sp, 304]
317+
stp d22, d23, [sp, 320]
318+
stp d24, d25, [sp, 336]
319+
stp d26, d27, [sp, 352]
320+
stp d28, d29, [sp, 368]
321+
stp d30, d31, [sp, 384]
322+
323+
/* Store pointer to saved integer registers in Caml_state->gc_regs */
324+
add TMP, sp, #16
325+
str TMP, Caml_state(gc_regs)
326+
327+
/* Save current allocation pointer for debugging purposes */
328+
str ALLOC_PTR, Caml_state(young_ptr)
329+
330+
/* Call the realloc function */
331+
bl G(caml_local_realloc)
332+
333+
/* Restore registers */
334+
ldp x0, x1, [sp, 16]
335+
ldp x2, x3, [sp, 32]
336+
ldp x4, x5, [sp, 48]
337+
ldp x6, x7, [sp, 64]
338+
ldp x8, x9, [sp, 80]
339+
ldp x10, x11, [sp, 96]
340+
ldp x12, x13, [sp, 112]
341+
ldp x14, x15, [sp, 128]
342+
ldp x19, x20, [sp, 144]
343+
ldp x21, x22, [sp, 160]
344+
ldp x23, x24, [sp, 176]
345+
ldr x25, [sp, 192]
346+
ldp d0, d1, [sp, 208]
347+
ldp d2, d3, [sp, 224]
348+
ldp d4, d5, [sp, 240]
349+
ldp d6, d7, [sp, 256]
350+
ldp d16, d17, [sp, 272]
351+
ldp d18, d19, [sp, 288]
352+
ldp d20, d21, [sp, 304]
353+
ldp d22, d23, [sp, 320]
354+
ldp d24, d25, [sp, 336]
355+
ldp d26, d27, [sp, 352]
356+
ldp d28, d29, [sp, 368]
357+
ldp d30, d31, [sp, 384]
358+
359+
/* Reload new allocation pointer */
360+
ldr ALLOC_PTR, Caml_state(young_ptr)
361+
362+
/* Free stack space and return to caller */
363+
ldp x29, x30, [sp], 400
364+
ret
365+
CFI_ENDPROC
366+
END_FUNCTION(caml_call_local_realloc)
367+
283368
/* Call a C function from OCaml */
284369
/* Function to call is in ADDITIONAL_ARG */
285370

0 commit comments

Comments
 (0)