Skip to content

Enable locals for arm64 #2442

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Apr 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 92 additions & 44 deletions backend/arm64/emit.mlp
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,22 @@ let emit_call_gc gc =
`{emit_label gc.gc_lbl}: bl {emit_symbol "caml_call_gc"}\n`;
`{emit_label gc.gc_frame_lbl}: b {emit_label gc.gc_return_lbl}\n`

(* Record calls to local stack reallocation *)

type local_realloc_call =
{ lr_lbl: label;
lr_return_lbl: label;
lr_dbg: Debuginfo.t
}

let local_realloc_sites = ref ([] : local_realloc_call list)

let emit_local_realloc lr =
`{emit_label lr.lr_lbl}:\n`;
` {emit_debug_info lr.lr_dbg}\n`;
` bl {emit_symbol "caml_call_local_realloc"}\n`;
` b {emit_label lr.lr_return_lbl}\n`

(* Names of various instructions *)

let name_for_comparison = function
Expand Down Expand Up @@ -374,7 +390,7 @@ let num_call_gc_points instr =
let rec loop instr call_gc =
match instr.desc with
| Lend -> call_gc
| Lop (Ialloc _) when !fastcode_flag ->
| Lop (Ialloc { mode = Alloc_heap; _ }) when !fastcode_flag ->
loop instr.next (call_gc + 1)
| Lop (Ipoll _) ->
loop instr.next (call_gc + 1)
Expand Down Expand Up @@ -480,6 +496,7 @@ module BR = Branch_relaxation.Make (struct
| _ -> 0
and single = match memory_chunk with Single -> 2 | _ -> 1 in
based + barrier + single
| Lop (Ialloc { mode = Alloc_local; _ }) -> 9
| Lop (Ialloc _) when !fastcode_flag -> 5
| Lop (Ispecific (Ifar_alloc _)) when !fastcode_flag -> 6
| Lop (Ipoll _) -> 3
Expand All @@ -491,8 +508,7 @@ module BR = Branch_relaxation.Make (struct
| _ -> 1 + num_instructions_for_intconst (Nativeint.of_int num_bytes)
end
| Lop (Icsel _) -> 4
| Lop (Ibeginregion | Iendregion) ->
Misc.fatal_error "Local allocations not supported on this architecture"
| Lop (Ibeginregion | Iendregion) -> 1
| Lop (Iintop (Icomp _)) -> 2
| Lop (Ifloatop (Icompf _)) -> 2
| Lop (Iintop_imm (Icomp _, _)) -> 2
Expand Down Expand Up @@ -569,44 +585,68 @@ let name_for_float_comparison = function

(* Output the assembly code for allocation. *)

let assembly_code_for_allocation i ~n ~far ~dbginfo =
let lbl_frame =
record_frame_label i.live (Dbg_alloc dbginfo)
in
if !fastcode_flag then begin
let lbl_after_alloc = new_label() in
let lbl_call_gc = new_label() in
(* n is at most Max_young_whsize * 8, i.e. currently 0x808,
so it is reasonable to assume n < 0x1_000. This makes
the generated code simpler. *)
assert (16 <= n && n < 0x1_000 && n land 0x7 = 0);
let offset = Domainstate.(idx_of_field Domain_young_limit) * 8 in
` ldr {emit_reg reg_tmp1}, [{emit_reg reg_domain_state_ptr}, #{emit_int offset}]\n`;
` sub {emit_reg reg_alloc_ptr}, {emit_reg reg_alloc_ptr}, #{emit_int n}\n`;
` cmp {emit_reg reg_alloc_ptr}, {emit_reg reg_tmp1}\n`;
if not far then begin
` b.lo {emit_label lbl_call_gc}\n`
end else begin
let lbl = new_label () in
` b.cs {emit_label lbl}\n`;
` b {emit_label lbl_call_gc}\n`;
`{emit_label lbl}:\n`
end;
`{emit_label lbl_after_alloc}:`;
` add {emit_reg i.res.(0)}, {emit_reg reg_alloc_ptr}, #8\n`;
call_gc_sites :=
{ gc_lbl = lbl_call_gc;
gc_return_lbl = lbl_after_alloc;
gc_frame_lbl = lbl_frame } :: !call_gc_sites
let assembly_code_for_allocation i ~local ~n ~far ~dbginfo =
if local then begin
let r = i.res.(0) in
let module DS = Domainstate in
let domain_local_sp_offset = DS.(idx_of_field Domain_local_sp) * 8 in
let domain_local_limit_offset = DS.(idx_of_field Domain_local_limit) * 8 in
let domain_local_top_offset = DS.(idx_of_field Domain_local_top) * 8 in
` ldr {emit_reg reg_tmp1}, [{emit_reg reg_domain_state_ptr}, #{emit_int domain_local_limit_offset}]\n`;
` ldr {emit_reg r}, [{emit_reg reg_domain_state_ptr}, #{emit_int domain_local_sp_offset}]\n`;
` sub {emit_reg r}, {emit_reg r}, #{emit_int n}\n`;
` str {emit_reg r}, [{emit_reg reg_domain_state_ptr}, #{emit_int domain_local_sp_offset}]\n`;
` cmp {emit_reg r}, {emit_reg reg_tmp1}\n`;
let lbl_call = new_label () in
` b.lt {emit_label lbl_call}\n`;
let lbl_after_alloc = new_label () in
`{emit_label lbl_after_alloc}:\n`;
` ldr {emit_reg reg_tmp1}, [{emit_reg reg_domain_state_ptr}, #{emit_int domain_local_top_offset}]\n`;
` add {emit_reg r}, {emit_reg r}, {emit_reg reg_tmp1}\n`;
` add {emit_reg r}, {emit_reg r}, #{emit_int 8}\n`;
local_realloc_sites :=
{ lr_lbl = lbl_call;
lr_dbg = i.dbg;
lr_return_lbl = lbl_after_alloc } :: !local_realloc_sites
end else begin
begin match n with
| 16 -> ` bl {emit_symbol "caml_alloc1"}\n`
| 24 -> ` bl {emit_symbol "caml_alloc2"}\n`
| 32 -> ` bl {emit_symbol "caml_alloc3"}\n`
| _ -> emit_intconst reg_x8 (Nativeint.of_int n);
` bl {emit_symbol "caml_allocN"}\n`
end;
`{emit_label lbl_frame}: add {emit_reg i.res.(0)}, {emit_reg reg_alloc_ptr}, #8\n`
let lbl_frame =
record_frame_label i.live (Dbg_alloc dbginfo)
in
if !fastcode_flag then begin
let lbl_after_alloc = new_label() in
let lbl_call_gc = new_label() in
(* n is at most Max_young_whsize * 8, i.e. currently 0x808,
so it is reasonable to assume n < 0x1_000. This makes
the generated code simpler. *)
assert (16 <= n && n < 0x1_000 && n land 0x7 = 0);
let offset = Domainstate.(idx_of_field Domain_young_limit) * 8 in
` ldr {emit_reg reg_tmp1}, [{emit_reg reg_domain_state_ptr}, #{emit_int offset}]\n`;
` sub {emit_reg reg_alloc_ptr}, {emit_reg reg_alloc_ptr}, #{emit_int n}\n`;
` cmp {emit_reg reg_alloc_ptr}, {emit_reg reg_tmp1}\n`;
if not far then begin
` b.lo {emit_label lbl_call_gc}\n`
end else begin
let lbl = new_label () in
` b.cs {emit_label lbl}\n`;
` b {emit_label lbl_call_gc}\n`;
`{emit_label lbl}:\n`
end;
`{emit_label lbl_after_alloc}:`;
` add {emit_reg i.res.(0)}, {emit_reg reg_alloc_ptr}, #8\n`;
call_gc_sites :=
{ gc_lbl = lbl_call_gc;
gc_return_lbl = lbl_after_alloc;
gc_frame_lbl = lbl_frame } :: !call_gc_sites
end else begin
begin match n with
| 16 -> ` bl {emit_symbol "caml_alloc1"}\n`
| 24 -> ` bl {emit_symbol "caml_alloc2"}\n`
| 32 -> ` bl {emit_symbol "caml_alloc3"}\n`
| _ -> emit_intconst reg_x8 (Nativeint.of_int n);
` bl {emit_symbol "caml_allocN"}\n`
end;
`{emit_label lbl_frame}: add {emit_reg i.res.(0)}, {emit_reg reg_alloc_ptr}, #8\n`
end
end

let assembly_code_for_poll i ~far ~return_label =
Expand Down Expand Up @@ -851,11 +891,17 @@ let emit_instr i =
fatal_error "arm64: got 128 bit memory chunk"
end
| Lop(Ialloc { bytes = n; dbginfo; mode = Alloc_heap }) ->
assembly_code_for_allocation i ~n ~far:false ~dbginfo
assembly_code_for_allocation i ~n ~local:false ~far:false ~dbginfo
| Lop(Ispecific (Ifar_alloc { bytes = n; dbginfo })) ->
assembly_code_for_allocation i ~n ~far:true ~dbginfo
| Lop(Ialloc { mode = Alloc_local } | Ibeginregion | Iendregion) ->
Misc.fatal_error "Local allocations not supported on this architecture"
assembly_code_for_allocation i ~n ~local:false ~far:true ~dbginfo
| Lop(Ialloc { bytes = n; dbginfo; mode = Alloc_local }) ->
assembly_code_for_allocation i ~n ~local:true ~far:false ~dbginfo
| Lop(Ibeginregion) ->
let offset = Domainstate.(idx_of_field Domain_local_sp) * 8 in
` ldr {emit_reg i.res.(0)}, [{emit_reg reg_domain_state_ptr}, #{emit_int offset}]\n`
| Lop(Iendregion) ->
let offset = Domainstate.(idx_of_field Domain_local_sp) * 8 in
` str {emit_reg i.arg.(0)}, [{emit_reg reg_domain_state_ptr}, #{emit_int offset}]\n`
| Lop(Ipoll { return_label }) ->
assembly_code_for_poll i ~far:false ~return_label
| Lop(Ispecific (Ifar_poll { return_label })) ->
Expand Down Expand Up @@ -1113,6 +1159,7 @@ let fundecl fundecl =
float_literals := [];
stack_offset := 0;
call_gc_sites := [];
local_realloc_sites := [];
for i = 0 to Proc.num_stack_slot_classes - 1 do
num_stack_slots.(i) <- fundecl.fun_num_stack_slots.(i);
done;
Expand Down Expand Up @@ -1172,6 +1219,7 @@ let fundecl fundecl =

emit_all fundecl.fun_body;
List.iter emit_call_gc !call_gc_sites;
List.iter emit_local_realloc !local_realloc_sites;
assert (List.length !call_gc_sites = num_call_gc);
(match fun_end_label with
| None -> ()
Expand Down
23 changes: 23 additions & 0 deletions ocaml/runtime/arm64.S
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,29 @@ FUNCTION(caml_allocN)
CFI_ENDPROC
END_FUNCTION(caml_allocN)

/* Reallocate the locals stack. This is like caml_call_gc, above. */
FUNCTION(caml_call_local_realloc)
CFI_STARTPROC
L(caml_call_local_realloc):
/* Save return address and frame pointer */
CFI_OFFSET(29, -16)
CFI_OFFSET(30, -8)
stp x29, x30, [sp, -16]!
CFI_ADJUST(16)
add x29, sp, #0
/* Store all registers (including ALLOC_PTR & TRAP_PTR) */
SAVE_ALL_REGS
SWITCH_OCAML_TO_C
/* Call the runtime to reallocate the local stack */
bl G(caml_local_realloc)
SWITCH_C_TO_OCAML
RESTORE_ALL_REGS
/* Free stack space and return to caller */
ldp x29, x30, [sp], 16
ret
CFI_ENDPROC
END_FUNCTION(caml_call_gc)

/* Call a C function from OCaml */
/* Function to call is in ADDITIONAL_ARG */

Expand Down
85 changes: 85 additions & 0 deletions ocaml/runtime4/arm64.S
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,91 @@ FUNCTION(caml_allocN)
CFI_ENDPROC
END_FUNCTION(caml_allocN)

FUNCTION(caml_call_local_realloc)
L(caml_call_local_realloc):
CFI_STARTPROC
/* Set up stack space, saving return address and frame pointer */
/* Store return address and frame pointer */
/* (2 RA/GP, 24 allocatable int regs, 24 caller-saved float regs) * 8 */
CFI_OFFSET(29,-400)
CFI_OFFSET(30,-392)
stp x29, x30, [sp,-400]! /* pre-indexing stp */
CFI_ADJUST(400)
add x29, sp, #0

/* Save allocatable integer registers on the stack, using order in proc.ml */
stp x0, x1, [sp, 16]
stp x2, x3, [sp, 32]
stp x4, x5, [sp, 48]
stp x6, x7, [sp, 64]
stp x8, x9, [sp, 80]
stp x10, x11, [sp, 96]
stp x12, x13, [sp, 112]
stp x14, x15, [sp, 128]
stp x19, x20, [sp, 144]
stp x21, x22, [sp, 160]
stp x23, x24, [sp, 176]
str x25, [sp, 192]

/* Save caller saved floating-point registers on the stack */
stp d0, d1, [sp, 208]
stp d2, d3, [sp, 224]
stp d4, d5, [sp, 240]
stp d6, d7, [sp, 256]
stp d16, d17, [sp, 272]
stp d18, d19, [sp, 288]
stp d20, d21, [sp, 304]
stp d22, d23, [sp, 320]
stp d24, d25, [sp, 336]
stp d26, d27, [sp, 352]
stp d28, d29, [sp, 368]
stp d30, d31, [sp, 384]

/* Store pointer to saved integer registers in Caml_state->gc_regs */
add TMP, sp, #16
str TMP, Caml_state(gc_regs)

/* Save current allocation pointer for debugging purposes */
str ALLOC_PTR, Caml_state(young_ptr)

/* Call the realloc function */
bl G(caml_local_realloc)

/* Restore registers */
ldp x0, x1, [sp, 16]
ldp x2, x3, [sp, 32]
ldp x4, x5, [sp, 48]
ldp x6, x7, [sp, 64]
ldp x8, x9, [sp, 80]
ldp x10, x11, [sp, 96]
ldp x12, x13, [sp, 112]
ldp x14, x15, [sp, 128]
ldp x19, x20, [sp, 144]
ldp x21, x22, [sp, 160]
ldp x23, x24, [sp, 176]
ldr x25, [sp, 192]
ldp d0, d1, [sp, 208]
ldp d2, d3, [sp, 224]
ldp d4, d5, [sp, 240]
ldp d6, d7, [sp, 256]
ldp d16, d17, [sp, 272]
ldp d18, d19, [sp, 288]
ldp d20, d21, [sp, 304]
ldp d22, d23, [sp, 320]
ldp d24, d25, [sp, 336]
ldp d26, d27, [sp, 352]
ldp d28, d29, [sp, 368]
ldp d30, d31, [sp, 384]

/* Reload new allocation pointer */
ldr ALLOC_PTR, Caml_state(young_ptr)

/* Free stack space and return to caller */
ldp x29, x30, [sp], 400
ret
CFI_ENDPROC
END_FUNCTION(caml_call_local_realloc)

/* Call a C function from OCaml */
/* Function to call is in ADDITIONAL_ARG */

Expand Down