From d01e6f4b336bd4d275c8d8ceba64015bb9a761d0 Mon Sep 17 00:00:00 2001
From: Xavier Leroy <xavier.leroy@college-de-france.fr>
Date: Thu, 3 Sep 2020 17:31:15 +0200
Subject: [PATCH] Do not cache young_limit in a processor register (upstream PR
 9876)

On target architectures with 32 or more registers,
a register was used to cache the value of the young_limit field
of the domain state.  This reduced the size and execution time
of the code for inlined allocations.

However, this usage is problematic with respect to polling for signals
and to inter-domain communication in Multicore OCaml, because it is
often not possible to change the value of the register when we change
young_limit.  So, the change to young_limit doesn't take effect
immediately, only when the register is reloaded from young_limit.

  - Removes the caching of young_limit in a register from the
    ARM64, PowerPC and RISC-V ports.

  - Recycle the former "young limit" register, giving one more allocatable register

    Now that we have a unused callee-save register on ARM64, PowerPC, and RISC-V,
    make it available for register allocation.

  - Assorted cleanups in runtime/*.S and in asmcomp/*/proc.ml

  - ARM64: wrong register pressure limits for Iextcall
    There are only 7 callee-save integer registers (x19 to x25), not 10.
---
 backend/arm64/emit.mlp        |  21 ++++---
 backend/arm64/proc.ml         |  30 +++++-----
 backend/power/emit.mlp        |  48 ++++++++-------
 backend/power/proc.ml         |  19 +++---
 backend/riscv/emit.mlp        |   9 +--
 backend/riscv/proc.ml         |  40 +++++++------
 ocaml/asmcomp/arm64/emit.mlp  |  21 ++++---
 ocaml/asmcomp/arm64/proc.ml   |  30 +++++-----
 ocaml/asmcomp/power/emit.mlp  |  48 ++++++++-------
 ocaml/asmcomp/power/proc.ml   |  19 +++---
 ocaml/asmcomp/riscv/emit.mlp  |   9 +--
 ocaml/asmcomp/riscv/proc.ml   |  40 +++++++------
 ocaml/runtime/arm64.S         |  29 +++++----
 ocaml/runtime/power.S         |  15 ++---
 ocaml/runtime/riscv.S         | 108 ++++++++++++++++------------------
 ocaml/runtime/signals_nat.c   |   7 ---
 ocaml/runtime/signals_osdep.h |   6 --
 17 files changed, 247 insertions(+), 252 deletions(-)

diff --git a/backend/arm64/emit.mlp b/backend/arm64/emit.mlp
index d4b0680898b..210a69f75de 100644
--- a/backend/arm64/emit.mlp
+++ b/backend/arm64/emit.mlp
@@ -33,12 +33,11 @@ let fastcode_flag = ref true
 
 (* Names for special regs *)
 
-let reg_domain_state_ptr = phys_reg 22
-let reg_trap_ptr = phys_reg 23
-let reg_alloc_ptr = phys_reg 24
-let reg_alloc_limit = phys_reg 25
-let reg_tmp1 = phys_reg 26
-let reg_x8 = phys_reg 8
+let reg_domain_state_ptr = phys_reg 25 (* x28 *)
+let reg_trap_ptr = phys_reg 23 (* x26 *)
+let reg_alloc_ptr = phys_reg 24 (* x27 *)
+let reg_tmp1 = phys_reg 26 (* x16 *)
+let reg_x8 = phys_reg 8 (* x8 *)
 
 (* Output a label *)
 
@@ -504,10 +503,8 @@ module BR = Branch_relaxation.Make (struct
     | Lop (Iload (size, addr)) | Lop (Istore (size, addr, _)) ->
       let based = match addr with Iindexed _ -> 0 | Ibased _ -> 1 in
       based + begin match size with Single -> 2 | _ -> 1 end
-    | Lop (Ialloc {bytes = num_bytes}) when !fastcode_flag ->
-      if num_bytes <= 0xFFF then 4 else 5
-    | Lop (Ispecific (Ifar_alloc {bytes = num_bytes})) when !fastcode_flag ->
-      if num_bytes <= 0xFFF then 5 else 6
+    | Lop (Ialloc _) when !fastcode_flag -> 5
+    | Lop (Ispecific (Ifar_alloc _)) when !fastcode_flag -> 6
     | Lop (Ialloc { bytes = num_bytes; _ })
     | Lop (Ispecific (Ifar_alloc { bytes = num_bytes; _ })) ->
       begin match num_bytes with
@@ -597,8 +594,10 @@ let assembly_code_for_allocation i ~n ~far ~dbginfo =
        so it is reasonable to assume n < 0x1_000.  This makes
        the generated code simpler. *)
     assert (16 <= n && n < 0x1_000 && n land 0x7 = 0);
+    let offset = Domainstate.(idx_of_field Domain_young_limit) * 8 in
+    `	ldr	{emit_reg reg_tmp1}, [{emit_reg reg_domain_state_ptr}, #{emit_int offset}]\n`;
     `	sub	{emit_reg reg_alloc_ptr}, {emit_reg reg_alloc_ptr}, #{emit_int n}\n`;
-    `	cmp	{emit_reg reg_alloc_ptr}, {emit_reg reg_alloc_limit}\n`;
+    `	cmp	{emit_reg reg_alloc_ptr}, {emit_reg reg_tmp1}\n`;
     if not far then begin
       `	b.lo	{emit_label lbl_call_gc}\n`
     end else begin
diff --git a/backend/arm64/proc.ml b/backend/arm64/proc.ml
index 03500512b7d..0a093c56f4b 100644
--- a/backend/arm64/proc.ml
+++ b/backend/arm64/proc.ml
@@ -34,11 +34,10 @@ let word_addressed = false
     x0 - x15              general purpose (caller-save)
     x16, x17              temporaries (used by call veeners)
     x18                   platform register (reserved)
-    x19 - x24             general purpose (callee-save)
-    x25                   domain state pointer
+    x19 - x25             general purpose (callee-save)
     x26                   trap pointer
     x27                   alloc pointer
-    x28                   alloc limit
+    x28                   domain state pointer
     x29                   frame pointer
     x30                   return address
     sp / xzr              stack pointer / zero register
@@ -49,10 +48,11 @@ let word_addressed = false
 *)
 
 let int_reg_name =
-  [| "x0";  "x1";  "x2";  "x3";  "x4";  "x5";  "x6";  "x7";
-     "x8";  "x9";  "x10"; "x11"; "x12"; "x13"; "x14"; "x15";
-     "x19"; "x20"; "x21"; "x22"; "x23"; "x24";
-     "x25"; "x26"; "x27"; "x28"; "x16"; "x17" |]
+  [| "x0";  "x1";  "x2";  "x3";  "x4";  "x5";  "x6";  "x7";  (* 0 - 7 *)
+     "x8";  "x9";  "x10"; "x11"; "x12"; "x13"; "x14"; "x15"; (* 8 - 15 *)
+     "x19"; "x20"; "x21"; "x22"; "x23"; "x24"; "x25";        (* 16 - 22 *)
+     "x26"; "x27"; "x28";                                    (* 23 - 25 *)
+     "x16"; "x17" |]                                         (* 26 - 27 *)
 
 let float_reg_name =
   [| "d0";  "d1";  "d2";  "d3";  "d4";  "d5";  "d6";  "d7";
@@ -68,7 +68,7 @@ let register_class r =
   | Float -> 1
 
 let num_available_registers =
-  [| 22; 32 |] (* first 22 int regs allocatable; all float regs allocatable *)
+  [| 23; 32 |] (* first 23 int regs allocatable; all float regs allocatable *)
 
 let first_available_register =
   [| 0; 100 |]
@@ -270,16 +270,16 @@ let destroyed_at_reloadretaddr = [| |]
 (* Maximal register pressure *)
 
 let safe_register_pressure = function
-  | Iextcall _ -> 8
-  | Ialloc _ -> 24
-  | _ -> 25
+  | Iextcall _ -> 7
+  | Ialloc _ -> 22
+  | _ -> 23
 
 let max_register_pressure = function
-  | Iextcall _ -> [| 10; 8 |]
-  | Ialloc _ -> [| 24; 32 |]
+  | Iextcall _ -> [| 7; 8 |]  (* 7 integer callee-saves, 8 FP callee-saves *)
+  | Ialloc _ -> [| 22; 32 |]
   | Iintoffloat | Ifloatofint
-  | Iload(Single, _) | Istore(Single, _, _) -> [| 25; 31 |]
-  | _ -> [| 25; 32 |]
+  | Iload(Single, _) | Istore(Single, _, _) -> [| 23; 31 |]
+  | _ -> [| 23; 32 |]
 
 (* Pure operations (without any side effect besides updating their result
    registers). *)
diff --git a/backend/power/emit.mlp b/backend/power/emit.mlp
index 40f9a104c48..d921b728d34 100644
--- a/backend/power/emit.mlp
+++ b/backend/power/emit.mlp
@@ -508,8 +508,8 @@ module BR = Branch_relaxation.Make (struct
       then load_store_size addr + 1
       else load_store_size addr
     | Lop(Istore(_chunk, addr, _)) -> load_store_size addr
-    | Lop(Ialloc _) -> 4
-    | Lop(Ispecific(Ialloc_far _)) -> 5
+    | Lop(Ialloc _) -> 5
+    | Lop(Ispecific(Ialloc_far _)) -> 6
     | Lop(Iintop Imod) -> 3
     | Lop(Iintop(Icomp _)) -> 4
     | Lop(Icompf _) -> 5
@@ -550,6 +550,26 @@ module BR = Branch_relaxation.Make (struct
   let relax_intop_imm_checkbound ~bound:_ = assert false
 end)
 
+(* Assembly code for inlined allocation *)
+
+let emit_alloc i bytes dbginfo far =
+  if !call_gc_label = 0 then call_gc_label := new_label ();
+  let offset = Domainstate.(idx_of_field Domain_young_limit) * 8 in
+  `	{emit_string lg}	0, {emit_int offset}(30)\n`;
+  `	addi    31, 31, {emit_int(-bytes)}\n`;
+  `	{emit_string cmplg}	31, 0\n`;
+  if not far then begin
+    `	bltl	{emit_label !call_gc_label}\n`;
+    record_frame i.live (Dbg_alloc dbginfo);
+    `	addi	{emit_reg i.res.(0)}, 31, {emit_int size_addr}\n`
+  end else begin
+    let lbl = new_label() in
+    `	bge	{emit_label lbl}\n`;
+    `	bl	{emit_label !call_gc_label}\n`;
+    record_frame i.live (Dbg_alloc dbginfo);
+    `{emit_label lbl}:	addi	{emit_reg i.res.(0)}, 31, {emit_int size_addr}\n`
+  end
+
 (* Output the assembly code for an instruction *)
 
 let emit_instr i =
@@ -782,22 +802,10 @@ let emit_instr i =
           | Single -> "stfs"
           | Double -> "stfd" in
         emit_load_store storeinstr addr i.arg 1 i.arg.(0)
-    | Lop(Ialloc { bytes = n; dbginfo }) ->
-        if !call_gc_label = 0 then call_gc_label := new_label ();
-        `	addi    31, 31, {emit_int(-n)}\n`;
-        `	{emit_string cmplg}	31, 30\n`;
-        `	bltl	{emit_label !call_gc_label}\n`;
-        record_frame i.live (Dbg_alloc dbginfo);
-        `	addi	{emit_reg i.res.(0)}, 31, {emit_int size_addr}\n`;
-    | Lop(Ispecific(Ialloc_far { bytes = n; dbginfo })) ->
-        if !call_gc_label = 0 then call_gc_label := new_label ();
-        let lbl = new_label() in
-        `	addi    31, 31, {emit_int(-n)}\n`;
-        `	{emit_string cmplg}	31, 30\n`;
-        `	bge	{emit_label lbl}\n`;
-        `	bl	{emit_label !call_gc_label}\n`;
-        record_frame i.live (Dbg_alloc dbginfo);
-        `{emit_label lbl}:	addi	{emit_reg i.res.(0)}, 31, {emit_int size_addr}\n`
+    | Lop(Ialloc { bytes; dbginfo }) ->
+        emit_alloc i bytes dbginfo false
+    | Lop(Ispecific(Ialloc_far { bytes; dbginfo })) ->
+        emit_alloc i bytes dbginfo true
     | Lop(Iintop Isub) ->               (* subfc has swapped arguments *)
         `	subfc	{emit_reg i.res.(0)}, {emit_reg i.arg.(1)}, {emit_reg i.arg.(0)}\n`
     | Lop(Iintop Imod) ->
@@ -1009,8 +1017,8 @@ let emit_instr i =
               Domainstate.(idx_of_field Domain_backtrace_pos)
             in
             begin match abi with
-            | ELF32 -> `	stw	0, {emit_int (backtrace_pos * 8)}(28)\n`
-            | _ -> `	std	0, {emit_int (backtrace_pos * 8)}(28)\n`
+            | ELF32 -> `	stw	0, {emit_int (backtrace_pos * 8)}(30)\n`
+            | _ -> `	std	0, {emit_int (backtrace_pos * 8)}(30)\n`
             end;
             emit_call "caml_raise_exn";
             record_frame Reg.Set.empty (Dbg_raise i.dbg);
diff --git a/backend/power/proc.ml b/backend/power/proc.ml
index 02e21b993cd..0de9daa9fed 100644
--- a/backend/power/proc.ml
+++ b/backend/power/proc.ml
@@ -35,10 +35,9 @@ let word_addressed = false
     3 - 10              function arguments and results
     11 - 12             temporaries
     13                  pointer to small data area
-    14 - 27             general purpose, preserved by C
-    28                  domain state pointer
+    14 - 28             general purpose, preserved by C
     29                  trap pointer
-    30                  allocation limit
+    30                  domain state pointer
     31                  allocation pointer
   Floating-point register map:
     0                   temporary
@@ -47,9 +46,9 @@ let word_addressed = false
 *)
 
 let int_reg_name =
-  [| "3"; "4"; "5"; "6"; "7"; "8"; "9"; "10";
-     "14"; "15"; "16"; "17"; "18"; "19"; "20"; "21";
-     "22"; "23"; "24"; "25"; "26"; "27" |]
+  [| "3"; "4"; "5"; "6"; "7"; "8"; "9"; "10";           (* 0 - 7 *)
+     "14"; "15"; "16"; "17"; "18"; "19"; "20"; "21";    (* 8 - 15 *)
+     "22"; "23"; "24"; "25"; "26"; "27"; "28" |]        (* 16 - 22 *)
 
 let float_reg_name =
   [| "1"; "2"; "3"; "4"; "5"; "6"; "7"; "8";
@@ -64,7 +63,7 @@ let register_class r =
   | Val | Int | Addr -> 0
   | Float -> 1
 
-let num_available_registers = [| 22; 31 |]
+let num_available_registers = [| 23; 31 |]
 
 let first_available_register = [| 0; 100 |]
 
@@ -76,7 +75,7 @@ let rotate_registers = true
 (* Representation of hard registers by pseudo-registers *)
 
 let hard_int_reg =
-  let v = Array.make 22 Reg.dummy in
+  let v = Array.make 23 Reg.dummy in
   for i = 0 to 21 do v.(i) <- Reg.at_location Int (Reg i) done; v
 
 let hard_float_reg =
@@ -315,11 +314,11 @@ let destroyed_at_reloadretaddr = [| phys_reg 11 |]
 
 let safe_register_pressure = function
     Iextcall _ -> 14
-  | _ -> 22
+  | _ -> 23
 
 let max_register_pressure = function
     Iextcall _ -> [| 14; 18 |]
-  | _ -> [| 22; 30 |]
+  | _ -> [| 23; 30 |]
 
 (* Pure operations (without any side effect besides updating their result
    registers). *)
diff --git a/backend/riscv/emit.mlp b/backend/riscv/emit.mlp
index 809633a4cfb..1fd616c8cd6 100644
--- a/backend/riscv/emit.mlp
+++ b/backend/riscv/emit.mlp
@@ -82,12 +82,11 @@ let rodata_space =
 
 (* Names for special regs *)
 
-let reg_tmp = phys_reg 22
+let reg_tmp = phys_reg 23
 let reg_t2 = phys_reg 16
-let reg_domain_state_ptr = phys_reg 23
+let reg_domain_state_ptr = phys_reg 26
 let reg_trap = phys_reg 24
 let reg_alloc_ptr = phys_reg 25
-let reg_alloc_lim = phys_reg 26
 
 (* Output a pseudo-register *)
 
@@ -392,13 +391,15 @@ let emit_instr i =
       let lbl_after_alloc = new_label () in
       let lbl_call_gc = new_label () in
       let n = -bytes in
+      let offset = Domainstate.(idx_of_field Domain_young_limit) * 8 in
       if is_immediate n then
         `	addi	{emit_reg reg_alloc_ptr}, {emit_reg reg_alloc_ptr}, {emit_int n}\n`
       else begin
         `	li	{emit_reg reg_tmp}, {emit_int n}\n`;
         `	add	{emit_reg reg_alloc_ptr}, {emit_reg reg_alloc_ptr}, {emit_reg reg_tmp}\n`
       end;
-      `	bltu	{emit_reg reg_alloc_ptr}, {emit_reg reg_alloc_lim}, {emit_label lbl_call_gc}\n`;
+      `	ld	{emit_reg reg_tmp}, {emit_int offset}({emit_reg reg_domain_state_ptr})\n`;
+      `	bltu	{emit_reg reg_alloc_ptr}, {emit_reg reg_tmp}, {emit_label lbl_call_gc}\n`;
       `{emit_label lbl_after_alloc}:\n`;
       `	addi	{emit_reg i.res.(0)}, {emit_reg reg_alloc_ptr}, {emit_int size_addr}\n`;
       call_gc_sites :=
diff --git a/backend/riscv/proc.ml b/backend/riscv/proc.ml
index a9f970eb714..ce72ee4b864 100644
--- a/backend/riscv/proc.ml
+++ b/backend/riscv/proc.ml
@@ -37,12 +37,12 @@ let word_addressed = false
     a0-a7        0-7       arguments/results
     s2-s9        8-15      arguments/results (preserved by C)
     t2-t6        16-20     temporary
-    t0           21        temporary
-    t1           22        temporary (used by code generator)
-    s0           23        domain pointer (preserved by C)
+    s0           21        general purpose (preserved by C)
+    t0           22        temporary
+    t1           23        temporary (used by code generator)
     s1           24        trap pointer (preserved by C)
     s10          25        allocation pointer (preserved by C)
-    s11          26        allocation limit (preserved by C)
+    s11          26        domain pointer (preserved by C)
 
   Floating-point register map
   ---------------------------
@@ -66,11 +66,12 @@ let word_addressed = false
 *)
 
 let int_reg_name =
-  [| "a0"; "a1"; "a2"; "a3"; "a4"; "a5"; "a6"; "a7";
-     "s2"; "s3"; "s4"; "s5"; "s6"; "s7"; "s8"; "s9";
-     "t2"; "t3"; "t4"; "t5"; "t6";
-     "t0"; "t1";
-     "s0"; "s1"; "s10"; "s11" |]
+  [| "a0"; "a1"; "a2"; "a3"; "a4"; "a5"; "a6"; "a7";  (* 0 - 7 *)
+     "s2"; "s3"; "s4"; "s5"; "s6"; "s7"; "s8"; "s9";  (* 8 - 15 *)
+     "t2"; "t3"; "t4"; "t5"; "t6";                    (* 16 - 20 *)
+     "s0";                                            (* 21 *)
+     "t0"; "t1";                                      (* 22 - 23 *)
+     "s1"; "s10"; "s11" |]                            (* 24 - 26 *)
 
 let float_reg_name =
   [| "ft0"; "ft1"; "ft2"; "ft3"; "ft4"; "ft5"; "ft6"; "ft7";
@@ -86,7 +87,7 @@ let register_class r =
   | Val | Int | Addr -> 0
   | Float -> 1
 
-let num_available_registers = [| 22; 32 |]
+let num_available_registers = [| 23; 32 |]
 
 let first_available_register = [| 0; 100 |]
 
@@ -235,13 +236,13 @@ let regs_are_volatile _ = false
 let destroyed_at_c_call =
   (* s0-s11 and fs0-fs11 are callee-save *)
   Array.of_list(List.map phys_reg
-    [0; 1; 2; 3; 4; 5; 6; 7; 16; 17; 18; 19; 20; 21;
+    [0; 1; 2; 3; 4; 5; 6; 7; 16; 17; 18; 19; 20; 22;
      100; 101; 102; 103; 104; 105; 106; 107; 110; 111; 112; 113; 114; 115; 116;
      117; 128; 129; 130; 131])
 
 let destroyed_at_alloc =
-  (* t0-t3 are used for PLT stubs *)
-  if !Clflags.dlcode then Array.map phys_reg [|16; 17; 18; 19; 20; 21|]
+  (* t0-t6 are used for PLT stubs *)
+  if !Clflags.dlcode then Array.map phys_reg [|16; 17; 18; 19; 20; 22|]
   else [| |]
 
 let destroyed_at_oper = function
@@ -249,7 +250,7 @@ let destroyed_at_oper = function
   | Iop(Iextcall{alloc = false; _}) -> destroyed_at_c_call
   | Iop(Ialloc _) -> destroyed_at_alloc
   | Iop(Istore(Single, _, _)) -> [| phys_reg 100 |]
-  | Iswitch _ -> [| phys_reg 21 |]
+  | Iswitch _ -> [| phys_reg 22 |]  (* t0 *)
   | _ -> [||]
 
 let destroyed_at_raise = all_phys_regs
@@ -259,12 +260,12 @@ let destroyed_at_reloadretaddr = [| |]
 (* Maximal register pressure *)
 
 let safe_register_pressure = function
-  | Iextcall _ -> 15
-  | _ -> 22
+  | Iextcall _ -> 9
+  | _ -> 23
 
 let max_register_pressure = function
-  | Iextcall _ -> [| 15; 18 |]
-  | _ -> [| 22; 30 |]
+  | Iextcall _ -> [| 9; 12 |]
+  | _ -> [| 23; 30 |]
 
 (* Pure operations (without any side effect besides updating their result
    registers). *)
@@ -293,8 +294,9 @@ let int_dwarf_reg_numbers =
   [| 10; 11; 12; 13; 14; 15; 16; 17;
      18; 19; 20; 21; 22; 23; 24; 25;
      7; 28; 29; 30; 31;
+     8;
      5; 6;
-     8; 9; 26; 27;
+     9; 26; 27;
   |]
 
 let float_dwarf_reg_numbers =
diff --git a/ocaml/asmcomp/arm64/emit.mlp b/ocaml/asmcomp/arm64/emit.mlp
index 7d01a8b047d..23bec3fde56 100644
--- a/ocaml/asmcomp/arm64/emit.mlp
+++ b/ocaml/asmcomp/arm64/emit.mlp
@@ -33,12 +33,11 @@ let fastcode_flag = ref true
 
 (* Names for special regs *)
 
-let reg_domain_state_ptr = phys_reg 22
-let reg_trap_ptr = phys_reg 23
-let reg_alloc_ptr = phys_reg 24
-let reg_alloc_limit = phys_reg 25
-let reg_tmp1 = phys_reg 26
-let reg_x8 = phys_reg 8
+let reg_domain_state_ptr = phys_reg 25 (* x28 *)
+let reg_trap_ptr = phys_reg 23 (* x26 *)
+let reg_alloc_ptr = phys_reg 24 (* x27 *)
+let reg_tmp1 = phys_reg 26 (* x16 *)
+let reg_x8 = phys_reg 8 (* x8 *)
 
 (* Output a label *)
 
@@ -504,10 +503,8 @@ module BR = Branch_relaxation.Make (struct
     | Lop (Iload (size, addr)) | Lop (Istore (size, addr, _)) ->
       let based = match addr with Iindexed _ -> 0 | Ibased _ -> 1 in
       based + begin match size with Single -> 2 | _ -> 1 end
-    | Lop (Ialloc {bytes = num_bytes}) when !fastcode_flag ->
-      if num_bytes <= 0xFFF then 4 else 5
-    | Lop (Ispecific (Ifar_alloc {bytes = num_bytes})) when !fastcode_flag ->
-      if num_bytes <= 0xFFF then 5 else 6
+    | Lop (Ialloc _) when !fastcode_flag -> 5
+    | Lop (Ispecific (Ifar_alloc _)) when !fastcode_flag -> 6
     | Lop (Ialloc { bytes = num_bytes; _ })
     | Lop (Ispecific (Ifar_alloc { bytes = num_bytes; _ })) ->
       begin match num_bytes with
@@ -593,8 +590,10 @@ let assembly_code_for_allocation i ~n ~far ~dbginfo =
        so it is reasonable to assume n < 0x1_000.  This makes
        the generated code simpler. *)
     assert (16 <= n && n < 0x1_000 && n land 0x7 = 0);
+    let offset = Domainstate.(idx_of_field Domain_young_limit) * 8 in
+    `	ldr	{emit_reg reg_tmp1}, [{emit_reg reg_domain_state_ptr}, #{emit_int offset}]\n`;
     `	sub	{emit_reg reg_alloc_ptr}, {emit_reg reg_alloc_ptr}, #{emit_int n}\n`;
-    `	cmp	{emit_reg reg_alloc_ptr}, {emit_reg reg_alloc_limit}\n`;
+    `	cmp	{emit_reg reg_alloc_ptr}, {emit_reg reg_tmp1}\n`;
     if not far then begin
       `	b.lo	{emit_label lbl_call_gc}\n`
     end else begin
diff --git a/ocaml/asmcomp/arm64/proc.ml b/ocaml/asmcomp/arm64/proc.ml
index 7635181a0a6..ac849a46cbb 100644
--- a/ocaml/asmcomp/arm64/proc.ml
+++ b/ocaml/asmcomp/arm64/proc.ml
@@ -33,11 +33,10 @@ let word_addressed = false
     x0 - x15              general purpose (caller-save)
     x16, x17              temporaries (used by call veeners)
     x18                   platform register (reserved)
-    x19 - x24             general purpose (callee-save)
-    x25                   domain state pointer
+    x19 - x25             general purpose (callee-save)
     x26                   trap pointer
     x27                   alloc pointer
-    x28                   alloc limit
+    x28                   domain state pointer
     x29                   frame pointer
     x30                   return address
     sp / xzr              stack pointer / zero register
@@ -48,10 +47,11 @@ let word_addressed = false
 *)
 
 let int_reg_name =
-  [| "x0";  "x1";  "x2";  "x3";  "x4";  "x5";  "x6";  "x7";
-     "x8";  "x9";  "x10"; "x11"; "x12"; "x13"; "x14"; "x15";
-     "x19"; "x20"; "x21"; "x22"; "x23"; "x24";
-     "x25"; "x26"; "x27"; "x28"; "x16"; "x17" |]
+  [| "x0";  "x1";  "x2";  "x3";  "x4";  "x5";  "x6";  "x7";  (* 0 - 7 *)
+     "x8";  "x9";  "x10"; "x11"; "x12"; "x13"; "x14"; "x15"; (* 8 - 15 *)
+     "x19"; "x20"; "x21"; "x22"; "x23"; "x24"; "x25";        (* 16 - 22 *)
+     "x26"; "x27"; "x28";                                    (* 23 - 25 *)
+     "x16"; "x17" |]                                         (* 26 - 27 *)
 
 let float_reg_name =
   [| "d0";  "d1";  "d2";  "d3";  "d4";  "d5";  "d6";  "d7";
@@ -67,7 +67,7 @@ let register_class r =
   | Float -> 1
 
 let num_available_registers =
-  [| 22; 32 |] (* first 22 int regs allocatable; all float regs allocatable *)
+  [| 23; 32 |] (* first 23 int regs allocatable; all float regs allocatable *)
 
 let first_available_register =
   [| 0; 100 |]
@@ -269,16 +269,16 @@ let destroyed_at_reloadretaddr = [| |]
 (* Maximal register pressure *)
 
 let safe_register_pressure = function
-  | Iextcall _ -> 8
-  | Ialloc _ -> 24
-  | _ -> 25
+  | Iextcall _ -> 7
+  | Ialloc _ -> 22
+  | _ -> 23
 
 let max_register_pressure = function
-  | Iextcall _ -> [| 10; 8 |]
-  | Ialloc _ -> [| 24; 32 |]
+  | Iextcall _ -> [| 7; 8 |]  (* 7 integer callee-saves, 8 FP callee-saves *)
+  | Ialloc _ -> [| 22; 32 |]
   | Iintoffloat | Ifloatofint
-  | Iload(Single, _) | Istore(Single, _, _) -> [| 25; 31 |]
-  | _ -> [| 25; 32 |]
+  | Iload(Single, _) | Istore(Single, _, _) -> [| 23; 31 |]
+  | _ -> [| 23; 32 |]
 
 (* Pure operations (without any side effect besides updating their result
    registers). *)
diff --git a/ocaml/asmcomp/power/emit.mlp b/ocaml/asmcomp/power/emit.mlp
index 20d744a625d..681a7659b95 100644
--- a/ocaml/asmcomp/power/emit.mlp
+++ b/ocaml/asmcomp/power/emit.mlp
@@ -486,8 +486,8 @@ module BR = Branch_relaxation.Make (struct
       then load_store_size addr + 1
       else load_store_size addr
     | Lop(Istore(_chunk, addr, _)) -> load_store_size addr
-    | Lop(Ialloc _) -> 4
-    | Lop(Ispecific(Ialloc_far _)) -> 5
+    | Lop(Ialloc _) -> 5
+    | Lop(Ispecific(Ialloc_far _)) -> 6
     | Lop(Iintop Imod) -> 3
     | Lop(Iintop(Icomp _)) -> 4
     | Lop(Iintop _) -> 1
@@ -524,6 +524,26 @@ module BR = Branch_relaxation.Make (struct
   let relax_intop_imm_checkbound ~bound:_ = assert false
 end)
 
+(* Assembly code for inlined allocation *)
+
+let emit_alloc i bytes dbginfo far =
+  if !call_gc_label = 0 then call_gc_label := new_label ();
+  let offset = Domainstate.(idx_of_field Domain_young_limit) * 8 in
+  `	{emit_string lg}	0, {emit_int offset}(30)\n`;
+  `	addi    31, 31, {emit_int(-bytes)}\n`;
+  `	{emit_string cmplg}	31, 0\n`;
+  if not far then begin
+    `	bltl	{emit_label !call_gc_label}\n`;
+    record_frame i.live (Dbg_alloc dbginfo);
+    `	addi	{emit_reg i.res.(0)}, 31, {emit_int size_addr}\n`
+  end else begin
+    let lbl = new_label() in
+    `	bge	{emit_label lbl}\n`;
+    `	bl	{emit_label !call_gc_label}\n`;
+    record_frame i.live (Dbg_alloc dbginfo);
+    `{emit_label lbl}:	addi	{emit_reg i.res.(0)}, 31, {emit_int size_addr}\n`
+  end
+
 (* Output the assembly code for an instruction *)
 
 let emit_instr i =
@@ -754,22 +774,10 @@ let emit_instr i =
           | Single -> "stfs"
           | Double -> "stfd" in
         emit_load_store storeinstr addr i.arg 1 i.arg.(0)
-    | Lop(Ialloc { bytes = n; dbginfo }) ->
-        if !call_gc_label = 0 then call_gc_label := new_label ();
-        `	addi    31, 31, {emit_int(-n)}\n`;
-        `	{emit_string cmplg}	31, 30\n`;
-        `	bltl	{emit_label !call_gc_label}\n`;
-        record_frame i.live (Dbg_alloc dbginfo);
-        `	addi	{emit_reg i.res.(0)}, 31, {emit_int size_addr}\n`;
-    | Lop(Ispecific(Ialloc_far { bytes = n; dbginfo })) ->
-        if !call_gc_label = 0 then call_gc_label := new_label ();
-        let lbl = new_label() in
-        `	addi    31, 31, {emit_int(-n)}\n`;
-        `	{emit_string cmplg}	31, 30\n`;
-        `	bge	{emit_label lbl}\n`;
-        `	bl	{emit_label !call_gc_label}\n`;
-        record_frame i.live (Dbg_alloc dbginfo);
-        `{emit_label lbl}:	addi	{emit_reg i.res.(0)}, 31, {emit_int size_addr}\n`
+    | Lop(Ialloc { bytes; dbginfo }) ->
+        emit_alloc i bytes dbginfo false
+    | Lop(Ispecific(Ialloc_far { bytes; dbginfo })) ->
+        emit_alloc i bytes dbginfo true
     | Lop(Iintop Isub) ->               (* subfc has swapped arguments *)
         `	subfc	{emit_reg i.res.(0)}, {emit_reg i.arg.(1)}, {emit_reg i.arg.(0)}\n`
     | Lop(Iintop Imod) ->
@@ -985,8 +993,8 @@ let emit_instr i =
               Domainstate.(idx_of_field Domain_backtrace_pos)
             in
             begin match abi with
-            | ELF32 -> `	stw	0, {emit_int (backtrace_pos * 8)}(28)\n`
-            | _ -> `	std	0, {emit_int (backtrace_pos * 8)}(28)\n`
+            | ELF32 -> `	stw	0, {emit_int (backtrace_pos * 8)}(30)\n`
+            | _ -> `	std	0, {emit_int (backtrace_pos * 8)}(30)\n`
             end;
             emit_call "caml_raise_exn";
             record_frame Reg.Set.empty (Dbg_raise i.dbg);
diff --git a/ocaml/asmcomp/power/proc.ml b/ocaml/asmcomp/power/proc.ml
index eec140db38f..24f95ff62b8 100644
--- a/ocaml/asmcomp/power/proc.ml
+++ b/ocaml/asmcomp/power/proc.ml
@@ -34,10 +34,9 @@ let word_addressed = false
     3 - 10              function arguments and results
     11 - 12             temporaries
     13                  pointer to small data area
-    14 - 27             general purpose, preserved by C
-    28                  domain state pointer
+    14 - 28             general purpose, preserved by C
     29                  trap pointer
-    30                  allocation limit
+    30                  domain state pointer
     31                  allocation pointer
   Floating-point register map:
     0                   temporary
@@ -46,9 +45,9 @@ let word_addressed = false
 *)
 
 let int_reg_name =
-  [| "3"; "4"; "5"; "6"; "7"; "8"; "9"; "10";
-     "14"; "15"; "16"; "17"; "18"; "19"; "20"; "21";
-     "22"; "23"; "24"; "25"; "26"; "27" |]
+  [| "3"; "4"; "5"; "6"; "7"; "8"; "9"; "10";           (* 0 - 7 *)
+     "14"; "15"; "16"; "17"; "18"; "19"; "20"; "21";    (* 8 - 15 *)
+     "22"; "23"; "24"; "25"; "26"; "27"; "28" |]        (* 16 - 22 *)
 
 let float_reg_name =
   [| "1"; "2"; "3"; "4"; "5"; "6"; "7"; "8";
@@ -63,7 +62,7 @@ let register_class r =
   | Val | Int | Addr -> 0
   | Float -> 1
 
-let num_available_registers = [| 22; 31 |]
+let num_available_registers = [| 23; 31 |]
 
 let first_available_register = [| 0; 100 |]
 
@@ -75,7 +74,7 @@ let rotate_registers = true
 (* Representation of hard registers by pseudo-registers *)
 
 let hard_int_reg =
-  let v = Array.make 22 Reg.dummy in
+  let v = Array.make 23 Reg.dummy in
   for i = 0 to 21 do v.(i) <- Reg.at_location Int (Reg i) done; v
 
 let hard_float_reg =
@@ -314,11 +313,11 @@ let destroyed_at_reloadretaddr = [| phys_reg 11 |]
 
 let safe_register_pressure = function
     Iextcall _ -> 14
-  | _ -> 22
+  | _ -> 23
 
 let max_register_pressure = function
     Iextcall _ -> [| 14; 18 |]
-  | _ -> [| 22; 30 |]
+  | _ -> [| 23; 30 |]
 
 (* Pure operations (without any side effect besides updating their result
    registers). *)
diff --git a/ocaml/asmcomp/riscv/emit.mlp b/ocaml/asmcomp/riscv/emit.mlp
index d8e694cf214..2f8582148d8 100644
--- a/ocaml/asmcomp/riscv/emit.mlp
+++ b/ocaml/asmcomp/riscv/emit.mlp
@@ -82,12 +82,11 @@ let rodata_space =
 
 (* Names for special regs *)
 
-let reg_tmp = phys_reg 22
+let reg_tmp = phys_reg 23
 let reg_t2 = phys_reg 16
-let reg_domain_state_ptr = phys_reg 23
+let reg_domain_state_ptr = phys_reg 26
 let reg_trap = phys_reg 24
 let reg_alloc_ptr = phys_reg 25
-let reg_alloc_lim = phys_reg 26
 
 (* Output a pseudo-register *)
 
@@ -374,13 +373,15 @@ let emit_instr i =
       let lbl_after_alloc = new_label () in
       let lbl_call_gc = new_label () in
       let n = -bytes in
+      let offset = Domainstate.(idx_of_field Domain_young_limit) * 8 in
       if is_immediate n then
         `	addi	{emit_reg reg_alloc_ptr}, {emit_reg reg_alloc_ptr}, {emit_int n}\n`
       else begin
         `	li	{emit_reg reg_tmp}, {emit_int n}\n`;
         `	add	{emit_reg reg_alloc_ptr}, {emit_reg reg_alloc_ptr}, {emit_reg reg_tmp}\n`
       end;
-      `	bltu	{emit_reg reg_alloc_ptr}, {emit_reg reg_alloc_lim}, {emit_label lbl_call_gc}\n`;
+      `	ld	{emit_reg reg_tmp}, {emit_int offset}({emit_reg reg_domain_state_ptr})\n`;
+      `	bltu	{emit_reg reg_alloc_ptr}, {emit_reg reg_tmp}, {emit_label lbl_call_gc}\n`;
       `{emit_label lbl_after_alloc}:\n`;
       `	addi	{emit_reg i.res.(0)}, {emit_reg reg_alloc_ptr}, {emit_int size_addr}\n`;
       call_gc_sites :=
diff --git a/ocaml/asmcomp/riscv/proc.ml b/ocaml/asmcomp/riscv/proc.ml
index 4e30e02bf03..1b460b8e0ab 100644
--- a/ocaml/asmcomp/riscv/proc.ml
+++ b/ocaml/asmcomp/riscv/proc.ml
@@ -36,12 +36,12 @@ let word_addressed = false
     a0-a7        0-7       arguments/results
     s2-s9        8-15      arguments/results (preserved by C)
     t2-t6        16-20     temporary
-    t0           21        temporary
-    t1           22        temporary (used by code generator)
-    s0           23        domain pointer (preserved by C)
+    s0           21        general purpose (preserved by C)
+    t0           22        temporary
+    t1           23        temporary (used by code generator)
     s1           24        trap pointer (preserved by C)
     s10          25        allocation pointer (preserved by C)
-    s11          26        allocation limit (preserved by C)
+    s11          26        domain pointer (preserved by C)
 
   Floating-point register map
   ---------------------------
@@ -65,11 +65,12 @@ let word_addressed = false
 *)
 
 let int_reg_name =
-  [| "a0"; "a1"; "a2"; "a3"; "a4"; "a5"; "a6"; "a7";
-     "s2"; "s3"; "s4"; "s5"; "s6"; "s7"; "s8"; "s9";
-     "t2"; "t3"; "t4"; "t5"; "t6";
-     "t0"; "t1";
-     "s0"; "s1"; "s10"; "s11" |]
+  [| "a0"; "a1"; "a2"; "a3"; "a4"; "a5"; "a6"; "a7";  (* 0 - 7 *)
+     "s2"; "s3"; "s4"; "s5"; "s6"; "s7"; "s8"; "s9";  (* 8 - 15 *)
+     "t2"; "t3"; "t4"; "t5"; "t6";                    (* 16 - 20 *)
+     "s0";                                            (* 21 *)
+     "t0"; "t1";                                      (* 22 - 23 *)
+     "s1"; "s10"; "s11" |]                            (* 24 - 26 *)
 
 let float_reg_name =
   [| "ft0"; "ft1"; "ft2"; "ft3"; "ft4"; "ft5"; "ft6"; "ft7";
@@ -85,7 +86,7 @@ let register_class r =
   | Val | Int | Addr -> 0
   | Float -> 1
 
-let num_available_registers = [| 22; 32 |]
+let num_available_registers = [| 23; 32 |]
 
 let first_available_register = [| 0; 100 |]
 
@@ -234,13 +235,13 @@ let regs_are_volatile _ = false
 let destroyed_at_c_call =
   (* s0-s11 and fs0-fs11 are callee-save *)
   Array.of_list(List.map phys_reg
-    [0; 1; 2; 3; 4; 5; 6; 7; 16; 17; 18; 19; 20; 21;
+    [0; 1; 2; 3; 4; 5; 6; 7; 16; 17; 18; 19; 20; 22;
      100; 101; 102; 103; 104; 105; 106; 107; 110; 111; 112; 113; 114; 115; 116;
      117; 128; 129; 130; 131])
 
 let destroyed_at_alloc =
-  (* t0-t3 are used for PLT stubs *)
-  if !Clflags.dlcode then Array.map phys_reg [|16; 17; 18; 19; 20; 21|]
+  (* t0-t6 are used for PLT stubs *)
+  if !Clflags.dlcode then Array.map phys_reg [|16; 17; 18; 19; 20; 22|]
   else [| |]
 
 let destroyed_at_oper = function
@@ -248,7 +249,7 @@ let destroyed_at_oper = function
   | Iop(Iextcall{alloc = false; _}) -> destroyed_at_c_call
   | Iop(Ialloc _) -> destroyed_at_alloc
   | Iop(Istore(Single, _, _)) -> [| phys_reg 100 |]
-  | Iswitch _ -> [| phys_reg 21 |]
+  | Iswitch _ -> [| phys_reg 22 |]  (* t0 *)
   | _ -> [||]
 
 let destroyed_at_raise = all_phys_regs
@@ -258,12 +259,12 @@ let destroyed_at_reloadretaddr = [| |]
 (* Maximal register pressure *)
 
 let safe_register_pressure = function
-  | Iextcall _ -> 15
-  | _ -> 22
+  | Iextcall _ -> 9
+  | _ -> 23
 
 let max_register_pressure = function
-  | Iextcall _ -> [| 15; 18 |]
-  | _ -> [| 22; 30 |]
+  | Iextcall _ -> [| 9; 12 |]
+  | _ -> [| 23; 30 |]
 
 (* Pure operations (without any side effect besides updating their result
    registers). *)
@@ -292,8 +293,9 @@ let int_dwarf_reg_numbers =
   [| 10; 11; 12; 13; 14; 15; 16; 17;
      18; 19; 20; 21; 22; 23; 24; 25;
      7; 28; 29; 30; 31;
+     8;
      5; 6;
-     8; 9; 26; 27;
+     9; 26; 27;
   |]
 
 let float_dwarf_reg_numbers =
diff --git a/ocaml/runtime/arm64.S b/ocaml/runtime/arm64.S
index 30092c8d584..e9d2c12b292 100644
--- a/ocaml/runtime/arm64.S
+++ b/ocaml/runtime/arm64.S
@@ -20,10 +20,9 @@
 
 /* Special registers */
 
-#define DOMAIN_STATE_PTR x25
+#define DOMAIN_STATE_PTR x28
 #define TRAP_PTR x26
 #define ALLOC_PTR x27
-#define ALLOC_LIMIT x28
 #define ADDITIONAL_ARG x8
 #define TMP x16
 #define TMP2 x17
@@ -64,7 +63,7 @@
 #include "../runtime/caml/domain_state.tbl"
 #undef DOMAIN_STATE
 
-#define Caml_state(var) [x25, 8*domain_field_caml_##var]
+#define Caml_state(var) [DOMAIN_STATE_PTR, 8*domain_field_caml_##var]
 
 /* Globals and labels */
 #if defined(SYS_macosx)
@@ -233,9 +232,8 @@ L(caml_call_gc):
         ldp     d26, d27, [sp, 352]
         ldp     d28, d29, [sp, 368]
         ldp     d30, d31, [sp, 384]
-    /* Reload new allocation pointer and allocation limit */
+    /* Reload new allocation pointer */
         ldr     ALLOC_PTR, Caml_state(young_ptr)
-        ldr     ALLOC_LIMIT, Caml_state(young_limit)
     /* Free stack space and return to caller */
         ldp     x29, x30, [sp], 400
         ret
@@ -244,8 +242,9 @@ L(caml_call_gc):
 
 FUNCTION(caml_alloc1)
         CFI_STARTPROC
+        ldr     TMP, Caml_state(young_limit)
         sub     ALLOC_PTR, ALLOC_PTR, #16
-        cmp     ALLOC_PTR, ALLOC_LIMIT
+        cmp     ALLOC_PTR, TMP
         b.lo    L(caml_call_gc)
         ret
         CFI_ENDPROC
@@ -253,8 +252,9 @@ FUNCTION(caml_alloc1)
 
 FUNCTION(caml_alloc2)
         CFI_STARTPROC
+        ldr     TMP, Caml_state(young_limit)
         sub     ALLOC_PTR, ALLOC_PTR, #24
-        cmp     ALLOC_PTR, ALLOC_LIMIT
+        cmp     ALLOC_PTR, TMP
         b.lo    L(caml_call_gc)
         ret
         CFI_ENDPROC
@@ -262,8 +262,9 @@ FUNCTION(caml_alloc2)
 
 FUNCTION(caml_alloc3)
         CFI_STARTPROC
+        ldr     TMP, Caml_state(young_limit)
         sub     ALLOC_PTR, ALLOC_PTR, #32
-        cmp     ALLOC_PTR, ALLOC_LIMIT
+        cmp     ALLOC_PTR, TMP
         b.lo    L(caml_call_gc)
         ret
         CFI_ENDPROC
@@ -271,8 +272,9 @@ FUNCTION(caml_alloc3)
 
 FUNCTION(caml_allocN)
         CFI_STARTPROC
+        ldr     TMP, Caml_state(young_limit)
         sub     ALLOC_PTR, ALLOC_PTR, ADDITIONAL_ARG
-        cmp     ALLOC_PTR, ALLOC_LIMIT
+        cmp     ALLOC_PTR, TMP
         b.lo    L(caml_call_gc)
         ret
         CFI_ENDPROC
@@ -295,9 +297,8 @@ FUNCTION(caml_c_call)
         str     TRAP_PTR, Caml_state(exception_pointer)
     /* Call the function */
         blr     ADDITIONAL_ARG
-    /* Reload alloc ptr and alloc limit */
+    /* Reload alloc ptr  */
         ldr     ALLOC_PTR, Caml_state(young_ptr)
-        ldr     ALLOC_LIMIT, Caml_state(young_limit)
     /* Return */
         ret     x19
         CFI_ENDPROC
@@ -346,9 +347,8 @@ L(jump_to_caml):
         stp     x8, x9, [sp, -16]!
         CFI_ADJUST(16)
         add     TRAP_PTR, sp, #0
-    /* Reload allocation pointers */
+    /* Reload allocation pointer */
         ldr     ALLOC_PTR, Caml_state(young_ptr)
-        ldr     ALLOC_LIMIT, Caml_state(young_limit)
     /* Call the OCaml code */
         blr     TMP2
 L(caml_retaddr):
@@ -431,10 +431,9 @@ FUNCTION(caml_raise_exception)
         mov     DOMAIN_STATE_PTR, C_ARG_1
     /* Load the exception bucket */
         mov     x0, C_ARG_2
-    /* Reload trap ptr, alloc ptr and alloc limit */
+    /* Reload trap ptr and alloc ptr */
         ldr     TRAP_PTR, Caml_state(exception_pointer)
         ldr     ALLOC_PTR, Caml_state(young_ptr)
-        ldr     ALLOC_LIMIT, Caml_state(young_limit)
     /* Test if backtrace is active */
         ldr     TMP, Caml_state(backtrace_active)
         cbnz    TMP, 2f
diff --git a/ocaml/runtime/power.S b/ocaml/runtime/power.S
index 1933a10ed99..4ca1a145381 100644
--- a/ocaml/runtime/power.S
+++ b/ocaml/runtime/power.S
@@ -23,9 +23,8 @@
 #define C_CALL_FUN 25
 #define C_CALL_TOC 26
 #define C_CALL_RET_ADDR 27
-#define DOMAIN_STATE_PTR 28
 #define TRAP_PTR 29
-#define ALLOC_LIMIT 30
+#define DOMAIN_STATE_PTR 30
 #define ALLOC_PTR 31
 
 #if defined(MODEL_ppc64) || defined(MODEL_ppc64le)
@@ -149,7 +148,7 @@
 #include "../runtime/caml/domain_state.tbl"
 #undef DOMAIN_STATE
 
-#define Caml_state(var) 8*domain_field_caml_##var(28)
+#define Caml_state(var) 8*domain_field_caml_##var(DOMAIN_STATE_PTR)
 
 #if defined(MODEL_ppc64)
         .section ".opd","aw"
@@ -241,9 +240,8 @@ FUNCTION(caml_call_gc)
 #if defined(MODEL_ppc64) || defined(MODEL_ppc64le)
         nop
 #endif
-    /* Reload new allocation pointer and allocation limit */
+    /* Reload new allocation pointer */
         lg      ALLOC_PTR, Caml_state(young_ptr)
-        lg      ALLOC_LIMIT, Caml_state(young_limit)
     /* Restore all regs used by the code generator */
         addi    11, 1, 8*32 + PARAM_SAVE_AREA + RESERVED_STACK - WORD
         lgu     3, WORD(11)
@@ -349,9 +347,8 @@ FUNCTION(caml_c_call)
 #endif
     /* Restore return address (in 27, preserved by the C function) */
         mtlr    C_CALL_RET_ADDR
-    /* Reload allocation pointer and allocation limit*/
+    /* Reload allocation pointer*/
         lg      ALLOC_PTR, Caml_state(young_ptr)
-        lg      ALLOC_LIMIT, Caml_state(young_limit)
     /* Return to caller */
         blr
         .cfi_endproc
@@ -401,7 +398,6 @@ FUNCTION(caml_raise_exception)
     /* Reload OCaml global registers */
         lg      1, Caml_state(exception_pointer)
         lg      ALLOC_PTR, Caml_state(young_ptr)
-        lg      ALLOC_LIMIT, Caml_state(young_limit)
     /* Pop trap frame */
         lg      0, TRAP_HANDLER_OFFSET(1)
         mtctr   0
@@ -505,9 +501,8 @@ FUNCTION(caml_start_program)
         lg      11, Caml_state(exception_pointer)
         stg     11, TRAP_PREVIOUS_OFFSET(1)
         mr      TRAP_PTR, 1
-    /* Reload allocation pointers */
+    /* Reload allocation pointer */
         lg      ALLOC_PTR, Caml_state(young_ptr)
-        lg      ALLOC_LIMIT, Caml_state(young_limit)
     /* Call the OCaml code (address in r12) */
 #if defined(MODEL_ppc)
         mtctr   12
diff --git a/ocaml/runtime/riscv.S b/ocaml/runtime/riscv.S
index d3a5a794bd2..4e195f27a11 100644
--- a/ocaml/runtime/riscv.S
+++ b/ocaml/runtime/riscv.S
@@ -17,10 +17,9 @@
 /* Must be preprocessed by cpp */
 
 #define ARG_DOMAIN_STATE_PTR t0
-#define DOMAIN_STATE_PTR s0
+#define DOMAIN_STATE_PTR s11
 #define TRAP_PTR s1
 #define ALLOC_PTR s10
-#define ALLOC_LIMIT s11
 #define TMP t1
 #define ARG t2
 
@@ -34,7 +33,7 @@
 #include "../runtime/caml/domain_state.tbl"
 #undef DOMAIN_STATE
 
-#define Caml_state(var) (8*domain_field_caml_##var)(s0)
+#define Caml_state(var) (8*domain_field_caml_##var)(DOMAIN_STATE_PTR)
 
 #define FUNCTION(name) \
         .align 2; \
@@ -63,11 +62,11 @@ FUNCTION(caml_call_gc)
         /* Record lowest stack address */
         STORE   sp, Caml_state(bottom_of_stack)
         /* Set up stack space, saving return address */
-        /* (1 reg for RA, 1 reg for FP, 22 allocatable int regs,
+        /* (1 reg for RA, 1 reg for FP, 23 allocatable int regs,
             20 caller-save float regs) * 8 */
-        addi    sp, sp, -0x160
+        /* + 1 for alignment */
+        addi    sp, sp, -0x170
         STORE   ra, 0x8(sp)
-        STORE   s0, 0x0(sp)
         /* Save allocatable integer registers on the stack,
            in the order given in proc.ml */
         STORE   a0, 0x10(sp)
@@ -91,29 +90,30 @@ FUNCTION(caml_call_gc)
         STORE   t4, 0xa0(sp)
         STORE   t5, 0xa8(sp)
         STORE   t6, 0xb0(sp)
-        STORE   t0, 0xb8(sp)
+        STORE   s0, 0xb8(sp)
+        STORE   t0, 0xc0(sp)
         /* Save caller-save floating-point registers on the stack
            (callee-saves are preserved by caml_garbage_collection) */
-        fsd     ft0, 0xc0(sp)
-        fsd     ft1, 0xc8(sp)
-        fsd     ft2, 0xd0(sp)
-        fsd     ft3, 0xd8(sp)
-        fsd     ft4, 0xe0(sp)
-        fsd     ft5, 0xe8(sp)
-        fsd     ft6, 0xf0(sp)
-        fsd     ft7, 0xf8(sp)
-        fsd     fa0, 0x100(sp)
-        fsd     fa1, 0x108(sp)
-        fsd     fa2, 0x110(sp)
-        fsd     fa3, 0x118(sp)
-        fsd     fa4, 0x120(sp)
-        fsd     fa5, 0x128(sp)
-        fsd     fa6, 0x130(sp)
-        fsd     fa7, 0x138(sp)
-        fsd     ft8, 0x140(sp)
-        fsd     ft9, 0x148(sp)
-        fsd     ft10, 0x150(sp)
-        fsd     ft11, 0x158(sp)
+        fsd     ft0, 0xd0(sp)
+        fsd     ft1, 0xd8(sp)
+        fsd     ft2, 0xe0(sp)
+        fsd     ft3, 0xe8(sp)
+        fsd     ft4, 0xf0(sp)
+        fsd     ft5, 0xf8(sp)
+        fsd     ft6, 0x100(sp)
+        fsd     ft7, 0x108(sp)
+        fsd     fa0, 0x110(sp)
+        fsd     fa1, 0x118(sp)
+        fsd     fa2, 0x120(sp)
+        fsd     fa3, 0x128(sp)
+        fsd     fa4, 0x130(sp)
+        fsd     fa5, 0x138(sp)
+        fsd     fa6, 0x140(sp)
+        fsd     fa7, 0x148(sp)
+        fsd     ft8, 0x150(sp)
+        fsd     ft9, 0x158(sp)
+        fsd     ft10, 0x160(sp)
+        fsd     ft11, 0x168(sp)
         /* Store pointer to saved integer registers in caml_gc_regs */
         addi    TMP, sp, 0x10
         STORE   TMP, Caml_state(gc_regs)
@@ -145,34 +145,33 @@ FUNCTION(caml_call_gc)
         LOAD    t4, 0xa0(sp)
         LOAD    t5, 0xa8(sp)
         LOAD    t6, 0xb0(sp)
-        LOAD    t0, 0xb8(sp)
-        fld     ft0, 0xc0(sp)
-        fld     ft1, 0xc8(sp)
-        fld     ft2, 0xd0(sp)
-        fld     ft3, 0xd8(sp)
-        fld     ft4, 0xe0(sp)
-        fld     ft5, 0xe8(sp)
-        fld     ft6, 0xf0(sp)
-        fld     ft7, 0xf8(sp)
-        fld     fa0, 0x100(sp)
-        fld     fa1, 0x108(sp)
-        fld     fa2, 0x110(sp)
-        fld     fa3, 0x118(sp)
-        fld     fa4, 0x120(sp)
-        fld     fa5, 0x128(sp)
-        fld     fa6, 0x130(sp)
-        fld     fa7, 0x138(sp)
-        fld     ft8, 0x140(sp)
-        fld     ft9, 0x148(sp)
-        fld     ft10, 0x150(sp)
-        fld     ft11, 0x158(sp)
-        /* Reload new allocation pointer and allocation limit */
+        LOAD    s0, 0xb8(sp)
+        LOAD    t0, 0xc0(sp)
+        fld     ft0, 0xd0(sp)
+        fld     ft1, 0xd8(sp)
+        fld     ft2, 0xe0(sp)
+        fld     ft3, 0xe8(sp)
+        fld     ft4, 0xf0(sp)
+        fld     ft5, 0xf8(sp)
+        fld     ft6, 0x100(sp)
+        fld     ft7, 0x108(sp)
+        fld     fa0, 0x110(sp)
+        fld     fa1, 0x118(sp)
+        fld     fa2, 0x120(sp)
+        fld     fa3, 0x128(sp)
+        fld     fa4, 0x130(sp)
+        fld     fa5, 0x138(sp)
+        fld     fa6, 0x140(sp)
+        fld     fa7, 0x148(sp)
+        fld     ft8, 0x150(sp)
+        fld     ft9, 0x158(sp)
+        fld     ft10, 0x160(sp)
+        fld     ft11, 0x168(sp)
+        /* Reload new allocation pointer */
         LOAD    ALLOC_PTR, Caml_state(young_ptr)
-        LOAD    ALLOC_LIMIT, Caml_state(young_limit)
         /* Free stack space and return to caller */
         LOAD    ra, 0x8(sp)
-        LOAD    s0, 0x0(sp)
-        addi    sp, sp, 0x160
+        addi    sp, sp, 0x170
         ret
         .size   caml_call_gc, .-caml_call_gc
 
@@ -190,9 +189,8 @@ FUNCTION(caml_c_call)
         STORE   TRAP_PTR, Caml_state(exception_pointer)
         /* Call the function */
         jalr    ARG
-        /* Reload alloc ptr and alloc limit */
+        /* Reload alloc ptr */
         LOAD    ALLOC_PTR, Caml_state(young_ptr)
-        LOAD    ALLOC_LIMIT, Caml_state(young_limit)
         /* Return */
         jr      s2
         .size   caml_c_call, .-caml_c_call
@@ -231,7 +229,6 @@ FUNCTION(caml_raise_exception)
         mv      a0, a1
         LOAD    TRAP_PTR, Caml_state(exception_pointer)
         LOAD    ALLOC_PTR, Caml_state(young_ptr)
-        LOAD    ALLOC_LIMIT, Caml_state(young_limit)
         LOAD    TMP, Caml_state(backtrace_active)
         bnez    TMP, 2f
 1:      /* Cut stack at current trap handler */
@@ -304,7 +301,6 @@ FUNCTION(caml_start_program)
         STORE   TMP, 8(sp)
         mv      TRAP_PTR, sp
         LOAD    ALLOC_PTR, Caml_state(young_ptr)
-        LOAD    ALLOC_LIMIT, Caml_state(young_limit)
         STORE   x0, Caml_state(last_return_address)
         jalr    ARG
 .Lcaml_retaddr:         /* pop trap frame, restoring caml_exception_pointer */
diff --git a/ocaml/runtime/signals_nat.c b/ocaml/runtime/signals_nat.c
index 8b64ab45263..1be1b45d420 100644
--- a/ocaml/runtime/signals_nat.c
+++ b/ocaml/runtime/signals_nat.c
@@ -99,13 +99,6 @@ DECLARE_SIGNAL_HANDLER(handle_signal)
 #endif
   if (sig < 0 || sig >= NSIG) return;
   caml_record_signal(sig);
-  /* Some ports cache [Caml_state->young_limit] in a register.
-     Use the signal context to modify that register too, but only if
-     we are inside OCaml code (not inside C code). */
-#if defined(CONTEXT_PC) && defined(CONTEXT_YOUNG_LIMIT)
-  if (caml_find_code_fragment_by_pc((char *) CONTEXT_PC) != NULL)
-    CONTEXT_YOUNG_LIMIT = (context_reg) Caml_state->young_limit;
-#endif
   errno = saved_errno;
 }
 
diff --git a/ocaml/runtime/signals_osdep.h b/ocaml/runtime/signals_osdep.h
index 5b23bbf93ae..1fd7101d5a1 100644
--- a/ocaml/runtime/signals_osdep.h
+++ b/ocaml/runtime/signals_osdep.h
@@ -315,7 +315,6 @@
   #define CONTEXT_STATE (CONTEXT_MCONTEXT->CONTEXT_REG(ss))
   #define CONTEXT_PC (CONTEXT_STATE.CONTEXT_REG(srr0))
   #define CONTEXT_EXCEPTION_POINTER (CONTEXT_STATE.CONTEXT_REG(r29))
-  #define CONTEXT_YOUNG_LIMIT (CONTEXT_STATE.CONTEXT_REG(r30))
   #define CONTEXT_YOUNG_PTR (CONTEXT_STATE.CONTEXT_REG(r31))
   #define CONTEXT_SP (CONTEXT_STATE.CONTEXT_REG(r1))
   #define CONTEXT_FAULTING_ADDRESS ((char *) info->si_addr)
@@ -334,7 +333,6 @@
   typedef unsigned long context_reg;
   #define CONTEXT_PC (context->regs->nip)
   #define CONTEXT_EXCEPTION_POINTER (context->regs->gpr[29])
-  #define CONTEXT_YOUNG_LIMIT (context->regs->gpr[30])
   #define CONTEXT_YOUNG_PTR (context->regs->gpr[31])
   #define CONTEXT_SP (context->regs->gpr[1])
 
@@ -352,7 +350,6 @@
   typedef unsigned long context_reg;
   #define CONTEXT_PC (context->uc_mcontext.gp_regs[32])
   #define CONTEXT_EXCEPTION_POINTER (context->uc_mcontext.gp_regs[29])
-  #define CONTEXT_YOUNG_LIMIT (context->uc_mcontext.gp_regs[30])
   #define CONTEXT_YOUNG_PTR (context->uc_mcontext.gp_regs[31])
   #define CONTEXT_SP (context->uc_mcontext.gp_regs[1])
   #define CONTEXT_FAULTING_ADDRESS ((char *) info->si_addr)
@@ -372,7 +369,6 @@
   typedef long context_reg;
   #define CONTEXT_PC (_UC_MACHINE_PC(context))
   #define CONTEXT_EXCEPTION_POINTER (context->uc_mcontext.__gregs[_REG_R29])
-  #define CONTEXT_YOUNG_LIMIT (context->uc_mcontext.__gregs[_REG_R30])
   #define CONTEXT_YOUNG_PTR (context->uc_mcontext.__gregs[_REG_R31])
   #define CONTEXT_SP (_UC_MACHINE_SP(context))
   #define CONTEXT_FAULTING_ADDRESS ((char *) info->si_addr)
@@ -393,7 +389,6 @@
   typedef unsigned long context_reg;
   #define CONTEXT_PC (context->sc_frame.srr0)
   #define CONTEXT_EXCEPTION_POINTER (context->sc_frame.fixreg[29])
-  #define CONTEXT_YOUNG_LIMIT (context->sc_frame.fixreg[30])
   #define CONTEXT_YOUNG_PTR (context->sc_frame.fixreg[31])
   #define CONTEXT_SP (context->sc_frame.fixreg[1])
 
@@ -410,7 +405,6 @@
   typedef unsigned long context_reg;
   #define CONTEXT_PC (context->uc_mcontext.psw.addr)
   #define CONTEXT_EXCEPTION_POINTER (context->uc_mcontext.gregs[13])
-  #define CONTEXT_YOUNG_LIMIT (context->uc_mcontext.gregs[10])
   #define CONTEXT_YOUNG_PTR (context->uc_mcontext.gregs[11])
   #define CONTEXT_SP (context->uc_mcontext.gregs[15])
   #define CONTEXT_FAULTING_ADDRESS ((char *) info->si_addr)