Skip to content

Commit 395cc30

Browse files
authored
Ports upstream #12212 (#2118)
1 parent f513da6 commit 395cc30

File tree

10 files changed

+109
-13
lines changed

10 files changed

+109
-13
lines changed

ocaml/runtime/alloc.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -396,3 +396,13 @@ CAMLexport value caml_alloc_some(value v)
396396
Field(some, 0) = v;
397397
CAMLreturn(some);
398398
}
399+
400+
CAMLprim value caml_atomic_make_contended(value v)
401+
{
402+
CAMLparam1(v);
403+
const mlsize_t sz = Wosize_bhsize(Cache_line_bsize);
404+
value res = caml_alloc_shr(sz, 0);
405+
caml_initialize(&Field(res, 0), v);
406+
for (mlsize_t i = 1; i < sz; i++) Field(res, i) = Val_unit;
407+
CAMLreturn(res);
408+
}

ocaml/runtime/caml/config.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,4 +272,17 @@ typedef uint64_t uintnat;
272272
/* Default size of runtime_events ringbuffers, in words, in powers of two */
273273
#define Default_runtime_events_log_wsize 16
274274

275+
/* Assumed size of cache line. This value can be bigger than the actual L1
276+
cache line size. Atomics allocated with aligned constructor are
277+
memory-aligned this value to avoid false sharing of cache line. */
278+
#if defined(TARGET_s390x)
279+
#define Cache_line_bsize 256
280+
#elif defined(TARGET_arm64) || defined(TARGET_power)
281+
#define Cache_line_bsize 128
282+
#elif defined(TARGET_amd64) || defined(TARGET_riscv)
283+
#define Cache_line_bsize 64
284+
#elif (!defined(NATIVE_CODE))
285+
#define Cache_line_bsize 64
286+
#endif
287+
275288
#endif /* CAML_CONFIG_H */

ocaml/runtime/caml/sizeclasses.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,14 @@
44
#define SIZECLASS_MAX 128
55
#define NUM_SIZECLASSES 32
66
static const unsigned int wsize_sizeclass[NUM_SIZECLASSES] =
7-
{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 23, 26, 29, 33, 37, 42,
7+
{ 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 17, 19, 22, 25, 28, 32, 33, 37, 42,
88
47, 53, 59, 65, 73, 81, 89, 99, 108, 118, 128 };
99
static const unsigned char wastage_sizeclass[NUM_SIZECLASSES] =
10-
{ 0, 0, 0, 0, 2, 0, 4, 4, 6, 2, 0, 4, 12, 6, 12, 21, 10, 3, 0, 22, 18, 3, 11,
10+
{ 0, 0, 0, 0, 2, 0, 4, 4, 2, 0, 4, 12, 12, 7, 0, 17, 4, 28, 0, 22, 18, 3, 11,
1111
21, 62, 4, 42, 87, 33, 96, 80, 124 };
1212
static const unsigned char sizeclass_wsize[SIZECLASS_MAX + 1] =
13-
{ 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14,
14-
15, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 20, 20,
13+
{ 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 13, 13, 14,
14+
14, 14, 15, 15, 15, 16, 16, 16, 17, 17, 17, 17, 18, 19, 19, 19, 19, 20, 20,
1515
20, 20, 20, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23,
1616
23, 24, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26,
1717
26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28,

ocaml/runtime/shared_heap.c

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,6 @@ typedef struct pool {
5353
sizeclass sz;
5454
} pool;
5555
CAML_STATIC_ASSERT(sizeof(pool) == Bsize_wsize(POOL_HEADER_WSIZE));
56-
#define POOL_HEADER_SZ sizeof(pool)
57-
5856
#define POOL_SLAB_WOFFSET(sz) (POOL_HEADER_WSIZE + wastage_sizeclass[sz])
5957
#define POOL_FIRST_BLOCK(p, sz) ((header_t*)(p) + POOL_SLAB_WOFFSET(sz))
6058
#define POOL_END(p) ((header_t*)(p) + POOL_WSIZE)
@@ -238,7 +236,7 @@ static void calc_pool_stats(pool* a, sizeclass sz, struct heap_stats* s)
238236
header_t* p = POOL_FIRST_BLOCK(a, sz);
239237
header_t* end = POOL_END(a);
240238
mlsize_t wh = wsize_sizeclass[sz];
241-
s->pool_frag_words += Wsize_bsize(POOL_HEADER_SZ);
239+
s->pool_frag_words += POOL_SLAB_WOFFSET(sz);
242240

243241
while (p + wh <= end) {
244242
header_t hd = (header_t)atomic_load_relaxed((atomic_uintnat*)p);
@@ -250,8 +248,7 @@ static void calc_pool_stats(pool* a, sizeclass sz, struct heap_stats* s)
250248

251249
p += wh;
252250
}
253-
CAMLassert(end - p == wastage_sizeclass[sz]);
254-
s->pool_frag_words += end - p;
251+
CAMLassert(end == p);
255252
s->pool_words += POOL_WSIZE;
256253
}
257254

@@ -283,6 +280,8 @@ Caml_inline void pool_initialize(pool* r,
283280
#endif
284281
p += wh;
285282
}
283+
CAMLassert(p == end);
284+
CAMLassert((uintptr_t)end % Cache_line_bsize == 0);
286285
r->next_obj = (value*)(p - wh);
287286
}
288287

@@ -1311,7 +1310,7 @@ static void verify_pool(pool* a, sizeclass sz, struct mem_stats* s) {
13111310
header_t* p = POOL_FIRST_BLOCK(a, sz);
13121311
header_t* end = POOL_END(a);
13131312
mlsize_t wh = wsize_sizeclass[sz];
1314-
s->overhead += Wsize_bsize(POOL_HEADER_SZ);
1313+
s->overhead += POOL_SLAB_WOFFSET(sz);
13151314

13161315
while (p + wh <= end) {
13171316
header_t hd = (header_t)*p;
@@ -1325,8 +1324,7 @@ static void verify_pool(pool* a, sizeclass sz, struct mem_stats* s) {
13251324
}
13261325
p += wh;
13271326
}
1328-
CAMLassert(end - p == wastage_sizeclass[sz]);
1329-
s->overhead += end - p;
1327+
CAMLassert(end == p);
13301328
s->alloced += POOL_WSIZE;
13311329
}
13321330
}

ocaml/runtime4/alloc.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,3 +289,8 @@ CAMLexport value caml_alloc_some(value v)
289289
Field(some, 0) = v;
290290
CAMLreturn(some);
291291
}
292+
293+
CAMLprim value caml_atomic_make_contended(value v)
294+
{
295+
caml_fatal_error("Atomic.make_contended is not supported by runtime4");
296+
}

ocaml/stdlib/atomic.ml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
type !'a t
1616

1717
external make : 'a -> 'a t = "%makemutable"
18+
external make_contended : 'a -> 'a t = "caml_atomic_make_contended"
1819
external get : 'a t -> 'a = "%atomic_load"
1920
external exchange : 'a t -> 'a -> 'a = "%atomic_exchange"
2021
external compare_and_set : 'a t -> 'a -> 'a -> bool = "%atomic_cas"

ocaml/stdlib/atomic.mli

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,20 @@ type !'a t
2929
(** Create an atomic reference. *)
3030
val make : 'a -> 'a t
3131

32+
(** Create an atomic reference that is alone on a cache line. It occupies 4-16x
33+
the memory of one allocated with [make v].
34+
The primary purpose is to prevent false-sharing and the resulting
35+
performance degradation. When a CPU performs an atomic operation, it
36+
temporarily takes ownership of an entire cache line that contains the
37+
atomic reference. If multiple atomic references share the same cache line,
38+
modifying these disjoint memory regions simultaneously becomes impossible,
39+
which can create a bottleneck. Hence, as a general guideline, if an atomic
40+
reference is experiencing contention, assigning it its own cache line may
41+
enhance performance.
42+
43+
CR ocaml 5 all-runtime5: does not support runtime4 *)
44+
val make_contended : 'a -> 'a t
45+
3246
(** Get the current value of the atomic reference. *)
3347
val get : 'a t -> 'a
3448

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
(* TEST
2+
modules = "aligned_alloc_stubs.c"
3+
* runtime4
4+
** skip
5+
* runtime5
6+
** native
7+
*)
8+
9+
external is_aligned : 'a Atomic.t -> bool = "caml_atomic_is_aligned"
10+
let test_is_aligned () =
11+
let l = List.init 100 Atomic.make in
12+
let all_aligned =
13+
List.for_all is_aligned l
14+
in
15+
assert (not all_aligned)
16+
;;
17+
18+
let test_make_contended () =
19+
let l = List.init 100 Atomic.make_contended in
20+
List.iteri (fun i atomic ->
21+
assert (Atomic.get atomic == i);
22+
assert (is_aligned atomic)) l
23+
;;
24+
25+
let () =
26+
test_is_aligned ();
27+
test_make_contended ();
28+
;;
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#include <stdio.h>
2+
#include <stdlib.h>
3+
#include "caml/alloc.h"
4+
5+
CAMLprim value caml_atomic_is_aligned(value val)
6+
{
7+
if ((uintptr_t)Hp_val(val) % Cache_line_bsize == 0) {
8+
return Val_true;
9+
} else {
10+
return Val_false;
11+
}
12+
}

ocaml/tools/gen_sizeclasses.ml

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,27 @@
1515
let overhead block slot obj =
1616
1. -. float_of_int((block / slot) * obj) /. float_of_int block
1717

18-
let max_overhead = 0.10
18+
let max_overhead = 0.101
19+
20+
(*
21+
Prevention of false sharing requires certain sizeclasses to be present. This
22+
ensures they are generated.
23+
Runtime has a constructor for atomics (`caml_atomic_make_contended`), which
24+
aligns them with cache lines to avoid false sharing. The implementation
25+
relies on the fact that pools are cache-aligned by design and slots of
26+
appropriate size maintain this property. To be precise, slots whose size is a
27+
multiple of cache line are laid out in such a way, that their boundaries
28+
coincide with boundaries between cache lines.
29+
*)
30+
let required_for_contended_atomic = function
31+
| 16 | 32 -> true
32+
| _ -> false
1933

2034
let rec blocksizes block slot = function
2135
| 0 -> []
2236
| obj ->
2337
if overhead block slot obj > max_overhead
38+
|| required_for_contended_atomic obj
2439
then
2540
if overhead block obj obj < max_overhead then
2641
obj :: blocksizes block obj (obj - 1)

0 commit comments

Comments
 (0)