From 79b2cf479c29ebea827a40b1d3f1a44b96490ce1 Mon Sep 17 00:00:00 2001
From: Sam Gross <colesbury@gmail.com>
Date: Mon, 10 Feb 2020 17:06:15 -0500
Subject: [PATCH 01/24] Add mimalloc v2.0.9

Modified src/alloc.c to remove include of alloc-override.c and not
compile new handler.

Ignore mimalloc symbols for now.

Did not include the following files:

 - include/mimalloc-new-delete.h
 - include/mimalloc-override.h
 - src/alloc-override-osx.c
 - src/alloc-override.c
 - src/static.c
 - src/region.c
---
 Include/mimalloc/mimalloc-atomic.h     |  338 +++++
 Include/mimalloc/mimalloc-internal.h   | 1116 ++++++++++++++++
 Include/mimalloc/mimalloc-track.h      |   62 +
 Include/mimalloc/mimalloc-types.h      |  609 +++++++++
 Include/mimalloc/mimalloc.h            |  557 ++++++++
 Makefile.pre.in                        |   34 +-
 Objects/mimalloc/alloc-aligned.c       |  306 +++++
 Objects/mimalloc/alloc-posix.c         |  184 +++
 Objects/mimalloc/alloc.c               | 1029 +++++++++++++++
 Objects/mimalloc/arena.c               |  536 ++++++++
 Objects/mimalloc/bitmap.c              |  414 ++++++
 Objects/mimalloc/bitmap.h              |  111 ++
 Objects/mimalloc/heap.c                |  602 +++++++++
 Objects/mimalloc/init.c                |  716 +++++++++++
 Objects/mimalloc/options.c             |  642 ++++++++++
 Objects/mimalloc/os.c                  | 1479 +++++++++++++++++++++
 Objects/mimalloc/page-queue.c          |  332 +++++
 Objects/mimalloc/page.c                |  926 ++++++++++++++
 Objects/mimalloc/random.c              |  404 ++++++
 Objects/mimalloc/region.c              |  516 ++++++++
 Objects/mimalloc/segment-cache.c       |  409 ++++++
 Objects/mimalloc/segment.c             | 1623 ++++++++++++++++++++++++
 Objects/mimalloc/stats.c               |  618 +++++++++
 PCbuild/_freeze_module.vcxproj         |   13 +
 PCbuild/_freeze_module.vcxproj.filters |   39 +
 PCbuild/pyproject.props                |    2 +-
 PCbuild/pythoncore.vcxproj             |   18 +
 PCbuild/pythoncore.vcxproj.filters     |   60 +
 Tools/build/smelly.py                  |    2 +-
 configure                              |  225 ++--
 configure.ac                           |  118 +-
 pyconfig.h.in                          |    3 +
 32 files changed, 13907 insertions(+), 136 deletions(-)
 create mode 100644 Include/mimalloc/mimalloc-atomic.h
 create mode 100644 Include/mimalloc/mimalloc-internal.h
 create mode 100644 Include/mimalloc/mimalloc-track.h
 create mode 100644 Include/mimalloc/mimalloc-types.h
 create mode 100644 Include/mimalloc/mimalloc.h
 create mode 100644 Objects/mimalloc/alloc-aligned.c
 create mode 100644 Objects/mimalloc/alloc-posix.c
 create mode 100644 Objects/mimalloc/alloc.c
 create mode 100644 Objects/mimalloc/arena.c
 create mode 100644 Objects/mimalloc/bitmap.c
 create mode 100644 Objects/mimalloc/bitmap.h
 create mode 100644 Objects/mimalloc/heap.c
 create mode 100644 Objects/mimalloc/init.c
 create mode 100644 Objects/mimalloc/options.c
 create mode 100644 Objects/mimalloc/os.c
 create mode 100644 Objects/mimalloc/page-queue.c
 create mode 100644 Objects/mimalloc/page.c
 create mode 100644 Objects/mimalloc/random.c
 create mode 100644 Objects/mimalloc/region.c
 create mode 100644 Objects/mimalloc/segment-cache.c
 create mode 100644 Objects/mimalloc/segment.c
 create mode 100644 Objects/mimalloc/stats.c

diff --git a/Include/mimalloc/mimalloc-atomic.h b/Include/mimalloc/mimalloc-atomic.h
new file mode 100644
index 00000000000000..c66f80493321ee
--- /dev/null
+++ b/Include/mimalloc/mimalloc-atomic.h
@@ -0,0 +1,338 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2021 Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_ATOMIC_H
+#define MIMALLOC_ATOMIC_H
+
+// --------------------------------------------------------------------------------------------
+// Atomics
+// We need to be portable between C, C++, and MSVC.
+// We base the primitives on the C/C++ atomics and create a mimimal wrapper for MSVC in C compilation mode.
+// This is why we try to use only `uintptr_t` and `<type>*` as atomic types.
+// To gain better insight in the range of used atomics, we use explicitly named memory order operations
+// instead of passing the memory order as a parameter.
+// -----------------------------------------------------------------------------------------------
+
+#if defined(__cplusplus)
+// Use C++ atomics
+#include <atomic>
+#define  _Atomic(tp)            std::atomic<tp>
+#define  mi_atomic(name)        std::atomic_##name
+#define  mi_memory_order(name)  std::memory_order_##name
+#if !defined(ATOMIC_VAR_INIT) || (__cplusplus >= 202002L) // c++20, see issue #571
+ #define MI_ATOMIC_VAR_INIT(x)  x
+#else
+ #define MI_ATOMIC_VAR_INIT(x)  ATOMIC_VAR_INIT(x)
+#endif
+#elif defined(_MSC_VER)
+// Use MSVC C wrapper for C11 atomics
+#define  _Atomic(tp)            tp
+#define  MI_ATOMIC_VAR_INIT(x)  x
+#define  mi_atomic(name)        mi_atomic_##name
+#define  mi_memory_order(name)  mi_memory_order_##name
+#else
+// Use C11 atomics
+#include <stdatomic.h>
+#define  mi_atomic(name)        atomic_##name
+#define  mi_memory_order(name)  memory_order_##name
+#define  MI_ATOMIC_VAR_INIT(x)  ATOMIC_VAR_INIT(x)
+#endif
+
+// Various defines for all used memory orders in mimalloc
+#define mi_atomic_cas_weak(p,expected,desired,mem_success,mem_fail)  \
+  mi_atomic(compare_exchange_weak_explicit)(p,expected,desired,mem_success,mem_fail)
+
+#define mi_atomic_cas_strong(p,expected,desired,mem_success,mem_fail)  \
+  mi_atomic(compare_exchange_strong_explicit)(p,expected,desired,mem_success,mem_fail)
+
+#define mi_atomic_load_acquire(p)                mi_atomic(load_explicit)(p,mi_memory_order(acquire))
+#define mi_atomic_load_relaxed(p)                mi_atomic(load_explicit)(p,mi_memory_order(relaxed))
+#define mi_atomic_store_release(p,x)             mi_atomic(store_explicit)(p,x,mi_memory_order(release))
+#define mi_atomic_store_relaxed(p,x)             mi_atomic(store_explicit)(p,x,mi_memory_order(relaxed))
+#define mi_atomic_exchange_release(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(release))
+#define mi_atomic_exchange_acq_rel(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_cas_weak_release(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
+#define mi_atomic_cas_weak_acq_rel(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))
+#define mi_atomic_cas_strong_release(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
+#define mi_atomic_cas_strong_acq_rel(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))
+
+#define mi_atomic_add_relaxed(p,x)               mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(relaxed))
+#define mi_atomic_sub_relaxed(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(relaxed))
+#define mi_atomic_add_acq_rel(p,x)               mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_sub_acq_rel(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_and_acq_rel(p,x)               mi_atomic(fetch_and_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_or_acq_rel(p,x)                mi_atomic(fetch_or_explicit)(p,x,mi_memory_order(acq_rel))
+
+#define mi_atomic_increment_relaxed(p)           mi_atomic_add_relaxed(p,(uintptr_t)1)
+#define mi_atomic_decrement_relaxed(p)           mi_atomic_sub_relaxed(p,(uintptr_t)1)
+#define mi_atomic_increment_acq_rel(p)           mi_atomic_add_acq_rel(p,(uintptr_t)1)
+#define mi_atomic_decrement_acq_rel(p)           mi_atomic_sub_acq_rel(p,(uintptr_t)1)
+
+static inline void mi_atomic_yield(void);
+static inline intptr_t mi_atomic_addi(_Atomic(intptr_t)*p, intptr_t add);
+static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub);
+
+
+#if defined(__cplusplus) || !defined(_MSC_VER)
+
+// In C++/C11 atomics we have polymorphic atomics so can use the typed `ptr` variants (where `tp` is the type of atomic value)
+// We use these macros so we can provide a typed wrapper in MSVC in C compilation mode as well
+#define mi_atomic_load_ptr_acquire(tp,p)                mi_atomic_load_acquire(p)
+#define mi_atomic_load_ptr_relaxed(tp,p)                mi_atomic_load_relaxed(p)
+
+// In C++ we need to add casts to help resolve templates if NULL is passed
+#if defined(__cplusplus)
+#define mi_atomic_store_ptr_release(tp,p,x)             mi_atomic_store_release(p,(tp*)x)
+#define mi_atomic_store_ptr_relaxed(tp,p,x)             mi_atomic_store_relaxed(p,(tp*)x)
+#define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release(p,exp,(tp*)des)
+#define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel(p,exp,(tp*)des)
+#define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release(p,exp,(tp*)des)
+#define mi_atomic_exchange_ptr_release(tp,p,x)          mi_atomic_exchange_release(p,(tp*)x)
+#define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          mi_atomic_exchange_acq_rel(p,(tp*)x)
+#else
+#define mi_atomic_store_ptr_release(tp,p,x)             mi_atomic_store_release(p,x)
+#define mi_atomic_store_ptr_relaxed(tp,p,x)             mi_atomic_store_relaxed(p,x)
+#define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release(p,exp,des)
+#define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel(p,exp,des)
+#define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release(p,exp,des)
+#define mi_atomic_exchange_ptr_release(tp,p,x)          mi_atomic_exchange_release(p,x)
+#define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          mi_atomic_exchange_acq_rel(p,x)
+#endif
+
+// These are used by the statistics
+static inline int64_t mi_atomic_addi64_relaxed(volatile int64_t* p, int64_t add) {
+  return mi_atomic(fetch_add_explicit)((_Atomic(int64_t)*)p, add, mi_memory_order(relaxed));
+}
+static inline void mi_atomic_maxi64_relaxed(volatile int64_t* p, int64_t x) {
+  int64_t current = mi_atomic_load_relaxed((_Atomic(int64_t)*)p);
+  while (current < x && !mi_atomic_cas_weak_release((_Atomic(int64_t)*)p, &current, x)) { /* nothing */ };
+}
+
+// Used by timers
+#define mi_atomic_loadi64_acquire(p)    mi_atomic(load_explicit)(p,mi_memory_order(acquire))
+#define mi_atomic_loadi64_relaxed(p)    mi_atomic(load_explicit)(p,mi_memory_order(relaxed))
+#define mi_atomic_storei64_release(p,x) mi_atomic(store_explicit)(p,x,mi_memory_order(release))
+#define mi_atomic_storei64_relaxed(p,x) mi_atomic(store_explicit)(p,x,mi_memory_order(relaxed))
+
+
+
+#elif defined(_MSC_VER)
+
+// MSVC C compilation wrapper that uses Interlocked operations to model C11 atomics.
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <intrin.h>
+#ifdef _WIN64
+typedef LONG64   msc_intptr_t;
+#define MI_64(f) f##64
+#else
+typedef LONG     msc_intptr_t;
+#define MI_64(f) f
+#endif
+
+typedef enum mi_memory_order_e {
+  mi_memory_order_relaxed,
+  mi_memory_order_consume,
+  mi_memory_order_acquire,
+  mi_memory_order_release,
+  mi_memory_order_acq_rel,
+  mi_memory_order_seq_cst
+} mi_memory_order;
+
+static inline uintptr_t mi_atomic_fetch_add_explicit(_Atomic(uintptr_t)*p, uintptr_t add, mi_memory_order mo) {
+  (void)(mo);
+  return (uintptr_t)MI_64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, (msc_intptr_t)add);
+}
+static inline uintptr_t mi_atomic_fetch_sub_explicit(_Atomic(uintptr_t)*p, uintptr_t sub, mi_memory_order mo) {
+  (void)(mo);
+  return (uintptr_t)MI_64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, -((msc_intptr_t)sub));
+}
+static inline uintptr_t mi_atomic_fetch_and_explicit(_Atomic(uintptr_t)*p, uintptr_t x, mi_memory_order mo) {
+  (void)(mo);
+  return (uintptr_t)MI_64(_InterlockedAnd)((volatile msc_intptr_t*)p, (msc_intptr_t)x);
+}
+static inline uintptr_t mi_atomic_fetch_or_explicit(_Atomic(uintptr_t)*p, uintptr_t x, mi_memory_order mo) {
+  (void)(mo);
+  return (uintptr_t)MI_64(_InterlockedOr)((volatile msc_intptr_t*)p, (msc_intptr_t)x);
+}
+static inline bool mi_atomic_compare_exchange_strong_explicit(_Atomic(uintptr_t)*p, uintptr_t* expected, uintptr_t desired, mi_memory_order mo1, mi_memory_order mo2) {
+  (void)(mo1); (void)(mo2);
+  uintptr_t read = (uintptr_t)MI_64(_InterlockedCompareExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)desired, (msc_intptr_t)(*expected));
+  if (read == *expected) {
+    return true;
+  }
+  else {
+    *expected = read;
+    return false;
+  }
+}
+static inline bool mi_atomic_compare_exchange_weak_explicit(_Atomic(uintptr_t)*p, uintptr_t* expected, uintptr_t desired, mi_memory_order mo1, mi_memory_order mo2) {
+  return mi_atomic_compare_exchange_strong_explicit(p, expected, desired, mo1, mo2);
+}
+static inline uintptr_t mi_atomic_exchange_explicit(_Atomic(uintptr_t)*p, uintptr_t exchange, mi_memory_order mo) {
+  (void)(mo);
+  return (uintptr_t)MI_64(_InterlockedExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)exchange);
+}
+static inline void mi_atomic_thread_fence(mi_memory_order mo) {
+  (void)(mo);
+  _Atomic(uintptr_t) x = 0;
+  mi_atomic_exchange_explicit(&x, 1, mo);
+}
+static inline uintptr_t mi_atomic_load_explicit(_Atomic(uintptr_t) const* p, mi_memory_order mo) {
+  (void)(mo);
+#if defined(_M_IX86) || defined(_M_X64)
+  return *p;
+#else
+  uintptr_t x = *p;
+  if (mo > mi_memory_order_relaxed) {
+    while (!mi_atomic_compare_exchange_weak_explicit(p, &x, x, mo, mi_memory_order_relaxed)) { /* nothing */ };
+  }
+  return x;
+#endif
+}
+static inline void mi_atomic_store_explicit(_Atomic(uintptr_t)*p, uintptr_t x, mi_memory_order mo) {
+  (void)(mo);
+#if defined(_M_IX86) || defined(_M_X64)
+  *p = x;
+#else
+  mi_atomic_exchange_explicit(p, x, mo);
+#endif
+}
+static inline int64_t mi_atomic_loadi64_explicit(_Atomic(int64_t)*p, mi_memory_order mo) {
+  (void)(mo);
+#if defined(_M_X64)
+  return *p;
+#else
+  int64_t old = *p;
+  int64_t x = old;
+  while ((old = InterlockedCompareExchange64(p, x, old)) != x) {
+    x = old;
+  }
+  return x;
+#endif
+}
+static inline void mi_atomic_storei64_explicit(_Atomic(int64_t)*p, int64_t x, mi_memory_order mo) {
+  (void)(mo);
+#if defined(x_M_IX86) || defined(_M_X64)
+  *p = x;
+#else
+  InterlockedExchange64(p, x);
+#endif
+}
+
+// These are used by the statistics
+static inline int64_t mi_atomic_addi64_relaxed(volatile _Atomic(int64_t)*p, int64_t add) {
+#ifdef _WIN64
+  return (int64_t)mi_atomic_addi((int64_t*)p, add);
+#else
+  int64_t current;
+  int64_t sum;
+  do {
+    current = *p;
+    sum = current + add;
+  } while (_InterlockedCompareExchange64(p, sum, current) != current);
+  return current;
+#endif
+}
+static inline void mi_atomic_maxi64_relaxed(volatile _Atomic(int64_t)*p, int64_t x) {
+  int64_t current;
+  do {
+    current = *p;
+  } while (current < x && _InterlockedCompareExchange64(p, x, current) != current);
+}
+
+// The pointer macros cast to `uintptr_t`.
+#define mi_atomic_load_ptr_acquire(tp,p)                (tp*)mi_atomic_load_acquire((_Atomic(uintptr_t)*)(p))
+#define mi_atomic_load_ptr_relaxed(tp,p)                (tp*)mi_atomic_load_relaxed((_Atomic(uintptr_t)*)(p))
+#define mi_atomic_store_ptr_release(tp,p,x)             mi_atomic_store_release((_Atomic(uintptr_t)*)(p),(uintptr_t)(x))
+#define mi_atomic_store_ptr_relaxed(tp,p,x)             mi_atomic_store_relaxed((_Atomic(uintptr_t)*)(p),(uintptr_t)(x))
+#define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
+#define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
+#define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
+#define mi_atomic_exchange_ptr_release(tp,p,x)          (tp*)mi_atomic_exchange_release((_Atomic(uintptr_t)*)(p),(uintptr_t)x)
+#define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          (tp*)mi_atomic_exchange_acq_rel((_Atomic(uintptr_t)*)(p),(uintptr_t)x)
+
+#define mi_atomic_loadi64_acquire(p)    mi_atomic(loadi64_explicit)(p,mi_memory_order(acquire))
+#define mi_atomic_loadi64_relaxed(p)    mi_atomic(loadi64_explicit)(p,mi_memory_order(relaxed))
+#define mi_atomic_storei64_release(p,x) mi_atomic(storei64_explicit)(p,x,mi_memory_order(release))
+#define mi_atomic_storei64_relaxed(p,x) mi_atomic(storei64_explicit)(p,x,mi_memory_order(relaxed))
+
+
+#endif
+
+
+// Atomically add a signed value; returns the previous value.
+static inline intptr_t mi_atomic_addi(_Atomic(intptr_t)*p, intptr_t add) {
+  return (intptr_t)mi_atomic_add_acq_rel((_Atomic(uintptr_t)*)p, (uintptr_t)add);
+}
+
+// Atomically subtract a signed value; returns the previous value.
+static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub) {
+  return (intptr_t)mi_atomic_addi(p, -sub);
+}
+
+// Yield
+#if defined(__cplusplus)
+#include <thread>
+static inline void mi_atomic_yield(void) {
+  std::this_thread::yield();
+}
+#elif defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+static inline void mi_atomic_yield(void) {
+  YieldProcessor();
+}
+#elif defined(__SSE2__)
+#include <emmintrin.h>
+static inline void mi_atomic_yield(void) {
+  _mm_pause();
+}
+#elif (defined(__GNUC__) || defined(__clang__)) && \
+      (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__armel__) || defined(__ARMEL__) || \
+       defined(__aarch64__) || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__))
+#if defined(__x86_64__) || defined(__i386__)
+static inline void mi_atomic_yield(void) {
+  __asm__ volatile ("pause" ::: "memory");
+}
+#elif defined(__aarch64__)
+static inline void mi_atomic_yield(void) {
+  __asm__ volatile("wfe");
+}
+#elif (defined(__arm__) && __ARM_ARCH__ >= 7)
+static inline void mi_atomic_yield(void) {
+  __asm__ volatile("yield" ::: "memory");
+}
+#elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__)
+static inline void mi_atomic_yield(void) {
+  __asm__ __volatile__ ("or 27,27,27" ::: "memory");
+}
+#elif defined(__armel__) || defined(__ARMEL__)
+static inline void mi_atomic_yield(void) {
+  __asm__ volatile ("nop" ::: "memory");
+}
+#endif
+#elif defined(__sun)
+// Fallback for other archs
+#include <synch.h>
+static inline void mi_atomic_yield(void) {
+  smt_pause();
+}
+#elif defined(__wasi__)
+#include <sched.h>
+static inline void mi_atomic_yield(void) {
+  sched_yield();
+}
+#else
+#include <unistd.h>
+static inline void mi_atomic_yield(void) {
+  sleep(0);
+}
+#endif
+
+
+#endif // __MIMALLOC_ATOMIC_H
diff --git a/Include/mimalloc/mimalloc-internal.h b/Include/mimalloc/mimalloc-internal.h
new file mode 100644
index 00000000000000..a68e69662c741d
--- /dev/null
+++ b/Include/mimalloc/mimalloc-internal.h
@@ -0,0 +1,1116 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2022, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_INTERNAL_H
+#define MIMALLOC_INTERNAL_H
+
+#include "mimalloc-types.h"
+#include "mimalloc-track.h"
+
+#if (MI_DEBUG>0)
+#define mi_trace_message(...)  _mi_trace_message(__VA_ARGS__)
+#else
+#define mi_trace_message(...)
+#endif
+
+#define MI_CACHE_LINE          64
+#if defined(_MSC_VER)
+#pragma warning(disable:4127)   // suppress constant conditional warning (due to MI_SECURE paths)
+#pragma warning(disable:26812)  // unscoped enum warning
+#define mi_decl_noinline        __declspec(noinline)
+#define mi_decl_thread          __declspec(thread)
+#define mi_decl_cache_align     __declspec(align(MI_CACHE_LINE))
+#elif (defined(__GNUC__) && (__GNUC__ >= 3)) || defined(__clang__) // includes clang and icc
+#define mi_decl_noinline        __attribute__((noinline))
+#define mi_decl_thread          __thread
+#define mi_decl_cache_align     __attribute__((aligned(MI_CACHE_LINE)))
+#else
+#define mi_decl_noinline
+#define mi_decl_thread          __thread        // hope for the best :-)
+#define mi_decl_cache_align
+#endif
+
+#if defined(__EMSCRIPTEN__) && !defined(__wasi__)
+#define __wasi__
+#endif
+
+#if defined(__cplusplus)
+#define mi_decl_externc       extern "C"
+#else
+#define mi_decl_externc
+#endif
+
+#if !defined(_WIN32) && !defined(__wasi__)
+#define  MI_USE_PTHREADS
+#include <pthread.h>
+#endif
+
+// "options.c"
+void       _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message);
+void       _mi_fprintf(mi_output_fun* out, void* arg, const char* fmt, ...);
+void       _mi_warning_message(const char* fmt, ...);
+void       _mi_verbose_message(const char* fmt, ...);
+void       _mi_trace_message(const char* fmt, ...);
+void       _mi_options_init(void);
+void       _mi_error_message(int err, const char* fmt, ...);
+
+// random.c
+void       _mi_random_init(mi_random_ctx_t* ctx);
+void       _mi_random_init_weak(mi_random_ctx_t* ctx);
+void       _mi_random_reinit_if_weak(mi_random_ctx_t * ctx);
+void       _mi_random_split(mi_random_ctx_t* ctx, mi_random_ctx_t* new_ctx);
+uintptr_t  _mi_random_next(mi_random_ctx_t* ctx);
+uintptr_t  _mi_heap_random_next(mi_heap_t* heap);
+uintptr_t  _mi_os_random_weak(uintptr_t extra_seed);
+static inline uintptr_t _mi_random_shuffle(uintptr_t x);
+
+// init.c
+extern mi_decl_cache_align mi_stats_t       _mi_stats_main;
+extern mi_decl_cache_align const mi_page_t  _mi_page_empty;
+bool       _mi_is_main_thread(void);
+size_t     _mi_current_thread_count(void);
+bool       _mi_preloading(void);  // true while the C runtime is not ready
+
+// os.c
+size_t     _mi_os_page_size(void);
+void       _mi_os_init(void);                                      // called from process init
+void*      _mi_os_alloc(size_t size, mi_stats_t* stats);           // to allocate thread local data
+void       _mi_os_free(void* p, size_t size, mi_stats_t* stats);   // to free thread local data
+
+bool       _mi_os_protect(void* addr, size_t size);
+bool       _mi_os_unprotect(void* addr, size_t size);
+bool       _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* stats);
+bool       _mi_os_decommit(void* p, size_t size, mi_stats_t* stats);
+bool       _mi_os_reset(void* p, size_t size, mi_stats_t* stats);
+// bool       _mi_os_unreset(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
+size_t     _mi_os_good_alloc_size(size_t size);
+bool       _mi_os_has_overcommit(void);
+bool       _mi_os_reset(void* addr, size_t size, mi_stats_t* tld_stats);
+
+void*      _mi_os_alloc_aligned_offset(size_t size, size_t alignment, size_t align_offset, bool commit, bool* large, mi_stats_t* tld_stats);
+void       _mi_os_free_aligned(void* p, size_t size, size_t alignment, size_t align_offset, bool was_committed, mi_stats_t* tld_stats);
+
+// arena.c
+void*      _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool* commit, bool* large, bool* is_pinned, bool* is_zero, mi_arena_id_t req_arena_id, size_t* memid, mi_os_tld_t* tld);
+void*      _mi_arena_alloc(size_t size, bool* commit, bool* large, bool* is_pinned, bool* is_zero, mi_arena_id_t req_arena_id, size_t* memid, mi_os_tld_t* tld);
+void       _mi_arena_free(void* p, size_t size, size_t alignment, size_t align_offset, size_t memid, bool all_committed, mi_stats_t* stats);
+mi_arena_id_t _mi_arena_id_none(void);
+bool       _mi_arena_memid_is_suitable(size_t memid, mi_arena_id_t req_arena_id);
+
+// "segment-cache.c"
+void*      _mi_segment_cache_pop(size_t size, mi_commit_mask_t* commit_mask, mi_commit_mask_t* decommit_mask, bool* large, bool* is_pinned, bool* is_zero, mi_arena_id_t req_arena_id, size_t* memid, mi_os_tld_t* tld);
+bool       _mi_segment_cache_push(void* start, size_t size, size_t memid, const mi_commit_mask_t* commit_mask, const mi_commit_mask_t* decommit_mask, bool is_large, bool is_pinned, mi_os_tld_t* tld);
+void       _mi_segment_cache_collect(bool force, mi_os_tld_t* tld);
+void       _mi_segment_cache_free_all(mi_os_tld_t* tld);
+void       _mi_segment_map_allocated_at(const mi_segment_t* segment);
+void       _mi_segment_map_freed_at(const mi_segment_t* segment);
+
+// "segment.c"
+mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment, mi_segments_tld_t* tld, mi_os_tld_t* os_tld);
+void       _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld);
+void       _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld);
+bool       _mi_segment_try_reclaim_abandoned( mi_heap_t* heap, bool try_all, mi_segments_tld_t* tld);
+void       _mi_segment_thread_collect(mi_segments_tld_t* tld);
+
+#if MI_HUGE_PAGE_ABANDON
+void       _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
+#else
+void       _mi_segment_huge_page_reset(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
+#endif
+
+uint8_t*   _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size); // page start for any page
+void       _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld);
+void       _mi_abandoned_await_readers(void);
+void       _mi_abandoned_collect(mi_heap_t* heap, bool force, mi_segments_tld_t* tld);
+
+
+
+// "page.c"
+void*      _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment)  mi_attr_noexcept mi_attr_malloc;
+
+void       _mi_page_retire(mi_page_t* page) mi_attr_noexcept;                  // free the page if there are no other pages with many free blocks
+void       _mi_page_unfull(mi_page_t* page);
+void       _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force);   // free the page
+void       _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq);            // abandon the page, to be picked up by another thread...
+void       _mi_heap_delayed_free_all(mi_heap_t* heap);
+bool       _mi_heap_delayed_free_partial(mi_heap_t* heap);
+void       _mi_heap_collect_retired(mi_heap_t* heap, bool force);
+
+void       _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never);
+bool       _mi_page_try_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never);
+size_t     _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append);
+void       _mi_deferred_free(mi_heap_t* heap, bool force);
+
+void       _mi_page_free_collect(mi_page_t* page,bool force);
+void       _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page);   // callback from segments
+
+size_t     _mi_bin_size(uint8_t bin);           // for stats
+uint8_t    _mi_bin(size_t size);                // for stats
+
+// "heap.c"
+void       _mi_heap_destroy_pages(mi_heap_t* heap);
+void       _mi_heap_collect_abandon(mi_heap_t* heap);
+void       _mi_heap_set_default_direct(mi_heap_t* heap);
+bool       _mi_heap_memid_is_suitable(mi_heap_t* heap, size_t memid);
+void       _mi_heap_destroy_all(void);
+
+// "stats.c"
+void       _mi_stats_done(mi_stats_t* stats);
+
+mi_msecs_t  _mi_clock_now(void);
+mi_msecs_t  _mi_clock_end(mi_msecs_t start);
+mi_msecs_t  _mi_clock_start(void);
+
+// "alloc.c"
+void*       _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept;  // called from `_mi_malloc_generic`
+void*       _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept;
+void*       _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept;     // called from `_mi_heap_malloc_aligned`
+void*       _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) mi_attr_noexcept;
+mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p);
+bool        _mi_free_delayed_block(mi_block_t* block);
+void        _mi_free_generic(const mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept;  // for runtime integration
+
+#if MI_DEBUG>1
+bool        _mi_page_is_valid(mi_page_t* page);
+#endif
+
+
+// ------------------------------------------------------
+// Branches
+// ------------------------------------------------------
+
+#if defined(__GNUC__) || defined(__clang__)
+#define mi_unlikely(x)     (__builtin_expect(!!(x),false))
+#define mi_likely(x)       (__builtin_expect(!!(x),true))
+#elif (defined(__cplusplus) && (__cplusplus >= 202002L)) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L)
+#define mi_unlikely(x)     (x) [[unlikely]]
+#define mi_likely(x)       (x) [[likely]]
+#else
+#define mi_unlikely(x)     (x)
+#define mi_likely(x)       (x)
+#endif
+
+#ifndef __has_builtin
+#define __has_builtin(x)  0
+#endif
+
+
+/* -----------------------------------------------------------
+  Error codes passed to `_mi_fatal_error`
+  All are recoverable but EFAULT is a serious error and aborts by default in secure mode.
+  For portability define undefined error codes using common Unix codes:
+  <https://www-numi.fnal.gov/offline_software/srt_public_context/WebDocs/Errors/unix_system_errors.html>
+----------------------------------------------------------- */
+#include <errno.h>
+#ifndef EAGAIN         // double free
+#define EAGAIN (11)
+#endif
+#ifndef ENOMEM         // out of memory
+#define ENOMEM (12)
+#endif
+#ifndef EFAULT         // corrupted free-list or meta-data
+#define EFAULT (14)
+#endif
+#ifndef EINVAL         // trying to free an invalid pointer
+#define EINVAL (22)
+#endif
+#ifndef EOVERFLOW      // count*size overflow
+#define EOVERFLOW (75)
+#endif
+
+
+/* -----------------------------------------------------------
+  Inlined definitions
+----------------------------------------------------------- */
+#define MI_UNUSED(x)     (void)(x)
+#if (MI_DEBUG>0)
+#define MI_UNUSED_RELEASE(x)
+#else
+#define MI_UNUSED_RELEASE(x)  MI_UNUSED(x)
+#endif
+
+#define MI_INIT4(x)   x(),x(),x(),x()
+#define MI_INIT8(x)   MI_INIT4(x),MI_INIT4(x)
+#define MI_INIT16(x)  MI_INIT8(x),MI_INIT8(x)
+#define MI_INIT32(x)  MI_INIT16(x),MI_INIT16(x)
+#define MI_INIT64(x)  MI_INIT32(x),MI_INIT32(x)
+#define MI_INIT128(x) MI_INIT64(x),MI_INIT64(x)
+#define MI_INIT256(x) MI_INIT128(x),MI_INIT128(x)
+
+
+// Is `x` a power of two? (0 is considered a power of two)
+static inline bool _mi_is_power_of_two(uintptr_t x) {
+  return ((x & (x - 1)) == 0);
+}
+
+// Is a pointer aligned?
+static inline bool _mi_is_aligned(void* p, size_t alignment) {
+  mi_assert_internal(alignment != 0);
+  return (((uintptr_t)p % alignment) == 0);
+}
+
+// Align upwards
+static inline uintptr_t _mi_align_up(uintptr_t sz, size_t alignment) {
+  mi_assert_internal(alignment != 0);
+  uintptr_t mask = alignment - 1;
+  if ((alignment & mask) == 0) {  // power of two?
+    return ((sz + mask) & ~mask);
+  }
+  else {
+    return (((sz + mask)/alignment)*alignment);
+  }
+}
+
+// Align downwards
+static inline uintptr_t _mi_align_down(uintptr_t sz, size_t alignment) {
+  mi_assert_internal(alignment != 0);
+  uintptr_t mask = alignment - 1;
+  if ((alignment & mask) == 0) { // power of two?
+    return (sz & ~mask);
+  }
+  else {
+    return ((sz / alignment) * alignment);
+  }
+}
+
+// Divide upwards: `s <= _mi_divide_up(s,d)*d < s+d`.
+static inline uintptr_t _mi_divide_up(uintptr_t size, size_t divider) {
+  mi_assert_internal(divider != 0);
+  return (divider == 0 ? size : ((size + divider - 1) / divider));
+}
+
+// Is memory zero initialized?
+static inline bool mi_mem_is_zero(void* p, size_t size) {
+  for (size_t i = 0; i < size; i++) {
+    if (((uint8_t*)p)[i] != 0) return false;
+  }
+  return true;
+}
+
+
+// Align a byte size to a size in _machine words_,
+// i.e. byte size == `wsize*sizeof(void*)`.
+static inline size_t _mi_wsize_from_size(size_t size) {
+  mi_assert_internal(size <= SIZE_MAX - sizeof(uintptr_t));
+  return (size + sizeof(uintptr_t) - 1) / sizeof(uintptr_t);
+}
+
+// Overflow detecting multiply
+#if __has_builtin(__builtin_umul_overflow) || (defined(__GNUC__) && (__GNUC__ >= 5))
+#include <limits.h>      // UINT_MAX, ULONG_MAX
+#if defined(_CLOCK_T)    // for Illumos
+#undef _CLOCK_T
+#endif
+static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
+  #if (SIZE_MAX == ULONG_MAX)
+    return __builtin_umull_overflow(count, size, (unsigned long *)total);
+  #elif (SIZE_MAX == UINT_MAX)
+    return __builtin_umul_overflow(count, size, (unsigned int *)total);
+  #else
+    return __builtin_umulll_overflow(count, size, (unsigned long long *)total);
+  #endif
+}
+#else /* __builtin_umul_overflow is unavailable */
+static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
+  #define MI_MUL_NO_OVERFLOW ((size_t)1 << (4*sizeof(size_t)))  // sqrt(SIZE_MAX)
+  *total = count * size;
+  // note: gcc/clang optimize this to directly check the overflow flag
+  return ((size >= MI_MUL_NO_OVERFLOW || count >= MI_MUL_NO_OVERFLOW) && size > 0 && (SIZE_MAX / size) < count);
+}
+#endif
+
+// Safe multiply `count*size` into `total`; return `true` on overflow.
+static inline bool mi_count_size_overflow(size_t count, size_t size, size_t* total) {
+  if (count==1) {  // quick check for the case where count is one (common for C++ allocators)
+    *total = size;
+    return false;
+  }
+  else if mi_unlikely(mi_mul_overflow(count, size, total)) {
+    #if MI_DEBUG > 0
+    _mi_error_message(EOVERFLOW, "allocation request is too large (%zu * %zu bytes)\n", count, size);
+    #endif
+    *total = SIZE_MAX;
+    return true;
+  }
+  else return false;
+}
+
+
+/* ----------------------------------------------------------------------------------------
+The thread local default heap: `_mi_get_default_heap` returns the thread local heap.
+On most platforms (Windows, Linux, FreeBSD, NetBSD, etc), this just returns a
+__thread local variable (`_mi_heap_default`). With the initial-exec TLS model this ensures
+that the storage will always be available (allocated on the thread stacks).
+On some platforms though we cannot use that when overriding `malloc` since the underlying
+TLS implementation (or the loader) will call itself `malloc` on a first access and recurse.
+We try to circumvent this in an efficient way:
+- macOSX : we use an unused TLS slot from the OS allocated slots (MI_TLS_SLOT). On OSX, the
+           loader itself calls `malloc` even before the modules are initialized.
+- OpenBSD: we use an unused slot from the pthread block (MI_TLS_PTHREAD_SLOT_OFS).
+- DragonFly: defaults are working but seem slow compared to freeBSD (see PR #323)
+------------------------------------------------------------------------------------------- */
+
+extern const mi_heap_t _mi_heap_empty;  // read-only empty heap, initial value of the thread local default heap
+extern bool _mi_process_is_initialized;
+mi_heap_t*  _mi_heap_main_get(void);    // statically allocated main backing heap
+
+#if defined(MI_MALLOC_OVERRIDE)
+#if defined(__APPLE__) // macOS
+#define MI_TLS_SLOT               89  // seems unused?
+// #define MI_TLS_RECURSE_GUARD 1
+// other possible unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89)
+// see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
+#elif defined(__OpenBSD__)
+// use end bytes of a name; goes wrong if anyone uses names > 23 characters (ptrhread specifies 16)
+// see <https://github.com/openbsd/src/blob/master/lib/libc/include/thread_private.h#L371>
+#define MI_TLS_PTHREAD_SLOT_OFS   (6*sizeof(int) + 4*sizeof(void*) + 24)
+// #elif defined(__DragonFly__)
+// #warning "mimalloc is not working correctly on DragonFly yet."
+// #define MI_TLS_PTHREAD_SLOT_OFS   (4 + 1*sizeof(void*))  // offset `uniqueid` (also used by gdb?) <https://github.com/DragonFlyBSD/DragonFlyBSD/blob/master/lib/libthread_xu/thread/thr_private.h#L458>
+#elif defined(__ANDROID__)
+// See issue #381
+#define MI_TLS_PTHREAD
+#endif
+#endif
+
+#if defined(MI_TLS_SLOT)
+static inline void* mi_tls_slot(size_t slot) mi_attr_noexcept;   // forward declaration
+#elif defined(MI_TLS_PTHREAD_SLOT_OFS)
+static inline mi_heap_t** mi_tls_pthread_heap_slot(void) {
+  pthread_t self = pthread_self();
+  #if defined(__DragonFly__)
+  if (self==NULL) {
+    mi_heap_t* pheap_main = _mi_heap_main_get();
+    return &pheap_main;
+  }
+  #endif
+  return (mi_heap_t**)((uint8_t*)self + MI_TLS_PTHREAD_SLOT_OFS);
+}
+#elif defined(MI_TLS_PTHREAD)
+extern pthread_key_t _mi_heap_default_key;
+#endif
+
+// Default heap to allocate from (if not using TLS- or pthread slots).
+// Do not use this directly but use through `mi_heap_get_default()` (or the unchecked `mi_get_default_heap`).
+// This thread local variable is only used when neither MI_TLS_SLOT, MI_TLS_PTHREAD, or MI_TLS_PTHREAD_SLOT_OFS are defined.
+// However, on the Apple M1 we do use the address of this variable as the unique thread-id (issue #356).
+extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
+
+static inline mi_heap_t* mi_get_default_heap(void) {
+#if defined(MI_TLS_SLOT)
+  mi_heap_t* heap = (mi_heap_t*)mi_tls_slot(MI_TLS_SLOT);
+  if mi_unlikely(heap == NULL) {
+    #ifdef __GNUC__
+    __asm(""); // prevent conditional load of the address of _mi_heap_empty
+    #endif
+    heap = (mi_heap_t*)&_mi_heap_empty;
+  }
+  return heap;
+#elif defined(MI_TLS_PTHREAD_SLOT_OFS)
+  mi_heap_t* heap = *mi_tls_pthread_heap_slot();
+  return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
+#elif defined(MI_TLS_PTHREAD)
+  mi_heap_t* heap = (mi_unlikely(_mi_heap_default_key == (pthread_key_t)(-1)) ? _mi_heap_main_get() : (mi_heap_t*)pthread_getspecific(_mi_heap_default_key));
+  return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
+#else
+  #if defined(MI_TLS_RECURSE_GUARD)
+  if (mi_unlikely(!_mi_process_is_initialized)) return _mi_heap_main_get();
+  #endif
+  return _mi_heap_default;
+#endif
+}
+
+static inline bool mi_heap_is_default(const mi_heap_t* heap) {
+  return (heap == mi_get_default_heap());
+}
+
+static inline bool mi_heap_is_backing(const mi_heap_t* heap) {
+  return (heap->tld->heap_backing == heap);
+}
+
+static inline bool mi_heap_is_initialized(mi_heap_t* heap) {
+  mi_assert_internal(heap != NULL);
+  return (heap != &_mi_heap_empty);
+}
+
+static inline uintptr_t _mi_ptr_cookie(const void* p) {
+  extern mi_heap_t _mi_heap_main;
+  mi_assert_internal(_mi_heap_main.cookie != 0);
+  return ((uintptr_t)p ^ _mi_heap_main.cookie);
+}
+
+/* -----------------------------------------------------------
+  Pages
+----------------------------------------------------------- */
+
+static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t size) {
+  mi_assert_internal(size <= (MI_SMALL_SIZE_MAX + MI_PADDING_SIZE));
+  const size_t idx = _mi_wsize_from_size(size);
+  mi_assert_internal(idx < MI_PAGES_DIRECT);
+  return heap->pages_free_direct[idx];
+}
+
+// Get the page belonging to a certain size class
+static inline mi_page_t* _mi_get_free_small_page(size_t size) {
+  return _mi_heap_get_free_small_page(mi_get_default_heap(), size);
+}
+
+// Segment that contains the pointer
+// Large aligned blocks may be aligned at N*MI_SEGMENT_SIZE (inside a huge segment > MI_SEGMENT_SIZE),
+// and we need align "down" to the segment info which is `MI_SEGMENT_SIZE` bytes before it;
+// therefore we align one byte before `p`.
+static inline mi_segment_t* _mi_ptr_segment(const void* p) {
+  mi_assert_internal(p != NULL);
+  return (mi_segment_t*)(((uintptr_t)p - 1) & ~MI_SEGMENT_MASK);
+}
+
+static inline mi_page_t* mi_slice_to_page(mi_slice_t* s) {
+  mi_assert_internal(s->slice_offset== 0 && s->slice_count > 0);
+  return (mi_page_t*)(s);
+}
+
+static inline mi_slice_t* mi_page_to_slice(mi_page_t* p) {
+  mi_assert_internal(p->slice_offset== 0 && p->slice_count > 0);
+  return (mi_slice_t*)(p);
+}
+
+// Segment belonging to a page
+static inline mi_segment_t* _mi_page_segment(const mi_page_t* page) {
+  mi_segment_t* segment = _mi_ptr_segment(page); 
+  mi_assert_internal(segment == NULL || ((mi_slice_t*)page >= segment->slices && (mi_slice_t*)page < segment->slices + segment->slice_entries));
+  return segment;
+}
+
+static inline mi_slice_t* mi_slice_first(const mi_slice_t* slice) {
+  mi_slice_t* start = (mi_slice_t*)((uint8_t*)slice - slice->slice_offset);
+  mi_assert_internal(start >= _mi_ptr_segment(slice)->slices);
+  mi_assert_internal(start->slice_offset == 0);
+  mi_assert_internal(start + start->slice_count > slice);
+  return start;
+}
+
+// Get the page containing the pointer (performance critical as it is called in mi_free)
+static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const void* p) {
+  mi_assert_internal(p > (void*)segment);
+  ptrdiff_t diff = (uint8_t*)p - (uint8_t*)segment;
+  mi_assert_internal(diff > 0 && diff <= (ptrdiff_t)MI_SEGMENT_SIZE);
+  size_t idx = (size_t)diff >> MI_SEGMENT_SLICE_SHIFT;
+  mi_assert_internal(idx <= segment->slice_entries);
+  mi_slice_t* slice0 = (mi_slice_t*)&segment->slices[idx];
+  mi_slice_t* slice = mi_slice_first(slice0);  // adjust to the block that holds the page data
+  mi_assert_internal(slice->slice_offset == 0);
+  mi_assert_internal(slice >= segment->slices && slice < segment->slices + segment->slice_entries);
+  return mi_slice_to_page(slice);
+}
+
+// Quick page start for initialized pages
+static inline uint8_t* _mi_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size) {
+  return _mi_segment_page_start(segment, page, page_size);
+}
+
+// Get the page containing the pointer
+static inline mi_page_t* _mi_ptr_page(void* p) {
+  return _mi_segment_page_of(_mi_ptr_segment(p), p);
+}
+
+// Get the block size of a page (special case for huge objects)
+static inline size_t mi_page_block_size(const mi_page_t* page) {
+  const size_t bsize = page->xblock_size;
+  mi_assert_internal(bsize > 0);
+  if mi_likely(bsize < MI_HUGE_BLOCK_SIZE) {
+    return bsize;
+  }
+  else {
+    size_t psize;
+    _mi_segment_page_start(_mi_page_segment(page), page, &psize);
+    return psize;
+  }
+}
+
+static inline bool mi_page_is_huge(const mi_page_t* page) {
+  return (_mi_page_segment(page)->kind == MI_SEGMENT_HUGE);
+}
+
+// Get the usable block size of a page without fixed padding.
+// This may still include internal padding due to alignment and rounding up size classes.
+static inline size_t mi_page_usable_block_size(const mi_page_t* page) {
+  return mi_page_block_size(page) - MI_PADDING_SIZE;
+}
+
+// size of a segment
+static inline size_t mi_segment_size(mi_segment_t* segment) {
+  return segment->segment_slices * MI_SEGMENT_SLICE_SIZE;
+}
+
+static inline uint8_t* mi_segment_end(mi_segment_t* segment) {
+  return (uint8_t*)segment + mi_segment_size(segment);
+}
+
+// Thread free access
+static inline mi_block_t* mi_page_thread_free(const mi_page_t* page) {
+  return (mi_block_t*)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free) & ~3);
+}
+
+static inline mi_delayed_t mi_page_thread_free_flag(const mi_page_t* page) {
+  return (mi_delayed_t)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free) & 3);
+}
+
+// Heap access
+static inline mi_heap_t* mi_page_heap(const mi_page_t* page) {
+  return (mi_heap_t*)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xheap));
+}
+
+static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) {
+  mi_assert_internal(mi_page_thread_free_flag(page) != MI_DELAYED_FREEING);
+  mi_atomic_store_release(&page->xheap,(uintptr_t)heap);
+}
+
+// Thread free flag helpers
+static inline mi_block_t* mi_tf_block(mi_thread_free_t tf) {
+  return (mi_block_t*)(tf & ~0x03);
+}
+static inline mi_delayed_t mi_tf_delayed(mi_thread_free_t tf) {
+  return (mi_delayed_t)(tf & 0x03);
+}
+static inline mi_thread_free_t mi_tf_make(mi_block_t* block, mi_delayed_t delayed) {
+  return (mi_thread_free_t)((uintptr_t)block | (uintptr_t)delayed);
+}
+static inline mi_thread_free_t mi_tf_set_delayed(mi_thread_free_t tf, mi_delayed_t delayed) {
+  return mi_tf_make(mi_tf_block(tf),delayed);
+}
+static inline mi_thread_free_t mi_tf_set_block(mi_thread_free_t tf, mi_block_t* block) {
+  return mi_tf_make(block, mi_tf_delayed(tf));
+}
+
+// are all blocks in a page freed?
+// note: needs up-to-date used count, (as the `xthread_free` list may not be empty). see `_mi_page_collect_free`.
+static inline bool mi_page_all_free(const mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  return (page->used == 0);
+}
+
+// are there any available blocks?
+static inline bool mi_page_has_any_available(const mi_page_t* page) {
+  mi_assert_internal(page != NULL && page->reserved > 0);
+  return (page->used < page->reserved || (mi_page_thread_free(page) != NULL));
+}
+
+// are there immediately available blocks, i.e. blocks available on the free list.
+static inline bool mi_page_immediate_available(const mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  return (page->free != NULL);
+}
+
+// is more than 7/8th of a page in use?
+static inline bool mi_page_mostly_used(const mi_page_t* page) {
+  if (page==NULL) return true;
+  uint16_t frac = page->reserved / 8U;
+  return (page->reserved - page->used <= frac);
+}
+
+static inline mi_page_queue_t* mi_page_queue(const mi_heap_t* heap, size_t size) {
+  return &((mi_heap_t*)heap)->pages[_mi_bin(size)];
+}
+
+
+
+//-----------------------------------------------------------
+// Page flags
+//-----------------------------------------------------------
+static inline bool mi_page_is_in_full(const mi_page_t* page) {
+  return page->flags.x.in_full;
+}
+
+static inline void mi_page_set_in_full(mi_page_t* page, bool in_full) {
+  page->flags.x.in_full = in_full;
+}
+
+static inline bool mi_page_has_aligned(const mi_page_t* page) {
+  return page->flags.x.has_aligned;
+}
+
+static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) {
+  page->flags.x.has_aligned = has_aligned;
+}
+
+
+/* -------------------------------------------------------------------
+Encoding/Decoding the free list next pointers
+
+This is to protect against buffer overflow exploits where the
+free list is mutated. Many hardened allocators xor the next pointer `p`
+with a secret key `k1`, as `p^k1`. This prevents overwriting with known
+values but might be still too weak: if the attacker can guess
+the pointer `p` this  can reveal `k1` (since `p^k1^p == k1`).
+Moreover, if multiple blocks can be read as well, the attacker can
+xor both as `(p1^k1) ^ (p2^k1) == p1^p2` which may reveal a lot
+about the pointers (and subsequently `k1`).
+
+Instead mimalloc uses an extra key `k2` and encodes as `((p^k2)<<<k1)+k1`.
+Since these operations are not associative, the above approaches do not
+work so well any more even if the `p` can be guesstimated. For example,
+for the read case we can subtract two entries to discard the `+k1` term,
+but that leads to `((p1^k2)<<<k1) - ((p2^k2)<<<k1)` at best.
+We include the left-rotation since xor and addition are otherwise linear
+in the lowest bit. Finally, both keys are unique per page which reduces
+the re-use of keys by a large factor.
+
+We also pass a separate `null` value to be used as `NULL` or otherwise
+`(k2<<<k1)+k1` would appear (too) often as a sentinel value.
+------------------------------------------------------------------- */
+
+static inline bool mi_is_in_same_segment(const void* p, const void* q) {
+  return (_mi_ptr_segment(p) == _mi_ptr_segment(q));
+}
+
+static inline bool mi_is_in_same_page(const void* p, const void* q) {
+  mi_segment_t* segment = _mi_ptr_segment(p);
+  if (_mi_ptr_segment(q) != segment) return false;
+  // assume q may be invalid // return (_mi_segment_page_of(segment, p) == _mi_segment_page_of(segment, q));
+  mi_page_t* page = _mi_segment_page_of(segment, p);
+  size_t psize;
+  uint8_t* start = _mi_segment_page_start(segment, page, &psize);
+  return (start <= (uint8_t*)q && (uint8_t*)q < start + psize);
+}
+
+static inline uintptr_t mi_rotl(uintptr_t x, uintptr_t shift) {
+  shift %= MI_INTPTR_BITS;
+  return (shift==0 ? x : ((x << shift) | (x >> (MI_INTPTR_BITS - shift))));
+}
+static inline uintptr_t mi_rotr(uintptr_t x, uintptr_t shift) {
+  shift %= MI_INTPTR_BITS;
+  return (shift==0 ? x : ((x >> shift) | (x << (MI_INTPTR_BITS - shift))));
+}
+
+static inline void* mi_ptr_decode(const void* null, const mi_encoded_t x, const uintptr_t* keys) {
+  void* p = (void*)(mi_rotr(x - keys[0], keys[0]) ^ keys[1]);
+  return (p==null ? NULL : p);
+}
+
+static inline mi_encoded_t mi_ptr_encode(const void* null, const void* p, const uintptr_t* keys) {
+  uintptr_t x = (uintptr_t)(p==NULL ? null : p);
+  return mi_rotl(x ^ keys[1], keys[0]) + keys[0];
+}
+
+static inline mi_block_t* mi_block_nextx( const void* null, const mi_block_t* block, const uintptr_t* keys ) {
+  mi_track_mem_defined(block,sizeof(mi_block_t));
+  mi_block_t* next;
+  #ifdef MI_ENCODE_FREELIST
+  next = (mi_block_t*)mi_ptr_decode(null, block->next, keys);
+  #else
+  MI_UNUSED(keys); MI_UNUSED(null);
+  next = (mi_block_t*)block->next;
+  #endif
+  mi_track_mem_noaccess(block,sizeof(mi_block_t));
+  return next;
+}
+
+static inline void mi_block_set_nextx(const void* null, mi_block_t* block, const mi_block_t* next, const uintptr_t* keys) {
+  mi_track_mem_undefined(block,sizeof(mi_block_t));
+  #ifdef MI_ENCODE_FREELIST
+  block->next = mi_ptr_encode(null, next, keys);
+  #else
+  MI_UNUSED(keys); MI_UNUSED(null);
+  block->next = (mi_encoded_t)next;
+  #endif
+  mi_track_mem_noaccess(block,sizeof(mi_block_t));
+}
+
+static inline mi_block_t* mi_block_next(const mi_page_t* page, const mi_block_t* block) {
+  #ifdef MI_ENCODE_FREELIST
+  mi_block_t* next = mi_block_nextx(page,block,page->keys);
+  // check for free list corruption: is `next` at least in the same page?
+  // TODO: check if `next` is `page->block_size` aligned?
+  if mi_unlikely(next!=NULL && !mi_is_in_same_page(block, next)) {
+    _mi_error_message(EFAULT, "corrupted free list entry of size %zub at %p: value 0x%zx\n", mi_page_block_size(page), block, (uintptr_t)next);
+    next = NULL;
+  }
+  return next;
+  #else
+  MI_UNUSED(page);
+  return mi_block_nextx(page,block,NULL);
+  #endif
+}
+
+static inline void mi_block_set_next(const mi_page_t* page, mi_block_t* block, const mi_block_t* next) {
+  #ifdef MI_ENCODE_FREELIST
+  mi_block_set_nextx(page,block,next, page->keys);
+  #else
+  MI_UNUSED(page);
+  mi_block_set_nextx(page,block,next,NULL);
+  #endif
+}
+
+
+// -------------------------------------------------------------------
+// commit mask
+// -------------------------------------------------------------------
+
+static inline void mi_commit_mask_create_empty(mi_commit_mask_t* cm) {
+  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
+    cm->mask[i] = 0;
+  }
+}
+
+static inline void mi_commit_mask_create_full(mi_commit_mask_t* cm) {
+  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
+    cm->mask[i] = ~((size_t)0);
+  }
+}
+
+static inline bool mi_commit_mask_is_empty(const mi_commit_mask_t* cm) {
+  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
+    if (cm->mask[i] != 0) return false;
+  }
+  return true;
+}
+
+static inline bool mi_commit_mask_is_full(const mi_commit_mask_t* cm) {
+  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
+    if (cm->mask[i] != ~((size_t)0)) return false;
+  }
+  return true;
+}
+
+// defined in `segment.c`:
+size_t _mi_commit_mask_committed_size(const mi_commit_mask_t* cm, size_t total);
+size_t _mi_commit_mask_next_run(const mi_commit_mask_t* cm, size_t* idx);
+
+#define mi_commit_mask_foreach(cm,idx,count) \
+  idx = 0; \
+  while ((count = _mi_commit_mask_next_run(cm,&idx)) > 0) { 
+        
+#define mi_commit_mask_foreach_end() \
+    idx += count; \
+  }
+      
+
+
+
+// -------------------------------------------------------------------
+// Fast "random" shuffle
+// -------------------------------------------------------------------
+
+static inline uintptr_t _mi_random_shuffle(uintptr_t x) {
+  if (x==0) { x = 17; }   // ensure we don't get stuck in generating zeros
+#if (MI_INTPTR_SIZE==8)
+  // by Sebastiano Vigna, see: <http://xoshiro.di.unimi.it/splitmix64.c>
+  x ^= x >> 30;
+  x *= 0xbf58476d1ce4e5b9UL;
+  x ^= x >> 27;
+  x *= 0x94d049bb133111ebUL;
+  x ^= x >> 31;
+#elif (MI_INTPTR_SIZE==4)
+  // by Chris Wellons, see: <https://nullprogram.com/blog/2018/07/31/>
+  x ^= x >> 16;
+  x *= 0x7feb352dUL;
+  x ^= x >> 15;
+  x *= 0x846ca68bUL;
+  x ^= x >> 16;
+#endif
+  return x;
+}
+
+// -------------------------------------------------------------------
+// Optimize numa node access for the common case (= one node)
+// -------------------------------------------------------------------
+
+int    _mi_os_numa_node_get(mi_os_tld_t* tld);
+size_t _mi_os_numa_node_count_get(void);
+
+extern _Atomic(size_t) _mi_numa_node_count;
+static inline int _mi_os_numa_node(mi_os_tld_t* tld) {
+  if mi_likely(mi_atomic_load_relaxed(&_mi_numa_node_count) == 1) { return 0; }
+  else return _mi_os_numa_node_get(tld);
+}
+static inline size_t _mi_os_numa_node_count(void) {
+  const size_t count = mi_atomic_load_relaxed(&_mi_numa_node_count);
+  if mi_likely(count > 0) { return count; }
+  else return _mi_os_numa_node_count_get();
+}
+
+
+// -------------------------------------------------------------------
+// Getting the thread id should be performant as it is called in the
+// fast path of `_mi_free` and we specialize for various platforms.
+// We only require _mi_threadid() to return a unique id for each thread.
+// -------------------------------------------------------------------
+#if defined(_WIN32)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+static inline mi_threadid_t _mi_thread_id(void) mi_attr_noexcept {
+  // Windows: works on Intel and ARM in both 32- and 64-bit
+  return (uintptr_t)NtCurrentTeb();
+}
+
+// We use assembly for a fast thread id on the main platforms. The TLS layout depends on
+// both the OS and libc implementation so we use specific tests for each main platform.
+// If you test on another platform and it works please send a PR :-)
+// see also https://akkadia.org/drepper/tls.pdf for more info on the TLS register.
+#elif defined(__GNUC__) && ( \
+           (defined(__GLIBC__)   && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \
+        || (defined(__APPLE__)   && (defined(__x86_64__) || defined(__aarch64__))) \
+        || (defined(__BIONIC__)  && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \
+        || (defined(__FreeBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
+        || (defined(__OpenBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
+      )
+
+static inline void* mi_tls_slot(size_t slot) mi_attr_noexcept {
+  void* res;
+  const size_t ofs = (slot*sizeof(void*));
+  #if defined(__i386__)
+    __asm__("movl %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86 32-bit always uses GS
+  #elif defined(__APPLE__) && defined(__x86_64__)
+    __asm__("movq %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 macOSX uses GS
+  #elif defined(__x86_64__) && (MI_INTPTR_SIZE==4)
+    __asm__("movl %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x32 ABI
+  #elif defined(__x86_64__)
+    __asm__("movq %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 Linux, BSD uses FS
+  #elif defined(__arm__)
+    void** tcb; MI_UNUSED(ofs);
+    __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
+    res = tcb[slot];
+  #elif defined(__aarch64__)
+    void** tcb; MI_UNUSED(ofs);
+    #if defined(__APPLE__) // M1, issue #343
+    __asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb));
+    #else
+    __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
+    #endif
+    res = tcb[slot];
+  #endif
+  return res;
+}
+
+// setting a tls slot is only used on macOS for now
+static inline void mi_tls_slot_set(size_t slot, void* value) mi_attr_noexcept {
+  const size_t ofs = (slot*sizeof(void*));
+  #if defined(__i386__)
+    __asm__("movl %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // 32-bit always uses GS
+  #elif defined(__APPLE__) && defined(__x86_64__)
+    __asm__("movq %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 macOS uses GS
+  #elif defined(__x86_64__) && (MI_INTPTR_SIZE==4)
+    __asm__("movl %1,%%fs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x32 ABI
+  #elif defined(__x86_64__)
+    __asm__("movq %1,%%fs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 Linux, BSD uses FS
+  #elif defined(__arm__)
+    void** tcb; MI_UNUSED(ofs);
+    __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
+    tcb[slot] = value;
+  #elif defined(__aarch64__)
+    void** tcb; MI_UNUSED(ofs);
+    #if defined(__APPLE__) // M1, issue #343
+    __asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb));
+    #else
+    __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
+    #endif
+    tcb[slot] = value;
+  #endif
+}
+
+static inline mi_threadid_t _mi_thread_id(void) mi_attr_noexcept {
+  #if defined(__BIONIC__)
+    // issue #384, #495: on the Bionic libc (Android), slot 1 is the thread id
+    // see: https://github.com/aosp-mirror/platform_bionic/blob/c44b1d0676ded732df4b3b21c5f798eacae93228/libc/platform/bionic/tls_defines.h#L86
+    return (uintptr_t)mi_tls_slot(1);
+  #else
+    // in all our other targets, slot 0 is the thread id
+    // glibc: https://sourceware.org/git/?p=glibc.git;a=blob_plain;f=sysdeps/x86_64/nptl/tls.h
+    // apple: https://github.com/apple/darwin-xnu/blob/main/libsyscall/os/tsd.h#L36
+    return (uintptr_t)mi_tls_slot(0);
+  #endif
+}
+
+#else
+
+// otherwise use portable C, taking the address of a thread local variable (this is still very fast on most platforms).
+static inline mi_threadid_t _mi_thread_id(void) mi_attr_noexcept {
+  return (uintptr_t)&_mi_heap_default;
+}
+
+#endif
+
+
+// -----------------------------------------------------------------------
+// Count bits: trailing or leading zeros (with MI_INTPTR_BITS on all zero)
+// -----------------------------------------------------------------------
+
+#if defined(__GNUC__)
+
+#include <limits.h>       // LONG_MAX
+#define MI_HAVE_FAST_BITSCAN
+static inline size_t mi_clz(uintptr_t x) {
+  if (x==0) return MI_INTPTR_BITS;
+#if (INTPTR_MAX == LONG_MAX)
+  return __builtin_clzl(x);
+#else
+  return __builtin_clzll(x);
+#endif
+}
+static inline size_t mi_ctz(uintptr_t x) {
+  if (x==0) return MI_INTPTR_BITS;
+#if (INTPTR_MAX == LONG_MAX)
+  return __builtin_ctzl(x);
+#else
+  return __builtin_ctzll(x);
+#endif
+}
+
+#elif defined(_MSC_VER)
+
+#include <limits.h>       // LONG_MAX
+#define MI_HAVE_FAST_BITSCAN
+static inline size_t mi_clz(uintptr_t x) {
+  if (x==0) return MI_INTPTR_BITS;
+  unsigned long idx;
+#if (INTPTR_MAX == LONG_MAX)
+  _BitScanReverse(&idx, x);
+#else
+  _BitScanReverse64(&idx, x);
+#endif
+  return ((MI_INTPTR_BITS - 1) - idx);
+}
+static inline size_t mi_ctz(uintptr_t x) {
+  if (x==0) return MI_INTPTR_BITS;
+  unsigned long idx;
+#if (INTPTR_MAX == LONG_MAX)
+  _BitScanForward(&idx, x);
+#else
+  _BitScanForward64(&idx, x);
+#endif
+  return idx;
+}
+
+#else
+static inline size_t mi_ctz32(uint32_t x) {
+  // de Bruijn multiplication, see <http://supertech.csail.mit.edu/papers/debruijn.pdf>
+  static const unsigned char debruijn[32] = {
+    0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
+    31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
+  };
+  if (x==0) return 32;
+  return debruijn[((x & -(int32_t)x) * 0x077CB531UL) >> 27];
+}
+static inline size_t mi_clz32(uint32_t x) {
+  // de Bruijn multiplication, see <http://supertech.csail.mit.edu/papers/debruijn.pdf>
+  static const uint8_t debruijn[32] = {
+    31, 22, 30, 21, 18, 10, 29, 2, 20, 17, 15, 13, 9, 6, 28, 1,
+    23, 19, 11, 3, 16, 14, 7, 24, 12, 4, 8, 25, 5, 26, 27, 0
+  };
+  if (x==0) return 32;
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  x |= x >> 8;
+  x |= x >> 16;
+  return debruijn[(uint32_t)(x * 0x07C4ACDDUL) >> 27];
+}
+
+static inline size_t mi_clz(uintptr_t x) {
+  if (x==0) return MI_INTPTR_BITS;
+#if (MI_INTPTR_BITS <= 32)
+  return mi_clz32((uint32_t)x);
+#else
+  size_t count = mi_clz32((uint32_t)(x >> 32));
+  if (count < 32) return count;
+  return (32 + mi_clz32((uint32_t)x));
+#endif
+}
+static inline size_t mi_ctz(uintptr_t x) {
+  if (x==0) return MI_INTPTR_BITS;
+#if (MI_INTPTR_BITS <= 32)
+  return mi_ctz32((uint32_t)x);
+#else
+  size_t count = mi_ctz32((uint32_t)x);
+  if (count < 32) return count;
+  return (32 + mi_ctz32((uint32_t)(x>>32)));
+#endif
+}
+
+#endif
+
+// "bit scan reverse": Return index of the highest bit (or MI_INTPTR_BITS if `x` is zero)
+static inline size_t mi_bsr(uintptr_t x) {
+  return (x==0 ? MI_INTPTR_BITS : MI_INTPTR_BITS - 1 - mi_clz(x));
+}
+
+
+// ---------------------------------------------------------------------------------
+// Provide our own `_mi_memcpy` for potential performance optimizations.
+//
+// For now, only on Windows with msvc/clang-cl we optimize to `rep movsb` if
+// we happen to run on x86/x64 cpu's that have "fast short rep movsb" (FSRM) support
+// (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017). See also issue #201 and pr #253.
+// ---------------------------------------------------------------------------------
+
+#if !MI_TRACK_ENABLED && defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64))
+#include <intrin.h>
+#include <string.h>
+extern bool _mi_cpu_has_fsrm;
+static inline void _mi_memcpy(void* dst, const void* src, size_t n) {
+  if (_mi_cpu_has_fsrm) {
+    __movsb((unsigned char*)dst, (const unsigned char*)src, n);
+  }
+  else {
+    memcpy(dst, src, n);
+  }
+}
+static inline void _mi_memzero(void* dst, size_t n) {
+  if (_mi_cpu_has_fsrm) {
+    __stosb((unsigned char*)dst, 0, n);
+  }
+  else {
+    memset(dst, 0, n);
+  }
+}
+#else
+#include <string.h>
+static inline void _mi_memcpy(void* dst, const void* src, size_t n) {
+  memcpy(dst, src, n);
+}
+static inline void _mi_memzero(void* dst, size_t n) {
+  memset(dst, 0, n);
+}
+#endif
+
+
+// -------------------------------------------------------------------------------
+// The `_mi_memcpy_aligned` can be used if the pointers are machine-word aligned
+// This is used for example in `mi_realloc`.
+// -------------------------------------------------------------------------------
+
+#if (defined(__GNUC__) && (__GNUC__ >= 4)) || defined(__clang__)
+// On GCC/CLang we provide a hint that the pointers are word aligned.
+#include <string.h>
+static inline void _mi_memcpy_aligned(void* dst, const void* src, size_t n) {
+  mi_assert_internal(((uintptr_t)dst % MI_INTPTR_SIZE == 0) && ((uintptr_t)src % MI_INTPTR_SIZE == 0));
+  void* adst = __builtin_assume_aligned(dst, MI_INTPTR_SIZE);
+  const void* asrc = __builtin_assume_aligned(src, MI_INTPTR_SIZE);
+  _mi_memcpy(adst, asrc, n);
+}
+
+static inline void _mi_memzero_aligned(void* dst, size_t n) {
+  mi_assert_internal((uintptr_t)dst % MI_INTPTR_SIZE == 0);
+  void* adst = __builtin_assume_aligned(dst, MI_INTPTR_SIZE);
+  _mi_memzero(adst, n);
+}
+#else
+// Default fallback on `_mi_memcpy`
+static inline void _mi_memcpy_aligned(void* dst, const void* src, size_t n) {
+  mi_assert_internal(((uintptr_t)dst % MI_INTPTR_SIZE == 0) && ((uintptr_t)src % MI_INTPTR_SIZE == 0));
+  _mi_memcpy(dst, src, n);
+}
+
+static inline void _mi_memzero_aligned(void* dst, size_t n) {
+  mi_assert_internal((uintptr_t)dst % MI_INTPTR_SIZE == 0);
+  _mi_memzero(dst, n);
+}
+#endif
+
+
+#endif
diff --git a/Include/mimalloc/mimalloc-track.h b/Include/mimalloc/mimalloc-track.h
new file mode 100644
index 00000000000000..f60d7acd0b8fcd
--- /dev/null
+++ b/Include/mimalloc/mimalloc-track.h
@@ -0,0 +1,62 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_TRACK_H
+#define MIMALLOC_TRACK_H
+
+// ------------------------------------------------------
+// Track memory ranges with macros for tools like Valgrind
+// address sanitizer, or other memory checkers.
+// ------------------------------------------------------
+
+#if MI_VALGRIND
+
+#define MI_TRACK_ENABLED 1
+#define MI_TRACK_TOOL    "valgrind"
+
+#include <valgrind/valgrind.h>
+#include <valgrind/memcheck.h>
+
+#define mi_track_malloc(p,size,zero)        VALGRIND_MALLOCLIKE_BLOCK(p,size,MI_PADDING_SIZE /*red zone*/,zero)
+#define mi_track_resize(p,oldsize,newsize)  VALGRIND_RESIZEINPLACE_BLOCK(p,oldsize,newsize,MI_PADDING_SIZE /*red zone*/)
+#define mi_track_free(p)                    VALGRIND_FREELIKE_BLOCK(p,MI_PADDING_SIZE /*red zone*/)
+#define mi_track_free_size(p,_size)         mi_track_free(p)
+#define mi_track_mem_defined(p,size)        VALGRIND_MAKE_MEM_DEFINED(p,size)
+#define mi_track_mem_undefined(p,size)      VALGRIND_MAKE_MEM_UNDEFINED(p,size)
+#define mi_track_mem_noaccess(p,size)       VALGRIND_MAKE_MEM_NOACCESS(p,size)
+
+#elif MI_ASAN
+
+#define MI_TRACK_ENABLED 1
+#define MI_TRACK_TOOL    "asan"
+
+#include <sanitizer/asan_interface.h>
+
+#define mi_track_malloc(p,size,zero)        ASAN_UNPOISON_MEMORY_REGION(p,size)
+#define mi_track_resize(p,oldsize,newsize)  ASAN_POISON_MEMORY_REGION(p,oldsize); ASAN_UNPOISON_MEMORY_REGION(p,newsize)
+#define mi_track_free(p)                    ASAN_POISON_MEMORY_REGION(p,mi_usable_size(p))
+#define mi_track_free_size(p,size)          ASAN_POISON_MEMORY_REGION(p,size)
+#define mi_track_mem_defined(p,size)        ASAN_UNPOISON_MEMORY_REGION(p,size)
+#define mi_track_mem_undefined(p,size)      ASAN_UNPOISON_MEMORY_REGION(p,size)
+#define mi_track_mem_noaccess(p,size)       ASAN_POISON_MEMORY_REGION(p,size)
+
+#else
+
+#define MI_TRACK_ENABLED 0
+#define MI_TRACK_TOOL    "none"
+
+#define mi_track_malloc(p,size,zero)
+#define mi_track_resize(p,oldsize,newsize)
+#define mi_track_free(p)
+#define mi_track_free_size(p,_size)
+#define mi_track_mem_defined(p,size)
+#define mi_track_mem_undefined(p,size)
+#define mi_track_mem_noaccess(p,size)
+
+#endif
+
+#endif
diff --git a/Include/mimalloc/mimalloc-types.h b/Include/mimalloc/mimalloc-types.h
new file mode 100644
index 00000000000000..f3af528e524b95
--- /dev/null
+++ b/Include/mimalloc/mimalloc-types.h
@@ -0,0 +1,609 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_TYPES_H
+#define MIMALLOC_TYPES_H
+
+#include <stddef.h>   // ptrdiff_t
+#include <stdint.h>   // uintptr_t, uint16_t, etc
+#include "mimalloc-atomic.h"  // _Atomic
+
+#ifdef _MSC_VER
+#pragma warning(disable:4214) // bitfield is not int
+#endif
+
+// Minimal alignment necessary. On most platforms 16 bytes are needed
+// due to SSE registers for example. This must be at least `sizeof(void*)`
+#ifndef MI_MAX_ALIGN_SIZE
+#define MI_MAX_ALIGN_SIZE  16   // sizeof(max_align_t)
+#endif
+
+// ------------------------------------------------------
+// Variants
+// ------------------------------------------------------
+
+// Define NDEBUG in the release version to disable assertions.
+// #define NDEBUG
+
+// Define MI_VALGRIND to enable valgrind support
+// #define MI_VALGRIND 1
+
+// Define MI_STAT as 1 to maintain statistics; set it to 2 to have detailed statistics (but costs some performance).
+// #define MI_STAT 1
+
+// Define MI_SECURE to enable security mitigations
+// #define MI_SECURE 1  // guard page around metadata
+// #define MI_SECURE 2  // guard page around each mimalloc page
+// #define MI_SECURE 3  // encode free lists (detect corrupted free list (buffer overflow), and invalid pointer free)
+// #define MI_SECURE 4  // checks for double free. (may be more expensive)
+
+#if !defined(MI_SECURE)
+#define MI_SECURE 0
+#endif
+
+// Define MI_DEBUG for debug mode
+// #define MI_DEBUG 1  // basic assertion checks and statistics, check double free, corrupted free list, and invalid pointer free.
+// #define MI_DEBUG 2  // + internal assertion checks
+// #define MI_DEBUG 3  // + extensive internal invariant checking (cmake -DMI_DEBUG_FULL=ON)
+#if !defined(MI_DEBUG)
+#if !defined(NDEBUG) || defined(_DEBUG)
+#define MI_DEBUG 2
+#else
+#define MI_DEBUG 0
+#endif
+#endif
+
+// Reserve extra padding at the end of each block to be more resilient against heap block overflows.
+// The padding can detect byte-precise buffer overflow on free.
+#if !defined(MI_PADDING) && (MI_DEBUG>=1 || MI_VALGRIND)
+#define MI_PADDING  1
+#endif
+
+
+// Encoded free lists allow detection of corrupted free lists
+// and can detect buffer overflows, modify after free, and double `free`s.
+#if (MI_SECURE>=3 || MI_DEBUG>=1)
+#define MI_ENCODE_FREELIST  1
+#endif
+
+
+// We used to abandon huge pages but to eagerly deallocate if freed from another thread,
+// but that makes it not possible to visit them during a heap walk or include them in a
+// `mi_heap_destroy`. We therefore instead reset/decommit the huge blocks if freed from
+// another thread so most memory is available until it gets properly freed by the owning thread.
+// #define MI_HUGE_PAGE_ABANDON 1
+
+
+// ------------------------------------------------------
+// Platform specific values
+// ------------------------------------------------------
+
+// ------------------------------------------------------
+// Size of a pointer.
+// We assume that `sizeof(void*)==sizeof(intptr_t)`
+// and it holds for all platforms we know of.
+//
+// However, the C standard only requires that:
+//  p == (void*)((intptr_t)p))
+// but we also need:
+//  i == (intptr_t)((void*)i)
+// or otherwise one might define an intptr_t type that is larger than a pointer...
+// ------------------------------------------------------
+
+#if INTPTR_MAX > INT64_MAX
+# define MI_INTPTR_SHIFT (4)  // assume 128-bit  (as on arm CHERI for example)
+#elif INTPTR_MAX == INT64_MAX
+# define MI_INTPTR_SHIFT (3)
+#elif INTPTR_MAX == INT32_MAX
+# define MI_INTPTR_SHIFT (2)
+#else
+#error platform pointers must be 32, 64, or 128 bits
+#endif
+
+#if SIZE_MAX == UINT64_MAX
+# define MI_SIZE_SHIFT (3)
+typedef int64_t  mi_ssize_t;
+#elif SIZE_MAX == UINT32_MAX
+# define MI_SIZE_SHIFT (2)
+typedef int32_t  mi_ssize_t;
+#else
+#error platform objects must be 32 or 64 bits
+#endif
+
+#if (SIZE_MAX/2) > LONG_MAX
+# define MI_ZU(x)  x##ULL
+# define MI_ZI(x)  x##LL
+#else
+# define MI_ZU(x)  x##UL
+# define MI_ZI(x)  x##L
+#endif
+
+#define MI_INTPTR_SIZE  (1<<MI_INTPTR_SHIFT)
+#define MI_INTPTR_BITS  (MI_INTPTR_SIZE*8)
+
+#define MI_SIZE_SIZE  (1<<MI_SIZE_SHIFT)
+#define MI_SIZE_BITS  (MI_SIZE_SIZE*8)
+
+#define MI_KiB     (MI_ZU(1024))
+#define MI_MiB     (MI_KiB*MI_KiB)
+#define MI_GiB     (MI_MiB*MI_KiB)
+
+
+// ------------------------------------------------------
+// Main internal data-structures
+// ------------------------------------------------------
+
+// Main tuning parameters for segment and page sizes
+// Sizes for 64-bit (usually divide by two for 32-bit)
+#define MI_SEGMENT_SLICE_SHIFT            (13 + MI_INTPTR_SHIFT)         // 64KiB  (32KiB on 32-bit)
+
+#if MI_INTPTR_SIZE > 4
+#define MI_SEGMENT_SHIFT                  ( 9 + MI_SEGMENT_SLICE_SHIFT)  // 32MiB
+#else
+#define MI_SEGMENT_SHIFT                  ( 7 + MI_SEGMENT_SLICE_SHIFT)  // 4MiB on 32-bit
+#endif
+
+#define MI_SMALL_PAGE_SHIFT               (MI_SEGMENT_SLICE_SHIFT)       // 64KiB
+#define MI_MEDIUM_PAGE_SHIFT              ( 3 + MI_SMALL_PAGE_SHIFT)     // 512KiB
+
+
+// Derived constants
+#define MI_SEGMENT_SIZE                   (MI_ZU(1)<<MI_SEGMENT_SHIFT)
+#define MI_SEGMENT_ALIGN                  MI_SEGMENT_SIZE
+#define MI_SEGMENT_MASK                   (MI_SEGMENT_ALIGN - 1)
+#define MI_SEGMENT_SLICE_SIZE             (MI_ZU(1)<< MI_SEGMENT_SLICE_SHIFT)
+#define MI_SLICES_PER_SEGMENT             (MI_SEGMENT_SIZE / MI_SEGMENT_SLICE_SIZE) // 1024
+
+#define MI_SMALL_PAGE_SIZE                (MI_ZU(1)<<MI_SMALL_PAGE_SHIFT)
+#define MI_MEDIUM_PAGE_SIZE               (MI_ZU(1)<<MI_MEDIUM_PAGE_SHIFT)
+
+#define MI_SMALL_OBJ_SIZE_MAX             (MI_SMALL_PAGE_SIZE/4)   // 8KiB on 64-bit
+#define MI_MEDIUM_OBJ_SIZE_MAX            (MI_MEDIUM_PAGE_SIZE/4)  // 128KiB on 64-bit
+#define MI_MEDIUM_OBJ_WSIZE_MAX           (MI_MEDIUM_OBJ_SIZE_MAX/MI_INTPTR_SIZE)   
+#define MI_LARGE_OBJ_SIZE_MAX             (MI_SEGMENT_SIZE/2)      // 32MiB on 64-bit
+#define MI_LARGE_OBJ_WSIZE_MAX            (MI_LARGE_OBJ_SIZE_MAX/MI_INTPTR_SIZE)
+
+// Maximum number of size classes. (spaced exponentially in 12.5% increments)
+#define MI_BIN_HUGE  (73U)
+
+#if (MI_MEDIUM_OBJ_WSIZE_MAX >= 655360)
+#error "mimalloc internal: define more bins"
+#endif
+
+// Maximum slice offset (15)
+#define MI_MAX_SLICE_OFFSET               ((MI_ALIGNMENT_MAX / MI_SEGMENT_SLICE_SIZE) - 1)
+
+// Used as a special value to encode block sizes in 32 bits.
+#define MI_HUGE_BLOCK_SIZE                ((uint32_t)(2*MI_GiB))
+
+// blocks up to this size are always allocated aligned
+#define MI_MAX_ALIGN_GUARANTEE            (8*MI_MAX_ALIGN_SIZE)  
+
+// Alignments over MI_ALIGNMENT_MAX are allocated in dedicated huge page segments 
+#define MI_ALIGNMENT_MAX                  (MI_SEGMENT_SIZE >> 1)  
+
+
+// ------------------------------------------------------
+// Mimalloc pages contain allocated blocks
+// ------------------------------------------------------
+
+// The free lists use encoded next fields
+// (Only actually encodes when MI_ENCODED_FREELIST is defined.)
+typedef uintptr_t  mi_encoded_t;
+
+// thread id's
+typedef size_t     mi_threadid_t;
+
+// free lists contain blocks
+typedef struct mi_block_s {
+  mi_encoded_t next;
+} mi_block_t;
+
+
+// The delayed flags are used for efficient multi-threaded free-ing
+typedef enum mi_delayed_e {
+  MI_USE_DELAYED_FREE   = 0, // push on the owning heap thread delayed list
+  MI_DELAYED_FREEING    = 1, // temporary: another thread is accessing the owning heap
+  MI_NO_DELAYED_FREE    = 2, // optimize: push on page local thread free queue if another block is already in the heap thread delayed free list
+  MI_NEVER_DELAYED_FREE = 3  // sticky, only resets on page reclaim
+} mi_delayed_t;
+
+
+// The `in_full` and `has_aligned` page flags are put in a union to efficiently
+// test if both are false (`full_aligned == 0`) in the `mi_free` routine.
+#if !MI_TSAN
+typedef union mi_page_flags_s {
+  uint8_t full_aligned;
+  struct {
+    uint8_t in_full : 1;
+    uint8_t has_aligned : 1;
+  } x;
+} mi_page_flags_t;
+#else
+// under thread sanitizer, use a byte for each flag to suppress warning, issue #130
+typedef union mi_page_flags_s {
+  uint16_t full_aligned;
+  struct {
+    uint8_t in_full;
+    uint8_t has_aligned;
+  } x;
+} mi_page_flags_t;
+#endif
+
+// Thread free list.
+// We use the bottom 2 bits of the pointer for mi_delayed_t flags
+typedef uintptr_t mi_thread_free_t;
+
+// A page contains blocks of one specific size (`block_size`).
+// Each page has three list of free blocks:
+// `free` for blocks that can be allocated,
+// `local_free` for freed blocks that are not yet available to `mi_malloc`
+// `thread_free` for freed blocks by other threads
+// The `local_free` and `thread_free` lists are migrated to the `free` list
+// when it is exhausted. The separate `local_free` list is necessary to
+// implement a monotonic heartbeat. The `thread_free` list is needed for
+// avoiding atomic operations in the common case.
+//
+//
+// `used - |thread_free|` == actual blocks that are in use (alive)
+// `used - |thread_free| + |free| + |local_free| == capacity`
+//
+// We don't count `freed` (as |free|) but use `used` to reduce
+// the number of memory accesses in the `mi_page_all_free` function(s).
+//
+// Notes:
+// - Access is optimized for `mi_free` and `mi_page_alloc` (in `alloc.c`)
+// - Using `uint16_t` does not seem to slow things down
+// - The size is 8 words on 64-bit which helps the page index calculations
+//   (and 10 words on 32-bit, and encoded free lists add 2 words. Sizes 10
+//    and 12 are still good for address calculation)
+// - To limit the structure size, the `xblock_size` is 32-bits only; for
+//   blocks > MI_HUGE_BLOCK_SIZE the size is determined from the segment page size
+// - `thread_free` uses the bottom bits as a delayed-free flags to optimize
+//   concurrent frees where only the first concurrent free adds to the owning
+//   heap `thread_delayed_free` list (see `alloc.c:mi_free_block_mt`).
+//   The invariant is that no-delayed-free is only set if there is
+//   at least one block that will be added, or as already been added, to
+//   the owning heap `thread_delayed_free` list. This guarantees that pages
+//   will be freed correctly even if only other threads free blocks.
+typedef struct mi_page_s {
+  // "owned" by the segment
+  uint32_t              slice_count;       // slices in this page (0 if not a page)
+  uint32_t              slice_offset;      // distance from the actual page data slice (0 if a page)
+  uint8_t               is_reset : 1;      // `true` if the page memory was reset
+  uint8_t               is_committed : 1;  // `true` if the page virtual memory is committed
+  uint8_t               is_zero_init : 1;  // `true` if the page was zero initialized
+
+  // layout like this to optimize access in `mi_malloc` and `mi_free`
+  uint16_t              capacity;          // number of blocks committed, must be the first field, see `segment.c:page_clear`
+  uint16_t              reserved;          // number of blocks reserved in memory
+  mi_page_flags_t       flags;             // `in_full` and `has_aligned` flags (8 bits)
+  uint8_t               is_zero : 1;       // `true` if the blocks in the free list are zero initialized
+  uint8_t               retire_expire : 7; // expiration count for retired blocks
+
+  mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
+  uint32_t              used;              // number of blocks in use (including blocks in `local_free` and `thread_free`)
+  uint32_t              xblock_size;       // size available in each block (always `>0`)
+  mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
+
+  #ifdef MI_ENCODE_FREELIST
+  uintptr_t             keys[2];           // two random keys to encode the free lists (see `_mi_block_next`)
+  #endif
+
+  _Atomic(mi_thread_free_t) xthread_free;  // list of deferred free blocks freed by other threads
+  _Atomic(uintptr_t)        xheap;
+
+  struct mi_page_s*     next;              // next page owned by this thread with the same `block_size`
+  struct mi_page_s*     prev;              // previous page owned by this thread with the same `block_size`
+
+  // 64-bit 9 words, 32-bit 12 words, (+2 for secure)
+  #if MI_INTPTR_SIZE==8
+  uintptr_t padding[1];
+  #endif
+} mi_page_t;
+
+
+
+typedef enum mi_page_kind_e {
+  MI_PAGE_SMALL,    // small blocks go into 64KiB pages inside a segment
+  MI_PAGE_MEDIUM,   // medium blocks go into medium pages inside a segment
+  MI_PAGE_LARGE,    // larger blocks go into a page of just one block
+  MI_PAGE_HUGE,     // huge blocks (> 16 MiB) are put into a single page in a single segment.
+} mi_page_kind_t;
+
+typedef enum mi_segment_kind_e {
+  MI_SEGMENT_NORMAL, // MI_SEGMENT_SIZE size with pages inside.
+  MI_SEGMENT_HUGE,   // > MI_LARGE_SIZE_MAX segment with just one huge page inside.
+} mi_segment_kind_t;
+
+// ------------------------------------------------------
+// A segment holds a commit mask where a bit is set if
+// the corresponding MI_COMMIT_SIZE area is committed.
+// The MI_COMMIT_SIZE must be a multiple of the slice
+// size. If it is equal we have the most fine grained 
+// decommit (but setting it higher can be more efficient).
+// The MI_MINIMAL_COMMIT_SIZE is the minimal amount that will
+// be committed in one go which can be set higher than
+// MI_COMMIT_SIZE for efficiency (while the decommit mask
+// is still tracked in fine-grained MI_COMMIT_SIZE chunks)
+// ------------------------------------------------------
+
+#define MI_MINIMAL_COMMIT_SIZE      (16*MI_SEGMENT_SLICE_SIZE)           // 1MiB
+#define MI_COMMIT_SIZE              (MI_SEGMENT_SLICE_SIZE)              // 64KiB
+#define MI_COMMIT_MASK_BITS         (MI_SEGMENT_SIZE / MI_COMMIT_SIZE)  
+#define MI_COMMIT_MASK_FIELD_BITS    MI_SIZE_BITS
+#define MI_COMMIT_MASK_FIELD_COUNT  (MI_COMMIT_MASK_BITS / MI_COMMIT_MASK_FIELD_BITS)
+
+#if (MI_COMMIT_MASK_BITS != (MI_COMMIT_MASK_FIELD_COUNT * MI_COMMIT_MASK_FIELD_BITS))
+#error "the segment size must be exactly divisible by the (commit size * size_t bits)"
+#endif
+
+typedef struct mi_commit_mask_s {
+  size_t mask[MI_COMMIT_MASK_FIELD_COUNT];
+} mi_commit_mask_t;
+
+typedef mi_page_t  mi_slice_t;
+typedef int64_t    mi_msecs_t;
+
+
+// Segments are large allocated memory blocks (8mb on 64 bit) from
+// the OS. Inside segments we allocated fixed size _pages_ that
+// contain blocks.
+typedef struct mi_segment_s {
+  size_t            memid;              // memory id for arena allocation
+  bool              mem_is_pinned;      // `true` if we cannot decommit/reset/protect in this memory (i.e. when allocated using large OS pages)    
+  bool              mem_is_large;       // in large/huge os pages?
+  bool              mem_is_committed;   // `true` if the whole segment is eagerly committed
+  size_t            mem_alignment;      // page alignment for huge pages (only used for alignment > MI_ALIGNMENT_MAX)
+  size_t            mem_align_offset;   // offset for huge page alignment (only used for alignment > MI_ALIGNMENT_MAX)
+
+  bool              allow_decommit;     
+  mi_msecs_t        decommit_expire;
+  mi_commit_mask_t  decommit_mask;
+  mi_commit_mask_t  commit_mask;
+
+  _Atomic(struct mi_segment_s*) abandoned_next;
+
+  // from here is zero initialized
+  struct mi_segment_s* next;            // the list of freed segments in the cache (must be first field, see `segment.c:mi_segment_init`)
+  
+  size_t            abandoned;          // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`)
+  size_t            abandoned_visits;   // count how often this segment is visited in the abandoned list (to force reclaim it it is too long)
+  size_t            used;               // count of pages in use
+  uintptr_t         cookie;             // verify addresses in debug mode: `mi_ptr_cookie(segment) == segment->cookie`  
+
+  size_t            segment_slices;      // for huge segments this may be different from `MI_SLICES_PER_SEGMENT`
+  size_t            segment_info_slices; // initial slices we are using segment info and possible guard pages.
+
+  // layout like this to optimize access in `mi_free`
+  mi_segment_kind_t kind;
+  size_t            slice_entries;       // entries in the `slices` array, at most `MI_SLICES_PER_SEGMENT`
+  _Atomic(mi_threadid_t) thread_id;      // unique id of the thread owning this segment
+
+  mi_slice_t        slices[MI_SLICES_PER_SEGMENT+1];  // one more for huge blocks with large alignment
+} mi_segment_t;
+
+
+// ------------------------------------------------------
+// Heaps
+// Provide first-class heaps to allocate from.
+// A heap just owns a set of pages for allocation and
+// can only be allocate/reallocate from the thread that created it.
+// Freeing blocks can be done from any thread though.
+// Per thread, the segments are shared among its heaps.
+// Per thread, there is always a default heap that is
+// used for allocation; it is initialized to statically
+// point to an empty heap to avoid initialization checks
+// in the fast path.
+// ------------------------------------------------------
+
+// Thread local data
+typedef struct mi_tld_s mi_tld_t;
+
+// Pages of a certain block size are held in a queue.
+typedef struct mi_page_queue_s {
+  mi_page_t* first;
+  mi_page_t* last;
+  size_t     block_size;
+} mi_page_queue_t;
+
+#define MI_BIN_FULL  (MI_BIN_HUGE+1)
+
+// Random context
+typedef struct mi_random_cxt_s {
+  uint32_t input[16];
+  uint32_t output[16];
+  int      output_available;
+  bool     weak;
+} mi_random_ctx_t;
+
+
+// In debug mode there is a padding structure at the end of the blocks to check for buffer overflows
+#if (MI_PADDING)
+typedef struct mi_padding_s {
+  uint32_t canary; // encoded block value to check validity of the padding (in case of overflow)
+  uint32_t delta;  // padding bytes before the block. (mi_usable_size(p) - delta == exact allocated bytes)
+} mi_padding_t;
+#define MI_PADDING_SIZE   (sizeof(mi_padding_t))
+#define MI_PADDING_WSIZE  ((MI_PADDING_SIZE + MI_INTPTR_SIZE - 1) / MI_INTPTR_SIZE)
+#else
+#define MI_PADDING_SIZE   0
+#define MI_PADDING_WSIZE  0
+#endif
+
+#define MI_PAGES_DIRECT   (MI_SMALL_WSIZE_MAX + MI_PADDING_WSIZE + 1)
+
+
+// A heap owns a set of pages.
+struct mi_heap_s {
+  mi_tld_t*             tld;
+  mi_page_t*            pages_free_direct[MI_PAGES_DIRECT];  // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size.
+  mi_page_queue_t       pages[MI_BIN_FULL + 1];              // queue of pages for each size class (or "bin")
+  _Atomic(mi_block_t*)  thread_delayed_free;
+  mi_threadid_t         thread_id;                           // thread this heap belongs too
+  mi_arena_id_t         arena_id;                            // arena id if the heap belongs to a specific arena (or 0)  
+  uintptr_t             cookie;                              // random cookie to verify pointers (see `_mi_ptr_cookie`)
+  uintptr_t             keys[2];                             // two random keys used to encode the `thread_delayed_free` list
+  mi_random_ctx_t       random;                              // random number context used for secure allocation
+  size_t                page_count;                          // total number of pages in the `pages` queues.
+  size_t                page_retired_min;                    // smallest retired index (retired pages are fully free, but still in the page queues)
+  size_t                page_retired_max;                    // largest retired index into the `pages` array.
+  mi_heap_t*            next;                                // list of heaps per thread
+  bool                  no_reclaim;                          // `true` if this heap should not reclaim abandoned pages
+};
+
+
+
+// ------------------------------------------------------
+// Debug
+// ------------------------------------------------------
+
+#if !defined(MI_DEBUG_UNINIT)
+#define MI_DEBUG_UNINIT     (0xD0)
+#endif
+#if !defined(MI_DEBUG_FREED)
+#define MI_DEBUG_FREED      (0xDF)
+#endif
+#if !defined(MI_DEBUG_PADDING)
+#define MI_DEBUG_PADDING    (0xDE)
+#endif
+
+#if (MI_DEBUG)
+// use our own assertion to print without memory allocation
+void _mi_assert_fail(const char* assertion, const char* fname, unsigned int line, const char* func );
+#define mi_assert(expr)     ((expr) ? (void)0 : _mi_assert_fail(#expr,__FILE__,__LINE__,__func__))
+#else
+#define mi_assert(x)
+#endif
+
+#if (MI_DEBUG>1)
+#define mi_assert_internal    mi_assert
+#else
+#define mi_assert_internal(x)
+#endif
+
+#if (MI_DEBUG>2)
+#define mi_assert_expensive   mi_assert
+#else
+#define mi_assert_expensive(x)
+#endif
+
+// ------------------------------------------------------
+// Statistics
+// ------------------------------------------------------
+
+#ifndef MI_STAT
+#if (MI_DEBUG>0)
+#define MI_STAT 2
+#else
+#define MI_STAT 0
+#endif
+#endif
+
+typedef struct mi_stat_count_s {
+  int64_t allocated;
+  int64_t freed;
+  int64_t peak;
+  int64_t current;
+} mi_stat_count_t;
+
+typedef struct mi_stat_counter_s {
+  int64_t total;
+  int64_t count;
+} mi_stat_counter_t;
+
+typedef struct mi_stats_s {
+  mi_stat_count_t segments;
+  mi_stat_count_t pages;
+  mi_stat_count_t reserved;
+  mi_stat_count_t committed;
+  mi_stat_count_t reset;
+  mi_stat_count_t page_committed;
+  mi_stat_count_t segments_abandoned;
+  mi_stat_count_t pages_abandoned;
+  mi_stat_count_t threads;
+  mi_stat_count_t normal;
+  mi_stat_count_t huge;
+  mi_stat_count_t large;
+  mi_stat_count_t malloc;
+  mi_stat_count_t segments_cache;
+  mi_stat_counter_t pages_extended;
+  mi_stat_counter_t mmap_calls;
+  mi_stat_counter_t commit_calls;
+  mi_stat_counter_t page_no_retire;
+  mi_stat_counter_t searches;
+  mi_stat_counter_t normal_count;
+  mi_stat_counter_t huge_count;
+  mi_stat_counter_t large_count;
+#if MI_STAT>1
+  mi_stat_count_t normal_bins[MI_BIN_HUGE+1];
+#endif
+} mi_stats_t;
+
+
+void _mi_stat_increase(mi_stat_count_t* stat, size_t amount);
+void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount);
+void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
+
+#if (MI_STAT)
+#define mi_stat_increase(stat,amount)         _mi_stat_increase( &(stat), amount)
+#define mi_stat_decrease(stat,amount)         _mi_stat_decrease( &(stat), amount)
+#define mi_stat_counter_increase(stat,amount) _mi_stat_counter_increase( &(stat), amount)
+#else
+#define mi_stat_increase(stat,amount)         (void)0
+#define mi_stat_decrease(stat,amount)         (void)0
+#define mi_stat_counter_increase(stat,amount) (void)0
+#endif
+
+#define mi_heap_stat_counter_increase(heap,stat,amount)  mi_stat_counter_increase( (heap)->tld->stats.stat, amount)
+#define mi_heap_stat_increase(heap,stat,amount)  mi_stat_increase( (heap)->tld->stats.stat, amount)
+#define mi_heap_stat_decrease(heap,stat,amount)  mi_stat_decrease( (heap)->tld->stats.stat, amount)
+
+// ------------------------------------------------------
+// Thread Local data
+// ------------------------------------------------------
+
+// A "span" is is an available range of slices. The span queues keep
+// track of slice spans of at most the given `slice_count` (but more than the previous size class).
+typedef struct mi_span_queue_s {
+  mi_slice_t* first;
+  mi_slice_t* last;
+  size_t      slice_count;
+} mi_span_queue_t;
+
+#define MI_SEGMENT_BIN_MAX (35)     // 35 == mi_segment_bin(MI_SLICES_PER_SEGMENT)
+
+// OS thread local data
+typedef struct mi_os_tld_s {
+  size_t                region_idx;   // start point for next allocation
+  mi_stats_t*           stats;        // points to tld stats
+} mi_os_tld_t;
+
+
+// Segments thread local data
+typedef struct mi_segments_tld_s {
+  mi_span_queue_t     spans[MI_SEGMENT_BIN_MAX+1];  // free slice spans inside segments
+  size_t              count;        // current number of segments;
+  size_t              peak_count;   // peak number of segments
+  size_t              current_size; // current size of all segments
+  size_t              peak_size;    // peak size of all segments
+  mi_stats_t*         stats;        // points to tld stats
+  mi_os_tld_t*        os;           // points to os stats
+} mi_segments_tld_t;
+
+// Thread local data
+struct mi_tld_s {
+  unsigned long long  heartbeat;     // monotonic heartbeat count
+  bool                recurse;       // true if deferred was called; used to prevent infinite recursion.
+  mi_heap_t*          heap_backing;  // backing heap of this thread (cannot be deleted)
+  mi_heap_t*          heaps;         // list of heaps in this thread (so we can abandon all when the thread terminates)
+  mi_segments_tld_t   segments;      // segment tld
+  mi_os_tld_t         os;            // os tld
+  mi_stats_t          stats;         // statistics
+};
+
+#endif
diff --git a/Include/mimalloc/mimalloc.h b/Include/mimalloc/mimalloc.h
new file mode 100644
index 00000000000000..9b72fbfda7411a
--- /dev/null
+++ b/Include/mimalloc/mimalloc.h
@@ -0,0 +1,557 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2022, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_H
+#define MIMALLOC_H
+
+#define MI_MALLOC_VERSION 209   // major + 2 digits minor
+
+// ------------------------------------------------------
+// Compiler specific attributes
+// ------------------------------------------------------
+
+#ifdef __cplusplus
+  #if (__cplusplus >= 201103L) || (_MSC_VER > 1900)  // C++11
+    #define mi_attr_noexcept   noexcept
+  #else
+    #define mi_attr_noexcept   throw()
+  #endif
+#else
+  #define mi_attr_noexcept
+#endif
+
+#if defined(__cplusplus) && (__cplusplus >= 201703)
+  #define mi_decl_nodiscard    [[nodiscard]]
+#elif (defined(__GNUC__) && (__GNUC__ >= 4)) || defined(__clang__)  // includes clang, icc, and clang-cl
+  #define mi_decl_nodiscard    __attribute__((warn_unused_result))
+#elif defined(_HAS_NODISCARD)
+  #define mi_decl_nodiscard    _NODISCARD
+#elif (_MSC_VER >= 1700)
+  #define mi_decl_nodiscard    _Check_return_
+#else
+  #define mi_decl_nodiscard
+#endif
+
+#if defined(_MSC_VER) || defined(__MINGW32__)
+  #if !defined(MI_SHARED_LIB)
+    #define mi_decl_export
+  #elif defined(MI_SHARED_LIB_EXPORT)
+    #define mi_decl_export              __declspec(dllexport)
+  #else
+    #define mi_decl_export              __declspec(dllimport)
+  #endif
+  #if defined(__MINGW32__)
+    #define mi_decl_restrict
+    #define mi_attr_malloc              __attribute__((malloc))
+  #else
+    #if (_MSC_VER >= 1900) && !defined(__EDG__)
+      #define mi_decl_restrict          __declspec(allocator) __declspec(restrict)
+    #else
+      #define mi_decl_restrict          __declspec(restrict)
+    #endif
+    #define mi_attr_malloc
+  #endif
+  #define mi_cdecl                      __cdecl
+  #define mi_attr_alloc_size(s)
+  #define mi_attr_alloc_size2(s1,s2)
+  #define mi_attr_alloc_align(p)
+#elif defined(__GNUC__)                 // includes clang and icc
+  #if defined(MI_SHARED_LIB) && defined(MI_SHARED_LIB_EXPORT)
+    #define mi_decl_export              __attribute__((visibility("default")))
+  #else
+    #define mi_decl_export
+  #endif
+  #define mi_cdecl                      // leads to warnings... __attribute__((cdecl))
+  #define mi_decl_restrict
+  #define mi_attr_malloc                __attribute__((malloc))
+  #if (defined(__clang_major__) && (__clang_major__ < 4)) || (__GNUC__ < 5)
+    #define mi_attr_alloc_size(s)
+    #define mi_attr_alloc_size2(s1,s2)
+    #define mi_attr_alloc_align(p)
+  #elif defined(__INTEL_COMPILER)
+    #define mi_attr_alloc_size(s)       __attribute__((alloc_size(s)))
+    #define mi_attr_alloc_size2(s1,s2)  __attribute__((alloc_size(s1,s2)))
+    #define mi_attr_alloc_align(p)
+  #else
+    #define mi_attr_alloc_size(s)       __attribute__((alloc_size(s)))
+    #define mi_attr_alloc_size2(s1,s2)  __attribute__((alloc_size(s1,s2)))
+    #define mi_attr_alloc_align(p)      __attribute__((alloc_align(p)))
+  #endif
+#else
+  #define mi_cdecl
+  #define mi_decl_export
+  #define mi_decl_restrict
+  #define mi_attr_malloc
+  #define mi_attr_alloc_size(s)
+  #define mi_attr_alloc_size2(s1,s2)
+  #define mi_attr_alloc_align(p)
+#endif
+
+// ------------------------------------------------------
+// Includes
+// ------------------------------------------------------
+
+#include <stddef.h>     // size_t
+#include <stdbool.h>    // bool
+#include <stdint.h>     // INTPTR_MAX
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// ------------------------------------------------------
+// Standard malloc interface
+// ------------------------------------------------------
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_malloc(size_t size)  mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_calloc(size_t count, size_t size)  mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2);
+mi_decl_nodiscard mi_decl_export void* mi_realloc(void* p, size_t newsize)      mi_attr_noexcept mi_attr_alloc_size(2);
+mi_decl_export void* mi_expand(void* p, size_t newsize)                         mi_attr_noexcept mi_attr_alloc_size(2);
+
+mi_decl_export void mi_free(void* p) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_strdup(const char* s) mi_attr_noexcept mi_attr_malloc;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_strndup(const char* s, size_t n) mi_attr_noexcept mi_attr_malloc;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_realpath(const char* fname, char* resolved_name) mi_attr_noexcept mi_attr_malloc;
+
+// ------------------------------------------------------
+// Extended functionality
+// ------------------------------------------------------
+#define MI_SMALL_WSIZE_MAX  (128)
+#define MI_SMALL_SIZE_MAX   (MI_SMALL_WSIZE_MAX*sizeof(void*))
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_malloc_small(size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_zalloc_small(size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_zalloc(size_t size)       mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_mallocn(size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2);
+mi_decl_nodiscard mi_decl_export void* mi_reallocn(void* p, size_t count, size_t size)        mi_attr_noexcept mi_attr_alloc_size2(2,3);
+mi_decl_nodiscard mi_decl_export void* mi_reallocf(void* p, size_t newsize)                   mi_attr_noexcept mi_attr_alloc_size(2);
+
+mi_decl_nodiscard mi_decl_export size_t mi_usable_size(const void* p) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export size_t mi_good_size(size_t size)     mi_attr_noexcept;
+
+
+// ------------------------------------------------------
+// Internals
+// ------------------------------------------------------
+
+typedef void (mi_cdecl mi_deferred_free_fun)(bool force, unsigned long long heartbeat, void* arg);
+mi_decl_export void mi_register_deferred_free(mi_deferred_free_fun* deferred_free, void* arg) mi_attr_noexcept;
+
+typedef void (mi_cdecl mi_output_fun)(const char* msg, void* arg);
+mi_decl_export void mi_register_output(mi_output_fun* out, void* arg) mi_attr_noexcept;
+
+typedef void (mi_cdecl mi_error_fun)(int err, void* arg);
+mi_decl_export void mi_register_error(mi_error_fun* fun, void* arg);
+
+mi_decl_export void mi_collect(bool force)    mi_attr_noexcept;
+mi_decl_export int  mi_version(void)          mi_attr_noexcept;
+mi_decl_export void mi_stats_reset(void)      mi_attr_noexcept;
+mi_decl_export void mi_stats_merge(void)      mi_attr_noexcept;
+mi_decl_export void mi_stats_print(void* out) mi_attr_noexcept;  // backward compatibility: `out` is ignored and should be NULL
+mi_decl_export void mi_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept;
+
+mi_decl_export void mi_process_init(void)     mi_attr_noexcept;
+mi_decl_export void mi_thread_init(void)      mi_attr_noexcept;
+mi_decl_export void mi_thread_done(void)      mi_attr_noexcept;
+mi_decl_export void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept;
+
+mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs,
+                                    size_t* current_rss, size_t* peak_rss,
+                                    size_t* current_commit, size_t* peak_commit, size_t* page_faults) mi_attr_noexcept;
+
+// -------------------------------------------------------------------------------------
+// Aligned allocation
+// Note that `alignment` always follows `size` for consistency with unaligned
+// allocation, but unfortunately this differs from `posix_memalign` and `aligned_alloc`.
+// -------------------------------------------------------------------------------------
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_malloc_aligned(size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_malloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_zalloc_aligned(size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_zalloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_calloc_aligned(size_t count, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2) mi_attr_alloc_align(3);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_calloc_aligned_at(size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2);
+mi_decl_nodiscard mi_decl_export void* mi_realloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_alloc_size(2) mi_attr_alloc_align(3);
+mi_decl_nodiscard mi_decl_export void* mi_realloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size(2);
+
+
+// -------------------------------------------------------------------------------------
+// Heaps: first-class, but can only allocate from the same thread that created it.
+// -------------------------------------------------------------------------------------
+
+struct mi_heap_s;
+typedef struct mi_heap_s mi_heap_t;
+
+mi_decl_nodiscard mi_decl_export mi_heap_t* mi_heap_new(void);
+mi_decl_export void       mi_heap_delete(mi_heap_t* heap);
+mi_decl_export void       mi_heap_destroy(mi_heap_t* heap);
+mi_decl_export mi_heap_t* mi_heap_set_default(mi_heap_t* heap);
+mi_decl_export mi_heap_t* mi_heap_get_default(void);
+mi_decl_export mi_heap_t* mi_heap_get_backing(void);
+mi_decl_export void       mi_heap_collect(mi_heap_t* heap, bool force) mi_attr_noexcept;
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_zalloc(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_mallocn(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_malloc_small(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+
+mi_decl_nodiscard mi_decl_export void* mi_heap_realloc(mi_heap_t* heap, void* p, size_t newsize)              mi_attr_noexcept mi_attr_alloc_size(3);
+mi_decl_nodiscard mi_decl_export void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size)  mi_attr_noexcept mi_attr_alloc_size2(3,4);
+mi_decl_nodiscard mi_decl_export void* mi_heap_reallocf(mi_heap_t* heap, void* p, size_t newsize)             mi_attr_noexcept mi_attr_alloc_size(3);
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_heap_strdup(mi_heap_t* heap, const char* s)            mi_attr_noexcept mi_attr_malloc;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_heap_strndup(mi_heap_t* heap, const char* s, size_t n) mi_attr_noexcept mi_attr_malloc;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept mi_attr_malloc;
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_malloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(3);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_malloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_zalloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(3);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_zalloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_calloc_aligned(mi_heap_t* heap, size_t count, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3) mi_attr_alloc_align(4);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_calloc_aligned_at(mi_heap_t* heap, size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3);
+mi_decl_nodiscard mi_decl_export void* mi_heap_realloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_alloc_size(3) mi_attr_alloc_align(4);
+mi_decl_nodiscard mi_decl_export void* mi_heap_realloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size(3);
+
+
+// --------------------------------------------------------------------------------
+// Zero initialized re-allocation.
+// Only valid on memory that was originally allocated with zero initialization too.
+// e.g. `mi_calloc`, `mi_zalloc`, `mi_zalloc_aligned` etc.
+// see <https://github.com/microsoft/mimalloc/issues/63#issuecomment-508272992>
+// --------------------------------------------------------------------------------
+
+mi_decl_nodiscard mi_decl_export void* mi_rezalloc(void* p, size_t newsize)                mi_attr_noexcept mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export void* mi_recalloc(void* p, size_t newcount, size_t size)  mi_attr_noexcept mi_attr_alloc_size2(2,3);
+
+mi_decl_nodiscard mi_decl_export void* mi_rezalloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_alloc_size(2) mi_attr_alloc_align(3);
+mi_decl_nodiscard mi_decl_export void* mi_rezalloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export void* mi_recalloc_aligned(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept mi_attr_alloc_size2(2,3) mi_attr_alloc_align(4);
+mi_decl_nodiscard mi_decl_export void* mi_recalloc_aligned_at(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size2(2,3);
+
+mi_decl_nodiscard mi_decl_export void* mi_heap_rezalloc(mi_heap_t* heap, void* p, size_t newsize)                mi_attr_noexcept mi_attr_alloc_size(3);
+mi_decl_nodiscard mi_decl_export void* mi_heap_recalloc(mi_heap_t* heap, void* p, size_t newcount, size_t size)  mi_attr_noexcept mi_attr_alloc_size2(3,4);
+
+mi_decl_nodiscard mi_decl_export void* mi_heap_rezalloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_alloc_size(3) mi_attr_alloc_align(4);
+mi_decl_nodiscard mi_decl_export void* mi_heap_rezalloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size(3);
+mi_decl_nodiscard mi_decl_export void* mi_heap_recalloc_aligned(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept mi_attr_alloc_size2(3,4) mi_attr_alloc_align(5);
+mi_decl_nodiscard mi_decl_export void* mi_heap_recalloc_aligned_at(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size2(3,4);
+
+
+// ------------------------------------------------------
+// Analysis
+// ------------------------------------------------------
+
+mi_decl_export bool mi_heap_contains_block(mi_heap_t* heap, const void* p);
+mi_decl_export bool mi_heap_check_owned(mi_heap_t* heap, const void* p);
+mi_decl_export bool mi_check_owned(const void* p);
+
+// An area of heap space contains blocks of a single size.
+typedef struct mi_heap_area_s {
+  void*  blocks;      // start of the area containing heap blocks
+  size_t reserved;    // bytes reserved for this area (virtual)
+  size_t committed;   // current available bytes for this area
+  size_t used;        // number of allocated blocks
+  size_t block_size;  // size in bytes of each block
+  size_t full_block_size; // size in bytes of a full block including padding and metadata.
+} mi_heap_area_t;
+
+typedef bool (mi_cdecl mi_block_visit_fun)(const mi_heap_t* heap, const mi_heap_area_t* area, void* block, size_t block_size, void* arg);
+
+mi_decl_export bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_all_blocks, mi_block_visit_fun* visitor, void* arg);
+
+// Experimental
+mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export bool mi_is_redirected(void) mi_attr_noexcept;
+
+mi_decl_export int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept;
+mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept;
+
+mi_decl_export int  mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept;
+mi_decl_export bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept;
+
+mi_decl_export void mi_debug_show_arenas(void) mi_attr_noexcept;
+
+// Experimental: heaps associated with specific memory arena's
+typedef int mi_arena_id_t;
+mi_decl_export void* mi_arena_area(mi_arena_id_t arena_id, size_t* size);
+mi_decl_export int   mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
+mi_decl_export int   mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
+mi_decl_export bool  mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
+
+#if MI_MALLOC_VERSION >= 200
+// Create a heap that only allocates in the specified arena
+mi_decl_nodiscard mi_decl_export mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id);
+#endif
+
+// deprecated
+mi_decl_export int  mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept;
+
+
+// ------------------------------------------------------
+// Convenience
+// ------------------------------------------------------
+
+#define mi_malloc_tp(tp)                ((tp*)mi_malloc(sizeof(tp)))
+#define mi_zalloc_tp(tp)                ((tp*)mi_zalloc(sizeof(tp)))
+#define mi_calloc_tp(tp,n)              ((tp*)mi_calloc(n,sizeof(tp)))
+#define mi_mallocn_tp(tp,n)             ((tp*)mi_mallocn(n,sizeof(tp)))
+#define mi_reallocn_tp(p,tp,n)          ((tp*)mi_reallocn(p,n,sizeof(tp)))
+#define mi_recalloc_tp(p,tp,n)          ((tp*)mi_recalloc(p,n,sizeof(tp)))
+
+#define mi_heap_malloc_tp(hp,tp)        ((tp*)mi_heap_malloc(hp,sizeof(tp)))
+#define mi_heap_zalloc_tp(hp,tp)        ((tp*)mi_heap_zalloc(hp,sizeof(tp)))
+#define mi_heap_calloc_tp(hp,tp,n)      ((tp*)mi_heap_calloc(hp,n,sizeof(tp)))
+#define mi_heap_mallocn_tp(hp,tp,n)     ((tp*)mi_heap_mallocn(hp,n,sizeof(tp)))
+#define mi_heap_reallocn_tp(hp,p,tp,n)  ((tp*)mi_heap_reallocn(hp,p,n,sizeof(tp)))
+#define mi_heap_recalloc_tp(hp,p,tp,n)  ((tp*)mi_heap_recalloc(hp,p,n,sizeof(tp)))
+
+
+// ------------------------------------------------------
+// Options
+// ------------------------------------------------------
+
+typedef enum mi_option_e {
+  // stable options
+  mi_option_show_errors,
+  mi_option_show_stats,
+  mi_option_verbose,
+  // some of the following options are experimental
+  // (deprecated options are kept for binary backward compatibility with v1.x versions)
+  mi_option_eager_commit,
+  mi_option_deprecated_eager_region_commit,
+  mi_option_deprecated_reset_decommits,
+  mi_option_large_os_pages,           // use large (2MiB) OS pages, implies eager commit
+  mi_option_reserve_huge_os_pages,    // reserve N huge OS pages (1GiB) at startup
+  mi_option_reserve_huge_os_pages_at, // reserve huge OS pages at a specific NUMA node
+  mi_option_reserve_os_memory,        // reserve specified amount of OS memory at startup
+  mi_option_deprecated_segment_cache,
+  mi_option_page_reset,
+  mi_option_abandoned_page_decommit,
+  mi_option_deprecated_segment_reset,
+  mi_option_eager_commit_delay,
+  mi_option_decommit_delay,
+  mi_option_use_numa_nodes,           // 0 = use available numa nodes, otherwise use at most N nodes.
+  mi_option_limit_os_alloc,           // 1 = do not use OS memory for allocation (but only reserved arenas)
+  mi_option_os_tag,
+  mi_option_max_errors,
+  mi_option_max_warnings,
+  mi_option_max_segment_reclaim,
+  mi_option_allow_decommit,
+  mi_option_segment_decommit_delay,  
+  mi_option_decommit_extend_delay,
+  mi_option_destroy_on_exit,          
+  _mi_option_last
+} mi_option_t;
+
+
+mi_decl_nodiscard mi_decl_export bool mi_option_is_enabled(mi_option_t option);
+mi_decl_export void mi_option_enable(mi_option_t option);
+mi_decl_export void mi_option_disable(mi_option_t option);
+mi_decl_export void mi_option_set_enabled(mi_option_t option, bool enable);
+mi_decl_export void mi_option_set_enabled_default(mi_option_t option, bool enable);
+
+mi_decl_nodiscard mi_decl_export long mi_option_get(mi_option_t option);
+mi_decl_nodiscard mi_decl_export long mi_option_get_clamp(mi_option_t option, long min, long max);
+mi_decl_export void mi_option_set(mi_option_t option, long value);
+mi_decl_export void mi_option_set_default(mi_option_t option, long value);
+
+
+// -------------------------------------------------------------------------------------------------------
+// "mi" prefixed implementations of various posix, Unix, Windows, and C++ allocation functions.
+// (This can be convenient when providing overrides of these functions as done in `mimalloc-override.h`.)
+// note: we use `mi_cfree` as "checked free" and it checks if the pointer is in our heap before free-ing.
+// -------------------------------------------------------------------------------------------------------
+
+mi_decl_export void  mi_cfree(void* p) mi_attr_noexcept;
+mi_decl_export void* mi__expand(void* p, size_t newsize) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export size_t mi_malloc_size(const void* p)        mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export size_t mi_malloc_good_size(size_t size)     mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export size_t mi_malloc_usable_size(const void *p) mi_attr_noexcept;
+
+mi_decl_export int mi_posix_memalign(void** p, size_t alignment, size_t size)   mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_memalign(size_t alignment, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_valloc(size_t size)  mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_pvalloc(size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_aligned_alloc(size_t alignment, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(1);
+
+mi_decl_nodiscard mi_decl_export void* mi_reallocarray(void* p, size_t count, size_t size) mi_attr_noexcept mi_attr_alloc_size2(2,3);
+mi_decl_nodiscard mi_decl_export int   mi_reallocarr(void* p, size_t count, size_t size) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export void* mi_aligned_recalloc(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export void* mi_aligned_offset_recalloc(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept;
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict unsigned short* mi_wcsdup(const unsigned short* s) mi_attr_noexcept mi_attr_malloc;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict unsigned char*  mi_mbsdup(const unsigned char* s)  mi_attr_noexcept mi_attr_malloc;
+mi_decl_export int mi_dupenv_s(char** buf, size_t* size, const char* name)                      mi_attr_noexcept;
+mi_decl_export int mi_wdupenv_s(unsigned short** buf, size_t* size, const unsigned short* name) mi_attr_noexcept;
+
+mi_decl_export void mi_free_size(void* p, size_t size)                           mi_attr_noexcept;
+mi_decl_export void mi_free_size_aligned(void* p, size_t size, size_t alignment) mi_attr_noexcept;
+mi_decl_export void mi_free_aligned(void* p, size_t alignment)                   mi_attr_noexcept;
+
+// The `mi_new` wrappers implement C++ semantics on out-of-memory instead of directly returning `NULL`.
+// (and call `std::get_new_handler` and potentially raise a `std::bad_alloc` exception).
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_new(size_t size)                   mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_new_aligned(size_t size, size_t alignment) mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_new_nothrow(size_t size)           mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_new_aligned_nothrow(size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_new_n(size_t count, size_t size)   mi_attr_malloc mi_attr_alloc_size2(1, 2);
+mi_decl_nodiscard mi_decl_export void* mi_new_realloc(void* p, size_t newsize)                mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export void* mi_new_reallocn(void* p, size_t newcount, size_t size) mi_attr_alloc_size2(2, 3);
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_alloc_new(mi_heap_t* heap, size_t size)                mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_alloc_new_n(mi_heap_t* heap, size_t count, size_t size) mi_attr_malloc mi_attr_alloc_size2(2, 3);
+
+#ifdef __cplusplus
+}
+#endif
+
+// ---------------------------------------------------------------------------------------------
+// Implement the C++ std::allocator interface for use in STL containers.
+// (note: see `mimalloc-new-delete.h` for overriding the new/delete operators globally)
+// ---------------------------------------------------------------------------------------------
+#ifdef __cplusplus
+
+#include <cstddef>     // std::size_t
+#include <cstdint>     // PTRDIFF_MAX
+#if (__cplusplus >= 201103L) || (_MSC_VER > 1900)  // C++11
+#include <type_traits> // std::true_type
+#include <utility>     // std::forward
+#endif
+
+template<class T> struct _mi_stl_allocator_common {
+  typedef T                 value_type;
+  typedef std::size_t       size_type;
+  typedef std::ptrdiff_t    difference_type;
+  typedef value_type&       reference;
+  typedef value_type const& const_reference;
+  typedef value_type*       pointer;
+  typedef value_type const* const_pointer;
+
+  #if ((__cplusplus >= 201103L) || (_MSC_VER > 1900))  // C++11
+  using propagate_on_container_copy_assignment = std::true_type;
+  using propagate_on_container_move_assignment = std::true_type;
+  using propagate_on_container_swap            = std::true_type;
+  template <class U, class ...Args> void construct(U* p, Args&& ...args) { ::new(p) U(std::forward<Args>(args)...); }
+  template <class U> void destroy(U* p) mi_attr_noexcept { p->~U(); }
+  #else
+  void construct(pointer p, value_type const& val) { ::new(p) value_type(val); }
+  void destroy(pointer p) { p->~value_type(); }
+  #endif
+
+  size_type     max_size() const mi_attr_noexcept { return (PTRDIFF_MAX/sizeof(value_type)); }
+  pointer       address(reference x) const        { return &x; }
+  const_pointer address(const_reference x) const  { return &x; }
+};
+
+template<class T> struct mi_stl_allocator : public _mi_stl_allocator_common<T> {
+  using typename _mi_stl_allocator_common<T>::size_type;
+  using typename _mi_stl_allocator_common<T>::value_type;
+  using typename _mi_stl_allocator_common<T>::pointer;
+  template <class U> struct rebind { typedef mi_stl_allocator<U> other; };
+
+  mi_stl_allocator()                                             mi_attr_noexcept = default;
+  mi_stl_allocator(const mi_stl_allocator&)                      mi_attr_noexcept = default;
+  template<class U> mi_stl_allocator(const mi_stl_allocator<U>&) mi_attr_noexcept { }
+  mi_stl_allocator  select_on_container_copy_construction() const { return *this; }
+  void              deallocate(T* p, size_type) { mi_free(p); }
+
+  #if (__cplusplus >= 201703L)  // C++17
+  mi_decl_nodiscard T* allocate(size_type count) { return static_cast<T*>(mi_new_n(count, sizeof(T))); }
+  mi_decl_nodiscard T* allocate(size_type count, const void*) { return allocate(count); }
+  #else
+  mi_decl_nodiscard pointer allocate(size_type count, const void* = 0) { return static_cast<pointer>(mi_new_n(count, sizeof(value_type))); }
+  #endif
+
+  #if ((__cplusplus >= 201103L) || (_MSC_VER > 1900))  // C++11
+  using is_always_equal = std::true_type;
+  #endif
+};
+
+template<class T1,class T2> bool operator==(const mi_stl_allocator<T1>& , const mi_stl_allocator<T2>& ) mi_attr_noexcept { return true; }
+template<class T1,class T2> bool operator!=(const mi_stl_allocator<T1>& , const mi_stl_allocator<T2>& ) mi_attr_noexcept { return false; }
+
+
+#if (__cplusplus >= 201103L) || (_MSC_VER > 1900)  // C++11
+#include <memory>      // std::shared_ptr
+
+// Common base class for STL allocators in a specific heap
+template<class T, bool destroy> struct _mi_heap_stl_allocator_common : public _mi_stl_allocator_common<T> {
+  using typename _mi_stl_allocator_common<T>::size_type;
+  using typename _mi_stl_allocator_common<T>::value_type;
+  using typename _mi_stl_allocator_common<T>::pointer;
+
+  _mi_heap_stl_allocator_common(mi_heap_t* hp) : heap(hp) { }    /* will not delete nor destroy the passed in heap */
+
+  #if (__cplusplus >= 201703L)  // C++17
+  mi_decl_nodiscard T* allocate(size_type count) { return static_cast<T*>(mi_heap_alloc_new_n(this->heap.get(), count, sizeof(T))); }
+  mi_decl_nodiscard T* allocate(size_type count, const void*) { return allocate(count); }
+  #else
+  mi_decl_nodiscard pointer allocate(size_type count, const void* = 0) { return static_cast<pointer>(mi_heap_alloc_new_n(this->heap.get(), count, sizeof(value_type))); }
+  #endif
+
+  #if ((__cplusplus >= 201103L) || (_MSC_VER > 1900))  // C++11
+  using is_always_equal = std::false_type;
+  #endif
+
+  void collect(bool force) { mi_heap_collect(this->heap.get(), force); }
+  template<class U> bool is_equal(const _mi_heap_stl_allocator_common<U, destroy>& x) const { return (this->heap == x.heap); }
+
+protected:
+  std::shared_ptr<mi_heap_t> heap;
+  template<class U, bool D> friend struct _mi_heap_stl_allocator_common;
+  
+  _mi_heap_stl_allocator_common() {
+    mi_heap_t* hp = mi_heap_new();
+    this->heap.reset(hp, (destroy ? &heap_destroy : &heap_delete));  /* calls heap_delete/destroy when the refcount drops to zero */
+  }
+  _mi_heap_stl_allocator_common(const _mi_heap_stl_allocator_common& x) mi_attr_noexcept : heap(x.heap) { }
+  template<class U> _mi_heap_stl_allocator_common(const _mi_heap_stl_allocator_common<U, destroy>& x) mi_attr_noexcept : heap(x.heap) { }
+
+private:
+  static void heap_delete(mi_heap_t* hp)  { if (hp != NULL) { mi_heap_delete(hp); } }
+  static void heap_destroy(mi_heap_t* hp) { if (hp != NULL) { mi_heap_destroy(hp); } }
+};
+
+// STL allocator allocation in a specific heap
+template<class T> struct mi_heap_stl_allocator : public _mi_heap_stl_allocator_common<T, false> {
+  using typename _mi_heap_stl_allocator_common<T, false>::size_type;
+  mi_heap_stl_allocator() : _mi_heap_stl_allocator_common<T, false>() { } // creates fresh heap that is deleted when the destructor is called
+  mi_heap_stl_allocator(mi_heap_t* hp) : _mi_heap_stl_allocator_common<T, false>(hp) { }  // no delete nor destroy on the passed in heap 
+  template<class U> mi_heap_stl_allocator(const mi_heap_stl_allocator<U>& x) mi_attr_noexcept : _mi_heap_stl_allocator_common<T, false>(x) { }
+
+  mi_heap_stl_allocator select_on_container_copy_construction() const { return *this; }
+  void deallocate(T* p, size_type) { mi_free(p); }
+  template<class U> struct rebind { typedef mi_heap_stl_allocator<U> other; };
+};
+
+template<class T1, class T2> bool operator==(const mi_heap_stl_allocator<T1>& x, const mi_heap_stl_allocator<T2>& y) mi_attr_noexcept { return (x.is_equal(y)); }
+template<class T1, class T2> bool operator!=(const mi_heap_stl_allocator<T1>& x, const mi_heap_stl_allocator<T2>& y) mi_attr_noexcept { return (!x.is_equal(y)); }
+
+
+// STL allocator allocation in a specific heap, where `free` does nothing and
+// the heap is destroyed in one go on destruction -- use with care!
+template<class T> struct mi_heap_destroy_stl_allocator : public _mi_heap_stl_allocator_common<T, true> {
+  using typename _mi_heap_stl_allocator_common<T, true>::size_type;
+  mi_heap_destroy_stl_allocator() : _mi_heap_stl_allocator_common<T, true>() { } // creates fresh heap that is destroyed when the destructor is called
+  mi_heap_destroy_stl_allocator(mi_heap_t* hp) : _mi_heap_stl_allocator_common<T, true>(hp) { }  // no delete nor destroy on the passed in heap 
+  template<class U> mi_heap_destroy_stl_allocator(const mi_heap_destroy_stl_allocator<U>& x) mi_attr_noexcept : _mi_heap_stl_allocator_common<T, true>(x) { }
+
+  mi_heap_destroy_stl_allocator select_on_container_copy_construction() const { return *this; }
+  void deallocate(T*, size_type) { /* do nothing as we destroy the heap on destruct. */ }
+  template<class U> struct rebind { typedef mi_heap_destroy_stl_allocator<U> other; };
+};
+
+template<class T1, class T2> bool operator==(const mi_heap_destroy_stl_allocator<T1>& x, const mi_heap_destroy_stl_allocator<T2>& y) mi_attr_noexcept { return (x.is_equal(y)); }
+template<class T1, class T2> bool operator!=(const mi_heap_destroy_stl_allocator<T1>& x, const mi_heap_destroy_stl_allocator<T2>& y) mi_attr_noexcept { return (!x.is_equal(y)); }
+
+#endif // C++11
+
+#endif // __cplusplus
+
+#endif
diff --git a/Makefile.pre.in b/Makefile.pre.in
index 9eb89dea50eac5..2186c51e623944 100644
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -100,7 +100,7 @@ CONFIGURE_LDFLAGS=	@LDFLAGS@
 # command line to append to these values without stomping the pre-set
 # values.
 PY_CFLAGS=	$(BASECFLAGS) $(OPT) $(CONFIGURE_CFLAGS) $(CFLAGS) $(EXTRA_CFLAGS)
-PY_CFLAGS_NODIST=$(CONFIGURE_CFLAGS_NODIST) $(CFLAGS_NODIST) -I$(srcdir)/Include/internal
+PY_CFLAGS_NODIST=$(CONFIGURE_CFLAGS_NODIST) $(CFLAGS_NODIST) -I$(srcdir)/Include/internal -I$(srcdir)/Include/mimalloc
 # Both CPPFLAGS and LDFLAGS need to contain the shell's value for setup.py to
 # be able to build extension modules using the directories specified in the
 # environment variables
@@ -336,6 +336,34 @@ IO_OBJS=	\
 		Modules/_io/bytesio.o \
 		Modules/_io/stringio.o
 
+
+##########################################################################
+# mimalloc
+
+MIMALLOC_HEADERS= \
+		$(srcdir)/Include/internal/pycore_mimalloc.h \
+		$(srcdir)/Include/internal/mimalloc/mimalloc-atomic.h \
+		$(srcdir)/Include/internal/mimalloc/mimalloc.h \
+		$(srcdir)/Include/internal/mimalloc/mimalloc-internal.h \
+		$(srcdir)/Include/internal/mimalloc/mimalloc-types.h
+
+MIMALLOC_OBJS= \
+		Objects/mimalloc/alloc-aligned.o \
+		Objects/mimalloc/alloc-posix.o \
+		Objects/mimalloc/alloc.o \
+		Objects/mimalloc/arena.o \
+		Objects/mimalloc/bitmap.o \
+		Objects/mimalloc/heap.o \
+		Objects/mimalloc/init.o \
+		Objects/mimalloc/options.o \
+		Objects/mimalloc/os.o \
+		Objects/mimalloc/page.o \
+		Objects/mimalloc/random.o \
+		Objects/mimalloc/segment.o \
+		Objects/mimalloc/segment-cache.o \
+		Objects/mimalloc/stats.o
+
+
 ##########################################################################
 # Parser
 
@@ -506,6 +534,7 @@ OBJECT_OBJS=	\
 		Objects/unicodectype.o \
 		Objects/unionobject.o \
 		Objects/weakrefobject.o \
+		@MIMALLOC_OBJS@ \
 		@PERF_TRAMPOLINE_OBJ@
 
 ##########################################################################
@@ -1538,6 +1567,7 @@ Objects/unicodeobject.o: $(srcdir)/Objects/unicodeobject.c $(UNICODE_DEPS)
 Objects/dictobject.o: $(srcdir)/Objects/stringlib/eq.h
 Objects/setobject.o: $(srcdir)/Objects/stringlib/eq.h
 
+Objects/mimalloc/page.o: $(srcdir)/Objects/mimalloc/page-queue.c
 
 .PHONY: regen-cases
 regen-cases:
@@ -1749,7 +1779,7 @@ PYTHON_HEADERS= \
 		$(srcdir)/Include/cpython/unicodeobject.h \
 		$(srcdir)/Include/cpython/warnings.h \
 		$(srcdir)/Include/cpython/weakrefobject.h \
-		\
+		@MIMALLOC_HEADERS@ \
 		$(srcdir)/Include/internal/pycore_abstract.h \
 		$(srcdir)/Include/internal/pycore_asdl.h \
 		$(srcdir)/Include/internal/pycore_ast.h \
diff --git a/Objects/mimalloc/alloc-aligned.c b/Objects/mimalloc/alloc-aligned.c
new file mode 100644
index 00000000000000..9fe82890fa2f55
--- /dev/null
+++ b/Objects/mimalloc/alloc-aligned.c
@@ -0,0 +1,306 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+
+#include <string.h>  // memset
+
+// ------------------------------------------------------
+// Aligned Allocation
+// ------------------------------------------------------
+
+// Fallback primitive aligned allocation -- split out for better codegen
+static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_fallback(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept
+{
+  mi_assert_internal(size <= PTRDIFF_MAX);
+  mi_assert_internal(alignment != 0 && _mi_is_power_of_two(alignment));
+
+  const uintptr_t align_mask = alignment - 1;  // for any x, `(x & align_mask) == (x % alignment)`
+  const size_t padsize = size + MI_PADDING_SIZE;
+
+  // use regular allocation if it is guaranteed to fit the alignment constraints
+  if (offset==0 && alignment<=padsize && padsize<=MI_MAX_ALIGN_GUARANTEE && (padsize&align_mask)==0) {
+    void* p = _mi_heap_malloc_zero(heap, size, zero);
+    mi_assert_internal(p == NULL || ((uintptr_t)p % alignment) == 0);
+    return p;
+  }
+
+  void* p;
+  size_t oversize;
+  if mi_unlikely(alignment > MI_ALIGNMENT_MAX) {
+    // use OS allocation for very large alignment and allocate inside a huge page (dedicated segment with 1 page)
+    // This can support alignments >= MI_SEGMENT_SIZE by ensuring the object can be aligned at a point in the
+    // first (and single) page such that the segment info is `MI_SEGMENT_SIZE` bytes before it (so it can be found by aligning the pointer down)
+    if mi_unlikely(offset != 0) {
+      // todo: cannot support offset alignment for very large alignments yet
+      #if MI_DEBUG > 0
+      _mi_error_message(EOVERFLOW, "aligned allocation with a very large alignment cannot be used with an alignment offset (size %zu, alignment %zu, offset %zu)\n", size, alignment, offset);
+      #endif
+      return NULL;
+    }
+    oversize = (size <= MI_SMALL_SIZE_MAX ? MI_SMALL_SIZE_MAX + 1 /* ensure we use generic malloc path */ : size);
+    p = _mi_heap_malloc_zero_ex(heap, oversize, false, alignment); // the page block size should be large enough to align in the single huge page block
+    // zero afterwards as only the area from the aligned_p may be committed!
+    if (p == NULL) return NULL;
+  }
+  else {
+    // otherwise over-allocate
+    oversize = size + alignment - 1;
+    p = _mi_heap_malloc_zero(heap, oversize, zero);
+    if (p == NULL) return NULL;
+  }
+
+  // .. and align within the allocation
+  const uintptr_t poffset = ((uintptr_t)p + offset) & align_mask;
+  const uintptr_t adjust  = (poffset == 0 ? 0 : alignment - poffset);
+  mi_assert_internal(adjust < alignment);
+  void* aligned_p = (void*)((uintptr_t)p + adjust);
+  if (aligned_p != p) {
+    mi_page_set_has_aligned(_mi_ptr_page(p), true);
+  }
+
+  mi_assert_internal(mi_page_usable_block_size(_mi_ptr_page(p)) >= adjust + size);
+  mi_assert_internal(p == _mi_page_ptr_unalign(_mi_ptr_segment(aligned_p), _mi_ptr_page(aligned_p), aligned_p));
+  mi_assert_internal(((uintptr_t)aligned_p + offset) % alignment == 0);
+  mi_assert_internal(mi_page_usable_block_size(_mi_ptr_page(p)) >= adjust + size);
+
+  // now zero the block if needed
+  if (zero && alignment > MI_ALIGNMENT_MAX) {
+    const ptrdiff_t diff = (uint8_t*)aligned_p - (uint8_t*)p;
+    const ptrdiff_t zsize = mi_page_usable_block_size(_mi_ptr_page(p)) - diff - MI_PADDING_SIZE;
+    if (zsize > 0) { _mi_memzero(aligned_p, zsize); }
+  }
+
+  #if MI_TRACK_ENABLED
+  if (p != aligned_p) {
+    mi_track_free_size(p, oversize);
+    mi_track_malloc(aligned_p, size, zero);
+  }
+  else {
+    mi_track_resize(aligned_p, oversize, size);
+  }
+  #endif
+  return aligned_p;
+}
+
+// Primitive aligned allocation
+static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept
+{
+  // note: we don't require `size > offset`, we just guarantee that the address at offset is aligned regardless of the allocated size.
+  mi_assert(alignment > 0);
+  if mi_unlikely(alignment == 0 || !_mi_is_power_of_two(alignment)) { // require power-of-two (see <https://en.cppreference.com/w/c/memory/aligned_alloc>)
+    #if MI_DEBUG > 0
+    _mi_error_message(EOVERFLOW, "aligned allocation requires the alignment to be a power-of-two (size %zu, alignment %zu)\n", size, alignment);
+    #endif
+    return NULL;
+  }
+  /*
+  if mi_unlikely(alignment > MI_ALIGNMENT_MAX) {  // we cannot align at a boundary larger than this (or otherwise we cannot find segment headers)
+    #if MI_DEBUG > 0
+    _mi_error_message(EOVERFLOW, "aligned allocation has a maximum alignment of %zu (size %zu, alignment %zu)\n", MI_ALIGNMENT_MAX, size, alignment);
+    #endif
+    return NULL;
+  }
+  */
+  if mi_unlikely(size > PTRDIFF_MAX) {          // we don't allocate more than PTRDIFF_MAX (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
+    #if MI_DEBUG > 0
+    _mi_error_message(EOVERFLOW, "aligned allocation request is too large (size %zu, alignment %zu)\n", size, alignment);
+    #endif
+    return NULL;
+  }
+  const uintptr_t align_mask = alignment-1;       // for any x, `(x & align_mask) == (x % alignment)`
+  const size_t padsize = size + MI_PADDING_SIZE;  // note: cannot overflow due to earlier size > PTRDIFF_MAX check
+
+  // try first if there happens to be a small block available with just the right alignment
+  if mi_likely(padsize <= MI_SMALL_SIZE_MAX && alignment <= padsize) {
+    mi_page_t* page = _mi_heap_get_free_small_page(heap, padsize);
+    const bool is_aligned = (((uintptr_t)page->free+offset) & align_mask)==0;
+    if mi_likely(page->free != NULL && is_aligned)
+    {
+      #if MI_STAT>1
+      mi_heap_stat_increase(heap, malloc, size);
+      #endif
+      void* p = _mi_page_malloc(heap, page, padsize, zero); // TODO: inline _mi_page_malloc
+      mi_assert_internal(p != NULL);
+      mi_assert_internal(((uintptr_t)p + offset) % alignment == 0);
+      mi_track_malloc(p,size,zero);
+      return p;
+    }
+  }
+  // fallback
+  return mi_heap_malloc_zero_aligned_at_fallback(heap, size, alignment, offset, zero);
+}
+
+
+// ------------------------------------------------------
+// Optimized mi_heap_malloc_aligned / mi_malloc_aligned
+// ------------------------------------------------------
+
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_malloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_malloc_zero_aligned_at(heap, size, alignment, offset, false);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_malloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept {
+  #if !MI_PADDING
+  // without padding, any small sized allocation is naturally aligned (see also `_mi_segment_page_start`)
+  if (!_mi_is_power_of_two(alignment)) return NULL;
+  if mi_likely(_mi_is_power_of_two(size) && size >= alignment && size <= MI_SMALL_SIZE_MAX)
+  #else
+  // with padding, we can only guarantee this for fixed alignments
+  if mi_likely((alignment == sizeof(void*) || (alignment == MI_MAX_ALIGN_SIZE && size > (MI_MAX_ALIGN_SIZE/2)))
+                && size <= MI_SMALL_SIZE_MAX)
+  #endif
+  {
+    // fast path for common alignment and size
+    return mi_heap_malloc_small(heap, size);
+  }
+  else {
+    return mi_heap_malloc_aligned_at(heap, size, alignment, 0);
+  }
+}
+
+// ------------------------------------------------------
+// Aligned Allocation
+// ------------------------------------------------------
+
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_zalloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_malloc_zero_aligned_at(heap, size, alignment, offset, true);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_zalloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_heap_zalloc_aligned_at(heap, size, alignment, 0);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_calloc_aligned_at(mi_heap_t* heap, size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  size_t total;
+  if (mi_count_size_overflow(count, size, &total)) return NULL;
+  return mi_heap_zalloc_aligned_at(heap, total, alignment, offset);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_calloc_aligned(mi_heap_t* heap, size_t count, size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_heap_calloc_aligned_at(heap,count,size,alignment,0);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_malloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_malloc_aligned_at(mi_get_default_heap(), size, alignment, offset);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_malloc_aligned(size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_heap_malloc_aligned(mi_get_default_heap(), size, alignment);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_zalloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_zalloc_aligned_at(mi_get_default_heap(), size, alignment, offset);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_zalloc_aligned(size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_heap_zalloc_aligned(mi_get_default_heap(), size, alignment);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_calloc_aligned_at(size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_calloc_aligned_at(mi_get_default_heap(), count, size, alignment, offset);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_calloc_aligned(size_t count, size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_heap_calloc_aligned(mi_get_default_heap(), count, size, alignment);
+}
+
+
+// ------------------------------------------------------
+// Aligned re-allocation
+// ------------------------------------------------------
+
+static void* mi_heap_realloc_zero_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset, bool zero) mi_attr_noexcept {
+  mi_assert(alignment > 0);
+  if (alignment <= sizeof(uintptr_t)) return _mi_heap_realloc_zero(heap,p,newsize,zero);
+  if (p == NULL) return mi_heap_malloc_zero_aligned_at(heap,newsize,alignment,offset,zero);
+  size_t size = mi_usable_size(p);
+  if (newsize <= size && newsize >= (size - (size / 2))
+      && (((uintptr_t)p + offset) % alignment) == 0) {
+    return p;  // reallocation still fits, is aligned and not more than 50% waste
+  }
+  else {
+    void* newp = mi_heap_malloc_aligned_at(heap,newsize,alignment,offset);
+    if (newp != NULL) {
+      if (zero && newsize > size) {
+        const mi_page_t* page = _mi_ptr_page(newp);
+        if (page->is_zero) {
+          // already zero initialized
+          mi_assert_expensive(mi_mem_is_zero(newp,newsize));
+        }
+        else {
+          // also set last word in the previous allocation to zero to ensure any padding is zero-initialized
+          size_t start = (size >= sizeof(intptr_t) ? size - sizeof(intptr_t) : 0);
+          memset((uint8_t*)newp + start, 0, newsize - start);
+        }
+      }
+      _mi_memcpy_aligned(newp, p, (newsize > size ? size : newsize));
+      mi_free(p); // only free if successful
+    }
+    return newp;
+  }
+}
+
+static void* mi_heap_realloc_zero_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, bool zero) mi_attr_noexcept {
+  mi_assert(alignment > 0);
+  if (alignment <= sizeof(uintptr_t)) return _mi_heap_realloc_zero(heap,p,newsize,zero);
+  size_t offset = ((uintptr_t)p % alignment); // use offset of previous allocation (p can be NULL)
+  return mi_heap_realloc_zero_aligned_at(heap,p,newsize,alignment,offset,zero);
+}
+
+mi_decl_nodiscard void* mi_heap_realloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_realloc_zero_aligned_at(heap,p,newsize,alignment,offset,false);
+}
+
+mi_decl_nodiscard void* mi_heap_realloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+  return mi_heap_realloc_zero_aligned(heap,p,newsize,alignment,false);
+}
+
+mi_decl_nodiscard void* mi_heap_rezalloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_realloc_zero_aligned_at(heap, p, newsize, alignment, offset, true);
+}
+
+mi_decl_nodiscard void* mi_heap_rezalloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+  return mi_heap_realloc_zero_aligned(heap, p, newsize, alignment, true);
+}
+
+mi_decl_nodiscard void* mi_heap_recalloc_aligned_at(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  size_t total;
+  if (mi_count_size_overflow(newcount, size, &total)) return NULL;
+  return mi_heap_rezalloc_aligned_at(heap, p, total, alignment, offset);
+}
+
+mi_decl_nodiscard void* mi_heap_recalloc_aligned(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
+  size_t total;
+  if (mi_count_size_overflow(newcount, size, &total)) return NULL;
+  return mi_heap_rezalloc_aligned(heap, p, total, alignment);
+}
+
+mi_decl_nodiscard void* mi_realloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_realloc_aligned_at(mi_get_default_heap(), p, newsize, alignment, offset);
+}
+
+mi_decl_nodiscard void* mi_realloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+  return mi_heap_realloc_aligned(mi_get_default_heap(), p, newsize, alignment);
+}
+
+mi_decl_nodiscard void* mi_rezalloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_rezalloc_aligned_at(mi_get_default_heap(), p, newsize, alignment, offset);
+}
+
+mi_decl_nodiscard void* mi_rezalloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+  return mi_heap_rezalloc_aligned(mi_get_default_heap(), p, newsize, alignment);
+}
+
+mi_decl_nodiscard void* mi_recalloc_aligned_at(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_recalloc_aligned_at(mi_get_default_heap(), p, newcount, size, alignment, offset);
+}
+
+mi_decl_nodiscard void* mi_recalloc_aligned(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_heap_recalloc_aligned(mi_get_default_heap(), p, newcount, size, alignment);
+}
diff --git a/Objects/mimalloc/alloc-posix.c b/Objects/mimalloc/alloc-posix.c
new file mode 100644
index 00000000000000..e6505f2907de22
--- /dev/null
+++ b/Objects/mimalloc/alloc-posix.c
@@ -0,0 +1,184 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+// ------------------------------------------------------------------------
+// mi prefixed publi definitions of various Posix, Unix, and C++ functions
+// for convenience and used when overriding these functions.
+// ------------------------------------------------------------------------
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+
+// ------------------------------------------------------
+// Posix & Unix functions definitions
+// ------------------------------------------------------
+
+#include <errno.h>
+#include <string.h>  // memset
+#include <stdlib.h>  // getenv
+
+#ifdef _MSC_VER
+#pragma warning(disable:4996)  // getenv _wgetenv
+#endif
+
+#ifndef EINVAL
+#define EINVAL 22
+#endif
+#ifndef ENOMEM
+#define ENOMEM 12
+#endif
+
+
+mi_decl_nodiscard size_t mi_malloc_size(const void* p) mi_attr_noexcept {
+  // if (!mi_is_in_heap_region(p)) return 0;
+  return mi_usable_size(p);
+}
+
+mi_decl_nodiscard size_t mi_malloc_usable_size(const void *p) mi_attr_noexcept {
+  // if (!mi_is_in_heap_region(p)) return 0;
+  return mi_usable_size(p);
+}
+
+mi_decl_nodiscard size_t mi_malloc_good_size(size_t size) mi_attr_noexcept {
+  return mi_good_size(size);
+}
+
+void mi_cfree(void* p) mi_attr_noexcept {
+  if (mi_is_in_heap_region(p)) {
+    mi_free(p);
+  }
+}
+
+int mi_posix_memalign(void** p, size_t alignment, size_t size) mi_attr_noexcept {
+  // Note: The spec dictates we should not modify `*p` on an error. (issue#27)
+  // <http://man7.org/linux/man-pages/man3/posix_memalign.3.html>
+  if (p == NULL) return EINVAL;
+  if (alignment % sizeof(void*) != 0) return EINVAL;                   // natural alignment
+  if (alignment==0 || !_mi_is_power_of_two(alignment)) return EINVAL;  // not a power of 2
+  void* q = mi_malloc_aligned(size, alignment);
+  if (q==NULL && size != 0) return ENOMEM;
+  mi_assert_internal(((uintptr_t)q % alignment) == 0);
+  *p = q;
+  return 0;
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_memalign(size_t alignment, size_t size) mi_attr_noexcept {
+  void* p = mi_malloc_aligned(size, alignment);
+  mi_assert_internal(((uintptr_t)p % alignment) == 0);
+  return p;
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_valloc(size_t size) mi_attr_noexcept {
+  return mi_memalign( _mi_os_page_size(), size );
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_pvalloc(size_t size) mi_attr_noexcept {
+  size_t psize = _mi_os_page_size();
+  if (size >= SIZE_MAX - psize) return NULL; // overflow
+  size_t asize = _mi_align_up(size, psize);
+  return mi_malloc_aligned(asize, psize);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_aligned_alloc(size_t alignment, size_t size) mi_attr_noexcept {
+  // C11 requires the size to be an integral multiple of the alignment, see <https://en.cppreference.com/w/c/memory/aligned_alloc>.
+  // unfortunately, it turns out quite some programs pass a size that is not an integral multiple so skip this check..
+  /* if mi_unlikely((size & (alignment - 1)) != 0) { // C11 requires alignment>0 && integral multiple, see <https://en.cppreference.com/w/c/memory/aligned_alloc>
+      #if MI_DEBUG > 0
+      _mi_error_message(EOVERFLOW, "(mi_)aligned_alloc requires the size to be an integral multiple of the alignment (size %zu, alignment %zu)\n", size, alignment);
+      #endif
+      return NULL;
+    }
+  */
+  // C11 also requires alignment to be a power-of-two (and > 0) which is checked in mi_malloc_aligned
+  void* p = mi_malloc_aligned(size, alignment);
+  mi_assert_internal(((uintptr_t)p % alignment) == 0);
+  return p;
+}
+
+mi_decl_nodiscard void* mi_reallocarray( void* p, size_t count, size_t size ) mi_attr_noexcept {  // BSD
+  void* newp = mi_reallocn(p,count,size);
+  if (newp==NULL) { errno = ENOMEM; }
+  return newp;
+}
+
+mi_decl_nodiscard int mi_reallocarr( void* p, size_t count, size_t size ) mi_attr_noexcept { // NetBSD
+  mi_assert(p != NULL);
+  if (p == NULL) {
+    errno = EINVAL;
+    return EINVAL;
+  }
+  void** op = (void**)p;
+  void* newp = mi_reallocarray(*op, count, size);
+  if mi_unlikely(newp == NULL) { return errno; }
+  *op = newp;
+  return 0;
+}
+
+void* mi__expand(void* p, size_t newsize) mi_attr_noexcept {  // Microsoft
+  void* res = mi_expand(p, newsize);
+  if (res == NULL) { errno = ENOMEM; }
+  return res;
+}
+
+mi_decl_nodiscard mi_decl_restrict unsigned short* mi_wcsdup(const unsigned short* s) mi_attr_noexcept {
+  if (s==NULL) return NULL;
+  size_t len;
+  for(len = 0; s[len] != 0; len++) { }
+  size_t size = (len+1)*sizeof(unsigned short);
+  unsigned short* p = (unsigned short*)mi_malloc(size);
+  if (p != NULL) {
+    _mi_memcpy(p,s,size);
+  }
+  return p;
+}
+
+mi_decl_nodiscard mi_decl_restrict unsigned char* mi_mbsdup(const unsigned char* s)  mi_attr_noexcept {
+  return (unsigned char*)mi_strdup((const char*)s);
+}
+
+int mi_dupenv_s(char** buf, size_t* size, const char* name) mi_attr_noexcept {
+  if (buf==NULL || name==NULL) return EINVAL;
+  if (size != NULL) *size = 0;
+  char* p = getenv(name);        // mscver warning 4996
+  if (p==NULL) {
+    *buf = NULL;
+  }
+  else {
+    *buf = mi_strdup(p);
+    if (*buf==NULL) return ENOMEM;
+    if (size != NULL) *size = strlen(p);
+  }
+  return 0;
+}
+
+int mi_wdupenv_s(unsigned short** buf, size_t* size, const unsigned short* name) mi_attr_noexcept {
+  if (buf==NULL || name==NULL) return EINVAL;
+  if (size != NULL) *size = 0;
+#if !defined(_WIN32) || (defined(WINAPI_FAMILY) && (WINAPI_FAMILY != WINAPI_FAMILY_DESKTOP_APP))
+  // not supported
+  *buf = NULL;
+  return EINVAL;
+#else
+  unsigned short* p = (unsigned short*)_wgetenv((const wchar_t*)name);  // msvc warning 4996
+  if (p==NULL) {
+    *buf = NULL;
+  }
+  else {
+    *buf = mi_wcsdup(p);
+    if (*buf==NULL) return ENOMEM;
+    if (size != NULL) *size = wcslen((const wchar_t*)p);
+  }
+  return 0;
+#endif
+}
+
+mi_decl_nodiscard void* mi_aligned_offset_recalloc(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept { // Microsoft
+  return mi_recalloc_aligned_at(p, newcount, size, alignment, offset);
+}
+
+mi_decl_nodiscard void* mi_aligned_recalloc(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept { // Microsoft
+  return mi_recalloc_aligned(p, newcount, size, alignment);
+}
diff --git a/Objects/mimalloc/alloc.c b/Objects/mimalloc/alloc.c
new file mode 100644
index 00000000000000..342a4bdf017681
--- /dev/null
+++ b/Objects/mimalloc/alloc.c
@@ -0,0 +1,1029 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2022, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#ifndef _DEFAULT_SOURCE
+#define _DEFAULT_SOURCE   // for realpath() on Linux
+#endif
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+
+#include <string.h>  // memset, strlen
+#include <stdlib.h>  // malloc, exit
+
+// ------------------------------------------------------
+// Allocation
+// ------------------------------------------------------
+
+// Fast allocation in a page: just pop from the free list.
+// Fall back to generic allocation only if the list is empty.
+extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept {
+  mi_assert_internal(page->xblock_size==0||mi_page_block_size(page) >= size);
+  mi_block_t* const block = page->free;
+  if mi_unlikely(block == NULL) {
+    return _mi_malloc_generic(heap, size, zero, 0);
+  }
+  mi_assert_internal(block != NULL && _mi_ptr_page(block) == page);
+  // pop from the free list
+  page->used++;
+  page->free = mi_block_next(page, block);
+  mi_assert_internal(page->free == NULL || _mi_ptr_page(page->free) == page);
+
+  // allow use of the block internally
+  // note: when tracking we need to avoid ever touching the MI_PADDING since
+  // that is tracked by valgrind etc. as non-accessible (through the red-zone, see `mimalloc-track.h`)
+  mi_track_mem_undefined(block, mi_page_usable_block_size(page));
+
+  // zero the block? note: we need to zero the full block size (issue #63)
+  if mi_unlikely(zero) {
+    mi_assert_internal(page->xblock_size != 0); // do not call with zero'ing for huge blocks (see _mi_malloc_generic)
+    const size_t zsize = (page->is_zero ? sizeof(block->next) + MI_PADDING_SIZE : page->xblock_size);
+    _mi_memzero_aligned(block, zsize - MI_PADDING_SIZE);
+  }
+
+#if (MI_DEBUG>0) && !MI_TRACK_ENABLED
+  if (!page->is_zero && !zero && !mi_page_is_huge(page)) {
+    memset(block, MI_DEBUG_UNINIT, mi_page_usable_block_size(page));
+  }
+#elif (MI_SECURE!=0)
+  if (!zero) { block->next = 0; } // don't leak internal data
+#endif
+
+#if (MI_STAT>0)
+  const size_t bsize = mi_page_usable_block_size(page);
+  if (bsize <= MI_MEDIUM_OBJ_SIZE_MAX) {
+    mi_heap_stat_increase(heap, normal, bsize);
+    mi_heap_stat_counter_increase(heap, normal_count, 1);
+#if (MI_STAT>1)
+    const size_t bin = _mi_bin(bsize);
+    mi_heap_stat_increase(heap, normal_bins[bin], 1);
+#endif
+  }
+#endif
+
+#if (MI_PADDING > 0) && defined(MI_ENCODE_FREELIST) && !MI_TRACK_ENABLED
+  mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + mi_page_usable_block_size(page));
+  ptrdiff_t delta = ((uint8_t*)padding - (uint8_t*)block - (size - MI_PADDING_SIZE));
+  #if (MI_DEBUG>1)
+  mi_assert_internal(delta >= 0 && mi_page_usable_block_size(page) >= (size - MI_PADDING_SIZE + delta));
+  mi_track_mem_defined(padding,sizeof(mi_padding_t));  // note: re-enable since mi_page_usable_block_size may set noaccess
+  #endif
+  padding->canary = (uint32_t)(mi_ptr_encode(page,block,page->keys));
+  padding->delta  = (uint32_t)(delta);
+  if (!mi_page_is_huge(page)) {
+    uint8_t* fill = (uint8_t*)padding - delta;
+    const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // set at most N initial padding bytes
+    for (size_t i = 0; i < maxpad; i++) { fill[i] = MI_DEBUG_PADDING; }
+  }
+#endif
+
+  return block;
+}
+
+static inline mi_decl_restrict void* mi_heap_malloc_small_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept {
+  mi_assert(heap != NULL);
+  #if MI_DEBUG
+  const uintptr_t tid = _mi_thread_id();
+  mi_assert(heap->thread_id == 0 || heap->thread_id == tid); // heaps are thread local
+  #endif
+  mi_assert(size <= MI_SMALL_SIZE_MAX);
+#if (MI_PADDING)
+  if (size == 0) {
+    size = sizeof(void*);
+  }
+#endif
+  mi_page_t* page = _mi_heap_get_free_small_page(heap, size + MI_PADDING_SIZE);
+  void* p = _mi_page_malloc(heap, page, size + MI_PADDING_SIZE, zero);
+  mi_assert_internal(p == NULL || mi_usable_size(p) >= size);
+#if MI_STAT>1
+  if (p != NULL) {
+    if (!mi_heap_is_initialized(heap)) { heap = mi_get_default_heap(); }
+    mi_heap_stat_increase(heap, malloc, mi_usable_size(p));
+  }
+#endif
+  mi_track_malloc(p,size,zero);
+  return p;
+}
+
+// allocate a small block
+mi_decl_nodiscard extern inline mi_decl_restrict void* mi_heap_malloc_small(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+  return mi_heap_malloc_small_zero(heap, size, false);
+}
+
+mi_decl_nodiscard extern inline mi_decl_restrict void* mi_malloc_small(size_t size) mi_attr_noexcept {
+  return mi_heap_malloc_small(mi_get_default_heap(), size);
+}
+
+// The main allocation function
+extern inline void* _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept {
+  if mi_likely(size <= MI_SMALL_SIZE_MAX) {
+    mi_assert_internal(huge_alignment == 0);
+    return mi_heap_malloc_small_zero(heap, size, zero);
+  }
+  else {
+    mi_assert(heap!=NULL);
+    mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id());   // heaps are thread local
+    void* const p = _mi_malloc_generic(heap, size + MI_PADDING_SIZE, zero, huge_alignment);  // note: size can overflow but it is detected in malloc_generic
+    mi_assert_internal(p == NULL || mi_usable_size(p) >= size);
+    #if MI_STAT>1
+    if (p != NULL) {
+      if (!mi_heap_is_initialized(heap)) { heap = mi_get_default_heap(); }
+      mi_heap_stat_increase(heap, malloc, mi_usable_size(p));
+    }
+    #endif
+    mi_track_malloc(p,size,zero);
+    return p;
+  }
+}
+
+extern inline void* _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept {
+  return _mi_heap_malloc_zero_ex(heap, size, zero, 0);
+}
+
+mi_decl_nodiscard extern inline mi_decl_restrict void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+  return _mi_heap_malloc_zero(heap, size, false);
+}
+
+mi_decl_nodiscard extern inline mi_decl_restrict void* mi_malloc(size_t size) mi_attr_noexcept {
+  return mi_heap_malloc(mi_get_default_heap(), size);
+}
+
+// zero initialized small block
+mi_decl_nodiscard mi_decl_restrict void* mi_zalloc_small(size_t size) mi_attr_noexcept {
+  return mi_heap_malloc_small_zero(mi_get_default_heap(), size, true);
+}
+
+mi_decl_nodiscard extern inline mi_decl_restrict void* mi_heap_zalloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+  return _mi_heap_malloc_zero(heap, size, true);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_zalloc(size_t size) mi_attr_noexcept {
+  return mi_heap_zalloc(mi_get_default_heap(),size);
+}
+
+
+// ------------------------------------------------------
+// Check for double free in secure and debug mode
+// This is somewhat expensive so only enabled for secure mode 4
+// ------------------------------------------------------
+
+#if (MI_ENCODE_FREELIST && (MI_SECURE>=4 || MI_DEBUG!=0))
+// linear check if the free list contains a specific element
+static bool mi_list_contains(const mi_page_t* page, const mi_block_t* list, const mi_block_t* elem) {
+  while (list != NULL) {
+    if (elem==list) return true;
+    list = mi_block_next(page, list);
+  }
+  return false;
+}
+
+static mi_decl_noinline bool mi_check_is_double_freex(const mi_page_t* page, const mi_block_t* block) {
+  // The decoded value is in the same page (or NULL).
+  // Walk the free lists to verify positively if it is already freed
+  if (mi_list_contains(page, page->free, block) ||
+      mi_list_contains(page, page->local_free, block) ||
+      mi_list_contains(page, mi_page_thread_free(page), block))
+  {
+    _mi_error_message(EAGAIN, "double free detected of block %p with size %zu\n", block, mi_page_block_size(page));
+    return true;
+  }
+  return false;
+}
+
+#define mi_track_page(page,access)  { size_t psize; void* pstart = _mi_page_start(_mi_page_segment(page),page,&psize); mi_track_mem_##access( pstart, psize); }
+
+static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
+  bool is_double_free = false;
+  mi_block_t* n = mi_block_nextx(page, block, page->keys); // pretend it is freed, and get the decoded first field
+  if (((uintptr_t)n & (MI_INTPTR_SIZE-1))==0 &&  // quick check: aligned pointer?
+      (n==NULL || mi_is_in_same_page(block, n))) // quick check: in same page or NULL?
+  {
+    // Suspicous: decoded value a in block is in the same page (or NULL) -- maybe a double free?
+    // (continue in separate function to improve code generation)
+    is_double_free = mi_check_is_double_freex(page, block);
+  }
+  return is_double_free;
+}
+#else
+static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(page);
+  MI_UNUSED(block);
+  return false;
+}
+#endif
+
+// ---------------------------------------------------------------------------
+// Check for heap block overflow by setting up padding at the end of the block
+// ---------------------------------------------------------------------------
+
+#if (MI_PADDING>0) && defined(MI_ENCODE_FREELIST) && !MI_TRACK_ENABLED
+static bool mi_page_decode_padding(const mi_page_t* page, const mi_block_t* block, size_t* delta, size_t* bsize) {
+  *bsize = mi_page_usable_block_size(page);
+  const mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + *bsize);
+  mi_track_mem_defined(padding,sizeof(mi_padding_t));
+  *delta = padding->delta;
+  uint32_t canary = padding->canary;
+  uintptr_t keys[2];
+  keys[0] = page->keys[0];
+  keys[1] = page->keys[1];
+  bool ok = ((uint32_t)mi_ptr_encode(page,block,keys) == canary && *delta <= *bsize);
+  mi_track_mem_noaccess(padding,sizeof(mi_padding_t));
+  return ok;
+}
+
+// Return the exact usable size of a block.
+static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
+  size_t bsize;
+  size_t delta;
+  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
+  mi_assert_internal(ok); mi_assert_internal(delta <= bsize);
+  return (ok ? bsize - delta : 0);
+}
+
+static bool mi_verify_padding(const mi_page_t* page, const mi_block_t* block, size_t* size, size_t* wrong) {
+  size_t bsize;
+  size_t delta;
+  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
+  *size = *wrong = bsize;
+  if (!ok) return false;
+  mi_assert_internal(bsize >= delta);
+  *size = bsize - delta;
+  if (!mi_page_is_huge(page)) {
+    uint8_t* fill = (uint8_t*)block + bsize - delta;
+    const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // check at most the first N padding bytes
+    mi_track_mem_defined(fill, maxpad);
+    for (size_t i = 0; i < maxpad; i++) {
+      if (fill[i] != MI_DEBUG_PADDING) {
+        *wrong = bsize - delta + i;
+        ok = false;
+        break;
+      }
+    }
+    mi_track_mem_noaccess(fill, maxpad);
+  }
+  return ok;
+}
+
+static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
+  size_t size;
+  size_t wrong;
+  if (!mi_verify_padding(page,block,&size,&wrong)) {
+    _mi_error_message(EFAULT, "buffer overflow in heap block %p of size %zu: write after %zu bytes\n", block, size, wrong );
+  }
+}
+
+// When a non-thread-local block is freed, it becomes part of the thread delayed free
+// list that is freed later by the owning heap. If the exact usable size is too small to
+// contain the pointer for the delayed list, then shrink the padding (by decreasing delta)
+// so it will later not trigger an overflow error in `mi_free_block`.
+static void mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
+  size_t bsize;
+  size_t delta;
+  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
+  mi_assert_internal(ok);
+  if (!ok || (bsize - delta) >= min_size) return;  // usually already enough space
+  mi_assert_internal(bsize >= min_size);
+  if (bsize < min_size) return;  // should never happen
+  size_t new_delta = (bsize - min_size);
+  mi_assert_internal(new_delta < bsize);
+  mi_padding_t* padding = (mi_padding_t*)((uint8_t*)block + bsize);
+  padding->delta = (uint32_t)new_delta;
+}
+#else
+static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(page);
+  MI_UNUSED(block);
+}
+
+static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(block);
+  return mi_page_usable_block_size(page);
+}
+
+static void mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
+  MI_UNUSED(page);
+  MI_UNUSED(block);
+  MI_UNUSED(min_size);
+}
+#endif
+
+// only maintain stats for smaller objects if requested
+#if (MI_STAT>0)
+static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
+  #if (MI_STAT < 2)  
+  MI_UNUSED(block);
+  #endif
+  mi_heap_t* const heap = mi_heap_get_default();
+  const size_t bsize = mi_page_usable_block_size(page);
+  #if (MI_STAT>1)
+  const size_t usize = mi_page_usable_size_of(page, block);
+  mi_heap_stat_decrease(heap, malloc, usize);
+  #endif  
+  if (bsize <= MI_MEDIUM_OBJ_SIZE_MAX) {
+    mi_heap_stat_decrease(heap, normal, bsize);
+    #if (MI_STAT > 1)
+    mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], 1);
+    #endif
+  }
+  else if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+    mi_heap_stat_decrease(heap, large, bsize);
+  }
+  else {
+    mi_heap_stat_decrease(heap, huge, bsize);
+  }  
+}
+#else
+static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(page); MI_UNUSED(block);
+}
+#endif
+
+#if MI_HUGE_PAGE_ABANDON
+#if (MI_STAT>0)
+// maintain stats for huge objects
+static void mi_stat_huge_free(const mi_page_t* page) {
+  mi_heap_t* const heap = mi_heap_get_default();
+  const size_t bsize = mi_page_block_size(page); // to match stats in `page.c:mi_page_huge_alloc`
+  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+    mi_heap_stat_decrease(heap, large, bsize);
+  }
+  else {
+    mi_heap_stat_decrease(heap, huge, bsize);
+  }
+}
+#else
+static void mi_stat_huge_free(const mi_page_t* page) {
+  MI_UNUSED(page);
+}
+#endif
+#endif
+
+// ------------------------------------------------------
+// Free
+// ------------------------------------------------------
+
+// multi-threaded free (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
+static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* block)
+{
+  // The padding check may access the non-thread-owned page for the key values.
+  // that is safe as these are constant and the page won't be freed (as the block is not freed yet).
+  mi_check_padding(page, block);
+  mi_padding_shrink(page, block, sizeof(mi_block_t));       // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection
+  
+  // huge page segments are always abandoned and can be freed immediately
+  mi_segment_t* segment = _mi_page_segment(page);
+  if (segment->kind == MI_SEGMENT_HUGE) {
+    #if MI_HUGE_PAGE_ABANDON
+    // huge page segments are always abandoned and can be freed immediately
+    mi_stat_huge_free(page);
+    _mi_segment_huge_page_free(segment, page, block);
+    return;
+    #else
+    // huge pages are special as they occupy the entire segment
+    // as these are large we reset the memory occupied by the page so it is available to other threads
+    // (as the owning thread needs to actually free the memory later).
+    _mi_segment_huge_page_reset(segment, page, block);
+    #endif
+  }
+  
+  #if (MI_DEBUG!=0) && !MI_TRACK_ENABLED                    // note: when tracking, cannot use mi_usable_size with multi-threading
+  if (segment->kind != MI_SEGMENT_HUGE) {                   // not for huge segments as we just reset the content
+    memset(block, MI_DEBUG_FREED, mi_usable_size(block));
+  }
+  #endif
+
+  // Try to put the block on either the page-local thread free list, or the heap delayed free list.
+  mi_thread_free_t tfreex;
+  bool use_delayed;
+  mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
+  do {
+    use_delayed = (mi_tf_delayed(tfree) == MI_USE_DELAYED_FREE);
+    if mi_unlikely(use_delayed) {
+      // unlikely: this only happens on the first concurrent free in a page that is in the full list
+      tfreex = mi_tf_set_delayed(tfree,MI_DELAYED_FREEING);
+    }
+    else {
+      // usual: directly add to page thread_free list
+      mi_block_set_next(page, block, mi_tf_block(tfree));
+      tfreex = mi_tf_set_block(tfree,block);
+    }
+  } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
+
+  if mi_unlikely(use_delayed) {
+    // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`)
+    mi_heap_t* const heap = (mi_heap_t*)(mi_atomic_load_acquire(&page->xheap)); //mi_page_heap(page);
+    mi_assert_internal(heap != NULL);
+    if (heap != NULL) {
+      // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity)
+      mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
+      do {
+        mi_block_set_nextx(heap,block,dfree, heap->keys);
+      } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block));
+    }
+
+    // and reset the MI_DELAYED_FREEING flag
+    tfree = mi_atomic_load_relaxed(&page->xthread_free);
+    do {
+      tfreex = tfree;
+      mi_assert_internal(mi_tf_delayed(tfree) == MI_DELAYED_FREEING);
+      tfreex = mi_tf_set_delayed(tfree,MI_NO_DELAYED_FREE);
+    } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
+  }
+}
+
+// regular free
+static inline void _mi_free_block(mi_page_t* page, bool local, mi_block_t* block)
+{
+  // and push it on the free list
+  //const size_t bsize = mi_page_block_size(page);
+  if mi_likely(local) {
+    // owning thread can free a block directly
+    if mi_unlikely(mi_check_is_double_free(page, block)) return;
+    mi_check_padding(page, block);
+    #if (MI_DEBUG!=0) && !MI_TRACK_ENABLED
+    if (!mi_page_is_huge(page)) {   // huge page content may be already decommitted
+      memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
+    }
+    #endif
+    mi_block_set_next(page, block, page->local_free);
+    page->local_free = block;
+    page->used--;
+    if mi_unlikely(mi_page_all_free(page)) {
+      _mi_page_retire(page);
+    }
+    else if mi_unlikely(mi_page_is_in_full(page)) {
+      _mi_page_unfull(page);
+    }
+  }
+  else {
+    _mi_free_block_mt(page,block);
+  }
+}
+
+
+// Adjust a block that was allocated aligned, to the actual start of the block in the page.
+mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p) {
+  mi_assert_internal(page!=NULL && p!=NULL);
+  const size_t diff   = (uint8_t*)p - _mi_page_start(segment, page, NULL);
+  const size_t adjust = (diff % mi_page_block_size(page));
+  return (mi_block_t*)((uintptr_t)p - adjust);
+}
+
+
+void mi_decl_noinline _mi_free_generic(const mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept {
+  mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(segment, page, p) : (mi_block_t*)p);
+  mi_stat_free(page, block);                 // stat_free may access the padding
+  mi_track_free(p);
+  _mi_free_block(page, is_local, block);
+}
+
+// Get the segment data belonging to a pointer
+// This is just a single `and` in assembly but does further checks in debug mode
+// (and secure mode) if this was a valid pointer.
+static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* msg)
+{
+  MI_UNUSED(msg);
+  mi_assert(p != NULL);
+
+#if (MI_DEBUG>0)
+  if mi_unlikely(((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0) {
+    _mi_error_message(EINVAL, "%s: invalid (unaligned) pointer: %p\n", msg, p);
+    return NULL;
+  }
+#endif
+
+  mi_segment_t* const segment = _mi_ptr_segment(p);
+  mi_assert_internal(segment != NULL);
+
+#if (MI_DEBUG>0)
+  if mi_unlikely(!mi_is_in_heap_region(p)) {
+  #if (MI_INTPTR_SIZE == 8 && defined(__linux__))
+    if (((uintptr_t)p >> 40) != 0x7F) { // linux tends to align large blocks above 0x7F000000000 (issue #640)
+  #else
+    {
+  #endif
+      _mi_warning_message("%s: pointer might not point to a valid heap region: %p\n"
+        "(this may still be a valid very large allocation (over 64MiB))\n", msg, p);
+      if mi_likely(_mi_ptr_cookie(segment) == segment->cookie) {
+        _mi_warning_message("(yes, the previous pointer %p was valid after all)\n", p);
+      }
+    }
+  }
+#endif
+#if (MI_DEBUG>0 || MI_SECURE>=4)
+  if mi_unlikely(_mi_ptr_cookie(segment) != segment->cookie) {
+    _mi_error_message(EINVAL, "%s: pointer does not point to a valid heap space: %p\n", msg, p);
+    return NULL;
+  }
+#endif
+
+  return segment;
+}
+
+// Free a block
+// fast path written carefully to prevent spilling on the stack
+void mi_free(void* p) mi_attr_noexcept
+{
+  if mi_unlikely(p == NULL) return;
+  mi_segment_t* const segment = mi_checked_ptr_segment(p,"mi_free");
+  const bool          is_local= (_mi_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
+  mi_page_t* const    page    = _mi_segment_page_of(segment, p);
+
+  if mi_likely(is_local) {                       // thread-local free?
+    if mi_likely(page->flags.full_aligned == 0)  // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned)
+    {
+      mi_block_t* const block = (mi_block_t*)p;
+      if mi_unlikely(mi_check_is_double_free(page, block)) return;
+      mi_check_padding(page, block);
+      mi_stat_free(page, block);
+      #if (MI_DEBUG!=0) && !MI_TRACK_ENABLED
+      memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
+      #endif
+      mi_track_free(p);
+      mi_block_set_next(page, block, page->local_free);
+      page->local_free = block;
+      if mi_unlikely(--page->used == 0) {   // using this expression generates better code than: page->used--; if (mi_page_all_free(page))
+        _mi_page_retire(page);
+      }
+    }
+    else {
+      // page is full or contains (inner) aligned blocks; use generic path
+      _mi_free_generic(segment, page, true, p);
+    }
+  }
+  else {
+    // not thread-local; use generic path
+    _mi_free_generic(segment, page, false, p);
+  }
+}
+
+// return true if successful
+bool _mi_free_delayed_block(mi_block_t* block) {
+  // get segment and page
+  const mi_segment_t* const segment = _mi_ptr_segment(block);
+  mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
+  mi_assert_internal(_mi_thread_id() == segment->thread_id);
+  mi_page_t* const page = _mi_segment_page_of(segment, block);
+
+  // Clear the no-delayed flag so delayed freeing is used again for this page.
+  // This must be done before collecting the free lists on this page -- otherwise
+  // some blocks may end up in the page `thread_free` list with no blocks in the
+  // heap `thread_delayed_free` list which may cause the page to be never freed!
+  // (it would only be freed if we happen to scan it in `mi_page_queue_find_free_ex`)
+  if (!_mi_page_try_use_delayed_free(page, MI_USE_DELAYED_FREE, false /* dont overwrite never delayed */)) {
+    return false;
+  }
+
+  // collect all other non-local frees to ensure up-to-date `used` count
+  _mi_page_free_collect(page, false);
+
+  // and free the block (possibly freeing the page as well since used is updated)
+  _mi_free_block(page, true, block);
+  return true;
+}
+
+// Bytes available in a block
+mi_decl_noinline static size_t mi_page_usable_aligned_size_of(const mi_segment_t* segment, const mi_page_t* page, const void* p) mi_attr_noexcept {
+  const mi_block_t* block = _mi_page_ptr_unalign(segment, page, p);
+  const size_t size = mi_page_usable_size_of(page, block);
+  const ptrdiff_t adjust = (uint8_t*)p - (uint8_t*)block;
+  mi_assert_internal(adjust >= 0 && (size_t)adjust <= size);
+  return (size - adjust);
+}
+
+static inline size_t _mi_usable_size(const void* p, const char* msg) mi_attr_noexcept {
+  if (p == NULL) return 0;
+  const mi_segment_t* const segment = mi_checked_ptr_segment(p, msg);
+  const mi_page_t* const page = _mi_segment_page_of(segment, p);
+  if mi_likely(!mi_page_has_aligned(page)) {
+    const mi_block_t* block = (const mi_block_t*)p;
+    return mi_page_usable_size_of(page, block);
+  }
+  else {
+    // split out to separate routine for improved code generation
+    return mi_page_usable_aligned_size_of(segment, page, p);
+  }
+}
+
+mi_decl_nodiscard size_t mi_usable_size(const void* p) mi_attr_noexcept {
+  return _mi_usable_size(p, "mi_usable_size");
+}
+
+
+// ------------------------------------------------------
+// Allocation extensions
+// ------------------------------------------------------
+
+void mi_free_size(void* p, size_t size) mi_attr_noexcept {
+  MI_UNUSED_RELEASE(size);
+  mi_assert(p == NULL || size <= _mi_usable_size(p,"mi_free_size"));
+  mi_free(p);
+}
+
+void mi_free_size_aligned(void* p, size_t size, size_t alignment) mi_attr_noexcept {
+  MI_UNUSED_RELEASE(alignment);
+  mi_assert(((uintptr_t)p % alignment) == 0);
+  mi_free_size(p,size);
+}
+
+void mi_free_aligned(void* p, size_t alignment) mi_attr_noexcept {
+  MI_UNUSED_RELEASE(alignment);
+  mi_assert(((uintptr_t)p % alignment) == 0);
+  mi_free(p);
+}
+
+mi_decl_nodiscard extern inline mi_decl_restrict void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
+  size_t total;
+  if (mi_count_size_overflow(count,size,&total)) return NULL;
+  return mi_heap_zalloc(heap,total);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_calloc(size_t count, size_t size) mi_attr_noexcept {
+  return mi_heap_calloc(mi_get_default_heap(),count,size);
+}
+
+// Uninitialized `calloc`
+mi_decl_nodiscard extern mi_decl_restrict void* mi_heap_mallocn(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
+  size_t total;
+  if (mi_count_size_overflow(count, size, &total)) return NULL;
+  return mi_heap_malloc(heap, total);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_mallocn(size_t count, size_t size) mi_attr_noexcept {
+  return mi_heap_mallocn(mi_get_default_heap(),count,size);
+}
+
+// Expand (or shrink) in place (or fail)
+void* mi_expand(void* p, size_t newsize) mi_attr_noexcept {
+  #if MI_PADDING
+  // we do not shrink/expand with padding enabled
+  MI_UNUSED(p); MI_UNUSED(newsize);
+  return NULL;
+  #else
+  if (p == NULL) return NULL;
+  const size_t size = _mi_usable_size(p,"mi_expand");
+  if (newsize > size) return NULL;
+  return p; // it fits
+  #endif
+}
+
+void* _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) mi_attr_noexcept {
+  // if p == NULL then behave as malloc.
+  // else if size == 0 then reallocate to a zero-sized block (and don't return NULL, just as mi_malloc(0)).
+  // (this means that returning NULL always indicates an error, and `p` will not have been freed in that case.)
+  const size_t size = _mi_usable_size(p,"mi_realloc"); // also works if p == NULL (with size 0)
+  if mi_unlikely(newsize <= size && newsize >= (size / 2) && newsize > 0) {  // note: newsize must be > 0 or otherwise we return NULL for realloc(NULL,0)
+    // todo: adjust potential padding to reflect the new size?
+    mi_track_free_size(p, size);
+    mi_track_malloc(p,newsize,true);
+    return p;  // reallocation still fits and not more than 50% waste
+  }
+  void* newp = mi_heap_malloc(heap,newsize);
+  if mi_likely(newp != NULL) {
+    if (zero && newsize > size) {
+      // also set last word in the previous allocation to zero to ensure any padding is zero-initialized
+      const size_t start = (size >= sizeof(intptr_t) ? size - sizeof(intptr_t) : 0);
+      memset((uint8_t*)newp + start, 0, newsize - start);
+    }
+    if mi_likely(p != NULL) {
+      if mi_likely(_mi_is_aligned(p, sizeof(uintptr_t))) {  // a client may pass in an arbitrary pointer `p`..
+        const size_t copysize = (newsize > size ? size : newsize);
+        mi_track_mem_defined(p,copysize);  // _mi_useable_size may be too large for byte precise memory tracking..
+        _mi_memcpy_aligned(newp, p, copysize);
+      }
+      mi_free(p); // only free the original pointer if successful
+    }
+  }
+  return newp;
+}
+
+mi_decl_nodiscard void* mi_heap_realloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
+  return _mi_heap_realloc_zero(heap, p, newsize, false);
+}
+
+mi_decl_nodiscard void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept {
+  size_t total;
+  if (mi_count_size_overflow(count, size, &total)) return NULL;
+  return mi_heap_realloc(heap, p, total);
+}
+
+
+// Reallocate but free `p` on errors
+mi_decl_nodiscard void* mi_heap_reallocf(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
+  void* newp = mi_heap_realloc(heap, p, newsize);
+  if (newp==NULL && p!=NULL) mi_free(p);
+  return newp;
+}
+
+mi_decl_nodiscard void* mi_heap_rezalloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
+  return _mi_heap_realloc_zero(heap, p, newsize, true);
+}
+
+mi_decl_nodiscard void* mi_heap_recalloc(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept {
+  size_t total;
+  if (mi_count_size_overflow(count, size, &total)) return NULL;
+  return mi_heap_rezalloc(heap, p, total);
+}
+
+
+mi_decl_nodiscard void* mi_realloc(void* p, size_t newsize) mi_attr_noexcept {
+  return mi_heap_realloc(mi_get_default_heap(),p,newsize);
+}
+
+mi_decl_nodiscard void* mi_reallocn(void* p, size_t count, size_t size) mi_attr_noexcept {
+  return mi_heap_reallocn(mi_get_default_heap(),p,count,size);
+}
+
+// Reallocate but free `p` on errors
+mi_decl_nodiscard void* mi_reallocf(void* p, size_t newsize) mi_attr_noexcept {
+  return mi_heap_reallocf(mi_get_default_heap(),p,newsize);
+}
+
+mi_decl_nodiscard void* mi_rezalloc(void* p, size_t newsize) mi_attr_noexcept {
+  return mi_heap_rezalloc(mi_get_default_heap(), p, newsize);
+}
+
+mi_decl_nodiscard void* mi_recalloc(void* p, size_t count, size_t size) mi_attr_noexcept {
+  return mi_heap_recalloc(mi_get_default_heap(), p, count, size);
+}
+
+
+
+// ------------------------------------------------------
+// strdup, strndup, and realpath
+// ------------------------------------------------------
+
+// `strdup` using mi_malloc
+mi_decl_nodiscard mi_decl_restrict char* mi_heap_strdup(mi_heap_t* heap, const char* s) mi_attr_noexcept {
+  if (s == NULL) return NULL;
+  size_t n = strlen(s);
+  char* t = (char*)mi_heap_malloc(heap,n+1);
+  if (t == NULL) return NULL;
+  _mi_memcpy(t, s, n);
+  t[n] = 0;
+  return t;
+}
+
+mi_decl_nodiscard mi_decl_restrict char* mi_strdup(const char* s) mi_attr_noexcept {
+  return mi_heap_strdup(mi_get_default_heap(), s);
+}
+
+// `strndup` using mi_malloc
+mi_decl_nodiscard mi_decl_restrict char* mi_heap_strndup(mi_heap_t* heap, const char* s, size_t n) mi_attr_noexcept {
+  if (s == NULL) return NULL;
+  const char* end = (const char*)memchr(s, 0, n);  // find end of string in the first `n` characters (returns NULL if not found)
+  const size_t m = (end != NULL ? (size_t)(end - s) : n);  // `m` is the minimum of `n` or the end-of-string
+  mi_assert_internal(m <= n);
+  char* t = (char*)mi_heap_malloc(heap, m+1);
+  if (t == NULL) return NULL;
+  _mi_memcpy(t, s, m);
+  t[m] = 0;
+  return t;
+}
+
+mi_decl_nodiscard mi_decl_restrict char* mi_strndup(const char* s, size_t n) mi_attr_noexcept {
+  return mi_heap_strndup(mi_get_default_heap(),s,n);
+}
+
+#ifndef __wasi__
+// `realpath` using mi_malloc
+#ifdef _WIN32
+#ifndef PATH_MAX
+#define PATH_MAX MAX_PATH
+#endif
+#include <windows.h>
+mi_decl_nodiscard mi_decl_restrict char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept {
+  // todo: use GetFullPathNameW to allow longer file names
+  char buf[PATH_MAX];
+  DWORD res = GetFullPathNameA(fname, PATH_MAX, (resolved_name == NULL ? buf : resolved_name), NULL);
+  if (res == 0) {
+    errno = GetLastError(); return NULL;
+  }
+  else if (res > PATH_MAX) {
+    errno = EINVAL; return NULL;
+  }
+  else if (resolved_name != NULL) {
+    return resolved_name;
+  }
+  else {
+    return mi_heap_strndup(heap, buf, PATH_MAX);
+  }
+}
+#else
+/*
+#include <unistd.h>  // pathconf
+static size_t mi_path_max(void) {
+  static size_t path_max = 0;
+  if (path_max <= 0) {
+    long m = pathconf("/",_PC_PATH_MAX);
+    if (m <= 0) path_max = 4096;      // guess
+    else if (m < 256) path_max = 256; // at least 256
+    else path_max = m;
+  }
+  return path_max;
+}
+*/
+char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept {
+  if (resolved_name != NULL) {
+    return realpath(fname,resolved_name);
+  }
+  else {
+    char* rname = realpath(fname, NULL);
+    if (rname == NULL) return NULL;
+    char* result = mi_heap_strdup(heap, rname);
+    free(rname);  // use regular free! (which may be redirected to our free but that's ok)
+    return result;
+  }
+  /*
+    const size_t n  = mi_path_max();
+    char* buf = (char*)mi_malloc(n+1);
+    if (buf == NULL) {
+      errno = ENOMEM;
+      return NULL;
+    }
+    char* rname  = realpath(fname,buf);
+    char* result = mi_heap_strndup(heap,rname,n); // ok if `rname==NULL`
+    mi_free(buf);
+    return result;
+  }
+  */
+}
+#endif
+
+mi_decl_nodiscard mi_decl_restrict char* mi_realpath(const char* fname, char* resolved_name) mi_attr_noexcept {
+  return mi_heap_realpath(mi_get_default_heap(),fname,resolved_name);
+}
+#endif
+
+/*-------------------------------------------------------
+C++ new and new_aligned
+The standard requires calling into `get_new_handler` and
+throwing the bad_alloc exception on failure. If we compile
+with a C++ compiler we can implement this precisely. If we
+use a C compiler we cannot throw a `bad_alloc` exception
+but we call `exit` instead (i.e. not returning).
+-------------------------------------------------------*/
+
+#if 0
+#ifdef __cplusplus
+#include <new>
+static bool mi_try_new_handler(bool nothrow) {
+  #if defined(_MSC_VER) || (__cplusplus >= 201103L)
+    std::new_handler h = std::get_new_handler();
+  #else
+    std::new_handler h = std::set_new_handler();
+    std::set_new_handler(h);
+  #endif
+  if (h==NULL) {
+    _mi_error_message(ENOMEM, "out of memory in 'new'");
+    if (!nothrow) {
+      throw std::bad_alloc();
+    }
+    return false;
+  }
+  else {
+    h();
+    return true;
+  }
+}
+#else
+typedef void (*std_new_handler_t)(void);
+
+#if (defined(__GNUC__) || (defined(__clang__) && !defined(_MSC_VER)))  // exclude clang-cl, see issue #631
+std_new_handler_t __attribute__((weak)) _ZSt15get_new_handlerv(void) {
+  return NULL;
+}
+static std_new_handler_t mi_get_new_handler(void) {
+  return _ZSt15get_new_handlerv();
+}
+#else
+// note: on windows we could dynamically link to `?get_new_handler@std@@YAP6AXXZXZ`.
+static std_new_handler_t mi_get_new_handler() {
+  return NULL;
+}
+#endif
+
+static bool mi_try_new_handler(bool nothrow) {
+  std_new_handler_t h = mi_get_new_handler();
+  if (h==NULL) {
+    _mi_error_message(ENOMEM, "out of memory in 'new'");
+    if (!nothrow) {
+      abort();  // cannot throw in plain C, use abort
+    }
+    return false;
+  }
+  else {
+    h();
+    return true;
+  }
+}
+#endif
+
+static mi_decl_noinline void* mi_heap_try_new(mi_heap_t* heap, size_t size, bool nothrow ) {
+  void* p = NULL;
+  while(p == NULL && mi_try_new_handler(nothrow)) {
+    p = mi_heap_malloc(heap,size);
+  }
+  return p;
+}
+
+static mi_decl_noinline void* mi_try_new(size_t size, bool nothrow) {
+  return mi_heap_try_new(mi_get_default_heap(), size, nothrow);
+}
+
+
+mi_decl_nodiscard mi_decl_restrict extern inline void* mi_heap_alloc_new(mi_heap_t* heap, size_t size) {
+  void* p = mi_heap_malloc(heap,size);
+  if mi_unlikely(p == NULL) return mi_heap_try_new(heap, size, false);
+  return p;
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_new(size_t size) {
+  return mi_heap_alloc_new(mi_get_default_heap(), size);
+}
+
+
+mi_decl_nodiscard mi_decl_restrict extern inline void* mi_heap_alloc_new_n(mi_heap_t* heap, size_t count, size_t size) {
+  size_t total;
+  if mi_unlikely(mi_count_size_overflow(count, size, &total)) {
+    mi_try_new_handler(false);  // on overflow we invoke the try_new_handler once to potentially throw std::bad_alloc
+    return NULL;
+  }
+  else {
+    return mi_heap_alloc_new(heap,total);
+  }
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_new_n(size_t count, size_t size) {
+  return mi_heap_alloc_new_n(mi_get_default_heap(), size, count);
+}
+
+
+mi_decl_nodiscard mi_decl_restrict void* mi_new_nothrow(size_t size) mi_attr_noexcept {
+  void* p = mi_malloc(size);
+  if mi_unlikely(p == NULL) return mi_try_new(size, true);
+  return p;
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_new_aligned(size_t size, size_t alignment) {
+  void* p;
+  do {
+    p = mi_malloc_aligned(size, alignment);
+  }
+  while(p == NULL && mi_try_new_handler(false));
+  return p;
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_new_aligned_nothrow(size_t size, size_t alignment) mi_attr_noexcept {
+  void* p;
+  do {
+    p = mi_malloc_aligned(size, alignment);
+  }
+  while(p == NULL && mi_try_new_handler(true));
+  return p;
+}
+
+mi_decl_nodiscard void* mi_new_realloc(void* p, size_t newsize) {
+  void* q;
+  do {
+    q = mi_realloc(p, newsize);
+  } while (q == NULL && mi_try_new_handler(false));
+  return q;
+}
+
+mi_decl_nodiscard void* mi_new_reallocn(void* p, size_t newcount, size_t size) {
+  size_t total;
+  if mi_unlikely(mi_count_size_overflow(newcount, size, &total)) {
+    mi_try_new_handler(false);  // on overflow we invoke the try_new_handler once to potentially throw std::bad_alloc
+    return NULL;
+  }
+  else {
+    return mi_new_realloc(p, total);
+  }
+}
+#endif
+
+// ------------------------------------------------------
+// ensure explicit external inline definitions are emitted!
+// ------------------------------------------------------
+
+#ifdef __cplusplus
+void* _mi_externs[] = {
+  (void*)&_mi_page_malloc,
+  (void*)&_mi_heap_malloc_zero,
+  (void*)&_mi_heap_malloc_zero_ex,
+  (void*)&mi_malloc,
+  (void*)&mi_malloc_small,
+  (void*)&mi_zalloc_small,
+  (void*)&mi_heap_malloc,
+  (void*)&mi_heap_zalloc,
+  (void*)&mi_heap_malloc_small,
+  (void*)&mi_heap_alloc_new,
+  (void*)&mi_heap_alloc_new_n
+};
+#endif
diff --git a/Objects/mimalloc/arena.c b/Objects/mimalloc/arena.c
new file mode 100644
index 00000000000000..80dd47869c9e24
--- /dev/null
+++ b/Objects/mimalloc/arena.c
@@ -0,0 +1,536 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2022, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+"Arenas" are fixed area's of OS memory from which we can allocate
+large blocks (>= MI_ARENA_MIN_BLOCK_SIZE, 4MiB).
+In contrast to the rest of mimalloc, the arenas are shared between
+threads and need to be accessed using atomic operations.
+
+Currently arenas are only used to for huge OS page (1GiB) reservations,
+or direct OS memory reservations -- otherwise it delegates to direct allocation from the OS.
+In the future, we can expose an API to manually add more kinds of arenas
+which is sometimes needed for embedded devices or shared memory for example.
+(We can also employ this with WASI or `sbrk` systems to reserve large arenas
+ on demand and be able to reuse them efficiently).
+
+The arena allocation needs to be thread safe and we use an atomic bitmap to allocate.
+-----------------------------------------------------------------------------*/
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include <string.h>  // memset
+#include <errno.h> // ENOMEM
+
+#include "bitmap.h"  // atomic bitmap
+
+
+// os.c
+void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_stats_t* stats);
+void  _mi_os_free_ex(void* p, size_t size, bool was_committed, mi_stats_t* stats);
+
+void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_secs, size_t* pages_reserved, size_t* psize);
+void  _mi_os_free_huge_pages(void* p, size_t size, mi_stats_t* stats);
+
+bool  _mi_os_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
+bool  _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats);
+
+
+/* -----------------------------------------------------------
+  Arena allocation
+----------------------------------------------------------- */
+
+// Block info: bit 0 contains the `in_use` bit, the upper bits the
+// size in count of arena blocks.
+typedef uintptr_t mi_block_info_t;
+#define MI_ARENA_BLOCK_SIZE   (MI_SEGMENT_SIZE)        // 64MiB  (must be at least MI_SEGMENT_ALIGN)
+#define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_BLOCK_SIZE/2)  // 32MiB
+#define MI_MAX_ARENAS         (64)                     // not more than 126 (since we use 7 bits in the memid and an arena index + 1)
+
+// A memory arena descriptor
+typedef struct mi_arena_s {
+  mi_arena_id_t id;                       // arena id; 0 for non-specific
+  bool     exclusive;                     // only allow allocations if specifically for this arena
+  _Atomic(uint8_t*) start;                // the start of the memory area
+  size_t   block_count;                   // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`)
+  size_t   field_count;                   // number of bitmap fields (where `field_count * MI_BITMAP_FIELD_BITS >= block_count`)
+  int      numa_node;                     // associated NUMA node
+  bool     is_zero_init;                  // is the arena zero initialized?
+  bool     allow_decommit;                // is decommit allowed? if true, is_large should be false and blocks_committed != NULL
+  bool     is_large;                      // large- or huge OS pages (always committed)
+  _Atomic(size_t) search_idx;             // optimization to start the search for free blocks
+  mi_bitmap_field_t* blocks_dirty;        // are the blocks potentially non-zero?
+  mi_bitmap_field_t* blocks_committed;    // are the blocks committed? (can be NULL for memory that cannot be decommitted)
+  mi_bitmap_field_t  blocks_inuse[1];     // in-place bitmap of in-use blocks (of size `field_count`)
+} mi_arena_t;
+
+
+// The available arenas
+static mi_decl_cache_align _Atomic(mi_arena_t*) mi_arenas[MI_MAX_ARENAS];
+static mi_decl_cache_align _Atomic(size_t)      mi_arena_count; // = 0
+
+
+/* -----------------------------------------------------------
+  Arena id's
+  0 is used for non-arena's (like OS memory)
+  id = arena_index + 1
+----------------------------------------------------------- */
+
+static size_t mi_arena_id_index(mi_arena_id_t id) {
+  return (size_t)(id <= 0 ? MI_MAX_ARENAS : id - 1);
+}
+
+static mi_arena_id_t mi_arena_id_create(size_t arena_index) {
+  mi_assert_internal(arena_index < MI_MAX_ARENAS);
+  mi_assert_internal(MI_MAX_ARENAS <= 126);
+  int id = (int)arena_index + 1;
+  mi_assert_internal(id >= 1 && id <= 127);
+  return id;
+}
+
+mi_arena_id_t _mi_arena_id_none(void) {
+  return 0;
+}
+
+static bool mi_arena_id_is_suitable(mi_arena_id_t arena_id, bool arena_is_exclusive, mi_arena_id_t req_arena_id) {
+  return ((!arena_is_exclusive && req_arena_id == _mi_arena_id_none()) ||
+          (arena_id == req_arena_id));
+}
+
+
+/* -----------------------------------------------------------
+  Arena allocations get a memory id where the lower 8 bits are
+  the arena id, and the upper bits the block index.
+----------------------------------------------------------- */
+
+// Use `0` as a special id for direct OS allocated memory.
+#define MI_MEMID_OS   0
+
+static size_t mi_arena_memid_create(mi_arena_id_t id, bool exclusive, mi_bitmap_index_t bitmap_index) {
+  mi_assert_internal(((bitmap_index << 8) >> 8) == bitmap_index); // no overflow?
+  mi_assert_internal(id >= 0 && id <= 0x7F);
+  return ((bitmap_index << 8) | ((uint8_t)id & 0x7F) | (exclusive ? 0x80 : 0));
+}
+
+static bool mi_arena_memid_indices(size_t arena_memid, size_t* arena_index, mi_bitmap_index_t* bitmap_index) {
+  *bitmap_index = (arena_memid >> 8);
+  mi_arena_id_t id = (int)(arena_memid & 0x7F);
+  *arena_index = mi_arena_id_index(id);
+  return ((arena_memid & 0x80) != 0);
+}
+
+bool _mi_arena_memid_is_suitable(size_t arena_memid, mi_arena_id_t request_arena_id) {
+  mi_arena_id_t id = (int)(arena_memid & 0x7F);
+  bool exclusive = ((arena_memid & 0x80) != 0);
+  return mi_arena_id_is_suitable(id, exclusive, request_arena_id);
+}
+
+static size_t mi_block_count_of_size(size_t size) {
+  return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE);
+}
+
+/* -----------------------------------------------------------
+  Thread safe allocation in an arena
+----------------------------------------------------------- */
+static bool mi_arena_alloc(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t* bitmap_idx)
+{
+  size_t idx = 0; // mi_atomic_load_relaxed(&arena->search_idx);  // start from last search; ok to be relaxed as the exact start does not matter
+  if (_mi_bitmap_try_find_from_claim_across(arena->blocks_inuse, arena->field_count, idx, blocks, bitmap_idx)) {
+    mi_atomic_store_relaxed(&arena->search_idx, mi_bitmap_index_field(*bitmap_idx));  // start search from found location next time around
+    return true;
+  };
+  return false;
+}
+
+
+/* -----------------------------------------------------------
+  Arena Allocation
+----------------------------------------------------------- */
+
+static mi_decl_noinline void* mi_arena_alloc_from(mi_arena_t* arena, size_t arena_index, size_t needed_bcount,
+                                                   bool* commit, bool* large, bool* is_pinned, bool* is_zero, 
+                                                   mi_arena_id_t req_arena_id, size_t* memid, mi_os_tld_t* tld)
+{
+  MI_UNUSED(arena_index);
+  mi_assert_internal(mi_arena_id_index(arena->id) == arena_index);
+  if (!mi_arena_id_is_suitable(arena->id, arena->exclusive, req_arena_id)) return NULL;
+
+  mi_bitmap_index_t bitmap_index;
+  if (!mi_arena_alloc(arena, needed_bcount, &bitmap_index)) return NULL;
+
+  // claimed it! set the dirty bits (todo: no need for an atomic op here?)
+  void* p    = arena->start + (mi_bitmap_index_bit(bitmap_index)*MI_ARENA_BLOCK_SIZE);
+  *memid     = mi_arena_memid_create(arena->id, arena->exclusive, bitmap_index);
+  *is_zero   = _mi_bitmap_claim_across(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL);
+  *large     = arena->is_large;
+  *is_pinned = (arena->is_large || !arena->allow_decommit);
+  if (arena->blocks_committed == NULL) {
+    // always committed
+    *commit = true;
+  }
+  else if (*commit) {
+    // arena not committed as a whole, but commit requested: ensure commit now
+    bool any_uncommitted;
+    _mi_bitmap_claim_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_uncommitted);
+    if (any_uncommitted) {
+      bool commit_zero;
+      _mi_os_commit(p, needed_bcount * MI_ARENA_BLOCK_SIZE, &commit_zero, tld->stats);
+      if (commit_zero) *is_zero = true;
+    }
+  }
+  else {
+    // no need to commit, but check if already fully committed
+    *commit = _mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index);
+  }
+  return p;
+}
+
+// allocate from an arena with fallback to the OS
+static mi_decl_noinline void* mi_arena_allocate(int numa_node, size_t size, size_t alignment, bool* commit, bool* large,
+                                                bool* is_pinned, bool* is_zero,
+                                                mi_arena_id_t req_arena_id, size_t* memid, mi_os_tld_t* tld )
+{
+  MI_UNUSED_RELEASE(alignment);
+  mi_assert_internal(alignment <= MI_SEGMENT_ALIGN);
+  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
+  const size_t bcount = mi_block_count_of_size(size);
+  if mi_likely(max_arena == 0) return NULL;
+  mi_assert_internal(size <= bcount * MI_ARENA_BLOCK_SIZE);
+
+  size_t arena_index = mi_arena_id_index(req_arena_id);
+  if (arena_index < MI_MAX_ARENAS) {
+    // try a specific arena if requested
+    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[arena_index]);
+    if ((arena != NULL) &&
+        (arena->numa_node < 0 || arena->numa_node == numa_node) && // numa local?
+        (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages
+    {
+      void* p = mi_arena_alloc_from(arena, arena_index, bcount, commit, large, is_pinned, is_zero, req_arena_id, memid, tld);
+      mi_assert_internal((uintptr_t)p % alignment == 0);
+      if (p != NULL) return p;
+    }
+  }
+  else {
+    // try numa affine allocation
+    for (size_t i = 0; i < max_arena; i++) {
+      mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
+      if (arena == NULL) break; // end reached
+      if ((arena->numa_node < 0 || arena->numa_node == numa_node) && // numa local?
+          (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages
+      {
+        void* p = mi_arena_alloc_from(arena, i, bcount, commit, large, is_pinned, is_zero, req_arena_id, memid, tld);
+        mi_assert_internal((uintptr_t)p % alignment == 0);
+        if (p != NULL) return p;
+      }
+    }
+
+    // try from another numa node instead..
+    for (size_t i = 0; i < max_arena; i++) {
+      mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
+      if (arena == NULL) break; // end reached
+      if ((arena->numa_node >= 0 && arena->numa_node != numa_node) && // not numa local!
+          (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages
+      {
+        void* p = mi_arena_alloc_from(arena, i, bcount, commit, large, is_pinned, is_zero, req_arena_id, memid, tld);
+        mi_assert_internal((uintptr_t)p % alignment == 0);
+        if (p != NULL) return p;
+      }
+    }
+  }
+  return NULL;
+}
+
+void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool* commit, bool* large, bool* is_pinned, bool* is_zero,
+                              mi_arena_id_t req_arena_id, size_t* memid, mi_os_tld_t* tld)
+{
+  mi_assert_internal(commit != NULL && is_pinned != NULL && is_zero != NULL && memid != NULL && tld != NULL);
+  mi_assert_internal(size > 0);
+  *memid   = MI_MEMID_OS;
+  *is_zero = false;
+  *is_pinned = false;
+
+  bool default_large = false;
+  if (large == NULL) large = &default_large;   // ensure `large != NULL`
+  const int numa_node = _mi_os_numa_node(tld); // current numa node
+
+  // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data)
+  if (size >= MI_ARENA_MIN_OBJ_SIZE && alignment <= MI_SEGMENT_ALIGN && align_offset == 0) {
+    void* p = mi_arena_allocate(numa_node, size, alignment, commit, large, is_pinned, is_zero, req_arena_id, memid, tld);
+    if (p != NULL) return p;
+  }
+
+  // finally, fall back to the OS
+  if (mi_option_is_enabled(mi_option_limit_os_alloc) || req_arena_id != _mi_arena_id_none()) {
+    errno = ENOMEM;
+    return NULL;
+  }
+  *is_zero = true;
+  *memid   = MI_MEMID_OS;
+  void* p = _mi_os_alloc_aligned_offset(size, alignment, align_offset, *commit, large, tld->stats);
+  if (p != NULL) { *is_pinned = *large; }
+  return p;
+}
+
+void* _mi_arena_alloc(size_t size, bool* commit, bool* large, bool* is_pinned, bool* is_zero, mi_arena_id_t req_arena_id, size_t* memid, mi_os_tld_t* tld)
+{
+  return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, large, is_pinned, is_zero, req_arena_id, memid, tld);
+}
+
+void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) {
+  if (size != NULL) *size = 0;
+  size_t arena_index = mi_arena_id_index(arena_id);
+  if (arena_index >= MI_MAX_ARENAS) return NULL;
+  mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[arena_index]);
+  if (arena == NULL) return NULL;
+  if (size != NULL) *size = arena->block_count * MI_ARENA_BLOCK_SIZE;
+  return arena->start;
+}
+
+/* -----------------------------------------------------------
+  Arena free
+----------------------------------------------------------- */
+
+void _mi_arena_free(void* p, size_t size, size_t alignment, size_t align_offset, size_t memid, bool all_committed, mi_stats_t* stats) {
+  mi_assert_internal(size > 0 && stats != NULL);
+  if (p==NULL) return;
+  if (size==0) return;
+
+  if (memid == MI_MEMID_OS) {
+    // was a direct OS allocation, pass through
+    _mi_os_free_aligned(p, size, alignment, align_offset, all_committed, stats);
+  }
+  else {
+    // allocated in an arena
+    mi_assert_internal(align_offset == 0);
+    size_t arena_idx;
+    size_t bitmap_idx;
+    mi_arena_memid_indices(memid, &arena_idx, &bitmap_idx);
+    mi_assert_internal(arena_idx < MI_MAX_ARENAS);
+    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t,&mi_arenas[arena_idx]);
+    mi_assert_internal(arena != NULL);
+    const size_t blocks = mi_block_count_of_size(size);
+    // checks
+    if (arena == NULL) {
+      _mi_error_message(EINVAL, "trying to free from non-existent arena: %p, size %zu, memid: 0x%zx\n", p, size, memid);
+      return;
+    }
+    mi_assert_internal(arena->field_count > mi_bitmap_index_field(bitmap_idx));
+    if (arena->field_count <= mi_bitmap_index_field(bitmap_idx)) {
+      _mi_error_message(EINVAL, "trying to free from non-existent arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
+      return;
+    }
+    // potentially decommit
+    if (!arena->allow_decommit || arena->blocks_committed == NULL) {
+      mi_assert_internal(all_committed); // note: may be not true as we may "pretend" to be not committed (in segment.c)
+    }
+    else {
+      mi_assert_internal(arena->blocks_committed != NULL);
+      _mi_os_decommit(p, blocks * MI_ARENA_BLOCK_SIZE, stats); // ok if this fails
+      _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx);
+    }
+    // and make it available to others again
+    bool all_inuse = _mi_bitmap_unclaim_across(arena->blocks_inuse, arena->field_count, blocks, bitmap_idx);
+    if (!all_inuse) {
+      _mi_error_message(EAGAIN, "trying to free an already freed block: %p, size %zu\n", p, size);
+      return;
+    };
+  }
+}
+
+/* -----------------------------------------------------------
+  Add an arena.
+----------------------------------------------------------- */
+
+static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id) {
+  mi_assert_internal(arena != NULL);
+  mi_assert_internal((uintptr_t)mi_atomic_load_ptr_relaxed(uint8_t,&arena->start) % MI_SEGMENT_ALIGN == 0);
+  mi_assert_internal(arena->block_count > 0);
+  if (arena_id != NULL) *arena_id = -1;
+
+  size_t i = mi_atomic_increment_acq_rel(&mi_arena_count);
+  if (i >= MI_MAX_ARENAS) {
+    mi_atomic_decrement_acq_rel(&mi_arena_count);
+    return false;
+  }
+  mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena);
+  arena->id = mi_arena_id_create(i);
+  if (arena_id != NULL) *arena_id = arena->id;
+  return true;
+}
+
+bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept
+{
+  if (arena_id != NULL) *arena_id = _mi_arena_id_none();
+  if (size < MI_ARENA_BLOCK_SIZE) return false;
+
+  if (is_large) {
+    mi_assert_internal(is_committed);
+    is_committed = true;
+  }
+
+  const size_t bcount = size / MI_ARENA_BLOCK_SIZE;
+  const size_t fields = _mi_divide_up(bcount, MI_BITMAP_FIELD_BITS);
+  const size_t bitmaps = (is_committed ? 2 : 3);
+  const size_t asize  = sizeof(mi_arena_t) + (bitmaps*fields*sizeof(mi_bitmap_field_t));
+  mi_arena_t* arena   = (mi_arena_t*)_mi_os_alloc(asize, &_mi_stats_main); // TODO: can we avoid allocating from the OS?
+  if (arena == NULL) return false;
+
+  arena->id = _mi_arena_id_none();
+  arena->exclusive = exclusive;
+  arena->block_count = bcount;
+  arena->field_count = fields;
+  arena->start = (uint8_t*)start;
+  arena->numa_node    = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
+  arena->is_large     = is_large;
+  arena->is_zero_init = is_zero;
+  arena->allow_decommit = !is_large && !is_committed; // only allow decommit for initially uncommitted memory
+  arena->search_idx   = 0;
+  arena->blocks_dirty = &arena->blocks_inuse[fields]; // just after inuse bitmap
+  arena->blocks_committed = (!arena->allow_decommit ? NULL : &arena->blocks_inuse[2*fields]); // just after dirty bitmap
+  // the bitmaps are already zero initialized due to os_alloc
+  // initialize committed bitmap?
+  if (arena->blocks_committed != NULL && is_committed) {
+    memset((void*)arena->blocks_committed, 0xFF, fields*sizeof(mi_bitmap_field_t)); // cast to void* to avoid atomic warning
+  }
+  // and claim leftover blocks if needed (so we never allocate there)
+  ptrdiff_t post = (fields * MI_BITMAP_FIELD_BITS) - bcount;
+  mi_assert_internal(post >= 0);
+  if (post > 0) {
+    // don't use leftover bits at the end
+    mi_bitmap_index_t postidx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post);
+    _mi_bitmap_claim(arena->blocks_inuse, fields, post, postidx, NULL);
+  }
+
+  return mi_arena_add(arena, arena_id);
+
+}
+
+// Reserve a range of regular OS memory
+int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept
+{
+  if (arena_id != NULL) *arena_id = _mi_arena_id_none();
+  size = _mi_align_up(size, MI_ARENA_BLOCK_SIZE); // at least one block
+  bool large = allow_large;
+  void* start = _mi_os_alloc_aligned(size, MI_SEGMENT_ALIGN, commit, &large, &_mi_stats_main);
+  if (start==NULL) return ENOMEM;
+  if (!mi_manage_os_memory_ex(start, size, (large || commit), large, true, -1, exclusive, arena_id)) {
+    _mi_os_free_ex(start, size, commit, &_mi_stats_main);
+    _mi_verbose_message("failed to reserve %zu k memory\n", _mi_divide_up(size,1024));
+    return ENOMEM;
+  }
+  _mi_verbose_message("reserved %zu KiB memory%s\n", _mi_divide_up(size,1024), large ? " (in large os pages)" : "");
+  return 0;
+}
+
+bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept {
+  return mi_manage_os_memory_ex(start, size, is_committed, is_large, is_zero, numa_node, false, NULL);
+}
+
+int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept {
+  return mi_reserve_os_memory_ex(size, commit, allow_large, false, NULL);
+}
+
+
+/* -----------------------------------------------------------
+  Debugging
+----------------------------------------------------------- */
+
+static size_t mi_debug_show_bitmap(const char* prefix, mi_bitmap_field_t* fields, size_t field_count ) {
+  size_t inuse_count = 0;
+  for (size_t i = 0; i < field_count; i++) {
+    char buf[MI_BITMAP_FIELD_BITS + 1];
+    uintptr_t field = mi_atomic_load_relaxed(&fields[i]);
+    for (size_t bit = 0; bit < MI_BITMAP_FIELD_BITS; bit++) {
+      bool inuse = ((((uintptr_t)1 << bit) & field) != 0);
+      if (inuse) inuse_count++;
+      buf[MI_BITMAP_FIELD_BITS - 1 - bit] = (inuse ? 'x' : '.');
+    }
+    buf[MI_BITMAP_FIELD_BITS] = 0;
+    _mi_verbose_message("%s%s\n", prefix, buf);
+  }
+  return inuse_count;
+}
+
+void mi_debug_show_arenas(void) mi_attr_noexcept {
+  size_t max_arenas = mi_atomic_load_relaxed(&mi_arena_count);
+  for (size_t i = 0; i < max_arenas; i++) {
+    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
+    if (arena == NULL) break;
+    size_t inuse_count = 0;
+    _mi_verbose_message("arena %zu: %zu blocks with %zu fields\n", i, arena->block_count, arena->field_count);
+    inuse_count += mi_debug_show_bitmap("  ", arena->blocks_inuse, arena->field_count);
+    _mi_verbose_message("  blocks in use ('x'): %zu\n", inuse_count);
+  }
+}
+
+
+/* -----------------------------------------------------------
+  Reserve a huge page arena.
+----------------------------------------------------------- */
+// reserve at a specific numa node
+int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
+  if (arena_id != NULL) *arena_id = -1;
+  if (pages==0) return 0;
+  if (numa_node < -1) numa_node = -1;
+  if (numa_node >= 0) numa_node = numa_node % _mi_os_numa_node_count();
+  size_t hsize = 0;
+  size_t pages_reserved = 0;
+  void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, timeout_msecs, &pages_reserved, &hsize);
+  if (p==NULL || pages_reserved==0) {
+    _mi_warning_message("failed to reserve %zu GiB huge pages\n", pages);
+    return ENOMEM;
+  }
+  _mi_verbose_message("numa node %i: reserved %zu GiB huge pages (of the %zu GiB requested)\n", numa_node, pages_reserved, pages);
+
+  if (!mi_manage_os_memory_ex(p, hsize, true, true, true, numa_node, exclusive, arena_id)) {
+    _mi_os_free_huge_pages(p, hsize, &_mi_stats_main);
+    return ENOMEM;
+  }
+  return 0;
+}
+
+int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept {
+  return mi_reserve_huge_os_pages_at_ex(pages, numa_node, timeout_msecs, false, NULL);
+}
+
+// reserve huge pages evenly among the given number of numa nodes (or use the available ones as detected)
+int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept {
+  if (pages == 0) return 0;
+
+  // pages per numa node
+  size_t numa_count = (numa_nodes > 0 ? numa_nodes : _mi_os_numa_node_count());
+  if (numa_count <= 0) numa_count = 1;
+  const size_t pages_per = pages / numa_count;
+  const size_t pages_mod = pages % numa_count;
+  const size_t timeout_per = (timeout_msecs==0 ? 0 : (timeout_msecs / numa_count) + 50);
+
+  // reserve evenly among numa nodes
+  for (size_t numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
+    size_t node_pages = pages_per;  // can be 0
+    if (numa_node < pages_mod) node_pages++;
+    int err = mi_reserve_huge_os_pages_at(node_pages, (int)numa_node, timeout_per);
+    if (err) return err;
+    if (pages < node_pages) {
+      pages = 0;
+    }
+    else {
+      pages -= node_pages;
+    }
+  }
+
+  return 0;
+}
+
+int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept {
+  MI_UNUSED(max_secs);
+  _mi_warning_message("mi_reserve_huge_os_pages is deprecated: use mi_reserve_huge_os_pages_interleave/at instead\n");
+  if (pages_reserved != NULL) *pages_reserved = 0;
+  int err = mi_reserve_huge_os_pages_interleave(pages, 0, (size_t)(max_secs * 1000.0));
+  if (err==0 && pages_reserved!=NULL) *pages_reserved = pages;
+  return err;
+}
diff --git a/Objects/mimalloc/bitmap.c b/Objects/mimalloc/bitmap.c
new file mode 100644
index 00000000000000..4ea9f4afa7322e
--- /dev/null
+++ b/Objects/mimalloc/bitmap.c
@@ -0,0 +1,414 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2021 Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+Concurrent bitmap that can set/reset sequences of bits atomically,
+represeted as an array of fields where each field is a machine word (`size_t`)
+
+There are two api's; the standard one cannot have sequences that cross
+between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS).
+(this is used in region allocation)
+
+The `_across` postfixed functions do allow sequences that can cross over
+between the fields. (This is used in arena allocation)
+---------------------------------------------------------------------------- */
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "bitmap.h"
+
+/* -----------------------------------------------------------
+  Bitmap definition
+----------------------------------------------------------- */
+
+// The bit mask for a given number of blocks at a specified bit index.
+static inline size_t mi_bitmap_mask_(size_t count, size_t bitidx) {
+  mi_assert_internal(count + bitidx <= MI_BITMAP_FIELD_BITS);
+  mi_assert_internal(count > 0);
+  if (count >= MI_BITMAP_FIELD_BITS) return MI_BITMAP_FIELD_FULL;
+  if (count == 0) return 0;
+  return ((((size_t)1 << count) - 1) << bitidx);
+}
+
+
+/* -----------------------------------------------------------
+  Claim a bit sequence atomically
+----------------------------------------------------------- */
+
+// Try to atomically claim a sequence of `count` bits in a single
+// field at `idx` in `bitmap`. Returns `true` on success.
+inline bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx)
+{
+  mi_assert_internal(bitmap_idx != NULL);
+  mi_assert_internal(count <= MI_BITMAP_FIELD_BITS);
+  mi_assert_internal(count > 0);
+  mi_bitmap_field_t* field = &bitmap[idx];
+  size_t map  = mi_atomic_load_relaxed(field);
+  if (map==MI_BITMAP_FIELD_FULL) return false; // short cut
+
+  // search for 0-bit sequence of length count
+  const size_t mask = mi_bitmap_mask_(count, 0);
+  const size_t bitidx_max = MI_BITMAP_FIELD_BITS - count;
+
+#ifdef MI_HAVE_FAST_BITSCAN
+  size_t bitidx = mi_ctz(~map);    // quickly find the first zero bit if possible
+#else
+  size_t bitidx = 0;               // otherwise start at 0
+#endif
+  size_t m = (mask << bitidx);     // invariant: m == mask shifted by bitidx
+
+  // scan linearly for a free range of zero bits
+  while (bitidx <= bitidx_max) {
+    const size_t mapm = map & m;
+    if (mapm == 0) {  // are the mask bits free at bitidx?
+      mi_assert_internal((m >> bitidx) == mask); // no overflow?
+      const size_t newmap = map | m;
+      mi_assert_internal((newmap^map) >> bitidx == mask);
+      if (!mi_atomic_cas_weak_acq_rel(field, &map, newmap)) {  // TODO: use strong cas here?
+        // no success, another thread claimed concurrently.. keep going (with updated `map`)
+        continue;
+      }
+      else {
+        // success, we claimed the bits!
+        *bitmap_idx = mi_bitmap_index_create(idx, bitidx);
+        return true;
+      }
+    }
+    else {
+      // on to the next bit range
+#ifdef MI_HAVE_FAST_BITSCAN
+      const size_t shift = (count == 1 ? 1 : mi_bsr(mapm) - bitidx + 1);
+      mi_assert_internal(shift > 0 && shift <= count);
+#else
+      const size_t shift = 1;
+#endif
+      bitidx += shift;
+      m <<= shift;
+    }
+  }
+  // no bits found
+  return false;
+}
+
+// Find `count` bits of 0 and set them to 1 atomically; returns `true` on success.
+// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
+// `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields.
+bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx) {
+  size_t idx = start_field_idx;
+  for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) {
+    if (idx >= bitmap_fields) idx = 0; // wrap
+    if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Like _mi_bitmap_try_find_from_claim but with an extra predicate that must be fullfilled
+bool _mi_bitmap_try_find_from_claim_pred(mi_bitmap_t bitmap, const size_t bitmap_fields, 
+            const size_t start_field_idx, const size_t count, 
+            mi_bitmap_pred_fun_t pred_fun, void* pred_arg,            
+            mi_bitmap_index_t* bitmap_idx) {
+  size_t idx = start_field_idx;
+  for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) {
+    if (idx >= bitmap_fields) idx = 0; // wrap
+    if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
+      if (pred_fun == NULL || pred_fun(*bitmap_idx, pred_arg)) { 
+        return true;
+      }
+      // predicate returned false, unclaim and look further
+      _mi_bitmap_unclaim(bitmap, bitmap_fields, count, *bitmap_idx);
+    }
+  }
+  return false;
+}
+
+/*
+// Find `count` bits of 0 and set them to 1 atomically; returns `true` on success.
+// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never span fields.
+bool _mi_bitmap_try_find_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t count, mi_bitmap_index_t* bitmap_idx) {
+  return _mi_bitmap_try_find_from_claim(bitmap, bitmap_fields, 0, count, bitmap_idx);
+}
+*/
+
+// Set `count` bits at `bitmap_idx` to 0 atomically
+// Returns `true` if all `count` bits were 1 previously.
+bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const size_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
+  // mi_assert_internal((bitmap[idx] & mask) == mask);
+  size_t prev = mi_atomic_and_acq_rel(&bitmap[idx], ~mask);
+  return ((prev & mask) == mask);
+}
+
+
+// Set `count` bits at `bitmap_idx` to 1 atomically
+// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
+bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const size_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
+  //mi_assert_internal(any_zero != NULL || (bitmap[idx] & mask) == 0);
+  size_t prev = mi_atomic_or_acq_rel(&bitmap[idx], mask);
+  if (any_zero != NULL) *any_zero = ((prev & mask) != mask);
+  return ((prev & mask) == 0);
+}
+
+// Returns `true` if all `count` bits were 1. `any_ones` is `true` if there was at least one bit set to one.
+static bool mi_bitmap_is_claimedx(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_ones) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const size_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
+  size_t field = mi_atomic_load_relaxed(&bitmap[idx]);
+  if (any_ones != NULL) *any_ones = ((field & mask) != 0);
+  return ((field & mask) == mask);
+}
+
+bool _mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  return mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, NULL);
+}
+
+bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  bool any_ones;
+  mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, &any_ones);
+  return any_ones;
+}
+
+
+//--------------------------------------------------------------------------
+// the `_across` functions work on bitmaps where sequences can cross over
+// between the fields. This is used in arena allocation
+//--------------------------------------------------------------------------
+
+// Try to atomically claim a sequence of `count` bits starting from the field
+// at `idx` in `bitmap` and crossing into subsequent fields. Returns `true` on success.
+static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t idx, const size_t count, const size_t retries, mi_bitmap_index_t* bitmap_idx)
+{
+  mi_assert_internal(bitmap_idx != NULL);
+
+  // check initial trailing zeros
+  mi_bitmap_field_t* field = &bitmap[idx];
+  size_t map = mi_atomic_load_relaxed(field);
+  const size_t initial = mi_clz(map);  // count of initial zeros starting at idx
+  mi_assert_internal(initial <= MI_BITMAP_FIELD_BITS);
+  if (initial == 0)     return false;
+  if (initial >= count) return _mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx);     // no need to cross fields
+  if (_mi_divide_up(count - initial, MI_BITMAP_FIELD_BITS) >= (bitmap_fields - idx)) return false; // not enough entries
+
+  // scan ahead
+  size_t found = initial;
+  size_t mask = 0;     // mask bits for the final field
+  while(found < count) {
+    field++;
+    map = mi_atomic_load_relaxed(field);
+    const size_t mask_bits = (found + MI_BITMAP_FIELD_BITS <= count ? MI_BITMAP_FIELD_BITS : (count - found));
+    mask = mi_bitmap_mask_(mask_bits, 0);
+    if ((map & mask) != 0) return false;
+    found += mask_bits;
+  }
+  mi_assert_internal(field < &bitmap[bitmap_fields]);
+
+  // found range of zeros up to the final field; mask contains mask in the final field
+  // now claim it atomically
+  mi_bitmap_field_t* const final_field = field;
+  const size_t final_mask = mask;
+  mi_bitmap_field_t* const initial_field = &bitmap[idx];
+  const size_t initial_mask = mi_bitmap_mask_(initial, MI_BITMAP_FIELD_BITS - initial);
+
+  // initial field
+  size_t newmap;
+  field = initial_field;
+  map = mi_atomic_load_relaxed(field);
+  do {
+    newmap = map | initial_mask;
+    if ((map & initial_mask) != 0) { goto rollback; };
+  } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
+
+  // intermediate fields
+  while (++field < final_field) {
+    newmap = MI_BITMAP_FIELD_FULL;
+    map = 0;
+    if (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)) { goto rollback; }
+  }
+
+  // final field
+  mi_assert_internal(field == final_field);
+  map = mi_atomic_load_relaxed(field);
+  do {
+    newmap = map | final_mask;
+    if ((map & final_mask) != 0) { goto rollback; }
+  } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
+
+  // claimed!
+  *bitmap_idx = mi_bitmap_index_create(idx, MI_BITMAP_FIELD_BITS - initial);
+  return true;
+
+rollback:
+  // roll back intermediate fields
+  while (--field > initial_field) {
+    newmap = 0;
+    map = MI_BITMAP_FIELD_FULL;
+    mi_assert_internal(mi_atomic_load_relaxed(field) == map);
+    mi_atomic_store_release(field, newmap);
+  }
+  if (field == initial_field) {
+    map = mi_atomic_load_relaxed(field);
+    do {
+      mi_assert_internal((map & initial_mask) == initial_mask);
+      newmap = map & ~initial_mask;
+    } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
+  }
+  // retry? (we make a recursive call instead of goto to be able to use const declarations)
+  if (retries < 4) {
+    return mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, retries+1, bitmap_idx);
+  }
+  else {
+    return false;
+  }
+}
+
+
+// Find `count` bits of zeros and set them to 1 atomically; returns `true` on success.
+// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
+bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx) {
+  mi_assert_internal(count > 0);
+  if (count==1) return _mi_bitmap_try_find_from_claim(bitmap, bitmap_fields, start_field_idx, count, bitmap_idx);
+  size_t idx = start_field_idx;
+  for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) {
+    if (idx >= bitmap_fields) idx = 0; // wrap
+    // try to claim inside the field
+    if (count <= MI_BITMAP_FIELD_BITS) {
+      if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
+        return true;
+      }
+    }
+    // try to claim across fields
+    if (mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, 0, bitmap_idx)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Helper for masks across fields; returns the mid count, post_mask may be 0
+static size_t mi_bitmap_mask_across(mi_bitmap_index_t bitmap_idx, size_t bitmap_fields, size_t count, size_t* pre_mask, size_t* mid_mask, size_t* post_mask) {
+  MI_UNUSED_RELEASE(bitmap_fields);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  if mi_likely(bitidx + count <= MI_BITMAP_FIELD_BITS) {
+    *pre_mask = mi_bitmap_mask_(count, bitidx);
+    *mid_mask = 0;
+    *post_mask = 0;
+    mi_assert_internal(mi_bitmap_index_field(bitmap_idx) < bitmap_fields);
+    return 0;
+  }
+  else {
+    const size_t pre_bits = MI_BITMAP_FIELD_BITS - bitidx;
+    mi_assert_internal(pre_bits < count);
+    *pre_mask = mi_bitmap_mask_(pre_bits, bitidx);
+    count -= pre_bits;
+    const size_t mid_count = (count / MI_BITMAP_FIELD_BITS);
+    *mid_mask = MI_BITMAP_FIELD_FULL;
+    count %= MI_BITMAP_FIELD_BITS;
+    *post_mask = (count==0 ? 0 : mi_bitmap_mask_(count, 0));
+    mi_assert_internal(mi_bitmap_index_field(bitmap_idx) + mid_count + (count==0 ? 0 : 1) < bitmap_fields);
+    return mid_count;
+  }
+}
+
+// Set `count` bits at `bitmap_idx` to 0 atomically
+// Returns `true` if all `count` bits were 1 previously.
+bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  size_t idx = mi_bitmap_index_field(bitmap_idx);
+  size_t pre_mask;
+  size_t mid_mask;
+  size_t post_mask;
+  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
+  bool all_one = true;
+  mi_bitmap_field_t* field = &bitmap[idx];
+  size_t prev = mi_atomic_and_acq_rel(field++, ~pre_mask);
+  if ((prev & pre_mask) != pre_mask) all_one = false;
+  while(mid_count-- > 0) {
+    prev = mi_atomic_and_acq_rel(field++, ~mid_mask);
+    if ((prev & mid_mask) != mid_mask) all_one = false;
+  }
+  if (post_mask!=0) {
+    prev = mi_atomic_and_acq_rel(field, ~post_mask);
+    if ((prev & post_mask) != post_mask) all_one = false;
+  }
+  return all_one;
+}
+
+// Set `count` bits at `bitmap_idx` to 1 atomically
+// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
+bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero) {
+  size_t idx = mi_bitmap_index_field(bitmap_idx);
+  size_t pre_mask;
+  size_t mid_mask;
+  size_t post_mask;
+  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
+  bool all_zero = true;
+  bool any_zero = false;
+  _Atomic(size_t)*field = &bitmap[idx];
+  size_t prev = mi_atomic_or_acq_rel(field++, pre_mask);
+  if ((prev & pre_mask) != 0) all_zero = false;
+  if ((prev & pre_mask) != pre_mask) any_zero = true;
+  while (mid_count-- > 0) {
+    prev = mi_atomic_or_acq_rel(field++, mid_mask);
+    if ((prev & mid_mask) != 0) all_zero = false;
+    if ((prev & mid_mask) != mid_mask) any_zero = true;
+  }
+  if (post_mask!=0) {
+    prev = mi_atomic_or_acq_rel(field, post_mask);
+    if ((prev & post_mask) != 0) all_zero = false;
+    if ((prev & post_mask) != post_mask) any_zero = true;
+  }
+  if (pany_zero != NULL) *pany_zero = any_zero;
+  return all_zero;
+}
+
+
+// Returns `true` if all `count` bits were 1.
+// `any_ones` is `true` if there was at least one bit set to one.
+static bool mi_bitmap_is_claimedx_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_ones) {
+  size_t idx = mi_bitmap_index_field(bitmap_idx);
+  size_t pre_mask;
+  size_t mid_mask;
+  size_t post_mask;
+  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
+  bool all_ones = true;
+  bool any_ones = false;
+  mi_bitmap_field_t* field = &bitmap[idx];
+  size_t prev = mi_atomic_load_relaxed(field++);
+  if ((prev & pre_mask) != pre_mask) all_ones = false;
+  if ((prev & pre_mask) != 0) any_ones = true;
+  while (mid_count-- > 0) {
+    prev = mi_atomic_load_relaxed(field++);
+    if ((prev & mid_mask) != mid_mask) all_ones = false;
+    if ((prev & mid_mask) != 0) any_ones = true;
+  }
+  if (post_mask!=0) {
+    prev = mi_atomic_load_relaxed(field);
+    if ((prev & post_mask) != post_mask) all_ones = false;
+    if ((prev & post_mask) != 0) any_ones = true;
+  }
+  if (pany_ones != NULL) *pany_ones = any_ones;
+  return all_ones;
+}
+
+bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  return mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, NULL);
+}
+
+bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  bool any_ones;
+  mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, &any_ones);
+  return any_ones;
+}
diff --git a/Objects/mimalloc/bitmap.h b/Objects/mimalloc/bitmap.h
new file mode 100644
index 00000000000000..0c501ec1feac3b
--- /dev/null
+++ b/Objects/mimalloc/bitmap.h
@@ -0,0 +1,111 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2020 Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+Concurrent bitmap that can set/reset sequences of bits atomically,
+represeted as an array of fields where each field is a machine word (`size_t`)
+
+There are two api's; the standard one cannot have sequences that cross
+between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS).
+(this is used in region allocation)
+
+The `_across` postfixed functions do allow sequences that can cross over
+between the fields. (This is used in arena allocation)
+---------------------------------------------------------------------------- */
+#pragma once
+#ifndef MI_BITMAP_H
+#define MI_BITMAP_H
+
+/* -----------------------------------------------------------
+  Bitmap definition
+----------------------------------------------------------- */
+
+#define MI_BITMAP_FIELD_BITS   (8*MI_SIZE_SIZE)
+#define MI_BITMAP_FIELD_FULL   (~((size_t)0))   // all bits set
+
+// An atomic bitmap of `size_t` fields
+typedef _Atomic(size_t)  mi_bitmap_field_t;
+typedef mi_bitmap_field_t*  mi_bitmap_t;
+
+// A bitmap index is the index of the bit in a bitmap.
+typedef size_t mi_bitmap_index_t;
+
+// Create a bit index.
+static inline mi_bitmap_index_t mi_bitmap_index_create(size_t idx, size_t bitidx) {
+  mi_assert_internal(bitidx < MI_BITMAP_FIELD_BITS);
+  return (idx*MI_BITMAP_FIELD_BITS) + bitidx;
+}
+
+// Create a bit index.
+static inline mi_bitmap_index_t mi_bitmap_index_create_from_bit(size_t full_bitidx) {  
+  return mi_bitmap_index_create(full_bitidx / MI_BITMAP_FIELD_BITS, full_bitidx % MI_BITMAP_FIELD_BITS);
+}
+
+// Get the field index from a bit index.
+static inline size_t mi_bitmap_index_field(mi_bitmap_index_t bitmap_idx) {
+  return (bitmap_idx / MI_BITMAP_FIELD_BITS);
+}
+
+// Get the bit index in a bitmap field
+static inline size_t mi_bitmap_index_bit_in_field(mi_bitmap_index_t bitmap_idx) {
+  return (bitmap_idx % MI_BITMAP_FIELD_BITS);
+}
+
+// Get the full bit index
+static inline size_t mi_bitmap_index_bit(mi_bitmap_index_t bitmap_idx) {
+  return bitmap_idx;
+}
+
+/* -----------------------------------------------------------
+  Claim a bit sequence atomically
+----------------------------------------------------------- */
+
+// Try to atomically claim a sequence of `count` bits in a single
+// field at `idx` in `bitmap`. Returns `true` on success.
+bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
+
+// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
+// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields.
+bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
+
+// Like _mi_bitmap_try_find_from_claim but with an extra predicate that must be fullfilled
+typedef bool (mi_cdecl *mi_bitmap_pred_fun_t)(mi_bitmap_index_t bitmap_idx, void* pred_arg);
+bool _mi_bitmap_try_find_from_claim_pred(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_pred_fun_t pred_fun, void* pred_arg, mi_bitmap_index_t* bitmap_idx);
+
+// Set `count` bits at `bitmap_idx` to 0 atomically
+// Returns `true` if all `count` bits were 1 previously.
+bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+
+// Set `count` bits at `bitmap_idx` to 1 atomically
+// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
+bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero);
+
+bool _mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+
+
+//--------------------------------------------------------------------------
+// the `_across` functions work on bitmaps where sequences can cross over
+// between the fields. This is used in arena allocation
+//--------------------------------------------------------------------------
+
+// Find `count` bits of zeros and set them to 1 atomically; returns `true` on success.
+// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
+bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
+
+// Set `count` bits at `bitmap_idx` to 0 atomically
+// Returns `true` if all `count` bits were 1 previously.
+bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+
+// Set `count` bits at `bitmap_idx` to 1 atomically
+// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
+bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero);
+
+bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+
+#endif
diff --git a/Objects/mimalloc/heap.c b/Objects/mimalloc/heap.c
new file mode 100644
index 00000000000000..ac2d042bfd20cf
--- /dev/null
+++ b/Objects/mimalloc/heap.c
@@ -0,0 +1,602 @@
+/*----------------------------------------------------------------------------
+Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include <string.h>  // memset, memcpy
+
+#if defined(_MSC_VER) && (_MSC_VER < 1920)
+#pragma warning(disable:4204)  // non-constant aggregate initializer
+#endif
+
+/* -----------------------------------------------------------
+  Helpers
+----------------------------------------------------------- */
+
+// return `true` if ok, `false` to break
+typedef bool (heap_page_visitor_fun)(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2);
+
+// Visit all pages in a heap; returns `false` if break was called.
+static bool mi_heap_visit_pages(mi_heap_t* heap, heap_page_visitor_fun* fn, void* arg1, void* arg2)
+{
+  if (heap==NULL || heap->page_count==0) return 0;
+
+  // visit all pages
+  #if MI_DEBUG>1
+  size_t total = heap->page_count;
+  #endif
+  size_t count = 0;
+  for (size_t i = 0; i <= MI_BIN_FULL; i++) {
+    mi_page_queue_t* pq = &heap->pages[i];
+    mi_page_t* page = pq->first;
+    while(page != NULL) {
+      mi_page_t* next = page->next; // save next in case the page gets removed from the queue
+      mi_assert_internal(mi_page_heap(page) == heap);
+      count++;
+      if (!fn(heap, pq, page, arg1, arg2)) return false;
+      page = next; // and continue
+    }
+  }
+  mi_assert_internal(count == total);
+  return true;
+}
+
+
+#if MI_DEBUG>=2
+static bool mi_heap_page_is_valid(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) {
+  MI_UNUSED(arg1);
+  MI_UNUSED(arg2);
+  MI_UNUSED(pq);
+  mi_assert_internal(mi_page_heap(page) == heap);
+  mi_segment_t* segment = _mi_page_segment(page);
+  mi_assert_internal(segment->thread_id == heap->thread_id);
+  mi_assert_expensive(_mi_page_is_valid(page));
+  return true;
+}
+#endif
+#if MI_DEBUG>=3
+static bool mi_heap_is_valid(mi_heap_t* heap) {
+  mi_assert_internal(heap!=NULL);
+  mi_heap_visit_pages(heap, &mi_heap_page_is_valid, NULL, NULL);
+  return true;
+}
+#endif
+
+
+
+
+/* -----------------------------------------------------------
+  "Collect" pages by migrating `local_free` and `thread_free`
+  lists and freeing empty pages. This is done when a thread
+  stops (and in that case abandons pages if there are still
+  blocks alive)
+----------------------------------------------------------- */
+
+typedef enum mi_collect_e {
+  MI_NORMAL,
+  MI_FORCE,
+  MI_ABANDON
+} mi_collect_t;
+
+
+static bool mi_heap_page_collect(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg_collect, void* arg2 ) {
+  MI_UNUSED(arg2);
+  MI_UNUSED(heap);
+  mi_assert_internal(mi_heap_page_is_valid(heap, pq, page, NULL, NULL));
+  mi_collect_t collect = *((mi_collect_t*)arg_collect);
+  _mi_page_free_collect(page, collect >= MI_FORCE);
+  if (mi_page_all_free(page)) {
+    // no more used blocks, free the page.
+    // note: this will free retired pages as well.
+    _mi_page_free(page, pq, collect >= MI_FORCE);
+  }
+  else if (collect == MI_ABANDON) {
+    // still used blocks but the thread is done; abandon the page
+    _mi_page_abandon(page, pq);
+  }
+  return true; // don't break
+}
+
+static bool mi_heap_page_never_delayed_free(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) {
+  MI_UNUSED(arg1);
+  MI_UNUSED(arg2);
+  MI_UNUSED(heap);
+  MI_UNUSED(pq);
+  _mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false);
+  return true; // don't break
+}
+
+static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
+{
+  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
+
+  const bool force = collect >= MI_FORCE;  
+  _mi_deferred_free(heap, force);
+
+  // note: never reclaim on collect but leave it to threads that need storage to reclaim 
+  const bool force_main = 
+    #ifdef NDEBUG
+      collect == MI_FORCE
+    #else
+      collect >= MI_FORCE
+    #endif
+      && _mi_is_main_thread() && mi_heap_is_backing(heap) && !heap->no_reclaim;
+
+  if (force_main) {
+    // the main thread is abandoned (end-of-program), try to reclaim all abandoned segments.
+    // if all memory is freed by now, all segments should be freed.
+    _mi_abandoned_reclaim_all(heap, &heap->tld->segments);
+  }
+
+  // if abandoning, mark all pages to no longer add to delayed_free
+  if (collect == MI_ABANDON) {
+    mi_heap_visit_pages(heap, &mi_heap_page_never_delayed_free, NULL, NULL);
+  }
+
+  // free all current thread delayed blocks.
+  // (if abandoning, after this there are no more thread-delayed references into the pages.)
+  _mi_heap_delayed_free_all(heap);
+
+  // collect retired pages
+  _mi_heap_collect_retired(heap, force);
+
+  // collect all pages owned by this thread
+  mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL);
+  mi_assert_internal( collect != MI_ABANDON || mi_atomic_load_ptr_acquire(mi_block_t,&heap->thread_delayed_free) == NULL );
+
+  // collect abandoned segments (in particular, decommit expired parts of segments in the abandoned segment list)
+  // note: forced decommit can be quite expensive if many threads are created/destroyed so we do not force on abandonment
+  _mi_abandoned_collect(heap, collect == MI_FORCE /* force? */, &heap->tld->segments);
+
+  // collect segment local caches
+  if (force) {
+    _mi_segment_thread_collect(&heap->tld->segments);
+  }
+
+  // decommit in global segment caches
+  // note: forced decommit can be quite expensive if many threads are created/destroyed so we do not force on abandonment
+  _mi_segment_cache_collect( collect == MI_FORCE, &heap->tld->os);  
+
+  // collect regions on program-exit (or shared library unload)
+  if (force && _mi_is_main_thread() && mi_heap_is_backing(heap)) {
+    //_mi_mem_collect(&heap->tld->os);
+  }
+}
+
+void _mi_heap_collect_abandon(mi_heap_t* heap) {
+  mi_heap_collect_ex(heap, MI_ABANDON);
+}
+
+void mi_heap_collect(mi_heap_t* heap, bool force) mi_attr_noexcept {
+  mi_heap_collect_ex(heap, (force ? MI_FORCE : MI_NORMAL));
+}
+
+void mi_collect(bool force) mi_attr_noexcept {
+  mi_heap_collect(mi_get_default_heap(), force);
+}
+
+
+/* -----------------------------------------------------------
+  Heap new
+----------------------------------------------------------- */
+
+mi_heap_t* mi_heap_get_default(void) {
+  mi_thread_init();
+  return mi_get_default_heap();
+}
+
+mi_heap_t* mi_heap_get_backing(void) {
+  mi_heap_t* heap = mi_heap_get_default();
+  mi_assert_internal(heap!=NULL);
+  mi_heap_t* bheap = heap->tld->heap_backing;
+  mi_assert_internal(bheap!=NULL);
+  mi_assert_internal(bheap->thread_id == _mi_thread_id());
+  return bheap;
+}
+
+mi_decl_nodiscard mi_heap_t* mi_heap_new_in_arena( mi_arena_id_t arena_id ) {
+  mi_heap_t* bheap = mi_heap_get_backing();
+  mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t);  // todo: OS allocate in secure mode?
+  if (heap==NULL) return NULL;
+  _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(mi_heap_t));
+  heap->tld = bheap->tld;
+  heap->thread_id = _mi_thread_id();
+  heap->arena_id = arena_id;
+  _mi_random_split(&bheap->random, &heap->random);
+  heap->cookie  = _mi_heap_random_next(heap) | 1;
+  heap->keys[0] = _mi_heap_random_next(heap);
+  heap->keys[1] = _mi_heap_random_next(heap);
+  heap->no_reclaim = true;  // don't reclaim abandoned pages or otherwise destroy is unsafe
+  // push on the thread local heaps list
+  heap->next = heap->tld->heaps;
+  heap->tld->heaps = heap;
+  return heap;
+}
+
+mi_decl_nodiscard mi_heap_t* mi_heap_new(void) {
+  return mi_heap_new_in_arena(_mi_arena_id_none());
+}
+
+bool _mi_heap_memid_is_suitable(mi_heap_t* heap, size_t memid) {
+  return _mi_arena_memid_is_suitable(memid, heap->arena_id);
+}
+
+uintptr_t _mi_heap_random_next(mi_heap_t* heap) {
+  return _mi_random_next(&heap->random);
+}
+
+// zero out the page queues
+static void mi_heap_reset_pages(mi_heap_t* heap) {
+  mi_assert_internal(heap != NULL);
+  mi_assert_internal(mi_heap_is_initialized(heap));
+  // TODO: copy full empty heap instead?
+  memset(&heap->pages_free_direct, 0, sizeof(heap->pages_free_direct));
+#ifdef MI_MEDIUM_DIRECT
+  memset(&heap->pages_free_medium, 0, sizeof(heap->pages_free_medium));
+#endif
+  _mi_memcpy_aligned(&heap->pages, &_mi_heap_empty.pages, sizeof(heap->pages));
+  heap->thread_delayed_free = NULL;
+  heap->page_count = 0;
+}
+
+// called from `mi_heap_destroy` and `mi_heap_delete` to free the internal heap resources.
+static void mi_heap_free(mi_heap_t* heap) {
+  mi_assert(heap != NULL);
+  mi_assert_internal(mi_heap_is_initialized(heap));
+  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
+  if (mi_heap_is_backing(heap)) return; // dont free the backing heap
+
+  // reset default
+  if (mi_heap_is_default(heap)) {
+    _mi_heap_set_default_direct(heap->tld->heap_backing);
+  }
+
+  // remove ourselves from the thread local heaps list
+  // linear search but we expect the number of heaps to be relatively small
+  mi_heap_t* prev = NULL;
+  mi_heap_t* curr = heap->tld->heaps;
+  while (curr != heap && curr != NULL) {
+    prev = curr;
+    curr = curr->next;
+  }
+  mi_assert_internal(curr == heap);
+  if (curr == heap) {
+    if (prev != NULL) { prev->next = heap->next; }
+                 else { heap->tld->heaps = heap->next; }
+  }
+  mi_assert_internal(heap->tld->heaps != NULL);
+
+  // and free the used memory
+  mi_free(heap);
+}
+
+
+/* -----------------------------------------------------------
+  Heap destroy
+----------------------------------------------------------- */
+
+static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) {
+  MI_UNUSED(arg1);
+  MI_UNUSED(arg2);
+  MI_UNUSED(heap);
+  MI_UNUSED(pq);
+
+  // ensure no more thread_delayed_free will be added
+  _mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false);
+
+  // stats
+  const size_t bsize = mi_page_block_size(page);
+  if (bsize > MI_MEDIUM_OBJ_SIZE_MAX) {
+    if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+      mi_heap_stat_decrease(heap, large, bsize);
+    }
+    else {
+      mi_heap_stat_decrease(heap, huge, bsize);
+    }
+  }
+#if (MI_STAT)
+  _mi_page_free_collect(page, false);  // update used count
+  const size_t inuse = page->used;
+  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+    mi_heap_stat_decrease(heap, normal, bsize * inuse);
+#if (MI_STAT>1)
+    mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], inuse);
+#endif
+  }
+  mi_heap_stat_decrease(heap, malloc, bsize * inuse);  // todo: off for aligned blocks...
+#endif
+
+  /// pretend it is all free now
+  mi_assert_internal(mi_page_thread_free(page) == NULL);
+  page->used = 0;
+
+  // and free the page
+  // mi_page_free(page,false);
+  page->next = NULL;
+  page->prev = NULL;
+  _mi_segment_page_free(page,false /* no force? */, &heap->tld->segments);
+
+  return true; // keep going
+}
+
+void _mi_heap_destroy_pages(mi_heap_t* heap) {
+  mi_heap_visit_pages(heap, &_mi_heap_page_destroy, NULL, NULL);
+  mi_heap_reset_pages(heap);
+}
+
+void mi_heap_destroy(mi_heap_t* heap) {
+  mi_assert(heap != NULL);
+  mi_assert(mi_heap_is_initialized(heap));
+  mi_assert(heap->no_reclaim);
+  mi_assert_expensive(mi_heap_is_valid(heap));
+  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
+  if (!heap->no_reclaim) {
+    // don't free in case it may contain reclaimed pages
+    mi_heap_delete(heap);
+  }
+  else {
+    // free all pages
+    _mi_heap_destroy_pages(heap);
+    mi_heap_free(heap);
+  }
+}
+
+void _mi_heap_destroy_all(void) {
+  mi_heap_t* bheap = mi_heap_get_backing();
+  mi_heap_t* curr = bheap->tld->heaps;
+  while (curr != NULL) {
+    mi_heap_t* next = curr->next;
+    if (curr->no_reclaim) {
+      mi_heap_destroy(curr);
+    }
+    else {
+      _mi_heap_destroy_pages(curr);
+    }
+    curr = next;
+  }
+}
+
+/* -----------------------------------------------------------
+  Safe Heap delete
+----------------------------------------------------------- */
+
+// Transfer the pages from one heap to the other
+static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) {
+  mi_assert_internal(heap!=NULL);
+  if (from==NULL || from->page_count == 0) return;
+
+  // reduce the size of the delayed frees
+  _mi_heap_delayed_free_partial(from);
+
+  // transfer all pages by appending the queues; this will set a new heap field
+  // so threads may do delayed frees in either heap for a while.
+  // note: appending waits for each page to not be in the `MI_DELAYED_FREEING` state
+  // so after this only the new heap will get delayed frees
+  for (size_t i = 0; i <= MI_BIN_FULL; i++) {
+    mi_page_queue_t* pq = &heap->pages[i];
+    mi_page_queue_t* append = &from->pages[i];
+    size_t pcount = _mi_page_queue_append(heap, pq, append);
+    heap->page_count += pcount;
+    from->page_count -= pcount;
+  }
+  mi_assert_internal(from->page_count == 0);
+
+  // and do outstanding delayed frees in the `from` heap
+  // note: be careful here as the `heap` field in all those pages no longer point to `from`,
+  // turns out to be ok as `_mi_heap_delayed_free` only visits the list and calls a
+  // the regular `_mi_free_delayed_block` which is safe.
+  _mi_heap_delayed_free_all(from);
+  #if !defined(_MSC_VER) || (_MSC_VER > 1900) // somehow the following line gives an error in VS2015, issue #353
+  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_block_t,&from->thread_delayed_free) == NULL);
+  #endif
+
+  // and reset the `from` heap
+  mi_heap_reset_pages(from);
+}
+
+// Safe delete a heap without freeing any still allocated blocks in that heap.
+void mi_heap_delete(mi_heap_t* heap)
+{
+  mi_assert(heap != NULL);
+  mi_assert(mi_heap_is_initialized(heap));
+  mi_assert_expensive(mi_heap_is_valid(heap));
+  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
+
+  if (!mi_heap_is_backing(heap)) {
+    // tranfer still used pages to the backing heap
+    mi_heap_absorb(heap->tld->heap_backing, heap);
+  }
+  else {
+    // the backing heap abandons its pages
+    _mi_heap_collect_abandon(heap);
+  }
+  mi_assert_internal(heap->page_count==0);
+  mi_heap_free(heap);
+}
+
+mi_heap_t* mi_heap_set_default(mi_heap_t* heap) {
+  mi_assert(heap != NULL);
+  mi_assert(mi_heap_is_initialized(heap));
+  if (heap==NULL || !mi_heap_is_initialized(heap)) return NULL;
+  mi_assert_expensive(mi_heap_is_valid(heap));
+  mi_heap_t* old = mi_get_default_heap();
+  _mi_heap_set_default_direct(heap);
+  return old;
+}
+
+
+
+
+/* -----------------------------------------------------------
+  Analysis
+----------------------------------------------------------- */
+
+// static since it is not thread safe to access heaps from other threads.
+static mi_heap_t* mi_heap_of_block(const void* p) {
+  if (p == NULL) return NULL;
+  mi_segment_t* segment = _mi_ptr_segment(p);
+  bool valid = (_mi_ptr_cookie(segment) == segment->cookie);
+  mi_assert_internal(valid);
+  if mi_unlikely(!valid) return NULL;
+  return mi_page_heap(_mi_segment_page_of(segment,p));
+}
+
+bool mi_heap_contains_block(mi_heap_t* heap, const void* p) {
+  mi_assert(heap != NULL);
+  if (heap==NULL || !mi_heap_is_initialized(heap)) return false;
+  return (heap == mi_heap_of_block(p));
+}
+
+
+static bool mi_heap_page_check_owned(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* p, void* vfound) {
+  MI_UNUSED(heap);
+  MI_UNUSED(pq);
+  bool* found = (bool*)vfound;
+  mi_segment_t* segment = _mi_page_segment(page);
+  void* start = _mi_page_start(segment, page, NULL);
+  void* end   = (uint8_t*)start + (page->capacity * mi_page_block_size(page));
+  *found = (p >= start && p < end);
+  return (!*found); // continue if not found
+}
+
+bool mi_heap_check_owned(mi_heap_t* heap, const void* p) {
+  mi_assert(heap != NULL);
+  if (heap==NULL || !mi_heap_is_initialized(heap)) return false;
+  if (((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0) return false;  // only aligned pointers
+  bool found = false;
+  mi_heap_visit_pages(heap, &mi_heap_page_check_owned, (void*)p, &found);
+  return found;
+}
+
+bool mi_check_owned(const void* p) {
+  return mi_heap_check_owned(mi_get_default_heap(), p);
+}
+
+/* -----------------------------------------------------------
+  Visit all heap blocks and areas
+  Todo: enable visiting abandoned pages, and
+        enable visiting all blocks of all heaps across threads
+----------------------------------------------------------- */
+
+// Separate struct to keep `mi_page_t` out of the public interface
+typedef struct mi_heap_area_ex_s {
+  mi_heap_area_t area;
+  mi_page_t*     page;
+} mi_heap_area_ex_t;
+
+static bool mi_heap_area_visit_blocks(const mi_heap_area_ex_t* xarea, mi_block_visit_fun* visitor, void* arg) {
+  mi_assert(xarea != NULL);
+  if (xarea==NULL) return true;
+  const mi_heap_area_t* area = &xarea->area;
+  mi_page_t* page = xarea->page;
+  mi_assert(page != NULL);
+  if (page == NULL) return true;
+
+  _mi_page_free_collect(page,true);
+  mi_assert_internal(page->local_free == NULL);
+  if (page->used == 0) return true;
+
+  const size_t bsize = mi_page_block_size(page);
+  const size_t ubsize = mi_page_usable_block_size(page); // without padding
+  size_t   psize;
+  uint8_t* pstart = _mi_page_start(_mi_page_segment(page), page, &psize);
+
+  if (page->capacity == 1) {
+    // optimize page with one block
+    mi_assert_internal(page->used == 1 && page->free == NULL);
+    return visitor(mi_page_heap(page), area, pstart, ubsize, arg);
+  }
+
+  // create a bitmap of free blocks.
+  #define MI_MAX_BLOCKS   (MI_SMALL_PAGE_SIZE / sizeof(void*))
+  uintptr_t free_map[MI_MAX_BLOCKS / sizeof(uintptr_t)];
+  memset(free_map, 0, sizeof(free_map));
+
+  size_t free_count = 0;
+  for (mi_block_t* block = page->free; block != NULL; block = mi_block_next(page,block)) {
+    free_count++;
+    mi_assert_internal((uint8_t*)block >= pstart && (uint8_t*)block < (pstart + psize));
+    size_t offset = (uint8_t*)block - pstart;
+    mi_assert_internal(offset % bsize == 0);
+    size_t blockidx = offset / bsize;  // Todo: avoid division?
+    mi_assert_internal( blockidx < MI_MAX_BLOCKS);
+    size_t bitidx = (blockidx / sizeof(uintptr_t));
+    size_t bit = blockidx - (bitidx * sizeof(uintptr_t));
+    free_map[bitidx] |= ((uintptr_t)1 << bit);
+  }
+  mi_assert_internal(page->capacity == (free_count + page->used));
+
+  // walk through all blocks skipping the free ones
+  size_t used_count = 0;
+  for (size_t i = 0; i < page->capacity; i++) {
+    size_t bitidx = (i / sizeof(uintptr_t));
+    size_t bit = i - (bitidx * sizeof(uintptr_t));
+    uintptr_t m = free_map[bitidx];
+    if (bit == 0 && m == UINTPTR_MAX) {
+      i += (sizeof(uintptr_t) - 1); // skip a run of free blocks
+    }
+    else if ((m & ((uintptr_t)1 << bit)) == 0) {
+      used_count++;
+      uint8_t* block = pstart + (i * bsize);
+      if (!visitor(mi_page_heap(page), area, block, ubsize, arg)) return false;
+    }
+  }
+  mi_assert_internal(page->used == used_count);
+  return true;
+}
+
+typedef bool (mi_heap_area_visit_fun)(const mi_heap_t* heap, const mi_heap_area_ex_t* area, void* arg);
+
+
+static bool mi_heap_visit_areas_page(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* vfun, void* arg) {
+  MI_UNUSED(heap);
+  MI_UNUSED(pq);
+  mi_heap_area_visit_fun* fun = (mi_heap_area_visit_fun*)vfun;
+  mi_heap_area_ex_t xarea;
+  const size_t bsize = mi_page_block_size(page);
+  const size_t ubsize = mi_page_usable_block_size(page);
+  xarea.page = page;
+  xarea.area.reserved = page->reserved * bsize;
+  xarea.area.committed = page->capacity * bsize;
+  xarea.area.blocks = _mi_page_start(_mi_page_segment(page), page, NULL);
+  xarea.area.used = page->used;   // number of blocks in use (#553)
+  xarea.area.block_size = ubsize;
+  xarea.area.full_block_size = bsize;
+  return fun(heap, &xarea, arg);
+}
+
+// Visit all heap pages as areas
+static bool mi_heap_visit_areas(const mi_heap_t* heap, mi_heap_area_visit_fun* visitor, void* arg) {
+  if (visitor == NULL) return false;
+  return mi_heap_visit_pages((mi_heap_t*)heap, &mi_heap_visit_areas_page, (void*)(visitor), arg); // note: function pointer to void* :-{
+}
+
+// Just to pass arguments
+typedef struct mi_visit_blocks_args_s {
+  bool  visit_blocks;
+  mi_block_visit_fun* visitor;
+  void* arg;
+} mi_visit_blocks_args_t;
+
+static bool mi_heap_area_visitor(const mi_heap_t* heap, const mi_heap_area_ex_t* xarea, void* arg) {
+  mi_visit_blocks_args_t* args = (mi_visit_blocks_args_t*)arg;
+  if (!args->visitor(heap, &xarea->area, NULL, xarea->area.block_size, args->arg)) return false;
+  if (args->visit_blocks) {
+    return mi_heap_area_visit_blocks(xarea, args->visitor, args->arg);
+  }
+  else {
+    return true;
+  }
+}
+
+// Visit all blocks in a heap
+bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
+  mi_visit_blocks_args_t args = { visit_blocks, visitor, arg };
+  return mi_heap_visit_areas(heap, &mi_heap_area_visitor, &args);
+}
diff --git a/Objects/mimalloc/init.c b/Objects/mimalloc/init.c
new file mode 100644
index 00000000000000..c416208cfdd625
--- /dev/null
+++ b/Objects/mimalloc/init.c
@@ -0,0 +1,716 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2022, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+
+#include <string.h>  // memcpy, memset
+#include <stdlib.h>  // atexit
+
+// Empty page used to initialize the small free pages array
+const mi_page_t _mi_page_empty = {
+  0, false, false, false, false,
+  0,       // capacity
+  0,       // reserved capacity
+  { 0 },   // flags
+  false,   // is_zero
+  0,       // retire_expire
+  NULL,    // free
+  0,       // used
+  0,       // xblock_size
+  NULL,    // local_free
+  #if MI_ENCODE_FREELIST
+  { 0, 0 },
+  #endif
+  MI_ATOMIC_VAR_INIT(0), // xthread_free
+  MI_ATOMIC_VAR_INIT(0), // xheap
+  NULL, NULL
+  #if MI_INTPTR_SIZE==8
+  , { 0 }  // padding
+  #endif
+};
+
+#define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty)
+
+#if (MI_PADDING>0) && (MI_INTPTR_SIZE >= 8)
+#define MI_SMALL_PAGES_EMPTY  { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY(), MI_PAGE_EMPTY() }
+#elif (MI_PADDING>0)
+#define MI_SMALL_PAGES_EMPTY  { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY(), MI_PAGE_EMPTY(), MI_PAGE_EMPTY() }
+#else
+#define MI_SMALL_PAGES_EMPTY  { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY() }
+#endif
+
+
+// Empty page queues for every bin
+#define QNULL(sz)  { NULL, NULL, (sz)*sizeof(uintptr_t) }
+#define MI_PAGE_QUEUES_EMPTY \
+  { QNULL(1), \
+    QNULL(     1), QNULL(     2), QNULL(     3), QNULL(     4), QNULL(     5), QNULL(     6), QNULL(     7), QNULL(     8), /* 8 */ \
+    QNULL(    10), QNULL(    12), QNULL(    14), QNULL(    16), QNULL(    20), QNULL(    24), QNULL(    28), QNULL(    32), /* 16 */ \
+    QNULL(    40), QNULL(    48), QNULL(    56), QNULL(    64), QNULL(    80), QNULL(    96), QNULL(   112), QNULL(   128), /* 24 */ \
+    QNULL(   160), QNULL(   192), QNULL(   224), QNULL(   256), QNULL(   320), QNULL(   384), QNULL(   448), QNULL(   512), /* 32 */ \
+    QNULL(   640), QNULL(   768), QNULL(   896), QNULL(  1024), QNULL(  1280), QNULL(  1536), QNULL(  1792), QNULL(  2048), /* 40 */ \
+    QNULL(  2560), QNULL(  3072), QNULL(  3584), QNULL(  4096), QNULL(  5120), QNULL(  6144), QNULL(  7168), QNULL(  8192), /* 48 */ \
+    QNULL( 10240), QNULL( 12288), QNULL( 14336), QNULL( 16384), QNULL( 20480), QNULL( 24576), QNULL( 28672), QNULL( 32768), /* 56 */ \
+    QNULL( 40960), QNULL( 49152), QNULL( 57344), QNULL( 65536), QNULL( 81920), QNULL( 98304), QNULL(114688), QNULL(131072), /* 64 */ \
+    QNULL(163840), QNULL(196608), QNULL(229376), QNULL(262144), QNULL(327680), QNULL(393216), QNULL(458752), QNULL(524288), /* 72 */ \
+    QNULL(MI_MEDIUM_OBJ_WSIZE_MAX + 1  /* 655360, Huge queue */), \
+    QNULL(MI_MEDIUM_OBJ_WSIZE_MAX + 2) /* Full queue */ }
+
+#define MI_STAT_COUNT_NULL()  {0,0,0,0}
+
+// Empty statistics
+#if MI_STAT>1
+#define MI_STAT_COUNT_END_NULL()  , { MI_STAT_COUNT_NULL(), MI_INIT32(MI_STAT_COUNT_NULL) }
+#else
+#define MI_STAT_COUNT_END_NULL()
+#endif
+
+#define MI_STATS_NULL  \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 },     \
+  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 } \
+  MI_STAT_COUNT_END_NULL()
+
+
+// Empty slice span queues for every bin
+#define SQNULL(sz)  { NULL, NULL, sz }
+#define MI_SEGMENT_SPAN_QUEUES_EMPTY \
+  { SQNULL(1), \
+    SQNULL(     1), SQNULL(     2), SQNULL(     3), SQNULL(     4), SQNULL(     5), SQNULL(     6), SQNULL(     7), SQNULL(    10), /*  8 */ \
+    SQNULL(    12), SQNULL(    14), SQNULL(    16), SQNULL(    20), SQNULL(    24), SQNULL(    28), SQNULL(    32), SQNULL(    40), /* 16 */ \
+    SQNULL(    48), SQNULL(    56), SQNULL(    64), SQNULL(    80), SQNULL(    96), SQNULL(   112), SQNULL(   128), SQNULL(   160), /* 24 */ \
+    SQNULL(   192), SQNULL(   224), SQNULL(   256), SQNULL(   320), SQNULL(   384), SQNULL(   448), SQNULL(   512), SQNULL(   640), /* 32 */ \
+    SQNULL(   768), SQNULL(   896), SQNULL(  1024) /* 35 */ }
+
+
+// --------------------------------------------------------
+// Statically allocate an empty heap as the initial
+// thread local value for the default heap,
+// and statically allocate the backing heap for the main
+// thread so it can function without doing any allocation
+// itself (as accessing a thread local for the first time
+// may lead to allocation itself on some platforms)
+// --------------------------------------------------------
+
+mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
+  NULL,
+  MI_SMALL_PAGES_EMPTY,
+  MI_PAGE_QUEUES_EMPTY,
+  MI_ATOMIC_VAR_INIT(NULL),
+  0,                // tid
+  0,                // cookie
+  0,                // arena id
+  { 0, 0 },         // keys
+  { {0}, {0}, 0, true }, // random
+  0,                // page count
+  MI_BIN_FULL, 0,   // page retired min/max
+  NULL,             // next
+  false
+};
+
+#define tld_empty_stats  ((mi_stats_t*)((uint8_t*)&tld_empty + offsetof(mi_tld_t,stats)))
+#define tld_empty_os     ((mi_os_tld_t*)((uint8_t*)&tld_empty + offsetof(mi_tld_t,os)))
+
+mi_decl_cache_align static const mi_tld_t tld_empty = {
+  0,
+  false,
+  NULL, NULL,
+  { MI_SEGMENT_SPAN_QUEUES_EMPTY, 0, 0, 0, 0, tld_empty_stats, tld_empty_os }, // segments
+  { 0, tld_empty_stats }, // os
+  { MI_STATS_NULL }       // stats
+};
+
+// the thread-local default heap for allocation
+mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;
+
+extern mi_heap_t _mi_heap_main;
+
+static mi_tld_t tld_main = {
+  0, false,
+  &_mi_heap_main, & _mi_heap_main,
+  { MI_SEGMENT_SPAN_QUEUES_EMPTY, 0, 0, 0, 0, &tld_main.stats, &tld_main.os }, // segments
+  { 0, &tld_main.stats },  // os
+  { MI_STATS_NULL }       // stats
+};
+
+mi_heap_t _mi_heap_main = {
+  &tld_main,
+  MI_SMALL_PAGES_EMPTY,
+  MI_PAGE_QUEUES_EMPTY,
+  MI_ATOMIC_VAR_INIT(NULL),
+  0,                // thread id
+  0,                // initial cookie
+  0,                // arena id
+  { 0, 0 },         // the key of the main heap can be fixed (unlike page keys that need to be secure!)
+  { {0x846ca68b}, {0}, 0, true },  // random
+  0,                // page count
+  MI_BIN_FULL, 0,   // page retired min/max
+  NULL,             // next heap
+  false             // can reclaim
+};
+
+bool _mi_process_is_initialized = false;  // set to `true` in `mi_process_init`.
+
+mi_stats_t _mi_stats_main = { MI_STATS_NULL };
+
+
+static void mi_heap_main_init(void) {
+  if (_mi_heap_main.cookie == 0) {
+    _mi_heap_main.thread_id = _mi_thread_id();
+    _mi_heap_main.cookie = 1;
+    #if defined(_WIN32) && !defined(MI_SHARED_LIB)
+      _mi_random_init_weak(&_mi_heap_main.random);    // prevent allocation failure during bcrypt dll initialization with static linking
+    #else
+      _mi_random_init(&_mi_heap_main.random);
+    #endif
+    _mi_heap_main.cookie  = _mi_heap_random_next(&_mi_heap_main);
+    _mi_heap_main.keys[0] = _mi_heap_random_next(&_mi_heap_main);
+    _mi_heap_main.keys[1] = _mi_heap_random_next(&_mi_heap_main);
+  }
+}
+
+mi_heap_t* _mi_heap_main_get(void) {
+  mi_heap_main_init();
+  return &_mi_heap_main;
+}
+
+
+/* -----------------------------------------------------------
+  Initialization and freeing of the thread local heaps
+----------------------------------------------------------- */
+
+// note: in x64 in release build `sizeof(mi_thread_data_t)` is under 4KiB (= OS page size).
+typedef struct mi_thread_data_s {
+  mi_heap_t  heap;  // must come first due to cast in `_mi_heap_done`
+  mi_tld_t   tld;
+} mi_thread_data_t;
+
+
+// Thread meta-data is allocated directly from the OS. For
+// some programs that do not use thread pools and allocate and
+// destroy many OS threads, this may causes too much overhead
+// per thread so we maintain a small cache of recently freed metadata.
+
+#define TD_CACHE_SIZE (8)
+static _Atomic(mi_thread_data_t*) td_cache[TD_CACHE_SIZE];
+
+static mi_thread_data_t* mi_thread_data_alloc(void) {
+  // try to find thread metadata in the cache
+  mi_thread_data_t* td;
+  for (int i = 0; i < TD_CACHE_SIZE; i++) {
+    td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
+    if (td != NULL) {
+      td = mi_atomic_exchange_ptr_acq_rel(mi_thread_data_t, &td_cache[i], NULL);
+      if (td != NULL) {
+        return td;
+      }
+    }
+  }
+  // if that fails, allocate directly from the OS
+  td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &_mi_stats_main);
+  if (td == NULL) {
+    // if this fails, try once more. (issue #257)
+    td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &_mi_stats_main);
+    if (td == NULL) {
+      // really out of memory
+      _mi_error_message(ENOMEM, "unable to allocate thread local heap metadata (%zu bytes)\n", sizeof(mi_thread_data_t));
+    }
+  }
+  return td;
+}
+
+static void mi_thread_data_free( mi_thread_data_t* tdfree ) {
+  // try to add the thread metadata to the cache
+  for (int i = 0; i < TD_CACHE_SIZE; i++) {
+    mi_thread_data_t* td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
+    if (td == NULL) {
+      mi_thread_data_t* expected = NULL;
+      if (mi_atomic_cas_ptr_weak_acq_rel(mi_thread_data_t, &td_cache[i], &expected, tdfree)) {
+        return;
+      }
+    }
+  }
+  // if that fails, just free it directly
+  _mi_os_free(tdfree, sizeof(mi_thread_data_t), &_mi_stats_main);
+}
+
+static void mi_thread_data_collect(void) {
+  // free all thread metadata from the cache
+  for (int i = 0; i < TD_CACHE_SIZE; i++) {
+    mi_thread_data_t* td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
+    if (td != NULL) {
+      td = mi_atomic_exchange_ptr_acq_rel(mi_thread_data_t, &td_cache[i], NULL);
+      if (td != NULL) {
+        _mi_os_free( td, sizeof(mi_thread_data_t), &_mi_stats_main );
+      }
+    }
+  }
+}
+
+// Initialize the thread local default heap, called from `mi_thread_init`
+static bool _mi_heap_init(void) {
+  if (mi_heap_is_initialized(mi_get_default_heap())) return true;
+  if (_mi_is_main_thread()) {
+    // mi_assert_internal(_mi_heap_main.thread_id != 0);  // can happen on freeBSD where alloc is called before any initialization
+    // the main heap is statically allocated
+    mi_heap_main_init();
+    _mi_heap_set_default_direct(&_mi_heap_main);
+    //mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_get_default_heap());
+  }
+  else {
+    // use `_mi_os_alloc` to allocate directly from the OS
+    mi_thread_data_t* td = mi_thread_data_alloc();
+    if (td == NULL) return false;
+
+    // OS allocated so already zero initialized
+    mi_tld_t*  tld = &td->tld;
+    mi_heap_t* heap = &td->heap;
+    _mi_memcpy_aligned(tld, &tld_empty, sizeof(*tld));
+    _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(*heap));
+    heap->thread_id = _mi_thread_id();
+    _mi_random_init(&heap->random);
+    heap->cookie  = _mi_heap_random_next(heap) | 1;
+    heap->keys[0] = _mi_heap_random_next(heap);
+    heap->keys[1] = _mi_heap_random_next(heap);
+    heap->tld = tld;
+    tld->heap_backing = heap;
+    tld->heaps = heap;
+    tld->segments.stats = &tld->stats;
+    tld->segments.os = &tld->os;
+    tld->os.stats = &tld->stats;
+    _mi_heap_set_default_direct(heap);
+  }
+  return false;
+}
+
+// Free the thread local default heap (called from `mi_thread_done`)
+static bool _mi_heap_done(mi_heap_t* heap) {
+  if (!mi_heap_is_initialized(heap)) return true;
+
+  // reset default heap
+  _mi_heap_set_default_direct(_mi_is_main_thread() ? &_mi_heap_main : (mi_heap_t*)&_mi_heap_empty);
+
+  // switch to backing heap
+  heap = heap->tld->heap_backing;
+  if (!mi_heap_is_initialized(heap)) return false;
+
+  // delete all non-backing heaps in this thread
+  mi_heap_t* curr = heap->tld->heaps;
+  while (curr != NULL) {
+    mi_heap_t* next = curr->next; // save `next` as `curr` will be freed
+    if (curr != heap) {
+      mi_assert_internal(!mi_heap_is_backing(curr));
+      mi_heap_delete(curr);
+    }
+    curr = next;
+  }
+  mi_assert_internal(heap->tld->heaps == heap && heap->next == NULL);
+  mi_assert_internal(mi_heap_is_backing(heap));
+
+  // collect if not the main thread
+  if (heap != &_mi_heap_main) {
+    _mi_heap_collect_abandon(heap);
+  }
+
+  // merge stats
+  _mi_stats_done(&heap->tld->stats);
+
+  // free if not the main thread
+  if (heap != &_mi_heap_main) {
+    // the following assertion does not always hold for huge segments as those are always treated
+    // as abondened: one may allocate it in one thread, but deallocate in another in which case
+    // the count can be too large or negative. todo: perhaps not count huge segments? see issue #363
+    // mi_assert_internal(heap->tld->segments.count == 0 || heap->thread_id != _mi_thread_id());
+    mi_thread_data_free((mi_thread_data_t*)heap);
+  }
+  else {
+    mi_thread_data_collect(); // free cached thread metadata
+    #if 0
+    // never free the main thread even in debug mode; if a dll is linked statically with mimalloc,
+    // there may still be delete/free calls after the mi_fls_done is called. Issue #207
+    _mi_heap_destroy_pages(heap);
+    mi_assert_internal(heap->tld->heap_backing == &_mi_heap_main);
+    #endif
+  }
+  return false;
+}
+
+
+
+// --------------------------------------------------------
+// Try to run `mi_thread_done()` automatically so any memory
+// owned by the thread but not yet released can be abandoned
+// and re-owned by another thread.
+//
+// 1. windows dynamic library:
+//     call from DllMain on DLL_THREAD_DETACH
+// 2. windows static library:
+//     use `FlsAlloc` to call a destructor when the thread is done
+// 3. unix, pthreads:
+//     use a pthread key to call a destructor when a pthread is done
+//
+// In the last two cases we also need to call `mi_process_init`
+// to set up the thread local keys.
+// --------------------------------------------------------
+
+static void _mi_thread_done(mi_heap_t* default_heap);
+
+#if defined(_WIN32) && defined(MI_SHARED_LIB)
+  // nothing to do as it is done in DllMain
+#elif defined(_WIN32) && !defined(MI_SHARED_LIB)
+  // use thread local storage keys to detect thread ending
+  #include <windows.h>
+  #include <fibersapi.h>
+  #if (_WIN32_WINNT < 0x600)  // before Windows Vista
+  WINBASEAPI DWORD WINAPI FlsAlloc( _In_opt_ PFLS_CALLBACK_FUNCTION lpCallback );
+  WINBASEAPI PVOID WINAPI FlsGetValue( _In_ DWORD dwFlsIndex );
+  WINBASEAPI BOOL  WINAPI FlsSetValue( _In_ DWORD dwFlsIndex, _In_opt_ PVOID lpFlsData );
+  WINBASEAPI BOOL  WINAPI FlsFree(_In_ DWORD dwFlsIndex);
+  #endif
+  static DWORD mi_fls_key = (DWORD)(-1);
+  static void NTAPI mi_fls_done(PVOID value) {
+    mi_heap_t* heap = (mi_heap_t*)value;
+    if (heap != NULL) {
+      _mi_thread_done(heap);
+      FlsSetValue(mi_fls_key, NULL);  // prevent recursion as _mi_thread_done may set it back to the main heap, issue #672
+    }
+  }
+#elif defined(MI_USE_PTHREADS)
+  // use pthread local storage keys to detect thread ending
+  // (and used with MI_TLS_PTHREADS for the default heap)
+  pthread_key_t _mi_heap_default_key = (pthread_key_t)(-1);
+  static void mi_pthread_done(void* value) {
+    if (value!=NULL) _mi_thread_done((mi_heap_t*)value);
+  }
+#elif defined(__wasi__)
+// no pthreads in the WebAssembly Standard Interface
+#else
+  #pragma message("define a way to call mi_thread_done when a thread is done")
+#endif
+
+// Set up handlers so `mi_thread_done` is called automatically
+static void mi_process_setup_auto_thread_done(void) {
+  static bool tls_initialized = false; // fine if it races
+  if (tls_initialized) return;
+  tls_initialized = true;
+  #if defined(_WIN32) && defined(MI_SHARED_LIB)
+    // nothing to do as it is done in DllMain
+  #elif defined(_WIN32) && !defined(MI_SHARED_LIB)
+    mi_fls_key = FlsAlloc(&mi_fls_done);
+  #elif defined(MI_USE_PTHREADS)
+    mi_assert_internal(_mi_heap_default_key == (pthread_key_t)(-1));
+    pthread_key_create(&_mi_heap_default_key, &mi_pthread_done);
+  #endif
+  _mi_heap_set_default_direct(&_mi_heap_main);
+}
+
+
+bool _mi_is_main_thread(void) {
+  return (_mi_heap_main.thread_id==0 || _mi_heap_main.thread_id == _mi_thread_id());
+}
+
+static _Atomic(size_t) thread_count = MI_ATOMIC_VAR_INIT(1);
+
+size_t  _mi_current_thread_count(void) {
+  return mi_atomic_load_relaxed(&thread_count);
+}
+
+// This is called from the `mi_malloc_generic`
+void mi_thread_init(void) mi_attr_noexcept
+{
+  // ensure our process has started already
+  mi_process_init();
+
+  // initialize the thread local default heap
+  // (this will call `_mi_heap_set_default_direct` and thus set the
+  //  fiber/pthread key to a non-zero value, ensuring `_mi_thread_done` is called)
+  if (_mi_heap_init()) return;  // returns true if already initialized
+
+  _mi_stat_increase(&_mi_stats_main.threads, 1);
+  mi_atomic_increment_relaxed(&thread_count);
+  //_mi_verbose_message("thread init: 0x%zx\n", _mi_thread_id());
+}
+
+void mi_thread_done(void) mi_attr_noexcept {
+  _mi_thread_done(mi_get_default_heap());
+}
+
+static void _mi_thread_done(mi_heap_t* heap) {
+  mi_atomic_decrement_relaxed(&thread_count);
+  _mi_stat_decrease(&_mi_stats_main.threads, 1);
+
+  // check thread-id as on Windows shutdown with FLS the main (exit) thread may call this on thread-local heaps...
+  if (heap->thread_id != _mi_thread_id()) return;
+
+  // abandon the thread local heap
+  if (_mi_heap_done(heap)) return;  // returns true if already ran
+}
+
+void _mi_heap_set_default_direct(mi_heap_t* heap)  {
+  mi_assert_internal(heap != NULL);
+  #if defined(MI_TLS_SLOT)
+  mi_tls_slot_set(MI_TLS_SLOT,heap);
+  #elif defined(MI_TLS_PTHREAD_SLOT_OFS)
+  *mi_tls_pthread_heap_slot() = heap;
+  #elif defined(MI_TLS_PTHREAD)
+  // we use _mi_heap_default_key
+  #else
+  _mi_heap_default = heap;
+  #endif
+
+  // ensure the default heap is passed to `_mi_thread_done`
+  // setting to a non-NULL value also ensures `mi_thread_done` is called.
+  #if defined(_WIN32) && defined(MI_SHARED_LIB)
+    // nothing to do as it is done in DllMain
+  #elif defined(_WIN32) && !defined(MI_SHARED_LIB)
+    mi_assert_internal(mi_fls_key != 0);
+    FlsSetValue(mi_fls_key, heap);
+  #elif defined(MI_USE_PTHREADS)
+  if (_mi_heap_default_key != (pthread_key_t)(-1)) {  // can happen during recursive invocation on freeBSD
+    pthread_setspecific(_mi_heap_default_key, heap);
+  }
+  #endif
+}
+
+
+// --------------------------------------------------------
+// Run functions on process init/done, and thread init/done
+// --------------------------------------------------------
+static void mi_cdecl mi_process_done(void);
+
+static bool os_preloading = true;    // true until this module is initialized
+static bool mi_redirected = false;   // true if malloc redirects to mi_malloc
+
+// Returns true if this module has not been initialized; Don't use C runtime routines until it returns false.
+bool _mi_preloading(void) {
+  return os_preloading;
+}
+
+mi_decl_nodiscard bool mi_is_redirected(void) mi_attr_noexcept {
+  return mi_redirected;
+}
+
+// Communicate with the redirection module on Windows
+#if defined(_WIN32) && defined(MI_SHARED_LIB) && !defined(MI_WIN_NOREDIRECT)
+#ifdef __cplusplus
+extern "C" {
+#endif
+mi_decl_export void _mi_redirect_entry(DWORD reason) {
+  // called on redirection; careful as this may be called before DllMain
+  if (reason == DLL_PROCESS_ATTACH) {
+    mi_redirected = true;
+  }
+  else if (reason == DLL_PROCESS_DETACH) {
+    mi_redirected = false;
+  }
+  else if (reason == DLL_THREAD_DETACH) {
+    mi_thread_done();
+  }
+}
+__declspec(dllimport) bool mi_cdecl mi_allocator_init(const char** message);
+__declspec(dllimport) void mi_cdecl mi_allocator_done(void);
+#ifdef __cplusplus
+}
+#endif
+#else
+static bool mi_allocator_init(const char** message) {
+  if (message != NULL) *message = NULL;
+  return true;
+}
+static void mi_allocator_done(void) {
+  // nothing to do
+}
+#endif
+
+// Called once by the process loader
+static void mi_process_load(void) {
+  mi_heap_main_init();
+  #if defined(MI_TLS_RECURSE_GUARD)
+  volatile mi_heap_t* dummy = _mi_heap_default; // access TLS to allocate it before setting tls_initialized to true;
+  MI_UNUSED(dummy);
+  #endif
+  os_preloading = false;
+  mi_assert_internal(_mi_is_main_thread());
+  #if !(defined(_WIN32) && defined(MI_SHARED_LIB))  // use Dll process detach (see below) instead of atexit (issue #521)
+  atexit(&mi_process_done);
+  #endif
+  _mi_options_init();
+  mi_process_setup_auto_thread_done();
+  mi_process_init();
+  if (mi_redirected) _mi_verbose_message("malloc is redirected.\n");
+
+  // show message from the redirector (if present)
+  const char* msg = NULL;
+  mi_allocator_init(&msg);
+  if (msg != NULL && (mi_option_is_enabled(mi_option_verbose) || mi_option_is_enabled(mi_option_show_errors))) {
+    _mi_fputs(NULL,NULL,NULL,msg);
+  }
+
+  // reseed random
+  _mi_random_reinit_if_weak(&_mi_heap_main.random);
+}
+
+#if defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64))
+#include <intrin.h>
+mi_decl_cache_align bool _mi_cpu_has_fsrm = false;
+
+static void mi_detect_cpu_features(void) {
+  // FSRM for fast rep movsb support (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017))
+  int32_t cpu_info[4];
+  __cpuid(cpu_info, 7);
+  _mi_cpu_has_fsrm = ((cpu_info[3] & (1 << 4)) != 0); // bit 4 of EDX : see <https ://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features>
+}
+#else
+static void mi_detect_cpu_features(void) {
+  // nothing
+}
+#endif
+
+// Initialize the process; called by thread_init or the process loader
+void mi_process_init(void) mi_attr_noexcept {
+  // ensure we are called once
+  if (_mi_process_is_initialized) return;
+  _mi_verbose_message("process init: 0x%zx\n", _mi_thread_id());
+  _mi_process_is_initialized = true;
+  mi_process_setup_auto_thread_done();
+
+  mi_detect_cpu_features();
+  _mi_os_init();
+  mi_heap_main_init();
+  #if (MI_DEBUG)
+  _mi_verbose_message("debug level : %d\n", MI_DEBUG);
+  #endif
+  _mi_verbose_message("secure level: %d\n", MI_SECURE);
+  _mi_verbose_message("mem tracking: %s\n", MI_TRACK_TOOL);
+  mi_thread_init();
+
+  #if defined(_WIN32) && !defined(MI_SHARED_LIB)
+  // When building as a static lib the FLS cleanup happens to early for the main thread.
+  // To avoid this, set the FLS value for the main thread to NULL so the fls cleanup
+  // will not call _mi_thread_done on the (still executing) main thread. See issue #508.
+  FlsSetValue(mi_fls_key, NULL);
+  #endif
+
+  mi_stats_reset();  // only call stat reset *after* thread init (or the heap tld == NULL)
+
+  if (mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
+    size_t pages = mi_option_get_clamp(mi_option_reserve_huge_os_pages, 0, 128*1024);
+    long reserve_at = mi_option_get(mi_option_reserve_huge_os_pages_at);
+    if (reserve_at != -1) {
+      mi_reserve_huge_os_pages_at(pages, reserve_at, pages*500);
+    } else {
+      mi_reserve_huge_os_pages_interleave(pages, 0, pages*500);
+    }
+  }
+  if (mi_option_is_enabled(mi_option_reserve_os_memory)) {
+    long ksize = mi_option_get(mi_option_reserve_os_memory);
+    if (ksize > 0) {
+      mi_reserve_os_memory((size_t)ksize*MI_KiB, true /* commit? */, true /* allow large pages? */);
+    }
+  }
+}
+
+// Called when the process is done (through `at_exit`)
+static void mi_cdecl mi_process_done(void) {
+  // only shutdown if we were initialized
+  if (!_mi_process_is_initialized) return;
+  // ensure we are called once
+  static bool process_done = false;
+  if (process_done) return;
+  process_done = true;
+
+  #if defined(_WIN32) && !defined(MI_SHARED_LIB)
+  FlsFree(mi_fls_key);  // call thread-done on all threads (except the main thread) to prevent dangling callback pointer if statically linked with a DLL; Issue #208
+  #endif
+
+  #ifndef MI_SKIP_COLLECT_ON_EXIT
+    #if (MI_DEBUG != 0) || !defined(MI_SHARED_LIB)
+    // free all memory if possible on process exit. This is not needed for a stand-alone process
+    // but should be done if mimalloc is statically linked into another shared library which
+    // is repeatedly loaded/unloaded, see issue #281.
+    mi_collect(true /* force */ );
+    #endif
+  #endif
+
+  // Forcefully release all retained memory; this can be dangerous in general if overriding regular malloc/free
+  // since after process_done there might still be other code running that calls `free` (like at_exit routines,
+  // or C-runtime termination code.
+  if (mi_option_is_enabled(mi_option_destroy_on_exit)) {
+    _mi_heap_destroy_all();                          // forcefully release all memory held by all heaps (of this thread only!)
+    _mi_segment_cache_free_all(&_mi_heap_main_get()->tld->os);  // release all cached segments
+  }
+
+  if (mi_option_is_enabled(mi_option_show_stats) || mi_option_is_enabled(mi_option_verbose)) {
+    mi_stats_print(NULL);
+  }
+  mi_allocator_done();
+  _mi_verbose_message("process done: 0x%zx\n", _mi_heap_main.thread_id);
+  os_preloading = true; // don't call the C runtime anymore
+}
+
+
+
+#if defined(_WIN32) && defined(MI_SHARED_LIB)
+  // Windows DLL: easy to hook into process_init and thread_done
+  __declspec(dllexport) BOOL WINAPI DllMain(HINSTANCE inst, DWORD reason, LPVOID reserved) {
+    MI_UNUSED(reserved);
+    MI_UNUSED(inst);
+    if (reason==DLL_PROCESS_ATTACH) {
+      mi_process_load();
+    }
+    else if (reason==DLL_PROCESS_DETACH) {
+      mi_process_done();
+    }
+    else if (reason==DLL_THREAD_DETACH) {
+      if (!mi_is_redirected()) {
+        mi_thread_done();
+      }
+    }
+    return TRUE;
+  }
+
+#elif defined(_MSC_VER)
+  // MSVC: use data section magic for static libraries
+  // See <https://www.codeguru.com/cpp/misc/misc/applicationcontrol/article.php/c6945/Running-Code-Before-and-After-Main.htm>
+  static int _mi_process_init(void) {
+    mi_process_load();
+    return 0;
+  }
+  typedef int(*_mi_crt_callback_t)(void);
+  #if defined(_M_X64) || defined(_M_ARM64)
+    __pragma(comment(linker, "/include:" "_mi_msvc_initu"))
+    #pragma section(".CRT$XIU", long, read)
+  #else
+    __pragma(comment(linker, "/include:" "__mi_msvc_initu"))
+  #endif
+  #pragma data_seg(".CRT$XIU")
+  mi_decl_externc _mi_crt_callback_t _mi_msvc_initu[] = { &_mi_process_init };
+  #pragma data_seg()
+
+#elif defined(__cplusplus)
+  // C++: use static initialization to detect process start
+  static bool _mi_process_init(void) {
+    mi_process_load();
+    return (_mi_heap_main.thread_id != 0);
+  }
+  static bool mi_initialized = _mi_process_init();
+
+#elif defined(__GNUC__) || defined(__clang__)
+  // GCC,Clang: use the constructor attribute
+  static void __attribute__((constructor)) _mi_process_init(void) {
+    mi_process_load();
+  }
+
+#else
+#pragma message("define a way to call mi_process_load on your platform")
+#endif
diff --git a/Objects/mimalloc/options.c b/Objects/mimalloc/options.c
new file mode 100644
index 00000000000000..0a82ca65edd8d9
--- /dev/null
+++ b/Objects/mimalloc/options.c
@@ -0,0 +1,642 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include <stdio.h>
+#include <stdlib.h> // strtol
+#include <string.h> // strncpy, strncat, strlen, strstr
+#include <ctype.h>  // toupper
+#include <stdarg.h>
+
+#ifdef _MSC_VER
+#pragma warning(disable:4996)   // strncpy, strncat
+#endif
+
+
+static long mi_max_error_count   = 16; // stop outputting errors after this (use < 0 for no limit)
+static long mi_max_warning_count = 16; // stop outputting warnings after this (use < 0 for no limit)
+
+static void mi_add_stderr_output(void);
+
+int mi_version(void) mi_attr_noexcept {
+  return MI_MALLOC_VERSION;
+}
+
+#ifdef _WIN32
+#include <conio.h>
+#endif
+
+// --------------------------------------------------------
+// Options
+// These can be accessed by multiple threads and may be
+// concurrently initialized, but an initializing data race
+// is ok since they resolve to the same value.
+// --------------------------------------------------------
+typedef enum mi_init_e {
+  UNINIT,       // not yet initialized
+  DEFAULTED,    // not found in the environment, use default value
+  INITIALIZED   // found in environment or set explicitly
+} mi_init_t;
+
+typedef struct mi_option_desc_s {
+  long        value;  // the value
+  mi_init_t   init;   // is it initialized yet? (from the environment)
+  mi_option_t option; // for debugging: the option index should match the option
+  const char* name;   // option name without `mimalloc_` prefix
+  const char* legacy_name; // potential legacy v1.x option name
+} mi_option_desc_t;
+
+#define MI_OPTION(opt)                  mi_option_##opt, #opt, NULL
+#define MI_OPTION_LEGACY(opt,legacy)    mi_option_##opt, #opt, #legacy
+
+static mi_option_desc_t options[_mi_option_last] =
+{
+  // stable options
+  #if MI_DEBUG || defined(MI_SHOW_ERRORS)
+  { 1, UNINIT, MI_OPTION(show_errors) },
+  #else
+  { 0, UNINIT, MI_OPTION(show_errors) },
+  #endif
+  { 0, UNINIT, MI_OPTION(show_stats) },
+  { 0, UNINIT, MI_OPTION(verbose) },
+
+  // Some of the following options are experimental and not all combinations are valid. Use with care.
+  { 1, UNINIT, MI_OPTION(eager_commit) },        // commit per segment directly (8MiB)  (but see also `eager_commit_delay`)
+  { 0, UNINIT, MI_OPTION(deprecated_eager_region_commit) },
+  { 0, UNINIT, MI_OPTION(deprecated_reset_decommits) },
+  { 0, UNINIT, MI_OPTION(large_os_pages) },      // use large OS pages, use only with eager commit to prevent fragmentation of VMA's
+  { 0, UNINIT, MI_OPTION(reserve_huge_os_pages) },  // per 1GiB huge pages
+  { -1, UNINIT, MI_OPTION(reserve_huge_os_pages_at) }, // reserve huge pages at node N
+  { 0, UNINIT, MI_OPTION(reserve_os_memory)     },
+  { 0, UNINIT, MI_OPTION(deprecated_segment_cache) },  // cache N segments per thread
+  { 0, UNINIT, MI_OPTION(page_reset) },          // reset page memory on free
+  { 0, UNINIT, MI_OPTION_LEGACY(abandoned_page_decommit, abandoned_page_reset) },// decommit free page memory when a thread terminates  
+  { 0, UNINIT, MI_OPTION(deprecated_segment_reset) },
+  #if defined(__NetBSD__)
+  { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
+  #elif defined(_WIN32)
+  { 4, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed (but per page in the segment on demand)
+  #else
+  { 1, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed (but per page in the segment on demand)
+  #endif
+  { 25,   UNINIT, MI_OPTION_LEGACY(decommit_delay, reset_delay) }, // page decommit delay in milli-seconds
+  { 0,    UNINIT, MI_OPTION(use_numa_nodes) },    // 0 = use available numa nodes, otherwise use at most N nodes. 
+  { 0,    UNINIT, MI_OPTION(limit_os_alloc) },    // 1 = do not use OS memory for allocation (but only reserved arenas)
+  { 100,  UNINIT, MI_OPTION(os_tag) },            // only apple specific for now but might serve more or less related purpose
+  { 16,   UNINIT, MI_OPTION(max_errors) },        // maximum errors that are output
+  { 16,   UNINIT, MI_OPTION(max_warnings) },      // maximum warnings that are output
+  { 8,    UNINIT, MI_OPTION(max_segment_reclaim)},// max. number of segment reclaims from the abandoned segments per try.  
+  { 1,    UNINIT, MI_OPTION(allow_decommit) },    // decommit slices when no longer used (after decommit_delay milli-seconds)
+  { 500,  UNINIT, MI_OPTION(segment_decommit_delay) }, // decommit delay in milli-seconds for freed segments
+  { 1,    UNINIT, MI_OPTION(decommit_extend_delay) },
+  { 0,    UNINIT, MI_OPTION(destroy_on_exit)}     // release all OS memory on process exit; careful with dangling pointer or after-exit frees!
+};
+
+static void mi_option_init(mi_option_desc_t* desc);
+
+void _mi_options_init(void) {
+  // called on process load; should not be called before the CRT is initialized!
+  // (e.g. do not call this from process_init as that may run before CRT initialization)
+  mi_add_stderr_output(); // now it safe to use stderr for output
+  for(int i = 0; i < _mi_option_last; i++ ) {
+    mi_option_t option = (mi_option_t)i;
+    long l = mi_option_get(option); MI_UNUSED(l); // initialize
+    // if (option != mi_option_verbose)
+    {
+      mi_option_desc_t* desc = &options[option];
+      _mi_verbose_message("option '%s': %ld\n", desc->name, desc->value);
+    }
+  }
+  mi_max_error_count = mi_option_get(mi_option_max_errors);
+  mi_max_warning_count = mi_option_get(mi_option_max_warnings);
+}
+
+mi_decl_nodiscard long mi_option_get(mi_option_t option) {
+  mi_assert(option >= 0 && option < _mi_option_last);
+  if (option < 0 || option >= _mi_option_last) return 0;
+  mi_option_desc_t* desc = &options[option];
+  mi_assert(desc->option == option);  // index should match the option
+  if mi_unlikely(desc->init == UNINIT) {
+    mi_option_init(desc);
+  }
+  return desc->value;
+}
+
+mi_decl_nodiscard long mi_option_get_clamp(mi_option_t option, long min, long max) {
+  long x = mi_option_get(option);
+  return (x < min ? min : (x > max ? max : x));
+}
+
+void mi_option_set(mi_option_t option, long value) {
+  mi_assert(option >= 0 && option < _mi_option_last);
+  if (option < 0 || option >= _mi_option_last) return;
+  mi_option_desc_t* desc = &options[option];
+  mi_assert(desc->option == option);  // index should match the option
+  desc->value = value;
+  desc->init = INITIALIZED;
+}
+
+void mi_option_set_default(mi_option_t option, long value) {
+  mi_assert(option >= 0 && option < _mi_option_last);
+  if (option < 0 || option >= _mi_option_last) return;
+  mi_option_desc_t* desc = &options[option];
+  if (desc->init != INITIALIZED) {
+    desc->value = value;
+  }
+}
+
+mi_decl_nodiscard bool mi_option_is_enabled(mi_option_t option) {
+  return (mi_option_get(option) != 0);
+}
+
+void mi_option_set_enabled(mi_option_t option, bool enable) {
+  mi_option_set(option, (enable ? 1 : 0));
+}
+
+void mi_option_set_enabled_default(mi_option_t option, bool enable) {
+  mi_option_set_default(option, (enable ? 1 : 0));
+}
+
+void mi_option_enable(mi_option_t option) {
+  mi_option_set_enabled(option,true);
+}
+
+void mi_option_disable(mi_option_t option) {
+  mi_option_set_enabled(option,false);
+}
+
+
+static void mi_cdecl mi_out_stderr(const char* msg, void* arg) {
+  MI_UNUSED(arg);
+  if (msg == NULL) return;
+  #ifdef _WIN32
+  // on windows with redirection, the C runtime cannot handle locale dependent output
+  // after the main thread closes so we use direct console output.
+  if (!_mi_preloading()) {
+    // _cputs(msg);  // _cputs cannot be used at is aborts if it fails to lock the console
+    static HANDLE hcon = INVALID_HANDLE_VALUE;
+    static bool hconIsConsole;
+    if (hcon == INVALID_HANDLE_VALUE) {
+      CONSOLE_SCREEN_BUFFER_INFO sbi;
+      hcon = GetStdHandle(STD_ERROR_HANDLE);
+      hconIsConsole = ((hcon != INVALID_HANDLE_VALUE) && GetConsoleScreenBufferInfo(hcon, &sbi));
+    }
+    const size_t len = strlen(msg);
+    if (len > 0 && len < UINT32_MAX) {
+      DWORD written = 0;
+      if (hconIsConsole) {
+        WriteConsoleA(hcon, msg, (DWORD)len, &written, NULL);
+      }
+      else if (hcon != INVALID_HANDLE_VALUE) {
+        // use direct write if stderr was redirected
+        WriteFile(hcon, msg, (DWORD)len, &written, NULL);
+      }
+      else {
+        // finally fall back to fputs after all
+        fputs(msg, stderr);
+      }
+    }
+  }
+  #else
+  fputs(msg, stderr);
+  #endif
+}
+
+// Since an output function can be registered earliest in the `main`
+// function we also buffer output that happens earlier. When
+// an output function is registered it is called immediately with
+// the output up to that point.
+#ifndef MI_MAX_DELAY_OUTPUT
+#define MI_MAX_DELAY_OUTPUT ((size_t)(32*1024))
+#endif
+static char out_buf[MI_MAX_DELAY_OUTPUT+1];
+static _Atomic(size_t) out_len;
+
+static void mi_cdecl mi_out_buf(const char* msg, void* arg) {
+  MI_UNUSED(arg);
+  if (msg==NULL) return;
+  if (mi_atomic_load_relaxed(&out_len)>=MI_MAX_DELAY_OUTPUT) return;
+  size_t n = strlen(msg);
+  if (n==0) return;
+  // claim space
+  size_t start = mi_atomic_add_acq_rel(&out_len, n);
+  if (start >= MI_MAX_DELAY_OUTPUT) return;
+  // check bound
+  if (start+n >= MI_MAX_DELAY_OUTPUT) {
+    n = MI_MAX_DELAY_OUTPUT-start-1;
+  }
+  _mi_memcpy(&out_buf[start], msg, n);
+}
+
+static void mi_out_buf_flush(mi_output_fun* out, bool no_more_buf, void* arg) {
+  if (out==NULL) return;
+  // claim (if `no_more_buf == true`, no more output will be added after this point)
+  size_t count = mi_atomic_add_acq_rel(&out_len, (no_more_buf ? MI_MAX_DELAY_OUTPUT : 1));
+  // and output the current contents
+  if (count>MI_MAX_DELAY_OUTPUT) count = MI_MAX_DELAY_OUTPUT;
+  out_buf[count] = 0;
+  out(out_buf,arg);
+  if (!no_more_buf) {
+    out_buf[count] = '\n'; // if continue with the buffer, insert a newline
+  }
+}
+
+
+// Once this module is loaded, switch to this routine
+// which outputs to stderr and the delayed output buffer.
+static void mi_cdecl mi_out_buf_stderr(const char* msg, void* arg) {
+  mi_out_stderr(msg,arg);
+  mi_out_buf(msg,arg);
+}
+
+
+
+// --------------------------------------------------------
+// Default output handler
+// --------------------------------------------------------
+
+// Should be atomic but gives errors on many platforms as generally we cannot cast a function pointer to a uintptr_t.
+// For now, don't register output from multiple threads.
+static mi_output_fun* volatile mi_out_default; // = NULL
+static _Atomic(void*) mi_out_arg; // = NULL
+
+static mi_output_fun* mi_out_get_default(void** parg) {
+  if (parg != NULL) { *parg = mi_atomic_load_ptr_acquire(void,&mi_out_arg); }
+  mi_output_fun* out = mi_out_default;
+  return (out == NULL ? &mi_out_buf : out);
+}
+
+void mi_register_output(mi_output_fun* out, void* arg) mi_attr_noexcept {
+  mi_out_default = (out == NULL ? &mi_out_stderr : out); // stop using the delayed output buffer
+  mi_atomic_store_ptr_release(void,&mi_out_arg, arg);
+  if (out!=NULL) mi_out_buf_flush(out,true,arg);         // output all the delayed output now
+}
+
+// add stderr to the delayed output after the module is loaded
+static void mi_add_stderr_output() {
+  mi_assert_internal(mi_out_default == NULL);
+  mi_out_buf_flush(&mi_out_stderr, false, NULL); // flush current contents to stderr
+  mi_out_default = &mi_out_buf_stderr;           // and add stderr to the delayed output
+}
+
+// --------------------------------------------------------
+// Messages, all end up calling `_mi_fputs`.
+// --------------------------------------------------------
+static _Atomic(size_t) error_count;   // = 0;  // when >= max_error_count stop emitting errors
+static _Atomic(size_t) warning_count; // = 0;  // when >= max_warning_count stop emitting warnings
+
+// When overriding malloc, we may recurse into mi_vfprintf if an allocation
+// inside the C runtime causes another message.
+// In some cases (like on macOS) the loader already allocates which
+// calls into mimalloc; if we then access thread locals (like `recurse`)
+// this may crash as the access may call _tlv_bootstrap that tries to
+// (recursively) invoke malloc again to allocate space for the thread local
+// variables on demand. This is why we use a _mi_preloading test on such
+// platforms. However, C code generator may move the initial thread local address
+// load before the `if` and we therefore split it out in a separate funcion.
+static mi_decl_thread bool recurse = false;
+
+static mi_decl_noinline bool mi_recurse_enter_prim(void) {
+  if (recurse) return false;
+  recurse = true;
+  return true;
+}
+
+static mi_decl_noinline void mi_recurse_exit_prim(void) {
+  recurse = false;
+}
+
+static bool mi_recurse_enter(void) {
+  #if defined(__APPLE__) || defined(MI_TLS_RECURSE_GUARD)
+  if (_mi_preloading()) return true;
+  #endif
+  return mi_recurse_enter_prim();
+}
+
+static void mi_recurse_exit(void) {
+  #if defined(__APPLE__) || defined(MI_TLS_RECURSE_GUARD)
+  if (_mi_preloading()) return;
+  #endif
+  mi_recurse_exit_prim();
+}
+
+void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message) {
+  if (out==NULL || (FILE*)out==stdout || (FILE*)out==stderr) { // TODO: use mi_out_stderr for stderr?
+    if (!mi_recurse_enter()) return;
+    out = mi_out_get_default(&arg);
+    if (prefix != NULL) out(prefix, arg);
+    out(message, arg);
+    mi_recurse_exit();
+  }
+  else {
+    if (prefix != NULL) out(prefix, arg);
+    out(message, arg);
+  }
+}
+
+// Define our own limited `fprintf` that avoids memory allocation.
+// We do this using `snprintf` with a limited buffer.
+static void mi_vfprintf( mi_output_fun* out, void* arg, const char* prefix, const char* fmt, va_list args ) {
+  char buf[512];
+  if (fmt==NULL) return;
+  if (!mi_recurse_enter()) return;
+  vsnprintf(buf,sizeof(buf)-1,fmt,args);
+  mi_recurse_exit();
+  _mi_fputs(out,arg,prefix,buf);
+}
+
+void _mi_fprintf( mi_output_fun* out, void* arg, const char* fmt, ... ) {
+  va_list args;
+  va_start(args,fmt);
+  mi_vfprintf(out,arg,NULL,fmt,args);
+  va_end(args);
+}
+
+static void mi_vfprintf_thread(mi_output_fun* out, void* arg, const char* prefix, const char* fmt, va_list args) {
+  if (prefix != NULL && strlen(prefix) <= 32 && !_mi_is_main_thread()) {
+    char tprefix[64];
+    snprintf(tprefix, sizeof(tprefix), "%sthread 0x%llx: ", prefix, (unsigned long long)_mi_thread_id());
+    mi_vfprintf(out, arg, tprefix, fmt, args);
+  }
+  else {
+    mi_vfprintf(out, arg, prefix, fmt, args);
+  }
+}
+
+void _mi_trace_message(const char* fmt, ...) {
+  if (mi_option_get(mi_option_verbose) <= 1) return;  // only with verbose level 2 or higher
+  va_list args;
+  va_start(args, fmt);
+  mi_vfprintf_thread(NULL, NULL, "mimalloc: ", fmt, args);
+  va_end(args);
+}
+
+void _mi_verbose_message(const char* fmt, ...) {
+  if (!mi_option_is_enabled(mi_option_verbose)) return;
+  va_list args;
+  va_start(args,fmt);
+  mi_vfprintf(NULL, NULL, "mimalloc: ", fmt, args);
+  va_end(args);
+}
+
+static void mi_show_error_message(const char* fmt, va_list args) {
+  if (!mi_option_is_enabled(mi_option_verbose)) {
+    if (!mi_option_is_enabled(mi_option_show_errors)) return;
+    if (mi_max_error_count >= 0 && (long)mi_atomic_increment_acq_rel(&error_count) > mi_max_error_count) return;
+  }
+  mi_vfprintf_thread(NULL, NULL, "mimalloc: error: ", fmt, args);
+}
+
+void _mi_warning_message(const char* fmt, ...) {
+  if (!mi_option_is_enabled(mi_option_verbose)) {
+    if (!mi_option_is_enabled(mi_option_show_errors)) return;
+    if (mi_max_warning_count >= 0 && (long)mi_atomic_increment_acq_rel(&warning_count) > mi_max_warning_count) return;
+  }
+  va_list args;
+  va_start(args,fmt);
+  mi_vfprintf_thread(NULL, NULL, "mimalloc: warning: ", fmt, args);
+  va_end(args);
+}
+
+
+#if MI_DEBUG
+void _mi_assert_fail(const char* assertion, const char* fname, unsigned line, const char* func ) {
+  _mi_fprintf(NULL, NULL, "mimalloc: assertion failed: at \"%s\":%u, %s\n  assertion: \"%s\"\n", fname, line, (func==NULL?"":func), assertion);
+  abort();
+}
+#endif
+
+// --------------------------------------------------------
+// Errors
+// --------------------------------------------------------
+
+static mi_error_fun* volatile  mi_error_handler; // = NULL
+static _Atomic(void*) mi_error_arg;     // = NULL
+
+static void mi_error_default(int err) {
+  MI_UNUSED(err);
+#if (MI_DEBUG>0)
+  if (err==EFAULT) {
+    #ifdef _MSC_VER
+    __debugbreak();
+    #endif
+    abort();
+  }
+#endif
+#if (MI_SECURE>0)
+  if (err==EFAULT) {  // abort on serious errors in secure mode (corrupted meta-data)
+    abort();
+  }
+#endif
+#if defined(MI_XMALLOC)
+  if (err==ENOMEM || err==EOVERFLOW) { // abort on memory allocation fails in xmalloc mode
+    abort();
+  }
+#endif
+}
+
+void mi_register_error(mi_error_fun* fun, void* arg) {
+  mi_error_handler = fun;  // can be NULL
+  mi_atomic_store_ptr_release(void,&mi_error_arg, arg);
+}
+
+void _mi_error_message(int err, const char* fmt, ...) {
+  // show detailed error message
+  va_list args;
+  va_start(args, fmt);
+  mi_show_error_message(fmt, args);
+  va_end(args);
+  // and call the error handler which may abort (or return normally)
+  if (mi_error_handler != NULL) {
+    mi_error_handler(err, mi_atomic_load_ptr_acquire(void,&mi_error_arg));
+  }
+  else {
+    mi_error_default(err);
+  }
+}
+
+// --------------------------------------------------------
+// Initialize options by checking the environment
+// --------------------------------------------------------
+
+static void mi_strlcpy(char* dest, const char* src, size_t dest_size) {
+  if (dest==NULL || src==NULL || dest_size == 0) return;
+  // copy until end of src, or when dest is (almost) full
+  while (*src != 0 && dest_size > 1) {
+    *dest++ = *src++;
+    dest_size--;
+  }
+  // always zero terminate
+  *dest = 0;
+}
+
+static void mi_strlcat(char* dest, const char* src, size_t dest_size) {
+  if (dest==NULL || src==NULL || dest_size == 0) return;
+  // find end of string in the dest buffer
+  while (*dest != 0 && dest_size > 1) {
+    dest++;
+    dest_size--;
+  }
+  // and catenate
+  mi_strlcpy(dest, src, dest_size);
+}
+
+#ifdef MI_NO_GETENV
+static bool mi_getenv(const char* name, char* result, size_t result_size) {
+  MI_UNUSED(name);
+  MI_UNUSED(result);
+  MI_UNUSED(result_size);
+  return false;
+}
+#else
+#if defined _WIN32
+// On Windows use GetEnvironmentVariable instead of getenv to work
+// reliably even when this is invoked before the C runtime is initialized.
+// i.e. when `_mi_preloading() == true`.
+// Note: on windows, environment names are not case sensitive.
+#include <windows.h>
+static bool mi_getenv(const char* name, char* result, size_t result_size) {
+  result[0] = 0;
+  size_t len = GetEnvironmentVariableA(name, result, (DWORD)result_size);
+  return (len > 0 && len < result_size);
+}
+#elif !defined(MI_USE_ENVIRON) || (MI_USE_ENVIRON!=0)
+// On Posix systemsr use `environ` to acces environment variables
+// even before the C runtime is initialized.
+#if defined(__APPLE__) && defined(__has_include) && __has_include(<crt_externs.h>)
+#include <crt_externs.h>
+static char** mi_get_environ(void) {
+  return (*_NSGetEnviron());
+}
+#else
+extern char** environ;
+static char** mi_get_environ(void) {
+  return environ;
+}
+#endif
+static int mi_strnicmp(const char* s, const char* t, size_t n) {
+  if (n == 0) return 0;
+  for (; *s != 0 && *t != 0 && n > 0; s++, t++, n--) {
+    if (toupper(*s) != toupper(*t)) break;
+  }
+  return (n == 0 ? 0 : *s - *t);
+}
+static bool mi_getenv(const char* name, char* result, size_t result_size) {
+  if (name==NULL) return false;
+  const size_t len = strlen(name);
+  if (len == 0) return false;
+  char** env = mi_get_environ();
+  if (env == NULL) return false;
+  // compare up to 256 entries
+  for (int i = 0; i < 256 && env[i] != NULL; i++) {
+    const char* s = env[i];
+    if (mi_strnicmp(name, s, len) == 0 && s[len] == '=') { // case insensitive
+      // found it
+      mi_strlcpy(result, s + len + 1, result_size);
+      return true;
+    }
+  }
+  return false;
+}
+#else
+// fallback: use standard C `getenv` but this cannot be used while initializing the C runtime
+static bool mi_getenv(const char* name, char* result, size_t result_size) {
+  // cannot call getenv() when still initializing the C runtime.
+  if (_mi_preloading()) return false;
+  const char* s = getenv(name);
+  if (s == NULL) {
+    // we check the upper case name too.
+    char buf[64+1];
+    size_t len = strlen(name);
+    if (len >= sizeof(buf)) len = sizeof(buf) - 1;
+    for (size_t i = 0; i < len; i++) {
+      buf[i] = toupper(name[i]);
+    }
+    buf[len] = 0;
+    s = getenv(buf);
+  }
+  if (s != NULL && strlen(s) < result_size) {
+    mi_strlcpy(result, s, result_size);
+    return true;
+  }
+  else {
+    return false;
+  }
+}
+#endif  // !MI_USE_ENVIRON
+#endif  // !MI_NO_GETENV
+
+static void mi_option_init(mi_option_desc_t* desc) {
+  // Read option value from the environment
+  char s[64+1];
+  char buf[64+1];
+  mi_strlcpy(buf, "mimalloc_", sizeof(buf));
+  mi_strlcat(buf, desc->name, sizeof(buf));
+  bool found = mi_getenv(buf,s,sizeof(s));
+  if (!found && desc->legacy_name != NULL) {
+    mi_strlcpy(buf, "mimalloc_", sizeof(buf));
+    mi_strlcat(buf, desc->legacy_name, sizeof(buf));
+    found = mi_getenv(buf,s,sizeof(s));
+    if (found) {
+      _mi_warning_message("environment option \"mimalloc_%s\" is deprecated -- use \"mimalloc_%s\" instead.\n", desc->legacy_name, desc->name );
+    }    
+  }
+
+  if (found) {
+    size_t len = strlen(s);
+    if (len >= sizeof(buf)) len = sizeof(buf) - 1;
+    for (size_t i = 0; i < len; i++) {
+      buf[i] = (char)toupper(s[i]);
+    }
+    buf[len] = 0;
+    if (buf[0]==0 || strstr("1;TRUE;YES;ON", buf) != NULL) {
+      desc->value = 1;
+      desc->init = INITIALIZED;
+    }
+    else if (strstr("0;FALSE;NO;OFF", buf) != NULL) {
+      desc->value = 0;
+      desc->init = INITIALIZED;
+    }
+    else {
+      char* end = buf;
+      long value = strtol(buf, &end, 10);
+      if (desc->option == mi_option_reserve_os_memory) {
+        // this option is interpreted in KiB to prevent overflow of `long`
+        if (*end == 'K') { end++; }
+        else if (*end == 'M') { value *= MI_KiB; end++; }
+        else if (*end == 'G') { value *= MI_MiB; end++; }
+        else { value = (value + MI_KiB - 1) / MI_KiB; }
+        if (end[0] == 'I' && end[1] == 'B') { end += 2; }
+        else if (*end == 'B') { end++; }
+      }
+      if (*end == 0) {
+        desc->value = value;
+        desc->init = INITIALIZED;
+      }
+      else {
+        // set `init` first to avoid recursion through _mi_warning_message on mimalloc_verbose.
+        desc->init = DEFAULTED;
+        if (desc->option == mi_option_verbose && desc->value == 0) {
+          // if the 'mimalloc_verbose' env var has a bogus value we'd never know
+          // (since the value defaults to 'off') so in that case briefly enable verbose
+          desc->value = 1;
+          _mi_warning_message("environment option mimalloc_%s has an invalid value.\n", desc->name );
+          desc->value = 0;
+        }
+        else {
+          _mi_warning_message("environment option mimalloc_%s has an invalid value.\n", desc->name );
+        }
+      }
+    }
+    mi_assert_internal(desc->init != UNINIT);
+  }
+  else if (!_mi_preloading()) {
+    desc->init = DEFAULTED;
+  }
+}
diff --git a/Objects/mimalloc/os.c b/Objects/mimalloc/os.c
new file mode 100644
index 00000000000000..0f98474170e9e8
--- /dev/null
+++ b/Objects/mimalloc/os.c
@@ -0,0 +1,1479 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#ifndef _DEFAULT_SOURCE
+#define _DEFAULT_SOURCE   // ensure mmap flags are defined
+#endif
+
+#if defined(__sun)
+// illumos provides new mman.h api when any of these are defined
+// otherwise the old api based on caddr_t which predates the void pointers one.
+// stock solaris provides only the former, chose to atomically to discard those
+// flags only here rather than project wide tough.
+#undef _XOPEN_SOURCE
+#undef _POSIX_C_SOURCE
+#endif
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include <string.h>  // strerror
+
+#ifdef _MSC_VER
+#pragma warning(disable:4996)  // strerror
+#endif
+
+#if defined(__wasi__)
+#define MI_USE_SBRK
+#endif
+
+#if defined(_WIN32)
+#include <windows.h>
+#elif defined(__wasi__)
+#include <unistd.h>    // sbrk
+#else
+#include <sys/mman.h>  // mmap
+#include <unistd.h>    // sysconf
+#if defined(__linux__)
+#include <features.h>
+#include <fcntl.h>
+#if defined(__GLIBC__)
+#include <linux/mman.h> // linux mmap flags
+#else
+#include <sys/mman.h>
+#endif
+#endif
+#if defined(__APPLE__)
+#include <TargetConditionals.h>
+#if !TARGET_IOS_IPHONE && !TARGET_IOS_SIMULATOR
+#include <mach/vm_statistics.h>
+#endif
+#endif
+#if defined(__FreeBSD__) || defined(__DragonFly__)
+#include <sys/param.h>
+#if __FreeBSD_version >= 1200000
+#include <sys/cpuset.h>
+#include <sys/domainset.h>
+#endif
+#include <sys/sysctl.h>
+#endif
+#endif
+
+/* -----------------------------------------------------------
+  Initialization.
+  On windows initializes support for aligned allocation and
+  large OS pages (if MIMALLOC_LARGE_OS_PAGES is true).
+----------------------------------------------------------- */
+bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats);
+bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats);
+
+static void* mi_align_up_ptr(void* p, size_t alignment) {
+  return (void*)_mi_align_up((uintptr_t)p, alignment);
+}
+
+static void* mi_align_down_ptr(void* p, size_t alignment) {
+  return (void*)_mi_align_down((uintptr_t)p, alignment);
+}
+
+
+// page size (initialized properly in `os_init`)
+static size_t os_page_size = 4096;
+
+// minimal allocation granularity
+static size_t os_alloc_granularity = 4096;
+
+// if non-zero, use large page allocation
+static size_t large_os_page_size = 0;
+
+// is memory overcommit allowed?
+// set dynamically in _mi_os_init (and if true we use MAP_NORESERVE)
+static bool os_overcommit = true;
+
+bool _mi_os_has_overcommit(void) {
+  return os_overcommit;
+}
+
+// OS (small) page size
+size_t _mi_os_page_size(void) {
+  return os_page_size;
+}
+
+// if large OS pages are supported (2 or 4MiB), then return the size, otherwise return the small page size (4KiB)
+size_t _mi_os_large_page_size(void) {
+  return (large_os_page_size != 0 ? large_os_page_size : _mi_os_page_size());
+}
+
+#if !defined(MI_USE_SBRK) && !defined(__wasi__)
+static bool use_large_os_page(size_t size, size_t alignment) {
+  // if we have access, check the size and alignment requirements
+  if (large_os_page_size == 0 || !mi_option_is_enabled(mi_option_large_os_pages)) return false;
+  return ((size % large_os_page_size) == 0 && (alignment % large_os_page_size) == 0);
+}
+#endif
+
+// round to a good OS allocation size (bounded by max 12.5% waste)
+size_t _mi_os_good_alloc_size(size_t size) {
+  size_t align_size;
+  if (size < 512*MI_KiB) align_size = _mi_os_page_size();
+  else if (size < 2*MI_MiB) align_size = 64*MI_KiB;
+  else if (size < 8*MI_MiB) align_size = 256*MI_KiB;
+  else if (size < 32*MI_MiB) align_size = 1*MI_MiB;
+  else align_size = 4*MI_MiB;
+  if mi_unlikely(size >= (SIZE_MAX - align_size)) return size; // possible overflow?
+  return _mi_align_up(size, align_size);
+}
+
+#if defined(_WIN32)
+// We use VirtualAlloc2 for aligned allocation, but it is only supported on Windows 10 and Windows Server 2016.
+// So, we need to look it up dynamically to run on older systems. (use __stdcall for 32-bit compatibility)
+// NtAllocateVirtualAllocEx is used for huge OS page allocation (1GiB)
+// We define a minimal MEM_EXTENDED_PARAMETER ourselves in order to be able to compile with older SDK's.
+typedef enum MI_MEM_EXTENDED_PARAMETER_TYPE_E {
+  MiMemExtendedParameterInvalidType = 0,
+  MiMemExtendedParameterAddressRequirements,
+  MiMemExtendedParameterNumaNode,
+  MiMemExtendedParameterPartitionHandle,
+  MiMemExtendedParameterUserPhysicalHandle,
+  MiMemExtendedParameterAttributeFlags,
+  MiMemExtendedParameterMax
+} MI_MEM_EXTENDED_PARAMETER_TYPE;
+
+typedef struct DECLSPEC_ALIGN(8) MI_MEM_EXTENDED_PARAMETER_S {
+  struct { DWORD64 Type : 8; DWORD64 Reserved : 56; } Type;
+  union  { DWORD64 ULong64; PVOID Pointer; SIZE_T Size; HANDLE Handle; DWORD ULong; } Arg;
+} MI_MEM_EXTENDED_PARAMETER;
+
+typedef struct MI_MEM_ADDRESS_REQUIREMENTS_S {
+  PVOID  LowestStartingAddress;
+  PVOID  HighestEndingAddress;
+  SIZE_T Alignment;
+} MI_MEM_ADDRESS_REQUIREMENTS;
+
+#define MI_MEM_EXTENDED_PARAMETER_NONPAGED_HUGE   0x00000010
+
+#include <winternl.h>
+typedef PVOID    (__stdcall *PVirtualAlloc2)(HANDLE, PVOID, SIZE_T, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG);
+typedef NTSTATUS (__stdcall *PNtAllocateVirtualMemoryEx)(HANDLE, PVOID*, SIZE_T*, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG);
+static PVirtualAlloc2 pVirtualAlloc2 = NULL;
+static PNtAllocateVirtualMemoryEx pNtAllocateVirtualMemoryEx = NULL;
+
+// Similarly, GetNumaProcesorNodeEx is only supported since Windows 7
+typedef struct MI_PROCESSOR_NUMBER_S { WORD Group; BYTE Number; BYTE Reserved; } MI_PROCESSOR_NUMBER;
+
+typedef VOID (__stdcall *PGetCurrentProcessorNumberEx)(MI_PROCESSOR_NUMBER* ProcNumber);
+typedef BOOL (__stdcall *PGetNumaProcessorNodeEx)(MI_PROCESSOR_NUMBER* Processor, PUSHORT NodeNumber);
+typedef BOOL (__stdcall* PGetNumaNodeProcessorMaskEx)(USHORT Node, PGROUP_AFFINITY ProcessorMask);
+typedef BOOL (__stdcall *PGetNumaProcessorNode)(UCHAR Processor, PUCHAR NodeNumber);
+static PGetCurrentProcessorNumberEx pGetCurrentProcessorNumberEx = NULL;
+static PGetNumaProcessorNodeEx      pGetNumaProcessorNodeEx = NULL;
+static PGetNumaNodeProcessorMaskEx  pGetNumaNodeProcessorMaskEx = NULL;
+static PGetNumaProcessorNode        pGetNumaProcessorNode = NULL;
+
+static bool mi_win_enable_large_os_pages(void)
+{
+  if (large_os_page_size > 0) return true;
+
+  // Try to see if large OS pages are supported
+  // To use large pages on Windows, we first need access permission
+  // Set "Lock pages in memory" permission in the group policy editor
+  // <https://devblogs.microsoft.com/oldnewthing/20110128-00/?p=11643>
+  unsigned long err = 0;
+  HANDLE token = NULL;
+  BOOL ok = OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &token);
+  if (ok) {
+    TOKEN_PRIVILEGES tp;
+    ok = LookupPrivilegeValue(NULL, TEXT("SeLockMemoryPrivilege"), &tp.Privileges[0].Luid);
+    if (ok) {
+      tp.PrivilegeCount = 1;
+      tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
+      ok = AdjustTokenPrivileges(token, FALSE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, 0);
+      if (ok) {
+        err = GetLastError();
+        ok = (err == ERROR_SUCCESS);
+        if (ok) {
+          large_os_page_size = GetLargePageMinimum();
+        }
+      }
+    }
+    CloseHandle(token);
+  }
+  if (!ok) {
+    if (err == 0) err = GetLastError();
+    _mi_warning_message("cannot enable large OS page support, error %lu\n", err);
+  }
+  return (ok!=0);
+}
+
+void _mi_os_init(void)
+{
+  os_overcommit = false;
+  // get the page size
+  SYSTEM_INFO si;
+  GetSystemInfo(&si);
+  if (si.dwPageSize > 0) os_page_size = si.dwPageSize;
+  if (si.dwAllocationGranularity > 0) os_alloc_granularity = si.dwAllocationGranularity;
+  // get the VirtualAlloc2 function
+  HINSTANCE  hDll;
+  hDll = LoadLibrary(TEXT("kernelbase.dll"));
+  if (hDll != NULL) {
+    // use VirtualAlloc2FromApp if possible as it is available to Windows store apps
+    pVirtualAlloc2 = (PVirtualAlloc2)(void (*)(void))GetProcAddress(hDll, "VirtualAlloc2FromApp");
+    if (pVirtualAlloc2==NULL) pVirtualAlloc2 = (PVirtualAlloc2)(void (*)(void))GetProcAddress(hDll, "VirtualAlloc2");
+    FreeLibrary(hDll);
+  }
+  // NtAllocateVirtualMemoryEx is used for huge page allocation
+  hDll = LoadLibrary(TEXT("ntdll.dll"));
+  if (hDll != NULL) {
+    pNtAllocateVirtualMemoryEx = (PNtAllocateVirtualMemoryEx)(void (*)(void))GetProcAddress(hDll, "NtAllocateVirtualMemoryEx");
+    FreeLibrary(hDll);
+  }
+  // Try to use Win7+ numa API
+  hDll = LoadLibrary(TEXT("kernel32.dll"));
+  if (hDll != NULL) {
+    pGetCurrentProcessorNumberEx = (PGetCurrentProcessorNumberEx)(void (*)(void))GetProcAddress(hDll, "GetCurrentProcessorNumberEx");
+    pGetNumaProcessorNodeEx = (PGetNumaProcessorNodeEx)(void (*)(void))GetProcAddress(hDll, "GetNumaProcessorNodeEx");
+    pGetNumaNodeProcessorMaskEx = (PGetNumaNodeProcessorMaskEx)(void (*)(void))GetProcAddress(hDll, "GetNumaNodeProcessorMaskEx");
+    pGetNumaProcessorNode = (PGetNumaProcessorNode)(void (*)(void))GetProcAddress(hDll, "GetNumaProcessorNode");
+    FreeLibrary(hDll);
+  }
+  if (mi_option_is_enabled(mi_option_large_os_pages) || mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
+    mi_win_enable_large_os_pages();
+  }
+}
+#elif defined(__wasi__)
+void _mi_os_init(void) {
+  os_overcommit = false;
+  os_page_size = 64*MI_KiB; // WebAssembly has a fixed page size: 64KiB
+  os_alloc_granularity = 16;
+}
+
+#else  // generic unix
+
+static void os_detect_overcommit(void) {
+#if defined(__linux__)
+  int fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY);
+	if (fd < 0) return;
+  char buf[32];
+  ssize_t nread = read(fd, &buf, sizeof(buf));
+	close(fd);
+  // <https://www.kernel.org/doc/Documentation/vm/overcommit-accounting>
+  // 0: heuristic overcommit, 1: always overcommit, 2: never overcommit (ignore NORESERVE)
+  if (nread >= 1) {
+    os_overcommit = (buf[0] == '0' || buf[0] == '1');
+  }
+#elif defined(__FreeBSD__)
+  int val = 0;
+  size_t olen = sizeof(val);
+  if (sysctlbyname("vm.overcommit", &val, &olen, NULL, 0) == 0) {
+    os_overcommit = (val != 0);
+  }
+#else
+  // default: overcommit is true
+#endif
+}
+
+void _mi_os_init(void) {
+  // get the page size
+  long result = sysconf(_SC_PAGESIZE);
+  if (result > 0) {
+    os_page_size = (size_t)result;
+    os_alloc_granularity = os_page_size;
+  }
+  large_os_page_size = 2*MI_MiB; // TODO: can we query the OS for this?
+  os_detect_overcommit();
+}
+#endif
+
+
+#if defined(MADV_NORMAL)
+static int mi_madvise(void* addr, size_t length, int advice) {
+  #if defined(__sun)
+  return madvise((caddr_t)addr, length, advice);  // Solaris needs cast (issue #520)
+  #else
+  return madvise(addr, length, advice);
+  #endif
+}
+#endif
+
+
+/* -----------------------------------------------------------
+  aligned hinting
+-------------------------------------------------------------- */
+
+// On 64-bit systems, we can do efficient aligned allocation by using
+// the 2TiB to 30TiB area to allocate those.
+#if (MI_INTPTR_SIZE >= 8)
+static mi_decl_cache_align _Atomic(uintptr_t)aligned_base;
+
+// Return a MI_SEGMENT_SIZE aligned address that is probably available.
+// If this returns NULL, the OS will determine the address but on some OS's that may not be
+// properly aligned which can be more costly as it needs to be adjusted afterwards.
+// For a size > 1GiB this always returns NULL in order to guarantee good ASLR randomization;
+// (otherwise an initial large allocation of say 2TiB has a 50% chance to include (known) addresses
+//  in the middle of the 2TiB - 6TiB address range (see issue #372))
+
+#define MI_HINT_BASE ((uintptr_t)2 << 40)  // 2TiB start
+#define MI_HINT_AREA ((uintptr_t)4 << 40)  // upto 6TiB   (since before win8 there is "only" 8TiB available to processes)
+#define MI_HINT_MAX  ((uintptr_t)30 << 40) // wrap after 30TiB (area after 32TiB is used for huge OS pages)
+
+static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size)
+{
+  if (try_alignment <= 1 || try_alignment > MI_SEGMENT_SIZE) return NULL;
+  size = _mi_align_up(size, MI_SEGMENT_SIZE);
+  if (size > 1*MI_GiB) return NULL;  // guarantee the chance of fixed valid address is at most 1/(MI_HINT_AREA / 1<<30) = 1/4096.
+  #if (MI_SECURE>0)
+  size += MI_SEGMENT_SIZE;        // put in `MI_SEGMENT_SIZE` virtual gaps between hinted blocks; this splits VLA's but increases guarded areas.
+  #endif
+
+  uintptr_t hint = mi_atomic_add_acq_rel(&aligned_base, size);
+  if (hint == 0 || hint > MI_HINT_MAX) {   // wrap or initialize
+    uintptr_t init = MI_HINT_BASE;
+    #if (MI_SECURE>0 || MI_DEBUG==0)       // security: randomize start of aligned allocations unless in debug mode
+    uintptr_t r = _mi_heap_random_next(mi_get_default_heap());
+    init = init + ((MI_SEGMENT_SIZE * ((r>>17) & 0xFFFFF)) % MI_HINT_AREA);  // (randomly 20 bits)*4MiB == 0 to 4TiB
+    #endif
+    uintptr_t expected = hint + size;
+    mi_atomic_cas_strong_acq_rel(&aligned_base, &expected, init);
+    hint = mi_atomic_add_acq_rel(&aligned_base, size); // this may still give 0 or > MI_HINT_MAX but that is ok, it is a hint after all
+  }
+  if (hint%try_alignment != 0) return NULL;
+  return (void*)hint;
+}
+#else
+static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size) {
+  MI_UNUSED(try_alignment); MI_UNUSED(size);
+  return NULL;
+}
+#endif
+
+/* -----------------------------------------------------------
+  Free memory
+-------------------------------------------------------------- */
+
+static bool mi_os_mem_free(void* addr, size_t size, bool was_committed, mi_stats_t* stats)
+{
+  if (addr == NULL || size == 0) return true; // || _mi_os_is_huge_reserved(addr)
+  bool err = false;
+#if defined(_WIN32)
+  DWORD errcode = 0;
+  err = (VirtualFree(addr, 0, MEM_RELEASE) == 0);
+  if (err) { errcode = GetLastError(); }
+  if (errcode == ERROR_INVALID_ADDRESS) {
+    // In mi_os_mem_alloc_aligned the fallback path may have returned a pointer inside
+    // the memory region returned by VirtualAlloc; in that case we need to free using
+    // the start of the region.
+    MEMORY_BASIC_INFORMATION info = { 0 };
+    VirtualQuery(addr, &info, sizeof(info));
+    if (info.AllocationBase < addr && ((uint8_t*)addr - (uint8_t*)info.AllocationBase) < (ptrdiff_t)MI_SEGMENT_SIZE) {
+      errcode = 0;
+      err = (VirtualFree(info.AllocationBase, 0, MEM_RELEASE) == 0);
+      if (err) { errcode = GetLastError(); }
+    }
+  }
+  if (errcode != 0) {
+    _mi_warning_message("unable to release OS memory: error code 0x%x, addr: %p, size: %zu\n", errcode, addr, size);
+  }
+#elif defined(MI_USE_SBRK) || defined(__wasi__)
+  err = false; // sbrk heap cannot be shrunk
+#else
+  err = (munmap(addr, size) == -1);
+  if (err) {
+    _mi_warning_message("unable to release OS memory: %s, addr: %p, size: %zu\n", strerror(errno), addr, size);
+  }
+#endif
+  if (was_committed) { _mi_stat_decrease(&stats->committed, size); }
+  _mi_stat_decrease(&stats->reserved, size);
+  return !err;
+}
+
+
+/* -----------------------------------------------------------
+  Raw allocation on Windows (VirtualAlloc)
+-------------------------------------------------------------- */
+
+#ifdef _WIN32
+ 
+#define MEM_COMMIT_RESERVE  (MEM_COMMIT|MEM_RESERVE)
+
+static void* mi_win_virtual_allocx(void* addr, size_t size, size_t try_alignment, DWORD flags) {
+#if (MI_INTPTR_SIZE >= 8)
+  // on 64-bit systems, try to use the virtual address area after 2TiB for 4MiB aligned allocations
+  if (addr == NULL) {
+    void* hint = mi_os_get_aligned_hint(try_alignment,size);
+    if (hint != NULL) {
+      void* p = VirtualAlloc(hint, size, flags, PAGE_READWRITE);
+      if (p != NULL) return p;
+      _mi_verbose_message("warning: unable to allocate hinted aligned OS memory (%zu bytes, error code: 0x%x, address: %p, alignment: %zu, flags: 0x%x)\n", size, GetLastError(), hint, try_alignment, flags);
+      // fall through on error
+    }
+  }
+#endif
+  // on modern Windows try use VirtualAlloc2 for aligned allocation
+  if (try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) {
+    MI_MEM_ADDRESS_REQUIREMENTS reqs = { 0, 0, 0 };
+    reqs.Alignment = try_alignment;
+    MI_MEM_EXTENDED_PARAMETER param = { {0, 0}, {0} };
+    param.Type.Type = MiMemExtendedParameterAddressRequirements;
+    param.Arg.Pointer = &reqs;
+    void* p = (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, &param, 1);
+    if (p != NULL) return p;
+    _mi_warning_message("unable to allocate aligned OS memory (%zu bytes, error code: 0x%x, address: %p, alignment: %zu, flags: 0x%x)\n", size, GetLastError(), addr, try_alignment, flags);
+    // fall through on error
+  }
+  // last resort
+  return VirtualAlloc(addr, size, flags, PAGE_READWRITE);
+}
+
+static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment, DWORD flags, bool large_only, bool allow_large, bool* is_large) {
+  mi_assert_internal(!(large_only && !allow_large));
+  static _Atomic(size_t) large_page_try_ok; // = 0;
+  void* p = NULL;
+  // Try to allocate large OS pages (2MiB) if allowed or required.
+  if ((large_only || use_large_os_page(size, try_alignment))
+      && allow_large && (flags&MEM_COMMIT)!=0 && (flags&MEM_RESERVE)!=0) {
+    size_t try_ok = mi_atomic_load_acquire(&large_page_try_ok);
+    if (!large_only && try_ok > 0) {
+      // if a large page allocation fails, it seems the calls to VirtualAlloc get very expensive.
+      // therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times.
+      mi_atomic_cas_strong_acq_rel(&large_page_try_ok, &try_ok, try_ok - 1);
+    }
+    else {
+      // large OS pages must always reserve and commit.
+      *is_large = true;
+      p = mi_win_virtual_allocx(addr, size, try_alignment, flags | MEM_LARGE_PAGES);
+      if (large_only) return p;
+      // fall back to non-large page allocation on error (`p == NULL`).
+      if (p == NULL) {
+        mi_atomic_store_release(&large_page_try_ok,10UL);  // on error, don't try again for the next N allocations
+      }
+    }
+  }
+  // Fall back to regular page allocation
+  if (p == NULL) {
+    *is_large = ((flags&MEM_LARGE_PAGES) != 0);
+    p = mi_win_virtual_allocx(addr, size, try_alignment, flags);
+  }
+  if (p == NULL) {
+    _mi_warning_message("unable to allocate OS memory (%zu bytes, error code: 0x%x, address: %p, alignment: %zu, flags: 0x%x, large only: %d, allow large: %d)\n", size, GetLastError(), addr, try_alignment, flags, large_only, allow_large);
+  }
+  return p;
+}
+
+/* -----------------------------------------------------------
+  Raw allocation using `sbrk` or `wasm_memory_grow`
+-------------------------------------------------------------- */
+
+#elif defined(MI_USE_SBRK) || defined(__wasi__)
+#if defined(MI_USE_SBRK)
+  static void* mi_memory_grow( size_t size ) {
+    void* p = sbrk(size);
+    if (p == (void*)(-1)) return NULL;
+    #if !defined(__wasi__) // on wasi this is always zero initialized already (?)
+    memset(p,0,size);
+    #endif
+    return p;
+  }
+#elif defined(__wasi__)
+  static void* mi_memory_grow( size_t size ) {
+    size_t base = (size > 0 ? __builtin_wasm_memory_grow(0,_mi_divide_up(size, _mi_os_page_size()))
+                            : __builtin_wasm_memory_size(0));
+    if (base == SIZE_MAX) return NULL;
+    return (void*)(base * _mi_os_page_size());
+  }
+#endif
+
+#if defined(MI_USE_PTHREADS)
+static pthread_mutex_t mi_heap_grow_mutex = PTHREAD_MUTEX_INITIALIZER;
+#endif
+
+static void* mi_heap_grow(size_t size, size_t try_alignment) {
+  void* p = NULL;
+  if (try_alignment <= 1) {
+    // `sbrk` is not thread safe in general so try to protect it (we could skip this on WASM but leave it in for now)
+    #if defined(MI_USE_PTHREADS)
+    pthread_mutex_lock(&mi_heap_grow_mutex);
+    #endif
+    p = mi_memory_grow(size);
+    #if defined(MI_USE_PTHREADS)
+    pthread_mutex_unlock(&mi_heap_grow_mutex);
+    #endif
+  }
+  else {
+    void* base = NULL;
+    size_t alloc_size = 0;
+    // to allocate aligned use a lock to try to avoid thread interaction
+    // between getting the current size and actual allocation
+    // (also, `sbrk` is not thread safe in general)
+    #if defined(MI_USE_PTHREADS)
+    pthread_mutex_lock(&mi_heap_grow_mutex);
+    #endif
+    {
+      void* current = mi_memory_grow(0);  // get current size
+      if (current != NULL) {
+        void* aligned_current = mi_align_up_ptr(current, try_alignment);  // and align from there to minimize wasted space
+        alloc_size = _mi_align_up( ((uint8_t*)aligned_current - (uint8_t*)current) + size, _mi_os_page_size());
+        base = mi_memory_grow(alloc_size);
+      }
+    }
+    #if defined(MI_USE_PTHREADS)
+    pthread_mutex_unlock(&mi_heap_grow_mutex);
+    #endif
+    if (base != NULL) {
+      p = mi_align_up_ptr(base, try_alignment);
+      if ((uint8_t*)p + size > (uint8_t*)base + alloc_size) {
+        // another thread used wasm_memory_grow/sbrk in-between and we do not have enough
+        // space after alignment. Give up (and waste the space as we cannot shrink :-( )
+        // (in `mi_os_mem_alloc_aligned` this will fall back to overallocation to align)
+        p = NULL;
+      }
+    }
+  }
+  if (p == NULL) {
+    _mi_warning_message("unable to allocate sbrk/wasm_memory_grow OS memory (%zu bytes, %zu alignment)\n", size, try_alignment);
+    errno = ENOMEM;
+    return NULL;
+  }
+  mi_assert_internal( try_alignment == 0 || (uintptr_t)p % try_alignment == 0 );
+  return p;
+}
+
+/* -----------------------------------------------------------
+  Raw allocation on Unix's (mmap)
+-------------------------------------------------------------- */
+#else
+#define MI_OS_USE_MMAP
+static void* mi_unix_mmapx(void* addr, size_t size, size_t try_alignment, int protect_flags, int flags, int fd) {
+  MI_UNUSED(try_alignment);
+  #if defined(MAP_ALIGNED)  // BSD
+  if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0) {
+    size_t n = mi_bsr(try_alignment);
+    if (((size_t)1 << n) == try_alignment && n >= 12 && n <= 30) {  // alignment is a power of 2 and 4096 <= alignment <= 1GiB
+      flags |= MAP_ALIGNED(n);
+      void* p = mmap(addr, size, protect_flags, flags | MAP_ALIGNED(n), fd, 0);
+      if (p!=MAP_FAILED) return p;
+      // fall back to regular mmap
+    }
+  }
+  #elif defined(MAP_ALIGN)  // Solaris
+  if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0) {
+    void* p = mmap((void*)try_alignment, size, protect_flags, flags | MAP_ALIGN, fd, 0);  // addr parameter is the required alignment
+    if (p!=MAP_FAILED) return p;
+    // fall back to regular mmap
+  }
+  #endif
+  #if (MI_INTPTR_SIZE >= 8) && !defined(MAP_ALIGNED)
+  // on 64-bit systems, use the virtual address area after 2TiB for 4MiB aligned allocations
+  if (addr == NULL) {
+    void* hint = mi_os_get_aligned_hint(try_alignment, size);
+    if (hint != NULL) {
+      void* p = mmap(hint, size, protect_flags, flags, fd, 0);
+      if (p!=MAP_FAILED) return p;
+      // fall back to regular mmap
+    }
+  }
+  #endif
+  // regular mmap
+  void* p = mmap(addr, size, protect_flags, flags, fd, 0);
+  if (p!=MAP_FAILED) return p;
+  // failed to allocate
+  return NULL;
+}
+
+static int mi_unix_mmap_fd(void) {
+#if defined(VM_MAKE_TAG)
+  // macOS: tracking anonymous page with a specific ID. (All up to 98 are taken officially but LLVM sanitizers had taken 99)
+  int os_tag = (int)mi_option_get(mi_option_os_tag);
+  if (os_tag < 100 || os_tag > 255) os_tag = 100;
+  return VM_MAKE_TAG(os_tag);
+#else
+  return -1;
+#endif
+}
+
+static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int protect_flags, bool large_only, bool allow_large, bool* is_large) {
+  void* p = NULL;
+  #if !defined(MAP_ANONYMOUS)
+  #define MAP_ANONYMOUS  MAP_ANON
+  #endif
+  #if !defined(MAP_NORESERVE)
+  #define MAP_NORESERVE  0
+  #endif
+  const int fd = mi_unix_mmap_fd();
+  int flags = MAP_PRIVATE | MAP_ANONYMOUS;
+  if (_mi_os_has_overcommit()) {
+    flags |= MAP_NORESERVE;
+  }
+  #if defined(PROT_MAX)
+  protect_flags |= PROT_MAX(PROT_READ | PROT_WRITE); // BSD
+  #endif    
+  // huge page allocation
+  if ((large_only || use_large_os_page(size, try_alignment)) && allow_large) {
+    static _Atomic(size_t) large_page_try_ok; // = 0;
+    size_t try_ok = mi_atomic_load_acquire(&large_page_try_ok);
+    if (!large_only && try_ok > 0) {
+      // If the OS is not configured for large OS pages, or the user does not have
+      // enough permission, the `mmap` will always fail (but it might also fail for other reasons).
+      // Therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times
+      // to avoid too many failing calls to mmap.
+      mi_atomic_cas_strong_acq_rel(&large_page_try_ok, &try_ok, try_ok - 1);
+    }
+    else {
+      int lflags = flags & ~MAP_NORESERVE;  // using NORESERVE on huge pages seems to fail on Linux
+      int lfd = fd;
+      #ifdef MAP_ALIGNED_SUPER
+      lflags |= MAP_ALIGNED_SUPER;
+      #endif
+      #ifdef MAP_HUGETLB
+      lflags |= MAP_HUGETLB;
+      #endif
+      #ifdef MAP_HUGE_1GB
+      static bool mi_huge_pages_available = true;
+      if ((size % MI_GiB) == 0 && mi_huge_pages_available) {
+        lflags |= MAP_HUGE_1GB;
+      }
+      else
+      #endif
+      {
+        #ifdef MAP_HUGE_2MB
+        lflags |= MAP_HUGE_2MB;
+        #endif
+      }
+      #ifdef VM_FLAGS_SUPERPAGE_SIZE_2MB
+      lfd |= VM_FLAGS_SUPERPAGE_SIZE_2MB;
+      #endif
+      if (large_only || lflags != flags) {
+        // try large OS page allocation
+        *is_large = true;
+        p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, lflags, lfd);
+        #ifdef MAP_HUGE_1GB
+        if (p == NULL && (lflags & MAP_HUGE_1GB) != 0) {
+          mi_huge_pages_available = false; // don't try huge 1GiB pages again
+          _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (error %i)\n", errno);
+          lflags = ((lflags & ~MAP_HUGE_1GB) | MAP_HUGE_2MB);
+          p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, lflags, lfd);
+        }
+        #endif
+        if (large_only) return p;
+        if (p == NULL) {
+          mi_atomic_store_release(&large_page_try_ok, (size_t)8);  // on error, don't try again for the next N allocations
+        }
+      }
+    }
+  }
+  // regular allocation
+  if (p == NULL) {
+    *is_large = false;
+    p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, flags, fd);
+    if (p != NULL) {
+      #if defined(MADV_HUGEPAGE)
+      // Many Linux systems don't allow MAP_HUGETLB but they support instead
+      // transparent huge pages (THP). Generally, it is not required to call `madvise` with MADV_HUGE
+      // though since properly aligned allocations will already use large pages if available
+      // in that case -- in particular for our large regions (in `memory.c`).
+      // However, some systems only allow THP if called with explicit `madvise`, so
+      // when large OS pages are enabled for mimalloc, we call `madvise` anyways.
+      if (allow_large && use_large_os_page(size, try_alignment)) {
+        if (mi_madvise(p, size, MADV_HUGEPAGE) == 0) {
+          *is_large = true; // possibly
+        };
+      }
+      #elif defined(__sun)
+      if (allow_large && use_large_os_page(size, try_alignment)) {
+        struct memcntl_mha cmd = {0};
+        cmd.mha_pagesize = large_os_page_size;
+        cmd.mha_cmd = MHA_MAPSIZE_VA;
+        if (memcntl((caddr_t)p, size, MC_HAT_ADVISE, (caddr_t)&cmd, 0, 0) == 0) {
+          *is_large = true;
+        }
+      }
+      #endif
+    }
+  }
+  if (p == NULL) {
+    _mi_warning_message("unable to allocate OS memory (%zu bytes, error code: %i, address: %p, large only: %d, allow large: %d)\n", size, errno, addr, large_only, allow_large);
+  }
+  return p;
+}
+#endif
+
+
+/* -----------------------------------------------------------
+   Primitive allocation from the OS.
+-------------------------------------------------------------- */
+
+// Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned.
+static void* mi_os_mem_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, mi_stats_t* stats) {
+  mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
+  if (size == 0) return NULL;
+  if (!commit) allow_large = false;
+  if (try_alignment == 0) try_alignment = 1; // avoid 0 to ensure there will be no divide by zero when aligning
+
+  void* p = NULL;
+  /*
+  if (commit && allow_large) {
+    p = _mi_os_try_alloc_from_huge_reserved(size, try_alignment);
+    if (p != NULL) {
+      *is_large = true;
+      return p;
+    }
+  }
+  */
+
+  #if defined(_WIN32)
+    int flags = MEM_RESERVE;
+    if (commit) { flags |= MEM_COMMIT; }
+    p = mi_win_virtual_alloc(NULL, size, try_alignment, flags, false, allow_large, is_large);
+  #elif defined(MI_USE_SBRK) || defined(__wasi__)
+    MI_UNUSED(allow_large);
+    *is_large = false;
+    p = mi_heap_grow(size, try_alignment);
+  #else
+    int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE);
+    p = mi_unix_mmap(NULL, size, try_alignment, protect_flags, false, allow_large, is_large);
+  #endif
+  mi_stat_counter_increase(stats->mmap_calls, 1);
+  if (p != NULL) {
+    _mi_stat_increase(&stats->reserved, size);
+    if (commit) { _mi_stat_increase(&stats->committed, size); }
+  }
+  return p;
+}
+
+
+// Primitive aligned allocation from the OS.
+// This function guarantees the allocated memory is aligned.
+static void* mi_os_mem_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, bool* is_large, mi_stats_t* stats) {
+  mi_assert_internal(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0));
+  mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
+  mi_assert_internal(is_large != NULL);
+  if (!commit) allow_large = false;
+  if (!(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0))) return NULL;
+  size = _mi_align_up(size, _mi_os_page_size());
+
+  // try first with a hint (this will be aligned directly on Win 10+ or BSD)
+  void* p = mi_os_mem_alloc(size, alignment, commit, allow_large, is_large, stats);
+  if (p == NULL) return NULL;
+
+  // if not aligned, free it, overallocate, and unmap around it
+  if (((uintptr_t)p % alignment != 0)) {
+    mi_os_mem_free(p, size, commit, stats);
+    _mi_warning_message("unable to allocate aligned OS memory directly, fall back to over-allocation (%zu bytes, address: %p, alignment: %zu, commit: %d)\n", size, p, alignment, commit);
+    if (size >= (SIZE_MAX - alignment)) return NULL; // overflow
+    const size_t over_size = size + alignment;
+
+#if _WIN32
+    // over-allocate uncommitted (virtual) memory
+    p = mi_os_mem_alloc(over_size, 0 /*alignment*/, false /* commit? */, false /* allow_large */, is_large, stats);
+    if (p == NULL) return NULL;
+
+    // set p to the aligned part in the full region
+    // note: this is dangerous on Windows as VirtualFree needs the actual region pointer
+    // but in mi_os_mem_free we handle this (hopefully exceptional) situation.
+    p = mi_align_up_ptr(p, alignment);
+
+    // explicitly commit only the aligned part
+    if (commit) {
+      _mi_os_commit(p, size, NULL, stats);
+    }
+#else
+    // overallocate...
+    p = mi_os_mem_alloc(over_size, 1, commit, false, is_large, stats);
+    if (p == NULL) return NULL;
+    // and selectively unmap parts around the over-allocated area. (noop on sbrk)
+    void* aligned_p = mi_align_up_ptr(p, alignment);
+    size_t pre_size = (uint8_t*)aligned_p - (uint8_t*)p;
+    size_t mid_size = _mi_align_up(size, _mi_os_page_size());
+    size_t post_size = over_size - pre_size - mid_size;
+    mi_assert_internal(pre_size < over_size && post_size < over_size && mid_size >= size);
+    if (pre_size > 0)  mi_os_mem_free(p, pre_size, commit, stats);
+    if (post_size > 0) mi_os_mem_free((uint8_t*)aligned_p + mid_size, post_size, commit, stats);
+    // we can return the aligned pointer on `mmap` (and sbrk) systems
+    p = aligned_p;
+#endif
+  }
+
+  mi_assert_internal(p == NULL || (p != NULL && ((uintptr_t)p % alignment) == 0));
+  return p;
+}
+
+
+/* -----------------------------------------------------------
+  OS API: alloc, free, alloc_aligned
+----------------------------------------------------------- */
+
+void* _mi_os_alloc(size_t size, mi_stats_t* tld_stats) {
+  MI_UNUSED(tld_stats);
+  mi_stats_t* stats = &_mi_stats_main;
+  if (size == 0) return NULL;
+  size = _mi_os_good_alloc_size(size);
+  bool is_large = false;
+  return mi_os_mem_alloc(size, 0, true, false, &is_large, stats);
+}
+
+void  _mi_os_free_ex(void* p, size_t size, bool was_committed, mi_stats_t* tld_stats) {
+  MI_UNUSED(tld_stats);
+  mi_stats_t* stats = &_mi_stats_main;
+  if (size == 0 || p == NULL) return;
+  size = _mi_os_good_alloc_size(size);
+  mi_os_mem_free(p, size, was_committed, stats);
+}
+
+void  _mi_os_free(void* p, size_t size, mi_stats_t* stats) {
+  _mi_os_free_ex(p, size, true, stats);
+}
+
+void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_stats_t* tld_stats)
+{
+  MI_UNUSED(&mi_os_get_aligned_hint); // suppress unused warnings
+  MI_UNUSED(tld_stats);
+  if (size == 0) return NULL;
+  size = _mi_os_good_alloc_size(size);
+  alignment = _mi_align_up(alignment, _mi_os_page_size());
+  bool allow_large = false;
+  if (large != NULL) {
+    allow_large = *large;
+    *large = false;
+  }
+  return mi_os_mem_alloc_aligned(size, alignment, commit, allow_large, (large!=NULL?large:&allow_large), &_mi_stats_main /*tld->stats*/ );
+}
+
+/* -----------------------------------------------------------
+  OS aligned allocation with an offset. This is used
+  for large alignments > MI_ALIGNMENT_MAX. We use a large mimalloc
+  page where the object can be aligned at an offset from the start of the segment.
+  As we may need to overallocate, we need to free such pointers using `mi_free_aligned`
+  to use the actual start of the memory region.
+----------------------------------------------------------- */
+
+void* _mi_os_alloc_aligned_offset(size_t size, size_t alignment, size_t offset, bool commit, bool* large, mi_stats_t* tld_stats) {
+  mi_assert(offset <= MI_SEGMENT_SIZE);
+  mi_assert(offset <= size);
+  mi_assert((alignment % _mi_os_page_size()) == 0);
+  if (offset > MI_SEGMENT_SIZE) return NULL;
+  if (offset == 0) {
+    // regular aligned allocation
+    return _mi_os_alloc_aligned(size, alignment, commit, large, tld_stats);
+  }
+  else {
+    // overallocate to align at an offset
+    const size_t extra = _mi_align_up(offset, alignment) - offset;
+    const size_t oversize = size + extra;
+    void* start = _mi_os_alloc_aligned(oversize, alignment, commit, large, tld_stats);
+    if (start == NULL) return NULL;
+    void* p = (uint8_t*)start + extra;
+    mi_assert(_mi_is_aligned((uint8_t*)p + offset, alignment));
+    // decommit the overallocation at the start
+    if (commit && extra > _mi_os_page_size()) {
+      _mi_os_decommit(start, extra, tld_stats);
+    }
+    return p;
+  }
+}
+
+void _mi_os_free_aligned(void* p, size_t size, size_t alignment, size_t align_offset, bool was_committed, mi_stats_t* tld_stats) {
+  mi_assert(align_offset <= MI_SEGMENT_SIZE);
+  const size_t extra = _mi_align_up(align_offset, alignment) - align_offset;
+  void* start = (uint8_t*)p - extra;
+  _mi_os_free_ex(start, size + extra, was_committed, tld_stats);
+}
+
+/* -----------------------------------------------------------
+  OS memory API: reset, commit, decommit, protect, unprotect.
+----------------------------------------------------------- */
+
+
+// OS page align within a given area, either conservative (pages inside the area only),
+// or not (straddling pages outside the area is possible)
+static void* mi_os_page_align_areax(bool conservative, void* addr, size_t size, size_t* newsize) {
+  mi_assert(addr != NULL && size > 0);
+  if (newsize != NULL) *newsize = 0;
+  if (size == 0 || addr == NULL) return NULL;
+
+  // page align conservatively within the range
+  void* start = (conservative ? mi_align_up_ptr(addr, _mi_os_page_size())
+    : mi_align_down_ptr(addr, _mi_os_page_size()));
+  void* end = (conservative ? mi_align_down_ptr((uint8_t*)addr + size, _mi_os_page_size())
+    : mi_align_up_ptr((uint8_t*)addr + size, _mi_os_page_size()));
+  ptrdiff_t diff = (uint8_t*)end - (uint8_t*)start;
+  if (diff <= 0) return NULL;
+
+  mi_assert_internal((conservative && (size_t)diff <= size) || (!conservative && (size_t)diff >= size));
+  if (newsize != NULL) *newsize = (size_t)diff;
+  return start;
+}
+
+static void* mi_os_page_align_area_conservative(void* addr, size_t size, size_t* newsize) {
+  return mi_os_page_align_areax(true, addr, size, newsize);
+}
+
+static void mi_mprotect_hint(int err) {
+#if defined(MI_OS_USE_MMAP) && (MI_SECURE>=2) // guard page around every mimalloc page
+  if (err == ENOMEM) {
+    _mi_warning_message("the previous warning may have been caused by a low memory map limit.\n"
+                        "  On Linux this is controlled by the vm.max_map_count. For example:\n"
+                        "  > sudo sysctl -w vm.max_map_count=262144\n");
+  }
+#else
+  MI_UNUSED(err);
+#endif
+}
+
+// Commit/Decommit memory.
+// Usually commit is aligned liberal, while decommit is aligned conservative.
+// (but not for the reset version where we want commit to be conservative as well)
+static bool mi_os_commitx(void* addr, size_t size, bool commit, bool conservative, bool* is_zero, mi_stats_t* stats) {
+  // page align in the range, commit liberally, decommit conservative
+  if (is_zero != NULL) { *is_zero = false; }
+  size_t csize;
+  void* start = mi_os_page_align_areax(conservative, addr, size, &csize);
+  if (csize == 0) return true;  // || _mi_os_is_huge_reserved(addr))
+  int err = 0;
+  if (commit) {
+    _mi_stat_increase(&stats->committed, size);  // use size for precise commit vs. decommit
+    _mi_stat_counter_increase(&stats->commit_calls, 1);
+  }
+  else {
+    _mi_stat_decrease(&stats->committed, size);
+  }
+
+  #if defined(_WIN32)
+  if (commit) {
+    // *is_zero = true;  // note: if the memory was already committed, the call succeeds but the memory is not zero'd
+    void* p = VirtualAlloc(start, csize, MEM_COMMIT, PAGE_READWRITE);
+    err = (p == start ? 0 : GetLastError());
+  }
+  else {
+    BOOL ok = VirtualFree(start, csize, MEM_DECOMMIT);
+    err = (ok ? 0 : GetLastError());
+  }
+  #elif defined(__wasi__)
+  // WebAssembly guests can't control memory protection
+  #elif 0 && defined(MAP_FIXED) && !defined(__APPLE__)
+  // Linux: disabled for now as mmap fixed seems much more expensive than MADV_DONTNEED (and splits VMA's?)
+  if (commit) {
+    // commit: just change the protection
+    err = mprotect(start, csize, (PROT_READ | PROT_WRITE));
+    if (err != 0) { err = errno; }
+  }
+  else {
+    // decommit: use mmap with MAP_FIXED to discard the existing memory (and reduce rss)
+    const int fd = mi_unix_mmap_fd();
+    void* p = mmap(start, csize, PROT_NONE, (MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE), fd, 0);
+    if (p != start) { err = errno; }
+  }
+  #else
+  // Linux, macOSX and others.
+  if (commit) {
+    // commit: ensure we can access the area
+    err = mprotect(start, csize, (PROT_READ | PROT_WRITE));
+    if (err != 0) { err = errno; }
+  }
+  else {
+    #if defined(MADV_DONTNEED) && MI_DEBUG == 0 && MI_SECURE == 0
+    // decommit: use MADV_DONTNEED as it decreases rss immediately (unlike MADV_FREE)
+    // (on the other hand, MADV_FREE would be good enough.. it is just not reflected in the stats :-( )
+    err = madvise(start, csize, MADV_DONTNEED);
+    #else
+    // decommit: just disable access (also used in debug and secure mode to trap on illegal access)
+    err = mprotect(start, csize, PROT_NONE);
+    if (err != 0) { err = errno; }
+    #endif
+    //#if defined(MADV_FREE_REUSE)
+    //  while ((err = mi_madvise(start, csize, MADV_FREE_REUSE)) != 0 && errno == EAGAIN) { errno = 0; }
+    //#endif
+  }
+  #endif
+  if (err != 0) {
+    _mi_warning_message("%s error: start: %p, csize: 0x%zx, err: %i\n", commit ? "commit" : "decommit", start, csize, err);
+    mi_mprotect_hint(err);
+  }
+  mi_assert_internal(err == 0);
+  return (err == 0);
+}
+
+bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats) {
+  MI_UNUSED(tld_stats);
+  mi_stats_t* stats = &_mi_stats_main;
+  return mi_os_commitx(addr, size, true, false /* liberal */, is_zero, stats);
+}
+
+bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* tld_stats) {
+  MI_UNUSED(tld_stats);
+  mi_stats_t* stats = &_mi_stats_main;
+  bool is_zero;
+  return mi_os_commitx(addr, size, false, true /* conservative */, &is_zero, stats);
+}
+
+/*
+static bool mi_os_commit_unreset(void* addr, size_t size, bool* is_zero, mi_stats_t* stats) {  
+  return mi_os_commitx(addr, size, true, true // conservative
+                      , is_zero, stats);
+}
+*/
+
+// Signal to the OS that the address range is no longer in use
+// but may be used later again. This will release physical memory
+// pages and reduce swapping while keeping the memory committed.
+// We page align to a conservative area inside the range to reset.
+static bool mi_os_resetx(void* addr, size_t size, bool reset, mi_stats_t* stats) {
+  // page align conservatively within the range
+  size_t csize;
+  void* start = mi_os_page_align_area_conservative(addr, size, &csize);
+  if (csize == 0) return true;  // || _mi_os_is_huge_reserved(addr)
+  if (reset) _mi_stat_increase(&stats->reset, csize);
+        else _mi_stat_decrease(&stats->reset, csize);
+  if (!reset) return true; // nothing to do on unreset!
+
+  #if (MI_DEBUG>1) && !MI_TRACK_ENABLED
+  if (MI_SECURE==0) {
+    memset(start, 0, csize); // pretend it is eagerly reset
+  }
+  #endif
+
+#if defined(_WIN32)
+  // Testing shows that for us (on `malloc-large`) MEM_RESET is 2x faster than DiscardVirtualMemory
+  void* p = VirtualAlloc(start, csize, MEM_RESET, PAGE_READWRITE);
+  mi_assert_internal(p == start);
+  #if 1
+  if (p == start && start != NULL) {
+    VirtualUnlock(start,csize); // VirtualUnlock after MEM_RESET removes the memory from the working set
+  }
+  #endif
+  if (p != start) return false;
+#else
+#if defined(MADV_FREE)
+  static _Atomic(size_t) advice = MI_ATOMIC_VAR_INIT(MADV_FREE);
+  int oadvice = (int)mi_atomic_load_relaxed(&advice);
+  int err;
+  while ((err = mi_madvise(start, csize, oadvice)) != 0 && errno == EAGAIN) { errno = 0;  };
+  if (err != 0 && errno == EINVAL && oadvice == MADV_FREE) {
+    // if MADV_FREE is not supported, fall back to MADV_DONTNEED from now on
+    mi_atomic_store_release(&advice, (size_t)MADV_DONTNEED);
+    err = mi_madvise(start, csize, MADV_DONTNEED);
+  }
+#elif defined(__wasi__)
+  int err = 0;
+#else
+  int err = mi_madvise(start, csize, MADV_DONTNEED);
+#endif
+  if (err != 0) {
+    _mi_warning_message("madvise reset error: start: %p, csize: 0x%zx, errno: %i\n", start, csize, errno);
+  }
+  //mi_assert(err == 0);
+  if (err != 0) return false;
+#endif
+  return true;
+}
+
+// Signal to the OS that the address range is no longer in use
+// but may be used later again. This will release physical memory
+// pages and reduce swapping while keeping the memory committed.
+// We page align to a conservative area inside the range to reset.
+bool _mi_os_reset(void* addr, size_t size, mi_stats_t* tld_stats) {
+  MI_UNUSED(tld_stats);
+  mi_stats_t* stats = &_mi_stats_main;
+  return mi_os_resetx(addr, size, true, stats);
+}
+
+/*
+bool _mi_os_unreset(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats) {
+  MI_UNUSED(tld_stats);
+  mi_stats_t* stats = &_mi_stats_main;
+  *is_zero = false;
+  return mi_os_resetx(addr, size, false, stats);
+}
+*/
+
+// Protect a region in memory to be not accessible.
+static  bool mi_os_protectx(void* addr, size_t size, bool protect) {
+  // page align conservatively within the range
+  size_t csize = 0;
+  void* start = mi_os_page_align_area_conservative(addr, size, &csize);
+  if (csize == 0) return false;
+  /*
+  if (_mi_os_is_huge_reserved(addr)) {
+	  _mi_warning_message("cannot mprotect memory allocated in huge OS pages\n");
+  }
+  */
+  int err = 0;
+#ifdef _WIN32
+  DWORD oldprotect = 0;
+  BOOL ok = VirtualProtect(start, csize, protect ? PAGE_NOACCESS : PAGE_READWRITE, &oldprotect);
+  err = (ok ? 0 : GetLastError());
+#elif defined(__wasi__)
+  err = 0;
+#else
+  err = mprotect(start, csize, protect ? PROT_NONE : (PROT_READ | PROT_WRITE));
+  if (err != 0) { err = errno; }
+#endif
+  if (err != 0) {
+    _mi_warning_message("mprotect error: start: %p, csize: 0x%zx, err: %i\n", start, csize, err);
+    mi_mprotect_hint(err);
+  }
+  return (err == 0);
+}
+
+bool _mi_os_protect(void* addr, size_t size) {
+  return mi_os_protectx(addr, size, true);
+}
+
+bool _mi_os_unprotect(void* addr, size_t size) {
+  return mi_os_protectx(addr, size, false);
+}
+
+
+
+bool _mi_os_shrink(void* p, size_t oldsize, size_t newsize, mi_stats_t* stats) {
+  // page align conservatively within the range
+  mi_assert_internal(oldsize > newsize && p != NULL);
+  if (oldsize < newsize || p == NULL) return false;
+  if (oldsize == newsize) return true;
+
+  // oldsize and newsize should be page aligned or we cannot shrink precisely
+  void* addr = (uint8_t*)p + newsize;
+  size_t size = 0;
+  void* start = mi_os_page_align_area_conservative(addr, oldsize - newsize, &size);
+  if (size == 0 || start != addr) return false;
+
+#ifdef _WIN32
+  // we cannot shrink on windows, but we can decommit
+  return _mi_os_decommit(start, size, stats);
+#else
+  return mi_os_mem_free(start, size, true, stats);
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------
+Support for allocating huge OS pages (1Gib) that are reserved up-front
+and possibly associated with a specific NUMA node. (use `numa_node>=0`)
+-----------------------------------------------------------------------------*/
+#define MI_HUGE_OS_PAGE_SIZE  (MI_GiB)
+
+#if defined(_WIN32) && (MI_INTPTR_SIZE >= 8)
+static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
+{
+  mi_assert_internal(size%MI_GiB == 0);
+  mi_assert_internal(addr != NULL);
+  const DWORD flags = MEM_LARGE_PAGES | MEM_COMMIT | MEM_RESERVE;
+
+  mi_win_enable_large_os_pages();
+
+  MI_MEM_EXTENDED_PARAMETER params[3] = { {{0,0},{0}},{{0,0},{0}},{{0,0},{0}} };
+  // on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages
+  static bool mi_huge_pages_available = true;
+  if (pNtAllocateVirtualMemoryEx != NULL && mi_huge_pages_available) {
+    params[0].Type.Type = MiMemExtendedParameterAttributeFlags;
+    params[0].Arg.ULong64 = MI_MEM_EXTENDED_PARAMETER_NONPAGED_HUGE;
+    ULONG param_count = 1;
+    if (numa_node >= 0) {
+      param_count++;
+      params[1].Type.Type = MiMemExtendedParameterNumaNode;
+      params[1].Arg.ULong = (unsigned)numa_node;
+    }
+    SIZE_T psize = size;
+    void* base = addr;
+    NTSTATUS err = (*pNtAllocateVirtualMemoryEx)(GetCurrentProcess(), &base, &psize, flags, PAGE_READWRITE, params, param_count);
+    if (err == 0 && base != NULL) {
+      return base;
+    }
+    else {
+      // fall back to regular large pages
+      mi_huge_pages_available = false; // don't try further huge pages
+      _mi_warning_message("unable to allocate using huge (1GiB) pages, trying large (2MiB) pages instead (status 0x%lx)\n", err);
+    }
+  }
+  // on modern Windows try use VirtualAlloc2 for numa aware large OS page allocation
+  if (pVirtualAlloc2 != NULL && numa_node >= 0) {
+    params[0].Type.Type = MiMemExtendedParameterNumaNode;
+    params[0].Arg.ULong = (unsigned)numa_node;
+    return (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, params, 1);
+  }
+
+  // otherwise use regular virtual alloc on older windows
+  return VirtualAlloc(addr, size, flags, PAGE_READWRITE);
+}
+
+#elif defined(MI_OS_USE_MMAP) && (MI_INTPTR_SIZE >= 8) && !defined(__HAIKU__)
+#include <sys/syscall.h>
+#ifndef MPOL_PREFERRED
+#define MPOL_PREFERRED 1
+#endif
+#if defined(SYS_mbind)
+static long mi_os_mbind(void* start, unsigned long len, unsigned long mode, const unsigned long* nmask, unsigned long maxnode, unsigned flags) {
+  return syscall(SYS_mbind, start, len, mode, nmask, maxnode, flags);
+}
+#else
+static long mi_os_mbind(void* start, unsigned long len, unsigned long mode, const unsigned long* nmask, unsigned long maxnode, unsigned flags) {
+  MI_UNUSED(start); MI_UNUSED(len); MI_UNUSED(mode); MI_UNUSED(nmask); MI_UNUSED(maxnode); MI_UNUSED(flags);
+  return 0;
+}
+#endif
+static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node) {
+  mi_assert_internal(size%MI_GiB == 0);
+  bool is_large = true;
+  void* p = mi_unix_mmap(addr, size, MI_SEGMENT_SIZE, PROT_READ | PROT_WRITE, true, true, &is_large);
+  if (p == NULL) return NULL;
+  if (numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) { // at most 64 nodes
+    unsigned long numa_mask = (1UL << numa_node);
+    // TODO: does `mbind` work correctly for huge OS pages? should we
+    // use `set_mempolicy` before calling mmap instead?
+    // see: <https://lkml.org/lkml/2017/2/9/875>
+    long err = mi_os_mbind(p, size, MPOL_PREFERRED, &numa_mask, 8*MI_INTPTR_SIZE, 0);
+    if (err != 0) {
+      _mi_warning_message("failed to bind huge (1GiB) pages to numa node %d: %s\n", numa_node, strerror(errno));
+    }
+  }
+  return p;
+}
+#else
+static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node) {
+  MI_UNUSED(addr); MI_UNUSED(size); MI_UNUSED(numa_node);
+  return NULL;
+}
+#endif
+
+#if (MI_INTPTR_SIZE >= 8)
+// To ensure proper alignment, use our own area for huge OS pages
+static mi_decl_cache_align _Atomic(uintptr_t)  mi_huge_start; // = 0
+
+// Claim an aligned address range for huge pages
+static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
+  if (total_size != NULL) *total_size = 0;
+  const size_t size = pages * MI_HUGE_OS_PAGE_SIZE;
+
+  uintptr_t start = 0;
+  uintptr_t end = 0;
+  uintptr_t huge_start = mi_atomic_load_relaxed(&mi_huge_start);
+  do {
+    start = huge_start;
+    if (start == 0) {
+      // Initialize the start address after the 32TiB area
+      start = ((uintptr_t)32 << 40);  // 32TiB virtual start address
+#if (MI_SECURE>0 || MI_DEBUG==0)      // security: randomize start of huge pages unless in debug mode
+      uintptr_t r = _mi_heap_random_next(mi_get_default_heap());
+      start = start + ((uintptr_t)MI_HUGE_OS_PAGE_SIZE * ((r>>17) & 0x0FFF));  // (randomly 12bits)*1GiB == between 0 to 4TiB
+#endif
+    }
+    end = start + size;
+    mi_assert_internal(end % MI_SEGMENT_SIZE == 0);
+  } while (!mi_atomic_cas_strong_acq_rel(&mi_huge_start, &huge_start, end));
+
+  if (total_size != NULL) *total_size = size;
+  return (uint8_t*)start;
+}
+#else
+static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
+  MI_UNUSED(pages);
+  if (total_size != NULL) *total_size = 0;
+  return NULL;
+}
+#endif
+
+// Allocate MI_SEGMENT_SIZE aligned huge pages
+void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_msecs, size_t* pages_reserved, size_t* psize) {
+  if (psize != NULL) *psize = 0;
+  if (pages_reserved != NULL) *pages_reserved = 0;
+  size_t size = 0;
+  uint8_t* start = mi_os_claim_huge_pages(pages, &size);
+  if (start == NULL) return NULL; // or 32-bit systems
+
+  // Allocate one page at the time but try to place them contiguously
+  // We allocate one page at the time to be able to abort if it takes too long
+  // or to at least allocate as many as available on the system.
+  mi_msecs_t start_t = _mi_clock_start();
+  size_t page;
+  for (page = 0; page < pages; page++) {
+    // allocate a page
+    void* addr = start + (page * MI_HUGE_OS_PAGE_SIZE);
+    void* p = mi_os_alloc_huge_os_pagesx(addr, MI_HUGE_OS_PAGE_SIZE, numa_node);
+
+    // Did we succeed at a contiguous address?
+    if (p != addr) {
+      // no success, issue a warning and break
+      if (p != NULL) {
+        _mi_warning_message("could not allocate contiguous huge page %zu at %p\n", page, addr);
+        _mi_os_free(p, MI_HUGE_OS_PAGE_SIZE, &_mi_stats_main);
+      }
+      break;
+    }
+
+    // success, record it
+    _mi_stat_increase(&_mi_stats_main.committed, MI_HUGE_OS_PAGE_SIZE);
+    _mi_stat_increase(&_mi_stats_main.reserved, MI_HUGE_OS_PAGE_SIZE);
+
+    // check for timeout
+    if (max_msecs > 0) {
+      mi_msecs_t elapsed = _mi_clock_end(start_t);
+      if (page >= 1) {
+        mi_msecs_t estimate = ((elapsed / (page+1)) * pages);
+        if (estimate > 2*max_msecs) { // seems like we are going to timeout, break
+          elapsed = max_msecs + 1;
+        }
+      }
+      if (elapsed > max_msecs) {
+        _mi_warning_message("huge page allocation timed out\n");
+        break;
+      }
+    }
+  }
+  mi_assert_internal(page*MI_HUGE_OS_PAGE_SIZE <= size);
+  if (pages_reserved != NULL) { *pages_reserved = page; }
+  if (psize != NULL) { *psize = page * MI_HUGE_OS_PAGE_SIZE; }
+  return (page == 0 ? NULL : start);
+}
+
+// free every huge page in a range individually (as we allocated per page)
+// note: needed with VirtualAlloc but could potentially be done in one go on mmap'd systems.
+void _mi_os_free_huge_pages(void* p, size_t size, mi_stats_t* stats) {
+  if (p==NULL || size==0) return;
+  uint8_t* base = (uint8_t*)p;
+  while (size >= MI_HUGE_OS_PAGE_SIZE) {
+    _mi_os_free(base, MI_HUGE_OS_PAGE_SIZE, stats);
+    size -= MI_HUGE_OS_PAGE_SIZE;
+    base += MI_HUGE_OS_PAGE_SIZE;
+  }
+}
+
+/* ----------------------------------------------------------------------------
+Support NUMA aware allocation
+-----------------------------------------------------------------------------*/
+#ifdef _WIN32
+static size_t mi_os_numa_nodex(void) {
+  USHORT numa_node = 0;
+  if (pGetCurrentProcessorNumberEx != NULL && pGetNumaProcessorNodeEx != NULL) {
+    // Extended API is supported
+    MI_PROCESSOR_NUMBER pnum;
+    (*pGetCurrentProcessorNumberEx)(&pnum);
+    USHORT nnode = 0;
+    BOOL ok = (*pGetNumaProcessorNodeEx)(&pnum, &nnode);
+    if (ok) { numa_node = nnode; }
+  }
+  else if (pGetNumaProcessorNode != NULL) {
+    // Vista or earlier, use older API that is limited to 64 processors. Issue #277
+    DWORD pnum = GetCurrentProcessorNumber();
+    UCHAR nnode = 0;
+    BOOL ok = pGetNumaProcessorNode((UCHAR)pnum, &nnode);
+    if (ok) { numa_node = nnode; }
+  }
+  return numa_node;
+}
+
+static size_t mi_os_numa_node_countx(void) {
+  ULONG numa_max = 0;
+  GetNumaHighestNodeNumber(&numa_max);
+  // find the highest node number that has actual processors assigned to it. Issue #282
+  while(numa_max > 0) {
+    if (pGetNumaNodeProcessorMaskEx != NULL) {
+      // Extended API is supported
+      GROUP_AFFINITY affinity;
+      if ((*pGetNumaNodeProcessorMaskEx)((USHORT)numa_max, &affinity)) {
+        if (affinity.Mask != 0) break;  // found the maximum non-empty node
+      }
+    }
+    else {
+      // Vista or earlier, use older API that is limited to 64 processors.
+      ULONGLONG mask;
+      if (GetNumaNodeProcessorMask((UCHAR)numa_max, &mask)) {
+        if (mask != 0) break; // found the maximum non-empty node
+      };
+    }
+    // max node was invalid or had no processor assigned, try again
+    numa_max--;
+  }
+  return ((size_t)numa_max + 1);
+}
+#elif defined(__linux__)
+#include <sys/syscall.h>  // getcpu
+#include <stdio.h>        // access
+
+static size_t mi_os_numa_nodex(void) {
+#ifdef SYS_getcpu
+  unsigned long node = 0;
+  unsigned long ncpu = 0;
+  long err = syscall(SYS_getcpu, &ncpu, &node, NULL);
+  if (err != 0) return 0;
+  return node;
+#else
+  return 0;
+#endif
+}
+static size_t mi_os_numa_node_countx(void) {
+  char buf[128];
+  unsigned node = 0;
+  for(node = 0; node < 256; node++) {
+    // enumerate node entries -- todo: it there a more efficient way to do this? (but ensure there is no allocation)
+    snprintf(buf, 127, "/sys/devices/system/node/node%u", node + 1);
+    if (access(buf,R_OK) != 0) break;
+  }
+  return (node+1);
+}
+#elif defined(__FreeBSD__) && __FreeBSD_version >= 1200000
+static size_t mi_os_numa_nodex(void) {
+  domainset_t dom;
+  size_t node;
+  int policy;
+  if (cpuset_getdomain(CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, sizeof(dom), &dom, &policy) == -1) return 0ul;
+  for (node = 0; node < MAXMEMDOM; node++) {
+    if (DOMAINSET_ISSET(node, &dom)) return node;
+  }
+  return 0ul;
+}
+static size_t mi_os_numa_node_countx(void) {
+  size_t ndomains = 0;
+  size_t len = sizeof(ndomains);
+  if (sysctlbyname("vm.ndomains", &ndomains, &len, NULL, 0) == -1) return 0ul;
+  return ndomains;
+}
+#elif defined(__DragonFly__)
+static size_t mi_os_numa_nodex(void) {
+  // TODO: DragonFly does not seem to provide any userland means to get this information.
+  return 0ul;
+}
+static size_t mi_os_numa_node_countx(void) {
+  size_t ncpus = 0, nvirtcoresperphys = 0;
+  size_t len = sizeof(size_t);
+  if (sysctlbyname("hw.ncpu", &ncpus, &len, NULL, 0) == -1) return 0ul;
+  if (sysctlbyname("hw.cpu_topology_ht_ids", &nvirtcoresperphys, &len, NULL, 0) == -1) return 0ul;
+  return nvirtcoresperphys * ncpus;
+}
+#else
+static size_t mi_os_numa_nodex(void) {
+  return 0;
+}
+static size_t mi_os_numa_node_countx(void) {
+  return 1;
+}
+#endif
+
+_Atomic(size_t)  _mi_numa_node_count; // = 0   // cache the node count
+
+size_t _mi_os_numa_node_count_get(void) {
+  size_t count = mi_atomic_load_acquire(&_mi_numa_node_count);
+  if (count <= 0) {
+    long ncount = mi_option_get(mi_option_use_numa_nodes); // given explicitly?
+    if (ncount > 0) {
+      count = (size_t)ncount;
+    }
+    else {
+      count = mi_os_numa_node_countx(); // or detect dynamically
+      if (count == 0) count = 1;
+    }
+    mi_atomic_store_release(&_mi_numa_node_count, count); // save it
+    _mi_verbose_message("using %zd numa regions\n", count);
+  }
+  return count;
+}
+
+int _mi_os_numa_node_get(mi_os_tld_t* tld) {
+  MI_UNUSED(tld);
+  size_t numa_count = _mi_os_numa_node_count();
+  if (numa_count<=1) return 0; // optimize on single numa node systems: always node 0
+  // never more than the node count and >= 0
+  size_t numa_node = mi_os_numa_nodex();
+  if (numa_node >= numa_count) { numa_node = numa_node % numa_count; }
+  return (int)numa_node;
+}
diff --git a/Objects/mimalloc/page-queue.c b/Objects/mimalloc/page-queue.c
new file mode 100644
index 00000000000000..cb54b3740196e9
--- /dev/null
+++ b/Objects/mimalloc/page-queue.c
@@ -0,0 +1,332 @@
+/*----------------------------------------------------------------------------
+Copyright (c) 2018-2020, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* -----------------------------------------------------------
+  Definition of page queues for each block size
+----------------------------------------------------------- */
+
+#ifndef MI_IN_PAGE_C
+#error "this file should be included from 'page.c'"
+#endif
+
+/* -----------------------------------------------------------
+  Minimal alignment in machine words (i.e. `sizeof(void*)`)
+----------------------------------------------------------- */
+
+#if (MI_MAX_ALIGN_SIZE > 4*MI_INTPTR_SIZE)
+  #error "define alignment for more than 4x word size for this platform"
+#elif (MI_MAX_ALIGN_SIZE > 2*MI_INTPTR_SIZE)
+  #define MI_ALIGN4W   // 4 machine words minimal alignment
+#elif (MI_MAX_ALIGN_SIZE > MI_INTPTR_SIZE)
+  #define MI_ALIGN2W   // 2 machine words minimal alignment
+#else
+  // ok, default alignment is 1 word
+#endif
+
+
+/* -----------------------------------------------------------
+  Queue query
+----------------------------------------------------------- */
+
+
+static inline bool mi_page_queue_is_huge(const mi_page_queue_t* pq) {
+  return (pq->block_size == (MI_MEDIUM_OBJ_SIZE_MAX+sizeof(uintptr_t)));
+}
+
+static inline bool mi_page_queue_is_full(const mi_page_queue_t* pq) {
+  return (pq->block_size == (MI_MEDIUM_OBJ_SIZE_MAX+(2*sizeof(uintptr_t))));
+}
+
+static inline bool mi_page_queue_is_special(const mi_page_queue_t* pq) {
+  return (pq->block_size > MI_MEDIUM_OBJ_SIZE_MAX);
+}
+
+/* -----------------------------------------------------------
+  Bins
+----------------------------------------------------------- */
+
+// Return the bin for a given field size.
+// Returns MI_BIN_HUGE if the size is too large.
+// We use `wsize` for the size in "machine word sizes",
+// i.e. byte size == `wsize*sizeof(void*)`.
+static inline uint8_t mi_bin(size_t size) {
+  size_t wsize = _mi_wsize_from_size(size);
+  uint8_t bin;
+  if (wsize <= 1) {
+    bin = 1;
+  }
+  #if defined(MI_ALIGN4W)
+  else if (wsize <= 4) {
+    bin = (uint8_t)((wsize+1)&~1); // round to double word sizes
+  }
+  #elif defined(MI_ALIGN2W)
+  else if (wsize <= 8) {
+    bin = (uint8_t)((wsize+1)&~1); // round to double word sizes
+  }
+  #else
+  else if (wsize <= 8) {
+    bin = (uint8_t)wsize;
+  }
+  #endif
+  else if (wsize > MI_MEDIUM_OBJ_WSIZE_MAX) {
+    bin = MI_BIN_HUGE;
+  }
+  else {
+    #if defined(MI_ALIGN4W)
+    if (wsize <= 16) { wsize = (wsize+3)&~3; } // round to 4x word sizes
+    #endif
+    wsize--;
+    // find the highest bit
+    uint8_t b = (uint8_t)mi_bsr(wsize);  // note: wsize != 0
+    // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation).
+    // - adjust with 3 because we use do not round the first 8 sizes
+    //   which each get an exact bin
+    bin = ((b << 2) + (uint8_t)((wsize >> (b - 2)) & 0x03)) - 3;
+    mi_assert_internal(bin < MI_BIN_HUGE);
+  }
+  mi_assert_internal(bin > 0 && bin <= MI_BIN_HUGE);
+  return bin;
+}
+
+
+
+/* -----------------------------------------------------------
+  Queue of pages with free blocks
+----------------------------------------------------------- */
+
+uint8_t _mi_bin(size_t size) {
+  return mi_bin(size);
+}
+
+size_t _mi_bin_size(uint8_t bin) {
+  return _mi_heap_empty.pages[bin].block_size;
+}
+
+// Good size for allocation
+size_t mi_good_size(size_t size) mi_attr_noexcept {
+  if (size <= MI_MEDIUM_OBJ_SIZE_MAX) {
+    return _mi_bin_size(mi_bin(size));
+  }
+  else {
+    return _mi_align_up(size,_mi_os_page_size());
+  }
+}
+
+#if (MI_DEBUG>1)
+static bool mi_page_queue_contains(mi_page_queue_t* queue, const mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_page_t* list = queue->first;
+  while (list != NULL) {
+    mi_assert_internal(list->next == NULL || list->next->prev == list);
+    mi_assert_internal(list->prev == NULL || list->prev->next == list);
+    if (list == page) break;
+    list = list->next;
+  }
+  return (list == page);
+}
+
+#endif
+
+#if (MI_DEBUG>1)
+static bool mi_heap_contains_queue(const mi_heap_t* heap, const mi_page_queue_t* pq) {
+  return (pq >= &heap->pages[0] && pq <= &heap->pages[MI_BIN_FULL]);
+}
+#endif
+
+static mi_page_queue_t* mi_page_queue_of(const mi_page_t* page) {
+  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : mi_bin(page->xblock_size));
+  mi_heap_t* heap = mi_page_heap(page);
+  mi_assert_internal(heap != NULL && bin <= MI_BIN_FULL);
+  mi_page_queue_t* pq = &heap->pages[bin];
+  mi_assert_internal(bin >= MI_BIN_HUGE || page->xblock_size == pq->block_size);
+  mi_assert_expensive(mi_page_queue_contains(pq, page));
+  return pq;
+}
+
+static mi_page_queue_t* mi_heap_page_queue_of(mi_heap_t* heap, const mi_page_t* page) {
+  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : mi_bin(page->xblock_size));
+  mi_assert_internal(bin <= MI_BIN_FULL);
+  mi_page_queue_t* pq = &heap->pages[bin];
+  mi_assert_internal(mi_page_is_in_full(page) || page->xblock_size == pq->block_size);
+  return pq;
+}
+
+// The current small page array is for efficiency and for each
+// small size (up to 256) it points directly to the page for that
+// size without having to compute the bin. This means when the
+// current free page queue is updated for a small bin, we need to update a
+// range of entries in `_mi_page_small_free`.
+static inline void mi_heap_queue_first_update(mi_heap_t* heap, const mi_page_queue_t* pq) {
+  mi_assert_internal(mi_heap_contains_queue(heap,pq));
+  size_t size = pq->block_size;
+  if (size > MI_SMALL_SIZE_MAX) return;
+
+  mi_page_t* page = pq->first;
+  if (pq->first == NULL) page = (mi_page_t*)&_mi_page_empty;
+
+  // find index in the right direct page array
+  size_t start;
+  size_t idx = _mi_wsize_from_size(size);
+  mi_page_t** pages_free = heap->pages_free_direct;
+
+  if (pages_free[idx] == page) return;  // already set
+
+  // find start slot
+  if (idx<=1) {
+    start = 0;
+  }
+  else {
+    // find previous size; due to minimal alignment upto 3 previous bins may need to be skipped
+    uint8_t bin = mi_bin(size);
+    const mi_page_queue_t* prev = pq - 1;
+    while( bin == mi_bin(prev->block_size) && prev > &heap->pages[0]) {
+      prev--;
+    }
+    start = 1 + _mi_wsize_from_size(prev->block_size);
+    if (start > idx) start = idx;
+  }
+
+  // set size range to the right page
+  mi_assert(start <= idx);
+  for (size_t sz = start; sz <= idx; sz++) {
+    pages_free[sz] = page;
+  }
+}
+
+/*
+static bool mi_page_queue_is_empty(mi_page_queue_t* queue) {
+  return (queue->first == NULL);
+}
+*/
+
+static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_assert_expensive(mi_page_queue_contains(queue, page));
+  mi_assert_internal(page->xblock_size == queue->block_size || (page->xblock_size > MI_MEDIUM_OBJ_SIZE_MAX && mi_page_queue_is_huge(queue))  || (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
+  mi_heap_t* heap = mi_page_heap(page);
+
+  if (page->prev != NULL) page->prev->next = page->next;
+  if (page->next != NULL) page->next->prev = page->prev;
+  if (page == queue->last)  queue->last = page->prev;
+  if (page == queue->first) {
+    queue->first = page->next;
+    // update first
+    mi_assert_internal(mi_heap_contains_queue(heap, queue));
+    mi_heap_queue_first_update(heap,queue);
+  }
+  heap->page_count--;
+  page->next = NULL;
+  page->prev = NULL;
+  // mi_atomic_store_ptr_release(mi_atomic_cast(void*, &page->heap), NULL);
+  mi_page_set_in_full(page,false);
+}
+
+
+static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_t* page) {
+  mi_assert_internal(mi_page_heap(page) == heap);
+  mi_assert_internal(!mi_page_queue_contains(queue, page));
+  #if MI_HUGE_PAGE_ABANDON
+  mi_assert_internal(_mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
+  #endif
+  mi_assert_internal(page->xblock_size == queue->block_size ||
+                      (page->xblock_size > MI_MEDIUM_OBJ_SIZE_MAX) ||
+                        (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
+
+  mi_page_set_in_full(page, mi_page_queue_is_full(queue));
+  // mi_atomic_store_ptr_release(mi_atomic_cast(void*, &page->heap), heap);
+  page->next = queue->first;
+  page->prev = NULL;
+  if (queue->first != NULL) {
+    mi_assert_internal(queue->first->prev == NULL);
+    queue->first->prev = page;
+    queue->first = page;
+  }
+  else {
+    queue->first = queue->last = page;
+  }
+
+  // update direct
+  mi_heap_queue_first_update(heap, queue);
+  heap->page_count++;
+}
+
+
+static void mi_page_queue_enqueue_from(mi_page_queue_t* to, mi_page_queue_t* from, mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_assert_expensive(mi_page_queue_contains(from, page));
+  mi_assert_expensive(!mi_page_queue_contains(to, page));
+
+  mi_assert_internal((page->xblock_size == to->block_size && page->xblock_size == from->block_size) ||
+                     (page->xblock_size == to->block_size && mi_page_queue_is_full(from)) ||
+                     (page->xblock_size == from->block_size && mi_page_queue_is_full(to)) ||
+                     (page->xblock_size > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_huge(to)) ||
+                     (page->xblock_size > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_full(to)));
+
+  mi_heap_t* heap = mi_page_heap(page);
+  if (page->prev != NULL) page->prev->next = page->next;
+  if (page->next != NULL) page->next->prev = page->prev;
+  if (page == from->last)  from->last = page->prev;
+  if (page == from->first) {
+    from->first = page->next;
+    // update first
+    mi_assert_internal(mi_heap_contains_queue(heap, from));
+    mi_heap_queue_first_update(heap, from);
+  }
+
+  page->prev = to->last;
+  page->next = NULL;
+  if (to->last != NULL) {
+    mi_assert_internal(heap == mi_page_heap(to->last));
+    to->last->next = page;
+    to->last = page;
+  }
+  else {
+    to->first = page;
+    to->last = page;
+    mi_heap_queue_first_update(heap, to);
+  }
+
+  mi_page_set_in_full(page, mi_page_queue_is_full(to));
+}
+
+// Only called from `mi_heap_absorb`.
+size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append) {
+  mi_assert_internal(mi_heap_contains_queue(heap,pq));
+  mi_assert_internal(pq->block_size == append->block_size);
+
+  if (append->first==NULL) return 0;
+
+  // set append pages to new heap and count
+  size_t count = 0;
+  for (mi_page_t* page = append->first; page != NULL; page = page->next) {
+    // inline `mi_page_set_heap` to avoid wrong assertion during absorption;
+    // in this case it is ok to be delayed freeing since both "to" and "from" heap are still alive.
+    mi_atomic_store_release(&page->xheap, (uintptr_t)heap);
+    // set the flag to delayed free (not overriding NEVER_DELAYED_FREE) which has as a
+    // side effect that it spins until any DELAYED_FREEING is finished. This ensures
+    // that after appending only the new heap will be used for delayed free operations.
+    _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, false);
+    count++;
+  }
+
+  if (pq->last==NULL) {
+    // take over afresh
+    mi_assert_internal(pq->first==NULL);
+    pq->first = append->first;
+    pq->last = append->last;
+    mi_heap_queue_first_update(heap, pq);
+  }
+  else {
+    // append to end
+    mi_assert_internal(pq->last!=NULL);
+    mi_assert_internal(append->first!=NULL);
+    pq->last->next = append->first;
+    append->first->prev = pq->last;
+    pq->last = append->last;
+  }
+  return count;
+}
diff --git a/Objects/mimalloc/page.c b/Objects/mimalloc/page.c
new file mode 100644
index 00000000000000..4250ff358b4a0b
--- /dev/null
+++ b/Objects/mimalloc/page.c
@@ -0,0 +1,926 @@
+/*----------------------------------------------------------------------------
+Copyright (c) 2018-2020, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* -----------------------------------------------------------
+  The core of the allocator. Every segment contains
+  pages of a certain block size. The main function
+  exported is `mi_malloc_generic`.
+----------------------------------------------------------- */
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+/* -----------------------------------------------------------
+  Definition of page queues for each block size
+----------------------------------------------------------- */
+
+#define MI_IN_PAGE_C
+#include "page-queue.c"
+#undef MI_IN_PAGE_C
+
+
+/* -----------------------------------------------------------
+  Page helpers
+----------------------------------------------------------- */
+
+// Index a block in a page
+static inline mi_block_t* mi_page_block_at(const mi_page_t* page, void* page_start, size_t block_size, size_t i) {
+  MI_UNUSED(page);
+  mi_assert_internal(page != NULL);
+  mi_assert_internal(i <= page->reserved);
+  return (mi_block_t*)((uint8_t*)page_start + (i * block_size));
+}
+
+static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t size, mi_tld_t* tld);
+static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld);
+
+#if (MI_DEBUG>=3)
+static size_t mi_page_list_count(mi_page_t* page, mi_block_t* head) {
+  size_t count = 0;
+  while (head != NULL) {
+    mi_assert_internal(page == _mi_ptr_page(head));
+    count++;
+    head = mi_block_next(page, head);
+  }
+  return count;
+}
+
+/*
+// Start of the page available memory
+static inline uint8_t* mi_page_area(const mi_page_t* page) {
+  return _mi_page_start(_mi_page_segment(page), page, NULL);
+}
+*/
+
+static bool mi_page_list_is_valid(mi_page_t* page, mi_block_t* p) {
+  size_t psize;
+  uint8_t* page_area = _mi_page_start(_mi_page_segment(page), page, &psize);
+  mi_block_t* start = (mi_block_t*)page_area;
+  mi_block_t* end   = (mi_block_t*)(page_area + psize);
+  while(p != NULL) {
+    if (p < start || p >= end) return false;
+    p = mi_block_next(page, p);
+  }
+  return true;
+}
+
+static bool mi_page_is_valid_init(mi_page_t* page) {
+  mi_assert_internal(page->xblock_size > 0);
+  mi_assert_internal(page->used <= page->capacity);
+  mi_assert_internal(page->capacity <= page->reserved);
+
+  mi_segment_t* segment = _mi_page_segment(page);
+  uint8_t* start = _mi_page_start(segment,page,NULL);
+  mi_assert_internal(start == _mi_segment_page_start(segment,page,NULL));
+  //const size_t bsize = mi_page_block_size(page);
+  //mi_assert_internal(start + page->capacity*page->block_size == page->top);
+
+  mi_assert_internal(mi_page_list_is_valid(page,page->free));
+  mi_assert_internal(mi_page_list_is_valid(page,page->local_free));
+
+  #if MI_DEBUG>3 // generally too expensive to check this
+  if (page->is_zero) {
+    const size_t ubsize = mi_page_usable_block_size(page);
+    for(mi_block_t* block = page->free; block != NULL; block = mi_block_next(page,block)) {
+      mi_assert_expensive(mi_mem_is_zero(block + 1, ubsize - sizeof(mi_block_t)));
+    }
+  }
+  #endif
+
+  mi_block_t* tfree = mi_page_thread_free(page);
+  mi_assert_internal(mi_page_list_is_valid(page, tfree));
+  //size_t tfree_count = mi_page_list_count(page, tfree);
+  //mi_assert_internal(tfree_count <= page->thread_freed + 1);
+
+  size_t free_count = mi_page_list_count(page, page->free) + mi_page_list_count(page, page->local_free);
+  mi_assert_internal(page->used + free_count == page->capacity);
+
+  return true;
+}
+
+bool _mi_page_is_valid(mi_page_t* page) {
+  mi_assert_internal(mi_page_is_valid_init(page));
+  #if MI_SECURE
+  mi_assert_internal(page->keys[0] != 0);
+  #endif
+  if (mi_page_heap(page)!=NULL) {
+    mi_segment_t* segment = _mi_page_segment(page);
+
+    mi_assert_internal(!_mi_process_is_initialized || segment->thread_id==0 || segment->thread_id == mi_page_heap(page)->thread_id);
+    #if MI_HUGE_PAGE_ABANDON
+    if (segment->kind != MI_SEGMENT_HUGE) 
+    #endif
+    {    
+      mi_page_queue_t* pq = mi_page_queue_of(page);
+      mi_assert_internal(mi_page_queue_contains(pq, page));
+      mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_block_size(page) > MI_MEDIUM_OBJ_SIZE_MAX || mi_page_is_in_full(page));
+      mi_assert_internal(mi_heap_contains_queue(mi_page_heap(page),pq));
+    }
+  }
+  return true;
+}
+#endif
+
+void _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never) {
+  while (!_mi_page_try_use_delayed_free(page, delay, override_never)) {
+    mi_atomic_yield();
+  }
+}
+
+bool _mi_page_try_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never) {
+  mi_thread_free_t tfreex;
+  mi_delayed_t     old_delay;
+  mi_thread_free_t tfree;
+  size_t yield_count = 0;
+  do {
+    tfree = mi_atomic_load_acquire(&page->xthread_free); // note: must acquire as we can break/repeat this loop and not do a CAS;
+    tfreex = mi_tf_set_delayed(tfree, delay);
+    old_delay = mi_tf_delayed(tfree);
+    if mi_unlikely(old_delay == MI_DELAYED_FREEING) {
+      if (yield_count >= 4) return false;  // give up after 4 tries
+      yield_count++;
+      mi_atomic_yield(); // delay until outstanding MI_DELAYED_FREEING are done.
+      // tfree = mi_tf_set_delayed(tfree, MI_NO_DELAYED_FREE); // will cause CAS to busy fail
+    }
+    else if (delay == old_delay) {
+      break; // avoid atomic operation if already equal
+    }
+    else if (!override_never && old_delay == MI_NEVER_DELAYED_FREE) {
+      break; // leave never-delayed flag set
+    }
+  } while ((old_delay == MI_DELAYED_FREEING) ||
+           !mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
+
+  return true; // success
+}
+
+/* -----------------------------------------------------------
+  Page collect the `local_free` and `thread_free` lists
+----------------------------------------------------------- */
+
+// Collect the local `thread_free` list using an atomic exchange.
+// Note: The exchange must be done atomically as this is used right after
+// moving to the full list in `mi_page_collect_ex` and we need to
+// ensure that there was no race where the page became unfull just before the move.
+static void _mi_page_thread_free_collect(mi_page_t* page)
+{
+  mi_block_t* head;
+  mi_thread_free_t tfreex;
+  mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
+  do {
+    head = mi_tf_block(tfree);
+    tfreex = mi_tf_set_block(tfree,NULL);
+  } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tfree, tfreex));
+
+  // return if the list is empty
+  if (head == NULL) return;
+
+  // find the tail -- also to get a proper count (without data races)
+  uint32_t max_count = page->capacity; // cannot collect more than capacity
+  uint32_t count = 1;
+  mi_block_t* tail = head;
+  mi_block_t* next;
+  while ((next = mi_block_next(page,tail)) != NULL && count <= max_count) {
+    count++;
+    tail = next;
+  }
+  // if `count > max_count` there was a memory corruption (possibly infinite list due to double multi-threaded free)
+  if (count > max_count) {
+    _mi_error_message(EFAULT, "corrupted thread-free list\n");
+    return; // the thread-free items cannot be freed
+  }
+
+  // and append the current local free list
+  mi_block_set_next(page,tail, page->local_free);
+  page->local_free = head;
+
+  // update counts now
+  page->used -= count;
+}
+
+void _mi_page_free_collect(mi_page_t* page, bool force) {
+  mi_assert_internal(page!=NULL);
+
+  // collect the thread free list
+  if (force || mi_page_thread_free(page) != NULL) {  // quick test to avoid an atomic operation
+    _mi_page_thread_free_collect(page);
+  }
+
+  // and the local free list
+  if (page->local_free != NULL) {
+    if mi_likely(page->free == NULL) {
+      // usual case
+      page->free = page->local_free;
+      page->local_free = NULL;
+      page->is_zero = false;
+    }
+    else if (force) {
+      // append -- only on shutdown (force) as this is a linear operation
+      mi_block_t* tail = page->local_free;
+      mi_block_t* next;
+      while ((next = mi_block_next(page, tail)) != NULL) {
+        tail = next;
+      }
+      mi_block_set_next(page, tail, page->free);
+      page->free = page->local_free;
+      page->local_free = NULL;
+      page->is_zero = false;
+    }
+  }
+
+  mi_assert_internal(!force || page->local_free == NULL);
+}
+
+
+
+/* -----------------------------------------------------------
+  Page fresh and retire
+----------------------------------------------------------- */
+
+// called from segments when reclaiming abandoned pages
+void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) {
+  mi_assert_expensive(mi_page_is_valid_init(page));
+
+  mi_assert_internal(mi_page_heap(page) == heap);
+  mi_assert_internal(mi_page_thread_free_flag(page) != MI_NEVER_DELAYED_FREE);
+  #if MI_HUGE_PAGE_ABANDON
+  mi_assert_internal(_mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
+  #endif
+  mi_assert_internal(!page->is_reset);
+  // TODO: push on full queue immediately if it is full?
+  mi_page_queue_t* pq = mi_page_queue(heap, mi_page_block_size(page));
+  mi_page_queue_push(heap, pq, page);
+  mi_assert_expensive(_mi_page_is_valid(page));
+}
+
+// allocate a fresh page from a segment
+static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size_t block_size, size_t page_alignment) {
+  #if !MI_HUGE_PAGE_ABANDON
+  mi_assert_internal(pq != NULL);
+  mi_assert_internal(mi_heap_contains_queue(heap, pq));
+  mi_assert_internal(page_alignment > 0 || block_size > MI_MEDIUM_OBJ_SIZE_MAX || block_size == pq->block_size);
+  #endif
+  mi_page_t* page = _mi_segment_page_alloc(heap, block_size, page_alignment, &heap->tld->segments, &heap->tld->os);
+  if (page == NULL) {
+    // this may be out-of-memory, or an abandoned page was reclaimed (and in our queue)
+    return NULL;
+  }
+  mi_assert_internal(page_alignment >0 || block_size > MI_MEDIUM_OBJ_SIZE_MAX || _mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
+  mi_assert_internal(pq!=NULL || page->xblock_size != 0);
+  mi_assert_internal(pq!=NULL || mi_page_block_size(page) >= block_size);
+  // a fresh page was found, initialize it
+  const size_t full_block_size = ((pq == NULL || mi_page_queue_is_huge(pq)) ? mi_page_block_size(page) : block_size); // see also: mi_segment_huge_page_alloc
+  mi_assert_internal(full_block_size >= block_size);
+  mi_page_init(heap, page, full_block_size, heap->tld);
+  mi_heap_stat_increase(heap, pages, 1);
+  if (pq != NULL) { mi_page_queue_push(heap, pq, page); }
+  mi_assert_expensive(_mi_page_is_valid(page));
+  return page;
+}
+
+// Get a fresh page to use
+static mi_page_t* mi_page_fresh(mi_heap_t* heap, mi_page_queue_t* pq) {
+  mi_assert_internal(mi_heap_contains_queue(heap, pq));
+  mi_page_t* page = mi_page_fresh_alloc(heap, pq, pq->block_size, 0);
+  if (page==NULL) return NULL;
+  mi_assert_internal(pq->block_size==mi_page_block_size(page));
+  mi_assert_internal(pq==mi_page_queue(heap, mi_page_block_size(page)));
+  return page;
+}
+
+/* -----------------------------------------------------------
+   Do any delayed frees
+   (put there by other threads if they deallocated in a full page)
+----------------------------------------------------------- */
+void _mi_heap_delayed_free_all(mi_heap_t* heap) {
+  while (!_mi_heap_delayed_free_partial(heap)) {
+    mi_atomic_yield();
+  }
+}
+
+// returns true if all delayed frees were processed
+bool _mi_heap_delayed_free_partial(mi_heap_t* heap) {
+  // take over the list (note: no atomic exchange since it is often NULL)
+  mi_block_t* block = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
+  while (block != NULL && !mi_atomic_cas_ptr_weak_acq_rel(mi_block_t, &heap->thread_delayed_free, &block, NULL)) { /* nothing */ };
+  bool all_freed = true;
+
+  // and free them all
+  while(block != NULL) {
+    mi_block_t* next = mi_block_nextx(heap,block, heap->keys);
+    // use internal free instead of regular one to keep stats etc correct
+    if (!_mi_free_delayed_block(block)) {
+      // we might already start delayed freeing while another thread has not yet
+      // reset the delayed_freeing flag; in that case delay it further by reinserting the current block
+      // into the delayed free list
+      all_freed = false;
+      mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
+      do {
+        mi_block_set_nextx(heap, block, dfree, heap->keys);
+      } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block));
+    }
+    block = next;
+  }
+  return all_freed;
+}
+
+/* -----------------------------------------------------------
+  Unfull, abandon, free and retire
+----------------------------------------------------------- */
+
+// Move a page from the full list back to a regular list
+void _mi_page_unfull(mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_assert_expensive(_mi_page_is_valid(page));
+  mi_assert_internal(mi_page_is_in_full(page));
+  if (!mi_page_is_in_full(page)) return;
+
+  mi_heap_t* heap = mi_page_heap(page);
+  mi_page_queue_t* pqfull = &heap->pages[MI_BIN_FULL];
+  mi_page_set_in_full(page, false); // to get the right queue
+  mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page);
+  mi_page_set_in_full(page, true);
+  mi_page_queue_enqueue_from(pq, pqfull, page);
+}
+
+static void mi_page_to_full(mi_page_t* page, mi_page_queue_t* pq) {
+  mi_assert_internal(pq == mi_page_queue_of(page));
+  mi_assert_internal(!mi_page_immediate_available(page));
+  mi_assert_internal(!mi_page_is_in_full(page));
+
+  if (mi_page_is_in_full(page)) return;
+  mi_page_queue_enqueue_from(&mi_page_heap(page)->pages[MI_BIN_FULL], pq, page);
+  _mi_page_free_collect(page,false);  // try to collect right away in case another thread freed just before MI_USE_DELAYED_FREE was set
+}
+
+
+// Abandon a page with used blocks at the end of a thread.
+// Note: only call if it is ensured that no references exist from
+// the `page->heap->thread_delayed_free` into this page.
+// Currently only called through `mi_heap_collect_ex` which ensures this.
+void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
+  mi_assert_internal(page != NULL);
+  mi_assert_expensive(_mi_page_is_valid(page));
+  mi_assert_internal(pq == mi_page_queue_of(page));
+  mi_assert_internal(mi_page_heap(page) != NULL);
+
+  mi_heap_t* pheap = mi_page_heap(page);
+
+  // remove from our page list
+  mi_segments_tld_t* segments_tld = &pheap->tld->segments;
+  mi_page_queue_remove(pq, page);
+
+  // page is no longer associated with our heap
+  mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE);
+  mi_page_set_heap(page, NULL);
+
+#if MI_DEBUG>1
+  // check there are no references left..
+  for (mi_block_t* block = (mi_block_t*)pheap->thread_delayed_free; block != NULL; block = mi_block_nextx(pheap, block, pheap->keys)) {
+    mi_assert_internal(_mi_ptr_page(block) != page);
+  }
+#endif
+
+  // and abandon it
+  mi_assert_internal(mi_page_heap(page) == NULL);
+  _mi_segment_page_abandon(page,segments_tld);
+}
+
+
+// Free a page with no more free blocks
+void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
+  mi_assert_internal(page != NULL);
+  mi_assert_expensive(_mi_page_is_valid(page));
+  mi_assert_internal(pq == mi_page_queue_of(page));
+  mi_assert_internal(mi_page_all_free(page));
+  mi_assert_internal(mi_page_thread_free_flag(page)!=MI_DELAYED_FREEING);
+
+  // no more aligned blocks in here
+  mi_page_set_has_aligned(page, false);
+
+  mi_heap_t* heap = mi_page_heap(page);
+
+  // remove from the page list
+  // (no need to do _mi_heap_delayed_free first as all blocks are already free)
+  mi_segments_tld_t* segments_tld = &heap->tld->segments;
+  mi_page_queue_remove(pq, page);
+
+  // and free it
+  mi_page_set_heap(page,NULL);
+  _mi_segment_page_free(page, force, segments_tld);
+}
+
+// Retire parameters
+#define MI_MAX_RETIRE_SIZE    (MI_MEDIUM_OBJ_SIZE_MAX)
+#define MI_RETIRE_CYCLES      (8)
+
+// Retire a page with no more used blocks
+// Important to not retire too quickly though as new
+// allocations might coming.
+// Note: called from `mi_free` and benchmarks often
+// trigger this due to freeing everything and then
+// allocating again so careful when changing this.
+void _mi_page_retire(mi_page_t* page) mi_attr_noexcept {
+  mi_assert_internal(page != NULL);
+  mi_assert_expensive(_mi_page_is_valid(page));
+  mi_assert_internal(mi_page_all_free(page));
+  
+  mi_page_set_has_aligned(page, false);
+
+  // don't retire too often..
+  // (or we end up retiring and re-allocating most of the time)
+  // NOTE: refine this more: we should not retire if this
+  // is the only page left with free blocks. It is not clear
+  // how to check this efficiently though...
+  // for now, we don't retire if it is the only page left of this size class.
+  mi_page_queue_t* pq = mi_page_queue_of(page);
+  if mi_likely(page->xblock_size <= MI_MAX_RETIRE_SIZE && !mi_page_queue_is_special(pq)) {  // not too large && not full or huge queue?
+    if (pq->last==page && pq->first==page) { // the only page in the queue?
+      mi_stat_counter_increase(_mi_stats_main.page_no_retire,1);
+      page->retire_expire = 1 + (page->xblock_size <= MI_SMALL_OBJ_SIZE_MAX ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);      
+      mi_heap_t* heap = mi_page_heap(page);
+      mi_assert_internal(pq >= heap->pages);
+      const size_t index = pq - heap->pages;
+      mi_assert_internal(index < MI_BIN_FULL && index < MI_BIN_HUGE);
+      if (index < heap->page_retired_min) heap->page_retired_min = index;
+      if (index > heap->page_retired_max) heap->page_retired_max = index;
+      mi_assert_internal(mi_page_all_free(page));
+      return; // dont't free after all
+    }
+  }
+  _mi_page_free(page, pq, false);
+}
+
+// free retired pages: we don't need to look at the entire queues
+// since we only retire pages that are at the head position in a queue.
+void _mi_heap_collect_retired(mi_heap_t* heap, bool force) {
+  size_t min = MI_BIN_FULL;
+  size_t max = 0;
+  for(size_t bin = heap->page_retired_min; bin <= heap->page_retired_max; bin++) {
+    mi_page_queue_t* pq   = &heap->pages[bin];
+    mi_page_t*       page = pq->first;
+    if (page != NULL && page->retire_expire != 0) {
+      if (mi_page_all_free(page)) {
+        page->retire_expire--;
+        if (force || page->retire_expire == 0) {
+          _mi_page_free(pq->first, pq, force);
+        }
+        else {
+          // keep retired, update min/max
+          if (bin < min) min = bin;
+          if (bin > max) max = bin;
+        }
+      }
+      else {
+        page->retire_expire = 0;
+      }
+    }
+  }
+  heap->page_retired_min = min;
+  heap->page_retired_max = max;
+}
+
+
+/* -----------------------------------------------------------
+  Initialize the initial free list in a page.
+  In secure mode we initialize a randomized list by
+  alternating between slices.
+----------------------------------------------------------- */
+
+#define MI_MAX_SLICE_SHIFT  (6)   // at most 64 slices
+#define MI_MAX_SLICES       (1UL << MI_MAX_SLICE_SHIFT)
+#define MI_MIN_SLICES       (2)
+
+static void mi_page_free_list_extend_secure(mi_heap_t* const heap, mi_page_t* const page, const size_t bsize, const size_t extend, mi_stats_t* const stats) {
+  MI_UNUSED(stats);
+  #if (MI_SECURE<=2)
+  mi_assert_internal(page->free == NULL);
+  mi_assert_internal(page->local_free == NULL);
+  #endif
+  mi_assert_internal(page->capacity + extend <= page->reserved);
+  mi_assert_internal(bsize == mi_page_block_size(page));
+  void* const page_area = _mi_page_start(_mi_page_segment(page), page, NULL);
+
+  // initialize a randomized free list
+  // set up `slice_count` slices to alternate between
+  size_t shift = MI_MAX_SLICE_SHIFT;
+  while ((extend >> shift) == 0) {
+    shift--;
+  }
+  const size_t slice_count = (size_t)1U << shift;
+  const size_t slice_extend = extend / slice_count;
+  mi_assert_internal(slice_extend >= 1);
+  mi_block_t* blocks[MI_MAX_SLICES];   // current start of the slice
+  size_t      counts[MI_MAX_SLICES];   // available objects in the slice
+  for (size_t i = 0; i < slice_count; i++) {
+    blocks[i] = mi_page_block_at(page, page_area, bsize, page->capacity + i*slice_extend);
+    counts[i] = slice_extend;
+  }
+  counts[slice_count-1] += (extend % slice_count);  // final slice holds the modulus too (todo: distribute evenly?)
+
+  // and initialize the free list by randomly threading through them
+  // set up first element
+  const uintptr_t r = _mi_heap_random_next(heap);
+  size_t current = r % slice_count;
+  counts[current]--;
+  mi_block_t* const free_start = blocks[current];
+  // and iterate through the rest; use `random_shuffle` for performance
+  uintptr_t rnd = _mi_random_shuffle(r|1); // ensure not 0
+  for (size_t i = 1; i < extend; i++) {
+    // call random_shuffle only every INTPTR_SIZE rounds
+    const size_t round = i%MI_INTPTR_SIZE;
+    if (round == 0) rnd = _mi_random_shuffle(rnd);
+    // select a random next slice index
+    size_t next = ((rnd >> 8*round) & (slice_count-1));
+    while (counts[next]==0) {                            // ensure it still has space
+      next++;
+      if (next==slice_count) next = 0;
+    }
+    // and link the current block to it
+    counts[next]--;
+    mi_block_t* const block = blocks[current];
+    blocks[current] = (mi_block_t*)((uint8_t*)block + bsize);  // bump to the following block
+    mi_block_set_next(page, block, blocks[next]);   // and set next; note: we may have `current == next`
+    current = next;
+  }
+  // prepend to the free list (usually NULL)
+  mi_block_set_next(page, blocks[current], page->free);  // end of the list
+  page->free = free_start;
+}
+
+static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, const size_t bsize, const size_t extend, mi_stats_t* const stats)
+{
+  MI_UNUSED(stats);
+  #if (MI_SECURE <= 2)
+  mi_assert_internal(page->free == NULL);
+  mi_assert_internal(page->local_free == NULL);
+  #endif
+  mi_assert_internal(page->capacity + extend <= page->reserved);
+  mi_assert_internal(bsize == mi_page_block_size(page));
+  void* const page_area = _mi_page_start(_mi_page_segment(page), page, NULL );
+
+  mi_block_t* const start = mi_page_block_at(page, page_area, bsize, page->capacity);
+
+  // initialize a sequential free list
+  mi_block_t* const last = mi_page_block_at(page, page_area, bsize, page->capacity + extend - 1);
+  mi_block_t* block = start;
+  while(block <= last) {
+    mi_block_t* next = (mi_block_t*)((uint8_t*)block + bsize);
+    mi_block_set_next(page,block,next);
+    block = next;
+  }
+  // prepend to free list (usually `NULL`)
+  mi_block_set_next(page, last, page->free);
+  page->free = start;
+}
+
+/* -----------------------------------------------------------
+  Page initialize and extend the capacity
+----------------------------------------------------------- */
+
+#define MI_MAX_EXTEND_SIZE    (4*1024)      // heuristic, one OS page seems to work well.
+#if (MI_SECURE>0)
+#define MI_MIN_EXTEND         (8*MI_SECURE) // extend at least by this many
+#else
+#define MI_MIN_EXTEND         (4)
+#endif
+
+// Extend the capacity (up to reserved) by initializing a free list
+// We do at most `MI_MAX_EXTEND` to avoid touching too much memory
+// Note: we also experimented with "bump" allocation on the first
+// allocations but this did not speed up any benchmark (due to an
+// extra test in malloc? or cache effects?)
+static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld) {
+  MI_UNUSED(tld); 
+  mi_assert_expensive(mi_page_is_valid_init(page));
+  #if (MI_SECURE<=2)
+  mi_assert(page->free == NULL);
+  mi_assert(page->local_free == NULL);
+  if (page->free != NULL) return;
+  #endif
+  if (page->capacity >= page->reserved) return;
+
+  size_t page_size;
+  _mi_page_start(_mi_page_segment(page), page, &page_size);
+  mi_stat_counter_increase(tld->stats.pages_extended, 1);
+
+  // calculate the extend count
+  const size_t bsize = (page->xblock_size < MI_HUGE_BLOCK_SIZE ? page->xblock_size : page_size);
+  size_t extend = page->reserved - page->capacity;
+  mi_assert_internal(extend > 0);
+
+  size_t max_extend = (bsize >= MI_MAX_EXTEND_SIZE ? MI_MIN_EXTEND : MI_MAX_EXTEND_SIZE/(uint32_t)bsize);
+  if (max_extend < MI_MIN_EXTEND) { max_extend = MI_MIN_EXTEND; }
+  mi_assert_internal(max_extend > 0);
+
+  if (extend > max_extend) {
+    // ensure we don't touch memory beyond the page to reduce page commit.
+    // the `lean` benchmark tests this. Going from 1 to 8 increases rss by 50%.
+    extend = max_extend;
+  }
+
+  mi_assert_internal(extend > 0 && extend + page->capacity <= page->reserved);
+  mi_assert_internal(extend < (1UL<<16));
+
+  // and append the extend the free list
+  if (extend < MI_MIN_SLICES || MI_SECURE==0) { //!mi_option_is_enabled(mi_option_secure)) {
+    mi_page_free_list_extend(page, bsize, extend, &tld->stats );
+  }
+  else {
+    mi_page_free_list_extend_secure(heap, page, bsize, extend, &tld->stats);
+  }
+  // enable the new free list
+  page->capacity += (uint16_t)extend;
+  mi_stat_increase(tld->stats.page_committed, extend * bsize);
+
+  // extension into zero initialized memory preserves the zero'd free list
+  if (!page->is_zero_init) {
+    page->is_zero = false;
+  }
+  mi_assert_expensive(mi_page_is_valid_init(page));
+}
+
+// Initialize a fresh page
+static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi_tld_t* tld) {
+  mi_assert(page != NULL);
+  mi_segment_t* segment = _mi_page_segment(page);
+  mi_assert(segment != NULL);
+  mi_assert_internal(block_size > 0);
+  // set fields
+  mi_page_set_heap(page, heap);
+  page->xblock_size = (block_size < MI_HUGE_BLOCK_SIZE ? (uint32_t)block_size : MI_HUGE_BLOCK_SIZE); // initialize before _mi_segment_page_start
+  size_t page_size;
+  const void* page_start = _mi_segment_page_start(segment, page, &page_size);
+  MI_UNUSED(page_start);
+  mi_track_mem_noaccess(page_start,page_size);
+  mi_assert_internal(mi_page_block_size(page) <= page_size);
+  mi_assert_internal(page_size <= page->slice_count*MI_SEGMENT_SLICE_SIZE);
+  mi_assert_internal(page_size / block_size < (1L<<16));
+  page->reserved = (uint16_t)(page_size / block_size);
+  mi_assert_internal(page->reserved > 0);
+  #ifdef MI_ENCODE_FREELIST
+  page->keys[0] = _mi_heap_random_next(heap);
+  page->keys[1] = _mi_heap_random_next(heap);
+  #endif
+  #if MI_DEBUG > 0
+  page->is_zero = false; // ensure in debug mode we initialize with MI_DEBUG_UNINIT, see issue #501
+  #else
+  page->is_zero = page->is_zero_init;
+  #endif
+
+  mi_assert_internal(page->is_committed);
+  mi_assert_internal(!page->is_reset);
+  mi_assert_internal(page->capacity == 0);
+  mi_assert_internal(page->free == NULL);
+  mi_assert_internal(page->used == 0);
+  mi_assert_internal(page->xthread_free == 0);
+  mi_assert_internal(page->next == NULL);
+  mi_assert_internal(page->prev == NULL);
+  mi_assert_internal(page->retire_expire == 0);
+  mi_assert_internal(!mi_page_has_aligned(page));
+  #if (MI_ENCODE_FREELIST)
+  mi_assert_internal(page->keys[0] != 0);
+  mi_assert_internal(page->keys[1] != 0);
+  #endif
+  mi_assert_expensive(mi_page_is_valid_init(page));
+
+  // initialize an initial free list
+  mi_page_extend_free(heap,page,tld);
+  mi_assert(mi_page_immediate_available(page));
+}
+
+
+/* -----------------------------------------------------------
+  Find pages with free blocks
+-------------------------------------------------------------*/
+
+// Find a page with free blocks of `page->block_size`.
+static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* pq, bool first_try)
+{
+  // search through the pages in "next fit" order
+  size_t count = 0;
+  mi_page_t* page = pq->first;
+  while (page != NULL)
+  {
+    mi_page_t* next = page->next; // remember next
+    count++;
+
+    // 0. collect freed blocks by us and other threads
+    _mi_page_free_collect(page, false);
+
+    // 1. if the page contains free blocks, we are done
+    if (mi_page_immediate_available(page)) {
+      break;  // pick this one
+    }
+
+    // 2. Try to extend
+    if (page->capacity < page->reserved) {
+      mi_page_extend_free(heap, page, heap->tld);
+      mi_assert_internal(mi_page_immediate_available(page));
+      break;
+    }
+
+    // 3. If the page is completely full, move it to the `mi_pages_full`
+    // queue so we don't visit long-lived pages too often.
+    mi_assert_internal(!mi_page_is_in_full(page) && !mi_page_immediate_available(page));
+    mi_page_to_full(page, pq);
+
+    page = next;
+  } // for each page
+
+  mi_heap_stat_counter_increase(heap, searches, count);
+
+  if (page == NULL) {
+    _mi_heap_collect_retired(heap, false); // perhaps make a page available?
+    page = mi_page_fresh(heap, pq);
+    if (page == NULL && first_try) {
+      // out-of-memory _or_ an abandoned page with free blocks was reclaimed, try once again
+      page = mi_page_queue_find_free_ex(heap, pq, false);
+    }
+  }
+  else {
+    mi_assert(pq->first == page);
+    page->retire_expire = 0;
+  }
+  mi_assert_internal(page == NULL || mi_page_immediate_available(page));
+  return page;
+}
+
+
+
+// Find a page with free blocks of `size`.
+static inline mi_page_t* mi_find_free_page(mi_heap_t* heap, size_t size) {
+  mi_page_queue_t* pq = mi_page_queue(heap,size);
+  mi_page_t* page = pq->first;
+  if (page != NULL) {
+   #if (MI_SECURE>=3) // in secure mode, we extend half the time to increase randomness
+    if (page->capacity < page->reserved && ((_mi_heap_random_next(heap) & 1) == 1)) {
+      mi_page_extend_free(heap, page, heap->tld);
+      mi_assert_internal(mi_page_immediate_available(page));
+    }
+    else
+   #endif
+    {
+      _mi_page_free_collect(page,false);
+    }
+
+    if (mi_page_immediate_available(page)) {
+      page->retire_expire = 0;
+      return page; // fast path
+    }
+  }
+  return mi_page_queue_find_free_ex(heap, pq, true);
+}
+
+
+/* -----------------------------------------------------------
+  Users can register a deferred free function called
+  when the `free` list is empty. Since the `local_free`
+  is separate this is deterministically called after
+  a certain number of allocations.
+----------------------------------------------------------- */
+
+static mi_deferred_free_fun* volatile deferred_free = NULL;
+static _Atomic(void*) deferred_arg; // = NULL
+
+void _mi_deferred_free(mi_heap_t* heap, bool force) {
+  heap->tld->heartbeat++;
+  if (deferred_free != NULL && !heap->tld->recurse) {
+    heap->tld->recurse = true;
+    deferred_free(force, heap->tld->heartbeat, mi_atomic_load_ptr_relaxed(void,&deferred_arg));
+    heap->tld->recurse = false;
+  }
+}
+
+void mi_register_deferred_free(mi_deferred_free_fun* fn, void* arg) mi_attr_noexcept {
+  deferred_free = fn;
+  mi_atomic_store_ptr_release(void,&deferred_arg, arg);
+}
+
+
+/* -----------------------------------------------------------
+  General allocation
+----------------------------------------------------------- */
+
+// Large and huge page allocation.
+// Huge pages are allocated directly without being in a queue.
+// Because huge pages contain just one block, and the segment contains
+// just that page, we always treat them as abandoned and any thread
+// that frees the block can free the whole page and segment directly.
+// Huge pages are also use if the requested alignment is very large (> MI_ALIGNMENT_MAX).
+static mi_page_t* mi_large_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_alignment) {
+  size_t block_size = _mi_os_good_alloc_size(size);
+  mi_assert_internal(mi_bin(block_size) == MI_BIN_HUGE || page_alignment > 0);
+  bool is_huge = (block_size > MI_LARGE_OBJ_SIZE_MAX || page_alignment > 0);
+  #if MI_HUGE_PAGE_ABANDON
+  mi_page_queue_t* pq = (is_huge ? NULL : mi_page_queue(heap, block_size));
+  #else
+  mi_page_queue_t* pq = mi_page_queue(heap, is_huge ? MI_HUGE_BLOCK_SIZE : block_size); // not block_size as that can be low if the page_alignment > 0
+  mi_assert_internal(!is_huge || mi_page_queue_is_huge(pq));
+  #endif
+  mi_page_t* page = mi_page_fresh_alloc(heap, pq, block_size, page_alignment);
+  if (page != NULL) {
+    mi_assert_internal(mi_page_immediate_available(page));
+    
+    if (is_huge) {
+      mi_assert_internal(_mi_page_segment(page)->kind == MI_SEGMENT_HUGE);
+      mi_assert_internal(_mi_page_segment(page)->used==1);
+      #if MI_HUGE_PAGE_ABANDON
+      mi_assert_internal(_mi_page_segment(page)->thread_id==0); // abandoned, not in the huge queue
+      mi_page_set_heap(page, NULL);
+      #endif      
+    }
+    else {
+      mi_assert_internal(_mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
+    }
+    
+    const size_t bsize = mi_page_usable_block_size(page);  // note: not `mi_page_block_size` to account for padding
+    if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+      mi_heap_stat_increase(heap, large, bsize);
+      mi_heap_stat_counter_increase(heap, large_count, 1);
+    }
+    else {
+      mi_heap_stat_increase(heap, huge, bsize);
+      mi_heap_stat_counter_increase(heap, huge_count, 1);
+    }
+  }
+  return page;
+}
+
+
+// Allocate a page
+// Note: in debug mode the size includes MI_PADDING_SIZE and might have overflowed.
+static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size, size_t huge_alignment) mi_attr_noexcept {
+  // huge allocation?
+  const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`  
+  if mi_unlikely(req_size > (MI_MEDIUM_OBJ_SIZE_MAX - MI_PADDING_SIZE) || huge_alignment > 0) {
+    if mi_unlikely(req_size > PTRDIFF_MAX) {  // we don't allocate more than PTRDIFF_MAX (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
+      _mi_error_message(EOVERFLOW, "allocation request is too large (%zu bytes)\n", req_size);
+      return NULL;
+    }
+    else {
+      return mi_large_huge_page_alloc(heap,size,huge_alignment);
+    }
+  }
+  else {
+    // otherwise find a page with free blocks in our size segregated queues
+    mi_assert_internal(size >= MI_PADDING_SIZE);
+    return mi_find_free_page(heap, size);
+  }
+}
+
+// Generic allocation routine if the fast path (`alloc.c:mi_page_malloc`) does not succeed.
+// Note: in debug mode the size includes MI_PADDING_SIZE and might have overflowed.
+// The `huge_alignment` is normally 0 but is set to a multiple of MI_SEGMENT_SIZE for
+// very large requested alignments in which case we use a huge segment.
+void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept
+{
+  mi_assert_internal(heap != NULL);
+
+  // initialize if necessary
+  if mi_unlikely(!mi_heap_is_initialized(heap)) {
+    mi_thread_init(); // calls `_mi_heap_init` in turn
+    heap = mi_get_default_heap();
+    if mi_unlikely(!mi_heap_is_initialized(heap)) { return NULL; }
+  }
+  mi_assert_internal(mi_heap_is_initialized(heap));
+
+  // call potential deferred free routines
+  _mi_deferred_free(heap, false);
+
+  // free delayed frees from other threads (but skip contended ones)
+  _mi_heap_delayed_free_partial(heap);
+
+  // find (or allocate) a page of the right size
+  mi_page_t* page = mi_find_page(heap, size, huge_alignment);
+  if mi_unlikely(page == NULL) { // first time out of memory, try to collect and retry the allocation once more
+    mi_heap_collect(heap, true /* force */);
+    page = mi_find_page(heap, size, huge_alignment);
+  }
+
+  if mi_unlikely(page == NULL) { // out of memory
+    const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`
+    _mi_error_message(ENOMEM, "unable to allocate memory (%zu bytes)\n", req_size);
+    return NULL;
+  }
+
+  mi_assert_internal(mi_page_immediate_available(page));
+  mi_assert_internal(mi_page_block_size(page) >= size);
+
+  // and try again, this time succeeding! (i.e. this should never recurse through _mi_page_malloc)
+  if mi_unlikely(zero && page->xblock_size == 0) {
+    // note: we cannot call _mi_page_malloc with zeroing for huge blocks; we zero it afterwards in that case.
+    void* p = _mi_page_malloc(heap, page, size, false);
+    mi_assert_internal(p != NULL);
+    _mi_memzero_aligned(p, mi_page_usable_block_size(page));
+    return p;
+  }
+  else {
+    return _mi_page_malloc(heap, page, size, zero);
+  }
+}
diff --git a/Objects/mimalloc/random.c b/Objects/mimalloc/random.c
new file mode 100644
index 00000000000000..06d4ba4ad67a98
--- /dev/null
+++ b/Objects/mimalloc/random.c
@@ -0,0 +1,404 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2021, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#ifndef _DEFAULT_SOURCE
+#define _DEFAULT_SOURCE   // for syscall() on Linux
+#endif
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+
+#include <string.h> // memset
+
+/* ----------------------------------------------------------------------------
+We use our own PRNG to keep predictable performance of random number generation
+and to avoid implementations that use a lock. We only use the OS provided
+random source to initialize the initial seeds. Since we do not need ultimate
+performance but we do rely on the security (for secret cookies in secure mode)
+we use a cryptographically secure generator (chacha20).
+-----------------------------------------------------------------------------*/
+
+#define MI_CHACHA_ROUNDS (20)   // perhaps use 12 for better performance?
+
+
+/* ----------------------------------------------------------------------------
+Chacha20 implementation as the original algorithm with a 64-bit nonce
+and counter: https://en.wikipedia.org/wiki/Salsa20
+The input matrix has sixteen 32-bit values:
+Position  0 to  3: constant key
+Position  4 to 11: the key
+Position 12 to 13: the counter.
+Position 14 to 15: the nonce.
+
+The implementation uses regular C code which compiles very well on modern compilers.
+(gcc x64 has no register spills, and clang 6+ uses SSE instructions)
+-----------------------------------------------------------------------------*/
+
+static inline uint32_t rotl(uint32_t x, uint32_t shift) {
+  return (x << shift) | (x >> (32 - shift));
+}
+
+static inline void qround(uint32_t x[16], size_t a, size_t b, size_t c, size_t d) {
+  x[a] += x[b]; x[d] = rotl(x[d] ^ x[a], 16);
+  x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 12);
+  x[a] += x[b]; x[d] = rotl(x[d] ^ x[a], 8);
+  x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 7);
+}
+
+static void chacha_block(mi_random_ctx_t* ctx)
+{
+  // scramble into `x`
+  uint32_t x[16];
+  for (size_t i = 0; i < 16; i++) {
+    x[i] = ctx->input[i];
+  }
+  for (size_t i = 0; i < MI_CHACHA_ROUNDS; i += 2) {
+    qround(x, 0, 4,  8, 12);
+    qround(x, 1, 5,  9, 13);
+    qround(x, 2, 6, 10, 14);
+    qround(x, 3, 7, 11, 15);
+    qround(x, 0, 5, 10, 15);
+    qround(x, 1, 6, 11, 12);
+    qround(x, 2, 7,  8, 13);
+    qround(x, 3, 4,  9, 14);
+  }
+
+  // add scrambled data to the initial state
+  for (size_t i = 0; i < 16; i++) {
+    ctx->output[i] = x[i] + ctx->input[i];
+  }
+  ctx->output_available = 16;
+
+  // increment the counter for the next round
+  ctx->input[12] += 1;
+  if (ctx->input[12] == 0) {
+    ctx->input[13] += 1;
+    if (ctx->input[13] == 0) {  // and keep increasing into the nonce
+      ctx->input[14] += 1;
+    }
+  }
+}
+
+static uint32_t chacha_next32(mi_random_ctx_t* ctx) {
+  if (ctx->output_available <= 0) {
+    chacha_block(ctx);
+    ctx->output_available = 16; // (assign again to suppress static analysis warning)
+  }
+  const uint32_t x = ctx->output[16 - ctx->output_available];
+  ctx->output[16 - ctx->output_available] = 0; // reset once the data is handed out
+  ctx->output_available--;
+  return x;
+}
+
+static inline uint32_t read32(const uint8_t* p, size_t idx32) {
+  const size_t i = 4*idx32;
+  return ((uint32_t)p[i+0] | (uint32_t)p[i+1] << 8 | (uint32_t)p[i+2] << 16 | (uint32_t)p[i+3] << 24);
+}
+
+static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t nonce)
+{
+  // since we only use chacha for randomness (and not encryption) we
+  // do not _need_ to read 32-bit values as little endian but we do anyways
+  // just for being compatible :-)
+  memset(ctx, 0, sizeof(*ctx));
+  for (size_t i = 0; i < 4; i++) {
+    const uint8_t* sigma = (uint8_t*)"expand 32-byte k";
+    ctx->input[i] = read32(sigma,i);
+  }
+  for (size_t i = 0; i < 8; i++) {
+    ctx->input[i + 4] = read32(key,i);
+  }
+  ctx->input[12] = 0;
+  ctx->input[13] = 0;
+  ctx->input[14] = (uint32_t)nonce;
+  ctx->input[15] = (uint32_t)(nonce >> 32);
+}
+
+static void chacha_split(mi_random_ctx_t* ctx, uint64_t nonce, mi_random_ctx_t* ctx_new) {
+  memset(ctx_new, 0, sizeof(*ctx_new));
+  _mi_memcpy(ctx_new->input, ctx->input, sizeof(ctx_new->input));
+  ctx_new->input[12] = 0;
+  ctx_new->input[13] = 0;
+  ctx_new->input[14] = (uint32_t)nonce;
+  ctx_new->input[15] = (uint32_t)(nonce >> 32);
+  mi_assert_internal(ctx->input[14] != ctx_new->input[14] || ctx->input[15] != ctx_new->input[15]); // do not reuse nonces!
+  chacha_block(ctx_new);
+}
+
+
+/* ----------------------------------------------------------------------------
+Random interface
+-----------------------------------------------------------------------------*/
+
+#if MI_DEBUG>1
+static bool mi_random_is_initialized(mi_random_ctx_t* ctx) {
+  return (ctx != NULL && ctx->input[0] != 0);
+}
+#endif
+
+void _mi_random_split(mi_random_ctx_t* ctx, mi_random_ctx_t* ctx_new) {
+  mi_assert_internal(mi_random_is_initialized(ctx));
+  mi_assert_internal(ctx != ctx_new);
+  chacha_split(ctx, (uintptr_t)ctx_new /*nonce*/, ctx_new);
+}
+
+uintptr_t _mi_random_next(mi_random_ctx_t* ctx) {
+  mi_assert_internal(mi_random_is_initialized(ctx));
+  #if MI_INTPTR_SIZE <= 4
+    return chacha_next32(ctx);
+  #elif MI_INTPTR_SIZE == 8
+    return (((uintptr_t)chacha_next32(ctx) << 32) | chacha_next32(ctx));
+  #else
+  # error "define mi_random_next for this platform"
+  #endif
+}
+
+
+/* ----------------------------------------------------------------------------
+To initialize a fresh random context we rely on the OS:
+- Windows     : BCryptGenRandom (or RtlGenRandom)
+- macOS       : CCRandomGenerateBytes, arc4random_buf
+- bsd,wasi    : arc4random_buf
+- Linux       : getrandom,/dev/urandom
+If we cannot get good randomness, we fall back to weak randomness based on a timer and ASLR.
+-----------------------------------------------------------------------------*/
+
+#if defined(_WIN32)
+
+#if defined(MI_USE_RTLGENRANDOM) // || defined(__cplusplus)
+// We prefer to use BCryptGenRandom instead of (the unofficial) RtlGenRandom but when using
+// dynamic overriding, we observed it can raise an exception when compiled with C++, and
+// sometimes deadlocks when also running under the VS debugger.
+// In contrast, issue #623 implies that on Windows Server 2019 we need to use BCryptGenRandom.
+// To be continued..
+#pragma comment (lib,"advapi32.lib")
+#define RtlGenRandom  SystemFunction036
+#ifdef __cplusplus
+extern "C" {
+#endif
+BOOLEAN NTAPI RtlGenRandom(PVOID RandomBuffer, ULONG RandomBufferLength);
+#ifdef __cplusplus
+}
+#endif
+static bool os_random_buf(void* buf, size_t buf_len) {
+  return (RtlGenRandom(buf, (ULONG)buf_len) != 0);
+}
+#else
+
+#ifndef BCRYPT_USE_SYSTEM_PREFERRED_RNG
+#define BCRYPT_USE_SYSTEM_PREFERRED_RNG 0x00000002
+#endif
+
+typedef LONG (NTAPI *PBCryptGenRandom)(HANDLE, PUCHAR, ULONG, ULONG);
+static  PBCryptGenRandom pBCryptGenRandom = NULL;
+
+static bool os_random_buf(void* buf, size_t buf_len) {
+  if (pBCryptGenRandom == NULL) {
+    HINSTANCE hDll = LoadLibrary(TEXT("bcrypt.dll"));
+    if (hDll != NULL) {
+      pBCryptGenRandom = (PBCryptGenRandom)(void (*)(void))GetProcAddress(hDll, "BCryptGenRandom");
+    }
+  }
+  if (pBCryptGenRandom == NULL) {
+    return false;
+  }
+  else {
+    return (pBCryptGenRandom(NULL, (PUCHAR)buf, (ULONG)buf_len, BCRYPT_USE_SYSTEM_PREFERRED_RNG) >= 0);
+  }
+}
+#endif
+
+#elif defined(__APPLE__)
+#include <AvailabilityMacros.h>
+#if defined(MAC_OS_X_VERSION_10_10) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_10
+#include <CommonCrypto/CommonCryptoError.h>
+#include <CommonCrypto/CommonRandom.h>
+#endif
+static bool os_random_buf(void* buf, size_t buf_len) {
+  #if defined(MAC_OS_X_VERSION_10_15) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_15
+    // We prefere CCRandomGenerateBytes as it returns an error code while arc4random_buf
+    // may fail silently on macOS. See PR #390, and <https://opensource.apple.com/source/Libc/Libc-1439.40.11/gen/FreeBSD/arc4random.c.auto.html>
+    return (CCRandomGenerateBytes(buf, buf_len) == kCCSuccess);
+  #else
+    // fall back on older macOS
+    arc4random_buf(buf, buf_len);
+    return true;
+  #endif
+}
+
+#elif defined(__ANDROID__) || defined(__DragonFly__) || \
+      defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
+      defined(__sun) // todo: what to use with __wasi__?
+#include <stdlib.h>
+static bool os_random_buf(void* buf, size_t buf_len) {
+  arc4random_buf(buf, buf_len);
+  return true;
+}
+#elif defined(__linux__) || defined(__HAIKU__)
+#if defined(__linux__)
+#include <sys/syscall.h>
+#endif
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+static bool os_random_buf(void* buf, size_t buf_len) {
+  // Modern Linux provides `getrandom` but different distributions either use `sys/random.h` or `linux/random.h`
+  // and for the latter the actual `getrandom` call is not always defined.
+  // (see <https://stackoverflow.com/questions/45237324/why-doesnt-getrandom-compile>)
+  // We therefore use a syscall directly and fall back dynamically to /dev/urandom when needed.
+#ifdef SYS_getrandom
+  #ifndef GRND_NONBLOCK
+  #define GRND_NONBLOCK (1)
+  #endif
+  static _Atomic(uintptr_t) no_getrandom; // = 0
+  if (mi_atomic_load_acquire(&no_getrandom)==0) {
+    ssize_t ret = syscall(SYS_getrandom, buf, buf_len, GRND_NONBLOCK);
+    if (ret >= 0) return (buf_len == (size_t)ret);
+    if (errno != ENOSYS) return false;
+    mi_atomic_store_release(&no_getrandom, 1UL); // don't call again, and fall back to /dev/urandom
+  }
+#endif
+  int flags = O_RDONLY;
+  #if defined(O_CLOEXEC)
+  flags |= O_CLOEXEC;
+  #endif
+  int fd = open("/dev/urandom", flags, 0);
+  if (fd < 0) return false;
+  size_t count = 0;
+  while(count < buf_len) {
+    ssize_t ret = read(fd, (char*)buf + count, buf_len - count);
+    if (ret<=0) {
+      if (errno!=EAGAIN && errno!=EINTR) break;
+    }
+    else {
+      count += ret;
+    }
+  }
+  close(fd);
+  return (count==buf_len);
+}
+#else
+static bool os_random_buf(void* buf, size_t buf_len) {
+  return false;
+}
+#endif
+
+#if defined(_WIN32)
+#include <windows.h>
+#elif defined(__APPLE__)
+#include <mach/mach_time.h>
+#else
+#include <time.h>
+#endif
+
+uintptr_t _mi_os_random_weak(uintptr_t extra_seed) {
+  uintptr_t x = (uintptr_t)&_mi_os_random_weak ^ extra_seed; // ASLR makes the address random
+
+  #if defined(_WIN32)
+    LARGE_INTEGER pcount;
+    QueryPerformanceCounter(&pcount);
+    x ^= (uintptr_t)(pcount.QuadPart);
+  #elif defined(__APPLE__)
+    x ^= (uintptr_t)mach_absolute_time();
+  #else
+    struct timespec time;
+    clock_gettime(CLOCK_MONOTONIC, &time);
+    x ^= (uintptr_t)time.tv_sec;
+    x ^= (uintptr_t)time.tv_nsec;
+  #endif
+  // and do a few randomization steps
+  uintptr_t max = ((x ^ (x >> 17)) & 0x0F) + 1;
+  for (uintptr_t i = 0; i < max; i++) {
+    x = _mi_random_shuffle(x);
+  }
+  mi_assert_internal(x != 0);
+  return x;
+}
+
+static void mi_random_init_ex(mi_random_ctx_t* ctx, bool use_weak) {
+  uint8_t key[32];
+  if (use_weak || !os_random_buf(key, sizeof(key))) {
+    // if we fail to get random data from the OS, we fall back to a
+    // weak random source based on the current time
+    #if !defined(__wasi__)
+    if (!use_weak) { _mi_warning_message("unable to use secure randomness\n"); }
+    #endif
+    uintptr_t x = _mi_os_random_weak(0);
+    for (size_t i = 0; i < 8; i++) {  // key is eight 32-bit words.
+      x = _mi_random_shuffle(x);
+      ((uint32_t*)key)[i] = (uint32_t)x;
+    }
+    ctx->weak = true;
+  }
+  else {
+    ctx->weak = false;
+  }
+  chacha_init(ctx, key, (uintptr_t)ctx /*nonce*/ );
+}
+
+void _mi_random_init(mi_random_ctx_t* ctx) {
+  mi_random_init_ex(ctx, false);
+}
+
+void _mi_random_init_weak(mi_random_ctx_t * ctx) {
+  mi_random_init_ex(ctx, true);
+}
+
+void _mi_random_reinit_if_weak(mi_random_ctx_t * ctx) {
+  if (ctx->weak) {
+    _mi_random_init(ctx);
+  }
+}
+
+/* --------------------------------------------------------
+test vectors from <https://tools.ietf.org/html/rfc8439>
+----------------------------------------------------------- */
+/*
+static bool array_equals(uint32_t* x, uint32_t* y, size_t n) {
+  for (size_t i = 0; i < n; i++) {
+    if (x[i] != y[i]) return false;
+  }
+  return true;
+}
+static void chacha_test(void)
+{
+  uint32_t x[4] = { 0x11111111, 0x01020304, 0x9b8d6f43, 0x01234567 };
+  uint32_t x_out[4] = { 0xea2a92f4, 0xcb1cf8ce, 0x4581472e, 0x5881c4bb };
+  qround(x, 0, 1, 2, 3);
+  mi_assert_internal(array_equals(x, x_out, 4));
+
+  uint32_t y[16] = {
+       0x879531e0,  0xc5ecf37d,  0x516461b1,  0xc9a62f8a,
+       0x44c20ef3,  0x3390af7f,  0xd9fc690b,  0x2a5f714c,
+       0x53372767,  0xb00a5631,  0x974c541a,  0x359e9963,
+       0x5c971061,  0x3d631689,  0x2098d9d6,  0x91dbd320 };
+  uint32_t y_out[16] = {
+       0x879531e0,  0xc5ecf37d,  0xbdb886dc,  0xc9a62f8a,
+       0x44c20ef3,  0x3390af7f,  0xd9fc690b,  0xcfacafd2,
+       0xe46bea80,  0xb00a5631,  0x974c541a,  0x359e9963,
+       0x5c971061,  0xccc07c79,  0x2098d9d6,  0x91dbd320 };
+  qround(y, 2, 7, 8, 13);
+  mi_assert_internal(array_equals(y, y_out, 16));
+
+  mi_random_ctx_t r = {
+    { 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574,
+      0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c,
+      0x13121110, 0x17161514, 0x1b1a1918, 0x1f1e1d1c,
+      0x00000001, 0x09000000, 0x4a000000, 0x00000000 },
+    {0},
+    0
+  };
+  uint32_t r_out[16] = {
+       0xe4e7f110, 0x15593bd1, 0x1fdd0f50, 0xc47120a3,
+       0xc7f4d1c7, 0x0368c033, 0x9aaa2204, 0x4e6cd4c3,
+       0x466482d2, 0x09aa9f07, 0x05d7c214, 0xa2028bd9,
+       0xd19c12b5, 0xb94e16de, 0xe883d0cb, 0x4e3c50a2 };
+  chacha_block(&r);
+  mi_assert_internal(array_equals(r.output, r_out, 16));
+}
+*/
diff --git a/Objects/mimalloc/region.c b/Objects/mimalloc/region.c
new file mode 100644
index 00000000000000..3571abb602bb98
--- /dev/null
+++ b/Objects/mimalloc/region.c
@@ -0,0 +1,516 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2020, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+This implements a layer between the raw OS memory (VirtualAlloc/mmap/sbrk/..)
+and the segment and huge object allocation by mimalloc. There may be multiple
+implementations of this (one could be the identity going directly to the OS,
+another could be a simple cache etc), but the current one uses large "regions".
+In contrast to the rest of mimalloc, the "regions" are shared between threads and
+need to be accessed using atomic operations.
+We need this memory layer between the raw OS calls because of:
+1. on `sbrk` like systems (like WebAssembly) we need our own memory maps in order
+   to reuse memory effectively.
+2. It turns out that for large objects, between 1MiB and 32MiB (?), the cost of
+   an OS allocation/free is still (much) too expensive relative to the accesses
+   in that object :-( (`malloc-large` tests this). This means we need a cheaper
+   way to reuse memory.
+3. This layer allows for NUMA aware allocation.
+
+Possible issues:
+- (2) can potentially be addressed too with a small cache per thread which is much
+  simpler. Generally though that requires shrinking of huge pages, and may overuse
+  memory per thread. (and is not compatible with `sbrk`).
+- Since the current regions are per-process, we need atomic operations to
+  claim blocks which may be contended
+- In the worst case, we need to search the whole region map (16KiB for 256GiB)
+  linearly. At what point will direct OS calls be faster? Is there a way to
+  do this better without adding too much complexity?
+-----------------------------------------------------------------------------*/
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include <string.h>  // memset
+
+#include "bitmap.h"
+
+// Internal raw OS interface
+size_t  _mi_os_large_page_size(void);
+bool    _mi_os_protect(void* addr, size_t size);
+bool    _mi_os_unprotect(void* addr, size_t size);
+bool    _mi_os_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
+bool    _mi_os_decommit(void* p, size_t size, mi_stats_t* stats);
+bool    _mi_os_reset(void* p, size_t size, mi_stats_t* stats);
+bool    _mi_os_unreset(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
+bool    _mi_os_commit_unreset(void* addr, size_t size, bool* is_zero, mi_stats_t* stats);
+
+// arena.c
+mi_arena_id_t _mi_arena_id_none(void);
+void    _mi_arena_free(void* p, size_t size, size_t alignment, size_t align_offset, size_t memid, bool all_committed, mi_stats_t* stats);
+void*   _mi_arena_alloc(size_t size, bool* commit, bool* large, bool* is_pinned, bool* is_zero, mi_arena_id_t req_arena_id, size_t* memid, mi_os_tld_t* tld);
+void*   _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool* commit, bool* large, bool* is_pinned, bool* is_zero, mi_arena_id_t req_arena_id, size_t* memid, mi_os_tld_t* tld);
+
+
+
+// Constants
+#if (MI_INTPTR_SIZE==8)
+#define MI_HEAP_REGION_MAX_SIZE    (256 * MI_GiB)  // 64KiB for the region map
+#elif (MI_INTPTR_SIZE==4)
+#define MI_HEAP_REGION_MAX_SIZE    (3 * MI_GiB)    // ~ KiB for the region map
+#else
+#error "define the maximum heap space allowed for regions on this platform"
+#endif
+
+#define MI_REGION_MAX_BLOCKS      MI_BITMAP_FIELD_BITS
+#define MI_REGION_SIZE            (MI_SEGMENT_SIZE * MI_BITMAP_FIELD_BITS)    // 256MiB  (64MiB on 32 bits)
+#define MI_REGION_MAX             (MI_HEAP_REGION_MAX_SIZE / MI_REGION_SIZE)  // 1024  (48 on 32 bits)
+#define MI_REGION_MAX_OBJ_BLOCKS  (MI_REGION_MAX_BLOCKS/4)                    // 64MiB
+#define MI_REGION_MAX_OBJ_SIZE    (MI_REGION_MAX_OBJ_BLOCKS*MI_SEGMENT_SIZE)
+
+// Region info
+typedef union mi_region_info_u {
+  size_t value;
+  struct {
+    bool  valid;        // initialized?
+    bool  is_large:1;   // allocated in fixed large/huge OS pages
+    bool  is_pinned:1;  // pinned memory cannot be decommitted
+    short numa_node;    // the associated NUMA node (where -1 means no associated node)
+  } x;
+} mi_region_info_t;
+
+
+// A region owns a chunk of REGION_SIZE (256MiB) (virtual) memory with
+// a bit map with one bit per MI_SEGMENT_SIZE (4MiB) block.
+typedef struct mem_region_s {
+  _Atomic(size_t)           info;        // mi_region_info_t.value
+  _Atomic(void*)            start;       // start of the memory area
+  mi_bitmap_field_t         in_use;      // bit per in-use block
+  mi_bitmap_field_t         dirty;       // track if non-zero per block
+  mi_bitmap_field_t         commit;      // track if committed per block
+  mi_bitmap_field_t         reset;       // track if reset per block
+  _Atomic(size_t)           arena_memid; // if allocated from a (huge page) arena
+  _Atomic(size_t)           padding;     // round to 8 fields (needs to be atomic for msvc, see issue #508)
+} mem_region_t;
+
+// The region map
+static mem_region_t regions[MI_REGION_MAX];
+
+// Allocated regions
+static _Atomic(size_t) regions_count; // = 0;
+
+
+/* ----------------------------------------------------------------------------
+Utility functions
+-----------------------------------------------------------------------------*/
+
+// Blocks (of 4MiB) needed for the given size.
+static size_t mi_region_block_count(size_t size) {
+  return _mi_divide_up(size, MI_SEGMENT_SIZE);
+}
+
+/*
+// Return a rounded commit/reset size such that we don't fragment large OS pages into small ones.
+static size_t mi_good_commit_size(size_t size) {
+  if (size > (SIZE_MAX - _mi_os_large_page_size())) return size;
+  return _mi_align_up(size, _mi_os_large_page_size());
+}
+*/
+
+// Return if a pointer points into a region reserved by us.
+mi_decl_nodiscard bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
+  if (p==NULL) return false;
+  size_t count = mi_atomic_load_relaxed(&regions_count);
+  for (size_t i = 0; i < count; i++) {
+    uint8_t* start = (uint8_t*)mi_atomic_load_ptr_relaxed(uint8_t, &regions[i].start);
+    if (start != NULL && (uint8_t*)p >= start && (uint8_t*)p < start + MI_REGION_SIZE) return true;
+  }
+  return false;
+}
+
+
+static void* mi_region_blocks_start(const mem_region_t* region, mi_bitmap_index_t bit_idx) {
+  uint8_t* start = (uint8_t*)mi_atomic_load_ptr_acquire(uint8_t, &((mem_region_t*)region)->start);
+  mi_assert_internal(start != NULL);
+  return (start + (bit_idx * MI_SEGMENT_SIZE));
+}
+
+static size_t mi_memid_create(mem_region_t* region, mi_bitmap_index_t bit_idx) {
+  mi_assert_internal(bit_idx < MI_BITMAP_FIELD_BITS);
+  size_t idx = region - regions;
+  mi_assert_internal(&regions[idx] == region);
+  return (idx*MI_BITMAP_FIELD_BITS + bit_idx)<<1;
+}
+
+static size_t mi_memid_create_from_arena(size_t arena_memid) {
+  return (arena_memid << 1) | 1;
+}
+
+
+static bool mi_memid_is_arena(size_t id, mem_region_t** region, mi_bitmap_index_t* bit_idx, size_t* arena_memid) {
+  if ((id&1)==1) {
+    if (arena_memid != NULL) *arena_memid = (id>>1);
+    return true;
+  }
+  else {
+    size_t idx = (id >> 1) / MI_BITMAP_FIELD_BITS;
+    *bit_idx   = (mi_bitmap_index_t)(id>>1) % MI_BITMAP_FIELD_BITS;
+    *region    = &regions[idx];
+    return false;
+  }
+}
+
+
+/* ----------------------------------------------------------------------------
+  Allocate a region is allocated from the OS (or an arena)
+-----------------------------------------------------------------------------*/
+
+static bool mi_region_try_alloc_os(size_t blocks, bool commit, bool allow_large, mem_region_t** region, mi_bitmap_index_t* bit_idx, mi_os_tld_t* tld)
+{
+  // not out of regions yet?
+  if (mi_atomic_load_relaxed(&regions_count) >= MI_REGION_MAX - 1) return false;
+
+  // try to allocate a fresh region from the OS
+  bool region_commit = (commit && mi_option_is_enabled(mi_option_eager_region_commit));
+  bool region_large = (commit && allow_large);
+  bool is_zero = false;
+  bool is_pinned = false;
+  size_t arena_memid = 0;
+  void* const start = _mi_arena_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, 0, &region_commit, &region_large, &is_pinned, &is_zero, _mi_arena_id_none(),  & arena_memid, tld);
+  if (start == NULL) return false;
+  mi_assert_internal(!(region_large && !allow_large));
+  mi_assert_internal(!region_large || region_commit);
+
+  // claim a fresh slot
+  const size_t idx = mi_atomic_increment_acq_rel(&regions_count);
+  if (idx >= MI_REGION_MAX) {
+    mi_atomic_decrement_acq_rel(&regions_count);
+    _mi_arena_free(start, MI_REGION_SIZE, MI_SEGMENT_ALIGN, 0, arena_memid, region_commit, tld->stats);
+    _mi_warning_message("maximum regions used: %zu GiB (perhaps recompile with a larger setting for MI_HEAP_REGION_MAX_SIZE)", _mi_divide_up(MI_HEAP_REGION_MAX_SIZE, MI_GiB));
+    return false;
+  }
+
+  // allocated, initialize and claim the initial blocks
+  mem_region_t* r = &regions[idx];
+  r->arena_memid  = arena_memid;
+  mi_atomic_store_release(&r->in_use, (size_t)0);
+  mi_atomic_store_release(&r->dirty, (is_zero ? 0 : MI_BITMAP_FIELD_FULL));
+  mi_atomic_store_release(&r->commit, (region_commit ? MI_BITMAP_FIELD_FULL : 0));
+  mi_atomic_store_release(&r->reset, (size_t)0);
+  *bit_idx = 0;
+  _mi_bitmap_claim(&r->in_use, 1, blocks, *bit_idx, NULL);
+  mi_atomic_store_ptr_release(void,&r->start, start);
+
+  // and share it
+  mi_region_info_t info;
+  info.value = 0;                        // initialize the full union to zero
+  info.x.valid = true;
+  info.x.is_large = region_large;
+  info.x.is_pinned = is_pinned;
+  info.x.numa_node = (short)_mi_os_numa_node(tld);
+  mi_atomic_store_release(&r->info, info.value); // now make it available to others
+  *region = r;
+  return true;
+}
+
+/* ----------------------------------------------------------------------------
+  Try to claim blocks in suitable regions
+-----------------------------------------------------------------------------*/
+
+static bool mi_region_is_suitable(const mem_region_t* region, int numa_node, bool allow_large ) {
+  // initialized at all?
+  mi_region_info_t info;
+  info.value = mi_atomic_load_relaxed(&((mem_region_t*)region)->info);
+  if (info.value==0) return false;
+
+  // numa correct
+  if (numa_node >= 0) {  // use negative numa node to always succeed
+    int rnode = info.x.numa_node;
+    if (rnode >= 0 && rnode != numa_node) return false;
+  }
+
+  // check allow-large
+  if (!allow_large && info.x.is_large) return false;
+
+  return true;
+}
+
+
+static bool mi_region_try_claim(int numa_node, size_t blocks, bool allow_large, mem_region_t** region, mi_bitmap_index_t* bit_idx, mi_os_tld_t* tld)
+{
+  // try all regions for a free slot
+  const size_t count = mi_atomic_load_relaxed(&regions_count); // monotonic, so ok to be relaxed
+  size_t idx = tld->region_idx; // Or start at 0 to reuse low addresses? Starting at 0 seems to increase latency though
+  for (size_t visited = 0; visited < count; visited++, idx++) {
+    if (idx >= count) idx = 0;  // wrap around
+    mem_region_t* r = &regions[idx];
+    // if this region suits our demand (numa node matches, large OS page matches)
+    if (mi_region_is_suitable(r, numa_node, allow_large)) {
+      // then try to atomically claim a segment(s) in this region
+      if (_mi_bitmap_try_find_claim_field(&r->in_use, 0, blocks, bit_idx)) {
+        tld->region_idx = idx;    // remember the last found position
+        *region = r;
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+
+static void* mi_region_try_alloc(size_t blocks, bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
+{
+  mi_assert_internal(blocks <= MI_BITMAP_FIELD_BITS);
+  mem_region_t* region;
+  mi_bitmap_index_t bit_idx;
+  const int numa_node = (_mi_os_numa_node_count() <= 1 ? -1 : _mi_os_numa_node(tld));
+  // try to claim in existing regions
+  if (!mi_region_try_claim(numa_node, blocks, *large, &region, &bit_idx, tld)) {
+    // otherwise try to allocate a fresh region and claim in there
+    if (!mi_region_try_alloc_os(blocks, *commit, *large, &region, &bit_idx, tld)) {
+      // out of regions or memory
+      return NULL;
+    }
+  }
+
+  // ------------------------------------------------
+  // found a region and claimed `blocks` at `bit_idx`, initialize them now
+  mi_assert_internal(region != NULL);
+  mi_assert_internal(_mi_bitmap_is_claimed(&region->in_use, 1, blocks, bit_idx));
+
+  mi_region_info_t info;
+  info.value = mi_atomic_load_acquire(&region->info);
+  uint8_t* start = (uint8_t*)mi_atomic_load_ptr_acquire(uint8_t,&region->start);
+  mi_assert_internal(!(info.x.is_large && !*large));
+  mi_assert_internal(start != NULL);
+
+  *is_zero   = _mi_bitmap_claim(&region->dirty, 1, blocks, bit_idx, NULL);
+  *large     = info.x.is_large;
+  *is_pinned = info.x.is_pinned;
+  *memid     = mi_memid_create(region, bit_idx);
+  void* p = start + (mi_bitmap_index_bit_in_field(bit_idx) * MI_SEGMENT_SIZE);
+
+  // commit
+  if (*commit) {
+    // ensure commit
+    bool any_uncommitted;
+    _mi_bitmap_claim(&region->commit, 1, blocks, bit_idx, &any_uncommitted);
+    if (any_uncommitted) {
+      mi_assert_internal(!info.x.is_large && !info.x.is_pinned);
+      bool commit_zero = false;
+      if (!_mi_mem_commit(p, blocks * MI_SEGMENT_SIZE, &commit_zero, tld)) {
+        // failed to commit! unclaim and return
+        mi_bitmap_unclaim(&region->in_use, 1, blocks, bit_idx);
+        return NULL;
+      }
+      if (commit_zero) *is_zero = true;
+    }
+  }
+  else {
+    // no need to commit, but check if already fully committed
+    *commit = _mi_bitmap_is_claimed(&region->commit, 1, blocks, bit_idx);
+  }
+  mi_assert_internal(!*commit || _mi_bitmap_is_claimed(&region->commit, 1, blocks, bit_idx));
+
+  // unreset reset blocks
+  if (_mi_bitmap_is_any_claimed(&region->reset, 1, blocks, bit_idx)) {
+    // some blocks are still reset
+    mi_assert_internal(!info.x.is_large && !info.x.is_pinned);
+    mi_assert_internal(!mi_option_is_enabled(mi_option_eager_commit) || *commit || mi_option_get(mi_option_eager_commit_delay) > 0);
+    mi_bitmap_unclaim(&region->reset, 1, blocks, bit_idx);
+    if (*commit || !mi_option_is_enabled(mi_option_reset_decommits)) { // only if needed
+      bool reset_zero = false;
+      _mi_mem_unreset(p, blocks * MI_SEGMENT_SIZE, &reset_zero, tld);
+      if (reset_zero) *is_zero = true;
+    }
+  }
+  mi_assert_internal(!_mi_bitmap_is_any_claimed(&region->reset, 1, blocks, bit_idx));
+
+  #if (MI_DEBUG>=2) && !MI_TRACK_ENABLED
+  if (*commit) { ((uint8_t*)p)[0] = 0; }
+  #endif
+
+  // and return the allocation
+  mi_assert_internal(p != NULL);
+  return p;
+}
+
+
+/* ----------------------------------------------------------------------------
+ Allocation
+-----------------------------------------------------------------------------*/
+
+// Allocate `size` memory aligned at `alignment`. Return non NULL on success, with a given memory `id`.
+// (`id` is abstract, but `id = idx*MI_REGION_MAP_BITS + bitidx`)
+void* _mi_mem_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
+{
+  mi_assert_internal(memid != NULL && tld != NULL);
+  mi_assert_internal(size > 0);
+  *memid = 0;
+  *is_zero = false;
+  *is_pinned = false;
+  bool default_large = false;
+  if (large==NULL) large = &default_large;  // ensure `large != NULL`
+  if (size == 0) return NULL;
+  size = _mi_align_up(size, _mi_os_page_size());
+
+  // allocate from regions if possible
+  void* p = NULL;
+  size_t arena_memid;
+  const size_t blocks = mi_region_block_count(size);
+  if (blocks <= MI_REGION_MAX_OBJ_BLOCKS && alignment <= MI_SEGMENT_ALIGN && align_offset == 0) {
+    p = mi_region_try_alloc(blocks, commit, large, is_pinned, is_zero, memid, tld);
+    if (p == NULL) {
+      _mi_warning_message("unable to allocate from region: size %zu\n", size);
+    }
+  }
+  if (p == NULL) {
+    // and otherwise fall back to the OS
+    p = _mi_arena_alloc_aligned(size, alignment, align_offset, commit, large, is_pinned, is_zero, _mi_arena_id_none(),  & arena_memid, tld);
+    *memid = mi_memid_create_from_arena(arena_memid);
+  }
+
+  if (p != NULL) {
+    mi_assert_internal(((uintptr_t)p + align_offset) % alignment == 0);
+    #if (MI_DEBUG>=2) && !MI_TRACK_ENABLED
+    if (*commit) { ((uint8_t*)p)[0] = 0; } // ensure the memory is committed
+    #endif
+  }
+  return p;
+}
+
+
+
+/* ----------------------------------------------------------------------------
+Free
+-----------------------------------------------------------------------------*/
+
+// Free previously allocated memory with a given id.
+void _mi_mem_free(void* p, size_t size, size_t alignment, size_t align_offset, size_t id, bool full_commit, bool any_reset, mi_os_tld_t* tld) {
+  mi_assert_internal(size > 0 && tld != NULL);
+  if (p==NULL) return;
+  if (size==0) return;
+  size = _mi_align_up(size, _mi_os_page_size());
+
+  size_t arena_memid = 0;
+  mi_bitmap_index_t bit_idx;
+  mem_region_t* region;
+  if (mi_memid_is_arena(id,&region,&bit_idx,&arena_memid)) {
+   // was a direct arena allocation, pass through
+    _mi_arena_free(p, size, alignment, align_offset, arena_memid, full_commit, tld->stats);
+  }
+  else {
+    // allocated in a region
+    mi_assert_internal(align_offset == 0);
+    mi_assert_internal(size <= MI_REGION_MAX_OBJ_SIZE); if (size > MI_REGION_MAX_OBJ_SIZE) return;
+    const size_t blocks = mi_region_block_count(size);
+    mi_assert_internal(blocks + bit_idx <= MI_BITMAP_FIELD_BITS);
+    mi_region_info_t info;
+    info.value = mi_atomic_load_acquire(&region->info);
+    mi_assert_internal(info.value != 0);
+    void* blocks_start = mi_region_blocks_start(region, bit_idx);
+    mi_assert_internal(blocks_start == p); // not a pointer in our area?
+    mi_assert_internal(bit_idx + blocks <= MI_BITMAP_FIELD_BITS);
+    if (blocks_start != p || bit_idx + blocks > MI_BITMAP_FIELD_BITS) return; // or `abort`?
+
+    // committed?
+    if (full_commit && (size % MI_SEGMENT_SIZE) == 0) {
+      _mi_bitmap_claim(&region->commit, 1, blocks, bit_idx, NULL);
+    }
+
+    if (any_reset) {
+      // set the is_reset bits if any pages were reset
+      _mi_bitmap_claim(&region->reset, 1, blocks, bit_idx, NULL);
+    }
+
+    // reset the blocks to reduce the working set.
+    if (!info.x.is_large && !info.x.is_pinned && mi_option_is_enabled(mi_option_segment_reset)
+       && (mi_option_is_enabled(mi_option_eager_commit) ||
+           mi_option_is_enabled(mi_option_reset_decommits))) // cannot reset halfway committed segments, use only `option_page_reset` instead
+    {
+      bool any_unreset;
+      _mi_bitmap_claim(&region->reset, 1, blocks, bit_idx, &any_unreset);
+      if (any_unreset) {
+        _mi_abandoned_await_readers(); // ensure no more pending write (in case reset = decommit)
+        _mi_mem_reset(p, blocks * MI_SEGMENT_SIZE, tld);
+      }
+    }
+
+    // and unclaim
+    bool all_unclaimed = mi_bitmap_unclaim(&region->in_use, 1, blocks, bit_idx);
+    mi_assert_internal(all_unclaimed); MI_UNUSED(all_unclaimed);
+  }
+}
+
+
+/* ----------------------------------------------------------------------------
+  collection
+-----------------------------------------------------------------------------*/
+void _mi_mem_collect(mi_os_tld_t* tld) {
+  // free every region that has no segments in use.
+  size_t rcount = mi_atomic_load_relaxed(&regions_count);
+  for (size_t i = 0; i < rcount; i++) {
+    mem_region_t* region = &regions[i];
+    if (mi_atomic_load_relaxed(&region->info) != 0) {
+      // if no segments used, try to claim the whole region
+      size_t m = mi_atomic_load_relaxed(&region->in_use);
+      while (m == 0 && !mi_atomic_cas_weak_release(&region->in_use, &m, MI_BITMAP_FIELD_FULL)) { /* nothing */ };
+      if (m == 0) {
+        // on success, free the whole region
+        uint8_t* start = (uint8_t*)mi_atomic_load_ptr_acquire(uint8_t,&regions[i].start);
+        size_t arena_memid = mi_atomic_load_relaxed(&regions[i].arena_memid);
+        size_t commit = mi_atomic_load_relaxed(&regions[i].commit);
+        memset((void*)&regions[i], 0, sizeof(mem_region_t));  // cast to void* to avoid atomic warning
+        // and release the whole region
+        mi_atomic_store_release(&region->info, (size_t)0);
+        if (start != NULL) { // && !_mi_os_is_huge_reserved(start)) {
+          _mi_abandoned_await_readers(); // ensure no pending reads
+          _mi_arena_free(start, MI_REGION_SIZE, MI_SEGMENT_ALIGN, 0, arena_memid, (~commit == 0), tld->stats);
+        }
+      }
+    }
+  }
+}
+
+
+/* ----------------------------------------------------------------------------
+  Other
+-----------------------------------------------------------------------------*/
+
+bool _mi_mem_reset(void* p, size_t size, mi_os_tld_t* tld) {
+  if (mi_option_is_enabled(mi_option_reset_decommits)) {
+    return _mi_os_decommit(p, size, tld->stats);
+  }
+  else {
+    return _mi_os_reset(p, size, tld->stats);
+  }
+}
+
+bool _mi_mem_unreset(void* p, size_t size, bool* is_zero, mi_os_tld_t* tld) {
+  if (mi_option_is_enabled(mi_option_reset_decommits)) {
+    return _mi_os_commit(p, size, is_zero, tld->stats);
+  }
+  else {
+    return _mi_os_unreset(p, size, is_zero, tld->stats);
+  }
+}
+
+bool _mi_mem_commit(void* p, size_t size, bool* is_zero, mi_os_tld_t* tld) {
+  return _mi_os_commit(p, size, is_zero, tld->stats);
+}
+
+bool _mi_mem_decommit(void* p, size_t size, mi_os_tld_t* tld) {
+  return _mi_os_decommit(p, size, tld->stats);
+}
+
+bool _mi_mem_protect(void* p, size_t size) {
+  return _mi_os_protect(p, size);
+}
+
+bool _mi_mem_unprotect(void* p, size_t size) {
+  return _mi_os_unprotect(p, size);
+}
diff --git a/Objects/mimalloc/segment-cache.c b/Objects/mimalloc/segment-cache.c
new file mode 100644
index 00000000000000..d93fd644140ff2
--- /dev/null
+++ b/Objects/mimalloc/segment-cache.c
@@ -0,0 +1,409 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2020, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+  Implements a cache of segments to avoid expensive OS calls and to reuse
+  the commit_mask to optimize the commit/decommit calls.
+  The full memory map of all segments is also implemented here.
+-----------------------------------------------------------------------------*/
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include "bitmap.h"  // atomic bitmap
+
+//#define MI_CACHE_DISABLE 1    // define to completely disable the segment cache
+
+#define MI_CACHE_FIELDS     (16)
+#define MI_CACHE_MAX        (MI_BITMAP_FIELD_BITS*MI_CACHE_FIELDS)       // 1024 on 64-bit
+
+#define BITS_SET()          MI_ATOMIC_VAR_INIT(UINTPTR_MAX)
+#define MI_CACHE_BITS_SET   MI_INIT16(BITS_SET)                          // note: update if MI_CACHE_FIELDS changes
+
+typedef struct mi_cache_slot_s {
+  void*               p;
+  size_t              memid;
+  bool                is_pinned;
+  mi_commit_mask_t    commit_mask;
+  mi_commit_mask_t    decommit_mask;
+  _Atomic(mi_msecs_t) expire;
+} mi_cache_slot_t;
+
+static mi_decl_cache_align mi_cache_slot_t cache[MI_CACHE_MAX];    // = 0
+
+static mi_decl_cache_align mi_bitmap_field_t cache_available[MI_CACHE_FIELDS] = { MI_CACHE_BITS_SET };        // zero bit = available!
+static mi_decl_cache_align mi_bitmap_field_t cache_available_large[MI_CACHE_FIELDS] = { MI_CACHE_BITS_SET };
+static mi_decl_cache_align mi_bitmap_field_t cache_inuse[MI_CACHE_FIELDS];   // zero bit = free
+
+static bool mi_cdecl mi_segment_cache_is_suitable(mi_bitmap_index_t bitidx, void* arg) {
+  mi_arena_id_t req_arena_id = *((mi_arena_id_t*)arg);
+  mi_cache_slot_t* slot = &cache[mi_bitmap_index_bit(bitidx)];
+  return _mi_arena_memid_is_suitable(slot->memid, req_arena_id);
+}
+
+mi_decl_noinline static void* mi_segment_cache_pop_ex(
+                              bool all_suitable,
+                              size_t size, mi_commit_mask_t* commit_mask, 
+                              mi_commit_mask_t* decommit_mask, bool* large, bool* is_pinned, bool* is_zero, 
+                              mi_arena_id_t _req_arena_id, size_t* memid, mi_os_tld_t* tld)
+{
+#ifdef MI_CACHE_DISABLE
+  return NULL;
+#else
+
+  // only segment blocks
+  if (size != MI_SEGMENT_SIZE) return NULL;
+
+  // numa node determines start field
+  const int numa_node = _mi_os_numa_node(tld);
+  size_t start_field = 0;
+  if (numa_node > 0) {
+    start_field = (MI_CACHE_FIELDS / _mi_os_numa_node_count())*numa_node;
+    if (start_field >= MI_CACHE_FIELDS) start_field = 0;
+  }
+
+  // find an available slot
+  mi_bitmap_index_t bitidx = 0;
+  bool claimed = false;
+  mi_arena_id_t req_arena_id = _req_arena_id;
+  mi_bitmap_pred_fun_t pred_fun = (all_suitable ? NULL : &mi_segment_cache_is_suitable);  // cannot pass NULL as the arena may be exclusive itself; todo: do not put exclusive arenas in the cache?
+
+  if (*large) {  // large allowed?
+    claimed = _mi_bitmap_try_find_from_claim_pred(cache_available_large, MI_CACHE_FIELDS, start_field, 1, pred_fun, &req_arena_id, &bitidx);
+    if (claimed) *large = true;
+  }
+  if (!claimed) {
+    claimed = _mi_bitmap_try_find_from_claim_pred (cache_available, MI_CACHE_FIELDS, start_field, 1, pred_fun, &req_arena_id, &bitidx);
+    if (claimed) *large = false;
+  }
+
+  if (!claimed) return NULL;
+
+  // found a slot
+  mi_cache_slot_t* slot = &cache[mi_bitmap_index_bit(bitidx)];
+  void* p = slot->p;
+  *memid = slot->memid;
+  *is_pinned = slot->is_pinned;
+  *is_zero = false;
+  *commit_mask = slot->commit_mask;     
+  *decommit_mask = slot->decommit_mask;
+  slot->p = NULL;
+  mi_atomic_storei64_release(&slot->expire,(mi_msecs_t)0);
+  
+  // mark the slot as free again
+  mi_assert_internal(_mi_bitmap_is_claimed(cache_inuse, MI_CACHE_FIELDS, 1, bitidx));
+  _mi_bitmap_unclaim(cache_inuse, MI_CACHE_FIELDS, 1, bitidx);
+  return p;
+#endif
+}
+
+
+mi_decl_noinline void* _mi_segment_cache_pop(size_t size, mi_commit_mask_t* commit_mask, mi_commit_mask_t* decommit_mask, bool* large, bool* is_pinned, bool* is_zero, mi_arena_id_t _req_arena_id, size_t* memid, mi_os_tld_t* tld)
+{
+  return mi_segment_cache_pop_ex(false, size, commit_mask, decommit_mask, large, is_pinned, is_zero, _req_arena_id, memid, tld);
+}
+
+static mi_decl_noinline void mi_commit_mask_decommit(mi_commit_mask_t* cmask, void* p, size_t total, mi_stats_t* stats)
+{
+  if (mi_commit_mask_is_empty(cmask)) {
+    // nothing
+  }
+  else if (mi_commit_mask_is_full(cmask)) {
+    _mi_os_decommit(p, total, stats);
+  }
+  else {
+    // todo: one call to decommit the whole at once?
+    mi_assert_internal((total%MI_COMMIT_MASK_BITS)==0);
+    size_t part = total/MI_COMMIT_MASK_BITS;
+    size_t idx;
+    size_t count;    
+    mi_commit_mask_foreach(cmask, idx, count) {
+      void*  start = (uint8_t*)p + (idx*part);
+      size_t size = count*part;
+      _mi_os_decommit(start, size, stats);
+    }
+    mi_commit_mask_foreach_end()
+  }
+  mi_commit_mask_create_empty(cmask);
+}
+
+#define MI_MAX_PURGE_PER_PUSH  (4)
+
+static mi_decl_noinline void mi_segment_cache_purge(bool visit_all, bool force, mi_os_tld_t* tld)
+{
+  MI_UNUSED(tld);
+  if (!mi_option_is_enabled(mi_option_allow_decommit)) return;
+  mi_msecs_t now = _mi_clock_now();
+  size_t purged = 0;
+  const size_t max_visits = (visit_all ? MI_CACHE_MAX /* visit all */ : MI_CACHE_FIELDS /* probe at most N (=16) slots */);
+  size_t idx              = (visit_all ? 0 : _mi_random_shuffle((uintptr_t)now) % MI_CACHE_MAX /* random start */ );
+  for (size_t visited = 0; visited < max_visits; visited++,idx++) {  // visit N slots
+    if (idx >= MI_CACHE_MAX) idx = 0; // wrap
+    mi_cache_slot_t* slot = &cache[idx];
+    mi_msecs_t expire = mi_atomic_loadi64_relaxed(&slot->expire);
+    if (expire != 0 && (force || now >= expire)) {  // racy read
+      // seems expired, first claim it from available
+      purged++;
+      mi_bitmap_index_t bitidx = mi_bitmap_index_create_from_bit(idx);
+      if (_mi_bitmap_claim(cache_available, MI_CACHE_FIELDS, 1, bitidx, NULL)) {
+        // was available, we claimed it
+        expire = mi_atomic_loadi64_acquire(&slot->expire);
+        if (expire != 0 && (force || now >= expire)) {  // safe read
+          // still expired, decommit it
+          mi_atomic_storei64_relaxed(&slot->expire,(mi_msecs_t)0);
+          mi_assert_internal(!mi_commit_mask_is_empty(&slot->commit_mask) && _mi_bitmap_is_claimed(cache_available_large, MI_CACHE_FIELDS, 1, bitidx));
+          _mi_abandoned_await_readers();  // wait until safe to decommit
+          // decommit committed parts
+          // TODO: instead of decommit, we could also free to the OS?
+          mi_commit_mask_decommit(&slot->commit_mask, slot->p, MI_SEGMENT_SIZE, tld->stats);
+          mi_commit_mask_create_empty(&slot->decommit_mask);
+        }
+        _mi_bitmap_unclaim(cache_available, MI_CACHE_FIELDS, 1, bitidx); // make it available again for a pop
+      }
+      if (!visit_all && purged > MI_MAX_PURGE_PER_PUSH) break;  // bound to no more than N purge tries per push
+    }
+  }
+}
+
+void _mi_segment_cache_collect(bool force, mi_os_tld_t* tld) {
+  if (force) {
+    // called on `mi_collect(true)` but not on thread termination    
+    _mi_segment_cache_free_all(tld);
+  }
+  else {
+    mi_segment_cache_purge(true /* visit all */, false /* don't force unexpired */, tld);
+  }
+}
+
+void _mi_segment_cache_free_all(mi_os_tld_t* tld) {
+  mi_commit_mask_t commit_mask;
+  mi_commit_mask_t decommit_mask;
+  bool is_pinned;
+  bool is_zero;
+  size_t memid;
+  const size_t size = MI_SEGMENT_SIZE;
+  // iterate twice: first large pages, then regular memory 
+  for (int i = 0; i < 2; i++) {
+    void* p;
+    do {
+      // keep popping and freeing the memory
+      bool large = (i == 0);  
+      p = mi_segment_cache_pop_ex(true /* all */, size, &commit_mask, &decommit_mask,
+                                  &large, &is_pinned, &is_zero, _mi_arena_id_none(), &memid, tld);
+      if (p != NULL) {
+        size_t csize = _mi_commit_mask_committed_size(&commit_mask, size);
+        if (csize > 0 && !is_pinned) _mi_stat_decrease(&_mi_stats_main.committed, csize);
+        _mi_arena_free(p, size, MI_SEGMENT_ALIGN, 0, memid, is_pinned /* pretend not committed to not double count decommits */, tld->stats);
+      }
+    } while (p != NULL);
+  }
+}
+
+mi_decl_noinline bool _mi_segment_cache_push(void* start, size_t size, size_t memid, const mi_commit_mask_t* commit_mask, const mi_commit_mask_t* decommit_mask, bool is_large, bool is_pinned, mi_os_tld_t* tld)
+{
+#ifdef MI_CACHE_DISABLE
+  return false;
+#else
+
+  // only for normal segment blocks
+  if (size != MI_SEGMENT_SIZE || ((uintptr_t)start % MI_SEGMENT_ALIGN) != 0) return false;
+
+  // numa node determines start field
+  int numa_node = _mi_os_numa_node(NULL);
+  size_t start_field = 0;
+  if (numa_node > 0) {
+    start_field = (MI_CACHE_FIELDS / _mi_os_numa_node_count())*numa_node;
+    if (start_field >= MI_CACHE_FIELDS) start_field = 0;
+  }
+
+  // purge expired entries
+  mi_segment_cache_purge(false /* limit purges to a constant N */, false /* don't force unexpired */, tld);
+
+  // find an available slot
+  mi_bitmap_index_t bitidx;
+  bool claimed = _mi_bitmap_try_find_from_claim(cache_inuse, MI_CACHE_FIELDS, start_field, 1, &bitidx);
+  if (!claimed) return false;
+
+  mi_assert_internal(_mi_bitmap_is_claimed(cache_available, MI_CACHE_FIELDS, 1, bitidx));
+  mi_assert_internal(_mi_bitmap_is_claimed(cache_available_large, MI_CACHE_FIELDS, 1, bitidx));
+#if MI_DEBUG>1
+  if (is_pinned || is_large) {
+    mi_assert_internal(mi_commit_mask_is_full(commit_mask));
+  }
+#endif
+
+  // set the slot
+  mi_cache_slot_t* slot = &cache[mi_bitmap_index_bit(bitidx)];
+  slot->p = start;
+  slot->memid = memid;
+  slot->is_pinned = is_pinned;
+  mi_atomic_storei64_relaxed(&slot->expire,(mi_msecs_t)0);
+  slot->commit_mask = *commit_mask;
+  slot->decommit_mask = *decommit_mask;
+  if (!mi_commit_mask_is_empty(commit_mask) && !is_large && !is_pinned && mi_option_is_enabled(mi_option_allow_decommit)) {
+    long delay = mi_option_get(mi_option_segment_decommit_delay);
+    if (delay == 0) {
+      _mi_abandoned_await_readers(); // wait until safe to decommit
+      mi_commit_mask_decommit(&slot->commit_mask, start, MI_SEGMENT_SIZE, tld->stats);
+      mi_commit_mask_create_empty(&slot->decommit_mask);
+    }
+    else {
+      mi_atomic_storei64_release(&slot->expire, _mi_clock_now() + delay);
+    }
+  }
+
+  // make it available
+  _mi_bitmap_unclaim((is_large ? cache_available_large : cache_available), MI_CACHE_FIELDS, 1, bitidx);
+  return true;
+#endif
+}
+
+
+/* -----------------------------------------------------------
+  The following functions are to reliably find the segment or
+  block that encompasses any pointer p (or NULL if it is not
+  in any of our segments).
+  We maintain a bitmap of all memory with 1 bit per MI_SEGMENT_SIZE (64MiB)
+  set to 1 if it contains the segment meta data.
+----------------------------------------------------------- */
+
+
+#if (MI_INTPTR_SIZE==8)
+#define MI_MAX_ADDRESS    ((size_t)20 << 40)  // 20TB
+#else
+#define MI_MAX_ADDRESS    ((size_t)2 << 30)   // 2Gb
+#endif
+
+#define MI_SEGMENT_MAP_BITS  (MI_MAX_ADDRESS / MI_SEGMENT_SIZE)
+#define MI_SEGMENT_MAP_SIZE  (MI_SEGMENT_MAP_BITS / 8)
+#define MI_SEGMENT_MAP_WSIZE (MI_SEGMENT_MAP_SIZE / MI_INTPTR_SIZE)
+
+static _Atomic(uintptr_t) mi_segment_map[MI_SEGMENT_MAP_WSIZE + 1];  // 2KiB per TB with 64MiB segments
+
+static size_t mi_segment_map_index_of(const mi_segment_t* segment, size_t* bitidx) {
+  mi_assert_internal(_mi_ptr_segment(segment + 1) == segment); // is it aligned on MI_SEGMENT_SIZE?
+  if ((uintptr_t)segment >= MI_MAX_ADDRESS) {
+    *bitidx = 0;
+    return MI_SEGMENT_MAP_WSIZE;
+  }
+  else {
+    const uintptr_t segindex = ((uintptr_t)segment) / MI_SEGMENT_SIZE;
+    *bitidx = segindex % MI_INTPTR_BITS;
+    const size_t mapindex = segindex / MI_INTPTR_BITS;
+    mi_assert_internal(mapindex < MI_SEGMENT_MAP_WSIZE);
+    return mapindex;
+  }
+}
+
+void _mi_segment_map_allocated_at(const mi_segment_t* segment) {
+  size_t bitidx;
+  size_t index = mi_segment_map_index_of(segment, &bitidx);
+  mi_assert_internal(index <= MI_SEGMENT_MAP_WSIZE);
+  if (index==MI_SEGMENT_MAP_WSIZE) return;
+  uintptr_t mask = mi_atomic_load_relaxed(&mi_segment_map[index]);
+  uintptr_t newmask;
+  do {
+    newmask = (mask | ((uintptr_t)1 << bitidx));
+  } while (!mi_atomic_cas_weak_release(&mi_segment_map[index], &mask, newmask));
+}
+
+void _mi_segment_map_freed_at(const mi_segment_t* segment) {
+  size_t bitidx;
+  size_t index = mi_segment_map_index_of(segment, &bitidx);
+  mi_assert_internal(index <= MI_SEGMENT_MAP_WSIZE);
+  if (index == MI_SEGMENT_MAP_WSIZE) return;
+  uintptr_t mask = mi_atomic_load_relaxed(&mi_segment_map[index]);
+  uintptr_t newmask;
+  do {
+    newmask = (mask & ~((uintptr_t)1 << bitidx));
+  } while (!mi_atomic_cas_weak_release(&mi_segment_map[index], &mask, newmask));
+}
+
+// Determine the segment belonging to a pointer or NULL if it is not in a valid segment.
+static mi_segment_t* _mi_segment_of(const void* p) {
+  if (p == NULL) return NULL;
+  mi_segment_t* segment = _mi_ptr_segment(p);
+  mi_assert_internal(segment != NULL);
+  size_t bitidx;
+  size_t index = mi_segment_map_index_of(segment, &bitidx);
+  // fast path: for any pointer to valid small/medium/large object or first MI_SEGMENT_SIZE in huge
+  const uintptr_t mask = mi_atomic_load_relaxed(&mi_segment_map[index]);
+  if mi_likely((mask & ((uintptr_t)1 << bitidx)) != 0) {
+    return segment; // yes, allocated by us
+  }
+  if (index==MI_SEGMENT_MAP_WSIZE) return NULL;
+
+  // TODO: maintain max/min allocated range for efficiency for more efficient rejection of invalid pointers?
+
+  // search downwards for the first segment in case it is an interior pointer
+  // could be slow but searches in MI_INTPTR_SIZE * MI_SEGMENT_SIZE (512MiB) steps trough
+  // valid huge objects
+  // note: we could maintain a lowest index to speed up the path for invalid pointers?
+  size_t lobitidx;
+  size_t loindex;
+  uintptr_t lobits = mask & (((uintptr_t)1 << bitidx) - 1);
+  if (lobits != 0) {
+    loindex = index;
+    lobitidx = mi_bsr(lobits);    // lobits != 0
+  }
+  else if (index == 0) {
+    return NULL;
+  }
+  else {
+    mi_assert_internal(index > 0);
+    uintptr_t lomask = mask;
+    loindex = index;
+    do {
+      loindex--;  
+      lomask = mi_atomic_load_relaxed(&mi_segment_map[loindex]);      
+    } while (lomask != 0 && loindex > 0);
+    if (lomask == 0) return NULL;
+    lobitidx = mi_bsr(lomask);    // lomask != 0
+  }
+  mi_assert_internal(loindex < MI_SEGMENT_MAP_WSIZE);
+  // take difference as the addresses could be larger than the MAX_ADDRESS space.
+  size_t diff = (((index - loindex) * (8*MI_INTPTR_SIZE)) + bitidx - lobitidx) * MI_SEGMENT_SIZE;
+  segment = (mi_segment_t*)((uint8_t*)segment - diff);
+
+  if (segment == NULL) return NULL;
+  mi_assert_internal((void*)segment < p);
+  bool cookie_ok = (_mi_ptr_cookie(segment) == segment->cookie);
+  mi_assert_internal(cookie_ok);
+  if mi_unlikely(!cookie_ok) return NULL;
+  if (((uint8_t*)segment + mi_segment_size(segment)) <= (uint8_t*)p) return NULL; // outside the range
+  mi_assert_internal(p >= (void*)segment && (uint8_t*)p < (uint8_t*)segment + mi_segment_size(segment));
+  return segment;
+}
+
+// Is this a valid pointer in our heap?
+static bool  mi_is_valid_pointer(const void* p) {
+  return (_mi_segment_of(p) != NULL);
+}
+
+mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
+  return mi_is_valid_pointer(p);
+}
+
+/*
+// Return the full segment range belonging to a pointer
+static void* mi_segment_range_of(const void* p, size_t* size) {
+  mi_segment_t* segment = _mi_segment_of(p);
+  if (segment == NULL) {
+    if (size != NULL) *size = 0;
+    return NULL;
+  }
+  else {
+    if (size != NULL) *size = segment->segment_size;
+    return segment;
+  }
+  mi_assert_expensive(page == NULL || mi_segment_is_valid(_mi_page_segment(page),tld));
+  mi_assert_internal(page == NULL || (mi_segment_page_size(_mi_page_segment(page)) - (MI_SECURE == 0 ? 0 : _mi_os_page_size())) >= block_size);
+  mi_reset_delayed(tld);
+  mi_assert_internal(page == NULL || mi_page_not_in_queue(page, tld));
+  return page;
+}
+*/
diff --git a/Objects/mimalloc/segment.c b/Objects/mimalloc/segment.c
new file mode 100644
index 00000000000000..dc98e3e7b945c4
--- /dev/null
+++ b/Objects/mimalloc/segment.c
@@ -0,0 +1,1623 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2020, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include <string.h>  // memset
+#include <stdio.h>
+
+#define MI_PAGE_HUGE_ALIGN  (256*1024)
+
+static void mi_segment_delayed_decommit(mi_segment_t* segment, bool force, mi_stats_t* stats);
+
+
+// -------------------------------------------------------------------
+// commit mask 
+// -------------------------------------------------------------------
+
+static bool mi_commit_mask_all_set(const mi_commit_mask_t* commit, const mi_commit_mask_t* cm) {
+  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
+    if ((commit->mask[i] & cm->mask[i]) != cm->mask[i]) return false;
+  }
+  return true;
+}
+
+static bool mi_commit_mask_any_set(const mi_commit_mask_t* commit, const mi_commit_mask_t* cm) {
+  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
+    if ((commit->mask[i] & cm->mask[i]) != 0) return true;
+  }
+  return false;
+}
+
+static void mi_commit_mask_create_intersect(const mi_commit_mask_t* commit, const mi_commit_mask_t* cm, mi_commit_mask_t* res) {
+  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
+    res->mask[i] = (commit->mask[i] & cm->mask[i]);
+  }
+}
+
+static void mi_commit_mask_clear(mi_commit_mask_t* res, const mi_commit_mask_t* cm) {
+  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
+    res->mask[i] &= ~(cm->mask[i]);
+  }
+}
+
+static void mi_commit_mask_set(mi_commit_mask_t* res, const mi_commit_mask_t* cm) {
+  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
+    res->mask[i] |= cm->mask[i];
+  }
+}
+
+static void mi_commit_mask_create(size_t bitidx, size_t bitcount, mi_commit_mask_t* cm) {
+  mi_assert_internal(bitidx < MI_COMMIT_MASK_BITS);
+  mi_assert_internal((bitidx + bitcount) <= MI_COMMIT_MASK_BITS);
+  if (bitcount == MI_COMMIT_MASK_BITS) {
+    mi_assert_internal(bitidx==0);
+    mi_commit_mask_create_full(cm);
+  }
+  else if (bitcount == 0) {
+    mi_commit_mask_create_empty(cm);
+  }
+  else {
+    mi_commit_mask_create_empty(cm);
+    size_t i = bitidx / MI_COMMIT_MASK_FIELD_BITS;
+    size_t ofs = bitidx % MI_COMMIT_MASK_FIELD_BITS;
+    while (bitcount > 0) {
+      mi_assert_internal(i < MI_COMMIT_MASK_FIELD_COUNT);
+      size_t avail = MI_COMMIT_MASK_FIELD_BITS - ofs;
+      size_t count = (bitcount > avail ? avail : bitcount);
+      size_t mask = (count >= MI_COMMIT_MASK_FIELD_BITS ? ~((size_t)0) : (((size_t)1 << count) - 1) << ofs);
+      cm->mask[i] = mask;
+      bitcount -= count;
+      ofs = 0;
+      i++;
+    }
+  }
+}
+
+size_t _mi_commit_mask_committed_size(const mi_commit_mask_t* cm, size_t total) {
+  mi_assert_internal((total%MI_COMMIT_MASK_BITS)==0);
+  size_t count = 0;
+  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
+    size_t mask = cm->mask[i];
+    if (~mask == 0) {
+      count += MI_COMMIT_MASK_FIELD_BITS;
+    }
+    else {
+      for (; mask != 0; mask >>= 1) {  // todo: use popcount
+        if ((mask&1)!=0) count++;
+      }
+    }
+  }
+  // we use total since for huge segments each commit bit may represent a larger size
+  return ((total / MI_COMMIT_MASK_BITS) * count);
+}
+
+
+size_t _mi_commit_mask_next_run(const mi_commit_mask_t* cm, size_t* idx) {
+  size_t i = (*idx) / MI_COMMIT_MASK_FIELD_BITS;
+  size_t ofs = (*idx) % MI_COMMIT_MASK_FIELD_BITS;
+  size_t mask = 0;
+  // find first ones
+  while (i < MI_COMMIT_MASK_FIELD_COUNT) {
+    mask = cm->mask[i];
+    mask >>= ofs;
+    if (mask != 0) {
+      while ((mask&1) == 0) {
+        mask >>= 1;
+        ofs++;
+      }
+      break;
+    }
+    i++;
+    ofs = 0;
+  }
+  if (i >= MI_COMMIT_MASK_FIELD_COUNT) {
+    // not found
+    *idx = MI_COMMIT_MASK_BITS;
+    return 0;
+  }
+  else {
+    // found, count ones
+    size_t count = 0;
+    *idx = (i*MI_COMMIT_MASK_FIELD_BITS) + ofs;
+    do {
+      mi_assert_internal(ofs < MI_COMMIT_MASK_FIELD_BITS && (mask&1) == 1);
+      do {
+        count++;
+        mask >>= 1;
+      } while ((mask&1) == 1);
+      if ((((*idx + count) % MI_COMMIT_MASK_FIELD_BITS) == 0)) {
+        i++;
+        if (i >= MI_COMMIT_MASK_FIELD_COUNT) break;
+        mask = cm->mask[i];
+        ofs = 0;
+      }
+    } while ((mask&1) == 1);
+    mi_assert_internal(count > 0);
+    return count;
+  }
+}
+
+
+/* --------------------------------------------------------------------------------
+  Segment allocation
+
+  If a  thread ends, it "abandons" pages with used blocks
+  and there is an abandoned segment list whose segments can
+  be reclaimed by still running threads, much like work-stealing.
+-------------------------------------------------------------------------------- */
+
+
+/* -----------------------------------------------------------
+   Slices
+----------------------------------------------------------- */
+
+
+static const mi_slice_t* mi_segment_slices_end(const mi_segment_t* segment) {
+  return &segment->slices[segment->slice_entries];
+}
+
+static uint8_t* mi_slice_start(const mi_slice_t* slice) {
+  mi_segment_t* segment = _mi_ptr_segment(slice);
+  mi_assert_internal(slice >= segment->slices && slice < mi_segment_slices_end(segment));
+  return ((uint8_t*)segment + ((slice - segment->slices)*MI_SEGMENT_SLICE_SIZE));
+}
+
+
+/* -----------------------------------------------------------
+   Bins
+----------------------------------------------------------- */
+// Use bit scan forward to quickly find the first zero bit if it is available
+
+static inline size_t mi_slice_bin8(size_t slice_count) {
+  if (slice_count<=1) return slice_count;
+  mi_assert_internal(slice_count <= MI_SLICES_PER_SEGMENT);
+  slice_count--;
+  size_t s = mi_bsr(slice_count);  // slice_count > 1
+  if (s <= 2) return slice_count + 1;
+  size_t bin = ((s << 2) | ((slice_count >> (s - 2))&0x03)) - 4;
+  return bin;
+}
+
+static inline size_t mi_slice_bin(size_t slice_count) {
+  mi_assert_internal(slice_count*MI_SEGMENT_SLICE_SIZE <= MI_SEGMENT_SIZE);
+  mi_assert_internal(mi_slice_bin8(MI_SLICES_PER_SEGMENT) <= MI_SEGMENT_BIN_MAX);
+  size_t bin = mi_slice_bin8(slice_count);
+  mi_assert_internal(bin <= MI_SEGMENT_BIN_MAX);
+  return bin;
+}
+
+static inline size_t mi_slice_index(const mi_slice_t* slice) {
+  mi_segment_t* segment = _mi_ptr_segment(slice);
+  ptrdiff_t index = slice - segment->slices;
+  mi_assert_internal(index >= 0 && index < (ptrdiff_t)segment->slice_entries);
+  return index;
+}
+
+
+/* -----------------------------------------------------------
+   Slice span queues
+----------------------------------------------------------- */
+
+static void mi_span_queue_push(mi_span_queue_t* sq, mi_slice_t* slice) {
+  // todo: or push to the end?
+  mi_assert_internal(slice->prev == NULL && slice->next==NULL);
+  slice->prev = NULL; // paranoia
+  slice->next = sq->first;
+  sq->first = slice;
+  if (slice->next != NULL) slice->next->prev = slice;
+                     else sq->last = slice;
+  slice->xblock_size = 0; // free
+}
+
+static mi_span_queue_t* mi_span_queue_for(size_t slice_count, mi_segments_tld_t* tld) {
+  size_t bin = mi_slice_bin(slice_count);
+  mi_span_queue_t* sq = &tld->spans[bin];
+  mi_assert_internal(sq->slice_count >= slice_count);
+  return sq;
+}
+
+static void mi_span_queue_delete(mi_span_queue_t* sq, mi_slice_t* slice) {
+  mi_assert_internal(slice->xblock_size==0 && slice->slice_count>0 && slice->slice_offset==0);
+  // should work too if the queue does not contain slice (which can happen during reclaim)
+  if (slice->prev != NULL) slice->prev->next = slice->next;
+  if (slice == sq->first) sq->first = slice->next;
+  if (slice->next != NULL) slice->next->prev = slice->prev;
+  if (slice == sq->last) sq->last = slice->prev;
+  slice->prev = NULL;
+  slice->next = NULL;
+  slice->xblock_size = 1; // no more free
+}
+
+
+/* -----------------------------------------------------------
+ Invariant checking
+----------------------------------------------------------- */
+
+static bool mi_slice_is_used(const mi_slice_t* slice) {
+  return (slice->xblock_size > 0);
+}
+
+
+#if (MI_DEBUG>=3)
+static bool mi_span_queue_contains(mi_span_queue_t* sq, mi_slice_t* slice) {
+  for (mi_slice_t* s = sq->first; s != NULL; s = s->next) {
+    if (s==slice) return true;
+  }
+  return false;
+}
+
+static bool mi_segment_is_valid(mi_segment_t* segment, mi_segments_tld_t* tld) {
+  mi_assert_internal(segment != NULL);
+  mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
+  mi_assert_internal(segment->abandoned <= segment->used);
+  mi_assert_internal(segment->thread_id == 0 || segment->thread_id == _mi_thread_id());
+  mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->decommit_mask)); // can only decommit committed blocks
+  //mi_assert_internal(segment->segment_info_size % MI_SEGMENT_SLICE_SIZE == 0);
+  mi_slice_t* slice = &segment->slices[0];
+  const mi_slice_t* end = mi_segment_slices_end(segment);
+  size_t used_count = 0;
+  mi_span_queue_t* sq;
+  while(slice < end) {
+    mi_assert_internal(slice->slice_count > 0);
+    mi_assert_internal(slice->slice_offset == 0);
+    size_t index = mi_slice_index(slice);
+    size_t maxindex = (index + slice->slice_count >= segment->slice_entries ? segment->slice_entries : index + slice->slice_count) - 1;
+    if (mi_slice_is_used(slice)) { // a page in use, we need at least MAX_SLICE_OFFSET valid back offsets
+      used_count++;
+      for (size_t i = 0; i <= MI_MAX_SLICE_OFFSET && index + i <= maxindex; i++) {
+        mi_assert_internal(segment->slices[index + i].slice_offset == i*sizeof(mi_slice_t));
+        mi_assert_internal(i==0 || segment->slices[index + i].slice_count == 0);
+        mi_assert_internal(i==0 || segment->slices[index + i].xblock_size == 1);
+      }
+      // and the last entry as well (for coalescing)
+      const mi_slice_t* last = slice + slice->slice_count - 1;
+      if (last > slice && last < mi_segment_slices_end(segment)) {
+        mi_assert_internal(last->slice_offset == (slice->slice_count-1)*sizeof(mi_slice_t));
+        mi_assert_internal(last->slice_count == 0);
+        mi_assert_internal(last->xblock_size == 1);
+      }
+    }
+    else {  // free range of slices; only last slice needs a valid back offset
+      mi_slice_t* last = &segment->slices[maxindex];
+      if (segment->kind != MI_SEGMENT_HUGE || slice->slice_count <= (segment->slice_entries - segment->segment_info_slices)) {
+        mi_assert_internal((uint8_t*)slice == (uint8_t*)last - last->slice_offset);
+      }
+      mi_assert_internal(slice == last || last->slice_count == 0 );
+      mi_assert_internal(last->xblock_size == 0 || (segment->kind==MI_SEGMENT_HUGE && last->xblock_size==1));
+      if (segment->kind != MI_SEGMENT_HUGE && segment->thread_id != 0) { // segment is not huge or abandoned
+        sq = mi_span_queue_for(slice->slice_count,tld);
+        mi_assert_internal(mi_span_queue_contains(sq,slice));
+      }
+    }
+    slice = &segment->slices[maxindex+1];
+  }
+  mi_assert_internal(slice == end);
+  mi_assert_internal(used_count == segment->used + 1);
+  return true;
+}
+#endif
+
+/* -----------------------------------------------------------
+ Segment size calculations
+----------------------------------------------------------- */
+
+static size_t mi_segment_info_size(mi_segment_t* segment) {
+  return segment->segment_info_slices * MI_SEGMENT_SLICE_SIZE;
+}
+
+static uint8_t* _mi_segment_page_start_from_slice(const mi_segment_t* segment, const mi_slice_t* slice, size_t xblock_size, size_t* page_size)
+{
+  ptrdiff_t idx = slice - segment->slices;
+  size_t psize = (size_t)slice->slice_count * MI_SEGMENT_SLICE_SIZE;
+  // make the start not OS page aligned for smaller blocks to avoid page/cache effects
+  size_t start_offset = (xblock_size >= MI_INTPTR_SIZE && xblock_size <= 1024 ? 3*MI_MAX_ALIGN_GUARANTEE : 0); 
+  if (page_size != NULL) { *page_size = psize - start_offset; }
+  return (uint8_t*)segment + ((idx*MI_SEGMENT_SLICE_SIZE) + start_offset);
+}
+
+// Start of the page available memory; can be used on uninitialized pages
+uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size)
+{
+  const mi_slice_t* slice = mi_page_to_slice((mi_page_t*)page);
+  uint8_t* p = _mi_segment_page_start_from_slice(segment, slice, page->xblock_size, page_size);  
+  mi_assert_internal(page->xblock_size > 0 || _mi_ptr_page(p) == page);
+  mi_assert_internal(_mi_ptr_segment(p) == segment);
+  return p;
+}
+
+
+static size_t mi_segment_calculate_slices(size_t required, size_t* pre_size, size_t* info_slices) {
+  size_t page_size = _mi_os_page_size();
+  size_t isize     = _mi_align_up(sizeof(mi_segment_t), page_size);
+  size_t guardsize = 0;
+  
+  if (MI_SECURE>0) {
+    // in secure mode, we set up a protected page in between the segment info
+    // and the page data (and one at the end of the segment)
+    guardsize = page_size;
+    if (required > 0) {
+      required = _mi_align_up(required, MI_SEGMENT_SLICE_SIZE) + page_size;
+    }
+  }
+
+  if (pre_size != NULL) *pre_size = isize;
+  isize = _mi_align_up(isize + guardsize, MI_SEGMENT_SLICE_SIZE);
+  if (info_slices != NULL) *info_slices = isize / MI_SEGMENT_SLICE_SIZE;
+  size_t segment_size = (required==0 ? MI_SEGMENT_SIZE : _mi_align_up( required + isize + guardsize, MI_SEGMENT_SLICE_SIZE) );  
+  mi_assert_internal(segment_size % MI_SEGMENT_SLICE_SIZE == 0);
+  return (segment_size / MI_SEGMENT_SLICE_SIZE);
+}
+
+
+/* ----------------------------------------------------------------------------
+Segment caches
+We keep a small segment cache per thread to increase local
+reuse and avoid setting/clearing guard pages in secure mode.
+------------------------------------------------------------------------------- */
+
+static void mi_segments_track_size(long segment_size, mi_segments_tld_t* tld) {
+  if (segment_size>=0) _mi_stat_increase(&tld->stats->segments,1);
+                  else _mi_stat_decrease(&tld->stats->segments,1);
+  tld->count += (segment_size >= 0 ? 1 : -1);
+  if (tld->count > tld->peak_count) tld->peak_count = tld->count;
+  tld->current_size += segment_size;
+  if (tld->current_size > tld->peak_size) tld->peak_size = tld->current_size;
+}
+
+static void mi_segment_os_free(mi_segment_t* segment, mi_segments_tld_t* tld) {
+  segment->thread_id = 0;
+  _mi_segment_map_freed_at(segment);
+  mi_segments_track_size(-((long)mi_segment_size(segment)),tld);
+  if (MI_SECURE>0) {
+    // _mi_os_unprotect(segment, mi_segment_size(segment)); // ensure no more guard pages are set
+    // unprotect the guard pages; we cannot just unprotect the whole segment size as part may be decommitted
+    size_t os_pagesize = _mi_os_page_size();
+    _mi_os_unprotect((uint8_t*)segment + mi_segment_info_size(segment) - os_pagesize, os_pagesize);
+    uint8_t* end = (uint8_t*)segment + mi_segment_size(segment) - os_pagesize;
+    _mi_os_unprotect(end, os_pagesize);
+  }
+
+  // purge delayed decommits now? (no, leave it to the cache)
+  // mi_segment_delayed_decommit(segment,true,tld->stats);
+  
+  // _mi_os_free(segment, mi_segment_size(segment), /*segment->memid,*/ tld->stats);
+  const size_t size = mi_segment_size(segment);
+  if (size != MI_SEGMENT_SIZE || segment->mem_align_offset != 0 || segment->kind == MI_SEGMENT_HUGE || // only push regular segments on the cache
+       !_mi_segment_cache_push(segment, size, segment->memid, &segment->commit_mask, &segment->decommit_mask, segment->mem_is_large, segment->mem_is_pinned, tld->os)) 
+  {
+    const size_t csize = _mi_commit_mask_committed_size(&segment->commit_mask, size);
+    if (csize > 0 && !segment->mem_is_pinned) _mi_stat_decrease(&_mi_stats_main.committed, csize);
+    _mi_abandoned_await_readers();  // wait until safe to free
+    _mi_arena_free(segment, mi_segment_size(segment), segment->mem_alignment, segment->mem_align_offset, segment->memid, segment->mem_is_pinned /* pretend not committed to not double count decommits */, tld->stats);
+  }
+}
+
+// called by threads that are terminating 
+void _mi_segment_thread_collect(mi_segments_tld_t* tld) {
+  MI_UNUSED(tld);
+  // nothing to do
+}
+
+
+/* -----------------------------------------------------------
+   Commit/Decommit ranges
+----------------------------------------------------------- */
+
+static void mi_segment_commit_mask(mi_segment_t* segment, bool conservative, uint8_t* p, size_t size, uint8_t** start_p, size_t* full_size, mi_commit_mask_t* cm) {
+  mi_assert_internal(_mi_ptr_segment(p + 1) == segment);
+  mi_assert_internal(segment->kind != MI_SEGMENT_HUGE);
+  mi_commit_mask_create_empty(cm);
+  if (size == 0 || size > MI_SEGMENT_SIZE || segment->kind == MI_SEGMENT_HUGE) return;
+  const size_t segstart = mi_segment_info_size(segment);
+  const size_t segsize = mi_segment_size(segment);
+  if (p >= (uint8_t*)segment + segsize) return;
+
+  size_t pstart = (p - (uint8_t*)segment);
+  mi_assert_internal(pstart + size <= segsize);
+
+  size_t start;
+  size_t end;
+  if (conservative) {
+    // decommit conservative
+    start = _mi_align_up(pstart, MI_COMMIT_SIZE);
+    end   = _mi_align_down(pstart + size, MI_COMMIT_SIZE);
+    mi_assert_internal(start >= segstart);
+    mi_assert_internal(end <= segsize);
+  }
+  else {
+    // commit liberal
+    start = _mi_align_down(pstart, MI_MINIMAL_COMMIT_SIZE);
+    end   = _mi_align_up(pstart + size, MI_MINIMAL_COMMIT_SIZE);
+  }
+  if (pstart >= segstart && start < segstart) {  // note: the mask is also calculated for an initial commit of the info area
+    start = segstart;
+  }
+  if (end > segsize) {
+    end = segsize;
+  }
+
+  mi_assert_internal(start <= pstart && (pstart + size) <= end);
+  mi_assert_internal(start % MI_COMMIT_SIZE==0 && end % MI_COMMIT_SIZE == 0);
+  *start_p   = (uint8_t*)segment + start;
+  *full_size = (end > start ? end - start : 0);
+  if (*full_size == 0) return;
+
+  size_t bitidx = start / MI_COMMIT_SIZE;
+  mi_assert_internal(bitidx < MI_COMMIT_MASK_BITS);
+  
+  size_t bitcount = *full_size / MI_COMMIT_SIZE; // can be 0
+  if (bitidx + bitcount > MI_COMMIT_MASK_BITS) {
+    _mi_warning_message("commit mask overflow: idx=%zu count=%zu start=%zx end=%zx p=0x%p size=%zu fullsize=%zu\n", bitidx, bitcount, start, end, p, size, *full_size);
+  }
+  mi_assert_internal((bitidx + bitcount) <= MI_COMMIT_MASK_BITS);
+  mi_commit_mask_create(bitidx, bitcount, cm);
+}
+
+
+static bool mi_segment_commitx(mi_segment_t* segment, bool commit, uint8_t* p, size_t size, mi_stats_t* stats) {    
+  mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->decommit_mask));
+
+  // commit liberal, but decommit conservative
+  uint8_t* start = NULL;
+  size_t   full_size = 0;
+  mi_commit_mask_t mask;
+  mi_segment_commit_mask(segment, !commit/*conservative*/, p, size, &start, &full_size, &mask);
+  if (mi_commit_mask_is_empty(&mask) || full_size==0) return true;
+
+  if (commit && !mi_commit_mask_all_set(&segment->commit_mask, &mask)) {
+    bool is_zero = false;
+    mi_commit_mask_t cmask;
+    mi_commit_mask_create_intersect(&segment->commit_mask, &mask, &cmask);
+    _mi_stat_decrease(&_mi_stats_main.committed, _mi_commit_mask_committed_size(&cmask, MI_SEGMENT_SIZE)); // adjust for overlap
+    if (!_mi_os_commit(start,full_size,&is_zero,stats)) return false;    
+    mi_commit_mask_set(&segment->commit_mask, &mask);     
+  }
+  else if (!commit && mi_commit_mask_any_set(&segment->commit_mask, &mask)) {
+    mi_assert_internal((void*)start != (void*)segment);
+    //mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &mask));
+
+    mi_commit_mask_t cmask;
+    mi_commit_mask_create_intersect(&segment->commit_mask, &mask, &cmask);
+    _mi_stat_increase(&_mi_stats_main.committed, full_size - _mi_commit_mask_committed_size(&cmask, MI_SEGMENT_SIZE)); // adjust for overlap
+    if (segment->allow_decommit) { 
+      _mi_os_decommit(start, full_size, stats); // ok if this fails
+    } 
+    mi_commit_mask_clear(&segment->commit_mask, &mask);
+  }
+  // increase expiration of reusing part of the delayed decommit
+  if (commit && mi_commit_mask_any_set(&segment->decommit_mask, &mask)) {
+    segment->decommit_expire = _mi_clock_now() + mi_option_get(mi_option_decommit_delay);
+  }
+  // always undo delayed decommits
+  mi_commit_mask_clear(&segment->decommit_mask, &mask);
+  return true;
+}
+
+static bool mi_segment_ensure_committed(mi_segment_t* segment, uint8_t* p, size_t size, mi_stats_t* stats) {
+  mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->decommit_mask));
+  // note: assumes commit_mask is always full for huge segments as otherwise the commit mask bits can overflow
+  if (mi_commit_mask_is_full(&segment->commit_mask) && mi_commit_mask_is_empty(&segment->decommit_mask)) return true; // fully committed
+  return mi_segment_commitx(segment,true,p,size,stats);
+}
+
+static void mi_segment_perhaps_decommit(mi_segment_t* segment, uint8_t* p, size_t size, mi_stats_t* stats) {
+  if (!segment->allow_decommit) return;
+  if (mi_option_get(mi_option_decommit_delay) == 0) {
+    mi_segment_commitx(segment, false, p, size, stats);
+  }
+  else {
+    // register for future decommit in the decommit mask
+    uint8_t* start = NULL;
+    size_t   full_size = 0;
+    mi_commit_mask_t mask; 
+    mi_segment_commit_mask(segment, true /*conservative*/, p, size, &start, &full_size, &mask);
+    if (mi_commit_mask_is_empty(&mask) || full_size==0) return;
+    
+    // update delayed commit
+    mi_assert_internal(segment->decommit_expire > 0 || mi_commit_mask_is_empty(&segment->decommit_mask));      
+    mi_commit_mask_t cmask;
+    mi_commit_mask_create_intersect(&segment->commit_mask, &mask, &cmask);  // only decommit what is committed; span_free may try to decommit more
+    mi_commit_mask_set(&segment->decommit_mask, &cmask);
+    mi_msecs_t now = _mi_clock_now();    
+    if (segment->decommit_expire == 0) {
+      // no previous decommits, initialize now
+      segment->decommit_expire = now + mi_option_get(mi_option_decommit_delay);
+    }
+    else if (segment->decommit_expire <= now) {
+      // previous decommit mask already expired
+      if (segment->decommit_expire + mi_option_get(mi_option_decommit_extend_delay) <= now) {
+        mi_segment_delayed_decommit(segment, true, stats);
+      }
+      else {
+        segment->decommit_expire = now + mi_option_get(mi_option_decommit_extend_delay); // (mi_option_get(mi_option_decommit_delay) / 8); // wait a tiny bit longer in case there is a series of free's
+      }
+    }
+    else {
+      // previous decommit mask is not yet expired, increase the expiration by a bit.
+      segment->decommit_expire += mi_option_get(mi_option_decommit_extend_delay);
+    }
+  }  
+}
+
+static void mi_segment_delayed_decommit(mi_segment_t* segment, bool force, mi_stats_t* stats) {
+  if (!segment->allow_decommit || mi_commit_mask_is_empty(&segment->decommit_mask)) return;
+  mi_msecs_t now = _mi_clock_now();
+  if (!force && now < segment->decommit_expire) return;
+
+  mi_commit_mask_t mask = segment->decommit_mask;
+  segment->decommit_expire = 0;
+  mi_commit_mask_create_empty(&segment->decommit_mask);
+
+  size_t idx;
+  size_t count;
+  mi_commit_mask_foreach(&mask, idx, count) {
+    // if found, decommit that sequence
+    if (count > 0) {
+      uint8_t* p = (uint8_t*)segment + (idx*MI_COMMIT_SIZE);
+      size_t size = count * MI_COMMIT_SIZE;
+      mi_segment_commitx(segment, false, p, size, stats);
+    }
+  }
+  mi_commit_mask_foreach_end()
+  mi_assert_internal(mi_commit_mask_is_empty(&segment->decommit_mask));
+}
+
+
+/* -----------------------------------------------------------
+   Span free
+----------------------------------------------------------- */
+
+static bool mi_segment_is_abandoned(mi_segment_t* segment) {
+  return (segment->thread_id == 0);
+}
+
+// note: can be called on abandoned segments
+static void mi_segment_span_free(mi_segment_t* segment, size_t slice_index, size_t slice_count, bool allow_decommit, mi_segments_tld_t* tld) {
+  mi_assert_internal(slice_index < segment->slice_entries);
+  mi_span_queue_t* sq = (segment->kind == MI_SEGMENT_HUGE || mi_segment_is_abandoned(segment) 
+                          ? NULL : mi_span_queue_for(slice_count,tld));
+  if (slice_count==0) slice_count = 1;
+  mi_assert_internal(slice_index + slice_count - 1 < segment->slice_entries);
+
+  // set first and last slice (the intermediates can be undetermined)
+  mi_slice_t* slice = &segment->slices[slice_index];
+  slice->slice_count = (uint32_t)slice_count;
+  mi_assert_internal(slice->slice_count == slice_count); // no overflow?
+  slice->slice_offset = 0;
+  if (slice_count > 1) {
+    mi_slice_t* last = &segment->slices[slice_index + slice_count - 1];
+    last->slice_count = 0;
+    last->slice_offset = (uint32_t)(sizeof(mi_page_t)*(slice_count - 1));
+    last->xblock_size = 0;
+  }
+
+  // perhaps decommit
+  if (allow_decommit) {
+    mi_segment_perhaps_decommit(segment, mi_slice_start(slice), slice_count * MI_SEGMENT_SLICE_SIZE, tld->stats);
+  }
+  
+  // and push it on the free page queue (if it was not a huge page)
+  if (sq != NULL) mi_span_queue_push( sq, slice );
+             else slice->xblock_size = 0; // mark huge page as free anyways
+}
+
+/*
+// called from reclaim to add existing free spans
+static void mi_segment_span_add_free(mi_slice_t* slice, mi_segments_tld_t* tld) {
+  mi_segment_t* segment = _mi_ptr_segment(slice);
+  mi_assert_internal(slice->xblock_size==0 && slice->slice_count>0 && slice->slice_offset==0);
+  size_t slice_index = mi_slice_index(slice);
+  mi_segment_span_free(segment,slice_index,slice->slice_count,tld);
+}
+*/
+
+static void mi_segment_span_remove_from_queue(mi_slice_t* slice, mi_segments_tld_t* tld) {
+  mi_assert_internal(slice->slice_count > 0 && slice->slice_offset==0 && slice->xblock_size==0);
+  mi_assert_internal(_mi_ptr_segment(slice)->kind != MI_SEGMENT_HUGE);
+  mi_span_queue_t* sq = mi_span_queue_for(slice->slice_count, tld);
+  mi_span_queue_delete(sq, slice);
+}
+
+// note: can be called on abandoned segments
+static mi_slice_t* mi_segment_span_free_coalesce(mi_slice_t* slice, mi_segments_tld_t* tld) {
+  mi_assert_internal(slice != NULL && slice->slice_count > 0 && slice->slice_offset == 0);
+  mi_segment_t* segment = _mi_ptr_segment(slice);
+  bool is_abandoned = mi_segment_is_abandoned(segment);
+
+  // for huge pages, just mark as free but don't add to the queues
+  if (segment->kind == MI_SEGMENT_HUGE) {
+    mi_assert_internal(segment->used == 1);  // decreased right after this call in `mi_segment_page_clear`
+    slice->xblock_size = 0;  // mark as free anyways
+    // we should mark the last slice `xblock_size=0` now to maintain invariants but we skip it to 
+    // avoid a possible cache miss (and the segment is about to be freed)
+    return slice;
+  }
+
+  // otherwise coalesce the span and add to the free span queues
+  size_t slice_count = slice->slice_count;
+  mi_slice_t* next = slice + slice->slice_count;
+  mi_assert_internal(next <= mi_segment_slices_end(segment));
+  if (next < mi_segment_slices_end(segment) && next->xblock_size==0) {
+    // free next block -- remove it from free and merge
+    mi_assert_internal(next->slice_count > 0 && next->slice_offset==0);
+    slice_count += next->slice_count; // extend
+    if (!is_abandoned) { mi_segment_span_remove_from_queue(next, tld); }
+  }
+  if (slice > segment->slices) {
+    mi_slice_t* prev = mi_slice_first(slice - 1);
+    mi_assert_internal(prev >= segment->slices);
+    if (prev->xblock_size==0) {
+      // free previous slice -- remove it from free and merge
+      mi_assert_internal(prev->slice_count > 0 && prev->slice_offset==0);
+      slice_count += prev->slice_count;
+      if (!is_abandoned) { mi_segment_span_remove_from_queue(prev, tld); }
+      slice = prev;
+    }
+  }
+
+  // and add the new free page
+  mi_segment_span_free(segment, mi_slice_index(slice), slice_count, true, tld);
+  return slice;
+}
+
+
+
+/* -----------------------------------------------------------
+   Page allocation
+----------------------------------------------------------- */
+
+// Note: may still return NULL if committing the memory failed
+static mi_page_t* mi_segment_span_allocate(mi_segment_t* segment, size_t slice_index, size_t slice_count, mi_segments_tld_t* tld) {
+  mi_assert_internal(slice_index < segment->slice_entries);
+  mi_slice_t* const slice = &segment->slices[slice_index];
+  mi_assert_internal(slice->xblock_size==0 || slice->xblock_size==1);
+
+  // commit before changing the slice data
+  if (!mi_segment_ensure_committed(segment, _mi_segment_page_start_from_slice(segment, slice, 0, NULL), slice_count * MI_SEGMENT_SLICE_SIZE, tld->stats)) {
+    return NULL;  // commit failed!
+  }
+
+  // convert the slices to a page
+  slice->slice_offset = 0;
+  slice->slice_count = (uint32_t)slice_count;
+  mi_assert_internal(slice->slice_count == slice_count);
+  const size_t bsize = slice_count * MI_SEGMENT_SLICE_SIZE;
+  slice->xblock_size = (uint32_t)(bsize >= MI_HUGE_BLOCK_SIZE ? MI_HUGE_BLOCK_SIZE : bsize);
+  mi_page_t*  page = mi_slice_to_page(slice);
+  mi_assert_internal(mi_page_block_size(page) == bsize);
+
+  // set slice back pointers for the first MI_MAX_SLICE_OFFSET entries
+  size_t extra = slice_count-1;
+  if (extra > MI_MAX_SLICE_OFFSET) extra = MI_MAX_SLICE_OFFSET;
+  if (slice_index + extra >= segment->slice_entries) extra = segment->slice_entries - slice_index - 1;  // huge objects may have more slices than avaiable entries in the segment->slices
+  
+  mi_slice_t* slice_next = slice + 1;
+  for (size_t i = 1; i <= extra; i++, slice_next++) {
+    slice_next->slice_offset = (uint32_t)(sizeof(mi_slice_t)*i);
+    slice_next->slice_count = 0;
+    slice_next->xblock_size = 1;
+  }
+
+  // and also for the last one (if not set already) (the last one is needed for coalescing and for large alignments)
+  // note: the cast is needed for ubsan since the index can be larger than MI_SLICES_PER_SEGMENT for huge allocations (see #543)
+  mi_slice_t* last = slice + slice_count - 1;
+  mi_slice_t* end = (mi_slice_t*)mi_segment_slices_end(segment);
+  if (last > end) last = end;
+  if (last > slice) {
+    last->slice_offset = (uint32_t)(sizeof(mi_slice_t) * (last - slice));
+    last->slice_count = 0;
+    last->xblock_size = 1;
+  }
+  
+  // and initialize the page
+  page->is_reset = false;
+  page->is_committed = true;
+  segment->used++;
+  return page;
+}
+
+static void mi_segment_slice_split(mi_segment_t* segment, mi_slice_t* slice, size_t slice_count, mi_segments_tld_t* tld) {
+  mi_assert_internal(_mi_ptr_segment(slice) == segment);
+  mi_assert_internal(slice->slice_count >= slice_count);
+  mi_assert_internal(slice->xblock_size > 0); // no more in free queue
+  if (slice->slice_count <= slice_count) return;
+  mi_assert_internal(segment->kind != MI_SEGMENT_HUGE);
+  size_t next_index = mi_slice_index(slice) + slice_count;
+  size_t next_count = slice->slice_count - slice_count;
+  mi_segment_span_free(segment, next_index, next_count, false /* don't decommit left-over part */, tld);
+  slice->slice_count = (uint32_t)slice_count;
+}
+
+static mi_page_t* mi_segments_page_find_and_allocate(size_t slice_count, mi_arena_id_t req_arena_id, mi_segments_tld_t* tld) {
+  mi_assert_internal(slice_count*MI_SEGMENT_SLICE_SIZE <= MI_LARGE_OBJ_SIZE_MAX);
+  // search from best fit up
+  mi_span_queue_t* sq = mi_span_queue_for(slice_count, tld);
+  if (slice_count == 0) slice_count = 1;
+  while (sq <= &tld->spans[MI_SEGMENT_BIN_MAX]) {
+    for (mi_slice_t* slice = sq->first; slice != NULL; slice = slice->next) {
+      if (slice->slice_count >= slice_count) {
+        // found one
+        mi_segment_t* segment = _mi_ptr_segment(slice);
+        if (_mi_arena_memid_is_suitable(segment->memid, req_arena_id)) {
+          // found a suitable page span
+          mi_span_queue_delete(sq, slice);
+
+          if (slice->slice_count > slice_count) {
+            mi_segment_slice_split(segment, slice, slice_count, tld);
+          }
+          mi_assert_internal(slice != NULL && slice->slice_count == slice_count && slice->xblock_size > 0);
+          mi_page_t* page = mi_segment_span_allocate(segment, mi_slice_index(slice), slice->slice_count, tld);
+          if (page == NULL) {
+            // commit failed; return NULL but first restore the slice
+            mi_segment_span_free_coalesce(slice, tld);
+            return NULL;
+          }
+          return page;
+        }
+      }
+    }
+    sq++;
+  }
+  // could not find a page..
+  return NULL;
+}
+
+
+/* -----------------------------------------------------------
+   Segment allocation
+----------------------------------------------------------- */
+
+static mi_segment_t* mi_segment_os_alloc( size_t required, size_t page_alignment, bool eager_delay, mi_arena_id_t req_arena_id,
+                                          size_t* psegment_slices, size_t* ppre_size, size_t* pinfo_slices, 
+                                          mi_commit_mask_t* pcommit_mask, mi_commit_mask_t* pdecommit_mask,
+                                          bool* is_zero, bool* pcommit, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
+
+{
+  // Allocate the segment from the OS
+  bool mem_large = (!eager_delay && (MI_SECURE==0)); // only allow large OS pages once we are no longer lazy    
+  bool is_pinned = false;
+  size_t memid = 0;
+  size_t align_offset = 0;
+  size_t alignment = MI_SEGMENT_ALIGN;
+  
+  if (page_alignment > 0) {
+    // mi_assert_internal(huge_page != NULL);
+    mi_assert_internal(page_alignment >= MI_SEGMENT_ALIGN);
+    alignment = page_alignment;
+    const size_t info_size = (*pinfo_slices) * MI_SEGMENT_SLICE_SIZE;
+    align_offset = _mi_align_up( info_size, MI_SEGMENT_ALIGN );
+    const size_t extra = align_offset - info_size;
+    // recalculate due to potential guard pages
+    *psegment_slices = mi_segment_calculate_slices(required + extra, ppre_size, pinfo_slices);
+    //segment_size += _mi_align_up(align_offset - info_size, MI_SEGMENT_SLICE_SIZE);
+    //segment_slices = segment_size / MI_SEGMENT_SLICE_SIZE;
+  }
+  const size_t segment_size = (*psegment_slices) * MI_SEGMENT_SLICE_SIZE;
+  mi_segment_t* segment = NULL;
+
+  // get from cache?
+  if (page_alignment == 0) {
+    segment = (mi_segment_t*)_mi_segment_cache_pop(segment_size, pcommit_mask, pdecommit_mask, &mem_large, &is_pinned, is_zero, req_arena_id, &memid, os_tld);
+  }
+  
+  // get from OS
+  if (segment==NULL) {
+    segment = (mi_segment_t*)_mi_arena_alloc_aligned(segment_size, alignment, align_offset, pcommit, &mem_large, &is_pinned, is_zero, req_arena_id, &memid, os_tld);
+    if (segment == NULL) return NULL;  // failed to allocate
+    if (*pcommit) {
+      mi_commit_mask_create_full(pcommit_mask);
+    }
+    else {
+      mi_commit_mask_create_empty(pcommit_mask);
+    }
+  }    
+  mi_assert_internal(segment != NULL && (uintptr_t)segment % MI_SEGMENT_SIZE == 0);
+
+  const size_t commit_needed = _mi_divide_up((*pinfo_slices)*MI_SEGMENT_SLICE_SIZE, MI_COMMIT_SIZE);
+  mi_assert_internal(commit_needed>0);
+  mi_commit_mask_t commit_needed_mask;
+  mi_commit_mask_create(0, commit_needed, &commit_needed_mask);
+  if (!mi_commit_mask_all_set(pcommit_mask, &commit_needed_mask)) {
+    // at least commit the info slices
+    mi_assert_internal(commit_needed*MI_COMMIT_SIZE >= (*pinfo_slices)*MI_SEGMENT_SLICE_SIZE);
+    bool ok = _mi_os_commit(segment, commit_needed*MI_COMMIT_SIZE, is_zero, tld->stats);
+    if (!ok) return NULL; // failed to commit 
+    mi_commit_mask_set(pcommit_mask, &commit_needed_mask); 
+  }
+  mi_track_mem_undefined(segment,commit_needed);
+  segment->memid = memid;
+  segment->mem_is_pinned = is_pinned;
+  segment->mem_is_large = mem_large;
+  segment->mem_is_committed = mi_commit_mask_is_full(pcommit_mask);
+  segment->mem_alignment = alignment;
+  segment->mem_align_offset = align_offset;
+  mi_segments_track_size((long)(segment_size), tld);
+  _mi_segment_map_allocated_at(segment);
+  return segment;
+}
+
+
+// Allocate a segment from the OS aligned to `MI_SEGMENT_SIZE` .
+static mi_segment_t* mi_segment_alloc(size_t required, size_t page_alignment, mi_arena_id_t req_arena_id, mi_segments_tld_t* tld, mi_os_tld_t* os_tld, mi_page_t** huge_page)
+{
+  mi_assert_internal((required==0 && huge_page==NULL) || (required>0 && huge_page != NULL));
+  
+  // calculate needed sizes first
+  size_t info_slices;
+  size_t pre_size;
+  size_t segment_slices = mi_segment_calculate_slices(required, &pre_size, &info_slices);
+  
+  // Commit eagerly only if not the first N lazy segments (to reduce impact of many threads that allocate just a little)
+  const bool eager_delay = (// !_mi_os_has_overcommit() &&             // never delay on overcommit systems
+                            _mi_current_thread_count() > 1 &&       // do not delay for the first N threads
+                            tld->count < (size_t)mi_option_get(mi_option_eager_commit_delay));
+  const bool eager = !eager_delay && mi_option_is_enabled(mi_option_eager_commit);
+  bool commit = eager || (required > 0);   
+  bool is_zero = false;  
+
+  mi_commit_mask_t commit_mask;
+  mi_commit_mask_t decommit_mask;
+  mi_commit_mask_create_empty(&commit_mask);
+  mi_commit_mask_create_empty(&decommit_mask);
+
+  // Allocate the segment from the OS  
+  mi_segment_t* segment = mi_segment_os_alloc(required, page_alignment, eager_delay, req_arena_id, 
+                                              &segment_slices, &pre_size, &info_slices, &commit_mask, &decommit_mask, 
+                                              &is_zero, &commit, tld, os_tld);
+  if (segment == NULL) return NULL;
+  
+  // zero the segment info? -- not always needed as it may be zero initialized from the OS 
+  mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, NULL);  // tsan
+  if (!is_zero) {
+    ptrdiff_t ofs = offsetof(mi_segment_t, next);
+    size_t    prefix = offsetof(mi_segment_t, slices) - ofs;
+    memset((uint8_t*)segment+ofs, 0, prefix + sizeof(mi_slice_t)*(segment_slices+1));  // one more
+  }
+  
+  segment->commit_mask = commit_mask; // on lazy commit, the initial part is always committed
+  segment->allow_decommit = (mi_option_is_enabled(mi_option_allow_decommit) && !segment->mem_is_pinned && !segment->mem_is_large);    
+  if (segment->allow_decommit) {
+    segment->decommit_expire = 0; // don't decommit just committed memory // _mi_clock_now() + mi_option_get(mi_option_decommit_delay);
+    segment->decommit_mask = decommit_mask;
+    mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->decommit_mask));
+    #if MI_DEBUG>2
+    const size_t commit_needed = _mi_divide_up(info_slices*MI_SEGMENT_SLICE_SIZE, MI_COMMIT_SIZE);
+    mi_commit_mask_t commit_needed_mask;
+    mi_commit_mask_create(0, commit_needed, &commit_needed_mask);
+    mi_assert_internal(!mi_commit_mask_any_set(&segment->decommit_mask, &commit_needed_mask));
+    #endif
+  }    
+  
+  // initialize segment info
+  const size_t slice_entries = (segment_slices > MI_SLICES_PER_SEGMENT ? MI_SLICES_PER_SEGMENT : segment_slices);
+  segment->segment_slices = segment_slices;
+  segment->segment_info_slices = info_slices;
+  segment->thread_id = _mi_thread_id();
+  segment->cookie = _mi_ptr_cookie(segment);
+  segment->slice_entries = slice_entries;
+  segment->kind = (required == 0 ? MI_SEGMENT_NORMAL : MI_SEGMENT_HUGE);
+
+  // memset(segment->slices, 0, sizeof(mi_slice_t)*(info_slices+1));
+  _mi_stat_increase(&tld->stats->page_committed, mi_segment_info_size(segment));
+
+  // set up guard pages
+  size_t guard_slices = 0;
+  if (MI_SECURE>0) {
+    // in secure mode, we set up a protected page in between the segment info
+    // and the page data, and at the end of the segment.
+    size_t os_pagesize = _mi_os_page_size();    
+    mi_assert_internal(mi_segment_info_size(segment) - os_pagesize >= pre_size);
+    _mi_os_protect((uint8_t*)segment + mi_segment_info_size(segment) - os_pagesize, os_pagesize);
+    uint8_t* end = (uint8_t*)segment + mi_segment_size(segment) - os_pagesize;
+    mi_segment_ensure_committed(segment, end, os_pagesize, tld->stats);
+    _mi_os_protect(end, os_pagesize);
+    if (slice_entries == segment_slices) segment->slice_entries--; // don't use the last slice :-(
+    guard_slices = 1;
+  }
+
+  // reserve first slices for segment info
+  mi_page_t* page0 = mi_segment_span_allocate(segment, 0, info_slices, tld);
+  mi_assert_internal(page0!=NULL); if (page0==NULL) return NULL; // cannot fail as we always commit in advance  
+  mi_assert_internal(segment->used == 1);
+  segment->used = 0; // don't count our internal slices towards usage
+  
+  // initialize initial free pages
+  if (segment->kind == MI_SEGMENT_NORMAL) { // not a huge page
+    mi_assert_internal(huge_page==NULL);
+    mi_segment_span_free(segment, info_slices, segment->slice_entries - info_slices, false /* don't decommit */, tld);
+  }
+  else {
+    mi_assert_internal(huge_page!=NULL);
+    mi_assert_internal(mi_commit_mask_is_empty(&segment->decommit_mask));
+    mi_assert_internal(mi_commit_mask_is_full(&segment->commit_mask));
+    *huge_page = mi_segment_span_allocate(segment, info_slices, segment_slices - info_slices - guard_slices, tld);
+    mi_assert_internal(*huge_page != NULL); // cannot fail as we commit in advance 
+  }
+
+  mi_assert_expensive(mi_segment_is_valid(segment,tld));
+  return segment;
+}
+
+
+static void mi_segment_free(mi_segment_t* segment, bool force, mi_segments_tld_t* tld) {
+  MI_UNUSED(force);
+  mi_assert_internal(segment != NULL);
+  mi_assert_internal(segment->next == NULL);
+  mi_assert_internal(segment->used == 0);
+
+  // Remove the free pages
+  mi_slice_t* slice = &segment->slices[0];
+  const mi_slice_t* end = mi_segment_slices_end(segment);
+  size_t page_count = 0;
+  while (slice < end) {
+    mi_assert_internal(slice->slice_count > 0);
+    mi_assert_internal(slice->slice_offset == 0);
+    mi_assert_internal(mi_slice_index(slice)==0 || slice->xblock_size == 0); // no more used pages ..
+    if (slice->xblock_size == 0 && segment->kind != MI_SEGMENT_HUGE) {
+      mi_segment_span_remove_from_queue(slice, tld);
+    }
+    page_count++;
+    slice = slice + slice->slice_count;
+  }
+  mi_assert_internal(page_count == 2); // first page is allocated by the segment itself
+
+  // stats
+  _mi_stat_decrease(&tld->stats->page_committed, mi_segment_info_size(segment));
+
+  // return it to the OS
+  mi_segment_os_free(segment, tld);
+}
+
+
+/* -----------------------------------------------------------
+   Page Free
+----------------------------------------------------------- */
+
+static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld);
+
+// note: can be called on abandoned pages
+static mi_slice_t* mi_segment_page_clear(mi_page_t* page, mi_segments_tld_t* tld) {
+  mi_assert_internal(page->xblock_size > 0);
+  mi_assert_internal(mi_page_all_free(page));
+  mi_segment_t* segment = _mi_ptr_segment(page);
+  mi_assert_internal(segment->used > 0);
+  
+  size_t inuse = page->capacity * mi_page_block_size(page);
+  _mi_stat_decrease(&tld->stats->page_committed, inuse);
+  _mi_stat_decrease(&tld->stats->pages, 1);
+
+  // reset the page memory to reduce memory pressure?
+  if (!segment->mem_is_pinned && !page->is_reset && mi_option_is_enabled(mi_option_page_reset)) {
+    size_t psize;
+    uint8_t* start = _mi_page_start(segment, page, &psize);
+    page->is_reset = true;
+    _mi_os_reset(start, psize, tld->stats);
+  }
+
+  // zero the page data, but not the segment fields
+  page->is_zero_init = false;
+  ptrdiff_t ofs = offsetof(mi_page_t, capacity);
+  memset((uint8_t*)page + ofs, 0, sizeof(*page) - ofs);
+  page->xblock_size = 1;
+
+  // and free it
+  mi_slice_t* slice = mi_segment_span_free_coalesce(mi_page_to_slice(page), tld);  
+  segment->used--;
+  // cannot assert segment valid as it is called during reclaim
+  // mi_assert_expensive(mi_segment_is_valid(segment, tld));
+  return slice;
+}
+
+void _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld)
+{
+  mi_assert(page != NULL);
+
+  mi_segment_t* segment = _mi_page_segment(page);
+  mi_assert_expensive(mi_segment_is_valid(segment,tld));
+
+  // mark it as free now
+  mi_segment_page_clear(page, tld);
+  mi_assert_expensive(mi_segment_is_valid(segment, tld));
+
+  if (segment->used == 0) {
+    // no more used pages; remove from the free list and free the segment
+    mi_segment_free(segment, force, tld);
+  }
+  else if (segment->used == segment->abandoned) {
+    // only abandoned pages; remove from free list and abandon
+    mi_segment_abandon(segment,tld);
+  }
+}
+
+
+/* -----------------------------------------------------------
+Abandonment
+
+When threads terminate, they can leave segments with
+live blocks (reachable through other threads). Such segments
+are "abandoned" and will be reclaimed by other threads to
+reuse their pages and/or free them eventually
+
+We maintain a global list of abandoned segments that are
+reclaimed on demand. Since this is shared among threads
+the implementation needs to avoid the A-B-A problem on
+popping abandoned segments: <https://en.wikipedia.org/wiki/ABA_problem>
+We use tagged pointers to avoid accidentially identifying
+reused segments, much like stamped references in Java.
+Secondly, we maintain a reader counter to avoid resetting
+or decommitting segments that have a pending read operation.
+
+Note: the current implementation is one possible design;
+another way might be to keep track of abandoned segments
+in the arenas/segment_cache's. This would have the advantage of keeping
+all concurrent code in one place and not needing to deal
+with ABA issues. The drawback is that it is unclear how to
+scan abandoned segments efficiently in that case as they
+would be spread among all other segments in the arenas.
+----------------------------------------------------------- */
+
+// Use the bottom 20-bits (on 64-bit) of the aligned segment pointers
+// to put in a tag that increments on update to avoid the A-B-A problem.
+#define MI_TAGGED_MASK   MI_SEGMENT_MASK
+typedef uintptr_t        mi_tagged_segment_t;
+
+static mi_segment_t* mi_tagged_segment_ptr(mi_tagged_segment_t ts) {
+  return (mi_segment_t*)(ts & ~MI_TAGGED_MASK);
+}
+
+static mi_tagged_segment_t mi_tagged_segment(mi_segment_t* segment, mi_tagged_segment_t ts) {
+  mi_assert_internal(((uintptr_t)segment & MI_TAGGED_MASK) == 0);
+  uintptr_t tag = ((ts & MI_TAGGED_MASK) + 1) & MI_TAGGED_MASK;
+  return ((uintptr_t)segment | tag);
+}
+
+// This is a list of visited abandoned pages that were full at the time.
+// this list migrates to `abandoned` when that becomes NULL. The use of
+// this list reduces contention and the rate at which segments are visited.
+static mi_decl_cache_align _Atomic(mi_segment_t*)       abandoned_visited; // = NULL
+
+// The abandoned page list (tagged as it supports pop)
+static mi_decl_cache_align _Atomic(mi_tagged_segment_t) abandoned;         // = NULL
+
+// Maintain these for debug purposes (these counts may be a bit off)
+static mi_decl_cache_align _Atomic(size_t)           abandoned_count;
+static mi_decl_cache_align _Atomic(size_t)           abandoned_visited_count;
+
+// We also maintain a count of current readers of the abandoned list
+// in order to prevent resetting/decommitting segment memory if it might
+// still be read.
+static mi_decl_cache_align _Atomic(size_t)           abandoned_readers; // = 0
+
+// Push on the visited list
+static void mi_abandoned_visited_push(mi_segment_t* segment) {
+  mi_assert_internal(segment->thread_id == 0);
+  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t,&segment->abandoned_next) == NULL);
+  mi_assert_internal(segment->next == NULL);
+  mi_assert_internal(segment->used > 0);
+  mi_segment_t* anext = mi_atomic_load_ptr_relaxed(mi_segment_t, &abandoned_visited);
+  do {
+    mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, anext);
+  } while (!mi_atomic_cas_ptr_weak_release(mi_segment_t, &abandoned_visited, &anext, segment));
+  mi_atomic_increment_relaxed(&abandoned_visited_count);
+}
+
+// Move the visited list to the abandoned list.
+static bool mi_abandoned_visited_revisit(void)
+{
+  // quick check if the visited list is empty
+  if (mi_atomic_load_ptr_relaxed(mi_segment_t, &abandoned_visited) == NULL) return false;
+
+  // grab the whole visited list
+  mi_segment_t* first = mi_atomic_exchange_ptr_acq_rel(mi_segment_t, &abandoned_visited, NULL);
+  if (first == NULL) return false;
+
+  // first try to swap directly if the abandoned list happens to be NULL
+  mi_tagged_segment_t afirst;
+  mi_tagged_segment_t ts = mi_atomic_load_relaxed(&abandoned);
+  if (mi_tagged_segment_ptr(ts)==NULL) {
+    size_t count = mi_atomic_load_relaxed(&abandoned_visited_count);
+    afirst = mi_tagged_segment(first, ts);
+    if (mi_atomic_cas_strong_acq_rel(&abandoned, &ts, afirst)) {
+      mi_atomic_add_relaxed(&abandoned_count, count);
+      mi_atomic_sub_relaxed(&abandoned_visited_count, count);
+      return true;
+    }
+  }
+
+  // find the last element of the visited list: O(n)
+  mi_segment_t* last = first;
+  mi_segment_t* next;
+  while ((next = mi_atomic_load_ptr_relaxed(mi_segment_t, &last->abandoned_next)) != NULL) {
+    last = next;
+  }
+
+  // and atomically prepend to the abandoned list
+  // (no need to increase the readers as we don't access the abandoned segments)
+  mi_tagged_segment_t anext = mi_atomic_load_relaxed(&abandoned);
+  size_t count;
+  do {
+    count = mi_atomic_load_relaxed(&abandoned_visited_count);
+    mi_atomic_store_ptr_release(mi_segment_t, &last->abandoned_next, mi_tagged_segment_ptr(anext));
+    afirst = mi_tagged_segment(first, anext);
+  } while (!mi_atomic_cas_weak_release(&abandoned, &anext, afirst));
+  mi_atomic_add_relaxed(&abandoned_count, count);
+  mi_atomic_sub_relaxed(&abandoned_visited_count, count);
+  return true;
+}
+
+// Push on the abandoned list.
+static void mi_abandoned_push(mi_segment_t* segment) {
+  mi_assert_internal(segment->thread_id == 0);
+  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL);
+  mi_assert_internal(segment->next == NULL);
+  mi_assert_internal(segment->used > 0);
+  mi_tagged_segment_t next;
+  mi_tagged_segment_t ts = mi_atomic_load_relaxed(&abandoned);
+  do {
+    mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, mi_tagged_segment_ptr(ts));
+    next = mi_tagged_segment(segment, ts);
+  } while (!mi_atomic_cas_weak_release(&abandoned, &ts, next));
+  mi_atomic_increment_relaxed(&abandoned_count);
+}
+
+// Wait until there are no more pending reads on segments that used to be in the abandoned list
+// called for example from `arena.c` before decommitting
+void _mi_abandoned_await_readers(void) {
+  size_t n;
+  do {
+    n = mi_atomic_load_acquire(&abandoned_readers);
+    if (n != 0) mi_atomic_yield();
+  } while (n != 0);
+}
+
+// Pop from the abandoned list
+static mi_segment_t* mi_abandoned_pop(void) {
+  mi_segment_t* segment;
+  // Check efficiently if it is empty (or if the visited list needs to be moved)
+  mi_tagged_segment_t ts = mi_atomic_load_relaxed(&abandoned);
+  segment = mi_tagged_segment_ptr(ts);
+  if mi_likely(segment == NULL) {
+    if mi_likely(!mi_abandoned_visited_revisit()) { // try to swap in the visited list on NULL
+      return NULL;
+    }
+  }
+
+  // Do a pop. We use a reader count to prevent
+  // a segment to be decommitted while a read is still pending,
+  // and a tagged pointer to prevent A-B-A link corruption.
+  // (this is called from `region.c:_mi_mem_free` for example)
+  mi_atomic_increment_relaxed(&abandoned_readers);  // ensure no segment gets decommitted
+  mi_tagged_segment_t next = 0;
+  ts = mi_atomic_load_acquire(&abandoned);
+  do {
+    segment = mi_tagged_segment_ptr(ts);
+    if (segment != NULL) {
+      mi_segment_t* anext = mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next);
+      next = mi_tagged_segment(anext, ts); // note: reads the segment's `abandoned_next` field so should not be decommitted
+    }
+  } while (segment != NULL && !mi_atomic_cas_weak_acq_rel(&abandoned, &ts, next));
+  mi_atomic_decrement_relaxed(&abandoned_readers);  // release reader lock
+  if (segment != NULL) {
+    mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, NULL);
+    mi_atomic_decrement_relaxed(&abandoned_count);
+  }
+  return segment;
+}
+
+/* -----------------------------------------------------------
+   Abandon segment/page
+----------------------------------------------------------- */
+
+static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) {
+  mi_assert_internal(segment->used == segment->abandoned);
+  mi_assert_internal(segment->used > 0);
+  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL);
+  mi_assert_internal(segment->abandoned_visits == 0);
+  mi_assert_expensive(mi_segment_is_valid(segment,tld));
+  
+  // remove the free pages from the free page queues
+  mi_slice_t* slice = &segment->slices[0];
+  const mi_slice_t* end = mi_segment_slices_end(segment);
+  while (slice < end) {
+    mi_assert_internal(slice->slice_count > 0);
+    mi_assert_internal(slice->slice_offset == 0);
+    if (slice->xblock_size == 0) { // a free page
+      mi_segment_span_remove_from_queue(slice,tld);
+      slice->xblock_size = 0; // but keep it free
+    }
+    slice = slice + slice->slice_count;
+  }
+
+  // perform delayed decommits
+  mi_segment_delayed_decommit(segment, mi_option_is_enabled(mi_option_abandoned_page_decommit) /* force? */, tld->stats);    
+  
+  // all pages in the segment are abandoned; add it to the abandoned list
+  _mi_stat_increase(&tld->stats->segments_abandoned, 1);
+  mi_segments_track_size(-((long)mi_segment_size(segment)), tld);
+  segment->thread_id = 0;
+  mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, NULL);
+  segment->abandoned_visits = 1;   // from 0 to 1 to signify it is abandoned
+  mi_abandoned_push(segment);
+}
+
+void _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld) {
+  mi_assert(page != NULL);
+  mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE);
+  mi_assert_internal(mi_page_heap(page) == NULL);
+  mi_segment_t* segment = _mi_page_segment(page);
+
+  mi_assert_expensive(mi_segment_is_valid(segment,tld));
+  segment->abandoned++;  
+
+  _mi_stat_increase(&tld->stats->pages_abandoned, 1);
+  mi_assert_internal(segment->abandoned <= segment->used);
+  if (segment->used == segment->abandoned) {
+    // all pages are abandoned, abandon the entire segment
+    mi_segment_abandon(segment, tld);
+  }
+}
+
+/* -----------------------------------------------------------
+  Reclaim abandoned pages
+----------------------------------------------------------- */
+
+static mi_slice_t* mi_slices_start_iterate(mi_segment_t* segment, const mi_slice_t** end) {
+  mi_slice_t* slice = &segment->slices[0];
+  *end = mi_segment_slices_end(segment);
+  mi_assert_internal(slice->slice_count>0 && slice->xblock_size>0); // segment allocated page
+  slice = slice + slice->slice_count; // skip the first segment allocated page
+  return slice;
+}
+
+// Possibly free pages and check if free space is available
+static bool mi_segment_check_free(mi_segment_t* segment, size_t slices_needed, size_t block_size, mi_segments_tld_t* tld) 
+{
+  mi_assert_internal(block_size < MI_HUGE_BLOCK_SIZE);
+  mi_assert_internal(mi_segment_is_abandoned(segment));
+  bool has_page = false;
+  
+  // for all slices
+  const mi_slice_t* end;
+  mi_slice_t* slice = mi_slices_start_iterate(segment, &end);
+  while (slice < end) {
+    mi_assert_internal(slice->slice_count > 0);
+    mi_assert_internal(slice->slice_offset == 0);
+    if (mi_slice_is_used(slice)) { // used page
+      // ensure used count is up to date and collect potential concurrent frees
+      mi_page_t* const page = mi_slice_to_page(slice);
+      _mi_page_free_collect(page, false);
+      if (mi_page_all_free(page)) {
+        // if this page is all free now, free it without adding to any queues (yet) 
+        mi_assert_internal(page->next == NULL && page->prev==NULL);
+        _mi_stat_decrease(&tld->stats->pages_abandoned, 1);
+        segment->abandoned--;
+        slice = mi_segment_page_clear(page, tld); // re-assign slice due to coalesce!
+        mi_assert_internal(!mi_slice_is_used(slice));
+        if (slice->slice_count >= slices_needed) {
+          has_page = true;
+        }
+      }
+      else {
+        if (page->xblock_size == block_size && mi_page_has_any_available(page)) {
+          // a page has available free blocks of the right size
+          has_page = true;
+        }
+      }      
+    }
+    else {
+      // empty span
+      if (slice->slice_count >= slices_needed) {
+        has_page = true;
+      }
+    }
+    slice = slice + slice->slice_count;
+  }
+  return has_page;
+}
+
+// Reclaim an abandoned segment; returns NULL if the segment was freed
+// set `right_page_reclaimed` to `true` if it reclaimed a page of the right `block_size` that was not full.
+static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap, size_t requested_block_size, bool* right_page_reclaimed, mi_segments_tld_t* tld) {
+  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL);
+  mi_assert_expensive(mi_segment_is_valid(segment, tld));
+  if (right_page_reclaimed != NULL) { *right_page_reclaimed = false; }
+
+  segment->thread_id = _mi_thread_id();
+  segment->abandoned_visits = 0;
+  mi_segments_track_size((long)mi_segment_size(segment), tld);
+  mi_assert_internal(segment->next == NULL);
+  _mi_stat_decrease(&tld->stats->segments_abandoned, 1);
+  
+  // for all slices
+  const mi_slice_t* end;
+  mi_slice_t* slice = mi_slices_start_iterate(segment, &end);
+  while (slice < end) {
+    mi_assert_internal(slice->slice_count > 0);
+    mi_assert_internal(slice->slice_offset == 0);
+    if (mi_slice_is_used(slice)) {
+      // in use: reclaim the page in our heap
+      mi_page_t* page = mi_slice_to_page(slice);
+      mi_assert_internal(!page->is_reset);
+      mi_assert_internal(page->is_committed);
+      mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE);
+      mi_assert_internal(mi_page_heap(page) == NULL);
+      mi_assert_internal(page->next == NULL && page->prev==NULL);
+      _mi_stat_decrease(&tld->stats->pages_abandoned, 1);
+      segment->abandoned--;
+      // set the heap again and allow delayed free again
+      mi_page_set_heap(page, heap);
+      _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, true); // override never (after heap is set)
+      _mi_page_free_collect(page, false); // ensure used count is up to date
+      if (mi_page_all_free(page)) {
+        // if everything free by now, free the page
+        slice = mi_segment_page_clear(page, tld);   // set slice again due to coalesceing
+      }
+      else {
+        // otherwise reclaim it into the heap
+        _mi_page_reclaim(heap, page);
+        if (requested_block_size == page->xblock_size && mi_page_has_any_available(page)) {
+          if (right_page_reclaimed != NULL) { *right_page_reclaimed = true; }
+        }
+      }
+    }
+    else {
+      // the span is free, add it to our page queues
+      slice = mi_segment_span_free_coalesce(slice, tld); // set slice again due to coalesceing
+    }
+    mi_assert_internal(slice->slice_count>0 && slice->slice_offset==0);
+    slice = slice + slice->slice_count;
+  }
+
+  mi_assert(segment->abandoned == 0);
+  if (segment->used == 0) {  // due to page_clear
+    mi_assert_internal(right_page_reclaimed == NULL || !(*right_page_reclaimed));
+    mi_segment_free(segment, false, tld);
+    return NULL;
+  }
+  else {
+    return segment;
+  }
+}
+
+
+void _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld) {
+  mi_segment_t* segment;
+  while ((segment = mi_abandoned_pop()) != NULL) {
+    mi_segment_reclaim(segment, heap, 0, NULL, tld);
+  }
+}
+
+static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t needed_slices, size_t block_size, bool* reclaimed, mi_segments_tld_t* tld)
+{
+  *reclaimed = false;
+  mi_segment_t* segment;
+  long max_tries = mi_option_get_clamp(mi_option_max_segment_reclaim, 8, 1024);     // limit the work to bound allocation times
+  while ((max_tries-- > 0) && ((segment = mi_abandoned_pop()) != NULL)) {
+    segment->abandoned_visits++;
+    // todo: an arena exclusive heap will potentially visit many abandoned unsuitable segments
+    // and push them into the visited list and use many tries. Perhaps we can skip non-suitable ones in a better way?
+    bool is_suitable = _mi_heap_memid_is_suitable(heap, segment->memid);
+    bool has_page = mi_segment_check_free(segment,needed_slices,block_size,tld); // try to free up pages (due to concurrent frees)
+    if (segment->used == 0) {
+      // free the segment (by forced reclaim) to make it available to other threads.
+      // note1: we prefer to free a segment as that might lead to reclaiming another
+      // segment that is still partially used.
+      // note2: we could in principle optimize this by skipping reclaim and directly
+      // freeing but that would violate some invariants temporarily)
+      mi_segment_reclaim(segment, heap, 0, NULL, tld);
+    }
+    else if (has_page && is_suitable) {
+      // found a large enough free span, or a page of the right block_size with free space 
+      // we return the result of reclaim (which is usually `segment`) as it might free
+      // the segment due to concurrent frees (in which case `NULL` is returned).
+      return mi_segment_reclaim(segment, heap, block_size, reclaimed, tld);
+    }
+    else if (segment->abandoned_visits > 3 && is_suitable) {  
+      // always reclaim on 3rd visit to limit the abandoned queue length.
+      mi_segment_reclaim(segment, heap, 0, NULL, tld);
+    }
+    else {
+      // otherwise, push on the visited list so it gets not looked at too quickly again
+      mi_segment_delayed_decommit(segment, true /* force? */, tld->stats); // forced decommit if needed as we may not visit soon again
+      mi_abandoned_visited_push(segment);
+    }
+  }
+  return NULL;
+}
+
+
+void _mi_abandoned_collect(mi_heap_t* heap, bool force, mi_segments_tld_t* tld)
+{
+  mi_segment_t* segment;
+  int max_tries = (force ? 16*1024 : 1024); // limit latency
+  if (force) {
+    mi_abandoned_visited_revisit(); 
+  }
+  while ((max_tries-- > 0) && ((segment = mi_abandoned_pop()) != NULL)) {
+    mi_segment_check_free(segment,0,0,tld); // try to free up pages (due to concurrent frees)
+    if (segment->used == 0) {
+      // free the segment (by forced reclaim) to make it available to other threads.
+      // note: we could in principle optimize this by skipping reclaim and directly
+      // freeing but that would violate some invariants temporarily)
+      mi_segment_reclaim(segment, heap, 0, NULL, tld);
+    }
+    else {
+      // otherwise, decommit if needed and push on the visited list 
+      // note: forced decommit can be expensive if many threads are destroyed/created as in mstress.
+      mi_segment_delayed_decommit(segment, force, tld->stats);
+      mi_abandoned_visited_push(segment);
+    }
+  }
+}
+
+/* -----------------------------------------------------------
+   Reclaim or allocate
+----------------------------------------------------------- */
+
+static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t needed_slices, size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
+{
+  mi_assert_internal(block_size < MI_HUGE_BLOCK_SIZE);
+  mi_assert_internal(block_size <= MI_LARGE_OBJ_SIZE_MAX);
+  
+  // 1. try to reclaim an abandoned segment
+  bool reclaimed;
+  mi_segment_t* segment = mi_segment_try_reclaim(heap, needed_slices, block_size, &reclaimed, tld);
+  if (reclaimed) {
+    // reclaimed the right page right into the heap
+    mi_assert_internal(segment != NULL);
+    return NULL; // pretend out-of-memory as the page will be in the page queue of the heap with available blocks
+  }
+  else if (segment != NULL) {
+    // reclaimed a segment with a large enough empty span in it
+    return segment;
+  }
+  // 2. otherwise allocate a fresh segment
+  return mi_segment_alloc(0, 0, heap->arena_id, tld, os_tld, NULL);  
+}
+
+
+/* -----------------------------------------------------------
+   Page allocation
+----------------------------------------------------------- */
+
+static mi_page_t* mi_segments_page_alloc(mi_heap_t* heap, mi_page_kind_t page_kind, size_t required, size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
+{
+  mi_assert_internal(required <= MI_LARGE_OBJ_SIZE_MAX && page_kind <= MI_PAGE_LARGE);
+
+  // find a free page
+  size_t page_size = _mi_align_up(required, (required > MI_MEDIUM_PAGE_SIZE ? MI_MEDIUM_PAGE_SIZE : MI_SEGMENT_SLICE_SIZE));
+  size_t slices_needed = page_size / MI_SEGMENT_SLICE_SIZE;
+  mi_assert_internal(slices_needed * MI_SEGMENT_SLICE_SIZE == page_size);
+  mi_page_t* page = mi_segments_page_find_and_allocate(slices_needed, heap->arena_id, tld); //(required <= MI_SMALL_SIZE_MAX ? 0 : slices_needed), tld);
+  if (page==NULL) {
+    // no free page, allocate a new segment and try again
+    if (mi_segment_reclaim_or_alloc(heap, slices_needed, block_size, tld, os_tld) == NULL) {
+      // OOM or reclaimed a good page in the heap
+      return NULL;  
+    }
+    else {
+      // otherwise try again
+      return mi_segments_page_alloc(heap, page_kind, required, block_size, tld, os_tld);
+    }
+  }
+  mi_assert_internal(page != NULL && page->slice_count*MI_SEGMENT_SLICE_SIZE == page_size);
+  mi_assert_internal(_mi_ptr_segment(page)->thread_id == _mi_thread_id());
+  mi_segment_delayed_decommit(_mi_ptr_segment(page), false, tld->stats);
+  return page;
+}
+
+
+
+/* -----------------------------------------------------------
+   Huge page allocation
+----------------------------------------------------------- */
+
+static mi_page_t* mi_segment_huge_page_alloc(size_t size, size_t page_alignment, mi_arena_id_t req_arena_id, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
+{
+  mi_page_t* page = NULL;
+  mi_segment_t* segment = mi_segment_alloc(size,page_alignment,req_arena_id,tld,os_tld,&page);
+  if (segment == NULL || page==NULL) return NULL;
+  mi_assert_internal(segment->used==1);
+  mi_assert_internal(mi_page_block_size(page) >= size);  
+  #if MI_HUGE_PAGE_ABANDON
+  segment->thread_id = 0; // huge segments are immediately abandoned
+  #endif  
+
+  // for huge pages we initialize the xblock_size as we may
+  // overallocate to accommodate large alignments.
+  size_t psize;
+  uint8_t* start = _mi_segment_page_start(segment, page, &psize);
+  page->xblock_size = (psize > MI_HUGE_BLOCK_SIZE ? MI_HUGE_BLOCK_SIZE : (uint32_t)psize);
+  
+  // decommit the part of the prefix of a page that will not be used; this can be quite large (close to MI_SEGMENT_SIZE)
+  if (page_alignment > 0 && segment->allow_decommit) {
+    uint8_t* aligned_p = (uint8_t*)_mi_align_up((uintptr_t)start, page_alignment);
+    mi_assert_internal(_mi_is_aligned(aligned_p, page_alignment));
+    mi_assert_internal(psize - (aligned_p - start) >= size);      
+    uint8_t* decommit_start = start + sizeof(mi_block_t);              // for the free list
+    ptrdiff_t decommit_size = aligned_p - decommit_start;
+    _mi_os_decommit(decommit_start, decommit_size, &_mi_stats_main);   // note: cannot use segment_decommit on huge segments    
+  }
+  
+  return page;
+}
+
+#if MI_HUGE_PAGE_ABANDON
+// free huge block from another thread
+void _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block_t* block) {
+  // huge page segments are always abandoned and can be freed immediately by any thread
+  mi_assert_internal(segment->kind==MI_SEGMENT_HUGE);
+  mi_assert_internal(segment == _mi_page_segment(page));
+  mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id)==0);
+
+  // claim it and free
+  mi_heap_t* heap = mi_heap_get_default(); // issue #221; don't use the internal get_default_heap as we need to ensure the thread is initialized.
+  // paranoia: if this it the last reference, the cas should always succeed
+  size_t expected_tid = 0;
+  if (mi_atomic_cas_strong_acq_rel(&segment->thread_id, &expected_tid, heap->thread_id)) {
+    mi_block_set_next(page, block, page->free);
+    page->free = block;
+    page->used--;
+    page->is_zero = false;
+    mi_assert(page->used == 0);
+    mi_tld_t* tld = heap->tld;
+    _mi_segment_page_free(page, true, &tld->segments);
+  }
+#if (MI_DEBUG!=0)
+  else {
+    mi_assert_internal(false);
+  }
+#endif
+}
+
+#else
+// reset memory of a huge block from another thread
+void _mi_segment_huge_page_reset(mi_segment_t* segment, mi_page_t* page, mi_block_t* block) {
+  MI_UNUSED(page);
+  mi_assert_internal(segment->kind == MI_SEGMENT_HUGE);
+  mi_assert_internal(segment == _mi_page_segment(page));
+  mi_assert_internal(page->used == 1); // this is called just before the free
+  mi_assert_internal(page->free == NULL);
+  if (segment->allow_decommit) {
+    const size_t csize = mi_usable_size(block) - sizeof(mi_block_t);
+    uint8_t* p = (uint8_t*)block + sizeof(mi_block_t);
+    _mi_os_decommit(p, csize, &_mi_stats_main);  // note: cannot use segment_decommit on huge segments
+  }
+}
+#endif
+
+/* -----------------------------------------------------------
+   Page allocation and free
+----------------------------------------------------------- */
+mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
+  mi_page_t* page;
+  if mi_unlikely(page_alignment > MI_ALIGNMENT_MAX) {
+    mi_assert_internal(_mi_is_power_of_two(page_alignment));
+    mi_assert_internal(page_alignment >= MI_SEGMENT_SIZE);
+    if (page_alignment < MI_SEGMENT_SIZE) { page_alignment = MI_SEGMENT_SIZE; }
+    page = mi_segment_huge_page_alloc(block_size,page_alignment,heap->arena_id,tld,os_tld);
+  }
+  else if (block_size <= MI_SMALL_OBJ_SIZE_MAX) {
+    page = mi_segments_page_alloc(heap,MI_PAGE_SMALL,block_size,block_size,tld,os_tld);
+  }
+  else if (block_size <= MI_MEDIUM_OBJ_SIZE_MAX) {
+    page = mi_segments_page_alloc(heap,MI_PAGE_MEDIUM,MI_MEDIUM_PAGE_SIZE,block_size,tld, os_tld);
+  }
+  else if (block_size <= MI_LARGE_OBJ_SIZE_MAX) {
+    page = mi_segments_page_alloc(heap,MI_PAGE_LARGE,block_size,block_size,tld, os_tld);
+  }
+  else {
+    page = mi_segment_huge_page_alloc(block_size,page_alignment,heap->arena_id,tld,os_tld);    
+  }
+  mi_assert_internal(page == NULL || _mi_heap_memid_is_suitable(heap, _mi_page_segment(page)->memid));
+  mi_assert_expensive(page == NULL || mi_segment_is_valid(_mi_page_segment(page),tld));
+  return page;
+}
+
+
diff --git a/Objects/mimalloc/stats.c b/Objects/mimalloc/stats.c
new file mode 100644
index 00000000000000..2a8b9404fbe264
--- /dev/null
+++ b/Objects/mimalloc/stats.c
@@ -0,0 +1,618 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include <stdio.h>  // fputs, stderr
+#include <string.h> // memset
+
+#if defined(_MSC_VER) && (_MSC_VER < 1920)
+#pragma warning(disable:4204)  // non-constant aggregate initializer
+#endif
+
+/* -----------------------------------------------------------
+  Statistics operations
+----------------------------------------------------------- */
+
+static bool mi_is_in_main(void* stat) {
+  return ((uint8_t*)stat >= (uint8_t*)&_mi_stats_main
+         && (uint8_t*)stat < ((uint8_t*)&_mi_stats_main + sizeof(mi_stats_t)));
+}
+
+static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) {
+  if (amount == 0) return;
+  if (mi_is_in_main(stat))
+  {
+    // add atomically (for abandoned pages)
+    int64_t current = mi_atomic_addi64_relaxed(&stat->current, amount);
+    mi_atomic_maxi64_relaxed(&stat->peak, current + amount);
+    if (amount > 0) {
+      mi_atomic_addi64_relaxed(&stat->allocated,amount);
+    }
+    else {
+      mi_atomic_addi64_relaxed(&stat->freed, -amount);
+    }
+  }
+  else {
+    // add thread local
+    stat->current += amount;
+    if (stat->current > stat->peak) stat->peak = stat->current;
+    if (amount > 0) {
+      stat->allocated += amount;
+    }
+    else {
+      stat->freed += -amount;
+    }
+  }
+}
+
+void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) {
+  if (mi_is_in_main(stat)) {
+    mi_atomic_addi64_relaxed( &stat->count, 1 );
+    mi_atomic_addi64_relaxed( &stat->total, (int64_t)amount );
+  }
+  else {
+    stat->count++;
+    stat->total += amount;
+  }
+}
+
+void _mi_stat_increase(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_update(stat, (int64_t)amount);
+}
+
+void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_update(stat, -((int64_t)amount));
+}
+
+// must be thread safe as it is called from stats_merge
+static void mi_stat_add(mi_stat_count_t* stat, const mi_stat_count_t* src, int64_t unit) {
+  if (stat==src) return;
+  if (src->allocated==0 && src->freed==0) return;
+  mi_atomic_addi64_relaxed( &stat->allocated, src->allocated * unit);
+  mi_atomic_addi64_relaxed( &stat->current, src->current * unit);
+  mi_atomic_addi64_relaxed( &stat->freed, src->freed * unit);
+  // peak scores do not work across threads..
+  mi_atomic_addi64_relaxed( &stat->peak, src->peak * unit);
+}
+
+static void mi_stat_counter_add(mi_stat_counter_t* stat, const mi_stat_counter_t* src, int64_t unit) {
+  if (stat==src) return;
+  mi_atomic_addi64_relaxed( &stat->total, src->total * unit);
+  mi_atomic_addi64_relaxed( &stat->count, src->count * unit);
+}
+
+// must be thread safe as it is called from stats_merge
+static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
+  if (stats==src) return;
+  mi_stat_add(&stats->segments, &src->segments,1);
+  mi_stat_add(&stats->pages, &src->pages,1);
+  mi_stat_add(&stats->reserved, &src->reserved, 1);
+  mi_stat_add(&stats->committed, &src->committed, 1);
+  mi_stat_add(&stats->reset, &src->reset, 1);
+  mi_stat_add(&stats->page_committed, &src->page_committed, 1);
+
+  mi_stat_add(&stats->pages_abandoned, &src->pages_abandoned, 1);
+  mi_stat_add(&stats->segments_abandoned, &src->segments_abandoned, 1);
+  mi_stat_add(&stats->threads, &src->threads, 1);
+
+  mi_stat_add(&stats->malloc, &src->malloc, 1);
+  mi_stat_add(&stats->segments_cache, &src->segments_cache, 1);
+  mi_stat_add(&stats->normal, &src->normal, 1);
+  mi_stat_add(&stats->huge, &src->huge, 1);
+  mi_stat_add(&stats->large, &src->large, 1);
+
+  mi_stat_counter_add(&stats->pages_extended, &src->pages_extended, 1);
+  mi_stat_counter_add(&stats->mmap_calls, &src->mmap_calls, 1);
+  mi_stat_counter_add(&stats->commit_calls, &src->commit_calls, 1);
+
+  mi_stat_counter_add(&stats->page_no_retire, &src->page_no_retire, 1);
+  mi_stat_counter_add(&stats->searches, &src->searches, 1);
+  mi_stat_counter_add(&stats->normal_count, &src->normal_count, 1);
+  mi_stat_counter_add(&stats->huge_count, &src->huge_count, 1);
+  mi_stat_counter_add(&stats->large_count, &src->large_count, 1);
+#if MI_STAT>1
+  for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
+    if (src->normal_bins[i].allocated > 0 || src->normal_bins[i].freed > 0) {
+      mi_stat_add(&stats->normal_bins[i], &src->normal_bins[i], 1);
+    }
+  }
+#endif
+}
+
+/* -----------------------------------------------------------
+  Display statistics
+----------------------------------------------------------- */
+
+// unit > 0 : size in binary bytes
+// unit == 0: count as decimal
+// unit < 0 : count in binary
+static void mi_printf_amount(int64_t n, int64_t unit, mi_output_fun* out, void* arg, const char* fmt) {
+  char buf[32]; buf[0] = 0;
+  int  len = 32;
+  const char* suffix = (unit <= 0 ? " " : "B");
+  const int64_t base = (unit == 0 ? 1000 : 1024);
+  if (unit>0) n *= unit;
+
+  const int64_t pos = (n < 0 ? -n : n);
+  if (pos < base) {
+    if (n!=1 || suffix[0] != 'B') {  // skip printing 1 B for the unit column
+      snprintf(buf, len, "%d %-3s", (int)n, (n==0 ? "" : suffix));
+    }
+  }
+  else {
+    int64_t divider = base;
+    const char* magnitude = "K";
+    if (pos >= divider*base) { divider *= base; magnitude = "M"; }
+    if (pos >= divider*base) { divider *= base; magnitude = "G"; }
+    const int64_t tens = (n / (divider/10));
+    const long whole = (long)(tens/10);
+    const long frac1 = (long)(tens%10);
+    char unitdesc[8];
+    snprintf(unitdesc, 8, "%s%s%s", magnitude, (base==1024 ? "i" : ""), suffix);
+    snprintf(buf, len, "%ld.%ld %-3s", whole, (frac1 < 0 ? -frac1 : frac1), unitdesc);
+  }
+  _mi_fprintf(out, arg, (fmt==NULL ? "%11s" : fmt), buf);
+}
+
+
+static void mi_print_amount(int64_t n, int64_t unit, mi_output_fun* out, void* arg) {
+  mi_printf_amount(n,unit,out,arg,NULL);
+}
+
+static void mi_print_count(int64_t n, int64_t unit, mi_output_fun* out, void* arg) {
+  if (unit==1) _mi_fprintf(out, arg, "%11s"," ");
+          else mi_print_amount(n,0,out,arg);
+}
+
+static void mi_stat_print_ex(const mi_stat_count_t* stat, const char* msg, int64_t unit, mi_output_fun* out, void* arg, const char* notok ) {
+  _mi_fprintf(out, arg,"%10s:", msg);
+  if (unit > 0) {
+    mi_print_amount(stat->peak, unit, out, arg);
+    mi_print_amount(stat->allocated, unit, out, arg);
+    mi_print_amount(stat->freed, unit, out, arg);
+    mi_print_amount(stat->current, unit, out, arg);
+    mi_print_amount(unit, 1, out, arg);
+    mi_print_count(stat->allocated, unit, out, arg);
+    if (stat->allocated > stat->freed) {
+      _mi_fprintf(out, arg, "  ");
+      _mi_fprintf(out, arg, (notok == NULL ? "not all freed!" : notok));
+      _mi_fprintf(out, arg, "\n");
+    }
+    else {
+      _mi_fprintf(out, arg, "  ok\n");
+    }
+  }
+  else if (unit<0) {
+    mi_print_amount(stat->peak, -1, out, arg);
+    mi_print_amount(stat->allocated, -1, out, arg);
+    mi_print_amount(stat->freed, -1, out, arg);
+    mi_print_amount(stat->current, -1, out, arg);
+    if (unit==-1) {
+      _mi_fprintf(out, arg, "%22s", "");
+    }
+    else {
+      mi_print_amount(-unit, 1, out, arg);
+      mi_print_count((stat->allocated / -unit), 0, out, arg);
+    }
+    if (stat->allocated > stat->freed)
+      _mi_fprintf(out, arg, "  not all freed!\n");
+    else
+      _mi_fprintf(out, arg, "  ok\n");
+  }
+  else {
+    mi_print_amount(stat->peak, 1, out, arg);
+    mi_print_amount(stat->allocated, 1, out, arg);
+    _mi_fprintf(out, arg, "%11s", " ");  // no freed
+    mi_print_amount(stat->current, 1, out, arg);
+    _mi_fprintf(out, arg, "\n");
+  }
+}
+
+static void mi_stat_print(const mi_stat_count_t* stat, const char* msg, int64_t unit, mi_output_fun* out, void* arg) {
+  mi_stat_print_ex(stat, msg, unit, out, arg, NULL);
+}
+
+static void mi_stat_counter_print(const mi_stat_counter_t* stat, const char* msg, mi_output_fun* out, void* arg ) {
+  _mi_fprintf(out, arg, "%10s:", msg);
+  mi_print_amount(stat->total, -1, out, arg);
+  _mi_fprintf(out, arg, "\n");
+}
+
+static void mi_stat_counter_print_avg(const mi_stat_counter_t* stat, const char* msg, mi_output_fun* out, void* arg) {
+  const int64_t avg_tens = (stat->count == 0 ? 0 : (stat->total*10 / stat->count));
+  const long avg_whole = (long)(avg_tens/10);
+  const long avg_frac1 = (long)(avg_tens%10);
+  _mi_fprintf(out, arg, "%10s: %5ld.%ld avg\n", msg, avg_whole, avg_frac1);
+}
+
+
+static void mi_print_header(mi_output_fun* out, void* arg ) {
+  _mi_fprintf(out, arg, "%10s: %10s %10s %10s %10s %10s %10s\n", "heap stats", "peak   ", "total   ", "freed   ", "current   ", "unit   ", "count   ");
+}
+
+#if MI_STAT>1
+static void mi_stats_print_bins(const mi_stat_count_t* bins, size_t max, const char* fmt, mi_output_fun* out, void* arg) {
+  bool found = false;
+  char buf[64];
+  for (size_t i = 0; i <= max; i++) {
+    if (bins[i].allocated > 0) {
+      found = true;
+      int64_t unit = _mi_bin_size((uint8_t)i);
+      snprintf(buf, 64, "%s %3lu", fmt, (long)i);
+      mi_stat_print(&bins[i], buf, unit, out, arg);
+    }
+  }
+  if (found) {
+    _mi_fprintf(out, arg, "\n");
+    mi_print_header(out, arg);
+  }
+}
+#endif
+
+
+
+//------------------------------------------------------------
+// Use an output wrapper for line-buffered output
+// (which is nice when using loggers etc.)
+//------------------------------------------------------------
+typedef struct buffered_s {
+  mi_output_fun* out;   // original output function
+  void*          arg;   // and state
+  char*          buf;   // local buffer of at least size `count+1`
+  size_t         used;  // currently used chars `used <= count`
+  size_t         count; // total chars available for output
+} buffered_t;
+
+static void mi_buffered_flush(buffered_t* buf) {
+  buf->buf[buf->used] = 0;
+  _mi_fputs(buf->out, buf->arg, NULL, buf->buf);
+  buf->used = 0;
+}
+
+static void mi_cdecl mi_buffered_out(const char* msg, void* arg) {
+  buffered_t* buf = (buffered_t*)arg;
+  if (msg==NULL || buf==NULL) return;
+  for (const char* src = msg; *src != 0; src++) {
+    char c = *src;
+    if (buf->used >= buf->count) mi_buffered_flush(buf);
+    mi_assert_internal(buf->used < buf->count);
+    buf->buf[buf->used++] = c;
+    if (c == '\n') mi_buffered_flush(buf);
+  }
+}
+
+//------------------------------------------------------------
+// Print statistics
+//------------------------------------------------------------
+
+static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults);
+
+static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0) mi_attr_noexcept {
+  // wrap the output function to be line buffered
+  char buf[256];
+  buffered_t buffer = { out0, arg0, NULL, 0, 255 };
+  buffer.buf = buf;
+  mi_output_fun* out = &mi_buffered_out;
+  void* arg = &buffer;
+
+  // and print using that
+  mi_print_header(out,arg);
+  #if MI_STAT>1
+  mi_stats_print_bins(stats->normal_bins, MI_BIN_HUGE, "normal",out,arg);
+  #endif
+  #if MI_STAT
+  mi_stat_print(&stats->normal, "normal", (stats->normal_count.count == 0 ? 1 : -(stats->normal.allocated / stats->normal_count.count)), out, arg);
+  mi_stat_print(&stats->large, "large", (stats->large_count.count == 0 ? 1 : -(stats->large.allocated / stats->large_count.count)), out, arg);
+  mi_stat_print(&stats->huge, "huge", (stats->huge_count.count == 0 ? 1 : -(stats->huge.allocated / stats->huge_count.count)), out, arg);
+  mi_stat_count_t total = { 0,0,0,0 };
+  mi_stat_add(&total, &stats->normal, 1);
+  mi_stat_add(&total, &stats->large, 1);
+  mi_stat_add(&total, &stats->huge, 1);
+  mi_stat_print(&total, "total", 1, out, arg);
+  #endif
+  #if MI_STAT>1
+  mi_stat_print(&stats->malloc, "malloc req", 1, out, arg);
+  _mi_fprintf(out, arg, "\n");
+  #endif
+  mi_stat_print_ex(&stats->reserved, "reserved", 1, out, arg, "");
+  mi_stat_print_ex(&stats->committed, "committed", 1, out, arg, "");
+  mi_stat_print(&stats->reset, "reset", 1, out, arg);
+  mi_stat_print(&stats->page_committed, "touched", 1, out, arg);
+  mi_stat_print(&stats->segments, "segments", -1, out, arg);
+  mi_stat_print(&stats->segments_abandoned, "-abandoned", -1, out, arg);
+  mi_stat_print(&stats->segments_cache, "-cached", -1, out, arg);
+  mi_stat_print(&stats->pages, "pages", -1, out, arg);
+  mi_stat_print(&stats->pages_abandoned, "-abandoned", -1, out, arg);
+  mi_stat_counter_print(&stats->pages_extended, "-extended", out, arg);
+  mi_stat_counter_print(&stats->page_no_retire, "-noretire", out, arg);
+  mi_stat_counter_print(&stats->mmap_calls, "mmaps", out, arg);
+  mi_stat_counter_print(&stats->commit_calls, "commits", out, arg);
+  mi_stat_print(&stats->threads, "threads", -1, out, arg);
+  mi_stat_counter_print_avg(&stats->searches, "searches", out, arg);
+  _mi_fprintf(out, arg, "%10s: %7zu\n", "numa nodes", _mi_os_numa_node_count());
+
+  mi_msecs_t elapsed;
+  mi_msecs_t user_time;
+  mi_msecs_t sys_time;
+  size_t current_rss;
+  size_t peak_rss;
+  size_t current_commit;
+  size_t peak_commit;
+  size_t page_faults;
+  mi_stat_process_info(&elapsed, &user_time, &sys_time, &current_rss, &peak_rss, &current_commit, &peak_commit, &page_faults);
+  _mi_fprintf(out, arg, "%10s: %7ld.%03ld s\n", "elapsed", elapsed/1000, elapsed%1000);
+  _mi_fprintf(out, arg, "%10s: user: %ld.%03ld s, system: %ld.%03ld s, faults: %lu, rss: ", "process",
+              user_time/1000, user_time%1000, sys_time/1000, sys_time%1000, (unsigned long)page_faults );
+  mi_printf_amount((int64_t)peak_rss, 1, out, arg, "%s");
+  if (peak_commit > 0) {
+    _mi_fprintf(out, arg, ", commit: ");
+    mi_printf_amount((int64_t)peak_commit, 1, out, arg, "%s");
+  }
+  _mi_fprintf(out, arg, "\n");
+}
+
+static mi_msecs_t mi_process_start; // = 0
+
+static mi_stats_t* mi_stats_get_default(void) {
+  mi_heap_t* heap = mi_heap_get_default();
+  return &heap->tld->stats;
+}
+
+static void mi_stats_merge_from(mi_stats_t* stats) {
+  if (stats != &_mi_stats_main) {
+    mi_stats_add(&_mi_stats_main, stats);
+    memset(stats, 0, sizeof(mi_stats_t));
+  }
+}
+
+void mi_stats_reset(void) mi_attr_noexcept {
+  mi_stats_t* stats = mi_stats_get_default();
+  if (stats != &_mi_stats_main) { memset(stats, 0, sizeof(mi_stats_t)); }
+  memset(&_mi_stats_main, 0, sizeof(mi_stats_t));
+  if (mi_process_start == 0) { mi_process_start = _mi_clock_start(); };
+}
+
+void mi_stats_merge(void) mi_attr_noexcept {
+  mi_stats_merge_from( mi_stats_get_default() );
+}
+
+void _mi_stats_done(mi_stats_t* stats) {  // called from `mi_thread_done`
+  mi_stats_merge_from(stats);
+}
+
+void mi_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept {
+  mi_stats_merge_from(mi_stats_get_default());
+  _mi_stats_print(&_mi_stats_main, out, arg);
+}
+
+void mi_stats_print(void* out) mi_attr_noexcept {
+  // for compatibility there is an `out` parameter (which can be `stdout` or `stderr`)
+  mi_stats_print_out((mi_output_fun*)out, NULL);
+}
+
+void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept {
+  _mi_stats_print(mi_stats_get_default(), out, arg);
+}
+
+
+// ----------------------------------------------------------------
+// Basic timer for convenience; use milli-seconds to avoid doubles
+// ----------------------------------------------------------------
+#ifdef _WIN32
+#include <windows.h>
+static mi_msecs_t mi_to_msecs(LARGE_INTEGER t) {
+  static LARGE_INTEGER mfreq; // = 0
+  if (mfreq.QuadPart == 0LL) {
+    LARGE_INTEGER f;
+    QueryPerformanceFrequency(&f);
+    mfreq.QuadPart = f.QuadPart/1000LL;
+    if (mfreq.QuadPart == 0) mfreq.QuadPart = 1;
+  }
+  return (mi_msecs_t)(t.QuadPart / mfreq.QuadPart);
+}
+
+mi_msecs_t _mi_clock_now(void) {
+  LARGE_INTEGER t;
+  QueryPerformanceCounter(&t);
+  return mi_to_msecs(t);
+}
+#else
+#include <time.h>
+#if defined(CLOCK_REALTIME) || defined(CLOCK_MONOTONIC)
+mi_msecs_t _mi_clock_now(void) {
+  struct timespec t;
+  #ifdef CLOCK_MONOTONIC
+  clock_gettime(CLOCK_MONOTONIC, &t);
+  #else
+  clock_gettime(CLOCK_REALTIME, &t);
+  #endif
+  return ((mi_msecs_t)t.tv_sec * 1000) + ((mi_msecs_t)t.tv_nsec / 1000000);
+}
+#else
+// low resolution timer
+mi_msecs_t _mi_clock_now(void) {
+  return ((mi_msecs_t)clock() / ((mi_msecs_t)CLOCKS_PER_SEC / 1000));
+}
+#endif
+#endif
+
+
+static mi_msecs_t mi_clock_diff;
+
+mi_msecs_t _mi_clock_start(void) {
+  if (mi_clock_diff == 0.0) {
+    mi_msecs_t t0 = _mi_clock_now();
+    mi_clock_diff = _mi_clock_now() - t0;
+  }
+  return _mi_clock_now();
+}
+
+mi_msecs_t _mi_clock_end(mi_msecs_t start) {
+  mi_msecs_t end = _mi_clock_now();
+  return (end - start - mi_clock_diff);
+}
+
+
+// --------------------------------------------------------
+// Basic process statistics
+// --------------------------------------------------------
+
+#if defined(_WIN32)
+#include <windows.h>
+
+static mi_msecs_t filetime_msecs(const FILETIME* ftime) {
+  ULARGE_INTEGER i;
+  i.LowPart = ftime->dwLowDateTime;
+  i.HighPart = ftime->dwHighDateTime;
+  mi_msecs_t msecs = (i.QuadPart / 10000); // FILETIME is in 100 nano seconds
+  return msecs;
+}
+
+typedef struct _PROCESS_MEMORY_COUNTERS {
+  DWORD cb;
+  DWORD PageFaultCount;
+  SIZE_T PeakWorkingSetSize;
+  SIZE_T WorkingSetSize;
+  SIZE_T QuotaPeakPagedPoolUsage;
+  SIZE_T QuotaPagedPoolUsage;
+  SIZE_T QuotaPeakNonPagedPoolUsage;
+  SIZE_T QuotaNonPagedPoolUsage;
+  SIZE_T PagefileUsage;
+  SIZE_T PeakPagefileUsage;
+} PROCESS_MEMORY_COUNTERS;
+typedef PROCESS_MEMORY_COUNTERS* PPROCESS_MEMORY_COUNTERS;
+typedef BOOL (WINAPI *PGetProcessMemoryInfo)(HANDLE, PPROCESS_MEMORY_COUNTERS, DWORD);
+static PGetProcessMemoryInfo pGetProcessMemoryInfo = NULL;
+
+static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults)
+{
+  *elapsed = _mi_clock_end(mi_process_start);
+  FILETIME ct;
+  FILETIME ut;
+  FILETIME st;
+  FILETIME et;
+  GetProcessTimes(GetCurrentProcess(), &ct, &et, &st, &ut);
+  *utime = filetime_msecs(&ut);
+  *stime = filetime_msecs(&st);
+  
+  // load psapi on demand
+  if (pGetProcessMemoryInfo == NULL) {
+    HINSTANCE hDll = LoadLibrary(TEXT("psapi.dll"));
+    if (hDll != NULL) {
+      pGetProcessMemoryInfo = (PGetProcessMemoryInfo)(void (*)(void))GetProcAddress(hDll, "GetProcessMemoryInfo");
+    }
+  }
+
+  // get process info
+  PROCESS_MEMORY_COUNTERS info;
+  memset(&info, 0, sizeof(info));
+  if (pGetProcessMemoryInfo != NULL) {
+    pGetProcessMemoryInfo(GetCurrentProcess(), &info, sizeof(info));
+  } 
+  *current_rss    = (size_t)info.WorkingSetSize;
+  *peak_rss       = (size_t)info.PeakWorkingSetSize;
+  *current_commit = (size_t)info.PagefileUsage;
+  *peak_commit    = (size_t)info.PeakPagefileUsage;
+  *page_faults    = (size_t)info.PageFaultCount;
+}
+
+#elif !defined(__wasi__) && (defined(__unix__) || defined(__unix) || defined(unix) || defined(__APPLE__) || defined(__HAIKU__))
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/resource.h>
+
+#if defined(__APPLE__)
+#include <mach/mach.h>
+#endif
+
+#if defined(__HAIKU__)
+#include <kernel/OS.h>
+#endif
+
+static mi_msecs_t timeval_secs(const struct timeval* tv) {
+  return ((mi_msecs_t)tv->tv_sec * 1000L) + ((mi_msecs_t)tv->tv_usec / 1000L);
+}
+
+static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults)
+{
+  *elapsed = _mi_clock_end(mi_process_start);
+  struct rusage rusage;
+  getrusage(RUSAGE_SELF, &rusage);
+  *utime = timeval_secs(&rusage.ru_utime);
+  *stime = timeval_secs(&rusage.ru_stime);
+#if !defined(__HAIKU__)
+  *page_faults = rusage.ru_majflt;
+#endif
+  // estimate commit using our stats
+  *peak_commit    = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.peak));
+  *current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.current));
+  *current_rss    = *current_commit;  // estimate
+#if defined(__HAIKU__)
+  // Haiku does not have (yet?) a way to
+  // get these stats per process
+  thread_info tid;
+  area_info mem;
+  ssize_t c;
+  get_thread_info(find_thread(0), &tid);
+  while (get_next_area_info(tid.team, &c, &mem) == B_OK) {
+    *peak_rss += mem.ram_size;
+  }
+  *page_faults = 0;
+#elif defined(__APPLE__)
+  *peak_rss = rusage.ru_maxrss;         // BSD reports in bytes
+  struct mach_task_basic_info info;
+  mach_msg_type_number_t infoCount = MACH_TASK_BASIC_INFO_COUNT;
+  if (task_info(mach_task_self(), MACH_TASK_BASIC_INFO, (task_info_t)&info, &infoCount) == KERN_SUCCESS) {
+    *current_rss = (size_t)info.resident_size;
+  }
+#else
+  *peak_rss = rusage.ru_maxrss * 1024;  // Linux reports in KiB
+#endif
+}
+
+#else
+#ifndef __wasi__
+// WebAssembly instances are not processes
+#pragma message("define a way to get process info")
+#endif
+
+static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults)
+{
+  *elapsed = _mi_clock_end(mi_process_start);
+  *peak_commit    = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.peak));
+  *current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.current));
+  *peak_rss    = *peak_commit;
+  *current_rss = *current_commit;
+  *page_faults = 0;
+  *utime = 0;
+  *stime = 0;
+}
+#endif
+
+
+mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults) mi_attr_noexcept
+{
+  mi_msecs_t elapsed = 0;
+  mi_msecs_t utime = 0;
+  mi_msecs_t stime = 0;
+  size_t current_rss0 = 0;
+  size_t peak_rss0 = 0;
+  size_t current_commit0 = 0;
+  size_t peak_commit0 = 0;
+  size_t page_faults0 = 0;
+  mi_stat_process_info(&elapsed,&utime, &stime, &current_rss0, &peak_rss0, &current_commit0, &peak_commit0, &page_faults0);
+  if (elapsed_msecs!=NULL)  *elapsed_msecs = (elapsed < 0 ? 0 : (elapsed < (mi_msecs_t)PTRDIFF_MAX ? (size_t)elapsed : PTRDIFF_MAX));
+  if (user_msecs!=NULL)     *user_msecs     = (utime < 0 ? 0 : (utime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)utime : PTRDIFF_MAX));
+  if (system_msecs!=NULL)   *system_msecs   = (stime < 0 ? 0 : (stime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)stime : PTRDIFF_MAX));
+  if (current_rss!=NULL)    *current_rss    = current_rss0;
+  if (peak_rss!=NULL)       *peak_rss       = peak_rss0;
+  if (current_commit!=NULL) *current_commit = current_commit0;
+  if (peak_commit!=NULL)    *peak_commit    = peak_commit0;
+  if (page_faults!=NULL)    *page_faults    = page_faults0;
+}
diff --git a/PCbuild/_freeze_module.vcxproj b/PCbuild/_freeze_module.vcxproj
index aaa63fe9456fb7..58bbbd8a508e18 100644
--- a/PCbuild/_freeze_module.vcxproj
+++ b/PCbuild/_freeze_module.vcxproj
@@ -147,6 +147,19 @@
     <ClCompile Include="..\Objects\longobject.c" />
     <ClCompile Include="..\Objects\memoryobject.c" />
     <ClCompile Include="..\Objects\methodobject.c" />
+    <ClCompile Include="..\Objects\mimalloc\alloc-aligned.c" />
+    <ClCompile Include="..\Objects\mimalloc\alloc.c" />
+    <ClCompile Include="..\Objects\mimalloc\arena.c" />
+    <ClCompile Include="..\Objects\mimalloc\bitmap.c" />
+    <ClCompile Include="..\Objects\mimalloc\heap.c" />
+    <ClCompile Include="..\Objects\mimalloc\init.c" />
+    <ClCompile Include="..\Objects\mimalloc\options.c" />
+    <ClCompile Include="..\Objects\mimalloc\os.c" />
+    <ClCompile Include="..\Objects\mimalloc\page.c" />
+    <ClCompile Include="..\Objects\mimalloc\random.c" />
+    <ClCompile Include="..\Objects\mimalloc\segment.c" />
+    <ClCompile Include="..\Objects\mimalloc\segment-cache.c" />
+    <ClCompile Include="..\Objects\mimalloc\stats.c" />
     <ClCompile Include="..\Objects\moduleobject.c" />
     <ClCompile Include="..\Objects\namespaceobject.c" />
     <ClCompile Include="..\Objects\object.c" />
diff --git a/PCbuild/_freeze_module.vcxproj.filters b/PCbuild/_freeze_module.vcxproj.filters
index 279736b0fbbb13..acdf68dc32a0f9 100644
--- a/PCbuild/_freeze_module.vcxproj.filters
+++ b/PCbuild/_freeze_module.vcxproj.filters
@@ -250,6 +250,45 @@
     <ClCompile Include="..\Objects\methodobject.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\Objects\mimalloc\alloc-aligned.c">
+      <Filter>Source Files</Filter>
+    </CLCompile>
+    <ClCompile Include="..\Objects\mimalloc\alloc.c">
+      <Filter>Source Files</Filter>
+    </CLCompile>
+    <ClCompile Include="..\Objects\mimalloc\arena.c">
+      <Filter>Source Files</Filter>
+    </CLCompile>
+    <ClCompile Include="..\Objects\mimalloc\bitmap.c">
+      <Filter>Source Files</Filter>
+    </CLCompile>
+    <ClCompile Include="..\Objects\mimalloc\heap.c">
+      <Filter>Source Files</Filter>
+    </CLCompile>
+    <ClCompile Include="..\Objects\mimalloc\init.c">
+      <Filter>Source Files</Filter>
+    </CLCompile>
+    <ClCompile Include="..\Objects\mimalloc\options.c">
+      <Filter>Source Files</Filter>
+    </CLCompile>
+    <ClCompile Include="..\Objects\mimalloc\os.c">
+      <Filter>Source Files</Filter>
+    </CLCompile>
+    <ClCompile Include="..\Objects\mimalloc\page.c">
+      <Filter>Source Files</Filter>
+    </CLCompile>
+    <ClCompile Include="..\Objects\mimalloc\random.c">
+      <Filter>Source Files</Filter>
+    </CLCompile>
+    <ClCompile Include="..\Objects\mimalloc\segment.c">
+      <Filter>Source Files</Filter>
+    </CLCompile>
+    <ClCompile Include="..\Objects\mimalloc\segment-cache.c">
+      <Filter>Source Files</Filter>
+    </CLCompile>
+    <ClCompile Include="..\Objects\mimalloc\stats.c">
+      <Filter>Source Files</Filter>
+    </CLCompile>
     <ClCompile Include="..\Python\modsupport.c">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/PCbuild/pyproject.props b/PCbuild/pyproject.props
index b8d2d3d265543f..437ffa80f1665c 100644
--- a/PCbuild/pyproject.props
+++ b/PCbuild/pyproject.props
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <ItemDefinitionGroup>
     <ClCompile>
-      <AdditionalIncludeDirectories>$(PySourcePath)Include;$(PySourcePath)Include\internal;$(PySourcePath)PC;$(IntDir);%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>$(PySourcePath)Include;$(PySourcePath)Include\internal;$(PySourcePath)Include\mimalloc;$(PySourcePath)PC;$(IntDir);%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <PreprocessorDefinitions>WIN32;$(_Py3NamePreprocessorDefinition);$(_PlatformPreprocessorDefinition)$(_DebugPreprocessorDefinition)$(_PydPreprocessorDefinition)%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <PreprocessorDefinitions Condition="'$(DisableGil)' == 'true'">Py_NOGIL=1;%(PreprocessorDefinitions)</PreprocessorDefinitions>
 
diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj
index f240779d2a5d4e..a5adc31112e01b 100644
--- a/PCbuild/pythoncore.vcxproj
+++ b/PCbuild/pythoncore.vcxproj
@@ -302,6 +302,11 @@
     <ClInclude Include="..\Include\marshal.h" />
     <ClInclude Include="..\Include\memoryobject.h" />
     <ClInclude Include="..\Include\methodobject.h" />
+    <ClInclude Include="..\Include\mimalloc\mimalloc-atomic.h" />
+    <ClInclude Include="..\Include\mimalloc\mimalloc-internal.h" />
+    <ClInclude Include="..\Include\mimalloc\mimalloc-track.h" />
+    <ClInclude Include="..\Include\mimalloc\mimalloc-types.h" />
+    <ClInclude Include="..\Include\mimalloc\mimalloc.h" />
     <ClInclude Include="..\Include\modsupport.h" />
     <ClInclude Include="..\Include\moduleobject.h" />
     <ClInclude Include="..\Include\object.h" />
@@ -493,6 +498,19 @@
     <ClCompile Include="..\Objects\longobject.c" />
     <ClCompile Include="..\Objects\memoryobject.c" />
     <ClCompile Include="..\Objects\methodobject.c" />
+    <ClCompile Include="..\Objects\mimalloc\alloc-aligned.c" />
+    <ClCompile Include="..\Objects\mimalloc\alloc.c" />
+    <ClCompile Include="..\Objects\mimalloc\arena.c" />
+    <ClCompile Include="..\Objects\mimalloc\bitmap.c" />
+    <ClCompile Include="..\Objects\mimalloc\heap.c" />
+    <ClCompile Include="..\Objects\mimalloc\init.c" />
+    <ClCompile Include="..\Objects\mimalloc\options.c" />
+    <ClCompile Include="..\Objects\mimalloc\os.c" />
+    <ClCompile Include="..\Objects\mimalloc\page.c" />
+    <ClCompile Include="..\Objects\mimalloc\random.c" />
+    <ClCompile Include="..\Objects\mimalloc\segment.c" />
+    <ClCompile Include="..\Objects\mimalloc\segment-cache.c" />
+    <ClCompile Include="..\Objects\mimalloc\stats.c" />
     <ClCompile Include="..\Objects\moduleobject.c" />
     <ClCompile Include="..\Objects\namespaceobject.c" />
     <ClCompile Include="..\Objects\object.c" />
diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters
index 6e13eb81018f9b..52a906e34bddbf 100644
--- a/PCbuild/pythoncore.vcxproj.filters
+++ b/PCbuild/pythoncore.vcxproj.filters
@@ -4,6 +4,9 @@
     <Filter Include="Include">
       <UniqueIdentifier>{086b0afb-270c-4603-a02a-63d46f0b2b92}</UniqueIdentifier>
     </Filter>
+    <Filter Include="Include\mimalloc">
+      <UniqueIdentifier>{1dc8d8bd-d493-478e-9b7a-7eb108bb8bd5}</UniqueIdentifier>
+    </Filter>
     <Filter Include="Modules">
       <UniqueIdentifier>{8e81609f-13ca-4eae-9fdb-f8af20c710c7}</UniqueIdentifier>
     </Filter>
@@ -19,6 +22,9 @@
     <Filter Include="Objects">
       <UniqueIdentifier>{ab29a558-143d-4fe7-a039-b431fb429856}</UniqueIdentifier>
     </Filter>
+    <Filter Include="Objects\mimalloc">
+      <UniqueIdentifier>{ad449c55-23cb-44f8-ada4-04b23c9ec388}</UniqueIdentifier>
+    </Filter>
     <Filter Include="Parser">
       <UniqueIdentifier>{97349fee-0abf-48b0-a8f5-771bf39b8aee}</UniqueIdentifier>
     </Filter>
@@ -789,6 +795,21 @@
     <ClInclude Include="..\Include\internal\pycore_uops.h">
       <Filter>Include\internal</Filter>
     </ClInclude>
+    <ClInclude Include="..\Include\mimalloc\mimalloc.h">
+      <Filter>Include\mimalloc</Filter>
+    </ClInclude>
+    <ClInclude Include="..\Include\mimalloc\mimalloc-atomic.h">
+      <Filter>Include\mimalloc</Filter>
+    </ClInclude>
+    <ClInclude Include="..\Include\mimalloc\mimalloc-internal.h">
+      <Filter>Include\mimalloc</Filter>
+    </ClInclude>
+    <ClInclude Include="..\Include\mimalloc\mimalloc-track.h">
+      <Filter>Include\mimalloc</Filter>
+    </ClInclude>
+    <ClInclude Include="..\Include\mimalloc\mimalloc-types.h">
+      <Filter>Include\mimalloc</Filter>
+    </ClInclude>
     <ClInclude Include="$(zlibDir)\crc32.h">
       <Filter>Modules\zlib</Filter>
     </ClInclude>
@@ -1109,6 +1130,45 @@
     <ClCompile Include="..\Objects\methodobject.c">
       <Filter>Objects</Filter>
     </ClCompile>
+    <ClCompile Include="..\Objects\mimalloc\alloc.c">
+      <Filter>Objects\mimalloc</Filter>
+    </ClCompile>
+    <ClCompile Include="..\Objects\mimalloc\alloc-aligned.c">
+      <Filter>Objects\mimalloc</Filter>
+    </ClCompile>
+    <ClCompile Include="..\Objects\mimalloc\arena.c">
+      <Filter>Objects\mimalloc</Filter>
+    </ClCompile>
+    <ClCompile Include="..\Objects\mimalloc\bitmap.c">
+      <Filter>Objects\mimalloc</Filter>
+    </ClCompile>
+    <ClCompile Include="..\Objects\mimalloc\heap.c">
+      <Filter>Objects\mimalloc</Filter>
+    </ClCompile>
+    <ClCompile Include="..\Objects\mimalloc\init.c">
+      <Filter>Objects\mimalloc</Filter>
+    </ClCompile>
+    <ClCompile Include="..\Objects\mimalloc\options.c">
+      <Filter>Objects\mimalloc</Filter>
+    </ClCompile>
+    <ClCompile Include="..\Objects\mimalloc\os.c">
+      <Filter>Objects\mimalloc</Filter>
+    </ClCompile>
+    <ClCompile Include="..\Objects\mimalloc\page.c">
+      <Filter>Objects\mimalloc</Filter>
+    </ClCompile>
+    <ClCompile Include="..\Objects\mimalloc\random.c">
+      <Filter>Objects\mimalloc</Filter>
+    </ClCompile>
+    <ClCompile Include="..\Objects\mimalloc\segment.c">
+      <Filter>Objects\mimalloc</Filter>
+    </ClCompile>
+    <ClCompile Include="..\Objects\mimalloc\segment-cache.c">
+      <Filter>Objects\mimalloc</Filter>
+    </ClCompile>
+    <ClCompile Include="..\Objects\mimalloc\stats.c">
+      <Filter>Objects\mimalloc</Filter>
+    </ClCompile>
     <ClCompile Include="..\Objects\moduleobject.c">
       <Filter>Objects</Filter>
     </ClCompile>
diff --git a/Tools/build/smelly.py b/Tools/build/smelly.py
index ab345307ff9b64..654d7e26571885 100755
--- a/Tools/build/smelly.py
+++ b/Tools/build/smelly.py
@@ -7,7 +7,7 @@
 import sysconfig
 
 
-ALLOWED_PREFIXES = ('Py', '_Py')
+ALLOWED_PREFIXES = ('Py', '_Py', 'mi_', '_mi_')
 if sys.platform == 'darwin':
     ALLOWED_PREFIXES += ('__Py',)
 
diff --git a/configure b/configure
index c87f518382f2ec..17fd7744a8d9cb 100755
--- a/configure
+++ b/configure
@@ -860,6 +860,9 @@ DTRACE_OBJS
 DTRACE_HEADERS
 DFLAGS
 DTRACE
+MIMALLOC_OBJS
+WITH_MIMALLOC
+MIMALLOC_HEADERS
 GDBM_LIBS
 GDBM_CFLAGS
 X11_LIBS
@@ -1088,6 +1091,7 @@ enable_loadable_sqlite_extensions
 with_dbmliborder
 enable_ipv6
 with_doc_strings
+with_mimalloc
 with_pymalloc
 with_freelists
 with_c_locale_coercion
@@ -1874,6 +1878,8 @@ Optional Packages:
                           value is a colon separated string with the backend
                           names `ndbm', `gdbm' and `bdb'.
   --with-doc-strings      enable documentation strings (default is yes)
+  --with-mimalloc         build with mimalloc memory allocator (default is yes
+                          if C11 stdatomic.h is available.)
   --with-pymalloc         enable specialized mallocs (default is yes)
   --with-freelists        enable object freelists (default is yes)
   --with-c-locale-coercion
@@ -16767,6 +16773,130 @@ fi
 { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $with_doc_strings" >&5
 printf "%s\n" "$with_doc_strings" >&6; }
 
+# Check for stdatomic.h, required for mimalloc.
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for stdatomic.h" >&5
+printf %s "checking for stdatomic.h... " >&6; }
+if test ${ac_cv_header_stdatomic_h+y}
+then :
+  printf %s "(cached) " >&6
+else $as_nop
+
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+
+    #include <stdatomic.h>
+    atomic_int int_var;
+    atomic_uintptr_t uintptr_var;
+    int main() {
+      atomic_store_explicit(&int_var, 5, memory_order_relaxed);
+      atomic_store_explicit(&uintptr_var, 0, memory_order_relaxed);
+      int loaded_value = atomic_load_explicit(&int_var, memory_order_seq_cst);
+      return 0;
+    }
+
+
+_ACEOF
+if ac_fn_c_try_link "$LINENO"
+then :
+  ac_cv_header_stdatomic_h=yes
+else $as_nop
+  ac_cv_header_stdatomic_h=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
+    conftest$ac_exeext conftest.$ac_ext
+
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_header_stdatomic_h" >&5
+printf "%s\n" "$ac_cv_header_stdatomic_h" >&6; }
+
+if test "x$ac_cv_header_stdatomic_h" = xyes
+then :
+
+
+printf "%s\n" "#define HAVE_STD_ATOMIC 1" >>confdefs.h
+
+
+fi
+
+# Check for GCC >= 4.7 and clang __atomic builtin functions
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for builtin __atomic_load_n and __atomic_store_n functions" >&5
+printf %s "checking for builtin __atomic_load_n and __atomic_store_n functions... " >&6; }
+if test ${ac_cv_builtin_atomic+y}
+then :
+  printf %s "(cached) " >&6
+else $as_nop
+
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+
+    int val;
+    int main() {
+      __atomic_store_n(&val, 1, __ATOMIC_SEQ_CST);
+      (void)__atomic_load_n(&val, __ATOMIC_SEQ_CST);
+      return 0;
+    }
+
+
+_ACEOF
+if ac_fn_c_try_link "$LINENO"
+then :
+  ac_cv_builtin_atomic=yes
+else $as_nop
+  ac_cv_builtin_atomic=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
+    conftest$ac_exeext conftest.$ac_ext
+
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_builtin_atomic" >&5
+printf "%s\n" "$ac_cv_builtin_atomic" >&6; }
+
+if test "x$ac_cv_builtin_atomic" = xyes
+then :
+
+
+printf "%s\n" "#define HAVE_BUILTIN_ATOMIC 1" >>confdefs.h
+
+
+fi
+
+# --with-mimalloc
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for --with-mimalloc" >&5
+printf %s "checking for --with-mimalloc... " >&6; }
+
+# Check whether --with-mimalloc was given.
+if test ${with_mimalloc+y}
+then :
+  withval=$with_mimalloc;
+else $as_nop
+  with_mimalloc="$ac_cv_header_stdatomic_h"
+
+fi
+
+
+if test "$with_mimalloc" != no; then
+  if test "$ac_cv_header_stdatomic_h" != yes; then
+    # mimalloc-atomic.h wants C11 stdatomic.h on POSIX
+    as_fn_error $? "mimalloc requires stdatomic.h, use --without-mimalloc to disable mimalloc." "$LINENO" 5
+  fi
+  with_mimalloc=yes
+
+printf "%s\n" "#define WITH_MIMALLOC 1" >>confdefs.h
+
+  MIMALLOC_HEADERS='$(MIMALLOC_HEADERS)'
+
+  MIMALLOC_HEADERS='$(MIMALLOC_OBJS)'
+
+fi
+
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $with_mimalloc" >&5
+printf "%s\n" "$with_mimalloc" >&6; }
+
+
+
+
 # Check for Python-specific malloc support
 { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for --with-pymalloc" >&5
 printf %s "checking for --with-pymalloc... " >&6; }
@@ -26678,6 +26808,7 @@ SRCDIRS="\
   Modules/cjkcodecs \
   Modules/expat \
   Objects \
+  Objects/mimalloc \
   Parser \
   Parser/tokenizer \
   Parser/lexer \
@@ -26836,95 +26967,6 @@ printf "%s\n" "#define HAVE_IPA_PURE_CONST_BUG 1" >>confdefs.h
     esac
 fi
 
-# Check for stdatomic.h
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for stdatomic.h" >&5
-printf %s "checking for stdatomic.h... " >&6; }
-if test ${ac_cv_header_stdatomic_h+y}
-then :
-  printf %s "(cached) " >&6
-else $as_nop
-
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-
-    #include <stdatomic.h>
-    atomic_int int_var;
-    atomic_uintptr_t uintptr_var;
-    int main(void) {
-      atomic_store_explicit(&int_var, 5, memory_order_relaxed);
-      atomic_store_explicit(&uintptr_var, 0, memory_order_relaxed);
-      int loaded_value = atomic_load_explicit(&int_var, memory_order_seq_cst);
-      return 0;
-    }
-
-
-_ACEOF
-if ac_fn_c_try_link "$LINENO"
-then :
-  ac_cv_header_stdatomic_h=yes
-else $as_nop
-  ac_cv_header_stdatomic_h=no
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.beam \
-    conftest$ac_exeext conftest.$ac_ext
-
-fi
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_header_stdatomic_h" >&5
-printf "%s\n" "$ac_cv_header_stdatomic_h" >&6; }
-
-if test "x$ac_cv_header_stdatomic_h" = xyes
-then :
-
-
-printf "%s\n" "#define HAVE_STD_ATOMIC 1" >>confdefs.h
-
-
-fi
-
-# Check for GCC >= 4.7 and clang __atomic builtin functions
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for builtin __atomic_load_n and __atomic_store_n functions" >&5
-printf %s "checking for builtin __atomic_load_n and __atomic_store_n functions... " >&6; }
-if test ${ac_cv_builtin_atomic+y}
-then :
-  printf %s "(cached) " >&6
-else $as_nop
-
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-
-    int val;
-    int main(void) {
-      __atomic_store_n(&val, 1, __ATOMIC_SEQ_CST);
-      (void)__atomic_load_n(&val, __ATOMIC_SEQ_CST);
-      return 0;
-    }
-
-
-_ACEOF
-if ac_fn_c_try_link "$LINENO"
-then :
-  ac_cv_builtin_atomic=yes
-else $as_nop
-  ac_cv_builtin_atomic=no
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.beam \
-    conftest$ac_exeext conftest.$ac_ext
-
-fi
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_builtin_atomic" >&5
-printf "%s\n" "$ac_cv_builtin_atomic" >&6; }
-
-if test "x$ac_cv_builtin_atomic" = xyes
-then :
-
-
-printf "%s\n" "#define HAVE_BUILTIN_ATOMIC 1" >>confdefs.h
-
-
-fi
-
 # ensurepip option
 { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for ensurepip" >&5
 printf %s "checking for ensurepip... " >&6; }
@@ -32187,3 +32229,8 @@ CPython core team, see https://peps.python.org/pep-0011/ for more information.
 " >&2;}
 fi
 
+if test "$ac_cv_header_stdatomic_h" != "yes"; then
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: Your compiler or platform does have a working C11 stdatomic.h. A future version of Python may require stdatomic.h." >&5
+printf "%s\n" "$as_me: Your compiler or platform does have a working C11 stdatomic.h. A future version of Python may require stdatomic.h." >&6;}
+fi
+
diff --git a/configure.ac b/configure.ac
index cd69f0ede54496..305d31b373b4ac 100644
--- a/configure.ac
+++ b/configure.ac
@@ -4497,6 +4497,73 @@ then
 fi
 AC_MSG_RESULT([$with_doc_strings])
 
+# Check for stdatomic.h, required for mimalloc.
+AC_CACHE_CHECK([for stdatomic.h], [ac_cv_header_stdatomic_h], [
+AC_LINK_IFELSE(
+[
+  AC_LANG_SOURCE([[
+    #include <stdatomic.h>
+    atomic_int int_var;
+    atomic_uintptr_t uintptr_var;
+    int main() {
+      atomic_store_explicit(&int_var, 5, memory_order_relaxed);
+      atomic_store_explicit(&uintptr_var, 0, memory_order_relaxed);
+      int loaded_value = atomic_load_explicit(&int_var, memory_order_seq_cst);
+      return 0;
+    }
+  ]])
+],[ac_cv_header_stdatomic_h=yes],[ac_cv_header_stdatomic_h=no])
+])
+
+AS_VAR_IF([ac_cv_header_stdatomic_h], [yes], [
+    AC_DEFINE(HAVE_STD_ATOMIC, 1,
+              [Has stdatomic.h with atomic_int and atomic_uintptr_t])
+])
+
+# Check for GCC >= 4.7 and clang __atomic builtin functions
+AC_CACHE_CHECK([for builtin __atomic_load_n and __atomic_store_n functions], [ac_cv_builtin_atomic], [
+AC_LINK_IFELSE(
+[
+  AC_LANG_SOURCE([[
+    int val;
+    int main() {
+      __atomic_store_n(&val, 1, __ATOMIC_SEQ_CST);
+      (void)__atomic_load_n(&val, __ATOMIC_SEQ_CST);
+      return 0;
+    }
+  ]])
+],[ac_cv_builtin_atomic=yes],[ac_cv_builtin_atomic=no])
+])
+
+AS_VAR_IF([ac_cv_builtin_atomic], [yes], [
+    AC_DEFINE(HAVE_BUILTIN_ATOMIC, 1, [Has builtin __atomic_load_n() and __atomic_store_n() functions])
+])
+
+# --with-mimalloc
+AC_MSG_CHECKING([for --with-mimalloc])
+AC_ARG_WITH([mimalloc],
+  [AS_HELP_STRING([--with-mimalloc],
+                 [build with mimalloc memory allocator (default is yes if C11 stdatomic.h is available.)])],
+  [],
+  [with_mimalloc="$ac_cv_header_stdatomic_h"]
+)
+
+if test "$with_mimalloc" != no; then
+  if test "$ac_cv_header_stdatomic_h" != yes; then
+    # mimalloc-atomic.h wants C11 stdatomic.h on POSIX
+    AC_MSG_ERROR([mimalloc requires stdatomic.h, use --without-mimalloc to disable mimalloc.])
+  fi
+  with_mimalloc=yes
+  AC_DEFINE([WITH_MIMALLOC], [1], [Define if you want to compile in mimalloc memory allocator.])
+  AC_SUBST([MIMALLOC_HEADERS], ['$(MIMALLOC_HEADERS)'])
+  AC_SUBST([MIMALLOC_OBJS], ['$(MIMALLOC_OBJS)'])
+fi
+
+AC_MSG_RESULT([$with_mimalloc])
+AC_SUBST([WITH_MIMALLOC])
+AC_SUBST([MIMALLOC_HEADERS])
+AC_SUBST([MIMALLOC_OBJS])
+
 # Check for Python-specific malloc support
 AC_MSG_CHECKING([for --with-pymalloc])
 AC_ARG_WITH(
@@ -6525,6 +6592,7 @@ SRCDIRS="\
   Modules/cjkcodecs \
   Modules/expat \
   Objects \
+  Objects/mimalloc \
   Parser \
   Parser/tokenizer \
   Parser/lexer \
@@ -6622,49 +6690,6 @@ if test "$ac_cv_gcc_asm_for_x87" = yes; then
     esac
 fi
 
-# Check for stdatomic.h
-AC_CACHE_CHECK([for stdatomic.h], [ac_cv_header_stdatomic_h], [
-AC_LINK_IFELSE(
-[
-  AC_LANG_SOURCE([[
-    #include <stdatomic.h>
-    atomic_int int_var;
-    atomic_uintptr_t uintptr_var;
-    int main(void) {
-      atomic_store_explicit(&int_var, 5, memory_order_relaxed);
-      atomic_store_explicit(&uintptr_var, 0, memory_order_relaxed);
-      int loaded_value = atomic_load_explicit(&int_var, memory_order_seq_cst);
-      return 0;
-    }
-  ]])
-],[ac_cv_header_stdatomic_h=yes],[ac_cv_header_stdatomic_h=no])
-])
-
-AS_VAR_IF([ac_cv_header_stdatomic_h], [yes], [
-    AC_DEFINE([HAVE_STD_ATOMIC], [1],
-              [Has stdatomic.h with atomic_int and atomic_uintptr_t])
-])
-
-# Check for GCC >= 4.7 and clang __atomic builtin functions
-AC_CACHE_CHECK([for builtin __atomic_load_n and __atomic_store_n functions], [ac_cv_builtin_atomic], [
-AC_LINK_IFELSE(
-[
-  AC_LANG_SOURCE([[
-    int val;
-    int main(void) {
-      __atomic_store_n(&val, 1, __ATOMIC_SEQ_CST);
-      (void)__atomic_load_n(&val, __ATOMIC_SEQ_CST);
-      return 0;
-    }
-  ]])
-],[ac_cv_builtin_atomic=yes],[ac_cv_builtin_atomic=no])
-])
-
-AS_VAR_IF([ac_cv_builtin_atomic], [yes], [
-    AC_DEFINE([HAVE_BUILTIN_ATOMIC], [1],
-      [Has builtin __atomic_load_n() and __atomic_store_n() functions])
-])
-
 # ensurepip option
 AC_MSG_CHECKING([for ensurepip])
 AC_ARG_WITH([ensurepip],
@@ -7392,3 +7417,10 @@ AS_VAR_IF([PY_SUPPORT_TIER], [0], [AC_MSG_WARN([
 Platform "$host" with compiler "$ac_cv_cc_name" is not supported by the
 CPython core team, see https://peps.python.org/pep-0011/ for more information.
 ])])
+
+if test "$ac_cv_header_stdatomic_h" != "yes"; then
+  AC_MSG_NOTICE(m4_normalize([
+    Your compiler or platform does have a working C11 stdatomic.h. A future
+    version of Python may require stdatomic.h.
+  ]))
+fi
diff --git a/pyconfig.h.in b/pyconfig.h.in
index 4b8fc4a677c776..2bbc1bdd52dc0e 100644
--- a/pyconfig.h.in
+++ b/pyconfig.h.in
@@ -1827,6 +1827,9 @@
 /* Define to 1 if libintl is needed for locale functions. */
 #undef WITH_LIBINTL
 
+/* Define if you want to compile in mimalloc memory allocator. */
+#undef WITH_MIMALLOC
+
 /* Define if you want to produce an OpenStep/Rhapsody framework (shared
    library plus accessory files). */
 #undef WITH_NEXT_FRAMEWORK

From 02888411e745f0022593995b14744cd4f319b070 Mon Sep 17 00:00:00 2001
From: Dino Viehland <dinoviehland@fb.com>
Date: Thu, 21 Sep 2023 11:24:15 -0700
Subject: [PATCH 02/24] Upgrade to mimalloc 2.12

Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
---
 Include/mimalloc/mimalloc-track.h             |   62 -
 Include/mimalloc/mimalloc.h                   |   82 +-
 .../{mimalloc-atomic.h => mimalloc/atomic.h}  |   63 +-
 .../internal.h}                               |  315 ++--
 Include/mimalloc/mimalloc/prim.h              |  323 ++++
 Include/mimalloc/mimalloc/track.h             |  147 ++
 .../{mimalloc-types.h => mimalloc/types.h}    |  109 +-
 Makefile.pre.in                               |   16 +-
 Objects/mimalloc/alloc-aligned.c              |   92 +-
 Objects/mimalloc/alloc-override.c             |  297 ++++
 Objects/mimalloc/alloc-posix.c                |    7 +-
 Objects/mimalloc/alloc.c                      |  223 +--
 Objects/mimalloc/arena.c                      |  734 ++++++---
 Objects/mimalloc/bitmap.c                     |  102 +-
 Objects/mimalloc/bitmap.h                     |    6 +-
 Objects/mimalloc/heap.c                       |   68 +-
 Objects/mimalloc/init.c                       |  189 ++-
 Objects/mimalloc/options.c                    |  269 ++--
 Objects/mimalloc/os.c                         | 1332 ++++-------------
 Objects/mimalloc/page.c                       |   61 +-
 .../mimalloc/prim/osx/alloc-override-zone.c   |  458 ++++++
 Objects/mimalloc/prim/osx/prim.c              |    9 +
 Objects/mimalloc/prim/prim.c                  |   24 +
 Objects/mimalloc/prim/readme.md               |    9 +
 Objects/mimalloc/prim/unix/prim.c             |  859 +++++++++++
 Objects/mimalloc/prim/wasi/prim.c             |  275 ++++
 .../mimalloc/prim/windows/etw-mimalloc.wprp   |   61 +
 Objects/mimalloc/prim/windows/etw.h           |  905 +++++++++++
 Objects/mimalloc/prim/windows/etw.man         |  Bin 0 -> 3926 bytes
 Objects/mimalloc/prim/windows/prim.c          |  622 ++++++++
 Objects/mimalloc/prim/windows/readme.md       |   17 +
 Objects/mimalloc/random.c                     |  162 +-
 Objects/mimalloc/region.c                     |  516 -------
 Objects/mimalloc/segment-cache.c              |  409 -----
 Objects/mimalloc/segment-map.c                |  153 ++
 Objects/mimalloc/segment.c                    |  332 ++--
 Objects/mimalloc/stats.c                      |  259 +---
 PCbuild/_freeze_module.vcxproj                |    3 +-
 PCbuild/_freeze_module.vcxproj.filters        |    5 +-
 PCbuild/pythoncore.vcxproj                    |   12 +-
 PCbuild/pythoncore.vcxproj.filters            |   16 +-
 configure                                     |    5 +-
 configure.ac                                  |    1 +
 43 files changed, 6043 insertions(+), 3566 deletions(-)
 delete mode 100644 Include/mimalloc/mimalloc-track.h
 rename Include/mimalloc/{mimalloc-atomic.h => mimalloc/atomic.h} (87%)
 rename Include/mimalloc/{mimalloc-internal.h => mimalloc/internal.h} (74%)
 create mode 100644 Include/mimalloc/mimalloc/prim.h
 create mode 100644 Include/mimalloc/mimalloc/track.h
 rename Include/mimalloc/{mimalloc-types.h => mimalloc/types.h} (85%)
 create mode 100644 Objects/mimalloc/alloc-override.c
 create mode 100644 Objects/mimalloc/prim/osx/alloc-override-zone.c
 create mode 100644 Objects/mimalloc/prim/osx/prim.c
 create mode 100644 Objects/mimalloc/prim/prim.c
 create mode 100644 Objects/mimalloc/prim/readme.md
 create mode 100644 Objects/mimalloc/prim/unix/prim.c
 create mode 100644 Objects/mimalloc/prim/wasi/prim.c
 create mode 100644 Objects/mimalloc/prim/windows/etw-mimalloc.wprp
 create mode 100644 Objects/mimalloc/prim/windows/etw.h
 create mode 100644 Objects/mimalloc/prim/windows/etw.man
 create mode 100644 Objects/mimalloc/prim/windows/prim.c
 create mode 100644 Objects/mimalloc/prim/windows/readme.md
 delete mode 100644 Objects/mimalloc/region.c
 delete mode 100644 Objects/mimalloc/segment-cache.c
 create mode 100644 Objects/mimalloc/segment-map.c

diff --git a/Include/mimalloc/mimalloc-track.h b/Include/mimalloc/mimalloc-track.h
deleted file mode 100644
index f60d7acd0b8fcd..00000000000000
--- a/Include/mimalloc/mimalloc-track.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* ----------------------------------------------------------------------------
-Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
-This is free software; you can redistribute it and/or modify it under the
-terms of the MIT license. A copy of the license can be found in the file
-"LICENSE" at the root of this distribution.
------------------------------------------------------------------------------*/
-#pragma once
-#ifndef MIMALLOC_TRACK_H
-#define MIMALLOC_TRACK_H
-
-// ------------------------------------------------------
-// Track memory ranges with macros for tools like Valgrind
-// address sanitizer, or other memory checkers.
-// ------------------------------------------------------
-
-#if MI_VALGRIND
-
-#define MI_TRACK_ENABLED 1
-#define MI_TRACK_TOOL    "valgrind"
-
-#include <valgrind/valgrind.h>
-#include <valgrind/memcheck.h>
-
-#define mi_track_malloc(p,size,zero)        VALGRIND_MALLOCLIKE_BLOCK(p,size,MI_PADDING_SIZE /*red zone*/,zero)
-#define mi_track_resize(p,oldsize,newsize)  VALGRIND_RESIZEINPLACE_BLOCK(p,oldsize,newsize,MI_PADDING_SIZE /*red zone*/)
-#define mi_track_free(p)                    VALGRIND_FREELIKE_BLOCK(p,MI_PADDING_SIZE /*red zone*/)
-#define mi_track_free_size(p,_size)         mi_track_free(p)
-#define mi_track_mem_defined(p,size)        VALGRIND_MAKE_MEM_DEFINED(p,size)
-#define mi_track_mem_undefined(p,size)      VALGRIND_MAKE_MEM_UNDEFINED(p,size)
-#define mi_track_mem_noaccess(p,size)       VALGRIND_MAKE_MEM_NOACCESS(p,size)
-
-#elif MI_ASAN
-
-#define MI_TRACK_ENABLED 1
-#define MI_TRACK_TOOL    "asan"
-
-#include <sanitizer/asan_interface.h>
-
-#define mi_track_malloc(p,size,zero)        ASAN_UNPOISON_MEMORY_REGION(p,size)
-#define mi_track_resize(p,oldsize,newsize)  ASAN_POISON_MEMORY_REGION(p,oldsize); ASAN_UNPOISON_MEMORY_REGION(p,newsize)
-#define mi_track_free(p)                    ASAN_POISON_MEMORY_REGION(p,mi_usable_size(p))
-#define mi_track_free_size(p,size)          ASAN_POISON_MEMORY_REGION(p,size)
-#define mi_track_mem_defined(p,size)        ASAN_UNPOISON_MEMORY_REGION(p,size)
-#define mi_track_mem_undefined(p,size)      ASAN_UNPOISON_MEMORY_REGION(p,size)
-#define mi_track_mem_noaccess(p,size)       ASAN_POISON_MEMORY_REGION(p,size)
-
-#else
-
-#define MI_TRACK_ENABLED 0
-#define MI_TRACK_TOOL    "none"
-
-#define mi_track_malloc(p,size,zero)
-#define mi_track_resize(p,oldsize,newsize)
-#define mi_track_free(p)
-#define mi_track_free_size(p,_size)
-#define mi_track_mem_defined(p,size)
-#define mi_track_mem_undefined(p,size)
-#define mi_track_mem_noaccess(p,size)
-
-#endif
-
-#endif
diff --git a/Include/mimalloc/mimalloc.h b/Include/mimalloc/mimalloc.h
index 9b72fbfda7411a..f77c2ea17008ec 100644
--- a/Include/mimalloc/mimalloc.h
+++ b/Include/mimalloc/mimalloc.h
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2022, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_H
 #define MIMALLOC_H
 
-#define MI_MALLOC_VERSION 209   // major + 2 digits minor
+#define MI_MALLOC_VERSION 212   // major + 2 digits minor
 
 // ------------------------------------------------------
 // Compiler specific attributes
@@ -284,7 +284,7 @@ mi_decl_export int   mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node,
 mi_decl_export int   mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
 mi_decl_export bool  mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
 
-#if MI_MALLOC_VERSION >= 200
+#if MI_MALLOC_VERSION >= 182
 // Create a heap that only allocates in the specified arena
 mi_decl_nodiscard mi_decl_export mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id);
 #endif
@@ -318,35 +318,40 @@ mi_decl_export int  mi_reserve_huge_os_pages(size_t pages, double max_secs, size
 
 typedef enum mi_option_e {
   // stable options
-  mi_option_show_errors,
-  mi_option_show_stats,
-  mi_option_verbose,
-  // some of the following options are experimental
-  // (deprecated options are kept for binary backward compatibility with v1.x versions)
-  mi_option_eager_commit,
-  mi_option_deprecated_eager_region_commit,
-  mi_option_deprecated_reset_decommits,
-  mi_option_large_os_pages,           // use large (2MiB) OS pages, implies eager commit
-  mi_option_reserve_huge_os_pages,    // reserve N huge OS pages (1GiB) at startup
+  mi_option_show_errors,              // print error messages
+  mi_option_show_stats,               // print statistics on termination
+  mi_option_verbose,                  // print verbose messages
+  // the following options are experimental (see src/options.h)
+  mi_option_eager_commit,             // eager commit segments? (after `eager_commit_delay` segments) (=1)
+  mi_option_arena_eager_commit,       // eager commit arenas? Use 2 to enable just on overcommit systems (=2)
+  mi_option_purge_decommits,          // should a memory purge decommit (or only reset) (=1)
+  mi_option_allow_large_os_pages,     // allow large (2MiB) OS pages, implies eager commit
+  mi_option_reserve_huge_os_pages,    // reserve N huge OS pages (1GiB/page) at startup
   mi_option_reserve_huge_os_pages_at, // reserve huge OS pages at a specific NUMA node
-  mi_option_reserve_os_memory,        // reserve specified amount of OS memory at startup
+  mi_option_reserve_os_memory,        // reserve specified amount of OS memory in an arena at startup
   mi_option_deprecated_segment_cache,
-  mi_option_page_reset,
-  mi_option_abandoned_page_decommit,
-  mi_option_deprecated_segment_reset,
-  mi_option_eager_commit_delay,
-  mi_option_decommit_delay,
-  mi_option_use_numa_nodes,           // 0 = use available numa nodes, otherwise use at most N nodes.
-  mi_option_limit_os_alloc,           // 1 = do not use OS memory for allocation (but only reserved arenas)
-  mi_option_os_tag,
-  mi_option_max_errors,
-  mi_option_max_warnings,
-  mi_option_max_segment_reclaim,
-  mi_option_allow_decommit,
-  mi_option_segment_decommit_delay,  
-  mi_option_decommit_extend_delay,
-  mi_option_destroy_on_exit,          
-  _mi_option_last
+  mi_option_deprecated_page_reset,
+  mi_option_abandoned_page_purge,     // immediately purge delayed purges on thread termination
+  mi_option_deprecated_segment_reset, 
+  mi_option_eager_commit_delay,       
+  mi_option_purge_delay,              // memory purging is delayed by N milli seconds; use 0 for immediate purging or -1 for no purging at all.
+  mi_option_use_numa_nodes,           // 0 = use all available numa nodes, otherwise use at most N nodes.
+  mi_option_limit_os_alloc,           // 1 = do not use OS memory for allocation (but only programmatically reserved arenas)
+  mi_option_os_tag,                   // tag used for OS logging (macOS only for now)
+  mi_option_max_errors,               // issue at most N error messages
+  mi_option_max_warnings,             // issue at most N warning messages
+  mi_option_max_segment_reclaim,      
+  mi_option_destroy_on_exit,          // if set, release all memory on exit; sometimes used for dynamic unloading but can be unsafe.
+  mi_option_arena_reserve,            // initial memory size in KiB for arena reservation (1GiB on 64-bit)
+  mi_option_arena_purge_mult,         
+  mi_option_purge_extend_delay,
+  _mi_option_last,
+  // legacy option names
+  mi_option_large_os_pages = mi_option_allow_large_os_pages,
+  mi_option_eager_region_commit = mi_option_arena_eager_commit,
+  mi_option_reset_decommits = mi_option_purge_decommits,
+  mi_option_reset_delay = mi_option_purge_delay,
+  mi_option_abandoned_page_reset = mi_option_abandoned_page_purge
 } mi_option_t;
 
 
@@ -356,8 +361,9 @@ mi_decl_export void mi_option_disable(mi_option_t option);
 mi_decl_export void mi_option_set_enabled(mi_option_t option, bool enable);
 mi_decl_export void mi_option_set_enabled_default(mi_option_t option, bool enable);
 
-mi_decl_nodiscard mi_decl_export long mi_option_get(mi_option_t option);
-mi_decl_nodiscard mi_decl_export long mi_option_get_clamp(mi_option_t option, long min, long max);
+mi_decl_nodiscard mi_decl_export long   mi_option_get(mi_option_t option);
+mi_decl_nodiscard mi_decl_export long   mi_option_get_clamp(mi_option_t option, long min, long max);
+mi_decl_nodiscard mi_decl_export size_t mi_option_get_size(mi_option_t option);
 mi_decl_export void mi_option_set(mi_option_t option, long value);
 mi_decl_export void mi_option_set_default(mi_option_t option, long value);
 
@@ -477,11 +483,13 @@ template<class T1,class T2> bool operator==(const mi_stl_allocator<T1>& , const
 template<class T1,class T2> bool operator!=(const mi_stl_allocator<T1>& , const mi_stl_allocator<T2>& ) mi_attr_noexcept { return false; }
 
 
-#if (__cplusplus >= 201103L) || (_MSC_VER > 1900)  // C++11
+#if (__cplusplus >= 201103L) || (_MSC_VER >= 1900)  // C++11
+#define MI_HAS_HEAP_STL_ALLOCATOR 1
+
 #include <memory>      // std::shared_ptr
 
 // Common base class for STL allocators in a specific heap
-template<class T, bool destroy> struct _mi_heap_stl_allocator_common : public _mi_stl_allocator_common<T> {
+template<class T, bool _mi_destroy> struct _mi_heap_stl_allocator_common : public _mi_stl_allocator_common<T> {
   using typename _mi_stl_allocator_common<T>::size_type;
   using typename _mi_stl_allocator_common<T>::value_type;
   using typename _mi_stl_allocator_common<T>::pointer;
@@ -500,7 +508,7 @@ template<class T, bool destroy> struct _mi_heap_stl_allocator_common : public _m
   #endif
 
   void collect(bool force) { mi_heap_collect(this->heap.get(), force); }
-  template<class U> bool is_equal(const _mi_heap_stl_allocator_common<U, destroy>& x) const { return (this->heap == x.heap); }
+  template<class U> bool is_equal(const _mi_heap_stl_allocator_common<U, _mi_destroy>& x) const { return (this->heap == x.heap); }
 
 protected:
   std::shared_ptr<mi_heap_t> heap;
@@ -508,10 +516,10 @@ template<class T, bool destroy> struct _mi_heap_stl_allocator_common : public _m
   
   _mi_heap_stl_allocator_common() {
     mi_heap_t* hp = mi_heap_new();
-    this->heap.reset(hp, (destroy ? &heap_destroy : &heap_delete));  /* calls heap_delete/destroy when the refcount drops to zero */
+    this->heap.reset(hp, (_mi_destroy ? &heap_destroy : &heap_delete));  /* calls heap_delete/destroy when the refcount drops to zero */
   }
   _mi_heap_stl_allocator_common(const _mi_heap_stl_allocator_common& x) mi_attr_noexcept : heap(x.heap) { }
-  template<class U> _mi_heap_stl_allocator_common(const _mi_heap_stl_allocator_common<U, destroy>& x) mi_attr_noexcept : heap(x.heap) { }
+  template<class U> _mi_heap_stl_allocator_common(const _mi_heap_stl_allocator_common<U, _mi_destroy>& x) mi_attr_noexcept : heap(x.heap) { }
 
 private:
   static void heap_delete(mi_heap_t* hp)  { if (hp != NULL) { mi_heap_delete(hp); } }
diff --git a/Include/mimalloc/mimalloc-atomic.h b/Include/mimalloc/mimalloc/atomic.h
similarity index 87%
rename from Include/mimalloc/mimalloc-atomic.h
rename to Include/mimalloc/mimalloc/atomic.h
index c66f80493321ee..fe418fab314a40 100644
--- a/Include/mimalloc/mimalloc-atomic.h
+++ b/Include/mimalloc/mimalloc/atomic.h
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2021 Microsoft Research, Daan Leijen
+Copyright (c) 2018-2023 Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -39,7 +39,11 @@ terms of the MIT license. A copy of the license can be found in the file
 #include <stdatomic.h>
 #define  mi_atomic(name)        atomic_##name
 #define  mi_memory_order(name)  memory_order_##name
-#define  MI_ATOMIC_VAR_INIT(x)  ATOMIC_VAR_INIT(x)
+#if !defined(ATOMIC_VAR_INIT) || (__STDC_VERSION__ >= 201710L) // c17, see issue #735
+ #define MI_ATOMIC_VAR_INIT(x) x
+#else
+ #define MI_ATOMIC_VAR_INIT(x) ATOMIC_VAR_INIT(x)
+#endif
 #endif
 
 // Various defines for all used memory orders in mimalloc
@@ -113,11 +117,13 @@ static inline void mi_atomic_maxi64_relaxed(volatile int64_t* p, int64_t x) {
 }
 
 // Used by timers
-#define mi_atomic_loadi64_acquire(p)    mi_atomic(load_explicit)(p,mi_memory_order(acquire))
-#define mi_atomic_loadi64_relaxed(p)    mi_atomic(load_explicit)(p,mi_memory_order(relaxed))
-#define mi_atomic_storei64_release(p,x) mi_atomic(store_explicit)(p,x,mi_memory_order(release))
-#define mi_atomic_storei64_relaxed(p,x) mi_atomic(store_explicit)(p,x,mi_memory_order(relaxed))
+#define mi_atomic_loadi64_acquire(p)            mi_atomic(load_explicit)(p,mi_memory_order(acquire))
+#define mi_atomic_loadi64_relaxed(p)            mi_atomic(load_explicit)(p,mi_memory_order(relaxed))
+#define mi_atomic_storei64_release(p,x)         mi_atomic(store_explicit)(p,x,mi_memory_order(release))
+#define mi_atomic_storei64_relaxed(p,x)         mi_atomic(store_explicit)(p,x,mi_memory_order(relaxed))
 
+#define mi_atomic_casi64_strong_acq_rel(p,e,d)  mi_atomic_cas_strong_acq_rel(p,e,d)
+#define mi_atomic_addi64_acq_rel(p,i)           mi_atomic_add_acq_rel(p,i)
 
 
 #elif defined(_MSC_VER)
@@ -245,6 +251,21 @@ static inline void mi_atomic_maxi64_relaxed(volatile _Atomic(int64_t)*p, int64_t
   } while (current < x && _InterlockedCompareExchange64(p, x, current) != current);
 }
 
+static inline void mi_atomic_addi64_acq_rel(volatile _Atomic(int64_t*)p, int64_t i) {
+  mi_atomic_addi64_relaxed(p, i);
+}
+
+static inline bool mi_atomic_casi64_strong_acq_rel(volatile _Atomic(int64_t*)p, int64_t* exp, int64_t des) {
+  int64_t read = _InterlockedCompareExchange64(p, des, *exp);
+  if (read == *exp) {
+    return true;
+  }
+  else {
+    *exp = read;
+    return false;
+  }
+}
+
 // The pointer macros cast to `uintptr_t`.
 #define mi_atomic_load_ptr_acquire(tp,p)                (tp*)mi_atomic_load_acquire((_Atomic(uintptr_t)*)(p))
 #define mi_atomic_load_ptr_relaxed(tp,p)                (tp*)mi_atomic_load_relaxed((_Atomic(uintptr_t)*)(p))
@@ -275,6 +296,26 @@ static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub) {
   return (intptr_t)mi_atomic_addi(p, -sub);
 }
 
+typedef _Atomic(uintptr_t) mi_atomic_once_t;
+
+// Returns true only on the first invocation
+static inline bool mi_atomic_once( mi_atomic_once_t* once ) {
+  if (mi_atomic_load_relaxed(once) != 0) return false;     // quick test 
+  uintptr_t expected = 0;
+  return mi_atomic_cas_strong_acq_rel(once, &expected, (uintptr_t)1); // try to set to 1
+}
+
+typedef _Atomic(uintptr_t) mi_atomic_guard_t;
+
+// Allows only one thread to execute at a time
+#define mi_atomic_guard(guard) \
+  uintptr_t _mi_guard_expected = 0; \
+  for(bool _mi_guard_once = true; \
+      _mi_guard_once && mi_atomic_cas_strong_acq_rel(guard,&_mi_guard_expected,(uintptr_t)1); \
+      (mi_atomic_store_release(guard,(uintptr_t)0), _mi_guard_once = false) )
+
+
+
 // Yield
 #if defined(__cplusplus)
 #include <thread>
@@ -294,7 +335,7 @@ static inline void mi_atomic_yield(void) {
 }
 #elif (defined(__GNUC__) || defined(__clang__)) && \
       (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__armel__) || defined(__ARMEL__) || \
-       defined(__aarch64__) || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__))
+       defined(__aarch64__) || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__)) || defined(__POWERPC__)
 #if defined(__x86_64__) || defined(__i386__)
 static inline void mi_atomic_yield(void) {
   __asm__ volatile ("pause" ::: "memory");
@@ -307,10 +348,16 @@ static inline void mi_atomic_yield(void) {
 static inline void mi_atomic_yield(void) {
   __asm__ volatile("yield" ::: "memory");
 }
-#elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__)
+#elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__POWERPC__)
+#ifdef __APPLE__
+static inline void mi_atomic_yield(void) {
+  __asm__ volatile ("or r27,r27,r27" ::: "memory");
+}
+#else
 static inline void mi_atomic_yield(void) {
   __asm__ __volatile__ ("or 27,27,27" ::: "memory");
 }
+#endif
 #elif defined(__armel__) || defined(__ARMEL__)
 static inline void mi_atomic_yield(void) {
   __asm__ volatile ("nop" ::: "memory");
diff --git a/Include/mimalloc/mimalloc-internal.h b/Include/mimalloc/mimalloc/internal.h
similarity index 74%
rename from Include/mimalloc/mimalloc-internal.h
rename to Include/mimalloc/mimalloc/internal.h
index a68e69662c741d..00d2626095b195 100644
--- a/Include/mimalloc/mimalloc-internal.h
+++ b/Include/mimalloc/mimalloc/internal.h
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2022, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -8,8 +8,14 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_INTERNAL_H
 #define MIMALLOC_INTERNAL_H
 
-#include "mimalloc-types.h"
-#include "mimalloc-track.h"
+
+// --------------------------------------------------------------------------
+// This file contains the interal API's of mimalloc and various utility
+// functions and macros.
+// --------------------------------------------------------------------------
+
+#include "mimalloc/types.h"
+#include "mimalloc/track.h"
 
 #if (MI_DEBUG>0)
 #define mi_trace_message(...)  _mi_trace_message(__VA_ARGS__)
@@ -44,6 +50,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #define mi_decl_externc
 #endif
 
+// pthreads
 #if !defined(_WIN32) && !defined(__wasi__)
 #define  MI_USE_PTHREADS
 #include <pthread.h>
@@ -73,39 +80,52 @@ extern mi_decl_cache_align mi_stats_t       _mi_stats_main;
 extern mi_decl_cache_align const mi_page_t  _mi_page_empty;
 bool       _mi_is_main_thread(void);
 size_t     _mi_current_thread_count(void);
-bool       _mi_preloading(void);  // true while the C runtime is not ready
+bool       _mi_preloading(void);           // true while the C runtime is not initialized yet
+mi_threadid_t _mi_thread_id(void) mi_attr_noexcept;
+mi_heap_t*    _mi_heap_main_get(void);     // statically allocated main backing heap
+void       _mi_thread_done(mi_heap_t* heap);
+void       _mi_thread_data_collect(void);
 
 // os.c
-size_t     _mi_os_page_size(void);
-void       _mi_os_init(void);                                      // called from process init
-void*      _mi_os_alloc(size_t size, mi_stats_t* stats);           // to allocate thread local data
-void       _mi_os_free(void* p, size_t size, mi_stats_t* stats);   // to free thread local data
+void       _mi_os_init(void);                                            // called from process init
+void*      _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* stats);  
+void       _mi_os_free(void* p, size_t size, mi_memid_t memid, mi_stats_t* stats);
+void       _mi_os_free_ex(void* p, size_t size, bool still_committed, mi_memid_t memid, mi_stats_t* stats);
 
-bool       _mi_os_protect(void* addr, size_t size);
-bool       _mi_os_unprotect(void* addr, size_t size);
-bool       _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* stats);
-bool       _mi_os_decommit(void* p, size_t size, mi_stats_t* stats);
-bool       _mi_os_reset(void* p, size_t size, mi_stats_t* stats);
-// bool       _mi_os_unreset(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
+size_t     _mi_os_page_size(void);
 size_t     _mi_os_good_alloc_size(size_t size);
 bool       _mi_os_has_overcommit(void);
+bool       _mi_os_has_virtual_reserve(void);
+
+bool       _mi_os_purge(void* p, size_t size, mi_stats_t* stats);
 bool       _mi_os_reset(void* addr, size_t size, mi_stats_t* tld_stats);
+bool       _mi_os_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
+bool       _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats);
+bool       _mi_os_protect(void* addr, size_t size);
+bool       _mi_os_unprotect(void* addr, size_t size);
+bool       _mi_os_purge(void* p, size_t size, mi_stats_t* stats);
+bool       _mi_os_purge_ex(void* p, size_t size, bool allow_reset, mi_stats_t* stats);
 
-void*      _mi_os_alloc_aligned_offset(size_t size, size_t alignment, size_t align_offset, bool commit, bool* large, mi_stats_t* tld_stats);
-void       _mi_os_free_aligned(void* p, size_t size, size_t alignment, size_t align_offset, bool was_committed, mi_stats_t* tld_stats);
+void*      _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* stats);
+void*      _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* tld_stats);
+
+void*      _mi_os_get_aligned_hint(size_t try_alignment, size_t size);
+bool       _mi_os_use_large_page(size_t size, size_t alignment);
+size_t     _mi_os_large_page_size(void);
+
+void*      _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_secs, size_t* pages_reserved, size_t* psize, mi_memid_t* memid);
 
 // arena.c
-void*      _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool* commit, bool* large, bool* is_pinned, bool* is_zero, mi_arena_id_t req_arena_id, size_t* memid, mi_os_tld_t* tld);
-void*      _mi_arena_alloc(size_t size, bool* commit, bool* large, bool* is_pinned, bool* is_zero, mi_arena_id_t req_arena_id, size_t* memid, mi_os_tld_t* tld);
-void       _mi_arena_free(void* p, size_t size, size_t alignment, size_t align_offset, size_t memid, bool all_committed, mi_stats_t* stats);
 mi_arena_id_t _mi_arena_id_none(void);
-bool       _mi_arena_memid_is_suitable(size_t memid, mi_arena_id_t req_arena_id);
-
-// "segment-cache.c"
-void*      _mi_segment_cache_pop(size_t size, mi_commit_mask_t* commit_mask, mi_commit_mask_t* decommit_mask, bool* large, bool* is_pinned, bool* is_zero, mi_arena_id_t req_arena_id, size_t* memid, mi_os_tld_t* tld);
-bool       _mi_segment_cache_push(void* start, size_t size, size_t memid, const mi_commit_mask_t* commit_mask, const mi_commit_mask_t* decommit_mask, bool is_large, bool is_pinned, mi_os_tld_t* tld);
-void       _mi_segment_cache_collect(bool force, mi_os_tld_t* tld);
-void       _mi_segment_cache_free_all(mi_os_tld_t* tld);
+void       _mi_arena_free(void* p, size_t size, size_t still_committed_size, mi_memid_t memid, mi_stats_t* stats);
+void*      _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld);
+void*      _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld);
+bool       _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id);
+bool       _mi_arena_contains(const void* p);
+void       _mi_arena_collect(bool force_purge, mi_stats_t* stats);
+void       _mi_arena_unsafe_destroy_all(mi_stats_t* stats);
+
+// "segment-map.c"
 void       _mi_segment_map_allocated_at(const mi_segment_t* segment);
 void       _mi_segment_map_freed_at(const mi_segment_t* segment);
 
@@ -127,8 +147,6 @@ void       _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld);
 void       _mi_abandoned_await_readers(void);
 void       _mi_abandoned_collect(mi_heap_t* heap, bool force, mi_segments_tld_t* tld);
 
-
-
 // "page.c"
 void*      _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment)  mi_attr_noexcept mi_attr_malloc;
 
@@ -155,12 +173,11 @@ uint8_t    _mi_bin(size_t size);                // for stats
 void       _mi_heap_destroy_pages(mi_heap_t* heap);
 void       _mi_heap_collect_abandon(mi_heap_t* heap);
 void       _mi_heap_set_default_direct(mi_heap_t* heap);
-bool       _mi_heap_memid_is_suitable(mi_heap_t* heap, size_t memid);
-void       _mi_heap_destroy_all(void);
+bool       _mi_heap_memid_is_suitable(mi_heap_t* heap, mi_memid_t memid);
+void       _mi_heap_unsafe_destroy_all(void);
 
 // "stats.c"
 void       _mi_stats_done(mi_stats_t* stats);
-
 mi_msecs_t  _mi_clock_now(void);
 mi_msecs_t  _mi_clock_end(mi_msecs_t start);
 mi_msecs_t  _mi_clock_start(void);
@@ -173,6 +190,16 @@ void*       _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool
 mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p);
 bool        _mi_free_delayed_block(mi_block_t* block);
 void        _mi_free_generic(const mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept;  // for runtime integration
+void        _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size);
+
+// option.c, c primitives
+char        _mi_toupper(char c);
+int         _mi_strnicmp(const char* s, const char* t, size_t n);
+void        _mi_strlcpy(char* dest, const char* src, size_t dest_size);
+void        _mi_strlcat(char* dest, const char* src, size_t dest_size);
+size_t      _mi_strlen(const char* s);
+size_t      _mi_strnlen(const char* s, size_t max_len);
+
 
 #if MI_DEBUG>1
 bool        _mi_page_is_valid(mi_page_t* page);
@@ -242,6 +269,10 @@ bool        _mi_page_is_valid(mi_page_t* page);
 #define MI_INIT256(x) MI_INIT128(x),MI_INIT128(x)
 
 
+#include <string.h>
+// initialize a local variable to zero; use memset as compilers optimize constant sized memset's
+#define _mi_memzero_var(x)  memset(&x,0,sizeof(x))
+
 // Is `x` a power of two? (0 is considered a power of two)
 static inline bool _mi_is_power_of_two(uintptr_t x) {
   return ((x & (x - 1)) == 0);
@@ -284,7 +315,7 @@ static inline uintptr_t _mi_divide_up(uintptr_t size, size_t divider) {
 }
 
 // Is memory zero initialized?
-static inline bool mi_mem_is_zero(void* p, size_t size) {
+static inline bool mi_mem_is_zero(const void* p, size_t size) {
   for (size_t i = 0; i < size; i++) {
     if (((uint8_t*)p)[i] != 0) return false;
   }
@@ -340,93 +371,11 @@ static inline bool mi_count_size_overflow(size_t count, size_t size, size_t* tot
 }
 
 
-/* ----------------------------------------------------------------------------------------
-The thread local default heap: `_mi_get_default_heap` returns the thread local heap.
-On most platforms (Windows, Linux, FreeBSD, NetBSD, etc), this just returns a
-__thread local variable (`_mi_heap_default`). With the initial-exec TLS model this ensures
-that the storage will always be available (allocated on the thread stacks).
-On some platforms though we cannot use that when overriding `malloc` since the underlying
-TLS implementation (or the loader) will call itself `malloc` on a first access and recurse.
-We try to circumvent this in an efficient way:
-- macOSX : we use an unused TLS slot from the OS allocated slots (MI_TLS_SLOT). On OSX, the
-           loader itself calls `malloc` even before the modules are initialized.
-- OpenBSD: we use an unused slot from the pthread block (MI_TLS_PTHREAD_SLOT_OFS).
-- DragonFly: defaults are working but seem slow compared to freeBSD (see PR #323)
+/*----------------------------------------------------------------------------------------
+  Heap functions
 ------------------------------------------------------------------------------------------- */
 
 extern const mi_heap_t _mi_heap_empty;  // read-only empty heap, initial value of the thread local default heap
-extern bool _mi_process_is_initialized;
-mi_heap_t*  _mi_heap_main_get(void);    // statically allocated main backing heap
-
-#if defined(MI_MALLOC_OVERRIDE)
-#if defined(__APPLE__) // macOS
-#define MI_TLS_SLOT               89  // seems unused?
-// #define MI_TLS_RECURSE_GUARD 1
-// other possible unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89)
-// see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
-#elif defined(__OpenBSD__)
-// use end bytes of a name; goes wrong if anyone uses names > 23 characters (ptrhread specifies 16)
-// see <https://github.com/openbsd/src/blob/master/lib/libc/include/thread_private.h#L371>
-#define MI_TLS_PTHREAD_SLOT_OFS   (6*sizeof(int) + 4*sizeof(void*) + 24)
-// #elif defined(__DragonFly__)
-// #warning "mimalloc is not working correctly on DragonFly yet."
-// #define MI_TLS_PTHREAD_SLOT_OFS   (4 + 1*sizeof(void*))  // offset `uniqueid` (also used by gdb?) <https://github.com/DragonFlyBSD/DragonFlyBSD/blob/master/lib/libthread_xu/thread/thr_private.h#L458>
-#elif defined(__ANDROID__)
-// See issue #381
-#define MI_TLS_PTHREAD
-#endif
-#endif
-
-#if defined(MI_TLS_SLOT)
-static inline void* mi_tls_slot(size_t slot) mi_attr_noexcept;   // forward declaration
-#elif defined(MI_TLS_PTHREAD_SLOT_OFS)
-static inline mi_heap_t** mi_tls_pthread_heap_slot(void) {
-  pthread_t self = pthread_self();
-  #if defined(__DragonFly__)
-  if (self==NULL) {
-    mi_heap_t* pheap_main = _mi_heap_main_get();
-    return &pheap_main;
-  }
-  #endif
-  return (mi_heap_t**)((uint8_t*)self + MI_TLS_PTHREAD_SLOT_OFS);
-}
-#elif defined(MI_TLS_PTHREAD)
-extern pthread_key_t _mi_heap_default_key;
-#endif
-
-// Default heap to allocate from (if not using TLS- or pthread slots).
-// Do not use this directly but use through `mi_heap_get_default()` (or the unchecked `mi_get_default_heap`).
-// This thread local variable is only used when neither MI_TLS_SLOT, MI_TLS_PTHREAD, or MI_TLS_PTHREAD_SLOT_OFS are defined.
-// However, on the Apple M1 we do use the address of this variable as the unique thread-id (issue #356).
-extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
-
-static inline mi_heap_t* mi_get_default_heap(void) {
-#if defined(MI_TLS_SLOT)
-  mi_heap_t* heap = (mi_heap_t*)mi_tls_slot(MI_TLS_SLOT);
-  if mi_unlikely(heap == NULL) {
-    #ifdef __GNUC__
-    __asm(""); // prevent conditional load of the address of _mi_heap_empty
-    #endif
-    heap = (mi_heap_t*)&_mi_heap_empty;
-  }
-  return heap;
-#elif defined(MI_TLS_PTHREAD_SLOT_OFS)
-  mi_heap_t* heap = *mi_tls_pthread_heap_slot();
-  return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
-#elif defined(MI_TLS_PTHREAD)
-  mi_heap_t* heap = (mi_unlikely(_mi_heap_default_key == (pthread_key_t)(-1)) ? _mi_heap_main_get() : (mi_heap_t*)pthread_getspecific(_mi_heap_default_key));
-  return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
-#else
-  #if defined(MI_TLS_RECURSE_GUARD)
-  if (mi_unlikely(!_mi_process_is_initialized)) return _mi_heap_main_get();
-  #endif
-  return _mi_heap_default;
-#endif
-}
-
-static inline bool mi_heap_is_default(const mi_heap_t* heap) {
-  return (heap == mi_get_default_heap());
-}
 
 static inline bool mi_heap_is_backing(const mi_heap_t* heap) {
   return (heap->tld->heap_backing == heap);
@@ -454,11 +403,6 @@ static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t si
   return heap->pages_free_direct[idx];
 }
 
-// Get the page belonging to a certain size class
-static inline mi_page_t* _mi_get_free_small_page(size_t size) {
-  return _mi_heap_get_free_small_page(mi_get_default_heap(), size);
-}
-
 // Segment that contains the pointer
 // Large aligned blocks may be aligned at N*MI_SEGMENT_SIZE (inside a huge segment > MI_SEGMENT_SIZE),
 // and we need align "down" to the segment info which is `MI_SEGMENT_SIZE` bytes before it;
@@ -790,6 +734,29 @@ size_t _mi_commit_mask_next_run(const mi_commit_mask_t* cm, size_t* idx);
       
 
 
+/* -----------------------------------------------------------
+  memory id's
+----------------------------------------------------------- */
+
+static inline mi_memid_t _mi_memid_create(mi_memkind_t memkind) {
+  mi_memid_t memid;
+  _mi_memzero_var(memid);
+  memid.memkind = memkind;
+  return memid;
+}
+
+static inline mi_memid_t _mi_memid_none(void) {
+  return _mi_memid_create(MI_MEM_NONE);
+}
+
+static inline mi_memid_t _mi_memid_create_os(bool committed, bool is_zero, bool is_large) {
+  mi_memid_t memid = _mi_memid_create(MI_MEM_OS);
+  memid.initially_committed = committed;
+  memid.initially_zero = is_zero;
+  memid.is_pinned = is_large;
+  return memid;
+}
+
 
 // -------------------------------------------------------------------
 // Fast "random" shuffle
@@ -834,107 +801,6 @@ static inline size_t _mi_os_numa_node_count(void) {
 }
 
 
-// -------------------------------------------------------------------
-// Getting the thread id should be performant as it is called in the
-// fast path of `_mi_free` and we specialize for various platforms.
-// We only require _mi_threadid() to return a unique id for each thread.
-// -------------------------------------------------------------------
-#if defined(_WIN32)
-
-#define WIN32_LEAN_AND_MEAN
-#include <windows.h>
-static inline mi_threadid_t _mi_thread_id(void) mi_attr_noexcept {
-  // Windows: works on Intel and ARM in both 32- and 64-bit
-  return (uintptr_t)NtCurrentTeb();
-}
-
-// We use assembly for a fast thread id on the main platforms. The TLS layout depends on
-// both the OS and libc implementation so we use specific tests for each main platform.
-// If you test on another platform and it works please send a PR :-)
-// see also https://akkadia.org/drepper/tls.pdf for more info on the TLS register.
-#elif defined(__GNUC__) && ( \
-           (defined(__GLIBC__)   && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \
-        || (defined(__APPLE__)   && (defined(__x86_64__) || defined(__aarch64__))) \
-        || (defined(__BIONIC__)  && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \
-        || (defined(__FreeBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
-        || (defined(__OpenBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
-      )
-
-static inline void* mi_tls_slot(size_t slot) mi_attr_noexcept {
-  void* res;
-  const size_t ofs = (slot*sizeof(void*));
-  #if defined(__i386__)
-    __asm__("movl %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86 32-bit always uses GS
-  #elif defined(__APPLE__) && defined(__x86_64__)
-    __asm__("movq %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 macOSX uses GS
-  #elif defined(__x86_64__) && (MI_INTPTR_SIZE==4)
-    __asm__("movl %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x32 ABI
-  #elif defined(__x86_64__)
-    __asm__("movq %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 Linux, BSD uses FS
-  #elif defined(__arm__)
-    void** tcb; MI_UNUSED(ofs);
-    __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
-    res = tcb[slot];
-  #elif defined(__aarch64__)
-    void** tcb; MI_UNUSED(ofs);
-    #if defined(__APPLE__) // M1, issue #343
-    __asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb));
-    #else
-    __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
-    #endif
-    res = tcb[slot];
-  #endif
-  return res;
-}
-
-// setting a tls slot is only used on macOS for now
-static inline void mi_tls_slot_set(size_t slot, void* value) mi_attr_noexcept {
-  const size_t ofs = (slot*sizeof(void*));
-  #if defined(__i386__)
-    __asm__("movl %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // 32-bit always uses GS
-  #elif defined(__APPLE__) && defined(__x86_64__)
-    __asm__("movq %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 macOS uses GS
-  #elif defined(__x86_64__) && (MI_INTPTR_SIZE==4)
-    __asm__("movl %1,%%fs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x32 ABI
-  #elif defined(__x86_64__)
-    __asm__("movq %1,%%fs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 Linux, BSD uses FS
-  #elif defined(__arm__)
-    void** tcb; MI_UNUSED(ofs);
-    __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
-    tcb[slot] = value;
-  #elif defined(__aarch64__)
-    void** tcb; MI_UNUSED(ofs);
-    #if defined(__APPLE__) // M1, issue #343
-    __asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb));
-    #else
-    __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
-    #endif
-    tcb[slot] = value;
-  #endif
-}
-
-static inline mi_threadid_t _mi_thread_id(void) mi_attr_noexcept {
-  #if defined(__BIONIC__)
-    // issue #384, #495: on the Bionic libc (Android), slot 1 is the thread id
-    // see: https://github.com/aosp-mirror/platform_bionic/blob/c44b1d0676ded732df4b3b21c5f798eacae93228/libc/platform/bionic/tls_defines.h#L86
-    return (uintptr_t)mi_tls_slot(1);
-  #else
-    // in all our other targets, slot 0 is the thread id
-    // glibc: https://sourceware.org/git/?p=glibc.git;a=blob_plain;f=sysdeps/x86_64/nptl/tls.h
-    // apple: https://github.com/apple/darwin-xnu/blob/main/libsyscall/os/tsd.h#L36
-    return (uintptr_t)mi_tls_slot(0);
-  #endif
-}
-
-#else
-
-// otherwise use portable C, taking the address of a thread local variable (this is still very fast on most platforms).
-static inline mi_threadid_t _mi_thread_id(void) mi_attr_noexcept {
-  return (uintptr_t)&_mi_heap_default;
-}
-
-#endif
-
 
 // -----------------------------------------------------------------------
 // Count bits: trailing or leading zeros (with MI_INTPTR_BITS on all zero)
@@ -964,6 +830,7 @@ static inline size_t mi_ctz(uintptr_t x) {
 #elif defined(_MSC_VER)
 
 #include <limits.h>       // LONG_MAX
+#include <intrin.h>       // BitScanReverse64
 #define MI_HAVE_FAST_BITSCAN
 static inline size_t mi_clz(uintptr_t x) {
   if (x==0) return MI_INTPTR_BITS;
@@ -1050,7 +917,6 @@ static inline size_t mi_bsr(uintptr_t x) {
 
 #if !MI_TRACK_ENABLED && defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64))
 #include <intrin.h>
-#include <string.h>
 extern bool _mi_cpu_has_fsrm;
 static inline void _mi_memcpy(void* dst, const void* src, size_t n) {
   if (_mi_cpu_has_fsrm) {
@@ -1069,7 +935,6 @@ static inline void _mi_memzero(void* dst, size_t n) {
   }
 }
 #else
-#include <string.h>
 static inline void _mi_memcpy(void* dst, const void* src, size_t n) {
   memcpy(dst, src, n);
 }
@@ -1078,7 +943,6 @@ static inline void _mi_memzero(void* dst, size_t n) {
 }
 #endif
 
-
 // -------------------------------------------------------------------------------
 // The `_mi_memcpy_aligned` can be used if the pointers are machine-word aligned
 // This is used for example in `mi_realloc`.
@@ -1086,7 +950,6 @@ static inline void _mi_memzero(void* dst, size_t n) {
 
 #if (defined(__GNUC__) && (__GNUC__ >= 4)) || defined(__clang__)
 // On GCC/CLang we provide a hint that the pointers are word aligned.
-#include <string.h>
 static inline void _mi_memcpy_aligned(void* dst, const void* src, size_t n) {
   mi_assert_internal(((uintptr_t)dst % MI_INTPTR_SIZE == 0) && ((uintptr_t)src % MI_INTPTR_SIZE == 0));
   void* adst = __builtin_assume_aligned(dst, MI_INTPTR_SIZE);
diff --git a/Include/mimalloc/mimalloc/prim.h b/Include/mimalloc/mimalloc/prim.h
new file mode 100644
index 00000000000000..9e560696fa4783
--- /dev/null
+++ b/Include/mimalloc/mimalloc/prim.h
@@ -0,0 +1,323 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_PRIM_H
+#define MIMALLOC_PRIM_H
+
+
+// --------------------------------------------------------------------------
+// This file specifies the primitive portability API.
+// Each OS/host needs to implement these primitives, see `src/prim`
+// for implementations on Window, macOS, WASI, and Linux/Unix.
+//
+// note: on all primitive functions, we always have result parameters != NUL, and:
+//  addr != NULL and page aligned
+//  size > 0     and page aligned
+//  return value is an error code an int where 0 is success.
+// --------------------------------------------------------------------------
+
+// OS memory configuration
+typedef struct mi_os_mem_config_s {
+  size_t  page_size;            // 4KiB
+  size_t  large_page_size;      // 2MiB
+  size_t  alloc_granularity;    // smallest allocation size (on Windows 64KiB)
+  bool    has_overcommit;       // can we reserve more memory than can be actually committed?
+  bool    must_free_whole;      // must allocated blocks be freed as a whole (false for mmap, true for VirtualAlloc)
+  bool    has_virtual_reserve;  // supports virtual address space reservation? (if true we can reserve virtual address space without using commit or physical memory)
+} mi_os_mem_config_t;
+
+// Initialize
+void _mi_prim_mem_init( mi_os_mem_config_t* config );
+
+// Free OS memory
+int _mi_prim_free(void* addr, size_t size );
+  
+// Allocate OS memory. Return NULL on error.
+// The `try_alignment` is just a hint and the returned pointer does not have to be aligned.
+// If `commit` is false, the virtual memory range only needs to be reserved (with no access) 
+// which will later be committed explicitly using `_mi_prim_commit`.
+// `is_zero` is set to true if the memory was zero initialized (as on most OS's)
+// pre: !commit => !allow_large
+//      try_alignment >= _mi_os_page_size() and a power of 2
+int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr);
+
+// Commit memory. Returns error code or 0 on success.
+// For example, on Linux this would make the memory PROT_READ|PROT_WRITE.
+// `is_zero` is set to true if the memory was zero initialized (e.g. on Windows)
+int _mi_prim_commit(void* addr, size_t size, bool* is_zero);
+
+// Decommit memory. Returns error code or 0 on success. The `needs_recommit` result is true
+// if the memory would need to be re-committed. For example, on Windows this is always true,
+// but on Linux we could use MADV_DONTNEED to decommit which does not need a recommit.
+// pre: needs_recommit != NULL
+int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit);
+
+// Reset memory. The range keeps being accessible but the content might be reset.
+// Returns error code or 0 on success.
+int _mi_prim_reset(void* addr, size_t size);
+
+// Protect memory. Returns error code or 0 on success.
+int _mi_prim_protect(void* addr, size_t size, bool protect);
+
+// Allocate huge (1GiB) pages possibly associated with a NUMA node.
+// `is_zero` is set to true if the memory was zero initialized (as on most OS's)
+// pre: size > 0  and a multiple of 1GiB.
+//      numa_node is either negative (don't care), or a numa node number.
+int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr);
+
+// Return the current NUMA node
+size_t _mi_prim_numa_node(void);
+
+// Return the number of logical NUMA nodes
+size_t _mi_prim_numa_node_count(void);
+
+// Clock ticks
+mi_msecs_t _mi_prim_clock_now(void);
+
+// Return process information (only for statistics)
+typedef struct mi_process_info_s {
+  mi_msecs_t  elapsed;
+  mi_msecs_t  utime;
+  mi_msecs_t  stime; 
+  size_t      current_rss; 
+  size_t      peak_rss;  
+  size_t      current_commit;
+  size_t      peak_commit; 
+  size_t      page_faults;
+} mi_process_info_t;
+
+void _mi_prim_process_info(mi_process_info_t* pinfo);
+
+// Default stderr output. (only for warnings etc. with verbose enabled)
+// msg != NULL && _mi_strlen(msg) > 0
+void _mi_prim_out_stderr( const char* msg );
+
+// Get an environment variable. (only for options)
+// name != NULL, result != NULL, result_size >= 64
+bool _mi_prim_getenv(const char* name, char* result, size_t result_size);
+
+
+// Fill a buffer with strong randomness; return `false` on error or if
+// there is no strong randomization available.
+bool _mi_prim_random_buf(void* buf, size_t buf_len);
+
+// Called on the first thread start, and should ensure `_mi_thread_done` is called on thread termination.
+void _mi_prim_thread_init_auto_done(void);
+
+// Called on process exit and may take action to clean up resources associated with the thread auto done.
+void _mi_prim_thread_done_auto_done(void);
+
+// Called when the default heap for a thread changes
+void _mi_prim_thread_associate_default_heap(mi_heap_t* heap);
+
+
+//-------------------------------------------------------------------
+// Thread id: `_mi_prim_thread_id()`
+// 
+// Getting the thread id should be performant as it is called in the
+// fast path of `_mi_free` and we specialize for various platforms as
+// inlined definitions. Regular code should call `init.c:_mi_thread_id()`.
+// We only require _mi_prim_thread_id() to return a unique id
+// for each thread (unequal to zero).
+//-------------------------------------------------------------------
+
+// defined in `init.c`; do not use these directly
+extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
+extern bool _mi_process_is_initialized;             // has mi_process_init been called?
+
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept;
+
+#if defined(_WIN32)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+  // Windows: works on Intel and ARM in both 32- and 64-bit
+  return (uintptr_t)NtCurrentTeb();
+}
+
+// We use assembly for a fast thread id on the main platforms. The TLS layout depends on
+// both the OS and libc implementation so we use specific tests for each main platform.
+// If you test on another platform and it works please send a PR :-)
+// see also https://akkadia.org/drepper/tls.pdf for more info on the TLS register.
+#elif defined(__GNUC__) && ( \
+           (defined(__GLIBC__)   && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \
+        || (defined(__APPLE__)   && (defined(__x86_64__) || defined(__aarch64__))) \
+        || (defined(__BIONIC__)  && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \
+        || (defined(__FreeBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
+        || (defined(__OpenBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
+      )
+
+static inline void* mi_prim_tls_slot(size_t slot) mi_attr_noexcept {
+  void* res;
+  const size_t ofs = (slot*sizeof(void*));
+  #if defined(__i386__)
+    __asm__("movl %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86 32-bit always uses GS
+  #elif defined(__APPLE__) && defined(__x86_64__)
+    __asm__("movq %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 macOSX uses GS
+  #elif defined(__x86_64__) && (MI_INTPTR_SIZE==4)
+    __asm__("movl %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x32 ABI
+  #elif defined(__x86_64__)
+    __asm__("movq %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 Linux, BSD uses FS
+  #elif defined(__arm__)
+    void** tcb; MI_UNUSED(ofs);
+    __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
+    res = tcb[slot];
+  #elif defined(__aarch64__)
+    void** tcb; MI_UNUSED(ofs);
+    #if defined(__APPLE__) // M1, issue #343
+    __asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb));
+    #else
+    __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
+    #endif
+    res = tcb[slot];
+  #endif
+  return res;
+}
+
+// setting a tls slot is only used on macOS for now
+static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexcept {
+  const size_t ofs = (slot*sizeof(void*));
+  #if defined(__i386__)
+    __asm__("movl %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // 32-bit always uses GS
+  #elif defined(__APPLE__) && defined(__x86_64__)
+    __asm__("movq %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 macOS uses GS
+  #elif defined(__x86_64__) && (MI_INTPTR_SIZE==4)
+    __asm__("movl %1,%%fs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x32 ABI
+  #elif defined(__x86_64__)
+    __asm__("movq %1,%%fs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 Linux, BSD uses FS
+  #elif defined(__arm__)
+    void** tcb; MI_UNUSED(ofs);
+    __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
+    tcb[slot] = value;
+  #elif defined(__aarch64__)
+    void** tcb; MI_UNUSED(ofs);
+    #if defined(__APPLE__) // M1, issue #343
+    __asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb));
+    #else
+    __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
+    #endif
+    tcb[slot] = value;
+  #endif
+}
+
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+  #if defined(__BIONIC__)
+    // issue #384, #495: on the Bionic libc (Android), slot 1 is the thread id
+    // see: https://github.com/aosp-mirror/platform_bionic/blob/c44b1d0676ded732df4b3b21c5f798eacae93228/libc/platform/bionic/tls_defines.h#L86
+    return (uintptr_t)mi_prim_tls_slot(1);
+  #else
+    // in all our other targets, slot 0 is the thread id
+    // glibc: https://sourceware.org/git/?p=glibc.git;a=blob_plain;f=sysdeps/x86_64/nptl/tls.h
+    // apple: https://github.com/apple/darwin-xnu/blob/main/libsyscall/os/tsd.h#L36
+    return (uintptr_t)mi_prim_tls_slot(0);
+  #endif
+}
+
+#else
+
+// otherwise use portable C, taking the address of a thread local variable (this is still very fast on most platforms).
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+  return (uintptr_t)&_mi_heap_default;
+}
+
+#endif
+
+
+
+/* ----------------------------------------------------------------------------------------
+The thread local default heap: `_mi_prim_get_default_heap()`
+This is inlined here as it is on the fast path for allocation functions.
+
+On most platforms (Windows, Linux, FreeBSD, NetBSD, etc), this just returns a
+__thread local variable (`_mi_heap_default`). With the initial-exec TLS model this ensures
+that the storage will always be available (allocated on the thread stacks).
+
+On some platforms though we cannot use that when overriding `malloc` since the underlying
+TLS implementation (or the loader) will call itself `malloc` on a first access and recurse.
+We try to circumvent this in an efficient way:
+- macOSX : we use an unused TLS slot from the OS allocated slots (MI_TLS_SLOT). On OSX, the
+           loader itself calls `malloc` even before the modules are initialized.
+- OpenBSD: we use an unused slot from the pthread block (MI_TLS_PTHREAD_SLOT_OFS).
+- DragonFly: defaults are working but seem slow compared to freeBSD (see PR #323)
+------------------------------------------------------------------------------------------- */
+
+static inline mi_heap_t* mi_prim_get_default_heap(void);
+
+#if defined(MI_MALLOC_OVERRIDE)
+#if defined(__APPLE__) // macOS
+  #define MI_TLS_SLOT               89  // seems unused?
+  // #define MI_TLS_RECURSE_GUARD 1
+  // other possible unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89)
+  // see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
+#elif defined(__OpenBSD__)
+  // use end bytes of a name; goes wrong if anyone uses names > 23 characters (ptrhread specifies 16)
+  // see <https://github.com/openbsd/src/blob/master/lib/libc/include/thread_private.h#L371>
+  #define MI_TLS_PTHREAD_SLOT_OFS   (6*sizeof(int) + 4*sizeof(void*) + 24)
+  // #elif defined(__DragonFly__)
+  // #warning "mimalloc is not working correctly on DragonFly yet."
+  // #define MI_TLS_PTHREAD_SLOT_OFS   (4 + 1*sizeof(void*))  // offset `uniqueid` (also used by gdb?) <https://github.com/DragonFlyBSD/DragonFlyBSD/blob/master/lib/libthread_xu/thread/thr_private.h#L458>
+#elif defined(__ANDROID__)
+  // See issue #381
+  #define MI_TLS_PTHREAD
+#endif
+#endif
+
+
+#if defined(MI_TLS_SLOT)
+
+static inline mi_heap_t* mi_prim_get_default_heap(void) {
+  mi_heap_t* heap = (mi_heap_t*)mi_prim_tls_slot(MI_TLS_SLOT);
+  if mi_unlikely(heap == NULL) {
+    #ifdef __GNUC__
+    __asm(""); // prevent conditional load of the address of _mi_heap_empty
+    #endif
+    heap = (mi_heap_t*)&_mi_heap_empty;
+  }
+  return heap;
+}
+
+#elif defined(MI_TLS_PTHREAD_SLOT_OFS)
+
+static inline mi_heap_t** mi_prim_tls_pthread_heap_slot(void) {
+  pthread_t self = pthread_self();
+  #if defined(__DragonFly__)
+  if (self==NULL) return NULL;
+  #endif
+  return (mi_heap_t**)((uint8_t*)self + MI_TLS_PTHREAD_SLOT_OFS);
+}
+
+static inline mi_heap_t* mi_prim_get_default_heap(void) {
+  mi_heap_t** pheap = mi_prim_tls_pthread_heap_slot();
+  if mi_unlikely(pheap == NULL) return _mi_heap_main_get();
+  mi_heap_t* heap = *pheap;
+  if mi_unlikely(heap == NULL) return (mi_heap_t*)&_mi_heap_empty;
+  return heap;
+}
+
+#elif defined(MI_TLS_PTHREAD)
+
+extern pthread_key_t _mi_heap_default_key;
+static inline mi_heap_t* mi_prim_get_default_heap(void) {
+  mi_heap_t* heap = (mi_unlikely(_mi_heap_default_key == (pthread_key_t)(-1)) ? _mi_heap_main_get() : (mi_heap_t*)pthread_getspecific(_mi_heap_default_key));
+  return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
+}
+
+#else // default using a thread local variable; used on most platforms.
+
+static inline mi_heap_t* mi_prim_get_default_heap(void) {
+  #if defined(MI_TLS_RECURSE_GUARD)
+  if (mi_unlikely(!_mi_process_is_initialized)) return _mi_heap_main_get();
+  #endif
+  return _mi_heap_default;
+}
+
+#endif  // mi_prim_get_default_heap()
+
+
+
+#endif  // MIMALLOC_PRIM_H
diff --git a/Include/mimalloc/mimalloc/track.h b/Include/mimalloc/mimalloc/track.h
new file mode 100644
index 00000000000000..9545f7507ac99f
--- /dev/null
+++ b/Include/mimalloc/mimalloc/track.h
@@ -0,0 +1,147 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_TRACK_H
+#define MIMALLOC_TRACK_H
+
+/* ------------------------------------------------------------------------------------------------------
+Track memory ranges with macros for tools like Valgrind address sanitizer, or other memory checkers.
+These can be defined for tracking allocation:
+
+  #define mi_track_malloc_size(p,reqsize,size,zero)
+  #define mi_track_free_size(p,_size)
+
+The macros are set up such that the size passed to `mi_track_free_size`
+always matches the size of `mi_track_malloc_size`. (currently, `size == mi_usable_size(p)`).
+The `reqsize` is what the user requested, and `size >= reqsize`.
+The `size` is either byte precise (and `size==reqsize`) if `MI_PADDING` is enabled,
+or otherwise it is the usable block size which may be larger than the original request.
+Use `_mi_block_size_of(void* p)` to get the full block size that was allocated (including padding etc).
+The `zero` parameter is `true` if the allocated block is zero initialized.
+
+Optional:
+
+  #define mi_track_align(p,alignedp,offset,size)
+  #define mi_track_resize(p,oldsize,newsize)
+  #define mi_track_init()
+
+The `mi_track_align` is called right after a `mi_track_malloc` for aligned pointers in a block.
+The corresponding `mi_track_free` still uses the block start pointer and original size (corresponding to the `mi_track_malloc`).
+The `mi_track_resize` is currently unused but could be called on reallocations within a block.
+`mi_track_init` is called at program start.
+
+The following macros are for tools like asan and valgrind to track whether memory is 
+defined, undefined, or not accessible at all:
+
+  #define mi_track_mem_defined(p,size)
+  #define mi_track_mem_undefined(p,size)
+  #define mi_track_mem_noaccess(p,size)
+
+-------------------------------------------------------------------------------------------------------*/
+
+#if MI_TRACK_VALGRIND
+// valgrind tool
+
+#define MI_TRACK_ENABLED      1
+#define MI_TRACK_HEAP_DESTROY 1           // track free of individual blocks on heap_destroy
+#define MI_TRACK_TOOL         "valgrind"
+
+#include <valgrind/valgrind.h>
+#include <valgrind/memcheck.h>
+
+#define mi_track_malloc_size(p,reqsize,size,zero) VALGRIND_MALLOCLIKE_BLOCK(p,size,MI_PADDING_SIZE /*red zone*/,zero)
+#define mi_track_free_size(p,_size)               VALGRIND_FREELIKE_BLOCK(p,MI_PADDING_SIZE /*red zone*/)
+#define mi_track_resize(p,oldsize,newsize)        VALGRIND_RESIZEINPLACE_BLOCK(p,oldsize,newsize,MI_PADDING_SIZE /*red zone*/)
+#define mi_track_mem_defined(p,size)              VALGRIND_MAKE_MEM_DEFINED(p,size)
+#define mi_track_mem_undefined(p,size)            VALGRIND_MAKE_MEM_UNDEFINED(p,size)
+#define mi_track_mem_noaccess(p,size)             VALGRIND_MAKE_MEM_NOACCESS(p,size)
+
+#elif MI_TRACK_ASAN
+// address sanitizer
+
+#define MI_TRACK_ENABLED      1
+#define MI_TRACK_HEAP_DESTROY 0
+#define MI_TRACK_TOOL         "asan"
+
+#include <sanitizer/asan_interface.h>
+
+#define mi_track_malloc_size(p,reqsize,size,zero) ASAN_UNPOISON_MEMORY_REGION(p,size)
+#define mi_track_free_size(p,size)                ASAN_POISON_MEMORY_REGION(p,size)
+#define mi_track_mem_defined(p,size)              ASAN_UNPOISON_MEMORY_REGION(p,size)
+#define mi_track_mem_undefined(p,size)            ASAN_UNPOISON_MEMORY_REGION(p,size)
+#define mi_track_mem_noaccess(p,size)             ASAN_POISON_MEMORY_REGION(p,size)
+
+#elif MI_TRACK_ETW
+// windows event tracing
+
+#define MI_TRACK_ENABLED      1
+#define MI_TRACK_HEAP_DESTROY 1
+#define MI_TRACK_TOOL         "ETW"
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include "../src/prim/windows/etw.h"
+
+#define mi_track_init()                           EventRegistermicrosoft_windows_mimalloc();
+#define mi_track_malloc_size(p,reqsize,size,zero) EventWriteETW_MI_ALLOC((UINT64)(p), size)
+#define mi_track_free_size(p,size)                EventWriteETW_MI_FREE((UINT64)(p), size)
+
+#else
+// no tracking
+
+#define MI_TRACK_ENABLED      0
+#define MI_TRACK_HEAP_DESTROY 0 
+#define MI_TRACK_TOOL         "none"
+
+#define mi_track_malloc_size(p,reqsize,size,zero)
+#define mi_track_free_size(p,_size)
+
+#endif
+
+// -------------------
+// Utility definitions
+
+#ifndef mi_track_resize
+#define mi_track_resize(p,oldsize,newsize)      mi_track_free_size(p,oldsize); mi_track_malloc(p,newsize,false)
+#endif
+
+#ifndef mi_track_align
+#define mi_track_align(p,alignedp,offset,size)  mi_track_mem_noaccess(p,offset)
+#endif
+
+#ifndef mi_track_init
+#define mi_track_init()
+#endif
+
+#ifndef mi_track_mem_defined
+#define mi_track_mem_defined(p,size)
+#endif
+
+#ifndef mi_track_mem_undefined
+#define mi_track_mem_undefined(p,size)
+#endif
+
+#ifndef mi_track_mem_noaccess
+#define mi_track_mem_noaccess(p,size)
+#endif
+
+
+#if MI_PADDING
+#define mi_track_malloc(p,reqsize,zero) \
+  if ((p)!=NULL) { \
+    mi_assert_internal(mi_usable_size(p)==(reqsize)); \
+    mi_track_malloc_size(p,reqsize,reqsize,zero); \
+  }
+#else
+#define mi_track_malloc(p,reqsize,zero) \
+  if ((p)!=NULL) { \
+    mi_assert_internal(mi_usable_size(p)>=(reqsize)); \
+    mi_track_malloc_size(p,reqsize,mi_usable_size(p),zero); \
+  }
+#endif
+
+#endif
diff --git a/Include/mimalloc/mimalloc-types.h b/Include/mimalloc/mimalloc/types.h
similarity index 85%
rename from Include/mimalloc/mimalloc-types.h
rename to Include/mimalloc/mimalloc/types.h
index f3af528e524b95..2005238a64e9ac 100644
--- a/Include/mimalloc/mimalloc-types.h
+++ b/Include/mimalloc/mimalloc/types.h
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -8,9 +8,20 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_TYPES_H
 #define MIMALLOC_TYPES_H
 
+// --------------------------------------------------------------------------
+// This file contains the main type definitions for mimalloc:
+// mi_heap_t      : all data for a thread-local heap, contains
+//                  lists of all managed heap pages.
+// mi_segment_t   : a larger chunk of memory (32GiB) from where pages
+//                  are allocated.
+// mi_page_t      : a mimalloc page (usually 64KiB or 512KiB) from
+//                  where objects are allocated.
+// --------------------------------------------------------------------------
+
+
 #include <stddef.h>   // ptrdiff_t
 #include <stdint.h>   // uintptr_t, uint16_t, etc
-#include "mimalloc-atomic.h"  // _Atomic
+#include "mimalloc/atomic.h"  // _Atomic
 
 #ifdef _MSC_VER
 #pragma warning(disable:4214) // bitfield is not int
@@ -29,8 +40,10 @@ terms of the MIT license. A copy of the license can be found in the file
 // Define NDEBUG in the release version to disable assertions.
 // #define NDEBUG
 
-// Define MI_VALGRIND to enable valgrind support
-// #define MI_VALGRIND 1
+// Define MI_TRACK_<tool> to enable tracking support
+// #define MI_TRACK_VALGRIND 1
+// #define MI_TRACK_ASAN     1
+// #define MI_TRACK_ETW      1
 
 // Define MI_STAT as 1 to maintain statistics; set it to 2 to have detailed statistics (but costs some performance).
 // #define MI_STAT 1
@@ -58,11 +71,16 @@ terms of the MIT license. A copy of the license can be found in the file
 #endif
 
 // Reserve extra padding at the end of each block to be more resilient against heap block overflows.
-// The padding can detect byte-precise buffer overflow on free.
-#if !defined(MI_PADDING) && (MI_DEBUG>=1 || MI_VALGRIND)
+// The padding can detect buffer overflow on free.
+#if !defined(MI_PADDING) && (MI_SECURE>=3 || MI_DEBUG>=1 || (MI_TRACK_VALGRIND || MI_TRACK_ASAN || MI_TRACK_ETW))
 #define MI_PADDING  1
 #endif
 
+// Check padding bytes; allows byte-precise buffer overflow detection
+#if !defined(MI_PADDING_CHECK) && MI_PADDING && (MI_SECURE>=3 || MI_DEBUG>=1)
+#define MI_PADDING_CHECK 1
+#endif
+
 
 // Encoded free lists allow detection of corrupted free lists
 // and can detect buffer overflows, modify after free, and double `free`s.
@@ -154,7 +172,7 @@ typedef int32_t  mi_ssize_t;
 // Derived constants
 #define MI_SEGMENT_SIZE                   (MI_ZU(1)<<MI_SEGMENT_SHIFT)
 #define MI_SEGMENT_ALIGN                  MI_SEGMENT_SIZE
-#define MI_SEGMENT_MASK                   (MI_SEGMENT_ALIGN - 1)
+#define MI_SEGMENT_MASK                   ((uintptr_t)(MI_SEGMENT_ALIGN - 1))
 #define MI_SEGMENT_SLICE_SIZE             (MI_ZU(1)<< MI_SEGMENT_SLICE_SHIFT)
 #define MI_SLICES_PER_SEGMENT             (MI_SEGMENT_SIZE / MI_SEGMENT_SLICE_SIZE) // 1024
 
@@ -273,16 +291,15 @@ typedef uintptr_t mi_thread_free_t;
 typedef struct mi_page_s {
   // "owned" by the segment
   uint32_t              slice_count;       // slices in this page (0 if not a page)
-  uint32_t              slice_offset;      // distance from the actual page data slice (0 if a page)
-  uint8_t               is_reset : 1;      // `true` if the page memory was reset
+  uint32_t              slice_offset;      // distance from the actual page data slice (0 if a page)  
   uint8_t               is_committed : 1;  // `true` if the page virtual memory is committed
-  uint8_t               is_zero_init : 1;  // `true` if the page was zero initialized
+  uint8_t               is_zero_init : 1;  // `true` if the page was initially zero initialized
 
   // layout like this to optimize access in `mi_malloc` and `mi_free`
   uint16_t              capacity;          // number of blocks committed, must be the first field, see `segment.c:page_clear`
   uint16_t              reserved;          // number of blocks reserved in memory
   mi_page_flags_t       flags;             // `in_full` and `has_aligned` flags (8 bits)
-  uint8_t               is_zero : 1;       // `true` if the blocks in the free list are zero initialized
+  uint8_t               free_is_zero : 1;  // `true` if the blocks in the free list are zero initialized
   uint8_t               retire_expire : 7; // expiration count for retired blocks
 
   mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
@@ -290,8 +307,8 @@ typedef struct mi_page_s {
   uint32_t              xblock_size;       // size available in each block (always `>0`)
   mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
 
-  #ifdef MI_ENCODE_FREELIST
-  uintptr_t             keys[2];           // two random keys to encode the free lists (see `_mi_block_next`)
+  #if (MI_ENCODE_FREELIST || MI_PADDING)
+  uintptr_t             keys[2];           // two random keys to encode the free lists (see `_mi_block_next`) or padding canary
   #endif
 
   _Atomic(mi_thread_free_t) xthread_free;  // list of deferred free blocks freed by other threads
@@ -308,6 +325,10 @@ typedef struct mi_page_s {
 
 
 
+// ------------------------------------------------------
+// Mimalloc segments contain mimalloc pages
+// ------------------------------------------------------
+
 typedef enum mi_page_kind_e {
   MI_PAGE_SMALL,    // small blocks go into 64KiB pages inside a segment
   MI_PAGE_MEDIUM,   // medium blocks go into medium pages inside a segment
@@ -332,7 +353,7 @@ typedef enum mi_segment_kind_e {
 // is still tracked in fine-grained MI_COMMIT_SIZE chunks)
 // ------------------------------------------------------
 
-#define MI_MINIMAL_COMMIT_SIZE      (16*MI_SEGMENT_SLICE_SIZE)           // 1MiB
+#define MI_MINIMAL_COMMIT_SIZE      (1*MI_SEGMENT_SLICE_SIZE)            
 #define MI_COMMIT_SIZE              (MI_SEGMENT_SLICE_SIZE)              // 64KiB
 #define MI_COMMIT_MASK_BITS         (MI_SEGMENT_SIZE / MI_COMMIT_SIZE)  
 #define MI_COMMIT_MASK_FIELD_BITS    MI_SIZE_BITS
@@ -350,20 +371,57 @@ typedef mi_page_t  mi_slice_t;
 typedef int64_t    mi_msecs_t;
 
 
+// Memory can reside in arena's, direct OS allocated, or statically allocated. The memid keeps track of this.
+typedef enum mi_memkind_e {
+  MI_MEM_NONE,      // not allocated
+  MI_MEM_EXTERNAL,  // not owned by mimalloc but provided externally (via `mi_manage_os_memory` for example)
+  MI_MEM_STATIC,    // allocated in a static area and should not be freed (for arena meta data for example)
+  MI_MEM_OS,        // allocated from the OS
+  MI_MEM_OS_HUGE,   // allocated as huge os pages
+  MI_MEM_OS_REMAP,  // allocated in a remapable area (i.e. using `mremap`)
+  MI_MEM_ARENA      // allocated from an arena (the usual case)
+} mi_memkind_t;
+
+static inline bool mi_memkind_is_os(mi_memkind_t memkind) {
+  return (memkind >= MI_MEM_OS && memkind <= MI_MEM_OS_REMAP);
+}
+
+typedef struct mi_memid_os_info {
+  void*         base;               // actual base address of the block (used for offset aligned allocations)
+  size_t        alignment;          // alignment at allocation
+} mi_memid_os_info_t;
+
+typedef struct mi_memid_arena_info {
+  size_t        block_index;        // index in the arena
+  mi_arena_id_t id;                 // arena id (>= 1)
+  bool          is_exclusive;       // the arena can only be used for specific arena allocations
+} mi_memid_arena_info_t;
+
+typedef struct mi_memid_s {
+  union {
+    mi_memid_os_info_t    os;       // only used for MI_MEM_OS
+    mi_memid_arena_info_t arena;    // only used for MI_MEM_ARENA
+  } mem;
+  bool          is_pinned;          // `true` if we cannot decommit/reset/protect in this memory (e.g. when allocated using large OS pages)
+  bool          initially_committed;// `true` if the memory was originally allocated as committed
+  bool          initially_zero;     // `true` if the memory was originally zero initialized
+  mi_memkind_t  memkind;
+} mi_memid_t;
+
+
 // Segments are large allocated memory blocks (8mb on 64 bit) from
 // the OS. Inside segments we allocated fixed size _pages_ that
 // contain blocks.
 typedef struct mi_segment_s {
-  size_t            memid;              // memory id for arena allocation
-  bool              mem_is_pinned;      // `true` if we cannot decommit/reset/protect in this memory (i.e. when allocated using large OS pages)    
-  bool              mem_is_large;       // in large/huge os pages?
-  bool              mem_is_committed;   // `true` if the whole segment is eagerly committed
-  size_t            mem_alignment;      // page alignment for huge pages (only used for alignment > MI_ALIGNMENT_MAX)
-  size_t            mem_align_offset;   // offset for huge page alignment (only used for alignment > MI_ALIGNMENT_MAX)
-
-  bool              allow_decommit;     
-  mi_msecs_t        decommit_expire;
-  mi_commit_mask_t  decommit_mask;
+  // constant fields
+  mi_memid_t        memid;              // memory id for arena allocation
+  bool              allow_decommit;
+  bool              allow_purge;
+  size_t            segment_size;
+
+  // segment fields
+  mi_msecs_t        purge_expire;
+  mi_commit_mask_t  purge_mask;
   mi_commit_mask_t  commit_mask;
 
   _Atomic(struct mi_segment_s*) abandoned_next;
@@ -522,6 +580,7 @@ typedef struct mi_stats_s {
   mi_stat_count_t reserved;
   mi_stat_count_t committed;
   mi_stat_count_t reset;
+  mi_stat_count_t purged;
   mi_stat_count_t page_committed;
   mi_stat_count_t segments_abandoned;
   mi_stat_count_t pages_abandoned;
@@ -534,6 +593,8 @@ typedef struct mi_stats_s {
   mi_stat_counter_t pages_extended;
   mi_stat_counter_t mmap_calls;
   mi_stat_counter_t commit_calls;
+  mi_stat_counter_t reset_calls;
+  mi_stat_counter_t purge_calls;
   mi_stat_counter_t page_no_retire;
   mi_stat_counter_t searches;
   mi_stat_counter_t normal_count;
diff --git a/Makefile.pre.in b/Makefile.pre.in
index 2186c51e623944..6a059121d8c1bf 100644
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -341,11 +341,12 @@ IO_OBJS=	\
 # mimalloc
 
 MIMALLOC_HEADERS= \
-		$(srcdir)/Include/internal/pycore_mimalloc.h \
-		$(srcdir)/Include/internal/mimalloc/mimalloc-atomic.h \
-		$(srcdir)/Include/internal/mimalloc/mimalloc.h \
-		$(srcdir)/Include/internal/mimalloc/mimalloc-internal.h \
-		$(srcdir)/Include/internal/mimalloc/mimalloc-types.h
+		$(srcdir)/Include/mimalloc/mimalloc.h \
+		$(srcdir)/Include/mimalloc/mimalloc/atomic.h \
+		$(srcdir)/Include/mimalloc/mimalloc/internal.h \
+		$(srcdir)/Include/mimalloc/mimalloc/prim.h \
+		$(srcdir)/Include/mimalloc/mimalloc/track.h \
+		$(srcdir)/Include/mimalloc/mimalloc/types.h
 
 MIMALLOC_OBJS= \
 		Objects/mimalloc/alloc-aligned.o \
@@ -360,8 +361,9 @@ MIMALLOC_OBJS= \
 		Objects/mimalloc/page.o \
 		Objects/mimalloc/random.o \
 		Objects/mimalloc/segment.o \
-		Objects/mimalloc/segment-cache.o \
-		Objects/mimalloc/stats.o
+		Objects/mimalloc/segment-map.o \
+		Objects/mimalloc/stats.o \
+		Objects/mimalloc/prim/prim.o
 
 
 ##########################################################################
diff --git a/Objects/mimalloc/alloc-aligned.c b/Objects/mimalloc/alloc-aligned.c
index 9fe82890fa2f55..1cd809f15d613c 100644
--- a/Objects/mimalloc/alloc-aligned.c
+++ b/Objects/mimalloc/alloc-aligned.c
@@ -6,9 +6,10 @@ terms of the MIT license. A copy of the license can be found in the file
 -----------------------------------------------------------------------------*/
 
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/prim.h"  // mi_prim_get_default_heap
 
-#include <string.h>  // memset
+#include <string.h>     // memset
 
 // ------------------------------------------------------
 // Aligned Allocation
@@ -46,7 +47,7 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_fallback(mi_heap_t*
     oversize = (size <= MI_SMALL_SIZE_MAX ? MI_SMALL_SIZE_MAX + 1 /* ensure we use generic malloc path */ : size);
     p = _mi_heap_malloc_zero_ex(heap, oversize, false, alignment); // the page block size should be large enough to align in the single huge page block
     // zero afterwards as only the area from the aligned_p may be committed!
-    if (p == NULL) return NULL;
+    if (p == NULL) return NULL;    
   }
   else {
     // otherwise over-allocate
@@ -61,30 +62,30 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_fallback(mi_heap_t*
   mi_assert_internal(adjust < alignment);
   void* aligned_p = (void*)((uintptr_t)p + adjust);
   if (aligned_p != p) {
-    mi_page_set_has_aligned(_mi_ptr_page(p), true);
+    mi_page_t* page = _mi_ptr_page(p);
+    mi_page_set_has_aligned(page, true);
+    _mi_padding_shrink(page, (mi_block_t*)p, adjust + size);
   }
+  // todo: expand padding if overallocated ?
 
   mi_assert_internal(mi_page_usable_block_size(_mi_ptr_page(p)) >= adjust + size);
   mi_assert_internal(p == _mi_page_ptr_unalign(_mi_ptr_segment(aligned_p), _mi_ptr_page(aligned_p), aligned_p));
   mi_assert_internal(((uintptr_t)aligned_p + offset) % alignment == 0);
-  mi_assert_internal(mi_page_usable_block_size(_mi_ptr_page(p)) >= adjust + size);
-
+  mi_assert_internal(mi_usable_size(aligned_p)>=size);
+  mi_assert_internal(mi_usable_size(p) == mi_usable_size(aligned_p)+adjust);
+    
   // now zero the block if needed
-  if (zero && alignment > MI_ALIGNMENT_MAX) {
-    const ptrdiff_t diff = (uint8_t*)aligned_p - (uint8_t*)p;
-    const ptrdiff_t zsize = mi_page_usable_block_size(_mi_ptr_page(p)) - diff - MI_PADDING_SIZE;
-    if (zsize > 0) { _mi_memzero(aligned_p, zsize); }
+  if (alignment > MI_ALIGNMENT_MAX) {
+    // for the tracker, on huge aligned allocations only from the start of the large block is defined
+    mi_track_mem_undefined(aligned_p, size);
+    if (zero) {
+      _mi_memzero_aligned(aligned_p, mi_usable_size(aligned_p));
+    }
   }
 
-  #if MI_TRACK_ENABLED
   if (p != aligned_p) {
-    mi_track_free_size(p, oversize);
-    mi_track_malloc(aligned_p, size, zero);
-  }
-  else {
-    mi_track_resize(aligned_p, oversize, size);
-  }
-  #endif
+    mi_track_align(p,aligned_p,adjust,mi_usable_size(aligned_p));
+  }  
   return aligned_p;
 }
 
@@ -92,21 +93,13 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_fallback(mi_heap_t*
 static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept
 {
   // note: we don't require `size > offset`, we just guarantee that the address at offset is aligned regardless of the allocated size.
-  mi_assert(alignment > 0);
   if mi_unlikely(alignment == 0 || !_mi_is_power_of_two(alignment)) { // require power-of-two (see <https://en.cppreference.com/w/c/memory/aligned_alloc>)
     #if MI_DEBUG > 0
     _mi_error_message(EOVERFLOW, "aligned allocation requires the alignment to be a power-of-two (size %zu, alignment %zu)\n", size, alignment);
     #endif
     return NULL;
   }
-  /*
-  if mi_unlikely(alignment > MI_ALIGNMENT_MAX) {  // we cannot align at a boundary larger than this (or otherwise we cannot find segment headers)
-    #if MI_DEBUG > 0
-    _mi_error_message(EOVERFLOW, "aligned allocation has a maximum alignment of %zu (size %zu, alignment %zu)\n", MI_ALIGNMENT_MAX, size, alignment);
-    #endif
-    return NULL;
-  }
-  */
+
   if mi_unlikely(size > PTRDIFF_MAX) {          // we don't allocate more than PTRDIFF_MAX (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
     #if MI_DEBUG > 0
     _mi_error_message(EOVERFLOW, "aligned allocation request is too large (size %zu, alignment %zu)\n", size, alignment);
@@ -146,9 +139,9 @@ mi_decl_nodiscard mi_decl_restrict void* mi_heap_malloc_aligned_at(mi_heap_t* he
 }
 
 mi_decl_nodiscard mi_decl_restrict void* mi_heap_malloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept {
+  if mi_unlikely(alignment == 0 || !_mi_is_power_of_two(alignment)) return NULL;
   #if !MI_PADDING
   // without padding, any small sized allocation is naturally aligned (see also `_mi_segment_page_start`)
-  if (!_mi_is_power_of_two(alignment)) return NULL;
   if mi_likely(_mi_is_power_of_two(size) && size >= alignment && size <= MI_SMALL_SIZE_MAX)
   #else
   // with padding, we can only guarantee this for fixed alignments
@@ -164,6 +157,11 @@ mi_decl_nodiscard mi_decl_restrict void* mi_heap_malloc_aligned(mi_heap_t* heap,
   }
 }
 
+// ensure a definition is emitted
+#if defined(__cplusplus)
+static void* _mi_heap_malloc_aligned = (void*)&mi_heap_malloc_aligned;
+#endif
+
 // ------------------------------------------------------
 // Aligned Allocation
 // ------------------------------------------------------
@@ -187,27 +185,27 @@ mi_decl_nodiscard mi_decl_restrict void* mi_heap_calloc_aligned(mi_heap_t* heap,
 }
 
 mi_decl_nodiscard mi_decl_restrict void* mi_malloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
-  return mi_heap_malloc_aligned_at(mi_get_default_heap(), size, alignment, offset);
+  return mi_heap_malloc_aligned_at(mi_prim_get_default_heap(), size, alignment, offset);
 }
 
 mi_decl_nodiscard mi_decl_restrict void* mi_malloc_aligned(size_t size, size_t alignment) mi_attr_noexcept {
-  return mi_heap_malloc_aligned(mi_get_default_heap(), size, alignment);
+  return mi_heap_malloc_aligned(mi_prim_get_default_heap(), size, alignment);
 }
 
 mi_decl_nodiscard mi_decl_restrict void* mi_zalloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
-  return mi_heap_zalloc_aligned_at(mi_get_default_heap(), size, alignment, offset);
+  return mi_heap_zalloc_aligned_at(mi_prim_get_default_heap(), size, alignment, offset);
 }
 
 mi_decl_nodiscard mi_decl_restrict void* mi_zalloc_aligned(size_t size, size_t alignment) mi_attr_noexcept {
-  return mi_heap_zalloc_aligned(mi_get_default_heap(), size, alignment);
+  return mi_heap_zalloc_aligned(mi_prim_get_default_heap(), size, alignment);
 }
 
 mi_decl_nodiscard mi_decl_restrict void* mi_calloc_aligned_at(size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
-  return mi_heap_calloc_aligned_at(mi_get_default_heap(), count, size, alignment, offset);
+  return mi_heap_calloc_aligned_at(mi_prim_get_default_heap(), count, size, alignment, offset);
 }
 
 mi_decl_nodiscard mi_decl_restrict void* mi_calloc_aligned(size_t count, size_t size, size_t alignment) mi_attr_noexcept {
-  return mi_heap_calloc_aligned(mi_get_default_heap(), count, size, alignment);
+  return mi_heap_calloc_aligned(mi_prim_get_default_heap(), count, size, alignment);
 }
 
 
@@ -225,19 +223,13 @@ static void* mi_heap_realloc_zero_aligned_at(mi_heap_t* heap, void* p, size_t ne
     return p;  // reallocation still fits, is aligned and not more than 50% waste
   }
   else {
+    // note: we don't zero allocate upfront so we only zero initialize the expanded part
     void* newp = mi_heap_malloc_aligned_at(heap,newsize,alignment,offset);
     if (newp != NULL) {
       if (zero && newsize > size) {
-        const mi_page_t* page = _mi_ptr_page(newp);
-        if (page->is_zero) {
-          // already zero initialized
-          mi_assert_expensive(mi_mem_is_zero(newp,newsize));
-        }
-        else {
-          // also set last word in the previous allocation to zero to ensure any padding is zero-initialized
-          size_t start = (size >= sizeof(intptr_t) ? size - sizeof(intptr_t) : 0);
-          memset((uint8_t*)newp + start, 0, newsize - start);
-        }
+        // also set last word in the previous allocation to zero to ensure any padding is zero-initialized
+        size_t start = (size >= sizeof(intptr_t) ? size - sizeof(intptr_t) : 0);
+        _mi_memzero((uint8_t*)newp + start, newsize - start);
       }
       _mi_memcpy_aligned(newp, p, (newsize > size ? size : newsize));
       mi_free(p); // only free if successful
@@ -282,25 +274,25 @@ mi_decl_nodiscard void* mi_heap_recalloc_aligned(mi_heap_t* heap, void* p, size_
 }
 
 mi_decl_nodiscard void* mi_realloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
-  return mi_heap_realloc_aligned_at(mi_get_default_heap(), p, newsize, alignment, offset);
+  return mi_heap_realloc_aligned_at(mi_prim_get_default_heap(), p, newsize, alignment, offset);
 }
 
 mi_decl_nodiscard void* mi_realloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
-  return mi_heap_realloc_aligned(mi_get_default_heap(), p, newsize, alignment);
+  return mi_heap_realloc_aligned(mi_prim_get_default_heap(), p, newsize, alignment);
 }
 
 mi_decl_nodiscard void* mi_rezalloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
-  return mi_heap_rezalloc_aligned_at(mi_get_default_heap(), p, newsize, alignment, offset);
+  return mi_heap_rezalloc_aligned_at(mi_prim_get_default_heap(), p, newsize, alignment, offset);
 }
 
 mi_decl_nodiscard void* mi_rezalloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
-  return mi_heap_rezalloc_aligned(mi_get_default_heap(), p, newsize, alignment);
+  return mi_heap_rezalloc_aligned(mi_prim_get_default_heap(), p, newsize, alignment);
 }
 
 mi_decl_nodiscard void* mi_recalloc_aligned_at(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
-  return mi_heap_recalloc_aligned_at(mi_get_default_heap(), p, newcount, size, alignment, offset);
+  return mi_heap_recalloc_aligned_at(mi_prim_get_default_heap(), p, newcount, size, alignment, offset);
 }
 
 mi_decl_nodiscard void* mi_recalloc_aligned(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
-  return mi_heap_recalloc_aligned(mi_get_default_heap(), p, newcount, size, alignment);
+  return mi_heap_recalloc_aligned(mi_prim_get_default_heap(), p, newcount, size, alignment);
 }
diff --git a/Objects/mimalloc/alloc-override.c b/Objects/mimalloc/alloc-override.c
new file mode 100644
index 00000000000000..873065dc63495c
--- /dev/null
+++ b/Objects/mimalloc/alloc-override.c
@@ -0,0 +1,297 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+#if !defined(MI_IN_ALLOC_C)
+#error "this file should be included from 'alloc.c' (so aliases can work)"
+#endif
+
+#if defined(MI_MALLOC_OVERRIDE) && defined(_WIN32) && !(defined(MI_SHARED_LIB) && defined(_DLL))
+#error "It is only possible to override "malloc" on Windows when building as a DLL (and linking the C runtime as a DLL)"
+#endif
+
+#if defined(MI_MALLOC_OVERRIDE) && !(defined(_WIN32))
+
+#if defined(__APPLE__)
+#include <AvailabilityMacros.h>
+mi_decl_externc void   vfree(void* p);
+mi_decl_externc size_t malloc_size(const void* p);
+mi_decl_externc size_t malloc_good_size(size_t size);
+#endif
+
+// helper definition for C override of C++ new
+typedef struct mi_nothrow_s { int _tag; } mi_nothrow_t;
+
+// ------------------------------------------------------
+// Override system malloc
+// ------------------------------------------------------
+
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(__APPLE__) && !MI_TRACK_ENABLED
+  // gcc, clang: use aliasing to alias the exported function to one of our `mi_` functions
+  #if (defined(__GNUC__) && __GNUC__ >= 9)
+    #pragma GCC diagnostic ignored "-Wattributes"  // or we get warnings that nodiscard is ignored on a forward
+    #define MI_FORWARD(fun)      __attribute__((alias(#fun), used, visibility("default"), copy(fun)));
+  #else
+    #define MI_FORWARD(fun)      __attribute__((alias(#fun), used, visibility("default")));
+  #endif
+  #define MI_FORWARD1(fun,x)      MI_FORWARD(fun)
+  #define MI_FORWARD2(fun,x,y)    MI_FORWARD(fun)
+  #define MI_FORWARD3(fun,x,y,z)  MI_FORWARD(fun)
+  #define MI_FORWARD0(fun,x)      MI_FORWARD(fun)
+  #define MI_FORWARD02(fun,x,y)   MI_FORWARD(fun)
+#else
+  // otherwise use forwarding by calling our `mi_` function
+  #define MI_FORWARD1(fun,x)      { return fun(x); }
+  #define MI_FORWARD2(fun,x,y)    { return fun(x,y); }
+  #define MI_FORWARD3(fun,x,y,z)  { return fun(x,y,z); }
+  #define MI_FORWARD0(fun,x)      { fun(x); }
+  #define MI_FORWARD02(fun,x,y)   { fun(x,y); }
+#endif
+
+
+#if defined(__APPLE__) && defined(MI_SHARED_LIB_EXPORT) && defined(MI_OSX_INTERPOSE)
+  // define MI_OSX_IS_INTERPOSED as we should not provide forwarding definitions for
+  // functions that are interposed (or the interposing does not work)
+  #define MI_OSX_IS_INTERPOSED
+
+  mi_decl_externc size_t mi_malloc_size_checked(void *p) {
+    if (!mi_is_in_heap_region(p)) return 0;
+    return mi_usable_size(p);
+  }
+
+  // use interposing so `DYLD_INSERT_LIBRARIES` works without `DYLD_FORCE_FLAT_NAMESPACE=1`
+  // See: <https://books.google.com/books?id=K8vUkpOXhN4C&pg=PA73>
+  struct mi_interpose_s {
+    const void* replacement;
+    const void* target;
+  };
+  #define MI_INTERPOSE_FUN(oldfun,newfun) { (const void*)&newfun, (const void*)&oldfun }
+  #define MI_INTERPOSE_MI(fun)            MI_INTERPOSE_FUN(fun,mi_##fun)
+
+  __attribute__((used)) static struct mi_interpose_s _mi_interposes[]  __attribute__((section("__DATA, __interpose"))) =
+  {
+    MI_INTERPOSE_MI(malloc),
+    MI_INTERPOSE_MI(calloc),
+    MI_INTERPOSE_MI(realloc),
+    MI_INTERPOSE_MI(strdup),
+    MI_INTERPOSE_MI(strndup),
+    MI_INTERPOSE_MI(realpath),
+    MI_INTERPOSE_MI(posix_memalign),
+    MI_INTERPOSE_MI(reallocf),
+    MI_INTERPOSE_MI(valloc),
+    MI_INTERPOSE_FUN(malloc_size,mi_malloc_size_checked),
+    MI_INTERPOSE_MI(malloc_good_size),
+    #if defined(MAC_OS_X_VERSION_10_15) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_15
+    MI_INTERPOSE_MI(aligned_alloc),
+    #endif
+    #ifdef MI_OSX_ZONE
+    // we interpose malloc_default_zone in alloc-override-osx.c so we can use mi_free safely
+    MI_INTERPOSE_MI(free),
+    MI_INTERPOSE_FUN(vfree,mi_free),
+    #else
+    // sometimes code allocates from default zone but deallocates using plain free :-( (like NxHashResizeToCapacity <https://github.com/nneonneo/osx-10.9-opensource/blob/master/objc4-551.1/runtime/hashtable2.mm>)
+    MI_INTERPOSE_FUN(free,mi_cfree), // use safe free that checks if pointers are from us
+    MI_INTERPOSE_FUN(vfree,mi_cfree),
+    #endif
+  };
+
+  #ifdef __cplusplus
+  extern "C" {
+  #endif
+  void  _ZdlPv(void* p);   // delete
+  void  _ZdaPv(void* p);   // delete[]
+  void  _ZdlPvm(void* p, size_t n);  // delete
+  void  _ZdaPvm(void* p, size_t n);  // delete[]
+  void* _Znwm(size_t n);  // new
+  void* _Znam(size_t n);  // new[]
+  void* _ZnwmRKSt9nothrow_t(size_t n, mi_nothrow_t tag); // new nothrow
+  void* _ZnamRKSt9nothrow_t(size_t n, mi_nothrow_t tag); // new[] nothrow
+  #ifdef __cplusplus
+  }
+  #endif
+  __attribute__((used)) static struct mi_interpose_s _mi_cxx_interposes[]  __attribute__((section("__DATA, __interpose"))) =
+  {
+    MI_INTERPOSE_FUN(_ZdlPv,mi_free),
+    MI_INTERPOSE_FUN(_ZdaPv,mi_free),
+    MI_INTERPOSE_FUN(_ZdlPvm,mi_free_size),
+    MI_INTERPOSE_FUN(_ZdaPvm,mi_free_size),
+    MI_INTERPOSE_FUN(_Znwm,mi_new),
+    MI_INTERPOSE_FUN(_Znam,mi_new),
+    MI_INTERPOSE_FUN(_ZnwmRKSt9nothrow_t,mi_new_nothrow),
+    MI_INTERPOSE_FUN(_ZnamRKSt9nothrow_t,mi_new_nothrow),
+  };
+
+#elif defined(_MSC_VER)
+  // cannot override malloc unless using a dll.
+  // we just override new/delete which does work in a static library.
+#else
+  // On all other systems forward to our API
+  mi_decl_export void* malloc(size_t size)              MI_FORWARD1(mi_malloc, size)
+  mi_decl_export void* calloc(size_t size, size_t n)    MI_FORWARD2(mi_calloc, size, n)
+  mi_decl_export void* realloc(void* p, size_t newsize) MI_FORWARD2(mi_realloc, p, newsize)
+  mi_decl_export void  free(void* p)                    MI_FORWARD0(mi_free, p)
+#endif
+
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(__APPLE__)
+#pragma GCC visibility push(default)
+#endif
+
+// ------------------------------------------------------
+// Override new/delete
+// This is not really necessary as they usually call
+// malloc/free anyway, but it improves performance.
+// ------------------------------------------------------
+#ifdef __cplusplus
+  // ------------------------------------------------------
+  // With a C++ compiler we override the new/delete operators.
+  // see <https://en.cppreference.com/w/cpp/memory/new/operator_new>
+  // ------------------------------------------------------
+  #include <new>
+
+  #ifndef MI_OSX_IS_INTERPOSED
+    void operator delete(void* p) noexcept              MI_FORWARD0(mi_free,p)
+    void operator delete[](void* p) noexcept            MI_FORWARD0(mi_free,p)
+
+    void* operator new(std::size_t n) noexcept(false)   MI_FORWARD1(mi_new,n)
+    void* operator new[](std::size_t n) noexcept(false) MI_FORWARD1(mi_new,n)
+
+    void* operator new  (std::size_t n, const std::nothrow_t& tag) noexcept { MI_UNUSED(tag); return mi_new_nothrow(n); }
+    void* operator new[](std::size_t n, const std::nothrow_t& tag) noexcept { MI_UNUSED(tag); return mi_new_nothrow(n); }
+
+    #if (__cplusplus >= 201402L || _MSC_VER >= 1916)
+    void operator delete  (void* p, std::size_t n) noexcept MI_FORWARD02(mi_free_size,p,n)
+    void operator delete[](void* p, std::size_t n) noexcept MI_FORWARD02(mi_free_size,p,n)
+    #endif
+  #endif
+
+  #if (__cplusplus > 201402L && defined(__cpp_aligned_new)) && (!defined(__GNUC__) || (__GNUC__ > 5))
+  void operator delete  (void* p, std::align_val_t al) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
+  void operator delete[](void* p, std::align_val_t al) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
+  void operator delete  (void* p, std::size_t n, std::align_val_t al) noexcept { mi_free_size_aligned(p, n, static_cast<size_t>(al)); };
+  void operator delete[](void* p, std::size_t n, std::align_val_t al) noexcept { mi_free_size_aligned(p, n, static_cast<size_t>(al)); };
+  void operator delete  (void* p, std::align_val_t al, const std::nothrow_t&) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
+  void operator delete[](void* p, std::align_val_t al, const std::nothrow_t&) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
+
+  void* operator new( std::size_t n, std::align_val_t al)   noexcept(false) { return mi_new_aligned(n, static_cast<size_t>(al)); }
+  void* operator new[]( std::size_t n, std::align_val_t al) noexcept(false) { return mi_new_aligned(n, static_cast<size_t>(al)); }
+  void* operator new  (std::size_t n, std::align_val_t al, const std::nothrow_t&) noexcept { return mi_new_aligned_nothrow(n, static_cast<size_t>(al)); }
+  void* operator new[](std::size_t n, std::align_val_t al, const std::nothrow_t&) noexcept { return mi_new_aligned_nothrow(n, static_cast<size_t>(al)); }
+  #endif
+
+#elif (defined(__GNUC__) || defined(__clang__))
+  // ------------------------------------------------------
+  // Override by defining the mangled C++ names of the operators (as
+  // used by GCC and CLang).
+  // See <https://itanium-cxx-abi.github.io/cxx-abi/abi.html#mangling>
+  // ------------------------------------------------------
+
+  void _ZdlPv(void* p)            MI_FORWARD0(mi_free,p) // delete
+  void _ZdaPv(void* p)            MI_FORWARD0(mi_free,p) // delete[]
+  void _ZdlPvm(void* p, size_t n) MI_FORWARD02(mi_free_size,p,n)
+  void _ZdaPvm(void* p, size_t n) MI_FORWARD02(mi_free_size,p,n)
+  void _ZdlPvSt11align_val_t(void* p, size_t al)            { mi_free_aligned(p,al); }
+  void _ZdaPvSt11align_val_t(void* p, size_t al)            { mi_free_aligned(p,al); }
+  void _ZdlPvmSt11align_val_t(void* p, size_t n, size_t al) { mi_free_size_aligned(p,n,al); }
+  void _ZdaPvmSt11align_val_t(void* p, size_t n, size_t al) { mi_free_size_aligned(p,n,al); }
+
+  #if (MI_INTPTR_SIZE==8)
+    void* _Znwm(size_t n)                             MI_FORWARD1(mi_new,n)  // new 64-bit
+    void* _Znam(size_t n)                             MI_FORWARD1(mi_new,n)  // new[] 64-bit
+    void* _ZnwmRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_nothrow(n); }
+    void* _ZnamRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_nothrow(n); }
+    void* _ZnwmSt11align_val_t(size_t n, size_t al)   MI_FORWARD2(mi_new_aligned, n, al)
+    void* _ZnamSt11align_val_t(size_t n, size_t al)   MI_FORWARD2(mi_new_aligned, n, al)
+    void* _ZnwmSt11align_val_tRKSt9nothrow_t(size_t n, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_aligned_nothrow(n,al); }
+    void* _ZnamSt11align_val_tRKSt9nothrow_t(size_t n, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_aligned_nothrow(n,al); }
+  #elif (MI_INTPTR_SIZE==4)
+    void* _Znwj(size_t n)                             MI_FORWARD1(mi_new,n)  // new 64-bit
+    void* _Znaj(size_t n)                             MI_FORWARD1(mi_new,n)  // new[] 64-bit
+    void* _ZnwjRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_nothrow(n); }
+    void* _ZnajRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_nothrow(n); }
+    void* _ZnwjSt11align_val_t(size_t n, size_t al)   MI_FORWARD2(mi_new_aligned, n, al)
+    void* _ZnajSt11align_val_t(size_t n, size_t al)   MI_FORWARD2(mi_new_aligned, n, al)
+    void* _ZnwjSt11align_val_tRKSt9nothrow_t(size_t n, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_aligned_nothrow(n,al); }
+    void* _ZnajSt11align_val_tRKSt9nothrow_t(size_t n, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_aligned_nothrow(n,al); }
+  #else
+    #error "define overloads for new/delete for this platform (just for performance, can be skipped)"
+  #endif
+#endif // __cplusplus
+
+// ------------------------------------------------------
+// Further Posix & Unix functions definitions
+// ------------------------------------------------------
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef MI_OSX_IS_INTERPOSED
+  // Forward Posix/Unix calls as well
+  void*  reallocf(void* p, size_t newsize) MI_FORWARD2(mi_reallocf,p,newsize)
+  size_t malloc_size(const void* p)        MI_FORWARD1(mi_usable_size,p)
+  #if !defined(__ANDROID__) && !defined(__FreeBSD__)
+  size_t malloc_usable_size(void *p)       MI_FORWARD1(mi_usable_size,p)
+  #else
+  size_t malloc_usable_size(const void *p) MI_FORWARD1(mi_usable_size,p)
+  #endif
+
+  // No forwarding here due to aliasing/name mangling issues
+  void*  valloc(size_t size)               { return mi_valloc(size); }
+  void   vfree(void* p)                    { mi_free(p); }
+  size_t malloc_good_size(size_t size)     { return mi_malloc_good_size(size); }
+  int    posix_memalign(void** p, size_t alignment, size_t size) { return mi_posix_memalign(p, alignment, size); }
+
+  // `aligned_alloc` is only available when __USE_ISOC11 is defined.
+  // Note: it seems __USE_ISOC11 is not defined in musl (and perhaps other libc's) so we only check
+  // for it if using glibc.
+  // Note: Conda has a custom glibc where `aligned_alloc` is declared `static inline` and we cannot
+  // override it, but both _ISOC11_SOURCE and __USE_ISOC11 are undefined in Conda GCC7 or GCC9.
+  // Fortunately, in the case where `aligned_alloc` is declared as `static inline` it
+  // uses internally `memalign`, `posix_memalign`, or `_aligned_malloc` so we  can avoid overriding it ourselves.
+  #if !defined(__GLIBC__) || __USE_ISOC11
+  void* aligned_alloc(size_t alignment, size_t size) { return mi_aligned_alloc(alignment, size); }
+  #endif
+#endif
+
+// no forwarding here due to aliasing/name mangling issues
+void  cfree(void* p)                                    { mi_free(p); }
+void* pvalloc(size_t size)                              { return mi_pvalloc(size); }
+void* reallocarray(void* p, size_t count, size_t size)  { return mi_reallocarray(p, count, size); }
+int   reallocarr(void* p, size_t count, size_t size)    { return mi_reallocarr(p, count, size); }
+void* memalign(size_t alignment, size_t size)           { return mi_memalign(alignment, size); }
+void* _aligned_malloc(size_t alignment, size_t size)    { return mi_aligned_alloc(alignment, size); }
+
+#if defined(__wasi__)
+  // forward __libc interface (see PR #667)
+  void* __libc_malloc(size_t size)                      MI_FORWARD1(mi_malloc, size)
+  void* __libc_calloc(size_t count, size_t size)        MI_FORWARD2(mi_calloc, count, size)
+  void* __libc_realloc(void* p, size_t size)            MI_FORWARD2(mi_realloc, p, size)
+  void  __libc_free(void* p)                            MI_FORWARD0(mi_free, p)
+  void* __libc_memalign(size_t alignment, size_t size)  { return mi_memalign(alignment, size); }
+
+#elif defined(__GLIBC__) && defined(__linux__)
+  // forward __libc interface (needed for glibc-based Linux distributions)
+  void* __libc_malloc(size_t size)                      MI_FORWARD1(mi_malloc,size)
+  void* __libc_calloc(size_t count, size_t size)        MI_FORWARD2(mi_calloc,count,size)
+  void* __libc_realloc(void* p, size_t size)            MI_FORWARD2(mi_realloc,p,size)
+  void  __libc_free(void* p)                            MI_FORWARD0(mi_free,p)
+  void  __libc_cfree(void* p)                           MI_FORWARD0(mi_free,p)
+
+  void* __libc_valloc(size_t size)                      { return mi_valloc(size); }
+  void* __libc_pvalloc(size_t size)                     { return mi_pvalloc(size); }
+  void* __libc_memalign(size_t alignment, size_t size)  { return mi_memalign(alignment,size); }
+  int   __posix_memalign(void** p, size_t alignment, size_t size) { return mi_posix_memalign(p,alignment,size); }
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(__APPLE__)
+#pragma GCC visibility pop
+#endif
+
+#endif // MI_MALLOC_OVERRIDE && !_WIN32
diff --git a/Objects/mimalloc/alloc-posix.c b/Objects/mimalloc/alloc-posix.c
index e6505f2907de22..225752fd8707fe 100644
--- a/Objects/mimalloc/alloc-posix.c
+++ b/Objects/mimalloc/alloc-posix.c
@@ -10,7 +10,7 @@ terms of the MIT license. A copy of the license can be found in the file
 // for convenience and used when overriding these functions.
 // ------------------------------------------------------------------------
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
+#include "mimalloc/internal.h"
 
 // ------------------------------------------------------
 // Posix & Unix functions definitions
@@ -56,7 +56,8 @@ int mi_posix_memalign(void** p, size_t alignment, size_t size) mi_attr_noexcept
   // Note: The spec dictates we should not modify `*p` on an error. (issue#27)
   // <http://man7.org/linux/man-pages/man3/posix_memalign.3.html>
   if (p == NULL) return EINVAL;
-  if (alignment % sizeof(void*) != 0) return EINVAL;                   // natural alignment
+  if ((alignment % sizeof(void*)) != 0) return EINVAL;                 // natural alignment
+  // it is also required that alignment is a power of 2 and > 0; this is checked in `mi_malloc_aligned`
   if (alignment==0 || !_mi_is_power_of_two(alignment)) return EINVAL;  // not a power of 2
   void* q = mi_malloc_aligned(size, alignment);
   if (q==NULL && size != 0) return ENOMEM;
@@ -149,7 +150,7 @@ int mi_dupenv_s(char** buf, size_t* size, const char* name) mi_attr_noexcept {
   else {
     *buf = mi_strdup(p);
     if (*buf==NULL) return ENOMEM;
-    if (size != NULL) *size = strlen(p);
+    if (size != NULL) *size = _mi_strlen(p);
   }
   return 0;
 }
diff --git a/Objects/mimalloc/alloc.c b/Objects/mimalloc/alloc.c
index 342a4bdf017681..ffc1747d527d11 100644
--- a/Objects/mimalloc/alloc.c
+++ b/Objects/mimalloc/alloc.c
@@ -9,12 +9,16 @@ terms of the MIT license. A copy of the license can be found in the file
 #endif
 
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
-#include "mimalloc-atomic.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
+#include "mimalloc/prim.h"   // _mi_prim_thread_id()
 
+#include <string.h>      // memset, strlen (for mi_strdup)
+#include <stdlib.h>      // malloc, abort
 
-#include <string.h>  // memset, strlen
-#include <stdlib.h>  // malloc, exit
+#define MI_IN_ALLOC_C
+#include "alloc-override.c"
+#undef MI_IN_ALLOC_C
 
 // ------------------------------------------------------
 // Allocation
@@ -33,21 +37,32 @@ extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t siz
   page->used++;
   page->free = mi_block_next(page, block);
   mi_assert_internal(page->free == NULL || _mi_ptr_page(page->free) == page);
+  #if MI_DEBUG>3
+  if (page->free_is_zero) {
+    mi_assert_expensive(mi_mem_is_zero(block+1,size - sizeof(*block)));
+  }
+  #endif
 
   // allow use of the block internally
   // note: when tracking we need to avoid ever touching the MI_PADDING since
-  // that is tracked by valgrind etc. as non-accessible (through the red-zone, see `mimalloc-track.h`)
+  // that is tracked by valgrind etc. as non-accessible (through the red-zone, see `mimalloc/track.h`)
   mi_track_mem_undefined(block, mi_page_usable_block_size(page));
 
   // zero the block? note: we need to zero the full block size (issue #63)
   if mi_unlikely(zero) {
     mi_assert_internal(page->xblock_size != 0); // do not call with zero'ing for huge blocks (see _mi_malloc_generic)
-    const size_t zsize = (page->is_zero ? sizeof(block->next) + MI_PADDING_SIZE : page->xblock_size);
-    _mi_memzero_aligned(block, zsize - MI_PADDING_SIZE);
+    mi_assert_internal(page->xblock_size >= MI_PADDING_SIZE);
+    if (page->free_is_zero) {
+      block->next = 0;
+      mi_track_mem_defined(block, page->xblock_size - MI_PADDING_SIZE);
+    }
+    else {
+      _mi_memzero_aligned(block, page->xblock_size - MI_PADDING_SIZE);
+    }    
   }
 
-#if (MI_DEBUG>0) && !MI_TRACK_ENABLED
-  if (!page->is_zero && !zero && !mi_page_is_huge(page)) {
+#if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN
+  if (!zero && !mi_page_is_huge(page)) {
     memset(block, MI_DEBUG_UNINIT, mi_page_usable_block_size(page));
   }
 #elif (MI_SECURE!=0)
@@ -66,20 +81,22 @@ extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t siz
   }
 #endif
 
-#if (MI_PADDING > 0) && defined(MI_ENCODE_FREELIST) && !MI_TRACK_ENABLED
+#if MI_PADDING // && !MI_TRACK_ENABLED
   mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + mi_page_usable_block_size(page));
   ptrdiff_t delta = ((uint8_t*)padding - (uint8_t*)block - (size - MI_PADDING_SIZE));
-  #if (MI_DEBUG>1)
+  #if (MI_DEBUG>=2)
   mi_assert_internal(delta >= 0 && mi_page_usable_block_size(page) >= (size - MI_PADDING_SIZE + delta));
-  mi_track_mem_defined(padding,sizeof(mi_padding_t));  // note: re-enable since mi_page_usable_block_size may set noaccess
   #endif
+  mi_track_mem_defined(padding,sizeof(mi_padding_t));  // note: re-enable since mi_page_usable_block_size may set noaccess
   padding->canary = (uint32_t)(mi_ptr_encode(page,block,page->keys));
   padding->delta  = (uint32_t)(delta);
+  #if MI_PADDING_CHECK
   if (!mi_page_is_huge(page)) {
     uint8_t* fill = (uint8_t*)padding - delta;
     const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // set at most N initial padding bytes
     for (size_t i = 0; i < maxpad; i++) { fill[i] = MI_DEBUG_PADDING; }
   }
+  #endif
 #endif
 
   return block;
@@ -92,21 +109,23 @@ static inline mi_decl_restrict void* mi_heap_malloc_small_zero(mi_heap_t* heap,
   mi_assert(heap->thread_id == 0 || heap->thread_id == tid); // heaps are thread local
   #endif
   mi_assert(size <= MI_SMALL_SIZE_MAX);
-#if (MI_PADDING)
-  if (size == 0) {
-    size = sizeof(void*);
-  }
-#endif
+  #if (MI_PADDING)
+  if (size == 0) { size = sizeof(void*); }
+  #endif
   mi_page_t* page = _mi_heap_get_free_small_page(heap, size + MI_PADDING_SIZE);
-  void* p = _mi_page_malloc(heap, page, size + MI_PADDING_SIZE, zero);
-  mi_assert_internal(p == NULL || mi_usable_size(p) >= size);
-#if MI_STAT>1
+  void* const p = _mi_page_malloc(heap, page, size + MI_PADDING_SIZE, zero);  
+  mi_track_malloc(p,size,zero);
+  #if MI_STAT>1
   if (p != NULL) {
-    if (!mi_heap_is_initialized(heap)) { heap = mi_get_default_heap(); }
+    if (!mi_heap_is_initialized(heap)) { heap = mi_prim_get_default_heap(); }
     mi_heap_stat_increase(heap, malloc, mi_usable_size(p));
   }
-#endif
-  mi_track_malloc(p,size,zero);
+  #endif
+  #if MI_DEBUG>3
+  if (p != NULL && zero) {
+    mi_assert_expensive(mi_mem_is_zero(p, size));
+  }
+  #endif
   return p;
 }
 
@@ -116,7 +135,7 @@ mi_decl_nodiscard extern inline mi_decl_restrict void* mi_heap_malloc_small(mi_h
 }
 
 mi_decl_nodiscard extern inline mi_decl_restrict void* mi_malloc_small(size_t size) mi_attr_noexcept {
-  return mi_heap_malloc_small(mi_get_default_heap(), size);
+  return mi_heap_malloc_small(mi_prim_get_default_heap(), size);
 }
 
 // The main allocation function
@@ -129,14 +148,18 @@ extern inline void* _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool z
     mi_assert(heap!=NULL);
     mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id());   // heaps are thread local
     void* const p = _mi_malloc_generic(heap, size + MI_PADDING_SIZE, zero, huge_alignment);  // note: size can overflow but it is detected in malloc_generic
-    mi_assert_internal(p == NULL || mi_usable_size(p) >= size);
+    mi_track_malloc(p,size,zero);
     #if MI_STAT>1
     if (p != NULL) {
-      if (!mi_heap_is_initialized(heap)) { heap = mi_get_default_heap(); }
+      if (!mi_heap_is_initialized(heap)) { heap = mi_prim_get_default_heap(); }
       mi_heap_stat_increase(heap, malloc, mi_usable_size(p));
     }
     #endif
-    mi_track_malloc(p,size,zero);
+    #if MI_DEBUG>3
+    if (p != NULL && zero) {
+      mi_assert_expensive(mi_mem_is_zero(p, size));
+    }
+    #endif
     return p;
   }
 }
@@ -150,12 +173,12 @@ mi_decl_nodiscard extern inline mi_decl_restrict void* mi_heap_malloc(mi_heap_t*
 }
 
 mi_decl_nodiscard extern inline mi_decl_restrict void* mi_malloc(size_t size) mi_attr_noexcept {
-  return mi_heap_malloc(mi_get_default_heap(), size);
+  return mi_heap_malloc(mi_prim_get_default_heap(), size);
 }
 
 // zero initialized small block
 mi_decl_nodiscard mi_decl_restrict void* mi_zalloc_small(size_t size) mi_attr_noexcept {
-  return mi_heap_malloc_small_zero(mi_get_default_heap(), size, true);
+  return mi_heap_malloc_small_zero(mi_prim_get_default_heap(), size, true);
 }
 
 mi_decl_nodiscard extern inline mi_decl_restrict void* mi_heap_zalloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
@@ -163,7 +186,7 @@ mi_decl_nodiscard extern inline mi_decl_restrict void* mi_heap_zalloc(mi_heap_t*
 }
 
 mi_decl_nodiscard mi_decl_restrict void* mi_zalloc(size_t size) mi_attr_noexcept {
-  return mi_heap_zalloc(mi_get_default_heap(),size);
+  return mi_heap_zalloc(mi_prim_get_default_heap(),size);
 }
 
 
@@ -221,7 +244,7 @@ static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block
 // Check for heap block overflow by setting up padding at the end of the block
 // ---------------------------------------------------------------------------
 
-#if (MI_PADDING>0) && defined(MI_ENCODE_FREELIST) && !MI_TRACK_ENABLED
+#if MI_PADDING // && !MI_TRACK_ENABLED
 static bool mi_page_decode_padding(const mi_page_t* page, const mi_block_t* block, size_t* delta, size_t* bsize) {
   *bsize = mi_page_usable_block_size(page);
   const mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + *bsize);
@@ -245,6 +268,40 @@ static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* bl
   return (ok ? bsize - delta : 0);
 }
 
+// When a non-thread-local block is freed, it becomes part of the thread delayed free
+// list that is freed later by the owning heap. If the exact usable size is too small to
+// contain the pointer for the delayed list, then shrink the padding (by decreasing delta)
+// so it will later not trigger an overflow error in `mi_free_block`.
+void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
+  size_t bsize;
+  size_t delta;
+  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
+  mi_assert_internal(ok);
+  if (!ok || (bsize - delta) >= min_size) return;  // usually already enough space
+  mi_assert_internal(bsize >= min_size);
+  if (bsize < min_size) return;  // should never happen
+  size_t new_delta = (bsize - min_size);
+  mi_assert_internal(new_delta < bsize);
+  mi_padding_t* padding = (mi_padding_t*)((uint8_t*)block + bsize);
+  mi_track_mem_defined(padding,sizeof(mi_padding_t));
+  padding->delta = (uint32_t)new_delta;
+  mi_track_mem_noaccess(padding,sizeof(mi_padding_t));
+}
+#else
+static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(block);
+  return mi_page_usable_block_size(page);
+}
+
+void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
+  MI_UNUSED(page);
+  MI_UNUSED(block);
+  MI_UNUSED(min_size);
+}
+#endif
+
+#if MI_PADDING && MI_PADDING_CHECK
+
 static bool mi_verify_padding(const mi_page_t* page, const mi_block_t* block, size_t* size, size_t* wrong) {
   size_t bsize;
   size_t delta;
@@ -277,39 +334,13 @@ static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
   }
 }
 
-// When a non-thread-local block is freed, it becomes part of the thread delayed free
-// list that is freed later by the owning heap. If the exact usable size is too small to
-// contain the pointer for the delayed list, then shrink the padding (by decreasing delta)
-// so it will later not trigger an overflow error in `mi_free_block`.
-static void mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
-  size_t bsize;
-  size_t delta;
-  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
-  mi_assert_internal(ok);
-  if (!ok || (bsize - delta) >= min_size) return;  // usually already enough space
-  mi_assert_internal(bsize >= min_size);
-  if (bsize < min_size) return;  // should never happen
-  size_t new_delta = (bsize - min_size);
-  mi_assert_internal(new_delta < bsize);
-  mi_padding_t* padding = (mi_padding_t*)((uint8_t*)block + bsize);
-  padding->delta = (uint32_t)new_delta;
-}
 #else
+
 static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
   MI_UNUSED(page);
   MI_UNUSED(block);
 }
 
-static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
-  MI_UNUSED(block);
-  return mi_page_usable_block_size(page);
-}
-
-static void mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
-  MI_UNUSED(page);
-  MI_UNUSED(block);
-  MI_UNUSED(min_size);
-}
 #endif
 
 // only maintain stats for smaller objects if requested
@@ -373,7 +404,7 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc
   // The padding check may access the non-thread-owned page for the key values.
   // that is safe as these are constant and the page won't be freed (as the block is not freed yet).
   mi_check_padding(page, block);
-  mi_padding_shrink(page, block, sizeof(mi_block_t));       // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection
+  _mi_padding_shrink(page, block, sizeof(mi_block_t));       // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection
   
   // huge page segments are always abandoned and can be freed immediately
   mi_segment_t* segment = _mi_page_segment(page);
@@ -391,8 +422,8 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc
     #endif
   }
   
-  #if (MI_DEBUG!=0) && !MI_TRACK_ENABLED                    // note: when tracking, cannot use mi_usable_size with multi-threading
-  if (segment->kind != MI_SEGMENT_HUGE) {                   // not for huge segments as we just reset the content
+  #if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN        // note: when tracking, cannot use mi_usable_size with multi-threading
+  if (segment->kind != MI_SEGMENT_HUGE) {                  // not for huge segments as we just reset the content
     memset(block, MI_DEBUG_FREED, mi_usable_size(block));
   }
   #endif
@@ -445,7 +476,7 @@ static inline void _mi_free_block(mi_page_t* page, bool local, mi_block_t* block
     // owning thread can free a block directly
     if mi_unlikely(mi_check_is_double_free(page, block)) return;
     mi_check_padding(page, block);
-    #if (MI_DEBUG!=0) && !MI_TRACK_ENABLED
+    #if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN
     if (!mi_page_is_huge(page)) {   // huge page content may be already decommitted
       memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
     }
@@ -477,8 +508,8 @@ mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* p
 
 void mi_decl_noinline _mi_free_generic(const mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept {
   mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(segment, page, p) : (mi_block_t*)p);
-  mi_stat_free(page, block);                 // stat_free may access the padding
-  mi_track_free(p);
+  mi_stat_free(page, block);    // stat_free may access the padding
+  mi_track_free_size(block, mi_page_usable_size_of(page,block));
   _mi_free_block(page, is_local, block);
 }
 
@@ -531,7 +562,7 @@ void mi_free(void* p) mi_attr_noexcept
 {
   if mi_unlikely(p == NULL) return;
   mi_segment_t* const segment = mi_checked_ptr_segment(p,"mi_free");
-  const bool          is_local= (_mi_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
+  const bool          is_local= (_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
   mi_page_t* const    page    = _mi_segment_page_of(segment, p);
 
   if mi_likely(is_local) {                       // thread-local free?
@@ -541,10 +572,10 @@ void mi_free(void* p) mi_attr_noexcept
       if mi_unlikely(mi_check_is_double_free(page, block)) return;
       mi_check_padding(page, block);
       mi_stat_free(page, block);
-      #if (MI_DEBUG!=0) && !MI_TRACK_ENABLED
+      #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN
       memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
       #endif
-      mi_track_free(p);
+      mi_track_free_size(p, mi_page_usable_size_of(page,block)); // faster then mi_usable_size as we already know the page and that p is unaligned
       mi_block_set_next(page, block, page->local_free);
       page->local_free = block;
       if mi_unlikely(--page->used == 0) {   // using this expression generates better code than: page->used--; if (mi_page_all_free(page))
@@ -644,7 +675,7 @@ mi_decl_nodiscard extern inline mi_decl_restrict void* mi_heap_calloc(mi_heap_t*
 }
 
 mi_decl_nodiscard mi_decl_restrict void* mi_calloc(size_t count, size_t size) mi_attr_noexcept {
-  return mi_heap_calloc(mi_get_default_heap(),count,size);
+  return mi_heap_calloc(mi_prim_get_default_heap(),count,size);
 }
 
 // Uninitialized `calloc`
@@ -655,7 +686,7 @@ mi_decl_nodiscard extern mi_decl_restrict void* mi_heap_mallocn(mi_heap_t* heap,
 }
 
 mi_decl_nodiscard mi_decl_restrict void* mi_mallocn(size_t count, size_t size) mi_attr_noexcept {
-  return mi_heap_mallocn(mi_get_default_heap(),count,size);
+  return mi_heap_mallocn(mi_prim_get_default_heap(),count,size);
 }
 
 // Expand (or shrink) in place (or fail)
@@ -678,9 +709,10 @@ void* _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero)
   // (this means that returning NULL always indicates an error, and `p` will not have been freed in that case.)
   const size_t size = _mi_usable_size(p,"mi_realloc"); // also works if p == NULL (with size 0)
   if mi_unlikely(newsize <= size && newsize >= (size / 2) && newsize > 0) {  // note: newsize must be > 0 or otherwise we return NULL for realloc(NULL,0)
-    // todo: adjust potential padding to reflect the new size?
-    mi_track_free_size(p, size);
-    mi_track_malloc(p,newsize,true);
+    mi_assert_internal(p!=NULL);
+    // todo: do not track as the usable size is still the same in the free; adjust potential padding?
+    // mi_track_resize(p,size,newsize)
+    // if (newsize < size) { mi_track_mem_noaccess((uint8_t*)p + newsize, size - newsize); }
     return p;  // reallocation still fits and not more than 50% waste
   }
   void* newp = mi_heap_malloc(heap,newsize);
@@ -688,14 +720,15 @@ void* _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero)
     if (zero && newsize > size) {
       // also set last word in the previous allocation to zero to ensure any padding is zero-initialized
       const size_t start = (size >= sizeof(intptr_t) ? size - sizeof(intptr_t) : 0);
-      memset((uint8_t*)newp + start, 0, newsize - start);
+      _mi_memzero((uint8_t*)newp + start, newsize - start);
+    }
+    else if (newsize == 0) {
+      ((uint8_t*)newp)[0] = 0; // work around for applications that expect zero-reallocation to be zero initialized (issue #725)
     }
     if mi_likely(p != NULL) {
-      if mi_likely(_mi_is_aligned(p, sizeof(uintptr_t))) {  // a client may pass in an arbitrary pointer `p`..
-        const size_t copysize = (newsize > size ? size : newsize);
-        mi_track_mem_defined(p,copysize);  // _mi_useable_size may be too large for byte precise memory tracking..
-        _mi_memcpy_aligned(newp, p, copysize);
-      }
+      const size_t copysize = (newsize > size ? size : newsize);
+      mi_track_mem_defined(p,copysize);  // _mi_useable_size may be too large for byte precise memory tracking..
+      _mi_memcpy(newp, p, copysize);
       mi_free(p); // only free the original pointer if successful
     }
   }
@@ -732,24 +765,24 @@ mi_decl_nodiscard void* mi_heap_recalloc(mi_heap_t* heap, void* p, size_t count,
 
 
 mi_decl_nodiscard void* mi_realloc(void* p, size_t newsize) mi_attr_noexcept {
-  return mi_heap_realloc(mi_get_default_heap(),p,newsize);
+  return mi_heap_realloc(mi_prim_get_default_heap(),p,newsize);
 }
 
 mi_decl_nodiscard void* mi_reallocn(void* p, size_t count, size_t size) mi_attr_noexcept {
-  return mi_heap_reallocn(mi_get_default_heap(),p,count,size);
+  return mi_heap_reallocn(mi_prim_get_default_heap(),p,count,size);
 }
 
 // Reallocate but free `p` on errors
 mi_decl_nodiscard void* mi_reallocf(void* p, size_t newsize) mi_attr_noexcept {
-  return mi_heap_reallocf(mi_get_default_heap(),p,newsize);
+  return mi_heap_reallocf(mi_prim_get_default_heap(),p,newsize);
 }
 
 mi_decl_nodiscard void* mi_rezalloc(void* p, size_t newsize) mi_attr_noexcept {
-  return mi_heap_rezalloc(mi_get_default_heap(), p, newsize);
+  return mi_heap_rezalloc(mi_prim_get_default_heap(), p, newsize);
 }
 
 mi_decl_nodiscard void* mi_recalloc(void* p, size_t count, size_t size) mi_attr_noexcept {
-  return mi_heap_recalloc(mi_get_default_heap(), p, count, size);
+  return mi_heap_recalloc(mi_prim_get_default_heap(), p, count, size);
 }
 
 
@@ -770,7 +803,7 @@ mi_decl_nodiscard mi_decl_restrict char* mi_heap_strdup(mi_heap_t* heap, const c
 }
 
 mi_decl_nodiscard mi_decl_restrict char* mi_strdup(const char* s) mi_attr_noexcept {
-  return mi_heap_strdup(mi_get_default_heap(), s);
+  return mi_heap_strdup(mi_prim_get_default_heap(), s);
 }
 
 // `strndup` using mi_malloc
@@ -787,7 +820,7 @@ mi_decl_nodiscard mi_decl_restrict char* mi_heap_strndup(mi_heap_t* heap, const
 }
 
 mi_decl_nodiscard mi_decl_restrict char* mi_strndup(const char* s, size_t n) mi_attr_noexcept {
-  return mi_heap_strndup(mi_get_default_heap(),s,n);
+  return mi_heap_strndup(mi_prim_get_default_heap(),s,n);
 }
 
 #ifndef __wasi__
@@ -856,7 +889,7 @@ char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name)
 #endif
 
 mi_decl_nodiscard mi_decl_restrict char* mi_realpath(const char* fname, char* resolved_name) mi_attr_noexcept {
-  return mi_heap_realpath(mi_get_default_heap(),fname,resolved_name);
+  return mi_heap_realpath(mi_prim_get_default_heap(),fname,resolved_name);
 }
 #endif
 
@@ -869,7 +902,6 @@ use a C compiler we cannot throw a `bad_alloc` exception
 but we call `exit` instead (i.e. not returning).
 -------------------------------------------------------*/
 
-#if 0
 #ifdef __cplusplus
 #include <new>
 static bool mi_try_new_handler(bool nothrow) {
@@ -924,7 +956,7 @@ static bool mi_try_new_handler(bool nothrow) {
 }
 #endif
 
-static mi_decl_noinline void* mi_heap_try_new(mi_heap_t* heap, size_t size, bool nothrow ) {
+mi_decl_export mi_decl_noinline void* mi_heap_try_new(mi_heap_t* heap, size_t size, bool nothrow ) {
   void* p = NULL;
   while(p == NULL && mi_try_new_handler(nothrow)) {
     p = mi_heap_malloc(heap,size);
@@ -933,22 +965,22 @@ static mi_decl_noinline void* mi_heap_try_new(mi_heap_t* heap, size_t size, bool
 }
 
 static mi_decl_noinline void* mi_try_new(size_t size, bool nothrow) {
-  return mi_heap_try_new(mi_get_default_heap(), size, nothrow);
+  return mi_heap_try_new(mi_prim_get_default_heap(), size, nothrow);
 }
 
 
-mi_decl_nodiscard mi_decl_restrict extern inline void* mi_heap_alloc_new(mi_heap_t* heap, size_t size) {
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_alloc_new(mi_heap_t* heap, size_t size) {
   void* p = mi_heap_malloc(heap,size);
   if mi_unlikely(p == NULL) return mi_heap_try_new(heap, size, false);
   return p;
 }
 
 mi_decl_nodiscard mi_decl_restrict void* mi_new(size_t size) {
-  return mi_heap_alloc_new(mi_get_default_heap(), size);
+  return mi_heap_alloc_new(mi_prim_get_default_heap(), size);
 }
 
 
-mi_decl_nodiscard mi_decl_restrict extern inline void* mi_heap_alloc_new_n(mi_heap_t* heap, size_t count, size_t size) {
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_alloc_new_n(mi_heap_t* heap, size_t count, size_t size) {
   size_t total;
   if mi_unlikely(mi_count_size_overflow(count, size, &total)) {
     mi_try_new_handler(false);  // on overflow we invoke the try_new_handler once to potentially throw std::bad_alloc
@@ -960,7 +992,7 @@ mi_decl_nodiscard mi_decl_restrict extern inline void* mi_heap_alloc_new_n(mi_he
 }
 
 mi_decl_nodiscard mi_decl_restrict void* mi_new_n(size_t count, size_t size) {
-  return mi_heap_alloc_new_n(mi_get_default_heap(), size, count);
+  return mi_heap_alloc_new_n(mi_prim_get_default_heap(), size, count);
 }
 
 
@@ -1006,7 +1038,6 @@ mi_decl_nodiscard void* mi_new_reallocn(void* p, size_t newcount, size_t size) {
     return mi_new_realloc(p, total);
   }
 }
-#endif
 
 // ------------------------------------------------------
 // ensure explicit external inline definitions are emitted!
@@ -1023,7 +1054,7 @@ void* _mi_externs[] = {
   (void*)&mi_heap_malloc,
   (void*)&mi_heap_zalloc,
   (void*)&mi_heap_malloc_small,
-  (void*)&mi_heap_alloc_new,
-  (void*)&mi_heap_alloc_new_n
+  // (void*)&mi_heap_alloc_new,
+  // (void*)&mi_heap_alloc_new_n
 };
 #endif
diff --git a/Objects/mimalloc/arena.c b/Objects/mimalloc/arena.c
index 80dd47869c9e24..a04a04c8fd3111 100644
--- a/Objects/mimalloc/arena.c
+++ b/Objects/mimalloc/arena.c
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2019-2022, Microsoft Research, Daan Leijen
+Copyright (c) 2019-2023, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -11,36 +11,22 @@ large blocks (>= MI_ARENA_MIN_BLOCK_SIZE, 4MiB).
 In contrast to the rest of mimalloc, the arenas are shared between
 threads and need to be accessed using atomic operations.
 
-Currently arenas are only used to for huge OS page (1GiB) reservations,
-or direct OS memory reservations -- otherwise it delegates to direct allocation from the OS.
-In the future, we can expose an API to manually add more kinds of arenas
-which is sometimes needed for embedded devices or shared memory for example.
-(We can also employ this with WASI or `sbrk` systems to reserve large arenas
- on demand and be able to reuse them efficiently).
+Arenas are used to for huge OS page (1GiB) reservations or for reserving
+OS memory upfront which can be improve performance or is sometimes needed
+on embedded devices. We can also employ this with WASI or `sbrk` systems 
+to reserve large arenas upfront and be able to reuse the memory more effectively.
 
 The arena allocation needs to be thread safe and we use an atomic bitmap to allocate.
 -----------------------------------------------------------------------------*/
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
-#include "mimalloc-atomic.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
 
 #include <string.h>  // memset
-#include <errno.h> // ENOMEM
+#include <errno.h>   // ENOMEM
 
 #include "bitmap.h"  // atomic bitmap
 
-
-// os.c
-void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_stats_t* stats);
-void  _mi_os_free_ex(void* p, size_t size, bool was_committed, mi_stats_t* stats);
-
-void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_secs, size_t* pages_reserved, size_t* psize);
-void  _mi_os_free_huge_pages(void* p, size_t size, mi_stats_t* stats);
-
-bool  _mi_os_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
-bool  _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats);
-
-
 /* -----------------------------------------------------------
   Arena allocation
 ----------------------------------------------------------- */
@@ -50,22 +36,25 @@ bool  _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats);
 typedef uintptr_t mi_block_info_t;
 #define MI_ARENA_BLOCK_SIZE   (MI_SEGMENT_SIZE)        // 64MiB  (must be at least MI_SEGMENT_ALIGN)
 #define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_BLOCK_SIZE/2)  // 32MiB
-#define MI_MAX_ARENAS         (64)                     // not more than 126 (since we use 7 bits in the memid and an arena index + 1)
+#define MI_MAX_ARENAS         (112)                    // not more than 126 (since we use 7 bits in the memid and an arena index + 1)
 
 // A memory arena descriptor
 typedef struct mi_arena_s {
   mi_arena_id_t id;                       // arena id; 0 for non-specific
-  bool     exclusive;                     // only allow allocations if specifically for this arena
+  mi_memid_t memid;                       // memid of the memory area
   _Atomic(uint8_t*) start;                // the start of the memory area
   size_t   block_count;                   // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`)
   size_t   field_count;                   // number of bitmap fields (where `field_count * MI_BITMAP_FIELD_BITS >= block_count`)
+  size_t   meta_size;                     // size of the arena structure itself (including its bitmaps)
+  mi_memid_t meta_memid;                  // memid of the arena structure itself (OS or static allocation)
   int      numa_node;                     // associated NUMA node
-  bool     is_zero_init;                  // is the arena zero initialized?
-  bool     allow_decommit;                // is decommit allowed? if true, is_large should be false and blocks_committed != NULL
-  bool     is_large;                      // large- or huge OS pages (always committed)
+  bool     exclusive;                     // only allow allocations if specifically for this arena  
+  bool     is_large;                      // memory area consists of large- or huge OS pages (always committed)
   _Atomic(size_t) search_idx;             // optimization to start the search for free blocks
+  _Atomic(mi_msecs_t) purge_expire;       // expiration time when blocks should be decommitted from `blocks_decommit`.  
   mi_bitmap_field_t* blocks_dirty;        // are the blocks potentially non-zero?
   mi_bitmap_field_t* blocks_committed;    // are the blocks committed? (can be NULL for memory that cannot be decommitted)
+  mi_bitmap_field_t* blocks_purge;        // blocks that can be (reset) decommitted. (can be NULL for memory that cannot be (reset) decommitted)  
   mi_bitmap_field_t  blocks_inuse[1];     // in-place bitmap of in-use blocks (of size `field_count`)
 } mi_arena_t;
 
@@ -75,9 +64,10 @@ static mi_decl_cache_align _Atomic(mi_arena_t*) mi_arenas[MI_MAX_ARENAS];
 static mi_decl_cache_align _Atomic(size_t)      mi_arena_count; // = 0
 
 
+//static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept;
+
 /* -----------------------------------------------------------
   Arena id's
-  0 is used for non-arena's (like OS memory)
   id = arena_index + 1
 ----------------------------------------------------------- */
 
@@ -87,10 +77,7 @@ static size_t mi_arena_id_index(mi_arena_id_t id) {
 
 static mi_arena_id_t mi_arena_id_create(size_t arena_index) {
   mi_assert_internal(arena_index < MI_MAX_ARENAS);
-  mi_assert_internal(MI_MAX_ARENAS <= 126);
-  int id = (int)arena_index + 1;
-  mi_assert_internal(id >= 1 && id <= 127);
-  return id;
+  return (int)arena_index + 1;
 }
 
 mi_arena_id_t _mi_arena_id_none(void) {
@@ -102,46 +89,123 @@ static bool mi_arena_id_is_suitable(mi_arena_id_t arena_id, bool arena_is_exclus
           (arena_id == req_arena_id));
 }
 
+bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id) {
+  if (memid.memkind == MI_MEM_ARENA) {
+    return mi_arena_id_is_suitable(memid.mem.arena.id, memid.mem.arena.is_exclusive, request_arena_id);
+  }
+  else {
+    return mi_arena_id_is_suitable(0, false, request_arena_id);
+  }
+}
+
+bool _mi_arena_memid_is_os_allocated(mi_memid_t memid) {
+  return (memid.memkind == MI_MEM_OS);
+}
 
 /* -----------------------------------------------------------
-  Arena allocations get a memory id where the lower 8 bits are
-  the arena id, and the upper bits the block index.
+  Arena allocations get a (currently) 16-bit memory id where the 
+  lower 8 bits are the arena id, and the upper bits the block index.
 ----------------------------------------------------------- */
 
-// Use `0` as a special id for direct OS allocated memory.
-#define MI_MEMID_OS   0
+static size_t mi_block_count_of_size(size_t size) {
+  return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE);
+}
+
+static size_t mi_arena_block_size(size_t bcount) {
+  return (bcount * MI_ARENA_BLOCK_SIZE);
+}
 
-static size_t mi_arena_memid_create(mi_arena_id_t id, bool exclusive, mi_bitmap_index_t bitmap_index) {
-  mi_assert_internal(((bitmap_index << 8) >> 8) == bitmap_index); // no overflow?
-  mi_assert_internal(id >= 0 && id <= 0x7F);
-  return ((bitmap_index << 8) | ((uint8_t)id & 0x7F) | (exclusive ? 0x80 : 0));
+static size_t mi_arena_size(mi_arena_t* arena) {
+  return mi_arena_block_size(arena->block_count);
 }
 
-static bool mi_arena_memid_indices(size_t arena_memid, size_t* arena_index, mi_bitmap_index_t* bitmap_index) {
-  *bitmap_index = (arena_memid >> 8);
-  mi_arena_id_t id = (int)(arena_memid & 0x7F);
-  *arena_index = mi_arena_id_index(id);
-  return ((arena_memid & 0x80) != 0);
+static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, mi_bitmap_index_t bitmap_index) {
+  mi_memid_t memid = _mi_memid_create(MI_MEM_ARENA);
+  memid.mem.arena.id = id;
+  memid.mem.arena.block_index = bitmap_index;
+  memid.mem.arena.is_exclusive = is_exclusive;
+  return memid;
 }
 
-bool _mi_arena_memid_is_suitable(size_t arena_memid, mi_arena_id_t request_arena_id) {
-  mi_arena_id_t id = (int)(arena_memid & 0x7F);
-  bool exclusive = ((arena_memid & 0x80) != 0);
-  return mi_arena_id_is_suitable(id, exclusive, request_arena_id);
+static bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, mi_bitmap_index_t* bitmap_index) {
+  mi_assert_internal(memid.memkind == MI_MEM_ARENA);
+  *arena_index = mi_arena_id_index(memid.mem.arena.id);
+  *bitmap_index = memid.mem.arena.block_index;
+  return memid.mem.arena.is_exclusive;
 }
 
-static size_t mi_block_count_of_size(size_t size) {
-  return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE);
+
+
+/* -----------------------------------------------------------
+  Special static area for mimalloc internal structures
+  to avoid OS calls (for example, for the arena metadata)
+----------------------------------------------------------- */
+
+#define MI_ARENA_STATIC_MAX  (MI_INTPTR_SIZE*MI_KiB)  // 8 KiB on 64-bit
+
+static uint8_t mi_arena_static[MI_ARENA_STATIC_MAX];
+static _Atomic(size_t) mi_arena_static_top;
+
+static void* mi_arena_static_zalloc(size_t size, size_t alignment, mi_memid_t* memid) {
+  *memid = _mi_memid_none();
+  if (size == 0 || size > MI_ARENA_STATIC_MAX) return NULL;
+  if ((mi_atomic_load_relaxed(&mi_arena_static_top) + size) > MI_ARENA_STATIC_MAX) return NULL;
+
+  // try to claim space
+  if (alignment == 0) { alignment = 1; }
+  const size_t oversize = size + alignment - 1;
+  if (oversize > MI_ARENA_STATIC_MAX) return NULL;
+  const size_t oldtop = mi_atomic_add_acq_rel(&mi_arena_static_top, oversize);
+  size_t top = oldtop + oversize;
+  if (top > MI_ARENA_STATIC_MAX) {
+    // try to roll back, ok if this fails
+    mi_atomic_cas_strong_acq_rel(&mi_arena_static_top, &top, oldtop);
+    return NULL;
+  }
+
+  // success
+  *memid = _mi_memid_create(MI_MEM_STATIC);
+  const size_t start = _mi_align_up(oldtop, alignment);
+  uint8_t* const p = &mi_arena_static[start];
+  _mi_memzero(p, size);
+  return p;
+}
+
+static void* mi_arena_meta_zalloc(size_t size, mi_memid_t* memid, mi_stats_t* stats) {
+  *memid = _mi_memid_none();
+
+  // try static
+  void* p = mi_arena_static_zalloc(size, MI_ALIGNMENT_MAX, memid);
+  if (p != NULL) return p;
+
+  // or fall back to the OS
+  return _mi_os_alloc(size, memid, stats);
+}
+
+static void mi_arena_meta_free(void* p, mi_memid_t memid, size_t size, mi_stats_t* stats) {
+  if (mi_memkind_is_os(memid.memkind)) {
+    _mi_os_free(p, size, memid, stats);
+  }
+  else {
+    mi_assert(memid.memkind == MI_MEM_STATIC);
+  }
 }
 
+static void* mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex) {
+  return (arena->start + mi_arena_block_size(mi_bitmap_index_bit(bindex)));
+}
+
+
 /* -----------------------------------------------------------
   Thread safe allocation in an arena
 ----------------------------------------------------------- */
-static bool mi_arena_alloc(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t* bitmap_idx)
+
+// claim the `blocks_inuse` bits
+static bool mi_arena_try_claim(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t* bitmap_idx)
 {
   size_t idx = 0; // mi_atomic_load_relaxed(&arena->search_idx);  // start from last search; ok to be relaxed as the exact start does not matter
   if (_mi_bitmap_try_find_from_claim_across(arena->blocks_inuse, arena->field_count, idx, blocks, bitmap_idx)) {
-    mi_atomic_store_relaxed(&arena->search_idx, mi_bitmap_index_field(*bitmap_idx));  // start search from found location next time around
+    mi_atomic_store_relaxed(&arena->search_idx, mi_bitmap_index_field(*bitmap_idx));  // start search from found location next time around    
     return true;
   };
   return false;
@@ -152,92 +216,116 @@ static bool mi_arena_alloc(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t*
   Arena Allocation
 ----------------------------------------------------------- */
 
-static mi_decl_noinline void* mi_arena_alloc_from(mi_arena_t* arena, size_t arena_index, size_t needed_bcount,
-                                                   bool* commit, bool* large, bool* is_pinned, bool* is_zero, 
-                                                   mi_arena_id_t req_arena_id, size_t* memid, mi_os_tld_t* tld)
+static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t arena_index, size_t needed_bcount,
+                                                    bool commit, mi_memid_t* memid, mi_os_tld_t* tld)
 {
   MI_UNUSED(arena_index);
   mi_assert_internal(mi_arena_id_index(arena->id) == arena_index);
-  if (!mi_arena_id_is_suitable(arena->id, arena->exclusive, req_arena_id)) return NULL;
 
   mi_bitmap_index_t bitmap_index;
-  if (!mi_arena_alloc(arena, needed_bcount, &bitmap_index)) return NULL;
-
-  // claimed it! set the dirty bits (todo: no need for an atomic op here?)
-  void* p    = arena->start + (mi_bitmap_index_bit(bitmap_index)*MI_ARENA_BLOCK_SIZE);
-  *memid     = mi_arena_memid_create(arena->id, arena->exclusive, bitmap_index);
-  *is_zero   = _mi_bitmap_claim_across(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL);
-  *large     = arena->is_large;
-  *is_pinned = (arena->is_large || !arena->allow_decommit);
+  if (!mi_arena_try_claim(arena, needed_bcount, &bitmap_index)) return NULL;
+
+  // claimed it! 
+  void* p = mi_arena_block_start(arena, bitmap_index);
+  *memid = mi_memid_create_arena(arena->id, arena->exclusive, bitmap_index);
+  memid->is_pinned = arena->memid.is_pinned;
+
+  // none of the claimed blocks should be scheduled for a decommit
+  if (arena->blocks_purge != NULL) {
+    // this is thread safe as a potential purge only decommits parts that are not yet claimed as used (in `blocks_inuse`).
+    _mi_bitmap_unclaim_across(arena->blocks_purge, arena->field_count, needed_bcount, bitmap_index);
+  }
+
+  // set the dirty bits (todo: no need for an atomic op here?)
+  if (arena->memid.initially_zero && arena->blocks_dirty != NULL) {
+    memid->initially_zero = _mi_bitmap_claim_across(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL);
+  }
+
+  // set commit state
   if (arena->blocks_committed == NULL) {
     // always committed
-    *commit = true;
+    memid->initially_committed = true;
   }
-  else if (*commit) {
-    // arena not committed as a whole, but commit requested: ensure commit now
+  else if (commit) {
+    // commit requested, but the range may not be committed as a whole: ensure it is committed now
+    memid->initially_committed = true;
     bool any_uncommitted;
     _mi_bitmap_claim_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_uncommitted);
     if (any_uncommitted) {
-      bool commit_zero;
-      _mi_os_commit(p, needed_bcount * MI_ARENA_BLOCK_SIZE, &commit_zero, tld->stats);
-      if (commit_zero) *is_zero = true;
+      bool commit_zero = false;
+      if (!_mi_os_commit(p, mi_arena_block_size(needed_bcount), &commit_zero, tld->stats)) {
+        memid->initially_committed = false;
+      }
+      else {
+        if (commit_zero) { memid->initially_zero = true; }
+      }
     }
   }
   else {
     // no need to commit, but check if already fully committed
-    *commit = _mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index);
+    memid->initially_committed = _mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index);
   }
+  
   return p;
 }
 
-// allocate from an arena with fallback to the OS
-static mi_decl_noinline void* mi_arena_allocate(int numa_node, size_t size, size_t alignment, bool* commit, bool* large,
-                                                bool* is_pinned, bool* is_zero,
-                                                mi_arena_id_t req_arena_id, size_t* memid, mi_os_tld_t* tld )
+// allocate in a speficic arena
+static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_node, int numa_node, size_t size, size_t alignment, 
+                                       bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld ) 
 {
   MI_UNUSED_RELEASE(alignment);
   mi_assert_internal(alignment <= MI_SEGMENT_ALIGN);
+  const size_t bcount = mi_block_count_of_size(size);  
+  const size_t arena_index = mi_arena_id_index(arena_id);
+  mi_assert_internal(arena_index < mi_atomic_load_relaxed(&mi_arena_count));
+  mi_assert_internal(size <= mi_arena_block_size(bcount));
+  
+  // Check arena suitability
+  mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]);
+  if (arena == NULL) return NULL;
+  if (!allow_large && arena->is_large) return NULL;
+  if (!mi_arena_id_is_suitable(arena->id, arena->exclusive, req_arena_id)) return NULL;
+  if (req_arena_id == _mi_arena_id_none()) { // in not specific, check numa affinity
+    const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node);
+    if (match_numa_node) { if (!numa_suitable) return NULL; }
+                    else { if (numa_suitable) return NULL; }
+  }
+
+  // try to allocate
+  void* p = mi_arena_try_alloc_at(arena, arena_index, bcount, commit, memid, tld);
+  mi_assert_internal(p == NULL || _mi_is_aligned(p, alignment));
+  return p;
+}
+
+
+// allocate from an arena with fallback to the OS
+static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, size_t alignment, 
+                                                  bool commit, bool allow_large,
+                                                  mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld )
+{
+  MI_UNUSED(alignment);
+  mi_assert_internal(alignment <= MI_SEGMENT_ALIGN);
   const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
-  const size_t bcount = mi_block_count_of_size(size);
   if mi_likely(max_arena == 0) return NULL;
-  mi_assert_internal(size <= bcount * MI_ARENA_BLOCK_SIZE);
-
-  size_t arena_index = mi_arena_id_index(req_arena_id);
-  if (arena_index < MI_MAX_ARENAS) {
-    // try a specific arena if requested
-    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[arena_index]);
-    if ((arena != NULL) &&
-        (arena->numa_node < 0 || arena->numa_node == numa_node) && // numa local?
-        (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages
-    {
-      void* p = mi_arena_alloc_from(arena, arena_index, bcount, commit, large, is_pinned, is_zero, req_arena_id, memid, tld);
-      mi_assert_internal((uintptr_t)p % alignment == 0);
+  
+  if (req_arena_id != _mi_arena_id_none()) {
+    // try a specific arena if requested 
+    if (mi_arena_id_index(req_arena_id) < max_arena) {
+      void* p = mi_arena_try_alloc_at_id(req_arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
       if (p != NULL) return p;
     }
   }
   else {
     // try numa affine allocation
-    for (size_t i = 0; i < max_arena; i++) {
-      mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
-      if (arena == NULL) break; // end reached
-      if ((arena->numa_node < 0 || arena->numa_node == numa_node) && // numa local?
-          (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages
-      {
-        void* p = mi_arena_alloc_from(arena, i, bcount, commit, large, is_pinned, is_zero, req_arena_id, memid, tld);
-        mi_assert_internal((uintptr_t)p % alignment == 0);
-        if (p != NULL) return p;
-      }
+    for (size_t i = 0; i < max_arena; i++) {    
+      void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
+      if (p != NULL) return p;
     }
 
     // try from another numa node instead..
-    for (size_t i = 0; i < max_arena; i++) {
-      mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
-      if (arena == NULL) break; // end reached
-      if ((arena->numa_node >= 0 && arena->numa_node != numa_node) && // not numa local!
-          (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages
-      {
-        void* p = mi_arena_alloc_from(arena, i, bcount, commit, large, is_pinned, is_zero, req_arena_id, memid, tld);
-        mi_assert_internal((uintptr_t)p % alignment == 0);
+    if (numa_node >= 0) {  // if numa_node was < 0 (no specific affinity requested), all arena's have been tried already
+      for (size_t i = 0; i < max_arena; i++) {
+        void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), false /* only proceed if not numa local */, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
         if (p != NULL) return p;
       }
     }
@@ -245,75 +333,294 @@ static mi_decl_noinline void* mi_arena_allocate(int numa_node, size_t size, size
   return NULL;
 }
 
-void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool* commit, bool* large, bool* is_pinned, bool* is_zero,
-                              mi_arena_id_t req_arena_id, size_t* memid, mi_os_tld_t* tld)
+// try to reserve a fresh arena space
+static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t *arena_id)
+{
+  if (_mi_preloading()) return false;  // use OS only while pre loading
+  if (req_arena_id != _mi_arena_id_none()) return false;
+
+  const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count);
+  if (arena_count > (MI_MAX_ARENAS - 4)) return false;
+
+  size_t arena_reserve = mi_option_get_size(mi_option_arena_reserve);
+  if (arena_reserve == 0) return false;
+
+  if (!_mi_os_has_virtual_reserve()) { 
+    arena_reserve = arena_reserve/4;  // be conservative if virtual reserve is not supported (for some embedded systems for example)
+  }
+  arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_BLOCK_SIZE);
+  if (arena_count >= 8 && arena_count <= 128) {
+    arena_reserve = ((size_t)1<<(arena_count/8)) * arena_reserve;  // scale up the arena sizes exponentially
+  }    
+  if (arena_reserve < req_size) return false;  // should be able to at least handle the current allocation size
+      
+  // commit eagerly?
+  bool arena_commit = false;
+  if (mi_option_get(mi_option_arena_eager_commit) == 2)      { arena_commit = _mi_os_has_overcommit(); }
+  else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; }
+
+  return (mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive */, arena_id) == 0);
+}    
+
+
+void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large,
+                              mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld)
 {
-  mi_assert_internal(commit != NULL && is_pinned != NULL && is_zero != NULL && memid != NULL && tld != NULL);
+  mi_assert_internal(memid != NULL && tld != NULL);
   mi_assert_internal(size > 0);
-  *memid   = MI_MEMID_OS;
-  *is_zero = false;
-  *is_pinned = false;
+  *memid = _mi_memid_none();
 
-  bool default_large = false;
-  if (large == NULL) large = &default_large;   // ensure `large != NULL`
   const int numa_node = _mi_os_numa_node(tld); // current numa node
 
   // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data)
   if (size >= MI_ARENA_MIN_OBJ_SIZE && alignment <= MI_SEGMENT_ALIGN && align_offset == 0) {
-    void* p = mi_arena_allocate(numa_node, size, alignment, commit, large, is_pinned, is_zero, req_arena_id, memid, tld);
-    if (p != NULL) return p;
+    void* p = mi_arena_try_alloc(numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
+    if (p != NULL) return p;    
+
+    // otherwise, try to first eagerly reserve a new arena 
+    if (req_arena_id == _mi_arena_id_none()) {
+      mi_arena_id_t arena_id = 0;
+      if (mi_arena_reserve(size, allow_large, req_arena_id, &arena_id)) {
+        // and try allocate in there
+        mi_assert_internal(req_arena_id == _mi_arena_id_none());
+        p = mi_arena_try_alloc_at_id(arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
+        if (p != NULL) return p;
+      }
+    }
   }
 
-  // finally, fall back to the OS
+  // if we cannot use OS allocation, return NULL
   if (mi_option_is_enabled(mi_option_limit_os_alloc) || req_arena_id != _mi_arena_id_none()) {
     errno = ENOMEM;
     return NULL;
   }
-  *is_zero = true;
-  *memid   = MI_MEMID_OS;
-  void* p = _mi_os_alloc_aligned_offset(size, alignment, align_offset, *commit, large, tld->stats);
-  if (p != NULL) { *is_pinned = *large; }
-  return p;
+   
+  // finally, fall back to the OS
+  if (align_offset > 0) {
+    return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid, tld->stats);
+  }
+  else {
+    return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, tld->stats);
+  }  
 }
 
-void* _mi_arena_alloc(size_t size, bool* commit, bool* large, bool* is_pinned, bool* is_zero, mi_arena_id_t req_arena_id, size_t* memid, mi_os_tld_t* tld)
+void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld)
 {
-  return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, large, is_pinned, is_zero, req_arena_id, memid, tld);
+  return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, allow_large, req_arena_id, memid, tld);
 }
 
+
 void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) {
   if (size != NULL) *size = 0;
   size_t arena_index = mi_arena_id_index(arena_id);
   if (arena_index >= MI_MAX_ARENAS) return NULL;
-  mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[arena_index]);
+  mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]);
   if (arena == NULL) return NULL;
-  if (size != NULL) *size = arena->block_count * MI_ARENA_BLOCK_SIZE;
+  if (size != NULL) { *size = mi_arena_block_size(arena->block_count); }
   return arena->start;
 }
 
+
+/* -----------------------------------------------------------
+  Arena purge
+----------------------------------------------------------- */
+
+static long mi_arena_purge_delay(void) {
+  // <0 = no purging allowed, 0=immediate purging, >0=milli-second delay
+  return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult));
+}
+
+// reset or decommit in an arena and update the committed/decommit bitmaps
+// assumes we own the area (i.e. blocks_in_use is claimed by us)
+static void mi_arena_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks, mi_stats_t* stats) {
+  mi_assert_internal(arena->blocks_committed != NULL);
+  mi_assert_internal(arena->blocks_purge != NULL);
+  mi_assert_internal(!arena->memid.is_pinned);
+  const size_t size = mi_arena_block_size(blocks);
+  void* const p = mi_arena_block_start(arena, bitmap_idx); 
+  bool needs_recommit;
+  if (_mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx)) {
+    // all blocks are committed, we can purge freely
+    needs_recommit = _mi_os_purge(p, size, stats);
+  }
+  else {
+    // some blocks are not committed -- this can happen when a partially committed block is freed 
+    // in `_mi_arena_free` and it is conservatively marked as uncommitted but still scheduled for a purge
+    // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory), 
+    // and also undo the decommit stats (as it was already adjusted)
+    mi_assert_internal(mi_option_is_enabled(mi_option_purge_decommits));
+    needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, stats);
+    _mi_stat_increase(&stats->committed, size);
+  }
+  
+  // clear the purged blocks
+  _mi_bitmap_unclaim_across(arena->blocks_purge, arena->field_count, blocks, bitmap_idx);
+  // update committed bitmap
+  if (needs_recommit) {
+    _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx);
+  }
+}
+
+// Schedule a purge. This is usually delayed to avoid repeated decommit/commit calls.
+// Note: assumes we (still) own the area as we may purge immediately
+static void mi_arena_schedule_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks, mi_stats_t* stats) {
+  mi_assert_internal(arena->blocks_purge != NULL);
+  const long delay = mi_arena_purge_delay();
+  if (delay < 0) return;  // is purging allowed at all?
+
+  if (_mi_preloading() || delay == 0) {
+    // decommit directly
+    mi_arena_purge(arena, bitmap_idx, blocks, stats);    
+  }
+  else {
+    // schedule decommit
+    mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire);
+    if (expire != 0) {
+      mi_atomic_addi64_acq_rel(&arena->purge_expire, delay/10);  // add smallish extra delay
+    }
+    else {
+      mi_atomic_storei64_release(&arena->purge_expire, _mi_clock_now() + delay);
+    }
+    _mi_bitmap_claim_across(arena->blocks_purge, arena->field_count, blocks, bitmap_idx, NULL);
+  }
+}
+
+// purge a range of blocks
+// return true if the full range was purged.
+// assumes we own the area (i.e. blocks_in_use is claimed by us)
+static bool mi_arena_purge_range(mi_arena_t* arena, size_t idx, size_t startidx, size_t bitlen, size_t purge, mi_stats_t* stats) {
+  const size_t endidx = startidx + bitlen;
+  size_t bitidx = startidx;
+  bool all_purged = false;
+  while (bitidx < endidx) {
+    // count consequetive ones in the purge mask
+    size_t count = 0;
+    while (bitidx + count < endidx && (purge & ((size_t)1 << (bitidx + count))) != 0) {
+      count++;
+    }
+    if (count > 0) {
+      // found range to be purged
+      const mi_bitmap_index_t range_idx = mi_bitmap_index_create(idx, bitidx);
+      mi_arena_purge(arena, range_idx, count, stats);
+      if (count == bitlen) {
+        all_purged = true;
+      }
+    }
+    bitidx += (count+1); // +1 to skip the zero bit (or end)
+  }
+  return all_purged;
+}
+
+// returns true if anything was purged
+static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi_stats_t* stats) 
+{
+  if (arena->memid.is_pinned || arena->blocks_purge == NULL) return false;
+  mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire);
+  if (expire == 0) return false;
+  if (!force && expire > now) return false;
+
+  // reset expire (if not already set concurrently)
+  mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire, 0);
+  
+  // potential purges scheduled, walk through the bitmap
+  bool any_purged = false;
+  bool full_purge = true;  
+  for (size_t i = 0; i < arena->field_count; i++) {
+    size_t purge = mi_atomic_load_relaxed(&arena->blocks_purge[i]);
+    if (purge != 0) {
+      size_t bitidx = 0;
+      while (bitidx < MI_BITMAP_FIELD_BITS) {
+        // find consequetive range of ones in the purge mask
+        size_t bitlen = 0;
+        while (bitidx + bitlen < MI_BITMAP_FIELD_BITS && (purge & ((size_t)1 << (bitidx + bitlen))) != 0) {
+          bitlen++;
+        }
+        // try to claim the longest range of corresponding in_use bits
+        const mi_bitmap_index_t bitmap_index = mi_bitmap_index_create(i, bitidx);
+        while( bitlen > 0 ) {
+          if (_mi_bitmap_try_claim(arena->blocks_inuse, arena->field_count, bitlen, bitmap_index)) {
+            break;
+          }
+          bitlen--;
+        }
+        // actual claimed bits at `in_use`
+        if (bitlen > 0) {
+          // read purge again now that we have the in_use bits
+          purge = mi_atomic_load_acquire(&arena->blocks_purge[i]);
+          if (!mi_arena_purge_range(arena, i, bitidx, bitlen, purge, stats)) {
+            full_purge = false;
+          }
+          any_purged = true;
+          // release the claimed `in_use` bits again
+          _mi_bitmap_unclaim(arena->blocks_inuse, arena->field_count, bitlen, bitmap_index);
+        }
+        bitidx += (bitlen+1);  // +1 to skip the zero (or end)
+      } // while bitidx
+    } // purge != 0
+  }
+  // if not fully purged, make sure to purge again in the future
+  if (!full_purge) {
+    const long delay = mi_arena_purge_delay();
+    mi_msecs_t expected = 0;
+    mi_atomic_casi64_strong_acq_rel(&arena->purge_expire,&expected,_mi_clock_now() + delay);
+  }
+  return any_purged;
+}
+
+static void mi_arenas_try_purge( bool force, bool visit_all, mi_stats_t* stats ) {
+  if (_mi_preloading() || mi_arena_purge_delay() <= 0) return;  // nothing will be scheduled
+
+  const size_t max_arena = mi_atomic_load_acquire(&mi_arena_count);
+  if (max_arena == 0) return;
+
+  // allow only one thread to purge at a time
+  static mi_atomic_guard_t purge_guard;
+  mi_atomic_guard(&purge_guard) 
+  {
+    mi_msecs_t now = _mi_clock_now();
+    size_t max_purge_count = (visit_all ? max_arena : 1);
+    for (size_t i = 0; i < max_arena; i++) {
+      mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
+      if (arena != NULL) {
+        if (mi_arena_try_purge(arena, now, force, stats)) {
+          if (max_purge_count <= 1) break;
+          max_purge_count--;
+        }
+      }
+    }
+  }  
+}
+
+
 /* -----------------------------------------------------------
   Arena free
 ----------------------------------------------------------- */
 
-void _mi_arena_free(void* p, size_t size, size_t alignment, size_t align_offset, size_t memid, bool all_committed, mi_stats_t* stats) {
+void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memid, mi_stats_t* stats) {
   mi_assert_internal(size > 0 && stats != NULL);
+  mi_assert_internal(committed_size <= size);
   if (p==NULL) return;
   if (size==0) return;
-
-  if (memid == MI_MEMID_OS) {
+  const bool all_committed = (committed_size == size);
+  
+  if (mi_memkind_is_os(memid.memkind)) {
     // was a direct OS allocation, pass through
-    _mi_os_free_aligned(p, size, alignment, align_offset, all_committed, stats);
+    if (!all_committed && committed_size > 0) {
+      // if partially committed, adjust the committed stats (as `_mi_os_free` will increase decommit by the full size)
+      _mi_stat_decrease(&stats->committed, committed_size);
+    }
+    _mi_os_free(p, size, memid, stats);
   }
-  else {
+  else if (memid.memkind == MI_MEM_ARENA) {
     // allocated in an arena
-    mi_assert_internal(align_offset == 0);
     size_t arena_idx;
     size_t bitmap_idx;
     mi_arena_memid_indices(memid, &arena_idx, &bitmap_idx);
     mi_assert_internal(arena_idx < MI_MAX_ARENAS);
-    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t,&mi_arenas[arena_idx]);
+    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t,&mi_arenas[arena_idx]);
     mi_assert_internal(arena != NULL);
     const size_t blocks = mi_block_count_of_size(size);
+    
     // checks
     if (arena == NULL) {
       _mi_error_message(EINVAL, "trying to free from non-existent arena: %p, size %zu, memid: 0x%zx\n", p, size, memid);
@@ -324,24 +631,100 @@ void _mi_arena_free(void* p, size_t size, size_t alignment, size_t align_offset,
       _mi_error_message(EINVAL, "trying to free from non-existent arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
       return;
     }
+
+    // need to set all memory to undefined as some parts may still be marked as no_access (like padding etc.)
+    mi_track_mem_undefined(p,size);
+
     // potentially decommit
-    if (!arena->allow_decommit || arena->blocks_committed == NULL) {
-      mi_assert_internal(all_committed); // note: may be not true as we may "pretend" to be not committed (in segment.c)
+    if (arena->memid.is_pinned || arena->blocks_committed == NULL) {
+      mi_assert_internal(all_committed);
     }
     else {
       mi_assert_internal(arena->blocks_committed != NULL);
-      _mi_os_decommit(p, blocks * MI_ARENA_BLOCK_SIZE, stats); // ok if this fails
-      _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx);
+      mi_assert_internal(arena->blocks_purge != NULL);
+      
+      if (!all_committed) {
+        // mark the entire range as no longer committed (so we recommit the full range when re-using)
+        _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx);
+        mi_track_mem_noaccess(p,size);
+        if (committed_size > 0) {
+          // if partially committed, adjust the committed stats (is it will be recommitted when re-using)
+          // in the delayed purge, we now need to not count a decommit if the range is not marked as committed.
+          _mi_stat_decrease(&stats->committed, committed_size);
+        }
+        // note: if not all committed, it may be that the purge will reset/decommit the entire range
+        // that contains already decommitted parts. Since purge consistently uses reset or decommit that
+        // works (as we should never reset decommitted parts).
+      }
+      // (delay) purge the entire range
+      mi_arena_schedule_purge(arena, bitmap_idx, blocks, stats);      
     }
+    
     // and make it available to others again
     bool all_inuse = _mi_bitmap_unclaim_across(arena->blocks_inuse, arena->field_count, blocks, bitmap_idx);
     if (!all_inuse) {
-      _mi_error_message(EAGAIN, "trying to free an already freed block: %p, size %zu\n", p, size);
+      _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", p, size);
       return;
     };
   }
+  else {
+    // arena was none, external, or static; nothing to do
+    mi_assert_internal(memid.memkind < MI_MEM_OS);
+  }
+
+  // purge expired decommits
+  mi_arenas_try_purge(false, false, stats);
+}
+
+// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
+// for dynamic libraries that are unloaded and need to release all their allocated memory.
+static void mi_arenas_unsafe_destroy(void) {
+  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
+  size_t new_max_arena = 0;
+  for (size_t i = 0; i < max_arena; i++) {
+    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
+    if (arena != NULL) {
+      if (arena->start != NULL && mi_memkind_is_os(arena->memid.memkind)) {      
+        mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL);
+        _mi_os_free(arena->start, mi_arena_size(arena), arena->memid, &_mi_stats_main); 
+      }
+      else {
+        new_max_arena = i;
+      }
+      mi_arena_meta_free(arena, arena->meta_memid, arena->meta_size, &_mi_stats_main);
+    }
+  }
+
+  // try to lower the max arena.
+  size_t expected = max_arena;
+  mi_atomic_cas_strong_acq_rel(&mi_arena_count, &expected, new_max_arena);
+}
+
+// Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired
+void _mi_arena_collect(bool force_purge, mi_stats_t* stats) {
+  mi_arenas_try_purge(force_purge, true /* visit all */, stats);
+}
+
+// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
+// for dynamic libraries that are unloaded and need to release all their allocated memory.
+void _mi_arena_unsafe_destroy_all(mi_stats_t* stats) {
+  mi_arenas_unsafe_destroy();
+  _mi_arena_collect(true /* force purge */, stats);  // purge non-owned arenas  
+}
+
+// Is a pointer inside any of our arenas?
+bool _mi_arena_contains(const void* p) {
+  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
+  for (size_t i = 0; i < max_arena; i++) {
+    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
+    if (arena != NULL && arena->start <= (const uint8_t*)p && arena->start + mi_arena_block_size(arena->block_count) > (const uint8_t*)p) { 
+      return true;      
+    }
+  }
+  return false;
 }
 
+
 /* -----------------------------------------------------------
   Add an arena.
 ----------------------------------------------------------- */
@@ -350,53 +733,58 @@ static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id) {
   mi_assert_internal(arena != NULL);
   mi_assert_internal((uintptr_t)mi_atomic_load_ptr_relaxed(uint8_t,&arena->start) % MI_SEGMENT_ALIGN == 0);
   mi_assert_internal(arena->block_count > 0);
-  if (arena_id != NULL) *arena_id = -1;
+  if (arena_id != NULL) { *arena_id = -1; }
 
   size_t i = mi_atomic_increment_acq_rel(&mi_arena_count);
   if (i >= MI_MAX_ARENAS) {
     mi_atomic_decrement_acq_rel(&mi_arena_count);
     return false;
   }
-  mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena);
   arena->id = mi_arena_id_create(i);
-  if (arena_id != NULL) *arena_id = arena->id;
+  mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena);
+  if (arena_id != NULL) { *arena_id = arena->id; }
   return true;
 }
 
-bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept
+static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept
 {
   if (arena_id != NULL) *arena_id = _mi_arena_id_none();
   if (size < MI_ARENA_BLOCK_SIZE) return false;
 
   if (is_large) {
-    mi_assert_internal(is_committed);
-    is_committed = true;
+    mi_assert_internal(memid.initially_committed && memid.is_pinned);
   }
 
   const size_t bcount = size / MI_ARENA_BLOCK_SIZE;
   const size_t fields = _mi_divide_up(bcount, MI_BITMAP_FIELD_BITS);
-  const size_t bitmaps = (is_committed ? 2 : 3);
+  const size_t bitmaps = (memid.is_pinned ? 2 : 4);
   const size_t asize  = sizeof(mi_arena_t) + (bitmaps*fields*sizeof(mi_bitmap_field_t));
-  mi_arena_t* arena   = (mi_arena_t*)_mi_os_alloc(asize, &_mi_stats_main); // TODO: can we avoid allocating from the OS?
+  mi_memid_t meta_memid;
+  mi_arena_t* arena   = (mi_arena_t*)mi_arena_meta_zalloc(asize, &meta_memid, &_mi_stats_main); // TODO: can we avoid allocating from the OS?
   if (arena == NULL) return false;
-
+  
+  // already zero'd due to os_alloc
+  // _mi_memzero(arena, asize);
   arena->id = _mi_arena_id_none();
+  arena->memid = memid;
   arena->exclusive = exclusive;
+  arena->meta_size = asize;
+  arena->meta_memid = meta_memid;
   arena->block_count = bcount;
   arena->field_count = fields;
   arena->start = (uint8_t*)start;
   arena->numa_node    = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
   arena->is_large     = is_large;
-  arena->is_zero_init = is_zero;
-  arena->allow_decommit = !is_large && !is_committed; // only allow decommit for initially uncommitted memory
+  arena->purge_expire = 0;
   arena->search_idx   = 0;
   arena->blocks_dirty = &arena->blocks_inuse[fields]; // just after inuse bitmap
-  arena->blocks_committed = (!arena->allow_decommit ? NULL : &arena->blocks_inuse[2*fields]); // just after dirty bitmap
-  // the bitmaps are already zero initialized due to os_alloc
+  arena->blocks_committed = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[2*fields]); // just after dirty bitmap
+  arena->blocks_purge  = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[3*fields]); // just after committed bitmap  
   // initialize committed bitmap?
-  if (arena->blocks_committed != NULL && is_committed) {
+  if (arena->blocks_committed != NULL && arena->memid.initially_committed) {
     memset((void*)arena->blocks_committed, 0xFF, fields*sizeof(mi_bitmap_field_t)); // cast to void* to avoid atomic warning
   }
+  
   // and claim leftover blocks if needed (so we never allocate there)
   ptrdiff_t post = (fields * MI_BITMAP_FIELD_BITS) - bcount;
   mi_assert_internal(post >= 0);
@@ -405,32 +793,42 @@ bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is
     mi_bitmap_index_t postidx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post);
     _mi_bitmap_claim(arena->blocks_inuse, fields, post, postidx, NULL);
   }
-
   return mi_arena_add(arena, arena_id);
 
 }
 
+bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
+  mi_memid_t memid = _mi_memid_create(MI_MEM_EXTERNAL);
+  memid.initially_committed = is_committed;
+  memid.initially_zero = is_zero;
+  memid.is_pinned = is_large;
+  return mi_manage_os_memory_ex2(start,size,is_large,numa_node,exclusive,memid, arena_id);
+}
+
 // Reserve a range of regular OS memory
-int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept
-{
+int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
   if (arena_id != NULL) *arena_id = _mi_arena_id_none();
   size = _mi_align_up(size, MI_ARENA_BLOCK_SIZE); // at least one block
-  bool large = allow_large;
-  void* start = _mi_os_alloc_aligned(size, MI_SEGMENT_ALIGN, commit, &large, &_mi_stats_main);
-  if (start==NULL) return ENOMEM;
-  if (!mi_manage_os_memory_ex(start, size, (large || commit), large, true, -1, exclusive, arena_id)) {
-    _mi_os_free_ex(start, size, commit, &_mi_stats_main);
-    _mi_verbose_message("failed to reserve %zu k memory\n", _mi_divide_up(size,1024));
+  mi_memid_t memid;
+  void* start = _mi_os_alloc_aligned(size, MI_SEGMENT_ALIGN, commit, allow_large, &memid, &_mi_stats_main);
+  if (start == NULL) return ENOMEM;
+  const bool is_large = memid.is_pinned; // todo: use separate is_large field?
+  if (!mi_manage_os_memory_ex2(start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) {
+    _mi_os_free_ex(start, size, commit, memid, &_mi_stats_main);
+    _mi_verbose_message("failed to reserve %zu k memory\n", _mi_divide_up(size, 1024));
     return ENOMEM;
   }
-  _mi_verbose_message("reserved %zu KiB memory%s\n", _mi_divide_up(size,1024), large ? " (in large os pages)" : "");
+  _mi_verbose_message("reserved %zu KiB memory%s\n", _mi_divide_up(size, 1024), is_large ? " (in large os pages)" : "");
   return 0;
 }
 
+
+// Manage a range of regular OS memory
 bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept {
-  return mi_manage_os_memory_ex(start, size, is_committed, is_large, is_zero, numa_node, false, NULL);
+  return mi_manage_os_memory_ex(start, size, is_committed, is_large, is_zero, numa_node, false /* exclusive? */, NULL);
 }
 
+// Reserve a range of regular OS memory
 int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept {
   return mi_reserve_os_memory_ex(size, commit, allow_large, false, NULL);
 }
@@ -480,15 +878,16 @@ int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_m
   if (numa_node >= 0) numa_node = numa_node % _mi_os_numa_node_count();
   size_t hsize = 0;
   size_t pages_reserved = 0;
-  void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, timeout_msecs, &pages_reserved, &hsize);
+  mi_memid_t memid;
+  void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, timeout_msecs, &pages_reserved, &hsize, &memid);
   if (p==NULL || pages_reserved==0) {
     _mi_warning_message("failed to reserve %zu GiB huge pages\n", pages);
     return ENOMEM;
   }
   _mi_verbose_message("numa node %i: reserved %zu GiB huge pages (of the %zu GiB requested)\n", numa_node, pages_reserved, pages);
 
-  if (!mi_manage_os_memory_ex(p, hsize, true, true, true, numa_node, exclusive, arena_id)) {
-    _mi_os_free_huge_pages(p, hsize, &_mi_stats_main);
+  if (!mi_manage_os_memory_ex2(p, hsize, true, numa_node, exclusive, memid, arena_id)) {
+    _mi_os_free(p, hsize, memid, &_mi_stats_main);
     return ENOMEM;
   }
   return 0;
@@ -534,3 +933,4 @@ int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserv
   if (err==0 && pages_reserved!=NULL) *pages_reserved = pages;
   return err;
 }
+
diff --git a/Objects/mimalloc/bitmap.c b/Objects/mimalloc/bitmap.c
index 4ea9f4afa7322e..a13dbe15bf33e8 100644
--- a/Objects/mimalloc/bitmap.c
+++ b/Objects/mimalloc/bitmap.c
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2019-2021 Microsoft Research, Daan Leijen
+Copyright (c) 2019-2023 Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -11,14 +11,13 @@ represeted as an array of fields where each field is a machine word (`size_t`)
 
 There are two api's; the standard one cannot have sequences that cross
 between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS).
-(this is used in region allocation)
 
 The `_across` postfixed functions do allow sequences that can cross over
 between the fields. (This is used in arena allocation)
 ---------------------------------------------------------------------------- */
 
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
+#include "mimalloc/internal.h"
 #include "bitmap.h"
 
 /* -----------------------------------------------------------
@@ -63,12 +62,12 @@ inline bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, cons
 
   // scan linearly for a free range of zero bits
   while (bitidx <= bitidx_max) {
-    const size_t mapm = map & m;
+    const size_t mapm = (map & m);
     if (mapm == 0) {  // are the mask bits free at bitidx?
       mi_assert_internal((m >> bitidx) == mask); // no overflow?
-      const size_t newmap = map | m;
+      const size_t newmap = (map | m);
       mi_assert_internal((newmap^map) >> bitidx == mask);
-      if (!mi_atomic_cas_weak_acq_rel(field, &map, newmap)) {  // TODO: use strong cas here?
+      if (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)) {  // TODO: use weak cas here?
         // no success, another thread claimed concurrently.. keep going (with updated `map`)
         continue;
       }
@@ -81,7 +80,8 @@ inline bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, cons
     else {
       // on to the next bit range
 #ifdef MI_HAVE_FAST_BITSCAN
-      const size_t shift = (count == 1 ? 1 : mi_bsr(mapm) - bitidx + 1);
+      mi_assert_internal(mapm != 0);
+      const size_t shift = (count == 1 ? 1 : (MI_INTPTR_BITS - mi_clz(mapm) - bitidx));
       mi_assert_internal(shift > 0 && shift <= count);
 #else
       const size_t shift = 1;
@@ -100,7 +100,7 @@ inline bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, cons
 bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx) {
   size_t idx = start_field_idx;
   for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) {
-    if (idx >= bitmap_fields) idx = 0; // wrap
+    if (idx >= bitmap_fields) { idx = 0; } // wrap
     if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
       return true;
     }
@@ -127,14 +127,6 @@ bool _mi_bitmap_try_find_from_claim_pred(mi_bitmap_t bitmap, const size_t bitmap
   return false;
 }
 
-/*
-// Find `count` bits of 0 and set them to 1 atomically; returns `true` on success.
-// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never span fields.
-bool _mi_bitmap_try_find_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t count, mi_bitmap_index_t* bitmap_idx) {
-  return _mi_bitmap_try_find_from_claim(bitmap, bitmap_fields, 0, count, bitmap_idx);
-}
-*/
-
 // Set `count` bits at `bitmap_idx` to 0 atomically
 // Returns `true` if all `count` bits were 1 previously.
 bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
@@ -143,7 +135,7 @@ bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count,
   const size_t mask = mi_bitmap_mask_(count, bitidx);
   mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
   // mi_assert_internal((bitmap[idx] & mask) == mask);
-  size_t prev = mi_atomic_and_acq_rel(&bitmap[idx], ~mask);
+  const size_t prev = mi_atomic_and_acq_rel(&bitmap[idx], ~mask);
   return ((prev & mask) == mask);
 }
 
@@ -157,7 +149,7 @@ bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi
   mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
   //mi_assert_internal(any_zero != NULL || (bitmap[idx] & mask) == 0);
   size_t prev = mi_atomic_or_acq_rel(&bitmap[idx], mask);
-  if (any_zero != NULL) *any_zero = ((prev & mask) != mask);
+  if (any_zero != NULL) { *any_zero = ((prev & mask) != mask); }
   return ((prev & mask) == 0);
 }
 
@@ -167,11 +159,28 @@ static bool mi_bitmap_is_claimedx(mi_bitmap_t bitmap, size_t bitmap_fields, size
   const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
   const size_t mask = mi_bitmap_mask_(count, bitidx);
   mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
-  size_t field = mi_atomic_load_relaxed(&bitmap[idx]);
-  if (any_ones != NULL) *any_ones = ((field & mask) != 0);
+  const size_t field = mi_atomic_load_relaxed(&bitmap[idx]);
+  if (any_ones != NULL) { *any_ones = ((field & mask) != 0); }
   return ((field & mask) == mask);
 }
 
+// Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically. 
+// Returns `true` if successful when all previous `count` bits were 0.
+bool _mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const size_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
+  size_t expected = mi_atomic_load_relaxed(&bitmap[idx]);
+  do  {    
+    if ((expected & mask) != 0) return false;
+  } 
+  while (!mi_atomic_cas_strong_acq_rel(&bitmap[idx], &expected, expected | mask));
+  mi_assert_internal((expected & mask) == 0);
+  return true;
+}
+
+
 bool _mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
   return mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, NULL);
 }
@@ -190,6 +199,7 @@ bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t
 
 // Try to atomically claim a sequence of `count` bits starting from the field
 // at `idx` in `bitmap` and crossing into subsequent fields. Returns `true` on success.
+// Only needs to consider crossing into the next fields (see `mi_bitmap_try_find_from_claim_across`)
 static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t idx, const size_t count, const size_t retries, mi_bitmap_index_t* bitmap_idx)
 {
   mi_assert_internal(bitmap_idx != NULL);
@@ -200,9 +210,9 @@ static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bit
   const size_t initial = mi_clz(map);  // count of initial zeros starting at idx
   mi_assert_internal(initial <= MI_BITMAP_FIELD_BITS);
   if (initial == 0)     return false;
-  if (initial >= count) return _mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx);     // no need to cross fields
+  if (initial >= count) return _mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx);    // no need to cross fields (this case won't happen for us)
   if (_mi_divide_up(count - initial, MI_BITMAP_FIELD_BITS) >= (bitmap_fields - idx)) return false; // not enough entries
-
+  
   // scan ahead
   size_t found = initial;
   size_t mask = 0;     // mask bits for the final field
@@ -210,25 +220,27 @@ static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bit
     field++;
     map = mi_atomic_load_relaxed(field);
     const size_t mask_bits = (found + MI_BITMAP_FIELD_BITS <= count ? MI_BITMAP_FIELD_BITS : (count - found));
+    mi_assert_internal(mask_bits > 0 && mask_bits <= MI_BITMAP_FIELD_BITS);
     mask = mi_bitmap_mask_(mask_bits, 0);
-    if ((map & mask) != 0) return false;
+    if ((map & mask) != 0) return false;  // some part is already claimed
     found += mask_bits;
   }
   mi_assert_internal(field < &bitmap[bitmap_fields]);
 
-  // found range of zeros up to the final field; mask contains mask in the final field
-  // now claim it atomically
+  // we found a range of contiguous zeros up to the final field; mask contains mask in the final field
+  // now try to claim the range atomically
   mi_bitmap_field_t* const final_field = field;
   const size_t final_mask = mask;
   mi_bitmap_field_t* const initial_field = &bitmap[idx];
-  const size_t initial_mask = mi_bitmap_mask_(initial, MI_BITMAP_FIELD_BITS - initial);
+  const size_t initial_idx = MI_BITMAP_FIELD_BITS - initial;
+  const size_t initial_mask = mi_bitmap_mask_(initial, initial_idx);
 
   // initial field
   size_t newmap;
   field = initial_field;
   map = mi_atomic_load_relaxed(field);
   do {
-    newmap = map | initial_mask;
+    newmap = (map | initial_mask);
     if ((map & initial_mask) != 0) { goto rollback; };
   } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
 
@@ -243,31 +255,32 @@ static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bit
   mi_assert_internal(field == final_field);
   map = mi_atomic_load_relaxed(field);
   do {
-    newmap = map | final_mask;
+    newmap = (map | final_mask);
     if ((map & final_mask) != 0) { goto rollback; }
   } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
 
   // claimed!
-  *bitmap_idx = mi_bitmap_index_create(idx, MI_BITMAP_FIELD_BITS - initial);
+  *bitmap_idx = mi_bitmap_index_create(idx, initial_idx);
   return true;
 
 rollback:
   // roll back intermediate fields
+  // (we just failed to claim `field` so decrement first)
   while (--field > initial_field) {
     newmap = 0;
     map = MI_BITMAP_FIELD_FULL;
     mi_assert_internal(mi_atomic_load_relaxed(field) == map);
     mi_atomic_store_release(field, newmap);
   }
-  if (field == initial_field) {
+  if (field == initial_field) {               // (if we failed on the initial field, `field + 1 == initial_field`)
     map = mi_atomic_load_relaxed(field);
     do {
       mi_assert_internal((map & initial_mask) == initial_mask);
-      newmap = map & ~initial_mask;
+      newmap = (map & ~initial_mask);
     } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
   }
   // retry? (we make a recursive call instead of goto to be able to use const declarations)
-  if (retries < 4) {
+  if (retries <= 2) {
     return mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, retries+1, bitmap_idx);
   }
   else {
@@ -280,17 +293,22 @@ static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bit
 // Starts at idx, and wraps around to search in all `bitmap_fields` fields.
 bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx) {
   mi_assert_internal(count > 0);
-  if (count==1) return _mi_bitmap_try_find_from_claim(bitmap, bitmap_fields, start_field_idx, count, bitmap_idx);
+  if (count <= 2) {
+    // we don't bother with crossover fields for small counts
+    return _mi_bitmap_try_find_from_claim(bitmap, bitmap_fields, start_field_idx, count, bitmap_idx);
+  }
+
+  // visit the fields
   size_t idx = start_field_idx;
   for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) {
-    if (idx >= bitmap_fields) idx = 0; // wrap
-    // try to claim inside the field
+    if (idx >= bitmap_fields) { idx = 0; } // wrap
+    // first try to claim inside a field
     if (count <= MI_BITMAP_FIELD_BITS) {
       if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
         return true;
       }
     }
-    // try to claim across fields
+    // if that fails, then try to claim across fields
     if (mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, 0, bitmap_idx)) {
       return true;
     }
@@ -300,7 +318,7 @@ bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitm
 
 // Helper for masks across fields; returns the mid count, post_mask may be 0
 static size_t mi_bitmap_mask_across(mi_bitmap_index_t bitmap_idx, size_t bitmap_fields, size_t count, size_t* pre_mask, size_t* mid_mask, size_t* post_mask) {
-  MI_UNUSED_RELEASE(bitmap_fields);
+  MI_UNUSED(bitmap_fields);
   const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
   if mi_likely(bitidx + count <= MI_BITMAP_FIELD_BITS) {
     *pre_mask = mi_bitmap_mask_(count, bitidx);
@@ -333,14 +351,14 @@ bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t
   size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
   bool all_one = true;
   mi_bitmap_field_t* field = &bitmap[idx];
-  size_t prev = mi_atomic_and_acq_rel(field++, ~pre_mask);
+  size_t prev = mi_atomic_and_acq_rel(field++, ~pre_mask);   // clear first part
   if ((prev & pre_mask) != pre_mask) all_one = false;
   while(mid_count-- > 0) {
-    prev = mi_atomic_and_acq_rel(field++, ~mid_mask);
+    prev = mi_atomic_and_acq_rel(field++, ~mid_mask);        // clear mid part
     if ((prev & mid_mask) != mid_mask) all_one = false;
   }
   if (post_mask!=0) {
-    prev = mi_atomic_and_acq_rel(field, ~post_mask);
+    prev = mi_atomic_and_acq_rel(field, ~post_mask);         // clear end part
     if ((prev & post_mask) != post_mask) all_one = false;
   }
   return all_one;
@@ -370,7 +388,7 @@ bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t co
     if ((prev & post_mask) != 0) all_zero = false;
     if ((prev & post_mask) != post_mask) any_zero = true;
   }
-  if (pany_zero != NULL) *pany_zero = any_zero;
+  if (pany_zero != NULL) { *pany_zero = any_zero; }
   return all_zero;
 }
 
@@ -399,7 +417,7 @@ static bool mi_bitmap_is_claimedx_across(mi_bitmap_t bitmap, size_t bitmap_field
     if ((prev & post_mask) != post_mask) all_ones = false;
     if ((prev & post_mask) != 0) any_ones = true;
   }
-  if (pany_ones != NULL) *pany_ones = any_ones;
+  if (pany_ones != NULL) { *pany_ones = any_ones; }
   return all_ones;
 }
 
diff --git a/Objects/mimalloc/bitmap.h b/Objects/mimalloc/bitmap.h
index 0c501ec1feac3b..0a765c714bae09 100644
--- a/Objects/mimalloc/bitmap.h
+++ b/Objects/mimalloc/bitmap.h
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2019-2020 Microsoft Research, Daan Leijen
+Copyright (c) 2019-2023 Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -80,6 +80,10 @@ bool _mi_bitmap_try_find_from_claim_pred(mi_bitmap_t bitmap, const size_t bitmap
 // Returns `true` if all `count` bits were 1 previously.
 bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
 
+// Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically. 
+// Returns `true` if successful when all previous `count` bits were 0.
+bool _mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+
 // Set `count` bits at `bitmap_idx` to 1 atomically
 // Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
 bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero);
diff --git a/Objects/mimalloc/heap.c b/Objects/mimalloc/heap.c
index ac2d042bfd20cf..58520ddf631ede 100644
--- a/Objects/mimalloc/heap.c
+++ b/Objects/mimalloc/heap.c
@@ -6,8 +6,9 @@ terms of the MIT license. A copy of the license can be found in the file
 -----------------------------------------------------------------------------*/
 
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
-#include "mimalloc-atomic.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
+#include "mimalloc/prim.h"  // mi_prim_get_default_heap
 
 #include <string.h>  // memset, memcpy
 
@@ -30,15 +31,18 @@ static bool mi_heap_visit_pages(mi_heap_t* heap, heap_page_visitor_fun* fn, void
   // visit all pages
   #if MI_DEBUG>1
   size_t total = heap->page_count;
-  #endif
   size_t count = 0;
+  #endif  
+
   for (size_t i = 0; i <= MI_BIN_FULL; i++) {
     mi_page_queue_t* pq = &heap->pages[i];
     mi_page_t* page = pq->first;
     while(page != NULL) {
       mi_page_t* next = page->next; // save next in case the page gets removed from the queue
       mi_assert_internal(mi_page_heap(page) == heap);
+      #if MI_DEBUG>1
       count++;
+      #endif
       if (!fn(heap, pq, page, arg1, arg2)) return false;
       page = next; // and continue
     }
@@ -150,8 +154,8 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
   mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL);
   mi_assert_internal( collect != MI_ABANDON || mi_atomic_load_ptr_acquire(mi_block_t,&heap->thread_delayed_free) == NULL );
 
-  // collect abandoned segments (in particular, decommit expired parts of segments in the abandoned segment list)
-  // note: forced decommit can be quite expensive if many threads are created/destroyed so we do not force on abandonment
+  // collect abandoned segments (in particular, purge expired parts of segments in the abandoned segment list)
+  // note: forced purge can be quite expensive if many threads are created/destroyed so we do not force on abandonment
   _mi_abandoned_collect(heap, collect == MI_FORCE /* force? */, &heap->tld->segments);
 
   // collect segment local caches
@@ -159,13 +163,10 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
     _mi_segment_thread_collect(&heap->tld->segments);
   }
 
-  // decommit in global segment caches
-  // note: forced decommit can be quite expensive if many threads are created/destroyed so we do not force on abandonment
-  _mi_segment_cache_collect( collect == MI_FORCE, &heap->tld->os);  
-
   // collect regions on program-exit (or shared library unload)
   if (force && _mi_is_main_thread() && mi_heap_is_backing(heap)) {
-    //_mi_mem_collect(&heap->tld->os);
+    _mi_thread_data_collect();  // collect thread data cache
+    _mi_arena_collect(true /* force purge */, &heap->tld->stats);
   }
 }
 
@@ -178,7 +179,7 @@ void mi_heap_collect(mi_heap_t* heap, bool force) mi_attr_noexcept {
 }
 
 void mi_collect(bool force) mi_attr_noexcept {
-  mi_heap_collect(mi_get_default_heap(), force);
+  mi_heap_collect(mi_prim_get_default_heap(), force);
 }
 
 
@@ -188,9 +189,14 @@ void mi_collect(bool force) mi_attr_noexcept {
 
 mi_heap_t* mi_heap_get_default(void) {
   mi_thread_init();
-  return mi_get_default_heap();
+  return mi_prim_get_default_heap();
+}
+
+static bool mi_heap_is_default(const mi_heap_t* heap) {
+  return (heap == mi_prim_get_default_heap());
 }
 
+
 mi_heap_t* mi_heap_get_backing(void) {
   mi_heap_t* heap = mi_heap_get_default();
   mi_assert_internal(heap!=NULL);
@@ -200,16 +206,16 @@ mi_heap_t* mi_heap_get_backing(void) {
   return bheap;
 }
 
-mi_decl_nodiscard mi_heap_t* mi_heap_new_in_arena( mi_arena_id_t arena_id ) {
+mi_decl_nodiscard mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id) {
   mi_heap_t* bheap = mi_heap_get_backing();
   mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t);  // todo: OS allocate in secure mode?
-  if (heap==NULL) return NULL;
+  if (heap == NULL) return NULL;
   _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(mi_heap_t));
   heap->tld = bheap->tld;
   heap->thread_id = _mi_thread_id();
   heap->arena_id = arena_id;
   _mi_random_split(&bheap->random, &heap->random);
-  heap->cookie  = _mi_heap_random_next(heap) | 1;
+  heap->cookie = _mi_heap_random_next(heap) | 1;
   heap->keys[0] = _mi_heap_random_next(heap);
   heap->keys[1] = _mi_heap_random_next(heap);
   heap->no_reclaim = true;  // don't reclaim abandoned pages or otherwise destroy is unsafe
@@ -223,7 +229,7 @@ mi_decl_nodiscard mi_heap_t* mi_heap_new(void) {
   return mi_heap_new_in_arena(_mi_arena_id_none());
 }
 
-bool _mi_heap_memid_is_suitable(mi_heap_t* heap, size_t memid) {
+bool _mi_heap_memid_is_suitable(mi_heap_t* heap, mi_memid_t memid) {
   return _mi_arena_memid_is_suitable(memid, heap->arena_id);
 }
 
@@ -237,9 +243,6 @@ static void mi_heap_reset_pages(mi_heap_t* heap) {
   mi_assert_internal(mi_heap_is_initialized(heap));
   // TODO: copy full empty heap instead?
   memset(&heap->pages_free_direct, 0, sizeof(heap->pages_free_direct));
-#ifdef MI_MEDIUM_DIRECT
-  memset(&heap->pages_free_medium, 0, sizeof(heap->pages_free_medium));
-#endif
   _mi_memcpy_aligned(&heap->pages, &_mi_heap_empty.pages, sizeof(heap->pages));
   heap->thread_delayed_free = NULL;
   heap->page_count = 0;
@@ -330,6 +333,14 @@ void _mi_heap_destroy_pages(mi_heap_t* heap) {
   mi_heap_reset_pages(heap);
 }
 
+#if MI_TRACK_HEAP_DESTROY
+static bool mi_cdecl mi_heap_track_block_free(const mi_heap_t* heap, const mi_heap_area_t* area, void* block, size_t block_size, void* arg) {
+  MI_UNUSED(heap); MI_UNUSED(area);  MI_UNUSED(arg); MI_UNUSED(block_size);
+  mi_track_free_size(block,mi_usable_size(block));
+  return true;
+}
+#endif
+
 void mi_heap_destroy(mi_heap_t* heap) {
   mi_assert(heap != NULL);
   mi_assert(mi_heap_is_initialized(heap));
@@ -341,13 +352,18 @@ void mi_heap_destroy(mi_heap_t* heap) {
     mi_heap_delete(heap);
   }
   else {
+    // track all blocks as freed
+    #if MI_TRACK_HEAP_DESTROY
+    mi_heap_visit_blocks(heap, true, mi_heap_track_block_free, NULL);
+    #endif
     // free all pages
     _mi_heap_destroy_pages(heap);
     mi_heap_free(heap);
   }
 }
 
-void _mi_heap_destroy_all(void) {
+// forcefully destroy all heaps in the current thread
+void _mi_heap_unsafe_destroy_all(void) {
   mi_heap_t* bheap = mi_heap_get_backing();
   mi_heap_t* curr = bheap->tld->heaps;
   while (curr != NULL) {
@@ -425,7 +441,7 @@ mi_heap_t* mi_heap_set_default(mi_heap_t* heap) {
   mi_assert(mi_heap_is_initialized(heap));
   if (heap==NULL || !mi_heap_is_initialized(heap)) return NULL;
   mi_assert_expensive(mi_heap_is_valid(heap));
-  mi_heap_t* old = mi_get_default_heap();
+  mi_heap_t* old = mi_prim_get_default_heap();
   _mi_heap_set_default_direct(heap);
   return old;
 }
@@ -475,7 +491,7 @@ bool mi_heap_check_owned(mi_heap_t* heap, const void* p) {
 }
 
 bool mi_check_owned(const void* p) {
-  return mi_heap_check_owned(mi_get_default_heap(), p);
+  return mi_heap_check_owned(mi_prim_get_default_heap(), p);
 }
 
 /* -----------------------------------------------------------
@@ -518,9 +534,13 @@ static bool mi_heap_area_visit_blocks(const mi_heap_area_ex_t* xarea, mi_block_v
   uintptr_t free_map[MI_MAX_BLOCKS / sizeof(uintptr_t)];
   memset(free_map, 0, sizeof(free_map));
 
+  #if MI_DEBUG>1
   size_t free_count = 0;
+  #endif
   for (mi_block_t* block = page->free; block != NULL; block = mi_block_next(page,block)) {
+    #if MI_DEBUG>1
     free_count++;
+    #endif
     mi_assert_internal((uint8_t*)block >= pstart && (uint8_t*)block < (pstart + psize));
     size_t offset = (uint8_t*)block - pstart;
     mi_assert_internal(offset % bsize == 0);
@@ -533,7 +553,9 @@ static bool mi_heap_area_visit_blocks(const mi_heap_area_ex_t* xarea, mi_block_v
   mi_assert_internal(page->capacity == (free_count + page->used));
 
   // walk through all blocks skipping the free ones
+  #if MI_DEBUG>1
   size_t used_count = 0;
+  #endif
   for (size_t i = 0; i < page->capacity; i++) {
     size_t bitidx = (i / sizeof(uintptr_t));
     size_t bit = i - (bitidx * sizeof(uintptr_t));
@@ -542,7 +564,9 @@ static bool mi_heap_area_visit_blocks(const mi_heap_area_ex_t* xarea, mi_block_v
       i += (sizeof(uintptr_t) - 1); // skip a run of free blocks
     }
     else if ((m & ((uintptr_t)1 << bit)) == 0) {
+      #if MI_DEBUG>1
       used_count++;
+      #endif
       uint8_t* block = pstart + (i * bsize);
       if (!visitor(mi_page_heap(page), area, block, ubsize, arg)) return false;
     }
diff --git a/Objects/mimalloc/init.c b/Objects/mimalloc/init.c
index c416208cfdd625..24e8333f42d3e5 100644
--- a/Objects/mimalloc/init.c
+++ b/Objects/mimalloc/init.c
@@ -5,14 +5,16 @@ terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/prim.h"
 
 #include <string.h>  // memcpy, memset
 #include <stdlib.h>  // atexit
 
+
 // Empty page used to initialize the small free pages array
 const mi_page_t _mi_page_empty = {
-  0, false, false, false, false,
+  0, false, false, false,
   0,       // capacity
   0,       // reserved capacity
   { 0 },   // flags
@@ -22,7 +24,7 @@ const mi_page_t _mi_page_empty = {
   0,       // used
   0,       // xblock_size
   NULL,    // local_free
-  #if MI_ENCODE_FREELIST
+  #if (MI_PADDING || MI_ENCODE_FREELIST)
   { 0, 0 },
   #endif
   MI_ATOMIC_VAR_INIT(0), // xthread_free
@@ -35,6 +37,7 @@ const mi_page_t _mi_page_empty = {
 
 #define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty)
 
+#if (MI_SMALL_WSIZE_MAX==128)
 #if (MI_PADDING>0) && (MI_INTPTR_SIZE >= 8)
 #define MI_SMALL_PAGES_EMPTY  { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY(), MI_PAGE_EMPTY() }
 #elif (MI_PADDING>0)
@@ -42,7 +45,9 @@ const mi_page_t _mi_page_empty = {
 #else
 #define MI_SMALL_PAGES_EMPTY  { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY() }
 #endif
-
+#else
+#error "define right initialization sizes corresponding to MI_SMALL_WSIZE_MAX"
+#endif
 
 // Empty page queues for every bin
 #define QNULL(sz)  { NULL, NULL, (sz)*sizeof(uintptr_t) }
@@ -77,8 +82,9 @@ const mi_page_t _mi_page_empty = {
   MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
   MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
   MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
-  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 },     \
-  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 } \
+  MI_STAT_COUNT_NULL(), \
+  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
+  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 } \
   MI_STAT_COUNT_END_NULL()
 
 
@@ -130,6 +136,10 @@ mi_decl_cache_align static const mi_tld_t tld_empty = {
   { MI_STATS_NULL }       // stats
 };
 
+mi_threadid_t _mi_thread_id(void) mi_attr_noexcept {
+  return _mi_prim_thread_id();
+}
+
 // the thread-local default heap for allocation
 mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;
 
@@ -193,6 +203,7 @@ mi_heap_t* _mi_heap_main_get(void) {
 typedef struct mi_thread_data_s {
   mi_heap_t  heap;  // must come first due to cast in `_mi_heap_done`
   mi_tld_t   tld;
+  mi_memid_t memid;
 } mi_thread_data_t;
 
 
@@ -201,30 +212,44 @@ typedef struct mi_thread_data_s {
 // destroy many OS threads, this may causes too much overhead
 // per thread so we maintain a small cache of recently freed metadata.
 
-#define TD_CACHE_SIZE (8)
+#define TD_CACHE_SIZE (16)
 static _Atomic(mi_thread_data_t*) td_cache[TD_CACHE_SIZE];
 
-static mi_thread_data_t* mi_thread_data_alloc(void) {
+static mi_thread_data_t* mi_thread_data_zalloc(void) {
   // try to find thread metadata in the cache
-  mi_thread_data_t* td;
+  bool is_zero = false;
+  mi_thread_data_t* td = NULL;
   for (int i = 0; i < TD_CACHE_SIZE; i++) {
     td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
     if (td != NULL) {
+      // found cached allocation, try use it
       td = mi_atomic_exchange_ptr_acq_rel(mi_thread_data_t, &td_cache[i], NULL);
       if (td != NULL) {
-        return td;
+        break;
       }
     }
   }
-  // if that fails, allocate directly from the OS
-  td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &_mi_stats_main);
+
+  // if that fails, allocate as meta data
   if (td == NULL) {
-    // if this fails, try once more. (issue #257)
-    td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &_mi_stats_main);
+    mi_memid_t memid;
+    td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &memid, &_mi_stats_main);
     if (td == NULL) {
-      // really out of memory
-      _mi_error_message(ENOMEM, "unable to allocate thread local heap metadata (%zu bytes)\n", sizeof(mi_thread_data_t));
+      // if this fails, try once more. (issue #257)
+      td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &memid, &_mi_stats_main);
+      if (td == NULL) {
+        // really out of memory
+        _mi_error_message(ENOMEM, "unable to allocate thread local heap metadata (%zu bytes)\n", sizeof(mi_thread_data_t));
+      }
     }
+    if (td != NULL) {
+      td->memid = memid;
+      is_zero = memid.initially_zero;
+    }
+  }
+  
+  if (td != NULL && !is_zero) {
+    _mi_memzero_aligned(td, sizeof(*td));
   }
   return td;
 }
@@ -241,17 +266,17 @@ static void mi_thread_data_free( mi_thread_data_t* tdfree ) {
     }
   }
   // if that fails, just free it directly
-  _mi_os_free(tdfree, sizeof(mi_thread_data_t), &_mi_stats_main);
+  _mi_os_free(tdfree, sizeof(mi_thread_data_t), tdfree->memid, &_mi_stats_main);
 }
 
-static void mi_thread_data_collect(void) {
+void _mi_thread_data_collect(void) {
   // free all thread metadata from the cache
   for (int i = 0; i < TD_CACHE_SIZE; i++) {
     mi_thread_data_t* td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
     if (td != NULL) {
       td = mi_atomic_exchange_ptr_acq_rel(mi_thread_data_t, &td_cache[i], NULL);
       if (td != NULL) {
-        _mi_os_free( td, sizeof(mi_thread_data_t), &_mi_stats_main );
+        _mi_os_free(td, sizeof(mi_thread_data_t), td->memid, &_mi_stats_main);
       }
     }
   }
@@ -259,20 +284,19 @@ static void mi_thread_data_collect(void) {
 
 // Initialize the thread local default heap, called from `mi_thread_init`
 static bool _mi_heap_init(void) {
-  if (mi_heap_is_initialized(mi_get_default_heap())) return true;
+  if (mi_heap_is_initialized(mi_prim_get_default_heap())) return true;
   if (_mi_is_main_thread()) {
     // mi_assert_internal(_mi_heap_main.thread_id != 0);  // can happen on freeBSD where alloc is called before any initialization
     // the main heap is statically allocated
     mi_heap_main_init();
     _mi_heap_set_default_direct(&_mi_heap_main);
-    //mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_get_default_heap());
+    //mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_prim_get_default_heap());
   }
   else {
     // use `_mi_os_alloc` to allocate directly from the OS
-    mi_thread_data_t* td = mi_thread_data_alloc();
+    mi_thread_data_t* td = mi_thread_data_zalloc();
     if (td == NULL) return false;
 
-    // OS allocated so already zero initialized
     mi_tld_t*  tld = &td->tld;
     mi_heap_t* heap = &td->heap;
     _mi_memcpy_aligned(tld, &tld_empty, sizeof(*tld));
@@ -334,7 +358,6 @@ static bool _mi_heap_done(mi_heap_t* heap) {
     mi_thread_data_free((mi_thread_data_t*)heap);
   }
   else {
-    mi_thread_data_collect(); // free cached thread metadata
     #if 0
     // never free the main thread even in debug mode; if a dll is linked statically with mimalloc,
     // there may still be delete/free calls after the mi_fls_done is called. Issue #207
@@ -363,54 +386,12 @@ static bool _mi_heap_done(mi_heap_t* heap) {
 // to set up the thread local keys.
 // --------------------------------------------------------
 
-static void _mi_thread_done(mi_heap_t* default_heap);
-
-#if defined(_WIN32) && defined(MI_SHARED_LIB)
-  // nothing to do as it is done in DllMain
-#elif defined(_WIN32) && !defined(MI_SHARED_LIB)
-  // use thread local storage keys to detect thread ending
-  #include <windows.h>
-  #include <fibersapi.h>
-  #if (_WIN32_WINNT < 0x600)  // before Windows Vista
-  WINBASEAPI DWORD WINAPI FlsAlloc( _In_opt_ PFLS_CALLBACK_FUNCTION lpCallback );
-  WINBASEAPI PVOID WINAPI FlsGetValue( _In_ DWORD dwFlsIndex );
-  WINBASEAPI BOOL  WINAPI FlsSetValue( _In_ DWORD dwFlsIndex, _In_opt_ PVOID lpFlsData );
-  WINBASEAPI BOOL  WINAPI FlsFree(_In_ DWORD dwFlsIndex);
-  #endif
-  static DWORD mi_fls_key = (DWORD)(-1);
-  static void NTAPI mi_fls_done(PVOID value) {
-    mi_heap_t* heap = (mi_heap_t*)value;
-    if (heap != NULL) {
-      _mi_thread_done(heap);
-      FlsSetValue(mi_fls_key, NULL);  // prevent recursion as _mi_thread_done may set it back to the main heap, issue #672
-    }
-  }
-#elif defined(MI_USE_PTHREADS)
-  // use pthread local storage keys to detect thread ending
-  // (and used with MI_TLS_PTHREADS for the default heap)
-  pthread_key_t _mi_heap_default_key = (pthread_key_t)(-1);
-  static void mi_pthread_done(void* value) {
-    if (value!=NULL) _mi_thread_done((mi_heap_t*)value);
-  }
-#elif defined(__wasi__)
-// no pthreads in the WebAssembly Standard Interface
-#else
-  #pragma message("define a way to call mi_thread_done when a thread is done")
-#endif
-
 // Set up handlers so `mi_thread_done` is called automatically
 static void mi_process_setup_auto_thread_done(void) {
   static bool tls_initialized = false; // fine if it races
   if (tls_initialized) return;
   tls_initialized = true;
-  #if defined(_WIN32) && defined(MI_SHARED_LIB)
-    // nothing to do as it is done in DllMain
-  #elif defined(_WIN32) && !defined(MI_SHARED_LIB)
-    mi_fls_key = FlsAlloc(&mi_fls_done);
-  #elif defined(MI_USE_PTHREADS)
-    mi_assert_internal(_mi_heap_default_key == (pthread_key_t)(-1));
-    pthread_key_create(&_mi_heap_default_key, &mi_pthread_done);
-  #endif
+  _mi_prim_thread_init_auto_done();
   _mi_heap_set_default_direct(&_mi_heap_main);
 }
 
@@ -442,13 +423,26 @@ void mi_thread_init(void) mi_attr_noexcept
 }
 
 void mi_thread_done(void) mi_attr_noexcept {
-  _mi_thread_done(mi_get_default_heap());
+  _mi_thread_done(NULL);
 }
 
-static void _mi_thread_done(mi_heap_t* heap) {
+void _mi_thread_done(mi_heap_t* heap) 
+{
+  // calling with NULL implies using the default heap
+  if (heap == NULL) { 
+    heap = mi_prim_get_default_heap(); 
+    if (heap == NULL) return;
+  }
+
+  // prevent re-entrancy through heap_done/heap_set_default_direct (issue #699)
+  if (!mi_heap_is_initialized(heap)) {
+    return; 
+  }
+
+  // adjust stats
   mi_atomic_decrement_relaxed(&thread_count);
   _mi_stat_decrease(&_mi_stats_main.threads, 1);
-
+  
   // check thread-id as on Windows shutdown with FLS the main (exit) thread may call this on thread-local heaps...
   if (heap->thread_id != _mi_thread_id()) return;
 
@@ -459,7 +453,7 @@ static void _mi_thread_done(mi_heap_t* heap) {
 void _mi_heap_set_default_direct(mi_heap_t* heap)  {
   mi_assert_internal(heap != NULL);
   #if defined(MI_TLS_SLOT)
-  mi_tls_slot_set(MI_TLS_SLOT,heap);
+  mi_prim_tls_slot_set(MI_TLS_SLOT,heap);
   #elif defined(MI_TLS_PTHREAD_SLOT_OFS)
   *mi_tls_pthread_heap_slot() = heap;
   #elif defined(MI_TLS_PTHREAD)
@@ -470,16 +464,7 @@ void _mi_heap_set_default_direct(mi_heap_t* heap)  {
 
   // ensure the default heap is passed to `_mi_thread_done`
   // setting to a non-NULL value also ensures `mi_thread_done` is called.
-  #if defined(_WIN32) && defined(MI_SHARED_LIB)
-    // nothing to do as it is done in DllMain
-  #elif defined(_WIN32) && !defined(MI_SHARED_LIB)
-    mi_assert_internal(mi_fls_key != 0);
-    FlsSetValue(mi_fls_key, heap);
-  #elif defined(MI_USE_PTHREADS)
-  if (_mi_heap_default_key != (pthread_key_t)(-1)) {  // can happen during recursive invocation on freeBSD
-    pthread_setspecific(_mi_heap_default_key, heap);
-  }
-  #endif
+  _mi_prim_thread_associate_default_heap(heap);    
 }
 
 
@@ -492,7 +477,7 @@ static bool os_preloading = true;    // true until this module is initialized
 static bool mi_redirected = false;   // true if malloc redirects to mi_malloc
 
 // Returns true if this module has not been initialized; Don't use C runtime routines until it returns false.
-bool _mi_preloading(void) {
+bool mi_decl_noinline _mi_preloading(void) {
   return os_preloading;
 }
 
@@ -535,9 +520,9 @@ static void mi_allocator_done(void) {
 // Called once by the process loader
 static void mi_process_load(void) {
   mi_heap_main_init();
-  #if defined(MI_TLS_RECURSE_GUARD)
+  #if defined(__APPLE__) || defined(MI_TLS_RECURSE_GUARD)
   volatile mi_heap_t* dummy = _mi_heap_default; // access TLS to allocate it before setting tls_initialized to true;
-  MI_UNUSED(dummy);
+  if (dummy == NULL) return;                    // use dummy or otherwise the access may get optimized away (issue #697)
   #endif
   os_preloading = false;
   mi_assert_internal(_mi_is_main_thread());
@@ -568,7 +553,7 @@ static void mi_detect_cpu_features(void) {
   // FSRM for fast rep movsb support (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017))
   int32_t cpu_info[4];
   __cpuid(cpu_info, 7);
-  _mi_cpu_has_fsrm = ((cpu_info[3] & (1 << 4)) != 0); // bit 4 of EDX : see <https ://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features>
+  _mi_cpu_has_fsrm = ((cpu_info[3] & (1 << 4)) != 0); // bit 4 of EDX : see <https://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features>
 }
 #else
 static void mi_detect_cpu_features(void) {
@@ -579,29 +564,37 @@ static void mi_detect_cpu_features(void) {
 // Initialize the process; called by thread_init or the process loader
 void mi_process_init(void) mi_attr_noexcept {
   // ensure we are called once
-  if (_mi_process_is_initialized) return;
-  _mi_verbose_message("process init: 0x%zx\n", _mi_thread_id());
+  static mi_atomic_once_t process_init;
+        #if _MSC_VER < 1920
+        mi_heap_main_init(); // vs2017 can dynamically re-initialize _mi_heap_main
+        #endif
+  if (!mi_atomic_once(&process_init)) return;
   _mi_process_is_initialized = true;
+  _mi_verbose_message("process init: 0x%zx\n", _mi_thread_id());
   mi_process_setup_auto_thread_done();
 
   mi_detect_cpu_features();
   _mi_os_init();
   mi_heap_main_init();
-  #if (MI_DEBUG)
+  #if MI_DEBUG
   _mi_verbose_message("debug level : %d\n", MI_DEBUG);
   #endif
   _mi_verbose_message("secure level: %d\n", MI_SECURE);
   _mi_verbose_message("mem tracking: %s\n", MI_TRACK_TOOL);
+  #if MI_TSAN
+  _mi_verbose_message("thread santizer enabled\n");
+  #endif
   mi_thread_init();
 
-  #if defined(_WIN32) && !defined(MI_SHARED_LIB)
-  // When building as a static lib the FLS cleanup happens to early for the main thread.
+  #if defined(_WIN32)
+  // On windows, when building as a static lib the FLS cleanup happens to early for the main thread.
   // To avoid this, set the FLS value for the main thread to NULL so the fls cleanup
   // will not call _mi_thread_done on the (still executing) main thread. See issue #508.
-  FlsSetValue(mi_fls_key, NULL);
+  _mi_prim_thread_associate_default_heap(NULL);
   #endif
 
   mi_stats_reset();  // only call stat reset *after* thread init (or the heap tld == NULL)
+  mi_track_init();
 
   if (mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
     size_t pages = mi_option_get_clamp(mi_option_reserve_huge_os_pages, 0, 128*1024);
@@ -629,12 +622,11 @@ static void mi_cdecl mi_process_done(void) {
   if (process_done) return;
   process_done = true;
 
-  #if defined(_WIN32) && !defined(MI_SHARED_LIB)
-  FlsFree(mi_fls_key);  // call thread-done on all threads (except the main thread) to prevent dangling callback pointer if statically linked with a DLL; Issue #208
-  #endif
-
+  // release any thread specific resources and ensure _mi_thread_done is called on all but the main thread
+  _mi_prim_thread_done_auto_done();
+  
   #ifndef MI_SKIP_COLLECT_ON_EXIT
-    #if (MI_DEBUG != 0) || !defined(MI_SHARED_LIB)
+    #if (MI_DEBUG || !defined(MI_SHARED_LIB))
     // free all memory if possible on process exit. This is not needed for a stand-alone process
     // but should be done if mimalloc is statically linked into another shared library which
     // is repeatedly loaded/unloaded, see issue #281.
@@ -646,8 +638,9 @@ static void mi_cdecl mi_process_done(void) {
   // since after process_done there might still be other code running that calls `free` (like at_exit routines,
   // or C-runtime termination code.
   if (mi_option_is_enabled(mi_option_destroy_on_exit)) {
-    _mi_heap_destroy_all();                          // forcefully release all memory held by all heaps (of this thread only!)
-    _mi_segment_cache_free_all(&_mi_heap_main_get()->tld->os);  // release all cached segments
+    mi_collect(true /* force */);
+    _mi_heap_unsafe_destroy_all();     // forcefully release all memory held by all heaps (of this thread only!)
+    _mi_arena_unsafe_destroy_all(& _mi_heap_main_get()->tld->stats);
   }
 
   if (mi_option_is_enabled(mi_option_show_stats) || mi_option_is_enabled(mi_option_verbose)) {
diff --git a/Objects/mimalloc/options.c b/Objects/mimalloc/options.c
index 0a82ca65edd8d9..345b560e3e7f4c 100644
--- a/Objects/mimalloc/options.c
+++ b/Objects/mimalloc/options.c
@@ -5,19 +5,14 @@ terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
-#include "mimalloc-atomic.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
+#include "mimalloc/prim.h"  // mi_prim_out_stderr
 
-#include <stdio.h>
-#include <stdlib.h> // strtol
-#include <string.h> // strncpy, strncat, strlen, strstr
-#include <ctype.h>  // toupper
+#include <stdio.h>      // FILE
+#include <stdlib.h>     // abort
 #include <stdarg.h>
 
-#ifdef _MSC_VER
-#pragma warning(disable:4996)   // strncpy, strncat
-#endif
-
 
 static long mi_max_error_count   = 16; // stop outputting errors after this (use < 0 for no limit)
 static long mi_max_warning_count = 16; // stop outputting warnings after this (use < 0 for no limit)
@@ -28,9 +23,6 @@ int mi_version(void) mi_attr_noexcept {
   return MI_MALLOC_VERSION;
 }
 
-#ifdef _WIN32
-#include <conio.h>
-#endif
 
 // --------------------------------------------------------
 // Options
@@ -49,7 +41,7 @@ typedef struct mi_option_desc_s {
   mi_init_t   init;   // is it initialized yet? (from the environment)
   mi_option_t option; // for debugging: the option index should match the option
   const char* name;   // option name without `mimalloc_` prefix
-  const char* legacy_name; // potential legacy v1.x option name
+  const char* legacy_name; // potential legacy option name
 } mi_option_desc_t;
 
 #define MI_OPTION(opt)                  mi_option_##opt, #opt, NULL
@@ -66,36 +58,38 @@ static mi_option_desc_t options[_mi_option_last] =
   { 0, UNINIT, MI_OPTION(show_stats) },
   { 0, UNINIT, MI_OPTION(verbose) },
 
-  // Some of the following options are experimental and not all combinations are valid. Use with care.
-  { 1, UNINIT, MI_OPTION(eager_commit) },        // commit per segment directly (8MiB)  (but see also `eager_commit_delay`)
-  { 0, UNINIT, MI_OPTION(deprecated_eager_region_commit) },
-  { 0, UNINIT, MI_OPTION(deprecated_reset_decommits) },
-  { 0, UNINIT, MI_OPTION(large_os_pages) },      // use large OS pages, use only with eager commit to prevent fragmentation of VMA's
-  { 0, UNINIT, MI_OPTION(reserve_huge_os_pages) },  // per 1GiB huge pages
-  { -1, UNINIT, MI_OPTION(reserve_huge_os_pages_at) }, // reserve huge pages at node N
+  // the following options are experimental and not all combinations make sense.
+  { 1, UNINIT, MI_OPTION(eager_commit) },               // commit per segment directly (4MiB)  (but see also `eager_commit_delay`)
+  { 2, UNINIT, MI_OPTION_LEGACY(arena_eager_commit,eager_region_commit) }, // eager commit arena's? 2 is used to enable this only on an OS that has overcommit (i.e. linux)
+  { 1, UNINIT, MI_OPTION_LEGACY(purge_decommits,reset_decommits) },        // purge decommits memory (instead of reset) (note: on linux this uses MADV_DONTNEED for decommit)
+  { 0, UNINIT, MI_OPTION_LEGACY(allow_large_os_pages,large_os_pages) },    // use large OS pages, use only with eager commit to prevent fragmentation of VMA's
+  { 0, UNINIT, MI_OPTION(reserve_huge_os_pages) },      // per 1GiB huge pages
+  {-1, UNINIT, MI_OPTION(reserve_huge_os_pages_at) },   // reserve huge pages at node N
   { 0, UNINIT, MI_OPTION(reserve_os_memory)     },
-  { 0, UNINIT, MI_OPTION(deprecated_segment_cache) },  // cache N segments per thread
-  { 0, UNINIT, MI_OPTION(page_reset) },          // reset page memory on free
-  { 0, UNINIT, MI_OPTION_LEGACY(abandoned_page_decommit, abandoned_page_reset) },// decommit free page memory when a thread terminates  
-  { 0, UNINIT, MI_OPTION(deprecated_segment_reset) },
-  #if defined(__NetBSD__)
-  { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
-  #elif defined(_WIN32)
-  { 4, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed (but per page in the segment on demand)
+  { 0, UNINIT, MI_OPTION(deprecated_segment_cache) },   // cache N segments per thread
+  { 0, UNINIT, MI_OPTION(deprecated_page_reset) },      // reset page memory on free
+  { 0, UNINIT, MI_OPTION_LEGACY(abandoned_page_purge,abandoned_page_reset) },       // reset free page memory when a thread terminates
+  { 0, UNINIT, MI_OPTION(deprecated_segment_reset) },   // reset segment memory on free (needs eager commit)
+#if defined(__NetBSD__)
+  { 0, UNINIT, MI_OPTION(eager_commit_delay) },         // the first N segments per thread are not eagerly committed
+#else
+  { 1, UNINIT, MI_OPTION(eager_commit_delay) },         // the first N segments per thread are not eagerly committed (but per page in the segment on demand)
+#endif
+  { 10,  UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
+  { 0,   UNINIT, MI_OPTION(use_numa_nodes) },           // 0 = use available numa nodes, otherwise use at most N nodes.
+  { 0,   UNINIT, MI_OPTION(limit_os_alloc) },           // 1 = do not use OS memory for allocation (but only reserved arenas)
+  { 100, UNINIT, MI_OPTION(os_tag) },                   // only apple specific for now but might serve more or less related purpose
+  { 16,  UNINIT, MI_OPTION(max_errors) },               // maximum errors that are output
+  { 16,  UNINIT, MI_OPTION(max_warnings) },             // maximum warnings that are output
+  { 8,   UNINIT, MI_OPTION(max_segment_reclaim)},       // max. number of segment reclaims from the abandoned segments per try.
+  { 0,   UNINIT, MI_OPTION(destroy_on_exit)},           // release all OS memory on process exit; careful with dangling pointer or after-exit frees!
+  #if (MI_INTPTR_SIZE>4)
+  { 1024L * 1024L, UNINIT, MI_OPTION(arena_reserve) },  // reserve memory N KiB at a time
   #else
-  { 1, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed (but per page in the segment on demand)
+  {  128L * 1024L, UNINIT, MI_OPTION(arena_reserve) },
   #endif
-  { 25,   UNINIT, MI_OPTION_LEGACY(decommit_delay, reset_delay) }, // page decommit delay in milli-seconds
-  { 0,    UNINIT, MI_OPTION(use_numa_nodes) },    // 0 = use available numa nodes, otherwise use at most N nodes. 
-  { 0,    UNINIT, MI_OPTION(limit_os_alloc) },    // 1 = do not use OS memory for allocation (but only reserved arenas)
-  { 100,  UNINIT, MI_OPTION(os_tag) },            // only apple specific for now but might serve more or less related purpose
-  { 16,   UNINIT, MI_OPTION(max_errors) },        // maximum errors that are output
-  { 16,   UNINIT, MI_OPTION(max_warnings) },      // maximum warnings that are output
-  { 8,    UNINIT, MI_OPTION(max_segment_reclaim)},// max. number of segment reclaims from the abandoned segments per try.  
-  { 1,    UNINIT, MI_OPTION(allow_decommit) },    // decommit slices when no longer used (after decommit_delay milli-seconds)
-  { 500,  UNINIT, MI_OPTION(segment_decommit_delay) }, // decommit delay in milli-seconds for freed segments
-  { 1,    UNINIT, MI_OPTION(decommit_extend_delay) },
-  { 0,    UNINIT, MI_OPTION(destroy_on_exit)}     // release all OS memory on process exit; careful with dangling pointer or after-exit frees!
+  { 10,  UNINIT, MI_OPTION(arena_purge_mult) },        // purge delay multiplier for arena's
+  { 1,   UNINIT, MI_OPTION_LEGACY(purge_extend_delay, decommit_extend_delay) },
 };
 
 static void mi_option_init(mi_option_desc_t* desc);
@@ -133,6 +127,12 @@ mi_decl_nodiscard long mi_option_get_clamp(mi_option_t option, long min, long ma
   return (x < min ? min : (x > max ? max : x));
 }
 
+mi_decl_nodiscard size_t mi_option_get_size(mi_option_t option) {
+  mi_assert_internal(option == mi_option_reserve_os_memory || option == mi_option_arena_reserve);
+  long x = mi_option_get(option);
+  return (x < 0 ? 0 : (size_t)x * MI_KiB);
+}
+
 void mi_option_set(mi_option_t option, long value) {
   mi_assert(option >= 0 && option < _mi_option_last);
   if (option < 0 || option >= _mi_option_last) return;
@@ -171,41 +171,11 @@ void mi_option_disable(mi_option_t option) {
   mi_option_set_enabled(option,false);
 }
 
-
 static void mi_cdecl mi_out_stderr(const char* msg, void* arg) {
   MI_UNUSED(arg);
-  if (msg == NULL) return;
-  #ifdef _WIN32
-  // on windows with redirection, the C runtime cannot handle locale dependent output
-  // after the main thread closes so we use direct console output.
-  if (!_mi_preloading()) {
-    // _cputs(msg);  // _cputs cannot be used at is aborts if it fails to lock the console
-    static HANDLE hcon = INVALID_HANDLE_VALUE;
-    static bool hconIsConsole;
-    if (hcon == INVALID_HANDLE_VALUE) {
-      CONSOLE_SCREEN_BUFFER_INFO sbi;
-      hcon = GetStdHandle(STD_ERROR_HANDLE);
-      hconIsConsole = ((hcon != INVALID_HANDLE_VALUE) && GetConsoleScreenBufferInfo(hcon, &sbi));
-    }
-    const size_t len = strlen(msg);
-    if (len > 0 && len < UINT32_MAX) {
-      DWORD written = 0;
-      if (hconIsConsole) {
-        WriteConsoleA(hcon, msg, (DWORD)len, &written, NULL);
-      }
-      else if (hcon != INVALID_HANDLE_VALUE) {
-        // use direct write if stderr was redirected
-        WriteFile(hcon, msg, (DWORD)len, &written, NULL);
-      }
-      else {
-        // finally fall back to fputs after all
-        fputs(msg, stderr);
-      }
-    }
+  if (msg != NULL && msg[0] != 0) {
+    _mi_prim_out_stderr(msg);
   }
-  #else
-  fputs(msg, stderr);
-  #endif
 }
 
 // Since an output function can be registered earliest in the `main`
@@ -222,7 +192,7 @@ static void mi_cdecl mi_out_buf(const char* msg, void* arg) {
   MI_UNUSED(arg);
   if (msg==NULL) return;
   if (mi_atomic_load_relaxed(&out_len)>=MI_MAX_DELAY_OUTPUT) return;
-  size_t n = strlen(msg);
+  size_t n = _mi_strlen(msg);
   if (n==0) return;
   // claim space
   size_t start = mi_atomic_add_acq_rel(&out_len, n);
@@ -279,7 +249,7 @@ void mi_register_output(mi_output_fun* out, void* arg) mi_attr_noexcept {
 }
 
 // add stderr to the delayed output after the module is loaded
-static void mi_add_stderr_output() {
+static void mi_add_stderr_output(void) {
   mi_assert_internal(mi_out_default == NULL);
   mi_out_buf_flush(&mi_out_stderr, false, NULL); // flush current contents to stderr
   mi_out_default = &mi_out_buf_stderr;           // and add stderr to the delayed output
@@ -314,7 +284,7 @@ static mi_decl_noinline void mi_recurse_exit_prim(void) {
 
 static bool mi_recurse_enter(void) {
   #if defined(__APPLE__) || defined(MI_TLS_RECURSE_GUARD)
-  if (_mi_preloading()) return true;
+  if (_mi_preloading()) return false;
   #endif
   return mi_recurse_enter_prim();
 }
@@ -327,7 +297,7 @@ static void mi_recurse_exit(void) {
 }
 
 void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message) {
-  if (out==NULL || (FILE*)out==stdout || (FILE*)out==stderr) { // TODO: use mi_out_stderr for stderr?
+  if (out==NULL || (void*)out==(void*)stdout || (void*)out==(void*)stderr) { // TODO: use mi_out_stderr for stderr?
     if (!mi_recurse_enter()) return;
     out = mi_out_get_default(&arg);
     if (prefix != NULL) out(prefix, arg);
@@ -359,7 +329,7 @@ void _mi_fprintf( mi_output_fun* out, void* arg, const char* fmt, ... ) {
 }
 
 static void mi_vfprintf_thread(mi_output_fun* out, void* arg, const char* prefix, const char* fmt, va_list args) {
-  if (prefix != NULL && strlen(prefix) <= 32 && !_mi_is_main_thread()) {
+  if (prefix != NULL && _mi_strnlen(prefix,33) <= 32 && !_mi_is_main_thread()) {
     char tprefix[64];
     snprintf(tprefix, sizeof(tprefix), "%sthread 0x%llx: ", prefix, (unsigned long long)_mi_thread_id());
     mi_vfprintf(out, arg, tprefix, fmt, args);
@@ -464,8 +434,20 @@ void _mi_error_message(int err, const char* fmt, ...) {
 // --------------------------------------------------------
 // Initialize options by checking the environment
 // --------------------------------------------------------
+char _mi_toupper(char c) {
+  if (c >= 'a' && c <= 'z') return (c - 'a' + 'A');
+                       else return c;
+}
+
+int _mi_strnicmp(const char* s, const char* t, size_t n) {
+  if (n == 0) return 0;
+  for (; *s != 0 && *t != 0 && n > 0; s++, t++, n--) {
+    if (_mi_toupper(*s) != _mi_toupper(*t)) break;
+  }
+  return (n == 0 ? 0 : *s - *t);
+}
 
-static void mi_strlcpy(char* dest, const char* src, size_t dest_size) {
+void _mi_strlcpy(char* dest, const char* src, size_t dest_size) {
   if (dest==NULL || src==NULL || dest_size == 0) return;
   // copy until end of src, or when dest is (almost) full
   while (*src != 0 && dest_size > 1) {
@@ -476,7 +458,7 @@ static void mi_strlcpy(char* dest, const char* src, size_t dest_size) {
   *dest = 0;
 }
 
-static void mi_strlcat(char* dest, const char* src, size_t dest_size) {
+void _mi_strlcat(char* dest, const char* src, size_t dest_size) {
   if (dest==NULL || src==NULL || dest_size == 0) return;
   // find end of string in the dest buffer
   while (*dest != 0 && dest_size > 1) {
@@ -484,7 +466,21 @@ static void mi_strlcat(char* dest, const char* src, size_t dest_size) {
     dest_size--;
   }
   // and catenate
-  mi_strlcpy(dest, src, dest_size);
+  _mi_strlcpy(dest, src, dest_size);
+}
+
+size_t _mi_strlen(const char* s) {
+  if (s==NULL) return 0;
+  size_t len = 0;
+  while(s[len] != 0) { len++; }
+  return len;
+}
+
+size_t _mi_strnlen(const char* s, size_t max_len) {
+  if (s==NULL) return 0;
+  size_t len = 0;
+  while(s[len] != 0 && len < max_len) { len++; }
+  return len;
 }
 
 #ifdef MI_NO_GETENV
@@ -495,107 +491,40 @@ static bool mi_getenv(const char* name, char* result, size_t result_size) {
   return false;
 }
 #else
-#if defined _WIN32
-// On Windows use GetEnvironmentVariable instead of getenv to work
-// reliably even when this is invoked before the C runtime is initialized.
-// i.e. when `_mi_preloading() == true`.
-// Note: on windows, environment names are not case sensitive.
-#include <windows.h>
 static bool mi_getenv(const char* name, char* result, size_t result_size) {
-  result[0] = 0;
-  size_t len = GetEnvironmentVariableA(name, result, (DWORD)result_size);
-  return (len > 0 && len < result_size);
-}
-#elif !defined(MI_USE_ENVIRON) || (MI_USE_ENVIRON!=0)
-// On Posix systemsr use `environ` to acces environment variables
-// even before the C runtime is initialized.
-#if defined(__APPLE__) && defined(__has_include) && __has_include(<crt_externs.h>)
-#include <crt_externs.h>
-static char** mi_get_environ(void) {
-  return (*_NSGetEnviron());
-}
-#else
-extern char** environ;
-static char** mi_get_environ(void) {
-  return environ;
+  if (name==NULL || result == NULL || result_size < 64) return false;
+  return _mi_prim_getenv(name,result,result_size);
 }
 #endif
-static int mi_strnicmp(const char* s, const char* t, size_t n) {
-  if (n == 0) return 0;
-  for (; *s != 0 && *t != 0 && n > 0; s++, t++, n--) {
-    if (toupper(*s) != toupper(*t)) break;
-  }
-  return (n == 0 ? 0 : *s - *t);
-}
-static bool mi_getenv(const char* name, char* result, size_t result_size) {
-  if (name==NULL) return false;
-  const size_t len = strlen(name);
-  if (len == 0) return false;
-  char** env = mi_get_environ();
-  if (env == NULL) return false;
-  // compare up to 256 entries
-  for (int i = 0; i < 256 && env[i] != NULL; i++) {
-    const char* s = env[i];
-    if (mi_strnicmp(name, s, len) == 0 && s[len] == '=') { // case insensitive
-      // found it
-      mi_strlcpy(result, s + len + 1, result_size);
-      return true;
-    }
-  }
-  return false;
-}
-#else
-// fallback: use standard C `getenv` but this cannot be used while initializing the C runtime
-static bool mi_getenv(const char* name, char* result, size_t result_size) {
-  // cannot call getenv() when still initializing the C runtime.
-  if (_mi_preloading()) return false;
-  const char* s = getenv(name);
-  if (s == NULL) {
-    // we check the upper case name too.
-    char buf[64+1];
-    size_t len = strlen(name);
-    if (len >= sizeof(buf)) len = sizeof(buf) - 1;
-    for (size_t i = 0; i < len; i++) {
-      buf[i] = toupper(name[i]);
-    }
-    buf[len] = 0;
-    s = getenv(buf);
-  }
-  if (s != NULL && strlen(s) < result_size) {
-    mi_strlcpy(result, s, result_size);
-    return true;
-  }
-  else {
-    return false;
-  }
-}
-#endif  // !MI_USE_ENVIRON
-#endif  // !MI_NO_GETENV
+
+// TODO: implement ourselves to reduce dependencies on the C runtime
+#include <stdlib.h> // strtol
+#include <string.h> // strstr
+
 
 static void mi_option_init(mi_option_desc_t* desc) {
   // Read option value from the environment
-  char s[64+1];
+  char s[64 + 1];
   char buf[64+1];
-  mi_strlcpy(buf, "mimalloc_", sizeof(buf));
-  mi_strlcat(buf, desc->name, sizeof(buf));
-  bool found = mi_getenv(buf,s,sizeof(s));
+  _mi_strlcpy(buf, "mimalloc_", sizeof(buf));
+  _mi_strlcat(buf, desc->name, sizeof(buf));
+  bool found = mi_getenv(buf, s, sizeof(s));
   if (!found && desc->legacy_name != NULL) {
-    mi_strlcpy(buf, "mimalloc_", sizeof(buf));
-    mi_strlcat(buf, desc->legacy_name, sizeof(buf));
-    found = mi_getenv(buf,s,sizeof(s));
+    _mi_strlcpy(buf, "mimalloc_", sizeof(buf));
+    _mi_strlcat(buf, desc->legacy_name, sizeof(buf));
+    found = mi_getenv(buf, s, sizeof(s));
     if (found) {
-      _mi_warning_message("environment option \"mimalloc_%s\" is deprecated -- use \"mimalloc_%s\" instead.\n", desc->legacy_name, desc->name );
-    }    
+      _mi_warning_message("environment option \"mimalloc_%s\" is deprecated -- use \"mimalloc_%s\" instead.\n", desc->legacy_name, desc->name);
+    }
   }
 
   if (found) {
-    size_t len = strlen(s);
-    if (len >= sizeof(buf)) len = sizeof(buf) - 1;
+    size_t len = _mi_strnlen(s, sizeof(buf) - 1);
     for (size_t i = 0; i < len; i++) {
-      buf[i] = (char)toupper(s[i]);
+      buf[i] = _mi_toupper(s[i]);
     }
     buf[len] = 0;
-    if (buf[0]==0 || strstr("1;TRUE;YES;ON", buf) != NULL) {
+    if (buf[0] == 0 || strstr("1;TRUE;YES;ON", buf) != NULL) {
       desc->value = 1;
       desc->init = INITIALIZED;
     }
@@ -606,7 +535,7 @@ static void mi_option_init(mi_option_desc_t* desc) {
     else {
       char* end = buf;
       long value = strtol(buf, &end, 10);
-      if (desc->option == mi_option_reserve_os_memory) {
+      if (desc->option == mi_option_reserve_os_memory || desc->option == mi_option_arena_reserve) {
         // this option is interpreted in KiB to prevent overflow of `long`
         if (*end == 'K') { end++; }
         else if (*end == 'M') { value *= MI_KiB; end++; }
@@ -626,11 +555,11 @@ static void mi_option_init(mi_option_desc_t* desc) {
           // if the 'mimalloc_verbose' env var has a bogus value we'd never know
           // (since the value defaults to 'off') so in that case briefly enable verbose
           desc->value = 1;
-          _mi_warning_message("environment option mimalloc_%s has an invalid value.\n", desc->name );
+          _mi_warning_message("environment option mimalloc_%s has an invalid value.\n", desc->name);
           desc->value = 0;
         }
         else {
-          _mi_warning_message("environment option mimalloc_%s has an invalid value.\n", desc->name );
+          _mi_warning_message("environment option mimalloc_%s has an invalid value.\n", desc->name);
         }
       }
     }
diff --git a/Objects/mimalloc/os.c b/Objects/mimalloc/os.c
index 0f98474170e9e8..4928186b980f26 100644
--- a/Objects/mimalloc/os.c
+++ b/Objects/mimalloc/os.c
@@ -1,118 +1,54 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
-#ifndef _DEFAULT_SOURCE
-#define _DEFAULT_SOURCE   // ensure mmap flags are defined
-#endif
-
-#if defined(__sun)
-// illumos provides new mman.h api when any of these are defined
-// otherwise the old api based on caddr_t which predates the void pointers one.
-// stock solaris provides only the former, chose to atomically to discard those
-// flags only here rather than project wide tough.
-#undef _XOPEN_SOURCE
-#undef _POSIX_C_SOURCE
-#endif
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
-#include "mimalloc-atomic.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
+#include "mimalloc/prim.h"
 
-#include <string.h>  // strerror
-
-#ifdef _MSC_VER
-#pragma warning(disable:4996)  // strerror
-#endif
-
-#if defined(__wasi__)
-#define MI_USE_SBRK
-#endif
-
-#if defined(_WIN32)
-#include <windows.h>
-#elif defined(__wasi__)
-#include <unistd.h>    // sbrk
-#else
-#include <sys/mman.h>  // mmap
-#include <unistd.h>    // sysconf
-#if defined(__linux__)
-#include <features.h>
-#include <fcntl.h>
-#if defined(__GLIBC__)
-#include <linux/mman.h> // linux mmap flags
-#else
-#include <sys/mman.h>
-#endif
-#endif
-#if defined(__APPLE__)
-#include <TargetConditionals.h>
-#if !TARGET_IOS_IPHONE && !TARGET_IOS_SIMULATOR
-#include <mach/vm_statistics.h>
-#endif
-#endif
-#if defined(__FreeBSD__) || defined(__DragonFly__)
-#include <sys/param.h>
-#if __FreeBSD_version >= 1200000
-#include <sys/cpuset.h>
-#include <sys/domainset.h>
-#endif
-#include <sys/sysctl.h>
-#endif
-#endif
 
 /* -----------------------------------------------------------
   Initialization.
   On windows initializes support for aligned allocation and
   large OS pages (if MIMALLOC_LARGE_OS_PAGES is true).
 ----------------------------------------------------------- */
-bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats);
-bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats);
 
-static void* mi_align_up_ptr(void* p, size_t alignment) {
-  return (void*)_mi_align_up((uintptr_t)p, alignment);
-}
+static mi_os_mem_config_t mi_os_mem_config = {
+  4096,   // page size
+  0,      // large page size (usually 2MiB)
+  4096,   // allocation granularity
+  true,   // has overcommit?  (if true we use MAP_NORESERVE on mmap systems)
+  false,  // must free whole? (on mmap systems we can free anywhere in a mapped range, but on Windows we must free the entire span)
+  true    // has virtual reserve? (if true we can reserve virtual address space without using commit or physical memory)
+};
 
-static void* mi_align_down_ptr(void* p, size_t alignment) {
-  return (void*)_mi_align_down((uintptr_t)p, alignment);
+bool _mi_os_has_overcommit(void) {
+  return mi_os_mem_config.has_overcommit;
 }
 
-
-// page size (initialized properly in `os_init`)
-static size_t os_page_size = 4096;
-
-// minimal allocation granularity
-static size_t os_alloc_granularity = 4096;
-
-// if non-zero, use large page allocation
-static size_t large_os_page_size = 0;
-
-// is memory overcommit allowed?
-// set dynamically in _mi_os_init (and if true we use MAP_NORESERVE)
-static bool os_overcommit = true;
-
-bool _mi_os_has_overcommit(void) {
-  return os_overcommit;
+bool _mi_os_has_virtual_reserve(void) { 
+  return mi_os_mem_config.has_virtual_reserve;
 }
 
+
 // OS (small) page size
 size_t _mi_os_page_size(void) {
-  return os_page_size;
+  return mi_os_mem_config.page_size;
 }
 
 // if large OS pages are supported (2 or 4MiB), then return the size, otherwise return the small page size (4KiB)
 size_t _mi_os_large_page_size(void) {
-  return (large_os_page_size != 0 ? large_os_page_size : _mi_os_page_size());
+  return (mi_os_mem_config.large_page_size != 0 ? mi_os_mem_config.large_page_size : _mi_os_page_size());
 }
 
-#if !defined(MI_USE_SBRK) && !defined(__wasi__)
-static bool use_large_os_page(size_t size, size_t alignment) {
+bool _mi_os_use_large_page(size_t size, size_t alignment) {
   // if we have access, check the size and alignment requirements
-  if (large_os_page_size == 0 || !mi_option_is_enabled(mi_option_large_os_pages)) return false;
-  return ((size % large_os_page_size) == 0 && (alignment % large_os_page_size) == 0);
+  if (mi_os_mem_config.large_page_size == 0 || !mi_option_is_enabled(mi_option_allow_large_os_pages)) return false;
+  return ((size % mi_os_mem_config.large_page_size) == 0 && (alignment % mi_os_mem_config.large_page_size) == 0);
 }
-#endif
 
 // round to a good OS allocation size (bounded by max 12.5% waste)
 size_t _mi_os_good_alloc_size(size_t size) {
@@ -126,177 +62,24 @@ size_t _mi_os_good_alloc_size(size_t size) {
   return _mi_align_up(size, align_size);
 }
 
-#if defined(_WIN32)
-// We use VirtualAlloc2 for aligned allocation, but it is only supported on Windows 10 and Windows Server 2016.
-// So, we need to look it up dynamically to run on older systems. (use __stdcall for 32-bit compatibility)
-// NtAllocateVirtualAllocEx is used for huge OS page allocation (1GiB)
-// We define a minimal MEM_EXTENDED_PARAMETER ourselves in order to be able to compile with older SDK's.
-typedef enum MI_MEM_EXTENDED_PARAMETER_TYPE_E {
-  MiMemExtendedParameterInvalidType = 0,
-  MiMemExtendedParameterAddressRequirements,
-  MiMemExtendedParameterNumaNode,
-  MiMemExtendedParameterPartitionHandle,
-  MiMemExtendedParameterUserPhysicalHandle,
-  MiMemExtendedParameterAttributeFlags,
-  MiMemExtendedParameterMax
-} MI_MEM_EXTENDED_PARAMETER_TYPE;
-
-typedef struct DECLSPEC_ALIGN(8) MI_MEM_EXTENDED_PARAMETER_S {
-  struct { DWORD64 Type : 8; DWORD64 Reserved : 56; } Type;
-  union  { DWORD64 ULong64; PVOID Pointer; SIZE_T Size; HANDLE Handle; DWORD ULong; } Arg;
-} MI_MEM_EXTENDED_PARAMETER;
-
-typedef struct MI_MEM_ADDRESS_REQUIREMENTS_S {
-  PVOID  LowestStartingAddress;
-  PVOID  HighestEndingAddress;
-  SIZE_T Alignment;
-} MI_MEM_ADDRESS_REQUIREMENTS;
-
-#define MI_MEM_EXTENDED_PARAMETER_NONPAGED_HUGE   0x00000010
-
-#include <winternl.h>
-typedef PVOID    (__stdcall *PVirtualAlloc2)(HANDLE, PVOID, SIZE_T, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG);
-typedef NTSTATUS (__stdcall *PNtAllocateVirtualMemoryEx)(HANDLE, PVOID*, SIZE_T*, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG);
-static PVirtualAlloc2 pVirtualAlloc2 = NULL;
-static PNtAllocateVirtualMemoryEx pNtAllocateVirtualMemoryEx = NULL;
-
-// Similarly, GetNumaProcesorNodeEx is only supported since Windows 7
-typedef struct MI_PROCESSOR_NUMBER_S { WORD Group; BYTE Number; BYTE Reserved; } MI_PROCESSOR_NUMBER;
-
-typedef VOID (__stdcall *PGetCurrentProcessorNumberEx)(MI_PROCESSOR_NUMBER* ProcNumber);
-typedef BOOL (__stdcall *PGetNumaProcessorNodeEx)(MI_PROCESSOR_NUMBER* Processor, PUSHORT NodeNumber);
-typedef BOOL (__stdcall* PGetNumaNodeProcessorMaskEx)(USHORT Node, PGROUP_AFFINITY ProcessorMask);
-typedef BOOL (__stdcall *PGetNumaProcessorNode)(UCHAR Processor, PUCHAR NodeNumber);
-static PGetCurrentProcessorNumberEx pGetCurrentProcessorNumberEx = NULL;
-static PGetNumaProcessorNodeEx      pGetNumaProcessorNodeEx = NULL;
-static PGetNumaNodeProcessorMaskEx  pGetNumaNodeProcessorMaskEx = NULL;
-static PGetNumaProcessorNode        pGetNumaProcessorNode = NULL;
-
-static bool mi_win_enable_large_os_pages(void)
-{
-  if (large_os_page_size > 0) return true;
-
-  // Try to see if large OS pages are supported
-  // To use large pages on Windows, we first need access permission
-  // Set "Lock pages in memory" permission in the group policy editor
-  // <https://devblogs.microsoft.com/oldnewthing/20110128-00/?p=11643>
-  unsigned long err = 0;
-  HANDLE token = NULL;
-  BOOL ok = OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &token);
-  if (ok) {
-    TOKEN_PRIVILEGES tp;
-    ok = LookupPrivilegeValue(NULL, TEXT("SeLockMemoryPrivilege"), &tp.Privileges[0].Luid);
-    if (ok) {
-      tp.PrivilegeCount = 1;
-      tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
-      ok = AdjustTokenPrivileges(token, FALSE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, 0);
-      if (ok) {
-        err = GetLastError();
-        ok = (err == ERROR_SUCCESS);
-        if (ok) {
-          large_os_page_size = GetLargePageMinimum();
-        }
-      }
-    }
-    CloseHandle(token);
-  }
-  if (!ok) {
-    if (err == 0) err = GetLastError();
-    _mi_warning_message("cannot enable large OS page support, error %lu\n", err);
-  }
-  return (ok!=0);
-}
-
-void _mi_os_init(void)
-{
-  os_overcommit = false;
-  // get the page size
-  SYSTEM_INFO si;
-  GetSystemInfo(&si);
-  if (si.dwPageSize > 0) os_page_size = si.dwPageSize;
-  if (si.dwAllocationGranularity > 0) os_alloc_granularity = si.dwAllocationGranularity;
-  // get the VirtualAlloc2 function
-  HINSTANCE  hDll;
-  hDll = LoadLibrary(TEXT("kernelbase.dll"));
-  if (hDll != NULL) {
-    // use VirtualAlloc2FromApp if possible as it is available to Windows store apps
-    pVirtualAlloc2 = (PVirtualAlloc2)(void (*)(void))GetProcAddress(hDll, "VirtualAlloc2FromApp");
-    if (pVirtualAlloc2==NULL) pVirtualAlloc2 = (PVirtualAlloc2)(void (*)(void))GetProcAddress(hDll, "VirtualAlloc2");
-    FreeLibrary(hDll);
-  }
-  // NtAllocateVirtualMemoryEx is used for huge page allocation
-  hDll = LoadLibrary(TEXT("ntdll.dll"));
-  if (hDll != NULL) {
-    pNtAllocateVirtualMemoryEx = (PNtAllocateVirtualMemoryEx)(void (*)(void))GetProcAddress(hDll, "NtAllocateVirtualMemoryEx");
-    FreeLibrary(hDll);
-  }
-  // Try to use Win7+ numa API
-  hDll = LoadLibrary(TEXT("kernel32.dll"));
-  if (hDll != NULL) {
-    pGetCurrentProcessorNumberEx = (PGetCurrentProcessorNumberEx)(void (*)(void))GetProcAddress(hDll, "GetCurrentProcessorNumberEx");
-    pGetNumaProcessorNodeEx = (PGetNumaProcessorNodeEx)(void (*)(void))GetProcAddress(hDll, "GetNumaProcessorNodeEx");
-    pGetNumaNodeProcessorMaskEx = (PGetNumaNodeProcessorMaskEx)(void (*)(void))GetProcAddress(hDll, "GetNumaNodeProcessorMaskEx");
-    pGetNumaProcessorNode = (PGetNumaProcessorNode)(void (*)(void))GetProcAddress(hDll, "GetNumaProcessorNode");
-    FreeLibrary(hDll);
-  }
-  if (mi_option_is_enabled(mi_option_large_os_pages) || mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
-    mi_win_enable_large_os_pages();
-  }
-}
-#elif defined(__wasi__)
 void _mi_os_init(void) {
-  os_overcommit = false;
-  os_page_size = 64*MI_KiB; // WebAssembly has a fixed page size: 64KiB
-  os_alloc_granularity = 16;
+  _mi_prim_mem_init(&mi_os_mem_config);
 }
 
-#else  // generic unix
-
-static void os_detect_overcommit(void) {
-#if defined(__linux__)
-  int fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY);
-	if (fd < 0) return;
-  char buf[32];
-  ssize_t nread = read(fd, &buf, sizeof(buf));
-	close(fd);
-  // <https://www.kernel.org/doc/Documentation/vm/overcommit-accounting>
-  // 0: heuristic overcommit, 1: always overcommit, 2: never overcommit (ignore NORESERVE)
-  if (nread >= 1) {
-    os_overcommit = (buf[0] == '0' || buf[0] == '1');
-  }
-#elif defined(__FreeBSD__)
-  int val = 0;
-  size_t olen = sizeof(val);
-  if (sysctlbyname("vm.overcommit", &val, &olen, NULL, 0) == 0) {
-    os_overcommit = (val != 0);
-  }
-#else
-  // default: overcommit is true
-#endif
-}
 
-void _mi_os_init(void) {
-  // get the page size
-  long result = sysconf(_SC_PAGESIZE);
-  if (result > 0) {
-    os_page_size = (size_t)result;
-    os_alloc_granularity = os_page_size;
-  }
-  large_os_page_size = 2*MI_MiB; // TODO: can we query the OS for this?
-  os_detect_overcommit();
-}
-#endif
+/* -----------------------------------------------------------
+  Util
+-------------------------------------------------------------- */
+bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats);
+bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats);
 
+static void* mi_align_up_ptr(void* p, size_t alignment) {
+  return (void*)_mi_align_up((uintptr_t)p, alignment);
+}
 
-#if defined(MADV_NORMAL)
-static int mi_madvise(void* addr, size_t length, int advice) {
-  #if defined(__sun)
-  return madvise((caddr_t)addr, length, advice);  // Solaris needs cast (issue #520)
-  #else
-  return madvise(addr, length, advice);
-  #endif
+static void* mi_align_down_ptr(void* p, size_t alignment) {
+  return (void*)_mi_align_down((uintptr_t)p, alignment);
 }
-#endif
 
 
 /* -----------------------------------------------------------
@@ -319,7 +102,7 @@ static mi_decl_cache_align _Atomic(uintptr_t)aligned_base;
 #define MI_HINT_AREA ((uintptr_t)4 << 40)  // upto 6TiB   (since before win8 there is "only" 8TiB available to processes)
 #define MI_HINT_MAX  ((uintptr_t)30 << 40) // wrap after 30TiB (area after 32TiB is used for huge OS pages)
 
-static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size)
+void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size)
 {
   if (try_alignment <= 1 || try_alignment > MI_SEGMENT_SIZE) return NULL;
   size = _mi_align_up(size, MI_SEGMENT_SIZE);
@@ -332,7 +115,7 @@ static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size)
   if (hint == 0 || hint > MI_HINT_MAX) {   // wrap or initialize
     uintptr_t init = MI_HINT_BASE;
     #if (MI_SECURE>0 || MI_DEBUG==0)       // security: randomize start of aligned allocations unless in debug mode
-    uintptr_t r = _mi_heap_random_next(mi_get_default_heap());
+    uintptr_t r = _mi_heap_random_next(mi_prim_get_default_heap());
     init = init + ((MI_SEGMENT_SIZE * ((r>>17) & 0xFFFFF)) % MI_HINT_AREA);  // (randomly 20 bits)*4MiB == 0 to 4TiB
     #endif
     uintptr_t expected = hint + size;
@@ -343,361 +126,61 @@ static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size)
   return (void*)hint;
 }
 #else
-static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size) {
+void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size) {
   MI_UNUSED(try_alignment); MI_UNUSED(size);
   return NULL;
 }
 #endif
 
+
 /* -----------------------------------------------------------
   Free memory
 -------------------------------------------------------------- */
 
-static bool mi_os_mem_free(void* addr, size_t size, bool was_committed, mi_stats_t* stats)
-{
-  if (addr == NULL || size == 0) return true; // || _mi_os_is_huge_reserved(addr)
-  bool err = false;
-#if defined(_WIN32)
-  DWORD errcode = 0;
-  err = (VirtualFree(addr, 0, MEM_RELEASE) == 0);
-  if (err) { errcode = GetLastError(); }
-  if (errcode == ERROR_INVALID_ADDRESS) {
-    // In mi_os_mem_alloc_aligned the fallback path may have returned a pointer inside
-    // the memory region returned by VirtualAlloc; in that case we need to free using
-    // the start of the region.
-    MEMORY_BASIC_INFORMATION info = { 0 };
-    VirtualQuery(addr, &info, sizeof(info));
-    if (info.AllocationBase < addr && ((uint8_t*)addr - (uint8_t*)info.AllocationBase) < (ptrdiff_t)MI_SEGMENT_SIZE) {
-      errcode = 0;
-      err = (VirtualFree(info.AllocationBase, 0, MEM_RELEASE) == 0);
-      if (err) { errcode = GetLastError(); }
-    }
-  }
-  if (errcode != 0) {
-    _mi_warning_message("unable to release OS memory: error code 0x%x, addr: %p, size: %zu\n", errcode, addr, size);
-  }
-#elif defined(MI_USE_SBRK) || defined(__wasi__)
-  err = false; // sbrk heap cannot be shrunk
-#else
-  err = (munmap(addr, size) == -1);
-  if (err) {
-    _mi_warning_message("unable to release OS memory: %s, addr: %p, size: %zu\n", strerror(errno), addr, size);
+static void mi_os_free_huge_os_pages(void* p, size_t size, mi_stats_t* stats);
+
+static void mi_os_prim_free(void* addr, size_t size, bool still_committed, mi_stats_t* tld_stats) {
+  MI_UNUSED(tld_stats);
+  mi_assert_internal((size % _mi_os_page_size()) == 0);
+  if (addr == NULL || size == 0) return; // || _mi_os_is_huge_reserved(addr)
+  int err = _mi_prim_free(addr, size);
+  if (err != 0) {
+    _mi_warning_message("unable to free OS memory (error: %d (0x%x), size: 0x%zx bytes, address: %p)\n", err, err, size, addr);
   }
-#endif
-  if (was_committed) { _mi_stat_decrease(&stats->committed, size); }
+  mi_stats_t* stats = &_mi_stats_main;
+  if (still_committed) { _mi_stat_decrease(&stats->committed, size); }
   _mi_stat_decrease(&stats->reserved, size);
-  return !err;
 }
 
-
-/* -----------------------------------------------------------
-  Raw allocation on Windows (VirtualAlloc)
--------------------------------------------------------------- */
-
-#ifdef _WIN32
- 
-#define MEM_COMMIT_RESERVE  (MEM_COMMIT|MEM_RESERVE)
-
-static void* mi_win_virtual_allocx(void* addr, size_t size, size_t try_alignment, DWORD flags) {
-#if (MI_INTPTR_SIZE >= 8)
-  // on 64-bit systems, try to use the virtual address area after 2TiB for 4MiB aligned allocations
-  if (addr == NULL) {
-    void* hint = mi_os_get_aligned_hint(try_alignment,size);
-    if (hint != NULL) {
-      void* p = VirtualAlloc(hint, size, flags, PAGE_READWRITE);
-      if (p != NULL) return p;
-      _mi_verbose_message("warning: unable to allocate hinted aligned OS memory (%zu bytes, error code: 0x%x, address: %p, alignment: %zu, flags: 0x%x)\n", size, GetLastError(), hint, try_alignment, flags);
-      // fall through on error
+void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t memid, mi_stats_t* tld_stats) {
+  if (mi_memkind_is_os(memid.memkind)) {
+    size_t csize = _mi_os_good_alloc_size(size);
+    void* base = addr;
+    // different base? (due to alignment)
+    if (memid.mem.os.base != NULL) {
+      mi_assert(memid.mem.os.base <= addr);
+      mi_assert((uint8_t*)memid.mem.os.base + memid.mem.os.alignment >= (uint8_t*)addr);
+      base = memid.mem.os.base;
+      csize += ((uint8_t*)addr - (uint8_t*)memid.mem.os.base);
     }
-  }
-#endif
-  // on modern Windows try use VirtualAlloc2 for aligned allocation
-  if (try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) {
-    MI_MEM_ADDRESS_REQUIREMENTS reqs = { 0, 0, 0 };
-    reqs.Alignment = try_alignment;
-    MI_MEM_EXTENDED_PARAMETER param = { {0, 0}, {0} };
-    param.Type.Type = MiMemExtendedParameterAddressRequirements;
-    param.Arg.Pointer = &reqs;
-    void* p = (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, &param, 1);
-    if (p != NULL) return p;
-    _mi_warning_message("unable to allocate aligned OS memory (%zu bytes, error code: 0x%x, address: %p, alignment: %zu, flags: 0x%x)\n", size, GetLastError(), addr, try_alignment, flags);
-    // fall through on error
-  }
-  // last resort
-  return VirtualAlloc(addr, size, flags, PAGE_READWRITE);
-}
-
-static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment, DWORD flags, bool large_only, bool allow_large, bool* is_large) {
-  mi_assert_internal(!(large_only && !allow_large));
-  static _Atomic(size_t) large_page_try_ok; // = 0;
-  void* p = NULL;
-  // Try to allocate large OS pages (2MiB) if allowed or required.
-  if ((large_only || use_large_os_page(size, try_alignment))
-      && allow_large && (flags&MEM_COMMIT)!=0 && (flags&MEM_RESERVE)!=0) {
-    size_t try_ok = mi_atomic_load_acquire(&large_page_try_ok);
-    if (!large_only && try_ok > 0) {
-      // if a large page allocation fails, it seems the calls to VirtualAlloc get very expensive.
-      // therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times.
-      mi_atomic_cas_strong_acq_rel(&large_page_try_ok, &try_ok, try_ok - 1);
+    // free it
+    if (memid.memkind == MI_MEM_OS_HUGE) {
+      mi_assert(memid.is_pinned);
+      mi_os_free_huge_os_pages(base, csize, tld_stats);
     }
     else {
-      // large OS pages must always reserve and commit.
-      *is_large = true;
-      p = mi_win_virtual_allocx(addr, size, try_alignment, flags | MEM_LARGE_PAGES);
-      if (large_only) return p;
-      // fall back to non-large page allocation on error (`p == NULL`).
-      if (p == NULL) {
-        mi_atomic_store_release(&large_page_try_ok,10UL);  // on error, don't try again for the next N allocations
-      }
+      mi_os_prim_free(base, csize, still_committed, tld_stats);
     }
   }
-  // Fall back to regular page allocation
-  if (p == NULL) {
-    *is_large = ((flags&MEM_LARGE_PAGES) != 0);
-    p = mi_win_virtual_allocx(addr, size, try_alignment, flags);
-  }
-  if (p == NULL) {
-    _mi_warning_message("unable to allocate OS memory (%zu bytes, error code: 0x%x, address: %p, alignment: %zu, flags: 0x%x, large only: %d, allow large: %d)\n", size, GetLastError(), addr, try_alignment, flags, large_only, allow_large);
-  }
-  return p;
-}
-
-/* -----------------------------------------------------------
-  Raw allocation using `sbrk` or `wasm_memory_grow`
--------------------------------------------------------------- */
-
-#elif defined(MI_USE_SBRK) || defined(__wasi__)
-#if defined(MI_USE_SBRK)
-  static void* mi_memory_grow( size_t size ) {
-    void* p = sbrk(size);
-    if (p == (void*)(-1)) return NULL;
-    #if !defined(__wasi__) // on wasi this is always zero initialized already (?)
-    memset(p,0,size);
-    #endif
-    return p;
-  }
-#elif defined(__wasi__)
-  static void* mi_memory_grow( size_t size ) {
-    size_t base = (size > 0 ? __builtin_wasm_memory_grow(0,_mi_divide_up(size, _mi_os_page_size()))
-                            : __builtin_wasm_memory_size(0));
-    if (base == SIZE_MAX) return NULL;
-    return (void*)(base * _mi_os_page_size());
-  }
-#endif
-
-#if defined(MI_USE_PTHREADS)
-static pthread_mutex_t mi_heap_grow_mutex = PTHREAD_MUTEX_INITIALIZER;
-#endif
-
-static void* mi_heap_grow(size_t size, size_t try_alignment) {
-  void* p = NULL;
-  if (try_alignment <= 1) {
-    // `sbrk` is not thread safe in general so try to protect it (we could skip this on WASM but leave it in for now)
-    #if defined(MI_USE_PTHREADS)
-    pthread_mutex_lock(&mi_heap_grow_mutex);
-    #endif
-    p = mi_memory_grow(size);
-    #if defined(MI_USE_PTHREADS)
-    pthread_mutex_unlock(&mi_heap_grow_mutex);
-    #endif
-  }
   else {
-    void* base = NULL;
-    size_t alloc_size = 0;
-    // to allocate aligned use a lock to try to avoid thread interaction
-    // between getting the current size and actual allocation
-    // (also, `sbrk` is not thread safe in general)
-    #if defined(MI_USE_PTHREADS)
-    pthread_mutex_lock(&mi_heap_grow_mutex);
-    #endif
-    {
-      void* current = mi_memory_grow(0);  // get current size
-      if (current != NULL) {
-        void* aligned_current = mi_align_up_ptr(current, try_alignment);  // and align from there to minimize wasted space
-        alloc_size = _mi_align_up( ((uint8_t*)aligned_current - (uint8_t*)current) + size, _mi_os_page_size());
-        base = mi_memory_grow(alloc_size);
-      }
-    }
-    #if defined(MI_USE_PTHREADS)
-    pthread_mutex_unlock(&mi_heap_grow_mutex);
-    #endif
-    if (base != NULL) {
-      p = mi_align_up_ptr(base, try_alignment);
-      if ((uint8_t*)p + size > (uint8_t*)base + alloc_size) {
-        // another thread used wasm_memory_grow/sbrk in-between and we do not have enough
-        // space after alignment. Give up (and waste the space as we cannot shrink :-( )
-        // (in `mi_os_mem_alloc_aligned` this will fall back to overallocation to align)
-        p = NULL;
-      }
-    }
-  }
-  if (p == NULL) {
-    _mi_warning_message("unable to allocate sbrk/wasm_memory_grow OS memory (%zu bytes, %zu alignment)\n", size, try_alignment);
-    errno = ENOMEM;
-    return NULL;
+    // nothing to do 
+    mi_assert(memid.memkind < MI_MEM_OS);
   }
-  mi_assert_internal( try_alignment == 0 || (uintptr_t)p % try_alignment == 0 );
-  return p;
 }
 
-/* -----------------------------------------------------------
-  Raw allocation on Unix's (mmap)
--------------------------------------------------------------- */
-#else
-#define MI_OS_USE_MMAP
-static void* mi_unix_mmapx(void* addr, size_t size, size_t try_alignment, int protect_flags, int flags, int fd) {
-  MI_UNUSED(try_alignment);
-  #if defined(MAP_ALIGNED)  // BSD
-  if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0) {
-    size_t n = mi_bsr(try_alignment);
-    if (((size_t)1 << n) == try_alignment && n >= 12 && n <= 30) {  // alignment is a power of 2 and 4096 <= alignment <= 1GiB
-      flags |= MAP_ALIGNED(n);
-      void* p = mmap(addr, size, protect_flags, flags | MAP_ALIGNED(n), fd, 0);
-      if (p!=MAP_FAILED) return p;
-      // fall back to regular mmap
-    }
-  }
-  #elif defined(MAP_ALIGN)  // Solaris
-  if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0) {
-    void* p = mmap((void*)try_alignment, size, protect_flags, flags | MAP_ALIGN, fd, 0);  // addr parameter is the required alignment
-    if (p!=MAP_FAILED) return p;
-    // fall back to regular mmap
-  }
-  #endif
-  #if (MI_INTPTR_SIZE >= 8) && !defined(MAP_ALIGNED)
-  // on 64-bit systems, use the virtual address area after 2TiB for 4MiB aligned allocations
-  if (addr == NULL) {
-    void* hint = mi_os_get_aligned_hint(try_alignment, size);
-    if (hint != NULL) {
-      void* p = mmap(hint, size, protect_flags, flags, fd, 0);
-      if (p!=MAP_FAILED) return p;
-      // fall back to regular mmap
-    }
-  }
-  #endif
-  // regular mmap
-  void* p = mmap(addr, size, protect_flags, flags, fd, 0);
-  if (p!=MAP_FAILED) return p;
-  // failed to allocate
-  return NULL;
-}
-
-static int mi_unix_mmap_fd(void) {
-#if defined(VM_MAKE_TAG)
-  // macOS: tracking anonymous page with a specific ID. (All up to 98 are taken officially but LLVM sanitizers had taken 99)
-  int os_tag = (int)mi_option_get(mi_option_os_tag);
-  if (os_tag < 100 || os_tag > 255) os_tag = 100;
-  return VM_MAKE_TAG(os_tag);
-#else
-  return -1;
-#endif
-}
-
-static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int protect_flags, bool large_only, bool allow_large, bool* is_large) {
-  void* p = NULL;
-  #if !defined(MAP_ANONYMOUS)
-  #define MAP_ANONYMOUS  MAP_ANON
-  #endif
-  #if !defined(MAP_NORESERVE)
-  #define MAP_NORESERVE  0
-  #endif
-  const int fd = mi_unix_mmap_fd();
-  int flags = MAP_PRIVATE | MAP_ANONYMOUS;
-  if (_mi_os_has_overcommit()) {
-    flags |= MAP_NORESERVE;
-  }
-  #if defined(PROT_MAX)
-  protect_flags |= PROT_MAX(PROT_READ | PROT_WRITE); // BSD
-  #endif    
-  // huge page allocation
-  if ((large_only || use_large_os_page(size, try_alignment)) && allow_large) {
-    static _Atomic(size_t) large_page_try_ok; // = 0;
-    size_t try_ok = mi_atomic_load_acquire(&large_page_try_ok);
-    if (!large_only && try_ok > 0) {
-      // If the OS is not configured for large OS pages, or the user does not have
-      // enough permission, the `mmap` will always fail (but it might also fail for other reasons).
-      // Therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times
-      // to avoid too many failing calls to mmap.
-      mi_atomic_cas_strong_acq_rel(&large_page_try_ok, &try_ok, try_ok - 1);
-    }
-    else {
-      int lflags = flags & ~MAP_NORESERVE;  // using NORESERVE on huge pages seems to fail on Linux
-      int lfd = fd;
-      #ifdef MAP_ALIGNED_SUPER
-      lflags |= MAP_ALIGNED_SUPER;
-      #endif
-      #ifdef MAP_HUGETLB
-      lflags |= MAP_HUGETLB;
-      #endif
-      #ifdef MAP_HUGE_1GB
-      static bool mi_huge_pages_available = true;
-      if ((size % MI_GiB) == 0 && mi_huge_pages_available) {
-        lflags |= MAP_HUGE_1GB;
-      }
-      else
-      #endif
-      {
-        #ifdef MAP_HUGE_2MB
-        lflags |= MAP_HUGE_2MB;
-        #endif
-      }
-      #ifdef VM_FLAGS_SUPERPAGE_SIZE_2MB
-      lfd |= VM_FLAGS_SUPERPAGE_SIZE_2MB;
-      #endif
-      if (large_only || lflags != flags) {
-        // try large OS page allocation
-        *is_large = true;
-        p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, lflags, lfd);
-        #ifdef MAP_HUGE_1GB
-        if (p == NULL && (lflags & MAP_HUGE_1GB) != 0) {
-          mi_huge_pages_available = false; // don't try huge 1GiB pages again
-          _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (error %i)\n", errno);
-          lflags = ((lflags & ~MAP_HUGE_1GB) | MAP_HUGE_2MB);
-          p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, lflags, lfd);
-        }
-        #endif
-        if (large_only) return p;
-        if (p == NULL) {
-          mi_atomic_store_release(&large_page_try_ok, (size_t)8);  // on error, don't try again for the next N allocations
-        }
-      }
-    }
-  }
-  // regular allocation
-  if (p == NULL) {
-    *is_large = false;
-    p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, flags, fd);
-    if (p != NULL) {
-      #if defined(MADV_HUGEPAGE)
-      // Many Linux systems don't allow MAP_HUGETLB but they support instead
-      // transparent huge pages (THP). Generally, it is not required to call `madvise` with MADV_HUGE
-      // though since properly aligned allocations will already use large pages if available
-      // in that case -- in particular for our large regions (in `memory.c`).
-      // However, some systems only allow THP if called with explicit `madvise`, so
-      // when large OS pages are enabled for mimalloc, we call `madvise` anyways.
-      if (allow_large && use_large_os_page(size, try_alignment)) {
-        if (mi_madvise(p, size, MADV_HUGEPAGE) == 0) {
-          *is_large = true; // possibly
-        };
-      }
-      #elif defined(__sun)
-      if (allow_large && use_large_os_page(size, try_alignment)) {
-        struct memcntl_mha cmd = {0};
-        cmd.mha_pagesize = large_os_page_size;
-        cmd.mha_cmd = MHA_MAPSIZE_VA;
-        if (memcntl((caddr_t)p, size, MC_HAT_ADVISE, (caddr_t)&cmd, 0, 0) == 0) {
-          *is_large = true;
-        }
-      }
-      #endif
-    }
-  }
-  if (p == NULL) {
-    _mi_warning_message("unable to allocate OS memory (%zu bytes, error code: %i, address: %p, large only: %d, allow large: %d)\n", size, errno, addr, large_only, allow_large);
-  }
-  return p;
+void  _mi_os_free(void* p, size_t size, mi_memid_t memid, mi_stats_t* tld_stats) {
+  _mi_os_free_ex(p, size, true, memid, tld_stats);
 }
-#endif
 
 
 /* -----------------------------------------------------------
@@ -705,39 +188,31 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
 -------------------------------------------------------------- */
 
 // Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned.
-static void* mi_os_mem_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, mi_stats_t* stats) {
+static void* mi_os_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, mi_stats_t* stats) {
   mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
+  mi_assert_internal(is_zero != NULL);
+  mi_assert_internal(is_large != NULL);
   if (size == 0) return NULL;
-  if (!commit) allow_large = false;
-  if (try_alignment == 0) try_alignment = 1; // avoid 0 to ensure there will be no divide by zero when aligning
+  if (!commit) { allow_large = false; }
+  if (try_alignment == 0) { try_alignment = 1; } // avoid 0 to ensure there will be no divide by zero when aligning
 
-  void* p = NULL;
-  /*
-  if (commit && allow_large) {
-    p = _mi_os_try_alloc_from_huge_reserved(size, try_alignment);
-    if (p != NULL) {
-      *is_large = true;
-      return p;
-    }
+  *is_zero = false;
+  void* p = NULL; 
+  int err = _mi_prim_alloc(size, try_alignment, commit, allow_large, is_large, is_zero, &p);
+  if (err != 0) {
+    _mi_warning_message("unable to allocate OS memory (error: %d (0x%x), size: 0x%zx bytes, align: 0x%zx, commit: %d, allow large: %d)\n", err, err, size, try_alignment, commit, allow_large);
   }
-  */
-
-  #if defined(_WIN32)
-    int flags = MEM_RESERVE;
-    if (commit) { flags |= MEM_COMMIT; }
-    p = mi_win_virtual_alloc(NULL, size, try_alignment, flags, false, allow_large, is_large);
-  #elif defined(MI_USE_SBRK) || defined(__wasi__)
-    MI_UNUSED(allow_large);
-    *is_large = false;
-    p = mi_heap_grow(size, try_alignment);
-  #else
-    int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE);
-    p = mi_unix_mmap(NULL, size, try_alignment, protect_flags, false, allow_large, is_large);
-  #endif
   mi_stat_counter_increase(stats->mmap_calls, 1);
   if (p != NULL) {
     _mi_stat_increase(&stats->reserved, size);
-    if (commit) { _mi_stat_increase(&stats->committed, size); }
+    if (commit) { 
+      _mi_stat_increase(&stats->committed, size); 
+      // seems needed for asan (or `mimalloc-test-api` fails)
+      #ifdef MI_TRACK_ASAN
+      if (*is_zero) { mi_track_mem_defined(p,size); }
+               else { mi_track_mem_undefined(p,size); }
+      #endif
+    }    
   }
   return p;
 }
@@ -745,99 +220,109 @@ static void* mi_os_mem_alloc(size_t size, size_t try_alignment, bool commit, boo
 
 // Primitive aligned allocation from the OS.
 // This function guarantees the allocated memory is aligned.
-static void* mi_os_mem_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, bool* is_large, mi_stats_t* stats) {
+static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** base, mi_stats_t* stats) {
   mi_assert_internal(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0));
   mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
   mi_assert_internal(is_large != NULL);
+  mi_assert_internal(is_zero != NULL);
+  mi_assert_internal(base != NULL);
   if (!commit) allow_large = false;
   if (!(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0))) return NULL;
   size = _mi_align_up(size, _mi_os_page_size());
 
   // try first with a hint (this will be aligned directly on Win 10+ or BSD)
-  void* p = mi_os_mem_alloc(size, alignment, commit, allow_large, is_large, stats);
+  void* p = mi_os_prim_alloc(size, alignment, commit, allow_large, is_large, is_zero, stats);
   if (p == NULL) return NULL;
 
-  // if not aligned, free it, overallocate, and unmap around it
-  if (((uintptr_t)p % alignment != 0)) {
-    mi_os_mem_free(p, size, commit, stats);
-    _mi_warning_message("unable to allocate aligned OS memory directly, fall back to over-allocation (%zu bytes, address: %p, alignment: %zu, commit: %d)\n", size, p, alignment, commit);
+  // aligned already?
+  if (((uintptr_t)p % alignment) == 0) {
+    *base = p;
+  }
+  else {
+    // if not aligned, free it, overallocate, and unmap around it
+    _mi_warning_message("unable to allocate aligned OS memory directly, fall back to over-allocation (size: 0x%zx bytes, address: %p, alignment: 0x%zx, commit: %d)\n", size, p, alignment, commit);
+    mi_os_prim_free(p, size, commit, stats);
     if (size >= (SIZE_MAX - alignment)) return NULL; // overflow
     const size_t over_size = size + alignment;
 
-#if _WIN32
-    // over-allocate uncommitted (virtual) memory
-    p = mi_os_mem_alloc(over_size, 0 /*alignment*/, false /* commit? */, false /* allow_large */, is_large, stats);
-    if (p == NULL) return NULL;
-
-    // set p to the aligned part in the full region
-    // note: this is dangerous on Windows as VirtualFree needs the actual region pointer
-    // but in mi_os_mem_free we handle this (hopefully exceptional) situation.
-    p = mi_align_up_ptr(p, alignment);
-
-    // explicitly commit only the aligned part
-    if (commit) {
-      _mi_os_commit(p, size, NULL, stats);
+    if (mi_os_mem_config.must_free_whole) {  // win32 virtualAlloc cannot free parts of an allocate block
+      // over-allocate uncommitted (virtual) memory
+      p = mi_os_prim_alloc(over_size, 1 /*alignment*/, false /* commit? */, false /* allow_large */, is_large, is_zero, stats);
+      if (p == NULL) return NULL;
+      
+      // set p to the aligned part in the full region
+      // note: this is dangerous on Windows as VirtualFree needs the actual base pointer
+      // this is handled though by having the `base` field in the memid's
+      *base = p; // remember the base
+      p = mi_align_up_ptr(p, alignment);
+
+      // explicitly commit only the aligned part
+      if (commit) {
+        _mi_os_commit(p, size, NULL, stats);
+      }
+    }
+    else  { // mmap can free inside an allocation
+      // overallocate...
+      p = mi_os_prim_alloc(over_size, 1, commit, false, is_large, is_zero, stats);
+      if (p == NULL) return NULL;
+      
+      // and selectively unmap parts around the over-allocated area. (noop on sbrk)
+      void* aligned_p = mi_align_up_ptr(p, alignment);
+      size_t pre_size = (uint8_t*)aligned_p - (uint8_t*)p;
+      size_t mid_size = _mi_align_up(size, _mi_os_page_size());
+      size_t post_size = over_size - pre_size - mid_size;
+      mi_assert_internal(pre_size < over_size&& post_size < over_size&& mid_size >= size);
+      if (pre_size > 0)  { mi_os_prim_free(p, pre_size, commit, stats); }
+      if (post_size > 0) { mi_os_prim_free((uint8_t*)aligned_p + mid_size, post_size, commit, stats); }
+      // we can return the aligned pointer on `mmap` (and sbrk) systems
+      p = aligned_p;
+      *base = aligned_p; // since we freed the pre part, `*base == p`.      
     }
-#else
-    // overallocate...
-    p = mi_os_mem_alloc(over_size, 1, commit, false, is_large, stats);
-    if (p == NULL) return NULL;
-    // and selectively unmap parts around the over-allocated area. (noop on sbrk)
-    void* aligned_p = mi_align_up_ptr(p, alignment);
-    size_t pre_size = (uint8_t*)aligned_p - (uint8_t*)p;
-    size_t mid_size = _mi_align_up(size, _mi_os_page_size());
-    size_t post_size = over_size - pre_size - mid_size;
-    mi_assert_internal(pre_size < over_size && post_size < over_size && mid_size >= size);
-    if (pre_size > 0)  mi_os_mem_free(p, pre_size, commit, stats);
-    if (post_size > 0) mi_os_mem_free((uint8_t*)aligned_p + mid_size, post_size, commit, stats);
-    // we can return the aligned pointer on `mmap` (and sbrk) systems
-    p = aligned_p;
-#endif
   }
 
-  mi_assert_internal(p == NULL || (p != NULL && ((uintptr_t)p % alignment) == 0));
+  mi_assert_internal(p == NULL || (p != NULL && *base != NULL && ((uintptr_t)p % alignment) == 0));
   return p;
 }
 
 
 /* -----------------------------------------------------------
-  OS API: alloc, free, alloc_aligned
+  OS API: alloc and alloc_aligned
 ----------------------------------------------------------- */
 
-void* _mi_os_alloc(size_t size, mi_stats_t* tld_stats) {
+void* _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* tld_stats) {
   MI_UNUSED(tld_stats);
+  *memid = _mi_memid_none();
   mi_stats_t* stats = &_mi_stats_main;
   if (size == 0) return NULL;
   size = _mi_os_good_alloc_size(size);
-  bool is_large = false;
-  return mi_os_mem_alloc(size, 0, true, false, &is_large, stats);
-}
-
-void  _mi_os_free_ex(void* p, size_t size, bool was_committed, mi_stats_t* tld_stats) {
-  MI_UNUSED(tld_stats);
-  mi_stats_t* stats = &_mi_stats_main;
-  if (size == 0 || p == NULL) return;
-  size = _mi_os_good_alloc_size(size);
-  mi_os_mem_free(p, size, was_committed, stats);
-}
-
-void  _mi_os_free(void* p, size_t size, mi_stats_t* stats) {
-  _mi_os_free_ex(p, size, true, stats);
+  bool os_is_large = false;
+  bool os_is_zero  = false;
+  void* p = mi_os_prim_alloc(size, 0, true, false, &os_is_large, &os_is_zero, stats);
+  if (p != NULL) {
+    *memid = _mi_memid_create_os(true, os_is_zero, os_is_large);
+  }  
+  return p;
 }
 
-void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_stats_t* tld_stats)
+void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* tld_stats)
 {
-  MI_UNUSED(&mi_os_get_aligned_hint); // suppress unused warnings
+  MI_UNUSED(&_mi_os_get_aligned_hint); // suppress unused warnings
   MI_UNUSED(tld_stats);
+  *memid = _mi_memid_none();
   if (size == 0) return NULL;
   size = _mi_os_good_alloc_size(size);
   alignment = _mi_align_up(alignment, _mi_os_page_size());
-  bool allow_large = false;
-  if (large != NULL) {
-    allow_large = *large;
-    *large = false;
+  
+  bool os_is_large = false;
+  bool os_is_zero  = false;
+  void* os_base = NULL;
+  void* p = mi_os_prim_alloc_aligned(size, alignment, commit, allow_large, &os_is_large, &os_is_zero, &os_base, &_mi_stats_main /*tld->stats*/ );
+  if (p != NULL) {
+    *memid = _mi_memid_create_os(commit, os_is_zero, os_is_large);
+    memid->mem.os.base = os_base;
+    memid->mem.os.alignment = alignment;
   }
-  return mi_os_mem_alloc_aligned(size, alignment, commit, allow_large, (large!=NULL?large:&allow_large), &_mi_stats_main /*tld->stats*/ );
+  return p;
 }
 
 /* -----------------------------------------------------------
@@ -848,22 +333,24 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* lar
   to use the actual start of the memory region.
 ----------------------------------------------------------- */
 
-void* _mi_os_alloc_aligned_offset(size_t size, size_t alignment, size_t offset, bool commit, bool* large, mi_stats_t* tld_stats) {
+void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t offset, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* tld_stats) {
   mi_assert(offset <= MI_SEGMENT_SIZE);
   mi_assert(offset <= size);
   mi_assert((alignment % _mi_os_page_size()) == 0);
+  *memid = _mi_memid_none();
   if (offset > MI_SEGMENT_SIZE) return NULL;
   if (offset == 0) {
     // regular aligned allocation
-    return _mi_os_alloc_aligned(size, alignment, commit, large, tld_stats);
+    return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, tld_stats);
   }
   else {
     // overallocate to align at an offset
     const size_t extra = _mi_align_up(offset, alignment) - offset;
     const size_t oversize = size + extra;
-    void* start = _mi_os_alloc_aligned(oversize, alignment, commit, large, tld_stats);
+    void* const start = _mi_os_alloc_aligned(oversize, alignment, commit, allow_large, memid, tld_stats);
     if (start == NULL) return NULL;
-    void* p = (uint8_t*)start + extra;
+
+    void* const p = (uint8_t*)start + extra;
     mi_assert(_mi_is_aligned((uint8_t*)p + offset, alignment));
     // decommit the overallocation at the start
     if (commit && extra > _mi_os_page_size()) {
@@ -873,18 +360,10 @@ void* _mi_os_alloc_aligned_offset(size_t size, size_t alignment, size_t offset,
   }
 }
 
-void _mi_os_free_aligned(void* p, size_t size, size_t alignment, size_t align_offset, bool was_committed, mi_stats_t* tld_stats) {
-  mi_assert(align_offset <= MI_SEGMENT_SIZE);
-  const size_t extra = _mi_align_up(align_offset, alignment) - align_offset;
-  void* start = (uint8_t*)p - extra;
-  _mi_os_free_ex(start, size + extra, was_committed, tld_stats);
-}
-
 /* -----------------------------------------------------------
   OS memory API: reset, commit, decommit, protect, unprotect.
 ----------------------------------------------------------- */
 
-
 // OS page align within a given area, either conservative (pages inside the area only),
 // or not (straddling pages outside the area is possible)
 static void* mi_os_page_align_areax(bool conservative, void* addr, size_t size, size_t* newsize) {
@@ -909,183 +388,116 @@ static void* mi_os_page_align_area_conservative(void* addr, size_t size, size_t*
   return mi_os_page_align_areax(true, addr, size, newsize);
 }
 
-static void mi_mprotect_hint(int err) {
-#if defined(MI_OS_USE_MMAP) && (MI_SECURE>=2) // guard page around every mimalloc page
-  if (err == ENOMEM) {
-    _mi_warning_message("the previous warning may have been caused by a low memory map limit.\n"
-                        "  On Linux this is controlled by the vm.max_map_count. For example:\n"
-                        "  > sudo sysctl -w vm.max_map_count=262144\n");
-  }
-#else
-  MI_UNUSED(err);
-#endif
-}
-
-// Commit/Decommit memory.
-// Usually commit is aligned liberal, while decommit is aligned conservative.
-// (but not for the reset version where we want commit to be conservative as well)
-static bool mi_os_commitx(void* addr, size_t size, bool commit, bool conservative, bool* is_zero, mi_stats_t* stats) {
-  // page align in the range, commit liberally, decommit conservative
+bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats) {
+  MI_UNUSED(tld_stats);
+  mi_stats_t* stats = &_mi_stats_main;  
   if (is_zero != NULL) { *is_zero = false; }
+  _mi_stat_increase(&stats->committed, size);  // use size for precise commit vs. decommit
+  _mi_stat_counter_increase(&stats->commit_calls, 1);
+
+  // page align range
   size_t csize;
-  void* start = mi_os_page_align_areax(conservative, addr, size, &csize);
-  if (csize == 0) return true;  // || _mi_os_is_huge_reserved(addr))
-  int err = 0;
-  if (commit) {
-    _mi_stat_increase(&stats->committed, size);  // use size for precise commit vs. decommit
-    _mi_stat_counter_increase(&stats->commit_calls, 1);
-  }
-  else {
-    _mi_stat_decrease(&stats->committed, size);
-  }
+  void* start = mi_os_page_align_areax(false /* conservative? */, addr, size, &csize);
+  if (csize == 0) return true;
 
-  #if defined(_WIN32)
-  if (commit) {
-    // *is_zero = true;  // note: if the memory was already committed, the call succeeds but the memory is not zero'd
-    void* p = VirtualAlloc(start, csize, MEM_COMMIT, PAGE_READWRITE);
-    err = (p == start ? 0 : GetLastError());
-  }
-  else {
-    BOOL ok = VirtualFree(start, csize, MEM_DECOMMIT);
-    err = (ok ? 0 : GetLastError());
-  }
-  #elif defined(__wasi__)
-  // WebAssembly guests can't control memory protection
-  #elif 0 && defined(MAP_FIXED) && !defined(__APPLE__)
-  // Linux: disabled for now as mmap fixed seems much more expensive than MADV_DONTNEED (and splits VMA's?)
-  if (commit) {
-    // commit: just change the protection
-    err = mprotect(start, csize, (PROT_READ | PROT_WRITE));
-    if (err != 0) { err = errno; }
-  }
-  else {
-    // decommit: use mmap with MAP_FIXED to discard the existing memory (and reduce rss)
-    const int fd = mi_unix_mmap_fd();
-    void* p = mmap(start, csize, PROT_NONE, (MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE), fd, 0);
-    if (p != start) { err = errno; }
-  }
-  #else
-  // Linux, macOSX and others.
-  if (commit) {
-    // commit: ensure we can access the area
-    err = mprotect(start, csize, (PROT_READ | PROT_WRITE));
-    if (err != 0) { err = errno; }
+  // commit  
+  bool os_is_zero = false;
+  int err = _mi_prim_commit(start, csize, &os_is_zero); 
+  if (err != 0) {
+    _mi_warning_message("cannot commit OS memory (error: %d (0x%x), address: %p, size: 0x%zx bytes)\n", err, err, start, csize);
+    return false;
   }
-  else {
-    #if defined(MADV_DONTNEED) && MI_DEBUG == 0 && MI_SECURE == 0
-    // decommit: use MADV_DONTNEED as it decreases rss immediately (unlike MADV_FREE)
-    // (on the other hand, MADV_FREE would be good enough.. it is just not reflected in the stats :-( )
-    err = madvise(start, csize, MADV_DONTNEED);
-    #else
-    // decommit: just disable access (also used in debug and secure mode to trap on illegal access)
-    err = mprotect(start, csize, PROT_NONE);
-    if (err != 0) { err = errno; }
-    #endif
-    //#if defined(MADV_FREE_REUSE)
-    //  while ((err = mi_madvise(start, csize, MADV_FREE_REUSE)) != 0 && errno == EAGAIN) { errno = 0; }
-    //#endif
+  if (os_is_zero && is_zero != NULL) { 
+    *is_zero = true;
+    mi_assert_expensive(mi_mem_is_zero(start, csize));
   }
+  // note: the following seems required for asan (otherwise `mimalloc-test-stress` fails)
+  #ifdef MI_TRACK_ASAN
+  if (os_is_zero) { mi_track_mem_defined(start,csize); }
+             else { mi_track_mem_undefined(start,csize); } 
   #endif
-  if (err != 0) {
-    _mi_warning_message("%s error: start: %p, csize: 0x%zx, err: %i\n", commit ? "commit" : "decommit", start, csize, err);
-    mi_mprotect_hint(err);
-  }
-  mi_assert_internal(err == 0);
-  return (err == 0);
+  return true;
 }
 
-bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats) {
+static bool mi_os_decommit_ex(void* addr, size_t size, bool* needs_recommit, mi_stats_t* tld_stats) {
   MI_UNUSED(tld_stats);
   mi_stats_t* stats = &_mi_stats_main;
-  return mi_os_commitx(addr, size, true, false /* liberal */, is_zero, stats);
+  mi_assert_internal(needs_recommit!=NULL);
+  _mi_stat_decrease(&stats->committed, size);
+
+  // page align
+  size_t csize;
+  void* start = mi_os_page_align_area_conservative(addr, size, &csize);
+  if (csize == 0) return true; 
+
+  // decommit
+  *needs_recommit = true;
+  int err = _mi_prim_decommit(start,csize,needs_recommit);  
+  if (err != 0) {
+    _mi_warning_message("cannot decommit OS memory (error: %d (0x%x), address: %p, size: 0x%zx bytes)\n", err, err, start, csize);
+  }
+  mi_assert_internal(err == 0);
+  return (err == 0);
 }
 
 bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* tld_stats) {
-  MI_UNUSED(tld_stats);
-  mi_stats_t* stats = &_mi_stats_main;
-  bool is_zero;
-  return mi_os_commitx(addr, size, false, true /* conservative */, &is_zero, stats);
+  bool needs_recommit;
+  return mi_os_decommit_ex(addr, size, &needs_recommit, tld_stats);
 }
 
-/*
-static bool mi_os_commit_unreset(void* addr, size_t size, bool* is_zero, mi_stats_t* stats) {  
-  return mi_os_commitx(addr, size, true, true // conservative
-                      , is_zero, stats);
-}
-*/
 
 // Signal to the OS that the address range is no longer in use
 // but may be used later again. This will release physical memory
 // pages and reduce swapping while keeping the memory committed.
 // We page align to a conservative area inside the range to reset.
-static bool mi_os_resetx(void* addr, size_t size, bool reset, mi_stats_t* stats) {
+bool _mi_os_reset(void* addr, size_t size, mi_stats_t* stats) { 
   // page align conservatively within the range
   size_t csize;
   void* start = mi_os_page_align_area_conservative(addr, size, &csize);
   if (csize == 0) return true;  // || _mi_os_is_huge_reserved(addr)
-  if (reset) _mi_stat_increase(&stats->reset, csize);
-        else _mi_stat_decrease(&stats->reset, csize);
-  if (!reset) return true; // nothing to do on unreset!
+  _mi_stat_increase(&stats->reset, csize);
+  _mi_stat_counter_increase(&stats->reset_calls, 1);
 
-  #if (MI_DEBUG>1) && !MI_TRACK_ENABLED
-  if (MI_SECURE==0) {
-    memset(start, 0, csize); // pretend it is eagerly reset
-  }
+  #if (MI_DEBUG>1) && !MI_SECURE && !MI_TRACK_ENABLED // && !MI_TSAN
+  memset(start, 0, csize); // pretend it is eagerly reset
   #endif
 
-#if defined(_WIN32)
-  // Testing shows that for us (on `malloc-large`) MEM_RESET is 2x faster than DiscardVirtualMemory
-  void* p = VirtualAlloc(start, csize, MEM_RESET, PAGE_READWRITE);
-  mi_assert_internal(p == start);
-  #if 1
-  if (p == start && start != NULL) {
-    VirtualUnlock(start,csize); // VirtualUnlock after MEM_RESET removes the memory from the working set
-  }
-  #endif
-  if (p != start) return false;
-#else
-#if defined(MADV_FREE)
-  static _Atomic(size_t) advice = MI_ATOMIC_VAR_INIT(MADV_FREE);
-  int oadvice = (int)mi_atomic_load_relaxed(&advice);
-  int err;
-  while ((err = mi_madvise(start, csize, oadvice)) != 0 && errno == EAGAIN) { errno = 0;  };
-  if (err != 0 && errno == EINVAL && oadvice == MADV_FREE) {
-    // if MADV_FREE is not supported, fall back to MADV_DONTNEED from now on
-    mi_atomic_store_release(&advice, (size_t)MADV_DONTNEED);
-    err = mi_madvise(start, csize, MADV_DONTNEED);
-  }
-#elif defined(__wasi__)
-  int err = 0;
-#else
-  int err = mi_madvise(start, csize, MADV_DONTNEED);
-#endif
+  int err = _mi_prim_reset(start, csize);
   if (err != 0) {
-    _mi_warning_message("madvise reset error: start: %p, csize: 0x%zx, errno: %i\n", start, csize, errno);
+    _mi_warning_message("cannot reset OS memory (error: %d (0x%x), address: %p, size: 0x%zx bytes)\n", err, err, start, csize);
   }
-  //mi_assert(err == 0);
-  if (err != 0) return false;
-#endif
-  return true;
+  return (err == 0);
 }
 
-// Signal to the OS that the address range is no longer in use
-// but may be used later again. This will release physical memory
-// pages and reduce swapping while keeping the memory committed.
-// We page align to a conservative area inside the range to reset.
-bool _mi_os_reset(void* addr, size_t size, mi_stats_t* tld_stats) {
-  MI_UNUSED(tld_stats);
-  mi_stats_t* stats = &_mi_stats_main;
-  return mi_os_resetx(addr, size, true, stats);
+
+// either resets or decommits memory, returns true if the memory needs 
+// to be recommitted if it is to be re-used later on.
+bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, mi_stats_t* stats)
+{
+  if (mi_option_get(mi_option_purge_delay) < 0) return false;  // is purging allowed?
+  _mi_stat_counter_increase(&stats->purge_calls, 1);
+  _mi_stat_increase(&stats->purged, size);
+
+  if (mi_option_is_enabled(mi_option_purge_decommits) &&   // should decommit?
+      !_mi_preloading())                                   // don't decommit during preloading (unsafe)
+  {
+    bool needs_recommit = true;
+    mi_os_decommit_ex(p, size, &needs_recommit, stats);
+    return needs_recommit;   
+  }
+  else {
+    if (allow_reset) {  // this can sometimes be not allowed if the range is not fully committed
+      _mi_os_reset(p, size, stats);
+    }
+    return false;  // needs no recommit
+  }
 }
 
-/*
-bool _mi_os_unreset(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats) {
-  MI_UNUSED(tld_stats);
-  mi_stats_t* stats = &_mi_stats_main;
-  *is_zero = false;
-  return mi_os_resetx(addr, size, false, stats);
+// either resets or decommits memory, returns true if the memory needs 
+// to be recommitted if it is to be re-used later on.
+bool _mi_os_purge(void* p, size_t size, mi_stats_t * stats) {
+  return _mi_os_purge_ex(p, size, true, stats);
 }
-*/
 
 // Protect a region in memory to be not accessible.
 static  bool mi_os_protectx(void* addr, size_t size, bool protect) {
@@ -1095,23 +507,12 @@ static  bool mi_os_protectx(void* addr, size_t size, bool protect) {
   if (csize == 0) return false;
   /*
   if (_mi_os_is_huge_reserved(addr)) {
-	  _mi_warning_message("cannot mprotect memory allocated in huge OS pages\n");
+          _mi_warning_message("cannot mprotect memory allocated in huge OS pages\n");
   }
   */
-  int err = 0;
-#ifdef _WIN32
-  DWORD oldprotect = 0;
-  BOOL ok = VirtualProtect(start, csize, protect ? PAGE_NOACCESS : PAGE_READWRITE, &oldprotect);
-  err = (ok ? 0 : GetLastError());
-#elif defined(__wasi__)
-  err = 0;
-#else
-  err = mprotect(start, csize, protect ? PROT_NONE : (PROT_READ | PROT_WRITE));
-  if (err != 0) { err = errno; }
-#endif
+  int err = _mi_prim_protect(start,csize,protect);
   if (err != 0) {
-    _mi_warning_message("mprotect error: start: %p, csize: 0x%zx, err: %i\n", start, csize, err);
-    mi_mprotect_hint(err);
+    _mi_warning_message("cannot %s OS memory (error: %d (0x%x), address: %p, size: 0x%zx bytes)\n", (protect ? "protect" : "unprotect"), err, err, start, csize);
   }
   return (err == 0);
 }
@@ -1126,115 +527,12 @@ bool _mi_os_unprotect(void* addr, size_t size) {
 
 
 
-bool _mi_os_shrink(void* p, size_t oldsize, size_t newsize, mi_stats_t* stats) {
-  // page align conservatively within the range
-  mi_assert_internal(oldsize > newsize && p != NULL);
-  if (oldsize < newsize || p == NULL) return false;
-  if (oldsize == newsize) return true;
-
-  // oldsize and newsize should be page aligned or we cannot shrink precisely
-  void* addr = (uint8_t*)p + newsize;
-  size_t size = 0;
-  void* start = mi_os_page_align_area_conservative(addr, oldsize - newsize, &size);
-  if (size == 0 || start != addr) return false;
-
-#ifdef _WIN32
-  // we cannot shrink on windows, but we can decommit
-  return _mi_os_decommit(start, size, stats);
-#else
-  return mi_os_mem_free(start, size, true, stats);
-#endif
-}
-
-
 /* ----------------------------------------------------------------------------
 Support for allocating huge OS pages (1Gib) that are reserved up-front
 and possibly associated with a specific NUMA node. (use `numa_node>=0`)
 -----------------------------------------------------------------------------*/
 #define MI_HUGE_OS_PAGE_SIZE  (MI_GiB)
 
-#if defined(_WIN32) && (MI_INTPTR_SIZE >= 8)
-static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
-{
-  mi_assert_internal(size%MI_GiB == 0);
-  mi_assert_internal(addr != NULL);
-  const DWORD flags = MEM_LARGE_PAGES | MEM_COMMIT | MEM_RESERVE;
-
-  mi_win_enable_large_os_pages();
-
-  MI_MEM_EXTENDED_PARAMETER params[3] = { {{0,0},{0}},{{0,0},{0}},{{0,0},{0}} };
-  // on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages
-  static bool mi_huge_pages_available = true;
-  if (pNtAllocateVirtualMemoryEx != NULL && mi_huge_pages_available) {
-    params[0].Type.Type = MiMemExtendedParameterAttributeFlags;
-    params[0].Arg.ULong64 = MI_MEM_EXTENDED_PARAMETER_NONPAGED_HUGE;
-    ULONG param_count = 1;
-    if (numa_node >= 0) {
-      param_count++;
-      params[1].Type.Type = MiMemExtendedParameterNumaNode;
-      params[1].Arg.ULong = (unsigned)numa_node;
-    }
-    SIZE_T psize = size;
-    void* base = addr;
-    NTSTATUS err = (*pNtAllocateVirtualMemoryEx)(GetCurrentProcess(), &base, &psize, flags, PAGE_READWRITE, params, param_count);
-    if (err == 0 && base != NULL) {
-      return base;
-    }
-    else {
-      // fall back to regular large pages
-      mi_huge_pages_available = false; // don't try further huge pages
-      _mi_warning_message("unable to allocate using huge (1GiB) pages, trying large (2MiB) pages instead (status 0x%lx)\n", err);
-    }
-  }
-  // on modern Windows try use VirtualAlloc2 for numa aware large OS page allocation
-  if (pVirtualAlloc2 != NULL && numa_node >= 0) {
-    params[0].Type.Type = MiMemExtendedParameterNumaNode;
-    params[0].Arg.ULong = (unsigned)numa_node;
-    return (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, params, 1);
-  }
-
-  // otherwise use regular virtual alloc on older windows
-  return VirtualAlloc(addr, size, flags, PAGE_READWRITE);
-}
-
-#elif defined(MI_OS_USE_MMAP) && (MI_INTPTR_SIZE >= 8) && !defined(__HAIKU__)
-#include <sys/syscall.h>
-#ifndef MPOL_PREFERRED
-#define MPOL_PREFERRED 1
-#endif
-#if defined(SYS_mbind)
-static long mi_os_mbind(void* start, unsigned long len, unsigned long mode, const unsigned long* nmask, unsigned long maxnode, unsigned flags) {
-  return syscall(SYS_mbind, start, len, mode, nmask, maxnode, flags);
-}
-#else
-static long mi_os_mbind(void* start, unsigned long len, unsigned long mode, const unsigned long* nmask, unsigned long maxnode, unsigned flags) {
-  MI_UNUSED(start); MI_UNUSED(len); MI_UNUSED(mode); MI_UNUSED(nmask); MI_UNUSED(maxnode); MI_UNUSED(flags);
-  return 0;
-}
-#endif
-static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node) {
-  mi_assert_internal(size%MI_GiB == 0);
-  bool is_large = true;
-  void* p = mi_unix_mmap(addr, size, MI_SEGMENT_SIZE, PROT_READ | PROT_WRITE, true, true, &is_large);
-  if (p == NULL) return NULL;
-  if (numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) { // at most 64 nodes
-    unsigned long numa_mask = (1UL << numa_node);
-    // TODO: does `mbind` work correctly for huge OS pages? should we
-    // use `set_mempolicy` before calling mmap instead?
-    // see: <https://lkml.org/lkml/2017/2/9/875>
-    long err = mi_os_mbind(p, size, MPOL_PREFERRED, &numa_mask, 8*MI_INTPTR_SIZE, 0);
-    if (err != 0) {
-      _mi_warning_message("failed to bind huge (1GiB) pages to numa node %d: %s\n", numa_node, strerror(errno));
-    }
-  }
-  return p;
-}
-#else
-static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node) {
-  MI_UNUSED(addr); MI_UNUSED(size); MI_UNUSED(numa_node);
-  return NULL;
-}
-#endif
 
 #if (MI_INTPTR_SIZE >= 8)
 // To ensure proper alignment, use our own area for huge OS pages
@@ -1253,10 +551,10 @@ static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
     if (start == 0) {
       // Initialize the start address after the 32TiB area
       start = ((uintptr_t)32 << 40);  // 32TiB virtual start address
-#if (MI_SECURE>0 || MI_DEBUG==0)      // security: randomize start of huge pages unless in debug mode
-      uintptr_t r = _mi_heap_random_next(mi_get_default_heap());
+    #if (MI_SECURE>0 || MI_DEBUG==0)      // security: randomize start of huge pages unless in debug mode
+      uintptr_t r = _mi_heap_random_next(mi_prim_get_default_heap());
       start = start + ((uintptr_t)MI_HUGE_OS_PAGE_SIZE * ((r>>17) & 0x0FFF));  // (randomly 12bits)*1GiB == between 0 to 4TiB
-#endif
+    #endif
     }
     end = start + size;
     mi_assert_internal(end % MI_SEGMENT_SIZE == 0);
@@ -1274,7 +572,8 @@ static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
 #endif
 
 // Allocate MI_SEGMENT_SIZE aligned huge pages
-void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_msecs, size_t* pages_reserved, size_t* psize) {
+void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_msecs, size_t* pages_reserved, size_t* psize, mi_memid_t* memid) {
+  *memid = _mi_memid_none();
   if (psize != NULL) *psize = 0;
   if (pages_reserved != NULL) *pages_reserved = 0;
   size_t size = 0;
@@ -1285,23 +584,32 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
   // We allocate one page at the time to be able to abort if it takes too long
   // or to at least allocate as many as available on the system.
   mi_msecs_t start_t = _mi_clock_start();
-  size_t page;
-  for (page = 0; page < pages; page++) {
+  size_t page = 0;
+  bool all_zero = true;
+  while (page < pages) {
     // allocate a page
+    bool is_zero = false;
     void* addr = start + (page * MI_HUGE_OS_PAGE_SIZE);
-    void* p = mi_os_alloc_huge_os_pagesx(addr, MI_HUGE_OS_PAGE_SIZE, numa_node);
+    void* p = NULL;
+    int err = _mi_prim_alloc_huge_os_pages(addr, MI_HUGE_OS_PAGE_SIZE, numa_node, &is_zero, &p);
+    if (!is_zero) { all_zero = false;  }
+    if (err != 0) {
+      _mi_warning_message("unable to allocate huge OS page (error: %d (0x%x), address: %p, size: %zx bytes)\n", err, err, addr, MI_HUGE_OS_PAGE_SIZE);
+      break;
+    }
 
     // Did we succeed at a contiguous address?
     if (p != addr) {
       // no success, issue a warning and break
       if (p != NULL) {
-        _mi_warning_message("could not allocate contiguous huge page %zu at %p\n", page, addr);
-        _mi_os_free(p, MI_HUGE_OS_PAGE_SIZE, &_mi_stats_main);
+        _mi_warning_message("could not allocate contiguous huge OS page %zu at %p\n", page, addr);
+        mi_os_prim_free(p, MI_HUGE_OS_PAGE_SIZE, true, &_mi_stats_main);
       }
       break;
     }
 
     // success, record it
+    page++;  // increase before timeout check (see issue #711)
     _mi_stat_increase(&_mi_stats_main.committed, MI_HUGE_OS_PAGE_SIZE);
     _mi_stat_increase(&_mi_stats_main.reserved, MI_HUGE_OS_PAGE_SIZE);
 
@@ -1315,7 +623,7 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
         }
       }
       if (elapsed > max_msecs) {
-        _mi_warning_message("huge page allocation timed out\n");
+        _mi_warning_message("huge OS page allocation timed out (after allocating %zu page(s))\n", page);
         break;
       }
     }
@@ -1323,16 +631,25 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
   mi_assert_internal(page*MI_HUGE_OS_PAGE_SIZE <= size);
   if (pages_reserved != NULL) { *pages_reserved = page; }
   if (psize != NULL) { *psize = page * MI_HUGE_OS_PAGE_SIZE; }
+  if (page != 0) {
+    mi_assert(start != NULL);
+    *memid = _mi_memid_create_os(true /* is committed */, all_zero, true /* is_large */);
+    memid->memkind = MI_MEM_OS_HUGE;
+    mi_assert(memid->is_pinned);
+    #ifdef MI_TRACK_ASAN
+    if (all_zero) { mi_track_mem_defined(start,size); }
+    #endif
+  }
   return (page == 0 ? NULL : start);
 }
 
 // free every huge page in a range individually (as we allocated per page)
 // note: needed with VirtualAlloc but could potentially be done in one go on mmap'd systems.
-void _mi_os_free_huge_pages(void* p, size_t size, mi_stats_t* stats) {
+static void mi_os_free_huge_os_pages(void* p, size_t size, mi_stats_t* stats) {
   if (p==NULL || size==0) return;
   uint8_t* base = (uint8_t*)p;
   while (size >= MI_HUGE_OS_PAGE_SIZE) {
-    _mi_os_free(base, MI_HUGE_OS_PAGE_SIZE, stats);
+    mi_os_prim_free(base, MI_HUGE_OS_PAGE_SIZE, true, stats);
     size -= MI_HUGE_OS_PAGE_SIZE;
     base += MI_HUGE_OS_PAGE_SIZE;
   }
@@ -1341,113 +658,6 @@ void _mi_os_free_huge_pages(void* p, size_t size, mi_stats_t* stats) {
 /* ----------------------------------------------------------------------------
 Support NUMA aware allocation
 -----------------------------------------------------------------------------*/
-#ifdef _WIN32
-static size_t mi_os_numa_nodex(void) {
-  USHORT numa_node = 0;
-  if (pGetCurrentProcessorNumberEx != NULL && pGetNumaProcessorNodeEx != NULL) {
-    // Extended API is supported
-    MI_PROCESSOR_NUMBER pnum;
-    (*pGetCurrentProcessorNumberEx)(&pnum);
-    USHORT nnode = 0;
-    BOOL ok = (*pGetNumaProcessorNodeEx)(&pnum, &nnode);
-    if (ok) { numa_node = nnode; }
-  }
-  else if (pGetNumaProcessorNode != NULL) {
-    // Vista or earlier, use older API that is limited to 64 processors. Issue #277
-    DWORD pnum = GetCurrentProcessorNumber();
-    UCHAR nnode = 0;
-    BOOL ok = pGetNumaProcessorNode((UCHAR)pnum, &nnode);
-    if (ok) { numa_node = nnode; }
-  }
-  return numa_node;
-}
-
-static size_t mi_os_numa_node_countx(void) {
-  ULONG numa_max = 0;
-  GetNumaHighestNodeNumber(&numa_max);
-  // find the highest node number that has actual processors assigned to it. Issue #282
-  while(numa_max > 0) {
-    if (pGetNumaNodeProcessorMaskEx != NULL) {
-      // Extended API is supported
-      GROUP_AFFINITY affinity;
-      if ((*pGetNumaNodeProcessorMaskEx)((USHORT)numa_max, &affinity)) {
-        if (affinity.Mask != 0) break;  // found the maximum non-empty node
-      }
-    }
-    else {
-      // Vista or earlier, use older API that is limited to 64 processors.
-      ULONGLONG mask;
-      if (GetNumaNodeProcessorMask((UCHAR)numa_max, &mask)) {
-        if (mask != 0) break; // found the maximum non-empty node
-      };
-    }
-    // max node was invalid or had no processor assigned, try again
-    numa_max--;
-  }
-  return ((size_t)numa_max + 1);
-}
-#elif defined(__linux__)
-#include <sys/syscall.h>  // getcpu
-#include <stdio.h>        // access
-
-static size_t mi_os_numa_nodex(void) {
-#ifdef SYS_getcpu
-  unsigned long node = 0;
-  unsigned long ncpu = 0;
-  long err = syscall(SYS_getcpu, &ncpu, &node, NULL);
-  if (err != 0) return 0;
-  return node;
-#else
-  return 0;
-#endif
-}
-static size_t mi_os_numa_node_countx(void) {
-  char buf[128];
-  unsigned node = 0;
-  for(node = 0; node < 256; node++) {
-    // enumerate node entries -- todo: it there a more efficient way to do this? (but ensure there is no allocation)
-    snprintf(buf, 127, "/sys/devices/system/node/node%u", node + 1);
-    if (access(buf,R_OK) != 0) break;
-  }
-  return (node+1);
-}
-#elif defined(__FreeBSD__) && __FreeBSD_version >= 1200000
-static size_t mi_os_numa_nodex(void) {
-  domainset_t dom;
-  size_t node;
-  int policy;
-  if (cpuset_getdomain(CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, sizeof(dom), &dom, &policy) == -1) return 0ul;
-  for (node = 0; node < MAXMEMDOM; node++) {
-    if (DOMAINSET_ISSET(node, &dom)) return node;
-  }
-  return 0ul;
-}
-static size_t mi_os_numa_node_countx(void) {
-  size_t ndomains = 0;
-  size_t len = sizeof(ndomains);
-  if (sysctlbyname("vm.ndomains", &ndomains, &len, NULL, 0) == -1) return 0ul;
-  return ndomains;
-}
-#elif defined(__DragonFly__)
-static size_t mi_os_numa_nodex(void) {
-  // TODO: DragonFly does not seem to provide any userland means to get this information.
-  return 0ul;
-}
-static size_t mi_os_numa_node_countx(void) {
-  size_t ncpus = 0, nvirtcoresperphys = 0;
-  size_t len = sizeof(size_t);
-  if (sysctlbyname("hw.ncpu", &ncpus, &len, NULL, 0) == -1) return 0ul;
-  if (sysctlbyname("hw.cpu_topology_ht_ids", &nvirtcoresperphys, &len, NULL, 0) == -1) return 0ul;
-  return nvirtcoresperphys * ncpus;
-}
-#else
-static size_t mi_os_numa_nodex(void) {
-  return 0;
-}
-static size_t mi_os_numa_node_countx(void) {
-  return 1;
-}
-#endif
 
 _Atomic(size_t)  _mi_numa_node_count; // = 0   // cache the node count
 
@@ -1459,7 +669,7 @@ size_t _mi_os_numa_node_count_get(void) {
       count = (size_t)ncount;
     }
     else {
-      count = mi_os_numa_node_countx(); // or detect dynamically
+      count = _mi_prim_numa_node_count(); // or detect dynamically
       if (count == 0) count = 1;
     }
     mi_atomic_store_release(&_mi_numa_node_count, count); // save it
@@ -1473,7 +683,7 @@ int _mi_os_numa_node_get(mi_os_tld_t* tld) {
   size_t numa_count = _mi_os_numa_node_count();
   if (numa_count<=1) return 0; // optimize on single numa node systems: always node 0
   // never more than the node count and >= 0
-  size_t numa_node = mi_os_numa_nodex();
+  size_t numa_node = _mi_prim_numa_node();
   if (numa_node >= numa_count) { numa_node = numa_node % numa_count; }
   return (int)numa_node;
 }
diff --git a/Objects/mimalloc/page.c b/Objects/mimalloc/page.c
index 4250ff358b4a0b..8ac0a715e76e5a 100644
--- a/Objects/mimalloc/page.c
+++ b/Objects/mimalloc/page.c
@@ -12,8 +12,8 @@ terms of the MIT license. A copy of the license can be found in the file
 ----------------------------------------------------------- */
 
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
-#include "mimalloc-atomic.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
 
 /* -----------------------------------------------------------
   Definition of page queues for each block size
@@ -66,6 +66,14 @@ static bool mi_page_list_is_valid(mi_page_t* page, mi_block_t* p) {
     if (p < start || p >= end) return false;
     p = mi_block_next(page, p);
   }
+#if MI_DEBUG>3 // generally too expensive to check this
+  if (page->free_is_zero) {
+    const size_t ubsize = mi_page_usable_block_size(page);
+    for (mi_block_t* block = page->free; block != NULL; block = mi_block_next(page, block)) {
+      mi_assert_expensive(mi_mem_is_zero(block + 1, ubsize - sizeof(mi_block_t)));
+    }
+  }
+#endif
   return true;
 }
 
@@ -84,7 +92,7 @@ static bool mi_page_is_valid_init(mi_page_t* page) {
   mi_assert_internal(mi_page_list_is_valid(page,page->local_free));
 
   #if MI_DEBUG>3 // generally too expensive to check this
-  if (page->is_zero) {
+  if (page->free_is_zero) {
     const size_t ubsize = mi_page_usable_block_size(page);
     for(mi_block_t* block = page->free; block != NULL; block = mi_block_next(page,block)) {
       mi_assert_expensive(mi_mem_is_zero(block + 1, ubsize - sizeof(mi_block_t)));
@@ -92,10 +100,12 @@ static bool mi_page_is_valid_init(mi_page_t* page) {
   }
   #endif
 
+  #if !MI_TRACK_ENABLED && !MI_TSAN
   mi_block_t* tfree = mi_page_thread_free(page);
   mi_assert_internal(mi_page_list_is_valid(page, tfree));
   //size_t tfree_count = mi_page_list_count(page, tfree);
   //mi_assert_internal(tfree_count <= page->thread_freed + 1);
+  #endif
 
   size_t free_count = mi_page_list_count(page, page->free) + mi_page_list_count(page, page->local_free);
   mi_assert_internal(page->used + free_count == page->capacity);
@@ -103,6 +113,8 @@ static bool mi_page_is_valid_init(mi_page_t* page) {
   return true;
 }
 
+extern bool _mi_process_is_initialized;             // has mi_process_init been called?
+
 bool _mi_page_is_valid(mi_page_t* page) {
   mi_assert_internal(mi_page_is_valid_init(page));
   #if MI_SECURE
@@ -217,7 +229,7 @@ void _mi_page_free_collect(mi_page_t* page, bool force) {
       // usual case
       page->free = page->local_free;
       page->local_free = NULL;
-      page->is_zero = false;
+      page->free_is_zero = false;
     }
     else if (force) {
       // append -- only on shutdown (force) as this is a linear operation
@@ -229,7 +241,7 @@ void _mi_page_free_collect(mi_page_t* page, bool force) {
       mi_block_set_next(page, tail, page->free);
       page->free = page->local_free;
       page->local_free = NULL;
-      page->is_zero = false;
+      page->free_is_zero = false;
     }
   }
 
@@ -251,7 +263,7 @@ void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) {
   #if MI_HUGE_PAGE_ABANDON
   mi_assert_internal(_mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
   #endif
-  mi_assert_internal(!page->is_reset);
+  
   // TODO: push on full queue immediately if it is full?
   mi_page_queue_t* pq = mi_page_queue(heap, mi_page_block_size(page));
   mi_page_queue_push(heap, pq, page);
@@ -379,7 +391,7 @@ void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
   mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE);
   mi_page_set_heap(page, NULL);
 
-#if MI_DEBUG>1
+#if (MI_DEBUG>1) && !MI_TRACK_ENABLED
   // check there are no references left..
   for (mi_block_t* block = (mi_block_t*)pheap->thread_delayed_free; block != NULL; block = mi_block_nextx(pheap, block, pheap->keys)) {
     mi_assert_internal(_mi_ptr_page(block) != page);
@@ -417,7 +429,7 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
 
 // Retire parameters
 #define MI_MAX_RETIRE_SIZE    (MI_MEDIUM_OBJ_SIZE_MAX)
-#define MI_RETIRE_CYCLES      (8)
+#define MI_RETIRE_CYCLES      (16)
 
 // Retire a page with no more used blocks
 // Important to not retire too quickly though as new
@@ -637,11 +649,6 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld)
   // enable the new free list
   page->capacity += (uint16_t)extend;
   mi_stat_increase(tld->stats.page_committed, extend * bsize);
-
-  // extension into zero initialized memory preserves the zero'd free list
-  if (!page->is_zero_init) {
-    page->is_zero = false;
-  }
   mi_assert_expensive(mi_page_is_valid_init(page));
 }
 
@@ -663,18 +670,19 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   mi_assert_internal(page_size / block_size < (1L<<16));
   page->reserved = (uint16_t)(page_size / block_size);
   mi_assert_internal(page->reserved > 0);
-  #ifdef MI_ENCODE_FREELIST
+  #if (MI_PADDING || MI_ENCODE_FREELIST)
   page->keys[0] = _mi_heap_random_next(heap);
   page->keys[1] = _mi_heap_random_next(heap);
   #endif
-  #if MI_DEBUG > 0
-  page->is_zero = false; // ensure in debug mode we initialize with MI_DEBUG_UNINIT, see issue #501
-  #else
-  page->is_zero = page->is_zero_init;
+  page->free_is_zero = page->is_zero_init;
+  #if MI_DEBUG>2
+  if (page->is_zero_init) {
+    mi_track_mem_defined(page_start, page_size);
+    mi_assert_expensive(mi_mem_is_zero(page_start, page_size));
+  }
   #endif
-
+  
   mi_assert_internal(page->is_committed);
-  mi_assert_internal(!page->is_reset);
   mi_assert_internal(page->capacity == 0);
   mi_assert_internal(page->free == NULL);
   mi_assert_internal(page->used == 0);
@@ -683,7 +691,7 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   mi_assert_internal(page->prev == NULL);
   mi_assert_internal(page->retire_expire == 0);
   mi_assert_internal(!mi_page_has_aligned(page));
-  #if (MI_ENCODE_FREELIST)
+  #if (MI_PADDING || MI_ENCODE_FREELIST)
   mi_assert_internal(page->keys[0] != 0);
   mi_assert_internal(page->keys[1] != 0);
   #endif
@@ -703,12 +711,16 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
 static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* pq, bool first_try)
 {
   // search through the pages in "next fit" order
+  #if MI_STAT
   size_t count = 0;
+  #endif
   mi_page_t* page = pq->first;
   while (page != NULL)
   {
     mi_page_t* next = page->next; // remember next
+    #if MI_STAT    
     count++;
+    #endif
 
     // 0. collect freed blocks by us and other threads
     _mi_page_free_collect(page, false);
@@ -869,7 +881,9 @@ static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size, size_t huge_alignme
   }
   else {
     // otherwise find a page with free blocks in our size segregated queues
-    mi_assert_internal(size >= MI_PADDING_SIZE);
+    #if MI_PADDING
+    mi_assert_internal(size >= MI_PADDING_SIZE); 
+    #endif
     return mi_find_free_page(heap, size);
   }
 }
@@ -884,8 +898,7 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al
 
   // initialize if necessary
   if mi_unlikely(!mi_heap_is_initialized(heap)) {
-    mi_thread_init(); // calls `_mi_heap_init` in turn
-    heap = mi_get_default_heap();
+    heap = mi_heap_get_default(); // calls mi_thread_init 
     if mi_unlikely(!mi_heap_is_initialized(heap)) { return NULL; }
   }
   mi_assert_internal(mi_heap_is_initialized(heap));
diff --git a/Objects/mimalloc/prim/osx/alloc-override-zone.c b/Objects/mimalloc/prim/osx/alloc-override-zone.c
new file mode 100644
index 00000000000000..0e0a99d9388c8a
--- /dev/null
+++ b/Objects/mimalloc/prim/osx/alloc-override-zone.c
@@ -0,0 +1,458 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2022, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+
+#if defined(MI_MALLOC_OVERRIDE)
+
+#if !defined(__APPLE__)
+#error "this file should only be included on macOS"
+#endif
+
+/* ------------------------------------------------------
+   Override system malloc on macOS
+   This is done through the malloc zone interface.
+   It seems to be most robust in combination with interposing
+   though or otherwise we may get zone errors as there are could
+   be allocations done by the time we take over the
+   zone.
+------------------------------------------------------ */
+
+#include <AvailabilityMacros.h>
+#include <malloc/malloc.h>
+#include <string.h>  // memset
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(MAC_OS_X_VERSION_10_6) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6)
+// only available from OSX 10.6
+extern malloc_zone_t* malloc_default_purgeable_zone(void) __attribute__((weak_import));
+#endif
+
+/* ------------------------------------------------------
+   malloc zone members
+------------------------------------------------------ */
+
+static size_t zone_size(malloc_zone_t* zone, const void* p) {
+  MI_UNUSED(zone);
+  if (!mi_is_in_heap_region(p)){ return 0; } // not our pointer, bail out
+  return mi_usable_size(p);
+}
+
+static void* zone_malloc(malloc_zone_t* zone, size_t size) {
+  MI_UNUSED(zone);
+  return mi_malloc(size);
+}
+
+static void* zone_calloc(malloc_zone_t* zone, size_t count, size_t size) {
+  MI_UNUSED(zone);
+  return mi_calloc(count, size);
+}
+
+static void* zone_valloc(malloc_zone_t* zone, size_t size) {
+  MI_UNUSED(zone);
+  return mi_malloc_aligned(size, _mi_os_page_size());
+}
+
+static void zone_free(malloc_zone_t* zone, void* p) {
+  MI_UNUSED(zone);
+  mi_cfree(p);
+}
+
+static void* zone_realloc(malloc_zone_t* zone, void* p, size_t newsize) {
+  MI_UNUSED(zone);
+  return mi_realloc(p, newsize);
+}
+
+static void* zone_memalign(malloc_zone_t* zone, size_t alignment, size_t size) {
+  MI_UNUSED(zone);
+  return mi_malloc_aligned(size,alignment);
+}
+
+static void zone_destroy(malloc_zone_t* zone) {
+  MI_UNUSED(zone);
+  // todo: ignore for now?
+}
+
+static unsigned zone_batch_malloc(malloc_zone_t* zone, size_t size, void** ps, unsigned count) {
+  size_t i;
+  for (i = 0; i < count; i++) {
+    ps[i] = zone_malloc(zone, size);
+    if (ps[i] == NULL) break;
+  }
+  return i;
+}
+
+static void zone_batch_free(malloc_zone_t* zone, void** ps, unsigned count) {
+  for(size_t i = 0; i < count; i++) {
+    zone_free(zone, ps[i]);
+    ps[i] = NULL;
+  }
+}
+
+static size_t zone_pressure_relief(malloc_zone_t* zone, size_t size) {
+  MI_UNUSED(zone); MI_UNUSED(size);
+  mi_collect(false);
+  return 0;
+}
+
+static void zone_free_definite_size(malloc_zone_t* zone, void* p, size_t size) {
+  MI_UNUSED(size);
+  zone_free(zone,p);
+}
+
+static boolean_t zone_claimed_address(malloc_zone_t* zone, void* p) {
+  MI_UNUSED(zone);
+  return mi_is_in_heap_region(p);
+}
+
+
+/* ------------------------------------------------------
+   Introspection members
+------------------------------------------------------ */
+
+static kern_return_t intro_enumerator(task_t task, void* p,
+                            unsigned type_mask, vm_address_t zone_address,
+                            memory_reader_t reader,
+                            vm_range_recorder_t recorder)
+{
+  // todo: enumerate all memory
+  MI_UNUSED(task); MI_UNUSED(p); MI_UNUSED(type_mask); MI_UNUSED(zone_address);
+  MI_UNUSED(reader); MI_UNUSED(recorder);
+  return KERN_SUCCESS;
+}
+
+static size_t intro_good_size(malloc_zone_t* zone, size_t size) {
+  MI_UNUSED(zone);
+  return mi_good_size(size);
+}
+
+static boolean_t intro_check(malloc_zone_t* zone) {
+  MI_UNUSED(zone);
+  return true;
+}
+
+static void intro_print(malloc_zone_t* zone, boolean_t verbose) {
+  MI_UNUSED(zone); MI_UNUSED(verbose);
+  mi_stats_print(NULL);
+}
+
+static void intro_log(malloc_zone_t* zone, void* p) {
+  MI_UNUSED(zone); MI_UNUSED(p);
+  // todo?
+}
+
+static void intro_force_lock(malloc_zone_t* zone) {
+  MI_UNUSED(zone);
+  // todo?
+}
+
+static void intro_force_unlock(malloc_zone_t* zone) {
+  MI_UNUSED(zone);
+  // todo?
+}
+
+static void intro_statistics(malloc_zone_t* zone, malloc_statistics_t* stats) {
+  MI_UNUSED(zone);
+  // todo...
+  stats->blocks_in_use = 0;
+  stats->size_in_use = 0;
+  stats->max_size_in_use = 0;
+  stats->size_allocated = 0;
+}
+
+static boolean_t intro_zone_locked(malloc_zone_t* zone) {
+  MI_UNUSED(zone);
+  return false;
+}
+
+
+/* ------------------------------------------------------
+  At process start, override the default allocator
+------------------------------------------------------ */
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#endif
+
+#if defined(__clang__)
+#pragma clang diagnostic ignored "-Wc99-extensions"
+#endif
+
+static malloc_introspection_t mi_introspect = {
+  .enumerator = &intro_enumerator,
+  .good_size = &intro_good_size,
+  .check = &intro_check,
+  .print = &intro_print,
+  .log = &intro_log,
+  .force_lock = &intro_force_lock,
+  .force_unlock = &intro_force_unlock,
+#if defined(MAC_OS_X_VERSION_10_6) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6) && !defined(__ppc__)
+  .statistics = &intro_statistics,
+  .zone_locked = &intro_zone_locked,
+#endif
+};
+
+static malloc_zone_t mi_malloc_zone = {
+  // note: even with designators, the order is important for C++ compilation
+  //.reserved1 = NULL,
+  //.reserved2 = NULL,
+  .size = &zone_size,
+  .malloc = &zone_malloc,
+  .calloc = &zone_calloc,
+  .valloc = &zone_valloc,
+  .free = &zone_free,
+  .realloc = &zone_realloc,
+  .destroy = &zone_destroy,
+  .zone_name = "mimalloc",
+  .batch_malloc = &zone_batch_malloc,
+  .batch_free = &zone_batch_free,
+  .introspect = &mi_introspect,
+#if defined(MAC_OS_X_VERSION_10_6) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6) && !defined(__ppc__)
+  #if defined(MAC_OS_X_VERSION_10_14) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_14)
+  .version = 10,
+  #else
+  .version = 9,
+  #endif
+  // switch to version 9+ on OSX 10.6 to support memalign.
+  .memalign = &zone_memalign,
+  .free_definite_size = &zone_free_definite_size,
+  .pressure_relief = &zone_pressure_relief,
+  #if defined(MAC_OS_X_VERSION_10_14) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_14)
+  .claimed_address = &zone_claimed_address,
+  #endif
+#else
+  .version = 4,
+#endif
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#if defined(MI_OSX_INTERPOSE) && defined(MI_SHARED_LIB_EXPORT)
+
+// ------------------------------------------------------
+// Override malloc_xxx and malloc_zone_xxx api's to use only
+// our mimalloc zone. Since even the loader uses malloc
+// on macOS, this ensures that all allocations go through
+// mimalloc (as all calls are interposed).
+// The main `malloc`, `free`, etc calls are interposed in `alloc-override.c`,
+// Here, we also override macOS specific API's like
+// `malloc_zone_calloc` etc. see <https://github.com/aosm/libmalloc/blob/master/man/malloc_zone_malloc.3>
+// ------------------------------------------------------
+
+static inline malloc_zone_t* mi_get_default_zone(void)
+{
+  static bool init;
+  if mi_unlikely(!init) {
+    init = true;
+    malloc_zone_register(&mi_malloc_zone);  // by calling register we avoid a zone error on free (see <http://eatmyrandom.blogspot.com/2010/03/mallocfree-interception-on-mac-os-x.html>)
+  }
+  return &mi_malloc_zone;
+}
+
+mi_decl_externc int  malloc_jumpstart(uintptr_t cookie);
+mi_decl_externc void _malloc_fork_prepare(void);
+mi_decl_externc void _malloc_fork_parent(void);
+mi_decl_externc void _malloc_fork_child(void);
+
+
+static malloc_zone_t* mi_malloc_create_zone(vm_size_t size, unsigned flags) {
+  MI_UNUSED(size); MI_UNUSED(flags);
+  return mi_get_default_zone();
+}
+
+static malloc_zone_t* mi_malloc_default_zone (void) {
+  return mi_get_default_zone();
+}
+
+static malloc_zone_t* mi_malloc_default_purgeable_zone(void) {
+  return mi_get_default_zone();
+}
+
+static void mi_malloc_destroy_zone(malloc_zone_t* zone) {
+  MI_UNUSED(zone);
+  // nothing.
+}
+
+static kern_return_t mi_malloc_get_all_zones (task_t task, memory_reader_t mr, vm_address_t** addresses, unsigned* count) {
+  MI_UNUSED(task); MI_UNUSED(mr);
+  if (addresses != NULL) *addresses = NULL;
+  if (count != NULL) *count = 0;
+  return KERN_SUCCESS;
+}
+
+static const char* mi_malloc_get_zone_name(malloc_zone_t* zone) {
+  return (zone == NULL ? mi_malloc_zone.zone_name : zone->zone_name);
+}
+
+static void mi_malloc_set_zone_name(malloc_zone_t* zone, const char* name) {
+  MI_UNUSED(zone); MI_UNUSED(name);
+}
+
+static int mi_malloc_jumpstart(uintptr_t cookie) {
+  MI_UNUSED(cookie);
+  return 1; // or 0 for no error?
+}
+
+static void mi__malloc_fork_prepare(void) {
+  // nothing
+}
+static void mi__malloc_fork_parent(void) {
+  // nothing
+}
+static void mi__malloc_fork_child(void) {
+  // nothing
+}
+
+static void mi_malloc_printf(const char* fmt, ...) {
+  MI_UNUSED(fmt);
+}
+
+static bool zone_check(malloc_zone_t* zone) {
+  MI_UNUSED(zone);
+  return true;
+}
+
+static malloc_zone_t* zone_from_ptr(const void* p) {
+  MI_UNUSED(p);
+  return mi_get_default_zone();
+}
+
+static void zone_log(malloc_zone_t* zone, void* p) {
+  MI_UNUSED(zone); MI_UNUSED(p);
+}
+
+static void zone_print(malloc_zone_t* zone, bool b) {
+  MI_UNUSED(zone); MI_UNUSED(b);
+}
+
+static void zone_print_ptr_info(void* p) {
+  MI_UNUSED(p);
+}
+
+static void zone_register(malloc_zone_t* zone) {
+  MI_UNUSED(zone);
+}
+
+static void zone_unregister(malloc_zone_t* zone) {
+  MI_UNUSED(zone);
+}
+
+// use interposing so `DYLD_INSERT_LIBRARIES` works without `DYLD_FORCE_FLAT_NAMESPACE=1`
+// See: <https://books.google.com/books?id=K8vUkpOXhN4C&pg=PA73>
+struct mi_interpose_s {
+  const void* replacement;
+  const void* target;
+};
+#define MI_INTERPOSE_FUN(oldfun,newfun) { (const void*)&newfun, (const void*)&oldfun }
+#define MI_INTERPOSE_MI(fun)            MI_INTERPOSE_FUN(fun,mi_##fun)
+#define MI_INTERPOSE_ZONE(fun)          MI_INTERPOSE_FUN(malloc_##fun,fun)
+__attribute__((used)) static const struct mi_interpose_s _mi_zone_interposes[]  __attribute__((section("__DATA, __interpose"))) =
+{
+
+  MI_INTERPOSE_MI(malloc_create_zone),
+  MI_INTERPOSE_MI(malloc_default_purgeable_zone),
+  MI_INTERPOSE_MI(malloc_default_zone),
+  MI_INTERPOSE_MI(malloc_destroy_zone),
+  MI_INTERPOSE_MI(malloc_get_all_zones),
+  MI_INTERPOSE_MI(malloc_get_zone_name),
+  MI_INTERPOSE_MI(malloc_jumpstart),
+  MI_INTERPOSE_MI(malloc_printf),
+  MI_INTERPOSE_MI(malloc_set_zone_name),
+  MI_INTERPOSE_MI(_malloc_fork_child),
+  MI_INTERPOSE_MI(_malloc_fork_parent),
+  MI_INTERPOSE_MI(_malloc_fork_prepare),
+
+  MI_INTERPOSE_ZONE(zone_batch_free),
+  MI_INTERPOSE_ZONE(zone_batch_malloc),
+  MI_INTERPOSE_ZONE(zone_calloc),
+  MI_INTERPOSE_ZONE(zone_check),
+  MI_INTERPOSE_ZONE(zone_free),
+  MI_INTERPOSE_ZONE(zone_from_ptr),
+  MI_INTERPOSE_ZONE(zone_log),
+  MI_INTERPOSE_ZONE(zone_malloc),
+  MI_INTERPOSE_ZONE(zone_memalign),
+  MI_INTERPOSE_ZONE(zone_print),
+  MI_INTERPOSE_ZONE(zone_print_ptr_info),
+  MI_INTERPOSE_ZONE(zone_realloc),
+  MI_INTERPOSE_ZONE(zone_register),
+  MI_INTERPOSE_ZONE(zone_unregister),
+  MI_INTERPOSE_ZONE(zone_valloc)
+};
+
+
+#else
+
+// ------------------------------------------------------
+// hook into the zone api's without interposing
+// This is the official way of adding an allocator but
+// it seems less robust than using interpose.
+// ------------------------------------------------------
+
+static inline malloc_zone_t* mi_get_default_zone(void)
+{
+  // The first returned zone is the real default
+  malloc_zone_t** zones = NULL;
+  unsigned count = 0;
+  kern_return_t ret = malloc_get_all_zones(0, NULL, (vm_address_t**)&zones, &count);
+  if (ret == KERN_SUCCESS && count > 0) {
+    return zones[0];
+  }
+  else {
+    // fallback
+    return malloc_default_zone();
+  }
+}
+
+#if defined(__clang__)
+__attribute__((constructor(0)))
+#else
+__attribute__((constructor))      // seems not supported by g++-11 on the M1
+#endif
+static void _mi_macos_override_malloc(void) {
+  malloc_zone_t* purgeable_zone = NULL;
+
+  #if defined(MAC_OS_X_VERSION_10_6) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6)
+  // force the purgeable zone to exist to avoid strange bugs
+  if (malloc_default_purgeable_zone) {
+    purgeable_zone = malloc_default_purgeable_zone();
+  }
+  #endif
+
+  // Register our zone.
+  // thomcc: I think this is still needed to put us in the zone list.
+  malloc_zone_register(&mi_malloc_zone);
+  // Unregister the default zone, this makes our zone the new default
+  // as that was the last registered.
+  malloc_zone_t *default_zone = mi_get_default_zone();
+  // thomcc: Unsure if the next test is *always* false or just false in the
+  // cases I've tried. I'm also unsure if the code inside is needed. at all
+  if (default_zone != &mi_malloc_zone) {
+    malloc_zone_unregister(default_zone);
+
+    // Reregister the default zone so free and realloc in that zone keep working.
+    malloc_zone_register(default_zone);
+  }
+
+  // Unregister, and re-register the purgeable_zone to avoid bugs if it occurs
+  // earlier than the default zone.
+  if (purgeable_zone != NULL) {
+    malloc_zone_unregister(purgeable_zone);
+    malloc_zone_register(purgeable_zone);
+  }
+
+}
+#endif  // MI_OSX_INTERPOSE
+
+#endif // MI_MALLOC_OVERRIDE
diff --git a/Objects/mimalloc/prim/osx/prim.c b/Objects/mimalloc/prim/osx/prim.c
new file mode 100644
index 00000000000000..8a2f4e8aa47316
--- /dev/null
+++ b/Objects/mimalloc/prim/osx/prim.c
@@ -0,0 +1,9 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+// We use the unix/prim.c with the mmap API on macOSX
+#include "../unix/prim.c"
diff --git a/Objects/mimalloc/prim/prim.c b/Objects/mimalloc/prim/prim.c
new file mode 100644
index 00000000000000..9a597d8eb603e9
--- /dev/null
+++ b/Objects/mimalloc/prim/prim.c
@@ -0,0 +1,24 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+// Select the implementation of the primitives
+// depending on the OS.
+
+#if defined(_WIN32)
+#include "windows/prim.c"  // VirtualAlloc (Windows)
+
+#elif defined(__APPLE__)
+#include "osx/prim.c"      // macOSX (actually defers to mmap in unix/prim.c)
+
+#elif defined(__wasi__)
+#define MI_USE_SBRK
+#include "wasi/prim.c"     // memory-grow or sbrk (Wasm)
+
+#else
+#include "unix/prim.c"     // mmap() (Linux, macOSX, BSD, Illumnos, Haiku, DragonFly, etc.)
+
+#endif
diff --git a/Objects/mimalloc/prim/readme.md b/Objects/mimalloc/prim/readme.md
new file mode 100644
index 00000000000000..380dd3a7178419
--- /dev/null
+++ b/Objects/mimalloc/prim/readme.md
@@ -0,0 +1,9 @@
+## Portability Primitives
+
+This is the portability layer where all primitives needed from the OS are defined.
+
+- `include/mimalloc/prim.h`: primitive portability API definition.
+- `prim.c`: Selects one of `unix/prim.c`, `wasi/prim.c`, or `windows/prim.c` depending on the host platform
+            (and on macOS, `osx/prim.c` defers to `unix/prim.c`).
+
+Note: still work in progress, there may still be places in the sources that still depend on OS ifdef's.
\ No newline at end of file
diff --git a/Objects/mimalloc/prim/unix/prim.c b/Objects/mimalloc/prim/unix/prim.c
new file mode 100644
index 00000000000000..6ca8d12c40f833
--- /dev/null
+++ b/Objects/mimalloc/prim/unix/prim.c
@@ -0,0 +1,859 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+// This file is included in `src/prim/prim.c`
+
+#ifndef _DEFAULT_SOURCE
+#define _DEFAULT_SOURCE   // ensure mmap flags and syscall are defined
+#endif
+
+#if defined(__sun)
+// illumos provides new mman.h api when any of these are defined
+// otherwise the old api based on caddr_t which predates the void pointers one.
+// stock solaris provides only the former, chose to atomically to discard those
+// flags only here rather than project wide tough.
+#undef _XOPEN_SOURCE
+#undef _POSIX_C_SOURCE
+#endif
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
+#include "mimalloc/prim.h"
+
+#include <sys/mman.h>  // mmap
+#include <unistd.h>    // sysconf
+
+#if defined(__linux__)
+  #include <features.h>
+  #include <fcntl.h>
+  #if defined(__GLIBC__)
+  #include <linux/mman.h> // linux mmap flags
+  #else
+  #include <sys/mman.h>
+  #endif
+#elif defined(__APPLE__)
+  #include <TargetConditionals.h>
+  #if !TARGET_IOS_IPHONE && !TARGET_IOS_SIMULATOR
+  #include <mach/vm_statistics.h>
+  #endif
+#elif defined(__FreeBSD__) || defined(__DragonFly__)
+  #include <sys/param.h>
+  #if __FreeBSD_version >= 1200000
+  #include <sys/cpuset.h>
+  #include <sys/domainset.h>
+  #endif
+  #include <sys/sysctl.h>
+#endif
+
+#if !defined(__HAIKU__) && !defined(__APPLE__) && !defined(__CYGWIN__)
+  #define MI_HAS_SYSCALL_H
+  #include <sys/syscall.h>
+#endif
+
+//------------------------------------------------------------------------------------
+// Use syscalls for some primitives to allow for libraries that override open/read/close etc.
+// and do allocation themselves; using syscalls prevents recursion when mimalloc is 
+// still initializing (issue #713)
+//------------------------------------------------------------------------------------
+
+#if defined(MI_HAS_SYSCALL_H) && defined(SYS_open) && defined(SYS_close) && defined(SYS_read) && defined(SYS_access)
+
+static int mi_prim_open(const char* fpath, int open_flags) {
+  return syscall(SYS_open,fpath,open_flags,0);
+}
+static ssize_t mi_prim_read(int fd, void* buf, size_t bufsize) {
+  return syscall(SYS_read,fd,buf,bufsize);
+}
+static int mi_prim_close(int fd) {
+  return syscall(SYS_close,fd);
+}
+static int mi_prim_access(const char *fpath, int mode) {
+  return syscall(SYS_access,fpath,mode);
+}
+
+#elif !defined(__APPLE__)  // avoid unused warnings
+
+static int mi_prim_open(const char* fpath, int open_flags) {
+  return open(fpath,open_flags);
+}
+static ssize_t mi_prim_read(int fd, void* buf, size_t bufsize) {
+  return read(fd,buf,bufsize);
+}
+static int mi_prim_close(int fd) {
+  return close(fd);
+}
+static int mi_prim_access(const char *fpath, int mode) {
+  return access(fpath,mode);
+}
+
+#endif
+
+
+
+//---------------------------------------------
+// init
+//---------------------------------------------
+
+static bool unix_detect_overcommit(void) {
+  bool os_overcommit = true;
+#if defined(__linux__)
+  int fd = mi_prim_open("/proc/sys/vm/overcommit_memory", O_RDONLY);
+        if (fd >= 0) {
+    char buf[32];
+    ssize_t nread = mi_prim_read(fd, &buf, sizeof(buf));
+    mi_prim_close(fd);
+    // <https://www.kernel.org/doc/Documentation/vm/overcommit-accounting>
+    // 0: heuristic overcommit, 1: always overcommit, 2: never overcommit (ignore NORESERVE)
+    if (nread >= 1) {
+      os_overcommit = (buf[0] == '0' || buf[0] == '1');
+    }
+  }
+#elif defined(__FreeBSD__)
+  int val = 0;
+  size_t olen = sizeof(val);
+  if (sysctlbyname("vm.overcommit", &val, &olen, NULL, 0) == 0) {
+    os_overcommit = (val != 0);
+  }
+#else
+  // default: overcommit is true  
+#endif
+  return os_overcommit;
+}
+
+void _mi_prim_mem_init( mi_os_mem_config_t* config ) {
+  long psize = sysconf(_SC_PAGESIZE);
+  if (psize > 0) {
+    config->page_size = (size_t)psize;
+    config->alloc_granularity = (size_t)psize;
+  }
+  config->large_page_size = 2*MI_MiB; // TODO: can we query the OS for this?
+  config->has_overcommit = unix_detect_overcommit();
+  config->must_free_whole = false;    // mmap can free in parts
+  config->has_virtual_reserve = true; // todo: check if this true for NetBSD?  (for anonymous mmap with PROT_NONE)
+}
+
+
+//---------------------------------------------
+// free
+//---------------------------------------------
+
+int _mi_prim_free(void* addr, size_t size ) {
+  bool err = (munmap(addr, size) == -1);
+  return (err ? errno : 0);
+}
+
+
+//---------------------------------------------
+// mmap
+//---------------------------------------------
+
+static int unix_madvise(void* addr, size_t size, int advice) {
+  #if defined(__sun)
+  return madvise((caddr_t)addr, size, advice);  // Solaris needs cast (issue #520)
+  #else
+  return madvise(addr, size, advice);
+  #endif
+}
+
+static void* unix_mmap_prim(void* addr, size_t size, size_t try_alignment, int protect_flags, int flags, int fd) {
+  MI_UNUSED(try_alignment);
+  void* p = NULL;
+  #if defined(MAP_ALIGNED)  // BSD
+  if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0) {
+    size_t n = mi_bsr(try_alignment);
+    if (((size_t)1 << n) == try_alignment && n >= 12 && n <= 30) {  // alignment is a power of 2 and 4096 <= alignment <= 1GiB
+      p = mmap(addr, size, protect_flags, flags | MAP_ALIGNED(n), fd, 0);
+      if (p==MAP_FAILED || !_mi_is_aligned(p,try_alignment)) { 
+        int err = errno;
+        _mi_warning_message("unable to directly request aligned OS memory (error: %d (0x%x), size: 0x%zx bytes, alignment: 0x%zx, hint address: %p)\n", err, err, size, try_alignment, addr);
+      }
+      if (p!=MAP_FAILED) return p;
+      // fall back to regular mmap      
+    }
+  }
+  #elif defined(MAP_ALIGN)  // Solaris
+  if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0) {
+    p = mmap((void*)try_alignment, size, protect_flags, flags | MAP_ALIGN, fd, 0);  // addr parameter is the required alignment
+    if (p!=MAP_FAILED) return p;
+    // fall back to regular mmap
+  }
+  #endif
+  #if (MI_INTPTR_SIZE >= 8) && !defined(MAP_ALIGNED)
+  // on 64-bit systems, use the virtual address area after 2TiB for 4MiB aligned allocations
+  if (addr == NULL) {
+    void* hint = _mi_os_get_aligned_hint(try_alignment, size);
+    if (hint != NULL) {
+      p = mmap(hint, size, protect_flags, flags, fd, 0);
+      if (p==MAP_FAILED || !_mi_is_aligned(p,try_alignment)) { 
+        #if MI_TRACK_ENABLED  // asan sometimes does not instrument errno correctly?
+        int err = 0;
+        #else
+        int err = errno;
+        #endif
+        _mi_warning_message("unable to directly request hinted aligned OS memory (error: %d (0x%x), size: 0x%zx bytes, alignment: 0x%zx, hint address: %p)\n", err, err, size, try_alignment, hint);
+      }
+      if (p!=MAP_FAILED) return p;
+      // fall back to regular mmap      
+    }
+  }
+  #endif
+  // regular mmap
+  p = mmap(addr, size, protect_flags, flags, fd, 0);
+  if (p!=MAP_FAILED) return p;
+  // failed to allocate
+  return NULL;
+}
+
+static int unix_mmap_fd(void) {
+  #if defined(VM_MAKE_TAG)
+  // macOS: tracking anonymous page with a specific ID. (All up to 98 are taken officially but LLVM sanitizers had taken 99)
+  int os_tag = (int)mi_option_get(mi_option_os_tag);
+  if (os_tag < 100 || os_tag > 255) { os_tag = 100; }
+  return VM_MAKE_TAG(os_tag);
+  #else
+  return -1;
+  #endif
+}
+
+static void* unix_mmap(void* addr, size_t size, size_t try_alignment, int protect_flags, bool large_only, bool allow_large, bool* is_large) {
+  #if !defined(MAP_ANONYMOUS)
+  #define MAP_ANONYMOUS  MAP_ANON
+  #endif
+  #if !defined(MAP_NORESERVE)
+  #define MAP_NORESERVE  0
+  #endif
+  void* p = NULL;
+  const int fd = unix_mmap_fd();
+  int flags = MAP_PRIVATE | MAP_ANONYMOUS;
+  if (_mi_os_has_overcommit()) {
+    flags |= MAP_NORESERVE;
+  }
+  #if defined(PROT_MAX)
+  protect_flags |= PROT_MAX(PROT_READ | PROT_WRITE); // BSD
+  #endif
+  // huge page allocation
+  if ((large_only || _mi_os_use_large_page(size, try_alignment)) && allow_large) {
+    static _Atomic(size_t) large_page_try_ok; // = 0;
+    size_t try_ok = mi_atomic_load_acquire(&large_page_try_ok);
+    if (!large_only && try_ok > 0) {
+      // If the OS is not configured for large OS pages, or the user does not have
+      // enough permission, the `mmap` will always fail (but it might also fail for other reasons).
+      // Therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times
+      // to avoid too many failing calls to mmap.
+      mi_atomic_cas_strong_acq_rel(&large_page_try_ok, &try_ok, try_ok - 1);
+    }
+    else {
+      int lflags = flags & ~MAP_NORESERVE;  // using NORESERVE on huge pages seems to fail on Linux
+      int lfd = fd;
+      #ifdef MAP_ALIGNED_SUPER
+      lflags |= MAP_ALIGNED_SUPER;
+      #endif
+      #ifdef MAP_HUGETLB
+      lflags |= MAP_HUGETLB;
+      #endif
+      #ifdef MAP_HUGE_1GB
+      static bool mi_huge_pages_available = true;
+      if ((size % MI_GiB) == 0 && mi_huge_pages_available) {
+        lflags |= MAP_HUGE_1GB;
+      }
+      else
+      #endif
+      {
+        #ifdef MAP_HUGE_2MB
+        lflags |= MAP_HUGE_2MB;
+        #endif
+      }
+      #ifdef VM_FLAGS_SUPERPAGE_SIZE_2MB
+      lfd |= VM_FLAGS_SUPERPAGE_SIZE_2MB;
+      #endif
+      if (large_only || lflags != flags) {
+        // try large OS page allocation
+        *is_large = true;
+        p = unix_mmap_prim(addr, size, try_alignment, protect_flags, lflags, lfd);
+        #ifdef MAP_HUGE_1GB
+        if (p == NULL && (lflags & MAP_HUGE_1GB) != 0) {
+          mi_huge_pages_available = false; // don't try huge 1GiB pages again
+          _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (errno: %i)\n", errno);
+          lflags = ((lflags & ~MAP_HUGE_1GB) | MAP_HUGE_2MB);
+          p = unix_mmap_prim(addr, size, try_alignment, protect_flags, lflags, lfd);
+        }
+        #endif
+        if (large_only) return p;
+        if (p == NULL) {
+          mi_atomic_store_release(&large_page_try_ok, (size_t)8);  // on error, don't try again for the next N allocations
+        }
+      }
+    }
+  }
+  // regular allocation
+  if (p == NULL) {
+    *is_large = false;
+    p = unix_mmap_prim(addr, size, try_alignment, protect_flags, flags, fd);
+    if (p != NULL) {
+      #if defined(MADV_HUGEPAGE)
+      // Many Linux systems don't allow MAP_HUGETLB but they support instead
+      // transparent huge pages (THP). Generally, it is not required to call `madvise` with MADV_HUGE
+      // though since properly aligned allocations will already use large pages if available
+      // in that case -- in particular for our large regions (in `memory.c`).
+      // However, some systems only allow THP if called with explicit `madvise`, so
+      // when large OS pages are enabled for mimalloc, we call `madvise` anyways.
+      if (allow_large && _mi_os_use_large_page(size, try_alignment)) {
+        if (unix_madvise(p, size, MADV_HUGEPAGE) == 0) {
+          *is_large = true; // possibly
+        };
+      }
+      #elif defined(__sun)
+      if (allow_large && _mi_os_use_large_page(size, try_alignment)) {
+        struct memcntl_mha cmd = {0};
+        cmd.mha_pagesize = large_os_page_size;
+        cmd.mha_cmd = MHA_MAPSIZE_VA;
+        if (memcntl((caddr_t)p, size, MC_HAT_ADVISE, (caddr_t)&cmd, 0, 0) == 0) {
+          *is_large = true;
+        }
+      }
+      #endif
+    }
+  }
+  return p;
+}
+
+// Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned.
+int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) {
+  mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
+  mi_assert_internal(commit || !allow_large);
+  mi_assert_internal(try_alignment > 0);
+  
+  *is_zero = true;
+  int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE);  
+  *addr = unix_mmap(NULL, size, try_alignment, protect_flags, false, allow_large, is_large);
+  return (*addr != NULL ? 0 : errno);
+}
+
+
+//---------------------------------------------
+// Commit/Reset
+//---------------------------------------------
+
+static void unix_mprotect_hint(int err) {
+  #if defined(__linux__) && (MI_SECURE>=2) // guard page around every mimalloc page
+  if (err == ENOMEM) {
+    _mi_warning_message("The next warning may be caused by a low memory map limit.\n"
+                        "  On Linux this is controlled by the vm.max_map_count -- maybe increase it?\n"
+                        "  For example: sudo sysctl -w vm.max_map_count=262144\n");
+  }
+  #else
+  MI_UNUSED(err);
+  #endif
+}
+
+int _mi_prim_commit(void* start, size_t size, bool* is_zero) {
+  // commit: ensure we can access the area
+  // note: we may think that *is_zero can be true since the memory
+  // was either from mmap PROT_NONE, or from decommit MADV_DONTNEED, but
+  // we sometimes call commit on a range with still partially committed
+  // memory and `mprotect` does not zero the range.
+  *is_zero = false;  
+  int err = mprotect(start, size, (PROT_READ | PROT_WRITE));
+  if (err != 0) { 
+    err = errno; 
+    unix_mprotect_hint(err);
+  }
+  return err;
+}
+
+int _mi_prim_decommit(void* start, size_t size, bool* needs_recommit) {
+  int err = 0;  
+  // decommit: use MADV_DONTNEED as it decreases rss immediately (unlike MADV_FREE)
+  err = unix_madvise(start, size, MADV_DONTNEED);    
+  #if !MI_DEBUG && !MI_SECURE
+    *needs_recommit = false;
+  #else
+    *needs_recommit = true;
+    mprotect(start, size, PROT_NONE);
+  #endif
+  /*
+  // decommit: use mmap with MAP_FIXED and PROT_NONE to discard the existing memory (and reduce rss)
+  *needs_recommit = true;
+  const int fd = unix_mmap_fd();
+  void* p = mmap(start, size, PROT_NONE, (MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE), fd, 0);
+  if (p != start) { err = errno; }    
+  */
+  return err;
+}
+
+int _mi_prim_reset(void* start, size_t size) {
+  // We try to use `MADV_FREE` as that is the fastest. A drawback though is that it 
+  // will not reduce the `rss` stats in tools like `top` even though the memory is available
+  // to other processes. With the default `MIMALLOC_PURGE_DECOMMITS=1` we ensure that by 
+  // default `MADV_DONTNEED` is used though.
+  #if defined(MADV_FREE)
+  static _Atomic(size_t) advice = MI_ATOMIC_VAR_INIT(MADV_FREE);
+  int oadvice = (int)mi_atomic_load_relaxed(&advice);
+  int err;
+  while ((err = unix_madvise(start, size, oadvice)) != 0 && errno == EAGAIN) { errno = 0;  };
+  if (err != 0 && errno == EINVAL && oadvice == MADV_FREE) {
+    // if MADV_FREE is not supported, fall back to MADV_DONTNEED from now on
+    mi_atomic_store_release(&advice, (size_t)MADV_DONTNEED);
+    err = unix_madvise(start, size, MADV_DONTNEED);
+  }
+  #else
+  int err = unix_madvise(start, size, MADV_DONTNEED);
+  #endif
+  return err;
+}
+
+int _mi_prim_protect(void* start, size_t size, bool protect) {
+  int err = mprotect(start, size, protect ? PROT_NONE : (PROT_READ | PROT_WRITE));
+  if (err != 0) { err = errno; }  
+  unix_mprotect_hint(err);
+  return err;
+}
+
+
+
+//---------------------------------------------
+// Huge page allocation
+//---------------------------------------------
+
+#if (MI_INTPTR_SIZE >= 8) && !defined(__HAIKU__) && !defined(__CYGWIN__)
+
+#ifndef MPOL_PREFERRED
+#define MPOL_PREFERRED 1
+#endif
+
+#if defined(MI_HAS_SYSCALL_H) && defined(SYS_mbind)
+static long mi_prim_mbind(void* start, unsigned long len, unsigned long mode, const unsigned long* nmask, unsigned long maxnode, unsigned flags) {
+  return syscall(SYS_mbind, start, len, mode, nmask, maxnode, flags);
+}
+#else
+static long mi_prim_mbind(void* start, unsigned long len, unsigned long mode, const unsigned long* nmask, unsigned long maxnode, unsigned flags) {
+  MI_UNUSED(start); MI_UNUSED(len); MI_UNUSED(mode); MI_UNUSED(nmask); MI_UNUSED(maxnode); MI_UNUSED(flags);
+  return 0;
+}
+#endif
+
+int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr) {
+  bool is_large = true;
+  *is_zero = true;
+  *addr = unix_mmap(hint_addr, size, MI_SEGMENT_SIZE, PROT_READ | PROT_WRITE, true, true, &is_large);
+  if (*addr != NULL && numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) { // at most 64 nodes
+    unsigned long numa_mask = (1UL << numa_node);
+    // TODO: does `mbind` work correctly for huge OS pages? should we
+    // use `set_mempolicy` before calling mmap instead?
+    // see: <https://lkml.org/lkml/2017/2/9/875>
+    long err = mi_prim_mbind(*addr, size, MPOL_PREFERRED, &numa_mask, 8*MI_INTPTR_SIZE, 0);
+    if (err != 0) {
+      err = errno;
+      _mi_warning_message("failed to bind huge (1GiB) pages to numa node %d (error: %d (0x%x))\n", numa_node, err, err);
+    }    
+  }
+  return (*addr != NULL ? 0 : errno);
+}
+
+#else
+
+int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr) {
+  MI_UNUSED(hint_addr); MI_UNUSED(size); MI_UNUSED(numa_node);
+  *is_zero = false;
+  *addr = NULL;
+  return ENOMEM;
+}
+
+#endif
+
+//---------------------------------------------
+// NUMA nodes
+//---------------------------------------------
+
+#if defined(__linux__)
+
+#include <stdio.h>    // snprintf
+
+size_t _mi_prim_numa_node(void) {
+  #if defined(MI_HAS_SYSCALL_H) && defined(SYS_getcpu)
+    unsigned long node = 0;
+    unsigned long ncpu = 0;
+    long err = syscall(SYS_getcpu, &ncpu, &node, NULL);
+    if (err != 0) return 0;
+    return node;
+  #else
+    return 0;
+  #endif
+}
+
+size_t _mi_prim_numa_node_count(void) {
+  char buf[128];
+  unsigned node = 0;
+  for(node = 0; node < 256; node++) {
+    // enumerate node entries -- todo: it there a more efficient way to do this? (but ensure there is no allocation)
+    snprintf(buf, 127, "/sys/devices/system/node/node%u", node + 1);
+    if (mi_prim_access(buf,R_OK) != 0) break;
+  }
+  return (node+1);
+}
+
+#elif defined(__FreeBSD__) && __FreeBSD_version >= 1200000
+
+size_t _mi_prim_numa_node(void) {
+  domainset_t dom;
+  size_t node;
+  int policy;
+  if (cpuset_getdomain(CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, sizeof(dom), &dom, &policy) == -1) return 0ul;
+  for (node = 0; node < MAXMEMDOM; node++) {
+    if (DOMAINSET_ISSET(node, &dom)) return node;
+  }
+  return 0ul;
+}
+
+size_t _mi_prim_numa_node_count(void) {
+  size_t ndomains = 0;
+  size_t len = sizeof(ndomains);
+  if (sysctlbyname("vm.ndomains", &ndomains, &len, NULL, 0) == -1) return 0ul;
+  return ndomains;
+}
+
+#elif defined(__DragonFly__)
+
+size_t _mi_prim_numa_node(void) {
+  // TODO: DragonFly does not seem to provide any userland means to get this information.
+  return 0ul;
+}
+
+size_t _mi_prim_numa_node_count(void) {
+  size_t ncpus = 0, nvirtcoresperphys = 0;
+  size_t len = sizeof(size_t);
+  if (sysctlbyname("hw.ncpu", &ncpus, &len, NULL, 0) == -1) return 0ul;
+  if (sysctlbyname("hw.cpu_topology_ht_ids", &nvirtcoresperphys, &len, NULL, 0) == -1) return 0ul;
+  return nvirtcoresperphys * ncpus;
+}
+
+#else
+
+size_t _mi_prim_numa_node(void) {
+  return 0;
+}
+
+size_t _mi_prim_numa_node_count(void) {
+  return 1;
+}
+
+#endif
+
+// ----------------------------------------------------------------
+// Clock
+// ----------------------------------------------------------------
+
+#include <time.h>
+
+#if defined(CLOCK_REALTIME) || defined(CLOCK_MONOTONIC)
+
+mi_msecs_t _mi_prim_clock_now(void) {
+  struct timespec t;
+  #ifdef CLOCK_MONOTONIC
+  clock_gettime(CLOCK_MONOTONIC, &t);
+  #else
+  clock_gettime(CLOCK_REALTIME, &t);
+  #endif
+  return ((mi_msecs_t)t.tv_sec * 1000) + ((mi_msecs_t)t.tv_nsec / 1000000);
+}
+
+#else
+
+// low resolution timer
+mi_msecs_t _mi_prim_clock_now(void) {
+  #if !defined(CLOCKS_PER_SEC) || (CLOCKS_PER_SEC == 1000) || (CLOCKS_PER_SEC == 0)
+  return (mi_msecs_t)clock();  
+  #elif (CLOCKS_PER_SEC < 1000)
+  return (mi_msecs_t)clock() * (1000 / (mi_msecs_t)CLOCKS_PER_SEC);  
+  #else
+  return (mi_msecs_t)clock() / ((mi_msecs_t)CLOCKS_PER_SEC / 1000);
+  #endif
+}
+
+#endif
+
+
+
+
+//----------------------------------------------------------------
+// Process info
+//----------------------------------------------------------------
+
+#if defined(__unix__) || defined(__unix) || defined(unix) || defined(__APPLE__) || defined(__HAIKU__)
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/resource.h>
+
+#if defined(__APPLE__)
+#include <mach/mach.h>
+#endif
+
+#if defined(__HAIKU__)
+#include <kernel/OS.h>
+#endif
+
+static mi_msecs_t timeval_secs(const struct timeval* tv) {
+  return ((mi_msecs_t)tv->tv_sec * 1000L) + ((mi_msecs_t)tv->tv_usec / 1000L);
+}
+
+void _mi_prim_process_info(mi_process_info_t* pinfo)
+{
+  struct rusage rusage;
+  getrusage(RUSAGE_SELF, &rusage);
+  pinfo->utime = timeval_secs(&rusage.ru_utime);
+  pinfo->stime = timeval_secs(&rusage.ru_stime);
+#if !defined(__HAIKU__)
+  pinfo->page_faults = rusage.ru_majflt;
+#endif  
+#if defined(__HAIKU__)
+  // Haiku does not have (yet?) a way to
+  // get these stats per process
+  thread_info tid;
+  area_info mem;
+  ssize_t c;
+  get_thread_info(find_thread(0), &tid);
+  while (get_next_area_info(tid.team, &c, &mem) == B_OK) {
+    pinfo->peak_rss += mem.ram_size;
+  }
+  pinfo->page_faults = 0;
+#elif defined(__APPLE__)
+  pinfo->peak_rss = rusage.ru_maxrss;         // macos reports in bytes
+  #ifdef MACH_TASK_BASIC_INFO
+  struct mach_task_basic_info info;
+  mach_msg_type_number_t infoCount = MACH_TASK_BASIC_INFO_COUNT;
+  if (task_info(mach_task_self(), MACH_TASK_BASIC_INFO, (task_info_t)&info, &infoCount) == KERN_SUCCESS) {
+    pinfo->current_rss = (size_t)info.resident_size;
+  }
+  #else
+  struct task_basic_info info;
+  mach_msg_type_number_t infoCount = TASK_BASIC_INFO_COUNT;
+  if (task_info(mach_task_self(), TASK_BASIC_INFO, (task_info_t)&info, &infoCount) == KERN_SUCCESS) {
+    pinfo->current_rss = (size_t)info.resident_size;
+  }
+  #endif
+#else
+  pinfo->peak_rss = rusage.ru_maxrss * 1024;  // Linux/BSD report in KiB
+#endif
+  // use defaults for commit
+}
+
+#else
+
+#ifndef __wasi__
+// WebAssembly instances are not processes
+#pragma message("define a way to get process info")
+#endif
+
+void _mi_prim_process_info(mi_process_info_t* pinfo)
+{
+  // use defaults
+  MI_UNUSED(pinfo);
+}
+
+#endif
+
+
+//----------------------------------------------------------------
+// Output
+//----------------------------------------------------------------
+
+void _mi_prim_out_stderr( const char* msg ) {
+  fputs(msg,stderr);
+}
+
+
+//----------------------------------------------------------------
+// Environment
+//----------------------------------------------------------------
+
+#if !defined(MI_USE_ENVIRON) || (MI_USE_ENVIRON!=0)
+// On Posix systemsr use `environ` to access environment variables
+// even before the C runtime is initialized.
+#if defined(__APPLE__) && defined(__has_include) && __has_include(<crt_externs.h>)
+#include <crt_externs.h>
+static char** mi_get_environ(void) {
+  return (*_NSGetEnviron());
+}
+#else
+extern char** environ;
+static char** mi_get_environ(void) {
+  return environ;
+}
+#endif
+bool _mi_prim_getenv(const char* name, char* result, size_t result_size) {
+  if (name==NULL) return false;
+  const size_t len = _mi_strlen(name);
+  if (len == 0) return false;
+  char** env = mi_get_environ();
+  if (env == NULL) return false;
+  // compare up to 10000 entries
+  for (int i = 0; i < 10000 && env[i] != NULL; i++) {
+    const char* s = env[i];
+    if (_mi_strnicmp(name, s, len) == 0 && s[len] == '=') { // case insensitive
+      // found it
+      _mi_strlcpy(result, s + len + 1, result_size);
+      return true;
+    }
+  }
+  return false;
+}
+#else
+// fallback: use standard C `getenv` but this cannot be used while initializing the C runtime
+bool _mi_prim_getenv(const char* name, char* result, size_t result_size) {
+  // cannot call getenv() when still initializing the C runtime.
+  if (_mi_preloading()) return false;
+  const char* s = getenv(name);
+  if (s == NULL) {
+    // we check the upper case name too.
+    char buf[64+1];
+    size_t len = _mi_strnlen(name,sizeof(buf)-1);
+    for (size_t i = 0; i < len; i++) {
+      buf[i] = _mi_toupper(name[i]);
+    }
+    buf[len] = 0;
+    s = getenv(buf);
+  }
+  if (s == NULL || _mi_strnlen(s,result_size) >= result_size)  return false;
+  _mi_strlcpy(result, s, result_size);
+  return true;
+}
+#endif  // !MI_USE_ENVIRON
+
+
+//----------------------------------------------------------------
+// Random
+//----------------------------------------------------------------
+
+#if defined(__APPLE__)
+
+#include <AvailabilityMacros.h>
+#if defined(MAC_OS_X_VERSION_10_10) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_10
+#include <CommonCrypto/CommonCryptoError.h>
+#include <CommonCrypto/CommonRandom.h>
+#endif
+bool _mi_prim_random_buf(void* buf, size_t buf_len) {
+  #if defined(MAC_OS_X_VERSION_10_15) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_15
+    // We prefere CCRandomGenerateBytes as it returns an error code while arc4random_buf
+    // may fail silently on macOS. See PR #390, and <https://opensource.apple.com/source/Libc/Libc-1439.40.11/gen/FreeBSD/arc4random.c.auto.html>
+    return (CCRandomGenerateBytes(buf, buf_len) == kCCSuccess);
+  #else
+    // fall back on older macOS
+    arc4random_buf(buf, buf_len);
+    return true;
+  #endif
+}
+
+#elif defined(__ANDROID__) || defined(__DragonFly__) || \
+      defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
+      defined(__sun) 
+
+#include <stdlib.h>
+bool _mi_prim_random_buf(void* buf, size_t buf_len) {
+  arc4random_buf(buf, buf_len);
+  return true;
+}
+
+#elif defined(__linux__) || defined(__HAIKU__)
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+
+bool _mi_prim_random_buf(void* buf, size_t buf_len) {
+  // Modern Linux provides `getrandom` but different distributions either use `sys/random.h` or `linux/random.h`
+  // and for the latter the actual `getrandom` call is not always defined.
+  // (see <https://stackoverflow.com/questions/45237324/why-doesnt-getrandom-compile>)
+  // We therefore use a syscall directly and fall back dynamically to /dev/urandom when needed.
+  #if defined(MI_HAS_SYSCALL_H) && defined(SYS_getrandom)
+    #ifndef GRND_NONBLOCK
+    #define GRND_NONBLOCK (1)
+    #endif
+    static _Atomic(uintptr_t) no_getrandom; // = 0
+    if (mi_atomic_load_acquire(&no_getrandom)==0) {
+      ssize_t ret = syscall(SYS_getrandom, buf, buf_len, GRND_NONBLOCK);
+      if (ret >= 0) return (buf_len == (size_t)ret);
+      if (errno != ENOSYS) return false;
+      mi_atomic_store_release(&no_getrandom, (uintptr_t)1); // don't call again, and fall back to /dev/urandom
+    }
+  #endif
+  int flags = O_RDONLY;
+  #if defined(O_CLOEXEC)
+  flags |= O_CLOEXEC;
+  #endif
+  int fd = mi_prim_open("/dev/urandom", flags);
+  if (fd < 0) return false;
+  size_t count = 0;
+  while(count < buf_len) {
+    ssize_t ret = mi_prim_read(fd, (char*)buf + count, buf_len - count);
+    if (ret<=0) {
+      if (errno!=EAGAIN && errno!=EINTR) break;
+    }
+    else {
+      count += ret;
+    }
+  }
+  mi_prim_close(fd);
+  return (count==buf_len);
+}
+
+#else
+
+bool _mi_prim_random_buf(void* buf, size_t buf_len) {
+  return false;
+}
+
+#endif
+
+
+//----------------------------------------------------------------
+// Thread init/done
+//----------------------------------------------------------------
+
+#if defined(MI_USE_PTHREADS)
+
+// use pthread local storage keys to detect thread ending
+// (and used with MI_TLS_PTHREADS for the default heap)
+pthread_key_t _mi_heap_default_key = (pthread_key_t)(-1);
+
+static void mi_pthread_done(void* value) {
+  if (value!=NULL) {
+    _mi_thread_done((mi_heap_t*)value);
+  }
+}
+
+void _mi_prim_thread_init_auto_done(void) {
+  mi_assert_internal(_mi_heap_default_key == (pthread_key_t)(-1));
+  pthread_key_create(&_mi_heap_default_key, &mi_pthread_done);
+}
+
+void _mi_prim_thread_done_auto_done(void) {
+  // nothing to do
+}
+
+void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
+  if (_mi_heap_default_key != (pthread_key_t)(-1)) {  // can happen during recursive invocation on freeBSD
+    pthread_setspecific(_mi_heap_default_key, heap);
+  }
+}
+
+#else 
+
+void _mi_prim_thread_init_auto_done(void) {
+  // nothing
+}
+
+void _mi_prim_thread_done_auto_done(void) {
+  // nothing
+}
+
+void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
+  MI_UNUSED(heap);
+}
+
+#endif
diff --git a/Objects/mimalloc/prim/wasi/prim.c b/Objects/mimalloc/prim/wasi/prim.c
new file mode 100644
index 00000000000000..50511f0b5bf0b4
--- /dev/null
+++ b/Objects/mimalloc/prim/wasi/prim.c
@@ -0,0 +1,275 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+// This file is included in `src/prim/prim.c`
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
+#include "mimalloc/prim.h"
+
+//---------------------------------------------
+// Initialize
+//---------------------------------------------
+
+void _mi_prim_mem_init( mi_os_mem_config_t* config ) {
+  config->page_size = 64*MI_KiB; // WebAssembly has a fixed page size: 64KiB
+  config->alloc_granularity = 16;
+  config->has_overcommit = false;  
+  config->must_free_whole = true;
+  config->has_virtual_reserve = false;
+}
+
+//---------------------------------------------
+// Free
+//---------------------------------------------
+
+int _mi_prim_free(void* addr, size_t size ) {
+  MI_UNUSED(addr); MI_UNUSED(size);
+  // wasi heap cannot be shrunk
+  return 0;
+}
+
+
+//---------------------------------------------
+// Allocation: sbrk or memory_grow
+//---------------------------------------------
+
+#if defined(MI_USE_SBRK)
+  static void* mi_memory_grow( size_t size ) {
+    void* p = sbrk(size);
+    if (p == (void*)(-1)) return NULL;
+    #if !defined(__wasi__) // on wasi this is always zero initialized already (?)
+    memset(p,0,size);
+    #endif
+    return p;
+  }
+#elif defined(__wasi__)
+  static void* mi_memory_grow( size_t size ) {
+    size_t base = (size > 0 ? __builtin_wasm_memory_grow(0,_mi_divide_up(size, _mi_os_page_size()))
+                            : __builtin_wasm_memory_size(0));
+    if (base == SIZE_MAX) return NULL;
+    return (void*)(base * _mi_os_page_size());
+  }
+#endif
+
+#if defined(MI_USE_PTHREADS)
+static pthread_mutex_t mi_heap_grow_mutex = PTHREAD_MUTEX_INITIALIZER;
+#endif
+
+static void* mi_prim_mem_grow(size_t size, size_t try_alignment) {
+  void* p = NULL;
+  if (try_alignment <= 1) {
+    // `sbrk` is not thread safe in general so try to protect it (we could skip this on WASM but leave it in for now)
+    #if defined(MI_USE_PTHREADS)
+    pthread_mutex_lock(&mi_heap_grow_mutex);
+    #endif
+    p = mi_memory_grow(size);
+    #if defined(MI_USE_PTHREADS)
+    pthread_mutex_unlock(&mi_heap_grow_mutex);
+    #endif
+  }
+  else {
+    void* base = NULL;
+    size_t alloc_size = 0;
+    // to allocate aligned use a lock to try to avoid thread interaction
+    // between getting the current size and actual allocation
+    // (also, `sbrk` is not thread safe in general)
+    #if defined(MI_USE_PTHREADS)
+    pthread_mutex_lock(&mi_heap_grow_mutex);
+    #endif
+    {
+      void* current = mi_memory_grow(0);  // get current size
+      if (current != NULL) {
+        void* aligned_current = mi_align_up_ptr(current, try_alignment);  // and align from there to minimize wasted space
+        alloc_size = _mi_align_up( ((uint8_t*)aligned_current - (uint8_t*)current) + size, _mi_os_page_size());
+        base = mi_memory_grow(alloc_size);
+      }
+    }
+    #if defined(MI_USE_PTHREADS)
+    pthread_mutex_unlock(&mi_heap_grow_mutex);
+    #endif
+    if (base != NULL) {
+      p = mi_align_up_ptr(base, try_alignment);
+      if ((uint8_t*)p + size > (uint8_t*)base + alloc_size) {
+        // another thread used wasm_memory_grow/sbrk in-between and we do not have enough
+        // space after alignment. Give up (and waste the space as we cannot shrink :-( )
+        // (in `mi_os_mem_alloc_aligned` this will fall back to overallocation to align)
+        p = NULL;
+      }
+    }
+  }
+  /*
+  if (p == NULL) {
+    _mi_warning_message("unable to allocate sbrk/wasm_memory_grow OS memory (%zu bytes, %zu alignment)\n", size, try_alignment);
+    errno = ENOMEM;
+    return NULL;
+  }
+  */
+  mi_assert_internal( p == NULL || try_alignment == 0 || (uintptr_t)p % try_alignment == 0 );
+  return p;
+}
+
+// Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned.
+int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) {
+  MI_UNUSED(allow_large); MI_UNUSED(commit);
+  *is_large = false;
+  *is_zero = false;
+  *addr = mi_prim_mem_grow(size, try_alignment);
+  return (*addr != NULL ? 0 : ENOMEM);
+}
+
+
+//---------------------------------------------
+// Commit/Reset/Protect
+//---------------------------------------------
+
+int _mi_prim_commit(void* addr, size_t size, bool* is_zero) {
+  MI_UNUSED(addr); MI_UNUSED(size); 
+  *is_zero = false;
+  return 0;
+}
+
+int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit) {
+  MI_UNUSED(addr); MI_UNUSED(size);
+  *needs_recommit = false;
+  return 0;
+}
+
+int _mi_prim_reset(void* addr, size_t size) {
+  MI_UNUSED(addr); MI_UNUSED(size);
+  return 0;
+}
+
+int _mi_prim_protect(void* addr, size_t size, bool protect) {
+  MI_UNUSED(addr); MI_UNUSED(size); MI_UNUSED(protect);
+  return 0;
+}
+
+
+//---------------------------------------------
+// Huge pages and NUMA nodes
+//---------------------------------------------
+
+int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr) {
+  MI_UNUSED(hint_addr); MI_UNUSED(size); MI_UNUSED(numa_node);
+  *is_zero = true;
+  *addr = NULL;
+  return ENOSYS;
+}
+
+size_t _mi_prim_numa_node(void) {
+  return 0;
+}
+
+size_t _mi_prim_numa_node_count(void) {
+  return 1;
+}
+
+
+//----------------------------------------------------------------
+// Clock
+//----------------------------------------------------------------
+
+#include <time.h>
+
+#if defined(CLOCK_REALTIME) || defined(CLOCK_MONOTONIC)
+
+mi_msecs_t _mi_prim_clock_now(void) {
+  struct timespec t;
+  #ifdef CLOCK_MONOTONIC
+  clock_gettime(CLOCK_MONOTONIC, &t);
+  #else
+  clock_gettime(CLOCK_REALTIME, &t);
+  #endif
+  return ((mi_msecs_t)t.tv_sec * 1000) + ((mi_msecs_t)t.tv_nsec / 1000000);
+}
+
+#else
+
+// low resolution timer
+mi_msecs_t _mi_prim_clock_now(void) {
+  #if !defined(CLOCKS_PER_SEC) || (CLOCKS_PER_SEC == 1000) || (CLOCKS_PER_SEC == 0)
+  return (mi_msecs_t)clock();  
+  #elif (CLOCKS_PER_SEC < 1000)
+  return (mi_msecs_t)clock() * (1000 / (mi_msecs_t)CLOCKS_PER_SEC);  
+  #else
+  return (mi_msecs_t)clock() / ((mi_msecs_t)CLOCKS_PER_SEC / 1000);
+  #endif
+}
+
+#endif
+
+
+//----------------------------------------------------------------
+// Process info
+//----------------------------------------------------------------
+
+void _mi_prim_process_info(mi_process_info_t* pinfo)
+{
+  // use defaults
+  MI_UNUSED(pinfo);
+}
+
+
+//----------------------------------------------------------------
+// Output
+//----------------------------------------------------------------
+
+void _mi_prim_out_stderr( const char* msg ) {
+  fputs(msg,stderr);
+}
+
+
+//----------------------------------------------------------------
+// Environment
+//----------------------------------------------------------------
+
+bool _mi_prim_getenv(const char* name, char* result, size_t result_size) {
+  // cannot call getenv() when still initializing the C runtime.
+  if (_mi_preloading()) return false;
+  const char* s = getenv(name);
+  if (s == NULL) {
+    // we check the upper case name too.
+    char buf[64+1];
+    size_t len = _mi_strnlen(name,sizeof(buf)-1);
+    for (size_t i = 0; i < len; i++) {
+      buf[i] = _mi_toupper(name[i]);
+    }
+    buf[len] = 0;
+    s = getenv(buf);
+  }
+  if (s == NULL || _mi_strnlen(s,result_size) >= result_size)  return false;
+  _mi_strlcpy(result, s, result_size);
+  return true;
+}
+
+
+//----------------------------------------------------------------
+// Random
+//----------------------------------------------------------------
+
+bool _mi_prim_random_buf(void* buf, size_t buf_len) {
+  return false;
+}
+
+
+//----------------------------------------------------------------
+// Thread init/done
+//----------------------------------------------------------------
+
+void _mi_prim_thread_init_auto_done(void) {
+  // nothing
+}
+
+void _mi_prim_thread_done_auto_done(void) {
+  // nothing
+}
+
+void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
+  MI_UNUSED(heap);
+}
diff --git a/Objects/mimalloc/prim/windows/etw-mimalloc.wprp b/Objects/mimalloc/prim/windows/etw-mimalloc.wprp
new file mode 100644
index 00000000000000..b00cd7adf2285c
--- /dev/null
+++ b/Objects/mimalloc/prim/windows/etw-mimalloc.wprp
@@ -0,0 +1,61 @@
+<WindowsPerformanceRecorder Version="1.0">
+  <Profiles>
+    <SystemCollector Id="WPR_initiated_WprApp_WPR_System_Collector" Name="WPR_initiated_WprApp_WPR System Collector">
+      <BufferSize Value="1024" />
+      <Buffers Value="100" />
+    </SystemCollector>
+    <EventCollector Id="Mimalloc_Collector" Name="Mimalloc Collector">
+      <BufferSize Value="1024" />
+      <Buffers Value="100" />
+    </EventCollector>
+    <SystemProvider Id="WPR_initiated_WprApp_WPR_System_Collector_Provider">
+      <Keywords>
+        <Keyword Value="Loader" />
+      </Keywords>
+    </SystemProvider>
+    <EventProvider Id="MimallocEventProvider" Name="138f4dbb-ee04-4899-aa0a-572ad4475779" NonPagedMemory="true" Stack="true">
+      <EventFilters FilterIn="true">
+        <EventId Value="100" />
+        <EventId Value="101" />
+      </EventFilters>
+    </EventProvider>
+    <Profile Id="CustomHeap.Verbose.File" Name="CustomHeap" Description="RunningProfile:CustomHeap.Verbose.File" LoggingMode="File" DetailLevel="Verbose">
+      <ProblemCategories>
+        <ProblemCategory Value="Resource Analysis" />
+      </ProblemCategories>
+      <Collectors>
+        <SystemCollectorId Value="WPR_initiated_WprApp_WPR_System_Collector">
+          <SystemProviderId Value="WPR_initiated_WprApp_WPR_System_Collector_Provider" />
+        </SystemCollectorId>
+        <EventCollectorId Value="Mimalloc_Collector">
+          <EventProviders>
+            <EventProviderId Value="MimallocEventProvider" >
+              <Keywords>
+                <Keyword Value="100"/>
+                <Keyword Value="101"/>
+              </Keywords>
+            </EventProviderId>
+          </EventProviders>
+        </EventCollectorId>
+      </Collectors>
+      <TraceMergeProperties>
+        <TraceMergeProperty Id="BaseVerboseTraceMergeProperties" Name="BaseTraceMergeProperties">
+          <DeletePreMergedTraceFiles Value="true" />
+          <FileCompression Value="false" />
+          <InjectOnly Value="false" />
+          <SkipMerge Value="false" />
+          <CustomEvents>
+            <CustomEvent Value="ImageId" />
+            <CustomEvent Value="BuildInfo" />
+            <CustomEvent Value="VolumeMapping" />
+            <CustomEvent Value="EventMetadata" />
+            <CustomEvent Value="PerfTrackMetadata" />
+            <CustomEvent Value="WinSAT" />
+            <CustomEvent Value="NetworkInterface" />
+          </CustomEvents>
+        </TraceMergeProperty>
+      </TraceMergeProperties>
+    </Profile>
+  </Profiles>
+</WindowsPerformanceRecorder>
+
diff --git a/Objects/mimalloc/prim/windows/etw.h b/Objects/mimalloc/prim/windows/etw.h
new file mode 100644
index 00000000000000..4e0a092a10f4ba
--- /dev/null
+++ b/Objects/mimalloc/prim/windows/etw.h
@@ -0,0 +1,905 @@
+//**********************************************************************`
+//* This is an include file generated by Message Compiler.             *`
+//*                                                                    *`
+//* Copyright (c) Microsoft Corporation. All Rights Reserved.          *`
+//**********************************************************************`
+#pragma once
+
+//*****************************************************************************
+//
+// Notes on the ETW event code generated by MC:
+//
+// - Structures and arrays of structures are treated as an opaque binary blob.
+//   The caller is responsible for packing the data for the structure into a
+//   single region of memory, with no padding between values. The macro will
+//   have an extra parameter for the length of the blob.
+// - Arrays of nul-terminated strings must be packed by the caller into a
+//   single binary blob containing the correct number of strings, with a nul
+//   after each string. The size of the blob is specified in characters, and
+//   includes the final nul.
+// - Arrays of SID are treated as a single binary blob. The caller is
+//   responsible for packing the SID values into a single region of memory with
+//   no padding.
+// - The length attribute on the data element in the manifest is significant
+//   for values with intype win:UnicodeString, win:AnsiString, or win:Binary.
+//   The length attribute must be specified for win:Binary, and is optional for
+//   win:UnicodeString and win:AnsiString (if no length is given, the strings
+//   are assumed to be nul-terminated). For win:UnicodeString, the length is
+//   measured in characters, not bytes.
+// - For an array of win:UnicodeString, win:AnsiString, or win:Binary, the
+//   length attribute applies to every value in the array, so every value in
+//   the array must have the same length. The values in the array are provided
+//   to the macro via a single pointer -- the caller is responsible for packing
+//   all of the values into a single region of memory with no padding between
+//   values.
+// - Values of type win:CountedUnicodeString, win:CountedAnsiString, and
+//   win:CountedBinary can be generated and collected on Vista or later.
+//   However, they may not decode properly without the Windows 10 2018 Fall
+//   Update.
+// - Arrays of type win:CountedUnicodeString, win:CountedAnsiString, and
+//   win:CountedBinary must be packed by the caller into a single region of
+//   memory. The format for each item is a UINT16 byte-count followed by that
+//   many bytes of data. When providing the array to the generated macro, you
+//   must provide the total size of the packed array data, including the UINT16
+//   sizes for each item. In the case of win:CountedUnicodeString, the data
+//   size is specified in WCHAR (16-bit) units. In the case of
+//   win:CountedAnsiString and win:CountedBinary, the data size is specified in
+//   bytes.
+//
+//*****************************************************************************
+
+#include <wmistr.h>
+#include <evntrace.h>
+#include <evntprov.h>
+
+#ifndef ETW_INLINE
+  #ifdef _ETW_KM_
+    // In kernel mode, save stack space by never inlining templates.
+    #define ETW_INLINE DECLSPEC_NOINLINE __inline
+  #else
+    // In user mode, save code size by inlining templates as appropriate.
+    #define ETW_INLINE __inline
+  #endif
+#endif // ETW_INLINE
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+//
+// MCGEN_DISABLE_PROVIDER_CODE_GENERATION macro:
+// Define this macro to have the compiler skip the generated functions in this
+// header.
+//
+#ifndef MCGEN_DISABLE_PROVIDER_CODE_GENERATION
+
+//
+// MCGEN_USE_KERNEL_MODE_APIS macro:
+// Controls whether the generated code uses kernel-mode or user-mode APIs.
+// - Set to 0 to use Windows user-mode APIs such as EventRegister.
+// - Set to 1 to use Windows kernel-mode APIs such as EtwRegister.
+// Default is based on whether the _ETW_KM_ macro is defined (i.e. by wdm.h).
+// Note that the APIs can also be overridden directly, e.g. by setting the
+// MCGEN_EVENTWRITETRANSFER or MCGEN_EVENTREGISTER macros.
+//
+#ifndef MCGEN_USE_KERNEL_MODE_APIS
+  #ifdef _ETW_KM_
+    #define MCGEN_USE_KERNEL_MODE_APIS 1
+  #else
+    #define MCGEN_USE_KERNEL_MODE_APIS 0
+  #endif
+#endif // MCGEN_USE_KERNEL_MODE_APIS
+
+//
+// MCGEN_HAVE_EVENTSETINFORMATION macro:
+// Controls how McGenEventSetInformation uses the EventSetInformation API.
+// - Set to 0 to disable the use of EventSetInformation
+//   (McGenEventSetInformation will always return an error).
+// - Set to 1 to directly invoke MCGEN_EVENTSETINFORMATION.
+// - Set to 2 to to locate EventSetInformation at runtime via GetProcAddress
+//   (user-mode) or MmGetSystemRoutineAddress (kernel-mode).
+// Default is determined as follows:
+// - If MCGEN_EVENTSETINFORMATION has been customized, set to 1
+//   (i.e. use MCGEN_EVENTSETINFORMATION).
+// - Else if the target OS version has EventSetInformation, set to 1
+//   (i.e. use MCGEN_EVENTSETINFORMATION).
+// - Else set to 2 (i.e. try to dynamically locate EventSetInformation).
+// Note that an McGenEventSetInformation function will only be generated if one
+// or more provider in a manifest has provider traits.
+//
+#ifndef MCGEN_HAVE_EVENTSETINFORMATION
+  #ifdef MCGEN_EVENTSETINFORMATION             // if MCGEN_EVENTSETINFORMATION has been customized,
+    #define MCGEN_HAVE_EVENTSETINFORMATION   1 //   directly invoke MCGEN_EVENTSETINFORMATION(...).
+  #elif MCGEN_USE_KERNEL_MODE_APIS             // else if using kernel-mode APIs,
+    #if NTDDI_VERSION >= 0x06040000            //   if target OS is Windows 10 or later,
+      #define MCGEN_HAVE_EVENTSETINFORMATION 1 //     directly invoke MCGEN_EVENTSETINFORMATION(...).
+    #else                                      //   else
+      #define MCGEN_HAVE_EVENTSETINFORMATION 2 //     find "EtwSetInformation" via MmGetSystemRoutineAddress.
+    #endif                                     // else (using user-mode APIs)
+  #else                                        //   if target OS and SDK is Windows 8 or later,
+    #if WINVER >= 0x0602 && defined(EVENT_FILTER_TYPE_SCHEMATIZED)
+      #define MCGEN_HAVE_EVENTSETINFORMATION 1 //     directly invoke MCGEN_EVENTSETINFORMATION(...).
+    #else                                      //   else
+      #define MCGEN_HAVE_EVENTSETINFORMATION 2 //     find "EventSetInformation" via GetModuleHandleExW/GetProcAddress.
+    #endif
+  #endif
+#endif // MCGEN_HAVE_EVENTSETINFORMATION
+
+//
+// MCGEN Override Macros
+//
+// The following override macros may be defined before including this header
+// to control the APIs used by this header:
+//
+// - MCGEN_EVENTREGISTER
+// - MCGEN_EVENTUNREGISTER
+// - MCGEN_EVENTSETINFORMATION
+// - MCGEN_EVENTWRITETRANSFER
+//
+// If the the macro is undefined, the MC implementation will default to the
+// corresponding ETW APIs. For example, if the MCGEN_EVENTREGISTER macro is
+// undefined, the EventRegister[MyProviderName] macro will use EventRegister
+// in user mode and will use EtwRegister in kernel mode.
+//
+// To prevent issues from conflicting definitions of these macros, the value
+// of the override macro will be used as a suffix in certain internal function
+// names. Because of this, the override macros must follow certain rules:
+//
+// - The macro must be defined before any MC-generated header is included and
+//   must not be undefined or redefined after any MC-generated header is
+//   included. Different translation units (i.e. different .c or .cpp files)
+//   may set the macros to different values, but within a translation unit
+//   (within a single .c or .cpp file), the macro must be set once and not
+//   changed.
+// - The override must be an object-like macro, not a function-like macro
+//   (i.e. the override macro must not have a parameter list).
+// - The override macro's value must be a simple identifier, i.e. must be
+//   something that starts with a letter or '_' and contains only letters,
+//   numbers, and '_' characters.
+// - If the override macro's value is the name of a second object-like macro,
+//   the second object-like macro must follow the same rules. (The override
+//   macro's value can also be the name of a function-like macro, in which
+//   case the function-like macro does not need to follow the same rules.)
+//
+// For example, the following will cause compile errors:
+//
+//   #define MCGEN_EVENTWRITETRANSFER MyNamespace::MyClass::MyFunction // Value has non-identifier characters (colon).
+//   #define MCGEN_EVENTWRITETRANSFER GetEventWriteFunctionPointer(7)  // Value has non-identifier characters (parentheses).
+//   #define MCGEN_EVENTWRITETRANSFER(h,e,a,r,c,d) EventWrite(h,e,c,d) // Override is defined as a function-like macro.
+//   #define MY_OBJECT_LIKE_MACRO     MyNamespace::MyClass::MyEventWriteFunction
+//   #define MCGEN_EVENTWRITETRANSFER MY_OBJECT_LIKE_MACRO // Evaluates to something with non-identifier characters (colon).
+//
+// The following would be ok:
+//
+//   #define MCGEN_EVENTWRITETRANSFER  MyEventWriteFunction1  // OK, suffix will be "MyEventWriteFunction1".
+//   #define MY_OBJECT_LIKE_MACRO      MyEventWriteFunction2
+//   #define MCGEN_EVENTWRITETRANSFER  MY_OBJECT_LIKE_MACRO   // OK, suffix will be "MyEventWriteFunction2".
+//   #define MY_FUNCTION_LIKE_MACRO(h,e,a,r,c,d) MyNamespace::MyClass::MyEventWriteFunction3(h,e,c,d)
+//   #define MCGEN_EVENTWRITETRANSFER  MY_FUNCTION_LIKE_MACRO // OK, suffix will be "MY_FUNCTION_LIKE_MACRO".
+//
+#ifndef MCGEN_EVENTREGISTER
+  #if MCGEN_USE_KERNEL_MODE_APIS
+    #define MCGEN_EVENTREGISTER        EtwRegister
+  #else
+    #define MCGEN_EVENTREGISTER        EventRegister
+  #endif
+#endif // MCGEN_EVENTREGISTER
+#ifndef MCGEN_EVENTUNREGISTER
+  #if MCGEN_USE_KERNEL_MODE_APIS
+    #define MCGEN_EVENTUNREGISTER      EtwUnregister
+  #else
+    #define MCGEN_EVENTUNREGISTER      EventUnregister
+  #endif
+#endif // MCGEN_EVENTUNREGISTER
+#ifndef MCGEN_EVENTSETINFORMATION
+  #if MCGEN_USE_KERNEL_MODE_APIS
+    #define MCGEN_EVENTSETINFORMATION  EtwSetInformation
+  #else
+    #define MCGEN_EVENTSETINFORMATION  EventSetInformation
+  #endif
+#endif // MCGEN_EVENTSETINFORMATION
+#ifndef MCGEN_EVENTWRITETRANSFER
+  #if MCGEN_USE_KERNEL_MODE_APIS
+    #define MCGEN_EVENTWRITETRANSFER   EtwWriteTransfer
+  #else
+    #define MCGEN_EVENTWRITETRANSFER   EventWriteTransfer
+  #endif
+#endif // MCGEN_EVENTWRITETRANSFER
+
+//
+// MCGEN_EVENT_ENABLED macro:
+// Override to control how the EventWrite[EventName] macros determine whether
+// an event is enabled. The default behavior is for EventWrite[EventName] to
+// use the EventEnabled[EventName] macros.
+//
+#ifndef MCGEN_EVENT_ENABLED
+#define MCGEN_EVENT_ENABLED(EventName) EventEnabled##EventName()
+#endif
+
+//
+// MCGEN_EVENT_ENABLED_FORCONTEXT macro:
+// Override to control how the EventWrite[EventName]_ForContext macros
+// determine whether an event is enabled. The default behavior is for
+// EventWrite[EventName]_ForContext to use the
+// EventEnabled[EventName]_ForContext macros.
+//
+#ifndef MCGEN_EVENT_ENABLED_FORCONTEXT
+#define MCGEN_EVENT_ENABLED_FORCONTEXT(pContext, EventName) EventEnabled##EventName##_ForContext(pContext)
+#endif
+
+//
+// MCGEN_ENABLE_CHECK macro:
+// Determines whether the specified event would be considered as enabled
+// based on the state of the specified context. Slightly faster than calling
+// McGenEventEnabled directly.
+//
+#ifndef MCGEN_ENABLE_CHECK
+#define MCGEN_ENABLE_CHECK(Context, Descriptor) (Context.IsEnabled && McGenEventEnabled(&Context, &Descriptor))
+#endif
+
+#if !defined(MCGEN_TRACE_CONTEXT_DEF)
+#define MCGEN_TRACE_CONTEXT_DEF
+// This structure is for use by MC-generated code and should not be used directly.
+typedef struct _MCGEN_TRACE_CONTEXT
+{
+    TRACEHANDLE            RegistrationHandle;
+    TRACEHANDLE            Logger;      // Used as pointer to provider traits.
+    ULONGLONG              MatchAnyKeyword;
+    ULONGLONG              MatchAllKeyword;
+    ULONG                  Flags;
+    ULONG                  IsEnabled;
+    UCHAR                  Level;
+    UCHAR                  Reserve;
+    USHORT                 EnableBitsCount;
+    PULONG                 EnableBitMask;
+    const ULONGLONG*       EnableKeyWords;
+    const UCHAR*           EnableLevel;
+} MCGEN_TRACE_CONTEXT, *PMCGEN_TRACE_CONTEXT;
+#endif // MCGEN_TRACE_CONTEXT_DEF
+
+#if !defined(MCGEN_LEVEL_KEYWORD_ENABLED_DEF)
+#define MCGEN_LEVEL_KEYWORD_ENABLED_DEF
+//
+// Determines whether an event with a given Level and Keyword would be
+// considered as enabled based on the state of the specified context.
+// Note that you may want to use MCGEN_ENABLE_CHECK instead of calling this
+// function directly.
+//
+FORCEINLINE
+BOOLEAN
+McGenLevelKeywordEnabled(
+    _In_ PMCGEN_TRACE_CONTEXT EnableInfo,
+    _In_ UCHAR Level,
+    _In_ ULONGLONG Keyword
+    )
+{
+    //
+    // Check if the event Level is lower than the level at which
+    // the channel is enabled.
+    // If the event Level is 0 or the channel is enabled at level 0,
+    // all levels are enabled.
+    //
+
+    if ((Level <= EnableInfo->Level) || // This also covers the case of Level == 0.
+        (EnableInfo->Level == 0)) {
+
+        //
+        // Check if Keyword is enabled
+        //
+
+        if ((Keyword == (ULONGLONG)0) ||
+            ((Keyword & EnableInfo->MatchAnyKeyword) &&
+             ((Keyword & EnableInfo->MatchAllKeyword) == EnableInfo->MatchAllKeyword))) {
+            return TRUE;
+        }
+    }
+
+    return FALSE;
+}
+#endif // MCGEN_LEVEL_KEYWORD_ENABLED_DEF
+
+#if !defined(MCGEN_EVENT_ENABLED_DEF)
+#define MCGEN_EVENT_ENABLED_DEF
+//
+// Determines whether the specified event would be considered as enabled based
+// on the state of the specified context. Note that you may want to use
+// MCGEN_ENABLE_CHECK instead of calling this function directly.
+//
+FORCEINLINE
+BOOLEAN
+McGenEventEnabled(
+    _In_ PMCGEN_TRACE_CONTEXT EnableInfo,
+    _In_ PCEVENT_DESCRIPTOR EventDescriptor
+    )
+{
+    return McGenLevelKeywordEnabled(EnableInfo, EventDescriptor->Level, EventDescriptor->Keyword);
+}
+#endif // MCGEN_EVENT_ENABLED_DEF
+
+#if !defined(MCGEN_CONTROL_CALLBACK)
+#define MCGEN_CONTROL_CALLBACK
+
+// This function is for use by MC-generated code and should not be used directly.
+DECLSPEC_NOINLINE __inline
+VOID
+__stdcall
+McGenControlCallbackV2(
+    _In_ LPCGUID SourceId,
+    _In_ ULONG ControlCode,
+    _In_ UCHAR Level,
+    _In_ ULONGLONG MatchAnyKeyword,
+    _In_ ULONGLONG MatchAllKeyword,
+    _In_opt_ PEVENT_FILTER_DESCRIPTOR FilterData,
+    _Inout_opt_ PVOID CallbackContext
+    )
+/*++
+
+Routine Description:
+
+    This is the notification callback for Windows Vista and later.
+
+Arguments:
+
+    SourceId - The GUID that identifies the session that enabled the provider.
+
+    ControlCode - The parameter indicates whether the provider
+                  is being enabled or disabled.
+
+    Level - The level at which the event is enabled.
+
+    MatchAnyKeyword - The bitmask of keywords that the provider uses to
+                      determine the category of events that it writes.
+
+    MatchAllKeyword - This bitmask additionally restricts the category
+                      of events that the provider writes.
+
+    FilterData - The provider-defined data.
+
+    CallbackContext - The context of the callback that is defined when the provider
+                      called EtwRegister to register itself.
+
+Remarks:
+
+    ETW calls this function to notify provider of enable/disable
+
+--*/
+{
+    PMCGEN_TRACE_CONTEXT Ctx = (PMCGEN_TRACE_CONTEXT)CallbackContext;
+    ULONG Ix;
+#ifndef MCGEN_PRIVATE_ENABLE_CALLBACK_V2
+    UNREFERENCED_PARAMETER(SourceId);
+    UNREFERENCED_PARAMETER(FilterData);
+#endif
+
+    if (Ctx == NULL) {
+        return;
+    }
+
+    switch (ControlCode) {
+
+        case EVENT_CONTROL_CODE_ENABLE_PROVIDER:
+            Ctx->Level = Level;
+            Ctx->MatchAnyKeyword = MatchAnyKeyword;
+            Ctx->MatchAllKeyword = MatchAllKeyword;
+            Ctx->IsEnabled = EVENT_CONTROL_CODE_ENABLE_PROVIDER;
+
+            for (Ix = 0; Ix < Ctx->EnableBitsCount; Ix += 1) {
+                if (McGenLevelKeywordEnabled(Ctx, Ctx->EnableLevel[Ix], Ctx->EnableKeyWords[Ix]) != FALSE) {
+                    Ctx->EnableBitMask[Ix >> 5] |= (1 << (Ix % 32));
+                } else {
+                    Ctx->EnableBitMask[Ix >> 5] &= ~(1 << (Ix % 32));
+                }
+            }
+            break;
+
+        case EVENT_CONTROL_CODE_DISABLE_PROVIDER:
+            Ctx->IsEnabled = EVENT_CONTROL_CODE_DISABLE_PROVIDER;
+            Ctx->Level = 0;
+            Ctx->MatchAnyKeyword = 0;
+            Ctx->MatchAllKeyword = 0;
+            if (Ctx->EnableBitsCount > 0) {
+#pragma warning(suppress: 26451) // Arithmetic overflow cannot occur, no matter the value of EnableBitCount
+                RtlZeroMemory(Ctx->EnableBitMask, (((Ctx->EnableBitsCount - 1) / 32) + 1) * sizeof(ULONG));
+            }
+            break;
+
+        default:
+            break;
+    }
+
+#ifdef MCGEN_PRIVATE_ENABLE_CALLBACK_V2
+    //
+    // Call user defined callback
+    //
+    MCGEN_PRIVATE_ENABLE_CALLBACK_V2(
+        SourceId,
+        ControlCode,
+        Level,
+        MatchAnyKeyword,
+        MatchAllKeyword,
+        FilterData,
+        CallbackContext
+        );
+#endif // MCGEN_PRIVATE_ENABLE_CALLBACK_V2
+
+    return;
+}
+
+#endif // MCGEN_CONTROL_CALLBACK
+
+#ifndef _mcgen_PENABLECALLBACK
+  #if MCGEN_USE_KERNEL_MODE_APIS
+    #define _mcgen_PENABLECALLBACK      PETWENABLECALLBACK
+  #else
+    #define _mcgen_PENABLECALLBACK      PENABLECALLBACK
+  #endif
+#endif // _mcgen_PENABLECALLBACK
+
+#if !defined(_mcgen_PASTE2)
+// This macro is for use by MC-generated code and should not be used directly.
+#define _mcgen_PASTE2(a, b) _mcgen_PASTE2_imp(a, b)
+#define _mcgen_PASTE2_imp(a, b) a##b
+#endif // _mcgen_PASTE2
+
+#if !defined(_mcgen_PASTE3)
+// This macro is for use by MC-generated code and should not be used directly.
+#define _mcgen_PASTE3(a, b, c) _mcgen_PASTE3_imp(a, b, c)
+#define _mcgen_PASTE3_imp(a, b, c) a##b##_##c
+#endif // _mcgen_PASTE3
+
+//
+// Macro validation
+//
+
+// Validate MCGEN_EVENTREGISTER:
+
+// Trigger an error if MCGEN_EVENTREGISTER is not an unqualified (simple) identifier:
+struct _mcgen_PASTE2(MCGEN_EVENTREGISTER_definition_must_be_an_unqualified_identifier_, MCGEN_EVENTREGISTER);
+
+// Trigger an error if MCGEN_EVENTREGISTER is redefined:
+typedef struct _mcgen_PASTE2(MCGEN_EVENTREGISTER_definition_must_be_an_unqualified_identifier_, MCGEN_EVENTREGISTER)
+    MCGEN_EVENTREGISTER_must_not_be_redefined_between_headers;
+
+// Trigger an error if MCGEN_EVENTREGISTER is defined as a function-like macro:
+typedef void MCGEN_EVENTREGISTER_must_not_be_a_functionLike_macro_MCGEN_EVENTREGISTER;
+typedef int _mcgen_PASTE2(MCGEN_EVENTREGISTER_must_not_be_a_functionLike_macro_, MCGEN_EVENTREGISTER);
+
+// Validate MCGEN_EVENTUNREGISTER:
+
+// Trigger an error if MCGEN_EVENTUNREGISTER is not an unqualified (simple) identifier:
+struct _mcgen_PASTE2(MCGEN_EVENTUNREGISTER_definition_must_be_an_unqualified_identifier_, MCGEN_EVENTUNREGISTER);
+
+// Trigger an error if MCGEN_EVENTUNREGISTER is redefined:
+typedef struct _mcgen_PASTE2(MCGEN_EVENTUNREGISTER_definition_must_be_an_unqualified_identifier_, MCGEN_EVENTUNREGISTER)
+    MCGEN_EVENTUNREGISTER_must_not_be_redefined_between_headers;
+
+// Trigger an error if MCGEN_EVENTUNREGISTER is defined as a function-like macro:
+typedef void MCGEN_EVENTUNREGISTER_must_not_be_a_functionLike_macro_MCGEN_EVENTUNREGISTER;
+typedef int _mcgen_PASTE2(MCGEN_EVENTUNREGISTER_must_not_be_a_functionLike_macro_, MCGEN_EVENTUNREGISTER);
+
+// Validate MCGEN_EVENTSETINFORMATION:
+
+// Trigger an error if MCGEN_EVENTSETINFORMATION is not an unqualified (simple) identifier:
+struct _mcgen_PASTE2(MCGEN_EVENTSETINFORMATION_definition_must_be_an_unqualified_identifier_, MCGEN_EVENTSETINFORMATION);
+
+// Trigger an error if MCGEN_EVENTSETINFORMATION is redefined:
+typedef struct _mcgen_PASTE2(MCGEN_EVENTSETINFORMATION_definition_must_be_an_unqualified_identifier_, MCGEN_EVENTSETINFORMATION)
+    MCGEN_EVENTSETINFORMATION_must_not_be_redefined_between_headers;
+
+// Trigger an error if MCGEN_EVENTSETINFORMATION is defined as a function-like macro:
+typedef void MCGEN_EVENTSETINFORMATION_must_not_be_a_functionLike_macro_MCGEN_EVENTSETINFORMATION;
+typedef int _mcgen_PASTE2(MCGEN_EVENTSETINFORMATION_must_not_be_a_functionLike_macro_, MCGEN_EVENTSETINFORMATION);
+
+// Validate MCGEN_EVENTWRITETRANSFER:
+
+// Trigger an error if MCGEN_EVENTWRITETRANSFER is not an unqualified (simple) identifier:
+struct _mcgen_PASTE2(MCGEN_EVENTWRITETRANSFER_definition_must_be_an_unqualified_identifier_, MCGEN_EVENTWRITETRANSFER);
+
+// Trigger an error if MCGEN_EVENTWRITETRANSFER is redefined:
+typedef struct _mcgen_PASTE2(MCGEN_EVENTWRITETRANSFER_definition_must_be_an_unqualified_identifier_, MCGEN_EVENTWRITETRANSFER)
+    MCGEN_EVENTWRITETRANSFER_must_not_be_redefined_between_headers;;
+
+// Trigger an error if MCGEN_EVENTWRITETRANSFER is defined as a function-like macro:
+typedef void MCGEN_EVENTWRITETRANSFER_must_not_be_a_functionLike_macro_MCGEN_EVENTWRITETRANSFER;
+typedef int _mcgen_PASTE2(MCGEN_EVENTWRITETRANSFER_must_not_be_a_functionLike_macro_, MCGEN_EVENTWRITETRANSFER);
+
+#ifndef McGenEventWrite_def
+#define McGenEventWrite_def
+
+// This macro is for use by MC-generated code and should not be used directly.
+#define McGenEventWrite _mcgen_PASTE2(McGenEventWrite_, MCGEN_EVENTWRITETRANSFER)
+
+// This function is for use by MC-generated code and should not be used directly.
+DECLSPEC_NOINLINE __inline
+ULONG __stdcall
+McGenEventWrite(
+    _In_ PMCGEN_TRACE_CONTEXT Context,
+    _In_ PCEVENT_DESCRIPTOR Descriptor,
+    _In_opt_ LPCGUID ActivityId,
+    _In_range_(1, 128) ULONG EventDataCount,
+    _Pre_cap_(EventDataCount) EVENT_DATA_DESCRIPTOR* EventData
+    )
+{
+    const USHORT UNALIGNED* Traits;
+
+    // Some customized MCGEN_EVENTWRITETRANSFER macros might ignore ActivityId.
+    UNREFERENCED_PARAMETER(ActivityId);
+
+    Traits = (const USHORT UNALIGNED*)(UINT_PTR)Context->Logger;
+
+    if (Traits == NULL) {
+        EventData[0].Ptr = 0;
+        EventData[0].Size = 0;
+        EventData[0].Reserved = 0;
+    } else {
+        EventData[0].Ptr = (ULONG_PTR)Traits;
+        EventData[0].Size = *Traits;
+        EventData[0].Reserved = 2; // EVENT_DATA_DESCRIPTOR_TYPE_PROVIDER_METADATA
+    }
+
+    return MCGEN_EVENTWRITETRANSFER(
+        Context->RegistrationHandle,
+        Descriptor,
+        ActivityId,
+        NULL,
+        EventDataCount,
+        EventData);
+}
+#endif // McGenEventWrite_def
+
+#if !defined(McGenEventRegisterUnregister)
+#define McGenEventRegisterUnregister
+
+// This macro is for use by MC-generated code and should not be used directly.
+#define McGenEventRegister _mcgen_PASTE2(McGenEventRegister_, MCGEN_EVENTREGISTER)
+
+#pragma warning(push)
+#pragma warning(disable:6103)
+// This function is for use by MC-generated code and should not be used directly.
+DECLSPEC_NOINLINE __inline
+ULONG __stdcall
+McGenEventRegister(
+    _In_ LPCGUID ProviderId,
+    _In_opt_ _mcgen_PENABLECALLBACK EnableCallback,
+    _In_opt_ PVOID CallbackContext,
+    _Inout_ PREGHANDLE RegHandle
+    )
+/*++
+
+Routine Description:
+
+    This function registers the provider with ETW.
+
+Arguments:
+
+    ProviderId - Provider ID to register with ETW.
+
+    EnableCallback - Callback to be used.
+
+    CallbackContext - Context for the callback.
+
+    RegHandle - Pointer to registration handle.
+
+Remarks:
+
+    Should not be called if the provider is already registered (i.e. should not
+    be called if *RegHandle != 0). Repeatedly registering a provider is a bug
+    and may indicate a race condition. However, for compatibility with previous
+    behavior, this function will return SUCCESS in this case.
+
+--*/
+{
+    ULONG Error;
+
+    if (*RegHandle != 0)
+    {
+        Error = 0; // ERROR_SUCCESS
+    }
+    else
+    {
+        Error = MCGEN_EVENTREGISTER(ProviderId, EnableCallback, CallbackContext, RegHandle);
+    }
+
+    return Error;
+}
+#pragma warning(pop)
+
+// This macro is for use by MC-generated code and should not be used directly.
+#define McGenEventUnregister _mcgen_PASTE2(McGenEventUnregister_, MCGEN_EVENTUNREGISTER)
+
+// This function is for use by MC-generated code and should not be used directly.
+DECLSPEC_NOINLINE __inline
+ULONG __stdcall
+McGenEventUnregister(_Inout_ PREGHANDLE RegHandle)
+/*++
+
+Routine Description:
+
+    Unregister from ETW and set *RegHandle = 0.
+
+Arguments:
+
+    RegHandle - the pointer to the provider registration handle
+
+Remarks:
+
+    If provider has not been registered (i.e. if *RegHandle == 0),
+    return SUCCESS. It is safe to call McGenEventUnregister even if the
+    call to McGenEventRegister returned an error.
+
+--*/
+{
+    ULONG Error;
+
+    if(*RegHandle == 0)
+    {
+        Error = 0; // ERROR_SUCCESS
+    }
+    else
+    {
+        Error = MCGEN_EVENTUNREGISTER(*RegHandle);
+        *RegHandle = (REGHANDLE)0;
+    }
+
+    return Error;
+}
+
+#endif // McGenEventRegisterUnregister
+
+#ifndef _mcgen_EVENT_BIT_SET
+  #if defined(_M_IX86) || defined(_M_X64)
+    // This macro is for use by MC-generated code and should not be used directly.
+    #define _mcgen_EVENT_BIT_SET(EnableBits, BitPosition) ((((const unsigned char*)EnableBits)[BitPosition >> 3] & (1u << (BitPosition & 7))) != 0)
+  #else // CPU type
+    // This macro is for use by MC-generated code and should not be used directly.
+    #define _mcgen_EVENT_BIT_SET(EnableBits, BitPosition) ((EnableBits[BitPosition >> 5] & (1u << (BitPosition & 31))) != 0)
+  #endif // CPU type
+#endif // _mcgen_EVENT_BIT_SET
+
+#endif // MCGEN_DISABLE_PROVIDER_CODE_GENERATION
+
+//+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+// Provider "microsoft-windows-mimalloc" event count 2
+//+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+// Provider GUID = 138f4dbb-ee04-4899-aa0a-572ad4475779
+EXTERN_C __declspec(selectany) const GUID ETW_MI_Provider = {0x138f4dbb, 0xee04, 0x4899, {0xaa, 0x0a, 0x57, 0x2a, 0xd4, 0x47, 0x57, 0x79}};
+
+#ifndef ETW_MI_Provider_Traits
+#define ETW_MI_Provider_Traits NULL
+#endif // ETW_MI_Provider_Traits
+
+//
+// Event Descriptors
+//
+EXTERN_C __declspec(selectany) const EVENT_DESCRIPTOR ETW_MI_ALLOC = {0x64, 0x1, 0x0, 0x4, 0x0, 0x0, 0x0};
+#define ETW_MI_ALLOC_value 0x64
+EXTERN_C __declspec(selectany) const EVENT_DESCRIPTOR ETW_MI_FREE = {0x65, 0x1, 0x0, 0x4, 0x0, 0x0, 0x0};
+#define ETW_MI_FREE_value 0x65
+
+//
+// MCGEN_DISABLE_PROVIDER_CODE_GENERATION macro:
+// Define this macro to have the compiler skip the generated functions in this
+// header.
+//
+#ifndef MCGEN_DISABLE_PROVIDER_CODE_GENERATION
+
+//
+// Event Enablement Bits
+// These variables are for use by MC-generated code and should not be used directly.
+//
+EXTERN_C __declspec(selectany) DECLSPEC_CACHEALIGN ULONG microsoft_windows_mimallocEnableBits[1];
+EXTERN_C __declspec(selectany) const ULONGLONG microsoft_windows_mimallocKeywords[1] = {0x0};
+EXTERN_C __declspec(selectany) const unsigned char microsoft_windows_mimallocLevels[1] = {4};
+
+//
+// Provider context
+//
+EXTERN_C __declspec(selectany) MCGEN_TRACE_CONTEXT ETW_MI_Provider_Context = {0, (ULONG_PTR)ETW_MI_Provider_Traits, 0, 0, 0, 0, 0, 0, 1, microsoft_windows_mimallocEnableBits, microsoft_windows_mimallocKeywords, microsoft_windows_mimallocLevels};
+
+//
+// Provider REGHANDLE
+//
+#define microsoft_windows_mimallocHandle (ETW_MI_Provider_Context.RegistrationHandle)
+
+//
+// This macro is set to 0, indicating that the EventWrite[Name] macros do not
+// have an Activity parameter. This is controlled by the -km and -um options.
+//
+#define ETW_MI_Provider_EventWriteActivity 0
+
+//
+// Register with ETW using the control GUID specified in the manifest.
+// Invoke this macro during module initialization (i.e. program startup,
+// DLL process attach, or driver load) to initialize the provider.
+// Note that if this function returns an error, the error means that
+// will not work, but no action needs to be taken -- even if EventRegister
+// returns an error, it is generally safe to use EventWrite and
+// EventUnregister macros (they will be no-ops if EventRegister failed).
+//
+#ifndef EventRegistermicrosoft_windows_mimalloc
+#define EventRegistermicrosoft_windows_mimalloc() McGenEventRegister(&ETW_MI_Provider, McGenControlCallbackV2, &ETW_MI_Provider_Context, &microsoft_windows_mimallocHandle)
+#endif
+
+//
+// Register with ETW using a specific control GUID (i.e. a GUID other than what
+// is specified in the manifest). Advanced scenarios only.
+//
+#ifndef EventRegisterByGuidmicrosoft_windows_mimalloc
+#define EventRegisterByGuidmicrosoft_windows_mimalloc(Guid) McGenEventRegister(&(Guid), McGenControlCallbackV2, &ETW_MI_Provider_Context, &microsoft_windows_mimallocHandle)
+#endif
+
+//
+// Unregister with ETW and close the provider.
+// Invoke this macro during module shutdown (i.e. program exit, DLL process
+// detach, or driver unload) to unregister the provider.
+// Note that you MUST call EventUnregister before DLL or driver unload
+// (not optional): failure to unregister a provider before DLL or driver unload
+// will result in crashes.
+//
+#ifndef EventUnregistermicrosoft_windows_mimalloc
+#define EventUnregistermicrosoft_windows_mimalloc() McGenEventUnregister(&microsoft_windows_mimallocHandle)
+#endif
+
+//
+// MCGEN_ENABLE_FORCONTEXT_CODE_GENERATION macro:
+// Define this macro to enable support for caller-allocated provider context.
+//
+#ifdef MCGEN_ENABLE_FORCONTEXT_CODE_GENERATION
+
+//
+// Advanced scenarios: Caller-allocated provider context.
+// Use when multiple differently-configured provider handles are needed,
+// e.g. for container-aware drivers, one context per container.
+//
+// Usage:
+//
+// - Caller enables the feature before including this header, e.g.
+//   #define MCGEN_ENABLE_FORCONTEXT_CODE_GENERATION 1
+// - Caller allocates memory, e.g. pContext = malloc(sizeof(McGenContext_microsoft_windows_mimalloc));
+// - Caller registers the provider, e.g. EventRegistermicrosoft_windows_mimalloc_ForContext(pContext);
+// - Caller writes events, e.g. EventWriteMyEvent_ForContext(pContext, ...);
+// - Caller unregisters, e.g. EventUnregistermicrosoft_windows_mimalloc_ForContext(pContext);
+// - Caller frees memory, e.g. free(pContext);
+//
+
+typedef struct tagMcGenContext_microsoft_windows_mimalloc {
+    // The fields of this structure are subject to change and should
+    // not be accessed directly. To access the provider's REGHANDLE,
+    // use microsoft_windows_mimallocHandle_ForContext(pContext).
+    MCGEN_TRACE_CONTEXT Context;
+    ULONG EnableBits[1];
+} McGenContext_microsoft_windows_mimalloc;
+
+#define EventRegistermicrosoft_windows_mimalloc_ForContext(pContext)             _mcgen_PASTE2(_mcgen_RegisterForContext_microsoft_windows_mimalloc_, MCGEN_EVENTREGISTER)(&ETW_MI_Provider, pContext)
+#define EventRegisterByGuidmicrosoft_windows_mimalloc_ForContext(Guid, pContext) _mcgen_PASTE2(_mcgen_RegisterForContext_microsoft_windows_mimalloc_, MCGEN_EVENTREGISTER)(&(Guid), pContext)
+#define EventUnregistermicrosoft_windows_mimalloc_ForContext(pContext)           McGenEventUnregister(&(pContext)->Context.RegistrationHandle)
+
+//
+// Provider REGHANDLE for caller-allocated context.
+//
+#define microsoft_windows_mimallocHandle_ForContext(pContext) ((pContext)->Context.RegistrationHandle)
+
+// This function is for use by MC-generated code and should not be used directly.
+// Initialize and register the caller-allocated context.
+__inline
+ULONG __stdcall
+_mcgen_PASTE2(_mcgen_RegisterForContext_microsoft_windows_mimalloc_, MCGEN_EVENTREGISTER)(
+    _In_ LPCGUID pProviderId,
+    _Out_ McGenContext_microsoft_windows_mimalloc* pContext)
+{
+    RtlZeroMemory(pContext, sizeof(*pContext));
+    pContext->Context.Logger = (ULONG_PTR)ETW_MI_Provider_Traits;
+    pContext->Context.EnableBitsCount = 1;
+    pContext->Context.EnableBitMask = pContext->EnableBits;
+    pContext->Context.EnableKeyWords = microsoft_windows_mimallocKeywords;
+    pContext->Context.EnableLevel = microsoft_windows_mimallocLevels;
+    return McGenEventRegister(
+        pProviderId,
+        McGenControlCallbackV2,
+        &pContext->Context,
+        &pContext->Context.RegistrationHandle);
+}
+
+// This function is for use by MC-generated code and should not be used directly.
+// Trigger a compile error if called with the wrong parameter type.
+FORCEINLINE
+_Ret_ McGenContext_microsoft_windows_mimalloc*
+_mcgen_CheckContextType_microsoft_windows_mimalloc(_In_ McGenContext_microsoft_windows_mimalloc* pContext)
+{
+    return pContext;
+}
+
+#endif // MCGEN_ENABLE_FORCONTEXT_CODE_GENERATION
+
+//
+// Enablement check macro for event "ETW_MI_ALLOC"
+//
+#define EventEnabledETW_MI_ALLOC() _mcgen_EVENT_BIT_SET(microsoft_windows_mimallocEnableBits, 0)
+#define EventEnabledETW_MI_ALLOC_ForContext(pContext) _mcgen_EVENT_BIT_SET(_mcgen_CheckContextType_microsoft_windows_mimalloc(pContext)->EnableBits, 0)
+
+//
+// Event write macros for event "ETW_MI_ALLOC"
+//
+#define EventWriteETW_MI_ALLOC(Address, Size) \
+        MCGEN_EVENT_ENABLED(ETW_MI_ALLOC) \
+        ? _mcgen_TEMPLATE_FOR_ETW_MI_ALLOC(&ETW_MI_Provider_Context, &ETW_MI_ALLOC, Address, Size) : 0
+#define EventWriteETW_MI_ALLOC_AssumeEnabled(Address, Size) \
+        _mcgen_TEMPLATE_FOR_ETW_MI_ALLOC(&ETW_MI_Provider_Context, &ETW_MI_ALLOC, Address, Size)
+#define EventWriteETW_MI_ALLOC_ForContext(pContext, Address, Size) \
+        MCGEN_EVENT_ENABLED_FORCONTEXT(pContext, ETW_MI_ALLOC) \
+        ? _mcgen_TEMPLATE_FOR_ETW_MI_ALLOC(&(pContext)->Context, &ETW_MI_ALLOC, Address, Size) : 0
+#define EventWriteETW_MI_ALLOC_ForContextAssumeEnabled(pContext, Address, Size) \
+        _mcgen_TEMPLATE_FOR_ETW_MI_ALLOC(&_mcgen_CheckContextType_microsoft_windows_mimalloc(pContext)->Context, &ETW_MI_ALLOC, Address, Size)
+
+// This macro is for use by MC-generated code and should not be used directly.
+#define _mcgen_TEMPLATE_FOR_ETW_MI_ALLOC _mcgen_PASTE2(McTemplateU0xx_, MCGEN_EVENTWRITETRANSFER)
+
+//
+// Enablement check macro for event "ETW_MI_FREE"
+//
+#define EventEnabledETW_MI_FREE() _mcgen_EVENT_BIT_SET(microsoft_windows_mimallocEnableBits, 0)
+#define EventEnabledETW_MI_FREE_ForContext(pContext) _mcgen_EVENT_BIT_SET(_mcgen_CheckContextType_microsoft_windows_mimalloc(pContext)->EnableBits, 0)
+
+//
+// Event write macros for event "ETW_MI_FREE"
+//
+#define EventWriteETW_MI_FREE(Address, Size) \
+        MCGEN_EVENT_ENABLED(ETW_MI_FREE) \
+        ? _mcgen_TEMPLATE_FOR_ETW_MI_FREE(&ETW_MI_Provider_Context, &ETW_MI_FREE, Address, Size) : 0
+#define EventWriteETW_MI_FREE_AssumeEnabled(Address, Size) \
+        _mcgen_TEMPLATE_FOR_ETW_MI_FREE(&ETW_MI_Provider_Context, &ETW_MI_FREE, Address, Size)
+#define EventWriteETW_MI_FREE_ForContext(pContext, Address, Size) \
+        MCGEN_EVENT_ENABLED_FORCONTEXT(pContext, ETW_MI_FREE) \
+        ? _mcgen_TEMPLATE_FOR_ETW_MI_FREE(&(pContext)->Context, &ETW_MI_FREE, Address, Size) : 0
+#define EventWriteETW_MI_FREE_ForContextAssumeEnabled(pContext, Address, Size) \
+        _mcgen_TEMPLATE_FOR_ETW_MI_FREE(&_mcgen_CheckContextType_microsoft_windows_mimalloc(pContext)->Context, &ETW_MI_FREE, Address, Size)
+
+// This macro is for use by MC-generated code and should not be used directly.
+#define _mcgen_TEMPLATE_FOR_ETW_MI_FREE _mcgen_PASTE2(McTemplateU0xx_, MCGEN_EVENTWRITETRANSFER)
+
+#endif // MCGEN_DISABLE_PROVIDER_CODE_GENERATION
+
+//
+// MCGEN_DISABLE_PROVIDER_CODE_GENERATION macro:
+// Define this macro to have the compiler skip the generated functions in this
+// header.
+//
+#ifndef MCGEN_DISABLE_PROVIDER_CODE_GENERATION
+
+//
+// Template Functions
+//
+
+//
+// Function for template "ETW_CUSTOM_HEAP_ALLOC_DATA" (and possibly others).
+// This function is for use by MC-generated code and should not be used directly.
+//
+#ifndef McTemplateU0xx_def
+#define McTemplateU0xx_def
+ETW_INLINE
+ULONG
+_mcgen_PASTE2(McTemplateU0xx_, MCGEN_EVENTWRITETRANSFER)(
+    _In_ PMCGEN_TRACE_CONTEXT Context,
+    _In_ PCEVENT_DESCRIPTOR Descriptor,
+    _In_ const unsigned __int64  _Arg0,
+    _In_ const unsigned __int64  _Arg1
+    )
+{
+#define McTemplateU0xx_ARGCOUNT 2
+
+    EVENT_DATA_DESCRIPTOR EventData[McTemplateU0xx_ARGCOUNT + 1];
+
+    EventDataDescCreate(&EventData[1],&_Arg0, sizeof(const unsigned __int64)  );
+
+    EventDataDescCreate(&EventData[2],&_Arg1, sizeof(const unsigned __int64)  );
+
+    return McGenEventWrite(Context, Descriptor, NULL, McTemplateU0xx_ARGCOUNT + 1, EventData);
+}
+#endif // McTemplateU0xx_def
+
+#endif // MCGEN_DISABLE_PROVIDER_CODE_GENERATION
+
+#if defined(__cplusplus)
+}
+#endif
diff --git a/Objects/mimalloc/prim/windows/etw.man b/Objects/mimalloc/prim/windows/etw.man
new file mode 100644
index 0000000000000000000000000000000000000000..cfd1f8a9eaacd50af63f1e28f9540aa88c20f90c
GIT binary patch
literal 3926
zcmeH~T~8B16o${WiT`2c+NGc<*i;EYh$d8xl<0*CS-Mb~v<qPu>PP(R>hsQY*nU!q
z$b})3?##}d?{nTW+uy%xwr*doYaNU1!Vc}sa%<a(W%kk*Y}poVi8<mVvnKl~r<t|f
z>7F%g+hVAmL$hwL?4dodnmuAKhUXm0)X9|WHj>XRahh@~SWDIkbduX;B#u6^Q>@U=
zDO8U+KXa0*th&%f*z^Udh4ol@uE=Q&`emUsh_CA`FOXgI{i-`XZ9C#bR1yBm=PJ*p
z9kVN$J6O;h;8HY>p)RnhY8A#Hb?z)_!y(Iaen(I)@-9CrSSp(;_Jn9I*$S&ATjP1?
zVxB>pV@LVsy;^jZr7r$HNAm06TcUiI`l@~F$Mt$E%Shfd3O+h1vFhR9a8yQZ@wpne
zr3bI-p=VEdo{)#uWxSVJeYQF|-5tnq>~f+CP~A0&{v=(up=ngEDl>5!$EDwPRaNjW
zXj|wbG$OwmwaW-hMvBK%pbm3wpic7<iGe1}wLX@G{?G|Bd>1O^dzbxT%*13+SP9h-
zI~rA5hapTVnk|qmiIVYy{__+x9f7OV4j3`g4;{{8_SWnLBSu2PUc%~`t%Ae^>J`SS
zdtZg-r<0xAH*_ALtK;Nv(d9nbKK1jK=Ld)I(jQrKhBjgToR#Wm8{0a}@6ZuEO<gbQ
zB3m5V%^3-v%%-25hY;g1&y$zH6XrqQ2)nL|zy>(lvG=y=Jh{M!4!-$(E)!vYUrf47
z<hkOaGap-@>nf4W$e&QFOovV_$>J%X*KN>oXI@jt%BJms>IU}I$<7<wvJG;y&ofxL
zZ?Ac4@EfiF;Qh3@Hq?u*mxUl}o``PSN9^9363^xS{`lRzaEI{>Hr{PChchs%+mx{%
zt(fa_PM4r63?1h#YOk~;byc5`>%q>sLHA1gohNq{qOREhxu<<B25jv)bQOG~tvEXM
zsbYvycWRYgeO<KaNEs4R@T0H57CiG+erb6HDME8{V+nzO*!Nwm{Em&P|NqBp)%s}N
z&KLRG82>y~`}YVhGe0?R_ceQ8vt^BpSNp6kErj_0hUNFyWQ1IOZ|GEgWBPwYFLgFu
Oo!*uqEBu%Ae18C_t1cS=

literal 0
HcmV?d00001

diff --git a/Objects/mimalloc/prim/windows/prim.c b/Objects/mimalloc/prim/windows/prim.c
new file mode 100644
index 00000000000000..e6b61079299cbd
--- /dev/null
+++ b/Objects/mimalloc/prim/windows/prim.c
@@ -0,0 +1,622 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+// This file is included in `src/prim/prim.c`
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
+#include "mimalloc/prim.h"
+#include <stdio.h>   // fputs, stderr
+
+
+//---------------------------------------------
+// Dynamically bind Windows API points for portability
+//---------------------------------------------
+
+// We use VirtualAlloc2 for aligned allocation, but it is only supported on Windows 10 and Windows Server 2016.
+// So, we need to look it up dynamically to run on older systems. (use __stdcall for 32-bit compatibility)
+// NtAllocateVirtualAllocEx is used for huge OS page allocation (1GiB)
+// We define a minimal MEM_EXTENDED_PARAMETER ourselves in order to be able to compile with older SDK's.
+typedef enum MI_MEM_EXTENDED_PARAMETER_TYPE_E {
+  MiMemExtendedParameterInvalidType = 0,
+  MiMemExtendedParameterAddressRequirements,
+  MiMemExtendedParameterNumaNode,
+  MiMemExtendedParameterPartitionHandle,
+  MiMemExtendedParameterUserPhysicalHandle,
+  MiMemExtendedParameterAttributeFlags,
+  MiMemExtendedParameterMax
+} MI_MEM_EXTENDED_PARAMETER_TYPE;
+
+typedef struct DECLSPEC_ALIGN(8) MI_MEM_EXTENDED_PARAMETER_S {
+  struct { DWORD64 Type : 8; DWORD64 Reserved : 56; } Type;
+  union  { DWORD64 ULong64; PVOID Pointer; SIZE_T Size; HANDLE Handle; DWORD ULong; } Arg;
+} MI_MEM_EXTENDED_PARAMETER;
+
+typedef struct MI_MEM_ADDRESS_REQUIREMENTS_S {
+  PVOID  LowestStartingAddress;
+  PVOID  HighestEndingAddress;
+  SIZE_T Alignment;
+} MI_MEM_ADDRESS_REQUIREMENTS;
+
+#define MI_MEM_EXTENDED_PARAMETER_NONPAGED_HUGE   0x00000010
+
+#include <winternl.h>
+typedef PVOID    (__stdcall *PVirtualAlloc2)(HANDLE, PVOID, SIZE_T, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG);
+typedef NTSTATUS (__stdcall *PNtAllocateVirtualMemoryEx)(HANDLE, PVOID*, SIZE_T*, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG);
+static PVirtualAlloc2 pVirtualAlloc2 = NULL;
+static PNtAllocateVirtualMemoryEx pNtAllocateVirtualMemoryEx = NULL;
+
+// Similarly, GetNumaProcesorNodeEx is only supported since Windows 7
+typedef struct MI_PROCESSOR_NUMBER_S { WORD Group; BYTE Number; BYTE Reserved; } MI_PROCESSOR_NUMBER;
+
+typedef VOID (__stdcall *PGetCurrentProcessorNumberEx)(MI_PROCESSOR_NUMBER* ProcNumber);
+typedef BOOL (__stdcall *PGetNumaProcessorNodeEx)(MI_PROCESSOR_NUMBER* Processor, PUSHORT NodeNumber);
+typedef BOOL (__stdcall* PGetNumaNodeProcessorMaskEx)(USHORT Node, PGROUP_AFFINITY ProcessorMask);
+typedef BOOL (__stdcall *PGetNumaProcessorNode)(UCHAR Processor, PUCHAR NodeNumber);
+static PGetCurrentProcessorNumberEx pGetCurrentProcessorNumberEx = NULL;
+static PGetNumaProcessorNodeEx      pGetNumaProcessorNodeEx = NULL;
+static PGetNumaNodeProcessorMaskEx  pGetNumaNodeProcessorMaskEx = NULL;
+static PGetNumaProcessorNode        pGetNumaProcessorNode = NULL;
+
+//---------------------------------------------
+// Enable large page support dynamically (if possible)
+//---------------------------------------------
+
+static bool win_enable_large_os_pages(size_t* large_page_size)
+{
+  static bool large_initialized = false;
+  if (large_initialized) return (_mi_os_large_page_size() > 0);
+  large_initialized = true;
+
+  // Try to see if large OS pages are supported
+  // To use large pages on Windows, we first need access permission
+  // Set "Lock pages in memory" permission in the group policy editor
+  // <https://devblogs.microsoft.com/oldnewthing/20110128-00/?p=11643>
+  unsigned long err = 0;
+  HANDLE token = NULL;
+  BOOL ok = OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &token);
+  if (ok) {
+    TOKEN_PRIVILEGES tp;
+    ok = LookupPrivilegeValue(NULL, TEXT("SeLockMemoryPrivilege"), &tp.Privileges[0].Luid);
+    if (ok) {
+      tp.PrivilegeCount = 1;
+      tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
+      ok = AdjustTokenPrivileges(token, FALSE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, 0);
+      if (ok) {
+        err = GetLastError();
+        ok = (err == ERROR_SUCCESS);
+        if (ok && large_page_size != NULL) {
+          *large_page_size = GetLargePageMinimum();
+        }
+      }
+    }
+    CloseHandle(token);
+  }
+  if (!ok) {
+    if (err == 0) err = GetLastError();
+    _mi_warning_message("cannot enable large OS page support, error %lu\n", err);
+  }
+  return (ok!=0);
+}
+
+
+//---------------------------------------------
+// Initialize
+//---------------------------------------------
+
+void _mi_prim_mem_init( mi_os_mem_config_t* config )
+{
+  config->has_overcommit = false;
+  config->must_free_whole = true;
+  config->has_virtual_reserve = true;
+  // get the page size
+  SYSTEM_INFO si;
+  GetSystemInfo(&si);
+  if (si.dwPageSize > 0) { config->page_size = si.dwPageSize; }
+  if (si.dwAllocationGranularity > 0) { config->alloc_granularity = si.dwAllocationGranularity; }
+  // get the VirtualAlloc2 function
+  HINSTANCE  hDll;
+  hDll = LoadLibrary(TEXT("kernelbase.dll"));
+  if (hDll != NULL) {
+    // use VirtualAlloc2FromApp if possible as it is available to Windows store apps
+    pVirtualAlloc2 = (PVirtualAlloc2)(void (*)(void))GetProcAddress(hDll, "VirtualAlloc2FromApp");
+    if (pVirtualAlloc2==NULL) pVirtualAlloc2 = (PVirtualAlloc2)(void (*)(void))GetProcAddress(hDll, "VirtualAlloc2");
+    FreeLibrary(hDll);
+  }
+  // NtAllocateVirtualMemoryEx is used for huge page allocation
+  hDll = LoadLibrary(TEXT("ntdll.dll"));
+  if (hDll != NULL) {
+    pNtAllocateVirtualMemoryEx = (PNtAllocateVirtualMemoryEx)(void (*)(void))GetProcAddress(hDll, "NtAllocateVirtualMemoryEx");
+    FreeLibrary(hDll);
+  }
+  // Try to use Win7+ numa API
+  hDll = LoadLibrary(TEXT("kernel32.dll"));
+  if (hDll != NULL) {
+    pGetCurrentProcessorNumberEx = (PGetCurrentProcessorNumberEx)(void (*)(void))GetProcAddress(hDll, "GetCurrentProcessorNumberEx");
+    pGetNumaProcessorNodeEx = (PGetNumaProcessorNodeEx)(void (*)(void))GetProcAddress(hDll, "GetNumaProcessorNodeEx");
+    pGetNumaNodeProcessorMaskEx = (PGetNumaNodeProcessorMaskEx)(void (*)(void))GetProcAddress(hDll, "GetNumaNodeProcessorMaskEx");
+    pGetNumaProcessorNode = (PGetNumaProcessorNode)(void (*)(void))GetProcAddress(hDll, "GetNumaProcessorNode");
+    FreeLibrary(hDll);
+  }
+  if (mi_option_is_enabled(mi_option_allow_large_os_pages) || mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
+    win_enable_large_os_pages(&config->large_page_size);
+  }
+}
+
+
+//---------------------------------------------
+// Free
+//---------------------------------------------
+
+int _mi_prim_free(void* addr, size_t size ) {
+  MI_UNUSED(size);
+  DWORD errcode = 0;
+  bool err = (VirtualFree(addr, 0, MEM_RELEASE) == 0);
+  if (err) { errcode = GetLastError(); }
+  if (errcode == ERROR_INVALID_ADDRESS) {
+    // In mi_os_mem_alloc_aligned the fallback path may have returned a pointer inside
+    // the memory region returned by VirtualAlloc; in that case we need to free using
+    // the start of the region.
+    MEMORY_BASIC_INFORMATION info = { 0 };
+    VirtualQuery(addr, &info, sizeof(info));
+    if (info.AllocationBase < addr && ((uint8_t*)addr - (uint8_t*)info.AllocationBase) < (ptrdiff_t)MI_SEGMENT_SIZE) {
+      errcode = 0;
+      err = (VirtualFree(info.AllocationBase, 0, MEM_RELEASE) == 0);
+      if (err) { errcode = GetLastError(); }
+    }
+  }
+  return (int)errcode;
+}
+
+
+//---------------------------------------------
+// VirtualAlloc
+//---------------------------------------------
+
+static void* win_virtual_alloc_prim(void* addr, size_t size, size_t try_alignment, DWORD flags) {
+  #if (MI_INTPTR_SIZE >= 8)
+  // on 64-bit systems, try to use the virtual address area after 2TiB for 4MiB aligned allocations
+  if (addr == NULL) {
+    void* hint = _mi_os_get_aligned_hint(try_alignment,size);
+    if (hint != NULL) {
+      void* p = VirtualAlloc(hint, size, flags, PAGE_READWRITE);
+      if (p != NULL) return p;
+      _mi_verbose_message("warning: unable to allocate hinted aligned OS memory (%zu bytes, error code: 0x%x, address: %p, alignment: %zu, flags: 0x%x)\n", size, GetLastError(), hint, try_alignment, flags);
+      // fall through on error
+    }
+  }
+  #endif
+  // on modern Windows try use VirtualAlloc2 for aligned allocation
+  if (try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) {
+    MI_MEM_ADDRESS_REQUIREMENTS reqs = { 0, 0, 0 };
+    reqs.Alignment = try_alignment;
+    MI_MEM_EXTENDED_PARAMETER param = { {0, 0}, {0} };
+    param.Type.Type = MiMemExtendedParameterAddressRequirements;
+    param.Arg.Pointer = &reqs;
+    void* p = (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, &param, 1);
+    if (p != NULL) return p;
+    _mi_warning_message("unable to allocate aligned OS memory (%zu bytes, error code: 0x%x, address: %p, alignment: %zu, flags: 0x%x)\n", size, GetLastError(), addr, try_alignment, flags);
+    // fall through on error
+  }
+  // last resort
+  return VirtualAlloc(addr, size, flags, PAGE_READWRITE);
+}
+
+static void* win_virtual_alloc(void* addr, size_t size, size_t try_alignment, DWORD flags, bool large_only, bool allow_large, bool* is_large) {
+  mi_assert_internal(!(large_only && !allow_large));
+  static _Atomic(size_t) large_page_try_ok; // = 0;
+  void* p = NULL;
+  // Try to allocate large OS pages (2MiB) if allowed or required.
+  if ((large_only || _mi_os_use_large_page(size, try_alignment))
+      && allow_large && (flags&MEM_COMMIT)!=0 && (flags&MEM_RESERVE)!=0) {
+    size_t try_ok = mi_atomic_load_acquire(&large_page_try_ok);
+    if (!large_only && try_ok > 0) {
+      // if a large page allocation fails, it seems the calls to VirtualAlloc get very expensive.
+      // therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times.
+      mi_atomic_cas_strong_acq_rel(&large_page_try_ok, &try_ok, try_ok - 1);
+    }
+    else {
+      // large OS pages must always reserve and commit.
+      *is_large = true;
+      p = win_virtual_alloc_prim(addr, size, try_alignment, flags | MEM_LARGE_PAGES);
+      if (large_only) return p;
+      // fall back to non-large page allocation on error (`p == NULL`).
+      if (p == NULL) {
+        mi_atomic_store_release(&large_page_try_ok,10UL);  // on error, don't try again for the next N allocations
+      }
+    }
+  }
+  // Fall back to regular page allocation
+  if (p == NULL) {
+    *is_large = ((flags&MEM_LARGE_PAGES) != 0);
+    p = win_virtual_alloc_prim(addr, size, try_alignment, flags);
+  }
+  //if (p == NULL) { _mi_warning_message("unable to allocate OS memory (%zu bytes, error code: 0x%x, address: %p, alignment: %zu, flags: 0x%x, large only: %d, allow large: %d)\n", size, GetLastError(), addr, try_alignment, flags, large_only, allow_large); }
+  return p;
+}
+
+int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) {
+  mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
+  mi_assert_internal(commit || !allow_large);
+  mi_assert_internal(try_alignment > 0);
+  *is_zero = true;
+  int flags = MEM_RESERVE;
+  if (commit) { flags |= MEM_COMMIT; }
+  *addr = win_virtual_alloc(NULL, size, try_alignment, flags, false, allow_large, is_large);
+  return (*addr != NULL ? 0 : (int)GetLastError());
+}
+
+
+//---------------------------------------------
+// Commit/Reset/Protect
+//---------------------------------------------
+#ifdef _MSC_VER
+#pragma warning(disable:6250)   // suppress warning calling VirtualFree without MEM_RELEASE (for decommit)
+#endif
+
+int _mi_prim_commit(void* addr, size_t size, bool* is_zero) {
+  *is_zero = false;
+  /*
+  // zero'ing only happens on an initial commit... but checking upfront seems expensive..
+  _MEMORY_BASIC_INFORMATION meminfo; _mi_memzero_var(meminfo);
+  if (VirtualQuery(addr, &meminfo, size) > 0) {
+    if ((meminfo.State & MEM_COMMIT) == 0) {
+      *is_zero = true;
+    }
+  }
+  */
+  // commit
+  void* p = VirtualAlloc(addr, size, MEM_COMMIT, PAGE_READWRITE);
+  if (p == NULL) return (int)GetLastError();
+  return 0;
+}
+
+int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit) {  
+  BOOL ok = VirtualFree(addr, size, MEM_DECOMMIT);
+  *needs_recommit = true;  // for safety, assume always decommitted even in the case of an error.
+  return (ok ? 0 : (int)GetLastError());
+}
+
+int _mi_prim_reset(void* addr, size_t size) {
+  void* p = VirtualAlloc(addr, size, MEM_RESET, PAGE_READWRITE);
+  mi_assert_internal(p == addr);
+  #if 0
+  if (p != NULL) {
+    VirtualUnlock(addr,size); // VirtualUnlock after MEM_RESET removes the memory directly from the working set
+  }
+  #endif
+  return (p != NULL ? 0 : (int)GetLastError());
+}
+
+int _mi_prim_protect(void* addr, size_t size, bool protect) {
+  DWORD oldprotect = 0;
+  BOOL ok = VirtualProtect(addr, size, protect ? PAGE_NOACCESS : PAGE_READWRITE, &oldprotect);
+  return (ok ? 0 : (int)GetLastError());
+}
+
+
+//---------------------------------------------
+// Huge page allocation
+//---------------------------------------------
+
+static void* _mi_prim_alloc_huge_os_pagesx(void* hint_addr, size_t size, int numa_node)
+{
+  const DWORD flags = MEM_LARGE_PAGES | MEM_COMMIT | MEM_RESERVE;
+
+  win_enable_large_os_pages(NULL);
+
+  MI_MEM_EXTENDED_PARAMETER params[3] = { {{0,0},{0}},{{0,0},{0}},{{0,0},{0}} };
+  // on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages
+  static bool mi_huge_pages_available = true;
+  if (pNtAllocateVirtualMemoryEx != NULL && mi_huge_pages_available) {
+    params[0].Type.Type = MiMemExtendedParameterAttributeFlags;
+    params[0].Arg.ULong64 = MI_MEM_EXTENDED_PARAMETER_NONPAGED_HUGE;
+    ULONG param_count = 1;
+    if (numa_node >= 0) {
+      param_count++;
+      params[1].Type.Type = MiMemExtendedParameterNumaNode;
+      params[1].Arg.ULong = (unsigned)numa_node;
+    }
+    SIZE_T psize = size;
+    void* base = hint_addr;
+    NTSTATUS err = (*pNtAllocateVirtualMemoryEx)(GetCurrentProcess(), &base, &psize, flags, PAGE_READWRITE, params, param_count);
+    if (err == 0 && base != NULL) {
+      return base;
+    }
+    else {
+      // fall back to regular large pages
+      mi_huge_pages_available = false; // don't try further huge pages
+      _mi_warning_message("unable to allocate using huge (1GiB) pages, trying large (2MiB) pages instead (status 0x%lx)\n", err);
+    }
+  }
+  // on modern Windows try use VirtualAlloc2 for numa aware large OS page allocation
+  if (pVirtualAlloc2 != NULL && numa_node >= 0) {
+    params[0].Type.Type = MiMemExtendedParameterNumaNode;
+    params[0].Arg.ULong = (unsigned)numa_node;
+    return (*pVirtualAlloc2)(GetCurrentProcess(), hint_addr, size, flags, PAGE_READWRITE, params, 1);
+  }
+
+  // otherwise use regular virtual alloc on older windows
+  return VirtualAlloc(hint_addr, size, flags, PAGE_READWRITE);
+}
+
+int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr) {
+  *is_zero = true;
+  *addr = _mi_prim_alloc_huge_os_pagesx(hint_addr,size,numa_node);
+  return (*addr != NULL ? 0 : (int)GetLastError());
+}
+
+
+//---------------------------------------------
+// Numa nodes
+//---------------------------------------------
+
+size_t _mi_prim_numa_node(void) {
+  USHORT numa_node = 0;
+  if (pGetCurrentProcessorNumberEx != NULL && pGetNumaProcessorNodeEx != NULL) {
+    // Extended API is supported
+    MI_PROCESSOR_NUMBER pnum;
+    (*pGetCurrentProcessorNumberEx)(&pnum);
+    USHORT nnode = 0;
+    BOOL ok = (*pGetNumaProcessorNodeEx)(&pnum, &nnode);
+    if (ok) { numa_node = nnode; }
+  }
+  else if (pGetNumaProcessorNode != NULL) {
+    // Vista or earlier, use older API that is limited to 64 processors. Issue #277
+    DWORD pnum = GetCurrentProcessorNumber();
+    UCHAR nnode = 0;
+    BOOL ok = pGetNumaProcessorNode((UCHAR)pnum, &nnode);
+    if (ok) { numa_node = nnode; }
+  }
+  return numa_node;
+}
+
+size_t _mi_prim_numa_node_count(void) {
+  ULONG numa_max = 0;
+  GetNumaHighestNodeNumber(&numa_max);
+  // find the highest node number that has actual processors assigned to it. Issue #282
+  while(numa_max > 0) {
+    if (pGetNumaNodeProcessorMaskEx != NULL) {
+      // Extended API is supported
+      GROUP_AFFINITY affinity;
+      if ((*pGetNumaNodeProcessorMaskEx)((USHORT)numa_max, &affinity)) {
+        if (affinity.Mask != 0) break;  // found the maximum non-empty node
+      }
+    }
+    else {
+      // Vista or earlier, use older API that is limited to 64 processors.
+      ULONGLONG mask;
+      if (GetNumaNodeProcessorMask((UCHAR)numa_max, &mask)) {
+        if (mask != 0) break; // found the maximum non-empty node
+      };
+    }
+    // max node was invalid or had no processor assigned, try again
+    numa_max--;
+  }
+  return ((size_t)numa_max + 1);
+}
+
+
+//----------------------------------------------------------------
+// Clock
+//----------------------------------------------------------------
+
+static mi_msecs_t mi_to_msecs(LARGE_INTEGER t) {
+  static LARGE_INTEGER mfreq; // = 0
+  if (mfreq.QuadPart == 0LL) {
+    LARGE_INTEGER f;
+    QueryPerformanceFrequency(&f);
+    mfreq.QuadPart = f.QuadPart/1000LL;
+    if (mfreq.QuadPart == 0) mfreq.QuadPart = 1;
+  }
+  return (mi_msecs_t)(t.QuadPart / mfreq.QuadPart);
+}
+
+mi_msecs_t _mi_prim_clock_now(void) {
+  LARGE_INTEGER t;
+  QueryPerformanceCounter(&t);
+  return mi_to_msecs(t);
+}
+
+
+//----------------------------------------------------------------
+// Process Info
+//----------------------------------------------------------------
+
+#include <windows.h>
+#include <psapi.h>
+
+static mi_msecs_t filetime_msecs(const FILETIME* ftime) {
+  ULARGE_INTEGER i;
+  i.LowPart = ftime->dwLowDateTime;
+  i.HighPart = ftime->dwHighDateTime;
+  mi_msecs_t msecs = (i.QuadPart / 10000); // FILETIME is in 100 nano seconds
+  return msecs;
+}
+
+typedef BOOL (WINAPI *PGetProcessMemoryInfo)(HANDLE, PPROCESS_MEMORY_COUNTERS, DWORD);
+static PGetProcessMemoryInfo pGetProcessMemoryInfo = NULL;
+
+void _mi_prim_process_info(mi_process_info_t* pinfo)
+{
+  FILETIME ct;
+  FILETIME ut;
+  FILETIME st;
+  FILETIME et;
+  GetProcessTimes(GetCurrentProcess(), &ct, &et, &st, &ut);
+  pinfo->utime = filetime_msecs(&ut);
+  pinfo->stime = filetime_msecs(&st);
+  
+  // load psapi on demand
+  if (pGetProcessMemoryInfo == NULL) {
+    HINSTANCE hDll = LoadLibrary(TEXT("psapi.dll"));
+    if (hDll != NULL) {
+      pGetProcessMemoryInfo = (PGetProcessMemoryInfo)(void (*)(void))GetProcAddress(hDll, "GetProcessMemoryInfo");
+    }
+  }
+
+  // get process info
+  PROCESS_MEMORY_COUNTERS info;
+  memset(&info, 0, sizeof(info));
+  if (pGetProcessMemoryInfo != NULL) {
+    pGetProcessMemoryInfo(GetCurrentProcess(), &info, sizeof(info));
+  } 
+  pinfo->current_rss    = (size_t)info.WorkingSetSize;
+  pinfo->peak_rss       = (size_t)info.PeakWorkingSetSize;
+  pinfo->current_commit = (size_t)info.PagefileUsage;
+  pinfo->peak_commit    = (size_t)info.PeakPagefileUsage;
+  pinfo->page_faults    = (size_t)info.PageFaultCount;
+}
+
+//----------------------------------------------------------------
+// Output
+//----------------------------------------------------------------
+
+void _mi_prim_out_stderr( const char* msg ) 
+{
+  // on windows with redirection, the C runtime cannot handle locale dependent output
+  // after the main thread closes so we use direct console output.
+  if (!_mi_preloading()) {
+    // _cputs(msg);  // _cputs cannot be used at is aborts if it fails to lock the console
+    static HANDLE hcon = INVALID_HANDLE_VALUE;
+    static bool hconIsConsole;
+    if (hcon == INVALID_HANDLE_VALUE) {
+      CONSOLE_SCREEN_BUFFER_INFO sbi;
+      hcon = GetStdHandle(STD_ERROR_HANDLE);
+      hconIsConsole = ((hcon != INVALID_HANDLE_VALUE) && GetConsoleScreenBufferInfo(hcon, &sbi));
+    }
+    const size_t len = _mi_strlen(msg);
+    if (len > 0 && len < UINT32_MAX) {
+      DWORD written = 0;
+      if (hconIsConsole) {
+        WriteConsoleA(hcon, msg, (DWORD)len, &written, NULL);
+      }
+      else if (hcon != INVALID_HANDLE_VALUE) {
+        // use direct write if stderr was redirected
+        WriteFile(hcon, msg, (DWORD)len, &written, NULL);
+      }
+      else {
+        // finally fall back to fputs after all
+        fputs(msg, stderr);
+      }
+    }
+  }
+}
+
+
+//----------------------------------------------------------------
+// Environment
+//----------------------------------------------------------------
+
+// On Windows use GetEnvironmentVariable instead of getenv to work
+// reliably even when this is invoked before the C runtime is initialized.
+// i.e. when `_mi_preloading() == true`.
+// Note: on windows, environment names are not case sensitive.
+bool _mi_prim_getenv(const char* name, char* result, size_t result_size) {
+  result[0] = 0;
+  size_t len = GetEnvironmentVariableA(name, result, (DWORD)result_size);
+  return (len > 0 && len < result_size);
+}
+
+
+
+//----------------------------------------------------------------
+// Random
+//----------------------------------------------------------------
+
+#if defined(MI_USE_RTLGENRANDOM) // || defined(__cplusplus)
+// We prefer to use BCryptGenRandom instead of (the unofficial) RtlGenRandom but when using
+// dynamic overriding, we observed it can raise an exception when compiled with C++, and
+// sometimes deadlocks when also running under the VS debugger.
+// In contrast, issue #623 implies that on Windows Server 2019 we need to use BCryptGenRandom.
+// To be continued..
+#pragma comment (lib,"advapi32.lib")
+#define RtlGenRandom  SystemFunction036
+mi_decl_externc BOOLEAN NTAPI RtlGenRandom(PVOID RandomBuffer, ULONG RandomBufferLength);
+
+bool _mi_prim_random_buf(void* buf, size_t buf_len) {
+  return (RtlGenRandom(buf, (ULONG)buf_len) != 0);
+}
+
+#else
+
+#ifndef BCRYPT_USE_SYSTEM_PREFERRED_RNG
+#define BCRYPT_USE_SYSTEM_PREFERRED_RNG 0x00000002
+#endif
+
+typedef LONG (NTAPI *PBCryptGenRandom)(HANDLE, PUCHAR, ULONG, ULONG);
+static  PBCryptGenRandom pBCryptGenRandom = NULL;
+
+bool _mi_prim_random_buf(void* buf, size_t buf_len) {
+  if (pBCryptGenRandom == NULL) {
+    HINSTANCE hDll = LoadLibrary(TEXT("bcrypt.dll"));
+    if (hDll != NULL) {
+      pBCryptGenRandom = (PBCryptGenRandom)(void (*)(void))GetProcAddress(hDll, "BCryptGenRandom");
+    }
+    if (pBCryptGenRandom == NULL) return false;
+  }
+  return (pBCryptGenRandom(NULL, (PUCHAR)buf, (ULONG)buf_len, BCRYPT_USE_SYSTEM_PREFERRED_RNG) >= 0);  
+}
+
+#endif  // MI_USE_RTLGENRANDOM
+
+//----------------------------------------------------------------
+// Thread init/done
+//----------------------------------------------------------------
+
+#if !defined(MI_SHARED_LIB)
+
+// use thread local storage keys to detect thread ending
+#include <fibersapi.h>
+#if (_WIN32_WINNT < 0x600)  // before Windows Vista
+WINBASEAPI DWORD WINAPI FlsAlloc( _In_opt_ PFLS_CALLBACK_FUNCTION lpCallback );
+WINBASEAPI PVOID WINAPI FlsGetValue( _In_ DWORD dwFlsIndex );
+WINBASEAPI BOOL  WINAPI FlsSetValue( _In_ DWORD dwFlsIndex, _In_opt_ PVOID lpFlsData );
+WINBASEAPI BOOL  WINAPI FlsFree(_In_ DWORD dwFlsIndex);
+#endif
+
+static DWORD mi_fls_key = (DWORD)(-1);
+
+static void NTAPI mi_fls_done(PVOID value) {
+  mi_heap_t* heap = (mi_heap_t*)value;
+  if (heap != NULL) {
+    _mi_thread_done(heap);
+    FlsSetValue(mi_fls_key, NULL);  // prevent recursion as _mi_thread_done may set it back to the main heap, issue #672
+  }
+}
+
+void _mi_prim_thread_init_auto_done(void) {
+  mi_fls_key = FlsAlloc(&mi_fls_done);
+}
+
+void _mi_prim_thread_done_auto_done(void) {
+  // call thread-done on all threads (except the main thread) to prevent 
+  // dangling callback pointer if statically linked with a DLL; Issue #208
+  FlsFree(mi_fls_key);  
+}
+
+void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
+  mi_assert_internal(mi_fls_key != (DWORD)(-1));
+  FlsSetValue(mi_fls_key, heap);
+}
+
+#else
+
+// Dll; nothing to do as in that case thread_done is handled through the DLL_THREAD_DETACH event.
+
+void _mi_prim_thread_init_auto_done(void) {
+}
+
+void _mi_prim_thread_done_auto_done(void) {
+}
+
+void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
+  MI_UNUSED(heap);
+}
+
+#endif
diff --git a/Objects/mimalloc/prim/windows/readme.md b/Objects/mimalloc/prim/windows/readme.md
new file mode 100644
index 00000000000000..217c3d174db473
--- /dev/null
+++ b/Objects/mimalloc/prim/windows/readme.md
@@ -0,0 +1,17 @@
+## Primitives:
+
+- `prim.c` contains Windows primitives for OS allocation.
+
+## Event Tracing for Windows (ETW)
+
+- `etw.h` is generated from `etw.man` which contains the manifest for mimalloc events.
+  (100 is an allocation, 101 is for a free)
+
+- `etw-mimalloc.wprp` is a profile for the Windows Performance Recorder (WPR).
+  In an admin prompt, you can use:
+  ```
+  > wpr -start src\prim\windows\etw-mimalloc.wprp -filemode
+  > <my mimalloc program>
+  > wpr -stop test.etl
+  ``` 
+  and then open `test.etl` in the Windows Performance Analyzer (WPA).
\ No newline at end of file
diff --git a/Objects/mimalloc/random.c b/Objects/mimalloc/random.c
index 06d4ba4ad67a98..4fc8b2f8fb0bc3 100644
--- a/Objects/mimalloc/random.c
+++ b/Objects/mimalloc/random.c
@@ -4,14 +4,10 @@ This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
-#ifndef _DEFAULT_SOURCE
-#define _DEFAULT_SOURCE   // for syscall() on Linux
-#endif
-
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
-
-#include <string.h> // memset
+#include "mimalloc/internal.h"
+#include "mimalloc/prim.h"    // _mi_prim_random_buf
+#include <string.h>       // memset
 
 /* ----------------------------------------------------------------------------
 We use our own PRNG to keep predictable performance of random number generation
@@ -158,159 +154,13 @@ uintptr_t _mi_random_next(mi_random_ctx_t* ctx) {
 
 
 /* ----------------------------------------------------------------------------
-To initialize a fresh random context we rely on the OS:
-- Windows     : BCryptGenRandom (or RtlGenRandom)
-- macOS       : CCRandomGenerateBytes, arc4random_buf
-- bsd,wasi    : arc4random_buf
-- Linux       : getrandom,/dev/urandom
+To initialize a fresh random context.
 If we cannot get good randomness, we fall back to weak randomness based on a timer and ASLR.
 -----------------------------------------------------------------------------*/
 
-#if defined(_WIN32)
-
-#if defined(MI_USE_RTLGENRANDOM) // || defined(__cplusplus)
-// We prefer to use BCryptGenRandom instead of (the unofficial) RtlGenRandom but when using
-// dynamic overriding, we observed it can raise an exception when compiled with C++, and
-// sometimes deadlocks when also running under the VS debugger.
-// In contrast, issue #623 implies that on Windows Server 2019 we need to use BCryptGenRandom.
-// To be continued..
-#pragma comment (lib,"advapi32.lib")
-#define RtlGenRandom  SystemFunction036
-#ifdef __cplusplus
-extern "C" {
-#endif
-BOOLEAN NTAPI RtlGenRandom(PVOID RandomBuffer, ULONG RandomBufferLength);
-#ifdef __cplusplus
-}
-#endif
-static bool os_random_buf(void* buf, size_t buf_len) {
-  return (RtlGenRandom(buf, (ULONG)buf_len) != 0);
-}
-#else
-
-#ifndef BCRYPT_USE_SYSTEM_PREFERRED_RNG
-#define BCRYPT_USE_SYSTEM_PREFERRED_RNG 0x00000002
-#endif
-
-typedef LONG (NTAPI *PBCryptGenRandom)(HANDLE, PUCHAR, ULONG, ULONG);
-static  PBCryptGenRandom pBCryptGenRandom = NULL;
-
-static bool os_random_buf(void* buf, size_t buf_len) {
-  if (pBCryptGenRandom == NULL) {
-    HINSTANCE hDll = LoadLibrary(TEXT("bcrypt.dll"));
-    if (hDll != NULL) {
-      pBCryptGenRandom = (PBCryptGenRandom)(void (*)(void))GetProcAddress(hDll, "BCryptGenRandom");
-    }
-  }
-  if (pBCryptGenRandom == NULL) {
-    return false;
-  }
-  else {
-    return (pBCryptGenRandom(NULL, (PUCHAR)buf, (ULONG)buf_len, BCRYPT_USE_SYSTEM_PREFERRED_RNG) >= 0);
-  }
-}
-#endif
-
-#elif defined(__APPLE__)
-#include <AvailabilityMacros.h>
-#if defined(MAC_OS_X_VERSION_10_10) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_10
-#include <CommonCrypto/CommonCryptoError.h>
-#include <CommonCrypto/CommonRandom.h>
-#endif
-static bool os_random_buf(void* buf, size_t buf_len) {
-  #if defined(MAC_OS_X_VERSION_10_15) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_15
-    // We prefere CCRandomGenerateBytes as it returns an error code while arc4random_buf
-    // may fail silently on macOS. See PR #390, and <https://opensource.apple.com/source/Libc/Libc-1439.40.11/gen/FreeBSD/arc4random.c.auto.html>
-    return (CCRandomGenerateBytes(buf, buf_len) == kCCSuccess);
-  #else
-    // fall back on older macOS
-    arc4random_buf(buf, buf_len);
-    return true;
-  #endif
-}
-
-#elif defined(__ANDROID__) || defined(__DragonFly__) || \
-      defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
-      defined(__sun) // todo: what to use with __wasi__?
-#include <stdlib.h>
-static bool os_random_buf(void* buf, size_t buf_len) {
-  arc4random_buf(buf, buf_len);
-  return true;
-}
-#elif defined(__linux__) || defined(__HAIKU__)
-#if defined(__linux__)
-#include <sys/syscall.h>
-#endif
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <errno.h>
-static bool os_random_buf(void* buf, size_t buf_len) {
-  // Modern Linux provides `getrandom` but different distributions either use `sys/random.h` or `linux/random.h`
-  // and for the latter the actual `getrandom` call is not always defined.
-  // (see <https://stackoverflow.com/questions/45237324/why-doesnt-getrandom-compile>)
-  // We therefore use a syscall directly and fall back dynamically to /dev/urandom when needed.
-#ifdef SYS_getrandom
-  #ifndef GRND_NONBLOCK
-  #define GRND_NONBLOCK (1)
-  #endif
-  static _Atomic(uintptr_t) no_getrandom; // = 0
-  if (mi_atomic_load_acquire(&no_getrandom)==0) {
-    ssize_t ret = syscall(SYS_getrandom, buf, buf_len, GRND_NONBLOCK);
-    if (ret >= 0) return (buf_len == (size_t)ret);
-    if (errno != ENOSYS) return false;
-    mi_atomic_store_release(&no_getrandom, 1UL); // don't call again, and fall back to /dev/urandom
-  }
-#endif
-  int flags = O_RDONLY;
-  #if defined(O_CLOEXEC)
-  flags |= O_CLOEXEC;
-  #endif
-  int fd = open("/dev/urandom", flags, 0);
-  if (fd < 0) return false;
-  size_t count = 0;
-  while(count < buf_len) {
-    ssize_t ret = read(fd, (char*)buf + count, buf_len - count);
-    if (ret<=0) {
-      if (errno!=EAGAIN && errno!=EINTR) break;
-    }
-    else {
-      count += ret;
-    }
-  }
-  close(fd);
-  return (count==buf_len);
-}
-#else
-static bool os_random_buf(void* buf, size_t buf_len) {
-  return false;
-}
-#endif
-
-#if defined(_WIN32)
-#include <windows.h>
-#elif defined(__APPLE__)
-#include <mach/mach_time.h>
-#else
-#include <time.h>
-#endif
-
 uintptr_t _mi_os_random_weak(uintptr_t extra_seed) {
   uintptr_t x = (uintptr_t)&_mi_os_random_weak ^ extra_seed; // ASLR makes the address random
-
-  #if defined(_WIN32)
-    LARGE_INTEGER pcount;
-    QueryPerformanceCounter(&pcount);
-    x ^= (uintptr_t)(pcount.QuadPart);
-  #elif defined(__APPLE__)
-    x ^= (uintptr_t)mach_absolute_time();
-  #else
-    struct timespec time;
-    clock_gettime(CLOCK_MONOTONIC, &time);
-    x ^= (uintptr_t)time.tv_sec;
-    x ^= (uintptr_t)time.tv_nsec;
-  #endif
+  x ^= _mi_prim_clock_now();  
   // and do a few randomization steps
   uintptr_t max = ((x ^ (x >> 17)) & 0x0F) + 1;
   for (uintptr_t i = 0; i < max; i++) {
@@ -322,7 +172,7 @@ uintptr_t _mi_os_random_weak(uintptr_t extra_seed) {
 
 static void mi_random_init_ex(mi_random_ctx_t* ctx, bool use_weak) {
   uint8_t key[32];
-  if (use_weak || !os_random_buf(key, sizeof(key))) {
+  if (use_weak || !_mi_prim_random_buf(key, sizeof(key))) {
     // if we fail to get random data from the OS, we fall back to a
     // weak random source based on the current time
     #if !defined(__wasi__)
diff --git a/Objects/mimalloc/region.c b/Objects/mimalloc/region.c
deleted file mode 100644
index 3571abb602bb98..00000000000000
--- a/Objects/mimalloc/region.c
+++ /dev/null
@@ -1,516 +0,0 @@
-/* ----------------------------------------------------------------------------
-Copyright (c) 2019-2020, Microsoft Research, Daan Leijen
-This is free software; you can redistribute it and/or modify it under the
-terms of the MIT license. A copy of the license can be found in the file
-"LICENSE" at the root of this distribution.
------------------------------------------------------------------------------*/
-
-/* ----------------------------------------------------------------------------
-This implements a layer between the raw OS memory (VirtualAlloc/mmap/sbrk/..)
-and the segment and huge object allocation by mimalloc. There may be multiple
-implementations of this (one could be the identity going directly to the OS,
-another could be a simple cache etc), but the current one uses large "regions".
-In contrast to the rest of mimalloc, the "regions" are shared between threads and
-need to be accessed using atomic operations.
-We need this memory layer between the raw OS calls because of:
-1. on `sbrk` like systems (like WebAssembly) we need our own memory maps in order
-   to reuse memory effectively.
-2. It turns out that for large objects, between 1MiB and 32MiB (?), the cost of
-   an OS allocation/free is still (much) too expensive relative to the accesses
-   in that object :-( (`malloc-large` tests this). This means we need a cheaper
-   way to reuse memory.
-3. This layer allows for NUMA aware allocation.
-
-Possible issues:
-- (2) can potentially be addressed too with a small cache per thread which is much
-  simpler. Generally though that requires shrinking of huge pages, and may overuse
-  memory per thread. (and is not compatible with `sbrk`).
-- Since the current regions are per-process, we need atomic operations to
-  claim blocks which may be contended
-- In the worst case, we need to search the whole region map (16KiB for 256GiB)
-  linearly. At what point will direct OS calls be faster? Is there a way to
-  do this better without adding too much complexity?
------------------------------------------------------------------------------*/
-#include "mimalloc.h"
-#include "mimalloc-internal.h"
-#include "mimalloc-atomic.h"
-
-#include <string.h>  // memset
-
-#include "bitmap.h"
-
-// Internal raw OS interface
-size_t  _mi_os_large_page_size(void);
-bool    _mi_os_protect(void* addr, size_t size);
-bool    _mi_os_unprotect(void* addr, size_t size);
-bool    _mi_os_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
-bool    _mi_os_decommit(void* p, size_t size, mi_stats_t* stats);
-bool    _mi_os_reset(void* p, size_t size, mi_stats_t* stats);
-bool    _mi_os_unreset(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
-bool    _mi_os_commit_unreset(void* addr, size_t size, bool* is_zero, mi_stats_t* stats);
-
-// arena.c
-mi_arena_id_t _mi_arena_id_none(void);
-void    _mi_arena_free(void* p, size_t size, size_t alignment, size_t align_offset, size_t memid, bool all_committed, mi_stats_t* stats);
-void*   _mi_arena_alloc(size_t size, bool* commit, bool* large, bool* is_pinned, bool* is_zero, mi_arena_id_t req_arena_id, size_t* memid, mi_os_tld_t* tld);
-void*   _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool* commit, bool* large, bool* is_pinned, bool* is_zero, mi_arena_id_t req_arena_id, size_t* memid, mi_os_tld_t* tld);
-
-
-
-// Constants
-#if (MI_INTPTR_SIZE==8)
-#define MI_HEAP_REGION_MAX_SIZE    (256 * MI_GiB)  // 64KiB for the region map
-#elif (MI_INTPTR_SIZE==4)
-#define MI_HEAP_REGION_MAX_SIZE    (3 * MI_GiB)    // ~ KiB for the region map
-#else
-#error "define the maximum heap space allowed for regions on this platform"
-#endif
-
-#define MI_REGION_MAX_BLOCKS      MI_BITMAP_FIELD_BITS
-#define MI_REGION_SIZE            (MI_SEGMENT_SIZE * MI_BITMAP_FIELD_BITS)    // 256MiB  (64MiB on 32 bits)
-#define MI_REGION_MAX             (MI_HEAP_REGION_MAX_SIZE / MI_REGION_SIZE)  // 1024  (48 on 32 bits)
-#define MI_REGION_MAX_OBJ_BLOCKS  (MI_REGION_MAX_BLOCKS/4)                    // 64MiB
-#define MI_REGION_MAX_OBJ_SIZE    (MI_REGION_MAX_OBJ_BLOCKS*MI_SEGMENT_SIZE)
-
-// Region info
-typedef union mi_region_info_u {
-  size_t value;
-  struct {
-    bool  valid;        // initialized?
-    bool  is_large:1;   // allocated in fixed large/huge OS pages
-    bool  is_pinned:1;  // pinned memory cannot be decommitted
-    short numa_node;    // the associated NUMA node (where -1 means no associated node)
-  } x;
-} mi_region_info_t;
-
-
-// A region owns a chunk of REGION_SIZE (256MiB) (virtual) memory with
-// a bit map with one bit per MI_SEGMENT_SIZE (4MiB) block.
-typedef struct mem_region_s {
-  _Atomic(size_t)           info;        // mi_region_info_t.value
-  _Atomic(void*)            start;       // start of the memory area
-  mi_bitmap_field_t         in_use;      // bit per in-use block
-  mi_bitmap_field_t         dirty;       // track if non-zero per block
-  mi_bitmap_field_t         commit;      // track if committed per block
-  mi_bitmap_field_t         reset;       // track if reset per block
-  _Atomic(size_t)           arena_memid; // if allocated from a (huge page) arena
-  _Atomic(size_t)           padding;     // round to 8 fields (needs to be atomic for msvc, see issue #508)
-} mem_region_t;
-
-// The region map
-static mem_region_t regions[MI_REGION_MAX];
-
-// Allocated regions
-static _Atomic(size_t) regions_count; // = 0;
-
-
-/* ----------------------------------------------------------------------------
-Utility functions
------------------------------------------------------------------------------*/
-
-// Blocks (of 4MiB) needed for the given size.
-static size_t mi_region_block_count(size_t size) {
-  return _mi_divide_up(size, MI_SEGMENT_SIZE);
-}
-
-/*
-// Return a rounded commit/reset size such that we don't fragment large OS pages into small ones.
-static size_t mi_good_commit_size(size_t size) {
-  if (size > (SIZE_MAX - _mi_os_large_page_size())) return size;
-  return _mi_align_up(size, _mi_os_large_page_size());
-}
-*/
-
-// Return if a pointer points into a region reserved by us.
-mi_decl_nodiscard bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
-  if (p==NULL) return false;
-  size_t count = mi_atomic_load_relaxed(&regions_count);
-  for (size_t i = 0; i < count; i++) {
-    uint8_t* start = (uint8_t*)mi_atomic_load_ptr_relaxed(uint8_t, &regions[i].start);
-    if (start != NULL && (uint8_t*)p >= start && (uint8_t*)p < start + MI_REGION_SIZE) return true;
-  }
-  return false;
-}
-
-
-static void* mi_region_blocks_start(const mem_region_t* region, mi_bitmap_index_t bit_idx) {
-  uint8_t* start = (uint8_t*)mi_atomic_load_ptr_acquire(uint8_t, &((mem_region_t*)region)->start);
-  mi_assert_internal(start != NULL);
-  return (start + (bit_idx * MI_SEGMENT_SIZE));
-}
-
-static size_t mi_memid_create(mem_region_t* region, mi_bitmap_index_t bit_idx) {
-  mi_assert_internal(bit_idx < MI_BITMAP_FIELD_BITS);
-  size_t idx = region - regions;
-  mi_assert_internal(&regions[idx] == region);
-  return (idx*MI_BITMAP_FIELD_BITS + bit_idx)<<1;
-}
-
-static size_t mi_memid_create_from_arena(size_t arena_memid) {
-  return (arena_memid << 1) | 1;
-}
-
-
-static bool mi_memid_is_arena(size_t id, mem_region_t** region, mi_bitmap_index_t* bit_idx, size_t* arena_memid) {
-  if ((id&1)==1) {
-    if (arena_memid != NULL) *arena_memid = (id>>1);
-    return true;
-  }
-  else {
-    size_t idx = (id >> 1) / MI_BITMAP_FIELD_BITS;
-    *bit_idx   = (mi_bitmap_index_t)(id>>1) % MI_BITMAP_FIELD_BITS;
-    *region    = &regions[idx];
-    return false;
-  }
-}
-
-
-/* ----------------------------------------------------------------------------
-  Allocate a region is allocated from the OS (or an arena)
------------------------------------------------------------------------------*/
-
-static bool mi_region_try_alloc_os(size_t blocks, bool commit, bool allow_large, mem_region_t** region, mi_bitmap_index_t* bit_idx, mi_os_tld_t* tld)
-{
-  // not out of regions yet?
-  if (mi_atomic_load_relaxed(&regions_count) >= MI_REGION_MAX - 1) return false;
-
-  // try to allocate a fresh region from the OS
-  bool region_commit = (commit && mi_option_is_enabled(mi_option_eager_region_commit));
-  bool region_large = (commit && allow_large);
-  bool is_zero = false;
-  bool is_pinned = false;
-  size_t arena_memid = 0;
-  void* const start = _mi_arena_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, 0, &region_commit, &region_large, &is_pinned, &is_zero, _mi_arena_id_none(),  & arena_memid, tld);
-  if (start == NULL) return false;
-  mi_assert_internal(!(region_large && !allow_large));
-  mi_assert_internal(!region_large || region_commit);
-
-  // claim a fresh slot
-  const size_t idx = mi_atomic_increment_acq_rel(&regions_count);
-  if (idx >= MI_REGION_MAX) {
-    mi_atomic_decrement_acq_rel(&regions_count);
-    _mi_arena_free(start, MI_REGION_SIZE, MI_SEGMENT_ALIGN, 0, arena_memid, region_commit, tld->stats);
-    _mi_warning_message("maximum regions used: %zu GiB (perhaps recompile with a larger setting for MI_HEAP_REGION_MAX_SIZE)", _mi_divide_up(MI_HEAP_REGION_MAX_SIZE, MI_GiB));
-    return false;
-  }
-
-  // allocated, initialize and claim the initial blocks
-  mem_region_t* r = &regions[idx];
-  r->arena_memid  = arena_memid;
-  mi_atomic_store_release(&r->in_use, (size_t)0);
-  mi_atomic_store_release(&r->dirty, (is_zero ? 0 : MI_BITMAP_FIELD_FULL));
-  mi_atomic_store_release(&r->commit, (region_commit ? MI_BITMAP_FIELD_FULL : 0));
-  mi_atomic_store_release(&r->reset, (size_t)0);
-  *bit_idx = 0;
-  _mi_bitmap_claim(&r->in_use, 1, blocks, *bit_idx, NULL);
-  mi_atomic_store_ptr_release(void,&r->start, start);
-
-  // and share it
-  mi_region_info_t info;
-  info.value = 0;                        // initialize the full union to zero
-  info.x.valid = true;
-  info.x.is_large = region_large;
-  info.x.is_pinned = is_pinned;
-  info.x.numa_node = (short)_mi_os_numa_node(tld);
-  mi_atomic_store_release(&r->info, info.value); // now make it available to others
-  *region = r;
-  return true;
-}
-
-/* ----------------------------------------------------------------------------
-  Try to claim blocks in suitable regions
------------------------------------------------------------------------------*/
-
-static bool mi_region_is_suitable(const mem_region_t* region, int numa_node, bool allow_large ) {
-  // initialized at all?
-  mi_region_info_t info;
-  info.value = mi_atomic_load_relaxed(&((mem_region_t*)region)->info);
-  if (info.value==0) return false;
-
-  // numa correct
-  if (numa_node >= 0) {  // use negative numa node to always succeed
-    int rnode = info.x.numa_node;
-    if (rnode >= 0 && rnode != numa_node) return false;
-  }
-
-  // check allow-large
-  if (!allow_large && info.x.is_large) return false;
-
-  return true;
-}
-
-
-static bool mi_region_try_claim(int numa_node, size_t blocks, bool allow_large, mem_region_t** region, mi_bitmap_index_t* bit_idx, mi_os_tld_t* tld)
-{
-  // try all regions for a free slot
-  const size_t count = mi_atomic_load_relaxed(&regions_count); // monotonic, so ok to be relaxed
-  size_t idx = tld->region_idx; // Or start at 0 to reuse low addresses? Starting at 0 seems to increase latency though
-  for (size_t visited = 0; visited < count; visited++, idx++) {
-    if (idx >= count) idx = 0;  // wrap around
-    mem_region_t* r = &regions[idx];
-    // if this region suits our demand (numa node matches, large OS page matches)
-    if (mi_region_is_suitable(r, numa_node, allow_large)) {
-      // then try to atomically claim a segment(s) in this region
-      if (_mi_bitmap_try_find_claim_field(&r->in_use, 0, blocks, bit_idx)) {
-        tld->region_idx = idx;    // remember the last found position
-        *region = r;
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-
-static void* mi_region_try_alloc(size_t blocks, bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
-{
-  mi_assert_internal(blocks <= MI_BITMAP_FIELD_BITS);
-  mem_region_t* region;
-  mi_bitmap_index_t bit_idx;
-  const int numa_node = (_mi_os_numa_node_count() <= 1 ? -1 : _mi_os_numa_node(tld));
-  // try to claim in existing regions
-  if (!mi_region_try_claim(numa_node, blocks, *large, &region, &bit_idx, tld)) {
-    // otherwise try to allocate a fresh region and claim in there
-    if (!mi_region_try_alloc_os(blocks, *commit, *large, &region, &bit_idx, tld)) {
-      // out of regions or memory
-      return NULL;
-    }
-  }
-
-  // ------------------------------------------------
-  // found a region and claimed `blocks` at `bit_idx`, initialize them now
-  mi_assert_internal(region != NULL);
-  mi_assert_internal(_mi_bitmap_is_claimed(&region->in_use, 1, blocks, bit_idx));
-
-  mi_region_info_t info;
-  info.value = mi_atomic_load_acquire(&region->info);
-  uint8_t* start = (uint8_t*)mi_atomic_load_ptr_acquire(uint8_t,&region->start);
-  mi_assert_internal(!(info.x.is_large && !*large));
-  mi_assert_internal(start != NULL);
-
-  *is_zero   = _mi_bitmap_claim(&region->dirty, 1, blocks, bit_idx, NULL);
-  *large     = info.x.is_large;
-  *is_pinned = info.x.is_pinned;
-  *memid     = mi_memid_create(region, bit_idx);
-  void* p = start + (mi_bitmap_index_bit_in_field(bit_idx) * MI_SEGMENT_SIZE);
-
-  // commit
-  if (*commit) {
-    // ensure commit
-    bool any_uncommitted;
-    _mi_bitmap_claim(&region->commit, 1, blocks, bit_idx, &any_uncommitted);
-    if (any_uncommitted) {
-      mi_assert_internal(!info.x.is_large && !info.x.is_pinned);
-      bool commit_zero = false;
-      if (!_mi_mem_commit(p, blocks * MI_SEGMENT_SIZE, &commit_zero, tld)) {
-        // failed to commit! unclaim and return
-        mi_bitmap_unclaim(&region->in_use, 1, blocks, bit_idx);
-        return NULL;
-      }
-      if (commit_zero) *is_zero = true;
-    }
-  }
-  else {
-    // no need to commit, but check if already fully committed
-    *commit = _mi_bitmap_is_claimed(&region->commit, 1, blocks, bit_idx);
-  }
-  mi_assert_internal(!*commit || _mi_bitmap_is_claimed(&region->commit, 1, blocks, bit_idx));
-
-  // unreset reset blocks
-  if (_mi_bitmap_is_any_claimed(&region->reset, 1, blocks, bit_idx)) {
-    // some blocks are still reset
-    mi_assert_internal(!info.x.is_large && !info.x.is_pinned);
-    mi_assert_internal(!mi_option_is_enabled(mi_option_eager_commit) || *commit || mi_option_get(mi_option_eager_commit_delay) > 0);
-    mi_bitmap_unclaim(&region->reset, 1, blocks, bit_idx);
-    if (*commit || !mi_option_is_enabled(mi_option_reset_decommits)) { // only if needed
-      bool reset_zero = false;
-      _mi_mem_unreset(p, blocks * MI_SEGMENT_SIZE, &reset_zero, tld);
-      if (reset_zero) *is_zero = true;
-    }
-  }
-  mi_assert_internal(!_mi_bitmap_is_any_claimed(&region->reset, 1, blocks, bit_idx));
-
-  #if (MI_DEBUG>=2) && !MI_TRACK_ENABLED
-  if (*commit) { ((uint8_t*)p)[0] = 0; }
-  #endif
-
-  // and return the allocation
-  mi_assert_internal(p != NULL);
-  return p;
-}
-
-
-/* ----------------------------------------------------------------------------
- Allocation
------------------------------------------------------------------------------*/
-
-// Allocate `size` memory aligned at `alignment`. Return non NULL on success, with a given memory `id`.
-// (`id` is abstract, but `id = idx*MI_REGION_MAP_BITS + bitidx`)
-void* _mi_mem_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
-{
-  mi_assert_internal(memid != NULL && tld != NULL);
-  mi_assert_internal(size > 0);
-  *memid = 0;
-  *is_zero = false;
-  *is_pinned = false;
-  bool default_large = false;
-  if (large==NULL) large = &default_large;  // ensure `large != NULL`
-  if (size == 0) return NULL;
-  size = _mi_align_up(size, _mi_os_page_size());
-
-  // allocate from regions if possible
-  void* p = NULL;
-  size_t arena_memid;
-  const size_t blocks = mi_region_block_count(size);
-  if (blocks <= MI_REGION_MAX_OBJ_BLOCKS && alignment <= MI_SEGMENT_ALIGN && align_offset == 0) {
-    p = mi_region_try_alloc(blocks, commit, large, is_pinned, is_zero, memid, tld);
-    if (p == NULL) {
-      _mi_warning_message("unable to allocate from region: size %zu\n", size);
-    }
-  }
-  if (p == NULL) {
-    // and otherwise fall back to the OS
-    p = _mi_arena_alloc_aligned(size, alignment, align_offset, commit, large, is_pinned, is_zero, _mi_arena_id_none(),  & arena_memid, tld);
-    *memid = mi_memid_create_from_arena(arena_memid);
-  }
-
-  if (p != NULL) {
-    mi_assert_internal(((uintptr_t)p + align_offset) % alignment == 0);
-    #if (MI_DEBUG>=2) && !MI_TRACK_ENABLED
-    if (*commit) { ((uint8_t*)p)[0] = 0; } // ensure the memory is committed
-    #endif
-  }
-  return p;
-}
-
-
-
-/* ----------------------------------------------------------------------------
-Free
------------------------------------------------------------------------------*/
-
-// Free previously allocated memory with a given id.
-void _mi_mem_free(void* p, size_t size, size_t alignment, size_t align_offset, size_t id, bool full_commit, bool any_reset, mi_os_tld_t* tld) {
-  mi_assert_internal(size > 0 && tld != NULL);
-  if (p==NULL) return;
-  if (size==0) return;
-  size = _mi_align_up(size, _mi_os_page_size());
-
-  size_t arena_memid = 0;
-  mi_bitmap_index_t bit_idx;
-  mem_region_t* region;
-  if (mi_memid_is_arena(id,&region,&bit_idx,&arena_memid)) {
-   // was a direct arena allocation, pass through
-    _mi_arena_free(p, size, alignment, align_offset, arena_memid, full_commit, tld->stats);
-  }
-  else {
-    // allocated in a region
-    mi_assert_internal(align_offset == 0);
-    mi_assert_internal(size <= MI_REGION_MAX_OBJ_SIZE); if (size > MI_REGION_MAX_OBJ_SIZE) return;
-    const size_t blocks = mi_region_block_count(size);
-    mi_assert_internal(blocks + bit_idx <= MI_BITMAP_FIELD_BITS);
-    mi_region_info_t info;
-    info.value = mi_atomic_load_acquire(&region->info);
-    mi_assert_internal(info.value != 0);
-    void* blocks_start = mi_region_blocks_start(region, bit_idx);
-    mi_assert_internal(blocks_start == p); // not a pointer in our area?
-    mi_assert_internal(bit_idx + blocks <= MI_BITMAP_FIELD_BITS);
-    if (blocks_start != p || bit_idx + blocks > MI_BITMAP_FIELD_BITS) return; // or `abort`?
-
-    // committed?
-    if (full_commit && (size % MI_SEGMENT_SIZE) == 0) {
-      _mi_bitmap_claim(&region->commit, 1, blocks, bit_idx, NULL);
-    }
-
-    if (any_reset) {
-      // set the is_reset bits if any pages were reset
-      _mi_bitmap_claim(&region->reset, 1, blocks, bit_idx, NULL);
-    }
-
-    // reset the blocks to reduce the working set.
-    if (!info.x.is_large && !info.x.is_pinned && mi_option_is_enabled(mi_option_segment_reset)
-       && (mi_option_is_enabled(mi_option_eager_commit) ||
-           mi_option_is_enabled(mi_option_reset_decommits))) // cannot reset halfway committed segments, use only `option_page_reset` instead
-    {
-      bool any_unreset;
-      _mi_bitmap_claim(&region->reset, 1, blocks, bit_idx, &any_unreset);
-      if (any_unreset) {
-        _mi_abandoned_await_readers(); // ensure no more pending write (in case reset = decommit)
-        _mi_mem_reset(p, blocks * MI_SEGMENT_SIZE, tld);
-      }
-    }
-
-    // and unclaim
-    bool all_unclaimed = mi_bitmap_unclaim(&region->in_use, 1, blocks, bit_idx);
-    mi_assert_internal(all_unclaimed); MI_UNUSED(all_unclaimed);
-  }
-}
-
-
-/* ----------------------------------------------------------------------------
-  collection
------------------------------------------------------------------------------*/
-void _mi_mem_collect(mi_os_tld_t* tld) {
-  // free every region that has no segments in use.
-  size_t rcount = mi_atomic_load_relaxed(&regions_count);
-  for (size_t i = 0; i < rcount; i++) {
-    mem_region_t* region = &regions[i];
-    if (mi_atomic_load_relaxed(&region->info) != 0) {
-      // if no segments used, try to claim the whole region
-      size_t m = mi_atomic_load_relaxed(&region->in_use);
-      while (m == 0 && !mi_atomic_cas_weak_release(&region->in_use, &m, MI_BITMAP_FIELD_FULL)) { /* nothing */ };
-      if (m == 0) {
-        // on success, free the whole region
-        uint8_t* start = (uint8_t*)mi_atomic_load_ptr_acquire(uint8_t,&regions[i].start);
-        size_t arena_memid = mi_atomic_load_relaxed(&regions[i].arena_memid);
-        size_t commit = mi_atomic_load_relaxed(&regions[i].commit);
-        memset((void*)&regions[i], 0, sizeof(mem_region_t));  // cast to void* to avoid atomic warning
-        // and release the whole region
-        mi_atomic_store_release(&region->info, (size_t)0);
-        if (start != NULL) { // && !_mi_os_is_huge_reserved(start)) {
-          _mi_abandoned_await_readers(); // ensure no pending reads
-          _mi_arena_free(start, MI_REGION_SIZE, MI_SEGMENT_ALIGN, 0, arena_memid, (~commit == 0), tld->stats);
-        }
-      }
-    }
-  }
-}
-
-
-/* ----------------------------------------------------------------------------
-  Other
------------------------------------------------------------------------------*/
-
-bool _mi_mem_reset(void* p, size_t size, mi_os_tld_t* tld) {
-  if (mi_option_is_enabled(mi_option_reset_decommits)) {
-    return _mi_os_decommit(p, size, tld->stats);
-  }
-  else {
-    return _mi_os_reset(p, size, tld->stats);
-  }
-}
-
-bool _mi_mem_unreset(void* p, size_t size, bool* is_zero, mi_os_tld_t* tld) {
-  if (mi_option_is_enabled(mi_option_reset_decommits)) {
-    return _mi_os_commit(p, size, is_zero, tld->stats);
-  }
-  else {
-    return _mi_os_unreset(p, size, is_zero, tld->stats);
-  }
-}
-
-bool _mi_mem_commit(void* p, size_t size, bool* is_zero, mi_os_tld_t* tld) {
-  return _mi_os_commit(p, size, is_zero, tld->stats);
-}
-
-bool _mi_mem_decommit(void* p, size_t size, mi_os_tld_t* tld) {
-  return _mi_os_decommit(p, size, tld->stats);
-}
-
-bool _mi_mem_protect(void* p, size_t size) {
-  return _mi_os_protect(p, size);
-}
-
-bool _mi_mem_unprotect(void* p, size_t size) {
-  return _mi_os_unprotect(p, size);
-}
diff --git a/Objects/mimalloc/segment-cache.c b/Objects/mimalloc/segment-cache.c
deleted file mode 100644
index d93fd644140ff2..00000000000000
--- a/Objects/mimalloc/segment-cache.c
+++ /dev/null
@@ -1,409 +0,0 @@
-/* ----------------------------------------------------------------------------
-Copyright (c) 2020, Microsoft Research, Daan Leijen
-This is free software; you can redistribute it and/or modify it under the
-terms of the MIT license. A copy of the license can be found in the file
-"LICENSE" at the root of this distribution.
------------------------------------------------------------------------------*/
-
-/* ----------------------------------------------------------------------------
-  Implements a cache of segments to avoid expensive OS calls and to reuse
-  the commit_mask to optimize the commit/decommit calls.
-  The full memory map of all segments is also implemented here.
------------------------------------------------------------------------------*/
-#include "mimalloc.h"
-#include "mimalloc-internal.h"
-#include "mimalloc-atomic.h"
-
-#include "bitmap.h"  // atomic bitmap
-
-//#define MI_CACHE_DISABLE 1    // define to completely disable the segment cache
-
-#define MI_CACHE_FIELDS     (16)
-#define MI_CACHE_MAX        (MI_BITMAP_FIELD_BITS*MI_CACHE_FIELDS)       // 1024 on 64-bit
-
-#define BITS_SET()          MI_ATOMIC_VAR_INIT(UINTPTR_MAX)
-#define MI_CACHE_BITS_SET   MI_INIT16(BITS_SET)                          // note: update if MI_CACHE_FIELDS changes
-
-typedef struct mi_cache_slot_s {
-  void*               p;
-  size_t              memid;
-  bool                is_pinned;
-  mi_commit_mask_t    commit_mask;
-  mi_commit_mask_t    decommit_mask;
-  _Atomic(mi_msecs_t) expire;
-} mi_cache_slot_t;
-
-static mi_decl_cache_align mi_cache_slot_t cache[MI_CACHE_MAX];    // = 0
-
-static mi_decl_cache_align mi_bitmap_field_t cache_available[MI_CACHE_FIELDS] = { MI_CACHE_BITS_SET };        // zero bit = available!
-static mi_decl_cache_align mi_bitmap_field_t cache_available_large[MI_CACHE_FIELDS] = { MI_CACHE_BITS_SET };
-static mi_decl_cache_align mi_bitmap_field_t cache_inuse[MI_CACHE_FIELDS];   // zero bit = free
-
-static bool mi_cdecl mi_segment_cache_is_suitable(mi_bitmap_index_t bitidx, void* arg) {
-  mi_arena_id_t req_arena_id = *((mi_arena_id_t*)arg);
-  mi_cache_slot_t* slot = &cache[mi_bitmap_index_bit(bitidx)];
-  return _mi_arena_memid_is_suitable(slot->memid, req_arena_id);
-}
-
-mi_decl_noinline static void* mi_segment_cache_pop_ex(
-                              bool all_suitable,
-                              size_t size, mi_commit_mask_t* commit_mask, 
-                              mi_commit_mask_t* decommit_mask, bool* large, bool* is_pinned, bool* is_zero, 
-                              mi_arena_id_t _req_arena_id, size_t* memid, mi_os_tld_t* tld)
-{
-#ifdef MI_CACHE_DISABLE
-  return NULL;
-#else
-
-  // only segment blocks
-  if (size != MI_SEGMENT_SIZE) return NULL;
-
-  // numa node determines start field
-  const int numa_node = _mi_os_numa_node(tld);
-  size_t start_field = 0;
-  if (numa_node > 0) {
-    start_field = (MI_CACHE_FIELDS / _mi_os_numa_node_count())*numa_node;
-    if (start_field >= MI_CACHE_FIELDS) start_field = 0;
-  }
-
-  // find an available slot
-  mi_bitmap_index_t bitidx = 0;
-  bool claimed = false;
-  mi_arena_id_t req_arena_id = _req_arena_id;
-  mi_bitmap_pred_fun_t pred_fun = (all_suitable ? NULL : &mi_segment_cache_is_suitable);  // cannot pass NULL as the arena may be exclusive itself; todo: do not put exclusive arenas in the cache?
-
-  if (*large) {  // large allowed?
-    claimed = _mi_bitmap_try_find_from_claim_pred(cache_available_large, MI_CACHE_FIELDS, start_field, 1, pred_fun, &req_arena_id, &bitidx);
-    if (claimed) *large = true;
-  }
-  if (!claimed) {
-    claimed = _mi_bitmap_try_find_from_claim_pred (cache_available, MI_CACHE_FIELDS, start_field, 1, pred_fun, &req_arena_id, &bitidx);
-    if (claimed) *large = false;
-  }
-
-  if (!claimed) return NULL;
-
-  // found a slot
-  mi_cache_slot_t* slot = &cache[mi_bitmap_index_bit(bitidx)];
-  void* p = slot->p;
-  *memid = slot->memid;
-  *is_pinned = slot->is_pinned;
-  *is_zero = false;
-  *commit_mask = slot->commit_mask;     
-  *decommit_mask = slot->decommit_mask;
-  slot->p = NULL;
-  mi_atomic_storei64_release(&slot->expire,(mi_msecs_t)0);
-  
-  // mark the slot as free again
-  mi_assert_internal(_mi_bitmap_is_claimed(cache_inuse, MI_CACHE_FIELDS, 1, bitidx));
-  _mi_bitmap_unclaim(cache_inuse, MI_CACHE_FIELDS, 1, bitidx);
-  return p;
-#endif
-}
-
-
-mi_decl_noinline void* _mi_segment_cache_pop(size_t size, mi_commit_mask_t* commit_mask, mi_commit_mask_t* decommit_mask, bool* large, bool* is_pinned, bool* is_zero, mi_arena_id_t _req_arena_id, size_t* memid, mi_os_tld_t* tld)
-{
-  return mi_segment_cache_pop_ex(false, size, commit_mask, decommit_mask, large, is_pinned, is_zero, _req_arena_id, memid, tld);
-}
-
-static mi_decl_noinline void mi_commit_mask_decommit(mi_commit_mask_t* cmask, void* p, size_t total, mi_stats_t* stats)
-{
-  if (mi_commit_mask_is_empty(cmask)) {
-    // nothing
-  }
-  else if (mi_commit_mask_is_full(cmask)) {
-    _mi_os_decommit(p, total, stats);
-  }
-  else {
-    // todo: one call to decommit the whole at once?
-    mi_assert_internal((total%MI_COMMIT_MASK_BITS)==0);
-    size_t part = total/MI_COMMIT_MASK_BITS;
-    size_t idx;
-    size_t count;    
-    mi_commit_mask_foreach(cmask, idx, count) {
-      void*  start = (uint8_t*)p + (idx*part);
-      size_t size = count*part;
-      _mi_os_decommit(start, size, stats);
-    }
-    mi_commit_mask_foreach_end()
-  }
-  mi_commit_mask_create_empty(cmask);
-}
-
-#define MI_MAX_PURGE_PER_PUSH  (4)
-
-static mi_decl_noinline void mi_segment_cache_purge(bool visit_all, bool force, mi_os_tld_t* tld)
-{
-  MI_UNUSED(tld);
-  if (!mi_option_is_enabled(mi_option_allow_decommit)) return;
-  mi_msecs_t now = _mi_clock_now();
-  size_t purged = 0;
-  const size_t max_visits = (visit_all ? MI_CACHE_MAX /* visit all */ : MI_CACHE_FIELDS /* probe at most N (=16) slots */);
-  size_t idx              = (visit_all ? 0 : _mi_random_shuffle((uintptr_t)now) % MI_CACHE_MAX /* random start */ );
-  for (size_t visited = 0; visited < max_visits; visited++,idx++) {  // visit N slots
-    if (idx >= MI_CACHE_MAX) idx = 0; // wrap
-    mi_cache_slot_t* slot = &cache[idx];
-    mi_msecs_t expire = mi_atomic_loadi64_relaxed(&slot->expire);
-    if (expire != 0 && (force || now >= expire)) {  // racy read
-      // seems expired, first claim it from available
-      purged++;
-      mi_bitmap_index_t bitidx = mi_bitmap_index_create_from_bit(idx);
-      if (_mi_bitmap_claim(cache_available, MI_CACHE_FIELDS, 1, bitidx, NULL)) {
-        // was available, we claimed it
-        expire = mi_atomic_loadi64_acquire(&slot->expire);
-        if (expire != 0 && (force || now >= expire)) {  // safe read
-          // still expired, decommit it
-          mi_atomic_storei64_relaxed(&slot->expire,(mi_msecs_t)0);
-          mi_assert_internal(!mi_commit_mask_is_empty(&slot->commit_mask) && _mi_bitmap_is_claimed(cache_available_large, MI_CACHE_FIELDS, 1, bitidx));
-          _mi_abandoned_await_readers();  // wait until safe to decommit
-          // decommit committed parts
-          // TODO: instead of decommit, we could also free to the OS?
-          mi_commit_mask_decommit(&slot->commit_mask, slot->p, MI_SEGMENT_SIZE, tld->stats);
-          mi_commit_mask_create_empty(&slot->decommit_mask);
-        }
-        _mi_bitmap_unclaim(cache_available, MI_CACHE_FIELDS, 1, bitidx); // make it available again for a pop
-      }
-      if (!visit_all && purged > MI_MAX_PURGE_PER_PUSH) break;  // bound to no more than N purge tries per push
-    }
-  }
-}
-
-void _mi_segment_cache_collect(bool force, mi_os_tld_t* tld) {
-  if (force) {
-    // called on `mi_collect(true)` but not on thread termination    
-    _mi_segment_cache_free_all(tld);
-  }
-  else {
-    mi_segment_cache_purge(true /* visit all */, false /* don't force unexpired */, tld);
-  }
-}
-
-void _mi_segment_cache_free_all(mi_os_tld_t* tld) {
-  mi_commit_mask_t commit_mask;
-  mi_commit_mask_t decommit_mask;
-  bool is_pinned;
-  bool is_zero;
-  size_t memid;
-  const size_t size = MI_SEGMENT_SIZE;
-  // iterate twice: first large pages, then regular memory 
-  for (int i = 0; i < 2; i++) {
-    void* p;
-    do {
-      // keep popping and freeing the memory
-      bool large = (i == 0);  
-      p = mi_segment_cache_pop_ex(true /* all */, size, &commit_mask, &decommit_mask,
-                                  &large, &is_pinned, &is_zero, _mi_arena_id_none(), &memid, tld);
-      if (p != NULL) {
-        size_t csize = _mi_commit_mask_committed_size(&commit_mask, size);
-        if (csize > 0 && !is_pinned) _mi_stat_decrease(&_mi_stats_main.committed, csize);
-        _mi_arena_free(p, size, MI_SEGMENT_ALIGN, 0, memid, is_pinned /* pretend not committed to not double count decommits */, tld->stats);
-      }
-    } while (p != NULL);
-  }
-}
-
-mi_decl_noinline bool _mi_segment_cache_push(void* start, size_t size, size_t memid, const mi_commit_mask_t* commit_mask, const mi_commit_mask_t* decommit_mask, bool is_large, bool is_pinned, mi_os_tld_t* tld)
-{
-#ifdef MI_CACHE_DISABLE
-  return false;
-#else
-
-  // only for normal segment blocks
-  if (size != MI_SEGMENT_SIZE || ((uintptr_t)start % MI_SEGMENT_ALIGN) != 0) return false;
-
-  // numa node determines start field
-  int numa_node = _mi_os_numa_node(NULL);
-  size_t start_field = 0;
-  if (numa_node > 0) {
-    start_field = (MI_CACHE_FIELDS / _mi_os_numa_node_count())*numa_node;
-    if (start_field >= MI_CACHE_FIELDS) start_field = 0;
-  }
-
-  // purge expired entries
-  mi_segment_cache_purge(false /* limit purges to a constant N */, false /* don't force unexpired */, tld);
-
-  // find an available slot
-  mi_bitmap_index_t bitidx;
-  bool claimed = _mi_bitmap_try_find_from_claim(cache_inuse, MI_CACHE_FIELDS, start_field, 1, &bitidx);
-  if (!claimed) return false;
-
-  mi_assert_internal(_mi_bitmap_is_claimed(cache_available, MI_CACHE_FIELDS, 1, bitidx));
-  mi_assert_internal(_mi_bitmap_is_claimed(cache_available_large, MI_CACHE_FIELDS, 1, bitidx));
-#if MI_DEBUG>1
-  if (is_pinned || is_large) {
-    mi_assert_internal(mi_commit_mask_is_full(commit_mask));
-  }
-#endif
-
-  // set the slot
-  mi_cache_slot_t* slot = &cache[mi_bitmap_index_bit(bitidx)];
-  slot->p = start;
-  slot->memid = memid;
-  slot->is_pinned = is_pinned;
-  mi_atomic_storei64_relaxed(&slot->expire,(mi_msecs_t)0);
-  slot->commit_mask = *commit_mask;
-  slot->decommit_mask = *decommit_mask;
-  if (!mi_commit_mask_is_empty(commit_mask) && !is_large && !is_pinned && mi_option_is_enabled(mi_option_allow_decommit)) {
-    long delay = mi_option_get(mi_option_segment_decommit_delay);
-    if (delay == 0) {
-      _mi_abandoned_await_readers(); // wait until safe to decommit
-      mi_commit_mask_decommit(&slot->commit_mask, start, MI_SEGMENT_SIZE, tld->stats);
-      mi_commit_mask_create_empty(&slot->decommit_mask);
-    }
-    else {
-      mi_atomic_storei64_release(&slot->expire, _mi_clock_now() + delay);
-    }
-  }
-
-  // make it available
-  _mi_bitmap_unclaim((is_large ? cache_available_large : cache_available), MI_CACHE_FIELDS, 1, bitidx);
-  return true;
-#endif
-}
-
-
-/* -----------------------------------------------------------
-  The following functions are to reliably find the segment or
-  block that encompasses any pointer p (or NULL if it is not
-  in any of our segments).
-  We maintain a bitmap of all memory with 1 bit per MI_SEGMENT_SIZE (64MiB)
-  set to 1 if it contains the segment meta data.
------------------------------------------------------------ */
-
-
-#if (MI_INTPTR_SIZE==8)
-#define MI_MAX_ADDRESS    ((size_t)20 << 40)  // 20TB
-#else
-#define MI_MAX_ADDRESS    ((size_t)2 << 30)   // 2Gb
-#endif
-
-#define MI_SEGMENT_MAP_BITS  (MI_MAX_ADDRESS / MI_SEGMENT_SIZE)
-#define MI_SEGMENT_MAP_SIZE  (MI_SEGMENT_MAP_BITS / 8)
-#define MI_SEGMENT_MAP_WSIZE (MI_SEGMENT_MAP_SIZE / MI_INTPTR_SIZE)
-
-static _Atomic(uintptr_t) mi_segment_map[MI_SEGMENT_MAP_WSIZE + 1];  // 2KiB per TB with 64MiB segments
-
-static size_t mi_segment_map_index_of(const mi_segment_t* segment, size_t* bitidx) {
-  mi_assert_internal(_mi_ptr_segment(segment + 1) == segment); // is it aligned on MI_SEGMENT_SIZE?
-  if ((uintptr_t)segment >= MI_MAX_ADDRESS) {
-    *bitidx = 0;
-    return MI_SEGMENT_MAP_WSIZE;
-  }
-  else {
-    const uintptr_t segindex = ((uintptr_t)segment) / MI_SEGMENT_SIZE;
-    *bitidx = segindex % MI_INTPTR_BITS;
-    const size_t mapindex = segindex / MI_INTPTR_BITS;
-    mi_assert_internal(mapindex < MI_SEGMENT_MAP_WSIZE);
-    return mapindex;
-  }
-}
-
-void _mi_segment_map_allocated_at(const mi_segment_t* segment) {
-  size_t bitidx;
-  size_t index = mi_segment_map_index_of(segment, &bitidx);
-  mi_assert_internal(index <= MI_SEGMENT_MAP_WSIZE);
-  if (index==MI_SEGMENT_MAP_WSIZE) return;
-  uintptr_t mask = mi_atomic_load_relaxed(&mi_segment_map[index]);
-  uintptr_t newmask;
-  do {
-    newmask = (mask | ((uintptr_t)1 << bitidx));
-  } while (!mi_atomic_cas_weak_release(&mi_segment_map[index], &mask, newmask));
-}
-
-void _mi_segment_map_freed_at(const mi_segment_t* segment) {
-  size_t bitidx;
-  size_t index = mi_segment_map_index_of(segment, &bitidx);
-  mi_assert_internal(index <= MI_SEGMENT_MAP_WSIZE);
-  if (index == MI_SEGMENT_MAP_WSIZE) return;
-  uintptr_t mask = mi_atomic_load_relaxed(&mi_segment_map[index]);
-  uintptr_t newmask;
-  do {
-    newmask = (mask & ~((uintptr_t)1 << bitidx));
-  } while (!mi_atomic_cas_weak_release(&mi_segment_map[index], &mask, newmask));
-}
-
-// Determine the segment belonging to a pointer or NULL if it is not in a valid segment.
-static mi_segment_t* _mi_segment_of(const void* p) {
-  if (p == NULL) return NULL;
-  mi_segment_t* segment = _mi_ptr_segment(p);
-  mi_assert_internal(segment != NULL);
-  size_t bitidx;
-  size_t index = mi_segment_map_index_of(segment, &bitidx);
-  // fast path: for any pointer to valid small/medium/large object or first MI_SEGMENT_SIZE in huge
-  const uintptr_t mask = mi_atomic_load_relaxed(&mi_segment_map[index]);
-  if mi_likely((mask & ((uintptr_t)1 << bitidx)) != 0) {
-    return segment; // yes, allocated by us
-  }
-  if (index==MI_SEGMENT_MAP_WSIZE) return NULL;
-
-  // TODO: maintain max/min allocated range for efficiency for more efficient rejection of invalid pointers?
-
-  // search downwards for the first segment in case it is an interior pointer
-  // could be slow but searches in MI_INTPTR_SIZE * MI_SEGMENT_SIZE (512MiB) steps trough
-  // valid huge objects
-  // note: we could maintain a lowest index to speed up the path for invalid pointers?
-  size_t lobitidx;
-  size_t loindex;
-  uintptr_t lobits = mask & (((uintptr_t)1 << bitidx) - 1);
-  if (lobits != 0) {
-    loindex = index;
-    lobitidx = mi_bsr(lobits);    // lobits != 0
-  }
-  else if (index == 0) {
-    return NULL;
-  }
-  else {
-    mi_assert_internal(index > 0);
-    uintptr_t lomask = mask;
-    loindex = index;
-    do {
-      loindex--;  
-      lomask = mi_atomic_load_relaxed(&mi_segment_map[loindex]);      
-    } while (lomask != 0 && loindex > 0);
-    if (lomask == 0) return NULL;
-    lobitidx = mi_bsr(lomask);    // lomask != 0
-  }
-  mi_assert_internal(loindex < MI_SEGMENT_MAP_WSIZE);
-  // take difference as the addresses could be larger than the MAX_ADDRESS space.
-  size_t diff = (((index - loindex) * (8*MI_INTPTR_SIZE)) + bitidx - lobitidx) * MI_SEGMENT_SIZE;
-  segment = (mi_segment_t*)((uint8_t*)segment - diff);
-
-  if (segment == NULL) return NULL;
-  mi_assert_internal((void*)segment < p);
-  bool cookie_ok = (_mi_ptr_cookie(segment) == segment->cookie);
-  mi_assert_internal(cookie_ok);
-  if mi_unlikely(!cookie_ok) return NULL;
-  if (((uint8_t*)segment + mi_segment_size(segment)) <= (uint8_t*)p) return NULL; // outside the range
-  mi_assert_internal(p >= (void*)segment && (uint8_t*)p < (uint8_t*)segment + mi_segment_size(segment));
-  return segment;
-}
-
-// Is this a valid pointer in our heap?
-static bool  mi_is_valid_pointer(const void* p) {
-  return (_mi_segment_of(p) != NULL);
-}
-
-mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
-  return mi_is_valid_pointer(p);
-}
-
-/*
-// Return the full segment range belonging to a pointer
-static void* mi_segment_range_of(const void* p, size_t* size) {
-  mi_segment_t* segment = _mi_segment_of(p);
-  if (segment == NULL) {
-    if (size != NULL) *size = 0;
-    return NULL;
-  }
-  else {
-    if (size != NULL) *size = segment->segment_size;
-    return segment;
-  }
-  mi_assert_expensive(page == NULL || mi_segment_is_valid(_mi_page_segment(page),tld));
-  mi_assert_internal(page == NULL || (mi_segment_page_size(_mi_page_segment(page)) - (MI_SECURE == 0 ? 0 : _mi_os_page_size())) >= block_size);
-  mi_reset_delayed(tld);
-  mi_assert_internal(page == NULL || mi_page_not_in_queue(page, tld));
-  return page;
-}
-*/
diff --git a/Objects/mimalloc/segment-map.c b/Objects/mimalloc/segment-map.c
new file mode 100644
index 00000000000000..4c2104bd8154d4
--- /dev/null
+++ b/Objects/mimalloc/segment-map.c
@@ -0,0 +1,153 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2023, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* -----------------------------------------------------------
+  The following functions are to reliably find the segment or
+  block that encompasses any pointer p (or NULL if it is not
+  in any of our segments).
+  We maintain a bitmap of all memory with 1 bit per MI_SEGMENT_SIZE (64MiB)
+  set to 1 if it contains the segment meta data.
+----------------------------------------------------------- */
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
+
+#if (MI_INTPTR_SIZE==8)
+#define MI_MAX_ADDRESS    ((size_t)40 << 40)  // 40TB (to include huge page areas)
+#else
+#define MI_MAX_ADDRESS    ((size_t)2 << 30)   // 2Gb
+#endif
+
+#define MI_SEGMENT_MAP_BITS  (MI_MAX_ADDRESS / MI_SEGMENT_SIZE)
+#define MI_SEGMENT_MAP_SIZE  (MI_SEGMENT_MAP_BITS / 8)
+#define MI_SEGMENT_MAP_WSIZE (MI_SEGMENT_MAP_SIZE / MI_INTPTR_SIZE)
+
+static _Atomic(uintptr_t) mi_segment_map[MI_SEGMENT_MAP_WSIZE + 1];  // 2KiB per TB with 64MiB segments
+
+static size_t mi_segment_map_index_of(const mi_segment_t* segment, size_t* bitidx) {
+  mi_assert_internal(_mi_ptr_segment(segment + 1) == segment); // is it aligned on MI_SEGMENT_SIZE?
+  if ((uintptr_t)segment >= MI_MAX_ADDRESS) {
+    *bitidx = 0;
+    return MI_SEGMENT_MAP_WSIZE;
+  }
+  else {
+    const uintptr_t segindex = ((uintptr_t)segment) / MI_SEGMENT_SIZE;
+    *bitidx = segindex % MI_INTPTR_BITS;
+    const size_t mapindex = segindex / MI_INTPTR_BITS;
+    mi_assert_internal(mapindex < MI_SEGMENT_MAP_WSIZE);
+    return mapindex;
+  }
+}
+
+void _mi_segment_map_allocated_at(const mi_segment_t* segment) {
+  size_t bitidx;
+  size_t index = mi_segment_map_index_of(segment, &bitidx);
+  mi_assert_internal(index <= MI_SEGMENT_MAP_WSIZE);
+  if (index==MI_SEGMENT_MAP_WSIZE) return;
+  uintptr_t mask = mi_atomic_load_relaxed(&mi_segment_map[index]);
+  uintptr_t newmask;
+  do {
+    newmask = (mask | ((uintptr_t)1 << bitidx));
+  } while (!mi_atomic_cas_weak_release(&mi_segment_map[index], &mask, newmask));
+}
+
+void _mi_segment_map_freed_at(const mi_segment_t* segment) {
+  size_t bitidx;
+  size_t index = mi_segment_map_index_of(segment, &bitidx);
+  mi_assert_internal(index <= MI_SEGMENT_MAP_WSIZE);
+  if (index == MI_SEGMENT_MAP_WSIZE) return;
+  uintptr_t mask = mi_atomic_load_relaxed(&mi_segment_map[index]);
+  uintptr_t newmask;
+  do {
+    newmask = (mask & ~((uintptr_t)1 << bitidx));
+  } while (!mi_atomic_cas_weak_release(&mi_segment_map[index], &mask, newmask));
+}
+
+// Determine the segment belonging to a pointer or NULL if it is not in a valid segment.
+static mi_segment_t* _mi_segment_of(const void* p) {
+  if (p == NULL) return NULL;
+  mi_segment_t* segment = _mi_ptr_segment(p);
+  mi_assert_internal(segment != NULL);
+  size_t bitidx;
+  size_t index = mi_segment_map_index_of(segment, &bitidx);
+  // fast path: for any pointer to valid small/medium/large object or first MI_SEGMENT_SIZE in huge
+  const uintptr_t mask = mi_atomic_load_relaxed(&mi_segment_map[index]);
+  if mi_likely((mask & ((uintptr_t)1 << bitidx)) != 0) {
+    return segment; // yes, allocated by us
+  }
+  if (index==MI_SEGMENT_MAP_WSIZE) return NULL;
+
+  // TODO: maintain max/min allocated range for efficiency for more efficient rejection of invalid pointers?
+
+  // search downwards for the first segment in case it is an interior pointer
+  // could be slow but searches in MI_INTPTR_SIZE * MI_SEGMENT_SIZE (512MiB) steps trough
+  // valid huge objects
+  // note: we could maintain a lowest index to speed up the path for invalid pointers?
+  size_t lobitidx;
+  size_t loindex;
+  uintptr_t lobits = mask & (((uintptr_t)1 << bitidx) - 1);
+  if (lobits != 0) {
+    loindex = index;
+    lobitidx = mi_bsr(lobits);    // lobits != 0
+  }
+  else if (index == 0) {
+    return NULL;
+  }
+  else {
+    mi_assert_internal(index > 0);
+    uintptr_t lomask = mask;
+    loindex = index;
+    do {
+      loindex--;  
+      lomask = mi_atomic_load_relaxed(&mi_segment_map[loindex]);      
+    } while (lomask != 0 && loindex > 0);
+    if (lomask == 0) return NULL;
+    lobitidx = mi_bsr(lomask);    // lomask != 0
+  }
+  mi_assert_internal(loindex < MI_SEGMENT_MAP_WSIZE);
+  // take difference as the addresses could be larger than the MAX_ADDRESS space.
+  size_t diff = (((index - loindex) * (8*MI_INTPTR_SIZE)) + bitidx - lobitidx) * MI_SEGMENT_SIZE;
+  segment = (mi_segment_t*)((uint8_t*)segment - diff);
+
+  if (segment == NULL) return NULL;
+  mi_assert_internal((void*)segment < p);
+  bool cookie_ok = (_mi_ptr_cookie(segment) == segment->cookie);
+  mi_assert_internal(cookie_ok);
+  if mi_unlikely(!cookie_ok) return NULL;
+  if (((uint8_t*)segment + mi_segment_size(segment)) <= (uint8_t*)p) return NULL; // outside the range
+  mi_assert_internal(p >= (void*)segment && (uint8_t*)p < (uint8_t*)segment + mi_segment_size(segment));
+  return segment;
+}
+
+// Is this a valid pointer in our heap?
+static bool  mi_is_valid_pointer(const void* p) {
+  return ((_mi_segment_of(p) != NULL) || (_mi_arena_contains(p)));
+}
+
+mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
+  return mi_is_valid_pointer(p);
+}
+
+/*
+// Return the full segment range belonging to a pointer
+static void* mi_segment_range_of(const void* p, size_t* size) {
+  mi_segment_t* segment = _mi_segment_of(p);
+  if (segment == NULL) {
+    if (size != NULL) *size = 0;
+    return NULL;
+  }
+  else {
+    if (size != NULL) *size = segment->segment_size;
+    return segment;
+  }
+  mi_assert_expensive(page == NULL || mi_segment_is_valid(_mi_page_segment(page),tld));
+  mi_assert_internal(page == NULL || (mi_segment_page_size(_mi_page_segment(page)) - (MI_SECURE == 0 ? 0 : _mi_os_page_size())) >= block_size);
+  mi_reset_delayed(tld);
+  mi_assert_internal(page == NULL || mi_page_not_in_queue(page, tld));
+  return page;
+}
+*/
diff --git a/Objects/mimalloc/segment.c b/Objects/mimalloc/segment.c
index dc98e3e7b945c4..28685f21c10060 100644
--- a/Objects/mimalloc/segment.c
+++ b/Objects/mimalloc/segment.c
@@ -5,15 +5,15 @@ terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
-#include "mimalloc-atomic.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
 
 #include <string.h>  // memset
 #include <stdio.h>
 
-#define MI_PAGE_HUGE_ALIGN  (256*1024)
+#define MI_PAGE_HUGE_ALIGN   (256*1024)
 
-static void mi_segment_delayed_decommit(mi_segment_t* segment, bool force, mi_stats_t* stats);
+static void mi_segment_try_purge(mi_segment_t* segment, bool force, mi_stats_t* stats);
 
 
 // -------------------------------------------------------------------
@@ -257,7 +257,7 @@ static bool mi_segment_is_valid(mi_segment_t* segment, mi_segments_tld_t* tld) {
   mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
   mi_assert_internal(segment->abandoned <= segment->used);
   mi_assert_internal(segment->thread_id == 0 || segment->thread_id == _mi_thread_id());
-  mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->decommit_mask)); // can only decommit committed blocks
+  mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->purge_mask)); // can only decommit committed blocks
   //mi_assert_internal(segment->segment_info_size % MI_SEGMENT_SLICE_SIZE == 0);
   mi_slice_t* slice = &segment->slices[0];
   const mi_slice_t* end = mi_segment_slices_end(segment);
@@ -316,7 +316,13 @@ static uint8_t* _mi_segment_page_start_from_slice(const mi_segment_t* segment, c
   ptrdiff_t idx = slice - segment->slices;
   size_t psize = (size_t)slice->slice_count * MI_SEGMENT_SLICE_SIZE;
   // make the start not OS page aligned for smaller blocks to avoid page/cache effects
-  size_t start_offset = (xblock_size >= MI_INTPTR_SIZE && xblock_size <= 1024 ? 3*MI_MAX_ALIGN_GUARANTEE : 0); 
+  // note: the offset must always be an xblock_size multiple since we assume small allocations
+  // are aligned (see `mi_heap_malloc_aligned`).
+  size_t start_offset = 0;
+  if (xblock_size >= MI_INTPTR_SIZE) {
+    if (xblock_size <= 64) { start_offset = 3*xblock_size; }
+    else if (xblock_size <= 512) { start_offset = xblock_size; }
+  }
   if (page_size != NULL) { *page_size = psize - start_offset; }
   return (uint8_t*)segment + ((idx*MI_SEGMENT_SLICE_SIZE) + start_offset);
 }
@@ -383,19 +389,14 @@ static void mi_segment_os_free(mi_segment_t* segment, mi_segments_tld_t* tld) {
     _mi_os_unprotect(end, os_pagesize);
   }
 
-  // purge delayed decommits now? (no, leave it to the cache)
-  // mi_segment_delayed_decommit(segment,true,tld->stats);
+  // purge delayed decommits now? (no, leave it to the arena)
+  // mi_segment_try_purge(segment,true,tld->stats);
   
-  // _mi_os_free(segment, mi_segment_size(segment), /*segment->memid,*/ tld->stats);
   const size_t size = mi_segment_size(segment);
-  if (size != MI_SEGMENT_SIZE || segment->mem_align_offset != 0 || segment->kind == MI_SEGMENT_HUGE || // only push regular segments on the cache
-       !_mi_segment_cache_push(segment, size, segment->memid, &segment->commit_mask, &segment->decommit_mask, segment->mem_is_large, segment->mem_is_pinned, tld->os)) 
-  {
-    const size_t csize = _mi_commit_mask_committed_size(&segment->commit_mask, size);
-    if (csize > 0 && !segment->mem_is_pinned) _mi_stat_decrease(&_mi_stats_main.committed, csize);
-    _mi_abandoned_await_readers();  // wait until safe to free
-    _mi_arena_free(segment, mi_segment_size(segment), segment->mem_alignment, segment->mem_align_offset, segment->memid, segment->mem_is_pinned /* pretend not committed to not double count decommits */, tld->stats);
-  }
+  const size_t csize = _mi_commit_mask_committed_size(&segment->commit_mask, size);
+
+  _mi_abandoned_await_readers();  // wait until safe to free
+  _mi_arena_free(segment, mi_segment_size(segment), csize, segment->memid, tld->stats);
 }
 
 // called by threads that are terminating 
@@ -459,60 +460,81 @@ static void mi_segment_commit_mask(mi_segment_t* segment, bool conservative, uin
   mi_commit_mask_create(bitidx, bitcount, cm);
 }
 
+static bool mi_segment_commit(mi_segment_t* segment, uint8_t* p, size_t size, mi_stats_t* stats) {
+  mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->purge_mask));
 
-static bool mi_segment_commitx(mi_segment_t* segment, bool commit, uint8_t* p, size_t size, mi_stats_t* stats) {    
-  mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->decommit_mask));
-
-  // commit liberal, but decommit conservative
+  // commit liberal
   uint8_t* start = NULL;
   size_t   full_size = 0;
   mi_commit_mask_t mask;
-  mi_segment_commit_mask(segment, !commit/*conservative*/, p, size, &start, &full_size, &mask);
-  if (mi_commit_mask_is_empty(&mask) || full_size==0) return true;
+  mi_segment_commit_mask(segment, false /* conservative? */, p, size, &start, &full_size, &mask);
+  if (mi_commit_mask_is_empty(&mask) || full_size == 0) return true;
 
-  if (commit && !mi_commit_mask_all_set(&segment->commit_mask, &mask)) {
+  if (!mi_commit_mask_all_set(&segment->commit_mask, &mask)) {
+    // committing
     bool is_zero = false;
     mi_commit_mask_t cmask;
     mi_commit_mask_create_intersect(&segment->commit_mask, &mask, &cmask);
     _mi_stat_decrease(&_mi_stats_main.committed, _mi_commit_mask_committed_size(&cmask, MI_SEGMENT_SIZE)); // adjust for overlap
-    if (!_mi_os_commit(start,full_size,&is_zero,stats)) return false;    
-    mi_commit_mask_set(&segment->commit_mask, &mask);     
+    if (!_mi_os_commit(start, full_size, &is_zero, stats)) return false;
+    mi_commit_mask_set(&segment->commit_mask, &mask);
+  }
+  
+  // increase purge expiration when using part of delayed purges -- we assume more allocations are coming soon.
+  if (mi_commit_mask_any_set(&segment->purge_mask, &mask)) {
+    segment->purge_expire = _mi_clock_now() + mi_option_get(mi_option_purge_delay);
   }
-  else if (!commit && mi_commit_mask_any_set(&segment->commit_mask, &mask)) {
-    mi_assert_internal((void*)start != (void*)segment);
-    //mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &mask));
 
-    mi_commit_mask_t cmask;
-    mi_commit_mask_create_intersect(&segment->commit_mask, &mask, &cmask);
-    _mi_stat_increase(&_mi_stats_main.committed, full_size - _mi_commit_mask_committed_size(&cmask, MI_SEGMENT_SIZE)); // adjust for overlap
-    if (segment->allow_decommit) { 
-      _mi_os_decommit(start, full_size, stats); // ok if this fails
-    } 
-    mi_commit_mask_clear(&segment->commit_mask, &mask);
-  }
-  // increase expiration of reusing part of the delayed decommit
-  if (commit && mi_commit_mask_any_set(&segment->decommit_mask, &mask)) {
-    segment->decommit_expire = _mi_clock_now() + mi_option_get(mi_option_decommit_delay);
-  }
-  // always undo delayed decommits
-  mi_commit_mask_clear(&segment->decommit_mask, &mask);
+  // always clear any delayed purges in our range (as they are either committed now)
+  mi_commit_mask_clear(&segment->purge_mask, &mask);
   return true;
 }
 
 static bool mi_segment_ensure_committed(mi_segment_t* segment, uint8_t* p, size_t size, mi_stats_t* stats) {
-  mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->decommit_mask));
+  mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->purge_mask));
   // note: assumes commit_mask is always full for huge segments as otherwise the commit mask bits can overflow
-  if (mi_commit_mask_is_full(&segment->commit_mask) && mi_commit_mask_is_empty(&segment->decommit_mask)) return true; // fully committed
-  return mi_segment_commitx(segment,true,p,size,stats);
+  if (mi_commit_mask_is_full(&segment->commit_mask) && mi_commit_mask_is_empty(&segment->purge_mask)) return true; // fully committed
+  mi_assert_internal(segment->kind != MI_SEGMENT_HUGE);
+  return mi_segment_commit(segment, p, size, stats);
+}
+
+static bool mi_segment_purge(mi_segment_t* segment, uint8_t* p, size_t size, mi_stats_t* stats) {    
+  mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->purge_mask));
+  if (!segment->allow_purge) return true;
+
+  // purge conservative
+  uint8_t* start = NULL;
+  size_t   full_size = 0;
+  mi_commit_mask_t mask;
+  mi_segment_commit_mask(segment, true /* conservative? */, p, size, &start, &full_size, &mask);
+  if (mi_commit_mask_is_empty(&mask) || full_size==0) return true;
+
+  if (mi_commit_mask_any_set(&segment->commit_mask, &mask)) {
+    // purging
+    mi_assert_internal((void*)start != (void*)segment);
+    mi_assert_internal(segment->allow_decommit);
+    const bool decommitted = _mi_os_purge(start, full_size, stats);  // reset or decommit
+    if (decommitted) {
+      mi_commit_mask_t cmask;
+      mi_commit_mask_create_intersect(&segment->commit_mask, &mask, &cmask);
+      _mi_stat_increase(&_mi_stats_main.committed, full_size - _mi_commit_mask_committed_size(&cmask, MI_SEGMENT_SIZE)); // adjust for double counting 
+      mi_commit_mask_clear(&segment->commit_mask, &mask);
+    }        
+  }
+  
+  // always clear any scheduled purges in our range
+  mi_commit_mask_clear(&segment->purge_mask, &mask);
+  return true;
 }
 
-static void mi_segment_perhaps_decommit(mi_segment_t* segment, uint8_t* p, size_t size, mi_stats_t* stats) {
-  if (!segment->allow_decommit) return;
-  if (mi_option_get(mi_option_decommit_delay) == 0) {
-    mi_segment_commitx(segment, false, p, size, stats);
+static void mi_segment_schedule_purge(mi_segment_t* segment, uint8_t* p, size_t size, mi_stats_t* stats) {
+  if (!segment->allow_purge) return;
+
+  if (mi_option_get(mi_option_purge_delay) == 0) {
+    mi_segment_purge(segment, p, size, stats);
   }
   else {
-    // register for future decommit in the decommit mask
+    // register for future purge in the purge mask
     uint8_t* start = NULL;
     size_t   full_size = 0;
     mi_commit_mask_t mask; 
@@ -520,39 +542,39 @@ static void mi_segment_perhaps_decommit(mi_segment_t* segment, uint8_t* p, size_
     if (mi_commit_mask_is_empty(&mask) || full_size==0) return;
     
     // update delayed commit
-    mi_assert_internal(segment->decommit_expire > 0 || mi_commit_mask_is_empty(&segment->decommit_mask));      
+    mi_assert_internal(segment->purge_expire > 0 || mi_commit_mask_is_empty(&segment->purge_mask));      
     mi_commit_mask_t cmask;
-    mi_commit_mask_create_intersect(&segment->commit_mask, &mask, &cmask);  // only decommit what is committed; span_free may try to decommit more
-    mi_commit_mask_set(&segment->decommit_mask, &cmask);
+    mi_commit_mask_create_intersect(&segment->commit_mask, &mask, &cmask);  // only purge what is committed; span_free may try to decommit more
+    mi_commit_mask_set(&segment->purge_mask, &cmask);
     mi_msecs_t now = _mi_clock_now();    
-    if (segment->decommit_expire == 0) {
-      // no previous decommits, initialize now
-      segment->decommit_expire = now + mi_option_get(mi_option_decommit_delay);
+    if (segment->purge_expire == 0) {
+      // no previous purgess, initialize now
+      segment->purge_expire = now + mi_option_get(mi_option_purge_delay);
     }
-    else if (segment->decommit_expire <= now) {
-      // previous decommit mask already expired
-      if (segment->decommit_expire + mi_option_get(mi_option_decommit_extend_delay) <= now) {
-        mi_segment_delayed_decommit(segment, true, stats);
+    else if (segment->purge_expire <= now) {
+      // previous purge mask already expired
+      if (segment->purge_expire + mi_option_get(mi_option_purge_extend_delay) <= now) {
+        mi_segment_try_purge(segment, true, stats);
       }
       else {
-        segment->decommit_expire = now + mi_option_get(mi_option_decommit_extend_delay); // (mi_option_get(mi_option_decommit_delay) / 8); // wait a tiny bit longer in case there is a series of free's
+        segment->purge_expire = now + mi_option_get(mi_option_purge_extend_delay); // (mi_option_get(mi_option_purge_delay) / 8); // wait a tiny bit longer in case there is a series of free's
       }
     }
     else {
-      // previous decommit mask is not yet expired, increase the expiration by a bit.
-      segment->decommit_expire += mi_option_get(mi_option_decommit_extend_delay);
+      // previous purge mask is not yet expired, increase the expiration by a bit.
+      segment->purge_expire += mi_option_get(mi_option_purge_extend_delay);
     }
   }  
 }
 
-static void mi_segment_delayed_decommit(mi_segment_t* segment, bool force, mi_stats_t* stats) {
-  if (!segment->allow_decommit || mi_commit_mask_is_empty(&segment->decommit_mask)) return;
+static void mi_segment_try_purge(mi_segment_t* segment, bool force, mi_stats_t* stats) {
+  if (!segment->allow_purge || mi_commit_mask_is_empty(&segment->purge_mask)) return;
   mi_msecs_t now = _mi_clock_now();
-  if (!force && now < segment->decommit_expire) return;
+  if (!force && now < segment->purge_expire) return;
 
-  mi_commit_mask_t mask = segment->decommit_mask;
-  segment->decommit_expire = 0;
-  mi_commit_mask_create_empty(&segment->decommit_mask);
+  mi_commit_mask_t mask = segment->purge_mask;
+  segment->purge_expire = 0;
+  mi_commit_mask_create_empty(&segment->purge_mask);
 
   size_t idx;
   size_t count;
@@ -561,11 +583,11 @@ static void mi_segment_delayed_decommit(mi_segment_t* segment, bool force, mi_st
     if (count > 0) {
       uint8_t* p = (uint8_t*)segment + (idx*MI_COMMIT_SIZE);
       size_t size = count * MI_COMMIT_SIZE;
-      mi_segment_commitx(segment, false, p, size, stats);
+      mi_segment_purge(segment, p, size, stats);
     }
   }
   mi_commit_mask_foreach_end()
-  mi_assert_internal(mi_commit_mask_is_empty(&segment->decommit_mask));
+  mi_assert_internal(mi_commit_mask_is_empty(&segment->purge_mask));
 }
 
 
@@ -578,7 +600,7 @@ static bool mi_segment_is_abandoned(mi_segment_t* segment) {
 }
 
 // note: can be called on abandoned segments
-static void mi_segment_span_free(mi_segment_t* segment, size_t slice_index, size_t slice_count, bool allow_decommit, mi_segments_tld_t* tld) {
+static void mi_segment_span_free(mi_segment_t* segment, size_t slice_index, size_t slice_count, bool allow_purge, mi_segments_tld_t* tld) {
   mi_assert_internal(slice_index < segment->slice_entries);
   mi_span_queue_t* sq = (segment->kind == MI_SEGMENT_HUGE || mi_segment_is_abandoned(segment) 
                           ? NULL : mi_span_queue_for(slice_count,tld));
@@ -598,8 +620,8 @@ static void mi_segment_span_free(mi_segment_t* segment, size_t slice_index, size
   }
 
   // perhaps decommit
-  if (allow_decommit) {
-    mi_segment_perhaps_decommit(segment, mi_slice_start(slice), slice_count * MI_SEGMENT_SLICE_SIZE, tld->stats);
+  if (allow_purge) {
+    mi_segment_schedule_purge(segment, mi_slice_start(slice), slice_count * MI_SEGMENT_SLICE_SIZE, tld->stats);
   }
   
   // and push it on the free page queue (if it was not a huge page)
@@ -632,7 +654,8 @@ static mi_slice_t* mi_segment_span_free_coalesce(mi_slice_t* slice, mi_segments_
 
   // for huge pages, just mark as free but don't add to the queues
   if (segment->kind == MI_SEGMENT_HUGE) {
-    mi_assert_internal(segment->used == 1);  // decreased right after this call in `mi_segment_page_clear`
+    // issue #691: segment->used can be 0 if the huge page block was freed while abandoned (reclaim will get here in that case)
+    mi_assert_internal((segment->used==0 && slice->xblock_size==0) || segment->used == 1);  // decreased right after this call in `mi_segment_page_clear`
     slice->xblock_size = 0;  // mark as free anyways
     // we should mark the last slice `xblock_size=0` now to maintain invariants but we skip it to 
     // avoid a possible cache miss (and the segment is about to be freed)
@@ -716,7 +739,6 @@ static mi_page_t* mi_segment_span_allocate(mi_segment_t* segment, size_t slice_i
   }
   
   // and initialize the page
-  page->is_reset = false;
   page->is_committed = true;
   segment->used++;
   return page;
@@ -730,7 +752,7 @@ static void mi_segment_slice_split(mi_segment_t* segment, mi_slice_t* slice, siz
   mi_assert_internal(segment->kind != MI_SEGMENT_HUGE);
   size_t next_index = mi_slice_index(slice) + slice_count;
   size_t next_count = slice->slice_count - slice_count;
-  mi_segment_span_free(segment, next_index, next_count, false /* don't decommit left-over part */, tld);
+  mi_segment_span_free(segment, next_index, next_count, false /* don't purge left-over part */, tld);
   slice->slice_count = (uint32_t)slice_count;
 }
 
@@ -773,16 +795,13 @@ static mi_page_t* mi_segments_page_find_and_allocate(size_t slice_count, mi_aren
    Segment allocation
 ----------------------------------------------------------- */
 
-static mi_segment_t* mi_segment_os_alloc( size_t required, size_t page_alignment, bool eager_delay, mi_arena_id_t req_arena_id,
+static mi_segment_t* mi_segment_os_alloc( size_t required, size_t page_alignment, bool eager_delayed, mi_arena_id_t req_arena_id,
                                           size_t* psegment_slices, size_t* ppre_size, size_t* pinfo_slices, 
-                                          mi_commit_mask_t* pcommit_mask, mi_commit_mask_t* pdecommit_mask,
-                                          bool* is_zero, bool* pcommit, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
+                                          bool commit, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
 
 {
-  // Allocate the segment from the OS
-  bool mem_large = (!eager_delay && (MI_SECURE==0)); // only allow large OS pages once we are no longer lazy    
-  bool is_pinned = false;
-  size_t memid = 0;
+  mi_memid_t memid;
+  bool   allow_large = (!eager_delayed && (MI_SECURE == 0)); // only allow large OS pages once we are no longer lazy
   size_t align_offset = 0;
   size_t alignment = MI_SEGMENT_ALIGN;
   
@@ -795,48 +814,41 @@ static mi_segment_t* mi_segment_os_alloc( size_t required, size_t page_alignment
     const size_t extra = align_offset - info_size;
     // recalculate due to potential guard pages
     *psegment_slices = mi_segment_calculate_slices(required + extra, ppre_size, pinfo_slices);
-    //segment_size += _mi_align_up(align_offset - info_size, MI_SEGMENT_SLICE_SIZE);
-    //segment_slices = segment_size / MI_SEGMENT_SLICE_SIZE;
   }
-  const size_t segment_size = (*psegment_slices) * MI_SEGMENT_SLICE_SIZE;
-  mi_segment_t* segment = NULL;
 
-  // get from cache?
-  if (page_alignment == 0) {
-    segment = (mi_segment_t*)_mi_segment_cache_pop(segment_size, pcommit_mask, pdecommit_mask, &mem_large, &is_pinned, is_zero, req_arena_id, &memid, os_tld);
+  const size_t segment_size = (*psegment_slices) * MI_SEGMENT_SLICE_SIZE;
+  mi_segment_t* segment = (mi_segment_t*)_mi_arena_alloc_aligned(segment_size, alignment, align_offset, commit, allow_large, req_arena_id, &memid, os_tld);
+  if (segment == NULL) {
+    return NULL;  // failed to allocate
   }
-  
-  // get from OS
-  if (segment==NULL) {
-    segment = (mi_segment_t*)_mi_arena_alloc_aligned(segment_size, alignment, align_offset, pcommit, &mem_large, &is_pinned, is_zero, req_arena_id, &memid, os_tld);
-    if (segment == NULL) return NULL;  // failed to allocate
-    if (*pcommit) {
-      mi_commit_mask_create_full(pcommit_mask);
-    }
-    else {
-      mi_commit_mask_create_empty(pcommit_mask);
-    }
-  }    
-  mi_assert_internal(segment != NULL && (uintptr_t)segment % MI_SEGMENT_SIZE == 0);
 
-  const size_t commit_needed = _mi_divide_up((*pinfo_slices)*MI_SEGMENT_SLICE_SIZE, MI_COMMIT_SIZE);
-  mi_assert_internal(commit_needed>0);
-  mi_commit_mask_t commit_needed_mask;
-  mi_commit_mask_create(0, commit_needed, &commit_needed_mask);
-  if (!mi_commit_mask_all_set(pcommit_mask, &commit_needed_mask)) {
+  // ensure metadata part of the segment is committed  
+  mi_commit_mask_t commit_mask; 
+  if (memid.initially_committed) { 
+    mi_commit_mask_create_full(&commit_mask);  
+  }
+  else { 
     // at least commit the info slices
+    const size_t commit_needed = _mi_divide_up((*pinfo_slices)*MI_SEGMENT_SLICE_SIZE, MI_COMMIT_SIZE);
+    mi_assert_internal(commit_needed>0);
+    mi_commit_mask_create(0, commit_needed, &commit_mask);    
     mi_assert_internal(commit_needed*MI_COMMIT_SIZE >= (*pinfo_slices)*MI_SEGMENT_SLICE_SIZE);
-    bool ok = _mi_os_commit(segment, commit_needed*MI_COMMIT_SIZE, is_zero, tld->stats);
-    if (!ok) return NULL; // failed to commit 
-    mi_commit_mask_set(pcommit_mask, &commit_needed_mask); 
+    if (!_mi_os_commit(segment, commit_needed*MI_COMMIT_SIZE, NULL, tld->stats)) {
+      _mi_arena_free(segment,segment_size,0,memid,tld->stats);
+      return NULL;
+    }    
   }
-  mi_track_mem_undefined(segment,commit_needed);
+  mi_assert_internal(segment != NULL && (uintptr_t)segment % MI_SEGMENT_SIZE == 0);
+
   segment->memid = memid;
-  segment->mem_is_pinned = is_pinned;
-  segment->mem_is_large = mem_large;
-  segment->mem_is_committed = mi_commit_mask_is_full(pcommit_mask);
-  segment->mem_alignment = alignment;
-  segment->mem_align_offset = align_offset;
+  segment->allow_decommit = !memid.is_pinned;
+  segment->allow_purge = segment->allow_decommit && (mi_option_get(mi_option_purge_delay) >= 0);
+  segment->segment_size = segment_size;
+  segment->commit_mask = commit_mask;
+  segment->purge_expire = 0;
+  mi_commit_mask_create_empty(&segment->purge_mask);
+  mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, NULL);  // tsan
+  
   mi_segments_track_size((long)(segment_size), tld);
   _mi_segment_map_allocated_at(segment);
   return segment;
@@ -859,42 +871,21 @@ static mi_segment_t* mi_segment_alloc(size_t required, size_t page_alignment, mi
                             tld->count < (size_t)mi_option_get(mi_option_eager_commit_delay));
   const bool eager = !eager_delay && mi_option_is_enabled(mi_option_eager_commit);
   bool commit = eager || (required > 0);   
-  bool is_zero = false;  
-
-  mi_commit_mask_t commit_mask;
-  mi_commit_mask_t decommit_mask;
-  mi_commit_mask_create_empty(&commit_mask);
-  mi_commit_mask_create_empty(&decommit_mask);
-
+  
   // Allocate the segment from the OS  
   mi_segment_t* segment = mi_segment_os_alloc(required, page_alignment, eager_delay, req_arena_id, 
-                                              &segment_slices, &pre_size, &info_slices, &commit_mask, &decommit_mask, 
-                                              &is_zero, &commit, tld, os_tld);
+                                              &segment_slices, &pre_size, &info_slices, commit, tld, os_tld);
   if (segment == NULL) return NULL;
   
-  // zero the segment info? -- not always needed as it may be zero initialized from the OS 
-  mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, NULL);  // tsan
-  if (!is_zero) {
-    ptrdiff_t ofs = offsetof(mi_segment_t, next);
+  // zero the segment info? -- not always needed as it may be zero initialized from the OS   
+  if (!segment->memid.initially_zero) {
+    ptrdiff_t ofs    = offsetof(mi_segment_t, next);
     size_t    prefix = offsetof(mi_segment_t, slices) - ofs;
-    memset((uint8_t*)segment+ofs, 0, prefix + sizeof(mi_slice_t)*(segment_slices+1));  // one more
+    size_t    zsize  = prefix + (sizeof(mi_slice_t) * (segment_slices + 1)); // one more  
+    _mi_memzero((uint8_t*)segment + ofs, zsize);
   }
   
-  segment->commit_mask = commit_mask; // on lazy commit, the initial part is always committed
-  segment->allow_decommit = (mi_option_is_enabled(mi_option_allow_decommit) && !segment->mem_is_pinned && !segment->mem_is_large);    
-  if (segment->allow_decommit) {
-    segment->decommit_expire = 0; // don't decommit just committed memory // _mi_clock_now() + mi_option_get(mi_option_decommit_delay);
-    segment->decommit_mask = decommit_mask;
-    mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->decommit_mask));
-    #if MI_DEBUG>2
-    const size_t commit_needed = _mi_divide_up(info_slices*MI_SEGMENT_SLICE_SIZE, MI_COMMIT_SIZE);
-    mi_commit_mask_t commit_needed_mask;
-    mi_commit_mask_create(0, commit_needed, &commit_needed_mask);
-    mi_assert_internal(!mi_commit_mask_any_set(&segment->decommit_mask, &commit_needed_mask));
-    #endif
-  }    
-  
-  // initialize segment info
+  // initialize the rest of the segment info
   const size_t slice_entries = (segment_slices > MI_SLICES_PER_SEGMENT ? MI_SLICES_PER_SEGMENT : segment_slices);
   segment->segment_slices = segment_slices;
   segment->segment_info_slices = info_slices;
@@ -903,7 +894,7 @@ static mi_segment_t* mi_segment_alloc(size_t required, size_t page_alignment, mi
   segment->slice_entries = slice_entries;
   segment->kind = (required == 0 ? MI_SEGMENT_NORMAL : MI_SEGMENT_HUGE);
 
-  // memset(segment->slices, 0, sizeof(mi_slice_t)*(info_slices+1));
+  // _mi_memzero(segment->slices, sizeof(mi_slice_t)*(info_slices+1));
   _mi_stat_increase(&tld->stats->page_committed, mi_segment_info_size(segment));
 
   // set up guard pages
@@ -930,11 +921,11 @@ static mi_segment_t* mi_segment_alloc(size_t required, size_t page_alignment, mi
   // initialize initial free pages
   if (segment->kind == MI_SEGMENT_NORMAL) { // not a huge page
     mi_assert_internal(huge_page==NULL);
-    mi_segment_span_free(segment, info_slices, segment->slice_entries - info_slices, false /* don't decommit */, tld);
+    mi_segment_span_free(segment, info_slices, segment->slice_entries - info_slices, false /* don't purge */, tld);
   }
   else {
     mi_assert_internal(huge_page!=NULL);
-    mi_assert_internal(mi_commit_mask_is_empty(&segment->decommit_mask));
+    mi_assert_internal(mi_commit_mask_is_empty(&segment->purge_mask));
     mi_assert_internal(mi_commit_mask_is_full(&segment->commit_mask));
     *huge_page = mi_segment_span_allocate(segment, info_slices, segment_slices - info_slices - guard_slices, tld);
     mi_assert_internal(*huge_page != NULL); // cannot fail as we commit in advance 
@@ -954,7 +945,9 @@ static void mi_segment_free(mi_segment_t* segment, bool force, mi_segments_tld_t
   // Remove the free pages
   mi_slice_t* slice = &segment->slices[0];
   const mi_slice_t* end = mi_segment_slices_end(segment);
+  #if MI_DEBUG>1
   size_t page_count = 0;
+  #endif
   while (slice < end) {
     mi_assert_internal(slice->slice_count > 0);
     mi_assert_internal(slice->slice_offset == 0);
@@ -962,7 +955,9 @@ static void mi_segment_free(mi_segment_t* segment, bool force, mi_segments_tld_t
     if (slice->xblock_size == 0 && segment->kind != MI_SEGMENT_HUGE) {
       mi_segment_span_remove_from_queue(slice, tld);
     }
+    #if MI_DEBUG>1
     page_count++;
+    #endif
     slice = slice + slice->slice_count;
   }
   mi_assert_internal(page_count == 2); // first page is allocated by the segment itself
@@ -993,17 +988,16 @@ static mi_slice_t* mi_segment_page_clear(mi_page_t* page, mi_segments_tld_t* tld
   _mi_stat_decrease(&tld->stats->pages, 1);
 
   // reset the page memory to reduce memory pressure?
-  if (!segment->mem_is_pinned && !page->is_reset && mi_option_is_enabled(mi_option_page_reset)) {
+  if (segment->allow_decommit && mi_option_is_enabled(mi_option_deprecated_page_reset)) {
     size_t psize;
-    uint8_t* start = _mi_page_start(segment, page, &psize);
-    page->is_reset = true;
+    uint8_t* start = _mi_page_start(segment, page, &psize);    
     _mi_os_reset(start, psize, tld->stats);
   }
 
   // zero the page data, but not the segment fields
   page->is_zero_init = false;
   ptrdiff_t ofs = offsetof(mi_page_t, capacity);
-  memset((uint8_t*)page + ofs, 0, sizeof(*page) - ofs);
+  _mi_memzero((uint8_t*)page + ofs, sizeof(*page) - ofs);
   page->xblock_size = 1;
 
   // and free it
@@ -1048,7 +1042,7 @@ We maintain a global list of abandoned segments that are
 reclaimed on demand. Since this is shared among threads
 the implementation needs to avoid the A-B-A problem on
 popping abandoned segments: <https://en.wikipedia.org/wiki/ABA_problem>
-We use tagged pointers to avoid accidentially identifying
+We use tagged pointers to avoid accidentally identifying
 reused segments, much like stamped references in Java.
 Secondly, we maintain a reader counter to avoid resetting
 or decommitting segments that have a pending read operation.
@@ -1234,8 +1228,8 @@ static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) {
     slice = slice + slice->slice_count;
   }
 
-  // perform delayed decommits
-  mi_segment_delayed_decommit(segment, mi_option_is_enabled(mi_option_abandoned_page_decommit) /* force? */, tld->stats);    
+  // perform delayed decommits (forcing is much slower on mstress)
+  mi_segment_try_purge(segment, mi_option_is_enabled(mi_option_abandoned_page_purge) /* force? */, tld->stats);    
   
   // all pages in the segment are abandoned; add it to the abandoned list
   _mi_stat_increase(&tld->stats->segments_abandoned, 1);
@@ -1343,7 +1337,6 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap,
     if (mi_slice_is_used(slice)) {
       // in use: reclaim the page in our heap
       mi_page_t* page = mi_slice_to_page(slice);
-      mi_assert_internal(!page->is_reset);
       mi_assert_internal(page->is_committed);
       mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE);
       mi_assert_internal(mi_page_heap(page) == NULL);
@@ -1424,7 +1417,7 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t needed_slice
     }
     else {
       // otherwise, push on the visited list so it gets not looked at too quickly again
-      mi_segment_delayed_decommit(segment, true /* force? */, tld->stats); // forced decommit if needed as we may not visit soon again
+      mi_segment_try_purge(segment, true /* force? */, tld->stats); // force purge if needed as we may not visit soon again
       mi_abandoned_visited_push(segment);
     }
   }
@@ -1448,9 +1441,9 @@ void _mi_abandoned_collect(mi_heap_t* heap, bool force, mi_segments_tld_t* tld)
       mi_segment_reclaim(segment, heap, 0, NULL, tld);
     }
     else {
-      // otherwise, decommit if needed and push on the visited list 
-      // note: forced decommit can be expensive if many threads are destroyed/created as in mstress.
-      mi_segment_delayed_decommit(segment, force, tld->stats);
+      // otherwise, purge if needed and push on the visited list 
+      // note: forced purge can be expensive if many threads are destroyed/created as in mstress.
+      mi_segment_try_purge(segment, force, tld->stats);
       mi_abandoned_visited_push(segment);
     }
   }
@@ -1508,7 +1501,7 @@ static mi_page_t* mi_segments_page_alloc(mi_heap_t* heap, mi_page_kind_t page_ki
   }
   mi_assert_internal(page != NULL && page->slice_count*MI_SEGMENT_SLICE_SIZE == page_size);
   mi_assert_internal(_mi_ptr_segment(page)->thread_id == _mi_thread_id());
-  mi_segment_delayed_decommit(_mi_ptr_segment(page), false, tld->stats);
+  mi_segment_try_purge(_mi_ptr_segment(page), false, tld->stats);
   return page;
 }
 
@@ -1542,7 +1535,7 @@ static mi_page_t* mi_segment_huge_page_alloc(size_t size, size_t page_alignment,
     mi_assert_internal(psize - (aligned_p - start) >= size);      
     uint8_t* decommit_start = start + sizeof(mi_block_t);              // for the free list
     ptrdiff_t decommit_size = aligned_p - decommit_start;
-    _mi_os_decommit(decommit_start, decommit_size, &_mi_stats_main);   // note: cannot use segment_decommit on huge segments    
+    _mi_os_reset(decommit_start, decommit_size, &_mi_stats_main);   // note: cannot use segment_decommit on huge segments    
   }
   
   return page;
@@ -1585,9 +1578,12 @@ void _mi_segment_huge_page_reset(mi_segment_t* segment, mi_page_t* page, mi_bloc
   mi_assert_internal(page->used == 1); // this is called just before the free
   mi_assert_internal(page->free == NULL);
   if (segment->allow_decommit) {
-    const size_t csize = mi_usable_size(block) - sizeof(mi_block_t);
-    uint8_t* p = (uint8_t*)block + sizeof(mi_block_t);
-    _mi_os_decommit(p, csize, &_mi_stats_main);  // note: cannot use segment_decommit on huge segments
+    size_t csize = mi_usable_size(block);
+    if (csize > sizeof(mi_block_t)) {
+      csize = csize - sizeof(mi_block_t);
+      uint8_t* p = (uint8_t*)block + sizeof(mi_block_t);
+      _mi_os_reset(p, csize, &_mi_stats_main);  // note: cannot use segment_decommit on huge segments
+    }
   }
 }
 #endif
diff --git a/Objects/mimalloc/stats.c b/Objects/mimalloc/stats.c
index 2a8b9404fbe264..300956ce1302e9 100644
--- a/Objects/mimalloc/stats.c
+++ b/Objects/mimalloc/stats.c
@@ -5,10 +5,11 @@ terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #include "mimalloc.h"
-#include "mimalloc-internal.h"
-#include "mimalloc-atomic.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
+#include "mimalloc/prim.h"
 
-#include <stdio.h>  // fputs, stderr
+#include <stdio.h>  // snprintf
 #include <string.h> // memset
 
 #if defined(_MSC_VER) && (_MSC_VER < 1920)
@@ -95,6 +96,7 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
   mi_stat_add(&stats->reserved, &src->reserved, 1);
   mi_stat_add(&stats->committed, &src->committed, 1);
   mi_stat_add(&stats->reset, &src->reset, 1);
+  mi_stat_add(&stats->purged, &src->purged, 1);
   mi_stat_add(&stats->page_committed, &src->page_committed, 1);
 
   mi_stat_add(&stats->pages_abandoned, &src->pages_abandoned, 1);
@@ -110,6 +112,8 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
   mi_stat_counter_add(&stats->pages_extended, &src->pages_extended, 1);
   mi_stat_counter_add(&stats->mmap_calls, &src->mmap_calls, 1);
   mi_stat_counter_add(&stats->commit_calls, &src->commit_calls, 1);
+  mi_stat_counter_add(&stats->reset_calls, &src->reset_calls, 1);
+  mi_stat_counter_add(&stats->purge_calls, &src->purge_calls, 1);
 
   mi_stat_counter_add(&stats->page_no_retire, &src->page_no_retire, 1);
   mi_stat_counter_add(&stats->searches, &src->searches, 1);
@@ -142,7 +146,7 @@ static void mi_printf_amount(int64_t n, int64_t unit, mi_output_fun* out, void*
   const int64_t pos = (n < 0 ? -n : n);
   if (pos < base) {
     if (n!=1 || suffix[0] != 'B') {  // skip printing 1 B for the unit column
-      snprintf(buf, len, "%d %-3s", (int)n, (n==0 ? "" : suffix));
+      snprintf(buf, len, "%d   %-3s", (int)n, (n==0 ? "" : suffix));
     }
   }
   else {
@@ -157,7 +161,7 @@ static void mi_printf_amount(int64_t n, int64_t unit, mi_output_fun* out, void*
     snprintf(unitdesc, 8, "%s%s%s", magnitude, (base==1024 ? "i" : ""), suffix);
     snprintf(buf, len, "%ld.%ld %-3s", whole, (frac1 < 0 ? -frac1 : frac1), unitdesc);
   }
-  _mi_fprintf(out, arg, (fmt==NULL ? "%11s" : fmt), buf);
+  _mi_fprintf(out, arg, (fmt==NULL ? "%12s" : fmt), buf);
 }
 
 
@@ -166,7 +170,7 @@ static void mi_print_amount(int64_t n, int64_t unit, mi_output_fun* out, void* a
 }
 
 static void mi_print_count(int64_t n, int64_t unit, mi_output_fun* out, void* arg) {
-  if (unit==1) _mi_fprintf(out, arg, "%11s"," ");
+  if (unit==1) _mi_fprintf(out, arg, "%12s"," ");
           else mi_print_amount(n,0,out,arg);
 }
 
@@ -181,7 +185,7 @@ static void mi_stat_print_ex(const mi_stat_count_t* stat, const char* msg, int64
     mi_print_count(stat->allocated, unit, out, arg);
     if (stat->allocated > stat->freed) {
       _mi_fprintf(out, arg, "  ");
-      _mi_fprintf(out, arg, (notok == NULL ? "not all freed!" : notok));
+      _mi_fprintf(out, arg, (notok == NULL ? "not all freed" : notok));
       _mi_fprintf(out, arg, "\n");
     }
     else {
@@ -194,7 +198,7 @@ static void mi_stat_print_ex(const mi_stat_count_t* stat, const char* msg, int64
     mi_print_amount(stat->freed, -1, out, arg);
     mi_print_amount(stat->current, -1, out, arg);
     if (unit==-1) {
-      _mi_fprintf(out, arg, "%22s", "");
+      _mi_fprintf(out, arg, "%24s", "");
     }
     else {
       mi_print_amount(-unit, 1, out, arg);
@@ -218,12 +222,19 @@ static void mi_stat_print(const mi_stat_count_t* stat, const char* msg, int64_t
   mi_stat_print_ex(stat, msg, unit, out, arg, NULL);
 }
 
+static void mi_stat_peak_print(const mi_stat_count_t* stat, const char* msg, int64_t unit, mi_output_fun* out, void* arg) {
+  _mi_fprintf(out, arg, "%10s:", msg);
+  mi_print_amount(stat->peak, unit, out, arg);
+  _mi_fprintf(out, arg, "\n");
+}
+
 static void mi_stat_counter_print(const mi_stat_counter_t* stat, const char* msg, mi_output_fun* out, void* arg ) {
   _mi_fprintf(out, arg, "%10s:", msg);
   mi_print_amount(stat->total, -1, out, arg);
   _mi_fprintf(out, arg, "\n");
 }
 
+
 static void mi_stat_counter_print_avg(const mi_stat_counter_t* stat, const char* msg, mi_output_fun* out, void* arg) {
   const int64_t avg_tens = (stat->count == 0 ? 0 : (stat->total*10 / stat->count));
   const long avg_whole = (long)(avg_tens/10);
@@ -233,7 +244,7 @@ static void mi_stat_counter_print_avg(const mi_stat_counter_t* stat, const char*
 
 
 static void mi_print_header(mi_output_fun* out, void* arg ) {
-  _mi_fprintf(out, arg, "%10s: %10s %10s %10s %10s %10s %10s\n", "heap stats", "peak   ", "total   ", "freed   ", "current   ", "unit   ", "count   ");
+  _mi_fprintf(out, arg, "%10s: %11s %11s %11s %11s %11s %11s\n", "heap stats", "peak   ", "total   ", "freed   ", "current   ", "unit   ", "count   ");
 }
 
 #if MI_STAT>1
@@ -291,8 +302,6 @@ static void mi_cdecl mi_buffered_out(const char* msg, void* arg) {
 // Print statistics
 //------------------------------------------------------------
 
-static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults);
-
 static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0) mi_attr_noexcept {
   // wrap the output function to be line buffered
   char buf[256];
@@ -322,7 +331,8 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
   #endif
   mi_stat_print_ex(&stats->reserved, "reserved", 1, out, arg, "");
   mi_stat_print_ex(&stats->committed, "committed", 1, out, arg, "");
-  mi_stat_print(&stats->reset, "reset", 1, out, arg);
+  mi_stat_peak_print(&stats->reset, "reset", 1, out, arg );
+  mi_stat_peak_print(&stats->purged, "purged", 1, out, arg );
   mi_stat_print(&stats->page_committed, "touched", 1, out, arg);
   mi_stat_print(&stats->segments, "segments", -1, out, arg);
   mi_stat_print(&stats->segments_abandoned, "-abandoned", -1, out, arg);
@@ -333,20 +343,22 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
   mi_stat_counter_print(&stats->page_no_retire, "-noretire", out, arg);
   mi_stat_counter_print(&stats->mmap_calls, "mmaps", out, arg);
   mi_stat_counter_print(&stats->commit_calls, "commits", out, arg);
+  mi_stat_counter_print(&stats->reset_calls, "resets", out, arg);
+  mi_stat_counter_print(&stats->purge_calls, "purges", out, arg);
   mi_stat_print(&stats->threads, "threads", -1, out, arg);
   mi_stat_counter_print_avg(&stats->searches, "searches", out, arg);
-  _mi_fprintf(out, arg, "%10s: %7zu\n", "numa nodes", _mi_os_numa_node_count());
+  _mi_fprintf(out, arg, "%10s: %5zu\n", "numa nodes", _mi_os_numa_node_count());
 
-  mi_msecs_t elapsed;
-  mi_msecs_t user_time;
-  mi_msecs_t sys_time;
+  size_t elapsed;
+  size_t user_time;
+  size_t sys_time;
   size_t current_rss;
   size_t peak_rss;
   size_t current_commit;
   size_t peak_commit;
   size_t page_faults;
-  mi_stat_process_info(&elapsed, &user_time, &sys_time, &current_rss, &peak_rss, &current_commit, &peak_commit, &page_faults);
-  _mi_fprintf(out, arg, "%10s: %7ld.%03ld s\n", "elapsed", elapsed/1000, elapsed%1000);
+  mi_process_info(&elapsed, &user_time, &sys_time, &current_rss, &peak_rss, &current_commit, &peak_commit, &page_faults);
+  _mi_fprintf(out, arg, "%10s: %5ld.%03ld s\n", "elapsed", elapsed/1000, elapsed%1000);
   _mi_fprintf(out, arg, "%10s: user: %ld.%03ld s, system: %ld.%03ld s, faults: %lu, rss: ", "process",
               user_time/1000, user_time%1000, sys_time/1000, sys_time%1000, (unsigned long)page_faults );
   mi_printf_amount((int64_t)peak_rss, 1, out, arg, "%s");
@@ -404,46 +416,12 @@ void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept {
 // ----------------------------------------------------------------
 // Basic timer for convenience; use milli-seconds to avoid doubles
 // ----------------------------------------------------------------
-#ifdef _WIN32
-#include <windows.h>
-static mi_msecs_t mi_to_msecs(LARGE_INTEGER t) {
-  static LARGE_INTEGER mfreq; // = 0
-  if (mfreq.QuadPart == 0LL) {
-    LARGE_INTEGER f;
-    QueryPerformanceFrequency(&f);
-    mfreq.QuadPart = f.QuadPart/1000LL;
-    if (mfreq.QuadPart == 0) mfreq.QuadPart = 1;
-  }
-  return (mi_msecs_t)(t.QuadPart / mfreq.QuadPart);
-}
+
+static mi_msecs_t mi_clock_diff;
 
 mi_msecs_t _mi_clock_now(void) {
-  LARGE_INTEGER t;
-  QueryPerformanceCounter(&t);
-  return mi_to_msecs(t);
-}
-#else
-#include <time.h>
-#if defined(CLOCK_REALTIME) || defined(CLOCK_MONOTONIC)
-mi_msecs_t _mi_clock_now(void) {
-  struct timespec t;
-  #ifdef CLOCK_MONOTONIC
-  clock_gettime(CLOCK_MONOTONIC, &t);
-  #else
-  clock_gettime(CLOCK_REALTIME, &t);
-  #endif
-  return ((mi_msecs_t)t.tv_sec * 1000) + ((mi_msecs_t)t.tv_nsec / 1000000);
-}
-#else
-// low resolution timer
-mi_msecs_t _mi_clock_now(void) {
-  return ((mi_msecs_t)clock() / ((mi_msecs_t)CLOCKS_PER_SEC / 1000));
+  return _mi_prim_clock_now();
 }
-#endif
-#endif
-
-
-static mi_msecs_t mi_clock_diff;
 
 mi_msecs_t _mi_clock_start(void) {
   if (mi_clock_diff == 0.0) {
@@ -463,156 +441,27 @@ mi_msecs_t _mi_clock_end(mi_msecs_t start) {
 // Basic process statistics
 // --------------------------------------------------------
 
-#if defined(_WIN32)
-#include <windows.h>
-
-static mi_msecs_t filetime_msecs(const FILETIME* ftime) {
-  ULARGE_INTEGER i;
-  i.LowPart = ftime->dwLowDateTime;
-  i.HighPart = ftime->dwHighDateTime;
-  mi_msecs_t msecs = (i.QuadPart / 10000); // FILETIME is in 100 nano seconds
-  return msecs;
-}
-
-typedef struct _PROCESS_MEMORY_COUNTERS {
-  DWORD cb;
-  DWORD PageFaultCount;
-  SIZE_T PeakWorkingSetSize;
-  SIZE_T WorkingSetSize;
-  SIZE_T QuotaPeakPagedPoolUsage;
-  SIZE_T QuotaPagedPoolUsage;
-  SIZE_T QuotaPeakNonPagedPoolUsage;
-  SIZE_T QuotaNonPagedPoolUsage;
-  SIZE_T PagefileUsage;
-  SIZE_T PeakPagefileUsage;
-} PROCESS_MEMORY_COUNTERS;
-typedef PROCESS_MEMORY_COUNTERS* PPROCESS_MEMORY_COUNTERS;
-typedef BOOL (WINAPI *PGetProcessMemoryInfo)(HANDLE, PPROCESS_MEMORY_COUNTERS, DWORD);
-static PGetProcessMemoryInfo pGetProcessMemoryInfo = NULL;
-
-static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults)
-{
-  *elapsed = _mi_clock_end(mi_process_start);
-  FILETIME ct;
-  FILETIME ut;
-  FILETIME st;
-  FILETIME et;
-  GetProcessTimes(GetCurrentProcess(), &ct, &et, &st, &ut);
-  *utime = filetime_msecs(&ut);
-  *stime = filetime_msecs(&st);
-  
-  // load psapi on demand
-  if (pGetProcessMemoryInfo == NULL) {
-    HINSTANCE hDll = LoadLibrary(TEXT("psapi.dll"));
-    if (hDll != NULL) {
-      pGetProcessMemoryInfo = (PGetProcessMemoryInfo)(void (*)(void))GetProcAddress(hDll, "GetProcessMemoryInfo");
-    }
-  }
-
-  // get process info
-  PROCESS_MEMORY_COUNTERS info;
-  memset(&info, 0, sizeof(info));
-  if (pGetProcessMemoryInfo != NULL) {
-    pGetProcessMemoryInfo(GetCurrentProcess(), &info, sizeof(info));
-  } 
-  *current_rss    = (size_t)info.WorkingSetSize;
-  *peak_rss       = (size_t)info.PeakWorkingSetSize;
-  *current_commit = (size_t)info.PagefileUsage;
-  *peak_commit    = (size_t)info.PeakPagefileUsage;
-  *page_faults    = (size_t)info.PageFaultCount;
-}
-
-#elif !defined(__wasi__) && (defined(__unix__) || defined(__unix) || defined(unix) || defined(__APPLE__) || defined(__HAIKU__))
-#include <stdio.h>
-#include <unistd.h>
-#include <sys/resource.h>
-
-#if defined(__APPLE__)
-#include <mach/mach.h>
-#endif
-
-#if defined(__HAIKU__)
-#include <kernel/OS.h>
-#endif
-
-static mi_msecs_t timeval_secs(const struct timeval* tv) {
-  return ((mi_msecs_t)tv->tv_sec * 1000L) + ((mi_msecs_t)tv->tv_usec / 1000L);
-}
-
-static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults)
-{
-  *elapsed = _mi_clock_end(mi_process_start);
-  struct rusage rusage;
-  getrusage(RUSAGE_SELF, &rusage);
-  *utime = timeval_secs(&rusage.ru_utime);
-  *stime = timeval_secs(&rusage.ru_stime);
-#if !defined(__HAIKU__)
-  *page_faults = rusage.ru_majflt;
-#endif
-  // estimate commit using our stats
-  *peak_commit    = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.peak));
-  *current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.current));
-  *current_rss    = *current_commit;  // estimate
-#if defined(__HAIKU__)
-  // Haiku does not have (yet?) a way to
-  // get these stats per process
-  thread_info tid;
-  area_info mem;
-  ssize_t c;
-  get_thread_info(find_thread(0), &tid);
-  while (get_next_area_info(tid.team, &c, &mem) == B_OK) {
-    *peak_rss += mem.ram_size;
-  }
-  *page_faults = 0;
-#elif defined(__APPLE__)
-  *peak_rss = rusage.ru_maxrss;         // BSD reports in bytes
-  struct mach_task_basic_info info;
-  mach_msg_type_number_t infoCount = MACH_TASK_BASIC_INFO_COUNT;
-  if (task_info(mach_task_self(), MACH_TASK_BASIC_INFO, (task_info_t)&info, &infoCount) == KERN_SUCCESS) {
-    *current_rss = (size_t)info.resident_size;
-  }
-#else
-  *peak_rss = rusage.ru_maxrss * 1024;  // Linux reports in KiB
-#endif
-}
-
-#else
-#ifndef __wasi__
-// WebAssembly instances are not processes
-#pragma message("define a way to get process info")
-#endif
-
-static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults)
-{
-  *elapsed = _mi_clock_end(mi_process_start);
-  *peak_commit    = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.peak));
-  *current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.current));
-  *peak_rss    = *peak_commit;
-  *current_rss = *current_commit;
-  *page_faults = 0;
-  *utime = 0;
-  *stime = 0;
-}
-#endif
-
-
 mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults) mi_attr_noexcept
 {
-  mi_msecs_t elapsed = 0;
-  mi_msecs_t utime = 0;
-  mi_msecs_t stime = 0;
-  size_t current_rss0 = 0;
-  size_t peak_rss0 = 0;
-  size_t current_commit0 = 0;
-  size_t peak_commit0 = 0;
-  size_t page_faults0 = 0;
-  mi_stat_process_info(&elapsed,&utime, &stime, &current_rss0, &peak_rss0, &current_commit0, &peak_commit0, &page_faults0);
-  if (elapsed_msecs!=NULL)  *elapsed_msecs = (elapsed < 0 ? 0 : (elapsed < (mi_msecs_t)PTRDIFF_MAX ? (size_t)elapsed : PTRDIFF_MAX));
-  if (user_msecs!=NULL)     *user_msecs     = (utime < 0 ? 0 : (utime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)utime : PTRDIFF_MAX));
-  if (system_msecs!=NULL)   *system_msecs   = (stime < 0 ? 0 : (stime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)stime : PTRDIFF_MAX));
-  if (current_rss!=NULL)    *current_rss    = current_rss0;
-  if (peak_rss!=NULL)       *peak_rss       = peak_rss0;
-  if (current_commit!=NULL) *current_commit = current_commit0;
-  if (peak_commit!=NULL)    *peak_commit    = peak_commit0;
-  if (page_faults!=NULL)    *page_faults    = page_faults0;
+  mi_process_info_t pinfo;
+  _mi_memzero_var(pinfo);
+  pinfo.elapsed        = _mi_clock_end(mi_process_start);
+  pinfo.current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.current));
+  pinfo.peak_commit    = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.peak));
+  pinfo.current_rss    = pinfo.current_commit;
+  pinfo.peak_rss       = pinfo.peak_commit;
+  pinfo.utime          = 0;
+  pinfo.stime          = 0;
+  pinfo.page_faults    = 0;
+
+  _mi_prim_process_info(&pinfo);
+  
+  if (elapsed_msecs!=NULL)  *elapsed_msecs  = (pinfo.elapsed < 0 ? 0 : (pinfo.elapsed < (mi_msecs_t)PTRDIFF_MAX ? (size_t)pinfo.elapsed : PTRDIFF_MAX));
+  if (user_msecs!=NULL)     *user_msecs     = (pinfo.utime < 0 ? 0 : (pinfo.utime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)pinfo.utime : PTRDIFF_MAX));
+  if (system_msecs!=NULL)   *system_msecs   = (pinfo.stime < 0 ? 0 : (pinfo.stime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)pinfo.stime : PTRDIFF_MAX));
+  if (current_rss!=NULL)    *current_rss    = pinfo.current_rss;
+  if (peak_rss!=NULL)       *peak_rss       = pinfo.peak_rss;
+  if (current_commit!=NULL) *current_commit = pinfo.current_commit;
+  if (peak_commit!=NULL)    *peak_commit    = pinfo.peak_commit;
+  if (page_faults!=NULL)    *page_faults    = pinfo.page_faults;
 }
diff --git a/PCbuild/_freeze_module.vcxproj b/PCbuild/_freeze_module.vcxproj
index 58bbbd8a508e18..5ca10588f09710 100644
--- a/PCbuild/_freeze_module.vcxproj
+++ b/PCbuild/_freeze_module.vcxproj
@@ -156,9 +156,10 @@
     <ClCompile Include="..\Objects\mimalloc\options.c" />
     <ClCompile Include="..\Objects\mimalloc\os.c" />
     <ClCompile Include="..\Objects\mimalloc\page.c" />
+    <ClCompile Include="..\Objects\mimalloc\prim\prim.c" />
     <ClCompile Include="..\Objects\mimalloc\random.c" />
     <ClCompile Include="..\Objects\mimalloc\segment.c" />
-    <ClCompile Include="..\Objects\mimalloc\segment-cache.c" />
+    <ClCompile Include="..\Objects\mimalloc\segment-map.c" />
     <ClCompile Include="..\Objects\mimalloc\stats.c" />
     <ClCompile Include="..\Objects\moduleobject.c" />
     <ClCompile Include="..\Objects\namespaceobject.c" />
diff --git a/PCbuild/_freeze_module.vcxproj.filters b/PCbuild/_freeze_module.vcxproj.filters
index acdf68dc32a0f9..f0afa7a6f53e58 100644
--- a/PCbuild/_freeze_module.vcxproj.filters
+++ b/PCbuild/_freeze_module.vcxproj.filters
@@ -277,13 +277,16 @@
     <ClCompile Include="..\Objects\mimalloc\page.c">
       <Filter>Source Files</Filter>
     </CLCompile>
+    <ClCompile Include="..\Objects\mimalloc\prim\prim.c">
+      <Filter>Source Files</Filter>
+    </CLCompile>
     <ClCompile Include="..\Objects\mimalloc\random.c">
       <Filter>Source Files</Filter>
     </CLCompile>
     <ClCompile Include="..\Objects\mimalloc\segment.c">
       <Filter>Source Files</Filter>
     </CLCompile>
-    <ClCompile Include="..\Objects\mimalloc\segment-cache.c">
+    <ClCompile Include="..\Objects\mimalloc\segment-map.c">
       <Filter>Source Files</Filter>
     </CLCompile>
     <ClCompile Include="..\Objects\mimalloc\stats.c">
diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj
index a5adc31112e01b..e19c9d0396abc0 100644
--- a/PCbuild/pythoncore.vcxproj
+++ b/PCbuild/pythoncore.vcxproj
@@ -302,10 +302,11 @@
     <ClInclude Include="..\Include\marshal.h" />
     <ClInclude Include="..\Include\memoryobject.h" />
     <ClInclude Include="..\Include\methodobject.h" />
-    <ClInclude Include="..\Include\mimalloc\mimalloc-atomic.h" />
-    <ClInclude Include="..\Include\mimalloc\mimalloc-internal.h" />
-    <ClInclude Include="..\Include\mimalloc\mimalloc-track.h" />
-    <ClInclude Include="..\Include\mimalloc\mimalloc-types.h" />
+    <ClInclude Include="..\Include\mimalloc\mimalloc\atomic.h" />
+    <ClInclude Include="..\Include\mimalloc\mimalloc\internal.h" />
+    <ClInclude Include="..\Include\mimalloc\mimalloc\prim.h" />
+    <ClInclude Include="..\Include\mimalloc\mimalloc\track.h" />
+    <ClInclude Include="..\Include\mimalloc\mimalloc\types.h" />
     <ClInclude Include="..\Include\mimalloc\mimalloc.h" />
     <ClInclude Include="..\Include\modsupport.h" />
     <ClInclude Include="..\Include\moduleobject.h" />
@@ -507,9 +508,10 @@
     <ClCompile Include="..\Objects\mimalloc\options.c" />
     <ClCompile Include="..\Objects\mimalloc\os.c" />
     <ClCompile Include="..\Objects\mimalloc\page.c" />
+    <ClCompile Include="..\Objects\mimalloc\prim\prim.c" />
     <ClCompile Include="..\Objects\mimalloc\random.c" />
     <ClCompile Include="..\Objects\mimalloc\segment.c" />
-    <ClCompile Include="..\Objects\mimalloc\segment-cache.c" />
+    <ClCompile Include="..\Objects\mimalloc\segment-map.c" />
     <ClCompile Include="..\Objects\mimalloc\stats.c" />
     <ClCompile Include="..\Objects\moduleobject.c" />
     <ClCompile Include="..\Objects\namespaceobject.c" />
diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters
index 52a906e34bddbf..ce16799029653a 100644
--- a/PCbuild/pythoncore.vcxproj.filters
+++ b/PCbuild/pythoncore.vcxproj.filters
@@ -798,16 +798,19 @@
     <ClInclude Include="..\Include\mimalloc\mimalloc.h">
       <Filter>Include\mimalloc</Filter>
     </ClInclude>
-    <ClInclude Include="..\Include\mimalloc\mimalloc-atomic.h">
+    <ClInclude Include="..\Include\mimalloc\mimalloc\atomic.h">
       <Filter>Include\mimalloc</Filter>
     </ClInclude>
-    <ClInclude Include="..\Include\mimalloc\mimalloc-internal.h">
+    <ClInclude Include="..\Include\mimalloc\mimalloc\internal.h">
       <Filter>Include\mimalloc</Filter>
     </ClInclude>
-    <ClInclude Include="..\Include\mimalloc\mimalloc-track.h">
+    <ClInclude Include="..\Include\mimalloc\mimalloc\prim.h">
       <Filter>Include\mimalloc</Filter>
     </ClInclude>
-    <ClInclude Include="..\Include\mimalloc\mimalloc-types.h">
+    <ClInclude Include="..\Include\mimalloc\mimalloc\track.h">
+      <Filter>Include\mimalloc</Filter>
+    </ClInclude>
+    <ClInclude Include="..\Include\mimalloc\mimalloc\types.h">
       <Filter>Include\mimalloc</Filter>
     </ClInclude>
     <ClInclude Include="$(zlibDir)\crc32.h">
@@ -1157,13 +1160,16 @@
     <ClCompile Include="..\Objects\mimalloc\page.c">
       <Filter>Objects\mimalloc</Filter>
     </ClCompile>
+    <ClCompile Include="..\Objects\mimalloc\prim\prim.c">
+      <Filter>Objects\mimalloc</Filter>
+    </ClCompile>
     <ClCompile Include="..\Objects\mimalloc\random.c">
       <Filter>Objects\mimalloc</Filter>
     </ClCompile>
     <ClCompile Include="..\Objects\mimalloc\segment.c">
       <Filter>Objects\mimalloc</Filter>
     </ClCompile>
-    <ClCompile Include="..\Objects\mimalloc\segment-cache.c">
+    <ClCompile Include="..\Objects\mimalloc\segment-map.c">
       <Filter>Objects\mimalloc</Filter>
     </ClCompile>
     <ClCompile Include="..\Objects\mimalloc\stats.c">
diff --git a/configure b/configure
index 17fd7744a8d9cb..490aa2aa33f40f 100755
--- a/configure
+++ b/configure
@@ -860,8 +860,8 @@ DTRACE_OBJS
 DTRACE_HEADERS
 DFLAGS
 DTRACE
-MIMALLOC_OBJS
 WITH_MIMALLOC
+MIMALLOC_OBJS
 MIMALLOC_HEADERS
 GDBM_LIBS
 GDBM_CFLAGS
@@ -16887,7 +16887,7 @@ printf "%s\n" "#define WITH_MIMALLOC 1" >>confdefs.h
 
   MIMALLOC_HEADERS='$(MIMALLOC_HEADERS)'
 
-  MIMALLOC_HEADERS='$(MIMALLOC_OBJS)'
+  MIMALLOC_OBJS='$(MIMALLOC_OBJS)'
 
 fi
 
@@ -26809,6 +26809,7 @@ SRCDIRS="\
   Modules/expat \
   Objects \
   Objects/mimalloc \
+  Objects/mimalloc/prim \
   Parser \
   Parser/tokenizer \
   Parser/lexer \
diff --git a/configure.ac b/configure.ac
index 305d31b373b4ac..63f8e9f247bfad 100644
--- a/configure.ac
+++ b/configure.ac
@@ -6593,6 +6593,7 @@ SRCDIRS="\
   Modules/expat \
   Objects \
   Objects/mimalloc \
+  Objects/mimalloc/prim \
   Parser \
   Parser/tokenizer \
   Parser/lexer \

From 612c35e87a34af0a1ad8bb65ba0df3e0103d2517 Mon Sep 17 00:00:00 2001
From: Dino Viehland <dinoviehland@fb.com>
Date: Tue, 26 Sep 2023 17:41:58 -0700
Subject: [PATCH 03/24] Fix mimalloc formatting

Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
---
 Include/mimalloc/mimalloc.h          |  14 +--
 Include/mimalloc/mimalloc/atomic.h   |   2 +-
 Include/mimalloc/mimalloc/internal.h |  10 +-
 Include/mimalloc/mimalloc/prim.h     |  14 +--
 Include/mimalloc/mimalloc/track.h    |   4 +-
 Include/mimalloc/mimalloc/types.h    |  22 ++--
 Objects/mimalloc/alloc-aligned.c     |   6 +-
 Objects/mimalloc/alloc.c             |  14 +--
 Objects/mimalloc/arena.c             |  95 +++++++++--------
 Objects/mimalloc/bitmap.c            |  16 +--
 Objects/mimalloc/bitmap.h            |   4 +-
 Objects/mimalloc/heap.c              |   8 +-
 Objects/mimalloc/init.c              |  16 +--
 Objects/mimalloc/os.c                |  44 ++++----
 Objects/mimalloc/page.c              |  28 ++---
 Objects/mimalloc/prim/unix/prim.c    |  46 ++++-----
 Objects/mimalloc/prim/wasi/prim.c    |   8 +-
 Objects/mimalloc/prim/windows/prim.c |  14 +--
 Objects/mimalloc/random.c            |   2 +-
 Objects/mimalloc/segment-map.c       |   4 +-
 Objects/mimalloc/segment.c           | 146 +++++++++++++--------------
 Objects/mimalloc/stats.c             |   2 +-
 22 files changed, 258 insertions(+), 261 deletions(-)

diff --git a/Include/mimalloc/mimalloc.h b/Include/mimalloc/mimalloc.h
index f77c2ea17008ec..821129e7690b1b 100644
--- a/Include/mimalloc/mimalloc.h
+++ b/Include/mimalloc/mimalloc.h
@@ -332,18 +332,18 @@ typedef enum mi_option_e {
   mi_option_deprecated_segment_cache,
   mi_option_deprecated_page_reset,
   mi_option_abandoned_page_purge,     // immediately purge delayed purges on thread termination
-  mi_option_deprecated_segment_reset, 
-  mi_option_eager_commit_delay,       
+  mi_option_deprecated_segment_reset,
+  mi_option_eager_commit_delay,
   mi_option_purge_delay,              // memory purging is delayed by N milli seconds; use 0 for immediate purging or -1 for no purging at all.
   mi_option_use_numa_nodes,           // 0 = use all available numa nodes, otherwise use at most N nodes.
   mi_option_limit_os_alloc,           // 1 = do not use OS memory for allocation (but only programmatically reserved arenas)
   mi_option_os_tag,                   // tag used for OS logging (macOS only for now)
   mi_option_max_errors,               // issue at most N error messages
   mi_option_max_warnings,             // issue at most N warning messages
-  mi_option_max_segment_reclaim,      
+  mi_option_max_segment_reclaim,
   mi_option_destroy_on_exit,          // if set, release all memory on exit; sometimes used for dynamic unloading but can be unsafe.
   mi_option_arena_reserve,            // initial memory size in KiB for arena reservation (1GiB on 64-bit)
-  mi_option_arena_purge_mult,         
+  mi_option_arena_purge_mult,
   mi_option_purge_extend_delay,
   _mi_option_last,
   // legacy option names
@@ -513,7 +513,7 @@ template<class T, bool _mi_destroy> struct _mi_heap_stl_allocator_common : publi
 protected:
   std::shared_ptr<mi_heap_t> heap;
   template<class U, bool D> friend struct _mi_heap_stl_allocator_common;
-  
+
   _mi_heap_stl_allocator_common() {
     mi_heap_t* hp = mi_heap_new();
     this->heap.reset(hp, (_mi_destroy ? &heap_destroy : &heap_delete));  /* calls heap_delete/destroy when the refcount drops to zero */
@@ -530,7 +530,7 @@ template<class T, bool _mi_destroy> struct _mi_heap_stl_allocator_common : publi
 template<class T> struct mi_heap_stl_allocator : public _mi_heap_stl_allocator_common<T, false> {
   using typename _mi_heap_stl_allocator_common<T, false>::size_type;
   mi_heap_stl_allocator() : _mi_heap_stl_allocator_common<T, false>() { } // creates fresh heap that is deleted when the destructor is called
-  mi_heap_stl_allocator(mi_heap_t* hp) : _mi_heap_stl_allocator_common<T, false>(hp) { }  // no delete nor destroy on the passed in heap 
+  mi_heap_stl_allocator(mi_heap_t* hp) : _mi_heap_stl_allocator_common<T, false>(hp) { }  // no delete nor destroy on the passed in heap
   template<class U> mi_heap_stl_allocator(const mi_heap_stl_allocator<U>& x) mi_attr_noexcept : _mi_heap_stl_allocator_common<T, false>(x) { }
 
   mi_heap_stl_allocator select_on_container_copy_construction() const { return *this; }
@@ -547,7 +547,7 @@ template<class T1, class T2> bool operator!=(const mi_heap_stl_allocator<T1>& x,
 template<class T> struct mi_heap_destroy_stl_allocator : public _mi_heap_stl_allocator_common<T, true> {
   using typename _mi_heap_stl_allocator_common<T, true>::size_type;
   mi_heap_destroy_stl_allocator() : _mi_heap_stl_allocator_common<T, true>() { } // creates fresh heap that is destroyed when the destructor is called
-  mi_heap_destroy_stl_allocator(mi_heap_t* hp) : _mi_heap_stl_allocator_common<T, true>(hp) { }  // no delete nor destroy on the passed in heap 
+  mi_heap_destroy_stl_allocator(mi_heap_t* hp) : _mi_heap_stl_allocator_common<T, true>(hp) { }  // no delete nor destroy on the passed in heap
   template<class U> mi_heap_destroy_stl_allocator(const mi_heap_destroy_stl_allocator<U>& x) mi_attr_noexcept : _mi_heap_stl_allocator_common<T, true>(x) { }
 
   mi_heap_destroy_stl_allocator select_on_container_copy_construction() const { return *this; }
diff --git a/Include/mimalloc/mimalloc/atomic.h b/Include/mimalloc/mimalloc/atomic.h
index fe418fab314a40..c6b8146ffdb049 100644
--- a/Include/mimalloc/mimalloc/atomic.h
+++ b/Include/mimalloc/mimalloc/atomic.h
@@ -300,7 +300,7 @@ typedef _Atomic(uintptr_t) mi_atomic_once_t;
 
 // Returns true only on the first invocation
 static inline bool mi_atomic_once( mi_atomic_once_t* once ) {
-  if (mi_atomic_load_relaxed(once) != 0) return false;     // quick test 
+  if (mi_atomic_load_relaxed(once) != 0) return false;     // quick test
   uintptr_t expected = 0;
   return mi_atomic_cas_strong_acq_rel(once, &expected, (uintptr_t)1); // try to set to 1
 }
diff --git a/Include/mimalloc/mimalloc/internal.h b/Include/mimalloc/mimalloc/internal.h
index 00d2626095b195..f076bc6a40f977 100644
--- a/Include/mimalloc/mimalloc/internal.h
+++ b/Include/mimalloc/mimalloc/internal.h
@@ -88,7 +88,7 @@ void       _mi_thread_data_collect(void);
 
 // os.c
 void       _mi_os_init(void);                                            // called from process init
-void*      _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* stats);  
+void*      _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* stats);
 void       _mi_os_free(void* p, size_t size, mi_memid_t memid, mi_stats_t* stats);
 void       _mi_os_free_ex(void* p, size_t size, bool still_committed, mi_memid_t memid, mi_stats_t* stats);
 
@@ -424,7 +424,7 @@ static inline mi_slice_t* mi_page_to_slice(mi_page_t* p) {
 
 // Segment belonging to a page
 static inline mi_segment_t* _mi_page_segment(const mi_page_t* page) {
-  mi_segment_t* segment = _mi_ptr_segment(page); 
+  mi_segment_t* segment = _mi_ptr_segment(page);
   mi_assert_internal(segment == NULL || ((mi_slice_t*)page >= segment->slices && (mi_slice_t*)page < segment->slices + segment->slice_entries));
   return segment;
 }
@@ -726,12 +726,12 @@ size_t _mi_commit_mask_next_run(const mi_commit_mask_t* cm, size_t* idx);
 
 #define mi_commit_mask_foreach(cm,idx,count) \
   idx = 0; \
-  while ((count = _mi_commit_mask_next_run(cm,&idx)) > 0) { 
-        
+  while ((count = _mi_commit_mask_next_run(cm,&idx)) > 0) {
+
 #define mi_commit_mask_foreach_end() \
     idx += count; \
   }
-      
+
 
 
 /* -----------------------------------------------------------
diff --git a/Include/mimalloc/mimalloc/prim.h b/Include/mimalloc/mimalloc/prim.h
index 9e560696fa4783..4b9e4dc4194d77 100644
--- a/Include/mimalloc/mimalloc/prim.h
+++ b/Include/mimalloc/mimalloc/prim.h
@@ -35,10 +35,10 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config );
 
 // Free OS memory
 int _mi_prim_free(void* addr, size_t size );
-  
+
 // Allocate OS memory. Return NULL on error.
 // The `try_alignment` is just a hint and the returned pointer does not have to be aligned.
-// If `commit` is false, the virtual memory range only needs to be reserved (with no access) 
+// If `commit` is false, the virtual memory range only needs to be reserved (with no access)
 // which will later be committed explicitly using `_mi_prim_commit`.
 // `is_zero` is set to true if the memory was zero initialized (as on most OS's)
 // pre: !commit => !allow_large
@@ -82,11 +82,11 @@ mi_msecs_t _mi_prim_clock_now(void);
 typedef struct mi_process_info_s {
   mi_msecs_t  elapsed;
   mi_msecs_t  utime;
-  mi_msecs_t  stime; 
-  size_t      current_rss; 
-  size_t      peak_rss;  
+  mi_msecs_t  stime;
+  size_t      current_rss;
+  size_t      peak_rss;
   size_t      current_commit;
-  size_t      peak_commit; 
+  size_t      peak_commit;
   size_t      page_faults;
 } mi_process_info_t;
 
@@ -117,7 +117,7 @@ void _mi_prim_thread_associate_default_heap(mi_heap_t* heap);
 
 //-------------------------------------------------------------------
 // Thread id: `_mi_prim_thread_id()`
-// 
+//
 // Getting the thread id should be performant as it is called in the
 // fast path of `_mi_free` and we specialize for various platforms as
 // inlined definitions. Regular code should call `init.c:_mi_thread_id()`.
diff --git a/Include/mimalloc/mimalloc/track.h b/Include/mimalloc/mimalloc/track.h
index 9545f7507ac99f..fa1a048d846a9c 100644
--- a/Include/mimalloc/mimalloc/track.h
+++ b/Include/mimalloc/mimalloc/track.h
@@ -34,7 +34,7 @@ The corresponding `mi_track_free` still uses the block start pointer and origina
 The `mi_track_resize` is currently unused but could be called on reallocations within a block.
 `mi_track_init` is called at program start.
 
-The following macros are for tools like asan and valgrind to track whether memory is 
+The following macros are for tools like asan and valgrind to track whether memory is
 defined, undefined, or not accessible at all:
 
   #define mi_track_mem_defined(p,size)
@@ -94,7 +94,7 @@ defined, undefined, or not accessible at all:
 // no tracking
 
 #define MI_TRACK_ENABLED      0
-#define MI_TRACK_HEAP_DESTROY 0 
+#define MI_TRACK_HEAP_DESTROY 0
 #define MI_TRACK_TOOL         "none"
 
 #define mi_track_malloc_size(p,reqsize,size,zero)
diff --git a/Include/mimalloc/mimalloc/types.h b/Include/mimalloc/mimalloc/types.h
index 2005238a64e9ac..7616f37e4b978f 100644
--- a/Include/mimalloc/mimalloc/types.h
+++ b/Include/mimalloc/mimalloc/types.h
@@ -181,7 +181,7 @@ typedef int32_t  mi_ssize_t;
 
 #define MI_SMALL_OBJ_SIZE_MAX             (MI_SMALL_PAGE_SIZE/4)   // 8KiB on 64-bit
 #define MI_MEDIUM_OBJ_SIZE_MAX            (MI_MEDIUM_PAGE_SIZE/4)  // 128KiB on 64-bit
-#define MI_MEDIUM_OBJ_WSIZE_MAX           (MI_MEDIUM_OBJ_SIZE_MAX/MI_INTPTR_SIZE)   
+#define MI_MEDIUM_OBJ_WSIZE_MAX           (MI_MEDIUM_OBJ_SIZE_MAX/MI_INTPTR_SIZE)
 #define MI_LARGE_OBJ_SIZE_MAX             (MI_SEGMENT_SIZE/2)      // 32MiB on 64-bit
 #define MI_LARGE_OBJ_WSIZE_MAX            (MI_LARGE_OBJ_SIZE_MAX/MI_INTPTR_SIZE)
 
@@ -199,10 +199,10 @@ typedef int32_t  mi_ssize_t;
 #define MI_HUGE_BLOCK_SIZE                ((uint32_t)(2*MI_GiB))
 
 // blocks up to this size are always allocated aligned
-#define MI_MAX_ALIGN_GUARANTEE            (8*MI_MAX_ALIGN_SIZE)  
+#define MI_MAX_ALIGN_GUARANTEE            (8*MI_MAX_ALIGN_SIZE)
 
-// Alignments over MI_ALIGNMENT_MAX are allocated in dedicated huge page segments 
-#define MI_ALIGNMENT_MAX                  (MI_SEGMENT_SIZE >> 1)  
+// Alignments over MI_ALIGNMENT_MAX are allocated in dedicated huge page segments
+#define MI_ALIGNMENT_MAX                  (MI_SEGMENT_SIZE >> 1)
 
 
 // ------------------------------------------------------
@@ -291,7 +291,7 @@ typedef uintptr_t mi_thread_free_t;
 typedef struct mi_page_s {
   // "owned" by the segment
   uint32_t              slice_count;       // slices in this page (0 if not a page)
-  uint32_t              slice_offset;      // distance from the actual page data slice (0 if a page)  
+  uint32_t              slice_offset;      // distance from the actual page data slice (0 if a page)
   uint8_t               is_committed : 1;  // `true` if the page virtual memory is committed
   uint8_t               is_zero_init : 1;  // `true` if the page was initially zero initialized
 
@@ -345,7 +345,7 @@ typedef enum mi_segment_kind_e {
 // A segment holds a commit mask where a bit is set if
 // the corresponding MI_COMMIT_SIZE area is committed.
 // The MI_COMMIT_SIZE must be a multiple of the slice
-// size. If it is equal we have the most fine grained 
+// size. If it is equal we have the most fine grained
 // decommit (but setting it higher can be more efficient).
 // The MI_MINIMAL_COMMIT_SIZE is the minimal amount that will
 // be committed in one go which can be set higher than
@@ -353,9 +353,9 @@ typedef enum mi_segment_kind_e {
 // is still tracked in fine-grained MI_COMMIT_SIZE chunks)
 // ------------------------------------------------------
 
-#define MI_MINIMAL_COMMIT_SIZE      (1*MI_SEGMENT_SLICE_SIZE)            
+#define MI_MINIMAL_COMMIT_SIZE      (1*MI_SEGMENT_SLICE_SIZE)
 #define MI_COMMIT_SIZE              (MI_SEGMENT_SLICE_SIZE)              // 64KiB
-#define MI_COMMIT_MASK_BITS         (MI_SEGMENT_SIZE / MI_COMMIT_SIZE)  
+#define MI_COMMIT_MASK_BITS         (MI_SEGMENT_SIZE / MI_COMMIT_SIZE)
 #define MI_COMMIT_MASK_FIELD_BITS    MI_SIZE_BITS
 #define MI_COMMIT_MASK_FIELD_COUNT  (MI_COMMIT_MASK_BITS / MI_COMMIT_MASK_FIELD_BITS)
 
@@ -428,11 +428,11 @@ typedef struct mi_segment_s {
 
   // from here is zero initialized
   struct mi_segment_s* next;            // the list of freed segments in the cache (must be first field, see `segment.c:mi_segment_init`)
-  
+
   size_t            abandoned;          // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`)
   size_t            abandoned_visits;   // count how often this segment is visited in the abandoned list (to force reclaim it it is too long)
   size_t            used;               // count of pages in use
-  uintptr_t         cookie;             // verify addresses in debug mode: `mi_ptr_cookie(segment) == segment->cookie`  
+  uintptr_t         cookie;             // verify addresses in debug mode: `mi_ptr_cookie(segment) == segment->cookie`
 
   size_t            segment_slices;      // for huge segments this may be different from `MI_SLICES_PER_SEGMENT`
   size_t            segment_info_slices; // initial slices we are using segment info and possible guard pages.
@@ -503,7 +503,7 @@ struct mi_heap_s {
   mi_page_queue_t       pages[MI_BIN_FULL + 1];              // queue of pages for each size class (or "bin")
   _Atomic(mi_block_t*)  thread_delayed_free;
   mi_threadid_t         thread_id;                           // thread this heap belongs too
-  mi_arena_id_t         arena_id;                            // arena id if the heap belongs to a specific arena (or 0)  
+  mi_arena_id_t         arena_id;                            // arena id if the heap belongs to a specific arena (or 0)
   uintptr_t             cookie;                              // random cookie to verify pointers (see `_mi_ptr_cookie`)
   uintptr_t             keys[2];                             // two random keys used to encode the `thread_delayed_free` list
   mi_random_ctx_t       random;                              // random number context used for secure allocation
diff --git a/Objects/mimalloc/alloc-aligned.c b/Objects/mimalloc/alloc-aligned.c
index 1cd809f15d613c..4c15f4043ec117 100644
--- a/Objects/mimalloc/alloc-aligned.c
+++ b/Objects/mimalloc/alloc-aligned.c
@@ -47,7 +47,7 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_fallback(mi_heap_t*
     oversize = (size <= MI_SMALL_SIZE_MAX ? MI_SMALL_SIZE_MAX + 1 /* ensure we use generic malloc path */ : size);
     p = _mi_heap_malloc_zero_ex(heap, oversize, false, alignment); // the page block size should be large enough to align in the single huge page block
     // zero afterwards as only the area from the aligned_p may be committed!
-    if (p == NULL) return NULL;    
+    if (p == NULL) return NULL;
   }
   else {
     // otherwise over-allocate
@@ -73,7 +73,7 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_fallback(mi_heap_t*
   mi_assert_internal(((uintptr_t)aligned_p + offset) % alignment == 0);
   mi_assert_internal(mi_usable_size(aligned_p)>=size);
   mi_assert_internal(mi_usable_size(p) == mi_usable_size(aligned_p)+adjust);
-    
+
   // now zero the block if needed
   if (alignment > MI_ALIGNMENT_MAX) {
     // for the tracker, on huge aligned allocations only from the start of the large block is defined
@@ -85,7 +85,7 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_fallback(mi_heap_t*
 
   if (p != aligned_p) {
     mi_track_align(p,aligned_p,adjust,mi_usable_size(aligned_p));
-  }  
+  }
   return aligned_p;
 }
 
diff --git a/Objects/mimalloc/alloc.c b/Objects/mimalloc/alloc.c
index ffc1747d527d11..fb56f952994990 100644
--- a/Objects/mimalloc/alloc.c
+++ b/Objects/mimalloc/alloc.c
@@ -58,7 +58,7 @@ extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t siz
     }
     else {
       _mi_memzero_aligned(block, page->xblock_size - MI_PADDING_SIZE);
-    }    
+    }
   }
 
 #if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN
@@ -113,7 +113,7 @@ static inline mi_decl_restrict void* mi_heap_malloc_small_zero(mi_heap_t* heap,
   if (size == 0) { size = sizeof(void*); }
   #endif
   mi_page_t* page = _mi_heap_get_free_small_page(heap, size + MI_PADDING_SIZE);
-  void* const p = _mi_page_malloc(heap, page, size + MI_PADDING_SIZE, zero);  
+  void* const p = _mi_page_malloc(heap, page, size + MI_PADDING_SIZE, zero);
   mi_track_malloc(p,size,zero);
   #if MI_STAT>1
   if (p != NULL) {
@@ -346,7 +346,7 @@ static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
 // only maintain stats for smaller objects if requested
 #if (MI_STAT>0)
 static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
-  #if (MI_STAT < 2)  
+  #if (MI_STAT < 2)
   MI_UNUSED(block);
   #endif
   mi_heap_t* const heap = mi_heap_get_default();
@@ -354,7 +354,7 @@ static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
   #if (MI_STAT>1)
   const size_t usize = mi_page_usable_size_of(page, block);
   mi_heap_stat_decrease(heap, malloc, usize);
-  #endif  
+  #endif
   if (bsize <= MI_MEDIUM_OBJ_SIZE_MAX) {
     mi_heap_stat_decrease(heap, normal, bsize);
     #if (MI_STAT > 1)
@@ -366,7 +366,7 @@ static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
   }
   else {
     mi_heap_stat_decrease(heap, huge, bsize);
-  }  
+  }
 }
 #else
 static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
@@ -405,7 +405,7 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc
   // that is safe as these are constant and the page won't be freed (as the block is not freed yet).
   mi_check_padding(page, block);
   _mi_padding_shrink(page, block, sizeof(mi_block_t));       // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection
-  
+
   // huge page segments are always abandoned and can be freed immediately
   mi_segment_t* segment = _mi_page_segment(page);
   if (segment->kind == MI_SEGMENT_HUGE) {
@@ -421,7 +421,7 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc
     _mi_segment_huge_page_reset(segment, page, block);
     #endif
   }
-  
+
   #if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN        // note: when tracking, cannot use mi_usable_size with multi-threading
   if (segment->kind != MI_SEGMENT_HUGE) {                  // not for huge segments as we just reset the content
     memset(block, MI_DEBUG_FREED, mi_usable_size(block));
diff --git a/Objects/mimalloc/arena.c b/Objects/mimalloc/arena.c
index a04a04c8fd3111..f8883603860dce 100644
--- a/Objects/mimalloc/arena.c
+++ b/Objects/mimalloc/arena.c
@@ -13,7 +13,7 @@ threads and need to be accessed using atomic operations.
 
 Arenas are used to for huge OS page (1GiB) reservations or for reserving
 OS memory upfront which can be improve performance or is sometimes needed
-on embedded devices. We can also employ this with WASI or `sbrk` systems 
+on embedded devices. We can also employ this with WASI or `sbrk` systems
 to reserve large arenas upfront and be able to reuse the memory more effectively.
 
 The arena allocation needs to be thread safe and we use an atomic bitmap to allocate.
@@ -48,13 +48,13 @@ typedef struct mi_arena_s {
   size_t   meta_size;                     // size of the arena structure itself (including its bitmaps)
   mi_memid_t meta_memid;                  // memid of the arena structure itself (OS or static allocation)
   int      numa_node;                     // associated NUMA node
-  bool     exclusive;                     // only allow allocations if specifically for this arena  
+  bool     exclusive;                     // only allow allocations if specifically for this arena
   bool     is_large;                      // memory area consists of large- or huge OS pages (always committed)
   _Atomic(size_t) search_idx;             // optimization to start the search for free blocks
-  _Atomic(mi_msecs_t) purge_expire;       // expiration time when blocks should be decommitted from `blocks_decommit`.  
+  _Atomic(mi_msecs_t) purge_expire;       // expiration time when blocks should be decommitted from `blocks_decommit`.
   mi_bitmap_field_t* blocks_dirty;        // are the blocks potentially non-zero?
   mi_bitmap_field_t* blocks_committed;    // are the blocks committed? (can be NULL for memory that cannot be decommitted)
-  mi_bitmap_field_t* blocks_purge;        // blocks that can be (reset) decommitted. (can be NULL for memory that cannot be (reset) decommitted)  
+  mi_bitmap_field_t* blocks_purge;        // blocks that can be (reset) decommitted. (can be NULL for memory that cannot be (reset) decommitted)
   mi_bitmap_field_t  blocks_inuse[1];     // in-place bitmap of in-use blocks (of size `field_count`)
 } mi_arena_t;
 
@@ -103,7 +103,7 @@ bool _mi_arena_memid_is_os_allocated(mi_memid_t memid) {
 }
 
 /* -----------------------------------------------------------
-  Arena allocations get a (currently) 16-bit memory id where the 
+  Arena allocations get a (currently) 16-bit memory id where the
   lower 8 bits are the arena id, and the upper bits the block index.
 ----------------------------------------------------------- */
 
@@ -205,7 +205,7 @@ static bool mi_arena_try_claim(mi_arena_t* arena, size_t blocks, mi_bitmap_index
 {
   size_t idx = 0; // mi_atomic_load_relaxed(&arena->search_idx);  // start from last search; ok to be relaxed as the exact start does not matter
   if (_mi_bitmap_try_find_from_claim_across(arena->blocks_inuse, arena->field_count, idx, blocks, bitmap_idx)) {
-    mi_atomic_store_relaxed(&arena->search_idx, mi_bitmap_index_field(*bitmap_idx));  // start search from found location next time around    
+    mi_atomic_store_relaxed(&arena->search_idx, mi_bitmap_index_field(*bitmap_idx));  // start search from found location next time around
     return true;
   };
   return false;
@@ -225,7 +225,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t ar
   mi_bitmap_index_t bitmap_index;
   if (!mi_arena_try_claim(arena, needed_bcount, &bitmap_index)) return NULL;
 
-  // claimed it! 
+  // claimed it!
   void* p = mi_arena_block_start(arena, bitmap_index);
   *memid = mi_memid_create_arena(arena->id, arena->exclusive, bitmap_index);
   memid->is_pinned = arena->memid.is_pinned;
@@ -265,21 +265,21 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t ar
     // no need to commit, but check if already fully committed
     memid->initially_committed = _mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index);
   }
-  
+
   return p;
 }
 
 // allocate in a speficic arena
-static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_node, int numa_node, size_t size, size_t alignment, 
-                                       bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld ) 
+static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_node, int numa_node, size_t size, size_t alignment,
+                                       bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld )
 {
   MI_UNUSED_RELEASE(alignment);
   mi_assert_internal(alignment <= MI_SEGMENT_ALIGN);
-  const size_t bcount = mi_block_count_of_size(size);  
+  const size_t bcount = mi_block_count_of_size(size);
   const size_t arena_index = mi_arena_id_index(arena_id);
   mi_assert_internal(arena_index < mi_atomic_load_relaxed(&mi_arena_count));
   mi_assert_internal(size <= mi_arena_block_size(bcount));
-  
+
   // Check arena suitability
   mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]);
   if (arena == NULL) return NULL;
@@ -299,7 +299,7 @@ static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_no
 
 
 // allocate from an arena with fallback to the OS
-static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, size_t alignment, 
+static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, size_t alignment,
                                                   bool commit, bool allow_large,
                                                   mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld )
 {
@@ -307,9 +307,9 @@ static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, siz
   mi_assert_internal(alignment <= MI_SEGMENT_ALIGN);
   const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
   if mi_likely(max_arena == 0) return NULL;
-  
+
   if (req_arena_id != _mi_arena_id_none()) {
-    // try a specific arena if requested 
+    // try a specific arena if requested
     if (mi_arena_id_index(req_arena_id) < max_arena) {
       void* p = mi_arena_try_alloc_at_id(req_arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
       if (p != NULL) return p;
@@ -317,7 +317,7 @@ static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, siz
   }
   else {
     // try numa affine allocation
-    for (size_t i = 0; i < max_arena; i++) {    
+    for (size_t i = 0; i < max_arena; i++) {
       void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
       if (p != NULL) return p;
     }
@@ -345,22 +345,22 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
   size_t arena_reserve = mi_option_get_size(mi_option_arena_reserve);
   if (arena_reserve == 0) return false;
 
-  if (!_mi_os_has_virtual_reserve()) { 
+  if (!_mi_os_has_virtual_reserve()) {
     arena_reserve = arena_reserve/4;  // be conservative if virtual reserve is not supported (for some embedded systems for example)
   }
   arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_BLOCK_SIZE);
   if (arena_count >= 8 && arena_count <= 128) {
     arena_reserve = ((size_t)1<<(arena_count/8)) * arena_reserve;  // scale up the arena sizes exponentially
-  }    
+  }
   if (arena_reserve < req_size) return false;  // should be able to at least handle the current allocation size
-      
+
   // commit eagerly?
   bool arena_commit = false;
   if (mi_option_get(mi_option_arena_eager_commit) == 2)      { arena_commit = _mi_os_has_overcommit(); }
   else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; }
 
   return (mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive */, arena_id) == 0);
-}    
+}
 
 
 void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large,
@@ -375,9 +375,9 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset
   // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data)
   if (size >= MI_ARENA_MIN_OBJ_SIZE && alignment <= MI_SEGMENT_ALIGN && align_offset == 0) {
     void* p = mi_arena_try_alloc(numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
-    if (p != NULL) return p;    
+    if (p != NULL) return p;
 
-    // otherwise, try to first eagerly reserve a new arena 
+    // otherwise, try to first eagerly reserve a new arena
     if (req_arena_id == _mi_arena_id_none()) {
       mi_arena_id_t arena_id = 0;
       if (mi_arena_reserve(size, allow_large, req_arena_id, &arena_id)) {
@@ -394,14 +394,14 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset
     errno = ENOMEM;
     return NULL;
   }
-   
+
   // finally, fall back to the OS
   if (align_offset > 0) {
     return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid, tld->stats);
   }
   else {
     return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, tld->stats);
-  }  
+  }
 }
 
 void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld)
@@ -437,22 +437,22 @@ static void mi_arena_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks,
   mi_assert_internal(arena->blocks_purge != NULL);
   mi_assert_internal(!arena->memid.is_pinned);
   const size_t size = mi_arena_block_size(blocks);
-  void* const p = mi_arena_block_start(arena, bitmap_idx); 
+  void* const p = mi_arena_block_start(arena, bitmap_idx);
   bool needs_recommit;
   if (_mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx)) {
     // all blocks are committed, we can purge freely
     needs_recommit = _mi_os_purge(p, size, stats);
   }
   else {
-    // some blocks are not committed -- this can happen when a partially committed block is freed 
+    // some blocks are not committed -- this can happen when a partially committed block is freed
     // in `_mi_arena_free` and it is conservatively marked as uncommitted but still scheduled for a purge
-    // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory), 
+    // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory),
     // and also undo the decommit stats (as it was already adjusted)
     mi_assert_internal(mi_option_is_enabled(mi_option_purge_decommits));
     needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, stats);
     _mi_stat_increase(&stats->committed, size);
   }
-  
+
   // clear the purged blocks
   _mi_bitmap_unclaim_across(arena->blocks_purge, arena->field_count, blocks, bitmap_idx);
   // update committed bitmap
@@ -470,7 +470,7 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t bitmap_idx, size_t
 
   if (_mi_preloading() || delay == 0) {
     // decommit directly
-    mi_arena_purge(arena, bitmap_idx, blocks, stats);    
+    mi_arena_purge(arena, bitmap_idx, blocks, stats);
   }
   else {
     // schedule decommit
@@ -512,7 +512,7 @@ static bool mi_arena_purge_range(mi_arena_t* arena, size_t idx, size_t startidx,
 }
 
 // returns true if anything was purged
-static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi_stats_t* stats) 
+static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi_stats_t* stats)
 {
   if (arena->memid.is_pinned || arena->blocks_purge == NULL) return false;
   mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire);
@@ -521,10 +521,10 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi
 
   // reset expire (if not already set concurrently)
   mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire, 0);
-  
+
   // potential purges scheduled, walk through the bitmap
   bool any_purged = false;
-  bool full_purge = true;  
+  bool full_purge = true;
   for (size_t i = 0; i < arena->field_count; i++) {
     size_t purge = mi_atomic_load_relaxed(&arena->blocks_purge[i]);
     if (purge != 0) {
@@ -575,7 +575,7 @@ static void mi_arenas_try_purge( bool force, bool visit_all, mi_stats_t* stats )
 
   // allow only one thread to purge at a time
   static mi_atomic_guard_t purge_guard;
-  mi_atomic_guard(&purge_guard) 
+  mi_atomic_guard(&purge_guard)
   {
     mi_msecs_t now = _mi_clock_now();
     size_t max_purge_count = (visit_all ? max_arena : 1);
@@ -588,7 +588,7 @@ static void mi_arenas_try_purge( bool force, bool visit_all, mi_stats_t* stats )
         }
       }
     }
-  }  
+  }
 }
 
 
@@ -602,7 +602,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
   if (p==NULL) return;
   if (size==0) return;
   const bool all_committed = (committed_size == size);
-  
+
   if (mi_memkind_is_os(memid.memkind)) {
     // was a direct OS allocation, pass through
     if (!all_committed && committed_size > 0) {
@@ -620,7 +620,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
     mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t,&mi_arenas[arena_idx]);
     mi_assert_internal(arena != NULL);
     const size_t blocks = mi_block_count_of_size(size);
-    
+
     // checks
     if (arena == NULL) {
       _mi_error_message(EINVAL, "trying to free from non-existent arena: %p, size %zu, memid: 0x%zx\n", p, size, memid);
@@ -642,7 +642,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
     else {
       mi_assert_internal(arena->blocks_committed != NULL);
       mi_assert_internal(arena->blocks_purge != NULL);
-      
+
       if (!all_committed) {
         // mark the entire range as no longer committed (so we recommit the full range when re-using)
         _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx);
@@ -657,9 +657,9 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
         // works (as we should never reset decommitted parts).
       }
       // (delay) purge the entire range
-      mi_arena_schedule_purge(arena, bitmap_idx, blocks, stats);      
+      mi_arena_schedule_purge(arena, bitmap_idx, blocks, stats);
     }
-    
+
     // and make it available to others again
     bool all_inuse = _mi_bitmap_unclaim_across(arena->blocks_inuse, arena->field_count, blocks, bitmap_idx);
     if (!all_inuse) {
@@ -684,9 +684,9 @@ static void mi_arenas_unsafe_destroy(void) {
   for (size_t i = 0; i < max_arena; i++) {
     mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
     if (arena != NULL) {
-      if (arena->start != NULL && mi_memkind_is_os(arena->memid.memkind)) {      
+      if (arena->start != NULL && mi_memkind_is_os(arena->memid.memkind)) {
         mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL);
-        _mi_os_free(arena->start, mi_arena_size(arena), arena->memid, &_mi_stats_main); 
+        _mi_os_free(arena->start, mi_arena_size(arena), arena->memid, &_mi_stats_main);
       }
       else {
         new_max_arena = i;
@@ -709,7 +709,7 @@ void _mi_arena_collect(bool force_purge, mi_stats_t* stats) {
 // for dynamic libraries that are unloaded and need to release all their allocated memory.
 void _mi_arena_unsafe_destroy_all(mi_stats_t* stats) {
   mi_arenas_unsafe_destroy();
-  _mi_arena_collect(true /* force purge */, stats);  // purge non-owned arenas  
+  _mi_arena_collect(true /* force purge */, stats);  // purge non-owned arenas
 }
 
 // Is a pointer inside any of our arenas?
@@ -717,8 +717,8 @@ bool _mi_arena_contains(const void* p) {
   const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
   for (size_t i = 0; i < max_arena; i++) {
     mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
-    if (arena != NULL && arena->start <= (const uint8_t*)p && arena->start + mi_arena_block_size(arena->block_count) > (const uint8_t*)p) { 
-      return true;      
+    if (arena != NULL && arena->start <= (const uint8_t*)p && arena->start + mi_arena_block_size(arena->block_count) > (const uint8_t*)p) {
+      return true;
     }
   }
   return false;
@@ -762,7 +762,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
   mi_memid_t meta_memid;
   mi_arena_t* arena   = (mi_arena_t*)mi_arena_meta_zalloc(asize, &meta_memid, &_mi_stats_main); // TODO: can we avoid allocating from the OS?
   if (arena == NULL) return false;
-  
+
   // already zero'd due to os_alloc
   // _mi_memzero(arena, asize);
   arena->id = _mi_arena_id_none();
@@ -779,12 +779,12 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
   arena->search_idx   = 0;
   arena->blocks_dirty = &arena->blocks_inuse[fields]; // just after inuse bitmap
   arena->blocks_committed = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[2*fields]); // just after dirty bitmap
-  arena->blocks_purge  = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[3*fields]); // just after committed bitmap  
+  arena->blocks_purge  = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[3*fields]); // just after committed bitmap
   // initialize committed bitmap?
   if (arena->blocks_committed != NULL && arena->memid.initially_committed) {
     memset((void*)arena->blocks_committed, 0xFF, fields*sizeof(mi_bitmap_field_t)); // cast to void* to avoid atomic warning
   }
-  
+
   // and claim leftover blocks if needed (so we never allocate there)
   ptrdiff_t post = (fields * MI_BITMAP_FIELD_BITS) - bcount;
   mi_assert_internal(post >= 0);
@@ -933,4 +933,3 @@ int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserv
   if (err==0 && pages_reserved!=NULL) *pages_reserved = pages;
   return err;
 }
-
diff --git a/Objects/mimalloc/bitmap.c b/Objects/mimalloc/bitmap.c
index a13dbe15bf33e8..ec3c755822dac1 100644
--- a/Objects/mimalloc/bitmap.c
+++ b/Objects/mimalloc/bitmap.c
@@ -109,15 +109,15 @@ bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fiel
 }
 
 // Like _mi_bitmap_try_find_from_claim but with an extra predicate that must be fullfilled
-bool _mi_bitmap_try_find_from_claim_pred(mi_bitmap_t bitmap, const size_t bitmap_fields, 
-            const size_t start_field_idx, const size_t count, 
-            mi_bitmap_pred_fun_t pred_fun, void* pred_arg,            
+bool _mi_bitmap_try_find_from_claim_pred(mi_bitmap_t bitmap, const size_t bitmap_fields,
+            const size_t start_field_idx, const size_t count,
+            mi_bitmap_pred_fun_t pred_fun, void* pred_arg,
             mi_bitmap_index_t* bitmap_idx) {
   size_t idx = start_field_idx;
   for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) {
     if (idx >= bitmap_fields) idx = 0; // wrap
     if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
-      if (pred_fun == NULL || pred_fun(*bitmap_idx, pred_arg)) { 
+      if (pred_fun == NULL || pred_fun(*bitmap_idx, pred_arg)) {
         return true;
       }
       // predicate returned false, unclaim and look further
@@ -164,7 +164,7 @@ static bool mi_bitmap_is_claimedx(mi_bitmap_t bitmap, size_t bitmap_fields, size
   return ((field & mask) == mask);
 }
 
-// Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically. 
+// Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically.
 // Returns `true` if successful when all previous `count` bits were 0.
 bool _mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
   const size_t idx = mi_bitmap_index_field(bitmap_idx);
@@ -172,9 +172,9 @@ bool _mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count
   const size_t mask = mi_bitmap_mask_(count, bitidx);
   mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
   size_t expected = mi_atomic_load_relaxed(&bitmap[idx]);
-  do  {    
+  do  {
     if ((expected & mask) != 0) return false;
-  } 
+  }
   while (!mi_atomic_cas_strong_acq_rel(&bitmap[idx], &expected, expected | mask));
   mi_assert_internal((expected & mask) == 0);
   return true;
@@ -212,7 +212,7 @@ static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bit
   if (initial == 0)     return false;
   if (initial >= count) return _mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx);    // no need to cross fields (this case won't happen for us)
   if (_mi_divide_up(count - initial, MI_BITMAP_FIELD_BITS) >= (bitmap_fields - idx)) return false; // not enough entries
-  
+
   // scan ahead
   size_t found = initial;
   size_t mask = 0;     // mask bits for the final field
diff --git a/Objects/mimalloc/bitmap.h b/Objects/mimalloc/bitmap.h
index 0a765c714bae09..9ba15d5d6f09ea 100644
--- a/Objects/mimalloc/bitmap.h
+++ b/Objects/mimalloc/bitmap.h
@@ -41,7 +41,7 @@ static inline mi_bitmap_index_t mi_bitmap_index_create(size_t idx, size_t bitidx
 }
 
 // Create a bit index.
-static inline mi_bitmap_index_t mi_bitmap_index_create_from_bit(size_t full_bitidx) {  
+static inline mi_bitmap_index_t mi_bitmap_index_create_from_bit(size_t full_bitidx) {
   return mi_bitmap_index_create(full_bitidx / MI_BITMAP_FIELD_BITS, full_bitidx % MI_BITMAP_FIELD_BITS);
 }
 
@@ -80,7 +80,7 @@ bool _mi_bitmap_try_find_from_claim_pred(mi_bitmap_t bitmap, const size_t bitmap
 // Returns `true` if all `count` bits were 1 previously.
 bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
 
-// Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically. 
+// Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically.
 // Returns `true` if successful when all previous `count` bits were 0.
 bool _mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
 
diff --git a/Objects/mimalloc/heap.c b/Objects/mimalloc/heap.c
index 58520ddf631ede..4eb622ed4bad76 100644
--- a/Objects/mimalloc/heap.c
+++ b/Objects/mimalloc/heap.c
@@ -32,7 +32,7 @@ static bool mi_heap_visit_pages(mi_heap_t* heap, heap_page_visitor_fun* fn, void
   #if MI_DEBUG>1
   size_t total = heap->page_count;
   size_t count = 0;
-  #endif  
+  #endif
 
   for (size_t i = 0; i <= MI_BIN_FULL; i++) {
     mi_page_queue_t* pq = &heap->pages[i];
@@ -120,11 +120,11 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
 {
   if (heap==NULL || !mi_heap_is_initialized(heap)) return;
 
-  const bool force = collect >= MI_FORCE;  
+  const bool force = collect >= MI_FORCE;
   _mi_deferred_free(heap, force);
 
-  // note: never reclaim on collect but leave it to threads that need storage to reclaim 
-  const bool force_main = 
+  // note: never reclaim on collect but leave it to threads that need storage to reclaim
+  const bool force_main =
     #ifdef NDEBUG
       collect == MI_FORCE
     #else
diff --git a/Objects/mimalloc/init.c b/Objects/mimalloc/init.c
index 24e8333f42d3e5..7dfa7657737117 100644
--- a/Objects/mimalloc/init.c
+++ b/Objects/mimalloc/init.c
@@ -247,7 +247,7 @@ static mi_thread_data_t* mi_thread_data_zalloc(void) {
       is_zero = memid.initially_zero;
     }
   }
-  
+
   if (td != NULL && !is_zero) {
     _mi_memzero_aligned(td, sizeof(*td));
   }
@@ -426,23 +426,23 @@ void mi_thread_done(void) mi_attr_noexcept {
   _mi_thread_done(NULL);
 }
 
-void _mi_thread_done(mi_heap_t* heap) 
+void _mi_thread_done(mi_heap_t* heap)
 {
   // calling with NULL implies using the default heap
-  if (heap == NULL) { 
-    heap = mi_prim_get_default_heap(); 
+  if (heap == NULL) {
+    heap = mi_prim_get_default_heap();
     if (heap == NULL) return;
   }
 
   // prevent re-entrancy through heap_done/heap_set_default_direct (issue #699)
   if (!mi_heap_is_initialized(heap)) {
-    return; 
+    return;
   }
 
   // adjust stats
   mi_atomic_decrement_relaxed(&thread_count);
   _mi_stat_decrease(&_mi_stats_main.threads, 1);
-  
+
   // check thread-id as on Windows shutdown with FLS the main (exit) thread may call this on thread-local heaps...
   if (heap->thread_id != _mi_thread_id()) return;
 
@@ -464,7 +464,7 @@ void _mi_heap_set_default_direct(mi_heap_t* heap)  {
 
   // ensure the default heap is passed to `_mi_thread_done`
   // setting to a non-NULL value also ensures `mi_thread_done` is called.
-  _mi_prim_thread_associate_default_heap(heap);    
+  _mi_prim_thread_associate_default_heap(heap);
 }
 
 
@@ -624,7 +624,7 @@ static void mi_cdecl mi_process_done(void) {
 
   // release any thread specific resources and ensure _mi_thread_done is called on all but the main thread
   _mi_prim_thread_done_auto_done();
-  
+
   #ifndef MI_SKIP_COLLECT_ON_EXIT
     #if (MI_DEBUG || !defined(MI_SHARED_LIB))
     // free all memory if possible on process exit. This is not needed for a stand-alone process
diff --git a/Objects/mimalloc/os.c b/Objects/mimalloc/os.c
index 4928186b980f26..239790f77c574b 100644
--- a/Objects/mimalloc/os.c
+++ b/Objects/mimalloc/os.c
@@ -29,7 +29,7 @@ bool _mi_os_has_overcommit(void) {
   return mi_os_mem_config.has_overcommit;
 }
 
-bool _mi_os_has_virtual_reserve(void) { 
+bool _mi_os_has_virtual_reserve(void) {
   return mi_os_mem_config.has_virtual_reserve;
 }
 
@@ -173,7 +173,7 @@ void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t me
     }
   }
   else {
-    // nothing to do 
+    // nothing to do
     mi_assert(memid.memkind < MI_MEM_OS);
   }
 }
@@ -197,7 +197,7 @@ static void* mi_os_prim_alloc(size_t size, size_t try_alignment, bool commit, bo
   if (try_alignment == 0) { try_alignment = 1; } // avoid 0 to ensure there will be no divide by zero when aligning
 
   *is_zero = false;
-  void* p = NULL; 
+  void* p = NULL;
   int err = _mi_prim_alloc(size, try_alignment, commit, allow_large, is_large, is_zero, &p);
   if (err != 0) {
     _mi_warning_message("unable to allocate OS memory (error: %d (0x%x), size: 0x%zx bytes, align: 0x%zx, commit: %d, allow large: %d)\n", err, err, size, try_alignment, commit, allow_large);
@@ -205,14 +205,14 @@ static void* mi_os_prim_alloc(size_t size, size_t try_alignment, bool commit, bo
   mi_stat_counter_increase(stats->mmap_calls, 1);
   if (p != NULL) {
     _mi_stat_increase(&stats->reserved, size);
-    if (commit) { 
-      _mi_stat_increase(&stats->committed, size); 
+    if (commit) {
+      _mi_stat_increase(&stats->committed, size);
       // seems needed for asan (or `mimalloc-test-api` fails)
       #ifdef MI_TRACK_ASAN
       if (*is_zero) { mi_track_mem_defined(p,size); }
                else { mi_track_mem_undefined(p,size); }
       #endif
-    }    
+    }
   }
   return p;
 }
@@ -249,7 +249,7 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
       // over-allocate uncommitted (virtual) memory
       p = mi_os_prim_alloc(over_size, 1 /*alignment*/, false /* commit? */, false /* allow_large */, is_large, is_zero, stats);
       if (p == NULL) return NULL;
-      
+
       // set p to the aligned part in the full region
       // note: this is dangerous on Windows as VirtualFree needs the actual base pointer
       // this is handled though by having the `base` field in the memid's
@@ -265,7 +265,7 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
       // overallocate...
       p = mi_os_prim_alloc(over_size, 1, commit, false, is_large, is_zero, stats);
       if (p == NULL) return NULL;
-      
+
       // and selectively unmap parts around the over-allocated area. (noop on sbrk)
       void* aligned_p = mi_align_up_ptr(p, alignment);
       size_t pre_size = (uint8_t*)aligned_p - (uint8_t*)p;
@@ -276,7 +276,7 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
       if (post_size > 0) { mi_os_prim_free((uint8_t*)aligned_p + mid_size, post_size, commit, stats); }
       // we can return the aligned pointer on `mmap` (and sbrk) systems
       p = aligned_p;
-      *base = aligned_p; // since we freed the pre part, `*base == p`.      
+      *base = aligned_p; // since we freed the pre part, `*base == p`.
     }
   }
 
@@ -300,7 +300,7 @@ void* _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* tld_stats) {
   void* p = mi_os_prim_alloc(size, 0, true, false, &os_is_large, &os_is_zero, stats);
   if (p != NULL) {
     *memid = _mi_memid_create_os(true, os_is_zero, os_is_large);
-  }  
+  }
   return p;
 }
 
@@ -312,7 +312,7 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo
   if (size == 0) return NULL;
   size = _mi_os_good_alloc_size(size);
   alignment = _mi_align_up(alignment, _mi_os_page_size());
-  
+
   bool os_is_large = false;
   bool os_is_zero  = false;
   void* os_base = NULL;
@@ -390,7 +390,7 @@ static void* mi_os_page_align_area_conservative(void* addr, size_t size, size_t*
 
 bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats) {
   MI_UNUSED(tld_stats);
-  mi_stats_t* stats = &_mi_stats_main;  
+  mi_stats_t* stats = &_mi_stats_main;
   if (is_zero != NULL) { *is_zero = false; }
   _mi_stat_increase(&stats->committed, size);  // use size for precise commit vs. decommit
   _mi_stat_counter_increase(&stats->commit_calls, 1);
@@ -400,21 +400,21 @@ bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats
   void* start = mi_os_page_align_areax(false /* conservative? */, addr, size, &csize);
   if (csize == 0) return true;
 
-  // commit  
+  // commit
   bool os_is_zero = false;
-  int err = _mi_prim_commit(start, csize, &os_is_zero); 
+  int err = _mi_prim_commit(start, csize, &os_is_zero);
   if (err != 0) {
     _mi_warning_message("cannot commit OS memory (error: %d (0x%x), address: %p, size: 0x%zx bytes)\n", err, err, start, csize);
     return false;
   }
-  if (os_is_zero && is_zero != NULL) { 
+  if (os_is_zero && is_zero != NULL) {
     *is_zero = true;
     mi_assert_expensive(mi_mem_is_zero(start, csize));
   }
   // note: the following seems required for asan (otherwise `mimalloc-test-stress` fails)
   #ifdef MI_TRACK_ASAN
   if (os_is_zero) { mi_track_mem_defined(start,csize); }
-             else { mi_track_mem_undefined(start,csize); } 
+             else { mi_track_mem_undefined(start,csize); }
   #endif
   return true;
 }
@@ -428,11 +428,11 @@ static bool mi_os_decommit_ex(void* addr, size_t size, bool* needs_recommit, mi_
   // page align
   size_t csize;
   void* start = mi_os_page_align_area_conservative(addr, size, &csize);
-  if (csize == 0) return true; 
+  if (csize == 0) return true;
 
   // decommit
   *needs_recommit = true;
-  int err = _mi_prim_decommit(start,csize,needs_recommit);  
+  int err = _mi_prim_decommit(start,csize,needs_recommit);
   if (err != 0) {
     _mi_warning_message("cannot decommit OS memory (error: %d (0x%x), address: %p, size: 0x%zx bytes)\n", err, err, start, csize);
   }
@@ -450,7 +450,7 @@ bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* tld_stats) {
 // but may be used later again. This will release physical memory
 // pages and reduce swapping while keeping the memory committed.
 // We page align to a conservative area inside the range to reset.
-bool _mi_os_reset(void* addr, size_t size, mi_stats_t* stats) { 
+bool _mi_os_reset(void* addr, size_t size, mi_stats_t* stats) {
   // page align conservatively within the range
   size_t csize;
   void* start = mi_os_page_align_area_conservative(addr, size, &csize);
@@ -470,7 +470,7 @@ bool _mi_os_reset(void* addr, size_t size, mi_stats_t* stats) {
 }
 
 
-// either resets or decommits memory, returns true if the memory needs 
+// either resets or decommits memory, returns true if the memory needs
 // to be recommitted if it is to be re-used later on.
 bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, mi_stats_t* stats)
 {
@@ -483,7 +483,7 @@ bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, mi_stats_t* stats)
   {
     bool needs_recommit = true;
     mi_os_decommit_ex(p, size, &needs_recommit, stats);
-    return needs_recommit;   
+    return needs_recommit;
   }
   else {
     if (allow_reset) {  // this can sometimes be not allowed if the range is not fully committed
@@ -493,7 +493,7 @@ bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, mi_stats_t* stats)
   }
 }
 
-// either resets or decommits memory, returns true if the memory needs 
+// either resets or decommits memory, returns true if the memory needs
 // to be recommitted if it is to be re-used later on.
 bool _mi_os_purge(void* p, size_t size, mi_stats_t * stats) {
   return _mi_os_purge_ex(p, size, true, stats);
diff --git a/Objects/mimalloc/page.c b/Objects/mimalloc/page.c
index 8ac0a715e76e5a..4610cf27afff75 100644
--- a/Objects/mimalloc/page.c
+++ b/Objects/mimalloc/page.c
@@ -125,9 +125,9 @@ bool _mi_page_is_valid(mi_page_t* page) {
 
     mi_assert_internal(!_mi_process_is_initialized || segment->thread_id==0 || segment->thread_id == mi_page_heap(page)->thread_id);
     #if MI_HUGE_PAGE_ABANDON
-    if (segment->kind != MI_SEGMENT_HUGE) 
+    if (segment->kind != MI_SEGMENT_HUGE)
     #endif
-    {    
+    {
       mi_page_queue_t* pq = mi_page_queue_of(page);
       mi_assert_internal(mi_page_queue_contains(pq, page));
       mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_block_size(page) > MI_MEDIUM_OBJ_SIZE_MAX || mi_page_is_in_full(page));
@@ -263,7 +263,7 @@ void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) {
   #if MI_HUGE_PAGE_ABANDON
   mi_assert_internal(_mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
   #endif
-  
+
   // TODO: push on full queue immediately if it is full?
   mi_page_queue_t* pq = mi_page_queue(heap, mi_page_block_size(page));
   mi_page_queue_push(heap, pq, page);
@@ -441,7 +441,7 @@ void _mi_page_retire(mi_page_t* page) mi_attr_noexcept {
   mi_assert_internal(page != NULL);
   mi_assert_expensive(_mi_page_is_valid(page));
   mi_assert_internal(mi_page_all_free(page));
-  
+
   mi_page_set_has_aligned(page, false);
 
   // don't retire too often..
@@ -454,7 +454,7 @@ void _mi_page_retire(mi_page_t* page) mi_attr_noexcept {
   if mi_likely(page->xblock_size <= MI_MAX_RETIRE_SIZE && !mi_page_queue_is_special(pq)) {  // not too large && not full or huge queue?
     if (pq->last==page && pq->first==page) { // the only page in the queue?
       mi_stat_counter_increase(_mi_stats_main.page_no_retire,1);
-      page->retire_expire = 1 + (page->xblock_size <= MI_SMALL_OBJ_SIZE_MAX ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);      
+      page->retire_expire = 1 + (page->xblock_size <= MI_SMALL_OBJ_SIZE_MAX ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);
       mi_heap_t* heap = mi_page_heap(page);
       mi_assert_internal(pq >= heap->pages);
       const size_t index = pq - heap->pages;
@@ -608,7 +608,7 @@ static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, co
 // allocations but this did not speed up any benchmark (due to an
 // extra test in malloc? or cache effects?)
 static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld) {
-  MI_UNUSED(tld); 
+  MI_UNUSED(tld);
   mi_assert_expensive(mi_page_is_valid_init(page));
   #if (MI_SECURE<=2)
   mi_assert(page->free == NULL);
@@ -681,7 +681,7 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
     mi_assert_expensive(mi_mem_is_zero(page_start, page_size));
   }
   #endif
-  
+
   mi_assert_internal(page->is_committed);
   mi_assert_internal(page->capacity == 0);
   mi_assert_internal(page->free == NULL);
@@ -718,7 +718,7 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p
   while (page != NULL)
   {
     mi_page_t* next = page->next; // remember next
-    #if MI_STAT    
+    #if MI_STAT
     count++;
     #endif
 
@@ -838,19 +838,19 @@ static mi_page_t* mi_large_huge_page_alloc(mi_heap_t* heap, size_t size, size_t
   mi_page_t* page = mi_page_fresh_alloc(heap, pq, block_size, page_alignment);
   if (page != NULL) {
     mi_assert_internal(mi_page_immediate_available(page));
-    
+
     if (is_huge) {
       mi_assert_internal(_mi_page_segment(page)->kind == MI_SEGMENT_HUGE);
       mi_assert_internal(_mi_page_segment(page)->used==1);
       #if MI_HUGE_PAGE_ABANDON
       mi_assert_internal(_mi_page_segment(page)->thread_id==0); // abandoned, not in the huge queue
       mi_page_set_heap(page, NULL);
-      #endif      
+      #endif
     }
     else {
       mi_assert_internal(_mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
     }
-    
+
     const size_t bsize = mi_page_usable_block_size(page);  // note: not `mi_page_block_size` to account for padding
     if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
       mi_heap_stat_increase(heap, large, bsize);
@@ -869,7 +869,7 @@ static mi_page_t* mi_large_huge_page_alloc(mi_heap_t* heap, size_t size, size_t
 // Note: in debug mode the size includes MI_PADDING_SIZE and might have overflowed.
 static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size, size_t huge_alignment) mi_attr_noexcept {
   // huge allocation?
-  const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`  
+  const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`
   if mi_unlikely(req_size > (MI_MEDIUM_OBJ_SIZE_MAX - MI_PADDING_SIZE) || huge_alignment > 0) {
     if mi_unlikely(req_size > PTRDIFF_MAX) {  // we don't allocate more than PTRDIFF_MAX (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
       _mi_error_message(EOVERFLOW, "allocation request is too large (%zu bytes)\n", req_size);
@@ -882,7 +882,7 @@ static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size, size_t huge_alignme
   else {
     // otherwise find a page with free blocks in our size segregated queues
     #if MI_PADDING
-    mi_assert_internal(size >= MI_PADDING_SIZE); 
+    mi_assert_internal(size >= MI_PADDING_SIZE);
     #endif
     return mi_find_free_page(heap, size);
   }
@@ -898,7 +898,7 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al
 
   // initialize if necessary
   if mi_unlikely(!mi_heap_is_initialized(heap)) {
-    heap = mi_heap_get_default(); // calls mi_thread_init 
+    heap = mi_heap_get_default(); // calls mi_thread_init
     if mi_unlikely(!mi_heap_is_initialized(heap)) { return NULL; }
   }
   mi_assert_internal(mi_heap_is_initialized(heap));
diff --git a/Objects/mimalloc/prim/unix/prim.c b/Objects/mimalloc/prim/unix/prim.c
index 6ca8d12c40f833..a9555eb09e11f6 100644
--- a/Objects/mimalloc/prim/unix/prim.c
+++ b/Objects/mimalloc/prim/unix/prim.c
@@ -57,7 +57,7 @@ terms of the MIT license. A copy of the license can be found in the file
 
 //------------------------------------------------------------------------------------
 // Use syscalls for some primitives to allow for libraries that override open/read/close etc.
-// and do allocation themselves; using syscalls prevents recursion when mimalloc is 
+// and do allocation themselves; using syscalls prevents recursion when mimalloc is
 // still initializing (issue #713)
 //------------------------------------------------------------------------------------
 
@@ -120,7 +120,7 @@ static bool unix_detect_overcommit(void) {
     os_overcommit = (val != 0);
   }
 #else
-  // default: overcommit is true  
+  // default: overcommit is true
 #endif
   return os_overcommit;
 }
@@ -168,12 +168,12 @@ static void* unix_mmap_prim(void* addr, size_t size, size_t try_alignment, int p
     size_t n = mi_bsr(try_alignment);
     if (((size_t)1 << n) == try_alignment && n >= 12 && n <= 30) {  // alignment is a power of 2 and 4096 <= alignment <= 1GiB
       p = mmap(addr, size, protect_flags, flags | MAP_ALIGNED(n), fd, 0);
-      if (p==MAP_FAILED || !_mi_is_aligned(p,try_alignment)) { 
+      if (p==MAP_FAILED || !_mi_is_aligned(p,try_alignment)) {
         int err = errno;
         _mi_warning_message("unable to directly request aligned OS memory (error: %d (0x%x), size: 0x%zx bytes, alignment: 0x%zx, hint address: %p)\n", err, err, size, try_alignment, addr);
       }
       if (p!=MAP_FAILED) return p;
-      // fall back to regular mmap      
+      // fall back to regular mmap
     }
   }
   #elif defined(MAP_ALIGN)  // Solaris
@@ -189,7 +189,7 @@ static void* unix_mmap_prim(void* addr, size_t size, size_t try_alignment, int p
     void* hint = _mi_os_get_aligned_hint(try_alignment, size);
     if (hint != NULL) {
       p = mmap(hint, size, protect_flags, flags, fd, 0);
-      if (p==MAP_FAILED || !_mi_is_aligned(p,try_alignment)) { 
+      if (p==MAP_FAILED || !_mi_is_aligned(p,try_alignment)) {
         #if MI_TRACK_ENABLED  // asan sometimes does not instrument errno correctly?
         int err = 0;
         #else
@@ -198,7 +198,7 @@ static void* unix_mmap_prim(void* addr, size_t size, size_t try_alignment, int p
         _mi_warning_message("unable to directly request hinted aligned OS memory (error: %d (0x%x), size: 0x%zx bytes, alignment: 0x%zx, hint address: %p)\n", err, err, size, try_alignment, hint);
       }
       if (p!=MAP_FAILED) return p;
-      // fall back to regular mmap      
+      // fall back to regular mmap
     }
   }
   #endif
@@ -327,9 +327,9 @@ int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_la
   mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
   mi_assert_internal(commit || !allow_large);
   mi_assert_internal(try_alignment > 0);
-  
+
   *is_zero = true;
-  int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE);  
+  int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE);
   *addr = unix_mmap(NULL, size, try_alignment, protect_flags, false, allow_large, is_large);
   return (*addr != NULL ? 0 : errno);
 }
@@ -357,19 +357,19 @@ int _mi_prim_commit(void* start, size_t size, bool* is_zero) {
   // was either from mmap PROT_NONE, or from decommit MADV_DONTNEED, but
   // we sometimes call commit on a range with still partially committed
   // memory and `mprotect` does not zero the range.
-  *is_zero = false;  
+  *is_zero = false;
   int err = mprotect(start, size, (PROT_READ | PROT_WRITE));
-  if (err != 0) { 
-    err = errno; 
+  if (err != 0) {
+    err = errno;
     unix_mprotect_hint(err);
   }
   return err;
 }
 
 int _mi_prim_decommit(void* start, size_t size, bool* needs_recommit) {
-  int err = 0;  
+  int err = 0;
   // decommit: use MADV_DONTNEED as it decreases rss immediately (unlike MADV_FREE)
-  err = unix_madvise(start, size, MADV_DONTNEED);    
+  err = unix_madvise(start, size, MADV_DONTNEED);
   #if !MI_DEBUG && !MI_SECURE
     *needs_recommit = false;
   #else
@@ -381,15 +381,15 @@ int _mi_prim_decommit(void* start, size_t size, bool* needs_recommit) {
   *needs_recommit = true;
   const int fd = unix_mmap_fd();
   void* p = mmap(start, size, PROT_NONE, (MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE), fd, 0);
-  if (p != start) { err = errno; }    
+  if (p != start) { err = errno; }
   */
   return err;
 }
 
 int _mi_prim_reset(void* start, size_t size) {
-  // We try to use `MADV_FREE` as that is the fastest. A drawback though is that it 
+  // We try to use `MADV_FREE` as that is the fastest. A drawback though is that it
   // will not reduce the `rss` stats in tools like `top` even though the memory is available
-  // to other processes. With the default `MIMALLOC_PURGE_DECOMMITS=1` we ensure that by 
+  // to other processes. With the default `MIMALLOC_PURGE_DECOMMITS=1` we ensure that by
   // default `MADV_DONTNEED` is used though.
   #if defined(MADV_FREE)
   static _Atomic(size_t) advice = MI_ATOMIC_VAR_INIT(MADV_FREE);
@@ -409,7 +409,7 @@ int _mi_prim_reset(void* start, size_t size) {
 
 int _mi_prim_protect(void* start, size_t size, bool protect) {
   int err = mprotect(start, size, protect ? PROT_NONE : (PROT_READ | PROT_WRITE));
-  if (err != 0) { err = errno; }  
+  if (err != 0) { err = errno; }
   unix_mprotect_hint(err);
   return err;
 }
@@ -450,7 +450,7 @@ int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bo
     if (err != 0) {
       err = errno;
       _mi_warning_message("failed to bind huge (1GiB) pages to numa node %d (error: %d (0x%x))\n", numa_node, err, err);
-    }    
+    }
   }
   return (*addr != NULL ? 0 : errno);
 }
@@ -567,9 +567,9 @@ mi_msecs_t _mi_prim_clock_now(void) {
 // low resolution timer
 mi_msecs_t _mi_prim_clock_now(void) {
   #if !defined(CLOCKS_PER_SEC) || (CLOCKS_PER_SEC == 1000) || (CLOCKS_PER_SEC == 0)
-  return (mi_msecs_t)clock();  
+  return (mi_msecs_t)clock();
   #elif (CLOCKS_PER_SEC < 1000)
-  return (mi_msecs_t)clock() * (1000 / (mi_msecs_t)CLOCKS_PER_SEC);  
+  return (mi_msecs_t)clock() * (1000 / (mi_msecs_t)CLOCKS_PER_SEC);
   #else
   return (mi_msecs_t)clock() / ((mi_msecs_t)CLOCKS_PER_SEC / 1000);
   #endif
@@ -609,7 +609,7 @@ void _mi_prim_process_info(mi_process_info_t* pinfo)
   pinfo->stime = timeval_secs(&rusage.ru_stime);
 #if !defined(__HAIKU__)
   pinfo->page_faults = rusage.ru_majflt;
-#endif  
+#endif
 #if defined(__HAIKU__)
   // Haiku does not have (yet?) a way to
   // get these stats per process
@@ -750,7 +750,7 @@ bool _mi_prim_random_buf(void* buf, size_t buf_len) {
 
 #elif defined(__ANDROID__) || defined(__DragonFly__) || \
       defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
-      defined(__sun) 
+      defined(__sun)
 
 #include <stdlib.h>
 bool _mi_prim_random_buf(void* buf, size_t buf_len) {
@@ -842,7 +842,7 @@ void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
   }
 }
 
-#else 
+#else
 
 void _mi_prim_thread_init_auto_done(void) {
   // nothing
diff --git a/Objects/mimalloc/prim/wasi/prim.c b/Objects/mimalloc/prim/wasi/prim.c
index 50511f0b5bf0b4..25e64fa3c40bf5 100644
--- a/Objects/mimalloc/prim/wasi/prim.c
+++ b/Objects/mimalloc/prim/wasi/prim.c
@@ -19,7 +19,7 @@ terms of the MIT license. A copy of the license can be found in the file
 void _mi_prim_mem_init( mi_os_mem_config_t* config ) {
   config->page_size = 64*MI_KiB; // WebAssembly has a fixed page size: 64KiB
   config->alloc_granularity = 16;
-  config->has_overcommit = false;  
+  config->has_overcommit = false;
   config->must_free_whole = true;
   config->has_virtual_reserve = false;
 }
@@ -129,7 +129,7 @@ int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_la
 //---------------------------------------------
 
 int _mi_prim_commit(void* addr, size_t size, bool* is_zero) {
-  MI_UNUSED(addr); MI_UNUSED(size); 
+  MI_UNUSED(addr); MI_UNUSED(size);
   *is_zero = false;
   return 0;
 }
@@ -194,9 +194,9 @@ mi_msecs_t _mi_prim_clock_now(void) {
 // low resolution timer
 mi_msecs_t _mi_prim_clock_now(void) {
   #if !defined(CLOCKS_PER_SEC) || (CLOCKS_PER_SEC == 1000) || (CLOCKS_PER_SEC == 0)
-  return (mi_msecs_t)clock();  
+  return (mi_msecs_t)clock();
   #elif (CLOCKS_PER_SEC < 1000)
-  return (mi_msecs_t)clock() * (1000 / (mi_msecs_t)CLOCKS_PER_SEC);  
+  return (mi_msecs_t)clock() * (1000 / (mi_msecs_t)CLOCKS_PER_SEC);
   #else
   return (mi_msecs_t)clock() / ((mi_msecs_t)CLOCKS_PER_SEC / 1000);
   #endif
diff --git a/Objects/mimalloc/prim/windows/prim.c b/Objects/mimalloc/prim/windows/prim.c
index e6b61079299cbd..a038277ad21cb0 100644
--- a/Objects/mimalloc/prim/windows/prim.c
+++ b/Objects/mimalloc/prim/windows/prim.c
@@ -276,7 +276,7 @@ int _mi_prim_commit(void* addr, size_t size, bool* is_zero) {
   return 0;
 }
 
-int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit) {  
+int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit) {
   BOOL ok = VirtualFree(addr, size, MEM_DECOMMIT);
   *needs_recommit = true;  // for safety, assume always decommitted even in the case of an error.
   return (ok ? 0 : (int)GetLastError());
@@ -451,7 +451,7 @@ void _mi_prim_process_info(mi_process_info_t* pinfo)
   GetProcessTimes(GetCurrentProcess(), &ct, &et, &st, &ut);
   pinfo->utime = filetime_msecs(&ut);
   pinfo->stime = filetime_msecs(&st);
-  
+
   // load psapi on demand
   if (pGetProcessMemoryInfo == NULL) {
     HINSTANCE hDll = LoadLibrary(TEXT("psapi.dll"));
@@ -465,7 +465,7 @@ void _mi_prim_process_info(mi_process_info_t* pinfo)
   memset(&info, 0, sizeof(info));
   if (pGetProcessMemoryInfo != NULL) {
     pGetProcessMemoryInfo(GetCurrentProcess(), &info, sizeof(info));
-  } 
+  }
   pinfo->current_rss    = (size_t)info.WorkingSetSize;
   pinfo->peak_rss       = (size_t)info.PeakWorkingSetSize;
   pinfo->current_commit = (size_t)info.PagefileUsage;
@@ -477,7 +477,7 @@ void _mi_prim_process_info(mi_process_info_t* pinfo)
 // Output
 //----------------------------------------------------------------
 
-void _mi_prim_out_stderr( const char* msg ) 
+void _mi_prim_out_stderr( const char* msg )
 {
   // on windows with redirection, the C runtime cannot handle locale dependent output
   // after the main thread closes so we use direct console output.
@@ -560,7 +560,7 @@ bool _mi_prim_random_buf(void* buf, size_t buf_len) {
     }
     if (pBCryptGenRandom == NULL) return false;
   }
-  return (pBCryptGenRandom(NULL, (PUCHAR)buf, (ULONG)buf_len, BCRYPT_USE_SYSTEM_PREFERRED_RNG) >= 0);  
+  return (pBCryptGenRandom(NULL, (PUCHAR)buf, (ULONG)buf_len, BCRYPT_USE_SYSTEM_PREFERRED_RNG) >= 0);
 }
 
 #endif  // MI_USE_RTLGENRANDOM
@@ -595,9 +595,9 @@ void _mi_prim_thread_init_auto_done(void) {
 }
 
 void _mi_prim_thread_done_auto_done(void) {
-  // call thread-done on all threads (except the main thread) to prevent 
+  // call thread-done on all threads (except the main thread) to prevent
   // dangling callback pointer if statically linked with a DLL; Issue #208
-  FlsFree(mi_fls_key);  
+  FlsFree(mi_fls_key);
 }
 
 void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
diff --git a/Objects/mimalloc/random.c b/Objects/mimalloc/random.c
index 4fc8b2f8fb0bc3..2a18b5aa992dad 100644
--- a/Objects/mimalloc/random.c
+++ b/Objects/mimalloc/random.c
@@ -160,7 +160,7 @@ If we cannot get good randomness, we fall back to weak randomness based on a tim
 
 uintptr_t _mi_os_random_weak(uintptr_t extra_seed) {
   uintptr_t x = (uintptr_t)&_mi_os_random_weak ^ extra_seed; // ASLR makes the address random
-  x ^= _mi_prim_clock_now();  
+  x ^= _mi_prim_clock_now();
   // and do a few randomization steps
   uintptr_t max = ((x ^ (x >> 17)) & 0x0F) + 1;
   for (uintptr_t i = 0; i < max; i++) {
diff --git a/Objects/mimalloc/segment-map.c b/Objects/mimalloc/segment-map.c
index 4c2104bd8154d4..3cd2127e56c1a7 100644
--- a/Objects/mimalloc/segment-map.c
+++ b/Objects/mimalloc/segment-map.c
@@ -102,8 +102,8 @@ static mi_segment_t* _mi_segment_of(const void* p) {
     uintptr_t lomask = mask;
     loindex = index;
     do {
-      loindex--;  
-      lomask = mi_atomic_load_relaxed(&mi_segment_map[loindex]);      
+      loindex--;
+      lomask = mi_atomic_load_relaxed(&mi_segment_map[loindex]);
     } while (lomask != 0 && loindex > 0);
     if (lomask == 0) return NULL;
     lobitidx = mi_bsr(lomask);    // lomask != 0
diff --git a/Objects/mimalloc/segment.c b/Objects/mimalloc/segment.c
index 28685f21c10060..033e0f97c36c14 100644
--- a/Objects/mimalloc/segment.c
+++ b/Objects/mimalloc/segment.c
@@ -17,7 +17,7 @@ static void mi_segment_try_purge(mi_segment_t* segment, bool force, mi_stats_t*
 
 
 // -------------------------------------------------------------------
-// commit mask 
+// commit mask
 // -------------------------------------------------------------------
 
 static bool mi_commit_mask_all_set(const mi_commit_mask_t* commit, const mi_commit_mask_t* cm) {
@@ -331,7 +331,7 @@ static uint8_t* _mi_segment_page_start_from_slice(const mi_segment_t* segment, c
 uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size)
 {
   const mi_slice_t* slice = mi_page_to_slice((mi_page_t*)page);
-  uint8_t* p = _mi_segment_page_start_from_slice(segment, slice, page->xblock_size, page_size);  
+  uint8_t* p = _mi_segment_page_start_from_slice(segment, slice, page->xblock_size, page_size);
   mi_assert_internal(page->xblock_size > 0 || _mi_ptr_page(p) == page);
   mi_assert_internal(_mi_ptr_segment(p) == segment);
   return p;
@@ -342,7 +342,7 @@ static size_t mi_segment_calculate_slices(size_t required, size_t* pre_size, siz
   size_t page_size = _mi_os_page_size();
   size_t isize     = _mi_align_up(sizeof(mi_segment_t), page_size);
   size_t guardsize = 0;
-  
+
   if (MI_SECURE>0) {
     // in secure mode, we set up a protected page in between the segment info
     // and the page data (and one at the end of the segment)
@@ -355,7 +355,7 @@ static size_t mi_segment_calculate_slices(size_t required, size_t* pre_size, siz
   if (pre_size != NULL) *pre_size = isize;
   isize = _mi_align_up(isize + guardsize, MI_SEGMENT_SLICE_SIZE);
   if (info_slices != NULL) *info_slices = isize / MI_SEGMENT_SLICE_SIZE;
-  size_t segment_size = (required==0 ? MI_SEGMENT_SIZE : _mi_align_up( required + isize + guardsize, MI_SEGMENT_SLICE_SIZE) );  
+  size_t segment_size = (required==0 ? MI_SEGMENT_SIZE : _mi_align_up( required + isize + guardsize, MI_SEGMENT_SLICE_SIZE) );
   mi_assert_internal(segment_size % MI_SEGMENT_SLICE_SIZE == 0);
   return (segment_size / MI_SEGMENT_SLICE_SIZE);
 }
@@ -391,7 +391,7 @@ static void mi_segment_os_free(mi_segment_t* segment, mi_segments_tld_t* tld) {
 
   // purge delayed decommits now? (no, leave it to the arena)
   // mi_segment_try_purge(segment,true,tld->stats);
-  
+
   const size_t size = mi_segment_size(segment);
   const size_t csize = _mi_commit_mask_committed_size(&segment->commit_mask, size);
 
@@ -399,7 +399,7 @@ static void mi_segment_os_free(mi_segment_t* segment, mi_segments_tld_t* tld) {
   _mi_arena_free(segment, mi_segment_size(segment), csize, segment->memid, tld->stats);
 }
 
-// called by threads that are terminating 
+// called by threads that are terminating
 void _mi_segment_thread_collect(mi_segments_tld_t* tld) {
   MI_UNUSED(tld);
   // nothing to do
@@ -451,7 +451,7 @@ static void mi_segment_commit_mask(mi_segment_t* segment, bool conservative, uin
 
   size_t bitidx = start / MI_COMMIT_SIZE;
   mi_assert_internal(bitidx < MI_COMMIT_MASK_BITS);
-  
+
   size_t bitcount = *full_size / MI_COMMIT_SIZE; // can be 0
   if (bitidx + bitcount > MI_COMMIT_MASK_BITS) {
     _mi_warning_message("commit mask overflow: idx=%zu count=%zu start=%zx end=%zx p=0x%p size=%zu fullsize=%zu\n", bitidx, bitcount, start, end, p, size, *full_size);
@@ -479,7 +479,7 @@ static bool mi_segment_commit(mi_segment_t* segment, uint8_t* p, size_t size, mi
     if (!_mi_os_commit(start, full_size, &is_zero, stats)) return false;
     mi_commit_mask_set(&segment->commit_mask, &mask);
   }
-  
+
   // increase purge expiration when using part of delayed purges -- we assume more allocations are coming soon.
   if (mi_commit_mask_any_set(&segment->purge_mask, &mask)) {
     segment->purge_expire = _mi_clock_now() + mi_option_get(mi_option_purge_delay);
@@ -498,7 +498,7 @@ static bool mi_segment_ensure_committed(mi_segment_t* segment, uint8_t* p, size_
   return mi_segment_commit(segment, p, size, stats);
 }
 
-static bool mi_segment_purge(mi_segment_t* segment, uint8_t* p, size_t size, mi_stats_t* stats) {    
+static bool mi_segment_purge(mi_segment_t* segment, uint8_t* p, size_t size, mi_stats_t* stats) {
   mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->purge_mask));
   if (!segment->allow_purge) return true;
 
@@ -517,11 +517,11 @@ static bool mi_segment_purge(mi_segment_t* segment, uint8_t* p, size_t size, mi_
     if (decommitted) {
       mi_commit_mask_t cmask;
       mi_commit_mask_create_intersect(&segment->commit_mask, &mask, &cmask);
-      _mi_stat_increase(&_mi_stats_main.committed, full_size - _mi_commit_mask_committed_size(&cmask, MI_SEGMENT_SIZE)); // adjust for double counting 
+      _mi_stat_increase(&_mi_stats_main.committed, full_size - _mi_commit_mask_committed_size(&cmask, MI_SEGMENT_SIZE)); // adjust for double counting
       mi_commit_mask_clear(&segment->commit_mask, &mask);
-    }        
+    }
   }
-  
+
   // always clear any scheduled purges in our range
   mi_commit_mask_clear(&segment->purge_mask, &mask);
   return true;
@@ -537,16 +537,16 @@ static void mi_segment_schedule_purge(mi_segment_t* segment, uint8_t* p, size_t
     // register for future purge in the purge mask
     uint8_t* start = NULL;
     size_t   full_size = 0;
-    mi_commit_mask_t mask; 
+    mi_commit_mask_t mask;
     mi_segment_commit_mask(segment, true /*conservative*/, p, size, &start, &full_size, &mask);
     if (mi_commit_mask_is_empty(&mask) || full_size==0) return;
-    
+
     // update delayed commit
-    mi_assert_internal(segment->purge_expire > 0 || mi_commit_mask_is_empty(&segment->purge_mask));      
+    mi_assert_internal(segment->purge_expire > 0 || mi_commit_mask_is_empty(&segment->purge_mask));
     mi_commit_mask_t cmask;
     mi_commit_mask_create_intersect(&segment->commit_mask, &mask, &cmask);  // only purge what is committed; span_free may try to decommit more
     mi_commit_mask_set(&segment->purge_mask, &cmask);
-    mi_msecs_t now = _mi_clock_now();    
+    mi_msecs_t now = _mi_clock_now();
     if (segment->purge_expire == 0) {
       // no previous purgess, initialize now
       segment->purge_expire = now + mi_option_get(mi_option_purge_delay);
@@ -564,7 +564,7 @@ static void mi_segment_schedule_purge(mi_segment_t* segment, uint8_t* p, size_t
       // previous purge mask is not yet expired, increase the expiration by a bit.
       segment->purge_expire += mi_option_get(mi_option_purge_extend_delay);
     }
-  }  
+  }
 }
 
 static void mi_segment_try_purge(mi_segment_t* segment, bool force, mi_stats_t* stats) {
@@ -602,7 +602,7 @@ static bool mi_segment_is_abandoned(mi_segment_t* segment) {
 // note: can be called on abandoned segments
 static void mi_segment_span_free(mi_segment_t* segment, size_t slice_index, size_t slice_count, bool allow_purge, mi_segments_tld_t* tld) {
   mi_assert_internal(slice_index < segment->slice_entries);
-  mi_span_queue_t* sq = (segment->kind == MI_SEGMENT_HUGE || mi_segment_is_abandoned(segment) 
+  mi_span_queue_t* sq = (segment->kind == MI_SEGMENT_HUGE || mi_segment_is_abandoned(segment)
                           ? NULL : mi_span_queue_for(slice_count,tld));
   if (slice_count==0) slice_count = 1;
   mi_assert_internal(slice_index + slice_count - 1 < segment->slice_entries);
@@ -623,7 +623,7 @@ static void mi_segment_span_free(mi_segment_t* segment, size_t slice_index, size
   if (allow_purge) {
     mi_segment_schedule_purge(segment, mi_slice_start(slice), slice_count * MI_SEGMENT_SLICE_SIZE, tld->stats);
   }
-  
+
   // and push it on the free page queue (if it was not a huge page)
   if (sq != NULL) mi_span_queue_push( sq, slice );
              else slice->xblock_size = 0; // mark huge page as free anyways
@@ -657,7 +657,7 @@ static mi_slice_t* mi_segment_span_free_coalesce(mi_slice_t* slice, mi_segments_
     // issue #691: segment->used can be 0 if the huge page block was freed while abandoned (reclaim will get here in that case)
     mi_assert_internal((segment->used==0 && slice->xblock_size==0) || segment->used == 1);  // decreased right after this call in `mi_segment_page_clear`
     slice->xblock_size = 0;  // mark as free anyways
-    // we should mark the last slice `xblock_size=0` now to maintain invariants but we skip it to 
+    // we should mark the last slice `xblock_size=0` now to maintain invariants but we skip it to
     // avoid a possible cache miss (and the segment is about to be freed)
     return slice;
   }
@@ -719,7 +719,7 @@ static mi_page_t* mi_segment_span_allocate(mi_segment_t* segment, size_t slice_i
   size_t extra = slice_count-1;
   if (extra > MI_MAX_SLICE_OFFSET) extra = MI_MAX_SLICE_OFFSET;
   if (slice_index + extra >= segment->slice_entries) extra = segment->slice_entries - slice_index - 1;  // huge objects may have more slices than avaiable entries in the segment->slices
-  
+
   mi_slice_t* slice_next = slice + 1;
   for (size_t i = 1; i <= extra; i++, slice_next++) {
     slice_next->slice_offset = (uint32_t)(sizeof(mi_slice_t)*i);
@@ -737,7 +737,7 @@ static mi_page_t* mi_segment_span_allocate(mi_segment_t* segment, size_t slice_i
     last->slice_count = 0;
     last->xblock_size = 1;
   }
-  
+
   // and initialize the page
   page->is_committed = true;
   segment->used++;
@@ -796,7 +796,7 @@ static mi_page_t* mi_segments_page_find_and_allocate(size_t slice_count, mi_aren
 ----------------------------------------------------------- */
 
 static mi_segment_t* mi_segment_os_alloc( size_t required, size_t page_alignment, bool eager_delayed, mi_arena_id_t req_arena_id,
-                                          size_t* psegment_slices, size_t* ppre_size, size_t* pinfo_slices, 
+                                          size_t* psegment_slices, size_t* ppre_size, size_t* pinfo_slices,
                                           bool commit, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
 
 {
@@ -804,7 +804,7 @@ static mi_segment_t* mi_segment_os_alloc( size_t required, size_t page_alignment
   bool   allow_large = (!eager_delayed && (MI_SECURE == 0)); // only allow large OS pages once we are no longer lazy
   size_t align_offset = 0;
   size_t alignment = MI_SEGMENT_ALIGN;
-  
+
   if (page_alignment > 0) {
     // mi_assert_internal(huge_page != NULL);
     mi_assert_internal(page_alignment >= MI_SEGMENT_ALIGN);
@@ -822,21 +822,21 @@ static mi_segment_t* mi_segment_os_alloc( size_t required, size_t page_alignment
     return NULL;  // failed to allocate
   }
 
-  // ensure metadata part of the segment is committed  
-  mi_commit_mask_t commit_mask; 
-  if (memid.initially_committed) { 
-    mi_commit_mask_create_full(&commit_mask);  
+  // ensure metadata part of the segment is committed
+  mi_commit_mask_t commit_mask;
+  if (memid.initially_committed) {
+    mi_commit_mask_create_full(&commit_mask);
   }
-  else { 
+  else {
     // at least commit the info slices
     const size_t commit_needed = _mi_divide_up((*pinfo_slices)*MI_SEGMENT_SLICE_SIZE, MI_COMMIT_SIZE);
     mi_assert_internal(commit_needed>0);
-    mi_commit_mask_create(0, commit_needed, &commit_mask);    
+    mi_commit_mask_create(0, commit_needed, &commit_mask);
     mi_assert_internal(commit_needed*MI_COMMIT_SIZE >= (*pinfo_slices)*MI_SEGMENT_SLICE_SIZE);
     if (!_mi_os_commit(segment, commit_needed*MI_COMMIT_SIZE, NULL, tld->stats)) {
       _mi_arena_free(segment,segment_size,0,memid,tld->stats);
       return NULL;
-    }    
+    }
   }
   mi_assert_internal(segment != NULL && (uintptr_t)segment % MI_SEGMENT_SIZE == 0);
 
@@ -848,7 +848,7 @@ static mi_segment_t* mi_segment_os_alloc( size_t required, size_t page_alignment
   segment->purge_expire = 0;
   mi_commit_mask_create_empty(&segment->purge_mask);
   mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, NULL);  // tsan
-  
+
   mi_segments_track_size((long)(segment_size), tld);
   _mi_segment_map_allocated_at(segment);
   return segment;
@@ -859,32 +859,32 @@ static mi_segment_t* mi_segment_os_alloc( size_t required, size_t page_alignment
 static mi_segment_t* mi_segment_alloc(size_t required, size_t page_alignment, mi_arena_id_t req_arena_id, mi_segments_tld_t* tld, mi_os_tld_t* os_tld, mi_page_t** huge_page)
 {
   mi_assert_internal((required==0 && huge_page==NULL) || (required>0 && huge_page != NULL));
-  
+
   // calculate needed sizes first
   size_t info_slices;
   size_t pre_size;
   size_t segment_slices = mi_segment_calculate_slices(required, &pre_size, &info_slices);
-  
+
   // Commit eagerly only if not the first N lazy segments (to reduce impact of many threads that allocate just a little)
   const bool eager_delay = (// !_mi_os_has_overcommit() &&             // never delay on overcommit systems
                             _mi_current_thread_count() > 1 &&       // do not delay for the first N threads
                             tld->count < (size_t)mi_option_get(mi_option_eager_commit_delay));
   const bool eager = !eager_delay && mi_option_is_enabled(mi_option_eager_commit);
-  bool commit = eager || (required > 0);   
-  
-  // Allocate the segment from the OS  
-  mi_segment_t* segment = mi_segment_os_alloc(required, page_alignment, eager_delay, req_arena_id, 
+  bool commit = eager || (required > 0);
+
+  // Allocate the segment from the OS
+  mi_segment_t* segment = mi_segment_os_alloc(required, page_alignment, eager_delay, req_arena_id,
                                               &segment_slices, &pre_size, &info_slices, commit, tld, os_tld);
   if (segment == NULL) return NULL;
-  
-  // zero the segment info? -- not always needed as it may be zero initialized from the OS   
+
+  // zero the segment info? -- not always needed as it may be zero initialized from the OS
   if (!segment->memid.initially_zero) {
     ptrdiff_t ofs    = offsetof(mi_segment_t, next);
     size_t    prefix = offsetof(mi_segment_t, slices) - ofs;
-    size_t    zsize  = prefix + (sizeof(mi_slice_t) * (segment_slices + 1)); // one more  
+    size_t    zsize  = prefix + (sizeof(mi_slice_t) * (segment_slices + 1)); // one more
     _mi_memzero((uint8_t*)segment + ofs, zsize);
   }
-  
+
   // initialize the rest of the segment info
   const size_t slice_entries = (segment_slices > MI_SLICES_PER_SEGMENT ? MI_SLICES_PER_SEGMENT : segment_slices);
   segment->segment_slices = segment_slices;
@@ -902,7 +902,7 @@ static mi_segment_t* mi_segment_alloc(size_t required, size_t page_alignment, mi
   if (MI_SECURE>0) {
     // in secure mode, we set up a protected page in between the segment info
     // and the page data, and at the end of the segment.
-    size_t os_pagesize = _mi_os_page_size();    
+    size_t os_pagesize = _mi_os_page_size();
     mi_assert_internal(mi_segment_info_size(segment) - os_pagesize >= pre_size);
     _mi_os_protect((uint8_t*)segment + mi_segment_info_size(segment) - os_pagesize, os_pagesize);
     uint8_t* end = (uint8_t*)segment + mi_segment_size(segment) - os_pagesize;
@@ -914,10 +914,10 @@ static mi_segment_t* mi_segment_alloc(size_t required, size_t page_alignment, mi
 
   // reserve first slices for segment info
   mi_page_t* page0 = mi_segment_span_allocate(segment, 0, info_slices, tld);
-  mi_assert_internal(page0!=NULL); if (page0==NULL) return NULL; // cannot fail as we always commit in advance  
+  mi_assert_internal(page0!=NULL); if (page0==NULL) return NULL; // cannot fail as we always commit in advance
   mi_assert_internal(segment->used == 1);
   segment->used = 0; // don't count our internal slices towards usage
-  
+
   // initialize initial free pages
   if (segment->kind == MI_SEGMENT_NORMAL) { // not a huge page
     mi_assert_internal(huge_page==NULL);
@@ -928,7 +928,7 @@ static mi_segment_t* mi_segment_alloc(size_t required, size_t page_alignment, mi
     mi_assert_internal(mi_commit_mask_is_empty(&segment->purge_mask));
     mi_assert_internal(mi_commit_mask_is_full(&segment->commit_mask));
     *huge_page = mi_segment_span_allocate(segment, info_slices, segment_slices - info_slices - guard_slices, tld);
-    mi_assert_internal(*huge_page != NULL); // cannot fail as we commit in advance 
+    mi_assert_internal(*huge_page != NULL); // cannot fail as we commit in advance
   }
 
   mi_assert_expensive(mi_segment_is_valid(segment,tld));
@@ -982,7 +982,7 @@ static mi_slice_t* mi_segment_page_clear(mi_page_t* page, mi_segments_tld_t* tld
   mi_assert_internal(mi_page_all_free(page));
   mi_segment_t* segment = _mi_ptr_segment(page);
   mi_assert_internal(segment->used > 0);
-  
+
   size_t inuse = page->capacity * mi_page_block_size(page);
   _mi_stat_decrease(&tld->stats->page_committed, inuse);
   _mi_stat_decrease(&tld->stats->pages, 1);
@@ -990,7 +990,7 @@ static mi_slice_t* mi_segment_page_clear(mi_page_t* page, mi_segments_tld_t* tld
   // reset the page memory to reduce memory pressure?
   if (segment->allow_decommit && mi_option_is_enabled(mi_option_deprecated_page_reset)) {
     size_t psize;
-    uint8_t* start = _mi_page_start(segment, page, &psize);    
+    uint8_t* start = _mi_page_start(segment, page, &psize);
     _mi_os_reset(start, psize, tld->stats);
   }
 
@@ -1001,7 +1001,7 @@ static mi_slice_t* mi_segment_page_clear(mi_page_t* page, mi_segments_tld_t* tld
   page->xblock_size = 1;
 
   // and free it
-  mi_slice_t* slice = mi_segment_span_free_coalesce(mi_page_to_slice(page), tld);  
+  mi_slice_t* slice = mi_segment_span_free_coalesce(mi_page_to_slice(page), tld);
   segment->used--;
   // cannot assert segment valid as it is called during reclaim
   // mi_assert_expensive(mi_segment_is_valid(segment, tld));
@@ -1214,7 +1214,7 @@ static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) {
   mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL);
   mi_assert_internal(segment->abandoned_visits == 0);
   mi_assert_expensive(mi_segment_is_valid(segment,tld));
-  
+
   // remove the free pages from the free page queues
   mi_slice_t* slice = &segment->slices[0];
   const mi_slice_t* end = mi_segment_slices_end(segment);
@@ -1229,8 +1229,8 @@ static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) {
   }
 
   // perform delayed decommits (forcing is much slower on mstress)
-  mi_segment_try_purge(segment, mi_option_is_enabled(mi_option_abandoned_page_purge) /* force? */, tld->stats);    
-  
+  mi_segment_try_purge(segment, mi_option_is_enabled(mi_option_abandoned_page_purge) /* force? */, tld->stats);
+
   // all pages in the segment are abandoned; add it to the abandoned list
   _mi_stat_increase(&tld->stats->segments_abandoned, 1);
   mi_segments_track_size(-((long)mi_segment_size(segment)), tld);
@@ -1247,7 +1247,7 @@ void _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld) {
   mi_segment_t* segment = _mi_page_segment(page);
 
   mi_assert_expensive(mi_segment_is_valid(segment,tld));
-  segment->abandoned++;  
+  segment->abandoned++;
 
   _mi_stat_increase(&tld->stats->pages_abandoned, 1);
   mi_assert_internal(segment->abandoned <= segment->used);
@@ -1270,12 +1270,12 @@ static mi_slice_t* mi_slices_start_iterate(mi_segment_t* segment, const mi_slice
 }
 
 // Possibly free pages and check if free space is available
-static bool mi_segment_check_free(mi_segment_t* segment, size_t slices_needed, size_t block_size, mi_segments_tld_t* tld) 
+static bool mi_segment_check_free(mi_segment_t* segment, size_t slices_needed, size_t block_size, mi_segments_tld_t* tld)
 {
   mi_assert_internal(block_size < MI_HUGE_BLOCK_SIZE);
   mi_assert_internal(mi_segment_is_abandoned(segment));
   bool has_page = false;
-  
+
   // for all slices
   const mi_slice_t* end;
   mi_slice_t* slice = mi_slices_start_iterate(segment, &end);
@@ -1287,7 +1287,7 @@ static bool mi_segment_check_free(mi_segment_t* segment, size_t slices_needed, s
       mi_page_t* const page = mi_slice_to_page(slice);
       _mi_page_free_collect(page, false);
       if (mi_page_all_free(page)) {
-        // if this page is all free now, free it without adding to any queues (yet) 
+        // if this page is all free now, free it without adding to any queues (yet)
         mi_assert_internal(page->next == NULL && page->prev==NULL);
         _mi_stat_decrease(&tld->stats->pages_abandoned, 1);
         segment->abandoned--;
@@ -1302,7 +1302,7 @@ static bool mi_segment_check_free(mi_segment_t* segment, size_t slices_needed, s
           // a page has available free blocks of the right size
           has_page = true;
         }
-      }      
+      }
     }
     else {
       // empty span
@@ -1327,7 +1327,7 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap,
   mi_segments_track_size((long)mi_segment_size(segment), tld);
   mi_assert_internal(segment->next == NULL);
   _mi_stat_decrease(&tld->stats->segments_abandoned, 1);
-  
+
   // for all slices
   const mi_slice_t* end;
   mi_slice_t* slice = mi_slices_start_iterate(segment, &end);
@@ -1406,12 +1406,12 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t needed_slice
       mi_segment_reclaim(segment, heap, 0, NULL, tld);
     }
     else if (has_page && is_suitable) {
-      // found a large enough free span, or a page of the right block_size with free space 
+      // found a large enough free span, or a page of the right block_size with free space
       // we return the result of reclaim (which is usually `segment`) as it might free
       // the segment due to concurrent frees (in which case `NULL` is returned).
       return mi_segment_reclaim(segment, heap, block_size, reclaimed, tld);
     }
-    else if (segment->abandoned_visits > 3 && is_suitable) {  
+    else if (segment->abandoned_visits > 3 && is_suitable) {
       // always reclaim on 3rd visit to limit the abandoned queue length.
       mi_segment_reclaim(segment, heap, 0, NULL, tld);
     }
@@ -1430,7 +1430,7 @@ void _mi_abandoned_collect(mi_heap_t* heap, bool force, mi_segments_tld_t* tld)
   mi_segment_t* segment;
   int max_tries = (force ? 16*1024 : 1024); // limit latency
   if (force) {
-    mi_abandoned_visited_revisit(); 
+    mi_abandoned_visited_revisit();
   }
   while ((max_tries-- > 0) && ((segment = mi_abandoned_pop()) != NULL)) {
     mi_segment_check_free(segment,0,0,tld); // try to free up pages (due to concurrent frees)
@@ -1441,7 +1441,7 @@ void _mi_abandoned_collect(mi_heap_t* heap, bool force, mi_segments_tld_t* tld)
       mi_segment_reclaim(segment, heap, 0, NULL, tld);
     }
     else {
-      // otherwise, purge if needed and push on the visited list 
+      // otherwise, purge if needed and push on the visited list
       // note: forced purge can be expensive if many threads are destroyed/created as in mstress.
       mi_segment_try_purge(segment, force, tld->stats);
       mi_abandoned_visited_push(segment);
@@ -1457,7 +1457,7 @@ static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t needed_
 {
   mi_assert_internal(block_size < MI_HUGE_BLOCK_SIZE);
   mi_assert_internal(block_size <= MI_LARGE_OBJ_SIZE_MAX);
-  
+
   // 1. try to reclaim an abandoned segment
   bool reclaimed;
   mi_segment_t* segment = mi_segment_try_reclaim(heap, needed_slices, block_size, &reclaimed, tld);
@@ -1471,7 +1471,7 @@ static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t needed_
     return segment;
   }
   // 2. otherwise allocate a fresh segment
-  return mi_segment_alloc(0, 0, heap->arena_id, tld, os_tld, NULL);  
+  return mi_segment_alloc(0, 0, heap->arena_id, tld, os_tld, NULL);
 }
 
 
@@ -1492,7 +1492,7 @@ static mi_page_t* mi_segments_page_alloc(mi_heap_t* heap, mi_page_kind_t page_ki
     // no free page, allocate a new segment and try again
     if (mi_segment_reclaim_or_alloc(heap, slices_needed, block_size, tld, os_tld) == NULL) {
       // OOM or reclaimed a good page in the heap
-      return NULL;  
+      return NULL;
     }
     else {
       // otherwise try again
@@ -1517,27 +1517,27 @@ static mi_page_t* mi_segment_huge_page_alloc(size_t size, size_t page_alignment,
   mi_segment_t* segment = mi_segment_alloc(size,page_alignment,req_arena_id,tld,os_tld,&page);
   if (segment == NULL || page==NULL) return NULL;
   mi_assert_internal(segment->used==1);
-  mi_assert_internal(mi_page_block_size(page) >= size);  
+  mi_assert_internal(mi_page_block_size(page) >= size);
   #if MI_HUGE_PAGE_ABANDON
   segment->thread_id = 0; // huge segments are immediately abandoned
-  #endif  
+  #endif
 
   // for huge pages we initialize the xblock_size as we may
   // overallocate to accommodate large alignments.
   size_t psize;
   uint8_t* start = _mi_segment_page_start(segment, page, &psize);
   page->xblock_size = (psize > MI_HUGE_BLOCK_SIZE ? MI_HUGE_BLOCK_SIZE : (uint32_t)psize);
-  
+
   // decommit the part of the prefix of a page that will not be used; this can be quite large (close to MI_SEGMENT_SIZE)
   if (page_alignment > 0 && segment->allow_decommit) {
     uint8_t* aligned_p = (uint8_t*)_mi_align_up((uintptr_t)start, page_alignment);
     mi_assert_internal(_mi_is_aligned(aligned_p, page_alignment));
-    mi_assert_internal(psize - (aligned_p - start) >= size);      
+    mi_assert_internal(psize - (aligned_p - start) >= size);
     uint8_t* decommit_start = start + sizeof(mi_block_t);              // for the free list
     ptrdiff_t decommit_size = aligned_p - decommit_start;
-    _mi_os_reset(decommit_start, decommit_size, &_mi_stats_main);   // note: cannot use segment_decommit on huge segments    
+    _mi_os_reset(decommit_start, decommit_size, &_mi_stats_main);   // note: cannot use segment_decommit on huge segments
   }
-  
+
   return page;
 }
 
@@ -1609,11 +1609,9 @@ mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t pag
     page = mi_segments_page_alloc(heap,MI_PAGE_LARGE,block_size,block_size,tld, os_tld);
   }
   else {
-    page = mi_segment_huge_page_alloc(block_size,page_alignment,heap->arena_id,tld,os_tld);    
+    page = mi_segment_huge_page_alloc(block_size,page_alignment,heap->arena_id,tld,os_tld);
   }
   mi_assert_internal(page == NULL || _mi_heap_memid_is_suitable(heap, _mi_page_segment(page)->memid));
   mi_assert_expensive(page == NULL || mi_segment_is_valid(_mi_page_segment(page),tld));
   return page;
 }
-
-
diff --git a/Objects/mimalloc/stats.c b/Objects/mimalloc/stats.c
index 300956ce1302e9..e7ea397ec4c384 100644
--- a/Objects/mimalloc/stats.c
+++ b/Objects/mimalloc/stats.c
@@ -455,7 +455,7 @@ mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, s
   pinfo.page_faults    = 0;
 
   _mi_prim_process_info(&pinfo);
-  
+
   if (elapsed_msecs!=NULL)  *elapsed_msecs  = (pinfo.elapsed < 0 ? 0 : (pinfo.elapsed < (mi_msecs_t)PTRDIFF_MAX ? (size_t)pinfo.elapsed : PTRDIFF_MAX));
   if (user_msecs!=NULL)     *user_msecs     = (pinfo.utime < 0 ? 0 : (pinfo.utime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)pinfo.utime : PTRDIFF_MAX));
   if (system_msecs!=NULL)   *system_msecs   = (pinfo.stime < 0 ? 0 : (pinfo.stime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)pinfo.stime : PTRDIFF_MAX));

From 359f90aab823110437c4855ab3a296c89f33ec5b Mon Sep 17 00:00:00 2001
From: Dino Viehland <dinoviehland@fb.com>
Date: Tue, 26 Sep 2023 15:13:10 -0700
Subject: [PATCH 04/24] Don't export symbol

Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
---
 Objects/mimalloc/alloc.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Objects/mimalloc/alloc.c b/Objects/mimalloc/alloc.c
index fb56f952994990..acc1d6c06dcf0e 100644
--- a/Objects/mimalloc/alloc.c
+++ b/Objects/mimalloc/alloc.c
@@ -16,6 +16,8 @@ terms of the MIT license. A copy of the license can be found in the file
 #include <string.h>      // memset, strlen (for mi_strdup)
 #include <stdlib.h>      // malloc, abort
 
+#define _ZSt15get_new_handlerv _Py__ZSt15get_new_handlerv
+
 #define MI_IN_ALLOC_C
 #include "alloc-override.c"
 #undef MI_IN_ALLOC_C

From 7117b06af207e56ca630c6ce3b87934a5afda8c2 Mon Sep 17 00:00:00 2001
From: Dino Viehland <dinoviehland@fb.com>
Date: Tue, 26 Sep 2023 12:37:14 -0700
Subject: [PATCH 05/24] Blurb

Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
---
 .../Core and Builtins/2023-09-06-12-36-11.bpo-46657.xea1T_.rst   | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 Misc/NEWS.d/next/Core and Builtins/2023-09-06-12-36-11.bpo-46657.xea1T_.rst

diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-09-06-12-36-11.bpo-46657.xea1T_.rst b/Misc/NEWS.d/next/Core and Builtins/2023-09-06-12-36-11.bpo-46657.xea1T_.rst
new file mode 100644
index 00000000000000..2da4fc18bed763
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2023-09-06-12-36-11.bpo-46657.xea1T_.rst	
@@ -0,0 +1 @@
+[WIP] Add mimalloc memory allocator support.

From 6bd9b0e63b4fe66561b7eec41f01a0f449821360 Mon Sep 17 00:00:00 2001
From: Dino Viehland <dinoviehland@fb.com>
Date: Mon, 25 Sep 2023 16:51:26 -0700
Subject: [PATCH 06/24] mimalloc: enable as allocator option

---
 Include/cpython/pymem.h              |   4 +
 Include/internal/pycore_pymem_init.h |  18 ++-
 Lib/test/support/__init__.py         |   5 +
 Lib/test/test_cmd_line.py            |  15 +-
 Modules/_testcapi/mem.c              |   9 ++
 Objects/obmalloc.c                   | 215 ++++++++++++++++++++++++++-
 6 files changed, 256 insertions(+), 10 deletions(-)

diff --git a/Include/cpython/pymem.h b/Include/cpython/pymem.h
index b75f1c4d2425dd..9f19ffc3db1d03 100644
--- a/Include/cpython/pymem.h
+++ b/Include/cpython/pymem.h
@@ -29,6 +29,10 @@ typedef enum {
     PYMEM_ALLOCATOR_PYMALLOC = 5,
     PYMEM_ALLOCATOR_PYMALLOC_DEBUG = 6,
 #endif
+#ifdef WITH_MIMALLOC
+    PYMEM_ALLOCATOR_MIMALLOC = 7,
+    PYMEM_ALLOCATOR_MIMALLOC_DEBUG = 8,
+#endif
 } PyMemAllocatorName;
 
 
diff --git a/Include/internal/pycore_pymem_init.h b/Include/internal/pycore_pymem_init.h
index 119fa16fb911ae..a19ba38dcac991 100644
--- a/Include/internal/pycore_pymem_init.h
+++ b/Include/internal/pycore_pymem_init.h
@@ -18,17 +18,31 @@ extern void * _PyMem_RawRealloc(void *, void *, size_t);
 extern void _PyMem_RawFree(void *, void *);
 #define PYRAW_ALLOC {NULL, _PyMem_RawMalloc, _PyMem_RawCalloc, _PyMem_RawRealloc, _PyMem_RawFree}
 
-#ifdef WITH_PYMALLOC
+#ifdef WITH_MIMALLOC
+extern void *_PyMem_MiMalloc(void *ctx, size_t size);
+extern void * _PyMem_MiCalloc(void *ctx, size_t nelem, size_t elsize);
+extern void *_PyMem_MiRealloc(void *ctx, void *ptr, size_t size);
+extern void _PyMem_MiFree(void *ctx, void *ptr);
+extern void * _PyObject_MiMalloc(void *ctx, size_t nbytes);
+extern void* _PyObject_MiCalloc(void *ctx, size_t nelem, size_t elsize);
+extern void* _PyObject_MiRealloc(void *ctx, void *ptr, size_t nbytes);
+extern void _PyObject_MiFree(void *ctx, void *ptr);
+
+#  define PYMEM_ALLOC {NULL, _PyMem_MiMalloc, _PyMem_MiCalloc, _PyMem_MiRealloc, _PyMem_MiFree}
+#  define PYOBJ_ALLOC {NULL, _PyObject_MiMalloc, _PyObject_MiCalloc, _PyObject_MiRealloc, _PyObject_MiFree}
+
+#elif defined(WITH_PYMALLOC)
 extern void* _PyObject_Malloc(void *, size_t);
 extern void* _PyObject_Calloc(void *, size_t, size_t);
 extern void _PyObject_Free(void *, void *);
 extern void* _PyObject_Realloc(void *, void *, size_t);
 #  define PYOBJ_ALLOC {NULL, _PyObject_Malloc, _PyObject_Calloc, _PyObject_Realloc, _PyObject_Free}
+#  define PYMEM_ALLOC PYOBJ_ALLOC
 #else
 # define PYOBJ_ALLOC PYRAW_ALLOC
+# define PYMEM_ALLOC PYOBJ_ALLOC
 #endif  // WITH_PYMALLOC
 
-#define PYMEM_ALLOC PYOBJ_ALLOC
 
 extern void* _PyMem_DebugRawMalloc(void *, size_t);
 extern void* _PyMem_DebugRawCalloc(void *, size_t, size_t);
diff --git a/Lib/test/support/__init__.py b/Lib/test/support/__init__.py
index 21e8770ab31180..da0ec0402a7a3d 100644
--- a/Lib/test/support/__init__.py
+++ b/Lib/test/support/__init__.py
@@ -2036,6 +2036,11 @@ def with_pymalloc():
     return _testcapi.WITH_PYMALLOC
 
 
+def with_mimalloc():
+    import _testcapi
+    return _testcapi.WITH_MIMALLOC
+
+
 class _ALWAYS_EQ:
     """
     Object that is equal to anything.
diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py
index eaf19aa160e860..49f81a956a581f 100644
--- a/Lib/test/test_cmd_line.py
+++ b/Lib/test/test_cmd_line.py
@@ -724,7 +724,9 @@ def test_xdev(self):
             code = "import _testinternalcapi; print(_testinternalcapi.pymem_getallocatorsname())"
             with support.SuppressCrashReport():
                 out = self.run_xdev("-c", code, check_exitcode=False)
-            if support.with_pymalloc():
+            if support.with_mimalloc():
+                alloc_name = "mimalloc_debug"
+            elif support.with_pymalloc():
                 alloc_name = "pymalloc_debug"
             else:
                 alloc_name = "malloc_debug"
@@ -803,7 +805,11 @@ def check_pythonmalloc(self, env_var, name):
     def test_pythonmalloc(self):
         # Test the PYTHONMALLOC environment variable
         pymalloc = support.with_pymalloc()
-        if pymalloc:
+        mimalloc = support.with_mimalloc()
+        if mimalloc:
+            default_name = 'mimalloc_debug' if support.Py_DEBUG else 'mimalloc'
+            default_name_debug = 'mimalloc_debug'
+        elif pymalloc:
             default_name = 'pymalloc_debug' if support.Py_DEBUG else 'pymalloc'
             default_name_debug = 'pymalloc_debug'
         else:
@@ -821,6 +827,11 @@ def test_pythonmalloc(self):
                 ('pymalloc', 'pymalloc'),
                 ('pymalloc_debug', 'pymalloc_debug'),
             ))
+        if mimalloc:
+            tests.extend((
+                ('mimalloc', 'mimalloc'),
+                ('mimalloc_debug', 'mimalloc_debug'),
+            ))
 
         for env_var, name in tests:
             with self.subTest(env_var=env_var, name=name):
diff --git a/Modules/_testcapi/mem.c b/Modules/_testcapi/mem.c
index ef9234d7f8955f..ab4ad934644c38 100644
--- a/Modules/_testcapi/mem.c
+++ b/Modules/_testcapi/mem.c
@@ -613,5 +613,14 @@ _PyTestCapi_Init_Mem(PyObject *mod)
         return -1;
     }
 
+#ifdef WITH_MIMALLOC
+    v = Py_True;
+#else
+    v = Py_False;
+#endif
+    if (PyModule_AddObjectRef(mod, "WITH_MIMALLOC", v) < 0) {
+        return -1;
+    }
+
     return 0;
 }
diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c
index 7d552ff57c8f1e..a5bcd8cd187ff5 100644
--- a/Objects/obmalloc.c
+++ b/Objects/obmalloc.c
@@ -10,6 +10,10 @@
 
 #include <stdlib.h>               // malloc()
 #include <stdbool.h>
+#ifdef WITH_MIMALLOC
+#include "mimalloc.h"
+#include "mimalloc/internal.h"    // for stats
+#endif
 
 #undef  uint
 #define uint pymem_uint
@@ -74,25 +78,94 @@ _PyMem_RawFree(void *Py_UNUSED(ctx), void *ptr)
     free(ptr);
 }
 
+#ifdef WITH_MIMALLOC
+
+void *
+_PyMem_MiMalloc(void *ctx, size_t size)
+{
+    return mi_malloc(size);
+}
+
+void *
+_PyMem_MiCalloc(void *ctx, size_t nelem, size_t elsize)
+{
+    return mi_calloc(nelem, elsize);
+}
+
+void *
+_PyMem_MiRealloc(void *ctx, void *ptr, size_t size)
+{
+    return mi_realloc(ptr, size);
+}
+
+void
+_PyMem_MiFree(void *ctx, void *ptr)
+{
+    mi_free(ptr);
+}
+
+void *
+_PyObject_MiMalloc(void *ctx, size_t nbytes)
+{
+    return mi_malloc(nbytes);
+}
+
+void *
+_PyObject_MiCalloc(void *ctx, size_t nelem, size_t elsize)
+{
+    return mi_calloc(nelem, elsize);
+}
+
+
+void *
+_PyObject_MiRealloc(void *ctx, void *ptr, size_t nbytes)
+{
+    return mi_realloc(ptr, nbytes);
+}
+
+void
+_PyObject_MiFree(void *ctx, void *ptr)
+{
+    mi_free(ptr);
+}
+
+#endif // WITH_MIMALLOC
+
+
 #define MALLOC_ALLOC {NULL, _PyMem_RawMalloc, _PyMem_RawCalloc, _PyMem_RawRealloc, _PyMem_RawFree}
-#define PYRAW_ALLOC MALLOC_ALLOC
 
-/* the default object allocator */
+
+#ifdef WITH_MIMALLOC
+#  define MIMALLOC_ALLOC {NULL, _PyMem_MiMalloc, _PyMem_MiCalloc, _PyMem_MiRealloc, _PyMem_MiFree}
+#  define MIMALLOC_OBJALLOC {NULL, _PyObject_MiMalloc, _PyObject_MiCalloc, _PyObject_MiRealloc, _PyObject_MiFree}
+#endif
+
+/* the pymalloc allocator */
 
 // The actual implementation is further down.
 
-#ifdef WITH_PYMALLOC
+#if defined(WITH_PYMALLOC)
 void* _PyObject_Malloc(void *ctx, size_t size);
 void* _PyObject_Calloc(void *ctx, size_t nelem, size_t elsize);
 void _PyObject_Free(void *ctx, void *p);
 void* _PyObject_Realloc(void *ctx, void *ptr, size_t size);
 #  define PYMALLOC_ALLOC {NULL, _PyObject_Malloc, _PyObject_Calloc, _PyObject_Realloc, _PyObject_Free}
+#endif  // WITH_PYMALLOC
+
+#ifdef WITH_MIMALLOC
+#  define PYRAW_ALLOC MALLOC_ALLOC
+#  define PYMEM_ALLOC MIMALLOC_ALLOC
+#  define PYOBJ_ALLOC MIMALLOC_OBJALLOC
+#elif defined(WITH_PYMALLOC)
+#  define PYRAW_ALLOC MALLOC_ALLOC
+#  define PYMEM_ALLOC PYMALLOC_ALLOC
 #  define PYOBJ_ALLOC PYMALLOC_ALLOC
 #else
+#  define PYRAW_ALLOC MALLOC_ALLOC
+#  define PYMEM_ALLOC MALLOC_ALLOC
 #  define PYOBJ_ALLOC MALLOC_ALLOC
-#endif  // WITH_PYMALLOC
+#endif
 
-#define PYMEM_ALLOC PYOBJ_ALLOC
 
 /* the default debug allocators */
 
@@ -291,6 +364,14 @@ _PyMem_GetAllocatorName(const char *name, PyMemAllocatorName *allocator)
     else if (strcmp(name, "pymalloc_debug") == 0) {
         *allocator = PYMEM_ALLOCATOR_PYMALLOC_DEBUG;
     }
+#endif
+#ifdef WITH_MIMALLOC
+    else if (strcmp(name, "mimalloc") == 0) {
+        *allocator = PYMEM_ALLOCATOR_MIMALLOC;
+    }
+    else if (strcmp(name, "mimalloc_debug") == 0) {
+        *allocator = PYMEM_ALLOCATOR_MIMALLOC_DEBUG;
+    }
 #endif
     else if (strcmp(name, "malloc") == 0) {
         *allocator = PYMEM_ALLOCATOR_MALLOC;
@@ -343,6 +424,26 @@ set_up_allocators_unlocked(PyMemAllocatorName allocator)
         break;
     }
 #endif
+#ifdef WITH_MIMALLOC
+    case PYMEM_ALLOCATOR_MIMALLOC:
+    case PYMEM_ALLOCATOR_MIMALLOC_DEBUG:
+    {
+        PyMemAllocatorEx malloc_alloc = MALLOC_ALLOC;
+        set_allocator_unlocked(PYMEM_DOMAIN_RAW, &malloc_alloc);
+
+        PyMemAllocatorEx pymalloc = MIMALLOC_ALLOC;
+        set_allocator_unlocked(PYMEM_DOMAIN_MEM, &pymalloc);
+
+        PyMemAllocatorEx objmalloc = MIMALLOC_OBJALLOC;
+        set_allocator_unlocked(PYMEM_DOMAIN_OBJ, &objmalloc);
+
+        if (allocator == PYMEM_ALLOCATOR_MIMALLOC_DEBUG) {
+            set_up_debug_hooks_unlocked();
+        }
+
+        break;
+    }
+#endif
 
     case PYMEM_ALLOCATOR_MALLOC:
     case PYMEM_ALLOCATOR_MALLOC_DEBUG:
@@ -390,6 +491,10 @@ get_current_allocator_name_unlocked(void)
 #ifdef WITH_PYMALLOC
     PyMemAllocatorEx pymalloc = PYMALLOC_ALLOC;
 #endif
+#ifdef WITH_MIMALLOC
+    PyMemAllocatorEx mimalloc = MIMALLOC_ALLOC;
+    PyMemAllocatorEx mimalloc_obj = MIMALLOC_OBJALLOC;
+#endif
 
     if (pymemallocator_eq(&_PyMem_Raw, &malloc_alloc) &&
         pymemallocator_eq(&_PyMem, &malloc_alloc) &&
@@ -405,6 +510,14 @@ get_current_allocator_name_unlocked(void)
         return "pymalloc";
     }
 #endif
+#ifdef WITH_MIMALLOC
+    if (pymemallocator_eq(&_PyMem_Raw, &malloc_alloc) &&
+        pymemallocator_eq(&_PyMem, &mimalloc) &&
+        pymemallocator_eq(&_PyObject, &mimalloc_obj))
+    {
+        return "mimalloc";
+    }
+#endif
 
     PyMemAllocatorEx dbg_raw = PYDBGRAW_ALLOC;
     PyMemAllocatorEx dbg_mem = PYDBGMEM_ALLOC;
@@ -428,6 +541,14 @@ get_current_allocator_name_unlocked(void)
         {
             return "pymalloc_debug";
         }
+#endif
+#ifdef WITH_MIMALLOC
+        if (pymemallocator_eq(&_PyMem_Debug.raw.alloc, &malloc_alloc) &&
+            pymemallocator_eq(&_PyMem_Debug.mem.alloc, &mimalloc) &&
+            pymemallocator_eq(&_PyMem_Debug.obj.alloc, &mimalloc_obj))
+        {
+            return "mimalloc_debug";
+        }
 #endif
     }
     return NULL;
@@ -443,13 +564,14 @@ _PyMem_GetCurrentAllocatorName(void)
 }
 
 
-#ifdef WITH_PYMALLOC
+#if defined(WITH_PYMALLOC) || defined(WITH_MIMALLOC)
 static int
 _PyMem_DebugEnabled(void)
 {
     return (_PyObject.malloc == _PyMem_DebugMalloc);
 }
 
+#ifdef WITH_PYMALLOC
 static int
 _PyMem_PymallocEnabled(void)
 {
@@ -461,6 +583,19 @@ _PyMem_PymallocEnabled(void)
     }
 }
 #endif
+#ifdef WITH_MIMALLOC
+static int
+_PyMem_MimallocEnabled(void)
+{
+    if (_PyMem_DebugEnabled()) {
+        return (_PyMem_Debug.obj.alloc.malloc == _PyObject_MiMalloc);
+    }
+    else {
+        return (_PyObject.malloc == _PyObject_MiMalloc);
+    }
+}
+#endif
+#endif // defined(WITH_PYMALLOC) || defined(WITH_MIMALLOC)
 
 
 static void
@@ -883,9 +1018,31 @@ get_state(void)
 #define narenas_highwater (state->mgmt.narenas_highwater)
 #define raw_allocated_blocks (state->mgmt.raw_allocated_blocks)
 
+#ifdef WITH_MIMALLOC
+static bool count_blocks(
+    const mi_heap_t* heap, const mi_heap_area_t* area,
+    void* block, size_t block_size, void* allocated_blocks)
+{
+    *(size_t *)allocated_blocks += area->used;
+    return 1;
+}
+#endif
+
 Py_ssize_t
 _PyInterpreterState_GetAllocatedBlocks(PyInterpreterState *interp)
 {
+#ifdef WITH_MIMALLOC
+    // TODO(sgross): this only counts the current thread's blocks.
+    if (_PyMem_MimallocEnabled()) {
+        size_t allocated_blocks = 0;
+
+        mi_heap_t *heap = mi_heap_get_default();
+        mi_heap_visit_blocks(heap, false, &count_blocks, &allocated_blocks);
+
+        return allocated_blocks;
+    }
+#endif
+
 #ifdef Py_DEBUG
     assert(has_own_state(interp));
 #else
@@ -2536,6 +2693,29 @@ pool_is_in_list(const poolp target, poolp list)
 }
 #endif
 
+#ifdef WITH_MIMALLOC
+struct _alloc_stats {
+    size_t allocated_blocks;
+    size_t allocated_bytes;
+    size_t allocated_with_overhead;
+    size_t bytes_reserved;
+    size_t bytes_committed;
+};
+
+static bool _collect_alloc_stats(
+    const mi_heap_t* heap, const mi_heap_area_t* area,
+    void* block, size_t block_size, void* arg)
+{
+    struct _alloc_stats *stats = (struct _alloc_stats *)arg;
+    stats->allocated_blocks += area->used;
+    stats->allocated_bytes += area->used * area->block_size;
+    stats->allocated_with_overhead += area->used * area->full_block_size;
+    stats->bytes_reserved += area->reserved;
+    stats->bytes_committed += area->committed;
+    return 1;
+}
+#endif
+
 /* Print summary info to "out" about the state of pymalloc's structures.
  * In Py_DEBUG mode, also perform some expensive internal consistency
  * checks.
@@ -2546,6 +2726,29 @@ pool_is_in_list(const poolp target, poolp list)
 int
 _PyObject_DebugMallocStats(FILE *out)
 {
+#ifdef WITH_MIMALLOC
+    if (_PyMem_MimallocEnabled()) {
+        fprintf(out, "Small block threshold = %ld, in %u size classes.\n",
+            MI_SMALL_OBJ_SIZE_MAX, MI_BIN_HUGE);
+        fprintf(out, "Medium block threshold = %ld\n",
+                MI_MEDIUM_OBJ_SIZE_MAX);
+        fprintf(out, "Large object max size = %ld\n",
+                MI_LARGE_OBJ_SIZE_MAX);
+
+        mi_heap_t *heap = mi_heap_get_default();
+        struct _alloc_stats stats = {};
+        mi_heap_visit_blocks(heap, false, &_collect_alloc_stats, &stats);
+
+        fprintf(out, "    Allocated Blocks: %ld\n", stats.allocated_blocks);
+        fprintf(out, "    Allocated Bytes: %ld\n", stats.allocated_bytes);
+        fprintf(out, "    Allocated Bytes w/ Overhead: %ld\n", stats.allocated_with_overhead);
+        fprintf(out, "    Bytes Reserved: %ld\n", stats.bytes_reserved);
+        fprintf(out, "    Bytes Committed: %ld\n", stats.bytes_committed);
+
+        return 1;
+    }
+#endif
+
     if (!_PyMem_PymallocEnabled()) {
         return 0;
     }

From 2165c5f57ead8204fc3ec4f4e61d4a9eefa508d6 Mon Sep 17 00:00:00 2001
From: Dino Viehland <dinoviehland@fb.com>
Date: Tue, 26 Sep 2023 18:39:53 -0700
Subject: [PATCH 07/24] Add includes to analyzer

Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
---
 Tools/c-analyzer/cpython/_files.py  | 1 +
 Tools/c-analyzer/cpython/_parser.py | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/Tools/c-analyzer/cpython/_files.py b/Tools/c-analyzer/cpython/_files.py
index ee9e46f7e5e95f..4be8f19bdac207 100644
--- a/Tools/c-analyzer/cpython/_files.py
+++ b/Tools/c-analyzer/cpython/_files.py
@@ -9,6 +9,7 @@
     # Technically, this is covered by "Include/*.h":
     #'Include/cpython/*.h',
     'Include/internal/*.h',
+    'Include/mimalloc/**/*.h',
     'Modules/**/*.h',
     'Modules/**/*.c',
     'Objects/**/*.h',
diff --git a/Tools/c-analyzer/cpython/_parser.py b/Tools/c-analyzer/cpython/_parser.py
index 4523b2ed5b9fdf..c8a790e5defe2e 100644
--- a/Tools/c-analyzer/cpython/_parser.py
+++ b/Tools/c-analyzer/cpython/_parser.py
@@ -89,6 +89,11 @@ def clean_lines(text):
 # not actually source
 Python/bytecodes.c
 
+# mimalloc
+Objects/mimalloc/*.c
+Include/mimalloc/*.h
+Include/mimalloc/mimalloc/*.h
+
 # @end=conf@
 ''')
 
@@ -109,6 +114,7 @@ def clean_lines(text):
 *	.
 *	./Include
 *	./Include/internal
+*   ./Include/mimalloc
 
 Modules/_decimal/**/*.c	Modules/_decimal/libmpdec
 Modules/_elementtree.c	Modules/expat

From b6bddbf0e29454b99296f79013c800985cd79985 Mon Sep 17 00:00:00 2001
From: Sam Gross <colesbury@gmail.com>
Date: Tue, 11 Feb 2020 14:58:03 -0500
Subject: [PATCH 08/24] mimalloc: minimal changes for use in Python

 - remove debug spam for freeing large allocations
 - use same bytes (0xDD) for freed allocations in CPython and mimalloc
   This is important for the test_capi debug memory tests
---
 Include/mimalloc/mimalloc/types.h | 2 +-
 Objects/mimalloc/alloc.c          | 2 +-
 Objects/mimalloc/os.c             | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/Include/mimalloc/mimalloc/types.h b/Include/mimalloc/mimalloc/types.h
index 7616f37e4b978f..e43755ee729369 100644
--- a/Include/mimalloc/mimalloc/types.h
+++ b/Include/mimalloc/mimalloc/types.h
@@ -524,7 +524,7 @@ struct mi_heap_s {
 #define MI_DEBUG_UNINIT     (0xD0)
 #endif
 #if !defined(MI_DEBUG_FREED)
-#define MI_DEBUG_FREED      (0xDF)
+#define MI_DEBUG_FREED      (0xDD)
 #endif
 #if !defined(MI_DEBUG_PADDING)
 #define MI_DEBUG_PADDING    (0xDE)
diff --git a/Objects/mimalloc/alloc.c b/Objects/mimalloc/alloc.c
index acc1d6c06dcf0e..f96c6f0b37f873 100644
--- a/Objects/mimalloc/alloc.c
+++ b/Objects/mimalloc/alloc.c
@@ -533,7 +533,7 @@ static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* ms
   mi_segment_t* const segment = _mi_ptr_segment(p);
   mi_assert_internal(segment != NULL);
 
-#if (MI_DEBUG>0)
+#if 0 && (MI_DEBUG>0)
   if mi_unlikely(!mi_is_in_heap_region(p)) {
   #if (MI_INTPTR_SIZE == 8 && defined(__linux__))
     if (((uintptr_t)p >> 40) != 0x7F) { // linux tends to align large blocks above 0x7F000000000 (issue #640)
diff --git a/Objects/mimalloc/os.c b/Objects/mimalloc/os.c
index 239790f77c574b..f3bc7184c41c5b 100644
--- a/Objects/mimalloc/os.c
+++ b/Objects/mimalloc/os.c
@@ -240,7 +240,8 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
   }
   else {
     // if not aligned, free it, overallocate, and unmap around it
-    _mi_warning_message("unable to allocate aligned OS memory directly, fall back to over-allocation (size: 0x%zx bytes, address: %p, alignment: 0x%zx, commit: %d)\n", size, p, alignment, commit);
+    // NOTE(sgross): this warning causes issues in Python tests
+    // _mi_warning_message("unable to allocate aligned OS memory directly, fall back to over-allocation (size: 0x%zx bytes, address: %p, alignment: 0x%zx, commit: %d)\n", size, p, alignment, commit);
     mi_os_prim_free(p, size, commit, stats);
     if (size >= (SIZE_MAX - alignment)) return NULL; // overflow
     const size_t over_size = size + alignment;

From 7f7f4fefab6b7276f6e6b6501044913fc10fa411 Mon Sep 17 00:00:00 2001
From: Dino Viehland <dinoviehland@fb.com>
Date: Fri, 29 Sep 2023 15:38:59 -0700
Subject: [PATCH 09/24] Add mimalloc MIT license

Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
---
 Doc/license.rst | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/Doc/license.rst b/Doc/license.rst
index 82039fd9aaf9d0..9b3a9751ae402a 100644
--- a/Doc/license.rst
+++ b/Doc/license.rst
@@ -1040,3 +1040,30 @@ https://www.w3.org/TR/xml-c14n2-testcases/ and is distributed under the
    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+mimalloc
+--------
+
+MIT License
+
+Copyright (c) 2018-2021 Microsoft Corporation, Daan Leijen
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+

From 73680108888d6a81d8d669f27441444e69b9fead Mon Sep 17 00:00:00 2001
From: Dino Viehland <dinoviehland@fb.com>
Date: Mon, 2 Oct 2023 09:50:29 -0700
Subject: [PATCH 10/24] Log mimalloc in Lib/test/pythoninfo.py Document new
 mimalloc support Cleanup some formatting

---
 Doc/c-api/init_config.rst                      | 10 ++++++++++
 Doc/c-api/memory.rst                           | 18 ++++++++++++++++--
 Doc/using/cmdline.rst                          |  4 ++++
 Doc/using/configure.rst                        |  7 +++++++
 Include/internal/pycore_pymem_init.h           |  8 ++++----
 Lib/test/pythoninfo.py                         |  3 ++-
 .../2023-09-06-12-36-11.bpo-46657.xea1T_.rst   |  2 +-
 7 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/Doc/c-api/init_config.rst b/Doc/c-api/init_config.rst
index 0240e25b6f1607..457bd674966446 100644
--- a/Doc/c-api/init_config.rst
+++ b/Doc/c-api/init_config.rst
@@ -253,11 +253,21 @@ PyPreConfig
       * ``PYMEM_ALLOCATOR_PYMALLOC_DEBUG`` (``6``): :ref:`Python pymalloc
         memory allocator <pymalloc>` with :ref:`debug hooks
         <pymem-debug-hooks>`.
+      * ``PYMEM_ALLOCATOR_MIMALLOC`` (``6``): use ``mimalloc``, a fast
+        malloc replacement.
+      * ``PYMEM_ALLOCATOR_MIMALLOC_DEBUG`` (``7``): use ``mimalloc``, a fast
+        malloc replacement with :ref:`debug hooks <pymem-debug-hooks>`.
+
 
       ``PYMEM_ALLOCATOR_PYMALLOC`` and ``PYMEM_ALLOCATOR_PYMALLOC_DEBUG`` are
       not supported if Python is :option:`configured using --without-pymalloc
       <--without-pymalloc>`.
 
+      ``PYMEM_ALLOCATOR_MIMALLOC`` and ``PYMEM_ALLOCATOR_MIMALLOC_DEBUG`` are
+      not supported if Python is :option:`configured using --without-mimalloc
+      <--without-mimalloc>` or if the underlying atomic support isn't 
+      available.
+
       See :ref:`Memory Management <memory>`.
 
       Default: ``PYMEM_ALLOCATOR_NOT_SET``.
diff --git a/Doc/c-api/memory.rst b/Doc/c-api/memory.rst
index e98c178ac27a37..6777943eb6f2a1 100644
--- a/Doc/c-api/memory.rst
+++ b/Doc/c-api/memory.rst
@@ -379,8 +379,8 @@ Default memory allocators:
 ===============================  ====================  ==================  =====================  ====================
 Configuration                    Name                  PyMem_RawMalloc     PyMem_Malloc           PyObject_Malloc
 ===============================  ====================  ==================  =====================  ====================
-Release build                    ``"pymalloc"``        ``malloc``          ``pymalloc``           ``pymalloc``
-Debug build                      ``"pymalloc_debug"``  ``malloc`` + debug  ``pymalloc`` + debug   ``pymalloc`` + debug
+Release build                    ``"mimalloc"``        ``malloc``          ``mimalloc``           ``mimalloc``
+Debug build                      ``"mimalloc_debug"``  ``malloc`` + debug  ``mimalloc`` + debug   ``mimalloc`` + debug
 Release build, without pymalloc  ``"malloc"``          ``malloc``          ``malloc``             ``malloc``
 Debug build, without pymalloc    ``"malloc_debug"``    ``malloc`` + debug  ``malloc`` + debug     ``malloc`` + debug
 ===============================  ====================  ==================  =====================  ====================
@@ -391,6 +391,8 @@ Legend:
 * ``malloc``: system allocators from the standard C library, C functions:
   :c:func:`malloc`, :c:func:`calloc`, :c:func:`realloc` and :c:func:`free`.
 * ``pymalloc``: :ref:`pymalloc memory allocator <pymalloc>`.
+* ``mimalloc``: :ref:`mimalloc memory allocator <mimalloc>`.  The pymalloc
+  allocator will be used if mimalloc support isn't available.
 * "+ debug": with :ref:`debug hooks on the Python memory allocators
   <pymem-debug-hooks>`.
 * "Debug build": :ref:`Python build in debug mode <debug-build>`.
@@ -671,6 +673,18 @@ Customize pymalloc Arena Allocator
 
    Set the arena allocator.
 
+.. _mimalloc:
+
+The mimalloc allocator
+======================
+
+.. versionadded:: 3.13
+
+Python supports the mimalloc allocator when the underlying platform support is available.
+mimalloc "is a general purpose allocator with excellent performance characteristics. 
+Initially developed by Daan Leijen for the runtime systems of the Koka and Lean languages."
+
+When mimalloc support is available it is the default allocator.
 
 tracemalloc C API
 =================
diff --git a/Doc/using/cmdline.rst b/Doc/using/cmdline.rst
index 2767b0cb15451c..9adff35ab5377b 100644
--- a/Doc/using/cmdline.rst
+++ b/Doc/using/cmdline.rst
@@ -911,6 +911,9 @@ conflict.
    * ``pymalloc``: use the :ref:`pymalloc allocator <pymalloc>` for
      :c:macro:`PYMEM_DOMAIN_MEM` and :c:macro:`PYMEM_DOMAIN_OBJ` domains and use
      the :c:func:`malloc` function for the :c:macro:`PYMEM_DOMAIN_RAW` domain.
+   * ``mimalloc``: use the :ref:`mimalloc allocator <mimalloc>` for
+     :c:macro:`PYMEM_DOMAIN_MEM` and :c:macro:`PYMEM_DOMAIN_OBJ` domains and use
+     the :c:func:`malloc` function for the :c:macro:`PYMEM_DOMAIN_RAW` domain.
 
    Install :ref:`debug hooks <pymem-debug-hooks>`:
 
@@ -918,6 +921,7 @@ conflict.
      allocators <default-memory-allocators>`.
    * ``malloc_debug``: same as ``malloc`` but also install debug hooks.
    * ``pymalloc_debug``: same as ``pymalloc`` but also install debug hooks.
+   * ``mimalloc_debug``: same as ``mimalloc`` but also install debug hooks.
 
    .. versionchanged:: 3.7
       Added the ``"default"`` allocator.
diff --git a/Doc/using/configure.rst b/Doc/using/configure.rst
index 5f9e695d10ad44..69f691dbe1202c 100644
--- a/Doc/using/configure.rst
+++ b/Doc/using/configure.rst
@@ -579,6 +579,13 @@ also be used to improve performance.
    Enable computed gotos in evaluation loop (enabled by default on supported
    compilers).
 
+.. option:: --without-mimalloc
+
+   Disable the fast mimalloc allocator :ref:`mimalloc <mimalloc>`
+   (enabled by default).
+
+   See also :envvar:`PYTHONMALLOC` environment variable.
+
 .. option:: --without-pymalloc
 
    Disable the specialized Python memory allocator :ref:`pymalloc <pymalloc>`
diff --git a/Include/internal/pycore_pymem_init.h b/Include/internal/pycore_pymem_init.h
index a19ba38dcac991..8776fb1d38099c 100644
--- a/Include/internal/pycore_pymem_init.h
+++ b/Include/internal/pycore_pymem_init.h
@@ -19,11 +19,11 @@ extern void _PyMem_RawFree(void *, void *);
 #define PYRAW_ALLOC {NULL, _PyMem_RawMalloc, _PyMem_RawCalloc, _PyMem_RawRealloc, _PyMem_RawFree}
 
 #ifdef WITH_MIMALLOC
-extern void *_PyMem_MiMalloc(void *ctx, size_t size);
-extern void * _PyMem_MiCalloc(void *ctx, size_t nelem, size_t elsize);
-extern void *_PyMem_MiRealloc(void *ctx, void *ptr, size_t size);
+extern void* _PyMem_MiMalloc(void *ctx, size_t size);
+extern void* _PyMem_MiCalloc(void *ctx, size_t nelem, size_t elsize);
+extern void* _PyMem_MiRealloc(void *ctx, void *ptr, size_t size);
 extern void _PyMem_MiFree(void *ctx, void *ptr);
-extern void * _PyObject_MiMalloc(void *ctx, size_t nbytes);
+extern void* _PyObject_MiMalloc(void *ctx, size_t nbytes);
 extern void* _PyObject_MiCalloc(void *ctx, size_t nelem, size_t elsize);
 extern void* _PyObject_MiRealloc(void *ctx, void *ptr, size_t nbytes);
 extern void _PyObject_MiFree(void *ctx, void *ptr);
diff --git a/Lib/test/pythoninfo.py b/Lib/test/pythoninfo.py
index 4f3ebb12ed957d..b0493366bc27f5 100644
--- a/Lib/test/pythoninfo.py
+++ b/Lib/test/pythoninfo.py
@@ -543,7 +543,8 @@ def collect_sysconfig(info_add):
     for name in (
         'WITH_DOC_STRINGS',
         'WITH_DTRACE',
-        'WITH_FREELISTS',
+        'WITH_FREELISTS'
+        'WITH_MIMALLOC',
         'WITH_PYMALLOC',
         'WITH_VALGRIND',
     ):
diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-09-06-12-36-11.bpo-46657.xea1T_.rst b/Misc/NEWS.d/next/Core and Builtins/2023-09-06-12-36-11.bpo-46657.xea1T_.rst
index 2da4fc18bed763..193b02a3b911f5 100644
--- a/Misc/NEWS.d/next/Core and Builtins/2023-09-06-12-36-11.bpo-46657.xea1T_.rst	
+++ b/Misc/NEWS.d/next/Core and Builtins/2023-09-06-12-36-11.bpo-46657.xea1T_.rst	
@@ -1 +1 @@
-[WIP] Add mimalloc memory allocator support.
+Add mimalloc memory allocator support.

From 8975bd182feeee62ee8fa68c4999269676ea7fe6 Mon Sep 17 00:00:00 2001
From: Dino Viehland <dinoviehland@fb.com>
Date: Mon, 2 Oct 2023 13:04:01 -0700
Subject: [PATCH 11/24] Use macro defs for exports as done in
 https://github.com/python/cpython/pull/31164/

---
 Include/internal/pycore_mimalloc.h | 301 +++++++++++++++++++++++++++++
 Include/mimalloc/mimalloc/types.h  |   2 +-
 Makefile.pre.in                    |  19 +-
 Objects/mimalloc/static.c          |  40 ++++
 Objects/obmalloc.c                 |   3 +-
 Tools/build/smelly.py              |   2 +-
 configure                          |   4 -
 configure.ac                       |   2 -
 8 files changed, 346 insertions(+), 27 deletions(-)
 create mode 100644 Include/internal/pycore_mimalloc.h
 create mode 100644 Objects/mimalloc/static.c

diff --git a/Include/internal/pycore_mimalloc.h b/Include/internal/pycore_mimalloc.h
new file mode 100644
index 00000000000000..7ba6ee0cf731fe
--- /dev/null
+++ b/Include/internal/pycore_mimalloc.h
@@ -0,0 +1,301 @@
+#ifndef Py_INTERNAL_MIMALLOC_H
+#define Py_INTERNAL_MIMALLOC_H
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#if defined(MIMALLOC_H) || defined(MIMALLOC_TYPES_H)
+#  error "pycore_mimalloc.h must be included before mimalloc.h"
+#endif
+
+#include "pycore_pymem.h"
+#define MI_DEBUG_UNINIT     PYMEM_CLEANBYTE
+#define MI_DEBUG_FREED      PYMEM_DEADBYTE
+#define MI_DEBUG_PADDING    PYMEM_FORBIDDENBYTE
+
+
+/* Prefix all non-static symbols with "_Py_"
+ * nm Objects/obmalloc.o | grep -E "[CRT] _?mi" | awk '{print "#define " $3 " PyUnstable" $3}' | sort
+ */
+
+#define _mi_abandoned_await_readers PyUnstable__mi_abandoned_await_readers
+#define _mi_abandoned_collect PyUnstable__mi_abandoned_collect
+#define _mi_abandoned_reclaim_all PyUnstable__mi_abandoned_reclaim_all
+#define mi_aligned_alloc PyUnstable_mi_aligned_alloc
+#define mi_aligned_offset_recalloc PyUnstable_mi_aligned_offset_recalloc
+#define mi_aligned_recalloc PyUnstable_mi_aligned_recalloc
+#define _mi_arena_alloc_aligned PyUnstable__mi_arena_alloc_aligned
+#define _mi_arena_alloc PyUnstable__mi_arena_alloc
+#define mi_arena_area PyUnstable_mi_arena_area
+#define _mi_arena_collect PyUnstable__mi_arena_collect
+#define _mi_arena_contains PyUnstable__mi_arena_contains
+#define _mi_arena_free PyUnstable__mi_arena_free
+#define _mi_arena_id_none PyUnstable__mi_arena_id_none
+#define _mi_arena_memid_is_os_allocated PyUnstable__mi_arena_memid_is_os_allocated
+#define _mi_arena_memid_is_suitable PyUnstable__mi_arena_memid_is_suitable
+#define _mi_arena_unsafe_destroy_all PyUnstable__mi_arena_unsafe_destroy_all
+#define _mi_assert_fail PyUnstable__mi_assert_fail
+#define _mi_bin PyUnstable__mi_bin
+#define _mi_bin_size PyUnstable__mi_bin_size
+#define _mi_bitmap_claim_across PyUnstable__mi_bitmap_claim_across
+#define _mi_bitmap_claim PyUnstable__mi_bitmap_claim
+#define _mi_bitmap_is_any_claimed_across PyUnstable__mi_bitmap_is_any_claimed_across
+#define _mi_bitmap_is_any_claimed PyUnstable__mi_bitmap_is_any_claimed
+#define _mi_bitmap_is_claimed_across PyUnstable__mi_bitmap_is_claimed_across
+#define _mi_bitmap_is_claimed PyUnstable__mi_bitmap_is_claimed
+#define _mi_bitmap_try_claim PyUnstable__mi_bitmap_try_claim
+#define _mi_bitmap_try_find_claim_field PyUnstable__mi_bitmap_try_find_claim_field
+#define _mi_bitmap_try_find_from_claim_across PyUnstable__mi_bitmap_try_find_from_claim_across
+#define _mi_bitmap_try_find_from_claim_pred PyUnstable__mi_bitmap_try_find_from_claim_pred
+#define _mi_bitmap_try_find_from_claim PyUnstable__mi_bitmap_try_find_from_claim
+#define _mi_bitmap_unclaim_across PyUnstable__mi_bitmap_unclaim_across
+#define _mi_bitmap_unclaim PyUnstable__mi_bitmap_unclaim
+#define mi_calloc_aligned_at PyUnstable_mi_calloc_aligned_at
+#define mi_calloc_aligned PyUnstable_mi_calloc_aligned
+#define mi_calloc PyUnstable_mi_calloc
+#define mi_cfree PyUnstable_mi_cfree
+#define mi_check_owned PyUnstable_mi_check_owned
+#define _mi_clock_end PyUnstable__mi_clock_end
+#define _mi_clock_now PyUnstable__mi_clock_now
+#define _mi_clock_start PyUnstable__mi_clock_start
+#define mi_collect PyUnstable_mi_collect
+#define _mi_commit_mask_committed_size PyUnstable__mi_commit_mask_committed_size
+#define _mi_commit_mask_next_run PyUnstable__mi_commit_mask_next_run
+#define _mi_current_thread_count PyUnstable__mi_current_thread_count
+#define mi_debug_show_arenas PyUnstable_mi_debug_show_arenas
+#define _mi_deferred_free PyUnstable__mi_deferred_free
+#define mi_dupenv_s PyUnstable_mi_dupenv_s
+#define _mi_error_message PyUnstable__mi_error_message
+#define mi__expand PyUnstable_mi__expand
+#define mi_expand PyUnstable_mi_expand
+#define _mi_fprintf PyUnstable__mi_fprintf
+#define _mi_fputs PyUnstable__mi_fputs
+#define mi_free_aligned PyUnstable_mi_free_aligned
+#define _mi_free_delayed_block PyUnstable__mi_free_delayed_block
+#define _mi_free_generic PyUnstable__mi_free_generic
+#define mi_free PyUnstable_mi_free
+#define mi_free_size_aligned PyUnstable_mi_free_size_aligned
+#define mi_free_size PyUnstable_mi_free_size
+#define mi_good_size PyUnstable_mi_good_size
+#define mi_heap_alloc_new_n PyUnstable_mi_heap_alloc_new_n
+#define mi_heap_alloc_new PyUnstable_mi_heap_alloc_new
+#define mi_heap_calloc_aligned_at PyUnstable_mi_heap_calloc_aligned_at
+#define mi_heap_calloc_aligned PyUnstable_mi_heap_calloc_aligned
+#define mi_heap_calloc PyUnstable_mi_heap_calloc
+#define mi_heap_check_owned PyUnstable_mi_heap_check_owned
+#define _mi_heap_collect_abandon PyUnstable__mi_heap_collect_abandon
+#define mi_heap_collect PyUnstable_mi_heap_collect
+#define _mi_heap_collect_retired PyUnstable__mi_heap_collect_retired
+#define mi_heap_contains_block PyUnstable_mi_heap_contains_block
+#define _mi_heap_delayed_free_all PyUnstable__mi_heap_delayed_free_all
+#define _mi_heap_delayed_free_partial PyUnstable__mi_heap_delayed_free_partial
+#define mi_heap_delete PyUnstable_mi_heap_delete
+#define _mi_heap_destroy_pages PyUnstable__mi_heap_destroy_pages
+#define mi_heap_destroy PyUnstable_mi_heap_destroy
+#define _mi_heap_empty PyUnstable__mi_heap_empty
+#define mi_heap_get_backing PyUnstable_mi_heap_get_backing
+#define mi_heap_get_default PyUnstable_mi_heap_get_default
+#define _mi_heap_main_get PyUnstable__mi_heap_main_get
+#define mi_heap_malloc_aligned_at PyUnstable_mi_heap_malloc_aligned_at
+#define mi_heap_malloc_aligned PyUnstable_mi_heap_malloc_aligned
+#define mi_heap_mallocn PyUnstable_mi_heap_mallocn
+#define mi_heap_malloc PyUnstable_mi_heap_malloc
+#define mi_heap_malloc_small PyUnstable_mi_heap_malloc_small
+#define _mi_heap_malloc_zero_ex PyUnstable__mi_heap_malloc_zero_ex
+#define _mi_heap_malloc_zero PyUnstable__mi_heap_malloc_zero
+#define _mi_heap_memid_is_suitable PyUnstable__mi_heap_memid_is_suitable
+#define mi_heap_new_in_arena PyUnstable_mi_heap_new_in_arena
+#define mi_heap_new PyUnstable_mi_heap_new
+#define _mi_heap_random_next PyUnstable__mi_heap_random_next
+#define mi_heap_realloc_aligned_at PyUnstable_mi_heap_realloc_aligned_at
+#define mi_heap_realloc_aligned PyUnstable_mi_heap_realloc_aligned
+#define mi_heap_reallocf PyUnstable_mi_heap_reallocf
+#define mi_heap_reallocn PyUnstable_mi_heap_reallocn
+#define mi_heap_realloc PyUnstable_mi_heap_realloc
+#define _mi_heap_realloc_zero PyUnstable__mi_heap_realloc_zero
+#define mi_heap_realpath PyUnstable_mi_heap_realpath
+#define mi_heap_recalloc_aligned_at PyUnstable_mi_heap_recalloc_aligned_at
+#define mi_heap_recalloc_aligned PyUnstable_mi_heap_recalloc_aligned
+#define mi_heap_recalloc PyUnstable_mi_heap_recalloc
+#define mi_heap_rezalloc_aligned_at PyUnstable_mi_heap_rezalloc_aligned_at
+#define mi_heap_rezalloc_aligned PyUnstable_mi_heap_rezalloc_aligned
+#define mi_heap_rezalloc PyUnstable_mi_heap_rezalloc
+#define _mi_heap_set_default_direct PyUnstable__mi_heap_set_default_direct
+#define mi_heap_set_default PyUnstable_mi_heap_set_default
+#define mi_heap_strdup PyUnstable_mi_heap_strdup
+#define mi_heap_strndup PyUnstable_mi_heap_strndup
+#define mi_heap_try_new PyUnstable_mi_heap_try_new
+#define _mi_heap_unsafe_destroy_all PyUnstable__mi_heap_unsafe_destroy_all
+#define mi_heap_visit_blocks PyUnstable_mi_heap_visit_blocks
+#define mi_heap_zalloc_aligned_at PyUnstable_mi_heap_zalloc_aligned_at
+#define mi_heap_zalloc_aligned PyUnstable_mi_heap_zalloc_aligned
+#define mi_heap_zalloc PyUnstable_mi_heap_zalloc
+#define mi_is_in_heap_region PyUnstable_mi_is_in_heap_region
+#define _mi_is_main_thread PyUnstable__mi_is_main_thread
+#define mi_is_redirected PyUnstable_mi_is_redirected
+#define mi_malloc_aligned_at PyUnstable_mi_malloc_aligned_at
+#define mi_malloc_aligned PyUnstable_mi_malloc_aligned
+#define _mi_malloc_generic PyUnstable__mi_malloc_generic
+#define mi_malloc_good_size PyUnstable_mi_malloc_good_size
+#define mi_mallocn PyUnstable_mi_mallocn
+#define mi_malloc PyUnstable_mi_malloc
+#define mi_malloc_size PyUnstable_mi_malloc_size
+#define mi_malloc_small PyUnstable_mi_malloc_small
+#define mi_malloc_usable_size PyUnstable_mi_malloc_usable_size
+#define mi_manage_os_memory_ex PyUnstable_mi_manage_os_memory_ex
+#define mi_manage_os_memory PyUnstable_mi_manage_os_memory
+#define mi_mbsdup PyUnstable_mi_mbsdup
+#define mi_memalign PyUnstable_mi_memalign
+#define mi_new_aligned_nothrow PyUnstable_mi_new_aligned_nothrow
+#define mi_new_aligned PyUnstable_mi_new_aligned
+#define mi_new_nothrow PyUnstable_mi_new_nothrow
+#define mi_new_n PyUnstable_mi_new_n
+#define mi_new PyUnstable_mi_new
+#define mi_new_reallocn PyUnstable_mi_new_reallocn
+#define mi_new_realloc PyUnstable_mi_new_realloc
+#define mi_option_disable PyUnstable_mi_option_disable
+#define mi_option_enable PyUnstable_mi_option_enable
+#define mi_option_get_clamp PyUnstable_mi_option_get_clamp
+#define mi_option_get PyUnstable_mi_option_get
+#define mi_option_get_size PyUnstable_mi_option_get_size
+#define mi_option_is_enabled PyUnstable_mi_option_is_enabled
+#define mi_option_set_default PyUnstable_mi_option_set_default
+#define mi_option_set_enabled_default PyUnstable_mi_option_set_enabled_default
+#define mi_option_set_enabled PyUnstable_mi_option_set_enabled
+#define mi_option_set PyUnstable_mi_option_set
+#define _mi_options_init PyUnstable__mi_options_init
+#define _mi_os_alloc_aligned_at_offset PyUnstable__mi_os_alloc_aligned_at_offset
+#define _mi_os_alloc_aligned PyUnstable__mi_os_alloc_aligned
+#define _mi_os_alloc_huge_os_pages PyUnstable__mi_os_alloc_huge_os_pages
+#define _mi_os_alloc PyUnstable__mi_os_alloc
+#define _mi_os_commit PyUnstable__mi_os_commit
+#define _mi_os_decommit PyUnstable__mi_os_decommit
+#define _mi_os_free_ex PyUnstable__mi_os_free_ex
+#define _mi_os_free PyUnstable__mi_os_free
+#define _mi_os_get_aligned_hint PyUnstable__mi_os_get_aligned_hint
+#define _mi_os_good_alloc_size PyUnstable__mi_os_good_alloc_size
+#define _mi_os_has_overcommit PyUnstable__mi_os_has_overcommit
+#define _mi_os_has_virtual_reserve PyUnstable__mi_os_has_virtual_reserve
+#define _mi_os_init PyUnstable__mi_os_init
+#define _mi_os_large_page_size PyUnstable__mi_os_large_page_size
+#define _mi_os_numa_node_count_get PyUnstable__mi_os_numa_node_count_get
+#define _mi_os_numa_node_get PyUnstable__mi_os_numa_node_get
+#define _mi_os_page_size PyUnstable__mi_os_page_size
+#define _mi_os_protect PyUnstable__mi_os_protect
+#define _mi_os_purge_ex PyUnstable__mi_os_purge_ex
+#define _mi_os_purge PyUnstable__mi_os_purge
+#define _mi_os_random_weak PyUnstable__mi_os_random_weak
+#define _mi_os_reset PyUnstable__mi_os_reset
+#define _mi_os_unprotect PyUnstable__mi_os_unprotect
+#define _mi_os_use_large_page PyUnstable__mi_os_use_large_page
+#define _mi_padding_shrink PyUnstable__mi_padding_shrink
+#define _mi_page_abandon PyUnstable__mi_page_abandon
+#define _mi_page_empty PyUnstable__mi_page_empty
+#define _mi_page_free_collect PyUnstable__mi_page_free_collect
+#define _mi_page_free PyUnstable__mi_page_free
+#define _mi_page_malloc PyUnstable__mi_page_malloc
+#define _mi_page_ptr_unalign PyUnstable__mi_page_ptr_unalign
+#define _mi_page_queue_append PyUnstable__mi_page_queue_append
+#define _mi_page_reclaim PyUnstable__mi_page_reclaim
+#define _mi_page_retire PyUnstable__mi_page_retire
+#define _mi_page_try_use_delayed_free PyUnstable__mi_page_try_use_delayed_free
+#define _mi_page_unfull PyUnstable__mi_page_unfull
+#define _mi_page_use_delayed_free PyUnstable__mi_page_use_delayed_free
+#define mi_posix_memalign PyUnstable_mi_posix_memalign
+#define _mi_preloading PyUnstable__mi_preloading
+#define _mi_prim_alloc_huge_os_pages PyUnstable__mi_prim_alloc_huge_os_pages
+#define _mi_prim_alloc PyUnstable__mi_prim_alloc
+#define _mi_prim_clock_now PyUnstable__mi_prim_clock_now
+#define _mi_prim_commit PyUnstable__mi_prim_commit
+#define _mi_prim_decommit PyUnstable__mi_prim_decommit
+#define _mi_prim_free PyUnstable__mi_prim_free
+#define _mi_prim_getenv PyUnstable__mi_prim_getenv
+#define _mi_prim_mem_init PyUnstable__mi_prim_mem_init
+#define _mi_prim_numa_node_count PyUnstable__mi_prim_numa_node_count
+#define _mi_prim_numa_node PyUnstable__mi_prim_numa_node
+#define _mi_prim_out_stderr PyUnstable__mi_prim_out_stderr
+#define _mi_prim_process_info PyUnstable__mi_prim_process_info
+#define _mi_prim_protect PyUnstable__mi_prim_protect
+#define _mi_prim_random_buf PyUnstable__mi_prim_random_buf
+#define _mi_prim_reset PyUnstable__mi_prim_reset
+#define _mi_prim_thread_associate_default_heap PyUnstable__mi_prim_thread_associate_default_heap
+#define _mi_prim_thread_done_auto_done PyUnstable__mi_prim_thread_done_auto_done
+#define _mi_prim_thread_init_auto_done PyUnstable__mi_prim_thread_init_auto_done
+#define mi_process_info PyUnstable_mi_process_info
+#define mi_process_init PyUnstable_mi_process_init
+#define mi_pvalloc PyUnstable_mi_pvalloc
+#define _mi_random_init PyUnstable__mi_random_init
+#define _mi_random_init_weak PyUnstable__mi_random_init_weak
+#define _mi_random_next PyUnstable__mi_random_next
+#define _mi_random_reinit_if_weak PyUnstable__mi_random_reinit_if_weak
+#define _mi_random_split PyUnstable__mi_random_split
+#define mi_realloc_aligned_at PyUnstable_mi_realloc_aligned_at
+#define mi_realloc_aligned PyUnstable_mi_realloc_aligned
+#define mi_reallocarray PyUnstable_mi_reallocarray
+#define mi_reallocarr PyUnstable_mi_reallocarr
+#define mi_reallocf PyUnstable_mi_reallocf
+#define mi_reallocn PyUnstable_mi_reallocn
+#define mi_realloc PyUnstable_mi_realloc
+#define mi_realpath PyUnstable_mi_realpath
+#define mi_recalloc_aligned_at PyUnstable_mi_recalloc_aligned_at
+#define mi_recalloc_aligned PyUnstable_mi_recalloc_aligned
+#define mi_recalloc PyUnstable_mi_recalloc
+#define mi_register_deferred_free PyUnstable_mi_register_deferred_free
+#define mi_register_error PyUnstable_mi_register_error
+#define mi_register_output PyUnstable_mi_register_output
+#define mi_reserve_huge_os_pages_at_ex PyUnstable_mi_reserve_huge_os_pages_at_ex
+#define mi_reserve_huge_os_pages_at PyUnstable_mi_reserve_huge_os_pages_at
+#define mi_reserve_huge_os_pages_interleave PyUnstable_mi_reserve_huge_os_pages_interleave
+#define mi_reserve_huge_os_pages PyUnstable_mi_reserve_huge_os_pages
+#define mi_reserve_os_memory_ex PyUnstable_mi_reserve_os_memory_ex
+#define mi_reserve_os_memory PyUnstable_mi_reserve_os_memory
+#define mi_rezalloc_aligned_at PyUnstable_mi_rezalloc_aligned_at
+#define mi_rezalloc_aligned PyUnstable_mi_rezalloc_aligned
+#define mi_rezalloc PyUnstable_mi_rezalloc
+#define _mi_segment_huge_page_reset PyUnstable__mi_segment_huge_page_reset
+#define _mi_segment_map_allocated_at PyUnstable__mi_segment_map_allocated_at
+#define _mi_segment_map_freed_at PyUnstable__mi_segment_map_freed_at
+#define _mi_segment_page_abandon PyUnstable__mi_segment_page_abandon
+#define _mi_segment_page_alloc PyUnstable__mi_segment_page_alloc
+#define _mi_segment_page_free PyUnstable__mi_segment_page_free
+#define _mi_segment_page_start PyUnstable__mi_segment_page_start
+#define _mi_segment_thread_collect PyUnstable__mi_segment_thread_collect
+#define _mi_stat_counter_increase PyUnstable__mi_stat_counter_increase
+#define _mi_stat_decrease PyUnstable__mi_stat_decrease
+#define _mi_stat_increase PyUnstable__mi_stat_increase
+#define _mi_stats_done PyUnstable__mi_stats_done
+#define mi_stats_merge PyUnstable_mi_stats_merge
+#define mi_stats_print_out PyUnstable_mi_stats_print_out
+#define mi_stats_print PyUnstable_mi_stats_print
+#define mi_stats_reset PyUnstable_mi_stats_reset
+#define mi_strdup PyUnstable_mi_strdup
+#define _mi_strlcat PyUnstable__mi_strlcat
+#define _mi_strlcpy PyUnstable__mi_strlcpy
+#define _mi_strlen PyUnstable__mi_strlen
+#define mi_strndup PyUnstable_mi_strndup
+#define _mi_strnicmp PyUnstable__mi_strnicmp
+#define _mi_strnlen PyUnstable__mi_strnlen
+#define _mi_thread_data_collect PyUnstable__mi_thread_data_collect
+#define _mi_thread_done PyUnstable__mi_thread_done
+#define mi_thread_done PyUnstable_mi_thread_done
+#define _mi_thread_id PyUnstable__mi_thread_id
+#define mi_thread_init PyUnstable_mi_thread_init
+#define mi_thread_stats_print_out PyUnstable_mi_thread_stats_print_out
+#define _mi_toupper PyUnstable__mi_toupper
+#define _mi_trace_message PyUnstable__mi_trace_message
+#define mi_usable_size PyUnstable_mi_usable_size
+#define mi_valloc PyUnstable_mi_valloc
+#define _mi_verbose_message PyUnstable__mi_verbose_message
+#define mi_version PyUnstable_mi_version
+#define _mi_warning_message PyUnstable__mi_warning_message
+#define mi_wcsdup PyUnstable_mi_wcsdup
+#define mi_wdupenv_s PyUnstable_mi_wdupenv_s
+#define mi_zalloc_aligned_at PyUnstable_mi_zalloc_aligned_at
+#define mi_zalloc_aligned PyUnstable_mi_zalloc_aligned
+#define mi_zalloc PyUnstable_mi_zalloc
+#define mi_zalloc_small PyUnstable_mi_zalloc_small
+
+#include "mimalloc.h"
+
+#endif // Py_INTERNAL_MIMALLOC_H
diff --git a/Include/mimalloc/mimalloc/types.h b/Include/mimalloc/mimalloc/types.h
index e43755ee729369..7616f37e4b978f 100644
--- a/Include/mimalloc/mimalloc/types.h
+++ b/Include/mimalloc/mimalloc/types.h
@@ -524,7 +524,7 @@ struct mi_heap_s {
 #define MI_DEBUG_UNINIT     (0xD0)
 #endif
 #if !defined(MI_DEBUG_FREED)
-#define MI_DEBUG_FREED      (0xDD)
+#define MI_DEBUG_FREED      (0xDF)
 #endif
 #if !defined(MI_DEBUG_PADDING)
 #define MI_DEBUG_PADDING    (0xDE)
diff --git a/Makefile.pre.in b/Makefile.pre.in
index 6a059121d8c1bf..c7d735d699cce6 100644
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -341,6 +341,7 @@ IO_OBJS=	\
 # mimalloc
 
 MIMALLOC_HEADERS= \
+                $(srcdir)/Include/internal/pycore_mimalloc.h \
 		$(srcdir)/Include/mimalloc/mimalloc.h \
 		$(srcdir)/Include/mimalloc/mimalloc/atomic.h \
 		$(srcdir)/Include/mimalloc/mimalloc/internal.h \
@@ -348,23 +349,6 @@ MIMALLOC_HEADERS= \
 		$(srcdir)/Include/mimalloc/mimalloc/track.h \
 		$(srcdir)/Include/mimalloc/mimalloc/types.h
 
-MIMALLOC_OBJS= \
-		Objects/mimalloc/alloc-aligned.o \
-		Objects/mimalloc/alloc-posix.o \
-		Objects/mimalloc/alloc.o \
-		Objects/mimalloc/arena.o \
-		Objects/mimalloc/bitmap.o \
-		Objects/mimalloc/heap.o \
-		Objects/mimalloc/init.o \
-		Objects/mimalloc/options.o \
-		Objects/mimalloc/os.o \
-		Objects/mimalloc/page.o \
-		Objects/mimalloc/random.o \
-		Objects/mimalloc/segment.o \
-		Objects/mimalloc/segment-map.o \
-		Objects/mimalloc/stats.o \
-		Objects/mimalloc/prim/prim.o
-
 
 ##########################################################################
 # Parser
@@ -536,7 +520,6 @@ OBJECT_OBJS=	\
 		Objects/unicodectype.o \
 		Objects/unionobject.o \
 		Objects/weakrefobject.o \
-		@MIMALLOC_OBJS@ \
 		@PERF_TRAMPOLINE_OBJ@
 
 ##########################################################################
diff --git a/Objects/mimalloc/static.c b/Objects/mimalloc/static.c
new file mode 100644
index 00000000000000..bc05dd72f5b54d
--- /dev/null
+++ b/Objects/mimalloc/static.c
@@ -0,0 +1,40 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2020, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#ifndef _DEFAULT_SOURCE
+#define _DEFAULT_SOURCE
+#endif
+#if defined(__sun)
+// same remarks as os.c for the static's context.
+#undef _XOPEN_SOURCE
+#undef _POSIX_C_SOURCE
+#endif
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+
+// For a static override we create a single object file
+// containing the whole library. If it is linked first
+// it will override all the standard library allocation
+// functions (on Unix's).
+#include "alloc.c"          // includes alloc-override.c
+#include "alloc-aligned.c"
+#include "alloc-posix.c"
+#include "arena.c"
+#include "bitmap.c"
+#include "heap.c"
+#include "init.c"
+#include "options.c"
+#include "os.c"
+#include "page.c"           // includes page-queue.c
+#include "random.c" 
+#include "segment.c"
+#include "segment-map.c"
+#include "stats.c"
+#include "prim/prim.c"
+#if MI_OSX_ZONE
+#include "prim/osx/alloc-override-zone.c"
+#endif
diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c
index a5bcd8cd187ff5..4303e3c5874fb1 100644
--- a/Objects/obmalloc.c
+++ b/Objects/obmalloc.c
@@ -11,7 +11,8 @@
 #include <stdlib.h>               // malloc()
 #include <stdbool.h>
 #ifdef WITH_MIMALLOC
-#include "mimalloc.h"
+#include "pycore_mimalloc.h"
+#include "mimalloc/static.c"
 #include "mimalloc/internal.h"    // for stats
 #endif
 
diff --git a/Tools/build/smelly.py b/Tools/build/smelly.py
index 654d7e26571885..ab345307ff9b64 100755
--- a/Tools/build/smelly.py
+++ b/Tools/build/smelly.py
@@ -7,7 +7,7 @@
 import sysconfig
 
 
-ALLOWED_PREFIXES = ('Py', '_Py', 'mi_', '_mi_')
+ALLOWED_PREFIXES = ('Py', '_Py')
 if sys.platform == 'darwin':
     ALLOWED_PREFIXES += ('__Py',)
 
diff --git a/configure b/configure
index 490aa2aa33f40f..97646e06b838b4 100755
--- a/configure
+++ b/configure
@@ -861,7 +861,6 @@ DTRACE_HEADERS
 DFLAGS
 DTRACE
 WITH_MIMALLOC
-MIMALLOC_OBJS
 MIMALLOC_HEADERS
 GDBM_LIBS
 GDBM_CFLAGS
@@ -16887,8 +16886,6 @@ printf "%s\n" "#define WITH_MIMALLOC 1" >>confdefs.h
 
   MIMALLOC_HEADERS='$(MIMALLOC_HEADERS)'
 
-  MIMALLOC_OBJS='$(MIMALLOC_OBJS)'
-
 fi
 
 { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $with_mimalloc" >&5
@@ -16896,7 +16893,6 @@ printf "%s\n" "$with_mimalloc" >&6; }
 
 
 
-
 # Check for Python-specific malloc support
 { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for --with-pymalloc" >&5
 printf %s "checking for --with-pymalloc... " >&6; }
diff --git a/configure.ac b/configure.ac
index 63f8e9f247bfad..f7d5c6bd3982de 100644
--- a/configure.ac
+++ b/configure.ac
@@ -4556,13 +4556,11 @@ if test "$with_mimalloc" != no; then
   with_mimalloc=yes
   AC_DEFINE([WITH_MIMALLOC], [1], [Define if you want to compile in mimalloc memory allocator.])
   AC_SUBST([MIMALLOC_HEADERS], ['$(MIMALLOC_HEADERS)'])
-  AC_SUBST([MIMALLOC_OBJS], ['$(MIMALLOC_OBJS)'])
 fi
 
 AC_MSG_RESULT([$with_mimalloc])
 AC_SUBST([WITH_MIMALLOC])
 AC_SUBST([MIMALLOC_HEADERS])
-AC_SUBST([MIMALLOC_OBJS])
 
 # Check for Python-specific malloc support
 AC_MSG_CHECKING([for --with-pymalloc])

From 8a7189275d4fa4ca0b4d79040b743dc9c8dbe813 Mon Sep 17 00:00:00 2001
From: Dino Viehland <dinoviehland@fb.com>
Date: Mon, 2 Oct 2023 16:16:00 -0700
Subject: [PATCH 12/24] Fix formatting

Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
---
 Doc/c-api/init_config.rst | 2 +-
 Doc/c-api/memory.rst      | 2 +-
 Objects/mimalloc/static.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Doc/c-api/init_config.rst b/Doc/c-api/init_config.rst
index 457bd674966446..f7b21b0e5db9be 100644
--- a/Doc/c-api/init_config.rst
+++ b/Doc/c-api/init_config.rst
@@ -265,7 +265,7 @@ PyPreConfig
 
       ``PYMEM_ALLOCATOR_MIMALLOC`` and ``PYMEM_ALLOCATOR_MIMALLOC_DEBUG`` are
       not supported if Python is :option:`configured using --without-mimalloc
-      <--without-mimalloc>` or if the underlying atomic support isn't 
+      <--without-mimalloc>` or if the underlying atomic support isn't
       available.
 
       See :ref:`Memory Management <memory>`.
diff --git a/Doc/c-api/memory.rst b/Doc/c-api/memory.rst
index 6777943eb6f2a1..1ea6d23f5e0c98 100644
--- a/Doc/c-api/memory.rst
+++ b/Doc/c-api/memory.rst
@@ -681,7 +681,7 @@ The mimalloc allocator
 .. versionadded:: 3.13
 
 Python supports the mimalloc allocator when the underlying platform support is available.
-mimalloc "is a general purpose allocator with excellent performance characteristics. 
+mimalloc "is a general purpose allocator with excellent performance characteristics.
 Initially developed by Daan Leijen for the runtime systems of the Koka and Lean languages."
 
 When mimalloc support is available it is the default allocator.
diff --git a/Objects/mimalloc/static.c b/Objects/mimalloc/static.c
index bc05dd72f5b54d..98c4aa18e450cd 100644
--- a/Objects/mimalloc/static.c
+++ b/Objects/mimalloc/static.c
@@ -30,7 +30,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "options.c"
 #include "os.c"
 #include "page.c"           // includes page-queue.c
-#include "random.c" 
+#include "random.c"
 #include "segment.c"
 #include "segment-map.c"
 #include "stats.c"

From d725922252f92fca4272aebdcd0e56c7a5b87a10 Mon Sep 17 00:00:00 2001
From: Dino Viehland <dinoviehland@fb.com>
Date: Fri, 6 Oct 2023 11:40:23 -0700
Subject: [PATCH 13/24] Fix ./python -m test test_sys -R 3:3

mimalloc is thread safe and shares a single heap across all runtimes,
therefore finalization and getting global allocated blocks across
all runtimes is different.
---
 Objects/obmalloc.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c
index 4303e3c5874fb1..e483ecb6c0d82b 100644
--- a/Objects/obmalloc.c
+++ b/Objects/obmalloc.c
@@ -1077,6 +1077,11 @@ _PyInterpreterState_GetAllocatedBlocks(PyInterpreterState *interp)
 void
 _PyInterpreterState_FinalizeAllocatedBlocks(PyInterpreterState *interp)
 {
+#ifdef WITH_MIAMLLOC
+    if (_PyMem_MimallocEnabled()) {
+        return;
+    }
+#endif
     if (has_own_state(interp)) {
         Py_ssize_t leaked = _PyInterpreterState_GetAllocatedBlocks(interp);
         assert(has_own_state(interp) || leaked == 0);
@@ -1102,6 +1107,16 @@ _Py_FinalizeAllocatedBlocks(_PyRuntimeState *runtime)
 static Py_ssize_t
 get_num_global_allocated_blocks(_PyRuntimeState *runtime)
 {
+#ifdef WITH_MIMALLOC
+    if (_PyMem_MimallocEnabled()) {
+        size_t allocated_blocks = 0;
+
+        mi_heap_t *heap = mi_heap_get_default();
+        mi_heap_visit_blocks(heap, false, &count_blocks, &allocated_blocks);
+
+        return allocated_blocks;
+    }
+#endif
     Py_ssize_t total = 0;
     if (_PyRuntimeState_GetFinalizing(runtime) != NULL) {
         PyInterpreterState *interp = _PyInterpreterState_Main();

From 2cb4aaf1d069dd07d527827a95d845f3825e852b Mon Sep 17 00:00:00 2001
From: Dino Viehland <dinoviehland@fb.com>
Date: Tue, 10 Oct 2023 15:28:13 -0700
Subject: [PATCH 14/24] Move mimalloc headers to internal, remove re-defs,
 update smelly to allow mi malloc symbols in static lib, but not shared lib

Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
---
 Include/{ => internal}/mimalloc/mimalloc.h    |   0
 .../{ => internal}/mimalloc/mimalloc/atomic.h |   0
 .../mimalloc/mimalloc/internal.h              |   0
 .../{ => internal}/mimalloc/mimalloc/prim.h   |   0
 .../{ => internal}/mimalloc/mimalloc/track.h  |   0
 .../{ => internal}/mimalloc/mimalloc/types.h  |   0
 Include/internal/pycore_mimalloc.h            | 282 ------------------
 Makefile.pre.in                               |  30 +-
 Tools/build/smelly.py                         |  13 +-
 9 files changed, 33 insertions(+), 292 deletions(-)
 rename Include/{ => internal}/mimalloc/mimalloc.h (100%)
 rename Include/{ => internal}/mimalloc/mimalloc/atomic.h (100%)
 rename Include/{ => internal}/mimalloc/mimalloc/internal.h (100%)
 rename Include/{ => internal}/mimalloc/mimalloc/prim.h (100%)
 rename Include/{ => internal}/mimalloc/mimalloc/track.h (100%)
 rename Include/{ => internal}/mimalloc/mimalloc/types.h (100%)

diff --git a/Include/mimalloc/mimalloc.h b/Include/internal/mimalloc/mimalloc.h
similarity index 100%
rename from Include/mimalloc/mimalloc.h
rename to Include/internal/mimalloc/mimalloc.h
diff --git a/Include/mimalloc/mimalloc/atomic.h b/Include/internal/mimalloc/mimalloc/atomic.h
similarity index 100%
rename from Include/mimalloc/mimalloc/atomic.h
rename to Include/internal/mimalloc/mimalloc/atomic.h
diff --git a/Include/mimalloc/mimalloc/internal.h b/Include/internal/mimalloc/mimalloc/internal.h
similarity index 100%
rename from Include/mimalloc/mimalloc/internal.h
rename to Include/internal/mimalloc/mimalloc/internal.h
diff --git a/Include/mimalloc/mimalloc/prim.h b/Include/internal/mimalloc/mimalloc/prim.h
similarity index 100%
rename from Include/mimalloc/mimalloc/prim.h
rename to Include/internal/mimalloc/mimalloc/prim.h
diff --git a/Include/mimalloc/mimalloc/track.h b/Include/internal/mimalloc/mimalloc/track.h
similarity index 100%
rename from Include/mimalloc/mimalloc/track.h
rename to Include/internal/mimalloc/mimalloc/track.h
diff --git a/Include/mimalloc/mimalloc/types.h b/Include/internal/mimalloc/mimalloc/types.h
similarity index 100%
rename from Include/mimalloc/mimalloc/types.h
rename to Include/internal/mimalloc/mimalloc/types.h
diff --git a/Include/internal/pycore_mimalloc.h b/Include/internal/pycore_mimalloc.h
index 7ba6ee0cf731fe..c29dc82a42762a 100644
--- a/Include/internal/pycore_mimalloc.h
+++ b/Include/internal/pycore_mimalloc.h
@@ -14,288 +14,6 @@
 #define MI_DEBUG_FREED      PYMEM_DEADBYTE
 #define MI_DEBUG_PADDING    PYMEM_FORBIDDENBYTE
 
-
-/* Prefix all non-static symbols with "_Py_"
- * nm Objects/obmalloc.o | grep -E "[CRT] _?mi" | awk '{print "#define " $3 " PyUnstable" $3}' | sort
- */
-
-#define _mi_abandoned_await_readers PyUnstable__mi_abandoned_await_readers
-#define _mi_abandoned_collect PyUnstable__mi_abandoned_collect
-#define _mi_abandoned_reclaim_all PyUnstable__mi_abandoned_reclaim_all
-#define mi_aligned_alloc PyUnstable_mi_aligned_alloc
-#define mi_aligned_offset_recalloc PyUnstable_mi_aligned_offset_recalloc
-#define mi_aligned_recalloc PyUnstable_mi_aligned_recalloc
-#define _mi_arena_alloc_aligned PyUnstable__mi_arena_alloc_aligned
-#define _mi_arena_alloc PyUnstable__mi_arena_alloc
-#define mi_arena_area PyUnstable_mi_arena_area
-#define _mi_arena_collect PyUnstable__mi_arena_collect
-#define _mi_arena_contains PyUnstable__mi_arena_contains
-#define _mi_arena_free PyUnstable__mi_arena_free
-#define _mi_arena_id_none PyUnstable__mi_arena_id_none
-#define _mi_arena_memid_is_os_allocated PyUnstable__mi_arena_memid_is_os_allocated
-#define _mi_arena_memid_is_suitable PyUnstable__mi_arena_memid_is_suitable
-#define _mi_arena_unsafe_destroy_all PyUnstable__mi_arena_unsafe_destroy_all
-#define _mi_assert_fail PyUnstable__mi_assert_fail
-#define _mi_bin PyUnstable__mi_bin
-#define _mi_bin_size PyUnstable__mi_bin_size
-#define _mi_bitmap_claim_across PyUnstable__mi_bitmap_claim_across
-#define _mi_bitmap_claim PyUnstable__mi_bitmap_claim
-#define _mi_bitmap_is_any_claimed_across PyUnstable__mi_bitmap_is_any_claimed_across
-#define _mi_bitmap_is_any_claimed PyUnstable__mi_bitmap_is_any_claimed
-#define _mi_bitmap_is_claimed_across PyUnstable__mi_bitmap_is_claimed_across
-#define _mi_bitmap_is_claimed PyUnstable__mi_bitmap_is_claimed
-#define _mi_bitmap_try_claim PyUnstable__mi_bitmap_try_claim
-#define _mi_bitmap_try_find_claim_field PyUnstable__mi_bitmap_try_find_claim_field
-#define _mi_bitmap_try_find_from_claim_across PyUnstable__mi_bitmap_try_find_from_claim_across
-#define _mi_bitmap_try_find_from_claim_pred PyUnstable__mi_bitmap_try_find_from_claim_pred
-#define _mi_bitmap_try_find_from_claim PyUnstable__mi_bitmap_try_find_from_claim
-#define _mi_bitmap_unclaim_across PyUnstable__mi_bitmap_unclaim_across
-#define _mi_bitmap_unclaim PyUnstable__mi_bitmap_unclaim
-#define mi_calloc_aligned_at PyUnstable_mi_calloc_aligned_at
-#define mi_calloc_aligned PyUnstable_mi_calloc_aligned
-#define mi_calloc PyUnstable_mi_calloc
-#define mi_cfree PyUnstable_mi_cfree
-#define mi_check_owned PyUnstable_mi_check_owned
-#define _mi_clock_end PyUnstable__mi_clock_end
-#define _mi_clock_now PyUnstable__mi_clock_now
-#define _mi_clock_start PyUnstable__mi_clock_start
-#define mi_collect PyUnstable_mi_collect
-#define _mi_commit_mask_committed_size PyUnstable__mi_commit_mask_committed_size
-#define _mi_commit_mask_next_run PyUnstable__mi_commit_mask_next_run
-#define _mi_current_thread_count PyUnstable__mi_current_thread_count
-#define mi_debug_show_arenas PyUnstable_mi_debug_show_arenas
-#define _mi_deferred_free PyUnstable__mi_deferred_free
-#define mi_dupenv_s PyUnstable_mi_dupenv_s
-#define _mi_error_message PyUnstable__mi_error_message
-#define mi__expand PyUnstable_mi__expand
-#define mi_expand PyUnstable_mi_expand
-#define _mi_fprintf PyUnstable__mi_fprintf
-#define _mi_fputs PyUnstable__mi_fputs
-#define mi_free_aligned PyUnstable_mi_free_aligned
-#define _mi_free_delayed_block PyUnstable__mi_free_delayed_block
-#define _mi_free_generic PyUnstable__mi_free_generic
-#define mi_free PyUnstable_mi_free
-#define mi_free_size_aligned PyUnstable_mi_free_size_aligned
-#define mi_free_size PyUnstable_mi_free_size
-#define mi_good_size PyUnstable_mi_good_size
-#define mi_heap_alloc_new_n PyUnstable_mi_heap_alloc_new_n
-#define mi_heap_alloc_new PyUnstable_mi_heap_alloc_new
-#define mi_heap_calloc_aligned_at PyUnstable_mi_heap_calloc_aligned_at
-#define mi_heap_calloc_aligned PyUnstable_mi_heap_calloc_aligned
-#define mi_heap_calloc PyUnstable_mi_heap_calloc
-#define mi_heap_check_owned PyUnstable_mi_heap_check_owned
-#define _mi_heap_collect_abandon PyUnstable__mi_heap_collect_abandon
-#define mi_heap_collect PyUnstable_mi_heap_collect
-#define _mi_heap_collect_retired PyUnstable__mi_heap_collect_retired
-#define mi_heap_contains_block PyUnstable_mi_heap_contains_block
-#define _mi_heap_delayed_free_all PyUnstable__mi_heap_delayed_free_all
-#define _mi_heap_delayed_free_partial PyUnstable__mi_heap_delayed_free_partial
-#define mi_heap_delete PyUnstable_mi_heap_delete
-#define _mi_heap_destroy_pages PyUnstable__mi_heap_destroy_pages
-#define mi_heap_destroy PyUnstable_mi_heap_destroy
-#define _mi_heap_empty PyUnstable__mi_heap_empty
-#define mi_heap_get_backing PyUnstable_mi_heap_get_backing
-#define mi_heap_get_default PyUnstable_mi_heap_get_default
-#define _mi_heap_main_get PyUnstable__mi_heap_main_get
-#define mi_heap_malloc_aligned_at PyUnstable_mi_heap_malloc_aligned_at
-#define mi_heap_malloc_aligned PyUnstable_mi_heap_malloc_aligned
-#define mi_heap_mallocn PyUnstable_mi_heap_mallocn
-#define mi_heap_malloc PyUnstable_mi_heap_malloc
-#define mi_heap_malloc_small PyUnstable_mi_heap_malloc_small
-#define _mi_heap_malloc_zero_ex PyUnstable__mi_heap_malloc_zero_ex
-#define _mi_heap_malloc_zero PyUnstable__mi_heap_malloc_zero
-#define _mi_heap_memid_is_suitable PyUnstable__mi_heap_memid_is_suitable
-#define mi_heap_new_in_arena PyUnstable_mi_heap_new_in_arena
-#define mi_heap_new PyUnstable_mi_heap_new
-#define _mi_heap_random_next PyUnstable__mi_heap_random_next
-#define mi_heap_realloc_aligned_at PyUnstable_mi_heap_realloc_aligned_at
-#define mi_heap_realloc_aligned PyUnstable_mi_heap_realloc_aligned
-#define mi_heap_reallocf PyUnstable_mi_heap_reallocf
-#define mi_heap_reallocn PyUnstable_mi_heap_reallocn
-#define mi_heap_realloc PyUnstable_mi_heap_realloc
-#define _mi_heap_realloc_zero PyUnstable__mi_heap_realloc_zero
-#define mi_heap_realpath PyUnstable_mi_heap_realpath
-#define mi_heap_recalloc_aligned_at PyUnstable_mi_heap_recalloc_aligned_at
-#define mi_heap_recalloc_aligned PyUnstable_mi_heap_recalloc_aligned
-#define mi_heap_recalloc PyUnstable_mi_heap_recalloc
-#define mi_heap_rezalloc_aligned_at PyUnstable_mi_heap_rezalloc_aligned_at
-#define mi_heap_rezalloc_aligned PyUnstable_mi_heap_rezalloc_aligned
-#define mi_heap_rezalloc PyUnstable_mi_heap_rezalloc
-#define _mi_heap_set_default_direct PyUnstable__mi_heap_set_default_direct
-#define mi_heap_set_default PyUnstable_mi_heap_set_default
-#define mi_heap_strdup PyUnstable_mi_heap_strdup
-#define mi_heap_strndup PyUnstable_mi_heap_strndup
-#define mi_heap_try_new PyUnstable_mi_heap_try_new
-#define _mi_heap_unsafe_destroy_all PyUnstable__mi_heap_unsafe_destroy_all
-#define mi_heap_visit_blocks PyUnstable_mi_heap_visit_blocks
-#define mi_heap_zalloc_aligned_at PyUnstable_mi_heap_zalloc_aligned_at
-#define mi_heap_zalloc_aligned PyUnstable_mi_heap_zalloc_aligned
-#define mi_heap_zalloc PyUnstable_mi_heap_zalloc
-#define mi_is_in_heap_region PyUnstable_mi_is_in_heap_region
-#define _mi_is_main_thread PyUnstable__mi_is_main_thread
-#define mi_is_redirected PyUnstable_mi_is_redirected
-#define mi_malloc_aligned_at PyUnstable_mi_malloc_aligned_at
-#define mi_malloc_aligned PyUnstable_mi_malloc_aligned
-#define _mi_malloc_generic PyUnstable__mi_malloc_generic
-#define mi_malloc_good_size PyUnstable_mi_malloc_good_size
-#define mi_mallocn PyUnstable_mi_mallocn
-#define mi_malloc PyUnstable_mi_malloc
-#define mi_malloc_size PyUnstable_mi_malloc_size
-#define mi_malloc_small PyUnstable_mi_malloc_small
-#define mi_malloc_usable_size PyUnstable_mi_malloc_usable_size
-#define mi_manage_os_memory_ex PyUnstable_mi_manage_os_memory_ex
-#define mi_manage_os_memory PyUnstable_mi_manage_os_memory
-#define mi_mbsdup PyUnstable_mi_mbsdup
-#define mi_memalign PyUnstable_mi_memalign
-#define mi_new_aligned_nothrow PyUnstable_mi_new_aligned_nothrow
-#define mi_new_aligned PyUnstable_mi_new_aligned
-#define mi_new_nothrow PyUnstable_mi_new_nothrow
-#define mi_new_n PyUnstable_mi_new_n
-#define mi_new PyUnstable_mi_new
-#define mi_new_reallocn PyUnstable_mi_new_reallocn
-#define mi_new_realloc PyUnstable_mi_new_realloc
-#define mi_option_disable PyUnstable_mi_option_disable
-#define mi_option_enable PyUnstable_mi_option_enable
-#define mi_option_get_clamp PyUnstable_mi_option_get_clamp
-#define mi_option_get PyUnstable_mi_option_get
-#define mi_option_get_size PyUnstable_mi_option_get_size
-#define mi_option_is_enabled PyUnstable_mi_option_is_enabled
-#define mi_option_set_default PyUnstable_mi_option_set_default
-#define mi_option_set_enabled_default PyUnstable_mi_option_set_enabled_default
-#define mi_option_set_enabled PyUnstable_mi_option_set_enabled
-#define mi_option_set PyUnstable_mi_option_set
-#define _mi_options_init PyUnstable__mi_options_init
-#define _mi_os_alloc_aligned_at_offset PyUnstable__mi_os_alloc_aligned_at_offset
-#define _mi_os_alloc_aligned PyUnstable__mi_os_alloc_aligned
-#define _mi_os_alloc_huge_os_pages PyUnstable__mi_os_alloc_huge_os_pages
-#define _mi_os_alloc PyUnstable__mi_os_alloc
-#define _mi_os_commit PyUnstable__mi_os_commit
-#define _mi_os_decommit PyUnstable__mi_os_decommit
-#define _mi_os_free_ex PyUnstable__mi_os_free_ex
-#define _mi_os_free PyUnstable__mi_os_free
-#define _mi_os_get_aligned_hint PyUnstable__mi_os_get_aligned_hint
-#define _mi_os_good_alloc_size PyUnstable__mi_os_good_alloc_size
-#define _mi_os_has_overcommit PyUnstable__mi_os_has_overcommit
-#define _mi_os_has_virtual_reserve PyUnstable__mi_os_has_virtual_reserve
-#define _mi_os_init PyUnstable__mi_os_init
-#define _mi_os_large_page_size PyUnstable__mi_os_large_page_size
-#define _mi_os_numa_node_count_get PyUnstable__mi_os_numa_node_count_get
-#define _mi_os_numa_node_get PyUnstable__mi_os_numa_node_get
-#define _mi_os_page_size PyUnstable__mi_os_page_size
-#define _mi_os_protect PyUnstable__mi_os_protect
-#define _mi_os_purge_ex PyUnstable__mi_os_purge_ex
-#define _mi_os_purge PyUnstable__mi_os_purge
-#define _mi_os_random_weak PyUnstable__mi_os_random_weak
-#define _mi_os_reset PyUnstable__mi_os_reset
-#define _mi_os_unprotect PyUnstable__mi_os_unprotect
-#define _mi_os_use_large_page PyUnstable__mi_os_use_large_page
-#define _mi_padding_shrink PyUnstable__mi_padding_shrink
-#define _mi_page_abandon PyUnstable__mi_page_abandon
-#define _mi_page_empty PyUnstable__mi_page_empty
-#define _mi_page_free_collect PyUnstable__mi_page_free_collect
-#define _mi_page_free PyUnstable__mi_page_free
-#define _mi_page_malloc PyUnstable__mi_page_malloc
-#define _mi_page_ptr_unalign PyUnstable__mi_page_ptr_unalign
-#define _mi_page_queue_append PyUnstable__mi_page_queue_append
-#define _mi_page_reclaim PyUnstable__mi_page_reclaim
-#define _mi_page_retire PyUnstable__mi_page_retire
-#define _mi_page_try_use_delayed_free PyUnstable__mi_page_try_use_delayed_free
-#define _mi_page_unfull PyUnstable__mi_page_unfull
-#define _mi_page_use_delayed_free PyUnstable__mi_page_use_delayed_free
-#define mi_posix_memalign PyUnstable_mi_posix_memalign
-#define _mi_preloading PyUnstable__mi_preloading
-#define _mi_prim_alloc_huge_os_pages PyUnstable__mi_prim_alloc_huge_os_pages
-#define _mi_prim_alloc PyUnstable__mi_prim_alloc
-#define _mi_prim_clock_now PyUnstable__mi_prim_clock_now
-#define _mi_prim_commit PyUnstable__mi_prim_commit
-#define _mi_prim_decommit PyUnstable__mi_prim_decommit
-#define _mi_prim_free PyUnstable__mi_prim_free
-#define _mi_prim_getenv PyUnstable__mi_prim_getenv
-#define _mi_prim_mem_init PyUnstable__mi_prim_mem_init
-#define _mi_prim_numa_node_count PyUnstable__mi_prim_numa_node_count
-#define _mi_prim_numa_node PyUnstable__mi_prim_numa_node
-#define _mi_prim_out_stderr PyUnstable__mi_prim_out_stderr
-#define _mi_prim_process_info PyUnstable__mi_prim_process_info
-#define _mi_prim_protect PyUnstable__mi_prim_protect
-#define _mi_prim_random_buf PyUnstable__mi_prim_random_buf
-#define _mi_prim_reset PyUnstable__mi_prim_reset
-#define _mi_prim_thread_associate_default_heap PyUnstable__mi_prim_thread_associate_default_heap
-#define _mi_prim_thread_done_auto_done PyUnstable__mi_prim_thread_done_auto_done
-#define _mi_prim_thread_init_auto_done PyUnstable__mi_prim_thread_init_auto_done
-#define mi_process_info PyUnstable_mi_process_info
-#define mi_process_init PyUnstable_mi_process_init
-#define mi_pvalloc PyUnstable_mi_pvalloc
-#define _mi_random_init PyUnstable__mi_random_init
-#define _mi_random_init_weak PyUnstable__mi_random_init_weak
-#define _mi_random_next PyUnstable__mi_random_next
-#define _mi_random_reinit_if_weak PyUnstable__mi_random_reinit_if_weak
-#define _mi_random_split PyUnstable__mi_random_split
-#define mi_realloc_aligned_at PyUnstable_mi_realloc_aligned_at
-#define mi_realloc_aligned PyUnstable_mi_realloc_aligned
-#define mi_reallocarray PyUnstable_mi_reallocarray
-#define mi_reallocarr PyUnstable_mi_reallocarr
-#define mi_reallocf PyUnstable_mi_reallocf
-#define mi_reallocn PyUnstable_mi_reallocn
-#define mi_realloc PyUnstable_mi_realloc
-#define mi_realpath PyUnstable_mi_realpath
-#define mi_recalloc_aligned_at PyUnstable_mi_recalloc_aligned_at
-#define mi_recalloc_aligned PyUnstable_mi_recalloc_aligned
-#define mi_recalloc PyUnstable_mi_recalloc
-#define mi_register_deferred_free PyUnstable_mi_register_deferred_free
-#define mi_register_error PyUnstable_mi_register_error
-#define mi_register_output PyUnstable_mi_register_output
-#define mi_reserve_huge_os_pages_at_ex PyUnstable_mi_reserve_huge_os_pages_at_ex
-#define mi_reserve_huge_os_pages_at PyUnstable_mi_reserve_huge_os_pages_at
-#define mi_reserve_huge_os_pages_interleave PyUnstable_mi_reserve_huge_os_pages_interleave
-#define mi_reserve_huge_os_pages PyUnstable_mi_reserve_huge_os_pages
-#define mi_reserve_os_memory_ex PyUnstable_mi_reserve_os_memory_ex
-#define mi_reserve_os_memory PyUnstable_mi_reserve_os_memory
-#define mi_rezalloc_aligned_at PyUnstable_mi_rezalloc_aligned_at
-#define mi_rezalloc_aligned PyUnstable_mi_rezalloc_aligned
-#define mi_rezalloc PyUnstable_mi_rezalloc
-#define _mi_segment_huge_page_reset PyUnstable__mi_segment_huge_page_reset
-#define _mi_segment_map_allocated_at PyUnstable__mi_segment_map_allocated_at
-#define _mi_segment_map_freed_at PyUnstable__mi_segment_map_freed_at
-#define _mi_segment_page_abandon PyUnstable__mi_segment_page_abandon
-#define _mi_segment_page_alloc PyUnstable__mi_segment_page_alloc
-#define _mi_segment_page_free PyUnstable__mi_segment_page_free
-#define _mi_segment_page_start PyUnstable__mi_segment_page_start
-#define _mi_segment_thread_collect PyUnstable__mi_segment_thread_collect
-#define _mi_stat_counter_increase PyUnstable__mi_stat_counter_increase
-#define _mi_stat_decrease PyUnstable__mi_stat_decrease
-#define _mi_stat_increase PyUnstable__mi_stat_increase
-#define _mi_stats_done PyUnstable__mi_stats_done
-#define mi_stats_merge PyUnstable_mi_stats_merge
-#define mi_stats_print_out PyUnstable_mi_stats_print_out
-#define mi_stats_print PyUnstable_mi_stats_print
-#define mi_stats_reset PyUnstable_mi_stats_reset
-#define mi_strdup PyUnstable_mi_strdup
-#define _mi_strlcat PyUnstable__mi_strlcat
-#define _mi_strlcpy PyUnstable__mi_strlcpy
-#define _mi_strlen PyUnstable__mi_strlen
-#define mi_strndup PyUnstable_mi_strndup
-#define _mi_strnicmp PyUnstable__mi_strnicmp
-#define _mi_strnlen PyUnstable__mi_strnlen
-#define _mi_thread_data_collect PyUnstable__mi_thread_data_collect
-#define _mi_thread_done PyUnstable__mi_thread_done
-#define mi_thread_done PyUnstable_mi_thread_done
-#define _mi_thread_id PyUnstable__mi_thread_id
-#define mi_thread_init PyUnstable_mi_thread_init
-#define mi_thread_stats_print_out PyUnstable_mi_thread_stats_print_out
-#define _mi_toupper PyUnstable__mi_toupper
-#define _mi_trace_message PyUnstable__mi_trace_message
-#define mi_usable_size PyUnstable_mi_usable_size
-#define mi_valloc PyUnstable_mi_valloc
-#define _mi_verbose_message PyUnstable__mi_verbose_message
-#define mi_version PyUnstable_mi_version
-#define _mi_warning_message PyUnstable__mi_warning_message
-#define mi_wcsdup PyUnstable_mi_wcsdup
-#define mi_wdupenv_s PyUnstable_mi_wdupenv_s
-#define mi_zalloc_aligned_at PyUnstable_mi_zalloc_aligned_at
-#define mi_zalloc_aligned PyUnstable_mi_zalloc_aligned
-#define mi_zalloc PyUnstable_mi_zalloc
-#define mi_zalloc_small PyUnstable_mi_zalloc_small
-
 #include "mimalloc.h"
 
 #endif // Py_INTERNAL_MIMALLOC_H
diff --git a/Makefile.pre.in b/Makefile.pre.in
index c7d735d699cce6..7cc15b01a0a3bb 100644
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -100,7 +100,7 @@ CONFIGURE_LDFLAGS=	@LDFLAGS@
 # command line to append to these values without stomping the pre-set
 # values.
 PY_CFLAGS=	$(BASECFLAGS) $(OPT) $(CONFIGURE_CFLAGS) $(CFLAGS) $(EXTRA_CFLAGS)
-PY_CFLAGS_NODIST=$(CONFIGURE_CFLAGS_NODIST) $(CFLAGS_NODIST) -I$(srcdir)/Include/internal -I$(srcdir)/Include/mimalloc
+PY_CFLAGS_NODIST=$(CONFIGURE_CFLAGS_NODIST) $(CFLAGS_NODIST) -I$(srcdir)/Include/internal -I$(srcdir)/Include/internal/mimalloc
 # Both CPPFLAGS and LDFLAGS need to contain the shell's value for setup.py to
 # be able to build extension modules using the directories specified in the
 # environment variables
@@ -342,12 +342,12 @@ IO_OBJS=	\
 
 MIMALLOC_HEADERS= \
                 $(srcdir)/Include/internal/pycore_mimalloc.h \
-		$(srcdir)/Include/mimalloc/mimalloc.h \
-		$(srcdir)/Include/mimalloc/mimalloc/atomic.h \
-		$(srcdir)/Include/mimalloc/mimalloc/internal.h \
-		$(srcdir)/Include/mimalloc/mimalloc/prim.h \
-		$(srcdir)/Include/mimalloc/mimalloc/track.h \
-		$(srcdir)/Include/mimalloc/mimalloc/types.h
+		$(srcdir)/Include/internal/mimalloc/mimalloc.h \
+		$(srcdir)/Include/internal/mimalloc/mimalloc/atomic.h \
+		$(srcdir)/Include/internal/mimalloc/mimalloc/internal.h \
+		$(srcdir)/Include/internal/mimalloc/mimalloc/prim.h \
+		$(srcdir)/Include/internal/mimalloc/mimalloc/track.h \
+		$(srcdir)/Include/internal/mimalloc/mimalloc/types.h
 
 
 ##########################################################################
@@ -1552,6 +1552,22 @@ Objects/unicodeobject.o: $(srcdir)/Objects/unicodeobject.c $(UNICODE_DEPS)
 Objects/dictobject.o: $(srcdir)/Objects/stringlib/eq.h
 Objects/setobject.o: $(srcdir)/Objects/stringlib/eq.h
 
+Objects/obmalloc.o: $(srcdir)/Objects/mimalloc/alloc.c \
+				    $(srcdir)/Objects/mimalloc/alloc-aligned.c \
+					$(srcdir)/Objects/mimalloc/alloc-posix.c \
+					$(srcdir)/Objects/mimalloc/arena.c \
+					$(srcdir)/Objects/mimalloc/bitmap.c \
+					$(srcdir)/Objects/mimalloc/heap.c \
+					$(srcdir)/Objects/mimalloc/init.c \
+					$(srcdir)/Objects/mimalloc/options.c \
+					$(srcdir)/Objects/mimalloc/os.c \
+					$(srcdir)/Objects/mimalloc/page.c \
+					$(srcdir)/Objects/mimalloc/random.c \
+					$(srcdir)/Objects/mimalloc/segment.c \
+					$(srcdir)/Objects/mimalloc/segment-map.c \
+					$(srcdir)/Objects/mimalloc/stats.c \
+					$(srcdir)/Objects/mimalloc/prim/prim.c
+
 Objects/mimalloc/page.o: $(srcdir)/Objects/mimalloc/page-queue.c
 
 .PHONY: regen-cases
diff --git a/Tools/build/smelly.py b/Tools/build/smelly.py
index ab345307ff9b64..e33f222218ac92 100755
--- a/Tools/build/smelly.py
+++ b/Tools/build/smelly.py
@@ -11,6 +11,11 @@
 if sys.platform == 'darwin':
     ALLOWED_PREFIXES += ('__Py',)
 
+# mimalloc doesn't use static, but it's symbols are not exported
+# from the shared library.  They do show up in the static library
+# before its linked into an executable.
+ALLOWED_STATIC_PREFIXES = ('mi_', '_mi_')
+
 # "Legacy": some old symbols are prefixed by "PY_".
 EXCEPTIONS = frozenset({
     'PY_TIMEOUT_MAX',
@@ -59,7 +64,7 @@ def get_exported_symbols(library, dynamic=False):
     return stdout
 
 
-def get_smelly_symbols(stdout):
+def get_smelly_symbols(stdout, dynamic=False):
     smelly_symbols = []
     python_symbols = []
     local_symbols = []
@@ -77,7 +82,9 @@ def get_smelly_symbols(stdout):
         symbol = parts[-1]
         result = '%s (type: %s)' % (symbol, symtype)
 
-        if symbol.startswith(ALLOWED_PREFIXES) or symbol in EXCEPTIONS:
+        if (symbol.startswith(ALLOWED_PREFIXES) or 
+            symbol in EXCEPTIONS or 
+            (not dynamic and symbol.startswith(ALLOWED_STATIC_PREFIXES))):
             python_symbols.append(result)
             continue
 
@@ -95,7 +102,7 @@ def get_smelly_symbols(stdout):
 
 def check_library(library, dynamic=False):
     nm_output = get_exported_symbols(library, dynamic)
-    smelly_symbols, python_symbols = get_smelly_symbols(nm_output)
+    smelly_symbols, python_symbols = get_smelly_symbols(nm_output, dynamic)
 
     if not smelly_symbols:
         print(f"OK: no smelly symbol found ({len(python_symbols)} Python symbols)")

From 82305dc2709561e9f85396e65f5e1889877df840 Mon Sep 17 00:00:00 2001
From: Dino Viehland <dinoviehland@fb.com>
Date: Tue, 10 Oct 2023 17:28:30 -0700
Subject: [PATCH 15/24] Fix header locations on Windows Fix header locations in
 c-analyzer Fix smelly trailing white space

Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
---
 PCbuild/pythoncore.vcxproj          | 12 ++++++------
 PCbuild/pythoncore.vcxproj.filters  | 14 +++++++-------
 Tools/build/smelly.py               |  4 ++--
 Tools/c-analyzer/cpython/_files.py  |  2 +-
 Tools/c-analyzer/cpython/_parser.py |  6 +++---
 5 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj
index e19c9d0396abc0..4b3d5bf5784154 100644
--- a/PCbuild/pythoncore.vcxproj
+++ b/PCbuild/pythoncore.vcxproj
@@ -302,12 +302,12 @@
     <ClInclude Include="..\Include\marshal.h" />
     <ClInclude Include="..\Include\memoryobject.h" />
     <ClInclude Include="..\Include\methodobject.h" />
-    <ClInclude Include="..\Include\mimalloc\mimalloc\atomic.h" />
-    <ClInclude Include="..\Include\mimalloc\mimalloc\internal.h" />
-    <ClInclude Include="..\Include\mimalloc\mimalloc\prim.h" />
-    <ClInclude Include="..\Include\mimalloc\mimalloc\track.h" />
-    <ClInclude Include="..\Include\mimalloc\mimalloc\types.h" />
-    <ClInclude Include="..\Include\mimalloc\mimalloc.h" />
+    <ClInclude Include="..\Include\internal\mimalloc\mimalloc\atomic.h" />
+    <ClInclude Include="..\Include\internal\mimalloc\mimalloc\internal.h" />
+    <ClInclude Include="..\Include\internal\mimalloc\mimalloc\prim.h" />
+    <ClInclude Include="..\Include\internal\mimalloc\mimalloc\track.h" />
+    <ClInclude Include="..\Include\internal\mimalloc\mimalloc\types.h" />
+    <ClInclude Include="..\Include\internal\mimalloc\mimalloc.h" />
     <ClInclude Include="..\Include\modsupport.h" />
     <ClInclude Include="..\Include\moduleobject.h" />
     <ClInclude Include="..\Include\object.h" />
diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters
index ce16799029653a..2eba3a8b427792 100644
--- a/PCbuild/pythoncore.vcxproj.filters
+++ b/PCbuild/pythoncore.vcxproj.filters
@@ -4,7 +4,7 @@
     <Filter Include="Include">
       <UniqueIdentifier>{086b0afb-270c-4603-a02a-63d46f0b2b92}</UniqueIdentifier>
     </Filter>
-    <Filter Include="Include\mimalloc">
+    <Filter Include="Include\internal\mimalloc">
       <UniqueIdentifier>{1dc8d8bd-d493-478e-9b7a-7eb108bb8bd5}</UniqueIdentifier>
     </Filter>
     <Filter Include="Modules">
@@ -795,22 +795,22 @@
     <ClInclude Include="..\Include\internal\pycore_uops.h">
       <Filter>Include\internal</Filter>
     </ClInclude>
-    <ClInclude Include="..\Include\mimalloc\mimalloc.h">
+    <ClInclude Include="..\Include\internal\mimalloc\mimalloc.h">
       <Filter>Include\mimalloc</Filter>
     </ClInclude>
-    <ClInclude Include="..\Include\mimalloc\mimalloc\atomic.h">
+    <ClInclude Include="..\Include\internal\mimalloc\mimalloc\atomic.h">
       <Filter>Include\mimalloc</Filter>
     </ClInclude>
-    <ClInclude Include="..\Include\mimalloc\mimalloc\internal.h">
+    <ClInclude Include="..\Include\internal\mimalloc\mimalloc\internal.h">
       <Filter>Include\mimalloc</Filter>
     </ClInclude>
-    <ClInclude Include="..\Include\mimalloc\mimalloc\prim.h">
+    <ClInclude Include="..\Include\internal\mimalloc\mimalloc\prim.h">
       <Filter>Include\mimalloc</Filter>
     </ClInclude>
-    <ClInclude Include="..\Include\mimalloc\mimalloc\track.h">
+    <ClInclude Include="..\Include\internal\mimalloc\mimalloc\track.h">
       <Filter>Include\mimalloc</Filter>
     </ClInclude>
-    <ClInclude Include="..\Include\mimalloc\mimalloc\types.h">
+    <ClInclude Include="..\Include\internal\mimalloc\mimalloc\types.h">
       <Filter>Include\mimalloc</Filter>
     </ClInclude>
     <ClInclude Include="$(zlibDir)\crc32.h">
diff --git a/Tools/build/smelly.py b/Tools/build/smelly.py
index e33f222218ac92..7c534269c57a09 100755
--- a/Tools/build/smelly.py
+++ b/Tools/build/smelly.py
@@ -82,8 +82,8 @@ def get_smelly_symbols(stdout, dynamic=False):
         symbol = parts[-1]
         result = '%s (type: %s)' % (symbol, symtype)
 
-        if (symbol.startswith(ALLOWED_PREFIXES) or 
-            symbol in EXCEPTIONS or 
+        if (symbol.startswith(ALLOWED_PREFIXES) or
+            symbol in EXCEPTIONS or
             (not dynamic and symbol.startswith(ALLOWED_STATIC_PREFIXES))):
             python_symbols.append(result)
             continue
diff --git a/Tools/c-analyzer/cpython/_files.py b/Tools/c-analyzer/cpython/_files.py
index 4be8f19bdac207..a9bf0a1961222a 100644
--- a/Tools/c-analyzer/cpython/_files.py
+++ b/Tools/c-analyzer/cpython/_files.py
@@ -9,7 +9,7 @@
     # Technically, this is covered by "Include/*.h":
     #'Include/cpython/*.h',
     'Include/internal/*.h',
-    'Include/mimalloc/**/*.h',
+    'Include/internal/mimalloc/**/*.h',
     'Modules/**/*.h',
     'Modules/**/*.c',
     'Objects/**/*.h',
diff --git a/Tools/c-analyzer/cpython/_parser.py b/Tools/c-analyzer/cpython/_parser.py
index c8a790e5defe2e..04388fb54caa6c 100644
--- a/Tools/c-analyzer/cpython/_parser.py
+++ b/Tools/c-analyzer/cpython/_parser.py
@@ -91,8 +91,8 @@ def clean_lines(text):
 
 # mimalloc
 Objects/mimalloc/*.c
-Include/mimalloc/*.h
-Include/mimalloc/mimalloc/*.h
+Include/internal/mimalloc/*.h
+Include/internal/mimalloc/mimalloc/*.h
 
 # @end=conf@
 ''')
@@ -114,7 +114,7 @@ def clean_lines(text):
 *	.
 *	./Include
 *	./Include/internal
-*   ./Include/mimalloc
+*   ./Include/internal/mimalloc
 
 Modules/_decimal/**/*.c	Modules/_decimal/libmpdec
 Modules/_elementtree.c	Modules/expat

From 2d0158a7ccdc4b1a09a40877fb4458815d618840 Mon Sep 17 00:00:00 2001
From: Dino Viehland <dinoviehland@fb.com>
Date: Thu, 12 Oct 2023 16:51:44 -0700
Subject: [PATCH 16/24] Try again to fix windows headers

---
 PCbuild/pyproject.props            |  2 +-
 PCbuild/pythoncore.vcxproj.filters | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/PCbuild/pyproject.props b/PCbuild/pyproject.props
index 437ffa80f1665c..fb12f2b0766d9b 100644
--- a/PCbuild/pyproject.props
+++ b/PCbuild/pyproject.props
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <ItemDefinitionGroup>
     <ClCompile>
-      <AdditionalIncludeDirectories>$(PySourcePath)Include;$(PySourcePath)Include\internal;$(PySourcePath)Include\mimalloc;$(PySourcePath)PC;$(IntDir);%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>$(PySourcePath)Include;$(PySourcePath)Include\internal;$(PySourcePath)Include\internal\mimalloc;$(PySourcePath)PC;$(IntDir);%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <PreprocessorDefinitions>WIN32;$(_Py3NamePreprocessorDefinition);$(_PlatformPreprocessorDefinition)$(_DebugPreprocessorDefinition)$(_PydPreprocessorDefinition)%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <PreprocessorDefinitions Condition="'$(DisableGil)' == 'true'">Py_NOGIL=1;%(PreprocessorDefinitions)</PreprocessorDefinitions>
 
diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters
index 2eba3a8b427792..b17149ae728988 100644
--- a/PCbuild/pythoncore.vcxproj.filters
+++ b/PCbuild/pythoncore.vcxproj.filters
@@ -796,22 +796,22 @@
       <Filter>Include\internal</Filter>
     </ClInclude>
     <ClInclude Include="..\Include\internal\mimalloc\mimalloc.h">
-      <Filter>Include\mimalloc</Filter>
+      <Filter>Include\internal\mimalloc</Filter>
     </ClInclude>
     <ClInclude Include="..\Include\internal\mimalloc\mimalloc\atomic.h">
-      <Filter>Include\mimalloc</Filter>
+      <Filter>Include\internal\mimalloc</Filter>
     </ClInclude>
     <ClInclude Include="..\Include\internal\mimalloc\mimalloc\internal.h">
-      <Filter>Include\mimalloc</Filter>
+      <Filter>Include\internal\mimalloc</Filter>
     </ClInclude>
     <ClInclude Include="..\Include\internal\mimalloc\mimalloc\prim.h">
-      <Filter>Include\mimalloc</Filter>
+      <Filter>Include\internal\mimalloc</Filter>
     </ClInclude>
     <ClInclude Include="..\Include\internal\mimalloc\mimalloc\track.h">
-      <Filter>Include\mimalloc</Filter>
+      <Filter>Include\internal\mimalloc</Filter>
     </ClInclude>
     <ClInclude Include="..\Include\internal\mimalloc\mimalloc\types.h">
-      <Filter>Include\mimalloc</Filter>
+      <Filter>Include\internal\mimalloc</Filter>
     </ClInclude>
     <ClInclude Include="$(zlibDir)\crc32.h">
       <Filter>Modules\zlib</Filter>

From 708cc6682b2c11b547d6f9705cbdd98708690344 Mon Sep 17 00:00:00 2001
From: Dino Viehland <dinoviehland@fb.com>
Date: Mon, 16 Oct 2023 11:02:27 -0700
Subject: [PATCH 17/24] Make memory sanitizer happy

---
 Objects/mimalloc/prim/unix/prim.c | 2 +-
 Objects/mimalloc/random.c         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Objects/mimalloc/prim/unix/prim.c b/Objects/mimalloc/prim/unix/prim.c
index a9555eb09e11f6..060523c581ae24 100644
--- a/Objects/mimalloc/prim/unix/prim.c
+++ b/Objects/mimalloc/prim/unix/prim.c
@@ -104,7 +104,7 @@ static bool unix_detect_overcommit(void) {
 #if defined(__linux__)
   int fd = mi_prim_open("/proc/sys/vm/overcommit_memory", O_RDONLY);
         if (fd >= 0) {
-    char buf[32];
+    char buf[32] = {0};
     ssize_t nread = mi_prim_read(fd, &buf, sizeof(buf));
     mi_prim_close(fd);
     // <https://www.kernel.org/doc/Documentation/vm/overcommit-accounting>
diff --git a/Objects/mimalloc/random.c b/Objects/mimalloc/random.c
index 2a18b5aa992dad..35a0633a800d6c 100644
--- a/Objects/mimalloc/random.c
+++ b/Objects/mimalloc/random.c
@@ -171,7 +171,7 @@ uintptr_t _mi_os_random_weak(uintptr_t extra_seed) {
 }
 
 static void mi_random_init_ex(mi_random_ctx_t* ctx, bool use_weak) {
-  uint8_t key[32];
+  uint8_t key[32] = {0};
   if (use_weak || !_mi_prim_random_buf(key, sizeof(key))) {
     // if we fail to get random data from the OS, we fall back to a
     // weak random source based on the current time

From 526a08d960283b9d5d9681e6689f0d0e6ae85c9b Mon Sep 17 00:00:00 2001
From: Dino Viehland <dinoviehland@fb.com>
Date: Mon, 23 Oct 2023 14:27:19 -0700
Subject: [PATCH 18/24] Keep pymalloc the default

---
 Include/internal/pycore_pymem_init.h | 15 +--------------
 Lib/test/test_cmd_line.py            |  9 ++-------
 Objects/obmalloc.c                   |  6 +-----
 3 files changed, 4 insertions(+), 26 deletions(-)

diff --git a/Include/internal/pycore_pymem_init.h b/Include/internal/pycore_pymem_init.h
index 8776fb1d38099c..11fbe16fa6f1d5 100644
--- a/Include/internal/pycore_pymem_init.h
+++ b/Include/internal/pycore_pymem_init.h
@@ -18,20 +18,7 @@ extern void * _PyMem_RawRealloc(void *, void *, size_t);
 extern void _PyMem_RawFree(void *, void *);
 #define PYRAW_ALLOC {NULL, _PyMem_RawMalloc, _PyMem_RawCalloc, _PyMem_RawRealloc, _PyMem_RawFree}
 
-#ifdef WITH_MIMALLOC
-extern void* _PyMem_MiMalloc(void *ctx, size_t size);
-extern void* _PyMem_MiCalloc(void *ctx, size_t nelem, size_t elsize);
-extern void* _PyMem_MiRealloc(void *ctx, void *ptr, size_t size);
-extern void _PyMem_MiFree(void *ctx, void *ptr);
-extern void* _PyObject_MiMalloc(void *ctx, size_t nbytes);
-extern void* _PyObject_MiCalloc(void *ctx, size_t nelem, size_t elsize);
-extern void* _PyObject_MiRealloc(void *ctx, void *ptr, size_t nbytes);
-extern void _PyObject_MiFree(void *ctx, void *ptr);
-
-#  define PYMEM_ALLOC {NULL, _PyMem_MiMalloc, _PyMem_MiCalloc, _PyMem_MiRealloc, _PyMem_MiFree}
-#  define PYOBJ_ALLOC {NULL, _PyObject_MiMalloc, _PyObject_MiCalloc, _PyObject_MiRealloc, _PyObject_MiFree}
-
-#elif defined(WITH_PYMALLOC)
+#if defined(WITH_PYMALLOC)
 extern void* _PyObject_Malloc(void *, size_t);
 extern void* _PyObject_Calloc(void *, size_t, size_t);
 extern void _PyObject_Free(void *, void *);
diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py
index 49f81a956a581f..866c7d11000303 100644
--- a/Lib/test/test_cmd_line.py
+++ b/Lib/test/test_cmd_line.py
@@ -724,9 +724,7 @@ def test_xdev(self):
             code = "import _testinternalcapi; print(_testinternalcapi.pymem_getallocatorsname())"
             with support.SuppressCrashReport():
                 out = self.run_xdev("-c", code, check_exitcode=False)
-            if support.with_mimalloc():
-                alloc_name = "mimalloc_debug"
-            elif support.with_pymalloc():
+            if support.with_pymalloc():
                 alloc_name = "pymalloc_debug"
             else:
                 alloc_name = "malloc_debug"
@@ -806,10 +804,7 @@ def test_pythonmalloc(self):
         # Test the PYTHONMALLOC environment variable
         pymalloc = support.with_pymalloc()
         mimalloc = support.with_mimalloc()
-        if mimalloc:
-            default_name = 'mimalloc_debug' if support.Py_DEBUG else 'mimalloc'
-            default_name_debug = 'mimalloc_debug'
-        elif pymalloc:
+        if pymalloc:
             default_name = 'pymalloc_debug' if support.Py_DEBUG else 'pymalloc'
             default_name_debug = 'pymalloc_debug'
         else:
diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c
index e483ecb6c0d82b..30870d2f67ab27 100644
--- a/Objects/obmalloc.c
+++ b/Objects/obmalloc.c
@@ -153,11 +153,7 @@ void* _PyObject_Realloc(void *ctx, void *ptr, size_t size);
 #  define PYMALLOC_ALLOC {NULL, _PyObject_Malloc, _PyObject_Calloc, _PyObject_Realloc, _PyObject_Free}
 #endif  // WITH_PYMALLOC
 
-#ifdef WITH_MIMALLOC
-#  define PYRAW_ALLOC MALLOC_ALLOC
-#  define PYMEM_ALLOC MIMALLOC_ALLOC
-#  define PYOBJ_ALLOC MIMALLOC_OBJALLOC
-#elif defined(WITH_PYMALLOC)
+#if defined(WITH_PYMALLOC)
 #  define PYRAW_ALLOC MALLOC_ALLOC
 #  define PYMEM_ALLOC PYMALLOC_ALLOC
 #  define PYOBJ_ALLOC PYMALLOC_ALLOC

From 3d43b3a29f57cd4eb1273105b7904ad31be78c88 Mon Sep 17 00:00:00 2001
From: Victor Stinner <vstinner@python.org>
Date: Mon, 30 Oct 2023 15:40:41 +0100
Subject: [PATCH 19/24] Update Lib/test/pythoninfo.py

---
 Lib/test/pythoninfo.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Lib/test/pythoninfo.py b/Lib/test/pythoninfo.py
index b0493366bc27f5..3a91a7dbc65021 100644
--- a/Lib/test/pythoninfo.py
+++ b/Lib/test/pythoninfo.py
@@ -543,7 +543,7 @@ def collect_sysconfig(info_add):
     for name in (
         'WITH_DOC_STRINGS',
         'WITH_DTRACE',
-        'WITH_FREELISTS'
+        'WITH_FREELISTS',
         'WITH_MIMALLOC',
         'WITH_PYMALLOC',
         'WITH_VALGRIND',

From 3261bcbb550f8f0dce242f5d94dcf7ece15c9446 Mon Sep 17 00:00:00 2001
From: Victor Stinner <vstinner@python.org>
Date: Mon, 30 Oct 2023 15:45:52 +0100
Subject: [PATCH 20/24] license.rst: remove empty line at the end

---
 Doc/license.rst | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Doc/license.rst b/Doc/license.rst
index 9b3a9751ae402a..8aad93062a5a88 100644
--- a/Doc/license.rst
+++ b/Doc/license.rst
@@ -1066,4 +1066,3 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
-

From 99c4a24b1449595b77be89ec6bc64bd32392c98a Mon Sep 17 00:00:00 2001
From: Victor Stinner <vstinner@python.org>
Date: Mon, 30 Oct 2023 15:50:42 +0100
Subject: [PATCH 21/24] Reduce Makefile.pre.in indentation

---
 Makefile.pre.in | 44 +++++++++++++++++++++++---------------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/Makefile.pre.in b/Makefile.pre.in
index 7cc15b01a0a3bb..46cd053f36ae3d 100644
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -341,13 +341,13 @@ IO_OBJS=	\
 # mimalloc
 
 MIMALLOC_HEADERS= \
-                $(srcdir)/Include/internal/pycore_mimalloc.h \
-		$(srcdir)/Include/internal/mimalloc/mimalloc.h \
-		$(srcdir)/Include/internal/mimalloc/mimalloc/atomic.h \
-		$(srcdir)/Include/internal/mimalloc/mimalloc/internal.h \
-		$(srcdir)/Include/internal/mimalloc/mimalloc/prim.h \
-		$(srcdir)/Include/internal/mimalloc/mimalloc/track.h \
-		$(srcdir)/Include/internal/mimalloc/mimalloc/types.h
+	$(srcdir)/Include/internal/pycore_mimalloc.h \
+	$(srcdir)/Include/internal/mimalloc/mimalloc.h \
+	$(srcdir)/Include/internal/mimalloc/mimalloc/atomic.h \
+	$(srcdir)/Include/internal/mimalloc/mimalloc/internal.h \
+	$(srcdir)/Include/internal/mimalloc/mimalloc/prim.h \
+	$(srcdir)/Include/internal/mimalloc/mimalloc/track.h \
+	$(srcdir)/Include/internal/mimalloc/mimalloc/types.h
 
 
 ##########################################################################
@@ -1553,20 +1553,20 @@ Objects/dictobject.o: $(srcdir)/Objects/stringlib/eq.h
 Objects/setobject.o: $(srcdir)/Objects/stringlib/eq.h
 
 Objects/obmalloc.o: $(srcdir)/Objects/mimalloc/alloc.c \
-				    $(srcdir)/Objects/mimalloc/alloc-aligned.c \
-					$(srcdir)/Objects/mimalloc/alloc-posix.c \
-					$(srcdir)/Objects/mimalloc/arena.c \
-					$(srcdir)/Objects/mimalloc/bitmap.c \
-					$(srcdir)/Objects/mimalloc/heap.c \
-					$(srcdir)/Objects/mimalloc/init.c \
-					$(srcdir)/Objects/mimalloc/options.c \
-					$(srcdir)/Objects/mimalloc/os.c \
-					$(srcdir)/Objects/mimalloc/page.c \
-					$(srcdir)/Objects/mimalloc/random.c \
-					$(srcdir)/Objects/mimalloc/segment.c \
-					$(srcdir)/Objects/mimalloc/segment-map.c \
-					$(srcdir)/Objects/mimalloc/stats.c \
-					$(srcdir)/Objects/mimalloc/prim/prim.c
+	$(srcdir)/Objects/mimalloc/alloc-aligned.c \
+		$(srcdir)/Objects/mimalloc/alloc-posix.c \
+		$(srcdir)/Objects/mimalloc/arena.c \
+		$(srcdir)/Objects/mimalloc/bitmap.c \
+		$(srcdir)/Objects/mimalloc/heap.c \
+		$(srcdir)/Objects/mimalloc/init.c \
+		$(srcdir)/Objects/mimalloc/options.c \
+		$(srcdir)/Objects/mimalloc/os.c \
+		$(srcdir)/Objects/mimalloc/page.c \
+		$(srcdir)/Objects/mimalloc/random.c \
+		$(srcdir)/Objects/mimalloc/segment.c \
+		$(srcdir)/Objects/mimalloc/segment-map.c \
+		$(srcdir)/Objects/mimalloc/stats.c \
+		$(srcdir)/Objects/mimalloc/prim/prim.c
 
 Objects/mimalloc/page.o: $(srcdir)/Objects/mimalloc/page-queue.c
 
@@ -1780,7 +1780,9 @@ PYTHON_HEADERS= \
 		$(srcdir)/Include/cpython/unicodeobject.h \
 		$(srcdir)/Include/cpython/warnings.h \
 		$(srcdir)/Include/cpython/weakrefobject.h \
+		\
 		@MIMALLOC_HEADERS@ \
+		\
 		$(srcdir)/Include/internal/pycore_abstract.h \
 		$(srcdir)/Include/internal/pycore_asdl.h \
 		$(srcdir)/Include/internal/pycore_ast.h \

From e2e36dc363dd004d93249a83f235fcf700747b3e Mon Sep 17 00:00:00 2001
From: Victor Stinner <vstinner@python.org>
Date: Mon, 30 Oct 2023 15:55:47 +0100
Subject: [PATCH 22/24] obmalloc.c: indent

---
 Objects/obmalloc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c
index 30870d2f67ab27..a3a65c469cd6f4 100644
--- a/Objects/obmalloc.c
+++ b/Objects/obmalloc.c
@@ -11,9 +11,9 @@
 #include <stdlib.h>               // malloc()
 #include <stdbool.h>
 #ifdef WITH_MIMALLOC
-#include "pycore_mimalloc.h"
-#include "mimalloc/static.c"
-#include "mimalloc/internal.h"    // for stats
+#  include "pycore_mimalloc.h"
+#  include "mimalloc/static.c"
+#  include "mimalloc/internal.h"  // for stats
 #endif
 
 #undef  uint

From 62b7b8300295b31ae194116d16e0ad2fa29b4597 Mon Sep 17 00:00:00 2001
From: Victor Stinner <vstinner@python.org>
Date: Mon, 30 Oct 2023 16:04:52 +0100
Subject: [PATCH 23/24] Split _PyObject_DebugMallocStats() in sub-functions

---
 Objects/obmalloc.c | 80 +++++++++++++++++++++++++++-------------------
 1 file changed, 47 insertions(+), 33 deletions(-)

diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c
index a3a65c469cd6f4..5927d1656f5e30 100644
--- a/Objects/obmalloc.c
+++ b/Objects/obmalloc.c
@@ -2726,44 +2726,33 @@ static bool _collect_alloc_stats(
     stats->bytes_committed += area->committed;
     return 1;
 }
-#endif
 
-/* Print summary info to "out" about the state of pymalloc's structures.
- * In Py_DEBUG mode, also perform some expensive internal consistency
- * checks.
- *
- * Return 0 if the memory debug hooks are not installed or no statistics was
- * written into out, return 1 otherwise.
- */
-int
-_PyObject_DebugMallocStats(FILE *out)
+static void
+py_mimalloc_print_stats(FILE *out)
 {
-#ifdef WITH_MIMALLOC
-    if (_PyMem_MimallocEnabled()) {
-        fprintf(out, "Small block threshold = %ld, in %u size classes.\n",
-            MI_SMALL_OBJ_SIZE_MAX, MI_BIN_HUGE);
-        fprintf(out, "Medium block threshold = %ld\n",
-                MI_MEDIUM_OBJ_SIZE_MAX);
-        fprintf(out, "Large object max size = %ld\n",
-                MI_LARGE_OBJ_SIZE_MAX);
-
-        mi_heap_t *heap = mi_heap_get_default();
-        struct _alloc_stats stats = {};
-        mi_heap_visit_blocks(heap, false, &_collect_alloc_stats, &stats);
+    fprintf(out, "Small block threshold = %ld, in %u size classes.\n",
+        MI_SMALL_OBJ_SIZE_MAX, MI_BIN_HUGE);
+    fprintf(out, "Medium block threshold = %ld\n",
+            MI_MEDIUM_OBJ_SIZE_MAX);
+    fprintf(out, "Large object max size = %ld\n",
+            MI_LARGE_OBJ_SIZE_MAX);
 
-        fprintf(out, "    Allocated Blocks: %ld\n", stats.allocated_blocks);
-        fprintf(out, "    Allocated Bytes: %ld\n", stats.allocated_bytes);
-        fprintf(out, "    Allocated Bytes w/ Overhead: %ld\n", stats.allocated_with_overhead);
-        fprintf(out, "    Bytes Reserved: %ld\n", stats.bytes_reserved);
-        fprintf(out, "    Bytes Committed: %ld\n", stats.bytes_committed);
+    mi_heap_t *heap = mi_heap_get_default();
+    struct _alloc_stats stats = {};
+    mi_heap_visit_blocks(heap, false, &_collect_alloc_stats, &stats);
 
-        return 1;
-    }
+    fprintf(out, "    Allocated Blocks: %ld\n", stats.allocated_blocks);
+    fprintf(out, "    Allocated Bytes: %ld\n", stats.allocated_bytes);
+    fprintf(out, "    Allocated Bytes w/ Overhead: %ld\n", stats.allocated_with_overhead);
+    fprintf(out, "    Bytes Reserved: %ld\n", stats.bytes_reserved);
+    fprintf(out, "    Bytes Committed: %ld\n", stats.bytes_committed);
+}
 #endif
 
-    if (!_PyMem_PymallocEnabled()) {
-        return 0;
-    }
+
+static void
+pymalloc_print_stats(FILE *out)
+{
     OMState *state = get_state();
 
     uint i;
@@ -2916,7 +2905,32 @@ _PyObject_DebugMallocStats(FILE *out)
 #endif
 #endif
 
-    return 1;
+}
+
+/* Print summary info to "out" about the state of pymalloc's structures.
+ * In Py_DEBUG mode, also perform some expensive internal consistency
+ * checks.
+ *
+ * Return 0 if the memory debug hooks are not installed or no statistics was
+ * written into out, return 1 otherwise.
+ */
+int
+_PyObject_DebugMallocStats(FILE *out)
+{
+#ifdef WITH_MIMALLOC
+    if (_PyMem_MimallocEnabled()) {
+        py_mimalloc_print_stats(out);
+        return 1;
+    }
+    else
+#endif
+    if (_PyMem_PymallocEnabled()) {
+        pymalloc_print_stats(out);
+        return 1;
+    }
+    else {
+        return 0;
+    }
 }
 
 #endif /* #ifdef WITH_PYMALLOC */

From ab829b3dcebae28160d72fb6859040bea1bcaceb Mon Sep 17 00:00:00 2001
From: Victor Stinner <vstinner@python.org>
Date: Mon, 30 Oct 2023 15:44:26 +0100
Subject: [PATCH 24/24] Fix memory doc

mimalloc is not the default
---
 Doc/c-api/memory.rst | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/Doc/c-api/memory.rst b/Doc/c-api/memory.rst
index 1ea6d23f5e0c98..e6a6346a8cf605 100644
--- a/Doc/c-api/memory.rst
+++ b/Doc/c-api/memory.rst
@@ -379,8 +379,8 @@ Default memory allocators:
 ===============================  ====================  ==================  =====================  ====================
 Configuration                    Name                  PyMem_RawMalloc     PyMem_Malloc           PyObject_Malloc
 ===============================  ====================  ==================  =====================  ====================
-Release build                    ``"mimalloc"``        ``malloc``          ``mimalloc``           ``mimalloc``
-Debug build                      ``"mimalloc_debug"``  ``malloc`` + debug  ``mimalloc`` + debug   ``mimalloc`` + debug
+Release build                    ``"pymalloc"``        ``malloc``          ``pymalloc``           ``pymalloc``
+Debug build                      ``"pymalloc_debug"``  ``malloc`` + debug  ``pymalloc`` + debug   ``pymalloc`` + debug
 Release build, without pymalloc  ``"malloc"``          ``malloc``          ``malloc``             ``malloc``
 Debug build, without pymalloc    ``"malloc_debug"``    ``malloc`` + debug  ``malloc`` + debug     ``malloc`` + debug
 ===============================  ====================  ==================  =====================  ====================
@@ -684,8 +684,6 @@ Python supports the mimalloc allocator when the underlying platform support is a
 mimalloc "is a general purpose allocator with excellent performance characteristics.
 Initially developed by Daan Leijen for the runtime systems of the Koka and Lean languages."
 
-When mimalloc support is available it is the default allocator.
-
 tracemalloc C API
 =================