Skip to content

Commit eebc67a

Browse files
aristeugregkh
authored andcommitted
hugetlb: force allocating surplus hugepages on mempolicy allowed nodes
commit 003af99 upstream. When trying to allocate a hugepage with no reserved ones free, it may be allowed in case a number of overcommit hugepages was configured (using /proc/sys/vm/nr_overcommit_hugepages) and that number wasn't reached. This allows for a behavior of having extra hugepages allocated dynamically, if there're resources for it. Some sysadmins even prefer not reserving any hugepages and setting a big number of overcommit hugepages. But while attempting to allocate overcommit hugepages in a multi node system (either NUMA or mempolicy/cpuset) said allocations might randomly fail even when there're resources available for the allocation. This happens due to allowed_mems_nr() only accounting for the number of free hugepages in the nodes the current process belongs to and the surplus hugepage allocation is done so it can be allocated in any node. In case one or more of the requested surplus hugepages are allocated in a different node, the whole allocation will fail due allowed_mems_nr() returning a lower value. So allocate surplus hugepages in one of the nodes the current process belongs to. Easy way to reproduce this issue is to use a 2+ NUMA nodes system: # echo 0 >/proc/sys/vm/nr_hugepages # echo 1 >/proc/sys/vm/nr_overcommit_hugepages # numactl -m0 ./tools/testing/selftests/mm/map_hugetlb 2 Repeating the execution of map_hugetlb test application will eventually fail when the hugepage ends up allocated in a different node. [[email protected]: v2] Link: https://lkml.kernel.org/r/[email protected] Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Aristeu Rozanski <[email protected]> Cc: Muchun Song <[email protected]> Cc: Aristeu Rozanski <[email protected]> Cc: David Hildenbrand <[email protected]> Cc: Vishal Moola <[email protected]> Cc: <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Greg Kroah-Hartman <[email protected]>
1 parent e60f62f commit eebc67a

File tree

1 file changed

+28
-19
lines changed

1 file changed

+28
-19
lines changed

mm/hugetlb.c

+28-19
Original file line numberDiff line numberDiff line change
@@ -2586,6 +2586,23 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
25862586
return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask);
25872587
}
25882588

2589+
static nodemask_t *policy_mbind_nodemask(gfp_t gfp)
2590+
{
2591+
#ifdef CONFIG_NUMA
2592+
struct mempolicy *mpol = get_task_policy(current);
2593+
2594+
/*
2595+
* Only enforce MPOL_BIND policy which overlaps with cpuset policy
2596+
* (from policy_nodemask) specifically for hugetlb case
2597+
*/
2598+
if (mpol->mode == MPOL_BIND &&
2599+
(apply_policy_zone(mpol, gfp_zone(gfp)) &&
2600+
cpuset_nodemask_valid_mems_allowed(&mpol->nodes)))
2601+
return &mpol->nodes;
2602+
#endif
2603+
return NULL;
2604+
}
2605+
25892606
/*
25902607
* Increase the hugetlb pool such that it can accommodate a reservation
25912608
* of size 'delta'.
@@ -2599,6 +2616,8 @@ static int gather_surplus_pages(struct hstate *h, long delta)
25992616
long i;
26002617
long needed, allocated;
26012618
bool alloc_ok = true;
2619+
int node;
2620+
nodemask_t *mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h));
26022621

26032622
lockdep_assert_held(&hugetlb_lock);
26042623
needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
@@ -2613,8 +2632,15 @@ static int gather_surplus_pages(struct hstate *h, long delta)
26132632
retry:
26142633
spin_unlock_irq(&hugetlb_lock);
26152634
for (i = 0; i < needed; i++) {
2616-
folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h),
2617-
NUMA_NO_NODE, NULL);
2635+
folio = NULL;
2636+
for_each_node_mask(node, cpuset_current_mems_allowed) {
2637+
if (!mbind_nodemask || node_isset(node, *mbind_nodemask)) {
2638+
folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h),
2639+
node, NULL);
2640+
if (folio)
2641+
break;
2642+
}
2643+
}
26182644
if (!folio) {
26192645
alloc_ok = false;
26202646
break;
@@ -4840,23 +4866,6 @@ static int __init default_hugepagesz_setup(char *s)
48404866
}
48414867
__setup("default_hugepagesz=", default_hugepagesz_setup);
48424868

4843-
static nodemask_t *policy_mbind_nodemask(gfp_t gfp)
4844-
{
4845-
#ifdef CONFIG_NUMA
4846-
struct mempolicy *mpol = get_task_policy(current);
4847-
4848-
/*
4849-
* Only enforce MPOL_BIND policy which overlaps with cpuset policy
4850-
* (from policy_nodemask) specifically for hugetlb case
4851-
*/
4852-
if (mpol->mode == MPOL_BIND &&
4853-
(apply_policy_zone(mpol, gfp_zone(gfp)) &&
4854-
cpuset_nodemask_valid_mems_allowed(&mpol->nodes)))
4855-
return &mpol->nodes;
4856-
#endif
4857-
return NULL;
4858-
}
4859-
48604869
static unsigned int allowed_mems_nr(struct hstate *h)
48614870
{
48624871
int node;

0 commit comments

Comments
 (0)