Skip to content

Commit 29ef680

Browse files
Michal Hockotorvalds
Michal Hocko
authored andcommitted
memcg, oom: move out_of_memory back to the charge path
Commit 3812c8c ("mm: memcg: do not trap chargers with full callstack on OOM") has changed the ENOMEM semantic of memcg charges. Rather than invoking the oom killer from the charging context it delays the oom killer to the page fault path (pagefault_out_of_memory). This in turn means that many users (e.g. slab or g-u-p) will get ENOMEM when the corresponding memcg hits the hard limit and the memcg is is OOM. This is behavior is inconsistent with !memcg case where the oom killer is invoked from the allocation context and the allocator keeps retrying until it succeeds. The difference in the behavior is user visible. mmap(MAP_POPULATE) might result in not fully populated ranges while the mmap return code doesn't tell that to the userspace. Random syscalls might fail with ENOMEM etc. The primary motivation of the different memcg oom semantic was the deadlock avoidance. Things have changed since then, though. We have an async oom teardown by the oom reaper now and so we do not have to rely on the victim to tear down its memory anymore. Therefore we can return to the original semantic as long as the memcg oom killer is not handed over to the users space. There is still one thing to be careful about here though. If the oom killer is not able to make any forward progress - e.g. because there is no eligible task to kill - then we have to bail out of the charge path to prevent from same class of deadlocks. We have basically two options here. Either we fail the charge with ENOMEM or force the charge and allow overcharge. The first option has been considered more harmful than useful because rare inconsistencies in the ENOMEM behavior is hard to test for and error prone. Basically the same reason why the page allocator doesn't fail allocations under such conditions. The later might allow runaways but those should be really unlikely unless somebody misconfigures the system. E.g. allowing to migrate tasks away from the memcg to a different unlimited memcg with move_charge_at_immigrate disabled. Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Michal Hocko <[email protected]> Acked-by: Greg Thelen <[email protected]> Cc: Johannes Weiner <[email protected]> Cc: Shakeel Butt <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent d39f8fb commit 29ef680

File tree

4 files changed

+71
-26
lines changed

4 files changed

+71
-26
lines changed

include/linux/memcontrol.h

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -507,16 +507,16 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);
507507
void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
508508
struct task_struct *p);
509509

510-
static inline void mem_cgroup_oom_enable(void)
510+
static inline void mem_cgroup_enter_user_fault(void)
511511
{
512-
WARN_ON(current->memcg_may_oom);
513-
current->memcg_may_oom = 1;
512+
WARN_ON(current->in_user_fault);
513+
current->in_user_fault = 1;
514514
}
515515

516-
static inline void mem_cgroup_oom_disable(void)
516+
static inline void mem_cgroup_exit_user_fault(void)
517517
{
518-
WARN_ON(!current->memcg_may_oom);
519-
current->memcg_may_oom = 0;
518+
WARN_ON(!current->in_user_fault);
519+
current->in_user_fault = 0;
520520
}
521521

522522
static inline bool task_in_memcg_oom(struct task_struct *p)
@@ -961,11 +961,11 @@ static inline void mem_cgroup_handle_over_high(void)
961961
{
962962
}
963963

964-
static inline void mem_cgroup_oom_enable(void)
964+
static inline void mem_cgroup_enter_user_fault(void)
965965
{
966966
}
967967

968-
static inline void mem_cgroup_oom_disable(void)
968+
static inline void mem_cgroup_exit_user_fault(void)
969969
{
970970
}
971971

include/linux/sched.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -722,7 +722,7 @@ struct task_struct {
722722
unsigned restore_sigmask:1;
723723
#endif
724724
#ifdef CONFIG_MEMCG
725-
unsigned memcg_may_oom:1;
725+
unsigned in_user_fault:1;
726726
#ifndef CONFIG_SLOB
727727
unsigned memcg_kmem_skip_account:1;
728728
#endif

mm/memcontrol.c

Lines changed: 60 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1534,28 +1534,53 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
15341534
__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
15351535
}
15361536

1537-
static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1537+
enum oom_status {
1538+
OOM_SUCCESS,
1539+
OOM_FAILED,
1540+
OOM_ASYNC,
1541+
OOM_SKIPPED
1542+
};
1543+
1544+
static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
15381545
{
1539-
if (!current->memcg_may_oom || order > PAGE_ALLOC_COSTLY_ORDER)
1540-
return;
1546+
if (order > PAGE_ALLOC_COSTLY_ORDER)
1547+
return OOM_SKIPPED;
1548+
15411549
/*
15421550
* We are in the middle of the charge context here, so we
15431551
* don't want to block when potentially sitting on a callstack
15441552
* that holds all kinds of filesystem and mm locks.
15451553
*
1546-
* Also, the caller may handle a failed allocation gracefully
1547-
* (like optional page cache readahead) and so an OOM killer
1548-
* invocation might not even be necessary.
1554+
* cgroup1 allows disabling the OOM killer and waiting for outside
1555+
* handling until the charge can succeed; remember the context and put
1556+
* the task to sleep at the end of the page fault when all locks are
1557+
* released.
1558+
*
1559+
* On the other hand, in-kernel OOM killer allows for an async victim
1560+
* memory reclaim (oom_reaper) and that means that we are not solely
1561+
* relying on the oom victim to make a forward progress and we can
1562+
* invoke the oom killer here.
15491563
*
1550-
* That's why we don't do anything here except remember the
1551-
* OOM context and then deal with it at the end of the page
1552-
* fault when the stack is unwound, the locks are released,
1553-
* and when we know whether the fault was overall successful.
1564+
* Please note that mem_cgroup_out_of_memory might fail to find a
1565+
* victim and then we have to bail out from the charge path.
15541566
*/
1555-
css_get(&memcg->css);
1556-
current->memcg_in_oom = memcg;
1557-
current->memcg_oom_gfp_mask = mask;
1558-
current->memcg_oom_order = order;
1567+
if (memcg->oom_kill_disable) {
1568+
if (!current->in_user_fault)
1569+
return OOM_SKIPPED;
1570+
css_get(&memcg->css);
1571+
current->memcg_in_oom = memcg;
1572+
current->memcg_oom_gfp_mask = mask;
1573+
current->memcg_oom_order = order;
1574+
1575+
return OOM_ASYNC;
1576+
}
1577+
1578+
if (mem_cgroup_out_of_memory(memcg, mask, order))
1579+
return OOM_SUCCESS;
1580+
1581+
WARN(1,"Memory cgroup charge failed because of no reclaimable memory! "
1582+
"This looks like a misconfiguration or a kernel bug.");
1583+
return OOM_FAILED;
15591584
}
15601585

15611586
/**
@@ -1950,6 +1975,8 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
19501975
unsigned long nr_reclaimed;
19511976
bool may_swap = true;
19521977
bool drained = false;
1978+
bool oomed = false;
1979+
enum oom_status oom_status;
19531980

19541981
if (mem_cgroup_is_root(memcg))
19551982
return 0;
@@ -2037,6 +2064,9 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
20372064
if (nr_retries--)
20382065
goto retry;
20392066

2067+
if (gfp_mask & __GFP_RETRY_MAYFAIL && oomed)
2068+
goto nomem;
2069+
20402070
if (gfp_mask & __GFP_NOFAIL)
20412071
goto force;
20422072

@@ -2045,8 +2075,23 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
20452075

20462076
memcg_memory_event(mem_over_limit, MEMCG_OOM);
20472077

2048-
mem_cgroup_oom(mem_over_limit, gfp_mask,
2078+
/*
2079+
* keep retrying as long as the memcg oom killer is able to make
2080+
* a forward progress or bypass the charge if the oom killer
2081+
* couldn't make any progress.
2082+
*/
2083+
oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
20492084
get_order(nr_pages * PAGE_SIZE));
2085+
switch (oom_status) {
2086+
case OOM_SUCCESS:
2087+
nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2088+
oomed = true;
2089+
goto retry;
2090+
case OOM_FAILED:
2091+
goto force;
2092+
default:
2093+
goto nomem;
2094+
}
20502095
nomem:
20512096
if (!(gfp_mask & __GFP_NOFAIL))
20522097
return -ENOMEM;

mm/memory.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4153,15 +4153,15 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
41534153
* space. Kernel faults are handled more gracefully.
41544154
*/
41554155
if (flags & FAULT_FLAG_USER)
4156-
mem_cgroup_oom_enable();
4156+
mem_cgroup_enter_user_fault();
41574157

41584158
if (unlikely(is_vm_hugetlb_page(vma)))
41594159
ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
41604160
else
41614161
ret = __handle_mm_fault(vma, address, flags);
41624162

41634163
if (flags & FAULT_FLAG_USER) {
4164-
mem_cgroup_oom_disable();
4164+
mem_cgroup_exit_user_fault();
41654165
/*
41664166
* The task may have entered a memcg OOM situation but
41674167
* if the allocation error was handled gracefully (no

0 commit comments

Comments
 (0)