Skip to content

Commit aac4536

Browse files
Michal Hockotorvalds
Michal Hocko
authored andcommitted
mm, oom: introduce oom reaper
This patch (of 5): This is based on the idea from Mel Gorman discussed during LSFMM 2015 and independently brought up by Oleg Nesterov. The OOM killer currently allows to kill only a single task in a good hope that the task will terminate in a reasonable time and frees up its memory. Such a task (oom victim) will get an access to memory reserves via mark_oom_victim to allow a forward progress should there be a need for additional memory during exit path. It has been shown (e.g. by Tetsuo Handa) that it is not that hard to construct workloads which break the core assumption mentioned above and the OOM victim might take unbounded amount of time to exit because it might be blocked in the uninterruptible state waiting for an event (e.g. lock) which is blocked by another task looping in the page allocator. This patch reduces the probability of such a lockup by introducing a specialized kernel thread (oom_reaper) which tries to reclaim additional memory by preemptively reaping the anonymous or swapped out memory owned by the oom victim under an assumption that such a memory won't be needed when its owner is killed and kicked from the userspace anyway. There is one notable exception to this, though, if the OOM victim was in the process of coredumping the result would be incomplete. This is considered a reasonable constrain because the overall system health is more important than debugability of a particular application. A kernel thread has been chosen because we need a reliable way of invocation so workqueue context is not appropriate because all the workers might be busy (e.g. allocating memory). Kswapd which sounds like another good fit is not appropriate as well because it might get blocked on locks during reclaim as well. oom_reaper has to take mmap_sem on the target task for reading so the solution is not 100% because the semaphore might be held or blocked for write but the probability is reduced considerably wrt. basically any lock blocking forward progress as described above. In order to prevent from blocking on the lock without any forward progress we are using only a trylock and retry 10 times with a short sleep in between. Users of mmap_sem which need it for write should be carefully reviewed to use _killable waiting as much as possible and reduce allocations requests done with the lock held to absolute minimum to reduce the risk even further. The API between oom killer and oom reaper is quite trivial. wake_oom_reaper updates mm_to_reap with cmpxchg to guarantee only NULL->mm transition and oom_reaper clear this atomically once it is done with the work. This means that only a single mm_struct can be reaped at the time. As the operation is potentially disruptive we are trying to limit it to the ncessary minimum and the reaper blocks any updates while it operates on an mm. mm_struct is pinned by mm_count to allow parallel exit_mmap and a race is detected by atomic_inc_not_zero(mm_users). Signed-off-by: Michal Hocko <[email protected]> Suggested-by: Oleg Nesterov <[email protected]> Suggested-by: Mel Gorman <[email protected]> Acked-by: Mel Gorman <[email protected]> Acked-by: David Rientjes <[email protected]> Cc: Mel Gorman <[email protected]> Cc: Tetsuo Handa <[email protected]> Cc: Oleg Nesterov <[email protected]> Cc: Hugh Dickins <[email protected]> Cc: Andrea Argangeli <[email protected]> Cc: Rik van Riel <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 69b27ba commit aac4536

File tree

4 files changed

+162
-13
lines changed

4 files changed

+162
-13
lines changed

include/linux/mm.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1132,6 +1132,8 @@ struct zap_details {
11321132
struct address_space *check_mapping; /* Check page->mapping if set */
11331133
pgoff_t first_index; /* Lowest page->index to unmap */
11341134
pgoff_t last_index; /* Highest page->index to unmap */
1135+
bool ignore_dirty; /* Ignore dirty pages */
1136+
bool check_swap_entries; /* Check also swap entries */
11351137
};
11361138

11371139
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,

mm/internal.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,11 @@
3838
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
3939
unsigned long floor, unsigned long ceiling);
4040

41+
void unmap_page_range(struct mmu_gather *tlb,
42+
struct vm_area_struct *vma,
43+
unsigned long addr, unsigned long end,
44+
struct zap_details *details);
45+
4146
extern int __do_page_cache_readahead(struct address_space *mapping,
4247
struct file *filp, pgoff_t offset, unsigned long nr_to_read,
4348
unsigned long lookahead_size);

mm/memory.c

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1102,6 +1102,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
11021102

11031103
if (!PageAnon(page)) {
11041104
if (pte_dirty(ptent)) {
1105+
/*
1106+
* oom_reaper cannot tear down dirty
1107+
* pages
1108+
*/
1109+
if (unlikely(details && details->ignore_dirty))
1110+
continue;
11051111
force_flush = 1;
11061112
set_page_dirty(page);
11071113
}
@@ -1120,8 +1126,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
11201126
}
11211127
continue;
11221128
}
1123-
/* If details->check_mapping, we leave swap entries. */
1124-
if (unlikely(details))
1129+
/* only check swap_entries if explicitly asked for in details */
1130+
if (unlikely(details && !details->check_swap_entries))
11251131
continue;
11261132

11271133
entry = pte_to_swp_entry(ptent);
@@ -1226,17 +1232,14 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
12261232
return addr;
12271233
}
12281234

1229-
static void unmap_page_range(struct mmu_gather *tlb,
1235+
void unmap_page_range(struct mmu_gather *tlb,
12301236
struct vm_area_struct *vma,
12311237
unsigned long addr, unsigned long end,
12321238
struct zap_details *details)
12331239
{
12341240
pgd_t *pgd;
12351241
unsigned long next;
12361242

1237-
if (details && !details->check_mapping)
1238-
details = NULL;
1239-
12401243
BUG_ON(addr >= end);
12411244
tlb_start_vma(tlb, vma);
12421245
pgd = pgd_offset(vma->vm_mm, addr);
@@ -2432,7 +2435,7 @@ static inline void unmap_mapping_range_tree(struct rb_root *root,
24322435
void unmap_mapping_range(struct address_space *mapping,
24332436
loff_t const holebegin, loff_t const holelen, int even_cows)
24342437
{
2435-
struct zap_details details;
2438+
struct zap_details details = { };
24362439
pgoff_t hba = holebegin >> PAGE_SHIFT;
24372440
pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
24382441

mm/oom_kill.c

Lines changed: 145 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,11 @@
3535
#include <linux/freezer.h>
3636
#include <linux/ftrace.h>
3737
#include <linux/ratelimit.h>
38+
#include <linux/kthread.h>
39+
#include <linux/init.h>
40+
41+
#include <asm/tlb.h>
42+
#include "internal.h"
3843

3944
#define CREATE_TRACE_POINTS
4045
#include <trace/events/oom.h>
@@ -405,6 +410,133 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
405410

406411
bool oom_killer_disabled __read_mostly;
407412

413+
#ifdef CONFIG_MMU
414+
/*
415+
* OOM Reaper kernel thread which tries to reap the memory used by the OOM
416+
* victim (if that is possible) to help the OOM killer to move on.
417+
*/
418+
static struct task_struct *oom_reaper_th;
419+
static struct mm_struct *mm_to_reap;
420+
static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
421+
422+
static bool __oom_reap_vmas(struct mm_struct *mm)
423+
{
424+
struct mmu_gather tlb;
425+
struct vm_area_struct *vma;
426+
struct zap_details details = {.check_swap_entries = true,
427+
.ignore_dirty = true};
428+
bool ret = true;
429+
430+
/* We might have raced with exit path */
431+
if (!atomic_inc_not_zero(&mm->mm_users))
432+
return true;
433+
434+
if (!down_read_trylock(&mm->mmap_sem)) {
435+
ret = false;
436+
goto out;
437+
}
438+
439+
tlb_gather_mmu(&tlb, mm, 0, -1);
440+
for (vma = mm->mmap ; vma; vma = vma->vm_next) {
441+
if (is_vm_hugetlb_page(vma))
442+
continue;
443+
444+
/*
445+
* mlocked VMAs require explicit munlocking before unmap.
446+
* Let's keep it simple here and skip such VMAs.
447+
*/
448+
if (vma->vm_flags & VM_LOCKED)
449+
continue;
450+
451+
/*
452+
* Only anonymous pages have a good chance to be dropped
453+
* without additional steps which we cannot afford as we
454+
* are OOM already.
455+
*
456+
* We do not even care about fs backed pages because all
457+
* which are reclaimable have already been reclaimed and
458+
* we do not want to block exit_mmap by keeping mm ref
459+
* count elevated without a good reason.
460+
*/
461+
if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED))
462+
unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
463+
&details);
464+
}
465+
tlb_finish_mmu(&tlb, 0, -1);
466+
up_read(&mm->mmap_sem);
467+
out:
468+
mmput(mm);
469+
return ret;
470+
}
471+
472+
static void oom_reap_vmas(struct mm_struct *mm)
473+
{
474+
int attempts = 0;
475+
476+
/* Retry the down_read_trylock(mmap_sem) a few times */
477+
while (attempts++ < 10 && !__oom_reap_vmas(mm))
478+
schedule_timeout_idle(HZ/10);
479+
480+
/* Drop a reference taken by wake_oom_reaper */
481+
mmdrop(mm);
482+
}
483+
484+
static int oom_reaper(void *unused)
485+
{
486+
while (true) {
487+
struct mm_struct *mm;
488+
489+
wait_event_freezable(oom_reaper_wait,
490+
(mm = READ_ONCE(mm_to_reap)));
491+
oom_reap_vmas(mm);
492+
WRITE_ONCE(mm_to_reap, NULL);
493+
}
494+
495+
return 0;
496+
}
497+
498+
static void wake_oom_reaper(struct mm_struct *mm)
499+
{
500+
struct mm_struct *old_mm;
501+
502+
if (!oom_reaper_th)
503+
return;
504+
505+
/*
506+
* Pin the given mm. Use mm_count instead of mm_users because
507+
* we do not want to delay the address space tear down.
508+
*/
509+
atomic_inc(&mm->mm_count);
510+
511+
/*
512+
* Make sure that only a single mm is ever queued for the reaper
513+
* because multiple are not necessary and the operation might be
514+
* disruptive so better reduce it to the bare minimum.
515+
*/
516+
old_mm = cmpxchg(&mm_to_reap, NULL, mm);
517+
if (!old_mm)
518+
wake_up(&oom_reaper_wait);
519+
else
520+
mmdrop(mm);
521+
}
522+
523+
static int __init oom_init(void)
524+
{
525+
oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
526+
if (IS_ERR(oom_reaper_th)) {
527+
pr_err("Unable to start OOM reaper %ld. Continuing regardless\n",
528+
PTR_ERR(oom_reaper_th));
529+
oom_reaper_th = NULL;
530+
}
531+
return 0;
532+
}
533+
subsys_initcall(oom_init)
534+
#else
535+
static void wake_oom_reaper(struct mm_struct *mm)
536+
{
537+
}
538+
#endif
539+
408540
/**
409541
* mark_oom_victim - mark the given task as OOM victim
410542
* @tsk: task to mark
@@ -510,6 +642,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
510642
unsigned int victim_points = 0;
511643
static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
512644
DEFAULT_RATELIMIT_BURST);
645+
bool can_oom_reap = true;
513646

514647
/*
515648
* If the task is already exiting, don't alarm the sysadmin or kill
@@ -600,17 +733,23 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
600733
continue;
601734
if (same_thread_group(p, victim))
602735
continue;
603-
if (unlikely(p->flags & PF_KTHREAD))
604-
continue;
605-
if (is_global_init(p))
606-
continue;
607-
if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
736+
if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p) ||
737+
p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
738+
/*
739+
* We cannot use oom_reaper for the mm shared by this
740+
* process because it wouldn't get killed and so the
741+
* memory might be still used.
742+
*/
743+
can_oom_reap = false;
608744
continue;
609-
745+
}
610746
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
611747
}
612748
rcu_read_unlock();
613749

750+
if (can_oom_reap)
751+
wake_oom_reaper(mm);
752+
614753
mmdrop(mm);
615754
put_task_struct(victim);
616755
}

0 commit comments

Comments
 (0)