|
35 | 35 | #include <linux/freezer.h>
|
36 | 36 | #include <linux/ftrace.h>
|
37 | 37 | #include <linux/ratelimit.h>
|
| 38 | +#include <linux/kthread.h> |
| 39 | +#include <linux/init.h> |
| 40 | + |
| 41 | +#include <asm/tlb.h> |
| 42 | +#include "internal.h" |
38 | 43 |
|
39 | 44 | #define CREATE_TRACE_POINTS
|
40 | 45 | #include <trace/events/oom.h>
|
@@ -405,6 +410,133 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
|
405 | 410 |
|
406 | 411 | bool oom_killer_disabled __read_mostly;
|
407 | 412 |
|
| 413 | +#ifdef CONFIG_MMU |
| 414 | +/* |
| 415 | + * OOM Reaper kernel thread which tries to reap the memory used by the OOM |
| 416 | + * victim (if that is possible) to help the OOM killer to move on. |
| 417 | + */ |
| 418 | +static struct task_struct *oom_reaper_th; |
| 419 | +static struct mm_struct *mm_to_reap; |
| 420 | +static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); |
| 421 | + |
| 422 | +static bool __oom_reap_vmas(struct mm_struct *mm) |
| 423 | +{ |
| 424 | + struct mmu_gather tlb; |
| 425 | + struct vm_area_struct *vma; |
| 426 | + struct zap_details details = {.check_swap_entries = true, |
| 427 | + .ignore_dirty = true}; |
| 428 | + bool ret = true; |
| 429 | + |
| 430 | + /* We might have raced with exit path */ |
| 431 | + if (!atomic_inc_not_zero(&mm->mm_users)) |
| 432 | + return true; |
| 433 | + |
| 434 | + if (!down_read_trylock(&mm->mmap_sem)) { |
| 435 | + ret = false; |
| 436 | + goto out; |
| 437 | + } |
| 438 | + |
| 439 | + tlb_gather_mmu(&tlb, mm, 0, -1); |
| 440 | + for (vma = mm->mmap ; vma; vma = vma->vm_next) { |
| 441 | + if (is_vm_hugetlb_page(vma)) |
| 442 | + continue; |
| 443 | + |
| 444 | + /* |
| 445 | + * mlocked VMAs require explicit munlocking before unmap. |
| 446 | + * Let's keep it simple here and skip such VMAs. |
| 447 | + */ |
| 448 | + if (vma->vm_flags & VM_LOCKED) |
| 449 | + continue; |
| 450 | + |
| 451 | + /* |
| 452 | + * Only anonymous pages have a good chance to be dropped |
| 453 | + * without additional steps which we cannot afford as we |
| 454 | + * are OOM already. |
| 455 | + * |
| 456 | + * We do not even care about fs backed pages because all |
| 457 | + * which are reclaimable have already been reclaimed and |
| 458 | + * we do not want to block exit_mmap by keeping mm ref |
| 459 | + * count elevated without a good reason. |
| 460 | + */ |
| 461 | + if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) |
| 462 | + unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end, |
| 463 | + &details); |
| 464 | + } |
| 465 | + tlb_finish_mmu(&tlb, 0, -1); |
| 466 | + up_read(&mm->mmap_sem); |
| 467 | +out: |
| 468 | + mmput(mm); |
| 469 | + return ret; |
| 470 | +} |
| 471 | + |
| 472 | +static void oom_reap_vmas(struct mm_struct *mm) |
| 473 | +{ |
| 474 | + int attempts = 0; |
| 475 | + |
| 476 | + /* Retry the down_read_trylock(mmap_sem) a few times */ |
| 477 | + while (attempts++ < 10 && !__oom_reap_vmas(mm)) |
| 478 | + schedule_timeout_idle(HZ/10); |
| 479 | + |
| 480 | + /* Drop a reference taken by wake_oom_reaper */ |
| 481 | + mmdrop(mm); |
| 482 | +} |
| 483 | + |
| 484 | +static int oom_reaper(void *unused) |
| 485 | +{ |
| 486 | + while (true) { |
| 487 | + struct mm_struct *mm; |
| 488 | + |
| 489 | + wait_event_freezable(oom_reaper_wait, |
| 490 | + (mm = READ_ONCE(mm_to_reap))); |
| 491 | + oom_reap_vmas(mm); |
| 492 | + WRITE_ONCE(mm_to_reap, NULL); |
| 493 | + } |
| 494 | + |
| 495 | + return 0; |
| 496 | +} |
| 497 | + |
| 498 | +static void wake_oom_reaper(struct mm_struct *mm) |
| 499 | +{ |
| 500 | + struct mm_struct *old_mm; |
| 501 | + |
| 502 | + if (!oom_reaper_th) |
| 503 | + return; |
| 504 | + |
| 505 | + /* |
| 506 | + * Pin the given mm. Use mm_count instead of mm_users because |
| 507 | + * we do not want to delay the address space tear down. |
| 508 | + */ |
| 509 | + atomic_inc(&mm->mm_count); |
| 510 | + |
| 511 | + /* |
| 512 | + * Make sure that only a single mm is ever queued for the reaper |
| 513 | + * because multiple are not necessary and the operation might be |
| 514 | + * disruptive so better reduce it to the bare minimum. |
| 515 | + */ |
| 516 | + old_mm = cmpxchg(&mm_to_reap, NULL, mm); |
| 517 | + if (!old_mm) |
| 518 | + wake_up(&oom_reaper_wait); |
| 519 | + else |
| 520 | + mmdrop(mm); |
| 521 | +} |
| 522 | + |
| 523 | +static int __init oom_init(void) |
| 524 | +{ |
| 525 | + oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); |
| 526 | + if (IS_ERR(oom_reaper_th)) { |
| 527 | + pr_err("Unable to start OOM reaper %ld. Continuing regardless\n", |
| 528 | + PTR_ERR(oom_reaper_th)); |
| 529 | + oom_reaper_th = NULL; |
| 530 | + } |
| 531 | + return 0; |
| 532 | +} |
| 533 | +subsys_initcall(oom_init) |
| 534 | +#else |
| 535 | +static void wake_oom_reaper(struct mm_struct *mm) |
| 536 | +{ |
| 537 | +} |
| 538 | +#endif |
| 539 | + |
408 | 540 | /**
|
409 | 541 | * mark_oom_victim - mark the given task as OOM victim
|
410 | 542 | * @tsk: task to mark
|
@@ -510,6 +642,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
|
510 | 642 | unsigned int victim_points = 0;
|
511 | 643 | static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
|
512 | 644 | DEFAULT_RATELIMIT_BURST);
|
| 645 | + bool can_oom_reap = true; |
513 | 646 |
|
514 | 647 | /*
|
515 | 648 | * If the task is already exiting, don't alarm the sysadmin or kill
|
@@ -600,17 +733,23 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
|
600 | 733 | continue;
|
601 | 734 | if (same_thread_group(p, victim))
|
602 | 735 | continue;
|
603 |
| - if (unlikely(p->flags & PF_KTHREAD)) |
604 |
| - continue; |
605 |
| - if (is_global_init(p)) |
606 |
| - continue; |
607 |
| - if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) |
| 736 | + if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p) || |
| 737 | + p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { |
| 738 | + /* |
| 739 | + * We cannot use oom_reaper for the mm shared by this |
| 740 | + * process because it wouldn't get killed and so the |
| 741 | + * memory might be still used. |
| 742 | + */ |
| 743 | + can_oom_reap = false; |
608 | 744 | continue;
|
609 |
| - |
| 745 | + } |
610 | 746 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
|
611 | 747 | }
|
612 | 748 | rcu_read_unlock();
|
613 | 749 |
|
| 750 | + if (can_oom_reap) |
| 751 | + wake_oom_reaper(mm); |
| 752 | + |
614 | 753 | mmdrop(mm);
|
615 | 754 | put_task_struct(victim);
|
616 | 755 | }
|
|
0 commit comments