Skip to content

Commit 1a4e58c

Browse files
minchanktorvalds
authored andcommitted
mm: introduce MADV_PAGEOUT
When a process expects no accesses to a certain memory range for a long time, it could hint kernel that the pages can be reclaimed instantly but data should be preserved for future use. This could reduce workingset eviction so it ends up increasing performance. This patch introduces the new MADV_PAGEOUT hint to madvise(2) syscall. MADV_PAGEOUT can be used by a process to mark a memory range as not expected to be used for a long time so that kernel reclaims *any LRU* pages instantly. The hint can help kernel in deciding which pages to evict proactively. A note: It doesn't apply SWAP_CLUSTER_MAX LRU page isolation limit intentionally because it's automatically bounded by PMD size. If PMD size(e.g., 256) makes some trouble, we could fix it later by limit it to SWAP_CLUSTER_MAX[1]. - man-page material MADV_PAGEOUT (since Linux x.x) Do not expect access in the near future so pages in the specified regions could be reclaimed instantly regardless of memory pressure. Thus, access in the range after successful operation could cause major page fault but never lose the up-to-date contents unlike MADV_DONTNEED. Pages belonging to a shared mapping are only processed if a write access is allowed for the calling process. MADV_PAGEOUT cannot be applied to locked pages, Huge TLB pages, or VM_PFNMAP pages. [1] https://lore.kernel.org/lkml/[email protected]/ [[email protected]: clear PG_active on MADV_PAGEOUT] Link: http://lkml.kernel.org/r/[email protected] [[email protected]: resolve conflicts with hmm.git] Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Minchan Kim <[email protected]> Reported-by: kbuild test robot <[email protected]> Acked-by: Michal Hocko <[email protected]> Cc: James E.J. Bottomley <[email protected]> Cc: Richard Henderson <[email protected]> Cc: Ralf Baechle <[email protected]> Cc: Chris Zankel <[email protected]> Cc: Daniel Colascione <[email protected]> Cc: Dave Hansen <[email protected]> Cc: Hillf Danton <[email protected]> Cc: Joel Fernandes (Google) <[email protected]> Cc: Johannes Weiner <[email protected]> Cc: Kirill A. Shutemov <[email protected]> Cc: Oleksandr Natalenko <[email protected]> Cc: Shakeel Butt <[email protected]> Cc: Sonny Rao <[email protected]> Cc: Suren Baghdasaryan <[email protected]> Cc: Tim Murray <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 8940b34 commit 1a4e58c

File tree

8 files changed

+251
-0
lines changed

8 files changed

+251
-0
lines changed

arch/alpha/include/uapi/asm/mman.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@
6969
#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
7070

7171
#define MADV_COLD 20 /* deactivate these pages */
72+
#define MADV_PAGEOUT 21 /* reclaim these pages */
7273

7374
/* compatibility flags */
7475
#define MAP_FILE 0

arch/mips/include/uapi/asm/mman.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@
9696
#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
9797

9898
#define MADV_COLD 20 /* deactivate these pages */
99+
#define MADV_PAGEOUT 21 /* reclaim these pages */
99100

100101
/* compatibility flags */
101102
#define MAP_FILE 0

arch/parisc/include/uapi/asm/mman.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
#define MADV_DOFORK 11 /* do inherit across fork */
5050

5151
#define MADV_COLD 20 /* deactivate these pages */
52+
#define MADV_PAGEOUT 21 /* reclaim these pages */
5253

5354
#define MADV_MERGEABLE 65 /* KSM may merge identical pages */
5455
#define MADV_UNMERGEABLE 66 /* KSM may not merge identical pages */

arch/xtensa/include/uapi/asm/mman.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@
104104
#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
105105

106106
#define MADV_COLD 20 /* deactivate these pages */
107+
#define MADV_PAGEOUT 21 /* reclaim these pages */
107108

108109
/* compatibility flags */
109110
#define MAP_FILE 0

include/linux/swap.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -365,6 +365,7 @@ extern int vm_swappiness;
365365
extern int remove_mapping(struct address_space *mapping, struct page *page);
366366
extern unsigned long vm_total_pages;
367367

368+
extern unsigned long reclaim_pages(struct list_head *page_list);
368369
#ifdef CONFIG_NUMA
369370
extern int node_reclaim_mode;
370371
extern int sysctl_min_unmapped_ratio;

include/uapi/asm-generic/mman-common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@
6868
#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
6969

7070
#define MADV_COLD 20 /* deactivate these pages */
71+
#define MADV_PAGEOUT 21 /* reclaim these pages */
7172

7273
/* compatibility flags */
7374
#define MAP_FILE 0

mm/madvise.c

Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ static int madvise_need_mmap_write(int behavior)
4444
case MADV_WILLNEED:
4545
case MADV_DONTNEED:
4646
case MADV_COLD:
47+
case MADV_PAGEOUT:
4748
case MADV_FREE:
4849
return 0;
4950
default:
@@ -461,6 +462,191 @@ static long madvise_cold(struct vm_area_struct *vma,
461462
return 0;
462463
}
463464

465+
static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr,
466+
unsigned long end, struct mm_walk *walk)
467+
{
468+
struct mmu_gather *tlb = walk->private;
469+
struct mm_struct *mm = tlb->mm;
470+
struct vm_area_struct *vma = walk->vma;
471+
pte_t *orig_pte, *pte, ptent;
472+
spinlock_t *ptl;
473+
LIST_HEAD(page_list);
474+
struct page *page;
475+
476+
if (fatal_signal_pending(current))
477+
return -EINTR;
478+
479+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
480+
if (pmd_trans_huge(*pmd)) {
481+
pmd_t orig_pmd;
482+
unsigned long next = pmd_addr_end(addr, end);
483+
484+
tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
485+
ptl = pmd_trans_huge_lock(pmd, vma);
486+
if (!ptl)
487+
return 0;
488+
489+
orig_pmd = *pmd;
490+
if (is_huge_zero_pmd(orig_pmd))
491+
goto huge_unlock;
492+
493+
if (unlikely(!pmd_present(orig_pmd))) {
494+
VM_BUG_ON(thp_migration_supported() &&
495+
!is_pmd_migration_entry(orig_pmd));
496+
goto huge_unlock;
497+
}
498+
499+
page = pmd_page(orig_pmd);
500+
if (next - addr != HPAGE_PMD_SIZE) {
501+
int err;
502+
503+
if (page_mapcount(page) != 1)
504+
goto huge_unlock;
505+
get_page(page);
506+
spin_unlock(ptl);
507+
lock_page(page);
508+
err = split_huge_page(page);
509+
unlock_page(page);
510+
put_page(page);
511+
if (!err)
512+
goto regular_page;
513+
return 0;
514+
}
515+
516+
if (pmd_young(orig_pmd)) {
517+
pmdp_invalidate(vma, addr, pmd);
518+
orig_pmd = pmd_mkold(orig_pmd);
519+
520+
set_pmd_at(mm, addr, pmd, orig_pmd);
521+
tlb_remove_tlb_entry(tlb, pmd, addr);
522+
}
523+
524+
ClearPageReferenced(page);
525+
test_and_clear_page_young(page);
526+
527+
if (!isolate_lru_page(page))
528+
list_add(&page->lru, &page_list);
529+
huge_unlock:
530+
spin_unlock(ptl);
531+
reclaim_pages(&page_list);
532+
return 0;
533+
}
534+
535+
if (pmd_trans_unstable(pmd))
536+
return 0;
537+
regular_page:
538+
#endif
539+
tlb_change_page_size(tlb, PAGE_SIZE);
540+
orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
541+
flush_tlb_batched_pending(mm);
542+
arch_enter_lazy_mmu_mode();
543+
for (; addr < end; pte++, addr += PAGE_SIZE) {
544+
ptent = *pte;
545+
if (!pte_present(ptent))
546+
continue;
547+
548+
page = vm_normal_page(vma, addr, ptent);
549+
if (!page)
550+
continue;
551+
552+
/*
553+
* creating a THP page is expensive so split it only if we
554+
* are sure it's worth. Split it if we are only owner.
555+
*/
556+
if (PageTransCompound(page)) {
557+
if (page_mapcount(page) != 1)
558+
break;
559+
get_page(page);
560+
if (!trylock_page(page)) {
561+
put_page(page);
562+
break;
563+
}
564+
pte_unmap_unlock(orig_pte, ptl);
565+
if (split_huge_page(page)) {
566+
unlock_page(page);
567+
put_page(page);
568+
pte_offset_map_lock(mm, pmd, addr, &ptl);
569+
break;
570+
}
571+
unlock_page(page);
572+
put_page(page);
573+
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
574+
pte--;
575+
addr -= PAGE_SIZE;
576+
continue;
577+
}
578+
579+
VM_BUG_ON_PAGE(PageTransCompound(page), page);
580+
581+
if (pte_young(ptent)) {
582+
ptent = ptep_get_and_clear_full(mm, addr, pte,
583+
tlb->fullmm);
584+
ptent = pte_mkold(ptent);
585+
set_pte_at(mm, addr, pte, ptent);
586+
tlb_remove_tlb_entry(tlb, pte, addr);
587+
}
588+
ClearPageReferenced(page);
589+
test_and_clear_page_young(page);
590+
591+
if (!isolate_lru_page(page))
592+
list_add(&page->lru, &page_list);
593+
}
594+
595+
arch_leave_lazy_mmu_mode();
596+
pte_unmap_unlock(orig_pte, ptl);
597+
reclaim_pages(&page_list);
598+
cond_resched();
599+
600+
return 0;
601+
}
602+
603+
static void madvise_pageout_page_range(struct mmu_gather *tlb,
604+
struct vm_area_struct *vma,
605+
unsigned long addr, unsigned long end)
606+
{
607+
tlb_start_vma(tlb, vma);
608+
walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, NULL);
609+
tlb_end_vma(tlb, vma);
610+
}
611+
612+
static inline bool can_do_pageout(struct vm_area_struct *vma)
613+
{
614+
if (vma_is_anonymous(vma))
615+
return true;
616+
if (!vma->vm_file)
617+
return false;
618+
/*
619+
* paging out pagecache only for non-anonymous mappings that correspond
620+
* to the files the calling process could (if tried) open for writing;
621+
* otherwise we'd be including shared non-exclusive mappings, which
622+
* opens a side channel.
623+
*/
624+
return inode_owner_or_capable(file_inode(vma->vm_file)) ||
625+
inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
626+
}
627+
628+
static long madvise_pageout(struct vm_area_struct *vma,
629+
struct vm_area_struct **prev,
630+
unsigned long start_addr, unsigned long end_addr)
631+
{
632+
struct mm_struct *mm = vma->vm_mm;
633+
struct mmu_gather tlb;
634+
635+
*prev = vma;
636+
if (!can_madv_lru_vma(vma))
637+
return -EINVAL;
638+
639+
if (!can_do_pageout(vma))
640+
return 0;
641+
642+
lru_add_drain();
643+
tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
644+
madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
645+
tlb_finish_mmu(&tlb, start_addr, end_addr);
646+
647+
return 0;
648+
}
649+
464650
static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
465651
unsigned long end, struct mm_walk *walk)
466652

@@ -843,6 +1029,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
8431029
return madvise_willneed(vma, prev, start, end);
8441030
case MADV_COLD:
8451031
return madvise_cold(vma, prev, start, end);
1032+
case MADV_PAGEOUT:
1033+
return madvise_pageout(vma, prev, start, end);
8461034
case MADV_FREE:
8471035
case MADV_DONTNEED:
8481036
return madvise_dontneed_free(vma, prev, start, end, behavior);
@@ -865,6 +1053,7 @@ madvise_behavior_valid(int behavior)
8651053
case MADV_DONTNEED:
8661054
case MADV_FREE:
8671055
case MADV_COLD:
1056+
case MADV_PAGEOUT:
8681057
#ifdef CONFIG_KSM
8691058
case MADV_MERGEABLE:
8701059
case MADV_UNMERGEABLE:

mm/vmscan.c

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2145,6 +2145,62 @@ static void shrink_active_list(unsigned long nr_to_scan,
21452145
nr_deactivate, nr_rotated, sc->priority, file);
21462146
}
21472147

2148+
unsigned long reclaim_pages(struct list_head *page_list)
2149+
{
2150+
int nid = -1;
2151+
unsigned long nr_reclaimed = 0;
2152+
LIST_HEAD(node_page_list);
2153+
struct reclaim_stat dummy_stat;
2154+
struct page *page;
2155+
struct scan_control sc = {
2156+
.gfp_mask = GFP_KERNEL,
2157+
.priority = DEF_PRIORITY,
2158+
.may_writepage = 1,
2159+
.may_unmap = 1,
2160+
.may_swap = 1,
2161+
};
2162+
2163+
while (!list_empty(page_list)) {
2164+
page = lru_to_page(page_list);
2165+
if (nid == -1) {
2166+
nid = page_to_nid(page);
2167+
INIT_LIST_HEAD(&node_page_list);
2168+
}
2169+
2170+
if (nid == page_to_nid(page)) {
2171+
ClearPageActive(page);
2172+
list_move(&page->lru, &node_page_list);
2173+
continue;
2174+
}
2175+
2176+
nr_reclaimed += shrink_page_list(&node_page_list,
2177+
NODE_DATA(nid),
2178+
&sc, 0,
2179+
&dummy_stat, false);
2180+
while (!list_empty(&node_page_list)) {
2181+
page = lru_to_page(&node_page_list);
2182+
list_del(&page->lru);
2183+
putback_lru_page(page);
2184+
}
2185+
2186+
nid = -1;
2187+
}
2188+
2189+
if (!list_empty(&node_page_list)) {
2190+
nr_reclaimed += shrink_page_list(&node_page_list,
2191+
NODE_DATA(nid),
2192+
&sc, 0,
2193+
&dummy_stat, false);
2194+
while (!list_empty(&node_page_list)) {
2195+
page = lru_to_page(&node_page_list);
2196+
list_del(&page->lru);
2197+
putback_lru_page(page);
2198+
}
2199+
}
2200+
2201+
return nr_reclaimed;
2202+
}
2203+
21482204
/*
21492205
* The inactive anon list should be small enough that the VM never has
21502206
* to do too much work.

0 commit comments

Comments
 (0)