Skip to content

Commit f8d9377

Browse files
davidhildenbrandakpm00
authored andcommitted
mm/memory: optimize fork() with PTE-mapped THP
Let's implement PTE batching when consecutive (present) PTEs map consecutive pages of the same large folio, and all other PTE bits besides the PFNs are equal. We will optimize folio_pte_batch() separately, to ignore selected PTE bits. This patch is based on work by Ryan Roberts. Use __always_inline for __copy_present_ptes() and keep the handling for single PTEs completely separate from the multi-PTE case: we really want the compiler to optimize for the single-PTE case with small folios, to not degrade performance. Note that PTE batching will never exceed a single page table and will always stay within VMA boundaries. Further, processing PTE-mapped THP that maybe pinned and have PageAnonExclusive set on at least one subpage should work as expected, but there is room for improvement: We will repeatedly (1) detect a PTE batch (2) detect that we have to copy a page (3) fall back and allocate a single page to copy a single page. For now we won't care as pinned pages are a corner case, and we should rather look into maintaining only a single PageAnonExclusive bit for large folios. Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: David Hildenbrand <[email protected]> Reviewed-by: Ryan Roberts <[email protected]> Reviewed-by: Mike Rapoport (IBM) <[email protected]> Cc: Albert Ou <[email protected]> Cc: Alexander Gordeev <[email protected]> Cc: Alexandre Ghiti <[email protected]> Cc: Aneesh Kumar K.V <[email protected]> Cc: Catalin Marinas <[email protected]> Cc: Christian Borntraeger <[email protected]> Cc: Christophe Leroy <[email protected]> Cc: David S. Miller <[email protected]> Cc: Dinh Nguyen <[email protected]> Cc: Gerald Schaefer <[email protected]> Cc: Heiko Carstens <[email protected]> Cc: Matthew Wilcox <[email protected]> Cc: Michael Ellerman <[email protected]> Cc: Naveen N. Rao <[email protected]> Cc: Nicholas Piggin <[email protected]> Cc: Palmer Dabbelt <[email protected]> Cc: Paul Walmsley <[email protected]> Cc: Russell King (Oracle) <[email protected]> Cc: Sven Schnelle <[email protected]> Cc: Vasily Gorbik <[email protected]> Cc: Will Deacon <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent 5372329 commit f8d9377

File tree

2 files changed

+124
-19
lines changed

2 files changed

+124
-19
lines changed

include/linux/pgtable.h

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -650,6 +650,37 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
650650
}
651651
#endif
652652

653+
#ifndef wrprotect_ptes
654+
/**
655+
* wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same
656+
* folio.
657+
* @mm: Address space the pages are mapped into.
658+
* @addr: Address the first page is mapped at.
659+
* @ptep: Page table pointer for the first entry.
660+
* @nr: Number of entries to write-protect.
661+
*
662+
* May be overridden by the architecture; otherwise, implemented as a simple
663+
* loop over ptep_set_wrprotect().
664+
*
665+
* Note that PTE bits in the PTE range besides the PFN can differ. For example,
666+
* some PTEs might be write-protected.
667+
*
668+
* Context: The caller holds the page table lock. The PTEs map consecutive
669+
* pages that belong to the same folio. The PTEs are all in the same PMD.
670+
*/
671+
static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
672+
pte_t *ptep, unsigned int nr)
673+
{
674+
for (;;) {
675+
ptep_set_wrprotect(mm, addr, ptep);
676+
if (--nr == 0)
677+
break;
678+
ptep++;
679+
addr += PAGE_SIZE;
680+
}
681+
}
682+
#endif
683+
653684
/*
654685
* On some architectures hardware does not set page access bit when accessing
655686
* memory page, it is responsibility of software setting this bit. It brings

mm/memory.c

Lines changed: 93 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -930,15 +930,15 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
930930
return 0;
931931
}
932932

933-
static inline void __copy_present_pte(struct vm_area_struct *dst_vma,
933+
static __always_inline void __copy_present_ptes(struct vm_area_struct *dst_vma,
934934
struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte,
935-
pte_t pte, unsigned long addr)
935+
pte_t pte, unsigned long addr, int nr)
936936
{
937937
struct mm_struct *src_mm = src_vma->vm_mm;
938938

939939
/* If it's a COW mapping, write protect it both processes. */
940940
if (is_cow_mapping(src_vma->vm_flags) && pte_write(pte)) {
941-
ptep_set_wrprotect(src_mm, addr, src_pte);
941+
wrprotect_ptes(src_mm, addr, src_pte, nr);
942942
pte = pte_wrprotect(pte);
943943
}
944944

@@ -950,26 +950,93 @@ static inline void __copy_present_pte(struct vm_area_struct *dst_vma,
950950
if (!userfaultfd_wp(dst_vma))
951951
pte = pte_clear_uffd_wp(pte);
952952

953-
set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
953+
set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr);
954+
}
955+
956+
/*
957+
* Detect a PTE batch: consecutive (present) PTEs that map consecutive
958+
* pages of the same folio.
959+
*
960+
* All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN.
961+
*/
962+
static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
963+
pte_t *start_ptep, pte_t pte, int max_nr)
964+
{
965+
unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
966+
const pte_t *end_ptep = start_ptep + max_nr;
967+
pte_t expected_pte = pte_next_pfn(pte);
968+
pte_t *ptep = start_ptep + 1;
969+
970+
VM_WARN_ON_FOLIO(!pte_present(pte), folio);
971+
972+
while (ptep != end_ptep) {
973+
pte = ptep_get(ptep);
974+
975+
if (!pte_same(pte, expected_pte))
976+
break;
977+
978+
/*
979+
* Stop immediately once we reached the end of the folio. In
980+
* corner cases the next PFN might fall into a different
981+
* folio.
982+
*/
983+
if (pte_pfn(pte) == folio_end_pfn)
984+
break;
985+
986+
expected_pte = pte_next_pfn(expected_pte);
987+
ptep++;
988+
}
989+
990+
return ptep - start_ptep;
954991
}
955992

956993
/*
957-
* Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page
958-
* is required to copy this pte.
994+
* Copy one present PTE, trying to batch-process subsequent PTEs that map
995+
* consecutive pages of the same folio by copying them as well.
996+
*
997+
* Returns -EAGAIN if one preallocated page is required to copy the next PTE.
998+
* Otherwise, returns the number of copied PTEs (at least 1).
959999
*/
9601000
static inline int
961-
copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
1001+
copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
9621002
pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr,
963-
int *rss, struct folio **prealloc)
1003+
int max_nr, int *rss, struct folio **prealloc)
9641004
{
9651005
struct page *page;
9661006
struct folio *folio;
1007+
int err, nr;
9671008

9681009
page = vm_normal_page(src_vma, addr, pte);
9691010
if (unlikely(!page))
9701011
goto copy_pte;
9711012

9721013
folio = page_folio(page);
1014+
1015+
/*
1016+
* If we likely have to copy, just don't bother with batching. Make
1017+
* sure that the common "small folio" case is as fast as possible
1018+
* by keeping the batching logic separate.
1019+
*/
1020+
if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) {
1021+
nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr);
1022+
folio_ref_add(folio, nr);
1023+
if (folio_test_anon(folio)) {
1024+
if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,
1025+
nr, src_vma))) {
1026+
folio_ref_sub(folio, nr);
1027+
return -EAGAIN;
1028+
}
1029+
rss[MM_ANONPAGES] += nr;
1030+
VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
1031+
} else {
1032+
folio_dup_file_rmap_ptes(folio, page, nr);
1033+
rss[mm_counter_file(folio)] += nr;
1034+
}
1035+
__copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte,
1036+
addr, nr);
1037+
return nr;
1038+
}
1039+
9731040
folio_get(folio);
9741041
if (folio_test_anon(folio)) {
9751042
/*
@@ -981,8 +1048,9 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
9811048
if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, src_vma))) {
9821049
/* Page may be pinned, we have to copy. */
9831050
folio_put(folio);
984-
return copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
985-
addr, rss, prealloc, page);
1051+
err = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
1052+
addr, rss, prealloc, page);
1053+
return err ? err : 1;
9861054
}
9871055
rss[MM_ANONPAGES]++;
9881056
VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
@@ -992,8 +1060,8 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
9921060
}
9931061

9941062
copy_pte:
995-
__copy_present_pte(dst_vma, src_vma, dst_pte, src_pte, pte, addr);
996-
return 0;
1063+
__copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, 1);
1064+
return 1;
9971065
}
9981066

9991067
static inline struct folio *folio_prealloc(struct mm_struct *src_mm,
@@ -1030,10 +1098,11 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
10301098
pte_t *src_pte, *dst_pte;
10311099
pte_t ptent;
10321100
spinlock_t *src_ptl, *dst_ptl;
1033-
int progress, ret = 0;
1101+
int progress, max_nr, ret = 0;
10341102
int rss[NR_MM_COUNTERS];
10351103
swp_entry_t entry = (swp_entry_t){0};
10361104
struct folio *prealloc = NULL;
1105+
int nr;
10371106

10381107
again:
10391108
progress = 0;
@@ -1064,6 +1133,8 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
10641133
arch_enter_lazy_mmu_mode();
10651134

10661135
do {
1136+
nr = 1;
1137+
10671138
/*
10681139
* We are holding two locks at this point - either of them
10691140
* could generate latencies in another task on another CPU.
@@ -1102,9 +1173,10 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
11021173
*/
11031174
WARN_ON_ONCE(ret != -ENOENT);
11041175
}
1105-
/* copy_present_pte() will clear `*prealloc' if consumed */
1106-
ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
1107-
ptent, addr, rss, &prealloc);
1176+
/* copy_present_ptes() will clear `*prealloc' if consumed */
1177+
max_nr = (end - addr) / PAGE_SIZE;
1178+
ret = copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte,
1179+
ptent, addr, max_nr, rss, &prealloc);
11081180
/*
11091181
* If we need a pre-allocated page for this pte, drop the
11101182
* locks, allocate, and try again.
@@ -1121,8 +1193,10 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
11211193
folio_put(prealloc);
11221194
prealloc = NULL;
11231195
}
1124-
progress += 8;
1125-
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
1196+
nr = ret;
1197+
progress += 8 * nr;
1198+
} while (dst_pte += nr, src_pte += nr, addr += PAGE_SIZE * nr,
1199+
addr != end);
11261200

11271201
arch_leave_lazy_mmu_mode();
11281202
pte_unmap_unlock(orig_src_pte, src_ptl);
@@ -1143,7 +1217,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
11431217
prealloc = folio_prealloc(src_mm, src_vma, addr, false);
11441218
if (!prealloc)
11451219
return -ENOMEM;
1146-
} else if (ret) {
1220+
} else if (ret < 0) {
11471221
VM_WARN_ON_ONCE(1);
11481222
}
11491223

0 commit comments

Comments
 (0)