Skip to content

Commit 8d6373f

Browse files
davidhildenbrandgregkh
authored andcommitted
x86/mm/pat: Fix VM_PAT handling when fork() fails in copy_page_range()
[ Upstream commit dc84bc2 ] If track_pfn_copy() fails, we already added the dst VMA to the maple tree. As fork() fails, we'll cleanup the maple tree, and stumble over the dst VMA for which we neither performed any reservation nor copied any page tables. Consequently untrack_pfn() will see VM_PAT and try obtaining the PAT information from the page table -- which fails because the page table was not copied. The easiest fix would be to simply clear the VM_PAT flag of the dst VMA if track_pfn_copy() fails. However, the whole thing is about "simply" clearing the VM_PAT flag is shaky as well: if we passed track_pfn_copy() and performed a reservation, but copying the page tables fails, we'll simply clear the VM_PAT flag, not properly undoing the reservation ... which is also wrong. So let's fix it properly: set the VM_PAT flag only if the reservation succeeded (leaving it clear initially), and undo the reservation if anything goes wrong while copying the page tables: clearing the VM_PAT flag after undoing the reservation. Note that any copied page table entries will get zapped when the VMA will get removed later, after copy_page_range() succeeded; as VM_PAT is not set then, we won't try cleaning VM_PAT up once more and untrack_pfn() will be happy. Note that leaving these page tables in place without a reservation is not a problem, as we are aborting fork(); this process will never run. A reproducer can trigger this usually at the first try: https://gitlab.com/davidhildenbrand/scratchspace/-/raw/main/reproducers/pat_fork.c WARNING: CPU: 26 PID: 11650 at arch/x86/mm/pat/memtype.c:983 get_pat_info+0xf6/0x110 Modules linked in: ... CPU: 26 UID: 0 PID: 11650 Comm: repro3 Not tainted 6.12.0-rc5+ #92 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-2.fc40 04/01/2014 RIP: 0010:get_pat_info+0xf6/0x110 ... Call Trace: <TASK> ... untrack_pfn+0x52/0x110 unmap_single_vma+0xa6/0xe0 unmap_vmas+0x105/0x1f0 exit_mmap+0xf6/0x460 __mmput+0x4b/0x120 copy_process+0x1bf6/0x2aa0 kernel_clone+0xab/0x440 __do_sys_clone+0x66/0x90 do_syscall_64+0x95/0x180 Likely this case was missed in: d155df5 ("x86/mm/pat: clear VM_PAT if copy_p4d_range failed") ... and instead of undoing the reservation we simply cleared the VM_PAT flag. Keep the documentation of these functions in include/linux/pgtable.h, one place is more than sufficient -- we should clean that up for the other functions like track_pfn_remap/untrack_pfn separately. Fixes: d155df5 ("x86/mm/pat: clear VM_PAT if copy_p4d_range failed") Fixes: 2ab6403 ("x86: PAT: hooks in generic vm code to help archs to track pfnmap regions - v3") Reported-by: xingwei lee <[email protected]> Reported-by: yuxin wang <[email protected]> Reported-by: Marius Fleischer <[email protected]> Signed-off-by: David Hildenbrand <[email protected]> Signed-off-by: Ingo Molnar <[email protected]> Cc: Andy Lutomirski <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Rik van Riel <[email protected]> Cc: "H. Peter Anvin" <[email protected]> Cc: Linus Torvalds <[email protected]> Cc: Andrew Morton <[email protected]> Cc: [email protected] Link: https://lore.kernel.org/r/[email protected] Closes: https://lore.kernel.org/lkml/CABOYnLx_dnqzpCW99G81DmOr+2UzdmZMk=T3uxwNxwz+R1RAwg@mail.gmail.com/ Closes: https://lore.kernel.org/lkml/CAJg=8jwijTP5fre8woS4JVJQ8iUA6v+iNcsOgtj9Zfpc3obDOQ@mail.gmail.com/ Signed-off-by: Sasha Levin <[email protected]>
1 parent 9192062 commit 8d6373f

File tree

4 files changed

+58
-37
lines changed

4 files changed

+58
-37
lines changed

arch/x86/mm/pat/memtype.c

Lines changed: 28 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -984,29 +984,42 @@ static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr,
984984
return -EINVAL;
985985
}
986986

987-
/*
988-
* track_pfn_copy is called when vma that is covering the pfnmap gets
989-
* copied through copy_page_range().
990-
*
991-
* If the vma has a linear pfn mapping for the entire range, we get the prot
992-
* from pte and reserve the entire vma range with single reserve_pfn_range call.
993-
*/
994-
int track_pfn_copy(struct vm_area_struct *vma)
987+
int track_pfn_copy(struct vm_area_struct *dst_vma,
988+
struct vm_area_struct *src_vma, unsigned long *pfn)
995989
{
990+
const unsigned long vma_size = src_vma->vm_end - src_vma->vm_start;
996991
resource_size_t paddr;
997-
unsigned long vma_size = vma->vm_end - vma->vm_start;
998992
pgprot_t pgprot;
993+
int rc;
999994

1000-
if (vma->vm_flags & VM_PAT) {
1001-
if (get_pat_info(vma, &paddr, &pgprot))
1002-
return -EINVAL;
1003-
/* reserve the whole chunk covered by vma. */
1004-
return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
1005-
}
995+
if (!(src_vma->vm_flags & VM_PAT))
996+
return 0;
997+
998+
/*
999+
* Duplicate the PAT information for the dst VMA based on the src
1000+
* VMA.
1001+
*/
1002+
if (get_pat_info(src_vma, &paddr, &pgprot))
1003+
return -EINVAL;
1004+
rc = reserve_pfn_range(paddr, vma_size, &pgprot, 1);
1005+
if (rc)
1006+
return rc;
10061007

1008+
/* Reservation for the destination VMA succeeded. */
1009+
vm_flags_set(dst_vma, VM_PAT);
1010+
*pfn = PHYS_PFN(paddr);
10071011
return 0;
10081012
}
10091013

1014+
void untrack_pfn_copy(struct vm_area_struct *dst_vma, unsigned long pfn)
1015+
{
1016+
untrack_pfn(dst_vma, pfn, dst_vma->vm_end - dst_vma->vm_start, true);
1017+
/*
1018+
* Reservation was freed, any copied page tables will get cleaned
1019+
* up later, but without getting PAT involved again.
1020+
*/
1021+
}
1022+
10101023
/*
10111024
* prot is passed in as a parameter for the new mapping. If the vma has
10121025
* a linear pfn mapping for the entire range, or no vma is provided,
@@ -1095,15 +1108,6 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
10951108
}
10961109
}
10971110

1098-
/*
1099-
* untrack_pfn_clear is called if the following situation fits:
1100-
*
1101-
* 1) while mremapping a pfnmap for a new region, with the old vma after
1102-
* its pfnmap page table has been removed. The new vma has a new pfnmap
1103-
* to the same pfn & cache type with VM_PAT set.
1104-
* 2) while duplicating vm area, the new vma fails to copy the pgtable from
1105-
* old vma.
1106-
*/
11071111
void untrack_pfn_clear(struct vm_area_struct *vma)
11081112
{
11091113
vm_flags_clear(vma, VM_PAT);

include/linux/pgtable.h

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1518,14 +1518,25 @@ static inline void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
15181518
}
15191519

15201520
/*
1521-
* track_pfn_copy is called when vma that is covering the pfnmap gets
1522-
* copied through copy_page_range().
1521+
* track_pfn_copy is called when a VM_PFNMAP VMA is about to get the page
1522+
* tables copied during copy_page_range(). On success, stores the pfn to be
1523+
* passed to untrack_pfn_copy().
15231524
*/
1524-
static inline int track_pfn_copy(struct vm_area_struct *vma)
1525+
static inline int track_pfn_copy(struct vm_area_struct *dst_vma,
1526+
struct vm_area_struct *src_vma, unsigned long *pfn)
15251527
{
15261528
return 0;
15271529
}
15281530

1531+
/*
1532+
* untrack_pfn_copy is called when a VM_PFNMAP VMA failed to copy during
1533+
* copy_page_range(), but after track_pfn_copy() was already called.
1534+
*/
1535+
static inline void untrack_pfn_copy(struct vm_area_struct *dst_vma,
1536+
unsigned long pfn)
1537+
{
1538+
}
1539+
15291540
/*
15301541
* untrack_pfn is called while unmapping a pfnmap for a region.
15311542
* untrack can be called for a specific region indicated by pfn and size or
@@ -1538,8 +1549,10 @@ static inline void untrack_pfn(struct vm_area_struct *vma,
15381549
}
15391550

15401551
/*
1541-
* untrack_pfn_clear is called while mremapping a pfnmap for a new region
1542-
* or fails to copy pgtable during duplicate vm area.
1552+
* untrack_pfn_clear is called in the following cases on a VM_PFNMAP VMA:
1553+
*
1554+
* 1) During mremap() on the src VMA after the page tables were moved.
1555+
* 2) During fork() on the dst VMA, immediately after duplicating the src VMA.
15431556
*/
15441557
static inline void untrack_pfn_clear(struct vm_area_struct *vma)
15451558
{
@@ -1550,7 +1563,10 @@ extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
15501563
unsigned long size);
15511564
extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
15521565
pfn_t pfn);
1553-
extern int track_pfn_copy(struct vm_area_struct *vma);
1566+
extern int track_pfn_copy(struct vm_area_struct *dst_vma,
1567+
struct vm_area_struct *src_vma, unsigned long *pfn);
1568+
extern void untrack_pfn_copy(struct vm_area_struct *dst_vma,
1569+
unsigned long pfn);
15541570
extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
15551571
unsigned long size, bool mm_wr_locked);
15561572
extern void untrack_pfn_clear(struct vm_area_struct *vma);

kernel/fork.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -505,6 +505,10 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
505505
vma_numab_state_init(new);
506506
dup_anon_vma_name(orig, new);
507507

508+
/* track_pfn_copy() will later take care of copying internal state. */
509+
if (unlikely(new->vm_flags & VM_PFNMAP))
510+
untrack_pfn_clear(new);
511+
508512
return new;
509513
}
510514

mm/memory.c

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1356,12 +1356,12 @@ int
13561356
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
13571357
{
13581358
pgd_t *src_pgd, *dst_pgd;
1359-
unsigned long next;
13601359
unsigned long addr = src_vma->vm_start;
13611360
unsigned long end = src_vma->vm_end;
13621361
struct mm_struct *dst_mm = dst_vma->vm_mm;
13631362
struct mm_struct *src_mm = src_vma->vm_mm;
13641363
struct mmu_notifier_range range;
1364+
unsigned long next, pfn;
13651365
bool is_cow;
13661366
int ret;
13671367

@@ -1372,11 +1372,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
13721372
return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma);
13731373

13741374
if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
1375-
/*
1376-
* We do not free on error cases below as remove_vma
1377-
* gets called on error from higher level routine
1378-
*/
1379-
ret = track_pfn_copy(src_vma);
1375+
ret = track_pfn_copy(dst_vma, src_vma, &pfn);
13801376
if (ret)
13811377
return ret;
13821378
}
@@ -1413,7 +1409,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
14131409
continue;
14141410
if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
14151411
addr, next))) {
1416-
untrack_pfn_clear(dst_vma);
14171412
ret = -ENOMEM;
14181413
break;
14191414
}
@@ -1423,6 +1418,8 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
14231418
raw_write_seqcount_end(&src_mm->write_protect_seq);
14241419
mmu_notifier_invalidate_range_end(&range);
14251420
}
1421+
if (ret && unlikely(src_vma->vm_flags & VM_PFNMAP))
1422+
untrack_pfn_copy(dst_vma, pfn);
14261423
return ret;
14271424
}
14281425

0 commit comments

Comments
 (0)