Skip to content

Commit df2cc96

Browse files
rppttorvalds
authored andcommitted
userfaultfd: prevent non-cooperative events vs mcopy_atomic races
If a process monitored with userfaultfd changes it's memory mappings or forks() at the same time as uffd monitor fills the process memory with UFFDIO_COPY, the actual creation of page table entries and copying of the data in mcopy_atomic may happen either before of after the memory mapping modifications and there is no way for the uffd monitor to maintain consistent view of the process memory layout. For instance, let's consider fork() running in parallel with userfaultfd_copy(): process | uffd monitor ---------------------------------+------------------------------ fork() | userfaultfd_copy() ... | ... dup_mmap() | down_read(mmap_sem) down_write(mmap_sem) | /* create PTEs, copy data */ dup_uffd() | up_read(mmap_sem) copy_page_range() | up_write(mmap_sem) | dup_uffd_complete() | /* notify monitor */ | If the userfaultfd_copy() takes the mmap_sem first, the new page(s) will be present by the time copy_page_range() is called and they will appear in the child's memory mappings. However, if the fork() is the first to take the mmap_sem, the new pages won't be mapped in the child's address space. If the pages are not present and child tries to access them, the monitor will get page fault notification and everything is fine. However, if the pages *are present*, the child can access them without uffd noticing. And if we copy them into child it'll see the wrong data. Since we are talking about background copy, we'd need to decide whether the pages should be copied or not regardless #PF notifications. Since userfaultfd monitor has no way to determine what was the order, let's disallow userfaultfd_copy in parallel with the non-cooperative events. In such case we return -EAGAIN and the uffd monitor can understand that userfaultfd_copy() clashed with a non-cooperative event and take an appropriate action. Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Mike Rapoport <[email protected]> Acked-by: Pavel Emelyanov <[email protected]> Cc: Andrea Arcangeli <[email protected]> Cc: Mike Kravetz <[email protected]> Cc: Andrei Vagin <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent be09102 commit df2cc96

File tree

3 files changed

+41
-9
lines changed

3 files changed

+41
-9
lines changed

fs/userfaultfd.c

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@ struct userfaultfd_ctx {
6262
enum userfaultfd_state state;
6363
/* released */
6464
bool released;
65+
/* memory mappings are changing because of non-cooperative event */
66+
bool mmap_changing;
6567
/* mm with one ore more vmas attached to this userfaultfd_ctx */
6668
struct mm_struct *mm;
6769
};
@@ -641,6 +643,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
641643
* already released.
642644
*/
643645
out:
646+
WRITE_ONCE(ctx->mmap_changing, false);
644647
userfaultfd_ctx_put(ctx);
645648
}
646649

@@ -686,10 +689,12 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
686689
ctx->state = UFFD_STATE_RUNNING;
687690
ctx->features = octx->features;
688691
ctx->released = false;
692+
ctx->mmap_changing = false;
689693
ctx->mm = vma->vm_mm;
690694
mmgrab(ctx->mm);
691695

692696
userfaultfd_ctx_get(octx);
697+
WRITE_ONCE(octx->mmap_changing, true);
693698
fctx->orig = octx;
694699
fctx->new = ctx;
695700
list_add_tail(&fctx->list, fcs);
@@ -732,6 +737,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
732737
if (ctx && (ctx->features & UFFD_FEATURE_EVENT_REMAP)) {
733738
vm_ctx->ctx = ctx;
734739
userfaultfd_ctx_get(ctx);
740+
WRITE_ONCE(ctx->mmap_changing, true);
735741
}
736742
}
737743

@@ -772,6 +778,7 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
772778
return true;
773779

774780
userfaultfd_ctx_get(ctx);
781+
WRITE_ONCE(ctx->mmap_changing, true);
775782
up_read(&mm->mmap_sem);
776783

777784
msg_init(&ewq.msg);
@@ -815,6 +822,7 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma,
815822
return -ENOMEM;
816823

817824
userfaultfd_ctx_get(ctx);
825+
WRITE_ONCE(ctx->mmap_changing, true);
818826
unmap_ctx->ctx = ctx;
819827
unmap_ctx->start = start;
820828
unmap_ctx->end = end;
@@ -1653,6 +1661,10 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
16531661

16541662
user_uffdio_copy = (struct uffdio_copy __user *) arg;
16551663

1664+
ret = -EAGAIN;
1665+
if (READ_ONCE(ctx->mmap_changing))
1666+
goto out;
1667+
16561668
ret = -EFAULT;
16571669
if (copy_from_user(&uffdio_copy, user_uffdio_copy,
16581670
/* don't copy "copy" last field */
@@ -1674,7 +1686,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
16741686
goto out;
16751687
if (mmget_not_zero(ctx->mm)) {
16761688
ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
1677-
uffdio_copy.len);
1689+
uffdio_copy.len, &ctx->mmap_changing);
16781690
mmput(ctx->mm);
16791691
} else {
16801692
return -ESRCH;
@@ -1705,6 +1717,10 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
17051717

17061718
user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
17071719

1720+
ret = -EAGAIN;
1721+
if (READ_ONCE(ctx->mmap_changing))
1722+
goto out;
1723+
17081724
ret = -EFAULT;
17091725
if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
17101726
/* don't copy "zeropage" last field */
@@ -1721,7 +1737,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
17211737

17221738
if (mmget_not_zero(ctx->mm)) {
17231739
ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
1724-
uffdio_zeropage.range.len);
1740+
uffdio_zeropage.range.len,
1741+
&ctx->mmap_changing);
17251742
mmput(ctx->mm);
17261743
} else {
17271744
return -ESRCH;
@@ -1900,6 +1917,7 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
19001917
ctx->features = 0;
19011918
ctx->state = UFFD_STATE_WAIT_API;
19021919
ctx->released = false;
1920+
ctx->mmap_changing = false;
19031921
ctx->mm = current->mm;
19041922
/* prevent the mm struct to be freed */
19051923
mmgrab(ctx->mm);

include/linux/userfaultfd_k.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,12 @@
3131
extern int handle_userfault(struct vm_fault *vmf, unsigned long reason);
3232

3333
extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
34-
unsigned long src_start, unsigned long len);
34+
unsigned long src_start, unsigned long len,
35+
bool *mmap_changing);
3536
extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
3637
unsigned long dst_start,
37-
unsigned long len);
38+
unsigned long len,
39+
bool *mmap_changing);
3840

3941
/* mm helpers */
4042
static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,

mm/userfaultfd.c

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -404,7 +404,8 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
404404
unsigned long dst_start,
405405
unsigned long src_start,
406406
unsigned long len,
407-
bool zeropage)
407+
bool zeropage,
408+
bool *mmap_changing)
408409
{
409410
struct vm_area_struct *dst_vma;
410411
ssize_t err;
@@ -430,6 +431,15 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
430431
retry:
431432
down_read(&dst_mm->mmap_sem);
432433

434+
/*
435+
* If memory mappings are changing because of non-cooperative
436+
* operation (e.g. mremap) running in parallel, bail out and
437+
* request the user to retry later
438+
*/
439+
err = -EAGAIN;
440+
if (mmap_changing && READ_ONCE(*mmap_changing))
441+
goto out_unlock;
442+
433443
/*
434444
* Make sure the vma is not shared, that the dst range is
435445
* both valid and fully within a single existing vma.
@@ -563,13 +573,15 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
563573
}
564574

565575
ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
566-
unsigned long src_start, unsigned long len)
576+
unsigned long src_start, unsigned long len,
577+
bool *mmap_changing)
567578
{
568-
return __mcopy_atomic(dst_mm, dst_start, src_start, len, false);
579+
return __mcopy_atomic(dst_mm, dst_start, src_start, len, false,
580+
mmap_changing);
569581
}
570582

571583
ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
572-
unsigned long len)
584+
unsigned long len, bool *mmap_changing)
573585
{
574-
return __mcopy_atomic(dst_mm, start, 0, len, true);
586+
return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing);
575587
}

0 commit comments

Comments
 (0)