Skip to content

Commit 4a42d84

Browse files
David Stevensbonzini
David Stevens
authored andcommitted
KVM: x86/mmu: Consider the hva in mmu_notifier retry
Track the range being invalidated by mmu_notifier and skip page fault retries if the fault address is not affected by the in-progress invalidation. Handle concurrent invalidations by finding the minimal range which includes all ranges being invalidated. Although the combined range may include unrelated addresses and cannot be shrunk as individual invalidation operations complete, it is unlikely the marginal gains of proper range tracking are worth the additional complexity. The primary benefit of this change is the reduction in the likelihood of extreme latency when handing a page fault due to another thread having been preempted while modifying host virtual addresses. Signed-off-by: David Stevens <[email protected]> Message-Id: <[email protected]> Signed-off-by: Paolo Bonzini <[email protected]>
1 parent 5f8a7cf commit 4a42d84

File tree

6 files changed

+79
-16
lines changed

6 files changed

+79
-16
lines changed

arch/powerpc/kvm/book3s_64_mmu_hv.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -590,7 +590,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu,
590590
} else {
591591
/* Call KVM generic code to do the slow-path check */
592592
pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
593-
writing, &write_ok);
593+
writing, &write_ok, NULL);
594594
if (is_error_noslot_pfn(pfn))
595595
return -EFAULT;
596596
page = NULL;

arch/powerpc/kvm/book3s_64_mmu_radix.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -822,7 +822,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
822822

823823
/* Call KVM generic code to do the slow-path check */
824824
pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
825-
writing, upgrade_p);
825+
writing, upgrade_p, NULL);
826826
if (is_error_noslot_pfn(pfn))
827827
return -EFAULT;
828828
page = NULL;

arch/x86/kvm/mmu/mmu.c

+17-6
Original file line numberDiff line numberDiff line change
@@ -2734,6 +2734,13 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
27342734
if (sp->role.level > PG_LEVEL_4K)
27352735
return;
27362736

2737+
/*
2738+
* If addresses are being invalidated, skip prefetching to avoid
2739+
* accidentally prefetching those addresses.
2740+
*/
2741+
if (unlikely(vcpu->kvm->mmu_notifier_count))
2742+
return;
2743+
27372744
__direct_pte_prefetch(vcpu, sp, sptep);
27382745
}
27392746

@@ -3640,8 +3647,8 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
36403647
}
36413648

36423649
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
3643-
gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
3644-
bool *writable)
3650+
gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva,
3651+
bool write, bool *writable)
36453652
{
36463653
struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
36473654
bool async;
@@ -3654,7 +3661,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
36543661
}
36553662

36563663
async = false;
3657-
*pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
3664+
*pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async,
3665+
write, writable, hva);
36583666
if (!async)
36593667
return false; /* *pfn has correct page already */
36603668

@@ -3668,7 +3676,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
36683676
return true;
36693677
}
36703678

3671-
*pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
3679+
*pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL,
3680+
write, writable, hva);
36723681
return false;
36733682
}
36743683

@@ -3681,6 +3690,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
36813690
gfn_t gfn = gpa >> PAGE_SHIFT;
36823691
unsigned long mmu_seq;
36833692
kvm_pfn_t pfn;
3693+
hva_t hva;
36843694
int r;
36853695

36863696
if (page_fault_handle_page_track(vcpu, error_code, gfn))
@@ -3699,7 +3709,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
36993709
mmu_seq = vcpu->kvm->mmu_notifier_seq;
37003710
smp_rmb();
37013711

3702-
if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
3712+
if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, &hva,
3713+
write, &map_writable))
37033714
return RET_PF_RETRY;
37043715

37053716
if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r))
@@ -3712,7 +3723,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
37123723
else
37133724
write_lock(&vcpu->kvm->mmu_lock);
37143725

3715-
if (!is_noslot_pfn(pfn) && mmu_notifier_retry(vcpu->kvm, mmu_seq))
3726+
if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva))
37163727
goto out_unlock;
37173728
r = make_mmu_pages_available(vcpu);
37183729
if (r)

arch/x86/kvm/mmu/paging_tmpl.h

+11-3
Original file line numberDiff line numberDiff line change
@@ -601,6 +601,13 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
601601
if (sp->role.level > PG_LEVEL_4K)
602602
return;
603603

604+
/*
605+
* If addresses are being invalidated, skip prefetching to avoid
606+
* accidentally prefetching those addresses.
607+
*/
608+
if (unlikely(vcpu->kvm->mmu_notifier_count))
609+
return;
610+
604611
if (sp->role.direct)
605612
return __direct_pte_prefetch(vcpu, sp, sptep);
606613

@@ -790,6 +797,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
790797
struct guest_walker walker;
791798
int r;
792799
kvm_pfn_t pfn;
800+
hva_t hva;
793801
unsigned long mmu_seq;
794802
bool map_writable, is_self_change_mapping;
795803
int max_level;
@@ -840,8 +848,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
840848
mmu_seq = vcpu->kvm->mmu_notifier_seq;
841849
smp_rmb();
842850

843-
if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
844-
&map_writable))
851+
if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, &hva,
852+
write_fault, &map_writable))
845853
return RET_PF_RETRY;
846854

847855
if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r))
@@ -869,7 +877,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
869877

870878
r = RET_PF_RETRY;
871879
write_lock(&vcpu->kvm->mmu_lock);
872-
if (!is_noslot_pfn(pfn) && mmu_notifier_retry(vcpu->kvm, mmu_seq))
880+
if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva))
873881
goto out_unlock;
874882

875883
kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);

include/linux/kvm_host.h

+24-1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include <linux/signal.h>
1212
#include <linux/sched.h>
1313
#include <linux/bug.h>
14+
#include <linux/minmax.h>
1415
#include <linux/mm.h>
1516
#include <linux/mmu_notifier.h>
1617
#include <linux/preempt.h>
@@ -506,6 +507,8 @@ struct kvm {
506507
struct mmu_notifier mmu_notifier;
507508
unsigned long mmu_notifier_seq;
508509
long mmu_notifier_count;
510+
unsigned long mmu_notifier_range_start;
511+
unsigned long mmu_notifier_range_end;
509512
#endif
510513
long tlbs_dirty;
511514
struct list_head devices;
@@ -733,7 +736,7 @@ kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
733736
kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn);
734737
kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
735738
bool atomic, bool *async, bool write_fault,
736-
bool *writable);
739+
bool *writable, hva_t *hva);
737740

738741
void kvm_release_pfn_clean(kvm_pfn_t pfn);
739742
void kvm_release_pfn_dirty(kvm_pfn_t pfn);
@@ -1207,6 +1210,26 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
12071210
return 1;
12081211
return 0;
12091212
}
1213+
1214+
static inline int mmu_notifier_retry_hva(struct kvm *kvm,
1215+
unsigned long mmu_seq,
1216+
unsigned long hva)
1217+
{
1218+
lockdep_assert_held(&kvm->mmu_lock);
1219+
/*
1220+
* If mmu_notifier_count is non-zero, then the range maintained by
1221+
* kvm_mmu_notifier_invalidate_range_start contains all addresses that
1222+
* might be being invalidated. Note that it may include some false
1223+
* positives, due to shortcuts when handing concurrent invalidations.
1224+
*/
1225+
if (unlikely(kvm->mmu_notifier_count) &&
1226+
hva >= kvm->mmu_notifier_range_start &&
1227+
hva < kvm->mmu_notifier_range_end)
1228+
return 1;
1229+
if (kvm->mmu_notifier_seq != mmu_seq)
1230+
return 1;
1231+
return 0;
1232+
}
12101233
#endif
12111234

12121235
#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING

virt/kvm/kvm_main.c

+25-4
Original file line numberDiff line numberDiff line change
@@ -486,6 +486,24 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
486486
* count is also read inside the mmu_lock critical section.
487487
*/
488488
kvm->mmu_notifier_count++;
489+
if (likely(kvm->mmu_notifier_count == 1)) {
490+
kvm->mmu_notifier_range_start = range->start;
491+
kvm->mmu_notifier_range_end = range->end;
492+
} else {
493+
/*
494+
* Fully tracking multiple concurrent ranges has dimishing
495+
* returns. Keep things simple and just find the minimal range
496+
* which includes the current and new ranges. As there won't be
497+
* enough information to subtract a range after its invalidate
498+
* completes, any ranges invalidated concurrently will
499+
* accumulate and persist until all outstanding invalidates
500+
* complete.
501+
*/
502+
kvm->mmu_notifier_range_start =
503+
min(kvm->mmu_notifier_range_start, range->start);
504+
kvm->mmu_notifier_range_end =
505+
max(kvm->mmu_notifier_range_end, range->end);
506+
}
489507
need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end,
490508
range->flags);
491509
/* we've to flush the tlb before the pages can be freed */
@@ -2023,10 +2041,13 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
20232041

20242042
kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
20252043
bool atomic, bool *async, bool write_fault,
2026-
bool *writable)
2044+
bool *writable, hva_t *hva)
20272045
{
20282046
unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
20292047

2048+
if (hva)
2049+
*hva = addr;
2050+
20302051
if (addr == KVM_HVA_ERR_RO_BAD) {
20312052
if (writable)
20322053
*writable = false;
@@ -2054,19 +2075,19 @@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
20542075
bool *writable)
20552076
{
20562077
return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
2057-
write_fault, writable);
2078+
write_fault, writable, NULL);
20582079
}
20592080
EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
20602081

20612082
kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
20622083
{
2063-
return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
2084+
return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL);
20642085
}
20652086
EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
20662087

20672088
kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
20682089
{
2069-
return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
2090+
return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL, NULL);
20702091
}
20712092
EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
20722093

0 commit comments

Comments
 (0)