aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGleb Natapov <gleb@redhat.com>2010-10-14 05:22:46 -0400
committerAvi Kivity <avi@redhat.com>2011-01-12 04:21:39 -0500
commitaf585b921e5d1e919947c4b1164b59507fe7cd7b (patch)
treed0d4cc753d4d58934c5986733d7340fe69e523de
parent010c520e20413dfd567d568aba2b7238acd37e33 (diff)
KVM: Halt vcpu if page it tries to access is swapped out
If a guest accesses swapped out memory do not swap it in from vcpu thread context. Schedule work to do swapping and put vcpu into halted state instead. Interrupts will still be delivered to the guest and if interrupt will cause reschedule guest will continue to run another task. [avi: remove call to get_user_pages_noio(), nacked by Linus; this makes everything synchrnous again] Acked-by: Rik van Riel <riel@redhat.com> Signed-off-by: Gleb Natapov <gleb@redhat.com> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
-rw-r--r--arch/x86/include/asm/kvm_host.h18
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/Makefile1
-rw-r--r--arch/x86/kvm/mmu.c52
-rw-r--r--arch/x86/kvm/paging_tmpl.h4
-rw-r--r--arch/x86/kvm/x86.c112
-rw-r--r--include/linux/kvm_host.h31
-rw-r--r--include/trace/events/kvm.h90
-rw-r--r--virt/kvm/Kconfig3
-rw-r--r--virt/kvm/async_pf.c190
-rw-r--r--virt/kvm/async_pf.h36
-rw-r--r--virt/kvm/kvm_main.c48
12 files changed, 570 insertions, 16 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f702f82aa1eb..b5f4c1a36d65 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -83,11 +83,14 @@
83#define KVM_NR_FIXED_MTRR_REGION 88 83#define KVM_NR_FIXED_MTRR_REGION 88
84#define KVM_NR_VAR_MTRR 8 84#define KVM_NR_VAR_MTRR 8
85 85
86#define ASYNC_PF_PER_VCPU 64
87
86extern spinlock_t kvm_lock; 88extern spinlock_t kvm_lock;
87extern struct list_head vm_list; 89extern struct list_head vm_list;
88 90
89struct kvm_vcpu; 91struct kvm_vcpu;
90struct kvm; 92struct kvm;
93struct kvm_async_pf;
91 94
92enum kvm_reg { 95enum kvm_reg {
93 VCPU_REGS_RAX = 0, 96 VCPU_REGS_RAX = 0,
@@ -412,6 +415,11 @@ struct kvm_vcpu_arch {
412 u64 hv_vapic; 415 u64 hv_vapic;
413 416
414 cpumask_var_t wbinvd_dirty_mask; 417 cpumask_var_t wbinvd_dirty_mask;
418
419 struct {
420 bool halted;
421 gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)];
422 } apf;
415}; 423};
416 424
417struct kvm_arch { 425struct kvm_arch {
@@ -585,6 +593,10 @@ struct kvm_x86_ops {
585 const struct trace_print_flags *exit_reasons_str; 593 const struct trace_print_flags *exit_reasons_str;
586}; 594};
587 595
596struct kvm_arch_async_pf {
597 gfn_t gfn;
598};
599
588extern struct kvm_x86_ops *kvm_x86_ops; 600extern struct kvm_x86_ops *kvm_x86_ops;
589 601
590int kvm_mmu_module_init(void); 602int kvm_mmu_module_init(void);
@@ -799,4 +811,10 @@ void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
799 811
800bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); 812bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
801 813
814void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
815 struct kvm_async_pf *work);
816void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
817 struct kvm_async_pf *work);
818extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
819
802#endif /* _ASM_X86_KVM_HOST_H */ 820#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index ddc131ff438f..50f63648ce1b 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -28,6 +28,7 @@ config KVM
28 select HAVE_KVM_IRQCHIP 28 select HAVE_KVM_IRQCHIP
29 select HAVE_KVM_EVENTFD 29 select HAVE_KVM_EVENTFD
30 select KVM_APIC_ARCHITECTURE 30 select KVM_APIC_ARCHITECTURE
31 select KVM_ASYNC_PF
31 select USER_RETURN_NOTIFIER 32 select USER_RETURN_NOTIFIER
32 select KVM_MMIO 33 select KVM_MMIO
33 ---help--- 34 ---help---
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 31a7035c4bd9..c53bf19b1da0 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -9,6 +9,7 @@ kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
9 coalesced_mmio.o irq_comm.o eventfd.o \ 9 coalesced_mmio.o irq_comm.o eventfd.o \
10 assigned-dev.o) 10 assigned-dev.o)
11kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) 11kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
12kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o)
12 13
13kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ 14kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
14 i8254.o timer.o 15 i8254.o timer.o
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index fbb04aee8301..4ab04de5a76a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -18,9 +18,11 @@
18 * 18 *
19 */ 19 */
20 20
21#include "irq.h"
21#include "mmu.h" 22#include "mmu.h"
22#include "x86.h" 23#include "x86.h"
23#include "kvm_cache_regs.h" 24#include "kvm_cache_regs.h"
25#include "x86.h"
24 26
25#include <linux/kvm_host.h> 27#include <linux/kvm_host.h>
26#include <linux/types.h> 28#include <linux/types.h>
@@ -2587,6 +2589,50 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2587 error_code & PFERR_WRITE_MASK, gfn); 2589 error_code & PFERR_WRITE_MASK, gfn);
2588} 2590}
2589 2591
2592int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
2593{
2594 struct kvm_arch_async_pf arch;
2595 arch.gfn = gfn;
2596
2597 return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
2598}
2599
2600static bool can_do_async_pf(struct kvm_vcpu *vcpu)
2601{
2602 if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
2603 kvm_event_needs_reinjection(vcpu)))
2604 return false;
2605
2606 return kvm_x86_ops->interrupt_allowed(vcpu);
2607}
2608
2609static bool try_async_pf(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
2610 pfn_t *pfn)
2611{
2612 bool async;
2613
2614 *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async);
2615
2616 if (!async)
2617 return false; /* *pfn has correct page already */
2618
2619 put_page(pfn_to_page(*pfn));
2620
2621 if (can_do_async_pf(vcpu)) {
2622 trace_kvm_try_async_get_page(async, *pfn);
2623 if (kvm_find_async_pf_gfn(vcpu, gfn)) {
2624 trace_kvm_async_pf_doublefault(gva, gfn);
2625 kvm_make_request(KVM_REQ_APF_HALT, vcpu);
2626 return true;
2627 } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
2628 return true;
2629 }
2630
2631 *pfn = gfn_to_pfn(vcpu->kvm, gfn);
2632
2633 return false;
2634}
2635
2590static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, 2636static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
2591 u32 error_code) 2637 u32 error_code)
2592{ 2638{
@@ -2609,7 +2655,11 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
2609 2655
2610 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2656 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2611 smp_rmb(); 2657 smp_rmb();
2612 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2658
2659 if (try_async_pf(vcpu, gfn, gpa, &pfn))
2660 return 0;
2661
2662 /* mmio */
2613 if (is_error_pfn(pfn)) 2663 if (is_error_pfn(pfn))
2614 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); 2664 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2615 spin_lock(&vcpu->kvm->mmu_lock); 2665 spin_lock(&vcpu->kvm->mmu_lock);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index cd7a833a3b52..c45376dd041a 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -568,7 +568,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
568 568
569 mmu_seq = vcpu->kvm->mmu_notifier_seq; 569 mmu_seq = vcpu->kvm->mmu_notifier_seq;
570 smp_rmb(); 570 smp_rmb();
571 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); 571
572 if (try_async_pf(vcpu, walker.gfn, addr, &pfn))
573 return 0;
572 574
573 /* mmio */ 575 /* mmio */
574 if (is_error_pfn(pfn)) 576 if (is_error_pfn(pfn))
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c05d47701292..3cd4d091c2f3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -43,6 +43,7 @@
43#include <linux/slab.h> 43#include <linux/slab.h>
44#include <linux/perf_event.h> 44#include <linux/perf_event.h>
45#include <linux/uaccess.h> 45#include <linux/uaccess.h>
46#include <linux/hash.h>
46#include <trace/events/kvm.h> 47#include <trace/events/kvm.h>
47 48
48#define CREATE_TRACE_POINTS 49#define CREATE_TRACE_POINTS
@@ -155,6 +156,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
155 156
156u64 __read_mostly host_xcr0; 157u64 __read_mostly host_xcr0;
157 158
159static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
160{
161 int i;
162 for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
163 vcpu->arch.apf.gfns[i] = ~0;
164}
165
158static void kvm_on_user_return(struct user_return_notifier *urn) 166static void kvm_on_user_return(struct user_return_notifier *urn)
159{ 167{
160 unsigned slot; 168 unsigned slot;
@@ -5115,6 +5123,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5115 vcpu->fpu_active = 0; 5123 vcpu->fpu_active = 0;
5116 kvm_x86_ops->fpu_deactivate(vcpu); 5124 kvm_x86_ops->fpu_deactivate(vcpu);
5117 } 5125 }
5126 if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
5127 /* Page is swapped out. Do synthetic halt */
5128 vcpu->arch.apf.halted = true;
5129 r = 1;
5130 goto out;
5131 }
5118 } 5132 }
5119 5133
5120 r = kvm_mmu_reload(vcpu); 5134 r = kvm_mmu_reload(vcpu);
@@ -5243,7 +5257,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5243 5257
5244 r = 1; 5258 r = 1;
5245 while (r > 0) { 5259 while (r > 0) {
5246 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 5260 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
5261 !vcpu->arch.apf.halted)
5247 r = vcpu_enter_guest(vcpu); 5262 r = vcpu_enter_guest(vcpu);
5248 else { 5263 else {
5249 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 5264 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
@@ -5256,6 +5271,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5256 vcpu->arch.mp_state = 5271 vcpu->arch.mp_state =
5257 KVM_MP_STATE_RUNNABLE; 5272 KVM_MP_STATE_RUNNABLE;
5258 case KVM_MP_STATE_RUNNABLE: 5273 case KVM_MP_STATE_RUNNABLE:
5274 vcpu->arch.apf.halted = false;
5259 break; 5275 break;
5260 case KVM_MP_STATE_SIPI_RECEIVED: 5276 case KVM_MP_STATE_SIPI_RECEIVED:
5261 default: 5277 default:
@@ -5277,6 +5293,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5277 vcpu->run->exit_reason = KVM_EXIT_INTR; 5293 vcpu->run->exit_reason = KVM_EXIT_INTR;
5278 ++vcpu->stat.request_irq_exits; 5294 ++vcpu->stat.request_irq_exits;
5279 } 5295 }
5296
5297 kvm_check_async_pf_completion(vcpu);
5298
5280 if (signal_pending(current)) { 5299 if (signal_pending(current)) {
5281 r = -EINTR; 5300 r = -EINTR;
5282 vcpu->run->exit_reason = KVM_EXIT_INTR; 5301 vcpu->run->exit_reason = KVM_EXIT_INTR;
@@ -5792,6 +5811,10 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
5792 5811
5793 kvm_make_request(KVM_REQ_EVENT, vcpu); 5812 kvm_make_request(KVM_REQ_EVENT, vcpu);
5794 5813
5814 kvm_clear_async_pf_completion_queue(vcpu);
5815 kvm_async_pf_hash_reset(vcpu);
5816 vcpu->arch.apf.halted = false;
5817
5795 return kvm_x86_ops->vcpu_reset(vcpu); 5818 return kvm_x86_ops->vcpu_reset(vcpu);
5796} 5819}
5797 5820
@@ -5880,6 +5903,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5880 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) 5903 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
5881 goto fail_free_mce_banks; 5904 goto fail_free_mce_banks;
5882 5905
5906 kvm_async_pf_hash_reset(vcpu);
5907
5883 return 0; 5908 return 0;
5884fail_free_mce_banks: 5909fail_free_mce_banks:
5885 kfree(vcpu->arch.mce_banks); 5910 kfree(vcpu->arch.mce_banks);
@@ -5938,8 +5963,10 @@ static void kvm_free_vcpus(struct kvm *kvm)
5938 /* 5963 /*
5939 * Unpin any mmu pages first. 5964 * Unpin any mmu pages first.
5940 */ 5965 */
5941 kvm_for_each_vcpu(i, vcpu, kvm) 5966 kvm_for_each_vcpu(i, vcpu, kvm) {
5967 kvm_clear_async_pf_completion_queue(vcpu);
5942 kvm_unload_vcpu_mmu(vcpu); 5968 kvm_unload_vcpu_mmu(vcpu);
5969 }
5943 kvm_for_each_vcpu(i, vcpu, kvm) 5970 kvm_for_each_vcpu(i, vcpu, kvm)
5944 kvm_arch_vcpu_free(vcpu); 5971 kvm_arch_vcpu_free(vcpu);
5945 5972
@@ -6050,7 +6077,9 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
6050 6077
6051int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 6078int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
6052{ 6079{
6053 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE 6080 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
6081 !vcpu->arch.apf.halted)
6082 || !list_empty_careful(&vcpu->async_pf.done)
6054 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED 6083 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
6055 || vcpu->arch.nmi_pending || 6084 || vcpu->arch.nmi_pending ||
6056 (kvm_arch_interrupt_allowed(vcpu) && 6085 (kvm_arch_interrupt_allowed(vcpu) &&
@@ -6109,6 +6138,83 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
6109} 6138}
6110EXPORT_SYMBOL_GPL(kvm_set_rflags); 6139EXPORT_SYMBOL_GPL(kvm_set_rflags);
6111 6140
6141static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
6142{
6143 return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
6144}
6145
6146static inline u32 kvm_async_pf_next_probe(u32 key)
6147{
6148 return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
6149}
6150
6151static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
6152{
6153 u32 key = kvm_async_pf_hash_fn(gfn);
6154
6155 while (vcpu->arch.apf.gfns[key] != ~0)
6156 key = kvm_async_pf_next_probe(key);
6157
6158 vcpu->arch.apf.gfns[key] = gfn;
6159}
6160
6161static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
6162{
6163 int i;
6164 u32 key = kvm_async_pf_hash_fn(gfn);
6165
6166 for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
6167 (vcpu->arch.apf.gfns[key] != gfn ||
6168 vcpu->arch.apf.gfns[key] == ~0); i++)
6169 key = kvm_async_pf_next_probe(key);
6170
6171 return key;
6172}
6173
6174bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
6175{
6176 return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
6177}
6178
6179static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
6180{
6181 u32 i, j, k;
6182
6183 i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
6184 while (true) {
6185 vcpu->arch.apf.gfns[i] = ~0;
6186 do {
6187 j = kvm_async_pf_next_probe(j);
6188 if (vcpu->arch.apf.gfns[j] == ~0)
6189 return;
6190 k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
6191 /*
6192 * k lies cyclically in ]i,j]
6193 * | i.k.j |
6194 * |....j i.k.| or |.k..j i...|
6195 */
6196 } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
6197 vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
6198 i = j;
6199 }
6200}
6201
6202void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
6203 struct kvm_async_pf *work)
6204{
6205 trace_kvm_async_pf_not_present(work->gva);
6206
6207 kvm_make_request(KVM_REQ_APF_HALT, vcpu);
6208 kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
6209}
6210
6211void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
6212 struct kvm_async_pf *work)
6213{
6214 trace_kvm_async_pf_ready(work->gva);
6215 kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
6216}
6217
6112EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); 6218EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
6113EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); 6219EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
6114EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); 6220EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a0557422715e..e56acc7857e2 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -40,6 +40,7 @@
40#define KVM_REQ_KICK 9 40#define KVM_REQ_KICK 9
41#define KVM_REQ_DEACTIVATE_FPU 10 41#define KVM_REQ_DEACTIVATE_FPU 10
42#define KVM_REQ_EVENT 11 42#define KVM_REQ_EVENT 11
43#define KVM_REQ_APF_HALT 12
43 44
44#define KVM_USERSPACE_IRQ_SOURCE_ID 0 45#define KVM_USERSPACE_IRQ_SOURCE_ID 0
45 46
@@ -74,6 +75,26 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
74int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, 75int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
75 struct kvm_io_device *dev); 76 struct kvm_io_device *dev);
76 77
78#ifdef CONFIG_KVM_ASYNC_PF
79struct kvm_async_pf {
80 struct work_struct work;
81 struct list_head link;
82 struct list_head queue;
83 struct kvm_vcpu *vcpu;
84 struct mm_struct *mm;
85 gva_t gva;
86 unsigned long addr;
87 struct kvm_arch_async_pf arch;
88 struct page *page;
89 bool done;
90};
91
92void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu);
93void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu);
94int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
95 struct kvm_arch_async_pf *arch);
96#endif
97
77struct kvm_vcpu { 98struct kvm_vcpu {
78 struct kvm *kvm; 99 struct kvm *kvm;
79#ifdef CONFIG_PREEMPT_NOTIFIERS 100#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -104,6 +125,15 @@ struct kvm_vcpu {
104 gpa_t mmio_phys_addr; 125 gpa_t mmio_phys_addr;
105#endif 126#endif
106 127
128#ifdef CONFIG_KVM_ASYNC_PF
129 struct {
130 u32 queued;
131 struct list_head queue;
132 struct list_head done;
133 spinlock_t lock;
134 } async_pf;
135#endif
136
107 struct kvm_vcpu_arch arch; 137 struct kvm_vcpu_arch arch;
108}; 138};
109 139
@@ -302,6 +332,7 @@ void kvm_set_page_accessed(struct page *page);
302 332
303pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr); 333pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr);
304pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn); 334pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
335pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async);
305pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); 336pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
306pfn_t gfn_to_pfn_memslot(struct kvm *kvm, 337pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
307 struct kvm_memory_slot *slot, gfn_t gfn); 338 struct kvm_memory_slot *slot, gfn_t gfn);
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 6dd3a51ab1cb..a78a5e574632 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -185,6 +185,96 @@ TRACE_EVENT(kvm_age_page,
185 __entry->referenced ? "YOUNG" : "OLD") 185 __entry->referenced ? "YOUNG" : "OLD")
186); 186);
187 187
188#ifdef CONFIG_KVM_ASYNC_PF
189TRACE_EVENT(
190 kvm_try_async_get_page,
191 TP_PROTO(bool async, u64 pfn),
192 TP_ARGS(async, pfn),
193
194 TP_STRUCT__entry(
195 __field(__u64, pfn)
196 ),
197
198 TP_fast_assign(
199 __entry->pfn = (!async) ? pfn : (u64)-1;
200 ),
201
202 TP_printk("pfn %#llx", __entry->pfn)
203);
204
205TRACE_EVENT(
206 kvm_async_pf_not_present,
207 TP_PROTO(u64 gva),
208 TP_ARGS(gva),
209
210 TP_STRUCT__entry(
211 __field(__u64, gva)
212 ),
213
214 TP_fast_assign(
215 __entry->gva = gva;
216 ),
217
218 TP_printk("gva %#llx not present", __entry->gva)
219);
220
221TRACE_EVENT(
222 kvm_async_pf_ready,
223 TP_PROTO(u64 gva),
224 TP_ARGS(gva),
225
226 TP_STRUCT__entry(
227 __field(__u64, gva)
228 ),
229
230 TP_fast_assign(
231 __entry->gva = gva;
232 ),
233
234 TP_printk("gva %#llx ready", __entry->gva)
235);
236
237TRACE_EVENT(
238 kvm_async_pf_completed,
239 TP_PROTO(unsigned long address, struct page *page, u64 gva),
240 TP_ARGS(address, page, gva),
241
242 TP_STRUCT__entry(
243 __field(unsigned long, address)
244 __field(pfn_t, pfn)
245 __field(u64, gva)
246 ),
247
248 TP_fast_assign(
249 __entry->address = address;
250 __entry->pfn = page ? page_to_pfn(page) : 0;
251 __entry->gva = gva;
252 ),
253
254 TP_printk("gva %#llx address %#lx pfn %#llx", __entry->gva,
255 __entry->address, __entry->pfn)
256);
257
258TRACE_EVENT(
259 kvm_async_pf_doublefault,
260 TP_PROTO(u64 gva, u64 gfn),
261 TP_ARGS(gva, gfn),
262
263 TP_STRUCT__entry(
264 __field(u64, gva)
265 __field(u64, gfn)
266 ),
267
268 TP_fast_assign(
269 __entry->gva = gva;
270 __entry->gfn = gfn;
271 ),
272
273 TP_printk("gva = %#llx, gfn = %#llx", __entry->gva, __entry->gfn)
274);
275
276#endif
277
188#endif /* _TRACE_KVM_MAIN_H */ 278#endif /* _TRACE_KVM_MAIN_H */
189 279
190/* This part must be outside protection */ 280/* This part must be outside protection */
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 7f1178f6b839..f63ccb0a5982 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -15,3 +15,6 @@ config KVM_APIC_ARCHITECTURE
15 15
16config KVM_MMIO 16config KVM_MMIO
17 bool 17 bool
18
19config KVM_ASYNC_PF
20 bool
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
new file mode 100644
index 000000000000..857d63431cb7
--- /dev/null
+++ b/virt/kvm/async_pf.c
@@ -0,0 +1,190 @@
1/*
2 * kvm asynchronous fault support
3 *
4 * Copyright 2010 Red Hat, Inc.
5 *
6 * Author:
7 * Gleb Natapov <gleb@redhat.com>
8 *
9 * This file is free software; you can redistribute it and/or modify
10 * it under the terms of version 2 of the GNU General Public License
11 * as published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software Foundation,
20 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
21 */
22
23#include <linux/kvm_host.h>
24#include <linux/slab.h>
25#include <linux/module.h>
26#include <linux/mmu_context.h>
27
28#include "async_pf.h"
29#include <trace/events/kvm.h>
30
31static struct kmem_cache *async_pf_cache;
32
33int kvm_async_pf_init(void)
34{
35 async_pf_cache = KMEM_CACHE(kvm_async_pf, 0);
36
37 if (!async_pf_cache)
38 return -ENOMEM;
39
40 return 0;
41}
42
43void kvm_async_pf_deinit(void)
44{
45 if (async_pf_cache)
46 kmem_cache_destroy(async_pf_cache);
47 async_pf_cache = NULL;
48}
49
50void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu)
51{
52 INIT_LIST_HEAD(&vcpu->async_pf.done);
53 INIT_LIST_HEAD(&vcpu->async_pf.queue);
54 spin_lock_init(&vcpu->async_pf.lock);
55}
56
57static void async_pf_execute(struct work_struct *work)
58{
59 struct page *page = NULL;
60 struct kvm_async_pf *apf =
61 container_of(work, struct kvm_async_pf, work);
62 struct mm_struct *mm = apf->mm;
63 struct kvm_vcpu *vcpu = apf->vcpu;
64 unsigned long addr = apf->addr;
65 gva_t gva = apf->gva;
66
67 might_sleep();
68
69 use_mm(mm);
70 down_read(&mm->mmap_sem);
71 get_user_pages(current, mm, addr, 1, 1, 0, &page, NULL);
72 up_read(&mm->mmap_sem);
73 unuse_mm(mm);
74
75 spin_lock(&vcpu->async_pf.lock);
76 list_add_tail(&apf->link, &vcpu->async_pf.done);
77 apf->page = page;
78 apf->done = true;
79 spin_unlock(&vcpu->async_pf.lock);
80
81 /*
82 * apf may be freed by kvm_check_async_pf_completion() after
83 * this point
84 */
85
86 trace_kvm_async_pf_completed(addr, page, gva);
87
88 if (waitqueue_active(&vcpu->wq))
89 wake_up_interruptible(&vcpu->wq);
90
91 mmdrop(mm);
92 kvm_put_kvm(vcpu->kvm);
93}
94
95void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
96{
97 /* cancel outstanding work queue item */
98 while (!list_empty(&vcpu->async_pf.queue)) {
99 struct kvm_async_pf *work =
100 list_entry(vcpu->async_pf.queue.next,
101 typeof(*work), queue);
102 cancel_work_sync(&work->work);
103 list_del(&work->queue);
104 if (!work->done) /* work was canceled */
105 kmem_cache_free(async_pf_cache, work);
106 }
107
108 spin_lock(&vcpu->async_pf.lock);
109 while (!list_empty(&vcpu->async_pf.done)) {
110 struct kvm_async_pf *work =
111 list_entry(vcpu->async_pf.done.next,
112 typeof(*work), link);
113 list_del(&work->link);
114 if (work->page)
115 put_page(work->page);
116 kmem_cache_free(async_pf_cache, work);
117 }
118 spin_unlock(&vcpu->async_pf.lock);
119
120 vcpu->async_pf.queued = 0;
121}
122
123void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
124{
125 struct kvm_async_pf *work;
126
127 if (list_empty_careful(&vcpu->async_pf.done))
128 return;
129
130 spin_lock(&vcpu->async_pf.lock);
131 work = list_first_entry(&vcpu->async_pf.done, typeof(*work), link);
132 list_del(&work->link);
133 spin_unlock(&vcpu->async_pf.lock);
134
135 kvm_arch_async_page_present(vcpu, work);
136
137 list_del(&work->queue);
138 vcpu->async_pf.queued--;
139 if (work->page)
140 put_page(work->page);
141 kmem_cache_free(async_pf_cache, work);
142}
143
144int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
145 struct kvm_arch_async_pf *arch)
146{
147 struct kvm_async_pf *work;
148
149 if (vcpu->async_pf.queued >= ASYNC_PF_PER_VCPU)
150 return 0;
151
152 /* setup delayed work */
153
154 /*
155 * do alloc nowait since if we are going to sleep anyway we
156 * may as well sleep faulting in page
157 */
158 work = kmem_cache_zalloc(async_pf_cache, GFP_NOWAIT);
159 if (!work)
160 return 0;
161
162 work->page = NULL;
163 work->done = false;
164 work->vcpu = vcpu;
165 work->gva = gva;
166 work->addr = gfn_to_hva(vcpu->kvm, gfn);
167 work->arch = *arch;
168 work->mm = current->mm;
169 atomic_inc(&work->mm->mm_count);
170 kvm_get_kvm(work->vcpu->kvm);
171
172 /* this can't really happen otherwise gfn_to_pfn_async
173 would succeed */
174 if (unlikely(kvm_is_error_hva(work->addr)))
175 goto retry_sync;
176
177 INIT_WORK(&work->work, async_pf_execute);
178 if (!schedule_work(&work->work))
179 goto retry_sync;
180
181 list_add_tail(&work->queue, &vcpu->async_pf.queue);
182 vcpu->async_pf.queued++;
183 kvm_arch_async_page_not_present(vcpu, work);
184 return 1;
185retry_sync:
186 kvm_put_kvm(work->vcpu->kvm);
187 mmdrop(work->mm);
188 kmem_cache_free(async_pf_cache, work);
189 return 0;
190}
diff --git a/virt/kvm/async_pf.h b/virt/kvm/async_pf.h
new file mode 100644
index 000000000000..e7ef6447cb82
--- /dev/null
+++ b/virt/kvm/async_pf.h
@@ -0,0 +1,36 @@
1/*
2 * kvm asynchronous fault support
3 *
4 * Copyright 2010 Red Hat, Inc.
5 *
6 * Author:
7 * Gleb Natapov <gleb@redhat.com>
8 *
9 * This file is free software; you can redistribute it and/or modify
10 * it under the terms of version 2 of the GNU General Public License
11 * as published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software Foundation,
20 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
21 */
22
23#ifndef __KVM_ASYNC_PF_H__
24#define __KVM_ASYNC_PF_H__
25
26#ifdef CONFIG_KVM_ASYNC_PF
27int kvm_async_pf_init(void);
28void kvm_async_pf_deinit(void);
29void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu);
30#else
31#define kvm_async_pf_init() (0)
32#define kvm_async_pf_deinit() do{}while(0)
33#define kvm_async_pf_vcpu_init(C) do{}while(0)
34#endif
35
36#endif
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5225052aebc1..75fd590c0214 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -55,6 +55,7 @@
55#include <asm-generic/bitops/le.h> 55#include <asm-generic/bitops/le.h>
56 56
57#include "coalesced_mmio.h" 57#include "coalesced_mmio.h"
58#include "async_pf.h"
58 59
59#define CREATE_TRACE_POINTS 60#define CREATE_TRACE_POINTS
60#include <trace/events/kvm.h> 61#include <trace/events/kvm.h>
@@ -186,6 +187,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
186 vcpu->kvm = kvm; 187 vcpu->kvm = kvm;
187 vcpu->vcpu_id = id; 188 vcpu->vcpu_id = id;
188 init_waitqueue_head(&vcpu->wq); 189 init_waitqueue_head(&vcpu->wq);
190 kvm_async_pf_vcpu_init(vcpu);
189 191
190 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 192 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
191 if (!page) { 193 if (!page) {
@@ -946,15 +948,20 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
946} 948}
947EXPORT_SYMBOL_GPL(gfn_to_hva); 949EXPORT_SYMBOL_GPL(gfn_to_hva);
948 950
949static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic) 951static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
952 bool *async)
950{ 953{
951 struct page *page[1]; 954 struct page *page[1];
952 int npages; 955 int npages = 0;
953 pfn_t pfn; 956 pfn_t pfn;
954 957
955 if (atomic) 958 /* we can do it either atomically or asynchronously, not both */
959 BUG_ON(atomic && async);
960
961 if (atomic || async)
956 npages = __get_user_pages_fast(addr, 1, 1, page); 962 npages = __get_user_pages_fast(addr, 1, 1, page);
957 else { 963
964 if (unlikely(npages != 1) && !atomic) {
958 might_sleep(); 965 might_sleep();
959 npages = get_user_pages_fast(addr, 1, 1, page); 966 npages = get_user_pages_fast(addr, 1, 1, page);
960 } 967 }
@@ -976,6 +983,9 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic)
976 983
977 if (vma == NULL || addr < vma->vm_start || 984 if (vma == NULL || addr < vma->vm_start ||
978 !(vma->vm_flags & VM_PFNMAP)) { 985 !(vma->vm_flags & VM_PFNMAP)) {
986 if (async && !(vma->vm_flags & VM_PFNMAP) &&
987 (vma->vm_flags & VM_WRITE))
988 *async = true;
979 up_read(&current->mm->mmap_sem); 989 up_read(&current->mm->mmap_sem);
980return_fault_page: 990return_fault_page:
981 get_page(fault_page); 991 get_page(fault_page);
@@ -993,32 +1003,41 @@ return_fault_page:
993 1003
994pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr) 1004pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
995{ 1005{
996 return hva_to_pfn(kvm, addr, true); 1006 return hva_to_pfn(kvm, addr, true, NULL);
997} 1007}
998EXPORT_SYMBOL_GPL(hva_to_pfn_atomic); 1008EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
999 1009
1000static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic) 1010static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async)
1001{ 1011{
1002 unsigned long addr; 1012 unsigned long addr;
1003 1013
1014 if (async)
1015 *async = false;
1016
1004 addr = gfn_to_hva(kvm, gfn); 1017 addr = gfn_to_hva(kvm, gfn);
1005 if (kvm_is_error_hva(addr)) { 1018 if (kvm_is_error_hva(addr)) {
1006 get_page(bad_page); 1019 get_page(bad_page);
1007 return page_to_pfn(bad_page); 1020 return page_to_pfn(bad_page);
1008 } 1021 }
1009 1022
1010 return hva_to_pfn(kvm, addr, atomic); 1023 return hva_to_pfn(kvm, addr, atomic, async);
1011} 1024}
1012 1025
1013pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) 1026pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
1014{ 1027{
1015 return __gfn_to_pfn(kvm, gfn, true); 1028 return __gfn_to_pfn(kvm, gfn, true, NULL);
1016} 1029}
1017EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); 1030EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
1018 1031
1032pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async)
1033{
1034 return __gfn_to_pfn(kvm, gfn, false, async);
1035}
1036EXPORT_SYMBOL_GPL(gfn_to_pfn_async);
1037
1019pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 1038pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
1020{ 1039{
1021 return __gfn_to_pfn(kvm, gfn, false); 1040 return __gfn_to_pfn(kvm, gfn, false, NULL);
1022} 1041}
1023EXPORT_SYMBOL_GPL(gfn_to_pfn); 1042EXPORT_SYMBOL_GPL(gfn_to_pfn);
1024 1043
@@ -1026,7 +1045,7 @@ pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
1026 struct kvm_memory_slot *slot, gfn_t gfn) 1045 struct kvm_memory_slot *slot, gfn_t gfn)
1027{ 1046{
1028 unsigned long addr = gfn_to_hva_memslot(slot, gfn); 1047 unsigned long addr = gfn_to_hva_memslot(slot, gfn);
1029 return hva_to_pfn(kvm, addr, false); 1048 return hva_to_pfn(kvm, addr, false, NULL);
1030} 1049}
1031 1050
1032int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, 1051int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
@@ -2336,6 +2355,10 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
2336 goto out_free_5; 2355 goto out_free_5;
2337 } 2356 }
2338 2357
2358 r = kvm_async_pf_init();
2359 if (r)
2360 goto out_free;
2361
2339 kvm_chardev_ops.owner = module; 2362 kvm_chardev_ops.owner = module;
2340 kvm_vm_fops.owner = module; 2363 kvm_vm_fops.owner = module;
2341 kvm_vcpu_fops.owner = module; 2364 kvm_vcpu_fops.owner = module;
@@ -2343,7 +2366,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
2343 r = misc_register(&kvm_dev); 2366 r = misc_register(&kvm_dev);
2344 if (r) { 2367 if (r) {
2345 printk(KERN_ERR "kvm: misc device register failed\n"); 2368 printk(KERN_ERR "kvm: misc device register failed\n");
2346 goto out_free; 2369 goto out_unreg;
2347 } 2370 }
2348 2371
2349 kvm_preempt_ops.sched_in = kvm_sched_in; 2372 kvm_preempt_ops.sched_in = kvm_sched_in;
@@ -2353,6 +2376,8 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
2353 2376
2354 return 0; 2377 return 0;
2355 2378
2379out_unreg:
2380 kvm_async_pf_deinit();
2356out_free: 2381out_free:
2357 kmem_cache_destroy(kvm_vcpu_cache); 2382 kmem_cache_destroy(kvm_vcpu_cache);
2358out_free_5: 2383out_free_5:
@@ -2385,6 +2410,7 @@ void kvm_exit(void)
2385 kvm_exit_debug(); 2410 kvm_exit_debug();
2386 misc_deregister(&kvm_dev); 2411 misc_deregister(&kvm_dev);
2387 kmem_cache_destroy(kvm_vcpu_cache); 2412 kmem_cache_destroy(kvm_vcpu_cache);
2413 kvm_async_pf_deinit();
2388 sysdev_unregister(&kvm_sysdev); 2414 sysdev_unregister(&kvm_sysdev);
2389 sysdev_class_unregister(&kvm_sysdev_class); 2415 sysdev_class_unregister(&kvm_sysdev_class);
2390 unregister_reboot_notifier(&kvm_reboot_notifier); 2416 unregister_reboot_notifier(&kvm_reboot_notifier);