aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGleb Natapov <gleb@redhat.com>2010-10-14 05:22:52 -0400
committerAvi Kivity <avi@redhat.com>2011-01-12 04:23:16 -0500
commit631bc4878220932fe67fc46fc7cf7cccdb1ec597 (patch)
treeac588182d02308a004d45a9c3ae6834d096e263d
parentfd10cde9294f73eeccbc16f3fec1ae6cde7b800c (diff)
KVM: Handle async PF in a guest.
When async PF capability is detected hook up special page fault handler that will handle async page fault events and bypass other page faults to regular page fault handler. Also add async PF handling to nested SVM emulation. Async PF always generates exit to L1 where vcpu thread will be scheduled out until page is available. Acked-by: Rik van Riel <riel@redhat.com> Signed-off-by: Gleb Natapov <gleb@redhat.com> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
-rw-r--r--arch/x86/include/asm/kvm_para.h12
-rw-r--r--arch/x86/include/asm/traps.h1
-rw-r--r--arch/x86/kernel/entry_32.S10
-rw-r--r--arch/x86/kernel/entry_64.S3
-rw-r--r--arch/x86/kernel/kvm.c181
-rw-r--r--arch/x86/kvm/svm.c45
6 files changed, 243 insertions, 9 deletions
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 2315398230d1..fbfd3679bc18 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -65,6 +65,9 @@ struct kvm_mmu_op_release_pt {
65 __u64 pt_phys; 65 __u64 pt_phys;
66}; 66};
67 67
68#define KVM_PV_REASON_PAGE_NOT_PRESENT 1
69#define KVM_PV_REASON_PAGE_READY 2
70
68struct kvm_vcpu_pv_apf_data { 71struct kvm_vcpu_pv_apf_data {
69 __u32 reason; 72 __u32 reason;
70 __u8 pad[60]; 73 __u8 pad[60];
@@ -171,8 +174,17 @@ static inline unsigned int kvm_arch_para_features(void)
171 174
172#ifdef CONFIG_KVM_GUEST 175#ifdef CONFIG_KVM_GUEST
173void __init kvm_guest_init(void); 176void __init kvm_guest_init(void);
177void kvm_async_pf_task_wait(u32 token);
178void kvm_async_pf_task_wake(u32 token);
179u32 kvm_read_and_reset_pf_reason(void);
174#else 180#else
175#define kvm_guest_init() do { } while (0) 181#define kvm_guest_init() do { } while (0)
182#define kvm_async_pf_task_wait(T) do {} while(0)
183#define kvm_async_pf_task_wake(T) do {} while(0)
184static u32 kvm_read_and_reset_pf_reason(void)
185{
186 return 0;
187}
176#endif 188#endif
177 189
178#endif /* __KERNEL__ */ 190#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index f66cda56781d..0310da67307f 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -30,6 +30,7 @@ asmlinkage void segment_not_present(void);
30asmlinkage void stack_segment(void); 30asmlinkage void stack_segment(void);
31asmlinkage void general_protection(void); 31asmlinkage void general_protection(void);
32asmlinkage void page_fault(void); 32asmlinkage void page_fault(void);
33asmlinkage void async_page_fault(void);
33asmlinkage void spurious_interrupt_bug(void); 34asmlinkage void spurious_interrupt_bug(void);
34asmlinkage void coprocessor_error(void); 35asmlinkage void coprocessor_error(void);
35asmlinkage void alignment_check(void); 36asmlinkage void alignment_check(void);
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 591e60104278..c8b4efad7ebb 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1406,6 +1406,16 @@ ENTRY(general_protection)
1406 CFI_ENDPROC 1406 CFI_ENDPROC
1407END(general_protection) 1407END(general_protection)
1408 1408
1409#ifdef CONFIG_KVM_GUEST
1410ENTRY(async_page_fault)
1411 RING0_EC_FRAME
1412 pushl $do_async_page_fault
1413 CFI_ADJUST_CFA_OFFSET 4
1414 jmp error_code
1415 CFI_ENDPROC
1416END(apf_page_fault)
1417#endif
1418
1409/* 1419/*
1410 * End of kprobes section 1420 * End of kprobes section
1411 */ 1421 */
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e3ba417e8697..bb3f6e9bfa68 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1319,6 +1319,9 @@ errorentry xen_stack_segment do_stack_segment
1319#endif 1319#endif
1320errorentry general_protection do_general_protection 1320errorentry general_protection do_general_protection
1321errorentry page_fault do_page_fault 1321errorentry page_fault do_page_fault
1322#ifdef CONFIG_KVM_GUEST
1323errorentry async_page_fault do_async_page_fault
1324#endif
1322#ifdef CONFIG_X86_MCE 1325#ifdef CONFIG_X86_MCE
1323paranoidzeroentry machine_check *machine_check_vector(%rip) 1326paranoidzeroentry machine_check *machine_check_vector(%rip)
1324#endif 1327#endif
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 032d03b6b54a..d5640634fef6 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -29,8 +29,14 @@
29#include <linux/hardirq.h> 29#include <linux/hardirq.h>
30#include <linux/notifier.h> 30#include <linux/notifier.h>
31#include <linux/reboot.h> 31#include <linux/reboot.h>
32#include <linux/hash.h>
33#include <linux/sched.h>
34#include <linux/slab.h>
35#include <linux/kprobes.h>
32#include <asm/timer.h> 36#include <asm/timer.h>
33#include <asm/cpu.h> 37#include <asm/cpu.h>
38#include <asm/traps.h>
39#include <asm/desc.h>
34 40
35#define MMU_QUEUE_SIZE 1024 41#define MMU_QUEUE_SIZE 1024
36 42
@@ -64,6 +70,168 @@ static void kvm_io_delay(void)
64{ 70{
65} 71}
66 72
73#define KVM_TASK_SLEEP_HASHBITS 8
74#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)
75
76struct kvm_task_sleep_node {
77 struct hlist_node link;
78 wait_queue_head_t wq;
79 u32 token;
80 int cpu;
81};
82
83static struct kvm_task_sleep_head {
84 spinlock_t lock;
85 struct hlist_head list;
86} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
87
88static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
89 u32 token)
90{
91 struct hlist_node *p;
92
93 hlist_for_each(p, &b->list) {
94 struct kvm_task_sleep_node *n =
95 hlist_entry(p, typeof(*n), link);
96 if (n->token == token)
97 return n;
98 }
99
100 return NULL;
101}
102
103void kvm_async_pf_task_wait(u32 token)
104{
105 u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
106 struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
107 struct kvm_task_sleep_node n, *e;
108 DEFINE_WAIT(wait);
109
110 spin_lock(&b->lock);
111 e = _find_apf_task(b, token);
112 if (e) {
113 /* dummy entry exist -> wake up was delivered ahead of PF */
114 hlist_del(&e->link);
115 kfree(e);
116 spin_unlock(&b->lock);
117 return;
118 }
119
120 n.token = token;
121 n.cpu = smp_processor_id();
122 init_waitqueue_head(&n.wq);
123 hlist_add_head(&n.link, &b->list);
124 spin_unlock(&b->lock);
125
126 for (;;) {
127 prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
128 if (hlist_unhashed(&n.link))
129 break;
130 local_irq_enable();
131 schedule();
132 local_irq_disable();
133 }
134 finish_wait(&n.wq, &wait);
135
136 return;
137}
138EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
139
140static void apf_task_wake_one(struct kvm_task_sleep_node *n)
141{
142 hlist_del_init(&n->link);
143 if (waitqueue_active(&n->wq))
144 wake_up(&n->wq);
145}
146
147static void apf_task_wake_all(void)
148{
149 int i;
150
151 for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
152 struct hlist_node *p, *next;
153 struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
154 spin_lock(&b->lock);
155 hlist_for_each_safe(p, next, &b->list) {
156 struct kvm_task_sleep_node *n =
157 hlist_entry(p, typeof(*n), link);
158 if (n->cpu == smp_processor_id())
159 apf_task_wake_one(n);
160 }
161 spin_unlock(&b->lock);
162 }
163}
164
165void kvm_async_pf_task_wake(u32 token)
166{
167 u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
168 struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
169 struct kvm_task_sleep_node *n;
170
171 if (token == ~0) {
172 apf_task_wake_all();
173 return;
174 }
175
176again:
177 spin_lock(&b->lock);
178 n = _find_apf_task(b, token);
179 if (!n) {
180 /*
181 * async PF was not yet handled.
182 * Add dummy entry for the token.
183 */
184 n = kmalloc(sizeof(*n), GFP_ATOMIC);
185 if (!n) {
186 /*
187 * Allocation failed! Busy wait while other cpu
188 * handles async PF.
189 */
190 spin_unlock(&b->lock);
191 cpu_relax();
192 goto again;
193 }
194 n->token = token;
195 n->cpu = smp_processor_id();
196 init_waitqueue_head(&n->wq);
197 hlist_add_head(&n->link, &b->list);
198 } else
199 apf_task_wake_one(n);
200 spin_unlock(&b->lock);
201 return;
202}
203EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
204
205u32 kvm_read_and_reset_pf_reason(void)
206{
207 u32 reason = 0;
208
209 if (__get_cpu_var(apf_reason).enabled) {
210 reason = __get_cpu_var(apf_reason).reason;
211 __get_cpu_var(apf_reason).reason = 0;
212 }
213
214 return reason;
215}
216EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
217
218dotraplinkage void __kprobes
219do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
220{
221 switch (kvm_read_and_reset_pf_reason()) {
222 default:
223 do_page_fault(regs, error_code);
224 break;
225 case KVM_PV_REASON_PAGE_NOT_PRESENT:
226 /* page is swapped out by the host. */
227 kvm_async_pf_task_wait((u32)read_cr2());
228 break;
229 case KVM_PV_REASON_PAGE_READY:
230 kvm_async_pf_task_wake((u32)read_cr2());
231 break;
232 }
233}
234
67static void kvm_mmu_op(void *buffer, unsigned len) 235static void kvm_mmu_op(void *buffer, unsigned len)
68{ 236{
69 int r; 237 int r;
@@ -300,6 +468,7 @@ static void kvm_guest_cpu_online(void *dummy)
300static void kvm_guest_cpu_offline(void *dummy) 468static void kvm_guest_cpu_offline(void *dummy)
301{ 469{
302 kvm_pv_disable_apf(NULL); 470 kvm_pv_disable_apf(NULL);
471 apf_task_wake_all();
303} 472}
304 473
305static int __cpuinit kvm_cpu_notify(struct notifier_block *self, 474static int __cpuinit kvm_cpu_notify(struct notifier_block *self,
@@ -327,13 +496,25 @@ static struct notifier_block __cpuinitdata kvm_cpu_notifier = {
327}; 496};
328#endif 497#endif
329 498
499static void __init kvm_apf_trap_init(void)
500{
501 set_intr_gate(14, &async_page_fault);
502}
503
330void __init kvm_guest_init(void) 504void __init kvm_guest_init(void)
331{ 505{
506 int i;
507
332 if (!kvm_para_available()) 508 if (!kvm_para_available())
333 return; 509 return;
334 510
335 paravirt_ops_setup(); 511 paravirt_ops_setup();
336 register_reboot_notifier(&kvm_pv_reboot_nb); 512 register_reboot_notifier(&kvm_pv_reboot_nb);
513 for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
514 spin_lock_init(&async_pf_sleepers[i].lock);
515 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
516 x86_init.irqs.trap_init = kvm_apf_trap_init;
517
337#ifdef CONFIG_SMP 518#ifdef CONFIG_SMP
338 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; 519 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
339 register_cpu_notifier(&kvm_cpu_notifier); 520 register_cpu_notifier(&kvm_cpu_notifier);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index b81a9b7c2ca4..93e8120b8021 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -31,6 +31,7 @@
31 31
32#include <asm/tlbflush.h> 32#include <asm/tlbflush.h>
33#include <asm/desc.h> 33#include <asm/desc.h>
34#include <asm/kvm_para.h>
34 35
35#include <asm/virtext.h> 36#include <asm/virtext.h>
36#include "trace.h" 37#include "trace.h"
@@ -133,6 +134,7 @@ struct vcpu_svm {
133 134
134 unsigned int3_injected; 135 unsigned int3_injected;
135 unsigned long int3_rip; 136 unsigned long int3_rip;
137 u32 apf_reason;
136}; 138};
137 139
138#define MSR_INVALID 0xffffffffU 140#define MSR_INVALID 0xffffffffU
@@ -1383,16 +1385,33 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
1383 1385
1384static int pf_interception(struct vcpu_svm *svm) 1386static int pf_interception(struct vcpu_svm *svm)
1385{ 1387{
1386 u64 fault_address; 1388 u64 fault_address = svm->vmcb->control.exit_info_2;
1387 u32 error_code; 1389 u32 error_code;
1390 int r = 1;
1388 1391
1389 fault_address = svm->vmcb->control.exit_info_2; 1392 switch (svm->apf_reason) {
1390 error_code = svm->vmcb->control.exit_info_1; 1393 default:
1394 error_code = svm->vmcb->control.exit_info_1;
1391 1395
1392 trace_kvm_page_fault(fault_address, error_code); 1396 trace_kvm_page_fault(fault_address, error_code);
1393 if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) 1397 if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
1394 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); 1398 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
1395 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 1399 r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
1400 break;
1401 case KVM_PV_REASON_PAGE_NOT_PRESENT:
1402 svm->apf_reason = 0;
1403 local_irq_disable();
1404 kvm_async_pf_task_wait(fault_address);
1405 local_irq_enable();
1406 break;
1407 case KVM_PV_REASON_PAGE_READY:
1408 svm->apf_reason = 0;
1409 local_irq_disable();
1410 kvm_async_pf_task_wake(fault_address);
1411 local_irq_enable();
1412 break;
1413 }
1414 return r;
1396} 1415}
1397 1416
1398static int db_interception(struct vcpu_svm *svm) 1417static int db_interception(struct vcpu_svm *svm)
@@ -1836,8 +1855,8 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
1836 return NESTED_EXIT_HOST; 1855 return NESTED_EXIT_HOST;
1837 break; 1856 break;
1838 case SVM_EXIT_EXCP_BASE + PF_VECTOR: 1857 case SVM_EXIT_EXCP_BASE + PF_VECTOR:
1839 /* When we're shadowing, trap PFs */ 1858 /* When we're shadowing, trap PFs, but not async PF */
1840 if (!npt_enabled) 1859 if (!npt_enabled && svm->apf_reason == 0)
1841 return NESTED_EXIT_HOST; 1860 return NESTED_EXIT_HOST;
1842 break; 1861 break;
1843 case SVM_EXIT_EXCP_BASE + NM_VECTOR: 1862 case SVM_EXIT_EXCP_BASE + NM_VECTOR:
@@ -1893,6 +1912,10 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
1893 u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); 1912 u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
1894 if (svm->nested.intercept_exceptions & excp_bits) 1913 if (svm->nested.intercept_exceptions & excp_bits)
1895 vmexit = NESTED_EXIT_DONE; 1914 vmexit = NESTED_EXIT_DONE;
1915 /* async page fault always cause vmexit */
1916 else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
1917 svm->apf_reason != 0)
1918 vmexit = NESTED_EXIT_DONE;
1896 break; 1919 break;
1897 } 1920 }
1898 case SVM_EXIT_ERR: { 1921 case SVM_EXIT_ERR: {
@@ -3414,6 +3437,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3414 3437
3415 svm->next_rip = 0; 3438 svm->next_rip = 0;
3416 3439
3440 /* if exit due to PF check for async PF */
3441 if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
3442 svm->apf_reason = kvm_read_and_reset_pf_reason();
3443
3417 if (npt_enabled) { 3444 if (npt_enabled) {
3418 vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); 3445 vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
3419 vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); 3446 vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);