diff options
author | Michael S. Tsirkin <mst@redhat.com> | 2012-06-24 12:24:34 -0400 |
---|---|---|
committer | Avi Kivity <avi@redhat.com> | 2012-06-25 05:38:06 -0400 |
commit | ab9cf4996bb989983e73da894b8dd0239aa2c3c2 (patch) | |
tree | 45d97f795e03f8c18f6dc110887f34391cb7a8e5 /arch | |
parent | 8680b94b0e6046af2644c17313287ec0cb5843dc (diff) |
KVM guest: guest side for eoi avoidance
The idea is simple: there's a bit, per APIC, in guest memory,
that tells the guest that it does not need EOI.
Guest tests it using a single est and clear operation - this is
necessary so that host can detect interrupt nesting - and if set, it can
skip the EOI MSR.
I run a simple microbenchmark to show exit reduction
(note: for testing, need to apply follow-up patch
'kvm: host side for eoi optimization' + a qemu patch
I posted separately, on host):
Before:
Performance counter stats for 'sleep 1s':
47,357 kvm:kvm_entry [99.98%]
0 kvm:kvm_hypercall [99.98%]
0 kvm:kvm_hv_hypercall [99.98%]
5,001 kvm:kvm_pio [99.98%]
0 kvm:kvm_cpuid [99.98%]
22,124 kvm:kvm_apic [99.98%]
49,849 kvm:kvm_exit [99.98%]
21,115 kvm:kvm_inj_virq [99.98%]
0 kvm:kvm_inj_exception [99.98%]
0 kvm:kvm_page_fault [99.98%]
22,937 kvm:kvm_msr [99.98%]
0 kvm:kvm_cr [99.98%]
0 kvm:kvm_pic_set_irq [99.98%]
0 kvm:kvm_apic_ipi [99.98%]
22,207 kvm:kvm_apic_accept_irq [99.98%]
22,421 kvm:kvm_eoi [99.98%]
0 kvm:kvm_pv_eoi [99.99%]
0 kvm:kvm_nested_vmrun [99.99%]
0 kvm:kvm_nested_intercepts [99.99%]
0 kvm:kvm_nested_vmexit [99.99%]
0 kvm:kvm_nested_vmexit_inject [99.99%]
0 kvm:kvm_nested_intr_vmexit [99.99%]
0 kvm:kvm_invlpga [99.99%]
0 kvm:kvm_skinit [99.99%]
57 kvm:kvm_emulate_insn [99.99%]
0 kvm:vcpu_match_mmio [99.99%]
0 kvm:kvm_userspace_exit [99.99%]
2 kvm:kvm_set_irq [99.99%]
2 kvm:kvm_ioapic_set_irq [99.99%]
23,609 kvm:kvm_msi_set_irq [99.99%]
1 kvm:kvm_ack_irq [99.99%]
131 kvm:kvm_mmio [99.99%]
226 kvm:kvm_fpu [100.00%]
0 kvm:kvm_age_page [100.00%]
0 kvm:kvm_try_async_get_page [100.00%]
0 kvm:kvm_async_pf_doublefault [100.00%]
0 kvm:kvm_async_pf_not_present [100.00%]
0 kvm:kvm_async_pf_ready [100.00%]
0 kvm:kvm_async_pf_completed
1.002100578 seconds time elapsed
After:
Performance counter stats for 'sleep 1s':
28,354 kvm:kvm_entry [99.98%]
0 kvm:kvm_hypercall [99.98%]
0 kvm:kvm_hv_hypercall [99.98%]
1,347 kvm:kvm_pio [99.98%]
0 kvm:kvm_cpuid [99.98%]
1,931 kvm:kvm_apic [99.98%]
29,595 kvm:kvm_exit [99.98%]
24,884 kvm:kvm_inj_virq [99.98%]
0 kvm:kvm_inj_exception [99.98%]
0 kvm:kvm_page_fault [99.98%]
1,986 kvm:kvm_msr [99.98%]
0 kvm:kvm_cr [99.98%]
0 kvm:kvm_pic_set_irq [99.98%]
0 kvm:kvm_apic_ipi [99.99%]
25,953 kvm:kvm_apic_accept_irq [99.99%]
26,132 kvm:kvm_eoi [99.99%]
26,593 kvm:kvm_pv_eoi [99.99%]
0 kvm:kvm_nested_vmrun [99.99%]
0 kvm:kvm_nested_intercepts [99.99%]
0 kvm:kvm_nested_vmexit [99.99%]
0 kvm:kvm_nested_vmexit_inject [99.99%]
0 kvm:kvm_nested_intr_vmexit [99.99%]
0 kvm:kvm_invlpga [99.99%]
0 kvm:kvm_skinit [99.99%]
284 kvm:kvm_emulate_insn [99.99%]
68 kvm:vcpu_match_mmio [99.99%]
68 kvm:kvm_userspace_exit [99.99%]
2 kvm:kvm_set_irq [99.99%]
2 kvm:kvm_ioapic_set_irq [99.99%]
28,288 kvm:kvm_msi_set_irq [99.99%]
1 kvm:kvm_ack_irq [99.99%]
131 kvm:kvm_mmio [100.00%]
588 kvm:kvm_fpu [100.00%]
0 kvm:kvm_age_page [100.00%]
0 kvm:kvm_try_async_get_page [100.00%]
0 kvm:kvm_async_pf_doublefault [100.00%]
0 kvm:kvm_async_pf_not_present [100.00%]
0 kvm:kvm_async_pf_ready [100.00%]
0 kvm:kvm_async_pf_completed
1.002039622 seconds time elapsed
We see that # of exits is almost halved.
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
Diffstat (limited to 'arch')
-rw-r--r-- | arch/x86/include/asm/kvm_para.h | 7 | ||||
-rw-r--r-- | arch/x86/kernel/kvm.c | 57 |
2 files changed, 61 insertions, 3 deletions
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 63ab1661d00e..2f7712e08b1e 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h | |||
@@ -22,6 +22,7 @@ | |||
22 | #define KVM_FEATURE_CLOCKSOURCE2 3 | 22 | #define KVM_FEATURE_CLOCKSOURCE2 3 |
23 | #define KVM_FEATURE_ASYNC_PF 4 | 23 | #define KVM_FEATURE_ASYNC_PF 4 |
24 | #define KVM_FEATURE_STEAL_TIME 5 | 24 | #define KVM_FEATURE_STEAL_TIME 5 |
25 | #define KVM_FEATURE_PV_EOI 6 | ||
25 | 26 | ||
26 | /* The last 8 bits are used to indicate how to interpret the flags field | 27 | /* The last 8 bits are used to indicate how to interpret the flags field |
27 | * in pvclock structure. If no bits are set, all flags are ignored. | 28 | * in pvclock structure. If no bits are set, all flags are ignored. |
@@ -37,6 +38,7 @@ | |||
37 | #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 | 38 | #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 |
38 | #define MSR_KVM_ASYNC_PF_EN 0x4b564d02 | 39 | #define MSR_KVM_ASYNC_PF_EN 0x4b564d02 |
39 | #define MSR_KVM_STEAL_TIME 0x4b564d03 | 40 | #define MSR_KVM_STEAL_TIME 0x4b564d03 |
41 | #define MSR_KVM_PV_EOI_EN 0x4b564d04 | ||
40 | 42 | ||
41 | struct kvm_steal_time { | 43 | struct kvm_steal_time { |
42 | __u64 steal; | 44 | __u64 steal; |
@@ -89,6 +91,11 @@ struct kvm_vcpu_pv_apf_data { | |||
89 | __u32 enabled; | 91 | __u32 enabled; |
90 | }; | 92 | }; |
91 | 93 | ||
94 | #define KVM_PV_EOI_BIT 0 | ||
95 | #define KVM_PV_EOI_MASK (0x1 << KVM_PV_EOI_BIT) | ||
96 | #define KVM_PV_EOI_ENABLED KVM_PV_EOI_MASK | ||
97 | #define KVM_PV_EOI_DISABLED 0x0 | ||
98 | |||
92 | #ifdef __KERNEL__ | 99 | #ifdef __KERNEL__ |
93 | #include <asm/processor.h> | 100 | #include <asm/processor.h> |
94 | 101 | ||
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index e554e5ad2fe8..75ab94c75c7a 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c | |||
@@ -39,6 +39,8 @@ | |||
39 | #include <asm/desc.h> | 39 | #include <asm/desc.h> |
40 | #include <asm/tlbflush.h> | 40 | #include <asm/tlbflush.h> |
41 | #include <asm/idle.h> | 41 | #include <asm/idle.h> |
42 | #include <asm/apic.h> | ||
43 | #include <asm/apicdef.h> | ||
42 | 44 | ||
43 | static int kvmapf = 1; | 45 | static int kvmapf = 1; |
44 | 46 | ||
@@ -283,6 +285,22 @@ static void kvm_register_steal_time(void) | |||
283 | cpu, __pa(st)); | 285 | cpu, __pa(st)); |
284 | } | 286 | } |
285 | 287 | ||
288 | static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; | ||
289 | |||
290 | static void kvm_guest_apic_eoi_write(u32 reg, u32 val) | ||
291 | { | ||
292 | /** | ||
293 | * This relies on __test_and_clear_bit to modify the memory | ||
294 | * in a way that is atomic with respect to the local CPU. | ||
295 | * The hypervisor only accesses this memory from the local CPU so | ||
296 | * there's no need for lock or memory barriers. | ||
297 | * An optimization barrier is implied in apic write. | ||
298 | */ | ||
299 | if (__test_and_clear_bit(KVM_PV_EOI_BIT, &__get_cpu_var(kvm_apic_eoi))) | ||
300 | return; | ||
301 | apic->write(APIC_EOI, APIC_EOI_ACK); | ||
302 | } | ||
303 | |||
286 | void __cpuinit kvm_guest_cpu_init(void) | 304 | void __cpuinit kvm_guest_cpu_init(void) |
287 | { | 305 | { |
288 | if (!kvm_para_available()) | 306 | if (!kvm_para_available()) |
@@ -300,11 +318,20 @@ void __cpuinit kvm_guest_cpu_init(void) | |||
300 | smp_processor_id()); | 318 | smp_processor_id()); |
301 | } | 319 | } |
302 | 320 | ||
321 | if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { | ||
322 | unsigned long pa; | ||
323 | /* Size alignment is implied but just to make it explicit. */ | ||
324 | BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); | ||
325 | __get_cpu_var(kvm_apic_eoi) = 0; | ||
326 | pa = __pa(&__get_cpu_var(kvm_apic_eoi)) | KVM_MSR_ENABLED; | ||
327 | wrmsrl(MSR_KVM_PV_EOI_EN, pa); | ||
328 | } | ||
329 | |||
303 | if (has_steal_clock) | 330 | if (has_steal_clock) |
304 | kvm_register_steal_time(); | 331 | kvm_register_steal_time(); |
305 | } | 332 | } |
306 | 333 | ||
307 | static void kvm_pv_disable_apf(void *unused) | 334 | static void kvm_pv_disable_apf(void) |
308 | { | 335 | { |
309 | if (!__get_cpu_var(apf_reason).enabled) | 336 | if (!__get_cpu_var(apf_reason).enabled) |
310 | return; | 337 | return; |
@@ -316,11 +343,23 @@ static void kvm_pv_disable_apf(void *unused) | |||
316 | smp_processor_id()); | 343 | smp_processor_id()); |
317 | } | 344 | } |
318 | 345 | ||
346 | static void kvm_pv_guest_cpu_reboot(void *unused) | ||
347 | { | ||
348 | /* | ||
349 | * We disable PV EOI before we load a new kernel by kexec, | ||
350 | * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory. | ||
351 | * New kernel can re-enable when it boots. | ||
352 | */ | ||
353 | if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) | ||
354 | wrmsrl(MSR_KVM_PV_EOI_EN, 0); | ||
355 | kvm_pv_disable_apf(); | ||
356 | } | ||
357 | |||
319 | static int kvm_pv_reboot_notify(struct notifier_block *nb, | 358 | static int kvm_pv_reboot_notify(struct notifier_block *nb, |
320 | unsigned long code, void *unused) | 359 | unsigned long code, void *unused) |
321 | { | 360 | { |
322 | if (code == SYS_RESTART) | 361 | if (code == SYS_RESTART) |
323 | on_each_cpu(kvm_pv_disable_apf, NULL, 1); | 362 | on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1); |
324 | return NOTIFY_DONE; | 363 | return NOTIFY_DONE; |
325 | } | 364 | } |
326 | 365 | ||
@@ -371,7 +410,9 @@ static void __cpuinit kvm_guest_cpu_online(void *dummy) | |||
371 | static void kvm_guest_cpu_offline(void *dummy) | 410 | static void kvm_guest_cpu_offline(void *dummy) |
372 | { | 411 | { |
373 | kvm_disable_steal_time(); | 412 | kvm_disable_steal_time(); |
374 | kvm_pv_disable_apf(NULL); | 413 | if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) |
414 | wrmsrl(MSR_KVM_PV_EOI_EN, 0); | ||
415 | kvm_pv_disable_apf(); | ||
375 | apf_task_wake_all(); | 416 | apf_task_wake_all(); |
376 | } | 417 | } |
377 | 418 | ||
@@ -424,6 +465,16 @@ void __init kvm_guest_init(void) | |||
424 | pv_time_ops.steal_clock = kvm_steal_clock; | 465 | pv_time_ops.steal_clock = kvm_steal_clock; |
425 | } | 466 | } |
426 | 467 | ||
468 | if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { | ||
469 | struct apic **drv; | ||
470 | |||
471 | for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { | ||
472 | /* Should happen once for each apic */ | ||
473 | WARN_ON((*drv)->eoi_write == kvm_guest_apic_eoi_write); | ||
474 | (*drv)->eoi_write = kvm_guest_apic_eoi_write; | ||
475 | } | ||
476 | } | ||
477 | |||
427 | #ifdef CONFIG_SMP | 478 | #ifdef CONFIG_SMP |
428 | smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; | 479 | smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; |
429 | register_cpu_notifier(&kvm_cpu_notifier); | 480 | register_cpu_notifier(&kvm_cpu_notifier); |