aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJan Kiszka <jan.kiszka@siemens.com>2014-03-07 14:03:13 -0500
committerPaolo Bonzini <pbonzini@redhat.com>2014-03-11 03:41:45 -0400
commitf4124500c2c13eb1208c6143b3f6d469709dea10 (patch)
treebb1a937a6a240cf3782c63223ca9584de1a7b63f
parentb6b8a1451fc40412c57d10c94b62e22acab28f94 (diff)
KVM: nVMX: Fully emulate preemption timer
We cannot rely on the hardware-provided preemption timer support because we are holding L2 in HLT outside non-root mode. Furthermore, emulating the preemption will resolve tick rate errata on older Intel CPUs. The emulation is based on hrtimer which is started on L2 entry, stopped on L2 exit and evaluated via the new check_nested_events hook. As we no longer rely on hardware features, we can enable both the preemption timer support and value saving unconditionally. Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-rw-r--r--arch/x86/kvm/vmx.c151
1 files changed, 96 insertions, 55 deletions
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 11718b44a62d..e559675e113f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -31,6 +31,7 @@
31#include <linux/ftrace_event.h> 31#include <linux/ftrace_event.h>
32#include <linux/slab.h> 32#include <linux/slab.h>
33#include <linux/tboot.h> 33#include <linux/tboot.h>
34#include <linux/hrtimer.h>
34#include "kvm_cache_regs.h" 35#include "kvm_cache_regs.h"
35#include "x86.h" 36#include "x86.h"
36 37
@@ -110,6 +111,8 @@ module_param(nested, bool, S_IRUGO);
110 111
111#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) 112#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
112 113
114#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
115
113/* 116/*
114 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 117 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
115 * ple_gap: upper bound on the amount of time between two successive 118 * ple_gap: upper bound on the amount of time between two successive
@@ -374,6 +377,9 @@ struct nested_vmx {
374 */ 377 */
375 struct page *apic_access_page; 378 struct page *apic_access_page;
376 u64 msr_ia32_feature_control; 379 u64 msr_ia32_feature_control;
380
381 struct hrtimer preemption_timer;
382 bool preemption_timer_expired;
377}; 383};
378 384
379#define POSTED_INTR_ON 0 385#define POSTED_INTR_ON 0
@@ -1048,6 +1054,12 @@ static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
1048 return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; 1054 return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
1049} 1055}
1050 1056
1057static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12)
1058{
1059 return vmcs12->pin_based_vm_exec_control &
1060 PIN_BASED_VMX_PREEMPTION_TIMER;
1061}
1062
1051static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) 1063static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
1052{ 1064{
1053 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); 1065 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
@@ -2253,9 +2265,9 @@ static __init void nested_vmx_setup_ctls_msrs(void)
2253 */ 2265 */
2254 nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 2266 nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
2255 nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK | 2267 nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK |
2256 PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS | 2268 PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS;
2269 nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
2257 PIN_BASED_VMX_PREEMPTION_TIMER; 2270 PIN_BASED_VMX_PREEMPTION_TIMER;
2258 nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
2259 2271
2260 /* 2272 /*
2261 * Exit controls 2273 * Exit controls
@@ -2270,15 +2282,10 @@ static __init void nested_vmx_setup_ctls_msrs(void)
2270#ifdef CONFIG_X86_64 2282#ifdef CONFIG_X86_64
2271 VM_EXIT_HOST_ADDR_SPACE_SIZE | 2283 VM_EXIT_HOST_ADDR_SPACE_SIZE |
2272#endif 2284#endif
2273 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | 2285 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
2286 nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
2287 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
2274 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; 2288 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
2275 if (!(nested_vmx_pinbased_ctls_high & PIN_BASED_VMX_PREEMPTION_TIMER) ||
2276 !(nested_vmx_exit_ctls_high & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) {
2277 nested_vmx_exit_ctls_high &= ~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
2278 nested_vmx_pinbased_ctls_high &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
2279 }
2280 nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
2281 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER);
2282 2289
2283 /* entry controls */ 2290 /* entry controls */
2284 rdmsr(MSR_IA32_VMX_ENTRY_CTLS, 2291 rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
@@ -2347,9 +2354,9 @@ static __init void nested_vmx_setup_ctls_msrs(void)
2347 2354
2348 /* miscellaneous data */ 2355 /* miscellaneous data */
2349 rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high); 2356 rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
2350 nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK | 2357 nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
2351 VMX_MISC_SAVE_EFER_LMA; 2358 nested_vmx_misc_low |= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
2352 nested_vmx_misc_low |= VMX_MISC_ACTIVITY_HLT; 2359 VMX_MISC_ACTIVITY_HLT;
2353 nested_vmx_misc_high = 0; 2360 nested_vmx_misc_high = 0;
2354} 2361}
2355 2362
@@ -5713,6 +5720,18 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
5713 */ 5720 */
5714} 5721}
5715 5722
5723static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
5724{
5725 struct vcpu_vmx *vmx =
5726 container_of(timer, struct vcpu_vmx, nested.preemption_timer);
5727
5728 vmx->nested.preemption_timer_expired = true;
5729 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
5730 kvm_vcpu_kick(&vmx->vcpu);
5731
5732 return HRTIMER_NORESTART;
5733}
5734
5716/* 5735/*
5717 * Emulate the VMXON instruction. 5736 * Emulate the VMXON instruction.
5718 * Currently, we just remember that VMX is active, and do not save or even 5737 * Currently, we just remember that VMX is active, and do not save or even
@@ -5777,6 +5796,10 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
5777 INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); 5796 INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
5778 vmx->nested.vmcs02_num = 0; 5797 vmx->nested.vmcs02_num = 0;
5779 5798
5799 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
5800 HRTIMER_MODE_REL);
5801 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
5802
5780 vmx->nested.vmxon = true; 5803 vmx->nested.vmxon = true;
5781 5804
5782 skip_emulated_instruction(vcpu); 5805 skip_emulated_instruction(vcpu);
@@ -6753,9 +6776,6 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
6753 * table is L0's fault. 6776 * table is L0's fault.
6754 */ 6777 */
6755 return 0; 6778 return 0;
6756 case EXIT_REASON_PREEMPTION_TIMER:
6757 return vmcs12->pin_based_vm_exec_control &
6758 PIN_BASED_VMX_PREEMPTION_TIMER;
6759 case EXIT_REASON_WBINVD: 6779 case EXIT_REASON_WBINVD:
6760 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 6780 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
6761 case EXIT_REASON_XSETBV: 6781 case EXIT_REASON_XSETBV:
@@ -6771,27 +6791,6 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
6771 *info2 = vmcs_read32(VM_EXIT_INTR_INFO); 6791 *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
6772} 6792}
6773 6793
6774static void nested_adjust_preemption_timer(struct kvm_vcpu *vcpu)
6775{
6776 u64 delta_tsc_l1;
6777 u32 preempt_val_l1, preempt_val_l2, preempt_scale;
6778
6779 if (!(get_vmcs12(vcpu)->pin_based_vm_exec_control &
6780 PIN_BASED_VMX_PREEMPTION_TIMER))
6781 return;
6782 preempt_scale = native_read_msr(MSR_IA32_VMX_MISC) &
6783 MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE;
6784 preempt_val_l2 = vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
6785 delta_tsc_l1 = vmx_read_l1_tsc(vcpu, native_read_tsc())
6786 - vcpu->arch.last_guest_tsc;
6787 preempt_val_l1 = delta_tsc_l1 >> preempt_scale;
6788 if (preempt_val_l2 <= preempt_val_l1)
6789 preempt_val_l2 = 0;
6790 else
6791 preempt_val_l2 -= preempt_val_l1;
6792 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, preempt_val_l2);
6793}
6794
6795/* 6794/*
6796 * The guest has exited. See if we can fix it or if we need userspace 6795 * The guest has exited. See if we can fix it or if we need userspace
6797 * assistance. 6796 * assistance.
@@ -7210,8 +7209,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
7210 atomic_switch_perf_msrs(vmx); 7209 atomic_switch_perf_msrs(vmx);
7211 debugctlmsr = get_debugctlmsr(); 7210 debugctlmsr = get_debugctlmsr();
7212 7211
7213 if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending)
7214 nested_adjust_preemption_timer(vcpu);
7215 vmx->__launched = vmx->loaded_vmcs->launched; 7212 vmx->__launched = vmx->loaded_vmcs->launched;
7216 asm( 7213 asm(
7217 /* Store host registers */ 7214 /* Store host registers */
@@ -7608,6 +7605,28 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
7608 kvm_inject_page_fault(vcpu, fault); 7605 kvm_inject_page_fault(vcpu, fault);
7609} 7606}
7610 7607
7608static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
7609{
7610 u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
7611 struct vcpu_vmx *vmx = to_vmx(vcpu);
7612
7613 if (vcpu->arch.virtual_tsc_khz == 0)
7614 return;
7615
7616 /* Make sure short timeouts reliably trigger an immediate vmexit.
7617 * hrtimer_start does not guarantee this. */
7618 if (preemption_timeout <= 1) {
7619 vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
7620 return;
7621 }
7622
7623 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
7624 preemption_timeout *= 1000000;
7625 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
7626 hrtimer_start(&vmx->nested.preemption_timer,
7627 ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
7628}
7629
7611/* 7630/*
7612 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 7631 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
7613 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 7632 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@ -7621,7 +7640,6 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7621{ 7640{
7622 struct vcpu_vmx *vmx = to_vmx(vcpu); 7641 struct vcpu_vmx *vmx = to_vmx(vcpu);
7623 u32 exec_control; 7642 u32 exec_control;
7624 u32 exit_control;
7625 7643
7626 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 7644 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
7627 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 7645 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
@@ -7679,13 +7697,14 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7679 7697
7680 vmcs_write64(VMCS_LINK_POINTER, -1ull); 7698 vmcs_write64(VMCS_LINK_POINTER, -1ull);
7681 7699
7682 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, 7700 exec_control = vmcs12->pin_based_vm_exec_control;
7683 (vmcs_config.pin_based_exec_ctrl | 7701 exec_control |= vmcs_config.pin_based_exec_ctrl;
7684 vmcs12->pin_based_vm_exec_control)); 7702 exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
7703 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
7685 7704
7686 if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) 7705 vmx->nested.preemption_timer_expired = false;
7687 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 7706 if (nested_cpu_has_preemption_timer(vmcs12))
7688 vmcs12->vmx_preemption_timer_value); 7707 vmx_start_preemption_timer(vcpu);
7689 7708
7690 /* 7709 /*
7691 * Whether page-faults are trapped is determined by a combination of 7710 * Whether page-faults are trapped is determined by a combination of
@@ -7713,7 +7732,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7713 enable_ept ? vmcs12->page_fault_error_code_match : 0); 7732 enable_ept ? vmcs12->page_fault_error_code_match : 0);
7714 7733
7715 if (cpu_has_secondary_exec_ctrls()) { 7734 if (cpu_has_secondary_exec_ctrls()) {
7716 u32 exec_control = vmx_secondary_exec_control(vmx); 7735 exec_control = vmx_secondary_exec_control(vmx);
7717 if (!vmx->rdtscp_enabled) 7736 if (!vmx->rdtscp_enabled)
7718 exec_control &= ~SECONDARY_EXEC_RDTSCP; 7737 exec_control &= ~SECONDARY_EXEC_RDTSCP;
7719 /* Take the following fields only from vmcs12 */ 7738 /* Take the following fields only from vmcs12 */
@@ -7800,10 +7819,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7800 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 7819 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
7801 * bits are further modified by vmx_set_efer() below. 7820 * bits are further modified by vmx_set_efer() below.
7802 */ 7821 */
7803 exit_control = vmcs_config.vmexit_ctrl; 7822 vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
7804 if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
7805 exit_control |= VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
7806 vm_exit_controls_init(vmx, exit_control);
7807 7823
7808 /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are 7824 /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
7809 * emulated by vmx_set_efer(), below. 7825 * emulated by vmx_set_efer(), below.
@@ -8151,6 +8167,14 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
8151{ 8167{
8152 struct vcpu_vmx *vmx = to_vmx(vcpu); 8168 struct vcpu_vmx *vmx = to_vmx(vcpu);
8153 8169
8170 if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
8171 vmx->nested.preemption_timer_expired) {
8172 if (vmx->nested.nested_run_pending)
8173 return -EBUSY;
8174 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
8175 return 0;
8176 }
8177
8154 if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { 8178 if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) {
8155 if (vmx->nested.nested_run_pending) 8179 if (vmx->nested.nested_run_pending)
8156 return -EBUSY; 8180 return -EBUSY;
@@ -8176,6 +8200,20 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
8176 return 0; 8200 return 0;
8177} 8201}
8178 8202
8203static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
8204{
8205 ktime_t remaining =
8206 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
8207 u64 value;
8208
8209 if (ktime_to_ns(remaining) <= 0)
8210 return 0;
8211
8212 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
8213 do_div(value, 1000000);
8214 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
8215}
8216
8179/* 8217/*
8180 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 8218 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
8181 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 8219 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
@@ -8246,10 +8284,13 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
8246 else 8284 else
8247 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 8285 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
8248 8286
8249 if ((vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) && 8287 if (nested_cpu_has_preemption_timer(vmcs12)) {
8250 (vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) 8288 if (vmcs12->vm_exit_controls &
8251 vmcs12->vmx_preemption_timer_value = 8289 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
8252 vmcs_read32(VMX_PREEMPTION_TIMER_VALUE); 8290 vmcs12->vmx_preemption_timer_value =
8291 vmx_get_preemption_timer_value(vcpu);
8292 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
8293 }
8253 8294
8254 /* 8295 /*
8255 * In some cases (usually, nested EPT), L2 is allowed to change its 8296 * In some cases (usually, nested EPT), L2 is allowed to change its