aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm/vmx/nested.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/vmx/nested.c')
-rw-r--r--arch/x86/kvm/vmx/nested.c129
1 files changed, 79 insertions, 50 deletions
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index d737a51a53ca..f24a2c225070 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -211,7 +211,6 @@ static void free_nested(struct kvm_vcpu *vcpu)
211 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) 211 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
212 return; 212 return;
213 213
214 hrtimer_cancel(&vmx->nested.preemption_timer);
215 vmx->nested.vmxon = false; 214 vmx->nested.vmxon = false;
216 vmx->nested.smm.vmxon = false; 215 vmx->nested.smm.vmxon = false;
217 free_vpid(vmx->nested.vpid02); 216 free_vpid(vmx->nested.vpid02);
@@ -274,6 +273,7 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
274void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) 273void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
275{ 274{
276 vcpu_load(vcpu); 275 vcpu_load(vcpu);
276 vmx_leave_nested(vcpu);
277 vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01); 277 vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01);
278 free_nested(vcpu); 278 free_nested(vcpu);
279 vcpu_put(vcpu); 279 vcpu_put(vcpu);
@@ -1980,17 +1980,6 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
1980 prepare_vmcs02_early_full(vmx, vmcs12); 1980 prepare_vmcs02_early_full(vmx, vmcs12);
1981 1981
1982 /* 1982 /*
1983 * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
1984 * entry, but only if the current (host) sp changed from the value
1985 * we wrote last (vmx->host_rsp). This cache is no longer relevant
1986 * if we switch vmcs, and rather than hold a separate cache per vmcs,
1987 * here we just force the write to happen on entry. host_rsp will
1988 * also be written unconditionally by nested_vmx_check_vmentry_hw()
1989 * if we are doing early consistency checks via hardware.
1990 */
1991 vmx->host_rsp = 0;
1992
1993 /*
1994 * PIN CONTROLS 1983 * PIN CONTROLS
1995 */ 1984 */
1996 exec_control = vmcs12->pin_based_vm_exec_control; 1985 exec_control = vmcs12->pin_based_vm_exec_control;
@@ -2289,10 +2278,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
2289 } 2278 }
2290 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 2279 vmx_set_rflags(vcpu, vmcs12->guest_rflags);
2291 2280
2292 vmx->nested.preemption_timer_expired = false;
2293 if (nested_cpu_has_preemption_timer(vmcs12))
2294 vmx_start_preemption_timer(vcpu);
2295
2296 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 2281 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
2297 * bitwise-or of what L1 wants to trap for L2, and what we want to 2282 * bitwise-or of what L1 wants to trap for L2, and what we want to
2298 * trap. Note that CR0.TS also needs updating - we do this later. 2283 * trap. Note that CR0.TS also needs updating - we do this later.
@@ -2722,6 +2707,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
2722{ 2707{
2723 struct vcpu_vmx *vmx = to_vmx(vcpu); 2708 struct vcpu_vmx *vmx = to_vmx(vcpu);
2724 unsigned long cr3, cr4; 2709 unsigned long cr3, cr4;
2710 bool vm_fail;
2725 2711
2726 if (!nested_early_check) 2712 if (!nested_early_check)
2727 return 0; 2713 return 0;
@@ -2755,29 +2741,34 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
2755 vmx->loaded_vmcs->host_state.cr4 = cr4; 2741 vmx->loaded_vmcs->host_state.cr4 = cr4;
2756 } 2742 }
2757 2743
2758 vmx->__launched = vmx->loaded_vmcs->launched;
2759
2760 asm( 2744 asm(
2761 /* Set HOST_RSP */
2762 "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ 2745 "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */
2763 __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" 2746 "cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
2764 "mov %%" _ASM_SP ", %c[host_rsp](%1)\n\t" 2747 "je 1f \n\t"
2748 __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t"
2749 "mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
2750 "1: \n\t"
2765 "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ 2751 "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */
2766 2752
2767 /* Check if vmlaunch or vmresume is needed */ 2753 /* Check if vmlaunch or vmresume is needed */
2768 "cmpl $0, %c[launched](%% " _ASM_CX")\n\t" 2754 "cmpb $0, %c[launched](%[loaded_vmcs])\n\t"
2769 2755
2756 /*
2757 * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set
2758 * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail
2759 * Valid. vmx_vmenter() directly "returns" RFLAGS, and so the
2760 * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail.
2761 */
2770 "call vmx_vmenter\n\t" 2762 "call vmx_vmenter\n\t"
2771 2763
2772 /* Set vmx->fail accordingly */ 2764 CC_SET(be)
2773 "setbe %c[fail](%% " _ASM_CX")\n\t" 2765 : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail)
2774 : ASM_CALL_CONSTRAINT 2766 : [HOST_RSP]"r"((unsigned long)HOST_RSP),
2775 : "c"(vmx), "d"((unsigned long)HOST_RSP), 2767 [loaded_vmcs]"r"(vmx->loaded_vmcs),
2776 [launched]"i"(offsetof(struct vcpu_vmx, __launched)), 2768 [launched]"i"(offsetof(struct loaded_vmcs, launched)),
2777 [fail]"i"(offsetof(struct vcpu_vmx, fail)), 2769 [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)),
2778 [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
2779 [wordsize]"i"(sizeof(ulong)) 2770 [wordsize]"i"(sizeof(ulong))
2780 : "rax", "cc", "memory" 2771 : "cc", "memory"
2781 ); 2772 );
2782 2773
2783 preempt_enable(); 2774 preempt_enable();
@@ -2787,10 +2778,9 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
2787 if (vmx->msr_autoload.guest.nr) 2778 if (vmx->msr_autoload.guest.nr)
2788 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2779 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
2789 2780
2790 if (vmx->fail) { 2781 if (vm_fail) {
2791 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != 2782 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
2792 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 2783 VMXERR_ENTRY_INVALID_CONTROL_FIELD);
2793 vmx->fail = 0;
2794 return 1; 2784 return 1;
2795 } 2785 }
2796 2786
@@ -2813,8 +2803,6 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
2813 2803
2814 return 0; 2804 return 0;
2815} 2805}
2816STACK_FRAME_NON_STANDARD(nested_vmx_check_vmentry_hw);
2817
2818 2806
2819static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 2807static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
2820 struct vmcs12 *vmcs12); 2808 struct vmcs12 *vmcs12);
@@ -3031,6 +3019,15 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
3031 kvm_make_request(KVM_REQ_EVENT, vcpu); 3019 kvm_make_request(KVM_REQ_EVENT, vcpu);
3032 3020
3033 /* 3021 /*
3022 * Do not start the preemption timer hrtimer until after we know
3023 * we are successful, so that only nested_vmx_vmexit needs to cancel
3024 * the timer.
3025 */
3026 vmx->nested.preemption_timer_expired = false;
3027 if (nested_cpu_has_preemption_timer(vmcs12))
3028 vmx_start_preemption_timer(vcpu);
3029
3030 /*
3034 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 3031 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
3035 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 3032 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
3036 * returned as far as L1 is concerned. It will only return (and set 3033 * returned as far as L1 is concerned. It will only return (and set
@@ -3450,13 +3447,10 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3450 else 3447 else
3451 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 3448 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
3452 3449
3453 if (nested_cpu_has_preemption_timer(vmcs12)) { 3450 if (nested_cpu_has_preemption_timer(vmcs12) &&
3454 if (vmcs12->vm_exit_controls & 3451 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
3455 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
3456 vmcs12->vmx_preemption_timer_value = 3452 vmcs12->vmx_preemption_timer_value =
3457 vmx_get_preemption_timer_value(vcpu); 3453 vmx_get_preemption_timer_value(vcpu);
3458 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
3459 }
3460 3454
3461 /* 3455 /*
3462 * In some cases (usually, nested EPT), L2 is allowed to change its 3456 * In some cases (usually, nested EPT), L2 is allowed to change its
@@ -3864,6 +3858,9 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
3864 3858
3865 leave_guest_mode(vcpu); 3859 leave_guest_mode(vcpu);
3866 3860
3861 if (nested_cpu_has_preemption_timer(vmcs12))
3862 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
3863
3867 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) 3864 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
3868 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 3865 vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
3869 3866
@@ -3915,9 +3912,6 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
3915 vmx_flush_tlb(vcpu, true); 3912 vmx_flush_tlb(vcpu, true);
3916 } 3913 }
3917 3914
3918 /* This is needed for same reason as it was needed in prepare_vmcs02 */
3919 vmx->host_rsp = 0;
3920
3921 /* Unpin physical memory we referred to in vmcs02 */ 3915 /* Unpin physical memory we referred to in vmcs02 */
3922 if (vmx->nested.apic_access_page) { 3916 if (vmx->nested.apic_access_page) {
3923 kvm_release_page_dirty(vmx->nested.apic_access_page); 3917 kvm_release_page_dirty(vmx->nested.apic_access_page);
@@ -4035,25 +4029,50 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
4035 /* Addr = segment_base + offset */ 4029 /* Addr = segment_base + offset */
4036 /* offset = base + [index * scale] + displacement */ 4030 /* offset = base + [index * scale] + displacement */
4037 off = exit_qualification; /* holds the displacement */ 4031 off = exit_qualification; /* holds the displacement */
4032 if (addr_size == 1)
4033 off = (gva_t)sign_extend64(off, 31);
4034 else if (addr_size == 0)
4035 off = (gva_t)sign_extend64(off, 15);
4038 if (base_is_valid) 4036 if (base_is_valid)
4039 off += kvm_register_read(vcpu, base_reg); 4037 off += kvm_register_read(vcpu, base_reg);
4040 if (index_is_valid) 4038 if (index_is_valid)
4041 off += kvm_register_read(vcpu, index_reg)<<scaling; 4039 off += kvm_register_read(vcpu, index_reg)<<scaling;
4042 vmx_get_segment(vcpu, &s, seg_reg); 4040 vmx_get_segment(vcpu, &s, seg_reg);
4043 *ret = s.base + off;
4044 4041
4042 /*
4043 * The effective address, i.e. @off, of a memory operand is truncated
4044 * based on the address size of the instruction. Note that this is
4045 * the *effective address*, i.e. the address prior to accounting for
4046 * the segment's base.
4047 */
4045 if (addr_size == 1) /* 32 bit */ 4048 if (addr_size == 1) /* 32 bit */
4046 *ret &= 0xffffffff; 4049 off &= 0xffffffff;
4050 else if (addr_size == 0) /* 16 bit */
4051 off &= 0xffff;
4047 4052
4048 /* Checks for #GP/#SS exceptions. */ 4053 /* Checks for #GP/#SS exceptions. */
4049 exn = false; 4054 exn = false;
4050 if (is_long_mode(vcpu)) { 4055 if (is_long_mode(vcpu)) {
4056 /*
4057 * The virtual/linear address is never truncated in 64-bit
4058 * mode, e.g. a 32-bit address size can yield a 64-bit virtual
4059 * address when using FS/GS with a non-zero base.
4060 */
4061 *ret = s.base + off;
4062
4051 /* Long mode: #GP(0)/#SS(0) if the memory address is in a 4063 /* Long mode: #GP(0)/#SS(0) if the memory address is in a
4052 * non-canonical form. This is the only check on the memory 4064 * non-canonical form. This is the only check on the memory
4053 * destination for long mode! 4065 * destination for long mode!
4054 */ 4066 */
4055 exn = is_noncanonical_address(*ret, vcpu); 4067 exn = is_noncanonical_address(*ret, vcpu);
4056 } else if (is_protmode(vcpu)) { 4068 } else {
4069 /*
4070 * When not in long mode, the virtual/linear address is
4071 * unconditionally truncated to 32 bits regardless of the
4072 * address size.
4073 */
4074 *ret = (s.base + off) & 0xffffffff;
4075
4057 /* Protected mode: apply checks for segment validity in the 4076 /* Protected mode: apply checks for segment validity in the
4058 * following order: 4077 * following order:
4059 * - segment type check (#GP(0) may be thrown) 4078 * - segment type check (#GP(0) may be thrown)
@@ -4077,10 +4096,16 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
4077 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. 4096 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
4078 */ 4097 */
4079 exn = (s.unusable != 0); 4098 exn = (s.unusable != 0);
4080 /* Protected mode: #GP(0)/#SS(0) if the memory 4099
4081 * operand is outside the segment limit. 4100 /*
4101 * Protected mode: #GP(0)/#SS(0) if the memory operand is
4102 * outside the segment limit. All CPUs that support VMX ignore
4103 * limit checks for flat segments, i.e. segments with base==0,
4104 * limit==0xffffffff and of type expand-up data or code.
4082 */ 4105 */
4083 exn = exn || (off + sizeof(u64) > s.limit); 4106 if (!(s.base == 0 && s.limit == 0xffffffff &&
4107 ((s.type & 8) || !(s.type & 4))))
4108 exn = exn || (off + sizeof(u64) > s.limit);
4084 } 4109 }
4085 if (exn) { 4110 if (exn) {
4086 kvm_queue_exception_e(vcpu, 4111 kvm_queue_exception_e(vcpu,
@@ -4145,11 +4170,11 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
4145 if (r < 0) 4170 if (r < 0)
4146 goto out_vmcs02; 4171 goto out_vmcs02;
4147 4172
4148 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL); 4173 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
4149 if (!vmx->nested.cached_vmcs12) 4174 if (!vmx->nested.cached_vmcs12)
4150 goto out_cached_vmcs12; 4175 goto out_cached_vmcs12;
4151 4176
4152 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL); 4177 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
4153 if (!vmx->nested.cached_shadow_vmcs12) 4178 if (!vmx->nested.cached_shadow_vmcs12)
4154 goto out_cached_shadow_vmcs12; 4179 goto out_cached_shadow_vmcs12;
4155 4180
@@ -5696,6 +5721,10 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
5696 enable_shadow_vmcs = 0; 5721 enable_shadow_vmcs = 0;
5697 if (enable_shadow_vmcs) { 5722 if (enable_shadow_vmcs) {
5698 for (i = 0; i < VMX_BITMAP_NR; i++) { 5723 for (i = 0; i < VMX_BITMAP_NR; i++) {
5724 /*
5725 * The vmx_bitmap is not tied to a VM and so should
5726 * not be charged to a memcg.
5727 */
5699 vmx_bitmap[i] = (unsigned long *) 5728 vmx_bitmap[i] = (unsigned long *)
5700 __get_free_page(GFP_KERNEL); 5729 __get_free_page(GFP_KERNEL);
5701 if (!vmx_bitmap[i]) { 5730 if (!vmx_bitmap[i]) {