diff options
Diffstat (limited to 'arch/x86/kvm/vmx/nested.c')
-rw-r--r-- | arch/x86/kvm/vmx/nested.c | 129 |
1 files changed, 79 insertions, 50 deletions
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index d737a51a53ca..f24a2c225070 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c | |||
@@ -211,7 +211,6 @@ static void free_nested(struct kvm_vcpu *vcpu) | |||
211 | if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) | 211 | if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) |
212 | return; | 212 | return; |
213 | 213 | ||
214 | hrtimer_cancel(&vmx->nested.preemption_timer); | ||
215 | vmx->nested.vmxon = false; | 214 | vmx->nested.vmxon = false; |
216 | vmx->nested.smm.vmxon = false; | 215 | vmx->nested.smm.vmxon = false; |
217 | free_vpid(vmx->nested.vpid02); | 216 | free_vpid(vmx->nested.vpid02); |
@@ -274,6 +273,7 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) | |||
274 | void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) | 273 | void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) |
275 | { | 274 | { |
276 | vcpu_load(vcpu); | 275 | vcpu_load(vcpu); |
276 | vmx_leave_nested(vcpu); | ||
277 | vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01); | 277 | vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01); |
278 | free_nested(vcpu); | 278 | free_nested(vcpu); |
279 | vcpu_put(vcpu); | 279 | vcpu_put(vcpu); |
@@ -1980,17 +1980,6 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) | |||
1980 | prepare_vmcs02_early_full(vmx, vmcs12); | 1980 | prepare_vmcs02_early_full(vmx, vmcs12); |
1981 | 1981 | ||
1982 | /* | 1982 | /* |
1983 | * HOST_RSP is normally set correctly in vmx_vcpu_run() just before | ||
1984 | * entry, but only if the current (host) sp changed from the value | ||
1985 | * we wrote last (vmx->host_rsp). This cache is no longer relevant | ||
1986 | * if we switch vmcs, and rather than hold a separate cache per vmcs, | ||
1987 | * here we just force the write to happen on entry. host_rsp will | ||
1988 | * also be written unconditionally by nested_vmx_check_vmentry_hw() | ||
1989 | * if we are doing early consistency checks via hardware. | ||
1990 | */ | ||
1991 | vmx->host_rsp = 0; | ||
1992 | |||
1993 | /* | ||
1994 | * PIN CONTROLS | 1983 | * PIN CONTROLS |
1995 | */ | 1984 | */ |
1996 | exec_control = vmcs12->pin_based_vm_exec_control; | 1985 | exec_control = vmcs12->pin_based_vm_exec_control; |
@@ -2289,10 +2278,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, | |||
2289 | } | 2278 | } |
2290 | vmx_set_rflags(vcpu, vmcs12->guest_rflags); | 2279 | vmx_set_rflags(vcpu, vmcs12->guest_rflags); |
2291 | 2280 | ||
2292 | vmx->nested.preemption_timer_expired = false; | ||
2293 | if (nested_cpu_has_preemption_timer(vmcs12)) | ||
2294 | vmx_start_preemption_timer(vcpu); | ||
2295 | |||
2296 | /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the | 2281 | /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the |
2297 | * bitwise-or of what L1 wants to trap for L2, and what we want to | 2282 | * bitwise-or of what L1 wants to trap for L2, and what we want to |
2298 | * trap. Note that CR0.TS also needs updating - we do this later. | 2283 | * trap. Note that CR0.TS also needs updating - we do this later. |
@@ -2722,6 +2707,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) | |||
2722 | { | 2707 | { |
2723 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 2708 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
2724 | unsigned long cr3, cr4; | 2709 | unsigned long cr3, cr4; |
2710 | bool vm_fail; | ||
2725 | 2711 | ||
2726 | if (!nested_early_check) | 2712 | if (!nested_early_check) |
2727 | return 0; | 2713 | return 0; |
@@ -2755,29 +2741,34 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) | |||
2755 | vmx->loaded_vmcs->host_state.cr4 = cr4; | 2741 | vmx->loaded_vmcs->host_state.cr4 = cr4; |
2756 | } | 2742 | } |
2757 | 2743 | ||
2758 | vmx->__launched = vmx->loaded_vmcs->launched; | ||
2759 | |||
2760 | asm( | 2744 | asm( |
2761 | /* Set HOST_RSP */ | ||
2762 | "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ | 2745 | "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ |
2763 | __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" | 2746 | "cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t" |
2764 | "mov %%" _ASM_SP ", %c[host_rsp](%1)\n\t" | 2747 | "je 1f \n\t" |
2748 | __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t" | ||
2749 | "mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t" | ||
2750 | "1: \n\t" | ||
2765 | "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ | 2751 | "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ |
2766 | 2752 | ||
2767 | /* Check if vmlaunch or vmresume is needed */ | 2753 | /* Check if vmlaunch or vmresume is needed */ |
2768 | "cmpl $0, %c[launched](%% " _ASM_CX")\n\t" | 2754 | "cmpb $0, %c[launched](%[loaded_vmcs])\n\t" |
2769 | 2755 | ||
2756 | /* | ||
2757 | * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set | ||
2758 | * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail | ||
2759 | * Valid. vmx_vmenter() directly "returns" RFLAGS, and so the | ||
2760 | * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail. | ||
2761 | */ | ||
2770 | "call vmx_vmenter\n\t" | 2762 | "call vmx_vmenter\n\t" |
2771 | 2763 | ||
2772 | /* Set vmx->fail accordingly */ | 2764 | CC_SET(be) |
2773 | "setbe %c[fail](%% " _ASM_CX")\n\t" | 2765 | : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail) |
2774 | : ASM_CALL_CONSTRAINT | 2766 | : [HOST_RSP]"r"((unsigned long)HOST_RSP), |
2775 | : "c"(vmx), "d"((unsigned long)HOST_RSP), | 2767 | [loaded_vmcs]"r"(vmx->loaded_vmcs), |
2776 | [launched]"i"(offsetof(struct vcpu_vmx, __launched)), | 2768 | [launched]"i"(offsetof(struct loaded_vmcs, launched)), |
2777 | [fail]"i"(offsetof(struct vcpu_vmx, fail)), | 2769 | [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)), |
2778 | [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), | ||
2779 | [wordsize]"i"(sizeof(ulong)) | 2770 | [wordsize]"i"(sizeof(ulong)) |
2780 | : "rax", "cc", "memory" | 2771 | : "cc", "memory" |
2781 | ); | 2772 | ); |
2782 | 2773 | ||
2783 | preempt_enable(); | 2774 | preempt_enable(); |
@@ -2787,10 +2778,9 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) | |||
2787 | if (vmx->msr_autoload.guest.nr) | 2778 | if (vmx->msr_autoload.guest.nr) |
2788 | vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); | 2779 | vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); |
2789 | 2780 | ||
2790 | if (vmx->fail) { | 2781 | if (vm_fail) { |
2791 | WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != | 2782 | WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != |
2792 | VMXERR_ENTRY_INVALID_CONTROL_FIELD); | 2783 | VMXERR_ENTRY_INVALID_CONTROL_FIELD); |
2793 | vmx->fail = 0; | ||
2794 | return 1; | 2784 | return 1; |
2795 | } | 2785 | } |
2796 | 2786 | ||
@@ -2813,8 +2803,6 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) | |||
2813 | 2803 | ||
2814 | return 0; | 2804 | return 0; |
2815 | } | 2805 | } |
2816 | STACK_FRAME_NON_STANDARD(nested_vmx_check_vmentry_hw); | ||
2817 | |||
2818 | 2806 | ||
2819 | static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, | 2807 | static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, |
2820 | struct vmcs12 *vmcs12); | 2808 | struct vmcs12 *vmcs12); |
@@ -3031,6 +3019,15 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) | |||
3031 | kvm_make_request(KVM_REQ_EVENT, vcpu); | 3019 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
3032 | 3020 | ||
3033 | /* | 3021 | /* |
3022 | * Do not start the preemption timer hrtimer until after we know | ||
3023 | * we are successful, so that only nested_vmx_vmexit needs to cancel | ||
3024 | * the timer. | ||
3025 | */ | ||
3026 | vmx->nested.preemption_timer_expired = false; | ||
3027 | if (nested_cpu_has_preemption_timer(vmcs12)) | ||
3028 | vmx_start_preemption_timer(vcpu); | ||
3029 | |||
3030 | /* | ||
3034 | * Note no nested_vmx_succeed or nested_vmx_fail here. At this point | 3031 | * Note no nested_vmx_succeed or nested_vmx_fail here. At this point |
3035 | * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet | 3032 | * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet |
3036 | * returned as far as L1 is concerned. It will only return (and set | 3033 | * returned as far as L1 is concerned. It will only return (and set |
@@ -3450,13 +3447,10 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) | |||
3450 | else | 3447 | else |
3451 | vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; | 3448 | vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; |
3452 | 3449 | ||
3453 | if (nested_cpu_has_preemption_timer(vmcs12)) { | 3450 | if (nested_cpu_has_preemption_timer(vmcs12) && |
3454 | if (vmcs12->vm_exit_controls & | 3451 | vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) |
3455 | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) | ||
3456 | vmcs12->vmx_preemption_timer_value = | 3452 | vmcs12->vmx_preemption_timer_value = |
3457 | vmx_get_preemption_timer_value(vcpu); | 3453 | vmx_get_preemption_timer_value(vcpu); |
3458 | hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); | ||
3459 | } | ||
3460 | 3454 | ||
3461 | /* | 3455 | /* |
3462 | * In some cases (usually, nested EPT), L2 is allowed to change its | 3456 | * In some cases (usually, nested EPT), L2 is allowed to change its |
@@ -3864,6 +3858,9 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, | |||
3864 | 3858 | ||
3865 | leave_guest_mode(vcpu); | 3859 | leave_guest_mode(vcpu); |
3866 | 3860 | ||
3861 | if (nested_cpu_has_preemption_timer(vmcs12)) | ||
3862 | hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); | ||
3863 | |||
3867 | if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) | 3864 | if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) |
3868 | vcpu->arch.tsc_offset -= vmcs12->tsc_offset; | 3865 | vcpu->arch.tsc_offset -= vmcs12->tsc_offset; |
3869 | 3866 | ||
@@ -3915,9 +3912,6 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, | |||
3915 | vmx_flush_tlb(vcpu, true); | 3912 | vmx_flush_tlb(vcpu, true); |
3916 | } | 3913 | } |
3917 | 3914 | ||
3918 | /* This is needed for same reason as it was needed in prepare_vmcs02 */ | ||
3919 | vmx->host_rsp = 0; | ||
3920 | |||
3921 | /* Unpin physical memory we referred to in vmcs02 */ | 3915 | /* Unpin physical memory we referred to in vmcs02 */ |
3922 | if (vmx->nested.apic_access_page) { | 3916 | if (vmx->nested.apic_access_page) { |
3923 | kvm_release_page_dirty(vmx->nested.apic_access_page); | 3917 | kvm_release_page_dirty(vmx->nested.apic_access_page); |
@@ -4035,25 +4029,50 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, | |||
4035 | /* Addr = segment_base + offset */ | 4029 | /* Addr = segment_base + offset */ |
4036 | /* offset = base + [index * scale] + displacement */ | 4030 | /* offset = base + [index * scale] + displacement */ |
4037 | off = exit_qualification; /* holds the displacement */ | 4031 | off = exit_qualification; /* holds the displacement */ |
4032 | if (addr_size == 1) | ||
4033 | off = (gva_t)sign_extend64(off, 31); | ||
4034 | else if (addr_size == 0) | ||
4035 | off = (gva_t)sign_extend64(off, 15); | ||
4038 | if (base_is_valid) | 4036 | if (base_is_valid) |
4039 | off += kvm_register_read(vcpu, base_reg); | 4037 | off += kvm_register_read(vcpu, base_reg); |
4040 | if (index_is_valid) | 4038 | if (index_is_valid) |
4041 | off += kvm_register_read(vcpu, index_reg)<<scaling; | 4039 | off += kvm_register_read(vcpu, index_reg)<<scaling; |
4042 | vmx_get_segment(vcpu, &s, seg_reg); | 4040 | vmx_get_segment(vcpu, &s, seg_reg); |
4043 | *ret = s.base + off; | ||
4044 | 4041 | ||
4042 | /* | ||
4043 | * The effective address, i.e. @off, of a memory operand is truncated | ||
4044 | * based on the address size of the instruction. Note that this is | ||
4045 | * the *effective address*, i.e. the address prior to accounting for | ||
4046 | * the segment's base. | ||
4047 | */ | ||
4045 | if (addr_size == 1) /* 32 bit */ | 4048 | if (addr_size == 1) /* 32 bit */ |
4046 | *ret &= 0xffffffff; | 4049 | off &= 0xffffffff; |
4050 | else if (addr_size == 0) /* 16 bit */ | ||
4051 | off &= 0xffff; | ||
4047 | 4052 | ||
4048 | /* Checks for #GP/#SS exceptions. */ | 4053 | /* Checks for #GP/#SS exceptions. */ |
4049 | exn = false; | 4054 | exn = false; |
4050 | if (is_long_mode(vcpu)) { | 4055 | if (is_long_mode(vcpu)) { |
4056 | /* | ||
4057 | * The virtual/linear address is never truncated in 64-bit | ||
4058 | * mode, e.g. a 32-bit address size can yield a 64-bit virtual | ||
4059 | * address when using FS/GS with a non-zero base. | ||
4060 | */ | ||
4061 | *ret = s.base + off; | ||
4062 | |||
4051 | /* Long mode: #GP(0)/#SS(0) if the memory address is in a | 4063 | /* Long mode: #GP(0)/#SS(0) if the memory address is in a |
4052 | * non-canonical form. This is the only check on the memory | 4064 | * non-canonical form. This is the only check on the memory |
4053 | * destination for long mode! | 4065 | * destination for long mode! |
4054 | */ | 4066 | */ |
4055 | exn = is_noncanonical_address(*ret, vcpu); | 4067 | exn = is_noncanonical_address(*ret, vcpu); |
4056 | } else if (is_protmode(vcpu)) { | 4068 | } else { |
4069 | /* | ||
4070 | * When not in long mode, the virtual/linear address is | ||
4071 | * unconditionally truncated to 32 bits regardless of the | ||
4072 | * address size. | ||
4073 | */ | ||
4074 | *ret = (s.base + off) & 0xffffffff; | ||
4075 | |||
4057 | /* Protected mode: apply checks for segment validity in the | 4076 | /* Protected mode: apply checks for segment validity in the |
4058 | * following order: | 4077 | * following order: |
4059 | * - segment type check (#GP(0) may be thrown) | 4078 | * - segment type check (#GP(0) may be thrown) |
@@ -4077,10 +4096,16 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, | |||
4077 | /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. | 4096 | /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. |
4078 | */ | 4097 | */ |
4079 | exn = (s.unusable != 0); | 4098 | exn = (s.unusable != 0); |
4080 | /* Protected mode: #GP(0)/#SS(0) if the memory | 4099 | |
4081 | * operand is outside the segment limit. | 4100 | /* |
4101 | * Protected mode: #GP(0)/#SS(0) if the memory operand is | ||
4102 | * outside the segment limit. All CPUs that support VMX ignore | ||
4103 | * limit checks for flat segments, i.e. segments with base==0, | ||
4104 | * limit==0xffffffff and of type expand-up data or code. | ||
4082 | */ | 4105 | */ |
4083 | exn = exn || (off + sizeof(u64) > s.limit); | 4106 | if (!(s.base == 0 && s.limit == 0xffffffff && |
4107 | ((s.type & 8) || !(s.type & 4)))) | ||
4108 | exn = exn || (off + sizeof(u64) > s.limit); | ||
4084 | } | 4109 | } |
4085 | if (exn) { | 4110 | if (exn) { |
4086 | kvm_queue_exception_e(vcpu, | 4111 | kvm_queue_exception_e(vcpu, |
@@ -4145,11 +4170,11 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) | |||
4145 | if (r < 0) | 4170 | if (r < 0) |
4146 | goto out_vmcs02; | 4171 | goto out_vmcs02; |
4147 | 4172 | ||
4148 | vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL); | 4173 | vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); |
4149 | if (!vmx->nested.cached_vmcs12) | 4174 | if (!vmx->nested.cached_vmcs12) |
4150 | goto out_cached_vmcs12; | 4175 | goto out_cached_vmcs12; |
4151 | 4176 | ||
4152 | vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL); | 4177 | vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); |
4153 | if (!vmx->nested.cached_shadow_vmcs12) | 4178 | if (!vmx->nested.cached_shadow_vmcs12) |
4154 | goto out_cached_shadow_vmcs12; | 4179 | goto out_cached_shadow_vmcs12; |
4155 | 4180 | ||
@@ -5696,6 +5721,10 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) | |||
5696 | enable_shadow_vmcs = 0; | 5721 | enable_shadow_vmcs = 0; |
5697 | if (enable_shadow_vmcs) { | 5722 | if (enable_shadow_vmcs) { |
5698 | for (i = 0; i < VMX_BITMAP_NR; i++) { | 5723 | for (i = 0; i < VMX_BITMAP_NR; i++) { |
5724 | /* | ||
5725 | * The vmx_bitmap is not tied to a VM and so should | ||
5726 | * not be charged to a memcg. | ||
5727 | */ | ||
5699 | vmx_bitmap[i] = (unsigned long *) | 5728 | vmx_bitmap[i] = (unsigned long *) |
5700 | __get_free_page(GFP_KERNEL); | 5729 | __get_free_page(GFP_KERNEL); |
5701 | if (!vmx_bitmap[i]) { | 5730 | if (!vmx_bitmap[i]) { |