aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorNadav Har'El <nyh@il.ibm.com>2011-05-25 16:11:34 -0400
committerAvi Kivity <avi@redhat.com>2011-07-12 04:45:16 -0400
commit4704d0befb0721274bda863192c4782febb6b94c (patch)
treef460a211f2307a9ba2db1f641b6907d9572f6cf6 /arch
parent99e65e805dea4df061aa4038211112aa96416412 (diff)
KVM: nVMX: Exiting from L2 to L1
This patch implements nested_vmx_vmexit(), called when the nested L2 guest exits and we want to run its L1 parent and let it handle this exit. Note that this will not necessarily be called on every L2 exit. L0 may decide to handle a particular exit on its own, without L1's involvement; In that case, L0 will handle the exit, and resume running L2, without running L1 and without calling nested_vmx_vmexit(). The logic for deciding whether to handle a particular exit in L1 or in L0, i.e., whether to call nested_vmx_vmexit(), will appear in a separate patch below. Signed-off-by: Nadav Har'El <nyh@il.ibm.com> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/include/asm/vmx.h4
-rw-r--r--arch/x86/kvm/vmx.c262
2 files changed, 266 insertions, 0 deletions
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 37690bd580c8..b747773cf83b 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -132,6 +132,8 @@ enum vmcs_field {
132 GUEST_IA32_PAT_HIGH = 0x00002805, 132 GUEST_IA32_PAT_HIGH = 0x00002805,
133 GUEST_IA32_EFER = 0x00002806, 133 GUEST_IA32_EFER = 0x00002806,
134 GUEST_IA32_EFER_HIGH = 0x00002807, 134 GUEST_IA32_EFER_HIGH = 0x00002807,
135 GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808,
136 GUEST_IA32_PERF_GLOBAL_CTRL_HIGH= 0x00002809,
135 GUEST_PDPTR0 = 0x0000280a, 137 GUEST_PDPTR0 = 0x0000280a,
136 GUEST_PDPTR0_HIGH = 0x0000280b, 138 GUEST_PDPTR0_HIGH = 0x0000280b,
137 GUEST_PDPTR1 = 0x0000280c, 139 GUEST_PDPTR1 = 0x0000280c,
@@ -144,6 +146,8 @@ enum vmcs_field {
144 HOST_IA32_PAT_HIGH = 0x00002c01, 146 HOST_IA32_PAT_HIGH = 0x00002c01,
145 HOST_IA32_EFER = 0x00002c02, 147 HOST_IA32_EFER = 0x00002c02,
146 HOST_IA32_EFER_HIGH = 0x00002c03, 148 HOST_IA32_EFER_HIGH = 0x00002c03,
149 HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04,
150 HOST_IA32_PERF_GLOBAL_CTRL_HIGH = 0x00002c05,
147 PIN_BASED_VM_EXEC_CONTROL = 0x00004000, 151 PIN_BASED_VM_EXEC_CONTROL = 0x00004000,
148 CPU_BASED_VM_EXEC_CONTROL = 0x00004002, 152 CPU_BASED_VM_EXEC_CONTROL = 0x00004002,
149 EXCEPTION_BITMAP = 0x00004004, 153 EXCEPTION_BITMAP = 0x00004004,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f8dccb9df34b..7d778e4976ea 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6144,6 +6144,268 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
6144 return 1; 6144 return 1;
6145} 6145}
6146 6146
6147/*
6148 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
6149 * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK).
6150 * This function returns the new value we should put in vmcs12.guest_cr0.
6151 * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
6152 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
6153 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
6154 * didn't trap the bit, because if L1 did, so would L0).
6155 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have
6156 * been modified by L2, and L1 knows it. So just leave the old value of
6157 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
6158 * isn't relevant, because if L0 traps this bit it can set it to anything.
6159 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
6160 * changed these bits, and therefore they need to be updated, but L0
6161 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather
6162 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
6163 */
6164static inline unsigned long
6165vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
6166{
6167 return
6168 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
6169 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
6170 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
6171 vcpu->arch.cr0_guest_owned_bits));
6172}
6173
6174static inline unsigned long
6175vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
6176{
6177 return
6178 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
6179 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
6180 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
6181 vcpu->arch.cr4_guest_owned_bits));
6182}
6183
6184/*
6185 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
6186 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
6187 * and this function updates it to reflect the changes to the guest state while
6188 * L2 was running (and perhaps made some exits which were handled directly by L0
6189 * without going back to L1), and to reflect the exit reason.
6190 * Note that we do not have to copy here all VMCS fields, just those that
6191 * could have changed by the L2 guest or the exit - i.e., the guest-state and
6192 * exit-information fields only. Other fields are modified by L1 with VMWRITE,
6193 * which already writes to vmcs12 directly.
6194 */
6195void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
6196{
6197 /* update guest state fields: */
6198 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
6199 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
6200
6201 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
6202 vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
6203 vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP);
6204 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
6205
6206 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
6207 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
6208 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
6209 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
6210 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
6211 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
6212 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
6213 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
6214 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
6215 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
6216 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
6217 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
6218 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
6219 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
6220 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
6221 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
6222 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
6223 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
6224 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
6225 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
6226 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
6227 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
6228 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
6229 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
6230 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
6231 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
6232 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
6233 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
6234 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
6235 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
6236 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
6237 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
6238 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
6239 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
6240 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
6241 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
6242
6243 vmcs12->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE);
6244 vmcs12->guest_interruptibility_info =
6245 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
6246 vmcs12->guest_pending_dbg_exceptions =
6247 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
6248
6249 /* TODO: These cannot have changed unless we have MSR bitmaps and
6250 * the relevant bit asks not to trap the change */
6251 vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
6252 if (vmcs12->vm_entry_controls & VM_EXIT_SAVE_IA32_PAT)
6253 vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
6254 vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
6255 vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
6256 vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
6257
6258 /* update exit information fields: */
6259
6260 vmcs12->vm_exit_reason = vmcs_read32(VM_EXIT_REASON);
6261 vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6262
6263 vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
6264 vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
6265 vmcs12->idt_vectoring_info_field =
6266 vmcs_read32(IDT_VECTORING_INFO_FIELD);
6267 vmcs12->idt_vectoring_error_code =
6268 vmcs_read32(IDT_VECTORING_ERROR_CODE);
6269 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
6270 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
6271
6272 /* clear vm-entry fields which are to be cleared on exit */
6273 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
6274 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
6275}
6276
6277/*
6278 * A part of what we need to when the nested L2 guest exits and we want to
6279 * run its L1 parent, is to reset L1's guest state to the host state specified
6280 * in vmcs12.
6281 * This function is to be called not only on normal nested exit, but also on
6282 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
6283 * Failures During or After Loading Guest State").
6284 * This function should be called when the active VMCS is L1's (vmcs01).
6285 */
6286void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
6287{
6288 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
6289 vcpu->arch.efer = vmcs12->host_ia32_efer;
6290 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
6291 vcpu->arch.efer |= (EFER_LMA | EFER_LME);
6292 else
6293 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
6294 vmx_set_efer(vcpu, vcpu->arch.efer);
6295
6296 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
6297 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
6298 /*
6299 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
6300 * actually changed, because it depends on the current state of
6301 * fpu_active (which may have changed).
6302 * Note that vmx_set_cr0 refers to efer set above.
6303 */
6304 kvm_set_cr0(vcpu, vmcs12->host_cr0);
6305 /*
6306 * If we did fpu_activate()/fpu_deactivate() during L2's run, we need
6307 * to apply the same changes to L1's vmcs. We just set cr0 correctly,
6308 * but we also need to update cr0_guest_host_mask and exception_bitmap.
6309 */
6310 update_exception_bitmap(vcpu);
6311 vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0);
6312 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
6313
6314 /*
6315 * Note that CR4_GUEST_HOST_MASK is already set in the original vmcs01
6316 * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask();
6317 */
6318 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
6319 kvm_set_cr4(vcpu, vmcs12->host_cr4);
6320
6321 /* shadow page tables on either EPT or shadow page tables */
6322 kvm_set_cr3(vcpu, vmcs12->host_cr3);
6323 kvm_mmu_reset_context(vcpu);
6324
6325 if (enable_vpid) {
6326 /*
6327 * Trivially support vpid by letting L2s share their parent
6328 * L1's vpid. TODO: move to a more elaborate solution, giving
6329 * each L2 its own vpid and exposing the vpid feature to L1.
6330 */
6331 vmx_flush_tlb(vcpu);
6332 }
6333
6334
6335 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
6336 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
6337 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
6338 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
6339 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
6340 vmcs_writel(GUEST_TR_BASE, vmcs12->host_tr_base);
6341 vmcs_writel(GUEST_GS_BASE, vmcs12->host_gs_base);
6342 vmcs_writel(GUEST_FS_BASE, vmcs12->host_fs_base);
6343 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->host_es_selector);
6344 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->host_cs_selector);
6345 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->host_ss_selector);
6346 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->host_ds_selector);
6347 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->host_fs_selector);
6348 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->host_gs_selector);
6349 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->host_tr_selector);
6350
6351 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT)
6352 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
6353 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
6354 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
6355 vmcs12->host_ia32_perf_global_ctrl);
6356}
6357
6358/*
6359 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
6360 * and modify vmcs12 to make it see what it would expect to see there if
6361 * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
6362 */
6363static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
6364{
6365 struct vcpu_vmx *vmx = to_vmx(vcpu);
6366 int cpu;
6367 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6368
6369 leave_guest_mode(vcpu);
6370 prepare_vmcs12(vcpu, vmcs12);
6371
6372 cpu = get_cpu();
6373 vmx->loaded_vmcs = &vmx->vmcs01;
6374 vmx_vcpu_put(vcpu);
6375 vmx_vcpu_load(vcpu, cpu);
6376 vcpu->cpu = cpu;
6377 put_cpu();
6378
6379 /* if no vmcs02 cache requested, remove the one we used */
6380 if (VMCS02_POOL_SIZE == 0)
6381 nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
6382
6383 load_vmcs12_host_state(vcpu, vmcs12);
6384
6385 /* Update TSC_OFFSET if vmx_adjust_tsc_offset() was used while L2 ran */
6386 vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
6387
6388 /* This is needed for same reason as it was needed in prepare_vmcs02 */
6389 vmx->host_rsp = 0;
6390
6391 /* Unpin physical memory we referred to in vmcs02 */
6392 if (vmx->nested.apic_access_page) {
6393 nested_release_page(vmx->nested.apic_access_page);
6394 vmx->nested.apic_access_page = 0;
6395 }
6396
6397 /*
6398 * Exiting from L2 to L1, we're now back to L1 which thinks it just
6399 * finished a VMLAUNCH or VMRESUME instruction, so we need to set the
6400 * success or failure flag accordingly.
6401 */
6402 if (unlikely(vmx->fail)) {
6403 vmx->fail = 0;
6404 nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR));
6405 } else
6406 nested_vmx_succeed(vcpu);
6407}
6408
6147static int vmx_check_intercept(struct kvm_vcpu *vcpu, 6409static int vmx_check_intercept(struct kvm_vcpu *vcpu,
6148 struct x86_instruction_info *info, 6410 struct x86_instruction_info *info,
6149 enum x86_intercept_stage stage) 6411 enum x86_intercept_stage stage)