From 5f4cb662a0a2533b45656607471571460310a5ca Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 14 Jul 2008 20:36:36 +0200 Subject: KVM: SVM: allow enabling/disabling NPT by reloading only the architecture module If NPT is enabled after loading both KVM modules on AMD and it should be disabled, both KVM modules must be reloaded. If only the architecture module is reloaded the behavior is undefined. With this patch it is possible to disable NPT only by reloading the kvm_amd module. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 6 ++++++ arch/x86/kvm/svm.c | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b0e4ddca6c18..d087d9c4f2d9 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1870,6 +1870,12 @@ void kvm_enable_tdp(void) } EXPORT_SYMBOL_GPL(kvm_enable_tdp); +void kvm_disable_tdp(void) +{ + tdp_enabled = false; +} +EXPORT_SYMBOL_GPL(kvm_disable_tdp); + static void free_mmu_pages(struct kvm_vcpu *vcpu) { struct kvm_mmu_page *sp; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index b756e876dce3..951b789cc913 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -453,7 +453,8 @@ static __init int svm_hardware_setup(void) if (npt_enabled) { printk(KERN_INFO "kvm: Nested Paging enabled\n"); kvm_enable_tdp(); - } + } else + kvm_disable_tdp(); return 0; -- cgit v1.2.2 From 98899aa0e0bf5de05850082be0eb837058c09ea5 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 16 Jul 2008 19:07:10 -0300 Subject: KVM: task switch: segment base is linear address The segment base is always a linear address, so translate before accessing guest memory. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9f1cdb011cff..cd687395e4e7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3223,6 +3223,7 @@ static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu, static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, struct desc_struct *seg_desc) { + gpa_t gpa; struct descriptor_table dtable; u16 index = selector >> 3; @@ -3232,13 +3233,16 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); return 1; } - return kvm_read_guest(vcpu->kvm, dtable.base + index * 8, seg_desc, 8); + gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base); + gpa += index * 8; + return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8); } /* allowed just for 8 bytes segments */ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, struct desc_struct *seg_desc) { + gpa_t gpa; struct descriptor_table dtable; u16 index = selector >> 3; @@ -3246,7 +3250,9 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, if (dtable.limit < index * 8 + 7) return 1; - return kvm_write_guest(vcpu->kvm, dtable.base + index * 8, seg_desc, 8); + gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base); + gpa += index * 8; + return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8); } static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, @@ -3258,7 +3264,7 @@ static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, base_addr |= (seg_desc->base1 << 16); base_addr |= (seg_desc->base2 << 24); - return base_addr; + return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); } static int load_tss_segment32(struct kvm_vcpu *vcpu, -- cgit v1.2.2 From 34198bf8426276a2ce1e97056a0f02d43637e5ae Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 16 Jul 2008 19:07:11 -0300 Subject: KVM: task switch: use seg regs provided by subarch instead of reading from GDT There is no guarantee that the old TSS descriptor in the GDT contains the proper base address. This is the case for Windows installation's reboot-via-triplefault. Use guest registers instead. Also translate the address properly. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 93 ++++++++++++++++++------------------------------------ 1 file changed, 30 insertions(+), 63 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cd687395e4e7..27c6ece91da6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3267,54 +3267,6 @@ static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); } -static int load_tss_segment32(struct kvm_vcpu *vcpu, - struct desc_struct *seg_desc, - struct tss_segment_32 *tss) -{ - u32 base_addr; - - base_addr = get_tss_base_addr(vcpu, seg_desc); - - return kvm_read_guest(vcpu->kvm, base_addr, tss, - sizeof(struct tss_segment_32)); -} - -static int save_tss_segment32(struct kvm_vcpu *vcpu, - struct desc_struct *seg_desc, - struct tss_segment_32 *tss) -{ - u32 base_addr; - - base_addr = get_tss_base_addr(vcpu, seg_desc); - - return kvm_write_guest(vcpu->kvm, base_addr, tss, - sizeof(struct tss_segment_32)); -} - -static int load_tss_segment16(struct kvm_vcpu *vcpu, - struct desc_struct *seg_desc, - struct tss_segment_16 *tss) -{ - u32 base_addr; - - base_addr = get_tss_base_addr(vcpu, seg_desc); - - return kvm_read_guest(vcpu->kvm, base_addr, tss, - sizeof(struct tss_segment_16)); -} - -static int save_tss_segment16(struct kvm_vcpu *vcpu, - struct desc_struct *seg_desc, - struct tss_segment_16 *tss) -{ - u32 base_addr; - - base_addr = get_tss_base_addr(vcpu, seg_desc); - - return kvm_write_guest(vcpu->kvm, base_addr, tss, - sizeof(struct tss_segment_16)); -} - static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) { struct kvm_segment kvm_seg; @@ -3472,20 +3424,26 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu, } static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, - struct desc_struct *cseg_desc, + u32 old_tss_base, struct desc_struct *nseg_desc) { struct tss_segment_16 tss_segment_16; int ret = 0; - if (load_tss_segment16(vcpu, cseg_desc, &tss_segment_16)) + if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16, + sizeof tss_segment_16)) goto out; save_state_to_tss16(vcpu, &tss_segment_16); - save_tss_segment16(vcpu, cseg_desc, &tss_segment_16); - if (load_tss_segment16(vcpu, nseg_desc, &tss_segment_16)) + if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16, + sizeof tss_segment_16)) goto out; + + if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), + &tss_segment_16, sizeof tss_segment_16)) + goto out; + if (load_state_from_tss16(vcpu, &tss_segment_16)) goto out; @@ -3495,20 +3453,26 @@ out: } static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, - struct desc_struct *cseg_desc, + u32 old_tss_base, struct desc_struct *nseg_desc) { struct tss_segment_32 tss_segment_32; int ret = 0; - if (load_tss_segment32(vcpu, cseg_desc, &tss_segment_32)) + if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32, + sizeof tss_segment_32)) goto out; save_state_to_tss32(vcpu, &tss_segment_32); - save_tss_segment32(vcpu, cseg_desc, &tss_segment_32); - if (load_tss_segment32(vcpu, nseg_desc, &tss_segment_32)) + if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32, + sizeof tss_segment_32)) + goto out; + + if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), + &tss_segment_32, sizeof tss_segment_32)) goto out; + if (load_state_from_tss32(vcpu, &tss_segment_32)) goto out; @@ -3523,16 +3487,20 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) struct desc_struct cseg_desc; struct desc_struct nseg_desc; int ret = 0; + u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); + u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); - kvm_get_segment(vcpu, &tr_seg, VCPU_SREG_TR); + old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base); + /* FIXME: Handle errors. Failure to read either TSS or their + * descriptors should generate a pagefault. + */ if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc)) goto out; - if (load_guest_segment_descriptor(vcpu, tr_seg.selector, &cseg_desc)) + if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc)) goto out; - if (reason != TASK_SWITCH_IRET) { int cpl; @@ -3550,8 +3518,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { cseg_desc.type &= ~(1 << 1); //clear the B flag - save_guest_segment_descriptor(vcpu, tr_seg.selector, - &cseg_desc); + save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc); } if (reason == TASK_SWITCH_IRET) { @@ -3563,10 +3530,10 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) kvm_x86_ops->cache_regs(vcpu); if (nseg_desc.type & 8) - ret = kvm_task_switch_32(vcpu, tss_selector, &cseg_desc, + ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base, &nseg_desc); else - ret = kvm_task_switch_16(vcpu, tss_selector, &cseg_desc, + ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_base, &nseg_desc); if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { -- cgit v1.2.2 From 577bdc496614ced56d999bbb425e85adf2386490 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sat, 19 Jul 2008 08:57:05 +0300 Subject: KVM: Avoid instruction emulation when event delivery is pending When an event (such as an interrupt) is injected, and the stack is shadowed (and therefore write protected), the guest will exit. The current code will see that the stack is shadowed and emulate a few instructions, each time postponing the injection. Eventually the injection may succeed, but at that time the guest may be unwilling to accept the interrupt (for example, the TPR may have changed). This occurs every once in a while during a Windows 2008 boot. Fix by unshadowing the fault address if the fault was due to an event injection. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 1 + arch/x86/kvm/svm.c | 7 ++++++- arch/x86/kvm/vmx.c | 2 ++ 3 files changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d087d9c4f2d9..2fa231923cf7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1814,6 +1814,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) spin_unlock(&vcpu->kvm->mmu_lock); return r; } +EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 951b789cc913..e2ee264740c7 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1008,10 +1008,13 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) struct kvm *kvm = svm->vcpu.kvm; u64 fault_address; u32 error_code; + bool event_injection = false; if (!irqchip_in_kernel(kvm) && - is_external_interrupt(exit_int_info)) + is_external_interrupt(exit_int_info)) { + event_injection = true; push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); + } fault_address = svm->vmcb->control.exit_info_2; error_code = svm->vmcb->control.exit_info_1; @@ -1025,6 +1028,8 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) (u32)fault_address, (u32)(fault_address >> 32), handler); + if (event_injection) + kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0cac63701719..b918fc83435c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2298,6 +2298,8 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) cr2 = vmcs_readl(EXIT_QUALIFICATION); KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, (u32)((u64)cr2 >> 32), handler); + if (vect_info & VECTORING_INFO_VALID_MASK) + kvm_mmu_unprotect_page_virt(vcpu, cr2); return kvm_mmu_page_fault(vcpu, cr2, error_code); } -- cgit v1.2.2 From c93cd3a58845012df2d658fecd0ac99f7008d753 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Sat, 19 Jul 2008 19:08:07 -0300 Subject: KVM: task switch: translate guest segment limit to virt-extension byte granular field If 'g' is one then limit is 4kb granular. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 27c6ece91da6..5916191420c7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3184,6 +3184,10 @@ static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector, kvm_desct->base |= seg_desc->base2 << 24; kvm_desct->limit = seg_desc->limit0; kvm_desct->limit |= seg_desc->limit << 16; + if (seg_desc->g) { + kvm_desct->limit <<= 12; + kvm_desct->limit |= 0xfff; + } kvm_desct->selector = selector; kvm_desct->type = seg_desc->type; kvm_desct->present = seg_desc->p; -- cgit v1.2.2 From 5ec5726a16245138f5d5305b00a752acb5730076 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Wed, 16 Jul 2008 09:21:22 +0800 Subject: KVM: VMX: Fix bypass_guest_pf enabling when disable EPT in module parameter Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b918fc83435c..f71151d999e4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3305,7 +3305,7 @@ static int __init vmx_init(void) vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP); vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP); - if (cpu_has_vmx_ept()) + if (vm_need_ept()) bypass_guest_pf = 0; if (bypass_guest_pf) -- cgit v1.2.2 From 5fdbcb9dd16f1e89ead127d3ee1a38e3a00cf1ea Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Wed, 16 Jul 2008 09:25:40 +0800 Subject: KVM: VMX: Fix undefined beaviour of EPT after reload kvm-intel.ko As well as move set base/mask ptes to vmx_init(). Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f71151d999e4..2a69773e3b26 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3118,15 +3118,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) return ERR_PTR(-ENOMEM); allocate_vpid(vmx); - if (id == 0 && vm_need_ept()) { - kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | - VMX_EPT_WRITABLE_MASK | - VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT); - kvm_mmu_set_mask_ptes(0ull, VMX_EPT_FAKE_ACCESSED_MASK, - VMX_EPT_FAKE_DIRTY_MASK, 0ull, - VMX_EPT_EXECUTABLE_MASK); - kvm_enable_tdp(); - } err = kvm_vcpu_init(&vmx->vcpu, kvm, id); if (err) @@ -3305,8 +3296,17 @@ static int __init vmx_init(void) vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP); vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP); - if (vm_need_ept()) + if (vm_need_ept()) { bypass_guest_pf = 0; + kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | + VMX_EPT_WRITABLE_MASK | + VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT); + kvm_mmu_set_mask_ptes(0ull, VMX_EPT_FAKE_ACCESSED_MASK, + VMX_EPT_FAKE_DIRTY_MASK, 0ull, + VMX_EPT_EXECUTABLE_MASK); + kvm_enable_tdp(); + } else + kvm_disable_tdp(); if (bypass_guest_pf) kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); -- cgit v1.2.2 From 583323b9d2f624884a8c9563fb5a4d795f39ab07 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 27 Jul 2008 21:43:11 +0200 Subject: x86: fix cpu hotplug on 32bit commit 3e9704739daf46a8ba6593d749c67b5f7cd633d2 ("x86: boot secondary cpus through initial_code") causes the kernel to crash when a CPU is brought online after the read only sections have been write protected. The write to initial_code in do_boot_cpu() fails. Move inital_code to .cpuinit.data section. Signed-off-by: Thomas Gleixner Acked-by: H. Peter Anvin --- arch/x86/kernel/head_32.S | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index f67e93441caf..a7010c3a377a 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -456,9 +456,6 @@ is386: movl $2,%ecx # set MP 1: #endif /* CONFIG_SMP */ jmp *(initial_code) -.align 4 -ENTRY(initial_code) - .long i386_start_kernel /* * We depend on ET to be correct. This checks for 287/387. @@ -601,6 +598,11 @@ ignore_int: #endif iret +.section .cpuinit.data,"wa" +.align 4 +ENTRY(initial_code) + .long i386_start_kernel + .section .text /* * Real beginning of normal "text" segment -- cgit v1.2.2