From 3fd28fce765632d0fe46b31f63c0e7a7ec6c6b79 Mon Sep 17 00:00:00 2001 From: Eddie Dong Date: Thu, 19 Nov 2009 17:54:07 +0200 Subject: KVM: x86: make double/triple fault promotion generic to all exceptions Move Double-Fault generation logic out of page fault exception generating function to cover more generic case. Signed-off-by: Eddie Dong Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 89 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 61 insertions(+), 28 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a1e1bc9d412d..8d860e0301a0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -257,12 +257,68 @@ void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) } EXPORT_SYMBOL_GPL(kvm_set_apic_base); +#define EXCPT_BENIGN 0 +#define EXCPT_CONTRIBUTORY 1 +#define EXCPT_PF 2 + +static int exception_class(int vector) +{ + switch (vector) { + case PF_VECTOR: + return EXCPT_PF; + case DE_VECTOR: + case TS_VECTOR: + case NP_VECTOR: + case SS_VECTOR: + case GP_VECTOR: + return EXCPT_CONTRIBUTORY; + default: + break; + } + return EXCPT_BENIGN; +} + +static void kvm_multiple_exception(struct kvm_vcpu *vcpu, + unsigned nr, bool has_error, u32 error_code) +{ + u32 prev_nr; + int class1, class2; + + if (!vcpu->arch.exception.pending) { + queue: + vcpu->arch.exception.pending = true; + vcpu->arch.exception.has_error_code = has_error; + vcpu->arch.exception.nr = nr; + vcpu->arch.exception.error_code = error_code; + return; + } + + /* to check exception */ + prev_nr = vcpu->arch.exception.nr; + if (prev_nr == DF_VECTOR) { + /* triple fault -> shutdown */ + set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + return; + } + class1 = exception_class(prev_nr); + class2 = exception_class(nr); + if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) + || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { + /* generate double fault per SDM Table 5-5 */ + vcpu->arch.exception.pending = true; + vcpu->arch.exception.has_error_code = true; + vcpu->arch.exception.nr = DF_VECTOR; + vcpu->arch.exception.error_code = 0; + } else + /* replace previous exception with a new one in a hope + that instruction re-execution will regenerate lost + exception */ + goto queue; +} + void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) { - WARN_ON(vcpu->arch.exception.pending); - vcpu->arch.exception.pending = true; - vcpu->arch.exception.has_error_code = false; - vcpu->arch.exception.nr = nr; + kvm_multiple_exception(vcpu, nr, false, 0); } EXPORT_SYMBOL_GPL(kvm_queue_exception); @@ -270,25 +326,6 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, u32 error_code) { ++vcpu->stat.pf_guest; - - if (vcpu->arch.exception.pending) { - switch(vcpu->arch.exception.nr) { - case DF_VECTOR: - /* triple fault -> shutdown */ - set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); - return; - case PF_VECTOR: - vcpu->arch.exception.nr = DF_VECTOR; - vcpu->arch.exception.error_code = 0; - return; - default: - /* replace previous exception with a new one in a hope - that instruction re-execution will regenerate lost - exception */ - vcpu->arch.exception.pending = false; - break; - } - } vcpu->arch.cr2 = addr; kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); } @@ -301,11 +338,7 @@ EXPORT_SYMBOL_GPL(kvm_inject_nmi); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { - WARN_ON(vcpu->arch.exception.pending); - vcpu->arch.exception.pending = true; - vcpu->arch.exception.has_error_code = true; - vcpu->arch.exception.nr = nr; - vcpu->arch.exception.error_code = error_code; + kvm_multiple_exception(vcpu, nr, true, error_code); } EXPORT_SYMBOL_GPL(kvm_queue_exception_e); -- cgit v1.2.2 From cb84b55f6cde26c7c17beaf87da08645ae6ccbf9 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 11 Nov 2009 17:29:49 -0200 Subject: KVM: x86: raise TSS exception for NULL CS and SS segments Windows 2003 uses task switch to triple fault and reboot (the other exception being reserved pdptrs bits). Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8d860e0301a0..279318677911 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4410,6 +4410,15 @@ static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) (kvm_get_rflags(vcpu) & X86_EFLAGS_VM); } +static void kvm_check_segment_descriptor(struct kvm_vcpu *vcpu, int seg, + u16 selector) +{ + /* NULL selector is not valid for CS and SS */ + if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) + if (!selector) + kvm_queue_exception_e(vcpu, TS_VECTOR, selector >> 3); +} + int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int type_bits, int seg) { @@ -4419,6 +4428,8 @@ int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, return kvm_load_realmode_segment(vcpu, selector, seg); if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) return 1; + + kvm_check_segment_descriptor(vcpu, seg, selector); kvm_seg.type |= type_bits; if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS && -- cgit v1.2.2 From 186a3e526ac1b4063a723f90ae4893beedb24fc6 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 2 Dec 2009 15:17:00 +0200 Subject: KVM: MMU: Report spte not found in rmap before BUG() In the past we've had errors of single-bit in the other two cases; the printk() may confirm it for the third case (many->many). Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 89a49fb46a27..4f499d7f7106 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -662,6 +662,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) prev_desc = desc; desc = desc->more; } + pr_err("rmap_remove: %p %llx many->many\n", spte, *spte); BUG(); } } -- cgit v1.2.2 From 59708670b639bff00f92e519df1ae14da240e919 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Tue, 15 Dec 2009 13:29:54 +0800 Subject: KVM: VMX: Trap and invalid MWAIT/MONITOR instruction We don't support these instructions, but guest can execute them even if the feature('monitor') haven't been exposed in CPUID. So we would trap and inject a #UD if guest try this way. Cc: stable@kernel.org Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d4918d6fc924..8a8e13965076 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1224,6 +1224,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_MWAIT_EXITING | + CPU_BASED_MONITOR_EXITING | CPU_BASED_INVLPG_EXITING; opt = CPU_BASED_TPR_SHADOW | CPU_BASED_USE_MSR_BITMAPS | @@ -3416,6 +3418,12 @@ static int handle_pause(struct kvm_vcpu *vcpu) return 1; } +static int handle_invalid_op(struct kvm_vcpu *vcpu) +{ + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -3453,6 +3461,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, + [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op, + [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op, }; static const int kvm_vmx_max_exit_handlers = -- cgit v1.2.2 From cdc0e24456bf5678f63497569c3676c9019f82c1 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 6 Dec 2009 17:21:14 +0200 Subject: KVM: VMX: Move some cr[04] related constants to vmx.c They have no place in common code. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8a8e13965076..efbb614ccd36 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -61,6 +61,19 @@ module_param_named(unrestricted_guest, static int __read_mostly emulate_invalid_guest_state = 0; module_param(emulate_invalid_guest_state, bool, S_IRUGO); +#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ + (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) +#define KVM_GUEST_CR0_MASK \ + (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) +#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \ + (X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP) +#define KVM_VM_CR0_ALWAYS_ON \ + (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) +#define KVM_GUEST_CR4_MASK \ + (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE) +#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) +#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) + /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive -- cgit v1.2.2 From fc78f51938e1ea866daa2045851b2e5681371668 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 7 Dec 2009 12:16:48 +0200 Subject: KVM: Add accessor for reading cr4 (or some bits of cr4) Some bits of cr4 can be owned by the guest on vmx, so when we read them, we copy them to the vcpu structure. In preparation for making the set of guest-owned bits dynamic, use helpers to access these bits so we don't need to know where the bit resides. No changes to svm since all bits are host-owned there. Signed-off-by: Avi Kivity --- arch/x86/kvm/kvm_cache_regs.h | 12 ++++++++++++ arch/x86/kvm/mmu.h | 5 +++-- arch/x86/kvm/vmx.c | 13 ++++++++----- arch/x86/kvm/x86.c | 16 ++++++---------- 4 files changed, 29 insertions(+), 17 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 7bcc5b6a4403..35acc36e1782 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -38,4 +38,16 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) return vcpu->arch.pdptrs[index]; } +static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) +{ + if (mask & vcpu->arch.cr4_guest_owned_bits) + kvm_x86_ops->decache_cr4_guest_bits(vcpu); + return vcpu->arch.cr4 & mask; +} + +static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr4_bits(vcpu, ~0UL); +} + #endif diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 61a1b3884b49..4567d8042b22 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -2,6 +2,7 @@ #define __KVM_X86_MMU_H #include +#include "kvm_cache_regs.h" #define PT64_PT_BITS 9 #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) @@ -64,12 +65,12 @@ static inline int is_long_mode(struct kvm_vcpu *vcpu) static inline int is_pae(struct kvm_vcpu *vcpu) { - return vcpu->arch.cr4 & X86_CR4_PAE; + return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); } static inline int is_pse(struct kvm_vcpu *vcpu) { - return vcpu->arch.cr4 & X86_CR4_PSE; + return kvm_read_cr4_bits(vcpu, X86_CR4_PSE); } static inline int is_paging(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index efbb614ccd36..284e905c59d3 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1615,8 +1615,10 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu) static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { - vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK; - vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; + ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; + + vcpu->arch.cr4 &= ~cr4_guest_owned_bits; + vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits; } static void ept_load_pdptrs(struct kvm_vcpu *vcpu) @@ -1661,7 +1663,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, (CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING)); vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, vcpu->arch.cr4); + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); } else if (!is_paging(vcpu)) { /* From nonpaging to paging */ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, @@ -1669,7 +1671,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING)); vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, vcpu->arch.cr4); + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); } if (!(cr0 & X86_CR0_WP)) @@ -2420,6 +2422,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); + vmx->vcpu.arch.cr4_guest_owned_bits = ~KVM_GUEST_CR4_MASK; tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc; rdtscll(tsc_this); @@ -3050,7 +3053,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) vcpu->arch.eff_db[dr] = val; break; case 4 ... 5: - if (vcpu->arch.cr4 & X86_CR4_DE) + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) kvm_queue_exception(vcpu, UD_VECTOR); break; case 6: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 279318677911..84dd33e717fd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -482,7 +482,7 @@ EXPORT_SYMBOL_GPL(kvm_lmsw); void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - unsigned long old_cr4 = vcpu->arch.cr4; + unsigned long old_cr4 = kvm_read_cr4(vcpu); unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; if (cr4 & CR4_RESERVED_BITS) { @@ -1899,7 +1899,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, return 0; if (mce->status & MCI_STATUS_UC) { if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || - !(vcpu->arch.cr4 & X86_CR4_MCE)) { + !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) { printk(KERN_DEBUG "kvm: set_mce: " "injects mce exception while " "previous one is in progress!\n"); @@ -3616,7 +3616,6 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) { unsigned long value; - kvm_x86_ops->decache_cr4_guest_bits(vcpu); switch (cr) { case 0: value = vcpu->arch.cr0; @@ -3628,7 +3627,7 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) value = vcpu->arch.cr3; break; case 4: - value = vcpu->arch.cr4; + value = kvm_read_cr4(vcpu); break; case 8: value = kvm_get_cr8(vcpu); @@ -3656,7 +3655,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, kvm_set_cr3(vcpu, val); break; case 4: - kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); + kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); break; case 8: kvm_set_cr8(vcpu, val & 0xfUL); @@ -4237,11 +4236,10 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, sregs->gdt.limit = dt.limit; sregs->gdt.base = dt.base; - kvm_x86_ops->decache_cr4_guest_bits(vcpu); sregs->cr0 = vcpu->arch.cr0; sregs->cr2 = vcpu->arch.cr2; sregs->cr3 = vcpu->arch.cr3; - sregs->cr4 = vcpu->arch.cr4; + sregs->cr4 = kvm_read_cr4(vcpu); sregs->cr8 = kvm_get_cr8(vcpu); sregs->efer = vcpu->arch.shadow_efer; sregs->apic_base = kvm_get_apic_base(vcpu); @@ -4737,13 +4735,11 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, kvm_x86_ops->set_efer(vcpu, sregs->efer); kvm_set_apic_base(vcpu, sregs->apic_base); - kvm_x86_ops->decache_cr4_guest_bits(vcpu); - mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0; kvm_x86_ops->set_cr0(vcpu, sregs->cr0); vcpu->arch.cr0 = sregs->cr0; - mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; + mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; kvm_x86_ops->set_cr4(vcpu, sregs->cr4); if (!is_long_mode(vcpu) && is_pae(vcpu)) { load_pdptrs(vcpu, vcpu->arch.cr3); -- cgit v1.2.2 From 4c38609ac569483152f9cb7e5a66f17355e563b2 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 7 Dec 2009 12:26:18 +0200 Subject: KVM: VMX: Make guest cr4 mask more conservative Instead of specifying the bits which we want to trap on, specify the bits which we allow the guest to change transparently. This is safer wrt future changes to cr4. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 284e905c59d3..755811a564b3 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -69,8 +69,10 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO); (X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP) #define KVM_VM_CR0_ALWAYS_ON \ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) -#define KVM_GUEST_CR4_MASK \ - (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE) +#define KVM_CR4_GUEST_OWNED_BITS \ + (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_OSXMMEXCPT) + #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) @@ -2421,8 +2423,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); - vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); - vmx->vcpu.arch.cr4_guest_owned_bits = ~KVM_GUEST_CR4_MASK; + vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; + vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc; rdtscll(tsc_this); -- cgit v1.2.2 From ce03e4f21a33b97c5d20ce597f64c361bb247904 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 7 Dec 2009 12:29:14 +0200 Subject: KVM: VMX: When using ept, allow the guest to own cr4.pge We make no use of cr4.pge if ept is enabled, but the guest does (to flush global mappings, as with vmap()), so give the guest ownership of this bit. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 755811a564b3..a7ebaa5f2b65 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2424,6 +2424,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; + if (enable_ept) + vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc; -- cgit v1.2.2 From bc23008b610dda1b0e69cd473b31c1391d6873f2 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 8 Dec 2009 12:14:42 +0200 Subject: KVM: VMX: Fold ept_update_paging_mode_cr4() into its caller ept_update_paging_mode_cr4() accesses vcpu->arch.cr4 directly, which usually needs to be accessed via kvm_read_cr4(). In this case, we can't, since cr4 is in the process of being updated. Instead of adding inane comments, fold the function into its caller (vmx_set_cr4), so it can use the not-yet-committed cr4 directly. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a7ebaa5f2b65..694baed9bc8f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1680,16 +1680,6 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, *hw_cr0 &= ~X86_CR0_WP; } -static void ept_update_paging_mode_cr4(unsigned long *hw_cr4, - struct kvm_vcpu *vcpu) -{ - if (!is_paging(vcpu)) { - *hw_cr4 &= ~X86_CR4_PAE; - *hw_cr4 |= X86_CR4_PSE; - } else if (!(vcpu->arch.cr4 & X86_CR4_PAE)) - *hw_cr4 &= ~X86_CR4_PAE; -} - static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -1767,8 +1757,14 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); vcpu->arch.cr4 = cr4; - if (enable_ept) - ept_update_paging_mode_cr4(&hw_cr4, vcpu); + if (enable_ept) { + if (!is_paging(vcpu)) { + hw_cr4 &= ~X86_CR4_PAE; + hw_cr4 |= X86_CR4_PSE; + } else if (!(cr4 & X86_CR4_PAE)) { + hw_cr4 &= ~X86_CR4_PAE; + } + } vmcs_writel(CR4_READ_SHADOW, cr4); vmcs_writel(GUEST_CR4, hw_cr4); -- cgit v1.2.2 From 8a7e3f01e692cd202fb7c042cf2be9ff8c599a1e Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Fri, 18 Dec 2009 16:48:42 +0800 Subject: KVM: VMX: Remove redundant variable It's no longer necessary. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 694baed9bc8f..f8f2fdc26894 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2399,14 +2399,12 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) for (i = 0; i < NR_VMX_MSR; ++i) { u32 index = vmx_msr_index[i]; u32 data_low, data_high; - u64 data; int j = vmx->nmsrs; if (rdmsr_safe(index, &data_low, &data_high) < 0) continue; if (wrmsr_safe(index, data_low, data_high) < 0) continue; - data = data_low | ((u64)data_high << 32); vmx->guest_msrs[j].index = i; vmx->guest_msrs[j].data = 0; vmx->guest_msrs[j].mask = -1ull; -- cgit v1.2.2 From 2bf78fa7b9b0d2917fd6587eadb3c0f6bbaf1718 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Fri, 18 Dec 2009 16:48:44 +0800 Subject: KVM: Extended shared_msr_global to per CPU shared_msr_global saved host value of relevant MSRs, but it have an assumption that all MSRs it tracked shared the value across the different CPUs. It's not true with some MSRs, e.g. MSR_TSC_AUX. Extend it to per CPU to provide the support of MSR_TSC_AUX, and more alike MSRs. Notice now the shared_msr_global still have one assumption: it can only deal with the MSRs that won't change in host after KVM module loaded. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 55 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 22 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 84dd33e717fd..4e7bbc49b7e4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -93,16 +93,16 @@ module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); struct kvm_shared_msrs_global { int nr; - struct kvm_shared_msr { - u32 msr; - u64 value; - } msrs[KVM_NR_SHARED_MSRS]; + u32 msrs[KVM_NR_SHARED_MSRS]; }; struct kvm_shared_msrs { struct user_return_notifier urn; bool registered; - u64 current_value[KVM_NR_SHARED_MSRS]; + struct kvm_shared_msr_values { + u64 host; + u64 curr; + } values[KVM_NR_SHARED_MSRS]; }; static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; @@ -147,53 +147,64 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { static void kvm_on_user_return(struct user_return_notifier *urn) { unsigned slot; - struct kvm_shared_msr *global; struct kvm_shared_msrs *locals = container_of(urn, struct kvm_shared_msrs, urn); + struct kvm_shared_msr_values *values; for (slot = 0; slot < shared_msrs_global.nr; ++slot) { - global = &shared_msrs_global.msrs[slot]; - if (global->value != locals->current_value[slot]) { - wrmsrl(global->msr, global->value); - locals->current_value[slot] = global->value; + values = &locals->values[slot]; + if (values->host != values->curr) { + wrmsrl(shared_msrs_global.msrs[slot], values->host); + values->curr = values->host; } } locals->registered = false; user_return_notifier_unregister(urn); } -void kvm_define_shared_msr(unsigned slot, u32 msr) +static void shared_msr_update(unsigned slot, u32 msr) { - int cpu; + struct kvm_shared_msrs *smsr; u64 value; + smsr = &__get_cpu_var(shared_msrs); + /* only read, and nobody should modify it at this time, + * so don't need lock */ + if (slot >= shared_msrs_global.nr) { + printk(KERN_ERR "kvm: invalid MSR slot!"); + return; + } + rdmsrl_safe(msr, &value); + smsr->values[slot].host = value; + smsr->values[slot].curr = value; +} + +void kvm_define_shared_msr(unsigned slot, u32 msr) +{ if (slot >= shared_msrs_global.nr) shared_msrs_global.nr = slot + 1; - shared_msrs_global.msrs[slot].msr = msr; - rdmsrl_safe(msr, &value); - shared_msrs_global.msrs[slot].value = value; - for_each_online_cpu(cpu) - per_cpu(shared_msrs, cpu).current_value[slot] = value; + shared_msrs_global.msrs[slot] = msr; + /* we need ensured the shared_msr_global have been updated */ + smp_wmb(); } EXPORT_SYMBOL_GPL(kvm_define_shared_msr); static void kvm_shared_msr_cpu_online(void) { unsigned i; - struct kvm_shared_msrs *locals = &__get_cpu_var(shared_msrs); for (i = 0; i < shared_msrs_global.nr; ++i) - locals->current_value[i] = shared_msrs_global.msrs[i].value; + shared_msr_update(i, shared_msrs_global.msrs[i]); } void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) { struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); - if (((value ^ smsr->current_value[slot]) & mask) == 0) + if (((value ^ smsr->values[slot].curr) & mask) == 0) return; - smsr->current_value[slot] = value; - wrmsrl(shared_msrs_global.msrs[slot].msr, value); + smsr->values[slot].curr = value; + wrmsrl(shared_msrs_global.msrs[slot], value); if (!smsr->registered) { smsr->urn.on_user_return = kvm_on_user_return; user_return_notifier_register(&smsr->urn); -- cgit v1.2.2 From 0e85188049afacdfce9c026144142264981bbabb Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Fri, 18 Dec 2009 16:48:46 +0800 Subject: KVM: Add cpuid_update() callback to kvm_x86_ops Sometime, we need to adjust some state in order to reflect guest CPUID setting, e.g. if we don't expose rdtscp to guest, we won't want to enable it on hardware. cpuid_update() is introduced for this purpose. Also export kvm_find_cpuid_entry() for later use. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 6 ++++++ arch/x86/kvm/vmx.c | 6 ++++++ arch/x86/kvm/x86.c | 3 +++ 3 files changed, 15 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1d9b33843c80..41777e6d9761 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2852,6 +2852,10 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) return 0; } +static void svm_cpuid_update(struct kvm_vcpu *vcpu) +{ +} + static const struct trace_print_flags svm_exit_reasons_str[] = { { SVM_EXIT_READ_CR0, "read_cr0" }, { SVM_EXIT_READ_CR3, "read_cr3" }, @@ -2976,6 +2980,8 @@ static struct kvm_x86_ops svm_x86_ops = { .exit_reasons_str = svm_exit_reasons_str, .gb_page_enable = svm_gb_page_enable, + + .cpuid_update = svm_cpuid_update, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f8f2fdc26894..75e8931e96c7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3988,6 +3988,10 @@ static bool vmx_gb_page_enable(void) return false; } +static void vmx_cpuid_update(struct kvm_vcpu *vcpu) +{ +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -4052,6 +4056,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .exit_reasons_str = vmx_exit_reasons_str, .gb_page_enable = vmx_gb_page_enable, + + .cpuid_update = vmx_cpuid_update, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4e7bbc49b7e4..e5ac21f992f0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1574,6 +1574,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, cpuid_fix_nx_cap(vcpu); r = 0; kvm_apic_set_version(vcpu); + kvm_x86_ops->cpuid_update(vcpu); out_free: vfree(cpuid_entries); @@ -1596,6 +1597,7 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, goto out; vcpu->arch.cpuid_nent = cpuid->nent; kvm_apic_set_version(vcpu); + kvm_x86_ops->cpuid_update(vcpu); return 0; out: @@ -3733,6 +3735,7 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, } return best; } +EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) { -- cgit v1.2.2 From 4e47c7a6d714cf352b719db92a924b6ec487acc5 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Fri, 18 Dec 2009 16:48:47 +0800 Subject: KVM: VMX: Add instruction rdtscp support for guest Before enabling, execution of "rdtscp" in guest would result in #UD. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 7 +++++++ arch/x86/kvm/vmx.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++--- arch/x86/kvm/x86.c | 3 ++- 3 files changed, 66 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 41777e6d9761..7f4e225feebf 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2914,6 +2914,11 @@ static bool svm_gb_page_enable(void) return true; } +static bool svm_rdtscp_supported(void) +{ + return false; +} + static struct kvm_x86_ops svm_x86_ops = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -2982,6 +2987,8 @@ static struct kvm_x86_ops svm_x86_ops = { .gb_page_enable = svm_gb_page_enable, .cpuid_update = svm_cpuid_update, + + .rdtscp_supported = svm_rdtscp_supported, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 75e8931e96c7..74a66f0c00b4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -151,6 +151,8 @@ struct vcpu_vmx { ktime_t entry_time; s64 vnmi_blocked_time; u32 exit_reason; + + bool rdtscp_enabled; }; static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) @@ -225,7 +227,7 @@ static const u32 vmx_msr_index[] = { #ifdef CONFIG_X86_64 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, #endif - MSR_EFER, MSR_K6_STAR, + MSR_EFER, MSR_TSC_AUX, MSR_K6_STAR, }; #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) @@ -362,6 +364,12 @@ static inline int cpu_has_vmx_vpid(void) SECONDARY_EXEC_ENABLE_VPID; } +static inline int cpu_has_vmx_rdtscp(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_RDTSCP; +} + static inline int cpu_has_virtual_nmis(void) { return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; @@ -893,6 +901,11 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); } +static bool vmx_rdtscp_supported(void) +{ + return cpu_has_vmx_rdtscp(); +} + /* * Swap MSR entry in host/guest MSR entry array. */ @@ -928,6 +941,9 @@ static void setup_msrs(struct vcpu_vmx *vmx) index = __find_msr_index(vmx, MSR_CSTAR); if (index >= 0) move_msr_up(vmx, index, save_nmsrs++); + index = __find_msr_index(vmx, MSR_TSC_AUX); + if (index >= 0 && vmx->rdtscp_enabled) + move_msr_up(vmx, index, save_nmsrs++); /* * MSR_K6_STAR is only needed on long mode guests, and only * if efer.sce is enabled. @@ -1017,6 +1033,10 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) case MSR_IA32_SYSENTER_ESP: data = vmcs_readl(GUEST_SYSENTER_ESP); break; + case MSR_TSC_AUX: + if (!to_vmx(vcpu)->rdtscp_enabled) + return 1; + /* Otherwise falls through */ default: vmx_load_host_state(to_vmx(vcpu)); msr = find_msr_entry(to_vmx(vcpu), msr_index); @@ -1080,7 +1100,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) vcpu->arch.pat = data; break; } - /* Otherwise falls through to kvm_set_msr_common */ + ret = kvm_set_msr_common(vcpu, msr_index, data); + break; + case MSR_TSC_AUX: + if (!vmx->rdtscp_enabled) + return 1; + /* Check reserved bit, higher 32 bits should be zero */ + if ((data >> 32) != 0) + return 1; + /* Otherwise falls through */ default: msr = find_msr_entry(vmx, msr_index); if (msr) { @@ -1260,7 +1288,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_UNRESTRICTED_GUEST | - SECONDARY_EXEC_PAUSE_LOOP_EXITING; + SECONDARY_EXEC_PAUSE_LOOP_EXITING | + SECONDARY_EXEC_RDTSCP; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -3988,8 +4017,31 @@ static bool vmx_gb_page_enable(void) return false; } +static inline u32 bit(int bitno) +{ + return 1 << (bitno & 31); +} + static void vmx_cpuid_update(struct kvm_vcpu *vcpu) { + struct kvm_cpuid_entry2 *best; + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 exec_control; + + vmx->rdtscp_enabled = false; + if (vmx_rdtscp_supported()) { + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + if (exec_control & SECONDARY_EXEC_RDTSCP) { + best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + if (best && (best->edx & bit(X86_FEATURE_RDTSCP))) + vmx->rdtscp_enabled = true; + else { + exec_control &= ~SECONDARY_EXEC_RDTSCP; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + exec_control); + } + } + } } static struct kvm_x86_ops vmx_x86_ops = { @@ -4058,6 +4110,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .gb_page_enable = vmx_gb_page_enable, .cpuid_update = vmx_cpuid_update, + + .rdtscp_supported = vmx_rdtscp_supported, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e5ac21f992f0..8798504ace11 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1646,6 +1646,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, #else unsigned f_lm = 0; #endif + unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; /* cpuid 1.edx */ const u32 kvm_supported_word0_x86_features = @@ -1665,7 +1666,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | F(PAT) | F(PSE36) | 0 /* Reserved */ | f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | - F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ | + F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp | 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); /* cpuid 1.ecx */ const u32 kvm_supported_word4_x86_features = -- cgit v1.2.2 From 953899b659adce62cbe83d6a7527550ab8797c48 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 14 Dec 2009 12:22:20 +0100 Subject: KVM: SVM: Adjust tsc_offset only if tsc_unstable The tsc_offset adjustment in svm_vcpu_load is executed unconditionally even if Linux considers the host tsc as stable. This causes a Linux guest detecting an unstable tsc in any case. This patch removes the tsc_offset adjustment if the host tsc is stable. The guest will now get the benefit of a stable tsc too. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 7f4e225feebf..b373ae6fb974 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -765,14 +765,16 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (unlikely(cpu != vcpu->cpu)) { u64 delta; - /* - * Make sure that the guest sees a monotonically - * increasing TSC. - */ - delta = vcpu->arch.host_tsc - native_read_tsc(); - svm->vmcb->control.tsc_offset += delta; - if (is_nested(svm)) - svm->nested.hsave->control.tsc_offset += delta; + if (check_tsc_unstable()) { + /* + * Make sure that the guest sees a monotonically + * increasing TSC. + */ + delta = vcpu->arch.host_tsc - native_read_tsc(); + svm->vmcb->control.tsc_offset += delta; + if (is_nested(svm)) + svm->nested.hsave->control.tsc_offset += delta; + } vcpu->cpu = cpu; kvm_migrate_timers(vcpu); svm->asid_generation = 0; -- cgit v1.2.2 From 50eb2a3cd0f50d912b26d0b79b7f443344608390 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 20 Dec 2009 15:00:10 +0200 Subject: KVM: Add KVM_MMIO kconfig item s390 doesn't have mmio, this will simplify ifdefing it out. Signed-off-by: Avi Kivity --- arch/x86/kvm/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 4cd498332466..06871111bf54 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -29,6 +29,7 @@ config KVM select HAVE_KVM_EVENTFD select KVM_APIC_ARCHITECTURE select USER_RETURN_NOTIFIER + select KVM_MMIO ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent -- cgit v1.2.2 From 46a26bf55714c1e2f17e34683292a389acb8e601 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 23 Dec 2009 14:35:16 -0200 Subject: KVM: modify memslots layout in struct kvm Have a pointer to an allocated region inside struct kvm. [alex: fix ppc book 3s] Signed-off-by: Alexander Graf Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 11 ++++++----- arch/x86/kvm/vmx.c | 4 ++-- arch/x86/kvm/x86.c | 4 ++-- 3 files changed, 10 insertions(+), 9 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 4f499d7f7106..81f84d326a84 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -807,13 +807,14 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, { int i, j; int retval = 0; + struct kvm_memslots *slots = kvm->memslots; /* * If mmap_sem isn't taken, we can look the memslots with only * the mmu_lock by skipping over the slots with userspace_addr == 0. */ - for (i = 0; i < kvm->nmemslots; i++) { - struct kvm_memory_slot *memslot = &kvm->memslots[i]; + for (i = 0; i < slots->nmemslots; i++) { + struct kvm_memory_slot *memslot = &slots->memslots[i]; unsigned long start = memslot->userspace_addr; unsigned long end; @@ -3021,8 +3022,8 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) unsigned int nr_mmu_pages; unsigned int nr_pages = 0; - for (i = 0; i < kvm->nmemslots; i++) - nr_pages += kvm->memslots[i].npages; + for (i = 0; i < kvm->memslots->nmemslots; i++) + nr_pages += kvm->memslots->memslots[i].npages; nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; nr_mmu_pages = max(nr_mmu_pages, @@ -3295,7 +3296,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu) int i, j, k; for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { - struct kvm_memory_slot *m = &vcpu->kvm->memslots[i]; + struct kvm_memory_slot *m = &vcpu->kvm->memslots->memslots[i]; struct kvm_rmap_desc *d; for (j = 0; j < m->npages; ++j) { diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 74a66f0c00b4..18698799e365 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1503,8 +1503,8 @@ static void enter_pmode(struct kvm_vcpu *vcpu) static gva_t rmode_tss_base(struct kvm *kvm) { if (!kvm->arch.tss_addr) { - gfn_t base_gfn = kvm->memslots[0].base_gfn + - kvm->memslots[0].npages - 3; + gfn_t base_gfn = kvm->memslots->memslots[0].base_gfn + + kvm->memslots->memslots[0].npages - 3; return base_gfn << PAGE_SHIFT; } return kvm->arch.tss_addr; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8798504ace11..3b81cb9da8b8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2427,7 +2427,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, spin_lock(&kvm->mmu_lock); kvm_mmu_slot_remove_write_access(kvm, log->slot); spin_unlock(&kvm->mmu_lock); - memslot = &kvm->memslots[log->slot]; + memslot = &kvm->memslots->memslots[log->slot]; n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; memset(memslot->dirty_bitmap, 0, n); } @@ -5223,7 +5223,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm, int user_alloc) { int npages = mem->memory_size >> PAGE_SHIFT; - struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; + struct kvm_memory_slot *memslot = &kvm->memslots->memslots[mem->slot]; /*To keep backward compatibility with older userspace, *x86 needs to hanlde !user_alloc case. -- cgit v1.2.2 From fef9cce0eb28a67e688a411cc30b73625e49002b Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 23 Dec 2009 14:35:17 -0200 Subject: KVM: modify alias layout in x86s struct kvm_arch Have a pointer to an allocated region inside x86's kvm_arch. Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3b81cb9da8b8..1ce833191430 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2227,9 +2227,10 @@ gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) { int i; struct kvm_mem_alias *alias; + struct kvm_mem_aliases *aliases = kvm->arch.aliases; - for (i = 0; i < kvm->arch.naliases; ++i) { - alias = &kvm->arch.aliases[i]; + for (i = 0; i < aliases->naliases; ++i) { + alias = &aliases->aliases[i]; if (gfn >= alias->base_gfn && gfn < alias->base_gfn + alias->npages) return alias->target_gfn + gfn - alias->base_gfn; @@ -2247,6 +2248,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, { int r, n; struct kvm_mem_alias *p; + struct kvm_mem_aliases *aliases; r = -EINVAL; /* General sanity checks */ @@ -2266,15 +2268,17 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, down_write(&kvm->slots_lock); spin_lock(&kvm->mmu_lock); - p = &kvm->arch.aliases[alias->slot]; + aliases = kvm->arch.aliases; + + p = &aliases->aliases[alias->slot]; p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; p->npages = alias->memory_size >> PAGE_SHIFT; p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; for (n = KVM_ALIAS_SLOTS; n > 0; --n) - if (kvm->arch.aliases[n - 1].npages) + if (aliases->aliases[n - 1].npages) break; - kvm->arch.naliases = n; + aliases->naliases = n; spin_unlock(&kvm->mmu_lock); kvm_mmu_zap_all(kvm); @@ -5158,6 +5162,12 @@ struct kvm *kvm_arch_create_vm(void) if (!kvm) return ERR_PTR(-ENOMEM); + kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); + if (!kvm->arch.aliases) { + kfree(kvm); + return ERR_PTR(-ENOMEM); + } + INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); @@ -5214,6 +5224,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) put_page(kvm->arch.apic_access_page); if (kvm->arch.ept_identity_pagetable) put_page(kvm->arch.ept_identity_pagetable); + kfree(kvm->arch.aliases); kfree(kvm); } -- cgit v1.2.2 From f7784b8ec9b6a041fa828cfbe9012fe51933f5ac Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 23 Dec 2009 14:35:18 -0200 Subject: KVM: split kvm_arch_set_memory_region into prepare and commit Required for SRCU convertion later. Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 51 +++++++++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 22 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1ce833191430..43da65feed49 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5228,13 +5228,13 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kfree(kvm); } -int kvm_arch_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, +int kvm_arch_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, struct kvm_memory_slot old, + struct kvm_userspace_memory_region *mem, int user_alloc) { - int npages = mem->memory_size >> PAGE_SHIFT; - struct kvm_memory_slot *memslot = &kvm->memslots->memslots[mem->slot]; + int npages = memslot->npages; /*To keep backward compatibility with older userspace, *x86 needs to hanlde !user_alloc case. @@ -5254,26 +5254,35 @@ int kvm_arch_set_memory_region(struct kvm *kvm, if (IS_ERR((void *)userspace_addr)) return PTR_ERR((void *)userspace_addr); - /* set userspace_addr atomically for kvm_hva_to_rmapp */ - spin_lock(&kvm->mmu_lock); memslot->userspace_addr = userspace_addr; - spin_unlock(&kvm->mmu_lock); - } else { - if (!old.user_alloc && old.rmap) { - int ret; - - down_write(¤t->mm->mmap_sem); - ret = do_munmap(current->mm, old.userspace_addr, - old.npages * PAGE_SIZE); - up_write(¤t->mm->mmap_sem); - if (ret < 0) - printk(KERN_WARNING - "kvm_vm_ioctl_set_memory_region: " - "failed to munmap memory\n"); - } } } + + return 0; +} + +void kvm_arch_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old, + int user_alloc) +{ + + int npages = mem->memory_size >> PAGE_SHIFT; + + if (!user_alloc && !old.user_alloc && old.rmap && !npages) { + int ret; + + down_write(¤t->mm->mmap_sem); + ret = do_munmap(current->mm, old.userspace_addr, + old.npages * PAGE_SIZE); + up_write(¤t->mm->mmap_sem); + if (ret < 0) + printk(KERN_WARNING + "kvm_vm_ioctl_set_memory_region: " + "failed to munmap memory\n"); + } + spin_lock(&kvm->mmu_lock); if (!kvm->arch.n_requested_mmu_pages) { unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); @@ -5282,8 +5291,6 @@ int kvm_arch_set_memory_region(struct kvm *kvm, kvm_mmu_slot_remove_write_access(kvm, mem->slot); spin_unlock(&kvm->mmu_lock); - - return 0; } void kvm_arch_flush_shadow(struct kvm *kvm) -- cgit v1.2.2 From bc6678a33d9b952981a8e44a4f876c3ad64ca4d8 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 23 Dec 2009 14:35:21 -0200 Subject: KVM: introduce kvm->srcu and convert kvm_set_memory_region to SRCU update Use two steps for memslot deletion: mark the slot invalid (which stops instantiation of new shadow pages for that slot, but allows destruction), then instantiate the new empty slot. Also simplifies kvm_handle_hva locking. Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 28 ++++++++++++++-------------- arch/x86/kvm/vmx.c | 6 +++++- 2 files changed, 19 insertions(+), 15 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 81f84d326a84..f8bf42a25995 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -807,21 +808,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, { int i, j; int retval = 0; - struct kvm_memslots *slots = kvm->memslots; + struct kvm_memslots *slots; + + slots = rcu_dereference(kvm->memslots); - /* - * If mmap_sem isn't taken, we can look the memslots with only - * the mmu_lock by skipping over the slots with userspace_addr == 0. - */ for (i = 0; i < slots->nmemslots; i++) { struct kvm_memory_slot *memslot = &slots->memslots[i]; unsigned long start = memslot->userspace_addr; unsigned long end; - /* mmu_lock protects userspace_addr */ - if (!start) - continue; - end = start + (memslot->npages << PAGE_SHIFT); if (hva >= start && hva < end) { gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; @@ -1617,7 +1612,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) { - int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn)); + int slot = memslot_id(kvm, gfn); struct kvm_mmu_page *sp = page_header(__pa(pte)); __set_bit(slot, sp->slot_bitmap); @@ -3021,9 +3016,11 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) int i; unsigned int nr_mmu_pages; unsigned int nr_pages = 0; + struct kvm_memslots *slots; - for (i = 0; i < kvm->memslots->nmemslots; i++) - nr_pages += kvm->memslots->memslots[i].npages; + slots = rcu_dereference(kvm->memslots); + for (i = 0; i < slots->nmemslots; i++) + nr_pages += slots->memslots[i].npages; nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; nr_mmu_pages = max(nr_mmu_pages, @@ -3293,10 +3290,12 @@ static void audit_mappings(struct kvm_vcpu *vcpu) static int count_rmaps(struct kvm_vcpu *vcpu) { int nmaps = 0; - int i, j, k; + int i, j, k, idx; + idx = srcu_read_lock(&kvm->srcu); + slots = rcu_dereference(kvm->memslots); for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { - struct kvm_memory_slot *m = &vcpu->kvm->memslots->memslots[i]; + struct kvm_memory_slot *m = &slots->memslots[i]; struct kvm_rmap_desc *d; for (j = 0; j < m->npages; ++j) { @@ -3319,6 +3318,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu) } } } + srcu_read_unlock(&kvm->srcu, idx); return nmaps; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 18698799e365..f1cae7d6113d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1503,7 +1503,11 @@ static void enter_pmode(struct kvm_vcpu *vcpu) static gva_t rmode_tss_base(struct kvm *kvm) { if (!kvm->arch.tss_addr) { - gfn_t base_gfn = kvm->memslots->memslots[0].base_gfn + + struct kvm_memslots *slots; + gfn_t base_gfn; + + slots = rcu_dereference(kvm->memslots); + base_gfn = kvm->memslots->memslots[0].base_gfn + kvm->memslots->memslots[0].npages - 3; return base_gfn << PAGE_SHIFT; } -- cgit v1.2.2 From b050b015abbef8225826eecb6f6b4d4a6dea7b79 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 23 Dec 2009 14:35:22 -0200 Subject: KVM: use SRCU for dirty log Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 49 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 41 insertions(+), 8 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 43da65feed49..e7488350ca16 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2415,27 +2415,60 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - int r; - int n; + int r, n, i; struct kvm_memory_slot *memslot; - int is_dirty = 0; + unsigned long is_dirty = 0; + unsigned long *dirty_bitmap = NULL; down_write(&kvm->slots_lock); - r = kvm_get_dirty_log(kvm, log, &is_dirty); - if (r) + r = -EINVAL; + if (log->slot >= KVM_MEMORY_SLOTS) + goto out; + + memslot = &kvm->memslots->memslots[log->slot]; + r = -ENOENT; + if (!memslot->dirty_bitmap) + goto out; + + n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; + + r = -ENOMEM; + dirty_bitmap = vmalloc(n); + if (!dirty_bitmap) goto out; + memset(dirty_bitmap, 0, n); + + for (i = 0; !is_dirty && i < n/sizeof(long); i++) + is_dirty = memslot->dirty_bitmap[i]; /* If nothing is dirty, don't bother messing with page tables. */ if (is_dirty) { + struct kvm_memslots *slots, *old_slots; + spin_lock(&kvm->mmu_lock); kvm_mmu_slot_remove_write_access(kvm, log->slot); spin_unlock(&kvm->mmu_lock); - memslot = &kvm->memslots->memslots[log->slot]; - n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; - memset(memslot->dirty_bitmap, 0, n); + + slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); + if (!slots) + goto out_free; + + memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); + slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; + + old_slots = kvm->memslots; + rcu_assign_pointer(kvm->memslots, slots); + synchronize_srcu_expedited(&kvm->srcu); + dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; + kfree(old_slots); } + r = 0; + if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) + r = -EFAULT; +out_free: + vfree(dirty_bitmap); out: up_write(&kvm->slots_lock); return r; -- cgit v1.2.2 From a983fb238728e1123177e8058d4f644b949a7d05 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 23 Dec 2009 14:35:23 -0200 Subject: KVM: x86: switch kvm_set_memory_alias to SRCU update Using a similar two-step procedure as for memslots. Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 51 insertions(+), 9 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e7488350ca16..28127c936c3b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #undef TRACE_INCLUDE_FILE #define CREATE_TRACE_POINTS @@ -2223,11 +2224,32 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) return kvm->arch.n_alloc_mmu_pages; } +gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn) +{ + int i; + struct kvm_mem_alias *alias; + struct kvm_mem_aliases *aliases; + + aliases = rcu_dereference(kvm->arch.aliases); + + for (i = 0; i < aliases->naliases; ++i) { + alias = &aliases->aliases[i]; + if (alias->flags & KVM_ALIAS_INVALID) + continue; + if (gfn >= alias->base_gfn + && gfn < alias->base_gfn + alias->npages) + return alias->target_gfn + gfn - alias->base_gfn; + } + return gfn; +} + gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) { int i; struct kvm_mem_alias *alias; - struct kvm_mem_aliases *aliases = kvm->arch.aliases; + struct kvm_mem_aliases *aliases; + + aliases = rcu_dereference(kvm->arch.aliases); for (i = 0; i < aliases->naliases; ++i) { alias = &aliases->aliases[i]; @@ -2248,7 +2270,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, { int r, n; struct kvm_mem_alias *p; - struct kvm_mem_aliases *aliases; + struct kvm_mem_aliases *aliases, *old_aliases; r = -EINVAL; /* General sanity checks */ @@ -2265,28 +2287,48 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, < alias->target_phys_addr) goto out; + r = -ENOMEM; + aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); + if (!aliases) + goto out; + down_write(&kvm->slots_lock); - spin_lock(&kvm->mmu_lock); - aliases = kvm->arch.aliases; + /* invalidate any gfn reference in case of deletion/shrinking */ + memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); + aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID; + old_aliases = kvm->arch.aliases; + rcu_assign_pointer(kvm->arch.aliases, aliases); + synchronize_srcu_expedited(&kvm->srcu); + kvm_mmu_zap_all(kvm); + kfree(old_aliases); + + r = -ENOMEM; + aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); + if (!aliases) + goto out_unlock; + + memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); p = &aliases->aliases[alias->slot]; p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; p->npages = alias->memory_size >> PAGE_SHIFT; p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; + p->flags &= ~(KVM_ALIAS_INVALID); for (n = KVM_ALIAS_SLOTS; n > 0; --n) if (aliases->aliases[n - 1].npages) break; aliases->naliases = n; - spin_unlock(&kvm->mmu_lock); - kvm_mmu_zap_all(kvm); + old_aliases = kvm->arch.aliases; + rcu_assign_pointer(kvm->arch.aliases, aliases); + synchronize_srcu_expedited(&kvm->srcu); + kfree(old_aliases); + r = 0; +out_unlock: up_write(&kvm->slots_lock); - - return 0; - out: return r; } -- cgit v1.2.2 From e93f8a0f821e290ac5149830110a5f704db7a1fc Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 23 Dec 2009 14:35:24 -0200 Subject: KVM: convert io_bus to SRCU Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/i8254.c | 6 +++--- arch/x86/kvm/i8259.c | 4 +++- arch/x86/kvm/x86.c | 13 +++++++------ 3 files changed, 13 insertions(+), 10 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 15578f180e59..4b433de02e5b 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -645,13 +645,13 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); kvm_iodevice_init(&pit->dev, &pit_dev_ops); - ret = __kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev); + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->dev); if (ret < 0) goto fail; if (flags & KVM_PIT_SPEAKER_DUMMY) { kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops); - ret = __kvm_io_bus_register_dev(&kvm->pio_bus, + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev); if (ret < 0) goto fail_unregister; @@ -660,7 +660,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) return pit; fail_unregister: - __kvm_io_bus_unregister_dev(&kvm->pio_bus, &pit->dev); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev); fail: if (pit->irq_source_id >= 0) diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index d057c0cbd245..b7d145b20953 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -533,7 +533,9 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) * Initialize PIO device */ kvm_iodevice_init(&s->dev, &picdev_ops); - ret = kvm_io_bus_register_dev(kvm, &kvm->pio_bus, &s->dev); + down_write(&kvm->slots_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &s->dev); + up_write(&kvm->slots_lock); if (ret < 0) { kfree(s); return NULL; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 28127c936c3b..9b42673df4af 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2851,7 +2851,7 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) return 0; - return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v); + return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); } static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) @@ -2860,7 +2860,7 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) return 0; - return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v); + return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); } static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, @@ -3345,11 +3345,12 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) int r; if (vcpu->arch.pio.in) - r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, + r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port, vcpu->arch.pio.size, pd); else - r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, - vcpu->arch.pio.size, pd); + r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, + vcpu->arch.pio.port, vcpu->arch.pio.size, + pd); return r; } @@ -3360,7 +3361,7 @@ static int pio_string_write(struct kvm_vcpu *vcpu) int i, r = 0; for (i = 0; i < io->cur_count; i++) { - if (kvm_io_bus_write(&vcpu->kvm->pio_bus, + if (kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, io->port, io->size, pd)) { r = -EOPNOTSUPP; break; -- cgit v1.2.2 From f656ce0185cabbbb0cf96877306879661297c7ad Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 23 Dec 2009 14:35:25 -0200 Subject: KVM: switch vcpu context to use SRCU Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 7 +++---- arch/x86/kvm/vmx.c | 6 +++--- arch/x86/kvm/x86.c | 43 ++++++++++++++++++++++++------------------- 3 files changed, 30 insertions(+), 26 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f8bf42a25995..25aabd00aa01 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2933,10 +2933,9 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { - int npages; + int npages, idx; - if (!down_read_trylock(&kvm->slots_lock)) - continue; + idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); npages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; @@ -2949,7 +2948,7 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) nr_to_scan--; spin_unlock(&kvm->mmu_lock); - up_read(&kvm->slots_lock); + srcu_read_unlock(&kvm->srcu, idx); } if (kvm_freed) list_move_tail(&kvm_freed->vm_list, &vm_list); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f1cae7d6113d..22ab7137d1d0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2478,10 +2478,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 msr; - int ret; + int ret, idx; vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); if (!init_rmode(vmx->vcpu.kvm)) { ret = -ENOMEM; goto out; @@ -2589,7 +2589,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx->emulation_required = 0; out: - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); return ret; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9b42673df4af..53bc06a68105 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1306,15 +1306,15 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, int (*do_msr)(struct kvm_vcpu *vcpu, unsigned index, u64 *data)) { - int i; + int i, idx; vcpu_load(vcpu); - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); for (i = 0; i < msrs->nmsrs; ++i) if (do_msr(vcpu, entries[i].index, &entries[i].data)) break; - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); vcpu_put(vcpu); @@ -3900,14 +3900,15 @@ static void vapic_enter(struct kvm_vcpu *vcpu) static void vapic_exit(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; + int idx; if (!apic || !apic->vapic_addr) return; - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_release_page_dirty(apic->vapic_page); mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); } static void update_cr8_intercept(struct kvm_vcpu *vcpu) @@ -4036,7 +4037,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_lapic_sync_to_vapic(vcpu); } - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); kvm_guest_enter(); @@ -4078,7 +4079,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) preempt_enable(); - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); /* * Profile KVM exit RIPs: @@ -4100,6 +4101,7 @@ out: static int __vcpu_run(struct kvm_vcpu *vcpu) { int r; + struct kvm *kvm = vcpu->kvm; if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { pr_debug("vcpu %d received sipi with vector # %x\n", @@ -4111,7 +4113,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); vapic_enter(vcpu); r = 1; @@ -4119,9 +4121,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) r = vcpu_enter_guest(vcpu); else { - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_vcpu_block(vcpu); - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) { switch(vcpu->arch.mp_state) { @@ -4156,13 +4158,13 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) ++vcpu->stat.signal_exits; } if (need_resched()) { - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_resched(vcpu); - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); } } - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); post_kvm_run_save(vcpu); vapic_exit(vcpu); @@ -4201,10 +4203,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu->mmio_read_completed = 1; vcpu->mmio_needed = 0; - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0, EMULTYPE_NO_DECODE); - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); if (r == EMULATE_DO_MMIO) { /* * Read-modify-write. Back to userspace. @@ -4967,11 +4969,12 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, { unsigned long vaddr = tr->linear_address; gpa_t gpa; + int idx; vcpu_load(vcpu); - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); tr->physical_address = gpa; tr->valid = gpa != UNMAPPED_GVA; tr->writeable = 1; @@ -5223,11 +5226,13 @@ fail: void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) { + int idx; + kfree(vcpu->arch.mce_banks); kvm_free_lapic(vcpu); - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_mmu_destroy(vcpu); - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); free_page((unsigned long)vcpu->arch.pio_data); } -- cgit v1.2.2 From 79fac95ecfa3969aab8119d37ccd7226165f933a Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 23 Dec 2009 14:35:26 -0200 Subject: KVM: convert slots_lock to a mutex Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/i8254.c | 2 +- arch/x86/kvm/i8259.c | 4 ++-- arch/x86/kvm/vmx.c | 8 ++++---- arch/x86/kvm/x86.c | 16 ++++++++-------- 4 files changed, 15 insertions(+), 15 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 4b433de02e5b..6a74246f80c6 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -605,7 +605,7 @@ static const struct kvm_io_device_ops speaker_dev_ops = { .write = speaker_ioport_write, }; -/* Caller must have writers lock on slots_lock */ +/* Caller must hold slots_lock */ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) { struct kvm_pit *pit; diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index b7d145b20953..d5753a75d58c 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -533,9 +533,9 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) * Initialize PIO device */ kvm_iodevice_init(&s->dev, &picdev_ops); - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &s->dev); - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); if (ret < 0) { kfree(s); return NULL; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 22ab7137d1d0..f04e2ff21383 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2223,7 +2223,7 @@ static int alloc_apic_access_page(struct kvm *kvm) struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); if (kvm->arch.apic_access_page) goto out; kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; @@ -2236,7 +2236,7 @@ static int alloc_apic_access_page(struct kvm *kvm) kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); out: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return r; } @@ -2245,7 +2245,7 @@ static int alloc_identity_pagetable(struct kvm *kvm) struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); if (kvm->arch.ept_identity_pagetable) goto out; kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; @@ -2260,7 +2260,7 @@ static int alloc_identity_pagetable(struct kvm *kvm) kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, kvm->arch.ept_identity_map_addr >> PAGE_SHIFT); out: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return r; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 53bc06a68105..aff3479867a8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2208,14 +2208,14 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) return -EINVAL; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); spin_lock(&kvm->mmu_lock); kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; spin_unlock(&kvm->mmu_lock); - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return 0; } @@ -2292,7 +2292,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, if (!aliases) goto out; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); /* invalidate any gfn reference in case of deletion/shrinking */ memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); @@ -2328,7 +2328,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, r = 0; out_unlock: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); out: return r; } @@ -2462,7 +2462,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, unsigned long is_dirty = 0; unsigned long *dirty_bitmap = NULL; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); r = -EINVAL; if (log->slot >= KVM_MEMORY_SLOTS) @@ -2512,7 +2512,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, out_free: vfree(dirty_bitmap); out: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return r; } @@ -2625,7 +2625,7 @@ long kvm_arch_vm_ioctl(struct file *filp, sizeof(struct kvm_pit_config))) goto out; create_pit: - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); r = -EEXIST; if (kvm->arch.vpit) goto create_pit_unlock; @@ -2634,7 +2634,7 @@ long kvm_arch_vm_ioctl(struct file *filp, if (kvm->arch.vpit) r = 0; create_pit_unlock: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); break; case KVM_IRQ_LINE_STATUS: case KVM_IRQ_LINE: { -- cgit v1.2.2 From f4c9e87c83a9f5bc1800db27dbb39e5cd1254c0a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 28 Dec 2009 16:06:35 +0200 Subject: KVM: Fill out ftrace exit reason strings Some exit reasons missed their strings; fill out the table. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 58 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 39 insertions(+), 19 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f04e2ff21383..f4486f460278 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3993,29 +3993,49 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) return ret; } +#define _ER(x) { EXIT_REASON_##x, #x } + static const struct trace_print_flags vmx_exit_reasons_str[] = { - { EXIT_REASON_EXCEPTION_NMI, "exception" }, - { EXIT_REASON_EXTERNAL_INTERRUPT, "ext_irq" }, - { EXIT_REASON_TRIPLE_FAULT, "triple_fault" }, - { EXIT_REASON_NMI_WINDOW, "nmi_window" }, - { EXIT_REASON_IO_INSTRUCTION, "io_instruction" }, - { EXIT_REASON_CR_ACCESS, "cr_access" }, - { EXIT_REASON_DR_ACCESS, "dr_access" }, - { EXIT_REASON_CPUID, "cpuid" }, - { EXIT_REASON_MSR_READ, "rdmsr" }, - { EXIT_REASON_MSR_WRITE, "wrmsr" }, - { EXIT_REASON_PENDING_INTERRUPT, "interrupt_window" }, - { EXIT_REASON_HLT, "halt" }, - { EXIT_REASON_INVLPG, "invlpg" }, - { EXIT_REASON_VMCALL, "hypercall" }, - { EXIT_REASON_TPR_BELOW_THRESHOLD, "tpr_below_thres" }, - { EXIT_REASON_APIC_ACCESS, "apic_access" }, - { EXIT_REASON_WBINVD, "wbinvd" }, - { EXIT_REASON_TASK_SWITCH, "task_switch" }, - { EXIT_REASON_EPT_VIOLATION, "ept_violation" }, + _ER(EXCEPTION_NMI), + _ER(EXTERNAL_INTERRUPT), + _ER(TRIPLE_FAULT), + _ER(PENDING_INTERRUPT), + _ER(NMI_WINDOW), + _ER(TASK_SWITCH), + _ER(CPUID), + _ER(HLT), + _ER(INVLPG), + _ER(RDPMC), + _ER(RDTSC), + _ER(VMCALL), + _ER(VMCLEAR), + _ER(VMLAUNCH), + _ER(VMPTRLD), + _ER(VMPTRST), + _ER(VMREAD), + _ER(VMRESUME), + _ER(VMWRITE), + _ER(VMOFF), + _ER(VMON), + _ER(CR_ACCESS), + _ER(DR_ACCESS), + _ER(IO_INSTRUCTION), + _ER(MSR_READ), + _ER(MSR_WRITE), + _ER(MWAIT_INSTRUCTION), + _ER(MONITOR_INSTRUCTION), + _ER(PAUSE_INSTRUCTION), + _ER(MCE_DURING_VMENTRY), + _ER(TPR_BELOW_THRESHOLD), + _ER(APIC_ACCESS), + _ER(EPT_VIOLATION), + _ER(EPT_MISCONFIG), + _ER(WBINVD), { -1, NULL } }; +#undef _ER + static bool vmx_gb_page_enable(void) { return false; -- cgit v1.2.2 From c9c5417455b0c2e3c164883354b5480e5aee3b36 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Tue, 5 Jan 2010 19:02:26 +0800 Subject: KVM: x86: Moving PT_*_LEVEL to mmu.h We can use them in x86.c and vmx.c now... Signed-off-by: Sheng Yang Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 4 ---- arch/x86/kvm/mmu.h | 4 ++++ 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 25aabd00aa01..12ccf14f8539 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -143,10 +143,6 @@ module_param(oos_shadow, bool, 0644); #define PFERR_RSVD_MASK (1U << 3) #define PFERR_FETCH_MASK (1U << 4) -#define PT_PDPE_LEVEL 3 -#define PT_DIRECTORY_LEVEL 2 -#define PT_PAGE_TABLE_LEVEL 1 - #define RMAP_EXT 4 #define ACC_EXEC_MASK 1 diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 4567d8042b22..ff583423968d 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -38,6 +38,10 @@ #define PT32_ROOT_LEVEL 2 #define PT32E_ROOT_LEVEL 3 +#define PT_PDPE_LEVEL 3 +#define PT_DIRECTORY_LEVEL 2 +#define PT_PAGE_TABLE_LEVEL 1 + int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) -- cgit v1.2.2 From 17cc393596823f4bbab81e68a9e23e7beadbcfca Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Tue, 5 Jan 2010 19:02:27 +0800 Subject: KVM: x86: Rename gb_page_enable() to get_lpage_level() in kvm_x86_ops Then the callback can provide the maximum supported large page level, which is more flexible. Also move the gb page support into x86_64 specific. Signed-off-by: Sheng Yang Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 6 +++--- arch/x86/kvm/vmx.c | 6 +++--- arch/x86/kvm/x86.c | 4 +++- 3 files changed, 9 insertions(+), 7 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index b373ae6fb974..cf64fc026e3e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2911,9 +2911,9 @@ static const struct trace_print_flags svm_exit_reasons_str[] = { { -1, NULL } }; -static bool svm_gb_page_enable(void) +static int svm_get_lpage_level(void) { - return true; + return PT_PDPE_LEVEL; } static bool svm_rdtscp_supported(void) @@ -2986,7 +2986,7 @@ static struct kvm_x86_ops svm_x86_ops = { .get_mt_mask = svm_get_mt_mask, .exit_reasons_str = svm_exit_reasons_str, - .gb_page_enable = svm_gb_page_enable, + .get_lpage_level = svm_get_lpage_level, .cpuid_update = svm_cpuid_update, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f4486f460278..0fd0892553ec 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4036,9 +4036,9 @@ static const struct trace_print_flags vmx_exit_reasons_str[] = { #undef _ER -static bool vmx_gb_page_enable(void) +static int vmx_get_lpage_level(void) { - return false; + return PT_DIRECTORY_LEVEL; } static inline u32 bit(int bitno) @@ -4131,7 +4131,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .get_mt_mask = vmx_get_mt_mask, .exit_reasons_str = vmx_exit_reasons_str, - .gb_page_enable = vmx_gb_page_enable, + .get_lpage_level = vmx_get_lpage_level, .cpuid_update = vmx_cpuid_update, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index aff3479867a8..c990424d86d0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1641,10 +1641,12 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, u32 index, int *nent, int maxnent) { unsigned f_nx = is_efer_nx() ? F(NX) : 0; - unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0; #ifdef CONFIG_X86_64 + unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL) + ? F(GBPAGES) : 0; unsigned f_lm = F(LM); #else + unsigned f_gbpages = 0; unsigned f_lm = 0; #endif unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; -- cgit v1.2.2 From 878403b788bff1af9c7f1a61e104f0c77115af29 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Tue, 5 Jan 2010 19:02:29 +0800 Subject: KVM: VMX: Enable EPT 1GB page support Signed-off-by: Sheng Yang Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 8 +++++--- arch/x86/kvm/vmx.c | 11 ++++++++++- 2 files changed, 15 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 12ccf14f8539..4f5508c35100 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -500,8 +500,7 @@ out: static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) { struct kvm_memory_slot *slot; - int host_level; - int level = PT_PAGE_TABLE_LEVEL; + int host_level, level, max_level; slot = gfn_to_memslot(vcpu->kvm, large_gfn); if (slot && slot->dirty_bitmap) @@ -512,7 +511,10 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) if (host_level == PT_PAGE_TABLE_LEVEL) return host_level; - for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) + max_level = kvm_x86_ops->get_lpage_level() < host_level ? + kvm_x86_ops->get_lpage_level() : host_level; + + for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) break; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0fd0892553ec..9b197b25b66d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -318,6 +318,11 @@ static inline bool cpu_has_vmx_ept_2m_page(void) return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT); } +static inline bool cpu_has_vmx_ept_1g_page(void) +{ + return !!(vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT); +} + static inline int cpu_has_vmx_invept_individual_addr(void) { return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT); @@ -4038,7 +4043,11 @@ static const struct trace_print_flags vmx_exit_reasons_str[] = { static int vmx_get_lpage_level(void) { - return PT_DIRECTORY_LEVEL; + if (enable_ept && !cpu_has_vmx_ept_1g_page()) + return PT_DIRECTORY_LEVEL; + else + /* For shadow and EPT supported 1GB page */ + return PT_PDPE_LEVEL; } static inline u32 bit(int bitno) -- cgit v1.2.2 From a1f83a74feaa9718a5c61587256ea6cc1b993d16 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 29 Dec 2009 17:33:58 +0200 Subject: KVM: VMX: trace clts and lmsw instructions as cr accesses clts writes cr0.ts; lmsw writes cr0[0:15] - record that in ftrace. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 9b197b25b66d..7c7b2eeea5d0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2997,6 +2997,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) vmx_fpu_deactivate(vcpu); vcpu->arch.cr0 &= ~X86_CR0_TS; vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); + trace_kvm_cr_write(0, vcpu->arch.cr0); vmx_fpu_activate(vcpu); skip_emulated_instruction(vcpu); return 1; @@ -3016,7 +3017,9 @@ static int handle_cr(struct kvm_vcpu *vcpu) } break; case 3: /* lmsw */ - kvm_lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f); + val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; + trace_kvm_cr_write(0, (vcpu->arch.cr0 & ~0xful) | val); + kvm_lmsw(vcpu, val); skip_emulated_instruction(vcpu); return 1; -- cgit v1.2.2 From 4d4ec0874583b127caac1d0f84033c8971b2fd2a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 29 Dec 2009 18:07:30 +0200 Subject: KVM: Replace read accesses of vcpu->arch.cr0 by an accessor Since we'd like to allow the guest to own a few bits of cr0 at times, we need to know when we access those bits. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 6 +++--- arch/x86/kvm/kvm_cache_regs.h | 10 ++++++++++ arch/x86/kvm/mmu.c | 2 +- arch/x86/kvm/mmu.h | 2 +- arch/x86/kvm/svm.c | 9 +++++---- arch/x86/kvm/vmx.c | 16 ++++++++-------- arch/x86/kvm/x86.c | 20 ++++++++++---------- 7 files changed, 38 insertions(+), 27 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 7e8faea4651e..0f89e320bc96 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1515,7 +1515,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt) /* syscall is not available in real mode */ if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL - || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) + || !kvm_read_cr0_bits(ctxt->vcpu, X86_CR0_PE)) return -1; setup_syscalls_segments(ctxt, &cs, &ss); @@ -1569,7 +1569,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) /* inject #GP if in real mode or paging is disabled */ if (ctxt->mode == X86EMUL_MODE_REAL || - !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { + !kvm_read_cr0_bits(ctxt->vcpu, X86_CR0_PE)) { kvm_inject_gp(ctxt->vcpu, 0); return -1; } @@ -1635,7 +1635,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) /* inject #GP if in real mode or paging is disabled */ if (ctxt->mode == X86EMUL_MODE_REAL - || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { + || !kvm_read_cr0_bits(ctxt->vcpu, X86_CR0_PE)) { kvm_inject_gp(ctxt->vcpu, 0); return -1; } diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 35acc36e1782..f46859751b30 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -38,6 +38,16 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) return vcpu->arch.pdptrs[index]; } +static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) +{ + return vcpu->arch.cr0 & mask; +} + +static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr0_bits(vcpu, ~0UL); +} + static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) { if (mask & vcpu->arch.cr4_guest_owned_bits) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 4f5508c35100..276bf7497c36 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -226,7 +226,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); static int is_write_protection(struct kvm_vcpu *vcpu) { - return vcpu->arch.cr0 & X86_CR0_WP; + return kvm_read_cr0_bits(vcpu, X86_CR0_WP); } static int is_cpuid_PSE36(void) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index ff583423968d..599159f728b9 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -79,7 +79,7 @@ static inline int is_pse(struct kvm_vcpu *vcpu) static inline int is_paging(struct kvm_vcpu *vcpu) { - return vcpu->arch.cr0 & X86_CR0_PG; + return kvm_read_cr0_bits(vcpu, X86_CR0_PG); } static inline int is_present_gpte(unsigned long pte) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index cf64fc026e3e..d3246ce70ae8 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -980,7 +980,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (npt_enabled) goto set; - if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { + if (kvm_read_cr0_bits(vcpu, X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); vcpu->fpu_active = 1; } @@ -1244,7 +1244,7 @@ static int ud_interception(struct vcpu_svm *svm) static int nm_interception(struct vcpu_svm *svm) { svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); - if (!(svm->vcpu.arch.cr0 & X86_CR0_TS)) + if (!kvm_read_cr0_bits(&svm->vcpu, X86_CR0_TS)) svm->vmcb->save.cr0 &= ~X86_CR0_TS; svm->vcpu.fpu_active = 1; @@ -1743,7 +1743,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) hsave->save.gdtr = vmcb->save.gdtr; hsave->save.idtr = vmcb->save.idtr; hsave->save.efer = svm->vcpu.arch.shadow_efer; - hsave->save.cr0 = svm->vcpu.arch.cr0; + hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); hsave->save.cr4 = svm->vcpu.arch.cr4; hsave->save.rflags = vmcb->save.rflags; hsave->save.rip = svm->next_rip; @@ -2387,7 +2387,8 @@ static int handle_exit(struct kvm_vcpu *vcpu) if (npt_enabled) { int mmu_reload = 0; - if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) { + if ((kvm_read_cr0_bits(vcpu, X86_CR0_PG) ^ svm->vmcb->save.cr0) + & X86_CR0_PG) { svm_set_cr0(vcpu, svm->vmcb->save.cr0); mmu_reload = 1; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7c7b2eeea5d0..4c7177c489ac 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -799,7 +799,7 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu) return; vcpu->fpu_active = 1; vmcs_clear_bits(GUEST_CR0, X86_CR0_TS); - if (vcpu->arch.cr0 & X86_CR0_TS) + if (kvm_read_cr0_bits(vcpu, X86_CR0_TS)) vmcs_set_bits(GUEST_CR0, X86_CR0_TS); update_exception_bitmap(vcpu); } @@ -1785,7 +1785,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) vmx_flush_tlb(vcpu); vmcs_writel(GUEST_CR3, guest_cr3); - if (vcpu->arch.cr0 & X86_CR0_PE) + if (kvm_read_cr0_bits(vcpu, X86_CR0_PE)) vmx_fpu_deactivate(vcpu); } @@ -1840,7 +1840,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, static int vmx_get_cpl(struct kvm_vcpu *vcpu) { - if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */ + if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) /* if real mode */ return 0; if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ @@ -2095,7 +2095,7 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) static bool guest_state_valid(struct kvm_vcpu *vcpu) { /* real mode guest state checks */ - if (!(vcpu->arch.cr0 & X86_CR0_PE)) { + if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) { if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) return false; if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) @@ -2580,7 +2580,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; - vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ + vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */ vmx_set_cr4(&vmx->vcpu, 0); vmx_set_efer(&vmx->vcpu, 0); vmx_fpu_activate(&vmx->vcpu); @@ -2996,8 +2996,8 @@ static int handle_cr(struct kvm_vcpu *vcpu) case 2: /* clts */ vmx_fpu_deactivate(vcpu); vcpu->arch.cr0 &= ~X86_CR0_TS; - vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); - trace_kvm_cr_write(0, vcpu->arch.cr0); + vmcs_writel(CR0_READ_SHADOW, kvm_read_cr0(vcpu)); + trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); vmx_fpu_activate(vcpu); skip_emulated_instruction(vcpu); return 1; @@ -3018,7 +3018,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) break; case 3: /* lmsw */ val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; - trace_kvm_cr_write(0, (vcpu->arch.cr0 & ~0xful) | val); + trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val); kvm_lmsw(vcpu, val); skip_emulated_instruction(vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c990424d86d0..748b15d8e46d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -430,7 +430,7 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { if (cr0 & CR0_RESERVED_BITS) { printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", - cr0, vcpu->arch.cr0); + cr0, kvm_read_cr0(vcpu)); kvm_inject_gp(vcpu, 0); return; } @@ -488,7 +488,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) { - kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); + kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0ful) | (msw & 0x0f)); } EXPORT_SYMBOL_GPL(kvm_lmsw); @@ -3095,7 +3095,7 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) int emulate_clts(struct kvm_vcpu *vcpu) { - kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); + kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); return X86EMUL_CONTINUE; } @@ -3714,7 +3714,7 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) switch (cr) { case 0: - value = vcpu->arch.cr0; + value = kvm_read_cr0(vcpu); break; case 2: value = vcpu->arch.cr2; @@ -3741,7 +3741,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, { switch (cr) { case 0: - kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); + kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); *rflags = kvm_get_rflags(vcpu); break; case 2: @@ -4335,7 +4335,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, sregs->gdt.limit = dt.limit; sregs->gdt.base = dt.base; - sregs->cr0 = vcpu->arch.cr0; + sregs->cr0 = kvm_read_cr0(vcpu); sregs->cr2 = vcpu->arch.cr2; sregs->cr3 = vcpu->arch.cr3; sregs->cr4 = kvm_read_cr4(vcpu); @@ -4521,7 +4521,7 @@ int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, { struct kvm_segment kvm_seg; - if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE)) + if (is_vm86_segment(vcpu, seg) || !(kvm_read_cr0_bits(vcpu, X86_CR0_PE))) return kvm_load_realmode_segment(vcpu, selector, seg); if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) return 1; @@ -4799,7 +4799,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) &nseg_desc); } - kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS); + kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS); seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); tr_seg.type = 11; kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); @@ -4834,7 +4834,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, kvm_x86_ops->set_efer(vcpu, sregs->efer); kvm_set_apic_base(vcpu, sregs->apic_base); - mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0; + mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; kvm_x86_ops->set_cr0(vcpu, sregs->cr0); vcpu->arch.cr0 = sregs->cr0; @@ -4873,7 +4873,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, /* Older userspace won't unhalt the vcpu on reset. */ if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && - !(vcpu->arch.cr0 & X86_CR0_PE)) + !(kvm_read_cr0_bits(vcpu, X86_CR0_PE))) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; vcpu_put(vcpu); -- cgit v1.2.2 From e8467fda83cdc9de53972fee0cd2e6916cf66f41 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 29 Dec 2009 18:43:06 +0200 Subject: KVM: VMX: Allow the guest to own some cr0 bits We will use this later to give the guest ownership of cr0.ts. Signed-off-by: Avi Kivity --- arch/x86/kvm/kvm_cache_regs.h | 2 ++ arch/x86/kvm/svm.c | 5 +++++ arch/x86/kvm/vmx.c | 9 +++++++++ 3 files changed, 16 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index f46859751b30..6b419a36cbd9 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -40,6 +40,8 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) { + if (mask & vcpu->arch.cr0_guest_owned_bits) + kvm_x86_ops->decache_cr0_guest_bits(vcpu); return vcpu->arch.cr0 & mask; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d3246ce70ae8..3899c2d19830 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -956,6 +956,10 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) svm->vmcb->save.gdtr.base = dt->base ; } +static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) +{ +} + static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { } @@ -2948,6 +2952,7 @@ static struct kvm_x86_ops svm_x86_ops = { .set_segment = svm_set_segment, .get_cpl = svm_get_cpl, .get_cs_db_l_bits = kvm_get_cs_db_l_bits, + .decache_cr0_guest_bits = svm_decache_cr0_guest_bits, .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, .set_cr0 = svm_set_cr0, .set_cr3 = svm_set_cr3, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4c7177c489ac..dbcdb55094f7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1653,6 +1653,14 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu) ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); } +static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) +{ + ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; + + vcpu->arch.cr0 &= ~cr0_guest_owned_bits; + vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; +} + static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; @@ -4106,6 +4114,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_segment = vmx_set_segment, .get_cpl = vmx_get_cpl, .get_cs_db_l_bits = vmx_get_cs_db_l_bits, + .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, .set_cr0 = vmx_set_cr0, .set_cr3 = vmx_set_cr3, -- cgit v1.2.2 From 02daab21d94dc4cf01b2fd09863d59a436900322 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 30 Dec 2009 12:40:26 +0200 Subject: KVM: Lazify fpu activation and deactivation Defer fpu deactivation as much as possible - if the guest fpu is loaded, keep it loaded until the next heavyweight exit (where we are forced to unload it). This reduces unnecessary exits. We also defer fpu activation on clts; while clts signals the intent to use the fpu, we can't be sure the guest will actually use it. Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 35 +++++++++++++++++++++-------------- arch/x86/kvm/vmx.c | 25 +++++++++---------------- arch/x86/kvm/x86.c | 7 ++++++- 3 files changed, 36 insertions(+), 31 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 3899c2d19830..5b336a80f31e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -984,17 +984,11 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (npt_enabled) goto set; - if (kvm_read_cr0_bits(vcpu, X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { - svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); - vcpu->fpu_active = 1; - } - vcpu->arch.cr0 = cr0; cr0 |= X86_CR0_PG | X86_CR0_WP; - if (!vcpu->fpu_active) { - svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR); + + if (!vcpu->fpu_active) cr0 |= X86_CR0_TS; - } set: /* * re-enable caching here because the QEMU bios @@ -1250,6 +1244,8 @@ static int nm_interception(struct vcpu_svm *svm) svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); if (!kvm_read_cr0_bits(&svm->vcpu, X86_CR0_TS)) svm->vmcb->save.cr0 &= ~X86_CR0_TS; + else + svm->vmcb->save.cr0 |= X86_CR0_TS; svm->vcpu.fpu_active = 1; return 1; @@ -2586,6 +2582,8 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu) static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) { + if (npt_enabled) + vcpu->fpu_active = 1; } static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) @@ -2805,12 +2803,6 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) svm->vmcb->save.cr3 = root; force_new_asid(vcpu); - - if (vcpu->fpu_active) { - svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR); - svm->vmcb->save.cr0 |= X86_CR0_TS; - vcpu->fpu_active = 0; - } } static int is_disabled(void) @@ -2926,6 +2918,20 @@ static bool svm_rdtscp_supported(void) return false; } +static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (npt_enabled) { + /* hack: npt requires active fpu at this time */ + vcpu->fpu_active = 1; + return; + } + + svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR; + svm->vmcb->save.cr0 |= X86_CR0_TS; +} + static struct kvm_x86_ops svm_x86_ops = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -2967,6 +2973,7 @@ static struct kvm_x86_ops svm_x86_ops = { .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, + .fpu_deactivate = svm_fpu_deactivate, .tlb_flush = svm_flush_tlb, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index dbcdb55094f7..d11be3fb7c80 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -66,7 +66,7 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO); #define KVM_GUEST_CR0_MASK \ (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \ - (X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP) + (X86_CR0_WP | X86_CR0_NE | X86_CR0_MP) #define KVM_VM_CR0_ALWAYS_ON \ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) #define KVM_CR4_GUEST_OWNED_BITS \ @@ -579,9 +579,8 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) { u32 eb; - eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR); - if (!vcpu->fpu_active) - eb |= 1u << NM_VECTOR; + eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) + | (1u << NM_VECTOR); /* * Unconditionally intercept #DB so we can maintain dr6 without * reading it every exit. @@ -595,6 +594,8 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) eb = ~0; if (enable_ept) eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ + if (vcpu->fpu_active) + eb &= ~(1u << NM_VECTOR); vmcs_write32(EXCEPTION_BITMAP, eb); } @@ -806,9 +807,6 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu) static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) { - if (!vcpu->fpu_active) - return; - vcpu->fpu_active = 0; vmcs_set_bits(GUEST_CR0, X86_CR0_TS); update_exception_bitmap(vcpu); } @@ -1737,8 +1735,6 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) else hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON; - vmx_fpu_deactivate(vcpu); - if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) enter_pmode(vcpu); @@ -1757,12 +1753,12 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (enable_ept) ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); + if (!vcpu->fpu_active) + hw_cr0 |= X86_CR0_TS; + vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, hw_cr0); vcpu->arch.cr0 = cr0; - - if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE)) - vmx_fpu_activate(vcpu); } static u64 construct_eptp(unsigned long root_hpa) @@ -1793,8 +1789,6 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) vmx_flush_tlb(vcpu); vmcs_writel(GUEST_CR3, guest_cr3); - if (kvm_read_cr0_bits(vcpu, X86_CR0_PE)) - vmx_fpu_deactivate(vcpu); } static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -3002,11 +2996,9 @@ static int handle_cr(struct kvm_vcpu *vcpu) }; break; case 2: /* clts */ - vmx_fpu_deactivate(vcpu); vcpu->arch.cr0 &= ~X86_CR0_TS; vmcs_writel(CR0_READ_SHADOW, kvm_read_cr0(vcpu)); trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); - vmx_fpu_activate(vcpu); skip_emulated_instruction(vcpu); return 1; case 1: /*mov from cr*/ @@ -4127,6 +4119,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, + .fpu_deactivate = vmx_fpu_deactivate, .tlb_flush = vmx_flush_tlb, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 748b15d8e46d..1de2ad7a004d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1509,8 +1509,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { - kvm_x86_ops->vcpu_put(vcpu); kvm_put_guest_fpu(vcpu); + kvm_x86_ops->vcpu_put(vcpu); } static int is_efer_nx(void) @@ -4006,6 +4006,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) r = 0; goto out; } + if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) { + vcpu->fpu_active = 0; + kvm_x86_ops->fpu_deactivate(vcpu); + } } preempt_disable(); @@ -5075,6 +5079,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) kvm_fx_save(&vcpu->arch.guest_fx_image); kvm_fx_restore(&vcpu->arch.host_fx_image); ++vcpu->stat.fpu_reload; + set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests); } EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); -- cgit v1.2.2 From edcafe3c5a06f46407c3f60145a36f269e56ff7f Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 30 Dec 2009 18:07:40 +0200 Subject: KVM: VMX: Give the guest ownership of cr0.ts when the fpu is active If the guest fpu is loaded, there is nothing interesing about cr0.ts; let the guest play with it as it will. This makes context switches between fpu intensive guest processes faster, as we won't trap the clts and cr0 write instructions. [marcelo: fix cr0 read shadow update on fpu deactivation; kills F8 install] Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d11be3fb7c80..fc1964d5e97c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -803,12 +803,20 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu) if (kvm_read_cr0_bits(vcpu, X86_CR0_TS)) vmcs_set_bits(GUEST_CR0, X86_CR0_TS); update_exception_bitmap(vcpu); + vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; + vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); } +static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); + static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) { + vmx_decache_cr0_guest_bits(vcpu); vmcs_set_bits(GUEST_CR0, X86_CR0_TS); update_exception_bitmap(vcpu); + vcpu->arch.cr0_guest_owned_bits = 0; + vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); + vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); } static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) @@ -2996,8 +3004,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) }; break; case 2: /* clts */ - vcpu->arch.cr0 &= ~X86_CR0_TS; - vmcs_writel(CR0_READ_SHADOW, kvm_read_cr0(vcpu)); + vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); skip_emulated_instruction(vcpu); return 1; -- cgit v1.2.2 From f9a48e6a18c210c4bf34769fa05ede250575c6a1 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 6 Jan 2010 19:10:22 +0200 Subject: KVM: Set cr0.et when the guest writes cr0 Follow the hardware. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1de2ad7a004d..1ad34d185da9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -428,6 +428,8 @@ out: void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { + cr0 |= X86_CR0_ET; + if (cr0 & CR0_RESERVED_BITS) { printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", cr0, kvm_read_cr0(vcpu)); -- cgit v1.2.2 From bff7827479ed004c0394e6e7b35ae601bb1a97ad Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 7 Jan 2010 13:16:08 +0200 Subject: KVM: SVM: Initialize fpu_active in init_vmcb() init_vmcb() sets up the intercepts as if the fpu is active, so initialize it there. This avoids an INIT from setting up intercepts inconsistent with fpu_active. Acked-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 5b336a80f31e..949990e4a22e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -540,6 +540,8 @@ static void init_vmcb(struct vcpu_svm *svm) struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; + svm->vcpu.fpu_active = 1; + control->intercept_cr_read = INTERCEPT_CR0_MASK | INTERCEPT_CR3_MASK | INTERCEPT_CR4_MASK; @@ -730,7 +732,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) init_vmcb(svm); fx_init(&svm->vcpu); - svm->vcpu.fpu_active = 1; svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_bsp(&svm->vcpu)) svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; -- cgit v1.2.2 From 888f9f3e0cfa32baf05b3840f0248f5502292a0f Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 10 Jan 2010 12:14:04 +0200 Subject: KVM: SVM: Restore unconditional cr0 intercept under npt Currently we don't intercept cr0 at all when npt is enabled. This improves performance but requires us to activate the fpu at all times. Remove this behaviour in preparation for adding selective cr0 intercepts. Acked-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 949990e4a22e..27273ed24c41 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -643,10 +643,8 @@ static void init_vmcb(struct vcpu_svm *svm) control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) | (1ULL << INTERCEPT_INVLPG)); control->intercept_exceptions &= ~(1 << PF_VECTOR); - control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK| - INTERCEPT_CR3_MASK); - control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK| - INTERCEPT_CR3_MASK); + control->intercept_cr_read &= ~INTERCEPT_CR3_MASK; + control->intercept_cr_write &= ~INTERCEPT_CR3_MASK; save->g_pat = 0x0007040600070406ULL; save->cr3 = 0; save->cr4 = 0; @@ -982,15 +980,13 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } } #endif - if (npt_enabled) - goto set; - vcpu->arch.cr0 = cr0; - cr0 |= X86_CR0_PG | X86_CR0_WP; + + if (!npt_enabled) + cr0 |= X86_CR0_PG | X86_CR0_WP; if (!vcpu->fpu_active) cr0 |= X86_CR0_TS; -set: /* * re-enable caching here because the QEMU bios * does not do it - this results in some delay at @@ -2386,21 +2382,10 @@ static int handle_exit(struct kvm_vcpu *vcpu) svm_complete_interrupts(svm); - if (npt_enabled) { - int mmu_reload = 0; - if ((kvm_read_cr0_bits(vcpu, X86_CR0_PG) ^ svm->vmcb->save.cr0) - & X86_CR0_PG) { - svm_set_cr0(vcpu, svm->vmcb->save.cr0); - mmu_reload = 1; - } + if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK)) vcpu->arch.cr0 = svm->vmcb->save.cr0; + if (npt_enabled) vcpu->arch.cr3 = svm->vmcb->save.cr3; - if (mmu_reload) { - kvm_mmu_reset_context(vcpu); - kvm_mmu_load(vcpu); - } - } - if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; -- cgit v1.2.2 From d225157bc6a442b1214882635fbf287d7d0e8133 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 6 Jan 2010 10:55:27 +0200 Subject: KVM: SVM: Selective cr0 intercept If two conditions apply: - no bits outside TS and EM differ between the host and guest cr0 - the fpu is active then we can activate the selective cr0 write intercept and drop the unconditional cr0 read and write intercept, and allow the guest to run with the host fpu state. This reduces cr0 exits due to guest fpu management while the guest fpu is loaded. Acked-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 27273ed24c41..83c7ab1bdad8 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -571,6 +571,7 @@ static void init_vmcb(struct vcpu_svm *svm) control->intercept = (1ULL << INTERCEPT_INTR) | (1ULL << INTERCEPT_NMI) | (1ULL << INTERCEPT_SMI) | + (1ULL << INTERCEPT_SELECTIVE_CR0) | (1ULL << INTERCEPT_CPUID) | (1ULL << INTERCEPT_INVD) | (1ULL << INTERCEPT_HLT) | @@ -963,6 +964,27 @@ static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { } +static void update_cr0_intercept(struct vcpu_svm *svm) +{ + ulong gcr0 = svm->vcpu.arch.cr0; + u64 *hcr0 = &svm->vmcb->save.cr0; + + if (!svm->vcpu.fpu_active) + *hcr0 |= SVM_CR0_SELECTIVE_MASK; + else + *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) + | (gcr0 & SVM_CR0_SELECTIVE_MASK); + + + if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { + svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK; + svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK; + } else { + svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK; + svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK; + } +} + static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_svm *svm = to_svm(vcpu); @@ -994,6 +1016,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) */ cr0 &= ~(X86_CR0_CD | X86_CR0_NW); svm->vmcb->save.cr0 = cr0; + update_cr0_intercept(svm); } static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -1239,11 +1262,8 @@ static int ud_interception(struct vcpu_svm *svm) static int nm_interception(struct vcpu_svm *svm) { svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); - if (!kvm_read_cr0_bits(&svm->vcpu, X86_CR0_TS)) - svm->vmcb->save.cr0 &= ~X86_CR0_TS; - else - svm->vmcb->save.cr0 |= X86_CR0_TS; svm->vcpu.fpu_active = 1; + update_cr0_intercept(svm); return 1; } @@ -2296,7 +2316,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_CR3] = emulate_on_interception, [SVM_EXIT_READ_CR4] = emulate_on_interception, [SVM_EXIT_READ_CR8] = emulate_on_interception, - /* for now: */ + [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, [SVM_EXIT_WRITE_CR0] = emulate_on_interception, [SVM_EXIT_WRITE_CR3] = emulate_on_interception, [SVM_EXIT_WRITE_CR4] = emulate_on_interception, @@ -2914,8 +2934,8 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) return; } + update_cr0_intercept(svm); svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR; - svm->vmcb->save.cr0 |= X86_CR0_TS; } static struct kvm_x86_ops svm_x86_ops = { -- cgit v1.2.2 From 4610c83cdc8bff04f2f22883749f716b1ccc502f Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 10 Jan 2010 12:19:20 +0200 Subject: KVM: SVM: Lazy fpu with npt Now that we can allow the guest to play with cr0 when the fpu is loaded, we can enable lazy fpu when npt is in use. Acked-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 83c7ab1bdad8..8d7cb62ebef6 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2588,8 +2588,6 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu) static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) { - if (npt_enabled) - vcpu->fpu_active = 1; } static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) @@ -2928,12 +2926,6 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if (npt_enabled) { - /* hack: npt requires active fpu at this time */ - vcpu->fpu_active = 1; - return; - } - update_cr0_intercept(svm); svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR; } -- cgit v1.2.2 From 55cd8e5a4edb8e235163ffe8264b9aaa8d7c050f Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 17 Jan 2010 15:51:22 +0200 Subject: KVM: Implement bare minimum of HYPER-V MSRs Minimum HYPER-V implementation should have GUEST_OS_ID, HYPERCALL and VP_INDEX MSRs. [avi: fix build on i386] Signed-off-by: Gleb Natapov Signed-off-by: Vadim Rozenfeld Signed-off-by: Avi Kivity --- arch/x86/kvm/trace.h | 32 +++++++++ arch/x86/kvm/x86.c | 193 ++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 224 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 816e0449db0b..1cb3d0e990f3 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -55,6 +55,38 @@ TRACE_EVENT(kvm_hypercall, __entry->a3) ); +/* + * Tracepoint for hypercall. + */ +TRACE_EVENT(kvm_hv_hypercall, + TP_PROTO(__u16 code, bool fast, __u16 rep_cnt, __u16 rep_idx, + __u64 ingpa, __u64 outgpa), + TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa), + + TP_STRUCT__entry( + __field( __u16, code ) + __field( bool, fast ) + __field( __u16, rep_cnt ) + __field( __u16, rep_idx ) + __field( __u64, ingpa ) + __field( __u64, outgpa ) + ), + + TP_fast_assign( + __entry->code = code; + __entry->fast = fast; + __entry->rep_cnt = rep_cnt; + __entry->rep_idx = rep_idx; + __entry->ingpa = ingpa; + __entry->outgpa = outgpa; + ), + + TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx", + __entry->code, __entry->fast ? "fast" : "slow", + __entry->rep_cnt, __entry->rep_idx, __entry->ingpa, + __entry->outgpa) +); + /* * Tracepoint for PIO. */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1ad34d185da9..480137db4770 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -622,9 +622,10 @@ static inline u32 bit(int bitno) * kvm-specific. Those are put in the beginning of the list. */ -#define KVM_SAVE_MSRS_BEGIN 2 +#define KVM_SAVE_MSRS_BEGIN 4 static u32 msrs_to_save[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, + HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_K6_STAR, #ifdef CONFIG_X86_64 @@ -1004,6 +1005,74 @@ out: return r; } +static bool kvm_hv_hypercall_enabled(struct kvm *kvm) +{ + return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE; +} + +static bool kvm_hv_msr_partition_wide(u32 msr) +{ + bool r = false; + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + case HV_X64_MSR_HYPERCALL: + r = true; + break; + } + + return r; +} + +static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + struct kvm *kvm = vcpu->kvm; + + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + kvm->arch.hv_guest_os_id = data; + /* setting guest os id to zero disables hypercall page */ + if (!kvm->arch.hv_guest_os_id) + kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE; + break; + case HV_X64_MSR_HYPERCALL: { + u64 gfn; + unsigned long addr; + u8 instructions[4]; + + /* if guest os id is not set hypercall should remain disabled */ + if (!kvm->arch.hv_guest_os_id) + break; + if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) { + kvm->arch.hv_hypercall = data; + break; + } + gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT; + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return 1; + kvm_x86_ops->patch_hypercall(vcpu, instructions); + ((unsigned char *)instructions)[3] = 0xc3; /* ret */ + if (copy_to_user((void __user *)addr, instructions, 4)) + return 1; + kvm->arch.hv_hypercall = data; + break; + } + default: + pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " + "data 0x%llx\n", msr, data); + return 1; + } + return 0; +} + +static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x data 0x%llx\n", + msr, data); + + return 1; +} + int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) { switch (msr) { @@ -1118,6 +1187,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " "0x%x data 0x%llx\n", msr, data); break; + case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + if (kvm_hv_msr_partition_wide(msr)) { + int r; + mutex_lock(&vcpu->kvm->lock); + r = set_msr_hyperv_pw(vcpu, msr, data); + mutex_unlock(&vcpu->kvm->lock); + return r; + } else + return set_msr_hyperv(vcpu, msr, data); + break; default: if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) return xen_hvm_config(vcpu, data); @@ -1217,6 +1296,48 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return 0; } +static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + u64 data = 0; + struct kvm *kvm = vcpu->kvm; + + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + data = kvm->arch.hv_guest_os_id; + break; + case HV_X64_MSR_HYPERCALL: + data = kvm->arch.hv_hypercall; + break; + default: + pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + return 1; + } + + *pdata = data; + return 0; +} + +static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + u64 data = 0; + + switch (msr) { + case HV_X64_MSR_VP_INDEX: { + int r; + struct kvm_vcpu *v; + kvm_for_each_vcpu(r, v, vcpu->kvm) + if (v == vcpu) + data = r; + break; + } + default: + pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + return 1; + } + *pdata = data; + return 0; +} + int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) { u64 data; @@ -1283,6 +1404,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: return get_msr_mce(vcpu, msr, pdata); + case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + if (kvm_hv_msr_partition_wide(msr)) { + int r; + mutex_lock(&vcpu->kvm->lock); + r = get_msr_hyperv_pw(vcpu, msr, pdata); + mutex_unlock(&vcpu->kvm->lock); + return r; + } else + return get_msr_hyperv(vcpu, msr, pdata); + break; default: if (!ignore_msrs) { pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); @@ -1398,6 +1529,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_XEN_HVM: case KVM_CAP_ADJUST_CLOCK: case KVM_CAP_VCPU_EVENTS: + case KVM_CAP_HYPERV: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -3618,11 +3750,70 @@ static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0, return a0 | ((gpa_t)a1 << 32); } +int kvm_hv_hypercall(struct kvm_vcpu *vcpu) +{ + u64 param, ingpa, outgpa, ret; + uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0; + bool fast, longmode; + int cs_db, cs_l; + + /* + * hypercall generates UD from non zero cpl and real mode + * per HYPER-V spec + */ + if (kvm_x86_ops->get_cpl(vcpu) != 0 || + !kvm_read_cr0_bits(vcpu, X86_CR0_PE)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 0; + } + + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + longmode = is_long_mode(vcpu) && cs_l == 1; + + if (!longmode) { + param = (kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffff); + ingpa = (kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffff); + outgpa = (kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffff); + } +#ifdef CONFIG_X86_64 + else { + param = kvm_register_read(vcpu, VCPU_REGS_RCX); + ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX); + outgpa = kvm_register_read(vcpu, VCPU_REGS_R8); + } +#endif + + code = param & 0xffff; + fast = (param >> 16) & 0x1; + rep_cnt = (param >> 32) & 0xfff; + rep_idx = (param >> 48) & 0xfff; + + trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa); + + res = HV_STATUS_INVALID_HYPERCALL_CODE; + + ret = res | (((u64)rep_done & 0xfff) << 32); + if (longmode) { + kvm_register_write(vcpu, VCPU_REGS_RAX, ret); + } else { + kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32); + kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff); + } + + return 1; +} + int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { unsigned long nr, a0, a1, a2, a3, ret; int r = 1; + if (kvm_hv_hypercall_enabled(vcpu->kvm)) + return kvm_hv_hypercall(vcpu); + nr = kvm_register_read(vcpu, VCPU_REGS_RAX); a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); -- cgit v1.2.2 From 10388a07164c1512b3a3d0273b9adc230f82790e Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 17 Jan 2010 15:51:23 +0200 Subject: KVM: Add HYPER-V apic access MSRs Implement HYPER-V apic MSRs. Spec defines three MSRs that speed-up access to EOI/TPR/ICR apic registers for PV guests. Signed-off-by: Gleb Natapov Signed-off-by: Vadim Rozenfeld Signed-off-by: Avi Kivity --- arch/x86/kvm/lapic.c | 31 +++++++++++++++++++++++++++++++ arch/x86/kvm/lapic.h | 8 ++++++++ arch/x86/kvm/x86.c | 42 ++++++++++++++++++++++++++++++++++++++---- 3 files changed, 77 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index ba8c045da782..4b224f90087b 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1246,3 +1246,34 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) return 0; } + +int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + if (!irqchip_in_kernel(vcpu->kvm)) + return 1; + + /* if this is ICR write vector before command */ + if (reg == APIC_ICR) + apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32)); + return apic_reg_write(apic, reg, (u32)data); +} + +int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + u32 low, high = 0; + + if (!irqchip_in_kernel(vcpu->kvm)) + return 1; + + if (apic_reg_read(apic, reg, 4, &low)) + return 1; + if (reg == APIC_ICR) + apic_reg_read(apic, APIC_ICR2, 4, &high); + + *data = (((u64)high) << 32) | low; + + return 0; +} diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 40010b09c4aa..f5fe32c5edad 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -48,4 +48,12 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); + +int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); +int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); + +static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE; +} #endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 480137db4770..552be51e4d84 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -622,10 +622,11 @@ static inline u32 bit(int bitno) * kvm-specific. Those are put in the beginning of the list. */ -#define KVM_SAVE_MSRS_BEGIN 4 +#define KVM_SAVE_MSRS_BEGIN 5 static u32 msrs_to_save[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, + HV_X64_MSR_APIC_ASSIST_PAGE, MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_K6_STAR, #ifdef CONFIG_X86_64 @@ -1067,10 +1068,36 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data) static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) { - pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x data 0x%llx\n", - msr, data); + switch (msr) { + case HV_X64_MSR_APIC_ASSIST_PAGE: { + unsigned long addr; - return 1; + if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) { + vcpu->arch.hv_vapic = data; + break; + } + addr = gfn_to_hva(vcpu->kvm, data >> + HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT); + if (kvm_is_error_hva(addr)) + return 1; + if (clear_user((void __user *)addr, PAGE_SIZE)) + return 1; + vcpu->arch.hv_vapic = data; + break; + } + case HV_X64_MSR_EOI: + return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data); + case HV_X64_MSR_ICR: + return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data); + case HV_X64_MSR_TPR: + return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); + default: + pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " + "data 0x%llx\n", msr, data); + return 1; + } + + return 0; } int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) @@ -1330,6 +1357,12 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) data = r; break; } + case HV_X64_MSR_EOI: + return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata); + case HV_X64_MSR_ICR: + return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata); + case HV_X64_MSR_TPR: + return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); default: pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; @@ -1530,6 +1563,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_ADJUST_CLOCK: case KVM_CAP_VCPU_EVENTS: case KVM_CAP_HYPERV: + case KVM_CAP_HYPERV_VAPIC: r = 1; break; case KVM_CAP_COALESCED_MMIO: -- cgit v1.2.2 From c25bc1638a1211f57cccbabdd8b732813b852340 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 17 Jan 2010 15:51:24 +0200 Subject: KVM: Implement NotifyLongSpinWait HYPER-V hypercall Windows issues this hypercall after guest was spinning on a spinlock for too many iterations. Signed-off-by: Gleb Natapov Signed-off-by: Vadim Rozenfeld Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 552be51e4d84..9f72a443455b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1564,6 +1564,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_VCPU_EVENTS: case KVM_CAP_HYPERV: case KVM_CAP_HYPERV_VAPIC: + case KVM_CAP_HYPERV_SPIN: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -3827,7 +3828,14 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa); - res = HV_STATUS_INVALID_HYPERCALL_CODE; + switch (code) { + case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT: + kvm_vcpu_on_spin(vcpu); + break; + default: + res = HV_STATUS_INVALID_HYPERCALL_CODE; + break; + } ret = res | (((u64)rep_done & 0xfff) << 32); if (longmode) { -- cgit v1.2.2 From 8dae444529230301bc85fc86033aa06a734c1a29 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 18 Jan 2010 18:45:10 +0900 Subject: KVM: rename is_writeble_pte() to is_writable_pte() There are two spellings of "writable" in arch/x86/kvm/mmu.c and paging_tmpl.h . This patch renames is_writeble_pte() to is_writable_pte() and makes grepping easy. New name is consistent with the definition of itself: return pte & PT_WRITABLE_MASK; Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 18 +++++++++--------- arch/x86/kvm/paging_tmpl.h | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 276bf7497c36..ff2b2e8d72eb 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -250,7 +250,7 @@ static int is_large_pte(u64 pte) return pte & PT_PAGE_SIZE_MASK; } -static int is_writeble_pte(unsigned long pte) +static int is_writable_pte(unsigned long pte) { return pte & PT_WRITABLE_MASK; } @@ -632,7 +632,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) pfn = spte_to_pfn(*spte); if (*spte & shadow_accessed_mask) kvm_set_pfn_accessed(pfn); - if (is_writeble_pte(*spte)) + if (is_writable_pte(*spte)) kvm_set_pfn_dirty(pfn); rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level); if (!*rmapp) { @@ -708,7 +708,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) BUG_ON(!spte); BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); - if (is_writeble_pte(*spte)) { + if (is_writable_pte(*spte)) { __set_spte(spte, *spte & ~PT_WRITABLE_MASK); write_protected = 1; } @@ -732,7 +732,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) BUG_ON(!(*spte & PT_PRESENT_MASK)); BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); - if (is_writeble_pte(*spte)) { + if (is_writable_pte(*spte)) { rmap_remove(kvm, spte); --kvm->stat.lpages; __set_spte(spte, shadow_trap_nonpresent_pte); @@ -787,7 +787,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, new_spte &= ~PT_WRITABLE_MASK; new_spte &= ~SPTE_HOST_WRITEABLE; - if (is_writeble_pte(*spte)) + if (is_writable_pte(*spte)) kvm_set_pfn_dirty(spte_to_pfn(*spte)); __set_spte(spte, new_spte); spte = rmap_next(kvm, rmapp, spte); @@ -1847,7 +1847,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, * is responsibility of mmu_get_page / kvm_sync_page. * Same reasoning can be applied to dirty page accounting. */ - if (!can_unsync && is_writeble_pte(*sptep)) + if (!can_unsync && is_writable_pte(*sptep)) goto set_pte; if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { @@ -1855,7 +1855,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, __func__, gfn); ret = 1; pte_access &= ~ACC_WRITE_MASK; - if (is_writeble_pte(spte)) + if (is_writable_pte(spte)) spte &= ~PT_WRITABLE_MASK; } } @@ -1876,7 +1876,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, bool reset_host_protection) { int was_rmapped = 0; - int was_writeble = is_writeble_pte(*sptep); + int was_writable = is_writable_pte(*sptep); int rmap_count; pgprintk("%s: spte %llx access %x write_fault %d" @@ -1927,7 +1927,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (rmap_count > RMAP_RECYCLE_THRESHOLD) rmap_recycle(vcpu, sptep, gfn); } else { - if (was_writeble) + if (was_writable) kvm_release_pfn_dirty(pfn); else kvm_release_pfn_clean(pfn); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index ede2131a9225..df15a5307d2d 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -162,7 +162,7 @@ walk: if (rsvd_fault) goto access_error; - if (write_fault && !is_writeble_pte(pte)) + if (write_fault && !is_writable_pte(pte)) if (user_fault || is_write_protection(vcpu)) goto access_error; -- cgit v1.2.2 From 7062dcaa369cae40c1a59949a5654985d0ae26ea Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Tue, 19 Jan 2010 17:43:21 +0800 Subject: KVM: VMX: Remove emulation failure report As Avi noted: >There are two problems with the kernel failure report. First, it >doesn't report enough data - registers, surrounding instructions, etc. >that are needed to explain what is going on. Second, it can flood >dmesg, which is a pretty bad thing to do. So we remove the emulation failure report in handle_invalid_guest_state(), and would inspected the guest using userspace tool in the future. Signed-off-by: Sheng Yang Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index fc1964d5e97c..516084f3c6d8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3449,7 +3449,6 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) } if (err != EMULATE_DONE) { - kvm_report_emulation_failure(vcpu, "emulation failure"); vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; -- cgit v1.2.2 From ccd469362e826261ccc261c4c36fb0a346338222 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Tue, 19 Jan 2010 15:06:38 +0200 Subject: KVM: fix Hyper-V hypercall warnings and wrong mask value Fix compilation warnings and wrong mask value. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9f72a443455b..9b0758b4d550 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3806,12 +3806,12 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) longmode = is_long_mode(vcpu) && cs_l == 1; if (!longmode) { - param = (kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) | - (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffff); - ingpa = (kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) | - (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffff); - outgpa = (kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) | - (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffff); + param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff); + ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff); + outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff); } #ifdef CONFIG_X86_64 else { -- cgit v1.2.2 From 647492047763c3ee8fe51ecf9a04f39040aa495b Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 19 Jan 2010 12:45:23 -0200 Subject: KVM: fix cleanup_srcu_struct on vm destruction cleanup_srcu_struct on VM destruction remains broken: BUG: unable to handle kernel paging request at ffffffffffffffff IP: [] srcu_read_lock+0x16/0x21 RIP: 0010:[] [] srcu_read_lock+0x16/0x21 Call Trace: [] kvm_arch_vcpu_uninit+0x1b/0x48 [kvm] [] kvm_vcpu_uninit+0x9/0x15 [kvm] [] vmx_free_vcpu+0x7f/0x8f [kvm_intel] [] kvm_arch_destroy_vm+0x78/0x111 [kvm] [] kvm_put_kvm+0xd4/0xfe [kvm] Move it to kvm_arch_destroy_vm. Signed-off-by: Marcelo Tosatti Reported-by: Jan Kiszka --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9b0758b4d550..322c2c5f9bc4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5547,6 +5547,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) put_page(kvm->arch.apic_access_page); if (kvm->arch.ept_identity_pagetable) put_page(kvm->arch.ept_identity_pagetable); + cleanup_srcu_struct(&kvm->srcu); kfree(kvm->arch.aliases); kfree(kvm); } -- cgit v1.2.2 From b60d513c32e2ddc8b3e9e1465b94913d44d19810 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 20 Jan 2010 16:47:21 +0900 Subject: KVM: x86: Use macros for x86_emulate_ops to avoid future mistakes The return values from x86_emulate_ops are defined in kvm_emulate.h as macros X86EMUL_*. But in emulate.c, we are comparing the return values from these ops with 0 to check if they're X86EMUL_CONTINUE or not: X86EMUL_CONTINUE is defined as 0 now. To avoid possible mistakes in the future, this patch substitutes "X86EMUL_CONTINUE" for "0" that are being compared with the return values from x86_emulate_ops. We think that there are more places we should use these macros, but the meanings of rc values in x86_emulate_insn() were not so clear at a glance. If we use proper macros in this function, we would be able to follow the flow of each emulation more easily and, maybe, more securely. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 65 ++++++++++++++++++++++++++++---------------------- 1 file changed, 36 insertions(+), 29 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 0f89e320bc96..48c7f9f8a08f 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1196,7 +1196,7 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt, rc = ops->read_emulated(register_address(c, ss_base(ctxt), c->regs[VCPU_REGS_RSP]), dest, len, ctxt->vcpu); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) return rc; register_address_increment(c, &c->regs[VCPU_REGS_RSP], len); @@ -1370,7 +1370,7 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, int rc; rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) return rc; if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || @@ -1385,7 +1385,7 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, (u32) c->regs[VCPU_REGS_RBX]; rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) return rc; ctxt->eflags |= EFLG_ZF; } @@ -1451,7 +1451,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, &c->dst.val, c->dst.bytes, ctxt->vcpu); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) return rc; break; case OP_NONE: @@ -1749,7 +1749,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) &c->src.val, c->src.bytes, ctxt->vcpu); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) goto done; c->src.orig_val = c->src.val; } @@ -1768,12 +1768,15 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) c->dst.ptr = (void *)c->dst.ptr + (c->src.val & mask) / 8; } - if (!(c->d & Mov) && - /* optimisation - avoid slow emulated read */ - ((rc = ops->read_emulated((unsigned long)c->dst.ptr, - &c->dst.val, - c->dst.bytes, ctxt->vcpu)) != 0)) - goto done; + if (!(c->d & Mov)) { + /* optimisation - avoid slow emulated read */ + rc = ops->read_emulated((unsigned long)c->dst.ptr, + &c->dst.val, + c->dst.bytes, + ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) + goto done; + } } c->dst.orig_val = c->dst.val; @@ -2039,11 +2042,12 @@ special_insn: c->dst.ptr = (unsigned long *)register_address(c, es_base(ctxt), c->regs[VCPU_REGS_RDI]); - if ((rc = ops->read_emulated(register_address(c, - seg_override_base(ctxt, c), - c->regs[VCPU_REGS_RSI]), + rc = ops->read_emulated(register_address(c, + seg_override_base(ctxt, c), + c->regs[VCPU_REGS_RSI]), &c->dst.val, - c->dst.bytes, ctxt->vcpu)) != 0) + c->dst.bytes, ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) goto done; register_address_increment(c, &c->regs[VCPU_REGS_RSI], (ctxt->eflags & EFLG_DF) ? -c->dst.bytes @@ -2058,10 +2062,11 @@ special_insn: c->src.ptr = (unsigned long *)register_address(c, seg_override_base(ctxt, c), c->regs[VCPU_REGS_RSI]); - if ((rc = ops->read_emulated((unsigned long)c->src.ptr, - &c->src.val, - c->src.bytes, - ctxt->vcpu)) != 0) + rc = ops->read_emulated((unsigned long)c->src.ptr, + &c->src.val, + c->src.bytes, + ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) goto done; c->dst.type = OP_NONE; /* Disable writeback. */ @@ -2069,10 +2074,11 @@ special_insn: c->dst.ptr = (unsigned long *)register_address(c, es_base(ctxt), c->regs[VCPU_REGS_RDI]); - if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, - &c->dst.val, - c->dst.bytes, - ctxt->vcpu)) != 0) + rc = ops->read_emulated((unsigned long)c->dst.ptr, + &c->dst.val, + c->dst.bytes, + ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) goto done; DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); @@ -2102,12 +2108,13 @@ special_insn: c->dst.type = OP_REG; c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; - if ((rc = ops->read_emulated(register_address(c, - seg_override_base(ctxt, c), - c->regs[VCPU_REGS_RSI]), - &c->dst.val, - c->dst.bytes, - ctxt->vcpu)) != 0) + rc = ops->read_emulated(register_address(c, + seg_override_base(ctxt, c), + c->regs[VCPU_REGS_RSI]), + &c->dst.val, + c->dst.bytes, + ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) goto done; register_address_increment(c, &c->regs[VCPU_REGS_RSI], (ctxt->eflags & EFLG_DF) ? -c->dst.bytes -- cgit v1.2.2 From f2483415293b180945da707e7dbe74e5daa72651 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 20 Jan 2010 18:20:20 +0100 Subject: KVM: VMX: Fix exceptions of mov to dr Injecting GP without an error code is a bad idea (causes unhandled guest exits). Moreover, we must not skip the instruction if we injected an exception. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 516084f3c6d8..9727773f24b7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3045,6 +3045,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) unsigned long val; int dr, reg; + /* Do not handle if the CPL > 0, will trigger GP on re-entry */ if (!kvm_require_cpl(vcpu, 0)) return 1; dr = vmcs_readl(GUEST_DR7); @@ -3099,20 +3100,22 @@ static int handle_dr(struct kvm_vcpu *vcpu) vcpu->arch.eff_db[dr] = val; break; case 4 ... 5: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } break; case 6: if (val & 0xffffffff00000000ULL) { - kvm_queue_exception(vcpu, GP_VECTOR); - break; + kvm_inject_gp(vcpu, 0); + return 1; } vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; break; case 7: if (val & 0xffffffff00000000ULL) { - kvm_queue_exception(vcpu, GP_VECTOR); - break; + kvm_inject_gp(vcpu, 0); + return 1; } vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { -- cgit v1.2.2 From 138ac8d88f91e2a6a278aa5cee9120c714c4ce2d Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 20 Jan 2010 18:20:20 +0100 Subject: KVM: VMX: Fix emulation of DR4 and DR5 Make sure DR4 and DR5 are aliased to DR6 and DR7, respectively, if CR4.DE is not set. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 9727773f24b7..c7b99e1f8aae 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3039,6 +3039,15 @@ static int handle_cr(struct kvm_vcpu *vcpu) return 0; } +static int check_dr_alias(struct kvm_vcpu *vcpu) +{ + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return -1; + } + return 0; +} + static int handle_dr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; @@ -3081,14 +3090,20 @@ static int handle_dr(struct kvm_vcpu *vcpu) case 0 ... 3: val = vcpu->arch.db[dr]; break; + case 4: + if (check_dr_alias(vcpu) < 0) + return 1; + /* fall through */ case 6: val = vcpu->arch.dr6; break; - case 7: + case 5: + if (check_dr_alias(vcpu) < 0) + return 1; + /* fall through */ + default: /* 7 */ val = vcpu->arch.dr7; break; - default: - val = 0; } kvm_register_write(vcpu, reg, val); } else { @@ -3099,12 +3114,10 @@ static int handle_dr(struct kvm_vcpu *vcpu) if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) vcpu->arch.eff_db[dr] = val; break; - case 4 ... 5: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { - kvm_queue_exception(vcpu, UD_VECTOR); + case 4: + if (check_dr_alias(vcpu) < 0) return 1; - } - break; + /* fall through */ case 6: if (val & 0xffffffff00000000ULL) { kvm_inject_gp(vcpu, 0); @@ -3112,7 +3125,11 @@ static int handle_dr(struct kvm_vcpu *vcpu) } vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; break; - case 7: + case 5: + if (check_dr_alias(vcpu) < 0) + return 1; + /* fall through */ + default: /* 7 */ if (val & 0xffffffff00000000ULL) { kvm_inject_gp(vcpu, 0); return 1; -- cgit v1.2.2 From fd7373cce767a8803e79f51bd3fc5f531234657f Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 20 Jan 2010 18:20:20 +0100 Subject: KVM: VMX: Clean up DR6 emulation As we trap all debug register accesses, we do not need to switch real DR6 at all. Clean up update_exception_bitmap at this chance, too. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c7b99e1f8aae..b7e812e9c299 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -579,17 +579,12 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) { u32 eb; - eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) - | (1u << NM_VECTOR); - /* - * Unconditionally intercept #DB so we can maintain dr6 without - * reading it every exit. - */ - eb |= 1u << DB_VECTOR; - if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { - if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) - eb |= 1u << BP_VECTOR; - } + eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | + (1u << NM_VECTOR) | (1u << DB_VECTOR); + if ((vcpu->guest_debug & + (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == + (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) + eb |= 1u << BP_VECTOR; if (to_vmx(vcpu)->rmode.vm86_active) eb = ~0; if (enable_ept) @@ -3777,9 +3772,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) */ vmcs_writel(HOST_CR0, read_cr0()); - if (vcpu->arch.switch_db_regs) - set_debugreg(vcpu->arch.dr6, 6); - asm( /* Store host registers */ "push %%"R"dx; push %%"R"bp;" @@ -3880,9 +3872,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) | (1 << VCPU_EXREG_PDPTR)); vcpu->arch.regs_dirty = 0; - if (vcpu->arch.switch_db_regs) - get_debugreg(vcpu->arch.dr6, 6); - vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); if (vmx->rmode.irq.pending) fixup_rmode_irq(vmx); -- cgit v1.2.2 From c76de350c8a3ba770becc17eaa744dc3c7642295 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 20 Jan 2010 18:20:20 +0100 Subject: KVM: SVM: Clean up and enhance mov dr emulation Enhance mov dr instruction emulation used by SVM so that it properly handles dr4/5: alias to dr6/7 if cr4.de is cleared. Otherwise return EMULATE_FAIL which will let our only possible caller in that scenario, ud_interception, re-inject UD. We do not need to inject faults, SVM does this for us (exceptions take precedence over instruction interceptions). For the same reason, the value overflow checks can be removed. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 64 +++++++++++++++++++++++++----------------------------- arch/x86/kvm/x86.c | 19 ++-------------- 2 files changed, 31 insertions(+), 52 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8d7cb62ebef6..4295dfcc6031 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1122,76 +1122,70 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) svm->vmcb->control.asid = sd->next_asid++; } -static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) +static int svm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *dest) { struct vcpu_svm *svm = to_svm(vcpu); - unsigned long val; switch (dr) { case 0 ... 3: - val = vcpu->arch.db[dr]; + *dest = vcpu->arch.db[dr]; break; + case 4: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return EMULATE_FAIL; /* will re-inject UD */ + /* fall through */ case 6: if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) - val = vcpu->arch.dr6; + *dest = vcpu->arch.dr6; else - val = svm->vmcb->save.dr6; + *dest = svm->vmcb->save.dr6; break; + case 5: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return EMULATE_FAIL; /* will re-inject UD */ + /* fall through */ case 7: if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) - val = vcpu->arch.dr7; + *dest = vcpu->arch.dr7; else - val = svm->vmcb->save.dr7; + *dest = svm->vmcb->save.dr7; break; - default: - val = 0; } - return val; + return EMULATE_DONE; } -static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, - int *exception) +static int svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value) { struct vcpu_svm *svm = to_svm(vcpu); - *exception = 0; - switch (dr) { case 0 ... 3: vcpu->arch.db[dr] = value; if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) vcpu->arch.eff_db[dr] = value; - return; - case 4 ... 5: - if (vcpu->arch.cr4 & X86_CR4_DE) - *exception = UD_VECTOR; - return; + break; + case 4: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return EMULATE_FAIL; /* will re-inject UD */ + /* fall through */ case 6: - if (value & 0xffffffff00000000ULL) { - *exception = GP_VECTOR; - return; - } vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1; - return; + break; + case 5: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return EMULATE_FAIL; /* will re-inject UD */ + /* fall through */ case 7: - if (value & 0xffffffff00000000ULL) { - *exception = GP_VECTOR; - return; - } vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1; if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { svm->vmcb->save.dr7 = vcpu->arch.dr7; vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK); } - return; - default: - /* FIXME: Possible case? */ - printk(KERN_DEBUG "%s: unexpected dr %u\n", - __func__, dr); - *exception = UD_VECTOR; - return; + break; } + + return EMULATE_DONE; } static int pf_interception(struct vcpu_svm *svm) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 322c2c5f9bc4..fd5101b57fa3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3270,29 +3270,14 @@ int emulate_clts(struct kvm_vcpu *vcpu) int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) { - struct kvm_vcpu *vcpu = ctxt->vcpu; - - switch (dr) { - case 0 ... 3: - *dest = kvm_x86_ops->get_dr(vcpu, dr); - return X86EMUL_CONTINUE; - default: - pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr); - return X86EMUL_UNHANDLEABLE; - } + return kvm_x86_ops->get_dr(ctxt->vcpu, dr, dest); } int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) { unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; - int exception; - kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); - if (exception) { - /* FIXME: better handling */ - return X86EMUL_UNHANDLEABLE; - } - return X86EMUL_CONTINUE; + return kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask); } void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) -- cgit v1.2.2 From 727f5a23e2080b19da392f17fa3158a0941f81f5 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 20 Jan 2010 18:20:20 +0100 Subject: KVM: SVM: Trap all debug register accesses To enable proper debug register emulation under all conditions, trap access to all DR0..7. This may be optimized later on. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 4295dfcc6031..a281368c3b96 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -554,13 +554,19 @@ static void init_vmcb(struct vcpu_svm *svm) control->intercept_dr_read = INTERCEPT_DR0_MASK | INTERCEPT_DR1_MASK | INTERCEPT_DR2_MASK | - INTERCEPT_DR3_MASK; + INTERCEPT_DR3_MASK | + INTERCEPT_DR4_MASK | + INTERCEPT_DR5_MASK | + INTERCEPT_DR6_MASK | + INTERCEPT_DR7_MASK; control->intercept_dr_write = INTERCEPT_DR0_MASK | INTERCEPT_DR1_MASK | INTERCEPT_DR2_MASK | INTERCEPT_DR3_MASK | + INTERCEPT_DR4_MASK | INTERCEPT_DR5_MASK | + INTERCEPT_DR6_MASK | INTERCEPT_DR7_MASK; control->intercept_exceptions = (1 << PF_VECTOR) | @@ -2319,11 +2325,17 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_DR1] = emulate_on_interception, [SVM_EXIT_READ_DR2] = emulate_on_interception, [SVM_EXIT_READ_DR3] = emulate_on_interception, + [SVM_EXIT_READ_DR4] = emulate_on_interception, + [SVM_EXIT_READ_DR5] = emulate_on_interception, + [SVM_EXIT_READ_DR6] = emulate_on_interception, + [SVM_EXIT_READ_DR7] = emulate_on_interception, [SVM_EXIT_WRITE_DR0] = emulate_on_interception, [SVM_EXIT_WRITE_DR1] = emulate_on_interception, [SVM_EXIT_WRITE_DR2] = emulate_on_interception, [SVM_EXIT_WRITE_DR3] = emulate_on_interception, + [SVM_EXIT_WRITE_DR4] = emulate_on_interception, [SVM_EXIT_WRITE_DR5] = emulate_on_interception, + [SVM_EXIT_WRITE_DR6] = emulate_on_interception, [SVM_EXIT_WRITE_DR7] = emulate_on_interception, [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, -- cgit v1.2.2 From ab344828ebe729e52949d64046adaa196f6b9dbe Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 21 Jan 2010 15:28:46 +0200 Subject: KVM: x86: fix checking of cr0 validity Move to/from Control Registers chapter of Intel SDM says. "Reserved bits in CR0 remain clear after any load of those registers; attempts to set them have no impact". Control Register chapter says "Bits 63:32 of CR0 are reserved and must be written with zeros. Writing a nonzero value to any of the upper 32 bits results in a general-protection exception, #GP(0)." This patch tries to implement this twisted logic. Signed-off-by: Gleb Natapov Reported-by: Lorenzo Martignoni Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fd5101b57fa3..ce267d9f0305 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -430,12 +430,16 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { cr0 |= X86_CR0_ET; - if (cr0 & CR0_RESERVED_BITS) { +#ifdef CONFIG_X86_64 + if (cr0 & 0xffffffff00000000UL) { printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", cr0, kvm_read_cr0(vcpu)); kvm_inject_gp(vcpu, 0); return; } +#endif + + cr0 &= ~CR0_RESERVED_BITS; if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); -- cgit v1.2.2 From 2608d7a12fd4badfe2eac534150f0715a7fd3ff3 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 21 Jan 2010 15:31:45 +0200 Subject: KVM: Allow kvm_load_guest_fpu() even when !vcpu->fpu_active This allows accessing the guest fpu from the instruction emulator, as well as being symmetric with kvm_put_guest_fpu(). Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ce267d9f0305..4cf4eac03bbc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4239,7 +4239,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) preempt_disable(); kvm_x86_ops->prepare_guest_switch(vcpu); - kvm_load_guest_fpu(vcpu); + if (vcpu->fpu_active) + kvm_load_guest_fpu(vcpu); local_irq_disable(); @@ -5285,7 +5286,7 @@ EXPORT_SYMBOL_GPL(fx_init); void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { - if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) + if (vcpu->guest_fpu_loaded) return; vcpu->guest_fpu_loaded = 1; -- cgit v1.2.2 From e5bb40251a920cdd9d12c569c6aab0bdd0279e4e Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 21 Jan 2010 15:31:46 +0200 Subject: KVM: Drop kvm_{load,put}_guest_fpu() exports Not used anymore. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4cf4eac03bbc..c61ec9c69267 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5293,7 +5293,6 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) kvm_fx_save(&vcpu->arch.host_fx_image); kvm_fx_restore(&vcpu->arch.guest_fx_image); } -EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { @@ -5306,7 +5305,6 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) ++vcpu->stat.fpu_reload; set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests); } -EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) { -- cgit v1.2.2 From 6b52d18605f580bdffaffd48c8da228c3e848deb Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 21 Jan 2010 15:31:47 +0200 Subject: KVM: Activate fpu on clts Assume that if the guest executes clts, it knows what it's doing, and load the guest fpu to prevent an #NM exception. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 8 +++++++- arch/x86/kvm/vmx.c | 1 + arch/x86/kvm/x86.c | 1 + 3 files changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index a281368c3b96..800208a60a51 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1259,12 +1259,17 @@ static int ud_interception(struct vcpu_svm *svm) return 1; } -static int nm_interception(struct vcpu_svm *svm) +static void svm_fpu_activate(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); svm->vcpu.fpu_active = 1; update_cr0_intercept(svm); +} +static int nm_interception(struct vcpu_svm *svm) +{ + svm_fpu_activate(&svm->vcpu); return 1; } @@ -2977,6 +2982,7 @@ static struct kvm_x86_ops svm_x86_ops = { .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, + .fpu_activate = svm_fpu_activate, .fpu_deactivate = svm_fpu_deactivate, .tlb_flush = svm_flush_tlb, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b7e812e9c299..fad871cbed19 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3002,6 +3002,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); skip_emulated_instruction(vcpu); + vmx_fpu_activate(vcpu); return 1; case 1: /*mov from cr*/ switch (cr) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c61ec9c69267..4db0c8a9082e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3269,6 +3269,7 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) int emulate_clts(struct kvm_vcpu *vcpu) { kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); + kvm_x86_ops->fpu_activate(vcpu); return X86EMUL_CONTINUE; } -- cgit v1.2.2 From 3eeb3288bcbf64da90afc26389b8844df7c34912 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 21 Jan 2010 15:31:48 +0200 Subject: KVM: Add a helper for checking if the guest is in protected mode Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 9 ++++----- arch/x86/kvm/vmx.c | 4 ++-- arch/x86/kvm/x86.c | 7 +++---- arch/x86/kvm/x86.h | 6 ++++++ 4 files changed, 15 insertions(+), 11 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 48c7f9f8a08f..6a429eefc533 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -32,6 +32,7 @@ #include #include +#include "x86.h" #include "mmu.h" /* for is_long_mode() */ /* @@ -1515,7 +1516,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt) /* syscall is not available in real mode */ if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL - || !kvm_read_cr0_bits(ctxt->vcpu, X86_CR0_PE)) + || !is_protmode(ctxt->vcpu)) return -1; setup_syscalls_segments(ctxt, &cs, &ss); @@ -1568,8 +1569,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) return -1; /* inject #GP if in real mode or paging is disabled */ - if (ctxt->mode == X86EMUL_MODE_REAL || - !kvm_read_cr0_bits(ctxt->vcpu, X86_CR0_PE)) { + if (ctxt->mode == X86EMUL_MODE_REAL || !is_protmode(ctxt->vcpu)) { kvm_inject_gp(ctxt->vcpu, 0); return -1; } @@ -1634,8 +1634,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) return -1; /* inject #GP if in real mode or paging is disabled */ - if (ctxt->mode == X86EMUL_MODE_REAL - || !kvm_read_cr0_bits(ctxt->vcpu, X86_CR0_PE)) { + if (ctxt->mode == X86EMUL_MODE_REAL || !is_protmode(ctxt->vcpu)) { kvm_inject_gp(ctxt->vcpu, 0); return -1; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index fad871cbed19..2e894954069f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1845,7 +1845,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, static int vmx_get_cpl(struct kvm_vcpu *vcpu) { - if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) /* if real mode */ + if (!is_protmode(vcpu)) return 0; if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ @@ -2100,7 +2100,7 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) static bool guest_state_valid(struct kvm_vcpu *vcpu) { /* real mode guest state checks */ - if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) { + if (!is_protmode(vcpu)) { if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) return false; if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4db0c8a9082e..a4a7d1892f72 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3786,8 +3786,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) * hypercall generates UD from non zero cpl and real mode * per HYPER-V spec */ - if (kvm_x86_ops->get_cpl(vcpu) != 0 || - !kvm_read_cr0_bits(vcpu, X86_CR0_PE)) { + if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) { kvm_queue_exception(vcpu, UD_VECTOR); return 0; } @@ -4751,7 +4750,7 @@ int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, { struct kvm_segment kvm_seg; - if (is_vm86_segment(vcpu, seg) || !(kvm_read_cr0_bits(vcpu, X86_CR0_PE))) + if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu)) return kvm_load_realmode_segment(vcpu, selector, seg); if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) return 1; @@ -5103,7 +5102,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, /* Older userspace won't unhalt the vcpu on reset. */ if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && - !(kvm_read_cr0_bits(vcpu, X86_CR0_PE))) + !is_protmode(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; vcpu_put(vcpu); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 5eadea585d2a..f783d8fe0d1d 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -2,6 +2,7 @@ #define ARCH_X86_KVM_X86_H #include +#include "kvm_cache_regs.h" static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) { @@ -35,4 +36,9 @@ static inline bool kvm_exception_is_soft(unsigned int nr) struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index); +static inline bool is_protmode(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr0_bits(vcpu, X86_CR0_PE); +} + #endif -- cgit v1.2.2 From 836a1b3c3456042704c86aaa3d837b976de9343b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 21 Jan 2010 15:31:49 +0200 Subject: KVM: Move cr0/cr4/efer related helpers to x86.h They have more general scope than the mmu. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 1 - arch/x86/kvm/mmu.c | 1 + arch/x86/kvm/mmu.h | 24 ------------------------ arch/x86/kvm/x86.h | 24 ++++++++++++++++++++++++ 4 files changed, 25 insertions(+), 25 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 6a429eefc533..645b245a3c23 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -33,7 +33,6 @@ #include #include "x86.h" -#include "mmu.h" /* for is_long_mode() */ /* * Opcode effective-address decode tables. diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ff2b2e8d72eb..6f7158f4fbfd 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -18,6 +18,7 @@ */ #include "mmu.h" +#include "x86.h" #include "kvm_cache_regs.h" #include diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 599159f728b9..61ef5a65b7d8 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -58,30 +58,6 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) return kvm_mmu_load(vcpu); } -static inline int is_long_mode(struct kvm_vcpu *vcpu) -{ -#ifdef CONFIG_X86_64 - return vcpu->arch.shadow_efer & EFER_LMA; -#else - return 0; -#endif -} - -static inline int is_pae(struct kvm_vcpu *vcpu) -{ - return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); -} - -static inline int is_pse(struct kvm_vcpu *vcpu) -{ - return kvm_read_cr4_bits(vcpu, X86_CR4_PSE); -} - -static inline int is_paging(struct kvm_vcpu *vcpu) -{ - return kvm_read_cr0_bits(vcpu, X86_CR0_PG); -} - static inline int is_present_gpte(unsigned long pte) { return pte & PT_PRESENT_MASK; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index f783d8fe0d1d..2dc24a755b6d 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -41,4 +41,28 @@ static inline bool is_protmode(struct kvm_vcpu *vcpu) return kvm_read_cr0_bits(vcpu, X86_CR0_PE); } +static inline int is_long_mode(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_X86_64 + return vcpu->arch.shadow_efer & EFER_LMA; +#else + return 0; +#endif +} + +static inline int is_pae(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); +} + +static inline int is_pse(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr4_bits(vcpu, X86_CR4_PSE); +} + +static inline int is_paging(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr0_bits(vcpu, X86_CR0_PG); +} + #endif -- cgit v1.2.2 From f6801dff23bd1902473902194667f4ac1eb6ea26 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 21 Jan 2010 15:31:50 +0200 Subject: KVM: Rename vcpu->shadow_efer to efer None of the other registers have the shadow_ prefix. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 2 +- arch/x86/kvm/svm.c | 12 ++++++------ arch/x86/kvm/vmx.c | 14 +++++++------- arch/x86/kvm/x86.c | 14 +++++++------- arch/x86/kvm/x86.h | 2 +- 5 files changed, 22 insertions(+), 22 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 6f7158f4fbfd..599c422c390f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -237,7 +237,7 @@ static int is_cpuid_PSE36(void) static int is_nx(struct kvm_vcpu *vcpu) { - return vcpu->arch.shadow_efer & EFER_NX; + return vcpu->arch.efer & EFER_NX; } static int is_shadow_present_pte(u64 pte) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 800208a60a51..9596cc86d6dd 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -231,7 +231,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) efer &= ~EFER_LME; to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; - vcpu->arch.shadow_efer = efer; + vcpu->arch.efer = efer; } static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, @@ -996,14 +996,14 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) struct vcpu_svm *svm = to_svm(vcpu); #ifdef CONFIG_X86_64 - if (vcpu->arch.shadow_efer & EFER_LME) { + if (vcpu->arch.efer & EFER_LME) { if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { - vcpu->arch.shadow_efer |= EFER_LMA; + vcpu->arch.efer |= EFER_LMA; svm->vmcb->save.efer |= EFER_LMA | EFER_LME; } if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) { - vcpu->arch.shadow_efer &= ~EFER_LMA; + vcpu->arch.efer &= ~EFER_LMA; svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); } } @@ -1361,7 +1361,7 @@ static int vmmcall_interception(struct vcpu_svm *svm) static int nested_svm_check_permissions(struct vcpu_svm *svm) { - if (!(svm->vcpu.arch.shadow_efer & EFER_SVME) + if (!(svm->vcpu.arch.efer & EFER_SVME) || !is_paging(&svm->vcpu)) { kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; @@ -1764,7 +1764,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) hsave->save.ds = vmcb->save.ds; hsave->save.gdtr = vmcb->save.gdtr; hsave->save.idtr = vmcb->save.idtr; - hsave->save.efer = svm->vcpu.arch.shadow_efer; + hsave->save.efer = svm->vcpu.arch.efer; hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); hsave->save.cr4 = svm->vcpu.arch.cr4; hsave->save.rflags = vmcb->save.rflags; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2e894954069f..a680d939546f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -613,7 +613,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) u64 guest_efer; u64 ignore_bits; - guest_efer = vmx->vcpu.arch.shadow_efer; + guest_efer = vmx->vcpu.arch.efer; /* * NX is emulated; LMA and LME handled by hardware; SCE meaninless @@ -955,7 +955,7 @@ static void setup_msrs(struct vcpu_vmx *vmx) * if efer.sce is enabled. */ index = __find_msr_index(vmx, MSR_K6_STAR); - if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE)) + if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE)) move_msr_up(vmx, index, save_nmsrs++); } #endif @@ -1600,7 +1600,7 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) * of this msr depends on is_long_mode(). */ vmx_load_host_state(to_vmx(vcpu)); - vcpu->arch.shadow_efer = efer; + vcpu->arch.efer = efer; if (!msr) return; if (efer & EFER_LMA) { @@ -1632,13 +1632,13 @@ static void enter_lmode(struct kvm_vcpu *vcpu) (guest_tr_ar & ~AR_TYPE_MASK) | AR_TYPE_BUSY_64_TSS); } - vcpu->arch.shadow_efer |= EFER_LMA; - vmx_set_efer(vcpu, vcpu->arch.shadow_efer); + vcpu->arch.efer |= EFER_LMA; + vmx_set_efer(vcpu, vcpu->arch.efer); } static void exit_lmode(struct kvm_vcpu *vcpu) { - vcpu->arch.shadow_efer &= ~EFER_LMA; + vcpu->arch.efer &= ~EFER_LMA; vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS) @@ -1745,7 +1745,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) enter_rmode(vcpu); #ifdef CONFIG_X86_64 - if (vcpu->arch.shadow_efer & EFER_LME) { + if (vcpu->arch.efer & EFER_LME) { if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) enter_lmode(vcpu); if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a4a7d1892f72..27af6e353b06 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -456,7 +456,7 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { #ifdef CONFIG_X86_64 - if ((vcpu->arch.shadow_efer & EFER_LME)) { + if ((vcpu->arch.efer & EFER_LME)) { int cs_db, cs_l; if (!is_pae(vcpu)) { @@ -655,7 +655,7 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer) } if (is_paging(vcpu) - && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) { + && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) { printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); kvm_inject_gp(vcpu, 0); return; @@ -686,9 +686,9 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer) kvm_x86_ops->set_efer(vcpu, efer); efer &= ~EFER_LMA; - efer |= vcpu->arch.shadow_efer & EFER_LMA; + efer |= vcpu->arch.efer & EFER_LMA; - vcpu->arch.shadow_efer = efer; + vcpu->arch.efer = efer; vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; kvm_mmu_reset_context(vcpu); @@ -1426,7 +1426,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) data |= (((uint64_t)4ULL) << 40); break; case MSR_EFER: - data = vcpu->arch.shadow_efer; + data = vcpu->arch.efer; break; case MSR_KVM_WALL_CLOCK: data = vcpu->kvm->arch.wall_clock; @@ -4569,7 +4569,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, sregs->cr3 = vcpu->arch.cr3; sregs->cr4 = kvm_read_cr4(vcpu); sregs->cr8 = kvm_get_cr8(vcpu); - sregs->efer = vcpu->arch.shadow_efer; + sregs->efer = vcpu->arch.efer; sregs->apic_base = kvm_get_apic_base(vcpu); memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap); @@ -5059,7 +5059,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, kvm_set_cr8(vcpu, sregs->cr8); - mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; + mmu_reset_needed |= vcpu->arch.efer != sregs->efer; kvm_x86_ops->set_efer(vcpu, sregs->efer); kvm_set_apic_base(vcpu, sregs->apic_base); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 2dc24a755b6d..2d101639bd8d 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -44,7 +44,7 @@ static inline bool is_protmode(struct kvm_vcpu *vcpu) static inline int is_long_mode(struct kvm_vcpu *vcpu) { #ifdef CONFIG_X86_64 - return vcpu->arch.shadow_efer & EFER_LMA; + return vcpu->arch.efer & EFER_LMA; #else return 0; #endif -- cgit v1.2.2 From 8ae099127668de80e4babba73bb492740ce3a1be Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 21 Jan 2010 15:31:51 +0200 Subject: KVM: Optimize kvm_read_cr[04]_bits() 'mask' is always a constant, so we can check whether it includes a bit that might be owned by the guest very cheaply, and avoid the decache call. Saves a few hundred bytes of module text. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/kvm_cache_regs.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 6b419a36cbd9..cff851cf5322 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -1,6 +1,11 @@ #ifndef ASM_KVM_CACHE_REGS_H #define ASM_KVM_CACHE_REGS_H +#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS +#define KVM_POSSIBLE_CR4_GUEST_BITS \ + (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_OSXMMEXCPT | X86_CR4_PGE) + static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, enum kvm_reg reg) { @@ -40,7 +45,8 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) { - if (mask & vcpu->arch.cr0_guest_owned_bits) + ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS; + if (tmask & vcpu->arch.cr0_guest_owned_bits) kvm_x86_ops->decache_cr0_guest_bits(vcpu); return vcpu->arch.cr0 & mask; } @@ -52,7 +58,8 @@ static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu) static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) { - if (mask & vcpu->arch.cr4_guest_owned_bits) + ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS; + if (tmask & vcpu->arch.cr4_guest_owned_bits) kvm_x86_ops->decache_cr4_guest_bits(vcpu); return vcpu->arch.cr4 & mask; } -- cgit v1.2.2 From 0c04851c0c093ce98ab4ca69556480d779292418 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 21 Jan 2010 15:31:52 +0200 Subject: KVM: trace guest fpu loads and unloads Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 27af6e353b06..3b90298fb980 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5292,6 +5292,7 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) vcpu->guest_fpu_loaded = 1; kvm_fx_save(&vcpu->arch.host_fx_image); kvm_fx_restore(&vcpu->arch.guest_fx_image); + trace_kvm_fpu(1); } void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) @@ -5304,6 +5305,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) kvm_fx_restore(&vcpu->arch.host_fx_image); ++vcpu->stat.fpu_reload; set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests); + trace_kvm_fpu(0); } void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) -- cgit v1.2.2 From d7fa6ab217aeed26293e01d7b64f79a1ac57e823 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 22 Jan 2010 16:55:05 +0800 Subject: KVM: MMU: Remove some useless code from alloc_mmu_pages() If we fail to alloc page for vcpu->arch.mmu.pae_root, call to free_mmu_pages() is unnecessary, which just do free the page malloc for vcpu->arch.mmu.pae_root. Signed-off-by: Wei Yongjun Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 599c422c390f..dc4d954efacd 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2843,16 +2843,13 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) */ page = alloc_page(GFP_KERNEL | __GFP_DMA32); if (!page) - goto error_1; + return -ENOMEM; + vcpu->arch.mmu.pae_root = page_address(page); for (i = 0; i < 4; ++i) vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; return 0; - -error_1: - free_mmu_pages(vcpu); - return -ENOMEM; } int kvm_mmu_create(struct kvm_vcpu *vcpu) -- cgit v1.2.2 From 81231c698a71af6e1815df72c06685d295e1cc1d Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 24 Jan 2010 16:26:40 +0200 Subject: KVM: VMX: Pass cr0.mp through to the guest when the fpu is active When cr0.mp is clear, the guest doesn't expect a #NM in response to a WAIT instruction. Because we always keep cr0.mp set, it will get a #NM, and potentially be confused. Fix by keeping cr0.mp set only when the fpu is inactive, and passing it through when inactive. Reported-by: Lorenzo Martignoni Analyzed-by: Gleb Natapov Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a680d939546f..7a56879a058c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -66,7 +66,7 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO); #define KVM_GUEST_CR0_MASK \ (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \ - (X86_CR0_WP | X86_CR0_NE | X86_CR0_MP) + (X86_CR0_WP | X86_CR0_NE) #define KVM_VM_CR0_ALWAYS_ON \ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) #define KVM_CR4_GUEST_OWNED_BITS \ @@ -791,12 +791,15 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) static void vmx_fpu_activate(struct kvm_vcpu *vcpu) { + ulong cr0; + if (vcpu->fpu_active) return; vcpu->fpu_active = 1; - vmcs_clear_bits(GUEST_CR0, X86_CR0_TS); - if (kvm_read_cr0_bits(vcpu, X86_CR0_TS)) - vmcs_set_bits(GUEST_CR0, X86_CR0_TS); + cr0 = vmcs_readl(GUEST_CR0); + cr0 &= ~(X86_CR0_TS | X86_CR0_MP); + cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP); + vmcs_writel(GUEST_CR0, cr0); update_exception_bitmap(vcpu); vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); @@ -807,7 +810,7 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) { vmx_decache_cr0_guest_bits(vcpu); - vmcs_set_bits(GUEST_CR0, X86_CR0_TS); + vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP); update_exception_bitmap(vcpu); vcpu->arch.cr0_guest_owned_bits = 0; vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); @@ -1757,7 +1760,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); if (!vcpu->fpu_active) - hw_cr0 |= X86_CR0_TS; + hw_cr0 |= X86_CR0_TS | X86_CR0_MP; vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, hw_cr0); -- cgit v1.2.2 From e01c2426149d70dc6dd46ad0453195656b6eeaa4 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 25 Jan 2010 12:01:04 +0200 Subject: KVM: mark segments accessed on HW task switch On HW task switch newly loaded segments should me marked as accessed. Reported-by: Lorenzo Martignoni Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3b90298fb980..d47ceda7a928 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4697,18 +4697,6 @@ static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) return kvm_seg.selector; } -static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, - u16 selector, - struct kvm_segment *kvm_seg) -{ - struct desc_struct seg_desc; - - if (load_guest_segment_descriptor(vcpu, selector, &seg_desc)) - return 1; - seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg); - return 0; -} - static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) { struct kvm_segment segvar = { @@ -4749,11 +4737,14 @@ int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int type_bits, int seg) { struct kvm_segment kvm_seg; + struct desc_struct seg_desc; if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu)) return kvm_load_realmode_segment(vcpu, selector, seg); - if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) + + if (load_guest_segment_descriptor(vcpu, selector, &seg_desc)) return 1; + seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg); kvm_check_segment_descriptor(vcpu, seg, selector); kvm_seg.type |= type_bits; @@ -4764,6 +4755,11 @@ int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, kvm_seg.unusable = 1; kvm_set_segment(vcpu, &kvm_seg, seg); + if (selector && !kvm_seg.unusable && kvm_seg.s) { + /* mark segment as accessed */ + seg_desc.type |= 1; + save_guest_segment_descriptor(vcpu, selector, &seg_desc); + } return 0; } -- cgit v1.2.2 From 6e7d15296773f7a479072ec7f3248f51d01e1fad Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 25 Jan 2010 19:36:03 +0200 Subject: KVM: Fix msr trace - data is 64 bits wide, not unsigned long - rw is confusingly named Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/trace.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 1cb3d0e990f3..45903a926372 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -246,23 +246,23 @@ TRACE_EVENT(kvm_page_fault, * Tracepoint for guest MSR access. */ TRACE_EVENT(kvm_msr, - TP_PROTO(unsigned int rw, unsigned int ecx, unsigned long data), - TP_ARGS(rw, ecx, data), + TP_PROTO(unsigned write, u32 ecx, u64 data), + TP_ARGS(write, ecx, data), TP_STRUCT__entry( - __field( unsigned int, rw ) - __field( unsigned int, ecx ) - __field( unsigned long, data ) + __field( unsigned, write ) + __field( u32, ecx ) + __field( u64, data ) ), TP_fast_assign( - __entry->rw = rw; + __entry->write = write; __entry->ecx = ecx; __entry->data = data; ), - TP_printk("msr_%s %x = 0x%lx", - __entry->rw ? "write" : "read", + TP_printk("msr_%s %x = 0x%llx", + __entry->write ? "write" : "read", __entry->ecx, __entry->data) ); -- cgit v1.2.2 From 59200273c4d07ddf99dd3c8d91b90bb170457fc3 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 25 Jan 2010 19:47:02 +0200 Subject: KVM: Trace failed msr reads and writes Record failed msrs reads and writes, and the fact that they failed as well. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 13 ++++++++----- arch/x86/kvm/trace.h | 17 +++++++++++------ arch/x86/kvm/vmx.c | 5 +++-- 3 files changed, 22 insertions(+), 13 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 9596cc86d6dd..52f78dd03010 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2177,9 +2177,10 @@ static int rdmsr_interception(struct vcpu_svm *svm) u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; u64 data; - if (svm_get_msr(&svm->vcpu, ecx, &data)) + if (svm_get_msr(&svm->vcpu, ecx, &data)) { + trace_kvm_msr_read_ex(ecx); kvm_inject_gp(&svm->vcpu, 0); - else { + } else { trace_kvm_msr_read(ecx, data); svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; @@ -2271,13 +2272,15 @@ static int wrmsr_interception(struct vcpu_svm *svm) u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); - trace_kvm_msr_write(ecx, data); svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; - if (svm_set_msr(&svm->vcpu, ecx, data)) + if (svm_set_msr(&svm->vcpu, ecx, data)) { + trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(&svm->vcpu, 0); - else + } else { + trace_kvm_msr_write(ecx, data); skip_emulated_instruction(&svm->vcpu); + } return 1; } diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 45903a926372..6ad30a29f044 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -246,28 +246,33 @@ TRACE_EVENT(kvm_page_fault, * Tracepoint for guest MSR access. */ TRACE_EVENT(kvm_msr, - TP_PROTO(unsigned write, u32 ecx, u64 data), - TP_ARGS(write, ecx, data), + TP_PROTO(unsigned write, u32 ecx, u64 data, bool exception), + TP_ARGS(write, ecx, data, exception), TP_STRUCT__entry( __field( unsigned, write ) __field( u32, ecx ) __field( u64, data ) + __field( u8, exception ) ), TP_fast_assign( __entry->write = write; __entry->ecx = ecx; __entry->data = data; + __entry->exception = exception; ), - TP_printk("msr_%s %x = 0x%llx", + TP_printk("msr_%s %x = 0x%llx%s", __entry->write ? "write" : "read", - __entry->ecx, __entry->data) + __entry->ecx, __entry->data, + __entry->exception ? " (#GP)" : "") ); -#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data) -#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data) +#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data, false) +#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data, false) +#define trace_kvm_msr_read_ex(ecx) trace_kvm_msr(0, ecx, 0, true) +#define trace_kvm_msr_write_ex(ecx, data) trace_kvm_msr(1, ecx, data, true) /* * Tracepoint for guest CR access. diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7a56879a058c..334b016267a9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3158,6 +3158,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu) u64 data; if (vmx_get_msr(vcpu, ecx, &data)) { + trace_kvm_msr_read_ex(ecx); kvm_inject_gp(vcpu, 0); return 1; } @@ -3177,13 +3178,13 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu) u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); - trace_kvm_msr_write(ecx, data); - if (vmx_set_msr(vcpu, ecx, data) != 0) { + trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(vcpu, 0); return 1; } + trace_kvm_msr_write(ecx, data); skip_emulated_instruction(vcpu); return 1; } -- cgit v1.2.2 From 6d3e435e708fa32f99b780caf6c5508a606a513b Mon Sep 17 00:00:00 2001 From: Gui Jianfeng Date: Fri, 29 Jan 2010 15:36:59 +0800 Subject: KVM: VMX: Remove redundant check in vm_need_virtualize_apic_accesses() flexpriority_enabled implies cpu_has_vmx_virtualize_apic_accesses() returning true, so we don't need this check here. Signed-off-by: Gui Jianfeng Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 334b016267a9..f083b64814d8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -358,9 +358,7 @@ static inline int cpu_has_vmx_ple(void) static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) { - return flexpriority_enabled && - (cpu_has_vmx_virtualize_apic_accesses()) && - (irqchip_in_kernel(kvm)); + return flexpriority_enabled && irqchip_in_kernel(kvm); } static inline int cpu_has_vmx_vpid(void) -- cgit v1.2.2 From ab9f4ecbb6d39a18e300a0d10a4968c37404aa76 Mon Sep 17 00:00:00 2001 From: "Zhai, Edwin" Date: Fri, 29 Jan 2010 14:38:44 +0800 Subject: KVM: enable PCI multiple-segments for pass-through device Enable optional parameter (default 0) - PCI segment (or domain) besides BDF, when assigning PCI device to guest. Signed-off-by: Zhai Edwin Acked-by: Chris Wright Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d47ceda7a928..0bf3df527afc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1569,6 +1569,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_HYPERV: case KVM_CAP_HYPERV_VAPIC: case KVM_CAP_HYPERV_SPIN: + case KVM_CAP_PCI_SEGMENT: r = 1; break; case KVM_CAP_COALESCED_MMIO: -- cgit v1.2.2 From c125c607326928001983f51109c3d0c190f0a8bb Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 1 Feb 2010 22:11:04 +0900 Subject: KVM: fix load_guest_segment_descriptor() to return X86EMUL_* This patch fixes load_guest_segment_descriptor() to return X86EMUL_PROPAGATE_FAULT when it tries to access the descriptor table beyond the limit of it: suggested by Marcelo. I have checked current callers of this helper function, - kvm_load_segment_descriptor() - kvm_task_switch() and confirmed that this patch will change nothing in the upper layers if we do not change the handling of this return value from load_guest_segment_descriptor(). Next step: Although fixing the kvm_task_switch() to handle the propagated faults properly seems difficult, and maybe not worth it because TSS is not used commonly these days, we can fix kvm_load_segment_descriptor(). By doing so, the injected #GP becomes possible to be handled by the guest. The only problem for this is how to differentiate this fault from the page faults generated by kvm_read_guest_virt(). We may have to split this function to achive this goal. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0bf3df527afc..01f0b037092e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4663,7 +4663,7 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, if (dtable.limit < index * 8 + 7) { kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); - return 1; + return X86EMUL_PROPAGATE_FAULT; } return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); } -- cgit v1.2.2 From 7edcface95c6e593faa40c70e0464500515db573 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 1 Feb 2010 22:11:52 +0900 Subject: KVM: fix kvm_fix_hypercall() to return X86EMUL_* This patch fixes kvm_fix_hypercall() to propagate X86EMUL_* info generated by emulator_write_emulated() to its callers: suggested by Marcelo. The effect of this is x86_emulate_insn() will begin to handle the page faults which occur in emulator_write_emulated(): this should be OK because emulator_write_emulated_onepage() always injects page fault when emulator_write_emulated() returns X86EMUL_PROPAGATE_FAULT. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 01f0b037092e..c91007f81660 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3888,10 +3888,8 @@ EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); int kvm_fix_hypercall(struct kvm_vcpu *vcpu) { char instruction[3]; - int ret = 0; unsigned long rip = kvm_rip_read(vcpu); - /* * Blow out the MMU to ensure that no other VCPU has an active mapping * to ensure that the updated hypercall appears atomically across all @@ -3900,11 +3898,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) kvm_mmu_zap_all(vcpu->kvm); kvm_x86_ops->patch_hypercall(vcpu, instruction); - if (emulator_write_emulated(rip, instruction, 3, vcpu) - != X86EMUL_CONTINUE) - ret = -EFAULT; - return ret; + return emulator_write_emulated(rip, instruction, 3, vcpu); } static u64 mk_cr_64(u64 curr_cr, u32 new_val) -- cgit v1.2.2 From ebcbab4c034db2ec25abe702d788936b29a49b24 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 7 Feb 2010 11:56:52 +0200 Subject: KVM: VMX: Wire up .fpu_activate() callback Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f083b64814d8..82fb810afd3f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4137,6 +4137,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, + .fpu_activate = vmx_fpu_activate, .fpu_deactivate = vmx_fpu_deactivate, .tlb_flush = vmx_flush_tlb, -- cgit v1.2.2 From c45b4fd416f5497b6b38dd70acc0e5b01399e5c9 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 6 Feb 2010 09:43:03 +0100 Subject: KVM: VMX: Remove redundant test in vmx_set_efer() msr was tested above, so the second test is not needed. A simplified version of the semantic match that finds this problem is as follows: (http://coccinelle.lip6.fr/) // @r@ expression *x; expression e; identifier l; @@ if (x == NULL || ...) { ... when forall return ...; } ... when != goto l; when != x = e when != &x *x == NULL // Signed-off-by: Julia Lawall Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 82fb810afd3f..b400be06c8cd 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1602,8 +1602,6 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) */ vmx_load_host_state(to_vmx(vcpu)); vcpu->arch.efer = efer; - if (!msr) - return; if (efer & EFER_LMA) { vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS) | -- cgit v1.2.2 From 8f0b1ab6fb045a1324d9435ba00c2940783b0041 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 28 Jan 2010 12:37:56 +0100 Subject: KVM: Introduce kvm_host_page_size This patch introduces a generic function to find out the host page size for a given gfn. This function is needed by the kvm iommu code. This patch also simplifies the x86 host_mapping_level function. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index dc4d954efacd..913ef4b7939a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -468,24 +468,10 @@ static int has_wrprotected_page(struct kvm *kvm, static int host_mapping_level(struct kvm *kvm, gfn_t gfn) { - unsigned long page_size = PAGE_SIZE; - struct vm_area_struct *vma; - unsigned long addr; + unsigned long page_size; int i, ret = 0; - addr = gfn_to_hva(kvm, gfn); - if (kvm_is_error_hva(addr)) - return PT_PAGE_TABLE_LEVEL; - - down_read(¤t->mm->mmap_sem); - vma = find_vma(current->mm, addr); - if (!vma) - goto out; - - page_size = vma_kernel_pagesize(vma); - -out: - up_read(¤t->mm->mmap_sem); + page_size = kvm_host_page_size(kvm, gfn); for (i = PT_PAGE_TABLE_LEVEL; i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) { -- cgit v1.2.2 From 6316e1c8c6af6ccb55ff8564231710660608f46c Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 3 Feb 2010 16:11:03 -0500 Subject: KVM: VMX: emulate accessed bit for EPT Currently KVM pretends that pages with EPT mappings never got accessed. This has some side effects in the VM, like swapping out actively used guest pages and needlessly breaking up actively used hugepages. We can avoid those very costly side effects by emulating the accessed bit for EPT PTEs, which should only be slightly costly because pages pass through page_referenced infrequently. TLB flushing is taken care of by kvm_mmu_notifier_clear_flush_young(). This seems to help prevent KVM guests from being swapped out when they should not on my system. Signed-off-by: Rik van Riel Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 913ef4b7939a..b8da6715d08b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -838,9 +838,15 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 *spte; int young = 0; - /* always return old for EPT */ + /* + * Emulate the accessed bit for EPT, by checking if this page has + * an EPT mapping, and clearing it if it does. On the next access, + * a new EPT mapping will be established. + * This has some overhead, but not as much as the cost of swapping + * out actively used pages or breaking up actively used hugepages. + */ if (!shadow_accessed_mask) - return 0; + return kvm_unmap_rmapp(kvm, rmapp, data); spte = rmap_next(kvm, rmapp, NULL); while (spte) { -- cgit v1.2.2 From 1976d2d2c91246a37fcb8246b811de735aa6e9a4 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Fri, 5 Feb 2010 17:52:46 +0900 Subject: KVM: Remove redundant reading of rax on OUT instructions kvm_emulate_pio() and complete_pio() both read out the RAX register value and copy it to a place into which the value read out from the port will be copied later. This patch removes this redundancy. /*** snippet from arch/x86/kvm/x86.c ***/ int complete_pio(struct kvm_vcpu *vcpu) { ... if (!io->string) { if (io->in) { val = kvm_register_read(vcpu, VCPU_REGS_RAX); memcpy(&val, vcpu->arch.pio_data, io->size); kvm_register_write(vcpu, VCPU_REGS_RAX, val); } ... Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c91007f81660..bd3161c6daed 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3551,8 +3551,10 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port) trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, size, 1); - val = kvm_register_read(vcpu, VCPU_REGS_RAX); - memcpy(vcpu->arch.pio_data, &val, 4); + if (!vcpu->arch.pio.in) { + val = kvm_register_read(vcpu, VCPU_REGS_RAX); + memcpy(vcpu->arch.pio_data, &val, 4); + } if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { complete_pio(vcpu); -- cgit v1.2.2 From 90bb6fc556ab255abd798bcf4ff5769690ab2eea Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 31 Dec 2009 12:10:16 +0200 Subject: KVM: MMU: Add tracepoint for guest page aging Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b8da6715d08b..739793240d1d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -151,6 +151,9 @@ module_param(oos_shadow, bool, 0644); #define ACC_USER_MASK PT_USER_MASK #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) +#include + +#undef TRACE_INCLUDE_FILE #define CREATE_TRACE_POINTS #include "mmutrace.h" @@ -792,6 +795,7 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, unsigned long data)) { int i, j; + int ret; int retval = 0; struct kvm_memslots *slots; @@ -806,16 +810,17 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, if (hva >= start && hva < end) { gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; - retval |= handler(kvm, &memslot->rmap[gfn_offset], - data); + ret = handler(kvm, &memslot->rmap[gfn_offset], data); for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { int idx = gfn_offset; idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); - retval |= handler(kvm, + ret |= handler(kvm, &memslot->lpage_info[j][idx].rmap_pde, data); } + trace_kvm_age_page(hva, memslot, ret); + retval |= ret; } } -- cgit v1.2.2 From a19a6d1131c77822ae55f84a1bd6d62327e99287 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Tue, 9 Feb 2010 16:41:53 +0800 Subject: KVM: VMX: Rename VMX_EPT_IGMT_BIT to VMX_EPT_IPAT_BIT Following the new SDM. Now the bit is named "Ignore PAT memory type". Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b400be06c8cd..f82b0723afa5 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4001,7 +4001,7 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) * b. VT-d with snooping control feature: snooping control feature of * VT-d engine can guarantee the cache correctness. Just set it * to WB to keep consistent with host. So the same as item 3. - * 3. EPT without VT-d: always map as WB and set IGMT=1 to keep + * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep * consistent with host MTRR */ if (is_mmio) @@ -4012,7 +4012,7 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) VMX_EPT_MT_EPTE_SHIFT; else ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) - | VMX_EPT_IGMT_BIT; + | VMX_EPT_IPAT_BIT; return ret; } -- cgit v1.2.2 From d225f53b76f4287365efd381b9e3a732e6bad27e Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 8 Feb 2010 17:03:51 +0800 Subject: KVM: PIT: unregister kvm irq notifier if fail to create pit If fail to create pit, we should unregister kvm irq notifier which register in kvm_create_pit(). Signed-off-by: Wei Yongjun Acked-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 6a74246f80c6..c9569f20dfba 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -663,8 +663,9 @@ fail_unregister: kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev); fail: - if (pit->irq_source_id >= 0) - kvm_free_irq_source_id(kvm, pit->irq_source_id); + kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier); + kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); + kvm_free_irq_source_id(kvm, pit->irq_source_id); kfree(pit); return NULL; -- cgit v1.2.2 From 72bb2fcd23afe8db53b47e8f9edd736c517ba532 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 9 Feb 2010 10:33:03 +0800 Subject: KVM: cleanup the failure path of KVM_CREATE_IRQCHIP ioctrl If we fail to init ioapic device or the fail to setup the default irq routing, the device register by kvm_create_pic() and kvm_ioapic_init() remain unregister. This patch fixed to do this. Signed-off-by: Wei Yongjun Signed-off-by: Avi Kivity --- arch/x86/kvm/i8259.c | 11 +++++++++++ arch/x86/kvm/irq.h | 1 + arch/x86/kvm/x86.c | 8 ++++---- 3 files changed, 16 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index d5753a75d58c..a3711f9e580f 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -543,3 +543,14 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) return s; } + +void kvm_destroy_pic(struct kvm *kvm) +{ + struct kvm_pic *vpic = kvm->arch.vpic; + + if (vpic) { + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev); + kvm->arch.vpic = NULL; + kfree(vpic); + } +} diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index be399e207d57..0b71d480ebf1 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -75,6 +75,7 @@ struct kvm_pic { }; struct kvm_pic *kvm_create_pic(struct kvm *kvm); +void kvm_destroy_pic(struct kvm *kvm); int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); void kvm_pic_clear_isr_ack(struct kvm *kvm); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bd3161c6daed..b2f91b9af00d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2771,6 +2771,8 @@ long kvm_arch_vm_ioctl(struct file *filp, if (vpic) { r = kvm_ioapic_init(kvm); if (r) { + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, + &vpic->dev); kfree(vpic); goto create_irqchip_unlock; } @@ -2782,10 +2784,8 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_setup_default_irq_routing(kvm); if (r) { mutex_lock(&kvm->irq_lock); - kfree(kvm->arch.vpic); - kfree(kvm->arch.vioapic); - kvm->arch.vpic = NULL; - kvm->arch.vioapic = NULL; + kvm_ioapic_destroy(kvm); + kvm_destroy_pic(kvm); mutex_unlock(&kvm->irq_lock); } create_irqchip_unlock: -- cgit v1.2.2 From 2db2c2eb6226e30f8059b82512a1364db98da8e3 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 10 Feb 2010 14:21:29 +0200 Subject: KVM: x86 emulator: Add group8 instruction decoding Use groups mechanism to decode 0F BA instructions. Signed-off-by: Gleb Natapov Cc: stable@kernel.org Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 645b245a3c23..435b1e4e8c9b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -88,6 +88,7 @@ enum { Group1_80, Group1_81, Group1_82, Group1_83, Group1A, Group3_Byte, Group3, Group4, Group5, Group7, + Group8, }; static u32 opcode_table[256] = { @@ -267,7 +268,7 @@ static u32 twobyte_table[256] = { 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem16 | ModRM | Mov, /* 0xB8 - 0xBF */ - 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp, + 0, 0, Group | Group8, DstMem | SrcReg | ModRM | BitOp, 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem16 | ModRM | Mov, /* 0xC0 - 0xCF */ @@ -323,6 +324,10 @@ static u32 group_table[] = { 0, 0, ModRM | SrcMem, ModRM | SrcMem, SrcNone | ModRM | DstMem | Mov, 0, SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp, + [Group8*8] = + 0, 0, 0, 0, + DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, + DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, }; static u32 group2_table[] = { -- cgit v1.2.2 From 60a29d4ea4e7b6b95d9391ebc8625b0426f3a363 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 10 Feb 2010 14:21:30 +0200 Subject: KVM: x86 emulator: Add group9 instruction decoding Use groups mechanism to decode 0F C7 instructions. Signed-off-by: Gleb Natapov Cc: stable@kernel.org Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 435b1e4e8c9b..45a4f7c1bb0b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -88,7 +88,7 @@ enum { Group1_80, Group1_81, Group1_82, Group1_83, Group1A, Group3_Byte, Group3, Group4, Group5, Group7, - Group8, + Group8, Group9, }; static u32 opcode_table[256] = { @@ -272,7 +272,8 @@ static u32 twobyte_table[256] = { 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem16 | ModRM | Mov, /* 0xC0 - 0xCF */ - 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM, + 0, 0, 0, DstMem | SrcReg | ModRM | Mov, + 0, 0, 0, Group | GroupDual | Group9, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xD0 - 0xDF */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -328,6 +329,8 @@ static u32 group_table[] = { 0, 0, 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, + [Group9*8] = + 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, }; static u32 group2_table[] = { @@ -335,6 +338,8 @@ static u32 group2_table[] = { SrcNone | ModRM, 0, 0, SrcNone | ModRM, SrcNone | ModRM | DstMem | Mov, 0, SrcMem16 | ModRM | Mov, 0, + [Group9*8] = + 0, 0, 0, 0, 0, 0, 0, 0, }; /* EFLAGS bit definitions. */ -- cgit v1.2.2 From a0044755679f3e761b8b95995e5f2db2b7efd0f6 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 10 Feb 2010 14:21:31 +0200 Subject: KVM: x86 emulator: Add Virtual-8086 mode of emulation For some instructions CPU behaves differently for real-mode and virtual 8086. Let emulator know which mode cpu is in, so it will not poke into vcpu state directly. Signed-off-by: Gleb Natapov Cc: stable@kernel.org Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 12 +++++++----- arch/x86/kvm/x86.c | 3 ++- 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 45a4f7c1bb0b..e4e2df3b6038 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -899,6 +899,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) switch (mode) { case X86EMUL_MODE_REAL: + case X86EMUL_MODE_VM86: case X86EMUL_MODE_PROT16: def_op_bytes = def_ad_bytes = 2; break; @@ -1525,7 +1526,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt) /* syscall is not available in real mode */ if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL - || !is_protmode(ctxt->vcpu)) + || ctxt->mode == X86EMUL_MODE_VM86) return -1; setup_syscalls_segments(ctxt, &cs, &ss); @@ -1577,8 +1578,8 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) if (c->lock_prefix) return -1; - /* inject #GP if in real mode or paging is disabled */ - if (ctxt->mode == X86EMUL_MODE_REAL || !is_protmode(ctxt->vcpu)) { + /* inject #GP if in real mode */ + if (ctxt->mode == X86EMUL_MODE_REAL) { kvm_inject_gp(ctxt->vcpu, 0); return -1; } @@ -1642,8 +1643,9 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) if (c->lock_prefix) return -1; - /* inject #GP if in real mode or paging is disabled */ - if (ctxt->mode == X86EMUL_MODE_REAL || !is_protmode(ctxt->vcpu)) { + /* inject #GP if in real mode or Virtual 8086 mode */ + if (ctxt->mode == X86EMUL_MODE_REAL || + ctxt->mode == X86EMUL_MODE_VM86) { kvm_inject_gp(ctxt->vcpu, 0); return -1; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b2f91b9af00d..a28379507d30 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3348,8 +3348,9 @@ int emulate_instruction(struct kvm_vcpu *vcpu, vcpu->arch.emulate_ctxt.vcpu = vcpu; vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); vcpu->arch.emulate_ctxt.mode = + (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) - ? X86EMUL_MODE_REAL : cs_l + ? X86EMUL_MODE_VM86 : cs_l ? X86EMUL_MODE_PROT64 : cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; -- cgit v1.2.2 From 1871c6020d7308afb99127bba51f04548e7ca84e Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 10 Feb 2010 14:21:32 +0200 Subject: KVM: x86 emulator: fix memory access during x86 emulation Currently when x86 emulator needs to access memory, page walk is done with broadest permission possible, so if emulated instruction was executed by userspace process it can still access kernel memory. Fix that by providing correct memory access to page walker during emulation. Signed-off-by: Gleb Natapov Cc: stable@kernel.org Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 6 +-- arch/x86/kvm/mmu.c | 17 +++--- arch/x86/kvm/mmu.h | 6 +++ arch/x86/kvm/paging_tmpl.h | 11 ++-- arch/x86/kvm/x86.c | 131 ++++++++++++++++++++++++++++++++++----------- 5 files changed, 124 insertions(+), 47 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index e4e2df3b6038..c44b46014842 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -616,7 +616,7 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, if (linear < fc->start || linear >= fc->end) { size = min(15UL, PAGE_SIZE - offset_in_page(linear)); - rc = ops->read_std(linear, fc->data, size, ctxt->vcpu); + rc = ops->fetch(linear, fc->data, size, ctxt->vcpu, NULL); if (rc) return rc; fc->start = linear; @@ -671,11 +671,11 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt, op_bytes = 3; *address = 0; rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, - ctxt->vcpu); + ctxt->vcpu, NULL); if (rc) return rc; rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, - ctxt->vcpu); + ctxt->vcpu, NULL); return rc; } diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 739793240d1d..741373e8ca77 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -138,12 +138,6 @@ module_param(oos_shadow, bool, 0644); #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ | PT64_NX_MASK) -#define PFERR_PRESENT_MASK (1U << 0) -#define PFERR_WRITE_MASK (1U << 1) -#define PFERR_USER_MASK (1U << 2) -#define PFERR_RSVD_MASK (1U << 3) -#define PFERR_FETCH_MASK (1U << 4) - #define RMAP_EXT 4 #define ACC_EXEC_MASK 1 @@ -1632,7 +1626,7 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) { struct page *page; - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); + gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); if (gpa == UNMAPPED_GVA) return NULL; @@ -2155,8 +2149,11 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) spin_unlock(&vcpu->kvm->mmu_lock); } -static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) +static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, + u32 access, u32 *error) { + if (error) + *error = 0; return vaddr; } @@ -2740,7 +2737,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) if (tdp_enabled) return 0; - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); + gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); spin_lock(&vcpu->kvm->mmu_lock); r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); @@ -3237,7 +3234,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, if (is_shadow_present_pte(ent) && !is_last_spte(ent, level)) audit_mappings_page(vcpu, ent, va, level - 1); else { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); + gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL); gfn_t gfn = gpa >> PAGE_SHIFT; pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 61ef5a65b7d8..be66759321a5 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -42,6 +42,12 @@ #define PT_DIRECTORY_LEVEL 2 #define PT_PAGE_TABLE_LEVEL 1 +#define PFERR_PRESENT_MASK (1U << 0) +#define PFERR_WRITE_MASK (1U << 1) +#define PFERR_USER_MASK (1U << 2) +#define PFERR_RSVD_MASK (1U << 3) +#define PFERR_FETCH_MASK (1U << 4) + int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index df15a5307d2d..81eab9a50e6a 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -490,18 +490,23 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) spin_unlock(&vcpu->kvm->mmu_lock); } -static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) +static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, + u32 *error) { struct guest_walker walker; gpa_t gpa = UNMAPPED_GVA; int r; - r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0); + r = FNAME(walk_addr)(&walker, vcpu, vaddr, + !!(access & PFERR_WRITE_MASK), + !!(access & PFERR_USER_MASK), + !!(access & PFERR_FETCH_MASK)); if (r) { gpa = gfn_to_gpa(walker.gfn); gpa |= vaddr & ~PAGE_MASK; - } + } else if (error) + *error = walker.error_code; return gpa; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a28379507d30..ea3a8af8a478 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3039,14 +3039,41 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); } -static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu) +gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); +} + + gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + access |= PFERR_FETCH_MASK; + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); +} + +gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + access |= PFERR_WRITE_MASK; + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); +} + +/* uses this to access any guest's mapped memory without checking CPL */ +gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error); +} + +static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 access, + u32 *error) { void *data = val; int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error); unsigned offset = addr & (PAGE_SIZE-1); unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; @@ -3069,14 +3096,37 @@ out: return r; } +/* used for instruction fetching */ +static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, + access | PFERR_FETCH_MASK, error); +} + +static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, + error); +} + +static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 *error) +{ + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error); +} + static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu) + struct kvm_vcpu *vcpu, u32 *error) { void *data = val; int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error); unsigned offset = addr & (PAGE_SIZE-1); unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; @@ -3106,6 +3156,7 @@ static int emulator_read_emulated(unsigned long addr, struct kvm_vcpu *vcpu) { gpa_t gpa; + u32 error_code; if (vcpu->mmio_read_completed) { memcpy(val, vcpu->mmio_data, bytes); @@ -3115,17 +3166,20 @@ static int emulator_read_emulated(unsigned long addr, return X86EMUL_CONTINUE; } - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code); + + if (gpa == UNMAPPED_GVA) { + kvm_inject_page_fault(vcpu, addr, error_code); + return X86EMUL_PROPAGATE_FAULT; + } /* For APIC access vmexit */ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) goto mmio; - if (kvm_read_guest_virt(addr, val, bytes, vcpu) + if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL) == X86EMUL_CONTINUE) return X86EMUL_CONTINUE; - if (gpa == UNMAPPED_GVA) - return X86EMUL_PROPAGATE_FAULT; mmio: /* @@ -3164,11 +3218,12 @@ static int emulator_write_emulated_onepage(unsigned long addr, struct kvm_vcpu *vcpu) { gpa_t gpa; + u32 error_code; - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code); if (gpa == UNMAPPED_GVA) { - kvm_inject_page_fault(vcpu, addr, 2); + kvm_inject_page_fault(vcpu, addr, error_code); return X86EMUL_PROPAGATE_FAULT; } @@ -3232,7 +3287,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, char *kaddr; u64 val; - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); if (gpa == UNMAPPED_GVA || (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) @@ -3297,7 +3352,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); - kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu); + kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL); printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); @@ -3305,7 +3360,8 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); static struct x86_emulate_ops emulate_ops = { - .read_std = kvm_read_guest_virt, + .read_std = kvm_read_guest_virt_system, + .fetch = kvm_fetch_guest_virt, .read_emulated = emulator_read_emulated, .write_emulated = emulator_write_emulated, .cmpxchg_emulated = emulator_cmpxchg_emulated, @@ -3442,12 +3498,17 @@ static int pio_copy_data(struct kvm_vcpu *vcpu) gva_t q = vcpu->arch.pio.guest_gva; unsigned bytes; int ret; + u32 error_code; bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; if (vcpu->arch.pio.in) - ret = kvm_write_guest_virt(q, p, bytes, vcpu); + ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code); else - ret = kvm_read_guest_virt(q, p, bytes, vcpu); + ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code); + + if (ret == X86EMUL_PROPAGATE_FAULT) + kvm_inject_page_fault(vcpu, q, error_code); + return ret; } @@ -3468,7 +3529,7 @@ int complete_pio(struct kvm_vcpu *vcpu) if (io->in) { r = pio_copy_data(vcpu); if (r) - return r; + goto out; } delta = 1; @@ -3495,7 +3556,7 @@ int complete_pio(struct kvm_vcpu *vcpu) kvm_register_write(vcpu, VCPU_REGS_RSI, val); } } - +out: io->count -= io->cur_count; io->cur_count = 0; @@ -3617,10 +3678,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, if (!vcpu->arch.pio.in) { /* string PIO write */ ret = pio_copy_data(vcpu); - if (ret == X86EMUL_PROPAGATE_FAULT) { - kvm_inject_gp(vcpu, 0); + if (ret == X86EMUL_PROPAGATE_FAULT) return 1; - } if (ret == 0 && !pio_string_write(vcpu)) { complete_pio(vcpu); if (vcpu->arch.pio.count == 0) @@ -4663,7 +4722,9 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); return X86EMUL_PROPAGATE_FAULT; } - return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); + return kvm_read_guest_virt_system(dtable.base + index*8, + seg_desc, sizeof(*seg_desc), + vcpu, NULL); } /* allowed just for 8 bytes segments */ @@ -4677,15 +4738,23 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, if (dtable.limit < index * 8 + 7) return 1; - return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); + return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL); +} + +static gpa_t get_tss_base_addr_write(struct kvm_vcpu *vcpu, + struct desc_struct *seg_desc) +{ + u32 base_addr = get_desc_base(seg_desc); + + return kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL); } -static gpa_t get_tss_base_addr(struct kvm_vcpu *vcpu, +static gpa_t get_tss_base_addr_read(struct kvm_vcpu *vcpu, struct desc_struct *seg_desc) { u32 base_addr = get_desc_base(seg_desc); - return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); + return kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL); } static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) @@ -4894,7 +4963,7 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, sizeof tss_segment_16)) goto out; - if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), + if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc), &tss_segment_16, sizeof tss_segment_16)) goto out; @@ -4902,7 +4971,7 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, tss_segment_16.prev_task_link = old_tss_sel; if (kvm_write_guest(vcpu->kvm, - get_tss_base_addr(vcpu, nseg_desc), + get_tss_base_addr_write(vcpu, nseg_desc), &tss_segment_16.prev_task_link, sizeof tss_segment_16.prev_task_link)) goto out; @@ -4933,7 +5002,7 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, sizeof tss_segment_32)) goto out; - if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), + if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc), &tss_segment_32, sizeof tss_segment_32)) goto out; @@ -4941,7 +5010,7 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, tss_segment_32.prev_task_link = old_tss_sel; if (kvm_write_guest(vcpu->kvm, - get_tss_base_addr(vcpu, nseg_desc), + get_tss_base_addr_write(vcpu, nseg_desc), &tss_segment_32.prev_task_link, sizeof tss_segment_32.prev_task_link)) goto out; @@ -4964,7 +5033,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); - old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base); + old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL); /* FIXME: Handle errors. Failure to read either TSS or their * descriptors should generate a pagefault. @@ -5199,7 +5268,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, vcpu_load(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); + gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); srcu_read_unlock(&vcpu->kvm->srcu, idx); tr->physical_address = gpa; tr->valid = gpa != UNMAPPED_GVA; -- cgit v1.2.2 From f850e2e603bf5a05b0aee7901857cf85715aa694 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 10 Feb 2010 14:21:33 +0200 Subject: KVM: x86 emulator: Check IOPL level during io instruction emulation Make emulator check that vcpu is allowed to execute IN, INS, OUT, OUTS, CLI, STI. Signed-off-by: Gleb Natapov Cc: stable@kernel.org Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++---- arch/x86/kvm/x86.c | 10 +++--- 2 files changed, 86 insertions(+), 13 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c44b46014842..296e8519dc53 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1698,6 +1698,57 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) return 0; } +static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt) +{ + int iopl; + if (ctxt->mode == X86EMUL_MODE_REAL) + return false; + if (ctxt->mode == X86EMUL_MODE_VM86) + return true; + iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; + return kvm_x86_ops->get_cpl(ctxt->vcpu) > iopl; +} + +static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 port, u16 len) +{ + struct kvm_segment tr_seg; + int r; + u16 io_bitmap_ptr; + u8 perm, bit_idx = port & 0x7; + unsigned mask = (1 << len) - 1; + + kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR); + if (tr_seg.unusable) + return false; + if (tr_seg.limit < 103) + return false; + r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, + NULL); + if (r != X86EMUL_CONTINUE) + return false; + if (io_bitmap_ptr + port/8 > tr_seg.limit) + return false; + r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1, + ctxt->vcpu, NULL); + if (r != X86EMUL_CONTINUE) + return false; + if ((perm >> bit_idx) & mask) + return false; + return true; +} + +static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 port, u16 len) +{ + if (emulator_bad_iopl(ctxt)) + if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) + return false; + return true; +} + int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { @@ -1889,7 +1940,12 @@ special_insn: break; case 0x6c: /* insb */ case 0x6d: /* insw/insd */ - if (kvm_emulate_pio_string(ctxt->vcpu, + if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], + (c->d & ByteOp) ? 1 : c->op_bytes)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } + if (kvm_emulate_pio_string(ctxt->vcpu, 1, (c->d & ByteOp) ? 1 : c->op_bytes, c->rep_prefix ? @@ -1905,6 +1961,11 @@ special_insn: return 0; case 0x6e: /* outsb */ case 0x6f: /* outsw/outsd */ + if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], + (c->d & ByteOp) ? 1 : c->op_bytes)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } if (kvm_emulate_pio_string(ctxt->vcpu, 0, (c->d & ByteOp) ? 1 : c->op_bytes, @@ -2202,7 +2263,13 @@ special_insn: case 0xef: /* out (e/r)ax,dx */ port = c->regs[VCPU_REGS_RDX]; io_dir_in = 0; - do_io: if (kvm_emulate_pio(ctxt->vcpu, io_dir_in, + do_io: + if (!emulator_io_permited(ctxt, ops, port, + (c->d & ByteOp) ? 1 : c->op_bytes)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } + if (kvm_emulate_pio(ctxt->vcpu, io_dir_in, (c->d & ByteOp) ? 1 : c->op_bytes, port) != 0) { c->eip = saved_eip; @@ -2227,13 +2294,21 @@ special_insn: c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xfa: /* cli */ - ctxt->eflags &= ~X86_EFLAGS_IF; - c->dst.type = OP_NONE; /* Disable writeback. */ + if (emulator_bad_iopl(ctxt)) + kvm_inject_gp(ctxt->vcpu, 0); + else { + ctxt->eflags &= ~X86_EFLAGS_IF; + c->dst.type = OP_NONE; /* Disable writeback. */ + } break; case 0xfb: /* sti */ - toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); - ctxt->eflags |= X86_EFLAGS_IF; - c->dst.type = OP_NONE; /* Disable writeback. */ + if (emulator_bad_iopl(ctxt)) + kvm_inject_gp(ctxt->vcpu, 0); + else { + toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); + ctxt->eflags |= X86_EFLAGS_IF; + c->dst.type = OP_NONE; /* Disable writeback. */ + } break; case 0xfc: /* cld */ ctxt->eflags &= ~EFLG_DF; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ea3a8af8a478..86b739f8f173 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3599,6 +3599,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port) { unsigned long val; + trace_kvm_pio(!in, port, size, 1); + vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; vcpu->run->io.size = vcpu->arch.pio.size = size; @@ -3610,9 +3612,6 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port) vcpu->arch.pio.down = 0; vcpu->arch.pio.rep = 0; - trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, - size, 1); - if (!vcpu->arch.pio.in) { val = kvm_register_read(vcpu, VCPU_REGS_RAX); memcpy(vcpu->arch.pio_data, &val, 4); @@ -3633,6 +3632,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, unsigned now, in_page; int ret = 0; + trace_kvm_pio(!in, port, size, count); + vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; vcpu->run->io.size = vcpu->arch.pio.size = size; @@ -3644,9 +3645,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, vcpu->arch.pio.down = down; vcpu->arch.pio.rep = rep; - trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, - size, count); - if (!count) { kvm_x86_ops->skip_emulated_instruction(vcpu); return 1; -- cgit v1.2.2 From d4c6a1549c056f1d817e8f6f2f97d8b44933472f Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 10 Feb 2010 14:21:34 +0200 Subject: KVM: x86 emulator: Fix popf emulation POPF behaves differently depending on current CPU mode. Emulate correct logic to prevent guest from changing flags that it can't change otherwise. Signed-off-by: Gleb Natapov Cc: stable@kernel.org Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 296e8519dc53..1782387c069e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -343,11 +343,18 @@ static u32 group2_table[] = { }; /* EFLAGS bit definitions. */ +#define EFLG_ID (1<<21) +#define EFLG_VIP (1<<20) +#define EFLG_VIF (1<<19) +#define EFLG_AC (1<<18) #define EFLG_VM (1<<17) #define EFLG_RF (1<<16) +#define EFLG_IOPL (3<<12) +#define EFLG_NT (1<<14) #define EFLG_OF (1<<11) #define EFLG_DF (1<<10) #define EFLG_IF (1<<9) +#define EFLG_TF (1<<8) #define EFLG_SF (1<<7) #define EFLG_ZF (1<<6) #define EFLG_AF (1<<4) @@ -1214,6 +1221,49 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt, return rc; } +static int emulate_popf(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + void *dest, int len) +{ + int rc; + unsigned long val, change_mask; + int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; + int cpl = kvm_x86_ops->get_cpl(ctxt->vcpu); + + rc = emulate_pop(ctxt, ops, &val, len); + if (rc != X86EMUL_CONTINUE) + return rc; + + change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF + | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID; + + switch(ctxt->mode) { + case X86EMUL_MODE_PROT64: + case X86EMUL_MODE_PROT32: + case X86EMUL_MODE_PROT16: + if (cpl == 0) + change_mask |= EFLG_IOPL; + if (cpl <= iopl) + change_mask |= EFLG_IF; + break; + case X86EMUL_MODE_VM86: + if (iopl < 3) { + kvm_inject_gp(ctxt->vcpu, 0); + return X86EMUL_PROPAGATE_FAULT; + } + change_mask |= EFLG_IF; + break; + default: /* real mode */ + change_mask |= (EFLG_IOPL | EFLG_IF); + break; + } + + *(unsigned long *)dest = + (ctxt->eflags & ~change_mask) | (val & change_mask); + + return rc; +} + static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg) { struct decode_cache *c = &ctxt->decode; @@ -2099,7 +2149,10 @@ special_insn: c->dst.type = OP_REG; c->dst.ptr = (unsigned long *) &ctxt->eflags; c->dst.bytes = c->op_bytes; - goto pop_instruction; + rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes); + if (rc != X86EMUL_CONTINUE) + goto done; + break; case 0xa0 ... 0xa1: /* mov */ c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; c->dst.val = c->src.val; -- cgit v1.2.2 From e92805ac1228626c59c865f2f4e9059b9fb8c97b Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 10 Feb 2010 14:21:35 +0200 Subject: KVM: x86 emulator: Check CPL level during privilege instruction emulation Add CPL checking in case emulator is tricked into emulating privilege instruction from userspace. Signed-off-by: Gleb Natapov Cc: stable@kernel.org Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 1782387c069e..d63211169ac1 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -76,6 +76,7 @@ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ #define GroupMask 0xff /* Group number stored in bits 0:7 */ /* Misc flags */ +#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ #define No64 (1<<28) /* Source 2 operand type */ #define Src2None (0<<29) @@ -211,7 +212,7 @@ static u32 opcode_table[256] = { SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* 0xF0 - 0xF7 */ 0, 0, 0, 0, - ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, + ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3, /* 0xF8 - 0xFF */ ImplicitOps, 0, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, @@ -219,16 +220,20 @@ static u32 opcode_table[256] = { static u32 twobyte_table[256] = { /* 0x00 - 0x0F */ - 0, Group | GroupDual | Group7, 0, 0, 0, ImplicitOps, ImplicitOps, 0, - ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, + 0, Group | GroupDual | Group7, 0, 0, + 0, ImplicitOps, ImplicitOps | Priv, 0, + ImplicitOps | Priv, ImplicitOps | Priv, 0, 0, + 0, ImplicitOps | ModRM, 0, 0, /* 0x10 - 0x1F */ 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, /* 0x20 - 0x2F */ - ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, + ModRM | ImplicitOps | Priv, ModRM | Priv, + ModRM | ImplicitOps | Priv, ModRM | Priv, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x30 - 0x3F */ - ImplicitOps, 0, ImplicitOps, 0, - ImplicitOps, ImplicitOps, 0, 0, + ImplicitOps | Priv, 0, ImplicitOps | Priv, 0, + ImplicitOps, ImplicitOps | Priv, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x40 - 0x47 */ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, @@ -322,9 +327,9 @@ static u32 group_table[] = { SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, [Group7*8] = - 0, 0, ModRM | SrcMem, ModRM | SrcMem, + 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv, SrcNone | ModRM | DstMem | Mov, 0, - SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp, + SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv, [Group8*8] = 0, 0, 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, @@ -335,7 +340,7 @@ static u32 group_table[] = { static u32 group2_table[] = { [Group7*8] = - SrcNone | ModRM, 0, 0, SrcNone | ModRM, + SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM, SrcNone | ModRM | DstMem | Mov, 0, SrcMem16 | ModRM | Mov, 0, [Group9*8] = @@ -1700,12 +1705,6 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) return -1; } - /* sysexit must be called from CPL 0 */ - if (kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) { - kvm_inject_gp(ctxt->vcpu, 0); - return -1; - } - setup_syscalls_segments(ctxt, &cs, &ss); if ((c->rex_prefix & 0x8) != 0x0) @@ -1820,6 +1819,12 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); saved_eip = c->eip; + /* Privileged instruction can be executed only in CPL=0 */ + if ((c->d & Priv) && kvm_x86_ops->get_cpl(ctxt->vcpu)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } + if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs)) memop = c->modrm_ea; -- cgit v1.2.2 From d380a5e4022a5c63a6c5ac631e48c752ba7f1e4f Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 10 Feb 2010 14:21:36 +0200 Subject: KVM: x86 emulator: Add LOCK prefix validity checking Instructions which are not allowed to have LOCK prefix should generate #UD if one is used. [avi: fold opcode 82 fix from another patch] Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 97 +++++++++++++++++++++++++++++--------------------- 1 file changed, 56 insertions(+), 41 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index d63211169ac1..c2de9f03da84 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -76,6 +76,7 @@ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ #define GroupMask 0xff /* Group number stored in bits 0:7 */ /* Misc flags */ +#define Lock (1<<26) /* lock prefix is allowed for the instruction */ #define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ #define No64 (1<<28) /* Source 2 operand type */ @@ -94,35 +95,35 @@ enum { static u32 opcode_table[256] = { /* 0x00 - 0x07 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, /* 0x08 - 0x0F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, ImplicitOps | Stack | No64, 0, /* 0x10 - 0x17 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, /* 0x18 - 0x1F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, /* 0x20 - 0x27 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, /* 0x28 - 0x2F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 0, 0, 0, 0, /* 0x30 - 0x37 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 0, 0, 0, 0, /* 0x38 - 0x3F */ @@ -158,7 +159,7 @@ static u32 opcode_table[256] = { Group | Group1_80, Group | Group1_81, Group | Group1_82, Group | Group1_83, ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, /* 0x88 - 0x8F */ ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, @@ -263,17 +264,18 @@ static u32 twobyte_table[256] = { DstMem | SrcReg | Src2CL | ModRM, 0, 0, /* 0xA8 - 0xAF */ ImplicitOps | Stack, ImplicitOps | Stack, - 0, DstMem | SrcReg | ModRM | BitOp, + 0, DstMem | SrcReg | ModRM | BitOp | Lock, DstMem | SrcReg | Src2ImmByte | ModRM, DstMem | SrcReg | Src2CL | ModRM, ModRM, 0, /* 0xB0 - 0xB7 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, - DstMem | SrcReg | ModRM | BitOp, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, + 0, DstMem | SrcReg | ModRM | BitOp | Lock, 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem16 | ModRM | Mov, /* 0xB8 - 0xBF */ - 0, 0, Group | Group8, DstMem | SrcReg | ModRM | BitOp, + 0, 0, + Group | Group8, DstMem | SrcReg | ModRM | BitOp | Lock, 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem16 | ModRM | Mov, /* 0xC0 - 0xCF */ @@ -290,25 +292,41 @@ static u32 twobyte_table[256] = { static u32 group_table[] = { [Group1_80*8] = - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM, [Group1_81*8] = - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM, [Group1_82*8] = - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM, [Group1_83*8] = - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM, [Group1A*8] = DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, [Group3_Byte*8] = @@ -332,10 +350,10 @@ static u32 group_table[] = { SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv, [Group8*8] = 0, 0, 0, 0, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, + DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock, [Group9*8] = - 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, + 0, ImplicitOps | ModRM | Lock, 0, 0, 0, 0, 0, 0, }; static u32 group2_table[] = { @@ -1580,8 +1598,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt) u64 msr_data; /* syscall is not available in real mode */ - if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL - || ctxt->mode == X86EMUL_MODE_VM86) + if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86) return -1; setup_syscalls_segments(ctxt, &cs, &ss); @@ -1629,10 +1646,6 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) struct kvm_segment cs, ss; u64 msr_data; - /* inject #UD if LOCK prefix is used */ - if (c->lock_prefix) - return -1; - /* inject #GP if in real mode */ if (ctxt->mode == X86EMUL_MODE_REAL) { kvm_inject_gp(ctxt->vcpu, 0); @@ -1694,10 +1707,6 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) u64 msr_data; int usermode; - /* inject #UD if LOCK prefix is used */ - if (c->lock_prefix) - return -1; - /* inject #GP if in real mode or Virtual 8086 mode */ if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86) { @@ -1819,6 +1828,12 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); saved_eip = c->eip; + /* LOCK prefix is allowed only with some instructions */ + if (c->lock_prefix && !(c->d & Lock)) { + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + goto done; + } + /* Privileged instruction can be executed only in CPL=0 */ if ((c->d & Priv) && kvm_x86_ops->get_cpl(ctxt->vcpu)) { kvm_inject_gp(ctxt->vcpu, 0); -- cgit v1.2.2 From 1d327eac3c9a8b170d68df44957d82bdbd263c53 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 11 Feb 2010 11:12:07 +0800 Subject: KVM: x86 emulator: code style cleanup Just remove redundant semicolon. Signed-off-by: Wei Yongjun Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c2de9f03da84..dd1b93522379 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1016,7 +1016,7 @@ done_prefixes: } if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { - kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");; + kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction"); return -1; } -- cgit v1.2.2 From e424e1918345ab17d3beb068b8745ab1706d0e1b Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 11 Feb 2010 12:41:10 +0200 Subject: KVM: x86 emulator: disallow opcode 82 in 64-bit mode Instructions with opcode 82 are not valid in 64 bit mode. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index dd1b93522379..c280c2352f7e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -310,14 +310,14 @@ static u32 group_table[] = { DstMem | SrcImm | ModRM | Lock, DstMem | SrcImm | ModRM, [Group1_82*8] = - ByteOp | DstMem | SrcImm | ModRM | Lock, - ByteOp | DstMem | SrcImm | ModRM | Lock, - ByteOp | DstMem | SrcImm | ModRM | Lock, - ByteOp | DstMem | SrcImm | ModRM | Lock, - ByteOp | DstMem | SrcImm | ModRM | Lock, - ByteOp | DstMem | SrcImm | ModRM | Lock, - ByteOp | DstMem | SrcImm | ModRM | Lock, - ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64, [Group1_83*8] = DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock, -- cgit v1.2.2 From fa8273e9549b3cee2947d1aebe5242e7a25eb3de Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 17 Feb 2010 14:00:41 +0000 Subject: KVM: Convert i8254/i8259 locks to raw_spinlocks The i8254/i8259 locks need to be real spinlocks on preempt-rt. Convert them to raw_spinlock. No change for !RT kernels. Signed-off-by: Thomas Gleixner Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 10 +++++----- arch/x86/kvm/i8254.h | 2 +- arch/x86/kvm/i8259.c | 31 ++++++++++++++++--------------- arch/x86/kvm/irq.h | 2 +- arch/x86/kvm/x86.c | 8 ++++---- 5 files changed, 27 insertions(+), 26 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index c9569f20dfba..294698b6daff 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -242,11 +242,11 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) { struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, irq_ack_notifier); - spin_lock(&ps->inject_lock); + raw_spin_lock(&ps->inject_lock); if (atomic_dec_return(&ps->pit_timer.pending) < 0) atomic_inc(&ps->pit_timer.pending); ps->irq_ack = 1; - spin_unlock(&ps->inject_lock); + raw_spin_unlock(&ps->inject_lock); } void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) @@ -624,7 +624,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) mutex_init(&pit->pit_state.lock); mutex_lock(&pit->pit_state.lock); - spin_lock_init(&pit->pit_state.inject_lock); + raw_spin_lock_init(&pit->pit_state.inject_lock); kvm->arch.vpit = pit; pit->kvm = kvm; @@ -724,12 +724,12 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) /* Try to inject pending interrupts when * last one has been acked. */ - spin_lock(&ps->inject_lock); + raw_spin_lock(&ps->inject_lock); if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) { ps->irq_ack = 0; inject = 1; } - spin_unlock(&ps->inject_lock); + raw_spin_unlock(&ps->inject_lock); if (inject) __inject_pit_timer_intr(kvm); } diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index d4c1c7ffdc09..900d6b0ba7c2 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -27,7 +27,7 @@ struct kvm_kpit_state { u32 speaker_data_on; struct mutex lock; struct kvm_pit *pit; - spinlock_t inject_lock; + raw_spinlock_t inject_lock; unsigned long irq_ack; struct kvm_irq_ack_notifier irq_ack_notifier; }; diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index a3711f9e580f..07771da85de5 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -44,18 +44,19 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq) * Other interrupt may be delivered to PIC while lock is dropped but * it should be safe since PIC state is already updated at this stage. */ - spin_unlock(&s->pics_state->lock); + raw_spin_unlock(&s->pics_state->lock); kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); - spin_lock(&s->pics_state->lock); + raw_spin_lock(&s->pics_state->lock); } void kvm_pic_clear_isr_ack(struct kvm *kvm) { struct kvm_pic *s = pic_irqchip(kvm); - spin_lock(&s->lock); + + raw_spin_lock(&s->lock); s->pics[0].isr_ack = 0xff; s->pics[1].isr_ack = 0xff; - spin_unlock(&s->lock); + raw_spin_unlock(&s->lock); } /* @@ -156,9 +157,9 @@ static void pic_update_irq(struct kvm_pic *s) void kvm_pic_update_irq(struct kvm_pic *s) { - spin_lock(&s->lock); + raw_spin_lock(&s->lock); pic_update_irq(s); - spin_unlock(&s->lock); + raw_spin_unlock(&s->lock); } int kvm_pic_set_irq(void *opaque, int irq, int level) @@ -166,14 +167,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level) struct kvm_pic *s = opaque; int ret = -1; - spin_lock(&s->lock); + raw_spin_lock(&s->lock); if (irq >= 0 && irq < PIC_NUM_PINS) { ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); pic_update_irq(s); trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, s->pics[irq >> 3].imr, ret == 0); } - spin_unlock(&s->lock); + raw_spin_unlock(&s->lock); return ret; } @@ -203,7 +204,7 @@ int kvm_pic_read_irq(struct kvm *kvm) int irq, irq2, intno; struct kvm_pic *s = pic_irqchip(kvm); - spin_lock(&s->lock); + raw_spin_lock(&s->lock); irq = pic_get_irq(&s->pics[0]); if (irq >= 0) { pic_intack(&s->pics[0], irq); @@ -228,7 +229,7 @@ int kvm_pic_read_irq(struct kvm *kvm) intno = s->pics[0].irq_base + irq; } pic_update_irq(s); - spin_unlock(&s->lock); + raw_spin_unlock(&s->lock); return intno; } @@ -442,7 +443,7 @@ static int picdev_write(struct kvm_io_device *this, printk(KERN_ERR "PIC: non byte write\n"); return 0; } - spin_lock(&s->lock); + raw_spin_lock(&s->lock); switch (addr) { case 0x20: case 0x21: @@ -455,7 +456,7 @@ static int picdev_write(struct kvm_io_device *this, elcr_ioport_write(&s->pics[addr & 1], addr, data); break; } - spin_unlock(&s->lock); + raw_spin_unlock(&s->lock); return 0; } @@ -472,7 +473,7 @@ static int picdev_read(struct kvm_io_device *this, printk(KERN_ERR "PIC: non byte read\n"); return 0; } - spin_lock(&s->lock); + raw_spin_lock(&s->lock); switch (addr) { case 0x20: case 0x21: @@ -486,7 +487,7 @@ static int picdev_read(struct kvm_io_device *this, break; } *(unsigned char *)val = data; - spin_unlock(&s->lock); + raw_spin_unlock(&s->lock); return 0; } @@ -520,7 +521,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); if (!s) return NULL; - spin_lock_init(&s->lock); + raw_spin_lock_init(&s->lock); s->kvm = kvm; s->pics[0].elcr_mask = 0xf8; s->pics[1].elcr_mask = 0xde; diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 0b71d480ebf1..34b15915754d 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -62,7 +62,7 @@ struct kvm_kpic_state { }; struct kvm_pic { - spinlock_t lock; + raw_spinlock_t lock; unsigned pending_acks; struct kvm *kvm; struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 86b739f8f173..f6ae4875d9d0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2542,18 +2542,18 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) r = 0; switch (chip->chip_id) { case KVM_IRQCHIP_PIC_MASTER: - spin_lock(&pic_irqchip(kvm)->lock); + raw_spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[0], &chip->chip.pic, sizeof(struct kvm_pic_state)); - spin_unlock(&pic_irqchip(kvm)->lock); + raw_spin_unlock(&pic_irqchip(kvm)->lock); break; case KVM_IRQCHIP_PIC_SLAVE: - spin_lock(&pic_irqchip(kvm)->lock); + raw_spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[1], &chip->chip.pic, sizeof(struct kvm_pic_state)); - spin_unlock(&pic_irqchip(kvm)->lock); + raw_spin_unlock(&pic_irqchip(kvm)->lock); break; case KVM_IRQCHIP_IOAPIC: r = kvm_set_ioapic(kvm, &chip->chip.ioapic); -- cgit v1.2.2 From 8b9f44140bc4afd2698413cd9960c3912168ee91 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 18 Feb 2010 12:14:59 +0200 Subject: KVM: x86 emulator: Forbid modifying CS segment register by mov instruction Inject #UD if guest attempts to do so. This is in accordance to Intel SDM. Cc: stable@kernel.org (2.6.33, 2.6.32) Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c280c2352f7e..2db760ff887c 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2126,6 +2126,12 @@ special_insn: int err; sel = c->src.val; + + if (c->modrm_reg == VCPU_SREG_CS) { + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + goto done; + } + if (c->modrm_reg == VCPU_SREG_SS) toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS); -- cgit v1.2.2 From 6f550484a15ea1b468665cdf59f020bf08ccb292 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Thu, 18 Feb 2010 12:15:00 +0200 Subject: KVM: Fix load_guest_segment_descriptor() to inject page fault This patch injects page fault when reading descriptor in load_guest_segment_descriptor() fails with FAULT. Effects of this injection: This function is used by kvm_load_segment_descriptor() which is necessary for the following instructions: - mov seg,r/m16 - jmp far - pop ?s This patch makes it possible to emulate the page faults generated by these instructions. But be sure that unless we change the kvm_load_segment_descriptor()'s ret value propagation this patch has no effect. Signed-off-by: Takuya Yoshikawa Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f6ae4875d9d0..203ee7d0ed58 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4713,6 +4713,9 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, { struct descriptor_table dtable; u16 index = selector >> 3; + int ret; + u32 err; + gva_t addr; get_segment_descriptor_dtable(vcpu, selector, &dtable); @@ -4720,9 +4723,13 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); return X86EMUL_PROPAGATE_FAULT; } - return kvm_read_guest_virt_system(dtable.base + index*8, - seg_desc, sizeof(*seg_desc), - vcpu, NULL); + addr = dtable.base + index * 8; + ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof(*seg_desc), + vcpu, &err); + if (ret == X86EMUL_PROPAGATE_FAULT) + kvm_inject_page_fault(vcpu, addr, err); + + return ret; } /* allowed just for 8 bytes segments */ -- cgit v1.2.2 From c697518a861e6c43b92b848895f9926580ee63c3 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 18 Feb 2010 12:15:01 +0200 Subject: KVM: Fix segment descriptor loading Add proper error and permission checking. This patch also change task switching code to load segment selectors before segment descriptors, like SDM requires, otherwise permission checking during segment descriptor loading will be incorrect. Cc: stable@kernel.org (2.6.33, 2.6.32) Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 30 +++------ arch/x86/kvm/x86.c | 177 +++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 150 insertions(+), 57 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 2db760ff887c..a1a7b27adf41 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1309,7 +1309,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, if (rc != 0) return rc; - rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, 1, seg); + rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, seg); return rc; } @@ -1491,7 +1491,7 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); if (rc) return rc; - rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, 1, VCPU_SREG_CS); + rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, VCPU_SREG_CS); return rc; } @@ -2122,12 +2122,11 @@ special_insn: break; case 0x8e: { /* mov seg, r/m16 */ uint16_t sel; - int type_bits; - int err; sel = c->src.val; - if (c->modrm_reg == VCPU_SREG_CS) { + if (c->modrm_reg == VCPU_SREG_CS || + c->modrm_reg > VCPU_SREG_GS) { kvm_queue_exception(ctxt->vcpu, UD_VECTOR); goto done; } @@ -2135,18 +2134,7 @@ special_insn: if (c->modrm_reg == VCPU_SREG_SS) toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS); - if (c->modrm_reg <= 5) { - type_bits = (c->modrm_reg == 1) ? 9 : 1; - err = kvm_load_segment_descriptor(ctxt->vcpu, sel, - type_bits, c->modrm_reg); - } else { - printk(KERN_INFO "Invalid segreg in modrm byte 0x%02x\n", - c->modrm); - goto cannot_emulate; - } - - if (err < 0) - goto cannot_emulate; + rc = kvm_load_segment_descriptor(ctxt->vcpu, sel, c->modrm_reg); c->dst.type = OP_NONE; /* Disable writeback. */ break; @@ -2320,11 +2308,9 @@ special_insn: case 0xe9: /* jmp rel */ goto jmp; case 0xea: /* jmp far */ - if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 9, - VCPU_SREG_CS) < 0) { - DPRINTF("jmp far: Failed to load CS descriptor\n"); - goto cannot_emulate; - } + if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, + VCPU_SREG_CS)) + goto done; c->eip = c->src.val; break; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 203ee7d0ed58..c3d2acbbb91b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4787,7 +4787,7 @@ static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int se .unusable = 0, }; kvm_x86_ops->set_segment(vcpu, &segvar, seg); - return 0; + return X86EMUL_CONTINUE; } static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) @@ -4797,43 +4797,112 @@ static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) (kvm_get_rflags(vcpu) & X86_EFLAGS_VM); } -static void kvm_check_segment_descriptor(struct kvm_vcpu *vcpu, int seg, - u16 selector) -{ - /* NULL selector is not valid for CS and SS */ - if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) - if (!selector) - kvm_queue_exception_e(vcpu, TS_VECTOR, selector >> 3); -} - -int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, - int type_bits, int seg) +int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg) { struct kvm_segment kvm_seg; struct desc_struct seg_desc; + u8 dpl, rpl, cpl; + unsigned err_vec = GP_VECTOR; + u32 err_code = 0; + bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ + int ret; if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu)) return kvm_load_realmode_segment(vcpu, selector, seg); - if (load_guest_segment_descriptor(vcpu, selector, &seg_desc)) - return 1; + /* NULL selector is not valid for TR, CS and SS */ + if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) + && null_selector) + goto exception; + + /* TR should be in GDT only */ + if (seg == VCPU_SREG_TR && (selector & (1 << 2))) + goto exception; + + ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc); + if (ret) + return ret; + seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg); - kvm_check_segment_descriptor(vcpu, seg, selector); - kvm_seg.type |= type_bits; + if (null_selector) { /* for NULL selector skip all following checks */ + kvm_seg.unusable = 1; + goto load; + } - if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS && - seg != VCPU_SREG_LDTR) - if (!kvm_seg.s) - kvm_seg.unusable = 1; + err_code = selector & 0xfffc; + err_vec = GP_VECTOR; - kvm_set_segment(vcpu, &kvm_seg, seg); - if (selector && !kvm_seg.unusable && kvm_seg.s) { + /* can't load system descriptor into segment selecor */ + if (seg <= VCPU_SREG_GS && !kvm_seg.s) + goto exception; + + if (!kvm_seg.present) { + err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; + goto exception; + } + + rpl = selector & 3; + dpl = kvm_seg.dpl; + cpl = kvm_x86_ops->get_cpl(vcpu); + + switch (seg) { + case VCPU_SREG_SS: + /* + * segment is not a writable data segment or segment + * selector's RPL != CPL or segment selector's RPL != CPL + */ + if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl) + goto exception; + break; + case VCPU_SREG_CS: + if (!(kvm_seg.type & 8)) + goto exception; + + if (kvm_seg.type & 4) { + /* conforming */ + if (dpl > cpl) + goto exception; + } else { + /* nonconforming */ + if (rpl > cpl || dpl != cpl) + goto exception; + } + /* CS(RPL) <- CPL */ + selector = (selector & 0xfffc) | cpl; + break; + case VCPU_SREG_TR: + if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9)) + goto exception; + break; + case VCPU_SREG_LDTR: + if (kvm_seg.s || kvm_seg.type != 2) + goto exception; + break; + default: /* DS, ES, FS, or GS */ + /* + * segment is not a data or readable code segment or + * ((segment is a data or nonconforming code segment) + * and (both RPL and CPL > DPL)) + */ + if ((kvm_seg.type & 0xa) == 0x8 || + (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl))) + goto exception; + break; + } + + if (!kvm_seg.unusable && kvm_seg.s) { /* mark segment as accessed */ + kvm_seg.type |= 1; seg_desc.type |= 1; save_guest_segment_descriptor(vcpu, selector, &seg_desc); } - return 0; +load: + kvm_set_segment(vcpu, &kvm_seg, seg); + return X86EMUL_CONTINUE; +exception: + kvm_queue_exception_e(vcpu, err_vec, err_code); + return X86EMUL_PROPAGATE_FAULT; } static void save_state_to_tss32(struct kvm_vcpu *vcpu, @@ -4859,6 +4928,14 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu, tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); } +static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, u16 sel, int seg) +{ + struct kvm_segment kvm_seg; + kvm_get_segment(vcpu, &kvm_seg, seg); + kvm_seg.selector = sel; + kvm_set_segment(vcpu, &kvm_seg, seg); +} + static int load_state_from_tss32(struct kvm_vcpu *vcpu, struct tss_segment_32 *tss) { @@ -4876,25 +4953,41 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu, kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); - if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) + /* + * SDM says that segment selectors are loaded before segment + * descriptors + */ + kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR); + kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES); + kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS); + kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS); + kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS); + kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS); + kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS); + + /* + * Now load segment descriptors. If fault happenes at this stage + * it is handled in a context of new task + */ + if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) + if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) + if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) + if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) + if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) + if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) + if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS)) return 1; return 0; } @@ -4934,19 +5027,33 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu, kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); - if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) + /* + * SDM says that segment selectors are loaded before segment + * descriptors + */ + kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR); + kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES); + kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS); + kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS); + kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS); + + /* + * Now load segment descriptors. If fault happenes at this stage + * it is handled in a context of new task + */ + if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) + if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) + if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) + if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) + if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS)) return 1; return 0; } -- cgit v1.2.2 From e54cfa97a9ca9a544a7257b89b530b505ae1b892 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Thu, 18 Feb 2010 12:15:02 +0200 Subject: KVM: Fix emulate_sys[call, enter, exit]()'s fault handling This patch fixes emulate_syscall(), emulate_sysenter() and emulate_sysexit() to handle injected faults properly. Even though original code injects faults in these functions, we cannot handle these unless we use the different return value from the UNHANDLEABLE case. So this patch use X86EMUL_* codes instead of -1 and 0 and makes x86_emulate_insn() to handle these propagated faults. Be sure that, in x86_emulate_insn(), goto cannot_emulate and goto done with rc equals X86EMUL_UNHANDLEABLE have same effect. Signed-off-by: Takuya Yoshikawa Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a1a7b27adf41..4dade6ac0827 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1599,7 +1599,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt) /* syscall is not available in real mode */ if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86) - return -1; + return X86EMUL_UNHANDLEABLE; setup_syscalls_segments(ctxt, &cs, &ss); @@ -1636,7 +1636,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt) ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); } - return 0; + return X86EMUL_CONTINUE; } static int @@ -1649,14 +1649,14 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) /* inject #GP if in real mode */ if (ctxt->mode == X86EMUL_MODE_REAL) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_UNHANDLEABLE; } /* XXX sysenter/sysexit have not been tested in 64bit mode. * Therefore, we inject an #UD. */ if (ctxt->mode == X86EMUL_MODE_PROT64) - return -1; + return X86EMUL_UNHANDLEABLE; setup_syscalls_segments(ctxt, &cs, &ss); @@ -1665,13 +1665,13 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) case X86EMUL_MODE_PROT32: if ((msr_data & 0xfffc) == 0x0) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_PROPAGATE_FAULT; } break; case X86EMUL_MODE_PROT64: if (msr_data == 0x0) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_PROPAGATE_FAULT; } break; } @@ -1696,7 +1696,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); c->regs[VCPU_REGS_RSP] = msr_data; - return 0; + return X86EMUL_CONTINUE; } static int @@ -1711,7 +1711,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_UNHANDLEABLE; } setup_syscalls_segments(ctxt, &cs, &ss); @@ -1729,7 +1729,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) cs.selector = (u16)(msr_data + 16); if ((msr_data & 0xfffc) == 0x0) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_PROPAGATE_FAULT; } ss.selector = (u16)(msr_data + 24); break; @@ -1737,7 +1737,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) cs.selector = (u16)(msr_data + 32); if (msr_data == 0x0) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_PROPAGATE_FAULT; } ss.selector = cs.selector + 8; cs.db = 0; @@ -1753,7 +1753,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; - return 0; + return X86EMUL_CONTINUE; } static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt) @@ -2476,8 +2476,9 @@ twobyte_insn: } break; case 0x05: /* syscall */ - if (emulate_syscall(ctxt) == -1) - goto cannot_emulate; + rc = emulate_syscall(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; else goto writeback; break; @@ -2548,14 +2549,16 @@ twobyte_insn: c->dst.type = OP_NONE; break; case 0x34: /* sysenter */ - if (emulate_sysenter(ctxt) == -1) - goto cannot_emulate; + rc = emulate_sysenter(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; else goto writeback; break; case 0x35: /* sysexit */ - if (emulate_sysexit(ctxt) == -1) - goto cannot_emulate; + rc = emulate_sysexit(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; else goto writeback; break; -- cgit v1.2.2 From c573cd22939e54fc1b8e672054a505048987a7cb Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 23 Feb 2010 17:47:53 +0100 Subject: KVM: VMX: Update instruction length on intercepted BP We intercept #BP while in guest debugging mode. As VM exits due to intercepted exceptions do not necessarily come with valid idt_vectoring, we have to update event_exit_inst_len explicitly in such cases. At least in the absence of migration, this ensures that re-injections of #BP will find and use the correct instruction length. Signed-off-by: Jan Kiszka Cc: stable@kernel.org (2.6.32, 2.6.33) Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f82b0723afa5..14873b9f8430 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2775,6 +2775,12 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, kvm_queue_exception(vcpu, vec); return 1; case BP_VECTOR: + /* + * Update instruction length as we may reinject the exception + * from user space while in guest debugging mode. + */ + to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) return 0; /* fall through */ @@ -2897,6 +2903,13 @@ static int handle_exception(struct kvm_vcpu *vcpu) kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); /* fall through */ case BP_VECTOR: + /* + * Update instruction length as we may reinject #BP from + * user space while in guest debugging mode. Reading it for + * #DB as well causes no harm, it is not used in that case. + */ + vmx->vcpu.arch.event_exit_inst_len = + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); kvm_run->exit_reason = KVM_EXIT_DEBUG; kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; kvm_run->debug.arch.exception = ex_no; -- cgit v1.2.2 From d2be1651b736002e0c76d7095d6c0ba77b4a897c Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 23 Feb 2010 17:47:57 +0100 Subject: KVM: x86: Add KVM_CAP_X86_ROBUST_SINGLESTEP This marks the guest single-step API improvement of 94fe45da and 91586a3b with a capability flag to allow reliable detection by user space. Signed-off-by: Jan Kiszka Cc: stable@kernel.org (2.6.33) Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c3d2acbbb91b..e46282a56565 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1570,6 +1570,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_HYPERV_VAPIC: case KVM_CAP_HYPERV_SPIN: case KVM_CAP_PCI_SEGMENT: + case KVM_CAP_X86_ROBUST_SINGLESTEP: r = 1; break; case KVM_CAP_COALESCED_MMIO: -- cgit v1.2.2