aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-05-05 17:47:31 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-05-05 17:47:31 -0400
commit01227a889ed56ae53aeebb9f93be9d54dd8b2de8 (patch)
treed5eba9359a9827e84d4112b84d48c54df5c5acde /arch/x86
parent9e6879460c8edb0cd3c24c09b83d06541b5af0dc (diff)
parentdb6ae6158186a17165ef990bda2895ae7594b039 (diff)
Merge tag 'kvm-3.10-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm updates from Gleb Natapov: "Highlights of the updates are: general: - new emulated device API - legacy device assignment is now optional - irqfd interface is more generic and can be shared between arches x86: - VMCS shadow support and other nested VMX improvements - APIC virtualization and Posted Interrupt hardware support - Optimize mmio spte zapping ppc: - BookE: in-kernel MPIC emulation with irqfd support - Book3S: in-kernel XICS emulation (incomplete) - Book3S: HV: migration fixes - BookE: more debug support preparation - BookE: e6500 support ARM: - reworking of Hyp idmaps s390: - ioeventfd for virtio-ccw And many other bug fixes, cleanups and improvements" * tag 'kvm-3.10-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (204 commits) kvm: Add compat_ioctl for device control API KVM: x86: Account for failing enable_irq_window for NMI window request KVM: PPC: Book3S: Add API for in-kernel XICS emulation kvm/ppc/mpic: fix missing unlock in set_base_addr() kvm/ppc: Hold srcu lock when calling kvm_io_bus_read/write kvm/ppc/mpic: remove users kvm/ppc/mpic: fix mmio region lists when multiple guests used kvm/ppc/mpic: remove default routes from documentation kvm: KVM_CAP_IOMMU only available with device assignment ARM: KVM: iterate over all CPUs for CPU compatibility check KVM: ARM: Fix spelling in error message ARM: KVM: define KVM_ARM_MAX_VCPUS unconditionally KVM: ARM: Fix API documentation for ONE_REG encoding ARM: KVM: promote vfp_host pointer to generic host cpu context ARM: KVM: add architecture specific hook for capabilities ARM: KVM: perform HYP initilization for hotplugged CPUs ARM: KVM: switch to a dual-step HYP init code ARM: KVM: rework HYP page table freeing ARM: KVM: enforce maximum size for identity mapped code ARM: KVM: move to a KVM provided HYP idmap ...
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/include/asm/entry_arch.h4
-rw-r--r--arch/x86/include/asm/hardirq.h3
-rw-r--r--arch/x86/include/asm/hw_irq.h1
-rw-r--r--arch/x86/include/asm/irq_vectors.h5
-rw-r--r--arch/x86/include/asm/kvm_host.h26
-rw-r--r--arch/x86/include/asm/vmx.h18
-rw-r--r--arch/x86/include/uapi/asm/kvm.h1
-rw-r--r--arch/x86/include/uapi/asm/msr-index.h2
-rw-r--r--arch/x86/include/uapi/asm/vmx.h5
-rw-r--r--arch/x86/kernel/entry_64.S5
-rw-r--r--arch/x86/kernel/irq.c22
-rw-r--r--arch/x86/kernel/irqinit.c4
-rw-r--r--arch/x86/kernel/kvmclock.c9
-rw-r--r--arch/x86/kvm/Kconfig14
-rw-r--r--arch/x86/kvm/Makefile5
-rw-r--r--arch/x86/kvm/emulate.c31
-rw-r--r--arch/x86/kvm/i8254.c4
-rw-r--r--arch/x86/kvm/lapic.c189
-rw-r--r--arch/x86/kvm/lapic.h22
-rw-r--r--arch/x86/kvm/mmu.c108
-rw-r--r--arch/x86/kvm/mmu.h11
-rw-r--r--arch/x86/kvm/paging_tmpl.h2
-rw-r--r--arch/x86/kvm/pmu.c14
-rw-r--r--arch/x86/kvm/svm.c40
-rw-r--r--arch/x86/kvm/vmx.c1077
-rw-r--r--arch/x86/kvm/x86.c243
26 files changed, 1344 insertions, 521 deletions
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 40afa0005c69..9bd4ecac72be 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -19,6 +19,10 @@ BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR)
19 19
20BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) 20BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
21 21
22#ifdef CONFIG_HAVE_KVM
23BUILD_INTERRUPT(kvm_posted_intr_ipi, POSTED_INTR_VECTOR)
24#endif
25
22/* 26/*
23 * every pentium local APIC has two 'local interrupts', with a 27 * every pentium local APIC has two 'local interrupts', with a
24 * soft-definable vector attached to both interrupts, one of 28 * soft-definable vector attached to both interrupts, one of
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 81f04cee5f74..ab0ae1aa6d0a 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -12,6 +12,9 @@ typedef struct {
12 unsigned int irq_spurious_count; 12 unsigned int irq_spurious_count;
13 unsigned int icr_read_retry_count; 13 unsigned int icr_read_retry_count;
14#endif 14#endif
15#ifdef CONFIG_HAVE_KVM
16 unsigned int kvm_posted_intr_ipis;
17#endif
15 unsigned int x86_platform_ipis; /* arch dependent */ 18 unsigned int x86_platform_ipis; /* arch dependent */
16 unsigned int apic_perf_irqs; 19 unsigned int apic_perf_irqs;
17 unsigned int apic_irq_work_irqs; 20 unsigned int apic_irq_work_irqs;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 10a78c3d3d5a..1da97efad08a 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -28,6 +28,7 @@
28/* Interrupt handlers registered during init_IRQ */ 28/* Interrupt handlers registered during init_IRQ */
29extern void apic_timer_interrupt(void); 29extern void apic_timer_interrupt(void);
30extern void x86_platform_ipi(void); 30extern void x86_platform_ipi(void);
31extern void kvm_posted_intr_ipi(void);
31extern void error_interrupt(void); 32extern void error_interrupt(void);
32extern void irq_work_interrupt(void); 33extern void irq_work_interrupt(void);
33 34
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index aac5fa62a86c..5702d7e3111d 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -102,6 +102,11 @@
102 */ 102 */
103#define X86_PLATFORM_IPI_VECTOR 0xf7 103#define X86_PLATFORM_IPI_VECTOR 0xf7
104 104
105/* Vector for KVM to deliver posted interrupt IPI */
106#ifdef CONFIG_HAVE_KVM
107#define POSTED_INTR_VECTOR 0xf2
108#endif
109
105/* 110/*
106 * IRQ work vector: 111 * IRQ work vector:
107 */ 112 */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4979778cc7fb..3741c653767c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -31,7 +31,7 @@
31#include <asm/msr-index.h> 31#include <asm/msr-index.h>
32#include <asm/asm.h> 32#include <asm/asm.h>
33 33
34#define KVM_MAX_VCPUS 254 34#define KVM_MAX_VCPUS 255
35#define KVM_SOFT_MAX_VCPUS 160 35#define KVM_SOFT_MAX_VCPUS 160
36#define KVM_USER_MEM_SLOTS 125 36#define KVM_USER_MEM_SLOTS 125
37/* memory slots that are not exposed to userspace */ 37/* memory slots that are not exposed to userspace */
@@ -43,6 +43,8 @@
43#define KVM_PIO_PAGE_OFFSET 1 43#define KVM_PIO_PAGE_OFFSET 1
44#define KVM_COALESCED_MMIO_PAGE_OFFSET 2 44#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
45 45
46#define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS
47
46#define CR0_RESERVED_BITS \ 48#define CR0_RESERVED_BITS \
47 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ 49 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
48 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ 50 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
@@ -94,9 +96,6 @@
94 96
95#define ASYNC_PF_PER_VCPU 64 97#define ASYNC_PF_PER_VCPU 64
96 98
97extern raw_spinlock_t kvm_lock;
98extern struct list_head vm_list;
99
100struct kvm_vcpu; 99struct kvm_vcpu;
101struct kvm; 100struct kvm;
102struct kvm_async_pf; 101struct kvm_async_pf;
@@ -230,6 +229,7 @@ struct kvm_mmu_page {
230#endif 229#endif
231 230
232 int write_flooding_count; 231 int write_flooding_count;
232 bool mmio_cached;
233}; 233};
234 234
235struct kvm_pio_request { 235struct kvm_pio_request {
@@ -345,7 +345,6 @@ struct kvm_vcpu_arch {
345 unsigned long apic_attention; 345 unsigned long apic_attention;
346 int32_t apic_arb_prio; 346 int32_t apic_arb_prio;
347 int mp_state; 347 int mp_state;
348 int sipi_vector;
349 u64 ia32_misc_enable_msr; 348 u64 ia32_misc_enable_msr;
350 bool tpr_access_reporting; 349 bool tpr_access_reporting;
351 350
@@ -643,7 +642,7 @@ struct kvm_x86_ops {
643 /* Create, but do not attach this VCPU */ 642 /* Create, but do not attach this VCPU */
644 struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); 643 struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
645 void (*vcpu_free)(struct kvm_vcpu *vcpu); 644 void (*vcpu_free)(struct kvm_vcpu *vcpu);
646 int (*vcpu_reset)(struct kvm_vcpu *vcpu); 645 void (*vcpu_reset)(struct kvm_vcpu *vcpu);
647 646
648 void (*prepare_guest_switch)(struct kvm_vcpu *vcpu); 647 void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
649 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); 648 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
@@ -696,14 +695,16 @@ struct kvm_x86_ops {
696 int (*nmi_allowed)(struct kvm_vcpu *vcpu); 695 int (*nmi_allowed)(struct kvm_vcpu *vcpu);
697 bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); 696 bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
698 void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked); 697 void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked);
699 void (*enable_nmi_window)(struct kvm_vcpu *vcpu); 698 int (*enable_nmi_window)(struct kvm_vcpu *vcpu);
700 void (*enable_irq_window)(struct kvm_vcpu *vcpu); 699 int (*enable_irq_window)(struct kvm_vcpu *vcpu);
701 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); 700 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
702 int (*vm_has_apicv)(struct kvm *kvm); 701 int (*vm_has_apicv)(struct kvm *kvm);
703 void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); 702 void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
704 void (*hwapic_isr_update)(struct kvm *kvm, int isr); 703 void (*hwapic_isr_update)(struct kvm *kvm, int isr);
705 void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); 704 void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
706 void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); 705 void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
706 void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
707 void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
707 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); 708 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
708 int (*get_tdp_level)(void); 709 int (*get_tdp_level)(void);
709 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); 710 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
@@ -730,6 +731,7 @@ struct kvm_x86_ops {
730 int (*check_intercept)(struct kvm_vcpu *vcpu, 731 int (*check_intercept)(struct kvm_vcpu *vcpu,
731 struct x86_instruction_info *info, 732 struct x86_instruction_info *info,
732 enum x86_intercept_stage stage); 733 enum x86_intercept_stage stage);
734 void (*handle_external_intr)(struct kvm_vcpu *vcpu);
733}; 735};
734 736
735struct kvm_arch_async_pf { 737struct kvm_arch_async_pf {
@@ -767,6 +769,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
767 struct kvm_memory_slot *slot, 769 struct kvm_memory_slot *slot,
768 gfn_t gfn_offset, unsigned long mask); 770 gfn_t gfn_offset, unsigned long mask);
769void kvm_mmu_zap_all(struct kvm *kvm); 771void kvm_mmu_zap_all(struct kvm *kvm);
772void kvm_mmu_zap_mmio_sptes(struct kvm *kvm);
770unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); 773unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
771void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); 774void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
772 775
@@ -797,6 +800,7 @@ enum emulation_result {
797#define EMULTYPE_TRAP_UD (1 << 1) 800#define EMULTYPE_TRAP_UD (1 << 1)
798#define EMULTYPE_SKIP (1 << 2) 801#define EMULTYPE_SKIP (1 << 2)
799#define EMULTYPE_RETRY (1 << 3) 802#define EMULTYPE_RETRY (1 << 3)
803#define EMULTYPE_NO_REEXECUTE (1 << 4)
800int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, 804int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
801 int emulation_type, void *insn, int insn_len); 805 int emulation_type, void *insn, int insn_len);
802 806
@@ -807,6 +811,7 @@ static inline int emulate_instruction(struct kvm_vcpu *vcpu,
807} 811}
808 812
809void kvm_enable_efer_bits(u64); 813void kvm_enable_efer_bits(u64);
814bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer);
810int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); 815int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
811int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); 816int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
812 817
@@ -819,6 +824,7 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
819 824
820void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); 825void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
821int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); 826int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
827void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector);
822 828
823int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, 829int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
824 int reason, bool has_error_code, u32 error_code); 830 int reason, bool has_error_code, u32 error_code);
@@ -973,7 +979,6 @@ enum {
973 * Trap the fault and ignore the instruction if that happens. 979 * Trap the fault and ignore the instruction if that happens.
974 */ 980 */
975asmlinkage void kvm_spurious_fault(void); 981asmlinkage void kvm_spurious_fault(void);
976extern bool kvm_rebooting;
977 982
978#define ____kvm_handle_fault_on_reboot(insn, cleanup_insn) \ 983#define ____kvm_handle_fault_on_reboot(insn, cleanup_insn) \
979 "666: " insn "\n\t" \ 984 "666: " insn "\n\t" \
@@ -1002,6 +1007,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
1002int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); 1007int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
1003int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); 1008int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
1004int kvm_cpu_get_interrupt(struct kvm_vcpu *v); 1009int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
1010void kvm_vcpu_reset(struct kvm_vcpu *vcpu);
1005 1011
1006void kvm_define_shared_msr(unsigned index, u32 msr); 1012void kvm_define_shared_msr(unsigned index, u32 msr);
1007void kvm_set_shared_msr(unsigned index, u64 val, u64 mask); 1013void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
@@ -1027,7 +1033,7 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu);
1027void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu); 1033void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu);
1028bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr); 1034bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr);
1029int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data); 1035int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
1030int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data); 1036int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
1031int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); 1037int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
1032void kvm_handle_pmu_event(struct kvm_vcpu *vcpu); 1038void kvm_handle_pmu_event(struct kvm_vcpu *vcpu);
1033void kvm_deliver_pmi(struct kvm_vcpu *vcpu); 1039void kvm_deliver_pmi(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index b6fbf860e398..f3e01a2cbaa1 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -65,11 +65,16 @@
65#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY 0x00000200 65#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY 0x00000200
66#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 66#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
67#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 67#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000
68#define SECONDARY_EXEC_SHADOW_VMCS 0x00004000
68 69
69 70
70#define PIN_BASED_EXT_INTR_MASK 0x00000001 71#define PIN_BASED_EXT_INTR_MASK 0x00000001
71#define PIN_BASED_NMI_EXITING 0x00000008 72#define PIN_BASED_NMI_EXITING 0x00000008
72#define PIN_BASED_VIRTUAL_NMIS 0x00000020 73#define PIN_BASED_VIRTUAL_NMIS 0x00000020
74#define PIN_BASED_VMX_PREEMPTION_TIMER 0x00000040
75#define PIN_BASED_POSTED_INTR 0x00000080
76
77#define PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR 0x00000016
73 78
74#define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000002 79#define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000002
75#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 80#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200
@@ -81,6 +86,8 @@
81#define VM_EXIT_LOAD_IA32_EFER 0x00200000 86#define VM_EXIT_LOAD_IA32_EFER 0x00200000
82#define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000 87#define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000
83 88
89#define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff
90
84#define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000002 91#define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000002
85#define VM_ENTRY_IA32E_MODE 0x00000200 92#define VM_ENTRY_IA32E_MODE 0x00000200
86#define VM_ENTRY_SMM 0x00000400 93#define VM_ENTRY_SMM 0x00000400
@@ -89,9 +96,15 @@
89#define VM_ENTRY_LOAD_IA32_PAT 0x00004000 96#define VM_ENTRY_LOAD_IA32_PAT 0x00004000
90#define VM_ENTRY_LOAD_IA32_EFER 0x00008000 97#define VM_ENTRY_LOAD_IA32_EFER 0x00008000
91 98
99#define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff
100
101#define VMX_MISC_PREEMPTION_TIMER_RATE_MASK 0x0000001f
102#define VMX_MISC_SAVE_EFER_LMA 0x00000020
103
92/* VMCS Encodings */ 104/* VMCS Encodings */
93enum vmcs_field { 105enum vmcs_field {
94 VIRTUAL_PROCESSOR_ID = 0x00000000, 106 VIRTUAL_PROCESSOR_ID = 0x00000000,
107 POSTED_INTR_NV = 0x00000002,
95 GUEST_ES_SELECTOR = 0x00000800, 108 GUEST_ES_SELECTOR = 0x00000800,
96 GUEST_CS_SELECTOR = 0x00000802, 109 GUEST_CS_SELECTOR = 0x00000802,
97 GUEST_SS_SELECTOR = 0x00000804, 110 GUEST_SS_SELECTOR = 0x00000804,
@@ -126,6 +139,8 @@ enum vmcs_field {
126 VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013, 139 VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013,
127 APIC_ACCESS_ADDR = 0x00002014, 140 APIC_ACCESS_ADDR = 0x00002014,
128 APIC_ACCESS_ADDR_HIGH = 0x00002015, 141 APIC_ACCESS_ADDR_HIGH = 0x00002015,
142 POSTED_INTR_DESC_ADDR = 0x00002016,
143 POSTED_INTR_DESC_ADDR_HIGH = 0x00002017,
129 EPT_POINTER = 0x0000201a, 144 EPT_POINTER = 0x0000201a,
130 EPT_POINTER_HIGH = 0x0000201b, 145 EPT_POINTER_HIGH = 0x0000201b,
131 EOI_EXIT_BITMAP0 = 0x0000201c, 146 EOI_EXIT_BITMAP0 = 0x0000201c,
@@ -136,6 +151,8 @@ enum vmcs_field {
136 EOI_EXIT_BITMAP2_HIGH = 0x00002021, 151 EOI_EXIT_BITMAP2_HIGH = 0x00002021,
137 EOI_EXIT_BITMAP3 = 0x00002022, 152 EOI_EXIT_BITMAP3 = 0x00002022,
138 EOI_EXIT_BITMAP3_HIGH = 0x00002023, 153 EOI_EXIT_BITMAP3_HIGH = 0x00002023,
154 VMREAD_BITMAP = 0x00002026,
155 VMWRITE_BITMAP = 0x00002028,
139 GUEST_PHYSICAL_ADDRESS = 0x00002400, 156 GUEST_PHYSICAL_ADDRESS = 0x00002400,
140 GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, 157 GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401,
141 VMCS_LINK_POINTER = 0x00002800, 158 VMCS_LINK_POINTER = 0x00002800,
@@ -209,6 +226,7 @@ enum vmcs_field {
209 GUEST_INTERRUPTIBILITY_INFO = 0x00004824, 226 GUEST_INTERRUPTIBILITY_INFO = 0x00004824,
210 GUEST_ACTIVITY_STATE = 0X00004826, 227 GUEST_ACTIVITY_STATE = 0X00004826,
211 GUEST_SYSENTER_CS = 0x0000482A, 228 GUEST_SYSENTER_CS = 0x0000482A,
229 VMX_PREEMPTION_TIMER_VALUE = 0x0000482E,
212 HOST_IA32_SYSENTER_CS = 0x00004c00, 230 HOST_IA32_SYSENTER_CS = 0x00004c00,
213 CR0_GUEST_HOST_MASK = 0x00006000, 231 CR0_GUEST_HOST_MASK = 0x00006000,
214 CR4_GUEST_HOST_MASK = 0x00006002, 232 CR4_GUEST_HOST_MASK = 0x00006002,
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index a65ec29e6ffb..5d9a3033b3d7 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -29,7 +29,6 @@
29#define __KVM_HAVE_PIT 29#define __KVM_HAVE_PIT
30#define __KVM_HAVE_IOAPIC 30#define __KVM_HAVE_IOAPIC
31#define __KVM_HAVE_IRQ_LINE 31#define __KVM_HAVE_IRQ_LINE
32#define __KVM_HAVE_DEVICE_ASSIGNMENT
33#define __KVM_HAVE_MSI 32#define __KVM_HAVE_MSI
34#define __KVM_HAVE_USER_NMI 33#define __KVM_HAVE_USER_NMI
35#define __KVM_HAVE_GUEST_DEBUG 34#define __KVM_HAVE_GUEST_DEBUG
diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index b5757885d7a4..b3a4866661c5 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -528,6 +528,8 @@
528#define VMX_BASIC_MEM_TYPE_WB 6LLU 528#define VMX_BASIC_MEM_TYPE_WB 6LLU
529#define VMX_BASIC_INOUT 0x0040000000000000LLU 529#define VMX_BASIC_INOUT 0x0040000000000000LLU
530 530
531/* MSR_IA32_VMX_MISC bits */
532#define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
531/* AMD-V MSRs */ 533/* AMD-V MSRs */
532 534
533#define MSR_VM_CR 0xc0010114 535#define MSR_VM_CR 0xc0010114
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 2871fccfee68..d651082c7cf7 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -65,6 +65,7 @@
65#define EXIT_REASON_EOI_INDUCED 45 65#define EXIT_REASON_EOI_INDUCED 45
66#define EXIT_REASON_EPT_VIOLATION 48 66#define EXIT_REASON_EPT_VIOLATION 48
67#define EXIT_REASON_EPT_MISCONFIG 49 67#define EXIT_REASON_EPT_MISCONFIG 49
68#define EXIT_REASON_PREEMPTION_TIMER 52
68#define EXIT_REASON_WBINVD 54 69#define EXIT_REASON_WBINVD 54
69#define EXIT_REASON_XSETBV 55 70#define EXIT_REASON_XSETBV 55
70#define EXIT_REASON_APIC_WRITE 56 71#define EXIT_REASON_APIC_WRITE 56
@@ -110,7 +111,7 @@
110 { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \ 111 { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \
111 { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \ 112 { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \
112 { EXIT_REASON_INVD, "INVD" }, \ 113 { EXIT_REASON_INVD, "INVD" }, \
113 { EXIT_REASON_INVPCID, "INVPCID" } 114 { EXIT_REASON_INVPCID, "INVPCID" }, \
114 115 { EXIT_REASON_PREEMPTION_TIMER, "PREEMPTION_TIMER" }
115 116
116#endif /* _UAPIVMX_H */ 117#endif /* _UAPIVMX_H */
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index c1d01e6ca790..727208941030 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1166,6 +1166,11 @@ apicinterrupt LOCAL_TIMER_VECTOR \
1166apicinterrupt X86_PLATFORM_IPI_VECTOR \ 1166apicinterrupt X86_PLATFORM_IPI_VECTOR \
1167 x86_platform_ipi smp_x86_platform_ipi 1167 x86_platform_ipi smp_x86_platform_ipi
1168 1168
1169#ifdef CONFIG_HAVE_KVM
1170apicinterrupt POSTED_INTR_VECTOR \
1171 kvm_posted_intr_ipi smp_kvm_posted_intr_ipi
1172#endif
1173
1169apicinterrupt THRESHOLD_APIC_VECTOR \ 1174apicinterrupt THRESHOLD_APIC_VECTOR \
1170 threshold_interrupt smp_threshold_interrupt 1175 threshold_interrupt smp_threshold_interrupt
1171apicinterrupt THERMAL_APIC_VECTOR \ 1176apicinterrupt THERMAL_APIC_VECTOR \
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 84b778962c66..ac0631d8996f 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -224,6 +224,28 @@ void smp_x86_platform_ipi(struct pt_regs *regs)
224 set_irq_regs(old_regs); 224 set_irq_regs(old_regs);
225} 225}
226 226
227#ifdef CONFIG_HAVE_KVM
228/*
229 * Handler for POSTED_INTERRUPT_VECTOR.
230 */
231void smp_kvm_posted_intr_ipi(struct pt_regs *regs)
232{
233 struct pt_regs *old_regs = set_irq_regs(regs);
234
235 ack_APIC_irq();
236
237 irq_enter();
238
239 exit_idle();
240
241 inc_irq_stat(kvm_posted_intr_ipis);
242
243 irq_exit();
244
245 set_irq_regs(old_regs);
246}
247#endif
248
227EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); 249EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
228 250
229#ifdef CONFIG_HOTPLUG_CPU 251#ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 7dc4e459c2b3..a2a1fbc594ff 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -172,6 +172,10 @@ static void __init apic_intr_init(void)
172 172
173 /* IPI for X86 platform specific use */ 173 /* IPI for X86 platform specific use */
174 alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi); 174 alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi);
175#ifdef CONFIG_HAVE_KVM
176 /* IPI for KVM to deliver posted interrupt */
177 alloc_intr_gate(POSTED_INTR_VECTOR, kvm_posted_intr_ipi);
178#endif
175 179
176 /* IPI vectors for APIC spurious and error interrupts */ 180 /* IPI vectors for APIC spurious and error interrupts */
177 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 181 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 0732f0089a3d..d2c381280e3c 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -160,8 +160,12 @@ int kvm_register_clock(char *txt)
160{ 160{
161 int cpu = smp_processor_id(); 161 int cpu = smp_processor_id();
162 int low, high, ret; 162 int low, high, ret;
163 struct pvclock_vcpu_time_info *src = &hv_clock[cpu].pvti; 163 struct pvclock_vcpu_time_info *src;
164
165 if (!hv_clock)
166 return 0;
164 167
168 src = &hv_clock[cpu].pvti;
165 low = (int)slow_virt_to_phys(src) | 1; 169 low = (int)slow_virt_to_phys(src) | 1;
166 high = ((u64)slow_virt_to_phys(src) >> 32); 170 high = ((u64)slow_virt_to_phys(src) >> 32);
167 ret = native_write_msr_safe(msr_kvm_system_time, low, high); 171 ret = native_write_msr_safe(msr_kvm_system_time, low, high);
@@ -276,6 +280,9 @@ int __init kvm_setup_vsyscall_timeinfo(void)
276 struct pvclock_vcpu_time_info *vcpu_time; 280 struct pvclock_vcpu_time_info *vcpu_time;
277 unsigned int size; 281 unsigned int size;
278 282
283 if (!hv_clock)
284 return 0;
285
279 size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); 286 size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
280 287
281 preempt_disable(); 288 preempt_disable();
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 586f00059805..a47a3e54b964 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -21,14 +21,13 @@ config KVM
21 tristate "Kernel-based Virtual Machine (KVM) support" 21 tristate "Kernel-based Virtual Machine (KVM) support"
22 depends on HAVE_KVM 22 depends on HAVE_KVM
23 depends on HIGH_RES_TIMERS 23 depends on HIGH_RES_TIMERS
24 # for device assignment:
25 depends on PCI
26 # for TASKSTATS/TASK_DELAY_ACCT: 24 # for TASKSTATS/TASK_DELAY_ACCT:
27 depends on NET 25 depends on NET
28 select PREEMPT_NOTIFIERS 26 select PREEMPT_NOTIFIERS
29 select MMU_NOTIFIER 27 select MMU_NOTIFIER
30 select ANON_INODES 28 select ANON_INODES
31 select HAVE_KVM_IRQCHIP 29 select HAVE_KVM_IRQCHIP
30 select HAVE_KVM_IRQ_ROUTING
32 select HAVE_KVM_EVENTFD 31 select HAVE_KVM_EVENTFD
33 select KVM_APIC_ARCHITECTURE 32 select KVM_APIC_ARCHITECTURE
34 select KVM_ASYNC_PF 33 select KVM_ASYNC_PF
@@ -82,6 +81,17 @@ config KVM_MMU_AUDIT
82 This option adds a R/W kVM module parameter 'mmu_audit', which allows 81 This option adds a R/W kVM module parameter 'mmu_audit', which allows
83 audit KVM MMU at runtime. 82 audit KVM MMU at runtime.
84 83
84config KVM_DEVICE_ASSIGNMENT
85 bool "KVM legacy PCI device assignment support"
86 depends on KVM && PCI && IOMMU_API
87 default y
88 ---help---
89 Provide support for legacy PCI device assignment through KVM. The
90 kernel now also supports a full featured userspace device driver
91 framework through VFIO, which supersedes much of this support.
92
93 If unsure, say Y.
94
85# OK, it's a little counter-intuitive to do this, but it puts it neatly under 95# OK, it's a little counter-intuitive to do this, but it puts it neatly under
86# the virtualization menu. 96# the virtualization menu.
87source drivers/vhost/Kconfig 97source drivers/vhost/Kconfig
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 04d30401c5cb..d609e1d84048 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,8 +7,9 @@ CFLAGS_vmx.o := -I.
7 7
8kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ 8kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
9 coalesced_mmio.o irq_comm.o eventfd.o \ 9 coalesced_mmio.o irq_comm.o eventfd.o \
10 assigned-dev.o) 10 irqchip.o)
11kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) 11kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += $(addprefix ../../../virt/kvm/, \
12 assigned-dev.o iommu.o)
12kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) 13kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o)
13 14
14kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ 15kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a335cc6cde72..8e517bba6a7c 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -132,8 +132,9 @@
132#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ 132#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */
133#define No64 (1<<28) 133#define No64 (1<<28)
134#define PageTable (1 << 29) /* instruction used to write page table */ 134#define PageTable (1 << 29) /* instruction used to write page table */
135#define NotImpl (1 << 30) /* instruction is not implemented */
135/* Source 2 operand type */ 136/* Source 2 operand type */
136#define Src2Shift (30) 137#define Src2Shift (31)
137#define Src2None (OpNone << Src2Shift) 138#define Src2None (OpNone << Src2Shift)
138#define Src2CL (OpCL << Src2Shift) 139#define Src2CL (OpCL << Src2Shift)
139#define Src2ImmByte (OpImmByte << Src2Shift) 140#define Src2ImmByte (OpImmByte << Src2Shift)
@@ -1578,12 +1579,21 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1578 1579
1579 memset(&seg_desc, 0, sizeof seg_desc); 1580 memset(&seg_desc, 0, sizeof seg_desc);
1580 1581
1581 if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86) 1582 if (ctxt->mode == X86EMUL_MODE_REAL) {
1582 || ctxt->mode == X86EMUL_MODE_REAL) { 1583 /* set real mode segment descriptor (keep limit etc. for
1583 /* set real mode segment descriptor */ 1584 * unreal mode) */
1584 ctxt->ops->get_segment(ctxt, &dummy, &seg_desc, NULL, seg); 1585 ctxt->ops->get_segment(ctxt, &dummy, &seg_desc, NULL, seg);
1585 set_desc_base(&seg_desc, selector << 4); 1586 set_desc_base(&seg_desc, selector << 4);
1586 goto load; 1587 goto load;
1588 } else if (seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86) {
1589 /* VM86 needs a clean new segment descriptor */
1590 set_desc_base(&seg_desc, selector << 4);
1591 set_desc_limit(&seg_desc, 0xffff);
1592 seg_desc.type = 3;
1593 seg_desc.p = 1;
1594 seg_desc.s = 1;
1595 seg_desc.dpl = 3;
1596 goto load;
1587 } 1597 }
1588 1598
1589 rpl = selector & 3; 1599 rpl = selector & 3;
@@ -3615,7 +3625,7 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
3615#define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i } 3625#define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i }
3616#define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \ 3626#define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \
3617 .check_perm = (_p) } 3627 .check_perm = (_p) }
3618#define N D(0) 3628#define N D(NotImpl)
3619#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) } 3629#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) }
3620#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) } 3630#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) }
3621#define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) } 3631#define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) }
@@ -3713,7 +3723,7 @@ static const struct opcode group5[] = {
3713 I(SrcMemFAddr | ImplicitOps | Stack, em_call_far), 3723 I(SrcMemFAddr | ImplicitOps | Stack, em_call_far),
3714 I(SrcMem | Stack, em_grp45), 3724 I(SrcMem | Stack, em_grp45),
3715 I(SrcMemFAddr | ImplicitOps, em_grp45), 3725 I(SrcMemFAddr | ImplicitOps, em_grp45),
3716 I(SrcMem | Stack, em_grp45), N, 3726 I(SrcMem | Stack, em_grp45), D(Undefined),
3717}; 3727};
3718 3728
3719static const struct opcode group6[] = { 3729static const struct opcode group6[] = {
@@ -4162,6 +4172,10 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
4162 break; 4172 break;
4163 case OpMem8: 4173 case OpMem8:
4164 ctxt->memop.bytes = 1; 4174 ctxt->memop.bytes = 1;
4175 if (ctxt->memop.type == OP_REG) {
4176 ctxt->memop.addr.reg = decode_register(ctxt, ctxt->modrm_rm, 1);
4177 fetch_register_operand(&ctxt->memop);
4178 }
4165 goto mem_common; 4179 goto mem_common;
4166 case OpMem16: 4180 case OpMem16:
4167 ctxt->memop.bytes = 2; 4181 ctxt->memop.bytes = 2;
@@ -4373,7 +4387,7 @@ done_prefixes:
4373 ctxt->intercept = opcode.intercept; 4387 ctxt->intercept = opcode.intercept;
4374 4388
4375 /* Unrecognised? */ 4389 /* Unrecognised? */
4376 if (ctxt->d == 0 || (ctxt->d & Undefined)) 4390 if (ctxt->d == 0 || (ctxt->d & NotImpl))
4377 return EMULATION_FAILED; 4391 return EMULATION_FAILED;
4378 4392
4379 if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn) 4393 if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn)
@@ -4511,7 +4525,8 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
4511 4525
4512 ctxt->mem_read.pos = 0; 4526 ctxt->mem_read.pos = 0;
4513 4527
4514 if (ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) { 4528 if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) ||
4529 (ctxt->d & Undefined)) {
4515 rc = emulate_ud(ctxt); 4530 rc = emulate_ud(ctxt);
4516 goto done; 4531 goto done;
4517 } 4532 }
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index c1d30b2fc9bb..412a5aa0ef94 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -290,8 +290,8 @@ static void pit_do_work(struct kthread_work *work)
290 } 290 }
291 spin_unlock(&ps->inject_lock); 291 spin_unlock(&ps->inject_lock);
292 if (inject) { 292 if (inject) {
293 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); 293 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1, false);
294 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); 294 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0, false);
295 295
296 /* 296 /*
297 * Provides NMI watchdog support via Virtual Wire mode. 297 * Provides NMI watchdog support via Virtual Wire mode.
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index f77df1c5de6e..e1adbb4aca75 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -94,6 +94,14 @@ static inline int apic_test_vector(int vec, void *bitmap)
94 return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); 94 return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
95} 95}
96 96
97bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector)
98{
99 struct kvm_lapic *apic = vcpu->arch.apic;
100
101 return apic_test_vector(vector, apic->regs + APIC_ISR) ||
102 apic_test_vector(vector, apic->regs + APIC_IRR);
103}
104
97static inline void apic_set_vector(int vec, void *bitmap) 105static inline void apic_set_vector(int vec, void *bitmap)
98{ 106{
99 set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); 107 set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
@@ -145,53 +153,6 @@ static inline int kvm_apic_id(struct kvm_lapic *apic)
145 return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff; 153 return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
146} 154}
147 155
148void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
149 struct kvm_lapic_irq *irq,
150 u64 *eoi_exit_bitmap)
151{
152 struct kvm_lapic **dst;
153 struct kvm_apic_map *map;
154 unsigned long bitmap = 1;
155 int i;
156
157 rcu_read_lock();
158 map = rcu_dereference(vcpu->kvm->arch.apic_map);
159
160 if (unlikely(!map)) {
161 __set_bit(irq->vector, (unsigned long *)eoi_exit_bitmap);
162 goto out;
163 }
164
165 if (irq->dest_mode == 0) { /* physical mode */
166 if (irq->delivery_mode == APIC_DM_LOWEST ||
167 irq->dest_id == 0xff) {
168 __set_bit(irq->vector,
169 (unsigned long *)eoi_exit_bitmap);
170 goto out;
171 }
172 dst = &map->phys_map[irq->dest_id & 0xff];
173 } else {
174 u32 mda = irq->dest_id << (32 - map->ldr_bits);
175
176 dst = map->logical_map[apic_cluster_id(map, mda)];
177
178 bitmap = apic_logical_id(map, mda);
179 }
180
181 for_each_set_bit(i, &bitmap, 16) {
182 if (!dst[i])
183 continue;
184 if (dst[i]->vcpu == vcpu) {
185 __set_bit(irq->vector,
186 (unsigned long *)eoi_exit_bitmap);
187 break;
188 }
189 }
190
191out:
192 rcu_read_unlock();
193}
194
195static void recalculate_apic_map(struct kvm *kvm) 156static void recalculate_apic_map(struct kvm *kvm)
196{ 157{
197 struct kvm_apic_map *new, *old = NULL; 158 struct kvm_apic_map *new, *old = NULL;
@@ -256,7 +217,7 @@ out:
256 if (old) 217 if (old)
257 kfree_rcu(old, rcu); 218 kfree_rcu(old, rcu);
258 219
259 kvm_ioapic_make_eoibitmap_request(kvm); 220 kvm_vcpu_request_scan_ioapic(kvm);
260} 221}
261 222
262static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id) 223static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
@@ -357,6 +318,19 @@ static u8 count_vectors(void *bitmap)
357 return count; 318 return count;
358} 319}
359 320
321void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
322{
323 u32 i, pir_val;
324 struct kvm_lapic *apic = vcpu->arch.apic;
325
326 for (i = 0; i <= 7; i++) {
327 pir_val = xchg(&pir[i], 0);
328 if (pir_val)
329 *((u32 *)(apic->regs + APIC_IRR + i * 0x10)) |= pir_val;
330 }
331}
332EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
333
360static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) 334static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
361{ 335{
362 apic->irr_pending = true; 336 apic->irr_pending = true;
@@ -379,6 +353,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
379 if (!apic->irr_pending) 353 if (!apic->irr_pending)
380 return -1; 354 return -1;
381 355
356 kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
382 result = apic_search_irr(apic); 357 result = apic_search_irr(apic);
383 ASSERT(result == -1 || result >= 16); 358 ASSERT(result == -1 || result >= 16);
384 359
@@ -431,14 +406,16 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
431} 406}
432 407
433static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, 408static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
434 int vector, int level, int trig_mode); 409 int vector, int level, int trig_mode,
410 unsigned long *dest_map);
435 411
436int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq) 412int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
413 unsigned long *dest_map)
437{ 414{
438 struct kvm_lapic *apic = vcpu->arch.apic; 415 struct kvm_lapic *apic = vcpu->arch.apic;
439 416
440 return __apic_accept_irq(apic, irq->delivery_mode, irq->vector, 417 return __apic_accept_irq(apic, irq->delivery_mode, irq->vector,
441 irq->level, irq->trig_mode); 418 irq->level, irq->trig_mode, dest_map);
442} 419}
443 420
444static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val) 421static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
@@ -505,6 +482,15 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic)
505 return result; 482 return result;
506} 483}
507 484
485void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr)
486{
487 struct kvm_lapic *apic = vcpu->arch.apic;
488 int i;
489
490 for (i = 0; i < 8; i++)
491 apic_set_reg(apic, APIC_TMR + 0x10 * i, tmr[i]);
492}
493
508static void apic_update_ppr(struct kvm_lapic *apic) 494static void apic_update_ppr(struct kvm_lapic *apic)
509{ 495{
510 u32 tpr, isrv, ppr, old_ppr; 496 u32 tpr, isrv, ppr, old_ppr;
@@ -611,7 +597,7 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
611} 597}
612 598
613bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, 599bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
614 struct kvm_lapic_irq *irq, int *r) 600 struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map)
615{ 601{
616 struct kvm_apic_map *map; 602 struct kvm_apic_map *map;
617 unsigned long bitmap = 1; 603 unsigned long bitmap = 1;
@@ -622,7 +608,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
622 *r = -1; 608 *r = -1;
623 609
624 if (irq->shorthand == APIC_DEST_SELF) { 610 if (irq->shorthand == APIC_DEST_SELF) {
625 *r = kvm_apic_set_irq(src->vcpu, irq); 611 *r = kvm_apic_set_irq(src->vcpu, irq, dest_map);
626 return true; 612 return true;
627 } 613 }
628 614
@@ -667,7 +653,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
667 continue; 653 continue;
668 if (*r < 0) 654 if (*r < 0)
669 *r = 0; 655 *r = 0;
670 *r += kvm_apic_set_irq(dst[i]->vcpu, irq); 656 *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
671 } 657 }
672 658
673 ret = true; 659 ret = true;
@@ -681,7 +667,8 @@ out:
681 * Return 1 if successfully added and 0 if discarded. 667 * Return 1 if successfully added and 0 if discarded.
682 */ 668 */
683static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, 669static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
684 int vector, int level, int trig_mode) 670 int vector, int level, int trig_mode,
671 unsigned long *dest_map)
685{ 672{
686 int result = 0; 673 int result = 0;
687 struct kvm_vcpu *vcpu = apic->vcpu; 674 struct kvm_vcpu *vcpu = apic->vcpu;
@@ -694,24 +681,28 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
694 if (unlikely(!apic_enabled(apic))) 681 if (unlikely(!apic_enabled(apic)))
695 break; 682 break;
696 683
697 if (trig_mode) { 684 if (dest_map)
698 apic_debug("level trig mode for vector %d", vector); 685 __set_bit(vcpu->vcpu_id, dest_map);
699 apic_set_vector(vector, apic->regs + APIC_TMR);
700 } else
701 apic_clear_vector(vector, apic->regs + APIC_TMR);
702 686
703 result = !apic_test_and_set_irr(vector, apic); 687 if (kvm_x86_ops->deliver_posted_interrupt) {
704 trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, 688 result = 1;
705 trig_mode, vector, !result); 689 kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
706 if (!result) { 690 } else {
707 if (trig_mode) 691 result = !apic_test_and_set_irr(vector, apic);
708 apic_debug("level trig mode repeatedly for "
709 "vector %d", vector);
710 break;
711 }
712 692
713 kvm_make_request(KVM_REQ_EVENT, vcpu); 693 if (!result) {
714 kvm_vcpu_kick(vcpu); 694 if (trig_mode)
695 apic_debug("level trig mode repeatedly "
696 "for vector %d", vector);
697 goto out;
698 }
699
700 kvm_make_request(KVM_REQ_EVENT, vcpu);
701 kvm_vcpu_kick(vcpu);
702 }
703out:
704 trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
705 trig_mode, vector, !result);
715 break; 706 break;
716 707
717 case APIC_DM_REMRD: 708 case APIC_DM_REMRD:
@@ -731,7 +722,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
731 case APIC_DM_INIT: 722 case APIC_DM_INIT:
732 if (!trig_mode || level) { 723 if (!trig_mode || level) {
733 result = 1; 724 result = 1;
734 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 725 /* assumes that there are only KVM_APIC_INIT/SIPI */
726 apic->pending_events = (1UL << KVM_APIC_INIT);
727 /* make sure pending_events is visible before sending
728 * the request */
729 smp_wmb();
735 kvm_make_request(KVM_REQ_EVENT, vcpu); 730 kvm_make_request(KVM_REQ_EVENT, vcpu);
736 kvm_vcpu_kick(vcpu); 731 kvm_vcpu_kick(vcpu);
737 } else { 732 } else {
@@ -743,13 +738,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
743 case APIC_DM_STARTUP: 738 case APIC_DM_STARTUP:
744 apic_debug("SIPI to vcpu %d vector 0x%02x\n", 739 apic_debug("SIPI to vcpu %d vector 0x%02x\n",
745 vcpu->vcpu_id, vector); 740 vcpu->vcpu_id, vector);
746 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { 741 result = 1;
747 result = 1; 742 apic->sipi_vector = vector;
748 vcpu->arch.sipi_vector = vector; 743 /* make sure sipi_vector is visible for the receiver */
749 vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; 744 smp_wmb();
750 kvm_make_request(KVM_REQ_EVENT, vcpu); 745 set_bit(KVM_APIC_SIPI, &apic->pending_events);
751 kvm_vcpu_kick(vcpu); 746 kvm_make_request(KVM_REQ_EVENT, vcpu);
752 } 747 kvm_vcpu_kick(vcpu);
753 break; 748 break;
754 749
755 case APIC_DM_EXTINT: 750 case APIC_DM_EXTINT:
@@ -782,7 +777,7 @@ static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
782 trigger_mode = IOAPIC_LEVEL_TRIG; 777 trigger_mode = IOAPIC_LEVEL_TRIG;
783 else 778 else
784 trigger_mode = IOAPIC_EDGE_TRIG; 779 trigger_mode = IOAPIC_EDGE_TRIG;
785 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); 780 kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
786 } 781 }
787} 782}
788 783
@@ -848,7 +843,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
848 irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode, 843 irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
849 irq.vector); 844 irq.vector);
850 845
851 kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq); 846 kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL);
852} 847}
853 848
854static u32 apic_get_tmcct(struct kvm_lapic *apic) 849static u32 apic_get_tmcct(struct kvm_lapic *apic)
@@ -1484,7 +1479,8 @@ int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
1484 vector = reg & APIC_VECTOR_MASK; 1479 vector = reg & APIC_VECTOR_MASK;
1485 mode = reg & APIC_MODE_MASK; 1480 mode = reg & APIC_MODE_MASK;
1486 trig_mode = reg & APIC_LVT_LEVEL_TRIGGER; 1481 trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
1487 return __apic_accept_irq(apic, mode, vector, 1, trig_mode); 1482 return __apic_accept_irq(apic, mode, vector, 1, trig_mode,
1483 NULL);
1488 } 1484 }
1489 return 0; 1485 return 0;
1490} 1486}
@@ -1654,6 +1650,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
1654 apic->highest_isr_cache = -1; 1650 apic->highest_isr_cache = -1;
1655 kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic)); 1651 kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic));
1656 kvm_make_request(KVM_REQ_EVENT, vcpu); 1652 kvm_make_request(KVM_REQ_EVENT, vcpu);
1653 kvm_rtc_eoi_tracking_restore_one(vcpu);
1657} 1654}
1658 1655
1659void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) 1656void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
@@ -1860,6 +1857,34 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
1860 addr, sizeof(u8)); 1857 addr, sizeof(u8));
1861} 1858}
1862 1859
1860void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
1861{
1862 struct kvm_lapic *apic = vcpu->arch.apic;
1863 unsigned int sipi_vector;
1864
1865 if (!kvm_vcpu_has_lapic(vcpu))
1866 return;
1867
1868 if (test_and_clear_bit(KVM_APIC_INIT, &apic->pending_events)) {
1869 kvm_lapic_reset(vcpu);
1870 kvm_vcpu_reset(vcpu);
1871 if (kvm_vcpu_is_bsp(apic->vcpu))
1872 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
1873 else
1874 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
1875 }
1876 if (test_and_clear_bit(KVM_APIC_SIPI, &apic->pending_events) &&
1877 vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
1878 /* evaluate pending_events before reading the vector */
1879 smp_rmb();
1880 sipi_vector = apic->sipi_vector;
1881 pr_debug("vcpu %d received sipi with vector # %x\n",
1882 vcpu->vcpu_id, sipi_vector);
1883 kvm_vcpu_deliver_sipi_vector(vcpu, sipi_vector);
1884 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
1885 }
1886}
1887
1863void kvm_lapic_init(void) 1888void kvm_lapic_init(void)
1864{ 1889{
1865 /* do not patch jump label more than once per second */ 1890 /* do not patch jump label more than once per second */
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 1676d34ddb4e..c730ac9fe801 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -5,6 +5,9 @@
5 5
6#include <linux/kvm_host.h> 6#include <linux/kvm_host.h>
7 7
8#define KVM_APIC_INIT 0
9#define KVM_APIC_SIPI 1
10
8struct kvm_timer { 11struct kvm_timer {
9 struct hrtimer timer; 12 struct hrtimer timer;
10 s64 period; /* unit: ns */ 13 s64 period; /* unit: ns */
@@ -32,6 +35,8 @@ struct kvm_lapic {
32 void *regs; 35 void *regs;
33 gpa_t vapic_addr; 36 gpa_t vapic_addr;
34 struct page *vapic_page; 37 struct page *vapic_page;
38 unsigned long pending_events;
39 unsigned int sipi_vector;
35}; 40};
36int kvm_create_lapic(struct kvm_vcpu *vcpu); 41int kvm_create_lapic(struct kvm_vcpu *vcpu);
37void kvm_free_lapic(struct kvm_vcpu *vcpu); 42void kvm_free_lapic(struct kvm_vcpu *vcpu);
@@ -39,6 +44,7 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu);
39int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); 44int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
40int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); 45int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
41int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); 46int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
47void kvm_apic_accept_events(struct kvm_vcpu *vcpu);
42void kvm_lapic_reset(struct kvm_vcpu *vcpu); 48void kvm_lapic_reset(struct kvm_vcpu *vcpu);
43u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); 49u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
44void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); 50void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
@@ -47,13 +53,16 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
47u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); 53u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
48void kvm_apic_set_version(struct kvm_vcpu *vcpu); 54void kvm_apic_set_version(struct kvm_vcpu *vcpu);
49 55
56void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr);
57void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
50int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); 58int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
51int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); 59int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
52int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq); 60int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
61 unsigned long *dest_map);
53int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); 62int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
54 63
55bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, 64bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
56 struct kvm_lapic_irq *irq, int *r); 65 struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map);
57 66
58u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); 67u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
59void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); 68void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
@@ -154,8 +163,11 @@ static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr)
154 return ldr & map->lid_mask; 163 return ldr & map->lid_mask;
155} 164}
156 165
157void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu, 166static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
158 struct kvm_lapic_irq *irq, 167{
159 u64 *eoi_bitmap); 168 return vcpu->arch.apic->pending_events;
169}
170
171bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
160 172
161#endif 173#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 956ca358108a..004cc87b781c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -199,8 +199,11 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
199 199
200static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access) 200static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)
201{ 201{
202 struct kvm_mmu_page *sp = page_header(__pa(sptep));
203
202 access &= ACC_WRITE_MASK | ACC_USER_MASK; 204 access &= ACC_WRITE_MASK | ACC_USER_MASK;
203 205
206 sp->mmio_cached = true;
204 trace_mark_mmio_spte(sptep, gfn, access); 207 trace_mark_mmio_spte(sptep, gfn, access);
205 mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT); 208 mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT);
206} 209}
@@ -1502,6 +1505,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
1502 u64 *parent_pte, int direct) 1505 u64 *parent_pte, int direct)
1503{ 1506{
1504 struct kvm_mmu_page *sp; 1507 struct kvm_mmu_page *sp;
1508
1505 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 1509 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
1506 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); 1510 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
1507 if (!direct) 1511 if (!direct)
@@ -1644,16 +1648,14 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1644static void kvm_mmu_commit_zap_page(struct kvm *kvm, 1648static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1645 struct list_head *invalid_list); 1649 struct list_head *invalid_list);
1646 1650
1647#define for_each_gfn_sp(kvm, sp, gfn) \ 1651#define for_each_gfn_sp(_kvm, _sp, _gfn) \
1648 hlist_for_each_entry(sp, \ 1652 hlist_for_each_entry(_sp, \
1649 &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ 1653 &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
1650 if ((sp)->gfn != (gfn)) {} else 1654 if ((_sp)->gfn != (_gfn)) {} else
1651 1655
1652#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn) \ 1656#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
1653 hlist_for_each_entry(sp, \ 1657 for_each_gfn_sp(_kvm, _sp, _gfn) \
1654 &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ 1658 if ((_sp)->role.direct || (_sp)->role.invalid) {} else
1655 if ((sp)->gfn != (gfn) || (sp)->role.direct || \
1656 (sp)->role.invalid) {} else
1657 1659
1658/* @sp->gfn should be write-protected at the call site */ 1660/* @sp->gfn should be write-protected at the call site */
1659static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 1661static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
@@ -2089,7 +2091,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2089static void kvm_mmu_commit_zap_page(struct kvm *kvm, 2091static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2090 struct list_head *invalid_list) 2092 struct list_head *invalid_list)
2091{ 2093{
2092 struct kvm_mmu_page *sp; 2094 struct kvm_mmu_page *sp, *nsp;
2093 2095
2094 if (list_empty(invalid_list)) 2096 if (list_empty(invalid_list))
2095 return; 2097 return;
@@ -2106,11 +2108,25 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2106 */ 2108 */
2107 kvm_flush_remote_tlbs(kvm); 2109 kvm_flush_remote_tlbs(kvm);
2108 2110
2109 do { 2111 list_for_each_entry_safe(sp, nsp, invalid_list, link) {
2110 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
2111 WARN_ON(!sp->role.invalid || sp->root_count); 2112 WARN_ON(!sp->role.invalid || sp->root_count);
2112 kvm_mmu_free_page(sp); 2113 kvm_mmu_free_page(sp);
2113 } while (!list_empty(invalid_list)); 2114 }
2115}
2116
2117static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
2118 struct list_head *invalid_list)
2119{
2120 struct kvm_mmu_page *sp;
2121
2122 if (list_empty(&kvm->arch.active_mmu_pages))
2123 return false;
2124
2125 sp = list_entry(kvm->arch.active_mmu_pages.prev,
2126 struct kvm_mmu_page, link);
2127 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2128
2129 return true;
2114} 2130}
2115 2131
2116/* 2132/*
@@ -2120,23 +2136,15 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2120void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages) 2136void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
2121{ 2137{
2122 LIST_HEAD(invalid_list); 2138 LIST_HEAD(invalid_list);
2123 /*
2124 * If we set the number of mmu pages to be smaller be than the
2125 * number of actived pages , we must to free some mmu pages before we
2126 * change the value
2127 */
2128 2139
2129 spin_lock(&kvm->mmu_lock); 2140 spin_lock(&kvm->mmu_lock);
2130 2141
2131 if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { 2142 if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
2132 while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages && 2143 /* Need to free some mmu pages to achieve the goal. */
2133 !list_empty(&kvm->arch.active_mmu_pages)) { 2144 while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages)
2134 struct kvm_mmu_page *page; 2145 if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list))
2146 break;
2135 2147
2136 page = container_of(kvm->arch.active_mmu_pages.prev,
2137 struct kvm_mmu_page, link);
2138 kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
2139 }
2140 kvm_mmu_commit_zap_page(kvm, &invalid_list); 2148 kvm_mmu_commit_zap_page(kvm, &invalid_list);
2141 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; 2149 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
2142 } 2150 }
@@ -2794,6 +2802,7 @@ exit:
2794 2802
2795static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, 2803static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2796 gva_t gva, pfn_t *pfn, bool write, bool *writable); 2804 gva_t gva, pfn_t *pfn, bool write, bool *writable);
2805static void make_mmu_pages_available(struct kvm_vcpu *vcpu);
2797 2806
2798static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, 2807static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
2799 gfn_t gfn, bool prefault) 2808 gfn_t gfn, bool prefault)
@@ -2835,7 +2844,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
2835 spin_lock(&vcpu->kvm->mmu_lock); 2844 spin_lock(&vcpu->kvm->mmu_lock);
2836 if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) 2845 if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
2837 goto out_unlock; 2846 goto out_unlock;
2838 kvm_mmu_free_some_pages(vcpu); 2847 make_mmu_pages_available(vcpu);
2839 if (likely(!force_pt_level)) 2848 if (likely(!force_pt_level))
2840 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); 2849 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
2841 r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn, 2850 r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
@@ -2913,7 +2922,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
2913 2922
2914 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 2923 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2915 spin_lock(&vcpu->kvm->mmu_lock); 2924 spin_lock(&vcpu->kvm->mmu_lock);
2916 kvm_mmu_free_some_pages(vcpu); 2925 make_mmu_pages_available(vcpu);
2917 sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, 2926 sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL,
2918 1, ACC_ALL, NULL); 2927 1, ACC_ALL, NULL);
2919 ++sp->root_count; 2928 ++sp->root_count;
@@ -2925,7 +2934,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
2925 2934
2926 ASSERT(!VALID_PAGE(root)); 2935 ASSERT(!VALID_PAGE(root));
2927 spin_lock(&vcpu->kvm->mmu_lock); 2936 spin_lock(&vcpu->kvm->mmu_lock);
2928 kvm_mmu_free_some_pages(vcpu); 2937 make_mmu_pages_available(vcpu);
2929 sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), 2938 sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
2930 i << 30, 2939 i << 30,
2931 PT32_ROOT_LEVEL, 1, ACC_ALL, 2940 PT32_ROOT_LEVEL, 1, ACC_ALL,
@@ -2964,7 +2973,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
2964 ASSERT(!VALID_PAGE(root)); 2973 ASSERT(!VALID_PAGE(root));
2965 2974
2966 spin_lock(&vcpu->kvm->mmu_lock); 2975 spin_lock(&vcpu->kvm->mmu_lock);
2967 kvm_mmu_free_some_pages(vcpu); 2976 make_mmu_pages_available(vcpu);
2968 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, 2977 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
2969 0, ACC_ALL, NULL); 2978 0, ACC_ALL, NULL);
2970 root = __pa(sp->spt); 2979 root = __pa(sp->spt);
@@ -2998,7 +3007,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
2998 return 1; 3007 return 1;
2999 } 3008 }
3000 spin_lock(&vcpu->kvm->mmu_lock); 3009 spin_lock(&vcpu->kvm->mmu_lock);
3001 kvm_mmu_free_some_pages(vcpu); 3010 make_mmu_pages_available(vcpu);
3002 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, 3011 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
3003 PT32_ROOT_LEVEL, 0, 3012 PT32_ROOT_LEVEL, 0,
3004 ACC_ALL, NULL); 3013 ACC_ALL, NULL);
@@ -3304,7 +3313,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
3304 spin_lock(&vcpu->kvm->mmu_lock); 3313 spin_lock(&vcpu->kvm->mmu_lock);
3305 if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) 3314 if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
3306 goto out_unlock; 3315 goto out_unlock;
3307 kvm_mmu_free_some_pages(vcpu); 3316 make_mmu_pages_available(vcpu);
3308 if (likely(!force_pt_level)) 3317 if (likely(!force_pt_level))
3309 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); 3318 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
3310 r = __direct_map(vcpu, gpa, write, map_writable, 3319 r = __direct_map(vcpu, gpa, write, map_writable,
@@ -4006,17 +4015,17 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
4006} 4015}
4007EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); 4016EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
4008 4017
4009void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 4018static void make_mmu_pages_available(struct kvm_vcpu *vcpu)
4010{ 4019{
4011 LIST_HEAD(invalid_list); 4020 LIST_HEAD(invalid_list);
4012 4021
4013 while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES && 4022 if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
4014 !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { 4023 return;
4015 struct kvm_mmu_page *sp; 4024
4025 while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
4026 if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
4027 break;
4016 4028
4017 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
4018 struct kvm_mmu_page, link);
4019 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
4020 ++vcpu->kvm->stat.mmu_recycled; 4029 ++vcpu->kvm->stat.mmu_recycled;
4021 } 4030 }
4022 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 4031 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
@@ -4185,17 +4194,22 @@ restart:
4185 spin_unlock(&kvm->mmu_lock); 4194 spin_unlock(&kvm->mmu_lock);
4186} 4195}
4187 4196
4188static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, 4197void kvm_mmu_zap_mmio_sptes(struct kvm *kvm)
4189 struct list_head *invalid_list)
4190{ 4198{
4191 struct kvm_mmu_page *page; 4199 struct kvm_mmu_page *sp, *node;
4200 LIST_HEAD(invalid_list);
4192 4201
4193 if (list_empty(&kvm->arch.active_mmu_pages)) 4202 spin_lock(&kvm->mmu_lock);
4194 return; 4203restart:
4204 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
4205 if (!sp->mmio_cached)
4206 continue;
4207 if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
4208 goto restart;
4209 }
4195 4210
4196 page = container_of(kvm->arch.active_mmu_pages.prev, 4211 kvm_mmu_commit_zap_page(kvm, &invalid_list);
4197 struct kvm_mmu_page, link); 4212 spin_unlock(&kvm->mmu_lock);
4198 kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
4199} 4213}
4200 4214
4201static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) 4215static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
@@ -4232,7 +4246,7 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
4232 idx = srcu_read_lock(&kvm->srcu); 4246 idx = srcu_read_lock(&kvm->srcu);
4233 spin_lock(&kvm->mmu_lock); 4247 spin_lock(&kvm->mmu_lock);
4234 4248
4235 kvm_mmu_remove_some_alloc_mmu_pages(kvm, &invalid_list); 4249 prepare_zap_oldest_mmu_page(kvm, &invalid_list);
4236 kvm_mmu_commit_zap_page(kvm, &invalid_list); 4250 kvm_mmu_commit_zap_page(kvm, &invalid_list);
4237 4251
4238 spin_unlock(&kvm->mmu_lock); 4252 spin_unlock(&kvm->mmu_lock);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 69871080e866..2adcbc2cac6d 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -57,14 +57,11 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
57 57
58static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) 58static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
59{ 59{
60 return kvm->arch.n_max_mmu_pages - 60 if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
61 kvm->arch.n_used_mmu_pages; 61 return kvm->arch.n_max_mmu_pages -
62} 62 kvm->arch.n_used_mmu_pages;
63 63
64static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 64 return 0;
65{
66 if (unlikely(kvm_mmu_available_pages(vcpu->kvm)< KVM_MIN_FREE_MMU_PAGES))
67 __kvm_mmu_free_some_pages(vcpu);
68} 65}
69 66
70static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) 67static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 105dd5bd550e..da20860b457a 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -627,7 +627,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
627 goto out_unlock; 627 goto out_unlock;
628 628
629 kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); 629 kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
630 kvm_mmu_free_some_pages(vcpu); 630 make_mmu_pages_available(vcpu);
631 if (!force_pt_level) 631 if (!force_pt_level)
632 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); 632 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
633 r = FNAME(fetch)(vcpu, addr, &walker, write_fault, 633 r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index cfc258a6bf97..c53e797e7369 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -360,10 +360,12 @@ int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
360 return 1; 360 return 1;
361} 361}
362 362
363int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) 363int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
364{ 364{
365 struct kvm_pmu *pmu = &vcpu->arch.pmu; 365 struct kvm_pmu *pmu = &vcpu->arch.pmu;
366 struct kvm_pmc *pmc; 366 struct kvm_pmc *pmc;
367 u32 index = msr_info->index;
368 u64 data = msr_info->data;
367 369
368 switch (index) { 370 switch (index) {
369 case MSR_CORE_PERF_FIXED_CTR_CTRL: 371 case MSR_CORE_PERF_FIXED_CTR_CTRL:
@@ -375,6 +377,10 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
375 } 377 }
376 break; 378 break;
377 case MSR_CORE_PERF_GLOBAL_STATUS: 379 case MSR_CORE_PERF_GLOBAL_STATUS:
380 if (msr_info->host_initiated) {
381 pmu->global_status = data;
382 return 0;
383 }
378 break; /* RO MSR */ 384 break; /* RO MSR */
379 case MSR_CORE_PERF_GLOBAL_CTRL: 385 case MSR_CORE_PERF_GLOBAL_CTRL:
380 if (pmu->global_ctrl == data) 386 if (pmu->global_ctrl == data)
@@ -386,7 +392,8 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
386 break; 392 break;
387 case MSR_CORE_PERF_GLOBAL_OVF_CTRL: 393 case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
388 if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) { 394 if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) {
389 pmu->global_status &= ~data; 395 if (!msr_info->host_initiated)
396 pmu->global_status &= ~data;
390 pmu->global_ovf_ctrl = data; 397 pmu->global_ovf_ctrl = data;
391 return 0; 398 return 0;
392 } 399 }
@@ -394,7 +401,8 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
394 default: 401 default:
395 if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) || 402 if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) ||
396 (pmc = get_fixed_pmc(pmu, index))) { 403 (pmc = get_fixed_pmc(pmu, index))) {
397 data = (s64)(s32)data; 404 if (!msr_info->host_initiated)
405 data = (s64)(s32)data;
398 pmc->counter += data - read_pmc(pmc); 406 pmc->counter += data - read_pmc(pmc);
399 return 0; 407 return 0;
400 } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) { 408 } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 7d39d70647e3..a14a6eaf871d 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1131,17 +1131,11 @@ static void init_vmcb(struct vcpu_svm *svm)
1131 init_seg(&save->gs); 1131 init_seg(&save->gs);
1132 1132
1133 save->cs.selector = 0xf000; 1133 save->cs.selector = 0xf000;
1134 save->cs.base = 0xffff0000;
1134 /* Executable/Readable Code Segment */ 1135 /* Executable/Readable Code Segment */
1135 save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK | 1136 save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
1136 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK; 1137 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
1137 save->cs.limit = 0xffff; 1138 save->cs.limit = 0xffff;
1138 /*
1139 * cs.base should really be 0xffff0000, but vmx can't handle that, so
1140 * be consistent with it.
1141 *
1142 * Replace when we have real mode working for vmx.
1143 */
1144 save->cs.base = 0xf0000;
1145 1139
1146 save->gdtr.limit = 0xffff; 1140 save->gdtr.limit = 0xffff;
1147 save->idtr.limit = 0xffff; 1141 save->idtr.limit = 0xffff;
@@ -1191,7 +1185,7 @@ static void init_vmcb(struct vcpu_svm *svm)
1191 enable_gif(svm); 1185 enable_gif(svm);
1192} 1186}
1193 1187
1194static int svm_vcpu_reset(struct kvm_vcpu *vcpu) 1188static void svm_vcpu_reset(struct kvm_vcpu *vcpu)
1195{ 1189{
1196 struct vcpu_svm *svm = to_svm(vcpu); 1190 struct vcpu_svm *svm = to_svm(vcpu);
1197 u32 dummy; 1191 u32 dummy;
@@ -1199,16 +1193,8 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
1199 1193
1200 init_vmcb(svm); 1194 init_vmcb(svm);
1201 1195
1202 if (!kvm_vcpu_is_bsp(vcpu)) {
1203 kvm_rip_write(vcpu, 0);
1204 svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
1205 svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
1206 }
1207
1208 kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy); 1196 kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
1209 kvm_register_write(vcpu, VCPU_REGS_RDX, eax); 1197 kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
1210
1211 return 0;
1212} 1198}
1213 1199
1214static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) 1200static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
@@ -3487,7 +3473,7 @@ static int handle_exit(struct kvm_vcpu *vcpu)
3487 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && 3473 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
3488 exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH && 3474 exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
3489 exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI) 3475 exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
3490 printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " 3476 printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
3491 "exit_code 0x%x\n", 3477 "exit_code 0x%x\n",
3492 __func__, svm->vmcb->control.exit_int_info, 3478 __func__, svm->vmcb->control.exit_int_info,
3493 exit_code); 3479 exit_code);
@@ -3591,6 +3577,11 @@ static void svm_hwapic_isr_update(struct kvm *kvm, int isr)
3591 return; 3577 return;
3592} 3578}
3593 3579
3580static void svm_sync_pir_to_irr(struct kvm_vcpu *vcpu)
3581{
3582 return;
3583}
3584
3594static int svm_nmi_allowed(struct kvm_vcpu *vcpu) 3585static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
3595{ 3586{
3596 struct vcpu_svm *svm = to_svm(vcpu); 3587 struct vcpu_svm *svm = to_svm(vcpu);
@@ -3641,7 +3632,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
3641 return ret; 3632 return ret;
3642} 3633}
3643 3634
3644static void enable_irq_window(struct kvm_vcpu *vcpu) 3635static int enable_irq_window(struct kvm_vcpu *vcpu)
3645{ 3636{
3646 struct vcpu_svm *svm = to_svm(vcpu); 3637 struct vcpu_svm *svm = to_svm(vcpu);
3647 3638
@@ -3655,15 +3646,16 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
3655 svm_set_vintr(svm); 3646 svm_set_vintr(svm);
3656 svm_inject_irq(svm, 0x0); 3647 svm_inject_irq(svm, 0x0);
3657 } 3648 }
3649 return 0;
3658} 3650}
3659 3651
3660static void enable_nmi_window(struct kvm_vcpu *vcpu) 3652static int enable_nmi_window(struct kvm_vcpu *vcpu)
3661{ 3653{
3662 struct vcpu_svm *svm = to_svm(vcpu); 3654 struct vcpu_svm *svm = to_svm(vcpu);
3663 3655
3664 if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) 3656 if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
3665 == HF_NMI_MASK) 3657 == HF_NMI_MASK)
3666 return; /* IRET will cause a vm exit */ 3658 return 0; /* IRET will cause a vm exit */
3667 3659
3668 /* 3660 /*
3669 * Something prevents NMI from been injected. Single step over possible 3661 * Something prevents NMI from been injected. Single step over possible
@@ -3672,6 +3664,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
3672 svm->nmi_singlestep = true; 3664 svm->nmi_singlestep = true;
3673 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 3665 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
3674 update_db_bp_intercept(vcpu); 3666 update_db_bp_intercept(vcpu);
3667 return 0;
3675} 3668}
3676 3669
3677static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) 3670static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -4247,6 +4240,11 @@ out:
4247 return ret; 4240 return ret;
4248} 4241}
4249 4242
4243static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
4244{
4245 local_irq_enable();
4246}
4247
4250static struct kvm_x86_ops svm_x86_ops = { 4248static struct kvm_x86_ops svm_x86_ops = {
4251 .cpu_has_kvm_support = has_svm, 4249 .cpu_has_kvm_support = has_svm,
4252 .disabled_by_bios = is_disabled, 4250 .disabled_by_bios = is_disabled,
@@ -4314,6 +4312,7 @@ static struct kvm_x86_ops svm_x86_ops = {
4314 .vm_has_apicv = svm_vm_has_apicv, 4312 .vm_has_apicv = svm_vm_has_apicv,
4315 .load_eoi_exitmap = svm_load_eoi_exitmap, 4313 .load_eoi_exitmap = svm_load_eoi_exitmap,
4316 .hwapic_isr_update = svm_hwapic_isr_update, 4314 .hwapic_isr_update = svm_hwapic_isr_update,
4315 .sync_pir_to_irr = svm_sync_pir_to_irr,
4317 4316
4318 .set_tss_addr = svm_set_tss_addr, 4317 .set_tss_addr = svm_set_tss_addr,
4319 .get_tdp_level = get_npt_level, 4318 .get_tdp_level = get_npt_level,
@@ -4342,6 +4341,7 @@ static struct kvm_x86_ops svm_x86_ops = {
4342 .set_tdp_cr3 = set_tdp_cr3, 4341 .set_tdp_cr3 = set_tdp_cr3,
4343 4342
4344 .check_intercept = svm_check_intercept, 4343 .check_intercept = svm_check_intercept,
4344 .handle_external_intr = svm_handle_external_intr,
4345}; 4345};
4346 4346
4347static int __init svm_init(void) 4347static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 867b81037f96..25a791ed21c8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -84,8 +84,11 @@ module_param(vmm_exclusive, bool, S_IRUGO);
84static bool __read_mostly fasteoi = 1; 84static bool __read_mostly fasteoi = 1;
85module_param(fasteoi, bool, S_IRUGO); 85module_param(fasteoi, bool, S_IRUGO);
86 86
87static bool __read_mostly enable_apicv_reg_vid; 87static bool __read_mostly enable_apicv = 1;
88module_param(enable_apicv, bool, S_IRUGO);
88 89
90static bool __read_mostly enable_shadow_vmcs = 1;
91module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
89/* 92/*
90 * If nested=1, nested virtualization is supported, i.e., guests may use 93 * If nested=1, nested virtualization is supported, i.e., guests may use
91 * VMX and be a hypervisor for its own guests. If nested=0, guests may not 94 * VMX and be a hypervisor for its own guests. If nested=0, guests may not
@@ -298,7 +301,8 @@ struct __packed vmcs12 {
298 u32 guest_activity_state; 301 u32 guest_activity_state;
299 u32 guest_sysenter_cs; 302 u32 guest_sysenter_cs;
300 u32 host_ia32_sysenter_cs; 303 u32 host_ia32_sysenter_cs;
301 u32 padding32[8]; /* room for future expansion */ 304 u32 vmx_preemption_timer_value;
305 u32 padding32[7]; /* room for future expansion */
302 u16 virtual_processor_id; 306 u16 virtual_processor_id;
303 u16 guest_es_selector; 307 u16 guest_es_selector;
304 u16 guest_cs_selector; 308 u16 guest_cs_selector;
@@ -351,6 +355,12 @@ struct nested_vmx {
351 /* The host-usable pointer to the above */ 355 /* The host-usable pointer to the above */
352 struct page *current_vmcs12_page; 356 struct page *current_vmcs12_page;
353 struct vmcs12 *current_vmcs12; 357 struct vmcs12 *current_vmcs12;
358 struct vmcs *current_shadow_vmcs;
359 /*
360 * Indicates if the shadow vmcs must be updated with the
361 * data hold by vmcs12
362 */
363 bool sync_shadow_vmcs;
354 364
355 /* vmcs02_list cache of VMCSs recently used to run L2 guests */ 365 /* vmcs02_list cache of VMCSs recently used to run L2 guests */
356 struct list_head vmcs02_pool; 366 struct list_head vmcs02_pool;
@@ -365,6 +375,31 @@ struct nested_vmx {
365 struct page *apic_access_page; 375 struct page *apic_access_page;
366}; 376};
367 377
378#define POSTED_INTR_ON 0
379/* Posted-Interrupt Descriptor */
380struct pi_desc {
381 u32 pir[8]; /* Posted interrupt requested */
382 u32 control; /* bit 0 of control is outstanding notification bit */
383 u32 rsvd[7];
384} __aligned(64);
385
386static bool pi_test_and_set_on(struct pi_desc *pi_desc)
387{
388 return test_and_set_bit(POSTED_INTR_ON,
389 (unsigned long *)&pi_desc->control);
390}
391
392static bool pi_test_and_clear_on(struct pi_desc *pi_desc)
393{
394 return test_and_clear_bit(POSTED_INTR_ON,
395 (unsigned long *)&pi_desc->control);
396}
397
398static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
399{
400 return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
401}
402
368struct vcpu_vmx { 403struct vcpu_vmx {
369 struct kvm_vcpu vcpu; 404 struct kvm_vcpu vcpu;
370 unsigned long host_rsp; 405 unsigned long host_rsp;
@@ -377,6 +412,7 @@ struct vcpu_vmx {
377 struct shared_msr_entry *guest_msrs; 412 struct shared_msr_entry *guest_msrs;
378 int nmsrs; 413 int nmsrs;
379 int save_nmsrs; 414 int save_nmsrs;
415 unsigned long host_idt_base;
380#ifdef CONFIG_X86_64 416#ifdef CONFIG_X86_64
381 u64 msr_host_kernel_gs_base; 417 u64 msr_host_kernel_gs_base;
382 u64 msr_guest_kernel_gs_base; 418 u64 msr_guest_kernel_gs_base;
@@ -428,6 +464,9 @@ struct vcpu_vmx {
428 464
429 bool rdtscp_enabled; 465 bool rdtscp_enabled;
430 466
467 /* Posted interrupt descriptor */
468 struct pi_desc pi_desc;
469
431 /* Support for a guest hypervisor (nested VMX) */ 470 /* Support for a guest hypervisor (nested VMX) */
432 struct nested_vmx nested; 471 struct nested_vmx nested;
433}; 472};
@@ -451,6 +490,64 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
451#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \ 490#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \
452 [number##_HIGH] = VMCS12_OFFSET(name)+4 491 [number##_HIGH] = VMCS12_OFFSET(name)+4
453 492
493
494static const unsigned long shadow_read_only_fields[] = {
495 /*
496 * We do NOT shadow fields that are modified when L0
497 * traps and emulates any vmx instruction (e.g. VMPTRLD,
498 * VMXON...) executed by L1.
499 * For example, VM_INSTRUCTION_ERROR is read
500 * by L1 if a vmx instruction fails (part of the error path).
501 * Note the code assumes this logic. If for some reason
502 * we start shadowing these fields then we need to
503 * force a shadow sync when L0 emulates vmx instructions
504 * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified
505 * by nested_vmx_failValid)
506 */
507 VM_EXIT_REASON,
508 VM_EXIT_INTR_INFO,
509 VM_EXIT_INSTRUCTION_LEN,
510 IDT_VECTORING_INFO_FIELD,
511 IDT_VECTORING_ERROR_CODE,
512 VM_EXIT_INTR_ERROR_CODE,
513 EXIT_QUALIFICATION,
514 GUEST_LINEAR_ADDRESS,
515 GUEST_PHYSICAL_ADDRESS
516};
517static const int max_shadow_read_only_fields =
518 ARRAY_SIZE(shadow_read_only_fields);
519
520static const unsigned long shadow_read_write_fields[] = {
521 GUEST_RIP,
522 GUEST_RSP,
523 GUEST_CR0,
524 GUEST_CR3,
525 GUEST_CR4,
526 GUEST_INTERRUPTIBILITY_INFO,
527 GUEST_RFLAGS,
528 GUEST_CS_SELECTOR,
529 GUEST_CS_AR_BYTES,
530 GUEST_CS_LIMIT,
531 GUEST_CS_BASE,
532 GUEST_ES_BASE,
533 CR0_GUEST_HOST_MASK,
534 CR0_READ_SHADOW,
535 CR4_READ_SHADOW,
536 TSC_OFFSET,
537 EXCEPTION_BITMAP,
538 CPU_BASED_VM_EXEC_CONTROL,
539 VM_ENTRY_EXCEPTION_ERROR_CODE,
540 VM_ENTRY_INTR_INFO_FIELD,
541 VM_ENTRY_INSTRUCTION_LEN,
542 VM_ENTRY_EXCEPTION_ERROR_CODE,
543 HOST_FS_BASE,
544 HOST_GS_BASE,
545 HOST_FS_SELECTOR,
546 HOST_GS_SELECTOR
547};
548static const int max_shadow_read_write_fields =
549 ARRAY_SIZE(shadow_read_write_fields);
550
454static const unsigned short vmcs_field_to_offset_table[] = { 551static const unsigned short vmcs_field_to_offset_table[] = {
455 FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), 552 FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
456 FIELD(GUEST_ES_SELECTOR, guest_es_selector), 553 FIELD(GUEST_ES_SELECTOR, guest_es_selector),
@@ -537,6 +634,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
537 FIELD(GUEST_ACTIVITY_STATE, guest_activity_state), 634 FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
538 FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs), 635 FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
539 FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs), 636 FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
637 FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value),
540 FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask), 638 FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
541 FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask), 639 FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
542 FIELD(CR0_READ_SHADOW, cr0_read_shadow), 640 FIELD(CR0_READ_SHADOW, cr0_read_shadow),
@@ -624,6 +722,9 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
624 struct kvm_segment *var, int seg); 722 struct kvm_segment *var, int seg);
625static bool guest_state_valid(struct kvm_vcpu *vcpu); 723static bool guest_state_valid(struct kvm_vcpu *vcpu);
626static u32 vmx_segment_access_rights(struct kvm_segment *var); 724static u32 vmx_segment_access_rights(struct kvm_segment *var);
725static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu);
726static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
727static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
627 728
628static DEFINE_PER_CPU(struct vmcs *, vmxarea); 729static DEFINE_PER_CPU(struct vmcs *, vmxarea);
629static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 730static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -640,6 +741,8 @@ static unsigned long *vmx_msr_bitmap_legacy;
640static unsigned long *vmx_msr_bitmap_longmode; 741static unsigned long *vmx_msr_bitmap_longmode;
641static unsigned long *vmx_msr_bitmap_legacy_x2apic; 742static unsigned long *vmx_msr_bitmap_legacy_x2apic;
642static unsigned long *vmx_msr_bitmap_longmode_x2apic; 743static unsigned long *vmx_msr_bitmap_longmode_x2apic;
744static unsigned long *vmx_vmread_bitmap;
745static unsigned long *vmx_vmwrite_bitmap;
643 746
644static bool cpu_has_load_ia32_efer; 747static bool cpu_has_load_ia32_efer;
645static bool cpu_has_load_perf_global_ctrl; 748static bool cpu_has_load_perf_global_ctrl;
@@ -782,6 +885,18 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void)
782 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; 885 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
783} 886}
784 887
888static inline bool cpu_has_vmx_posted_intr(void)
889{
890 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
891}
892
893static inline bool cpu_has_vmx_apicv(void)
894{
895 return cpu_has_vmx_apic_register_virt() &&
896 cpu_has_vmx_virtual_intr_delivery() &&
897 cpu_has_vmx_posted_intr();
898}
899
785static inline bool cpu_has_vmx_flexpriority(void) 900static inline bool cpu_has_vmx_flexpriority(void)
786{ 901{
787 return cpu_has_vmx_tpr_shadow() && 902 return cpu_has_vmx_tpr_shadow() &&
@@ -895,6 +1010,18 @@ static inline bool cpu_has_vmx_wbinvd_exit(void)
895 SECONDARY_EXEC_WBINVD_EXITING; 1010 SECONDARY_EXEC_WBINVD_EXITING;
896} 1011}
897 1012
1013static inline bool cpu_has_vmx_shadow_vmcs(void)
1014{
1015 u64 vmx_msr;
1016 rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
1017 /* check if the cpu supports writing r/o exit information fields */
1018 if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
1019 return false;
1020
1021 return vmcs_config.cpu_based_2nd_exec_ctrl &
1022 SECONDARY_EXEC_SHADOW_VMCS;
1023}
1024
898static inline bool report_flexpriority(void) 1025static inline bool report_flexpriority(void)
899{ 1026{
900 return flexpriority_enabled; 1027 return flexpriority_enabled;
@@ -1790,7 +1917,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1790 u32 intr_info = nr | INTR_INFO_VALID_MASK; 1917 u32 intr_info = nr | INTR_INFO_VALID_MASK;
1791 1918
1792 if (nr == PF_VECTOR && is_guest_mode(vcpu) && 1919 if (nr == PF_VECTOR && is_guest_mode(vcpu) &&
1793 nested_pf_handled(vcpu)) 1920 !vmx->nested.nested_run_pending && nested_pf_handled(vcpu))
1794 return; 1921 return;
1795 1922
1796 if (has_error_code) { 1923 if (has_error_code) {
@@ -2022,6 +2149,7 @@ static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
2022static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high; 2149static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
2023static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high; 2150static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
2024static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high; 2151static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
2152static u32 nested_vmx_misc_low, nested_vmx_misc_high;
2025static __init void nested_vmx_setup_ctls_msrs(void) 2153static __init void nested_vmx_setup_ctls_msrs(void)
2026{ 2154{
2027 /* 2155 /*
@@ -2040,30 +2168,40 @@ static __init void nested_vmx_setup_ctls_msrs(void)
2040 */ 2168 */
2041 2169
2042 /* pin-based controls */ 2170 /* pin-based controls */
2171 rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
2172 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high);
2043 /* 2173 /*
2044 * According to the Intel spec, if bit 55 of VMX_BASIC is off (as it is 2174 * According to the Intel spec, if bit 55 of VMX_BASIC is off (as it is
2045 * in our case), bits 1, 2 and 4 (i.e., 0x16) must be 1 in this MSR. 2175 * in our case), bits 1, 2 and 4 (i.e., 0x16) must be 1 in this MSR.
2046 */ 2176 */
2047 nested_vmx_pinbased_ctls_low = 0x16 ; 2177 nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
2048 nested_vmx_pinbased_ctls_high = 0x16 | 2178 nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK |
2049 PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING | 2179 PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS |
2050 PIN_BASED_VIRTUAL_NMIS; 2180 PIN_BASED_VMX_PREEMPTION_TIMER;
2181 nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
2051 2182
2052 /* exit controls */ 2183 /*
2053 nested_vmx_exit_ctls_low = 0; 2184 * Exit controls
2185 * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and
2186 * 17 must be 1.
2187 */
2188 nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
2054 /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */ 2189 /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */
2055#ifdef CONFIG_X86_64 2190#ifdef CONFIG_X86_64
2056 nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE; 2191 nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE;
2057#else 2192#else
2058 nested_vmx_exit_ctls_high = 0; 2193 nested_vmx_exit_ctls_high = 0;
2059#endif 2194#endif
2195 nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
2060 2196
2061 /* entry controls */ 2197 /* entry controls */
2062 rdmsr(MSR_IA32_VMX_ENTRY_CTLS, 2198 rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
2063 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high); 2199 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high);
2064 nested_vmx_entry_ctls_low = 0; 2200 /* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */
2201 nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
2065 nested_vmx_entry_ctls_high &= 2202 nested_vmx_entry_ctls_high &=
2066 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE; 2203 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE;
2204 nested_vmx_entry_ctls_high |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
2067 2205
2068 /* cpu-based controls */ 2206 /* cpu-based controls */
2069 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, 2207 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
@@ -2080,6 +2218,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
2080 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 2218 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
2081 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING | 2219 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
2082 CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING | 2220 CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING |
2221 CPU_BASED_PAUSE_EXITING |
2083 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 2222 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
2084 /* 2223 /*
2085 * We can allow some features even when not supported by the 2224 * We can allow some features even when not supported by the
@@ -2094,7 +2233,14 @@ static __init void nested_vmx_setup_ctls_msrs(void)
2094 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high); 2233 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high);
2095 nested_vmx_secondary_ctls_low = 0; 2234 nested_vmx_secondary_ctls_low = 0;
2096 nested_vmx_secondary_ctls_high &= 2235 nested_vmx_secondary_ctls_high &=
2097 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 2236 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2237 SECONDARY_EXEC_WBINVD_EXITING;
2238
2239 /* miscellaneous data */
2240 rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
2241 nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK |
2242 VMX_MISC_SAVE_EFER_LMA;
2243 nested_vmx_misc_high = 0;
2098} 2244}
2099 2245
2100static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 2246static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
@@ -2165,7 +2311,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2165 nested_vmx_entry_ctls_high); 2311 nested_vmx_entry_ctls_high);
2166 break; 2312 break;
2167 case MSR_IA32_VMX_MISC: 2313 case MSR_IA32_VMX_MISC:
2168 *pdata = 0; 2314 *pdata = vmx_control_msr(nested_vmx_misc_low,
2315 nested_vmx_misc_high);
2169 break; 2316 break;
2170 /* 2317 /*
2171 * These MSRs specify bits which the guest must keep fixed (on or off) 2318 * These MSRs specify bits which the guest must keep fixed (on or off)
@@ -2529,12 +2676,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
2529 u32 _vmexit_control = 0; 2676 u32 _vmexit_control = 0;
2530 u32 _vmentry_control = 0; 2677 u32 _vmentry_control = 0;
2531 2678
2532 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
2533 opt = PIN_BASED_VIRTUAL_NMIS;
2534 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
2535 &_pin_based_exec_control) < 0)
2536 return -EIO;
2537
2538 min = CPU_BASED_HLT_EXITING | 2679 min = CPU_BASED_HLT_EXITING |
2539#ifdef CONFIG_X86_64 2680#ifdef CONFIG_X86_64
2540 CPU_BASED_CR8_LOAD_EXITING | 2681 CPU_BASED_CR8_LOAD_EXITING |
@@ -2573,7 +2714,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
2573 SECONDARY_EXEC_RDTSCP | 2714 SECONDARY_EXEC_RDTSCP |
2574 SECONDARY_EXEC_ENABLE_INVPCID | 2715 SECONDARY_EXEC_ENABLE_INVPCID |
2575 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2716 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2576 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; 2717 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2718 SECONDARY_EXEC_SHADOW_VMCS;
2577 if (adjust_vmx_controls(min2, opt2, 2719 if (adjust_vmx_controls(min2, opt2,
2578 MSR_IA32_VMX_PROCBASED_CTLS2, 2720 MSR_IA32_VMX_PROCBASED_CTLS2,
2579 &_cpu_based_2nd_exec_control) < 0) 2721 &_cpu_based_2nd_exec_control) < 0)
@@ -2605,11 +2747,23 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
2605#ifdef CONFIG_X86_64 2747#ifdef CONFIG_X86_64
2606 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; 2748 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
2607#endif 2749#endif
2608 opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT; 2750 opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
2751 VM_EXIT_ACK_INTR_ON_EXIT;
2609 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, 2752 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
2610 &_vmexit_control) < 0) 2753 &_vmexit_control) < 0)
2611 return -EIO; 2754 return -EIO;
2612 2755
2756 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
2757 opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR;
2758 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
2759 &_pin_based_exec_control) < 0)
2760 return -EIO;
2761
2762 if (!(_cpu_based_2nd_exec_control &
2763 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) ||
2764 !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))
2765 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
2766
2613 min = 0; 2767 min = 0;
2614 opt = VM_ENTRY_LOAD_IA32_PAT; 2768 opt = VM_ENTRY_LOAD_IA32_PAT;
2615 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, 2769 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
@@ -2762,6 +2916,8 @@ static __init int hardware_setup(void)
2762 2916
2763 if (!cpu_has_vmx_vpid()) 2917 if (!cpu_has_vmx_vpid())
2764 enable_vpid = 0; 2918 enable_vpid = 0;
2919 if (!cpu_has_vmx_shadow_vmcs())
2920 enable_shadow_vmcs = 0;
2765 2921
2766 if (!cpu_has_vmx_ept() || 2922 if (!cpu_has_vmx_ept() ||
2767 !cpu_has_vmx_ept_4levels()) { 2923 !cpu_has_vmx_ept_4levels()) {
@@ -2788,14 +2944,16 @@ static __init int hardware_setup(void)
2788 if (!cpu_has_vmx_ple()) 2944 if (!cpu_has_vmx_ple())
2789 ple_gap = 0; 2945 ple_gap = 0;
2790 2946
2791 if (!cpu_has_vmx_apic_register_virt() || 2947 if (!cpu_has_vmx_apicv())
2792 !cpu_has_vmx_virtual_intr_delivery()) 2948 enable_apicv = 0;
2793 enable_apicv_reg_vid = 0;
2794 2949
2795 if (enable_apicv_reg_vid) 2950 if (enable_apicv)
2796 kvm_x86_ops->update_cr8_intercept = NULL; 2951 kvm_x86_ops->update_cr8_intercept = NULL;
2797 else 2952 else {
2798 kvm_x86_ops->hwapic_irr_update = NULL; 2953 kvm_x86_ops->hwapic_irr_update = NULL;
2954 kvm_x86_ops->deliver_posted_interrupt = NULL;
2955 kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
2956 }
2799 2957
2800 if (nested) 2958 if (nested)
2801 nested_vmx_setup_ctls_msrs(); 2959 nested_vmx_setup_ctls_msrs();
@@ -2876,22 +3034,6 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
2876 vmx->cpl = 0; 3034 vmx->cpl = 0;
2877} 3035}
2878 3036
2879static gva_t rmode_tss_base(struct kvm *kvm)
2880{
2881 if (!kvm->arch.tss_addr) {
2882 struct kvm_memslots *slots;
2883 struct kvm_memory_slot *slot;
2884 gfn_t base_gfn;
2885
2886 slots = kvm_memslots(kvm);
2887 slot = id_to_memslot(slots, 0);
2888 base_gfn = slot->base_gfn + slot->npages - 3;
2889
2890 return base_gfn << PAGE_SHIFT;
2891 }
2892 return kvm->arch.tss_addr;
2893}
2894
2895static void fix_rmode_seg(int seg, struct kvm_segment *save) 3037static void fix_rmode_seg(int seg, struct kvm_segment *save)
2896{ 3038{
2897 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3039 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
@@ -2942,19 +3084,15 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
2942 3084
2943 /* 3085 /*
2944 * Very old userspace does not call KVM_SET_TSS_ADDR before entering 3086 * Very old userspace does not call KVM_SET_TSS_ADDR before entering
2945 * vcpu. Call it here with phys address pointing 16M below 4G. 3087 * vcpu. Warn the user that an update is overdue.
2946 */ 3088 */
2947 if (!vcpu->kvm->arch.tss_addr) { 3089 if (!vcpu->kvm->arch.tss_addr)
2948 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be " 3090 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
2949 "called before entering vcpu\n"); 3091 "called before entering vcpu\n");
2950 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
2951 vmx_set_tss_addr(vcpu->kvm, 0xfeffd000);
2952 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
2953 }
2954 3092
2955 vmx_segment_cache_clear(vmx); 3093 vmx_segment_cache_clear(vmx);
2956 3094
2957 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); 3095 vmcs_writel(GUEST_TR_BASE, vcpu->kvm->arch.tss_addr);
2958 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); 3096 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
2959 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 3097 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
2960 3098
@@ -3214,7 +3352,9 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3214 */ 3352 */
3215 if (!nested_vmx_allowed(vcpu)) 3353 if (!nested_vmx_allowed(vcpu))
3216 return 1; 3354 return 1;
3217 } else if (to_vmx(vcpu)->nested.vmxon) 3355 }
3356 if (to_vmx(vcpu)->nested.vmxon &&
3357 ((cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON))
3218 return 1; 3358 return 1;
3219 3359
3220 vcpu->arch.cr4 = cr4; 3360 vcpu->arch.cr4 = cr4;
@@ -3550,7 +3690,7 @@ static bool guest_state_valid(struct kvm_vcpu *vcpu)
3550 return true; 3690 return true;
3551 3691
3552 /* real mode guest state checks */ 3692 /* real mode guest state checks */
3553 if (!is_protmode(vcpu)) { 3693 if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
3554 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) 3694 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
3555 return false; 3695 return false;
3556 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) 3696 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
@@ -3599,7 +3739,7 @@ static int init_rmode_tss(struct kvm *kvm)
3599 int r, idx, ret = 0; 3739 int r, idx, ret = 0;
3600 3740
3601 idx = srcu_read_lock(&kvm->srcu); 3741 idx = srcu_read_lock(&kvm->srcu);
3602 fn = rmode_tss_base(kvm) >> PAGE_SHIFT; 3742 fn = kvm->arch.tss_addr >> PAGE_SHIFT;
3603 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); 3743 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
3604 if (r < 0) 3744 if (r < 0)
3605 goto out; 3745 goto out;
@@ -3692,7 +3832,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
3692 kvm_userspace_mem.flags = 0; 3832 kvm_userspace_mem.flags = 0;
3693 kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL; 3833 kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
3694 kvm_userspace_mem.memory_size = PAGE_SIZE; 3834 kvm_userspace_mem.memory_size = PAGE_SIZE;
3695 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, false); 3835 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem);
3696 if (r) 3836 if (r)
3697 goto out; 3837 goto out;
3698 3838
@@ -3722,7 +3862,7 @@ static int alloc_identity_pagetable(struct kvm *kvm)
3722 kvm_userspace_mem.guest_phys_addr = 3862 kvm_userspace_mem.guest_phys_addr =
3723 kvm->arch.ept_identity_map_addr; 3863 kvm->arch.ept_identity_map_addr;
3724 kvm_userspace_mem.memory_size = PAGE_SIZE; 3864 kvm_userspace_mem.memory_size = PAGE_SIZE;
3725 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, false); 3865 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem);
3726 if (r) 3866 if (r)
3727 goto out; 3867 goto out;
3728 3868
@@ -3869,13 +4009,59 @@ static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
3869 msr, MSR_TYPE_W); 4009 msr, MSR_TYPE_W);
3870} 4010}
3871 4011
4012static int vmx_vm_has_apicv(struct kvm *kvm)
4013{
4014 return enable_apicv && irqchip_in_kernel(kvm);
4015}
4016
4017/*
4018 * Send interrupt to vcpu via posted interrupt way.
4019 * 1. If target vcpu is running(non-root mode), send posted interrupt
4020 * notification to vcpu and hardware will sync PIR to vIRR atomically.
4021 * 2. If target vcpu isn't running(root mode), kick it to pick up the
4022 * interrupt from PIR in next vmentry.
4023 */
4024static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
4025{
4026 struct vcpu_vmx *vmx = to_vmx(vcpu);
4027 int r;
4028
4029 if (pi_test_and_set_pir(vector, &vmx->pi_desc))
4030 return;
4031
4032 r = pi_test_and_set_on(&vmx->pi_desc);
4033 kvm_make_request(KVM_REQ_EVENT, vcpu);
4034#ifdef CONFIG_SMP
4035 if (!r && (vcpu->mode == IN_GUEST_MODE))
4036 apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
4037 POSTED_INTR_VECTOR);
4038 else
4039#endif
4040 kvm_vcpu_kick(vcpu);
4041}
4042
4043static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
4044{
4045 struct vcpu_vmx *vmx = to_vmx(vcpu);
4046
4047 if (!pi_test_and_clear_on(&vmx->pi_desc))
4048 return;
4049
4050 kvm_apic_update_irr(vcpu, vmx->pi_desc.pir);
4051}
4052
4053static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu)
4054{
4055 return;
4056}
4057
3872/* 4058/*
3873 * Set up the vmcs's constant host-state fields, i.e., host-state fields that 4059 * Set up the vmcs's constant host-state fields, i.e., host-state fields that
3874 * will not change in the lifetime of the guest. 4060 * will not change in the lifetime of the guest.
3875 * Note that host-state that does change is set elsewhere. E.g., host-state 4061 * Note that host-state that does change is set elsewhere. E.g., host-state
3876 * that is set differently for each CPU is set in vmx_vcpu_load(), not here. 4062 * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
3877 */ 4063 */
3878static void vmx_set_constant_host_state(void) 4064static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
3879{ 4065{
3880 u32 low32, high32; 4066 u32 low32, high32;
3881 unsigned long tmpl; 4067 unsigned long tmpl;
@@ -3903,6 +4089,7 @@ static void vmx_set_constant_host_state(void)
3903 4089
3904 native_store_idt(&dt); 4090 native_store_idt(&dt);
3905 vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ 4091 vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
4092 vmx->host_idt_base = dt.address;
3906 4093
3907 vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */ 4094 vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */
3908 4095
@@ -3928,6 +4115,15 @@ static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
3928 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); 4115 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
3929} 4116}
3930 4117
4118static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
4119{
4120 u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
4121
4122 if (!vmx_vm_has_apicv(vmx->vcpu.kvm))
4123 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
4124 return pin_based_exec_ctrl;
4125}
4126
3931static u32 vmx_exec_control(struct vcpu_vmx *vmx) 4127static u32 vmx_exec_control(struct vcpu_vmx *vmx)
3932{ 4128{
3933 u32 exec_control = vmcs_config.cpu_based_exec_ctrl; 4129 u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
@@ -3945,11 +4141,6 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
3945 return exec_control; 4141 return exec_control;
3946} 4142}
3947 4143
3948static int vmx_vm_has_apicv(struct kvm *kvm)
3949{
3950 return enable_apicv_reg_vid && irqchip_in_kernel(kvm);
3951}
3952
3953static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) 4144static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
3954{ 4145{
3955 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; 4146 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
@@ -3971,6 +4162,12 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
3971 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | 4162 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
3972 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4163 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
3973 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 4164 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
4165 /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
4166 (handle_vmptrld).
4167 We can NOT enable shadow_vmcs here because we don't have yet
4168 a current VMCS12
4169 */
4170 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
3974 return exec_control; 4171 return exec_control;
3975} 4172}
3976 4173
@@ -3999,14 +4196,17 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
3999 vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a)); 4196 vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
4000 vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b)); 4197 vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
4001 4198
4199 if (enable_shadow_vmcs) {
4200 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
4201 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
4202 }
4002 if (cpu_has_vmx_msr_bitmap()) 4203 if (cpu_has_vmx_msr_bitmap())
4003 vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy)); 4204 vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
4004 4205
4005 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ 4206 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
4006 4207
4007 /* Control */ 4208 /* Control */
4008 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, 4209 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
4009 vmcs_config.pin_based_exec_ctrl);
4010 4210
4011 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); 4211 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
4012 4212
@@ -4015,13 +4215,16 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
4015 vmx_secondary_exec_control(vmx)); 4215 vmx_secondary_exec_control(vmx));
4016 } 4216 }
4017 4217
4018 if (enable_apicv_reg_vid) { 4218 if (vmx_vm_has_apicv(vmx->vcpu.kvm)) {
4019 vmcs_write64(EOI_EXIT_BITMAP0, 0); 4219 vmcs_write64(EOI_EXIT_BITMAP0, 0);
4020 vmcs_write64(EOI_EXIT_BITMAP1, 0); 4220 vmcs_write64(EOI_EXIT_BITMAP1, 0);
4021 vmcs_write64(EOI_EXIT_BITMAP2, 0); 4221 vmcs_write64(EOI_EXIT_BITMAP2, 0);
4022 vmcs_write64(EOI_EXIT_BITMAP3, 0); 4222 vmcs_write64(EOI_EXIT_BITMAP3, 0);
4023 4223
4024 vmcs_write16(GUEST_INTR_STATUS, 0); 4224 vmcs_write16(GUEST_INTR_STATUS, 0);
4225
4226 vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR);
4227 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
4025 } 4228 }
4026 4229
4027 if (ple_gap) { 4230 if (ple_gap) {
@@ -4035,7 +4238,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
4035 4238
4036 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ 4239 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */
4037 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ 4240 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */
4038 vmx_set_constant_host_state(); 4241 vmx_set_constant_host_state(vmx);
4039#ifdef CONFIG_X86_64 4242#ifdef CONFIG_X86_64
4040 rdmsrl(MSR_FS_BASE, a); 4243 rdmsrl(MSR_FS_BASE, a);
4041 vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ 4244 vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
@@ -4089,11 +4292,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
4089 return 0; 4292 return 0;
4090} 4293}
4091 4294
4092static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) 4295static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4093{ 4296{
4094 struct vcpu_vmx *vmx = to_vmx(vcpu); 4297 struct vcpu_vmx *vmx = to_vmx(vcpu);
4095 u64 msr; 4298 u64 msr;
4096 int ret;
4097 4299
4098 vmx->rmode.vm86_active = 0; 4300 vmx->rmode.vm86_active = 0;
4099 4301
@@ -4109,12 +4311,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4109 vmx_segment_cache_clear(vmx); 4311 vmx_segment_cache_clear(vmx);
4110 4312
4111 seg_setup(VCPU_SREG_CS); 4313 seg_setup(VCPU_SREG_CS);
4112 if (kvm_vcpu_is_bsp(&vmx->vcpu)) 4314 vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
4113 vmcs_write16(GUEST_CS_SELECTOR, 0xf000); 4315 vmcs_write32(GUEST_CS_BASE, 0xffff0000);
4114 else {
4115 vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
4116 vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
4117 }
4118 4316
4119 seg_setup(VCPU_SREG_DS); 4317 seg_setup(VCPU_SREG_DS);
4120 seg_setup(VCPU_SREG_ES); 4318 seg_setup(VCPU_SREG_ES);
@@ -4137,10 +4335,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4137 vmcs_writel(GUEST_SYSENTER_EIP, 0); 4335 vmcs_writel(GUEST_SYSENTER_EIP, 0);
4138 4336
4139 vmcs_writel(GUEST_RFLAGS, 0x02); 4337 vmcs_writel(GUEST_RFLAGS, 0x02);
4140 if (kvm_vcpu_is_bsp(&vmx->vcpu)) 4338 kvm_rip_write(vcpu, 0xfff0);
4141 kvm_rip_write(vcpu, 0xfff0);
4142 else
4143 kvm_rip_write(vcpu, 0);
4144 4339
4145 vmcs_writel(GUEST_GDTR_BASE, 0); 4340 vmcs_writel(GUEST_GDTR_BASE, 0);
4146 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); 4341 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
@@ -4171,23 +4366,20 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4171 vmcs_write64(APIC_ACCESS_ADDR, 4366 vmcs_write64(APIC_ACCESS_ADDR,
4172 page_to_phys(vmx->vcpu.kvm->arch.apic_access_page)); 4367 page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
4173 4368
4369 if (vmx_vm_has_apicv(vcpu->kvm))
4370 memset(&vmx->pi_desc, 0, sizeof(struct pi_desc));
4371
4174 if (vmx->vpid != 0) 4372 if (vmx->vpid != 0)
4175 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 4373 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
4176 4374
4177 vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; 4375 vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
4178 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
4179 vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */ 4376 vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */
4180 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
4181 vmx_set_cr4(&vmx->vcpu, 0); 4377 vmx_set_cr4(&vmx->vcpu, 0);
4182 vmx_set_efer(&vmx->vcpu, 0); 4378 vmx_set_efer(&vmx->vcpu, 0);
4183 vmx_fpu_activate(&vmx->vcpu); 4379 vmx_fpu_activate(&vmx->vcpu);
4184 update_exception_bitmap(&vmx->vcpu); 4380 update_exception_bitmap(&vmx->vcpu);
4185 4381
4186 vpid_sync_context(vmx); 4382 vpid_sync_context(vmx);
4187
4188 ret = 0;
4189
4190 return ret;
4191} 4383}
4192 4384
4193/* 4385/*
@@ -4200,40 +4392,45 @@ static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
4200 PIN_BASED_EXT_INTR_MASK; 4392 PIN_BASED_EXT_INTR_MASK;
4201} 4393}
4202 4394
4203static void enable_irq_window(struct kvm_vcpu *vcpu) 4395static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
4396{
4397 return get_vmcs12(vcpu)->pin_based_vm_exec_control &
4398 PIN_BASED_NMI_EXITING;
4399}
4400
4401static int enable_irq_window(struct kvm_vcpu *vcpu)
4204{ 4402{
4205 u32 cpu_based_vm_exec_control; 4403 u32 cpu_based_vm_exec_control;
4206 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) { 4404
4405 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
4207 /* 4406 /*
4208 * We get here if vmx_interrupt_allowed() said we can't 4407 * We get here if vmx_interrupt_allowed() said we can't
4209 * inject to L1 now because L2 must run. Ask L2 to exit 4408 * inject to L1 now because L2 must run. The caller will have
4210 * right after entry, so we can inject to L1 more promptly. 4409 * to make L2 exit right after entry, so we can inject to L1
4410 * more promptly.
4211 */ 4411 */
4212 kvm_make_request(KVM_REQ_IMMEDIATE_EXIT, vcpu); 4412 return -EBUSY;
4213 return;
4214 }
4215 4413
4216 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 4414 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
4217 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; 4415 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
4218 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 4416 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
4417 return 0;
4219} 4418}
4220 4419
4221static void enable_nmi_window(struct kvm_vcpu *vcpu) 4420static int enable_nmi_window(struct kvm_vcpu *vcpu)
4222{ 4421{
4223 u32 cpu_based_vm_exec_control; 4422 u32 cpu_based_vm_exec_control;
4224 4423
4225 if (!cpu_has_virtual_nmis()) { 4424 if (!cpu_has_virtual_nmis())
4226 enable_irq_window(vcpu); 4425 return enable_irq_window(vcpu);
4227 return; 4426
4228 } 4427 if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI)
4428 return enable_irq_window(vcpu);
4229 4429
4230 if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
4231 enable_irq_window(vcpu);
4232 return;
4233 }
4234 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 4430 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
4235 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; 4431 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
4236 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 4432 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
4433 return 0;
4237} 4434}
4238 4435
4239static void vmx_inject_irq(struct kvm_vcpu *vcpu) 4436static void vmx_inject_irq(struct kvm_vcpu *vcpu)
@@ -4294,16 +4491,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
4294 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 4491 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
4295} 4492}
4296 4493
4297static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
4298{
4299 if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
4300 return 0;
4301
4302 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
4303 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
4304 | GUEST_INTR_STATE_NMI));
4305}
4306
4307static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) 4494static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
4308{ 4495{
4309 if (!cpu_has_virtual_nmis()) 4496 if (!cpu_has_virtual_nmis())
@@ -4333,18 +4520,52 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
4333 } 4520 }
4334} 4521}
4335 4522
4523static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
4524{
4525 if (is_guest_mode(vcpu)) {
4526 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4527
4528 if (to_vmx(vcpu)->nested.nested_run_pending)
4529 return 0;
4530 if (nested_exit_on_nmi(vcpu)) {
4531 nested_vmx_vmexit(vcpu);
4532 vmcs12->vm_exit_reason = EXIT_REASON_EXCEPTION_NMI;
4533 vmcs12->vm_exit_intr_info = NMI_VECTOR |
4534 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK;
4535 /*
4536 * The NMI-triggered VM exit counts as injection:
4537 * clear this one and block further NMIs.
4538 */
4539 vcpu->arch.nmi_pending = 0;
4540 vmx_set_nmi_mask(vcpu, true);
4541 return 0;
4542 }
4543 }
4544
4545 if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
4546 return 0;
4547
4548 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
4549 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
4550 | GUEST_INTR_STATE_NMI));
4551}
4552
4336static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) 4553static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
4337{ 4554{
4338 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) { 4555 if (is_guest_mode(vcpu)) {
4339 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4556 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4340 if (to_vmx(vcpu)->nested.nested_run_pending || 4557
4341 (vmcs12->idt_vectoring_info_field & 4558 if (to_vmx(vcpu)->nested.nested_run_pending)
4342 VECTORING_INFO_VALID_MASK))
4343 return 0; 4559 return 0;
4344 nested_vmx_vmexit(vcpu); 4560 if (nested_exit_on_intr(vcpu)) {
4345 vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT; 4561 nested_vmx_vmexit(vcpu);
4346 vmcs12->vm_exit_intr_info = 0; 4562 vmcs12->vm_exit_reason =
4347 /* fall through to normal code, but now in L1, not L2 */ 4563 EXIT_REASON_EXTERNAL_INTERRUPT;
4564 vmcs12->vm_exit_intr_info = 0;
4565 /*
4566 * fall through to normal code, but now in L1, not L2
4567 */
4568 }
4348 } 4569 }
4349 4570
4350 return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && 4571 return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
@@ -4362,7 +4583,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
4362 .flags = 0, 4583 .flags = 0,
4363 }; 4584 };
4364 4585
4365 ret = kvm_set_memory_region(kvm, &tss_mem, false); 4586 ret = kvm_set_memory_region(kvm, &tss_mem);
4366 if (ret) 4587 if (ret)
4367 return ret; 4588 return ret;
4368 kvm->arch.tss_addr = addr; 4589 kvm->arch.tss_addr = addr;
@@ -4603,34 +4824,50 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
4603/* called to set cr0 as appropriate for a mov-to-cr0 exit. */ 4824/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
4604static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) 4825static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
4605{ 4826{
4606 if (to_vmx(vcpu)->nested.vmxon &&
4607 ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON))
4608 return 1;
4609
4610 if (is_guest_mode(vcpu)) { 4827 if (is_guest_mode(vcpu)) {
4828 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4829 unsigned long orig_val = val;
4830
4611 /* 4831 /*
4612 * We get here when L2 changed cr0 in a way that did not change 4832 * We get here when L2 changed cr0 in a way that did not change
4613 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), 4833 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
4614 * but did change L0 shadowed bits. This can currently happen 4834 * but did change L0 shadowed bits. So we first calculate the
4615 * with the TS bit: L0 may want to leave TS on (for lazy fpu 4835 * effective cr0 value that L1 would like to write into the
4616 * loading) while pretending to allow the guest to change it. 4836 * hardware. It consists of the L2-owned bits from the new
4837 * value combined with the L1-owned bits from L1's guest_cr0.
4617 */ 4838 */
4618 if (kvm_set_cr0(vcpu, (val & vcpu->arch.cr0_guest_owned_bits) | 4839 val = (val & ~vmcs12->cr0_guest_host_mask) |
4619 (vcpu->arch.cr0 & ~vcpu->arch.cr0_guest_owned_bits))) 4840 (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
4841
4842 /* TODO: will have to take unrestricted guest mode into
4843 * account */
4844 if ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON)
4620 return 1; 4845 return 1;
4621 vmcs_writel(CR0_READ_SHADOW, val); 4846
4847 if (kvm_set_cr0(vcpu, val))
4848 return 1;
4849 vmcs_writel(CR0_READ_SHADOW, orig_val);
4622 return 0; 4850 return 0;
4623 } else 4851 } else {
4852 if (to_vmx(vcpu)->nested.vmxon &&
4853 ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON))
4854 return 1;
4624 return kvm_set_cr0(vcpu, val); 4855 return kvm_set_cr0(vcpu, val);
4856 }
4625} 4857}
4626 4858
4627static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) 4859static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
4628{ 4860{
4629 if (is_guest_mode(vcpu)) { 4861 if (is_guest_mode(vcpu)) {
4630 if (kvm_set_cr4(vcpu, (val & vcpu->arch.cr4_guest_owned_bits) | 4862 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4631 (vcpu->arch.cr4 & ~vcpu->arch.cr4_guest_owned_bits))) 4863 unsigned long orig_val = val;
4864
4865 /* analogously to handle_set_cr0 */
4866 val = (val & ~vmcs12->cr4_guest_host_mask) |
4867 (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
4868 if (kvm_set_cr4(vcpu, val))
4632 return 1; 4869 return 1;
4633 vmcs_writel(CR4_READ_SHADOW, val); 4870 vmcs_writel(CR4_READ_SHADOW, orig_val);
4634 return 0; 4871 return 0;
4635 } else 4872 } else
4636 return kvm_set_cr4(vcpu, val); 4873 return kvm_set_cr4(vcpu, val);
@@ -5183,7 +5420,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
5183 if (test_bit(KVM_REQ_EVENT, &vcpu->requests)) 5420 if (test_bit(KVM_REQ_EVENT, &vcpu->requests))
5184 return 1; 5421 return 1;
5185 5422
5186 err = emulate_instruction(vcpu, 0); 5423 err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE);
5187 5424
5188 if (err == EMULATE_DO_MMIO) { 5425 if (err == EMULATE_DO_MMIO) {
5189 ret = 0; 5426 ret = 0;
@@ -5259,8 +5496,7 @@ static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
5259 } 5496 }
5260 5497
5261 /* Create a new VMCS */ 5498 /* Create a new VMCS */
5262 item = (struct vmcs02_list *) 5499 item = kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
5263 kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
5264 if (!item) 5500 if (!item)
5265 return NULL; 5501 return NULL;
5266 item->vmcs02.vmcs = alloc_vmcs(); 5502 item->vmcs02.vmcs = alloc_vmcs();
@@ -5309,6 +5545,9 @@ static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
5309 free_loaded_vmcs(&vmx->vmcs01); 5545 free_loaded_vmcs(&vmx->vmcs01);
5310} 5546}
5311 5547
5548static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
5549 u32 vm_instruction_error);
5550
5312/* 5551/*
5313 * Emulate the VMXON instruction. 5552 * Emulate the VMXON instruction.
5314 * Currently, we just remember that VMX is active, and do not save or even 5553 * Currently, we just remember that VMX is active, and do not save or even
@@ -5321,6 +5560,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
5321{ 5560{
5322 struct kvm_segment cs; 5561 struct kvm_segment cs;
5323 struct vcpu_vmx *vmx = to_vmx(vcpu); 5562 struct vcpu_vmx *vmx = to_vmx(vcpu);
5563 struct vmcs *shadow_vmcs;
5324 5564
5325 /* The Intel VMX Instruction Reference lists a bunch of bits that 5565 /* The Intel VMX Instruction Reference lists a bunch of bits that
5326 * are prerequisite to running VMXON, most notably cr4.VMXE must be 5566 * are prerequisite to running VMXON, most notably cr4.VMXE must be
@@ -5344,6 +5584,21 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
5344 kvm_inject_gp(vcpu, 0); 5584 kvm_inject_gp(vcpu, 0);
5345 return 1; 5585 return 1;
5346 } 5586 }
5587 if (vmx->nested.vmxon) {
5588 nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
5589 skip_emulated_instruction(vcpu);
5590 return 1;
5591 }
5592 if (enable_shadow_vmcs) {
5593 shadow_vmcs = alloc_vmcs();
5594 if (!shadow_vmcs)
5595 return -ENOMEM;
5596 /* mark vmcs as shadow */
5597 shadow_vmcs->revision_id |= (1u << 31);
5598 /* init shadow vmcs */
5599 vmcs_clear(shadow_vmcs);
5600 vmx->nested.current_shadow_vmcs = shadow_vmcs;
5601 }
5347 5602
5348 INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); 5603 INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
5349 vmx->nested.vmcs02_num = 0; 5604 vmx->nested.vmcs02_num = 0;
@@ -5384,6 +5639,25 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
5384 return 1; 5639 return 1;
5385} 5640}
5386 5641
5642static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
5643{
5644 u32 exec_control;
5645 if (enable_shadow_vmcs) {
5646 if (vmx->nested.current_vmcs12 != NULL) {
5647 /* copy to memory all shadowed fields in case
5648 they were modified */
5649 copy_shadow_to_vmcs12(vmx);
5650 vmx->nested.sync_shadow_vmcs = false;
5651 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
5652 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
5653 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
5654 vmcs_write64(VMCS_LINK_POINTER, -1ull);
5655 }
5656 }
5657 kunmap(vmx->nested.current_vmcs12_page);
5658 nested_release_page(vmx->nested.current_vmcs12_page);
5659}
5660
5387/* 5661/*
5388 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 5662 * Free whatever needs to be freed from vmx->nested when L1 goes down, or
5389 * just stops using VMX. 5663 * just stops using VMX.
@@ -5394,11 +5668,12 @@ static void free_nested(struct vcpu_vmx *vmx)
5394 return; 5668 return;
5395 vmx->nested.vmxon = false; 5669 vmx->nested.vmxon = false;
5396 if (vmx->nested.current_vmptr != -1ull) { 5670 if (vmx->nested.current_vmptr != -1ull) {
5397 kunmap(vmx->nested.current_vmcs12_page); 5671 nested_release_vmcs12(vmx);
5398 nested_release_page(vmx->nested.current_vmcs12_page);
5399 vmx->nested.current_vmptr = -1ull; 5672 vmx->nested.current_vmptr = -1ull;
5400 vmx->nested.current_vmcs12 = NULL; 5673 vmx->nested.current_vmcs12 = NULL;
5401 } 5674 }
5675 if (enable_shadow_vmcs)
5676 free_vmcs(vmx->nested.current_shadow_vmcs);
5402 /* Unpin physical memory we referred to in current vmcs02 */ 5677 /* Unpin physical memory we referred to in current vmcs02 */
5403 if (vmx->nested.apic_access_page) { 5678 if (vmx->nested.apic_access_page) {
5404 nested_release_page(vmx->nested.apic_access_page); 5679 nested_release_page(vmx->nested.apic_access_page);
@@ -5507,6 +5782,10 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
5507 X86_EFLAGS_SF | X86_EFLAGS_OF)) 5782 X86_EFLAGS_SF | X86_EFLAGS_OF))
5508 | X86_EFLAGS_ZF); 5783 | X86_EFLAGS_ZF);
5509 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 5784 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
5785 /*
5786 * We don't need to force a shadow sync because
5787 * VM_INSTRUCTION_ERROR is not shadowed
5788 */
5510} 5789}
5511 5790
5512/* Emulate the VMCLEAR instruction */ 5791/* Emulate the VMCLEAR instruction */
@@ -5539,8 +5818,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
5539 } 5818 }
5540 5819
5541 if (vmptr == vmx->nested.current_vmptr) { 5820 if (vmptr == vmx->nested.current_vmptr) {
5542 kunmap(vmx->nested.current_vmcs12_page); 5821 nested_release_vmcs12(vmx);
5543 nested_release_page(vmx->nested.current_vmcs12_page);
5544 vmx->nested.current_vmptr = -1ull; 5822 vmx->nested.current_vmptr = -1ull;
5545 vmx->nested.current_vmcs12 = NULL; 5823 vmx->nested.current_vmcs12 = NULL;
5546 } 5824 }
@@ -5639,6 +5917,111 @@ static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu,
5639 } 5917 }
5640} 5918}
5641 5919
5920
5921static inline bool vmcs12_write_any(struct kvm_vcpu *vcpu,
5922 unsigned long field, u64 field_value){
5923 short offset = vmcs_field_to_offset(field);
5924 char *p = ((char *) get_vmcs12(vcpu)) + offset;
5925 if (offset < 0)
5926 return false;
5927
5928 switch (vmcs_field_type(field)) {
5929 case VMCS_FIELD_TYPE_U16:
5930 *(u16 *)p = field_value;
5931 return true;
5932 case VMCS_FIELD_TYPE_U32:
5933 *(u32 *)p = field_value;
5934 return true;
5935 case VMCS_FIELD_TYPE_U64:
5936 *(u64 *)p = field_value;
5937 return true;
5938 case VMCS_FIELD_TYPE_NATURAL_WIDTH:
5939 *(natural_width *)p = field_value;
5940 return true;
5941 default:
5942 return false; /* can never happen. */
5943 }
5944
5945}
5946
5947static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
5948{
5949 int i;
5950 unsigned long field;
5951 u64 field_value;
5952 struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs;
5953 unsigned long *fields = (unsigned long *)shadow_read_write_fields;
5954 int num_fields = max_shadow_read_write_fields;
5955
5956 vmcs_load(shadow_vmcs);
5957
5958 for (i = 0; i < num_fields; i++) {
5959 field = fields[i];
5960 switch (vmcs_field_type(field)) {
5961 case VMCS_FIELD_TYPE_U16:
5962 field_value = vmcs_read16(field);
5963 break;
5964 case VMCS_FIELD_TYPE_U32:
5965 field_value = vmcs_read32(field);
5966 break;
5967 case VMCS_FIELD_TYPE_U64:
5968 field_value = vmcs_read64(field);
5969 break;
5970 case VMCS_FIELD_TYPE_NATURAL_WIDTH:
5971 field_value = vmcs_readl(field);
5972 break;
5973 }
5974 vmcs12_write_any(&vmx->vcpu, field, field_value);
5975 }
5976
5977 vmcs_clear(shadow_vmcs);
5978 vmcs_load(vmx->loaded_vmcs->vmcs);
5979}
5980
5981static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
5982{
5983 unsigned long *fields[] = {
5984 (unsigned long *)shadow_read_write_fields,
5985 (unsigned long *)shadow_read_only_fields
5986 };
5987 int num_lists = ARRAY_SIZE(fields);
5988 int max_fields[] = {
5989 max_shadow_read_write_fields,
5990 max_shadow_read_only_fields
5991 };
5992 int i, q;
5993 unsigned long field;
5994 u64 field_value = 0;
5995 struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs;
5996
5997 vmcs_load(shadow_vmcs);
5998
5999 for (q = 0; q < num_lists; q++) {
6000 for (i = 0; i < max_fields[q]; i++) {
6001 field = fields[q][i];
6002 vmcs12_read_any(&vmx->vcpu, field, &field_value);
6003
6004 switch (vmcs_field_type(field)) {
6005 case VMCS_FIELD_TYPE_U16:
6006 vmcs_write16(field, (u16)field_value);
6007 break;
6008 case VMCS_FIELD_TYPE_U32:
6009 vmcs_write32(field, (u32)field_value);
6010 break;
6011 case VMCS_FIELD_TYPE_U64:
6012 vmcs_write64(field, (u64)field_value);
6013 break;
6014 case VMCS_FIELD_TYPE_NATURAL_WIDTH:
6015 vmcs_writel(field, (long)field_value);
6016 break;
6017 }
6018 }
6019 }
6020
6021 vmcs_clear(shadow_vmcs);
6022 vmcs_load(vmx->loaded_vmcs->vmcs);
6023}
6024
5642/* 6025/*
5643 * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was 6026 * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was
5644 * used before) all generate the same failure when it is missing. 6027 * used before) all generate the same failure when it is missing.
@@ -5703,8 +6086,6 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
5703 gva_t gva; 6086 gva_t gva;
5704 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 6087 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5705 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6088 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5706 char *p;
5707 short offset;
5708 /* The value to write might be 32 or 64 bits, depending on L1's long 6089 /* The value to write might be 32 or 64 bits, depending on L1's long
5709 * mode, and eventually we need to write that into a field of several 6090 * mode, and eventually we need to write that into a field of several
5710 * possible lengths. The code below first zero-extends the value to 64 6091 * possible lengths. The code below first zero-extends the value to 64
@@ -5741,28 +6122,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
5741 return 1; 6122 return 1;
5742 } 6123 }
5743 6124
5744 offset = vmcs_field_to_offset(field); 6125 if (!vmcs12_write_any(vcpu, field, field_value)) {
5745 if (offset < 0) {
5746 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5747 skip_emulated_instruction(vcpu);
5748 return 1;
5749 }
5750 p = ((char *) get_vmcs12(vcpu)) + offset;
5751
5752 switch (vmcs_field_type(field)) {
5753 case VMCS_FIELD_TYPE_U16:
5754 *(u16 *)p = field_value;
5755 break;
5756 case VMCS_FIELD_TYPE_U32:
5757 *(u32 *)p = field_value;
5758 break;
5759 case VMCS_FIELD_TYPE_U64:
5760 *(u64 *)p = field_value;
5761 break;
5762 case VMCS_FIELD_TYPE_NATURAL_WIDTH:
5763 *(natural_width *)p = field_value;
5764 break;
5765 default:
5766 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 6126 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5767 skip_emulated_instruction(vcpu); 6127 skip_emulated_instruction(vcpu);
5768 return 1; 6128 return 1;
@@ -5780,6 +6140,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
5780 gva_t gva; 6140 gva_t gva;
5781 gpa_t vmptr; 6141 gpa_t vmptr;
5782 struct x86_exception e; 6142 struct x86_exception e;
6143 u32 exec_control;
5783 6144
5784 if (!nested_vmx_check_permission(vcpu)) 6145 if (!nested_vmx_check_permission(vcpu))
5785 return 1; 6146 return 1;
@@ -5818,14 +6179,20 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
5818 skip_emulated_instruction(vcpu); 6179 skip_emulated_instruction(vcpu);
5819 return 1; 6180 return 1;
5820 } 6181 }
5821 if (vmx->nested.current_vmptr != -1ull) { 6182 if (vmx->nested.current_vmptr != -1ull)
5822 kunmap(vmx->nested.current_vmcs12_page); 6183 nested_release_vmcs12(vmx);
5823 nested_release_page(vmx->nested.current_vmcs12_page);
5824 }
5825 6184
5826 vmx->nested.current_vmptr = vmptr; 6185 vmx->nested.current_vmptr = vmptr;
5827 vmx->nested.current_vmcs12 = new_vmcs12; 6186 vmx->nested.current_vmcs12 = new_vmcs12;
5828 vmx->nested.current_vmcs12_page = page; 6187 vmx->nested.current_vmcs12_page = page;
6188 if (enable_shadow_vmcs) {
6189 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
6190 exec_control |= SECONDARY_EXEC_SHADOW_VMCS;
6191 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
6192 vmcs_write64(VMCS_LINK_POINTER,
6193 __pa(vmx->nested.current_shadow_vmcs));
6194 vmx->nested.sync_shadow_vmcs = true;
6195 }
5829 } 6196 }
5830 6197
5831 nested_vmx_succeed(vcpu); 6198 nested_vmx_succeed(vcpu);
@@ -5908,6 +6275,52 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
5908static const int kvm_vmx_max_exit_handlers = 6275static const int kvm_vmx_max_exit_handlers =
5909 ARRAY_SIZE(kvm_vmx_exit_handlers); 6276 ARRAY_SIZE(kvm_vmx_exit_handlers);
5910 6277
6278static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
6279 struct vmcs12 *vmcs12)
6280{
6281 unsigned long exit_qualification;
6282 gpa_t bitmap, last_bitmap;
6283 unsigned int port;
6284 int size;
6285 u8 b;
6286
6287 if (nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING))
6288 return 1;
6289
6290 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
6291 return 0;
6292
6293 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6294
6295 port = exit_qualification >> 16;
6296 size = (exit_qualification & 7) + 1;
6297
6298 last_bitmap = (gpa_t)-1;
6299 b = -1;
6300
6301 while (size > 0) {
6302 if (port < 0x8000)
6303 bitmap = vmcs12->io_bitmap_a;
6304 else if (port < 0x10000)
6305 bitmap = vmcs12->io_bitmap_b;
6306 else
6307 return 1;
6308 bitmap += (port & 0x7fff) / 8;
6309
6310 if (last_bitmap != bitmap)
6311 if (kvm_read_guest(vcpu->kvm, bitmap, &b, 1))
6312 return 1;
6313 if (b & (1 << (port & 7)))
6314 return 1;
6315
6316 port++;
6317 size--;
6318 last_bitmap = bitmap;
6319 }
6320
6321 return 0;
6322}
6323
5911/* 6324/*
5912 * Return 1 if we should exit from L2 to L1 to handle an MSR access access, 6325 * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
5913 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 6326 * rather than handle it ourselves in L0. I.e., check whether L1 expressed
@@ -5939,7 +6352,8 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
5939 /* Then read the msr_index'th bit from this bitmap: */ 6352 /* Then read the msr_index'th bit from this bitmap: */
5940 if (msr_index < 1024*8) { 6353 if (msr_index < 1024*8) {
5941 unsigned char b; 6354 unsigned char b;
5942 kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1); 6355 if (kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1))
6356 return 1;
5943 return 1 & (b >> (msr_index & 7)); 6357 return 1 & (b >> (msr_index & 7));
5944 } else 6358 } else
5945 return 1; /* let L1 handle the wrong parameter */ 6359 return 1; /* let L1 handle the wrong parameter */
@@ -6033,10 +6447,10 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
6033 */ 6447 */
6034static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) 6448static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
6035{ 6449{
6036 u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
6037 u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 6450 u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
6038 struct vcpu_vmx *vmx = to_vmx(vcpu); 6451 struct vcpu_vmx *vmx = to_vmx(vcpu);
6039 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6452 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6453 u32 exit_reason = vmx->exit_reason;
6040 6454
6041 if (vmx->nested.nested_run_pending) 6455 if (vmx->nested.nested_run_pending)
6042 return 0; 6456 return 0;
@@ -6060,14 +6474,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
6060 case EXIT_REASON_TRIPLE_FAULT: 6474 case EXIT_REASON_TRIPLE_FAULT:
6061 return 1; 6475 return 1;
6062 case EXIT_REASON_PENDING_INTERRUPT: 6476 case EXIT_REASON_PENDING_INTERRUPT:
6477 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING);
6063 case EXIT_REASON_NMI_WINDOW: 6478 case EXIT_REASON_NMI_WINDOW:
6064 /* 6479 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING);
6065 * prepare_vmcs02() set the CPU_BASED_VIRTUAL_INTR_PENDING bit
6066 * (aka Interrupt Window Exiting) only when L1 turned it on,
6067 * so if we got a PENDING_INTERRUPT exit, this must be for L1.
6068 * Same for NMI Window Exiting.
6069 */
6070 return 1;
6071 case EXIT_REASON_TASK_SWITCH: 6480 case EXIT_REASON_TASK_SWITCH:
6072 return 1; 6481 return 1;
6073 case EXIT_REASON_CPUID: 6482 case EXIT_REASON_CPUID:
@@ -6097,8 +6506,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
6097 case EXIT_REASON_DR_ACCESS: 6506 case EXIT_REASON_DR_ACCESS:
6098 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 6507 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
6099 case EXIT_REASON_IO_INSTRUCTION: 6508 case EXIT_REASON_IO_INSTRUCTION:
6100 /* TODO: support IO bitmaps */ 6509 return nested_vmx_exit_handled_io(vcpu, vmcs12);
6101 return 1;
6102 case EXIT_REASON_MSR_READ: 6510 case EXIT_REASON_MSR_READ:
6103 case EXIT_REASON_MSR_WRITE: 6511 case EXIT_REASON_MSR_WRITE:
6104 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 6512 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
@@ -6122,6 +6530,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
6122 case EXIT_REASON_EPT_VIOLATION: 6530 case EXIT_REASON_EPT_VIOLATION:
6123 case EXIT_REASON_EPT_MISCONFIG: 6531 case EXIT_REASON_EPT_MISCONFIG:
6124 return 0; 6532 return 0;
6533 case EXIT_REASON_PREEMPTION_TIMER:
6534 return vmcs12->pin_based_vm_exec_control &
6535 PIN_BASED_VMX_PREEMPTION_TIMER;
6125 case EXIT_REASON_WBINVD: 6536 case EXIT_REASON_WBINVD:
6126 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 6537 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
6127 case EXIT_REASON_XSETBV: 6538 case EXIT_REASON_XSETBV:
@@ -6316,6 +6727,9 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
6316 6727
6317static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) 6728static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
6318{ 6729{
6730 if (!vmx_vm_has_apicv(vcpu->kvm))
6731 return;
6732
6319 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); 6733 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
6320 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); 6734 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
6321 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); 6735 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
@@ -6346,6 +6760,52 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
6346 } 6760 }
6347} 6761}
6348 6762
6763static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
6764{
6765 u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
6766
6767 /*
6768 * If external interrupt exists, IF bit is set in rflags/eflags on the
6769 * interrupt stack frame, and interrupt will be enabled on a return
6770 * from interrupt handler.
6771 */
6772 if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK))
6773 == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) {
6774 unsigned int vector;
6775 unsigned long entry;
6776 gate_desc *desc;
6777 struct vcpu_vmx *vmx = to_vmx(vcpu);
6778#ifdef CONFIG_X86_64
6779 unsigned long tmp;
6780#endif
6781
6782 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
6783 desc = (gate_desc *)vmx->host_idt_base + vector;
6784 entry = gate_offset(*desc);
6785 asm volatile(
6786#ifdef CONFIG_X86_64
6787 "mov %%" _ASM_SP ", %[sp]\n\t"
6788 "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t"
6789 "push $%c[ss]\n\t"
6790 "push %[sp]\n\t"
6791#endif
6792 "pushf\n\t"
6793 "orl $0x200, (%%" _ASM_SP ")\n\t"
6794 __ASM_SIZE(push) " $%c[cs]\n\t"
6795 "call *%[entry]\n\t"
6796 :
6797#ifdef CONFIG_X86_64
6798 [sp]"=&r"(tmp)
6799#endif
6800 :
6801 [entry]"r"(entry),
6802 [ss]"i"(__KERNEL_DS),
6803 [cs]"i"(__KERNEL_CS)
6804 );
6805 } else
6806 local_irq_enable();
6807}
6808
6349static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) 6809static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
6350{ 6810{
6351 u32 exit_intr_info; 6811 u32 exit_intr_info;
@@ -6388,7 +6848,7 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
6388 ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); 6848 ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
6389} 6849}
6390 6850
6391static void __vmx_complete_interrupts(struct vcpu_vmx *vmx, 6851static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
6392 u32 idt_vectoring_info, 6852 u32 idt_vectoring_info,
6393 int instr_len_field, 6853 int instr_len_field,
6394 int error_code_field) 6854 int error_code_field)
@@ -6399,46 +6859,43 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
6399 6859
6400 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; 6860 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
6401 6861
6402 vmx->vcpu.arch.nmi_injected = false; 6862 vcpu->arch.nmi_injected = false;
6403 kvm_clear_exception_queue(&vmx->vcpu); 6863 kvm_clear_exception_queue(vcpu);
6404 kvm_clear_interrupt_queue(&vmx->vcpu); 6864 kvm_clear_interrupt_queue(vcpu);
6405 6865
6406 if (!idtv_info_valid) 6866 if (!idtv_info_valid)
6407 return; 6867 return;
6408 6868
6409 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 6869 kvm_make_request(KVM_REQ_EVENT, vcpu);
6410 6870
6411 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; 6871 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
6412 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; 6872 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
6413 6873
6414 switch (type) { 6874 switch (type) {
6415 case INTR_TYPE_NMI_INTR: 6875 case INTR_TYPE_NMI_INTR:
6416 vmx->vcpu.arch.nmi_injected = true; 6876 vcpu->arch.nmi_injected = true;
6417 /* 6877 /*
6418 * SDM 3: 27.7.1.2 (September 2008) 6878 * SDM 3: 27.7.1.2 (September 2008)
6419 * Clear bit "block by NMI" before VM entry if a NMI 6879 * Clear bit "block by NMI" before VM entry if a NMI
6420 * delivery faulted. 6880 * delivery faulted.
6421 */ 6881 */
6422 vmx_set_nmi_mask(&vmx->vcpu, false); 6882 vmx_set_nmi_mask(vcpu, false);
6423 break; 6883 break;
6424 case INTR_TYPE_SOFT_EXCEPTION: 6884 case INTR_TYPE_SOFT_EXCEPTION:
6425 vmx->vcpu.arch.event_exit_inst_len = 6885 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
6426 vmcs_read32(instr_len_field);
6427 /* fall through */ 6886 /* fall through */
6428 case INTR_TYPE_HARD_EXCEPTION: 6887 case INTR_TYPE_HARD_EXCEPTION:
6429 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { 6888 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
6430 u32 err = vmcs_read32(error_code_field); 6889 u32 err = vmcs_read32(error_code_field);
6431 kvm_queue_exception_e(&vmx->vcpu, vector, err); 6890 kvm_queue_exception_e(vcpu, vector, err);
6432 } else 6891 } else
6433 kvm_queue_exception(&vmx->vcpu, vector); 6892 kvm_queue_exception(vcpu, vector);
6434 break; 6893 break;
6435 case INTR_TYPE_SOFT_INTR: 6894 case INTR_TYPE_SOFT_INTR:
6436 vmx->vcpu.arch.event_exit_inst_len = 6895 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
6437 vmcs_read32(instr_len_field);
6438 /* fall through */ 6896 /* fall through */
6439 case INTR_TYPE_EXT_INTR: 6897 case INTR_TYPE_EXT_INTR:
6440 kvm_queue_interrupt(&vmx->vcpu, vector, 6898 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
6441 type == INTR_TYPE_SOFT_INTR);
6442 break; 6899 break;
6443 default: 6900 default:
6444 break; 6901 break;
@@ -6447,18 +6904,14 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
6447 6904
6448static void vmx_complete_interrupts(struct vcpu_vmx *vmx) 6905static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
6449{ 6906{
6450 if (is_guest_mode(&vmx->vcpu)) 6907 __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
6451 return;
6452 __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info,
6453 VM_EXIT_INSTRUCTION_LEN, 6908 VM_EXIT_INSTRUCTION_LEN,
6454 IDT_VECTORING_ERROR_CODE); 6909 IDT_VECTORING_ERROR_CODE);
6455} 6910}
6456 6911
6457static void vmx_cancel_injection(struct kvm_vcpu *vcpu) 6912static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
6458{ 6913{
6459 if (is_guest_mode(vcpu)) 6914 __vmx_complete_interrupts(vcpu,
6460 return;
6461 __vmx_complete_interrupts(to_vmx(vcpu),
6462 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 6915 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
6463 VM_ENTRY_INSTRUCTION_LEN, 6916 VM_ENTRY_INSTRUCTION_LEN,
6464 VM_ENTRY_EXCEPTION_ERROR_CODE); 6917 VM_ENTRY_EXCEPTION_ERROR_CODE);
@@ -6489,21 +6942,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
6489 struct vcpu_vmx *vmx = to_vmx(vcpu); 6942 struct vcpu_vmx *vmx = to_vmx(vcpu);
6490 unsigned long debugctlmsr; 6943 unsigned long debugctlmsr;
6491 6944
6492 if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) {
6493 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6494 if (vmcs12->idt_vectoring_info_field &
6495 VECTORING_INFO_VALID_MASK) {
6496 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
6497 vmcs12->idt_vectoring_info_field);
6498 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
6499 vmcs12->vm_exit_instruction_len);
6500 if (vmcs12->idt_vectoring_info_field &
6501 VECTORING_INFO_DELIVER_CODE_MASK)
6502 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
6503 vmcs12->idt_vectoring_error_code);
6504 }
6505 }
6506
6507 /* Record the guest's net vcpu time for enforced NMI injections. */ 6945 /* Record the guest's net vcpu time for enforced NMI injections. */
6508 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) 6946 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
6509 vmx->entry_time = ktime_get(); 6947 vmx->entry_time = ktime_get();
@@ -6513,6 +6951,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
6513 if (vmx->emulation_required) 6951 if (vmx->emulation_required)
6514 return; 6952 return;
6515 6953
6954 if (vmx->nested.sync_shadow_vmcs) {
6955 copy_vmcs12_to_shadow(vmx);
6956 vmx->nested.sync_shadow_vmcs = false;
6957 }
6958
6516 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) 6959 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
6517 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); 6960 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
6518 if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) 6961 if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
@@ -6662,17 +7105,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
6662 7105
6663 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 7106 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
6664 7107
6665 if (is_guest_mode(vcpu)) {
6666 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6667 vmcs12->idt_vectoring_info_field = vmx->idt_vectoring_info;
6668 if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
6669 vmcs12->idt_vectoring_error_code =
6670 vmcs_read32(IDT_VECTORING_ERROR_CODE);
6671 vmcs12->vm_exit_instruction_len =
6672 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
6673 }
6674 }
6675
6676 vmx->loaded_vmcs->launched = 1; 7108 vmx->loaded_vmcs->launched = 1;
6677 7109
6678 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); 7110 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
@@ -6734,10 +7166,11 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
6734 put_cpu(); 7166 put_cpu();
6735 if (err) 7167 if (err)
6736 goto free_vmcs; 7168 goto free_vmcs;
6737 if (vm_need_virtualize_apic_accesses(kvm)) 7169 if (vm_need_virtualize_apic_accesses(kvm)) {
6738 err = alloc_apic_access_page(kvm); 7170 err = alloc_apic_access_page(kvm);
6739 if (err) 7171 if (err)
6740 goto free_vmcs; 7172 goto free_vmcs;
7173 }
6741 7174
6742 if (enable_ept) { 7175 if (enable_ept) {
6743 if (!kvm->arch.ept_identity_map_addr) 7176 if (!kvm->arch.ept_identity_map_addr)
@@ -6931,9 +7364,8 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
6931 vmcs12->vm_entry_instruction_len); 7364 vmcs12->vm_entry_instruction_len);
6932 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 7365 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
6933 vmcs12->guest_interruptibility_info); 7366 vmcs12->guest_interruptibility_info);
6934 vmcs_write32(GUEST_ACTIVITY_STATE, vmcs12->guest_activity_state);
6935 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 7367 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
6936 vmcs_writel(GUEST_DR7, vmcs12->guest_dr7); 7368 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
6937 vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags); 7369 vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags);
6938 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 7370 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
6939 vmcs12->guest_pending_dbg_exceptions); 7371 vmcs12->guest_pending_dbg_exceptions);
@@ -6946,6 +7378,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
6946 (vmcs_config.pin_based_exec_ctrl | 7378 (vmcs_config.pin_based_exec_ctrl |
6947 vmcs12->pin_based_vm_exec_control)); 7379 vmcs12->pin_based_vm_exec_control));
6948 7380
7381 if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
7382 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
7383 vmcs12->vmx_preemption_timer_value);
7384
6949 /* 7385 /*
6950 * Whether page-faults are trapped is determined by a combination of 7386 * Whether page-faults are trapped is determined by a combination of
6951 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. 7387 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
@@ -7016,7 +7452,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7016 * Other fields are different per CPU, and will be set later when 7452 * Other fields are different per CPU, and will be set later when
7017 * vmx_vcpu_load() is called, and when vmx_save_host_state() is called. 7453 * vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
7018 */ 7454 */
7019 vmx_set_constant_host_state(); 7455 vmx_set_constant_host_state(vmx);
7020 7456
7021 /* 7457 /*
7022 * HOST_RSP is normally set correctly in vmx_vcpu_run() just before 7458 * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
@@ -7082,7 +7518,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7082 7518
7083 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) 7519 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
7084 vcpu->arch.efer = vmcs12->guest_ia32_efer; 7520 vcpu->arch.efer = vmcs12->guest_ia32_efer;
7085 if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 7521 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
7086 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 7522 vcpu->arch.efer |= (EFER_LMA | EFER_LME);
7087 else 7523 else
7088 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 7524 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
@@ -7121,6 +7557,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
7121 struct vcpu_vmx *vmx = to_vmx(vcpu); 7557 struct vcpu_vmx *vmx = to_vmx(vcpu);
7122 int cpu; 7558 int cpu;
7123 struct loaded_vmcs *vmcs02; 7559 struct loaded_vmcs *vmcs02;
7560 bool ia32e;
7124 7561
7125 if (!nested_vmx_check_permission(vcpu) || 7562 if (!nested_vmx_check_permission(vcpu) ||
7126 !nested_vmx_check_vmcs12(vcpu)) 7563 !nested_vmx_check_vmcs12(vcpu))
@@ -7129,6 +7566,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
7129 skip_emulated_instruction(vcpu); 7566 skip_emulated_instruction(vcpu);
7130 vmcs12 = get_vmcs12(vcpu); 7567 vmcs12 = get_vmcs12(vcpu);
7131 7568
7569 if (enable_shadow_vmcs)
7570 copy_shadow_to_vmcs12(vmx);
7571
7132 /* 7572 /*
7133 * The nested entry process starts with enforcing various prerequisites 7573 * The nested entry process starts with enforcing various prerequisites
7134 * on vmcs12 as required by the Intel SDM, and act appropriately when 7574 * on vmcs12 as required by the Intel SDM, and act appropriately when
@@ -7146,6 +7586,11 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
7146 return 1; 7586 return 1;
7147 } 7587 }
7148 7588
7589 if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE) {
7590 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
7591 return 1;
7592 }
7593
7149 if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) && 7594 if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) &&
7150 !IS_ALIGNED(vmcs12->msr_bitmap, PAGE_SIZE)) { 7595 !IS_ALIGNED(vmcs12->msr_bitmap, PAGE_SIZE)) {
7151 /*TODO: Also verify bits beyond physical address width are 0*/ 7596 /*TODO: Also verify bits beyond physical address width are 0*/
@@ -7204,6 +7649,45 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
7204 } 7649 }
7205 7650
7206 /* 7651 /*
7652 * If the load IA32_EFER VM-entry control is 1, the following checks
7653 * are performed on the field for the IA32_EFER MSR:
7654 * - Bits reserved in the IA32_EFER MSR must be 0.
7655 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
7656 * the IA-32e mode guest VM-exit control. It must also be identical
7657 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
7658 * CR0.PG) is 1.
7659 */
7660 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) {
7661 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
7662 if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) ||
7663 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
7664 ((vmcs12->guest_cr0 & X86_CR0_PG) &&
7665 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) {
7666 nested_vmx_entry_failure(vcpu, vmcs12,
7667 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
7668 return 1;
7669 }
7670 }
7671
7672 /*
7673 * If the load IA32_EFER VM-exit control is 1, bits reserved in the
7674 * IA32_EFER MSR must be 0 in the field for that register. In addition,
7675 * the values of the LMA and LME bits in the field must each be that of
7676 * the host address-space size VM-exit control.
7677 */
7678 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
7679 ia32e = (vmcs12->vm_exit_controls &
7680 VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
7681 if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
7682 ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
7683 ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) {
7684 nested_vmx_entry_failure(vcpu, vmcs12,
7685 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
7686 return 1;
7687 }
7688 }
7689
7690 /*
7207 * We're finally done with prerequisite checking, and can start with 7691 * We're finally done with prerequisite checking, and can start with
7208 * the nested entry. 7692 * the nested entry.
7209 */ 7693 */
@@ -7223,6 +7707,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
7223 vcpu->cpu = cpu; 7707 vcpu->cpu = cpu;
7224 put_cpu(); 7708 put_cpu();
7225 7709
7710 vmx_segment_cache_clear(vmx);
7711
7226 vmcs12->launch_state = 1; 7712 vmcs12->launch_state = 1;
7227 7713
7228 prepare_vmcs02(vcpu, vmcs12); 7714 prepare_vmcs02(vcpu, vmcs12);
@@ -7273,6 +7759,48 @@ vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7273 vcpu->arch.cr4_guest_owned_bits)); 7759 vcpu->arch.cr4_guest_owned_bits));
7274} 7760}
7275 7761
7762static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
7763 struct vmcs12 *vmcs12)
7764{
7765 u32 idt_vectoring;
7766 unsigned int nr;
7767
7768 if (vcpu->arch.exception.pending) {
7769 nr = vcpu->arch.exception.nr;
7770 idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
7771
7772 if (kvm_exception_is_soft(nr)) {
7773 vmcs12->vm_exit_instruction_len =
7774 vcpu->arch.event_exit_inst_len;
7775 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
7776 } else
7777 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
7778
7779 if (vcpu->arch.exception.has_error_code) {
7780 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
7781 vmcs12->idt_vectoring_error_code =
7782 vcpu->arch.exception.error_code;
7783 }
7784
7785 vmcs12->idt_vectoring_info_field = idt_vectoring;
7786 } else if (vcpu->arch.nmi_pending) {
7787 vmcs12->idt_vectoring_info_field =
7788 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
7789 } else if (vcpu->arch.interrupt.pending) {
7790 nr = vcpu->arch.interrupt.nr;
7791 idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
7792
7793 if (vcpu->arch.interrupt.soft) {
7794 idt_vectoring |= INTR_TYPE_SOFT_INTR;
7795 vmcs12->vm_entry_instruction_len =
7796 vcpu->arch.event_exit_inst_len;
7797 } else
7798 idt_vectoring |= INTR_TYPE_EXT_INTR;
7799
7800 vmcs12->idt_vectoring_info_field = idt_vectoring;
7801 }
7802}
7803
7276/* 7804/*
7277 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 7805 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
7278 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 7806 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
@@ -7284,7 +7812,7 @@ vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7284 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 7812 * exit-information fields only. Other fields are modified by L1 with VMWRITE,
7285 * which already writes to vmcs12 directly. 7813 * which already writes to vmcs12 directly.
7286 */ 7814 */
7287void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 7815static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7288{ 7816{
7289 /* update guest state fields: */ 7817 /* update guest state fields: */
7290 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 7818 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
@@ -7332,16 +7860,19 @@ void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7332 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 7860 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
7333 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 7861 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
7334 7862
7335 vmcs12->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE);
7336 vmcs12->guest_interruptibility_info = 7863 vmcs12->guest_interruptibility_info =
7337 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 7864 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
7338 vmcs12->guest_pending_dbg_exceptions = 7865 vmcs12->guest_pending_dbg_exceptions =
7339 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 7866 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
7340 7867
7868 vmcs12->vm_entry_controls =
7869 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
7870 (vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE);
7871
7341 /* TODO: These cannot have changed unless we have MSR bitmaps and 7872 /* TODO: These cannot have changed unless we have MSR bitmaps and
7342 * the relevant bit asks not to trap the change */ 7873 * the relevant bit asks not to trap the change */
7343 vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 7874 vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
7344 if (vmcs12->vm_entry_controls & VM_EXIT_SAVE_IA32_PAT) 7875 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
7345 vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); 7876 vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
7346 vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); 7877 vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
7347 vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); 7878 vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
@@ -7349,21 +7880,38 @@ void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7349 7880
7350 /* update exit information fields: */ 7881 /* update exit information fields: */
7351 7882
7352 vmcs12->vm_exit_reason = vmcs_read32(VM_EXIT_REASON); 7883 vmcs12->vm_exit_reason = to_vmx(vcpu)->exit_reason;
7353 vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 7884 vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7354 7885
7355 vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 7886 vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
7356 vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 7887 if ((vmcs12->vm_exit_intr_info &
7357 vmcs12->idt_vectoring_info_field = 7888 (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
7358 vmcs_read32(IDT_VECTORING_INFO_FIELD); 7889 (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK))
7359 vmcs12->idt_vectoring_error_code = 7890 vmcs12->vm_exit_intr_error_code =
7360 vmcs_read32(IDT_VECTORING_ERROR_CODE); 7891 vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
7892 vmcs12->idt_vectoring_info_field = 0;
7361 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 7893 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
7362 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 7894 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
7363 7895
7364 /* clear vm-entry fields which are to be cleared on exit */ 7896 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
7365 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) 7897 /* vm_entry_intr_info_field is cleared on exit. Emulate this
7898 * instead of reading the real value. */
7366 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 7899 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
7900
7901 /*
7902 * Transfer the event that L0 or L1 may wanted to inject into
7903 * L2 to IDT_VECTORING_INFO_FIELD.
7904 */
7905 vmcs12_save_pending_event(vcpu, vmcs12);
7906 }
7907
7908 /*
7909 * Drop what we picked up for L2 via vmx_complete_interrupts. It is
7910 * preserved above and would only end up incorrectly in L1.
7911 */
7912 vcpu->arch.nmi_injected = false;
7913 kvm_clear_exception_queue(vcpu);
7914 kvm_clear_interrupt_queue(vcpu);
7367} 7915}
7368 7916
7369/* 7917/*
@@ -7375,11 +7923,12 @@ void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7375 * Failures During or After Loading Guest State"). 7923 * Failures During or After Loading Guest State").
7376 * This function should be called when the active VMCS is L1's (vmcs01). 7924 * This function should be called when the active VMCS is L1's (vmcs01).
7377 */ 7925 */
7378void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 7926static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
7927 struct vmcs12 *vmcs12)
7379{ 7928{
7380 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 7929 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
7381 vcpu->arch.efer = vmcs12->host_ia32_efer; 7930 vcpu->arch.efer = vmcs12->host_ia32_efer;
7382 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 7931 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
7383 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 7932 vcpu->arch.efer |= (EFER_LMA | EFER_LME);
7384 else 7933 else
7385 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 7934 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
@@ -7387,6 +7936,7 @@ void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7387 7936
7388 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp); 7937 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
7389 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip); 7938 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
7939 vmx_set_rflags(vcpu, X86_EFLAGS_BIT1);
7390 /* 7940 /*
7391 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 7941 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
7392 * actually changed, because it depends on the current state of 7942 * actually changed, because it depends on the current state of
@@ -7445,6 +7995,9 @@ void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7445 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 7995 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
7446 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, 7996 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
7447 vmcs12->host_ia32_perf_global_ctrl); 7997 vmcs12->host_ia32_perf_global_ctrl);
7998
7999 kvm_set_dr(vcpu, 7, 0x400);
8000 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
7448} 8001}
7449 8002
7450/* 8003/*
@@ -7458,6 +8011,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
7458 int cpu; 8011 int cpu;
7459 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 8012 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7460 8013
8014 /* trying to cancel vmlaunch/vmresume is a bug */
8015 WARN_ON_ONCE(vmx->nested.nested_run_pending);
8016
7461 leave_guest_mode(vcpu); 8017 leave_guest_mode(vcpu);
7462 prepare_vmcs12(vcpu, vmcs12); 8018 prepare_vmcs12(vcpu, vmcs12);
7463 8019
@@ -7468,6 +8024,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
7468 vcpu->cpu = cpu; 8024 vcpu->cpu = cpu;
7469 put_cpu(); 8025 put_cpu();
7470 8026
8027 vmx_segment_cache_clear(vmx);
8028
7471 /* if no vmcs02 cache requested, remove the one we used */ 8029 /* if no vmcs02 cache requested, remove the one we used */
7472 if (VMCS02_POOL_SIZE == 0) 8030 if (VMCS02_POOL_SIZE == 0)
7473 nested_free_vmcs02(vmx, vmx->nested.current_vmptr); 8031 nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
@@ -7496,6 +8054,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
7496 nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR)); 8054 nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR));
7497 } else 8055 } else
7498 nested_vmx_succeed(vcpu); 8056 nested_vmx_succeed(vcpu);
8057 if (enable_shadow_vmcs)
8058 vmx->nested.sync_shadow_vmcs = true;
7499} 8059}
7500 8060
7501/* 8061/*
@@ -7513,6 +8073,8 @@ static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
7513 vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY; 8073 vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
7514 vmcs12->exit_qualification = qualification; 8074 vmcs12->exit_qualification = qualification;
7515 nested_vmx_succeed(vcpu); 8075 nested_vmx_succeed(vcpu);
8076 if (enable_shadow_vmcs)
8077 to_vmx(vcpu)->nested.sync_shadow_vmcs = true;
7516} 8078}
7517 8079
7518static int vmx_check_intercept(struct kvm_vcpu *vcpu, 8080static int vmx_check_intercept(struct kvm_vcpu *vcpu,
@@ -7590,6 +8152,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
7590 .load_eoi_exitmap = vmx_load_eoi_exitmap, 8152 .load_eoi_exitmap = vmx_load_eoi_exitmap,
7591 .hwapic_irr_update = vmx_hwapic_irr_update, 8153 .hwapic_irr_update = vmx_hwapic_irr_update,
7592 .hwapic_isr_update = vmx_hwapic_isr_update, 8154 .hwapic_isr_update = vmx_hwapic_isr_update,
8155 .sync_pir_to_irr = vmx_sync_pir_to_irr,
8156 .deliver_posted_interrupt = vmx_deliver_posted_interrupt,
7593 8157
7594 .set_tss_addr = vmx_set_tss_addr, 8158 .set_tss_addr = vmx_set_tss_addr,
7595 .get_tdp_level = get_ept_level, 8159 .get_tdp_level = get_ept_level,
@@ -7618,6 +8182,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
7618 .set_tdp_cr3 = vmx_set_cr3, 8182 .set_tdp_cr3 = vmx_set_cr3,
7619 8183
7620 .check_intercept = vmx_check_intercept, 8184 .check_intercept = vmx_check_intercept,
8185 .handle_external_intr = vmx_handle_external_intr,
7621}; 8186};
7622 8187
7623static int __init vmx_init(void) 8188static int __init vmx_init(void)
@@ -7656,6 +8221,24 @@ static int __init vmx_init(void)
7656 (unsigned long *)__get_free_page(GFP_KERNEL); 8221 (unsigned long *)__get_free_page(GFP_KERNEL);
7657 if (!vmx_msr_bitmap_longmode_x2apic) 8222 if (!vmx_msr_bitmap_longmode_x2apic)
7658 goto out4; 8223 goto out4;
8224 vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
8225 if (!vmx_vmread_bitmap)
8226 goto out5;
8227
8228 vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
8229 if (!vmx_vmwrite_bitmap)
8230 goto out6;
8231
8232 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
8233 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
8234 /* shadowed read/write fields */
8235 for (i = 0; i < max_shadow_read_write_fields; i++) {
8236 clear_bit(shadow_read_write_fields[i], vmx_vmwrite_bitmap);
8237 clear_bit(shadow_read_write_fields[i], vmx_vmread_bitmap);
8238 }
8239 /* shadowed read only fields */
8240 for (i = 0; i < max_shadow_read_only_fields; i++)
8241 clear_bit(shadow_read_only_fields[i], vmx_vmread_bitmap);
7659 8242
7660 /* 8243 /*
7661 * Allow direct access to the PC debug port (it is often used for I/O 8244 * Allow direct access to the PC debug port (it is often used for I/O
@@ -7674,7 +8257,7 @@ static int __init vmx_init(void)
7674 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), 8257 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
7675 __alignof__(struct vcpu_vmx), THIS_MODULE); 8258 __alignof__(struct vcpu_vmx), THIS_MODULE);
7676 if (r) 8259 if (r)
7677 goto out3; 8260 goto out7;
7678 8261
7679#ifdef CONFIG_KEXEC 8262#ifdef CONFIG_KEXEC
7680 rcu_assign_pointer(crash_vmclear_loaded_vmcss, 8263 rcu_assign_pointer(crash_vmclear_loaded_vmcss,
@@ -7692,7 +8275,7 @@ static int __init vmx_init(void)
7692 memcpy(vmx_msr_bitmap_longmode_x2apic, 8275 memcpy(vmx_msr_bitmap_longmode_x2apic,
7693 vmx_msr_bitmap_longmode, PAGE_SIZE); 8276 vmx_msr_bitmap_longmode, PAGE_SIZE);
7694 8277
7695 if (enable_apicv_reg_vid) { 8278 if (enable_apicv) {
7696 for (msr = 0x800; msr <= 0x8ff; msr++) 8279 for (msr = 0x800; msr <= 0x8ff; msr++)
7697 vmx_disable_intercept_msr_read_x2apic(msr); 8280 vmx_disable_intercept_msr_read_x2apic(msr);
7698 8281
@@ -7722,6 +8305,12 @@ static int __init vmx_init(void)
7722 8305
7723 return 0; 8306 return 0;
7724 8307
8308out7:
8309 free_page((unsigned long)vmx_vmwrite_bitmap);
8310out6:
8311 free_page((unsigned long)vmx_vmread_bitmap);
8312out5:
8313 free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
7725out4: 8314out4:
7726 free_page((unsigned long)vmx_msr_bitmap_longmode); 8315 free_page((unsigned long)vmx_msr_bitmap_longmode);
7727out3: 8316out3:
@@ -7743,6 +8332,8 @@ static void __exit vmx_exit(void)
7743 free_page((unsigned long)vmx_msr_bitmap_longmode); 8332 free_page((unsigned long)vmx_msr_bitmap_longmode);
7744 free_page((unsigned long)vmx_io_bitmap_b); 8333 free_page((unsigned long)vmx_io_bitmap_b);
7745 free_page((unsigned long)vmx_io_bitmap_a); 8334 free_page((unsigned long)vmx_io_bitmap_a);
8335 free_page((unsigned long)vmx_vmwrite_bitmap);
8336 free_page((unsigned long)vmx_vmread_bitmap);
7746 8337
7747#ifdef CONFIG_KEXEC 8338#ifdef CONFIG_KEXEC
7748 rcu_assign_pointer(crash_vmclear_loaded_vmcss, NULL); 8339 rcu_assign_pointer(crash_vmclear_loaded_vmcss, NULL);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e1721324c271..05a8b1a2300d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -162,8 +162,6 @@ u64 __read_mostly host_xcr0;
162 162
163static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); 163static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
164 164
165static int kvm_vcpu_reset(struct kvm_vcpu *vcpu);
166
167static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) 165static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
168{ 166{
169 int i; 167 int i;
@@ -263,6 +261,13 @@ void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
263} 261}
264EXPORT_SYMBOL_GPL(kvm_set_apic_base); 262EXPORT_SYMBOL_GPL(kvm_set_apic_base);
265 263
264asmlinkage void kvm_spurious_fault(void)
265{
266 /* Fault while not rebooting. We want the trace. */
267 BUG();
268}
269EXPORT_SYMBOL_GPL(kvm_spurious_fault);
270
266#define EXCPT_BENIGN 0 271#define EXCPT_BENIGN 0
267#define EXCPT_CONTRIBUTORY 1 272#define EXCPT_CONTRIBUTORY 1
268#define EXCPT_PF 2 273#define EXCPT_PF 2
@@ -840,23 +845,17 @@ static const u32 emulated_msrs[] = {
840 MSR_IA32_MCG_CTL, 845 MSR_IA32_MCG_CTL,
841}; 846};
842 847
843static int set_efer(struct kvm_vcpu *vcpu, u64 efer) 848bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
844{ 849{
845 u64 old_efer = vcpu->arch.efer;
846
847 if (efer & efer_reserved_bits) 850 if (efer & efer_reserved_bits)
848 return 1; 851 return false;
849
850 if (is_paging(vcpu)
851 && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
852 return 1;
853 852
854 if (efer & EFER_FFXSR) { 853 if (efer & EFER_FFXSR) {
855 struct kvm_cpuid_entry2 *feat; 854 struct kvm_cpuid_entry2 *feat;
856 855
857 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 856 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
858 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) 857 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT)))
859 return 1; 858 return false;
860 } 859 }
861 860
862 if (efer & EFER_SVME) { 861 if (efer & EFER_SVME) {
@@ -864,9 +863,24 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
864 863
865 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 864 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
866 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) 865 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM)))
867 return 1; 866 return false;
868 } 867 }
869 868
869 return true;
870}
871EXPORT_SYMBOL_GPL(kvm_valid_efer);
872
873static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
874{
875 u64 old_efer = vcpu->arch.efer;
876
877 if (!kvm_valid_efer(vcpu, efer))
878 return 1;
879
880 if (is_paging(vcpu)
881 && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
882 return 1;
883
870 efer &= ~EFER_LMA; 884 efer &= ~EFER_LMA;
871 efer |= vcpu->arch.efer & EFER_LMA; 885 efer |= vcpu->arch.efer & EFER_LMA;
872 886
@@ -1079,6 +1093,10 @@ static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
1079 u32 thresh_lo, thresh_hi; 1093 u32 thresh_lo, thresh_hi;
1080 int use_scaling = 0; 1094 int use_scaling = 0;
1081 1095
1096 /* tsc_khz can be zero if TSC calibration fails */
1097 if (this_tsc_khz == 0)
1098 return;
1099
1082 /* Compute a scale to convert nanoseconds in TSC cycles */ 1100 /* Compute a scale to convert nanoseconds in TSC cycles */
1083 kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, 1101 kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
1084 &vcpu->arch.virtual_tsc_shift, 1102 &vcpu->arch.virtual_tsc_shift,
@@ -1156,20 +1174,23 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
1156 ns = get_kernel_ns(); 1174 ns = get_kernel_ns();
1157 elapsed = ns - kvm->arch.last_tsc_nsec; 1175 elapsed = ns - kvm->arch.last_tsc_nsec;
1158 1176
1159 /* n.b - signed multiplication and division required */ 1177 if (vcpu->arch.virtual_tsc_khz) {
1160 usdiff = data - kvm->arch.last_tsc_write; 1178 /* n.b - signed multiplication and division required */
1179 usdiff = data - kvm->arch.last_tsc_write;
1161#ifdef CONFIG_X86_64 1180#ifdef CONFIG_X86_64
1162 usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz; 1181 usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
1163#else 1182#else
1164 /* do_div() only does unsigned */ 1183 /* do_div() only does unsigned */
1165 asm("idivl %2; xor %%edx, %%edx" 1184 asm("idivl %2; xor %%edx, %%edx"
1166 : "=A"(usdiff) 1185 : "=A"(usdiff)
1167 : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz)); 1186 : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz));
1168#endif 1187#endif
1169 do_div(elapsed, 1000); 1188 do_div(elapsed, 1000);
1170 usdiff -= elapsed; 1189 usdiff -= elapsed;
1171 if (usdiff < 0) 1190 if (usdiff < 0)
1172 usdiff = -usdiff; 1191 usdiff = -usdiff;
1192 } else
1193 usdiff = USEC_PER_SEC; /* disable TSC match window below */
1173 1194
1174 /* 1195 /*
1175 * Special case: TSC write with a small delta (1 second) of virtual 1196 * Special case: TSC write with a small delta (1 second) of virtual
@@ -2034,7 +2055,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2034 case MSR_P6_EVNTSEL0: 2055 case MSR_P6_EVNTSEL0:
2035 case MSR_P6_EVNTSEL1: 2056 case MSR_P6_EVNTSEL1:
2036 if (kvm_pmu_msr(vcpu, msr)) 2057 if (kvm_pmu_msr(vcpu, msr))
2037 return kvm_pmu_set_msr(vcpu, msr, data); 2058 return kvm_pmu_set_msr(vcpu, msr_info);
2038 2059
2039 if (pr || data != 0) 2060 if (pr || data != 0)
2040 vcpu_unimpl(vcpu, "disabled perfctr wrmsr: " 2061 vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
@@ -2080,7 +2101,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2080 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) 2101 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
2081 return xen_hvm_config(vcpu, data); 2102 return xen_hvm_config(vcpu, data);
2082 if (kvm_pmu_msr(vcpu, msr)) 2103 if (kvm_pmu_msr(vcpu, msr))
2083 return kvm_pmu_set_msr(vcpu, msr, data); 2104 return kvm_pmu_set_msr(vcpu, msr_info);
2084 if (!ignore_msrs) { 2105 if (!ignore_msrs) {
2085 vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", 2106 vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
2086 msr, data); 2107 msr, data);
@@ -2479,7 +2500,6 @@ int kvm_dev_ioctl_check_extension(long ext)
2479 case KVM_CAP_USER_NMI: 2500 case KVM_CAP_USER_NMI:
2480 case KVM_CAP_REINJECT_CONTROL: 2501 case KVM_CAP_REINJECT_CONTROL:
2481 case KVM_CAP_IRQ_INJECT_STATUS: 2502 case KVM_CAP_IRQ_INJECT_STATUS:
2482 case KVM_CAP_ASSIGN_DEV_IRQ:
2483 case KVM_CAP_IRQFD: 2503 case KVM_CAP_IRQFD:
2484 case KVM_CAP_IOEVENTFD: 2504 case KVM_CAP_IOEVENTFD:
2485 case KVM_CAP_PIT2: 2505 case KVM_CAP_PIT2:
@@ -2497,10 +2517,12 @@ int kvm_dev_ioctl_check_extension(long ext)
2497 case KVM_CAP_XSAVE: 2517 case KVM_CAP_XSAVE:
2498 case KVM_CAP_ASYNC_PF: 2518 case KVM_CAP_ASYNC_PF:
2499 case KVM_CAP_GET_TSC_KHZ: 2519 case KVM_CAP_GET_TSC_KHZ:
2500 case KVM_CAP_PCI_2_3:
2501 case KVM_CAP_KVMCLOCK_CTRL: 2520 case KVM_CAP_KVMCLOCK_CTRL:
2502 case KVM_CAP_READONLY_MEM: 2521 case KVM_CAP_READONLY_MEM:
2503 case KVM_CAP_IRQFD_RESAMPLE: 2522#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
2523 case KVM_CAP_ASSIGN_DEV_IRQ:
2524 case KVM_CAP_PCI_2_3:
2525#endif
2504 r = 1; 2526 r = 1;
2505 break; 2527 break;
2506 case KVM_CAP_COALESCED_MMIO: 2528 case KVM_CAP_COALESCED_MMIO:
@@ -2521,9 +2543,11 @@ int kvm_dev_ioctl_check_extension(long ext)
2521 case KVM_CAP_PV_MMU: /* obsolete */ 2543 case KVM_CAP_PV_MMU: /* obsolete */
2522 r = 0; 2544 r = 0;
2523 break; 2545 break;
2546#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
2524 case KVM_CAP_IOMMU: 2547 case KVM_CAP_IOMMU:
2525 r = iommu_present(&pci_bus_type); 2548 r = iommu_present(&pci_bus_type);
2526 break; 2549 break;
2550#endif
2527 case KVM_CAP_MCE: 2551 case KVM_CAP_MCE:
2528 r = KVM_MAX_MCE_BANKS; 2552 r = KVM_MAX_MCE_BANKS;
2529 break; 2553 break;
@@ -2679,6 +2703,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
2679static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 2703static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
2680 struct kvm_lapic_state *s) 2704 struct kvm_lapic_state *s)
2681{ 2705{
2706 kvm_x86_ops->sync_pir_to_irr(vcpu);
2682 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); 2707 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
2683 2708
2684 return 0; 2709 return 0;
@@ -2696,7 +2721,7 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
2696static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, 2721static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2697 struct kvm_interrupt *irq) 2722 struct kvm_interrupt *irq)
2698{ 2723{
2699 if (irq->irq < 0 || irq->irq >= KVM_NR_INTERRUPTS) 2724 if (irq->irq >= KVM_NR_INTERRUPTS)
2700 return -EINVAL; 2725 return -EINVAL;
2701 if (irqchip_in_kernel(vcpu->kvm)) 2726 if (irqchip_in_kernel(vcpu->kvm))
2702 return -ENXIO; 2727 return -ENXIO;
@@ -2819,10 +2844,9 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2819 events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); 2844 events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
2820 events->nmi.pad = 0; 2845 events->nmi.pad = 0;
2821 2846
2822 events->sipi_vector = vcpu->arch.sipi_vector; 2847 events->sipi_vector = 0; /* never valid when reporting to user space */
2823 2848
2824 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING 2849 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
2825 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
2826 | KVM_VCPUEVENT_VALID_SHADOW); 2850 | KVM_VCPUEVENT_VALID_SHADOW);
2827 memset(&events->reserved, 0, sizeof(events->reserved)); 2851 memset(&events->reserved, 0, sizeof(events->reserved));
2828} 2852}
@@ -2853,8 +2877,9 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2853 vcpu->arch.nmi_pending = events->nmi.pending; 2877 vcpu->arch.nmi_pending = events->nmi.pending;
2854 kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked); 2878 kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
2855 2879
2856 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) 2880 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
2857 vcpu->arch.sipi_vector = events->sipi_vector; 2881 kvm_vcpu_has_lapic(vcpu))
2882 vcpu->arch.apic->sipi_vector = events->sipi_vector;
2858 2883
2859 kvm_make_request(KVM_REQ_EVENT, vcpu); 2884 kvm_make_request(KVM_REQ_EVENT, vcpu);
2860 2885
@@ -3478,13 +3503,15 @@ out:
3478 return r; 3503 return r;
3479} 3504}
3480 3505
3481int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event) 3506int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
3507 bool line_status)
3482{ 3508{
3483 if (!irqchip_in_kernel(kvm)) 3509 if (!irqchip_in_kernel(kvm))
3484 return -ENXIO; 3510 return -ENXIO;
3485 3511
3486 irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 3512 irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
3487 irq_event->irq, irq_event->level); 3513 irq_event->irq, irq_event->level,
3514 line_status);
3488 return 0; 3515 return 0;
3489} 3516}
3490 3517
@@ -4752,11 +4779,15 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)
4752} 4779}
4753 4780
4754static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, 4781static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
4755 bool write_fault_to_shadow_pgtable) 4782 bool write_fault_to_shadow_pgtable,
4783 int emulation_type)
4756{ 4784{
4757 gpa_t gpa = cr2; 4785 gpa_t gpa = cr2;
4758 pfn_t pfn; 4786 pfn_t pfn;
4759 4787
4788 if (emulation_type & EMULTYPE_NO_REEXECUTE)
4789 return false;
4790
4760 if (!vcpu->arch.mmu.direct_map) { 4791 if (!vcpu->arch.mmu.direct_map) {
4761 /* 4792 /*
4762 * Write permission should be allowed since only 4793 * Write permission should be allowed since only
@@ -4899,8 +4930,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4899 if (r != EMULATION_OK) { 4930 if (r != EMULATION_OK) {
4900 if (emulation_type & EMULTYPE_TRAP_UD) 4931 if (emulation_type & EMULTYPE_TRAP_UD)
4901 return EMULATE_FAIL; 4932 return EMULATE_FAIL;
4902 if (reexecute_instruction(vcpu, cr2, 4933 if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
4903 write_fault_to_spt)) 4934 emulation_type))
4904 return EMULATE_DONE; 4935 return EMULATE_DONE;
4905 if (emulation_type & EMULTYPE_SKIP) 4936 if (emulation_type & EMULTYPE_SKIP)
4906 return EMULATE_FAIL; 4937 return EMULATE_FAIL;
@@ -4930,7 +4961,8 @@ restart:
4930 return EMULATE_DONE; 4961 return EMULATE_DONE;
4931 4962
4932 if (r == EMULATION_FAILED) { 4963 if (r == EMULATION_FAILED) {
4933 if (reexecute_instruction(vcpu, cr2, write_fault_to_spt)) 4964 if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
4965 emulation_type))
4934 return EMULATE_DONE; 4966 return EMULATE_DONE;
4935 4967
4936 return handle_emulation_failure(vcpu); 4968 return handle_emulation_failure(vcpu);
@@ -5641,14 +5673,20 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
5641#endif 5673#endif
5642} 5674}
5643 5675
5644static void update_eoi_exitmap(struct kvm_vcpu *vcpu) 5676static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
5645{ 5677{
5646 u64 eoi_exit_bitmap[4]; 5678 u64 eoi_exit_bitmap[4];
5679 u32 tmr[8];
5680
5681 if (!kvm_apic_hw_enabled(vcpu->arch.apic))
5682 return;
5647 5683
5648 memset(eoi_exit_bitmap, 0, 32); 5684 memset(eoi_exit_bitmap, 0, 32);
5685 memset(tmr, 0, 32);
5649 5686
5650 kvm_ioapic_calculate_eoi_exitmap(vcpu, eoi_exit_bitmap); 5687 kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap, tmr);
5651 kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); 5688 kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
5689 kvm_apic_update_tmr(vcpu, tmr);
5652} 5690}
5653 5691
5654static int vcpu_enter_guest(struct kvm_vcpu *vcpu) 5692static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
@@ -5656,7 +5694,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5656 int r; 5694 int r;
5657 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 5695 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
5658 vcpu->run->request_interrupt_window; 5696 vcpu->run->request_interrupt_window;
5659 bool req_immediate_exit = 0; 5697 bool req_immediate_exit = false;
5660 5698
5661 if (vcpu->requests) { 5699 if (vcpu->requests) {
5662 if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) 5700 if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
@@ -5698,24 +5736,30 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5698 record_steal_time(vcpu); 5736 record_steal_time(vcpu);
5699 if (kvm_check_request(KVM_REQ_NMI, vcpu)) 5737 if (kvm_check_request(KVM_REQ_NMI, vcpu))
5700 process_nmi(vcpu); 5738 process_nmi(vcpu);
5701 req_immediate_exit =
5702 kvm_check_request(KVM_REQ_IMMEDIATE_EXIT, vcpu);
5703 if (kvm_check_request(KVM_REQ_PMU, vcpu)) 5739 if (kvm_check_request(KVM_REQ_PMU, vcpu))
5704 kvm_handle_pmu_event(vcpu); 5740 kvm_handle_pmu_event(vcpu);
5705 if (kvm_check_request(KVM_REQ_PMI, vcpu)) 5741 if (kvm_check_request(KVM_REQ_PMI, vcpu))
5706 kvm_deliver_pmi(vcpu); 5742 kvm_deliver_pmi(vcpu);
5707 if (kvm_check_request(KVM_REQ_EOIBITMAP, vcpu)) 5743 if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
5708 update_eoi_exitmap(vcpu); 5744 vcpu_scan_ioapic(vcpu);
5709 } 5745 }
5710 5746
5711 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { 5747 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
5748 kvm_apic_accept_events(vcpu);
5749 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
5750 r = 1;
5751 goto out;
5752 }
5753
5712 inject_pending_event(vcpu); 5754 inject_pending_event(vcpu);
5713 5755
5714 /* enable NMI/IRQ window open exits if needed */ 5756 /* enable NMI/IRQ window open exits if needed */
5715 if (vcpu->arch.nmi_pending) 5757 if (vcpu->arch.nmi_pending)
5716 kvm_x86_ops->enable_nmi_window(vcpu); 5758 req_immediate_exit =
5759 kvm_x86_ops->enable_nmi_window(vcpu) != 0;
5717 else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) 5760 else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
5718 kvm_x86_ops->enable_irq_window(vcpu); 5761 req_immediate_exit =
5762 kvm_x86_ops->enable_irq_window(vcpu) != 0;
5719 5763
5720 if (kvm_lapic_enabled(vcpu)) { 5764 if (kvm_lapic_enabled(vcpu)) {
5721 /* 5765 /*
@@ -5794,7 +5838,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5794 5838
5795 vcpu->mode = OUTSIDE_GUEST_MODE; 5839 vcpu->mode = OUTSIDE_GUEST_MODE;
5796 smp_wmb(); 5840 smp_wmb();
5797 local_irq_enable(); 5841
5842 /* Interrupt is enabled by handle_external_intr() */
5843 kvm_x86_ops->handle_external_intr(vcpu);
5798 5844
5799 ++vcpu->stat.exits; 5845 ++vcpu->stat.exits;
5800 5846
@@ -5843,16 +5889,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5843 int r; 5889 int r;
5844 struct kvm *kvm = vcpu->kvm; 5890 struct kvm *kvm = vcpu->kvm;
5845 5891
5846 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
5847 pr_debug("vcpu %d received sipi with vector # %x\n",
5848 vcpu->vcpu_id, vcpu->arch.sipi_vector);
5849 kvm_lapic_reset(vcpu);
5850 r = kvm_vcpu_reset(vcpu);
5851 if (r)
5852 return r;
5853 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
5854 }
5855
5856 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); 5892 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
5857 r = vapic_enter(vcpu); 5893 r = vapic_enter(vcpu);
5858 if (r) { 5894 if (r) {
@@ -5869,8 +5905,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5869 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 5905 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
5870 kvm_vcpu_block(vcpu); 5906 kvm_vcpu_block(vcpu);
5871 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); 5907 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
5872 if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) 5908 if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) {
5873 { 5909 kvm_apic_accept_events(vcpu);
5874 switch(vcpu->arch.mp_state) { 5910 switch(vcpu->arch.mp_state) {
5875 case KVM_MP_STATE_HALTED: 5911 case KVM_MP_STATE_HALTED:
5876 vcpu->arch.mp_state = 5912 vcpu->arch.mp_state =
@@ -5878,7 +5914,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5878 case KVM_MP_STATE_RUNNABLE: 5914 case KVM_MP_STATE_RUNNABLE:
5879 vcpu->arch.apf.halted = false; 5915 vcpu->arch.apf.halted = false;
5880 break; 5916 break;
5881 case KVM_MP_STATE_SIPI_RECEIVED: 5917 case KVM_MP_STATE_INIT_RECEIVED:
5918 break;
5882 default: 5919 default:
5883 r = -EINTR; 5920 r = -EINTR;
5884 break; 5921 break;
@@ -6013,6 +6050,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
6013 6050
6014 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { 6051 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
6015 kvm_vcpu_block(vcpu); 6052 kvm_vcpu_block(vcpu);
6053 kvm_apic_accept_events(vcpu);
6016 clear_bit(KVM_REQ_UNHALT, &vcpu->requests); 6054 clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
6017 r = -EAGAIN; 6055 r = -EAGAIN;
6018 goto out; 6056 goto out;
@@ -6169,6 +6207,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
6169int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, 6207int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
6170 struct kvm_mp_state *mp_state) 6208 struct kvm_mp_state *mp_state)
6171{ 6209{
6210 kvm_apic_accept_events(vcpu);
6172 mp_state->mp_state = vcpu->arch.mp_state; 6211 mp_state->mp_state = vcpu->arch.mp_state;
6173 return 0; 6212 return 0;
6174} 6213}
@@ -6176,7 +6215,15 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
6176int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, 6215int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
6177 struct kvm_mp_state *mp_state) 6216 struct kvm_mp_state *mp_state)
6178{ 6217{
6179 vcpu->arch.mp_state = mp_state->mp_state; 6218 if (!kvm_vcpu_has_lapic(vcpu) &&
6219 mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
6220 return -EINVAL;
6221
6222 if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
6223 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
6224 set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
6225 } else
6226 vcpu->arch.mp_state = mp_state->mp_state;
6180 kvm_make_request(KVM_REQ_EVENT, vcpu); 6227 kvm_make_request(KVM_REQ_EVENT, vcpu);
6181 return 0; 6228 return 0;
6182} 6229}
@@ -6475,9 +6522,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
6475 r = vcpu_load(vcpu); 6522 r = vcpu_load(vcpu);
6476 if (r) 6523 if (r)
6477 return r; 6524 return r;
6478 r = kvm_vcpu_reset(vcpu); 6525 kvm_vcpu_reset(vcpu);
6479 if (r == 0) 6526 r = kvm_mmu_setup(vcpu);
6480 r = kvm_mmu_setup(vcpu);
6481 vcpu_put(vcpu); 6527 vcpu_put(vcpu);
6482 6528
6483 return r; 6529 return r;
@@ -6514,7 +6560,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
6514 kvm_x86_ops->vcpu_free(vcpu); 6560 kvm_x86_ops->vcpu_free(vcpu);
6515} 6561}
6516 6562
6517static int kvm_vcpu_reset(struct kvm_vcpu *vcpu) 6563void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
6518{ 6564{
6519 atomic_set(&vcpu->arch.nmi_queued, 0); 6565 atomic_set(&vcpu->arch.nmi_queued, 0);
6520 vcpu->arch.nmi_pending = 0; 6566 vcpu->arch.nmi_pending = 0;
@@ -6541,7 +6587,18 @@ static int kvm_vcpu_reset(struct kvm_vcpu *vcpu)
6541 vcpu->arch.regs_avail = ~0; 6587 vcpu->arch.regs_avail = ~0;
6542 vcpu->arch.regs_dirty = ~0; 6588 vcpu->arch.regs_dirty = ~0;
6543 6589
6544 return kvm_x86_ops->vcpu_reset(vcpu); 6590 kvm_x86_ops->vcpu_reset(vcpu);
6591}
6592
6593void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector)
6594{
6595 struct kvm_segment cs;
6596
6597 kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
6598 cs.selector = vector << 8;
6599 cs.base = vector << 12;
6600 kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
6601 kvm_rip_write(vcpu, 0);
6545} 6602}
6546 6603
6547int kvm_arch_hardware_enable(void *garbage) 6604int kvm_arch_hardware_enable(void *garbage)
@@ -6706,8 +6763,10 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
6706 } 6763 }
6707 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; 6764 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
6708 6765
6709 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) 6766 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) {
6767 r = -ENOMEM;
6710 goto fail_free_mce_banks; 6768 goto fail_free_mce_banks;
6769 }
6711 6770
6712 r = fx_init(vcpu); 6771 r = fx_init(vcpu);
6713 if (r) 6772 if (r)
@@ -6811,6 +6870,23 @@ void kvm_arch_sync_events(struct kvm *kvm)
6811 6870
6812void kvm_arch_destroy_vm(struct kvm *kvm) 6871void kvm_arch_destroy_vm(struct kvm *kvm)
6813{ 6872{
6873 if (current->mm == kvm->mm) {
6874 /*
6875 * Free memory regions allocated on behalf of userspace,
6876 * unless the the memory map has changed due to process exit
6877 * or fd copying.
6878 */
6879 struct kvm_userspace_memory_region mem;
6880 memset(&mem, 0, sizeof(mem));
6881 mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
6882 kvm_set_memory_region(kvm, &mem);
6883
6884 mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
6885 kvm_set_memory_region(kvm, &mem);
6886
6887 mem.slot = TSS_PRIVATE_MEMSLOT;
6888 kvm_set_memory_region(kvm, &mem);
6889 }
6814 kvm_iommu_unmap_guest(kvm); 6890 kvm_iommu_unmap_guest(kvm);
6815 kfree(kvm->arch.vpic); 6891 kfree(kvm->arch.vpic);
6816 kfree(kvm->arch.vioapic); 6892 kfree(kvm->arch.vioapic);
@@ -6903,24 +6979,21 @@ out_free:
6903 6979
6904int kvm_arch_prepare_memory_region(struct kvm *kvm, 6980int kvm_arch_prepare_memory_region(struct kvm *kvm,
6905 struct kvm_memory_slot *memslot, 6981 struct kvm_memory_slot *memslot,
6906 struct kvm_memory_slot old,
6907 struct kvm_userspace_memory_region *mem, 6982 struct kvm_userspace_memory_region *mem,
6908 bool user_alloc) 6983 enum kvm_mr_change change)
6909{ 6984{
6910 int npages = memslot->npages;
6911
6912 /* 6985 /*
6913 * Only private memory slots need to be mapped here since 6986 * Only private memory slots need to be mapped here since
6914 * KVM_SET_MEMORY_REGION ioctl is no longer supported. 6987 * KVM_SET_MEMORY_REGION ioctl is no longer supported.
6915 */ 6988 */
6916 if ((memslot->id >= KVM_USER_MEM_SLOTS) && npages && !old.npages) { 6989 if ((memslot->id >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_CREATE)) {
6917 unsigned long userspace_addr; 6990 unsigned long userspace_addr;
6918 6991
6919 /* 6992 /*
6920 * MAP_SHARED to prevent internal slot pages from being moved 6993 * MAP_SHARED to prevent internal slot pages from being moved
6921 * by fork()/COW. 6994 * by fork()/COW.
6922 */ 6995 */
6923 userspace_addr = vm_mmap(NULL, 0, npages * PAGE_SIZE, 6996 userspace_addr = vm_mmap(NULL, 0, memslot->npages * PAGE_SIZE,
6924 PROT_READ | PROT_WRITE, 6997 PROT_READ | PROT_WRITE,
6925 MAP_SHARED | MAP_ANONYMOUS, 0); 6998 MAP_SHARED | MAP_ANONYMOUS, 0);
6926 6999
@@ -6935,17 +7008,17 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
6935 7008
6936void kvm_arch_commit_memory_region(struct kvm *kvm, 7009void kvm_arch_commit_memory_region(struct kvm *kvm,
6937 struct kvm_userspace_memory_region *mem, 7010 struct kvm_userspace_memory_region *mem,
6938 struct kvm_memory_slot old, 7011 const struct kvm_memory_slot *old,
6939 bool user_alloc) 7012 enum kvm_mr_change change)
6940{ 7013{
6941 7014
6942 int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT; 7015 int nr_mmu_pages = 0;
6943 7016
6944 if ((mem->slot >= KVM_USER_MEM_SLOTS) && old.npages && !npages) { 7017 if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) {
6945 int ret; 7018 int ret;
6946 7019
6947 ret = vm_munmap(old.userspace_addr, 7020 ret = vm_munmap(old->userspace_addr,
6948 old.npages * PAGE_SIZE); 7021 old->npages * PAGE_SIZE);
6949 if (ret < 0) 7022 if (ret < 0)
6950 printk(KERN_WARNING 7023 printk(KERN_WARNING
6951 "kvm_vm_ioctl_set_memory_region: " 7024 "kvm_vm_ioctl_set_memory_region: "
@@ -6962,14 +7035,14 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
6962 * Existing largepage mappings are destroyed here and new ones will 7035 * Existing largepage mappings are destroyed here and new ones will
6963 * not be created until the end of the logging. 7036 * not be created until the end of the logging.
6964 */ 7037 */
6965 if (npages && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES)) 7038 if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
6966 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 7039 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
6967 /* 7040 /*
6968 * If memory slot is created, or moved, we need to clear all 7041 * If memory slot is created, or moved, we need to clear all
6969 * mmio sptes. 7042 * mmio sptes.
6970 */ 7043 */
6971 if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT) { 7044 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
6972 kvm_mmu_zap_all(kvm); 7045 kvm_mmu_zap_mmio_sptes(kvm);
6973 kvm_reload_remote_mmus(kvm); 7046 kvm_reload_remote_mmus(kvm);
6974 } 7047 }
6975} 7048}
@@ -6991,7 +7064,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
6991 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && 7064 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
6992 !vcpu->arch.apf.halted) 7065 !vcpu->arch.apf.halted)
6993 || !list_empty_careful(&vcpu->async_pf.done) 7066 || !list_empty_careful(&vcpu->async_pf.done)
6994 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED 7067 || kvm_apic_has_events(vcpu)
6995 || atomic_read(&vcpu->arch.nmi_queued) || 7068 || atomic_read(&vcpu->arch.nmi_queued) ||
6996 (kvm_arch_interrupt_allowed(vcpu) && 7069 (kvm_arch_interrupt_allowed(vcpu) &&
6997 kvm_cpu_has_interrupt(vcpu)); 7070 kvm_cpu_has_interrupt(vcpu));