aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlexander Graf <agraf@suse.de>2013-02-13 06:56:14 -0500
committerAlexander Graf <agraf@suse.de>2013-02-13 06:56:14 -0500
commitdd92d6f2749c43ebab91c4762a1bc79e6523e936 (patch)
tree6e6730bdd09284679c0861df6d0fcbec08ea7a87
parentb9e3e208935e95ad62bd1b1bc4408c23a9ae3ada (diff)
parentb0da5bec30eca7ffbb2c89afa6fe503fd418d3a6 (diff)
Merge commit 'origin/next' into kvm-ppc-next
-rw-r--r--Documentation/virtual/kvm/api.txt25
-rw-r--r--arch/ia64/kvm/lapic.h6
-rw-r--r--arch/s390/kvm/kvm-s390.c8
-rw-r--r--arch/s390/kvm/kvm-s390.h25
-rw-r--r--arch/x86/include/asm/kvm_host.h6
-rw-r--r--arch/x86/include/asm/vmx.h21
-rw-r--r--arch/x86/kvm/emulate.c2
-rw-r--r--arch/x86/kvm/irq.c56
-rw-r--r--arch/x86/kvm/lapic.c140
-rw-r--r--arch/x86/kvm/lapic.h34
-rw-r--r--arch/x86/kvm/mmu.c32
-rw-r--r--arch/x86/kvm/paging_tmpl.h3
-rw-r--r--arch/x86/kvm/svm.c24
-rw-r--r--arch/x86/kvm/vmx.c336
-rw-r--r--arch/x86/kvm/x86.c25
-rw-r--r--drivers/s390/kvm/virtio_ccw.c20
-rw-r--r--include/linux/kvm_host.h3
-rw-r--r--kernel/sched/core.c25
-rw-r--r--virt/kvm/ioapic.c39
-rw-r--r--virt/kvm/ioapic.h4
-rw-r--r--virt/kvm/iommu.c4
-rw-r--r--virt/kvm/irq_comm.c25
-rw-r--r--virt/kvm/kvm_main.c106
23 files changed, 803 insertions, 166 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 09905cbcbb0b..c2534c300a45 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -219,19 +219,6 @@ allocation of vcpu ids. For example, if userspace wants
219single-threaded guest vcpus, it should make all vcpu ids be a multiple 219single-threaded guest vcpus, it should make all vcpu ids be a multiple
220of the number of vcpus per vcore. 220of the number of vcpus per vcore.
221 221
222On powerpc using book3s_hv mode, the vcpus are mapped onto virtual
223threads in one or more virtual CPU cores. (This is because the
224hardware requires all the hardware threads in a CPU core to be in the
225same partition.) The KVM_CAP_PPC_SMT capability indicates the number
226of vcpus per virtual core (vcore). The vcore id is obtained by
227dividing the vcpu id by the number of vcpus per vcore. The vcpus in a
228given vcore will always be in the same physical core as each other
229(though that might be a different physical core from time to time).
230Userspace can control the threading (SMT) mode of the guest by its
231allocation of vcpu ids. For example, if userspace wants
232single-threaded guest vcpus, it should make all vcpu ids be a multiple
233of the number of vcpus per vcore.
234
235For virtual cpus that have been created with S390 user controlled virtual 222For virtual cpus that have been created with S390 user controlled virtual
236machines, the resulting vcpu fd can be memory mapped at page offset 223machines, the resulting vcpu fd can be memory mapped at page offset
237KVM_S390_SIE_PAGE_OFFSET in order to obtain a memory map of the virtual 224KVM_S390_SIE_PAGE_OFFSET in order to obtain a memory map of the virtual
@@ -874,12 +861,12 @@ It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr
874be identical. This allows large pages in the guest to be backed by large 861be identical. This allows large pages in the guest to be backed by large
875pages in the host. 862pages in the host.
876 863
877The flags field supports two flag, KVM_MEM_LOG_DIRTY_PAGES, which instructs 864The flags field supports two flags: KVM_MEM_LOG_DIRTY_PAGES and
878kvm to keep track of writes to memory within the slot. See KVM_GET_DIRTY_LOG 865KVM_MEM_READONLY. The former can be set to instruct KVM to keep track of
879ioctl. The KVM_CAP_READONLY_MEM capability indicates the availability of the 866writes to memory within the slot. See KVM_GET_DIRTY_LOG ioctl to know how to
880KVM_MEM_READONLY flag. When this flag is set for a memory region, KVM only 867use it. The latter can be set, if KVM_CAP_READONLY_MEM capability allows it,
881allows read accesses. Writes will be posted to userspace as KVM_EXIT_MMIO 868to make a new slot read-only. In this case, writes to this memory will be
882exits. 869posted to userspace as KVM_EXIT_MMIO exits.
883 870
884When the KVM_CAP_SYNC_MMU capability is available, changes in the backing of 871When the KVM_CAP_SYNC_MMU capability is available, changes in the backing of
885the memory region are automatically reflected into the guest. For example, an 872the memory region are automatically reflected into the guest. For example, an
diff --git a/arch/ia64/kvm/lapic.h b/arch/ia64/kvm/lapic.h
index c5f92a926a9a..c3e2935b6db4 100644
--- a/arch/ia64/kvm/lapic.h
+++ b/arch/ia64/kvm/lapic.h
@@ -27,4 +27,10 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
27#define kvm_apic_present(x) (true) 27#define kvm_apic_present(x) (true)
28#define kvm_lapic_enabled(x) (true) 28#define kvm_lapic_enabled(x) (true)
29 29
30static inline bool kvm_apic_vid_enabled(void)
31{
32 /* IA64 has no apicv supporting, do nothing here */
33 return false;
34}
35
30#endif 36#endif
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 5b01f0953900..4377d1886631 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -770,6 +770,14 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
770 } else 770 } else
771 prefix = 0; 771 prefix = 0;
772 772
773 /*
774 * The guest FPRS and ACRS are in the host FPRS/ACRS due to the lazy
775 * copying in vcpu load/put. Lets update our copies before we save
776 * it into the save area
777 */
778 save_fp_regs(&vcpu->arch.guest_fpregs);
779 save_access_regs(vcpu->run->s.regs.acrs);
780
773 if (__guestcopy(vcpu, addr + offsetof(struct save_area, fp_regs), 781 if (__guestcopy(vcpu, addr + offsetof(struct save_area, fp_regs),
774 vcpu->arch.guest_fpregs.fprs, 128, prefix)) 782 vcpu->arch.guest_fpregs.fprs, 128, prefix))
775 return -EFAULT; 783 return -EFAULT;
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 3e05deff21b6..4d89d64a8161 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -67,8 +67,8 @@ static inline void kvm_s390_set_prefix(struct kvm_vcpu *vcpu, u32 prefix)
67 67
68static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu) 68static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu)
69{ 69{
70 int base2 = vcpu->arch.sie_block->ipb >> 28; 70 u32 base2 = vcpu->arch.sie_block->ipb >> 28;
71 int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16); 71 u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
72 72
73 return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2; 73 return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2;
74} 74}
@@ -76,10 +76,10 @@ static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu)
76static inline void kvm_s390_get_base_disp_sse(struct kvm_vcpu *vcpu, 76static inline void kvm_s390_get_base_disp_sse(struct kvm_vcpu *vcpu,
77 u64 *address1, u64 *address2) 77 u64 *address1, u64 *address2)
78{ 78{
79 int base1 = (vcpu->arch.sie_block->ipb & 0xf0000000) >> 28; 79 u32 base1 = (vcpu->arch.sie_block->ipb & 0xf0000000) >> 28;
80 int disp1 = (vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16; 80 u32 disp1 = (vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16;
81 int base2 = (vcpu->arch.sie_block->ipb & 0xf000) >> 12; 81 u32 base2 = (vcpu->arch.sie_block->ipb & 0xf000) >> 12;
82 int disp2 = vcpu->arch.sie_block->ipb & 0x0fff; 82 u32 disp2 = vcpu->arch.sie_block->ipb & 0x0fff;
83 83
84 *address1 = (base1 ? vcpu->run->s.regs.gprs[base1] : 0) + disp1; 84 *address1 = (base1 ? vcpu->run->s.regs.gprs[base1] : 0) + disp1;
85 *address2 = (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2; 85 *address2 = (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2;
@@ -87,17 +87,20 @@ static inline void kvm_s390_get_base_disp_sse(struct kvm_vcpu *vcpu,
87 87
88static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu) 88static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu)
89{ 89{
90 int base2 = vcpu->arch.sie_block->ipb >> 28; 90 u32 base2 = vcpu->arch.sie_block->ipb >> 28;
91 int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16) + 91 u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16) +
92 ((vcpu->arch.sie_block->ipb & 0xff00) << 4); 92 ((vcpu->arch.sie_block->ipb & 0xff00) << 4);
93 /* The displacement is a 20bit _SIGNED_ value */
94 if (disp2 & 0x80000)
95 disp2+=0xfff00000;
93 96
94 return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2; 97 return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + (long)(int)disp2;
95} 98}
96 99
97static inline u64 kvm_s390_get_base_disp_rs(struct kvm_vcpu *vcpu) 100static inline u64 kvm_s390_get_base_disp_rs(struct kvm_vcpu *vcpu)
98{ 101{
99 int base2 = vcpu->arch.sie_block->ipb >> 28; 102 u32 base2 = vcpu->arch.sie_block->ipb >> 28;
100 int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16); 103 u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
101 104
102 return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2; 105 return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2;
103} 106}
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 77d56a4ba89c..635a74d22409 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -699,6 +699,11 @@ struct kvm_x86_ops {
699 void (*enable_nmi_window)(struct kvm_vcpu *vcpu); 699 void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
700 void (*enable_irq_window)(struct kvm_vcpu *vcpu); 700 void (*enable_irq_window)(struct kvm_vcpu *vcpu);
701 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); 701 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
702 int (*vm_has_apicv)(struct kvm *kvm);
703 void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
704 void (*hwapic_isr_update)(struct kvm *kvm, int isr);
705 void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
706 void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
702 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); 707 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
703 int (*get_tdp_level)(void); 708 int (*get_tdp_level)(void);
704 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); 709 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
@@ -993,6 +998,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva);
993int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); 998int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
994void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); 999void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
995int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); 1000int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
1001int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
996int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); 1002int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
997int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); 1003int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
998int kvm_cpu_get_interrupt(struct kvm_vcpu *v); 1004int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index e385df97bfdc..5c9dbadd364a 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -62,10 +62,12 @@
62#define EXIT_REASON_MCE_DURING_VMENTRY 41 62#define EXIT_REASON_MCE_DURING_VMENTRY 41
63#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 63#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
64#define EXIT_REASON_APIC_ACCESS 44 64#define EXIT_REASON_APIC_ACCESS 44
65#define EXIT_REASON_EOI_INDUCED 45
65#define EXIT_REASON_EPT_VIOLATION 48 66#define EXIT_REASON_EPT_VIOLATION 48
66#define EXIT_REASON_EPT_MISCONFIG 49 67#define EXIT_REASON_EPT_MISCONFIG 49
67#define EXIT_REASON_WBINVD 54 68#define EXIT_REASON_WBINVD 54
68#define EXIT_REASON_XSETBV 55 69#define EXIT_REASON_XSETBV 55
70#define EXIT_REASON_APIC_WRITE 56
69#define EXIT_REASON_INVPCID 58 71#define EXIT_REASON_INVPCID 58
70 72
71#define VMX_EXIT_REASONS \ 73#define VMX_EXIT_REASONS \
@@ -103,7 +105,12 @@
103 { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ 105 { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \
104 { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ 106 { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \
105 { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ 107 { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \
106 { EXIT_REASON_WBINVD, "WBINVD" } 108 { EXIT_REASON_WBINVD, "WBINVD" }, \
109 { EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \
110 { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \
111 { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \
112 { EXIT_REASON_INVD, "INVD" }, \
113 { EXIT_REASON_INVPCID, "INVPCID" }
107 114
108#ifdef __KERNEL__ 115#ifdef __KERNEL__
109 116
@@ -138,9 +145,12 @@
138#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 145#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
139#define SECONDARY_EXEC_ENABLE_EPT 0x00000002 146#define SECONDARY_EXEC_ENABLE_EPT 0x00000002
140#define SECONDARY_EXEC_RDTSCP 0x00000008 147#define SECONDARY_EXEC_RDTSCP 0x00000008
148#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE 0x00000010
141#define SECONDARY_EXEC_ENABLE_VPID 0x00000020 149#define SECONDARY_EXEC_ENABLE_VPID 0x00000020
142#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 150#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
143#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 151#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080
152#define SECONDARY_EXEC_APIC_REGISTER_VIRT 0x00000100
153#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY 0x00000200
144#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 154#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
145#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 155#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000
146 156
@@ -178,6 +188,7 @@ enum vmcs_field {
178 GUEST_GS_SELECTOR = 0x0000080a, 188 GUEST_GS_SELECTOR = 0x0000080a,
179 GUEST_LDTR_SELECTOR = 0x0000080c, 189 GUEST_LDTR_SELECTOR = 0x0000080c,
180 GUEST_TR_SELECTOR = 0x0000080e, 190 GUEST_TR_SELECTOR = 0x0000080e,
191 GUEST_INTR_STATUS = 0x00000810,
181 HOST_ES_SELECTOR = 0x00000c00, 192 HOST_ES_SELECTOR = 0x00000c00,
182 HOST_CS_SELECTOR = 0x00000c02, 193 HOST_CS_SELECTOR = 0x00000c02,
183 HOST_SS_SELECTOR = 0x00000c04, 194 HOST_SS_SELECTOR = 0x00000c04,
@@ -205,6 +216,14 @@ enum vmcs_field {
205 APIC_ACCESS_ADDR_HIGH = 0x00002015, 216 APIC_ACCESS_ADDR_HIGH = 0x00002015,
206 EPT_POINTER = 0x0000201a, 217 EPT_POINTER = 0x0000201a,
207 EPT_POINTER_HIGH = 0x0000201b, 218 EPT_POINTER_HIGH = 0x0000201b,
219 EOI_EXIT_BITMAP0 = 0x0000201c,
220 EOI_EXIT_BITMAP0_HIGH = 0x0000201d,
221 EOI_EXIT_BITMAP1 = 0x0000201e,
222 EOI_EXIT_BITMAP1_HIGH = 0x0000201f,
223 EOI_EXIT_BITMAP2 = 0x00002020,
224 EOI_EXIT_BITMAP2_HIGH = 0x00002021,
225 EOI_EXIT_BITMAP3 = 0x00002022,
226 EOI_EXIT_BITMAP3_HIGH = 0x00002023,
208 GUEST_PHYSICAL_ADDRESS = 0x00002400, 227 GUEST_PHYSICAL_ADDRESS = 0x00002400,
209 GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, 228 GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401,
210 VMCS_LINK_POINTER = 0x00002800, 229 VMCS_LINK_POINTER = 0x00002800,
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index e99fb72cd4c5..2b11318151a4 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1013,7 +1013,7 @@ static u8 test_cc(unsigned int condition, unsigned long flags)
1013 void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf); 1013 void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf);
1014 1014
1015 flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF; 1015 flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF;
1016 asm("pushq %[flags]; popf; call *%[fastop]" 1016 asm("push %[flags]; popf; call *%[fastop]"
1017 : "=a"(rc) : [fastop]"r"(fop), [flags]"r"(flags)); 1017 : "=a"(rc) : [fastop]"r"(fop), [flags]"r"(flags));
1018 return rc; 1018 return rc;
1019} 1019}
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index b111aee815f8..484bc874688b 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -38,6 +38,38 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
38EXPORT_SYMBOL(kvm_cpu_has_pending_timer); 38EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
39 39
40/* 40/*
41 * check if there is pending interrupt from
42 * non-APIC source without intack.
43 */
44static int kvm_cpu_has_extint(struct kvm_vcpu *v)
45{
46 if (kvm_apic_accept_pic_intr(v))
47 return pic_irqchip(v->kvm)->output; /* PIC */
48 else
49 return 0;
50}
51
52/*
53 * check if there is injectable interrupt:
54 * when virtual interrupt delivery enabled,
55 * interrupt from apic will handled by hardware,
56 * we don't need to check it here.
57 */
58int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
59{
60 if (!irqchip_in_kernel(v->kvm))
61 return v->arch.interrupt.pending;
62
63 if (kvm_cpu_has_extint(v))
64 return 1;
65
66 if (kvm_apic_vid_enabled(v->kvm))
67 return 0;
68
69 return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
70}
71
72/*
41 * check if there is pending interrupt without 73 * check if there is pending interrupt without
42 * intack. 74 * intack.
43 */ 75 */
@@ -46,27 +78,41 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
46 if (!irqchip_in_kernel(v->kvm)) 78 if (!irqchip_in_kernel(v->kvm))
47 return v->arch.interrupt.pending; 79 return v->arch.interrupt.pending;
48 80
49 if (kvm_apic_accept_pic_intr(v) && pic_irqchip(v->kvm)->output) 81 if (kvm_cpu_has_extint(v))
50 return pic_irqchip(v->kvm)->output; /* PIC */ 82 return 1;
51 83
52 return kvm_apic_has_interrupt(v) != -1; /* LAPIC */ 84 return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
53} 85}
54EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); 86EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
55 87
56/* 88/*
89 * Read pending interrupt(from non-APIC source)
90 * vector and intack.
91 */
92static int kvm_cpu_get_extint(struct kvm_vcpu *v)
93{
94 if (kvm_cpu_has_extint(v))
95 return kvm_pic_read_irq(v->kvm); /* PIC */
96 return -1;
97}
98
99/*
57 * Read pending interrupt vector and intack. 100 * Read pending interrupt vector and intack.
58 */ 101 */
59int kvm_cpu_get_interrupt(struct kvm_vcpu *v) 102int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
60{ 103{
104 int vector;
105
61 if (!irqchip_in_kernel(v->kvm)) 106 if (!irqchip_in_kernel(v->kvm))
62 return v->arch.interrupt.nr; 107 return v->arch.interrupt.nr;
63 108
64 if (kvm_apic_accept_pic_intr(v) && pic_irqchip(v->kvm)->output) 109 vector = kvm_cpu_get_extint(v);
65 return kvm_pic_read_irq(v->kvm); /* PIC */ 110
111 if (kvm_apic_vid_enabled(v->kvm) || vector != -1)
112 return vector; /* PIC */
66 113
67 return kvm_get_apic_interrupt(v); /* APIC */ 114 return kvm_get_apic_interrupt(v); /* APIC */
68} 115}
69EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
70 116
71void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) 117void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
72{ 118{
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9392f527f107..02b51dd4e4ad 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -140,31 +140,56 @@ static inline int apic_enabled(struct kvm_lapic *apic)
140 (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ 140 (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
141 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) 141 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
142 142
143static inline int apic_x2apic_mode(struct kvm_lapic *apic)
144{
145 return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
146}
147
148static inline int kvm_apic_id(struct kvm_lapic *apic) 143static inline int kvm_apic_id(struct kvm_lapic *apic)
149{ 144{
150 return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff; 145 return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
151} 146}
152 147
153static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr) 148void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
149 struct kvm_lapic_irq *irq,
150 u64 *eoi_exit_bitmap)
154{ 151{
155 u16 cid; 152 struct kvm_lapic **dst;
156 ldr >>= 32 - map->ldr_bits; 153 struct kvm_apic_map *map;
157 cid = (ldr >> map->cid_shift) & map->cid_mask; 154 unsigned long bitmap = 1;
155 int i;
158 156
159 BUG_ON(cid >= ARRAY_SIZE(map->logical_map)); 157 rcu_read_lock();
158 map = rcu_dereference(vcpu->kvm->arch.apic_map);
160 159
161 return cid; 160 if (unlikely(!map)) {
162} 161 __set_bit(irq->vector, (unsigned long *)eoi_exit_bitmap);
162 goto out;
163 }
163 164
164static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr) 165 if (irq->dest_mode == 0) { /* physical mode */
165{ 166 if (irq->delivery_mode == APIC_DM_LOWEST ||
166 ldr >>= (32 - map->ldr_bits); 167 irq->dest_id == 0xff) {
167 return ldr & map->lid_mask; 168 __set_bit(irq->vector,
169 (unsigned long *)eoi_exit_bitmap);
170 goto out;
171 }
172 dst = &map->phys_map[irq->dest_id & 0xff];
173 } else {
174 u32 mda = irq->dest_id << (32 - map->ldr_bits);
175
176 dst = map->logical_map[apic_cluster_id(map, mda)];
177
178 bitmap = apic_logical_id(map, mda);
179 }
180
181 for_each_set_bit(i, &bitmap, 16) {
182 if (!dst[i])
183 continue;
184 if (dst[i]->vcpu == vcpu) {
185 __set_bit(irq->vector,
186 (unsigned long *)eoi_exit_bitmap);
187 break;
188 }
189 }
190
191out:
192 rcu_read_unlock();
168} 193}
169 194
170static void recalculate_apic_map(struct kvm *kvm) 195static void recalculate_apic_map(struct kvm *kvm)
@@ -230,6 +255,8 @@ out:
230 255
231 if (old) 256 if (old)
232 kfree_rcu(old, rcu); 257 kfree_rcu(old, rcu);
258
259 kvm_ioapic_make_eoibitmap_request(kvm);
233} 260}
234 261
235static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id) 262static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
@@ -345,6 +372,10 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
345{ 372{
346 int result; 373 int result;
347 374
375 /*
376 * Note that irr_pending is just a hint. It will be always
377 * true with virtual interrupt delivery enabled.
378 */
348 if (!apic->irr_pending) 379 if (!apic->irr_pending)
349 return -1; 380 return -1;
350 381
@@ -461,6 +492,8 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
461static inline int apic_find_highest_isr(struct kvm_lapic *apic) 492static inline int apic_find_highest_isr(struct kvm_lapic *apic)
462{ 493{
463 int result; 494 int result;
495
496 /* Note that isr_count is always 1 with vid enabled */
464 if (!apic->isr_count) 497 if (!apic->isr_count)
465 return -1; 498 return -1;
466 if (likely(apic->highest_isr_cache != -1)) 499 if (likely(apic->highest_isr_cache != -1))
@@ -740,6 +773,19 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
740 return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; 773 return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
741} 774}
742 775
776static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
777{
778 if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
779 kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
780 int trigger_mode;
781 if (apic_test_vector(vector, apic->regs + APIC_TMR))
782 trigger_mode = IOAPIC_LEVEL_TRIG;
783 else
784 trigger_mode = IOAPIC_EDGE_TRIG;
785 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
786 }
787}
788
743static int apic_set_eoi(struct kvm_lapic *apic) 789static int apic_set_eoi(struct kvm_lapic *apic)
744{ 790{
745 int vector = apic_find_highest_isr(apic); 791 int vector = apic_find_highest_isr(apic);
@@ -756,19 +802,26 @@ static int apic_set_eoi(struct kvm_lapic *apic)
756 apic_clear_isr(vector, apic); 802 apic_clear_isr(vector, apic);
757 apic_update_ppr(apic); 803 apic_update_ppr(apic);
758 804
759 if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && 805 kvm_ioapic_send_eoi(apic, vector);
760 kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
761 int trigger_mode;
762 if (apic_test_vector(vector, apic->regs + APIC_TMR))
763 trigger_mode = IOAPIC_LEVEL_TRIG;
764 else
765 trigger_mode = IOAPIC_EDGE_TRIG;
766 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
767 }
768 kvm_make_request(KVM_REQ_EVENT, apic->vcpu); 806 kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
769 return vector; 807 return vector;
770} 808}
771 809
810/*
811 * this interface assumes a trap-like exit, which has already finished
812 * desired side effect including vISR and vPPR update.
813 */
814void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector)
815{
816 struct kvm_lapic *apic = vcpu->arch.apic;
817
818 trace_kvm_eoi(apic, vector);
819
820 kvm_ioapic_send_eoi(apic, vector);
821 kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
822}
823EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated);
824
772static void apic_send_ipi(struct kvm_lapic *apic) 825static void apic_send_ipi(struct kvm_lapic *apic)
773{ 826{
774 u32 icr_low = kvm_apic_get_reg(apic, APIC_ICR); 827 u32 icr_low = kvm_apic_get_reg(apic, APIC_ICR);
@@ -1212,6 +1265,21 @@ void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
1212} 1265}
1213EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); 1266EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
1214 1267
1268/* emulate APIC access in a trap manner */
1269void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
1270{
1271 u32 val = 0;
1272
1273 /* hw has done the conditional check and inst decode */
1274 offset &= 0xff0;
1275
1276 apic_reg_read(vcpu->arch.apic, offset, 4, &val);
1277
1278 /* TODO: optimize to just emulate side effect w/o one more write */
1279 apic_reg_write(vcpu->arch.apic, offset, val);
1280}
1281EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode);
1282
1215void kvm_free_lapic(struct kvm_vcpu *vcpu) 1283void kvm_free_lapic(struct kvm_vcpu *vcpu)
1216{ 1284{
1217 struct kvm_lapic *apic = vcpu->arch.apic; 1285 struct kvm_lapic *apic = vcpu->arch.apic;
@@ -1288,6 +1356,7 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
1288 1356
1289void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) 1357void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
1290{ 1358{
1359 u64 old_value = vcpu->arch.apic_base;
1291 struct kvm_lapic *apic = vcpu->arch.apic; 1360 struct kvm_lapic *apic = vcpu->arch.apic;
1292 1361
1293 if (!apic) { 1362 if (!apic) {
@@ -1309,11 +1378,16 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
1309 value &= ~MSR_IA32_APICBASE_BSP; 1378 value &= ~MSR_IA32_APICBASE_BSP;
1310 1379
1311 vcpu->arch.apic_base = value; 1380 vcpu->arch.apic_base = value;
1312 if (apic_x2apic_mode(apic)) { 1381 if ((old_value ^ value) & X2APIC_ENABLE) {
1313 u32 id = kvm_apic_id(apic); 1382 if (value & X2APIC_ENABLE) {
1314 u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); 1383 u32 id = kvm_apic_id(apic);
1315 kvm_apic_set_ldr(apic, ldr); 1384 u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
1385 kvm_apic_set_ldr(apic, ldr);
1386 kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
1387 } else
1388 kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
1316 } 1389 }
1390
1317 apic->base_address = apic->vcpu->arch.apic_base & 1391 apic->base_address = apic->vcpu->arch.apic_base &
1318 MSR_IA32_APICBASE_BASE; 1392 MSR_IA32_APICBASE_BASE;
1319 1393
@@ -1359,8 +1433,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
1359 apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); 1433 apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
1360 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); 1434 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
1361 } 1435 }
1362 apic->irr_pending = false; 1436 apic->irr_pending = kvm_apic_vid_enabled(vcpu->kvm);
1363 apic->isr_count = 0; 1437 apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm);
1364 apic->highest_isr_cache = -1; 1438 apic->highest_isr_cache = -1;
1365 update_divide_count(apic); 1439 update_divide_count(apic);
1366 atomic_set(&apic->lapic_timer.pending, 0); 1440 atomic_set(&apic->lapic_timer.pending, 0);
@@ -1575,8 +1649,10 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
1575 update_divide_count(apic); 1649 update_divide_count(apic);
1576 start_apic_timer(apic); 1650 start_apic_timer(apic);
1577 apic->irr_pending = true; 1651 apic->irr_pending = true;
1578 apic->isr_count = count_vectors(apic->regs + APIC_ISR); 1652 apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm) ?
1653 1 : count_vectors(apic->regs + APIC_ISR);
1579 apic->highest_isr_cache = -1; 1654 apic->highest_isr_cache = -1;
1655 kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic));
1580 kvm_make_request(KVM_REQ_EVENT, vcpu); 1656 kvm_make_request(KVM_REQ_EVENT, vcpu);
1581} 1657}
1582 1658
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index e5ebf9f3571f..1676d34ddb4e 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -64,6 +64,9 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
64u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); 64u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
65void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data); 65void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);
66 66
67void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset);
68void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector);
69
67void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); 70void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
68void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); 71void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
69void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); 72void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
@@ -124,4 +127,35 @@ static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
124 return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic); 127 return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic);
125} 128}
126 129
130static inline int apic_x2apic_mode(struct kvm_lapic *apic)
131{
132 return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
133}
134
135static inline bool kvm_apic_vid_enabled(struct kvm *kvm)
136{
137 return kvm_x86_ops->vm_has_apicv(kvm);
138}
139
140static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr)
141{
142 u16 cid;
143 ldr >>= 32 - map->ldr_bits;
144 cid = (ldr >> map->cid_shift) & map->cid_mask;
145
146 BUG_ON(cid >= ARRAY_SIZE(map->logical_map));
147
148 return cid;
149}
150
151static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr)
152{
153 ldr >>= (32 - map->ldr_bits);
154 return ldr & map->lid_mask;
155}
156
157void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
158 struct kvm_lapic_irq *irq,
159 u64 *eoi_bitmap);
160
127#endif 161#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9f628f7a40b2..0242a8a1b2e2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -448,7 +448,8 @@ static bool __check_direct_spte_mmio_pf(u64 spte)
448 448
449static bool spte_is_locklessly_modifiable(u64 spte) 449static bool spte_is_locklessly_modifiable(u64 spte)
450{ 450{
451 return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)); 451 return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
452 (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
452} 453}
453 454
454static bool spte_has_volatile_bits(u64 spte) 455static bool spte_has_volatile_bits(u64 spte)
@@ -1460,28 +1461,14 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
1460 percpu_counter_add(&kvm_total_used_mmu_pages, nr); 1461 percpu_counter_add(&kvm_total_used_mmu_pages, nr);
1461} 1462}
1462 1463
1463/* 1464static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
1464 * Remove the sp from shadow page cache, after call it,
1465 * we can not find this sp from the cache, and the shadow
1466 * page table is still valid.
1467 * It should be under the protection of mmu lock.
1468 */
1469static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp)
1470{ 1465{
1471 ASSERT(is_empty_shadow_page(sp->spt)); 1466 ASSERT(is_empty_shadow_page(sp->spt));
1472 hlist_del(&sp->hash_link); 1467 hlist_del(&sp->hash_link);
1473 if (!sp->role.direct)
1474 free_page((unsigned long)sp->gfns);
1475}
1476
1477/*
1478 * Free the shadow page table and the sp, we can do it
1479 * out of the protection of mmu lock.
1480 */
1481static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
1482{
1483 list_del(&sp->link); 1468 list_del(&sp->link);
1484 free_page((unsigned long)sp->spt); 1469 free_page((unsigned long)sp->spt);
1470 if (!sp->role.direct)
1471 free_page((unsigned long)sp->gfns);
1485 kmem_cache_free(mmu_page_header_cache, sp); 1472 kmem_cache_free(mmu_page_header_cache, sp);
1486} 1473}
1487 1474
@@ -2125,7 +2112,6 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2125 do { 2112 do {
2126 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); 2113 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
2127 WARN_ON(!sp->role.invalid || sp->root_count); 2114 WARN_ON(!sp->role.invalid || sp->root_count);
2128 kvm_mmu_isolate_page(sp);
2129 kvm_mmu_free_page(sp); 2115 kvm_mmu_free_page(sp);
2130 } while (!list_empty(invalid_list)); 2116 } while (!list_empty(invalid_list));
2131} 2117}
@@ -2327,9 +2313,8 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
2327 if (s->role.level != PT_PAGE_TABLE_LEVEL) 2313 if (s->role.level != PT_PAGE_TABLE_LEVEL)
2328 return 1; 2314 return 1;
2329 2315
2330 if (!need_unsync && !s->unsync) { 2316 if (!s->unsync)
2331 need_unsync = true; 2317 need_unsync = true;
2332 }
2333 } 2318 }
2334 if (need_unsync) 2319 if (need_unsync)
2335 kvm_unsync_pages(vcpu, gfn); 2320 kvm_unsync_pages(vcpu, gfn);
@@ -3687,6 +3672,7 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
3687 else 3672 else
3688 r = paging32_init_context(vcpu, context); 3673 r = paging32_init_context(vcpu, context);
3689 3674
3675 vcpu->arch.mmu.base_role.nxe = is_nx(vcpu);
3690 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); 3676 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
3691 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); 3677 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
3692 vcpu->arch.mmu.base_role.smep_andnot_wp 3678 vcpu->arch.mmu.base_role.smep_andnot_wp
@@ -3853,7 +3839,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
3853 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ 3839 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
3854 *gpa &= ~(gpa_t)7; 3840 *gpa &= ~(gpa_t)7;
3855 *bytes = 8; 3841 *bytes = 8;
3856 r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, min(*bytes, 8)); 3842 r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, 8);
3857 if (r) 3843 if (r)
3858 gentry = 0; 3844 gentry = 0;
3859 new = (const u8 *)&gentry; 3845 new = (const u8 *)&gentry;
@@ -4007,7 +3993,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
4007 !((sp->role.word ^ vcpu->arch.mmu.base_role.word) 3993 !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
4008 & mask.word) && rmap_can_add(vcpu)) 3994 & mask.word) && rmap_can_add(vcpu))
4009 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); 3995 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
4010 if (!remote_flush && need_remote_flush(entry, *spte)) 3996 if (need_remote_flush(entry, *spte))
4011 remote_flush = true; 3997 remote_flush = true;
4012 ++spte; 3998 ++spte;
4013 } 3999 }
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index ca69dcccbe31..34c5c99323f4 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -409,9 +409,6 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
409 unsigned direct_access, access = gw->pt_access; 409 unsigned direct_access, access = gw->pt_access;
410 int top_level, emulate = 0; 410 int top_level, emulate = 0;
411 411
412 if (!is_present_gpte(gw->ptes[gw->level - 1]))
413 return 0;
414
415 direct_access = gw->pte_access; 412 direct_access = gw->pte_access;
416 413
417 top_level = vcpu->arch.mmu.root_level; 414 top_level = vcpu->arch.mmu.root_level;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d29d3cd1c156..e1b1ce21bc00 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3571,6 +3571,26 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3571 set_cr_intercept(svm, INTERCEPT_CR8_WRITE); 3571 set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
3572} 3572}
3573 3573
3574static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
3575{
3576 return;
3577}
3578
3579static int svm_vm_has_apicv(struct kvm *kvm)
3580{
3581 return 0;
3582}
3583
3584static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
3585{
3586 return;
3587}
3588
3589static void svm_hwapic_isr_update(struct kvm *kvm, int isr)
3590{
3591 return;
3592}
3593
3574static int svm_nmi_allowed(struct kvm_vcpu *vcpu) 3594static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
3575{ 3595{
3576 struct vcpu_svm *svm = to_svm(vcpu); 3596 struct vcpu_svm *svm = to_svm(vcpu);
@@ -4290,6 +4310,10 @@ static struct kvm_x86_ops svm_x86_ops = {
4290 .enable_nmi_window = enable_nmi_window, 4310 .enable_nmi_window = enable_nmi_window,
4291 .enable_irq_window = enable_irq_window, 4311 .enable_irq_window = enable_irq_window,
4292 .update_cr8_intercept = update_cr8_intercept, 4312 .update_cr8_intercept = update_cr8_intercept,
4313 .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
4314 .vm_has_apicv = svm_vm_has_apicv,
4315 .load_eoi_exitmap = svm_load_eoi_exitmap,
4316 .hwapic_isr_update = svm_hwapic_isr_update,
4293 4317
4294 .set_tss_addr = svm_set_tss_addr, 4318 .set_tss_addr = svm_set_tss_addr,
4295 .get_tdp_level = get_npt_level, 4319 .get_tdp_level = get_npt_level,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 02eeba86328d..fe9a9cfadbd6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -84,6 +84,9 @@ module_param(vmm_exclusive, bool, S_IRUGO);
84static bool __read_mostly fasteoi = 1; 84static bool __read_mostly fasteoi = 1;
85module_param(fasteoi, bool, S_IRUGO); 85module_param(fasteoi, bool, S_IRUGO);
86 86
87static bool __read_mostly enable_apicv_reg_vid = 1;
88module_param(enable_apicv_reg_vid, bool, S_IRUGO);
89
87/* 90/*
88 * If nested=1, nested virtualization is supported, i.e., guests may use 91 * If nested=1, nested virtualization is supported, i.e., guests may use
89 * VMX and be a hypervisor for its own guests. If nested=0, guests may not 92 * VMX and be a hypervisor for its own guests. If nested=0, guests may not
@@ -640,6 +643,8 @@ static unsigned long *vmx_io_bitmap_a;
640static unsigned long *vmx_io_bitmap_b; 643static unsigned long *vmx_io_bitmap_b;
641static unsigned long *vmx_msr_bitmap_legacy; 644static unsigned long *vmx_msr_bitmap_legacy;
642static unsigned long *vmx_msr_bitmap_longmode; 645static unsigned long *vmx_msr_bitmap_longmode;
646static unsigned long *vmx_msr_bitmap_legacy_x2apic;
647static unsigned long *vmx_msr_bitmap_longmode_x2apic;
643 648
644static bool cpu_has_load_ia32_efer; 649static bool cpu_has_load_ia32_efer;
645static bool cpu_has_load_perf_global_ctrl; 650static bool cpu_has_load_perf_global_ctrl;
@@ -764,6 +769,24 @@ static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
764 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 769 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
765} 770}
766 771
772static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
773{
774 return vmcs_config.cpu_based_2nd_exec_ctrl &
775 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
776}
777
778static inline bool cpu_has_vmx_apic_register_virt(void)
779{
780 return vmcs_config.cpu_based_2nd_exec_ctrl &
781 SECONDARY_EXEC_APIC_REGISTER_VIRT;
782}
783
784static inline bool cpu_has_vmx_virtual_intr_delivery(void)
785{
786 return vmcs_config.cpu_based_2nd_exec_ctrl &
787 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
788}
789
767static inline bool cpu_has_vmx_flexpriority(void) 790static inline bool cpu_has_vmx_flexpriority(void)
768{ 791{
769 return cpu_has_vmx_tpr_shadow() && 792 return cpu_has_vmx_tpr_shadow() &&
@@ -1821,6 +1844,25 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
1821 vmx->guest_msrs[from] = tmp; 1844 vmx->guest_msrs[from] = tmp;
1822} 1845}
1823 1846
1847static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
1848{
1849 unsigned long *msr_bitmap;
1850
1851 if (irqchip_in_kernel(vcpu->kvm) && apic_x2apic_mode(vcpu->arch.apic)) {
1852 if (is_long_mode(vcpu))
1853 msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
1854 else
1855 msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
1856 } else {
1857 if (is_long_mode(vcpu))
1858 msr_bitmap = vmx_msr_bitmap_longmode;
1859 else
1860 msr_bitmap = vmx_msr_bitmap_legacy;
1861 }
1862
1863 vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
1864}
1865
1824/* 1866/*
1825 * Set up the vmcs to automatically save and restore system 1867 * Set up the vmcs to automatically save and restore system
1826 * msrs. Don't touch the 64-bit msrs if the guest is in legacy 1868 * msrs. Don't touch the 64-bit msrs if the guest is in legacy
@@ -1829,7 +1871,6 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
1829static void setup_msrs(struct vcpu_vmx *vmx) 1871static void setup_msrs(struct vcpu_vmx *vmx)
1830{ 1872{
1831 int save_nmsrs, index; 1873 int save_nmsrs, index;
1832 unsigned long *msr_bitmap;
1833 1874
1834 save_nmsrs = 0; 1875 save_nmsrs = 0;
1835#ifdef CONFIG_X86_64 1876#ifdef CONFIG_X86_64
@@ -1861,14 +1902,8 @@ static void setup_msrs(struct vcpu_vmx *vmx)
1861 1902
1862 vmx->save_nmsrs = save_nmsrs; 1903 vmx->save_nmsrs = save_nmsrs;
1863 1904
1864 if (cpu_has_vmx_msr_bitmap()) { 1905 if (cpu_has_vmx_msr_bitmap())
1865 if (is_long_mode(&vmx->vcpu)) 1906 vmx_set_msr_bitmap(&vmx->vcpu);
1866 msr_bitmap = vmx_msr_bitmap_longmode;
1867 else
1868 msr_bitmap = vmx_msr_bitmap_legacy;
1869
1870 vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
1871 }
1872} 1907}
1873 1908
1874/* 1909/*
@@ -2534,13 +2569,16 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
2534 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { 2569 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
2535 min2 = 0; 2570 min2 = 0;
2536 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2571 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2572 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2537 SECONDARY_EXEC_WBINVD_EXITING | 2573 SECONDARY_EXEC_WBINVD_EXITING |
2538 SECONDARY_EXEC_ENABLE_VPID | 2574 SECONDARY_EXEC_ENABLE_VPID |
2539 SECONDARY_EXEC_ENABLE_EPT | 2575 SECONDARY_EXEC_ENABLE_EPT |
2540 SECONDARY_EXEC_UNRESTRICTED_GUEST | 2576 SECONDARY_EXEC_UNRESTRICTED_GUEST |
2541 SECONDARY_EXEC_PAUSE_LOOP_EXITING | 2577 SECONDARY_EXEC_PAUSE_LOOP_EXITING |
2542 SECONDARY_EXEC_RDTSCP | 2578 SECONDARY_EXEC_RDTSCP |
2543 SECONDARY_EXEC_ENABLE_INVPCID; 2579 SECONDARY_EXEC_ENABLE_INVPCID |
2580 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2581 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
2544 if (adjust_vmx_controls(min2, opt2, 2582 if (adjust_vmx_controls(min2, opt2,
2545 MSR_IA32_VMX_PROCBASED_CTLS2, 2583 MSR_IA32_VMX_PROCBASED_CTLS2,
2546 &_cpu_based_2nd_exec_control) < 0) 2584 &_cpu_based_2nd_exec_control) < 0)
@@ -2551,6 +2589,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
2551 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 2589 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
2552 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; 2590 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
2553#endif 2591#endif
2592
2593 if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
2594 _cpu_based_2nd_exec_control &= ~(
2595 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2596 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2597 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
2598
2554 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { 2599 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
2555 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT 2600 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
2556 enabled */ 2601 enabled */
@@ -2748,6 +2793,15 @@ static __init int hardware_setup(void)
2748 if (!cpu_has_vmx_ple()) 2793 if (!cpu_has_vmx_ple())
2749 ple_gap = 0; 2794 ple_gap = 0;
2750 2795
2796 if (!cpu_has_vmx_apic_register_virt() ||
2797 !cpu_has_vmx_virtual_intr_delivery())
2798 enable_apicv_reg_vid = 0;
2799
2800 if (enable_apicv_reg_vid)
2801 kvm_x86_ops->update_cr8_intercept = NULL;
2802 else
2803 kvm_x86_ops->hwapic_irr_update = NULL;
2804
2751 if (nested) 2805 if (nested)
2752 nested_vmx_setup_ctls_msrs(); 2806 nested_vmx_setup_ctls_msrs();
2753 2807
@@ -3173,6 +3227,14 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3173 if (!is_paging(vcpu)) { 3227 if (!is_paging(vcpu)) {
3174 hw_cr4 &= ~X86_CR4_PAE; 3228 hw_cr4 &= ~X86_CR4_PAE;
3175 hw_cr4 |= X86_CR4_PSE; 3229 hw_cr4 |= X86_CR4_PSE;
3230 /*
3231 * SMEP is disabled if CPU is in non-paging mode in
3232 * hardware. However KVM always uses paging mode to
3233 * emulate guest non-paging mode with TDP.
3234 * To emulate this behavior, SMEP needs to be manually
3235 * disabled when guest switches to non-paging mode.
3236 */
3237 hw_cr4 &= ~X86_CR4_SMEP;
3176 } else if (!(cr4 & X86_CR4_PAE)) { 3238 } else if (!(cr4 & X86_CR4_PAE)) {
3177 hw_cr4 &= ~X86_CR4_PAE; 3239 hw_cr4 &= ~X86_CR4_PAE;
3178 } 3240 }
@@ -3707,7 +3769,10 @@ static void free_vpid(struct vcpu_vmx *vmx)
3707 spin_unlock(&vmx_vpid_lock); 3769 spin_unlock(&vmx_vpid_lock);
3708} 3770}
3709 3771
3710static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr) 3772#define MSR_TYPE_R 1
3773#define MSR_TYPE_W 2
3774static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
3775 u32 msr, int type)
3711{ 3776{
3712 int f = sizeof(unsigned long); 3777 int f = sizeof(unsigned long);
3713 3778
@@ -3720,20 +3785,93 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
3720 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. 3785 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
3721 */ 3786 */
3722 if (msr <= 0x1fff) { 3787 if (msr <= 0x1fff) {
3723 __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */ 3788 if (type & MSR_TYPE_R)
3724 __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */ 3789 /* read-low */
3790 __clear_bit(msr, msr_bitmap + 0x000 / f);
3791
3792 if (type & MSR_TYPE_W)
3793 /* write-low */
3794 __clear_bit(msr, msr_bitmap + 0x800 / f);
3795
3725 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 3796 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
3726 msr &= 0x1fff; 3797 msr &= 0x1fff;
3727 __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */ 3798 if (type & MSR_TYPE_R)
3728 __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */ 3799 /* read-high */
3800 __clear_bit(msr, msr_bitmap + 0x400 / f);
3801
3802 if (type & MSR_TYPE_W)
3803 /* write-high */
3804 __clear_bit(msr, msr_bitmap + 0xc00 / f);
3805
3806 }
3807}
3808
3809static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
3810 u32 msr, int type)
3811{
3812 int f = sizeof(unsigned long);
3813
3814 if (!cpu_has_vmx_msr_bitmap())
3815 return;
3816
3817 /*
3818 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
3819 * have the write-low and read-high bitmap offsets the wrong way round.
3820 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
3821 */
3822 if (msr <= 0x1fff) {
3823 if (type & MSR_TYPE_R)
3824 /* read-low */
3825 __set_bit(msr, msr_bitmap + 0x000 / f);
3826
3827 if (type & MSR_TYPE_W)
3828 /* write-low */
3829 __set_bit(msr, msr_bitmap + 0x800 / f);
3830
3831 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
3832 msr &= 0x1fff;
3833 if (type & MSR_TYPE_R)
3834 /* read-high */
3835 __set_bit(msr, msr_bitmap + 0x400 / f);
3836
3837 if (type & MSR_TYPE_W)
3838 /* write-high */
3839 __set_bit(msr, msr_bitmap + 0xc00 / f);
3840
3729 } 3841 }
3730} 3842}
3731 3843
3732static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) 3844static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
3733{ 3845{
3734 if (!longmode_only) 3846 if (!longmode_only)
3735 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr); 3847 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
3736 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr); 3848 msr, MSR_TYPE_R | MSR_TYPE_W);
3849 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
3850 msr, MSR_TYPE_R | MSR_TYPE_W);
3851}
3852
3853static void vmx_enable_intercept_msr_read_x2apic(u32 msr)
3854{
3855 __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
3856 msr, MSR_TYPE_R);
3857 __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
3858 msr, MSR_TYPE_R);
3859}
3860
3861static void vmx_disable_intercept_msr_read_x2apic(u32 msr)
3862{
3863 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
3864 msr, MSR_TYPE_R);
3865 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
3866 msr, MSR_TYPE_R);
3867}
3868
3869static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
3870{
3871 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
3872 msr, MSR_TYPE_W);
3873 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
3874 msr, MSR_TYPE_W);
3737} 3875}
3738 3876
3739/* 3877/*
@@ -3812,6 +3950,11 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
3812 return exec_control; 3950 return exec_control;
3813} 3951}
3814 3952
3953static int vmx_vm_has_apicv(struct kvm *kvm)
3954{
3955 return enable_apicv_reg_vid && irqchip_in_kernel(kvm);
3956}
3957
3815static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) 3958static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
3816{ 3959{
3817 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; 3960 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
@@ -3829,6 +3972,10 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
3829 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 3972 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
3830 if (!ple_gap) 3973 if (!ple_gap)
3831 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 3974 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
3975 if (!vmx_vm_has_apicv(vmx->vcpu.kvm))
3976 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
3977 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
3978 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
3832 return exec_control; 3979 return exec_control;
3833} 3980}
3834 3981
@@ -3873,6 +4020,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
3873 vmx_secondary_exec_control(vmx)); 4020 vmx_secondary_exec_control(vmx));
3874 } 4021 }
3875 4022
4023 if (enable_apicv_reg_vid) {
4024 vmcs_write64(EOI_EXIT_BITMAP0, 0);
4025 vmcs_write64(EOI_EXIT_BITMAP1, 0);
4026 vmcs_write64(EOI_EXIT_BITMAP2, 0);
4027 vmcs_write64(EOI_EXIT_BITMAP3, 0);
4028
4029 vmcs_write16(GUEST_INTR_STATUS, 0);
4030 }
4031
3876 if (ple_gap) { 4032 if (ple_gap) {
3877 vmcs_write32(PLE_GAP, ple_gap); 4033 vmcs_write32(PLE_GAP, ple_gap);
3878 vmcs_write32(PLE_WINDOW, ple_window); 4034 vmcs_write32(PLE_WINDOW, ple_window);
@@ -4787,6 +4943,26 @@ static int handle_apic_access(struct kvm_vcpu *vcpu)
4787 return emulate_instruction(vcpu, 0) == EMULATE_DONE; 4943 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
4788} 4944}
4789 4945
4946static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
4947{
4948 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4949 int vector = exit_qualification & 0xff;
4950
4951 /* EOI-induced VM exit is trap-like and thus no need to adjust IP */
4952 kvm_apic_set_eoi_accelerated(vcpu, vector);
4953 return 1;
4954}
4955
4956static int handle_apic_write(struct kvm_vcpu *vcpu)
4957{
4958 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4959 u32 offset = exit_qualification & 0xfff;
4960
4961 /* APIC-write VM exit is trap-like and thus no need to adjust IP */
4962 kvm_apic_write_nodecode(vcpu, offset);
4963 return 1;
4964}
4965
4790static int handle_task_switch(struct kvm_vcpu *vcpu) 4966static int handle_task_switch(struct kvm_vcpu *vcpu)
4791{ 4967{
4792 struct vcpu_vmx *vmx = to_vmx(vcpu); 4968 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -5721,6 +5897,8 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
5721 [EXIT_REASON_VMON] = handle_vmon, 5897 [EXIT_REASON_VMON] = handle_vmon,
5722 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 5898 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
5723 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 5899 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
5900 [EXIT_REASON_APIC_WRITE] = handle_apic_write,
5901 [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced,
5724 [EXIT_REASON_WBINVD] = handle_wbinvd, 5902 [EXIT_REASON_WBINVD] = handle_wbinvd,
5725 [EXIT_REASON_XSETBV] = handle_xsetbv, 5903 [EXIT_REASON_XSETBV] = handle_xsetbv,
5726 [EXIT_REASON_TASK_SWITCH] = handle_task_switch, 5904 [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
@@ -6070,6 +6248,85 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
6070 vmcs_write32(TPR_THRESHOLD, irr); 6248 vmcs_write32(TPR_THRESHOLD, irr);
6071} 6249}
6072 6250
6251static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
6252{
6253 u32 sec_exec_control;
6254
6255 /*
6256 * There is not point to enable virtualize x2apic without enable
6257 * apicv
6258 */
6259 if (!cpu_has_vmx_virtualize_x2apic_mode() ||
6260 !vmx_vm_has_apicv(vcpu->kvm))
6261 return;
6262
6263 if (!vm_need_tpr_shadow(vcpu->kvm))
6264 return;
6265
6266 sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
6267
6268 if (set) {
6269 sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6270 sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
6271 } else {
6272 sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
6273 sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6274 }
6275 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
6276
6277 vmx_set_msr_bitmap(vcpu);
6278}
6279
6280static void vmx_hwapic_isr_update(struct kvm *kvm, int isr)
6281{
6282 u16 status;
6283 u8 old;
6284
6285 if (!vmx_vm_has_apicv(kvm))
6286 return;
6287
6288 if (isr == -1)
6289 isr = 0;
6290
6291 status = vmcs_read16(GUEST_INTR_STATUS);
6292 old = status >> 8;
6293 if (isr != old) {
6294 status &= 0xff;
6295 status |= isr << 8;
6296 vmcs_write16(GUEST_INTR_STATUS, status);
6297 }
6298}
6299
6300static void vmx_set_rvi(int vector)
6301{
6302 u16 status;
6303 u8 old;
6304
6305 status = vmcs_read16(GUEST_INTR_STATUS);
6306 old = (u8)status & 0xff;
6307 if ((u8)vector != old) {
6308 status &= ~0xff;
6309 status |= (u8)vector;
6310 vmcs_write16(GUEST_INTR_STATUS, status);
6311 }
6312}
6313
6314static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
6315{
6316 if (max_irr == -1)
6317 return;
6318
6319 vmx_set_rvi(max_irr);
6320}
6321
6322static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
6323{
6324 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
6325 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
6326 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
6327 vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
6328}
6329
6073static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) 6330static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
6074{ 6331{
6075 u32 exit_intr_info; 6332 u32 exit_intr_info;
@@ -7333,6 +7590,11 @@ static struct kvm_x86_ops vmx_x86_ops = {
7333 .enable_nmi_window = enable_nmi_window, 7590 .enable_nmi_window = enable_nmi_window,
7334 .enable_irq_window = enable_irq_window, 7591 .enable_irq_window = enable_irq_window,
7335 .update_cr8_intercept = update_cr8_intercept, 7592 .update_cr8_intercept = update_cr8_intercept,
7593 .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
7594 .vm_has_apicv = vmx_vm_has_apicv,
7595 .load_eoi_exitmap = vmx_load_eoi_exitmap,
7596 .hwapic_irr_update = vmx_hwapic_irr_update,
7597 .hwapic_isr_update = vmx_hwapic_isr_update,
7336 7598
7337 .set_tss_addr = vmx_set_tss_addr, 7599 .set_tss_addr = vmx_set_tss_addr,
7338 .get_tdp_level = get_ept_level, 7600 .get_tdp_level = get_ept_level,
@@ -7365,7 +7627,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
7365 7627
7366static int __init vmx_init(void) 7628static int __init vmx_init(void)
7367{ 7629{
7368 int r, i; 7630 int r, i, msr;
7369 7631
7370 rdmsrl_safe(MSR_EFER, &host_efer); 7632 rdmsrl_safe(MSR_EFER, &host_efer);
7371 7633
@@ -7386,11 +7648,19 @@ static int __init vmx_init(void)
7386 if (!vmx_msr_bitmap_legacy) 7648 if (!vmx_msr_bitmap_legacy)
7387 goto out1; 7649 goto out1;
7388 7650
7651 vmx_msr_bitmap_legacy_x2apic =
7652 (unsigned long *)__get_free_page(GFP_KERNEL);
7653 if (!vmx_msr_bitmap_legacy_x2apic)
7654 goto out2;
7389 7655
7390 vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); 7656 vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
7391 if (!vmx_msr_bitmap_longmode) 7657 if (!vmx_msr_bitmap_longmode)
7392 goto out2; 7658 goto out3;
7393 7659
7660 vmx_msr_bitmap_longmode_x2apic =
7661 (unsigned long *)__get_free_page(GFP_KERNEL);
7662 if (!vmx_msr_bitmap_longmode_x2apic)
7663 goto out4;
7394 7664
7395 /* 7665 /*
7396 * Allow direct access to the PC debug port (it is often used for I/O 7666 * Allow direct access to the PC debug port (it is often used for I/O
@@ -7422,6 +7692,28 @@ static int __init vmx_init(void)
7422 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); 7692 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
7423 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); 7693 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
7424 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); 7694 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
7695 memcpy(vmx_msr_bitmap_legacy_x2apic,
7696 vmx_msr_bitmap_legacy, PAGE_SIZE);
7697 memcpy(vmx_msr_bitmap_longmode_x2apic,
7698 vmx_msr_bitmap_longmode, PAGE_SIZE);
7699
7700 if (enable_apicv_reg_vid) {
7701 for (msr = 0x800; msr <= 0x8ff; msr++)
7702 vmx_disable_intercept_msr_read_x2apic(msr);
7703
7704 /* According SDM, in x2apic mode, the whole id reg is used.
7705 * But in KVM, it only use the highest eight bits. Need to
7706 * intercept it */
7707 vmx_enable_intercept_msr_read_x2apic(0x802);
7708 /* TMCCT */
7709 vmx_enable_intercept_msr_read_x2apic(0x839);
7710 /* TPR */
7711 vmx_disable_intercept_msr_write_x2apic(0x808);
7712 /* EOI */
7713 vmx_disable_intercept_msr_write_x2apic(0x80b);
7714 /* SELF-IPI */
7715 vmx_disable_intercept_msr_write_x2apic(0x83f);
7716 }
7425 7717
7426 if (enable_ept) { 7718 if (enable_ept) {
7427 kvm_mmu_set_mask_ptes(0ull, 7719 kvm_mmu_set_mask_ptes(0ull,
@@ -7435,8 +7727,10 @@ static int __init vmx_init(void)
7435 7727
7436 return 0; 7728 return 0;
7437 7729
7438out3: 7730out4:
7439 free_page((unsigned long)vmx_msr_bitmap_longmode); 7731 free_page((unsigned long)vmx_msr_bitmap_longmode);
7732out3:
7733 free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
7440out2: 7734out2:
7441 free_page((unsigned long)vmx_msr_bitmap_legacy); 7735 free_page((unsigned long)vmx_msr_bitmap_legacy);
7442out1: 7736out1:
@@ -7448,6 +7742,8 @@ out:
7448 7742
7449static void __exit vmx_exit(void) 7743static void __exit vmx_exit(void)
7450{ 7744{
7745 free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
7746 free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
7451 free_page((unsigned long)vmx_msr_bitmap_legacy); 7747 free_page((unsigned long)vmx_msr_bitmap_legacy);
7452 free_page((unsigned long)vmx_msr_bitmap_longmode); 7748 free_page((unsigned long)vmx_msr_bitmap_longmode);
7453 free_page((unsigned long)vmx_io_bitmap_b); 7749 free_page((unsigned long)vmx_io_bitmap_b);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b9f55299ed7e..373e17a0d398 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -870,8 +870,6 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
870 870
871 kvm_x86_ops->set_efer(vcpu, efer); 871 kvm_x86_ops->set_efer(vcpu, efer);
872 872
873 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
874
875 /* Update reserved bits */ 873 /* Update reserved bits */
876 if ((efer ^ old_efer) & EFER_NX) 874 if ((efer ^ old_efer) & EFER_NX)
877 kvm_mmu_reset_context(vcpu); 875 kvm_mmu_reset_context(vcpu);
@@ -5565,7 +5563,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
5565 vcpu->arch.nmi_injected = true; 5563 vcpu->arch.nmi_injected = true;
5566 kvm_x86_ops->set_nmi(vcpu); 5564 kvm_x86_ops->set_nmi(vcpu);
5567 } 5565 }
5568 } else if (kvm_cpu_has_interrupt(vcpu)) { 5566 } else if (kvm_cpu_has_injectable_intr(vcpu)) {
5569 if (kvm_x86_ops->interrupt_allowed(vcpu)) { 5567 if (kvm_x86_ops->interrupt_allowed(vcpu)) {
5570 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), 5568 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
5571 false); 5569 false);
@@ -5633,6 +5631,16 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
5633#endif 5631#endif
5634} 5632}
5635 5633
5634static void update_eoi_exitmap(struct kvm_vcpu *vcpu)
5635{
5636 u64 eoi_exit_bitmap[4];
5637
5638 memset(eoi_exit_bitmap, 0, 32);
5639
5640 kvm_ioapic_calculate_eoi_exitmap(vcpu, eoi_exit_bitmap);
5641 kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
5642}
5643
5636static int vcpu_enter_guest(struct kvm_vcpu *vcpu) 5644static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5637{ 5645{
5638 int r; 5646 int r;
@@ -5686,6 +5694,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5686 kvm_handle_pmu_event(vcpu); 5694 kvm_handle_pmu_event(vcpu);
5687 if (kvm_check_request(KVM_REQ_PMI, vcpu)) 5695 if (kvm_check_request(KVM_REQ_PMI, vcpu))
5688 kvm_deliver_pmi(vcpu); 5696 kvm_deliver_pmi(vcpu);
5697 if (kvm_check_request(KVM_REQ_EOIBITMAP, vcpu))
5698 update_eoi_exitmap(vcpu);
5689 } 5699 }
5690 5700
5691 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { 5701 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
@@ -5694,10 +5704,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5694 /* enable NMI/IRQ window open exits if needed */ 5704 /* enable NMI/IRQ window open exits if needed */
5695 if (vcpu->arch.nmi_pending) 5705 if (vcpu->arch.nmi_pending)
5696 kvm_x86_ops->enable_nmi_window(vcpu); 5706 kvm_x86_ops->enable_nmi_window(vcpu);
5697 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) 5707 else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
5698 kvm_x86_ops->enable_irq_window(vcpu); 5708 kvm_x86_ops->enable_irq_window(vcpu);
5699 5709
5700 if (kvm_lapic_enabled(vcpu)) { 5710 if (kvm_lapic_enabled(vcpu)) {
5711 /*
5712 * Update architecture specific hints for APIC
5713 * virtual interrupt delivery.
5714 */
5715 if (kvm_x86_ops->hwapic_irr_update)
5716 kvm_x86_ops->hwapic_irr_update(vcpu,
5717 kvm_lapic_find_highest_irr(vcpu));
5701 update_cr8_intercept(vcpu); 5718 update_cr8_intercept(vcpu);
5702 kvm_lapic_sync_to_vapic(vcpu); 5719 kvm_lapic_sync_to_vapic(vcpu);
5703 } 5720 }
diff --git a/drivers/s390/kvm/virtio_ccw.c b/drivers/s390/kvm/virtio_ccw.c
index 2edd94af131c..3217dfe5cb8b 100644
--- a/drivers/s390/kvm/virtio_ccw.c
+++ b/drivers/s390/kvm/virtio_ccw.c
@@ -244,9 +244,9 @@ static struct virtqueue *virtio_ccw_setup_vq(struct virtio_device *vdev,
244{ 244{
245 struct virtio_ccw_device *vcdev = to_vc_device(vdev); 245 struct virtio_ccw_device *vcdev = to_vc_device(vdev);
246 int err; 246 int err;
247 struct virtqueue *vq; 247 struct virtqueue *vq = NULL;
248 struct virtio_ccw_vq_info *info; 248 struct virtio_ccw_vq_info *info;
249 unsigned long size; 249 unsigned long size = 0; /* silence the compiler */
250 unsigned long flags; 250 unsigned long flags;
251 251
252 /* Allocate queue. */ 252 /* Allocate queue. */
@@ -279,11 +279,8 @@ static struct virtqueue *virtio_ccw_setup_vq(struct virtio_device *vdev,
279 /* For now, we fail if we can't get the requested size. */ 279 /* For now, we fail if we can't get the requested size. */
280 dev_warn(&vcdev->cdev->dev, "no vq\n"); 280 dev_warn(&vcdev->cdev->dev, "no vq\n");
281 err = -ENOMEM; 281 err = -ENOMEM;
282 free_pages_exact(info->queue, size);
283 goto out_err; 282 goto out_err;
284 } 283 }
285 info->vq = vq;
286 vq->priv = info;
287 284
288 /* Register it with the host. */ 285 /* Register it with the host. */
289 info->info_block->queue = (__u64)info->queue; 286 info->info_block->queue = (__u64)info->queue;
@@ -297,12 +294,12 @@ static struct virtqueue *virtio_ccw_setup_vq(struct virtio_device *vdev,
297 err = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_SET_VQ | i); 294 err = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_SET_VQ | i);
298 if (err) { 295 if (err) {
299 dev_warn(&vcdev->cdev->dev, "SET_VQ failed\n"); 296 dev_warn(&vcdev->cdev->dev, "SET_VQ failed\n");
300 free_pages_exact(info->queue, size);
301 info->vq = NULL;
302 vq->priv = NULL;
303 goto out_err; 297 goto out_err;
304 } 298 }
305 299
300 info->vq = vq;
301 vq->priv = info;
302
306 /* Save it to our list. */ 303 /* Save it to our list. */
307 spin_lock_irqsave(&vcdev->lock, flags); 304 spin_lock_irqsave(&vcdev->lock, flags);
308 list_add(&info->node, &vcdev->virtqueues); 305 list_add(&info->node, &vcdev->virtqueues);
@@ -311,8 +308,13 @@ static struct virtqueue *virtio_ccw_setup_vq(struct virtio_device *vdev,
311 return vq; 308 return vq;
312 309
313out_err: 310out_err:
314 if (info) 311 if (vq)
312 vring_del_virtqueue(vq);
313 if (info) {
314 if (info->queue)
315 free_pages_exact(info->queue, size);
315 kfree(info->info_block); 316 kfree(info->info_block);
317 }
316 kfree(info); 318 kfree(info);
317 return ERR_PTR(err); 319 return ERR_PTR(err);
318} 320}
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4dd7d7531e69..0350e0d5e031 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -123,6 +123,7 @@ static inline bool is_error_page(struct page *page)
123#define KVM_REQ_MASTERCLOCK_UPDATE 19 123#define KVM_REQ_MASTERCLOCK_UPDATE 19
124#define KVM_REQ_MCLOCK_INPROGRESS 20 124#define KVM_REQ_MCLOCK_INPROGRESS 20
125#define KVM_REQ_EPR_EXIT 21 125#define KVM_REQ_EPR_EXIT 21
126#define KVM_REQ_EOIBITMAP 22
126 127
127#define KVM_USERSPACE_IRQ_SOURCE_ID 0 128#define KVM_USERSPACE_IRQ_SOURCE_ID 0
128#define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 129#define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1
@@ -538,6 +539,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
538void kvm_flush_remote_tlbs(struct kvm *kvm); 539void kvm_flush_remote_tlbs(struct kvm *kvm);
539void kvm_reload_remote_mmus(struct kvm *kvm); 540void kvm_reload_remote_mmus(struct kvm *kvm);
540void kvm_make_mclock_inprogress_request(struct kvm *kvm); 541void kvm_make_mclock_inprogress_request(struct kvm *kvm);
542void kvm_make_update_eoibitmap_request(struct kvm *kvm);
541 543
542long kvm_arch_dev_ioctl(struct file *filp, 544long kvm_arch_dev_ioctl(struct file *filp,
543 unsigned int ioctl, unsigned long arg); 545 unsigned int ioctl, unsigned long arg);
@@ -691,6 +693,7 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
691int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level); 693int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level);
692int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm, 694int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
693 int irq_source_id, int level); 695 int irq_source_id, int level);
696bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin);
694void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin); 697void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
695void kvm_register_irq_ack_notifier(struct kvm *kvm, 698void kvm_register_irq_ack_notifier(struct kvm *kvm,
696 struct kvm_irq_ack_notifier *kian); 699 struct kvm_irq_ack_notifier *kian);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0533496b6228..01edad9b5d71 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4316,7 +4316,10 @@ EXPORT_SYMBOL(yield);
4316 * It's the caller's job to ensure that the target task struct 4316 * It's the caller's job to ensure that the target task struct
4317 * can't go away on us before we can do any checks. 4317 * can't go away on us before we can do any checks.
4318 * 4318 *
4319 * Returns true if we indeed boosted the target task. 4319 * Returns:
4320 * true (>0) if we indeed boosted the target task.
4321 * false (0) if we failed to boost the target.
4322 * -ESRCH if there's no task to yield to.
4320 */ 4323 */
4321bool __sched yield_to(struct task_struct *p, bool preempt) 4324bool __sched yield_to(struct task_struct *p, bool preempt)
4322{ 4325{
@@ -4330,6 +4333,15 @@ bool __sched yield_to(struct task_struct *p, bool preempt)
4330 4333
4331again: 4334again:
4332 p_rq = task_rq(p); 4335 p_rq = task_rq(p);
4336 /*
4337 * If we're the only runnable task on the rq and target rq also
4338 * has only one task, there's absolutely no point in yielding.
4339 */
4340 if (rq->nr_running == 1 && p_rq->nr_running == 1) {
4341 yielded = -ESRCH;
4342 goto out_irq;
4343 }
4344
4333 double_rq_lock(rq, p_rq); 4345 double_rq_lock(rq, p_rq);
4334 while (task_rq(p) != p_rq) { 4346 while (task_rq(p) != p_rq) {
4335 double_rq_unlock(rq, p_rq); 4347 double_rq_unlock(rq, p_rq);
@@ -4337,13 +4349,13 @@ again:
4337 } 4349 }
4338 4350
4339 if (!curr->sched_class->yield_to_task) 4351 if (!curr->sched_class->yield_to_task)
4340 goto out; 4352 goto out_unlock;
4341 4353
4342 if (curr->sched_class != p->sched_class) 4354 if (curr->sched_class != p->sched_class)
4343 goto out; 4355 goto out_unlock;
4344 4356
4345 if (task_running(p_rq, p) || p->state) 4357 if (task_running(p_rq, p) || p->state)
4346 goto out; 4358 goto out_unlock;
4347 4359
4348 yielded = curr->sched_class->yield_to_task(rq, p, preempt); 4360 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
4349 if (yielded) { 4361 if (yielded) {
@@ -4356,11 +4368,12 @@ again:
4356 resched_task(p_rq->curr); 4368 resched_task(p_rq->curr);
4357 } 4369 }
4358 4370
4359out: 4371out_unlock:
4360 double_rq_unlock(rq, p_rq); 4372 double_rq_unlock(rq, p_rq);
4373out_irq:
4361 local_irq_restore(flags); 4374 local_irq_restore(flags);
4362 4375
4363 if (yielded) 4376 if (yielded > 0)
4364 schedule(); 4377 schedule();
4365 4378
4366 return yielded; 4379 return yielded;
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index f3abbef46c42..ce82b9401958 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -35,6 +35,7 @@
35#include <linux/hrtimer.h> 35#include <linux/hrtimer.h>
36#include <linux/io.h> 36#include <linux/io.h>
37#include <linux/slab.h> 37#include <linux/slab.h>
38#include <linux/export.h>
38#include <asm/processor.h> 39#include <asm/processor.h>
39#include <asm/page.h> 40#include <asm/page.h>
40#include <asm/current.h> 41#include <asm/current.h>
@@ -115,6 +116,42 @@ static void update_handled_vectors(struct kvm_ioapic *ioapic)
115 smp_wmb(); 116 smp_wmb();
116} 117}
117 118
119void kvm_ioapic_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
120 u64 *eoi_exit_bitmap)
121{
122 struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
123 union kvm_ioapic_redirect_entry *e;
124 struct kvm_lapic_irq irqe;
125 int index;
126
127 spin_lock(&ioapic->lock);
128 /* traverse ioapic entry to set eoi exit bitmap*/
129 for (index = 0; index < IOAPIC_NUM_PINS; index++) {
130 e = &ioapic->redirtbl[index];
131 if (!e->fields.mask &&
132 (e->fields.trig_mode == IOAPIC_LEVEL_TRIG ||
133 kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC,
134 index))) {
135 irqe.dest_id = e->fields.dest_id;
136 irqe.vector = e->fields.vector;
137 irqe.dest_mode = e->fields.dest_mode;
138 irqe.delivery_mode = e->fields.delivery_mode << 8;
139 kvm_calculate_eoi_exitmap(vcpu, &irqe, eoi_exit_bitmap);
140 }
141 }
142 spin_unlock(&ioapic->lock);
143}
144EXPORT_SYMBOL_GPL(kvm_ioapic_calculate_eoi_exitmap);
145
146void kvm_ioapic_make_eoibitmap_request(struct kvm *kvm)
147{
148 struct kvm_ioapic *ioapic = kvm->arch.vioapic;
149
150 if (!kvm_apic_vid_enabled(kvm) || !ioapic)
151 return;
152 kvm_make_update_eoibitmap_request(kvm);
153}
154
118static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) 155static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
119{ 156{
120 unsigned index; 157 unsigned index;
@@ -156,6 +193,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
156 if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG 193 if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG
157 && ioapic->irr & (1 << index)) 194 && ioapic->irr & (1 << index))
158 ioapic_service(ioapic, index); 195 ioapic_service(ioapic, index);
196 kvm_ioapic_make_eoibitmap_request(ioapic->kvm);
159 break; 197 break;
160 } 198 }
161} 199}
@@ -455,6 +493,7 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
455 spin_lock(&ioapic->lock); 493 spin_lock(&ioapic->lock);
456 memcpy(ioapic, state, sizeof(struct kvm_ioapic_state)); 494 memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
457 update_handled_vectors(ioapic); 495 update_handled_vectors(ioapic);
496 kvm_ioapic_make_eoibitmap_request(kvm);
458 spin_unlock(&ioapic->lock); 497 spin_unlock(&ioapic->lock);
459 return 0; 498 return 0;
460} 499}
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index a30abfe6ed16..0400a466c50c 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -82,5 +82,9 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
82 struct kvm_lapic_irq *irq); 82 struct kvm_lapic_irq *irq);
83int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); 83int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
84int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); 84int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
85void kvm_ioapic_make_eoibitmap_request(struct kvm *kvm);
86void kvm_ioapic_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
87 u64 *eoi_exit_bitmap);
88
85 89
86#endif 90#endif
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index 4a340cb23013..72a130bc448a 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -76,7 +76,9 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
76 gfn = slot->base_gfn; 76 gfn = slot->base_gfn;
77 end_gfn = gfn + slot->npages; 77 end_gfn = gfn + slot->npages;
78 78
79 flags = IOMMU_READ | IOMMU_WRITE; 79 flags = IOMMU_READ;
80 if (!(slot->flags & KVM_MEM_READONLY))
81 flags |= IOMMU_WRITE;
80 if (kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY) 82 if (kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY)
81 flags |= IOMMU_CACHE; 83 flags |= IOMMU_CACHE;
82 84
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 656fa455e154..ff6d40e2c06d 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/kvm_host.h> 23#include <linux/kvm_host.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/export.h>
25#include <trace/events/kvm.h> 26#include <trace/events/kvm.h>
26 27
27#include <asm/msidef.h> 28#include <asm/msidef.h>
@@ -237,6 +238,28 @@ int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level)
237 return ret; 238 return ret;
238} 239}
239 240
241bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin)
242{
243 struct kvm_irq_ack_notifier *kian;
244 struct hlist_node *n;
245 int gsi;
246
247 rcu_read_lock();
248 gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
249 if (gsi != -1)
250 hlist_for_each_entry_rcu(kian, n, &kvm->irq_ack_notifier_list,
251 link)
252 if (kian->gsi == gsi) {
253 rcu_read_unlock();
254 return true;
255 }
256
257 rcu_read_unlock();
258
259 return false;
260}
261EXPORT_SYMBOL_GPL(kvm_irq_has_notifier);
262
240void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) 263void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
241{ 264{
242 struct kvm_irq_ack_notifier *kian; 265 struct kvm_irq_ack_notifier *kian;
@@ -261,6 +284,7 @@ void kvm_register_irq_ack_notifier(struct kvm *kvm,
261 mutex_lock(&kvm->irq_lock); 284 mutex_lock(&kvm->irq_lock);
262 hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list); 285 hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list);
263 mutex_unlock(&kvm->irq_lock); 286 mutex_unlock(&kvm->irq_lock);
287 kvm_ioapic_make_eoibitmap_request(kvm);
264} 288}
265 289
266void kvm_unregister_irq_ack_notifier(struct kvm *kvm, 290void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
@@ -270,6 +294,7 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
270 hlist_del_init_rcu(&kian->link); 294 hlist_del_init_rcu(&kian->link);
271 mutex_unlock(&kvm->irq_lock); 295 mutex_unlock(&kvm->irq_lock);
272 synchronize_rcu(); 296 synchronize_rcu();
297 kvm_ioapic_make_eoibitmap_request(kvm);
273} 298}
274 299
275int kvm_request_irq_source_id(struct kvm *kvm) 300int kvm_request_irq_source_id(struct kvm *kvm)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5e709ebb7c40..2e93630b4add 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -217,6 +217,11 @@ void kvm_make_mclock_inprogress_request(struct kvm *kvm)
217 make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); 217 make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
218} 218}
219 219
220void kvm_make_update_eoibitmap_request(struct kvm *kvm)
221{
222 make_all_cpus_request(kvm, KVM_REQ_EOIBITMAP);
223}
224
220int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 225int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
221{ 226{
222 struct page *page; 227 struct page *page;
@@ -714,6 +719,24 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
714} 719}
715 720
716/* 721/*
722 * KVM_SET_USER_MEMORY_REGION ioctl allows the following operations:
723 * - create a new memory slot
724 * - delete an existing memory slot
725 * - modify an existing memory slot
726 * -- move it in the guest physical memory space
727 * -- just change its flags
728 *
729 * Since flags can be changed by some of these operations, the following
730 * differentiation is the best we can do for __kvm_set_memory_region():
731 */
732enum kvm_mr_change {
733 KVM_MR_CREATE,
734 KVM_MR_DELETE,
735 KVM_MR_MOVE,
736 KVM_MR_FLAGS_ONLY,
737};
738
739/*
717 * Allocate some memory and give it an address in the guest physical address 740 * Allocate some memory and give it an address in the guest physical address
718 * space. 741 * space.
719 * 742 *
@@ -731,6 +754,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
731 struct kvm_memory_slot *slot; 754 struct kvm_memory_slot *slot;
732 struct kvm_memory_slot old, new; 755 struct kvm_memory_slot old, new;
733 struct kvm_memslots *slots = NULL, *old_memslots; 756 struct kvm_memslots *slots = NULL, *old_memslots;
757 enum kvm_mr_change change;
734 758
735 r = check_memory_region_flags(mem); 759 r = check_memory_region_flags(mem);
736 if (r) 760 if (r)
@@ -772,17 +796,31 @@ int __kvm_set_memory_region(struct kvm *kvm,
772 new.npages = npages; 796 new.npages = npages;
773 new.flags = mem->flags; 797 new.flags = mem->flags;
774 798
775 /*
776 * Disallow changing a memory slot's size or changing anything about
777 * zero sized slots that doesn't involve making them non-zero.
778 */
779 r = -EINVAL; 799 r = -EINVAL;
780 if (npages && old.npages && npages != old.npages) 800 if (npages) {
781 goto out; 801 if (!old.npages)
782 if (!npages && !old.npages) 802 change = KVM_MR_CREATE;
803 else { /* Modify an existing slot. */
804 if ((mem->userspace_addr != old.userspace_addr) ||
805 (npages != old.npages) ||
806 ((new.flags ^ old.flags) & KVM_MEM_READONLY))
807 goto out;
808
809 if (base_gfn != old.base_gfn)
810 change = KVM_MR_MOVE;
811 else if (new.flags != old.flags)
812 change = KVM_MR_FLAGS_ONLY;
813 else { /* Nothing to change. */
814 r = 0;
815 goto out;
816 }
817 }
818 } else if (old.npages) {
819 change = KVM_MR_DELETE;
820 } else /* Modify a non-existent slot: disallowed. */
783 goto out; 821 goto out;
784 822
785 if ((npages && !old.npages) || (base_gfn != old.base_gfn)) { 823 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
786 /* Check for overlaps */ 824 /* Check for overlaps */
787 r = -EEXIST; 825 r = -EEXIST;
788 kvm_for_each_memslot(slot, kvm->memslots) { 826 kvm_for_each_memslot(slot, kvm->memslots) {
@@ -800,20 +838,12 @@ int __kvm_set_memory_region(struct kvm *kvm,
800 new.dirty_bitmap = NULL; 838 new.dirty_bitmap = NULL;
801 839
802 r = -ENOMEM; 840 r = -ENOMEM;
803 841 if (change == KVM_MR_CREATE) {
804 /*
805 * Allocate if a slot is being created. If modifying a slot,
806 * the userspace_addr cannot change.
807 */
808 if (!old.npages) {
809 new.user_alloc = user_alloc; 842 new.user_alloc = user_alloc;
810 new.userspace_addr = mem->userspace_addr; 843 new.userspace_addr = mem->userspace_addr;
811 844
812 if (kvm_arch_create_memslot(&new, npages)) 845 if (kvm_arch_create_memslot(&new, npages))
813 goto out_free; 846 goto out_free;
814 } else if (npages && mem->userspace_addr != old.userspace_addr) {
815 r = -EINVAL;
816 goto out_free;
817 } 847 }
818 848
819 /* Allocate page dirty bitmap if needed */ 849 /* Allocate page dirty bitmap if needed */
@@ -822,7 +852,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
822 goto out_free; 852 goto out_free;
823 } 853 }
824 854
825 if (!npages || base_gfn != old.base_gfn) { 855 if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) {
826 r = -ENOMEM; 856 r = -ENOMEM;
827 slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots), 857 slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots),
828 GFP_KERNEL); 858 GFP_KERNEL);
@@ -863,15 +893,23 @@ int __kvm_set_memory_region(struct kvm *kvm,
863 goto out_free; 893 goto out_free;
864 } 894 }
865 895
866 /* map new memory slot into the iommu */ 896 /*
867 if (npages) { 897 * IOMMU mapping: New slots need to be mapped. Old slots need to be
898 * un-mapped and re-mapped if their base changes. Since base change
899 * unmapping is handled above with slot deletion, mapping alone is
900 * needed here. Anything else the iommu might care about for existing
901 * slots (size changes, userspace addr changes and read-only flag
902 * changes) is disallowed above, so any other attribute changes getting
903 * here can be skipped.
904 */
905 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
868 r = kvm_iommu_map_pages(kvm, &new); 906 r = kvm_iommu_map_pages(kvm, &new);
869 if (r) 907 if (r)
870 goto out_slots; 908 goto out_slots;
871 } 909 }
872 910
873 /* actual memory is freed via old in kvm_free_physmem_slot below */ 911 /* actual memory is freed via old in kvm_free_physmem_slot below */
874 if (!npages) { 912 if (change == KVM_MR_DELETE) {
875 new.dirty_bitmap = NULL; 913 new.dirty_bitmap = NULL;
876 memset(&new.arch, 0, sizeof(new.arch)); 914 memset(&new.arch, 0, sizeof(new.arch));
877 } 915 }
@@ -1669,6 +1707,7 @@ bool kvm_vcpu_yield_to(struct kvm_vcpu *target)
1669{ 1707{
1670 struct pid *pid; 1708 struct pid *pid;
1671 struct task_struct *task = NULL; 1709 struct task_struct *task = NULL;
1710 bool ret = false;
1672 1711
1673 rcu_read_lock(); 1712 rcu_read_lock();
1674 pid = rcu_dereference(target->pid); 1713 pid = rcu_dereference(target->pid);
@@ -1676,17 +1715,15 @@ bool kvm_vcpu_yield_to(struct kvm_vcpu *target)
1676 task = get_pid_task(target->pid, PIDTYPE_PID); 1715 task = get_pid_task(target->pid, PIDTYPE_PID);
1677 rcu_read_unlock(); 1716 rcu_read_unlock();
1678 if (!task) 1717 if (!task)
1679 return false; 1718 return ret;
1680 if (task->flags & PF_VCPU) { 1719 if (task->flags & PF_VCPU) {
1681 put_task_struct(task); 1720 put_task_struct(task);
1682 return false; 1721 return ret;
1683 }
1684 if (yield_to(task, 1)) {
1685 put_task_struct(task);
1686 return true;
1687 } 1722 }
1723 ret = yield_to(task, 1);
1688 put_task_struct(task); 1724 put_task_struct(task);
1689 return false; 1725
1726 return ret;
1690} 1727}
1691EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); 1728EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
1692 1729
@@ -1727,12 +1764,14 @@ bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
1727 return eligible; 1764 return eligible;
1728} 1765}
1729#endif 1766#endif
1767
1730void kvm_vcpu_on_spin(struct kvm_vcpu *me) 1768void kvm_vcpu_on_spin(struct kvm_vcpu *me)
1731{ 1769{
1732 struct kvm *kvm = me->kvm; 1770 struct kvm *kvm = me->kvm;
1733 struct kvm_vcpu *vcpu; 1771 struct kvm_vcpu *vcpu;
1734 int last_boosted_vcpu = me->kvm->last_boosted_vcpu; 1772 int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
1735 int yielded = 0; 1773 int yielded = 0;
1774 int try = 3;
1736 int pass; 1775 int pass;
1737 int i; 1776 int i;
1738 1777
@@ -1744,7 +1783,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
1744 * VCPU is holding the lock that we need and will release it. 1783 * VCPU is holding the lock that we need and will release it.
1745 * We approximate round-robin by starting at the last boosted VCPU. 1784 * We approximate round-robin by starting at the last boosted VCPU.
1746 */ 1785 */
1747 for (pass = 0; pass < 2 && !yielded; pass++) { 1786 for (pass = 0; pass < 2 && !yielded && try; pass++) {
1748 kvm_for_each_vcpu(i, vcpu, kvm) { 1787 kvm_for_each_vcpu(i, vcpu, kvm) {
1749 if (!pass && i <= last_boosted_vcpu) { 1788 if (!pass && i <= last_boosted_vcpu) {
1750 i = last_boosted_vcpu; 1789 i = last_boosted_vcpu;
@@ -1757,10 +1796,15 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
1757 continue; 1796 continue;
1758 if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) 1797 if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
1759 continue; 1798 continue;
1760 if (kvm_vcpu_yield_to(vcpu)) { 1799
1800 yielded = kvm_vcpu_yield_to(vcpu);
1801 if (yielded > 0) {
1761 kvm->last_boosted_vcpu = i; 1802 kvm->last_boosted_vcpu = i;
1762 yielded = 1;
1763 break; 1803 break;
1804 } else if (yielded < 0) {
1805 try--;
1806 if (!try)
1807 break;
1764 } 1808 }
1765 } 1809 }
1766 } 1810 }