aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-09-15 18:43:55 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-09-15 18:43:55 -0400
commit9db59599ae502b38b27cff6462273f84acd59927 (patch)
tree96d90a2f7bcddc837987579ad2d3e58b891db716
parentb38923a068c10fc36ca8f596d650d095ce390b85 (diff)
parent4f350c6dbcb9000e18907515ec8a7b205ac33c69 (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull more KVM updates from Paolo Bonzini: - PPC bugfixes - RCU splat fix - swait races fix - pointless userspace-triggerable BUG() fix - misc fixes for KVM_RUN corner cases - nested virt correctness fixes + one host DoS - some cleanups - clang build fix - fix AMD AVIC with default QEMU command line options - x86 bugfixes * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (28 commits) kvm: nVMX: Handle deferred early VMLAUNCH/VMRESUME failure properly kvm: vmx: Handle VMLAUNCH/VMRESUME failure properly kvm: nVMX: Remove nested_vmx_succeed after successful VM-entry kvm,mips: Fix potential swait_active() races kvm,powerpc: Serialize wq active checks in ops->vcpu_kick kvm: Serialize wq active checks in kvm_vcpu_wake_up() kvm,x86: Fix apf_task_wake_one() wq serialization kvm,lapic: Justify use of swait_active() kvm,async_pf: Use swq_has_sleeper() sched/wait: Add swq_has_sleeper() KVM: VMX: Do not BUG() on out-of-bounds guest IRQ KVM: Don't accept obviously wrong gsi values via KVM_IRQFD kvm: nVMX: Don't allow L2 to access the hardware CR8 KVM: trace events: update list of exit reasons KVM: async_pf: Fix #DF due to inject "Page not Present" and "Page Ready" exceptions simultaneously KVM: X86: Don't block vCPU if there is pending exception KVM: SVM: Add irqchip_split() checks before enabling AVIC KVM: Add struct kvm_vcpu pointer parameter to get_enable_apicv() KVM: SVM: Refactor AVIC vcpu initialization into avic_init_vcpu() KVM: x86: fix clang build ...
-rw-r--r--arch/mips/kvm/mips.c4
-rw-r--r--arch/powerpc/kvm/book3s_hv.c4
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_xive.c1
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S17
-rw-r--r--arch/powerpc/kvm/book3s_xive.c1
-rw-r--r--arch/powerpc/kvm/book3s_xive_template.c7
-rw-r--r--arch/x86/include/asm/kvm_host.h3
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kvm/cpuid.h1
-rw-r--r--arch/x86/kvm/lapic.c4
-rw-r--r--arch/x86/kvm/svm.c38
-rw-r--r--arch/x86/kvm/vmx.c162
-rw-r--r--arch/x86/kvm/x86.c51
-rw-r--r--include/linux/swait.h58
-rw-r--r--include/trace/events/kvm.h4
-rw-r--r--virt/kvm/async_pf.c6
-rw-r--r--virt/kvm/eventfd.c2
-rw-r--r--virt/kvm/kvm_main.c3
18 files changed, 257 insertions, 111 deletions
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index bce2a6431430..d535edc01434 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -514,7 +514,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
514 514
515 dvcpu->arch.wait = 0; 515 dvcpu->arch.wait = 0;
516 516
517 if (swait_active(&dvcpu->wq)) 517 if (swq_has_sleeper(&dvcpu->wq))
518 swake_up(&dvcpu->wq); 518 swake_up(&dvcpu->wq);
519 519
520 return 0; 520 return 0;
@@ -1179,7 +1179,7 @@ static void kvm_mips_comparecount_func(unsigned long data)
1179 kvm_mips_callbacks->queue_timer_int(vcpu); 1179 kvm_mips_callbacks->queue_timer_int(vcpu);
1180 1180
1181 vcpu->arch.wait = 0; 1181 vcpu->arch.wait = 0;
1182 if (swait_active(&vcpu->wq)) 1182 if (swq_has_sleeper(&vcpu->wq))
1183 swake_up(&vcpu->wq); 1183 swake_up(&vcpu->wq);
1184} 1184}
1185 1185
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 18e974a34fce..73bf1ebfa78f 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -181,7 +181,7 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
181 struct swait_queue_head *wqp; 181 struct swait_queue_head *wqp;
182 182
183 wqp = kvm_arch_vcpu_wq(vcpu); 183 wqp = kvm_arch_vcpu_wq(vcpu);
184 if (swait_active(wqp)) { 184 if (swq_has_sleeper(wqp)) {
185 swake_up(wqp); 185 swake_up(wqp);
186 ++vcpu->stat.halt_wakeup; 186 ++vcpu->stat.halt_wakeup;
187 } 187 }
@@ -4212,11 +4212,13 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
4212 if ((cfg->process_table & PRTS_MASK) > 24) 4212 if ((cfg->process_table & PRTS_MASK) > 24)
4213 return -EINVAL; 4213 return -EINVAL;
4214 4214
4215 mutex_lock(&kvm->lock);
4215 kvm->arch.process_table = cfg->process_table; 4216 kvm->arch.process_table = cfg->process_table;
4216 kvmppc_setup_partition_table(kvm); 4217 kvmppc_setup_partition_table(kvm);
4217 4218
4218 lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0; 4219 lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0;
4219 kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE); 4220 kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE);
4221 mutex_unlock(&kvm->lock);
4220 4222
4221 return 0; 4223 return 0;
4222} 4224}
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xive.c b/arch/powerpc/kvm/book3s_hv_rm_xive.c
index abf5f01b6eb1..5b81a807d742 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xive.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xive.c
@@ -38,7 +38,6 @@ static inline void __iomem *get_tima_phys(void)
38#define __x_tima get_tima_phys() 38#define __x_tima get_tima_phys()
39#define __x_eoi_page(xd) ((void __iomem *)((xd)->eoi_page)) 39#define __x_eoi_page(xd) ((void __iomem *)((xd)->eoi_page))
40#define __x_trig_page(xd) ((void __iomem *)((xd)->trig_page)) 40#define __x_trig_page(xd) ((void __iomem *)((xd)->trig_page))
41#define __x_readb __raw_rm_readb
42#define __x_writeb __raw_rm_writeb 41#define __x_writeb __raw_rm_writeb
43#define __x_readw __raw_rm_readw 42#define __x_readw __raw_rm_readw
44#define __x_readq __raw_rm_readq 43#define __x_readq __raw_rm_readq
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 663a4a861e7f..17936f82d3c7 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -771,6 +771,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
771 771
772#ifdef CONFIG_PPC_TRANSACTIONAL_MEM 772#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
773BEGIN_FTR_SECTION 773BEGIN_FTR_SECTION
774 /*
775 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
776 */
774 bl kvmppc_restore_tm 777 bl kvmppc_restore_tm
775END_FTR_SECTION_IFSET(CPU_FTR_TM) 778END_FTR_SECTION_IFSET(CPU_FTR_TM)
776#endif 779#endif
@@ -1630,6 +1633,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
1630 1633
1631#ifdef CONFIG_PPC_TRANSACTIONAL_MEM 1634#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
1632BEGIN_FTR_SECTION 1635BEGIN_FTR_SECTION
1636 /*
1637 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
1638 */
1633 bl kvmppc_save_tm 1639 bl kvmppc_save_tm
1634END_FTR_SECTION_IFSET(CPU_FTR_TM) 1640END_FTR_SECTION_IFSET(CPU_FTR_TM)
1635#endif 1641#endif
@@ -1749,7 +1755,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
1749 /* 1755 /*
1750 * Are we running hash or radix ? 1756 * Are we running hash or radix ?
1751 */ 1757 */
1752 beq cr2,3f 1758 ld r5, VCPU_KVM(r9)
1759 lbz r0, KVM_RADIX(r5)
1760 cmpwi cr2, r0, 0
1761 beq cr2, 3f
1753 1762
1754 /* Radix: Handle the case where the guest used an illegal PID */ 1763 /* Radix: Handle the case where the guest used an illegal PID */
1755 LOAD_REG_ADDR(r4, mmu_base_pid) 1764 LOAD_REG_ADDR(r4, mmu_base_pid)
@@ -2466,6 +2475,9 @@ _GLOBAL(kvmppc_h_cede) /* r3 = vcpu pointer, r11 = msr, r13 = paca */
2466 2475
2467#ifdef CONFIG_PPC_TRANSACTIONAL_MEM 2476#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
2468BEGIN_FTR_SECTION 2477BEGIN_FTR_SECTION
2478 /*
2479 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
2480 */
2469 ld r9, HSTATE_KVM_VCPU(r13) 2481 ld r9, HSTATE_KVM_VCPU(r13)
2470 bl kvmppc_save_tm 2482 bl kvmppc_save_tm
2471END_FTR_SECTION_IFSET(CPU_FTR_TM) 2483END_FTR_SECTION_IFSET(CPU_FTR_TM)
@@ -2578,6 +2590,9 @@ kvm_end_cede:
2578 2590
2579#ifdef CONFIG_PPC_TRANSACTIONAL_MEM 2591#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
2580BEGIN_FTR_SECTION 2592BEGIN_FTR_SECTION
2593 /*
2594 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
2595 */
2581 bl kvmppc_restore_tm 2596 bl kvmppc_restore_tm
2582END_FTR_SECTION_IFSET(CPU_FTR_TM) 2597END_FTR_SECTION_IFSET(CPU_FTR_TM)
2583#endif 2598#endif
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index 08b200a0bbce..13304622ab1c 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -48,7 +48,6 @@
48#define __x_tima xive_tima 48#define __x_tima xive_tima
49#define __x_eoi_page(xd) ((void __iomem *)((xd)->eoi_mmio)) 49#define __x_eoi_page(xd) ((void __iomem *)((xd)->eoi_mmio))
50#define __x_trig_page(xd) ((void __iomem *)((xd)->trig_mmio)) 50#define __x_trig_page(xd) ((void __iomem *)((xd)->trig_mmio))
51#define __x_readb __raw_readb
52#define __x_writeb __raw_writeb 51#define __x_writeb __raw_writeb
53#define __x_readw __raw_readw 52#define __x_readw __raw_readw
54#define __x_readq __raw_readq 53#define __x_readq __raw_readq
diff --git a/arch/powerpc/kvm/book3s_xive_template.c b/arch/powerpc/kvm/book3s_xive_template.c
index d1ed2c41b5d2..c7a5deadd1cc 100644
--- a/arch/powerpc/kvm/book3s_xive_template.c
+++ b/arch/powerpc/kvm/book3s_xive_template.c
@@ -28,7 +28,8 @@ static void GLUE(X_PFX,ack_pending)(struct kvmppc_xive_vcpu *xc)
28 * bit. 28 * bit.
29 */ 29 */
30 if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { 30 if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
31 u8 pipr = __x_readb(__x_tima + TM_QW1_OS + TM_PIPR); 31 __be64 qw1 = __x_readq(__x_tima + TM_QW1_OS);
32 u8 pipr = be64_to_cpu(qw1) & 0xff;
32 if (pipr >= xc->hw_cppr) 33 if (pipr >= xc->hw_cppr)
33 return; 34 return;
34 } 35 }
@@ -336,7 +337,6 @@ X_STATIC unsigned long GLUE(X_PFX,h_ipoll)(struct kvm_vcpu *vcpu, unsigned long
336 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 337 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
337 u8 pending = xc->pending; 338 u8 pending = xc->pending;
338 u32 hirq; 339 u32 hirq;
339 u8 pipr;
340 340
341 pr_devel("H_IPOLL(server=%ld)\n", server); 341 pr_devel("H_IPOLL(server=%ld)\n", server);
342 342
@@ -353,7 +353,8 @@ X_STATIC unsigned long GLUE(X_PFX,h_ipoll)(struct kvm_vcpu *vcpu, unsigned long
353 pending = 0xff; 353 pending = 0xff;
354 } else { 354 } else {
355 /* Grab pending interrupt if any */ 355 /* Grab pending interrupt if any */
356 pipr = __x_readb(__x_tima + TM_QW1_OS + TM_PIPR); 356 __be64 qw1 = __x_readq(__x_tima + TM_QW1_OS);
357 u8 pipr = be64_to_cpu(qw1) & 0xff;
357 if (pipr < 8) 358 if (pipr < 8)
358 pending |= 1 << pipr; 359 pending |= 1 << pipr;
359 } 360 }
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8844eee290b2..c73e493adf07 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -951,7 +951,6 @@ struct kvm_x86_ops {
951 void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); 951 void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
952 unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); 952 unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
953 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); 953 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
954 u32 (*get_pkru)(struct kvm_vcpu *vcpu);
955 954
956 void (*tlb_flush)(struct kvm_vcpu *vcpu); 955 void (*tlb_flush)(struct kvm_vcpu *vcpu);
957 956
@@ -973,7 +972,7 @@ struct kvm_x86_ops {
973 void (*enable_nmi_window)(struct kvm_vcpu *vcpu); 972 void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
974 void (*enable_irq_window)(struct kvm_vcpu *vcpu); 973 void (*enable_irq_window)(struct kvm_vcpu *vcpu);
975 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); 974 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
976 bool (*get_enable_apicv)(void); 975 bool (*get_enable_apicv)(struct kvm_vcpu *vcpu);
977 void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); 976 void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
978 void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); 977 void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
979 void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr); 978 void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 874827b0d7ca..aa60a08b65b1 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -180,7 +180,7 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n)
180 hlist_del_init(&n->link); 180 hlist_del_init(&n->link);
181 if (n->halted) 181 if (n->halted)
182 smp_send_reschedule(n->cpu); 182 smp_send_reschedule(n->cpu);
183 else if (swait_active(&n->wq)) 183 else if (swq_has_sleeper(&n->wq))
184 swake_up(&n->wq); 184 swake_up(&n->wq);
185} 185}
186 186
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 1ea3c0e1e3a9..0bc5c1315708 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -59,7 +59,6 @@ static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature)
59{ 59{
60 unsigned x86_leaf = x86_feature / 32; 60 unsigned x86_leaf = x86_feature / 32;
61 61
62 BUILD_BUG_ON(!__builtin_constant_p(x86_leaf));
63 BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid)); 62 BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid));
64 BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0); 63 BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0);
65 64
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index aaf10b6f5380..69c5612be786 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1324,6 +1324,10 @@ static void apic_timer_expired(struct kvm_lapic *apic)
1324 atomic_inc(&apic->lapic_timer.pending); 1324 atomic_inc(&apic->lapic_timer.pending);
1325 kvm_set_pending_timer(vcpu); 1325 kvm_set_pending_timer(vcpu);
1326 1326
1327 /*
1328 * For x86, the atomic_inc() is serialized, thus
1329 * using swait_active() is safe.
1330 */
1327 if (swait_active(q)) 1331 if (swait_active(q))
1328 swake_up(q); 1332 swake_up(q);
1329 1333
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 2c1cfe68a9af..0e68f0b3cbf7 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1200,7 +1200,6 @@ static void avic_init_vmcb(struct vcpu_svm *svm)
1200 vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK; 1200 vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
1201 vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT; 1201 vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT;
1202 vmcb->control.int_ctl |= AVIC_ENABLE_MASK; 1202 vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
1203 svm->vcpu.arch.apicv_active = true;
1204} 1203}
1205 1204
1206static void init_vmcb(struct vcpu_svm *svm) 1205static void init_vmcb(struct vcpu_svm *svm)
@@ -1316,7 +1315,7 @@ static void init_vmcb(struct vcpu_svm *svm)
1316 set_intercept(svm, INTERCEPT_PAUSE); 1315 set_intercept(svm, INTERCEPT_PAUSE);
1317 } 1316 }
1318 1317
1319 if (avic) 1318 if (kvm_vcpu_apicv_active(&svm->vcpu))
1320 avic_init_vmcb(svm); 1319 avic_init_vmcb(svm);
1321 1320
1322 /* 1321 /*
@@ -1600,6 +1599,23 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
1600 avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE); 1599 avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
1601} 1600}
1602 1601
1602static int avic_init_vcpu(struct vcpu_svm *svm)
1603{
1604 int ret;
1605
1606 if (!kvm_vcpu_apicv_active(&svm->vcpu))
1607 return 0;
1608
1609 ret = avic_init_backing_page(&svm->vcpu);
1610 if (ret)
1611 return ret;
1612
1613 INIT_LIST_HEAD(&svm->ir_list);
1614 spin_lock_init(&svm->ir_list_lock);
1615
1616 return ret;
1617}
1618
1603static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) 1619static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
1604{ 1620{
1605 struct vcpu_svm *svm; 1621 struct vcpu_svm *svm;
@@ -1636,14 +1652,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
1636 if (!hsave_page) 1652 if (!hsave_page)
1637 goto free_page3; 1653 goto free_page3;
1638 1654
1639 if (avic) { 1655 err = avic_init_vcpu(svm);
1640 err = avic_init_backing_page(&svm->vcpu); 1656 if (err)
1641 if (err) 1657 goto free_page4;
1642 goto free_page4;
1643
1644 INIT_LIST_HEAD(&svm->ir_list);
1645 spin_lock_init(&svm->ir_list_lock);
1646 }
1647 1658
1648 /* We initialize this flag to true to make sure that the is_running 1659 /* We initialize this flag to true to make sure that the is_running
1649 * bit would be set the first time the vcpu is loaded. 1660 * bit would be set the first time the vcpu is loaded.
@@ -4395,9 +4406,9 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
4395 return; 4406 return;
4396} 4407}
4397 4408
4398static bool svm_get_enable_apicv(void) 4409static bool svm_get_enable_apicv(struct kvm_vcpu *vcpu)
4399{ 4410{
4400 return avic; 4411 return avic && irqchip_split(vcpu->kvm);
4401} 4412}
4402 4413
4403static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) 4414static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
@@ -4414,7 +4425,7 @@ static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
4414 struct vcpu_svm *svm = to_svm(vcpu); 4425 struct vcpu_svm *svm = to_svm(vcpu);
4415 struct vmcb *vmcb = svm->vmcb; 4426 struct vmcb *vmcb = svm->vmcb;
4416 4427
4417 if (!avic) 4428 if (!kvm_vcpu_apicv_active(&svm->vcpu))
4418 return; 4429 return;
4419 4430
4420 vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; 4431 vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
@@ -5302,6 +5313,7 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
5302 */ 5313 */
5303 if (info->rep_prefix != REPE_PREFIX) 5314 if (info->rep_prefix != REPE_PREFIX)
5304 goto out; 5315 goto out;
5316 break;
5305 case SVM_EXIT_IOIO: { 5317 case SVM_EXIT_IOIO: {
5306 u64 exit_info; 5318 u64 exit_info;
5307 u32 bytes; 5319 u32 bytes;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 699704d4bc9e..06c0c6d0541e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5012,7 +5012,7 @@ static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_activ
5012 } 5012 }
5013} 5013}
5014 5014
5015static bool vmx_get_enable_apicv(void) 5015static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
5016{ 5016{
5017 return enable_apicv; 5017 return enable_apicv;
5018} 5018}
@@ -8344,12 +8344,14 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
8344 struct vcpu_vmx *vmx = to_vmx(vcpu); 8344 struct vcpu_vmx *vmx = to_vmx(vcpu);
8345 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 8345 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
8346 8346
8347 trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, 8347 if (vmx->nested.nested_run_pending)
8348 vmcs_readl(EXIT_QUALIFICATION), 8348 return false;
8349 vmx->idt_vectoring_info, 8349
8350 intr_info, 8350 if (unlikely(vmx->fail)) {
8351 vmcs_read32(VM_EXIT_INTR_ERROR_CODE), 8351 pr_info_ratelimited("%s failed vm entry %x\n", __func__,
8352 KVM_ISA_VMX); 8352 vmcs_read32(VM_INSTRUCTION_ERROR));
8353 return true;
8354 }
8353 8355
8354 /* 8356 /*
8355 * The host physical addresses of some pages of guest memory 8357 * The host physical addresses of some pages of guest memory
@@ -8363,14 +8365,12 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
8363 */ 8365 */
8364 nested_mark_vmcs12_pages_dirty(vcpu); 8366 nested_mark_vmcs12_pages_dirty(vcpu);
8365 8367
8366 if (vmx->nested.nested_run_pending) 8368 trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
8367 return false; 8369 vmcs_readl(EXIT_QUALIFICATION),
8368 8370 vmx->idt_vectoring_info,
8369 if (unlikely(vmx->fail)) { 8371 intr_info,
8370 pr_info_ratelimited("%s failed vm entry %x\n", __func__, 8372 vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
8371 vmcs_read32(VM_INSTRUCTION_ERROR)); 8373 KVM_ISA_VMX);
8372 return true;
8373 }
8374 8374
8375 switch (exit_reason) { 8375 switch (exit_reason) {
8376 case EXIT_REASON_EXCEPTION_NMI: 8376 case EXIT_REASON_EXCEPTION_NMI:
@@ -9424,12 +9424,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
9424 | (1 << VCPU_EXREG_CR3)); 9424 | (1 << VCPU_EXREG_CR3));
9425 vcpu->arch.regs_dirty = 0; 9425 vcpu->arch.regs_dirty = 0;
9426 9426
9427 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
9428
9429 vmx->loaded_vmcs->launched = 1;
9430
9431 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
9432
9433 /* 9427 /*
9434 * eager fpu is enabled if PKEY is supported and CR4 is switched 9428 * eager fpu is enabled if PKEY is supported and CR4 is switched
9435 * back on host, so it is safe to read guest PKRU from current 9429 * back on host, so it is safe to read guest PKRU from current
@@ -9451,6 +9445,14 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
9451 kvm_make_request(KVM_REQ_EVENT, vcpu); 9445 kvm_make_request(KVM_REQ_EVENT, vcpu);
9452 9446
9453 vmx->nested.nested_run_pending = 0; 9447 vmx->nested.nested_run_pending = 0;
9448 vmx->idt_vectoring_info = 0;
9449
9450 vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON);
9451 if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
9452 return;
9453
9454 vmx->loaded_vmcs->launched = 1;
9455 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
9454 9456
9455 vmx_complete_atomic_exit(vmx); 9457 vmx_complete_atomic_exit(vmx);
9456 vmx_recover_nmi_blocking(vmx); 9458 vmx_recover_nmi_blocking(vmx);
@@ -10525,6 +10527,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
10525 if (exec_control & CPU_BASED_TPR_SHADOW) { 10527 if (exec_control & CPU_BASED_TPR_SHADOW) {
10526 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull); 10528 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
10527 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 10529 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
10530 } else {
10531#ifdef CONFIG_X86_64
10532 exec_control |= CPU_BASED_CR8_LOAD_EXITING |
10533 CPU_BASED_CR8_STORE_EXITING;
10534#endif
10528 } 10535 }
10529 10536
10530 /* 10537 /*
@@ -11388,46 +11395,30 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
11388{ 11395{
11389 struct vcpu_vmx *vmx = to_vmx(vcpu); 11396 struct vcpu_vmx *vmx = to_vmx(vcpu);
11390 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 11397 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
11391 u32 vm_inst_error = 0;
11392 11398
11393 /* trying to cancel vmlaunch/vmresume is a bug */ 11399 /* trying to cancel vmlaunch/vmresume is a bug */
11394 WARN_ON_ONCE(vmx->nested.nested_run_pending); 11400 WARN_ON_ONCE(vmx->nested.nested_run_pending);
11395 11401
11402 /*
11403 * The only expected VM-instruction error is "VM entry with
11404 * invalid control field(s)." Anything else indicates a
11405 * problem with L0.
11406 */
11407 WARN_ON_ONCE(vmx->fail && (vmcs_read32(VM_INSTRUCTION_ERROR) !=
11408 VMXERR_ENTRY_INVALID_CONTROL_FIELD));
11409
11396 leave_guest_mode(vcpu); 11410 leave_guest_mode(vcpu);
11397 prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
11398 exit_qualification);
11399 11411
11400 if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, 11412 if (likely(!vmx->fail)) {
11401 vmcs12->vm_exit_msr_store_count)) 11413 prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
11402 nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); 11414 exit_qualification);
11403 11415
11404 if (unlikely(vmx->fail)) 11416 if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
11405 vm_inst_error = vmcs_read32(VM_INSTRUCTION_ERROR); 11417 vmcs12->vm_exit_msr_store_count))
11418 nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
11419 }
11406 11420
11407 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 11421 vmx_switch_vmcs(vcpu, &vmx->vmcs01);
11408
11409 /*
11410 * TODO: SDM says that with acknowledge interrupt on exit, bit 31 of
11411 * the VM-exit interrupt information (valid interrupt) is always set to
11412 * 1 on EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't need
11413 * kvm_cpu_has_interrupt(). See the commit message for details.
11414 */
11415 if (nested_exit_intr_ack_set(vcpu) &&
11416 exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
11417 kvm_cpu_has_interrupt(vcpu)) {
11418 int irq = kvm_cpu_get_interrupt(vcpu);
11419 WARN_ON(irq < 0);
11420 vmcs12->vm_exit_intr_info = irq |
11421 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
11422 }
11423
11424 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
11425 vmcs12->exit_qualification,
11426 vmcs12->idt_vectoring_info_field,
11427 vmcs12->vm_exit_intr_info,
11428 vmcs12->vm_exit_intr_error_code,
11429 KVM_ISA_VMX);
11430
11431 vm_entry_controls_reset_shadow(vmx); 11422 vm_entry_controls_reset_shadow(vmx);
11432 vm_exit_controls_reset_shadow(vmx); 11423 vm_exit_controls_reset_shadow(vmx);
11433 vmx_segment_cache_clear(vmx); 11424 vmx_segment_cache_clear(vmx);
@@ -11436,8 +11427,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
11436 if (VMCS02_POOL_SIZE == 0) 11427 if (VMCS02_POOL_SIZE == 0)
11437 nested_free_vmcs02(vmx, vmx->nested.current_vmptr); 11428 nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
11438 11429
11439 load_vmcs12_host_state(vcpu, vmcs12);
11440
11441 /* Update any VMCS fields that might have changed while L2 ran */ 11430 /* Update any VMCS fields that might have changed while L2 ran */
11442 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr); 11431 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
11443 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr); 11432 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
@@ -11486,21 +11475,57 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
11486 */ 11475 */
11487 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 11476 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
11488 11477
11489 /*
11490 * Exiting from L2 to L1, we're now back to L1 which thinks it just
11491 * finished a VMLAUNCH or VMRESUME instruction, so we need to set the
11492 * success or failure flag accordingly.
11493 */
11494 if (unlikely(vmx->fail)) {
11495 vmx->fail = 0;
11496 nested_vmx_failValid(vcpu, vm_inst_error);
11497 } else
11498 nested_vmx_succeed(vcpu);
11499 if (enable_shadow_vmcs) 11478 if (enable_shadow_vmcs)
11500 vmx->nested.sync_shadow_vmcs = true; 11479 vmx->nested.sync_shadow_vmcs = true;
11501 11480
11502 /* in case we halted in L2 */ 11481 /* in case we halted in L2 */
11503 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 11482 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
11483
11484 if (likely(!vmx->fail)) {
11485 /*
11486 * TODO: SDM says that with acknowledge interrupt on
11487 * exit, bit 31 of the VM-exit interrupt information
11488 * (valid interrupt) is always set to 1 on
11489 * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't
11490 * need kvm_cpu_has_interrupt(). See the commit
11491 * message for details.
11492 */
11493 if (nested_exit_intr_ack_set(vcpu) &&
11494 exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
11495 kvm_cpu_has_interrupt(vcpu)) {
11496 int irq = kvm_cpu_get_interrupt(vcpu);
11497 WARN_ON(irq < 0);
11498 vmcs12->vm_exit_intr_info = irq |
11499 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
11500 }
11501
11502 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
11503 vmcs12->exit_qualification,
11504 vmcs12->idt_vectoring_info_field,
11505 vmcs12->vm_exit_intr_info,
11506 vmcs12->vm_exit_intr_error_code,
11507 KVM_ISA_VMX);
11508
11509 load_vmcs12_host_state(vcpu, vmcs12);
11510
11511 return;
11512 }
11513
11514 /*
11515 * After an early L2 VM-entry failure, we're now back
11516 * in L1 which thinks it just finished a VMLAUNCH or
11517 * VMRESUME instruction, so we need to set the failure
11518 * flag and the VM-instruction error field of the VMCS
11519 * accordingly.
11520 */
11521 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
11522 /*
11523 * The emulated instruction was already skipped in
11524 * nested_vmx_run, but the updated RIP was never
11525 * written back to the vmcs01.
11526 */
11527 skip_emulated_instruction(vcpu);
11528 vmx->fail = 0;
11504} 11529}
11505 11530
11506/* 11531/*
@@ -11829,7 +11854,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
11829 struct kvm_lapic_irq irq; 11854 struct kvm_lapic_irq irq;
11830 struct kvm_vcpu *vcpu; 11855 struct kvm_vcpu *vcpu;
11831 struct vcpu_data vcpu_info; 11856 struct vcpu_data vcpu_info;
11832 int idx, ret = -EINVAL; 11857 int idx, ret = 0;
11833 11858
11834 if (!kvm_arch_has_assigned_device(kvm) || 11859 if (!kvm_arch_has_assigned_device(kvm) ||
11835 !irq_remapping_cap(IRQ_POSTING_CAP) || 11860 !irq_remapping_cap(IRQ_POSTING_CAP) ||
@@ -11838,7 +11863,12 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
11838 11863
11839 idx = srcu_read_lock(&kvm->irq_srcu); 11864 idx = srcu_read_lock(&kvm->irq_srcu);
11840 irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); 11865 irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
11841 BUG_ON(guest_irq >= irq_rt->nr_rt_entries); 11866 if (guest_irq >= irq_rt->nr_rt_entries ||
11867 hlist_empty(&irq_rt->map[guest_irq])) {
11868 pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
11869 guest_irq, irq_rt->nr_rt_entries);
11870 goto out;
11871 }
11842 11872
11843 hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { 11873 hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
11844 if (e->type != KVM_IRQ_ROUTING_MSI) 11874 if (e->type != KVM_IRQ_ROUTING_MSI)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6069af86da3b..cd17b7d9a107 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7231,10 +7231,19 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
7231 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 7231 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
7232 7232
7233 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { 7233 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
7234 if (kvm_run->immediate_exit) {
7235 r = -EINTR;
7236 goto out;
7237 }
7234 kvm_vcpu_block(vcpu); 7238 kvm_vcpu_block(vcpu);
7235 kvm_apic_accept_events(vcpu); 7239 kvm_apic_accept_events(vcpu);
7236 kvm_clear_request(KVM_REQ_UNHALT, vcpu); 7240 kvm_clear_request(KVM_REQ_UNHALT, vcpu);
7237 r = -EAGAIN; 7241 r = -EAGAIN;
7242 if (signal_pending(current)) {
7243 r = -EINTR;
7244 vcpu->run->exit_reason = KVM_EXIT_INTR;
7245 ++vcpu->stat.signal_exits;
7246 }
7238 goto out; 7247 goto out;
7239 } 7248 }
7240 7249
@@ -7971,7 +7980,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
7971 BUG_ON(vcpu->kvm == NULL); 7980 BUG_ON(vcpu->kvm == NULL);
7972 kvm = vcpu->kvm; 7981 kvm = vcpu->kvm;
7973 7982
7974 vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(); 7983 vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
7975 vcpu->arch.pv.pv_unhalted = false; 7984 vcpu->arch.pv.pv_unhalted = false;
7976 vcpu->arch.emulate_ctxt.ops = &emulate_ops; 7985 vcpu->arch.emulate_ctxt.ops = &emulate_ops;
7977 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu)) 7986 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu))
@@ -8452,6 +8461,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
8452 if (vcpu->arch.pv.pv_unhalted) 8461 if (vcpu->arch.pv.pv_unhalted)
8453 return true; 8462 return true;
8454 8463
8464 if (vcpu->arch.exception.pending)
8465 return true;
8466
8455 if (kvm_test_request(KVM_REQ_NMI, vcpu) || 8467 if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
8456 (vcpu->arch.nmi_pending && 8468 (vcpu->arch.nmi_pending &&
8457 kvm_x86_ops->nmi_allowed(vcpu))) 8469 kvm_x86_ops->nmi_allowed(vcpu)))
@@ -8619,6 +8631,13 @@ static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
8619 sizeof(val)); 8631 sizeof(val));
8620} 8632}
8621 8633
8634static int apf_get_user(struct kvm_vcpu *vcpu, u32 *val)
8635{
8636
8637 return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, val,
8638 sizeof(u32));
8639}
8640
8622void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, 8641void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
8623 struct kvm_async_pf *work) 8642 struct kvm_async_pf *work)
8624{ 8643{
@@ -8646,6 +8665,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
8646 struct kvm_async_pf *work) 8665 struct kvm_async_pf *work)
8647{ 8666{
8648 struct x86_exception fault; 8667 struct x86_exception fault;
8668 u32 val;
8649 8669
8650 if (work->wakeup_all) 8670 if (work->wakeup_all)
8651 work->arch.token = ~0; /* broadcast wakeup */ 8671 work->arch.token = ~0; /* broadcast wakeup */
@@ -8653,15 +8673,26 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
8653 kvm_del_async_pf_gfn(vcpu, work->arch.gfn); 8673 kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
8654 trace_kvm_async_pf_ready(work->arch.token, work->gva); 8674 trace_kvm_async_pf_ready(work->arch.token, work->gva);
8655 8675
8656 if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) && 8676 if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED &&
8657 !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) { 8677 !apf_get_user(vcpu, &val)) {
8658 fault.vector = PF_VECTOR; 8678 if (val == KVM_PV_REASON_PAGE_NOT_PRESENT &&
8659 fault.error_code_valid = true; 8679 vcpu->arch.exception.pending &&
8660 fault.error_code = 0; 8680 vcpu->arch.exception.nr == PF_VECTOR &&
8661 fault.nested_page_fault = false; 8681 !apf_put_user(vcpu, 0)) {
8662 fault.address = work->arch.token; 8682 vcpu->arch.exception.injected = false;
8663 fault.async_page_fault = true; 8683 vcpu->arch.exception.pending = false;
8664 kvm_inject_page_fault(vcpu, &fault); 8684 vcpu->arch.exception.nr = 0;
8685 vcpu->arch.exception.has_error_code = false;
8686 vcpu->arch.exception.error_code = 0;
8687 } else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
8688 fault.vector = PF_VECTOR;
8689 fault.error_code_valid = true;
8690 fault.error_code = 0;
8691 fault.nested_page_fault = false;
8692 fault.address = work->arch.token;
8693 fault.async_page_fault = true;
8694 kvm_inject_page_fault(vcpu, &fault);
8695 }
8665 } 8696 }
8666 vcpu->arch.apf.halted = false; 8697 vcpu->arch.apf.halted = false;
8667 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 8698 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
diff --git a/include/linux/swait.h b/include/linux/swait.h
index 4a4e180d0a35..73e97a08d3d0 100644
--- a/include/linux/swait.h
+++ b/include/linux/swait.h
@@ -79,9 +79,63 @@ extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name
79 DECLARE_SWAIT_QUEUE_HEAD(name) 79 DECLARE_SWAIT_QUEUE_HEAD(name)
80#endif 80#endif
81 81
82static inline int swait_active(struct swait_queue_head *q) 82/**
83 * swait_active -- locklessly test for waiters on the queue
84 * @wq: the waitqueue to test for waiters
85 *
86 * returns true if the wait list is not empty
87 *
88 * NOTE: this function is lockless and requires care, incorrect usage _will_
89 * lead to sporadic and non-obvious failure.
90 *
91 * NOTE2: this function has the same above implications as regular waitqueues.
92 *
93 * Use either while holding swait_queue_head::lock or when used for wakeups
94 * with an extra smp_mb() like:
95 *
96 * CPU0 - waker CPU1 - waiter
97 *
98 * for (;;) {
99 * @cond = true; prepare_to_swait(&wq_head, &wait, state);
100 * smp_mb(); // smp_mb() from set_current_state()
101 * if (swait_active(wq_head)) if (@cond)
102 * wake_up(wq_head); break;
103 * schedule();
104 * }
105 * finish_swait(&wq_head, &wait);
106 *
107 * Because without the explicit smp_mb() it's possible for the
108 * swait_active() load to get hoisted over the @cond store such that we'll
109 * observe an empty wait list while the waiter might not observe @cond.
110 * This, in turn, can trigger missing wakeups.
111 *
112 * Also note that this 'optimization' trades a spin_lock() for an smp_mb(),
113 * which (when the lock is uncontended) are of roughly equal cost.
114 */
115static inline int swait_active(struct swait_queue_head *wq)
116{
117 return !list_empty(&wq->task_list);
118}
119
120/**
121 * swq_has_sleeper - check if there are any waiting processes
122 * @wq: the waitqueue to test for waiters
123 *
124 * Returns true if @wq has waiting processes
125 *
126 * Please refer to the comment for swait_active.
127 */
128static inline bool swq_has_sleeper(struct swait_queue_head *wq)
83{ 129{
84 return !list_empty(&q->task_list); 130 /*
131 * We need to be sure we are in sync with the list_add()
132 * modifications to the wait queue (task_list).
133 *
134 * This memory barrier should be paired with one on the
135 * waiting side.
136 */
137 smp_mb();
138 return swait_active(wq);
85} 139}
86 140
87extern void swake_up(struct swait_queue_head *q); 141extern void swake_up(struct swait_queue_head *q);
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 8ade3eb6c640..dcffedfac431 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -14,7 +14,9 @@
14 ERSN(SHUTDOWN), ERSN(FAIL_ENTRY), ERSN(INTR), ERSN(SET_TPR), \ 14 ERSN(SHUTDOWN), ERSN(FAIL_ENTRY), ERSN(INTR), ERSN(SET_TPR), \
15 ERSN(TPR_ACCESS), ERSN(S390_SIEIC), ERSN(S390_RESET), ERSN(DCR),\ 15 ERSN(TPR_ACCESS), ERSN(S390_SIEIC), ERSN(S390_RESET), ERSN(DCR),\
16 ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI), ERSN(PAPR_HCALL), \ 16 ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI), ERSN(PAPR_HCALL), \
17 ERSN(S390_UCONTROL), ERSN(WATCHDOG), ERSN(S390_TSCH) 17 ERSN(S390_UCONTROL), ERSN(WATCHDOG), ERSN(S390_TSCH), ERSN(EPR),\
18 ERSN(SYSTEM_EVENT), ERSN(S390_STSI), ERSN(IOAPIC_EOI), \
19 ERSN(HYPERV)
18 20
19TRACE_EVENT(kvm_userspace_exit, 21TRACE_EVENT(kvm_userspace_exit,
20 TP_PROTO(__u32 reason, int errno), 22 TP_PROTO(__u32 reason, int errno),
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index bb298a200cd3..57bcb27dcf30 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -106,11 +106,7 @@ static void async_pf_execute(struct work_struct *work)
106 106
107 trace_kvm_async_pf_completed(addr, gva); 107 trace_kvm_async_pf_completed(addr, gva);
108 108
109 /* 109 if (swq_has_sleeper(&vcpu->wq))
110 * This memory barrier pairs with prepare_to_wait's set_current_state()
111 */
112 smp_mb();
113 if (swait_active(&vcpu->wq))
114 swake_up(&vcpu->wq); 110 swake_up(&vcpu->wq);
115 111
116 mmput(mm); 112 mmput(mm);
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index f2ac53ab8243..c608ab495282 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -565,6 +565,8 @@ kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
565{ 565{
566 if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE)) 566 if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE))
567 return -EINVAL; 567 return -EINVAL;
568 if (args->gsi >= KVM_MAX_IRQ_ROUTES)
569 return -EINVAL;
568 570
569 if (args->flags & KVM_IRQFD_FLAG_DEASSIGN) 571 if (args->flags & KVM_IRQFD_FLAG_DEASSIGN)
570 return kvm_irqfd_deassign(kvm, args); 572 return kvm_irqfd_deassign(kvm, args);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 6ed1c2021198..9deb5a245b83 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -674,6 +674,7 @@ out_err_no_irq_srcu:
674out_err_no_srcu: 674out_err_no_srcu:
675 hardware_disable_all(); 675 hardware_disable_all();
676out_err_no_disable: 676out_err_no_disable:
677 refcount_set(&kvm->users_count, 0);
677 for (i = 0; i < KVM_NR_BUSES; i++) 678 for (i = 0; i < KVM_NR_BUSES; i++)
678 kfree(kvm_get_bus(kvm, i)); 679 kfree(kvm_get_bus(kvm, i));
679 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) 680 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
@@ -2186,7 +2187,7 @@ bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
2186 struct swait_queue_head *wqp; 2187 struct swait_queue_head *wqp;
2187 2188
2188 wqp = kvm_arch_vcpu_wq(vcpu); 2189 wqp = kvm_arch_vcpu_wq(vcpu);
2189 if (swait_active(wqp)) { 2190 if (swq_has_sleeper(wqp)) {
2190 swake_up(wqp); 2191 swake_up(wqp);
2191 ++vcpu->stat.halt_wakeup; 2192 ++vcpu->stat.halt_wakeup;
2192 return true; 2193 return true;