aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul Mackerras <paulus@ozlabs.org>2019-08-13 06:06:48 -0400
committerMichael Ellerman <mpe@ellerman.id.au>2019-08-16 00:16:59 -0400
commitda15c03b047dca891d37b9f4ef9ca14d84a6484f (patch)
tree2ea7be961ab577536dfded9c88a117386c316646
parent8d4ba9c931bc384bcc6889a43915aaaf19d3e499 (diff)
powerpc/xive: Implement get_irqchip_state method for XIVE to fix shutdown race
Testing has revealed the existence of a race condition where a XIVE interrupt being shut down can be in one of the XIVE interrupt queues (of which there are up to 8 per CPU, one for each priority) at the point where free_irq() is called. If this happens, can return an interrupt number which has been shut down. This can lead to various symptoms: - irq_to_desc(irq) can be NULL. In this case, no end-of-interrupt function gets called, resulting in the CPU's elevated interrupt priority (numerically lowered CPPR) never gets reset. That then means that the CPU stops processing interrupts, causing device timeouts and other errors in various device drivers. - The irq descriptor or related data structures can be in the process of being freed as the interrupt code is using them. This typically leads to crashes due to bad pointer dereferences. This race is basically what commit 62e0468650c3 ("genirq: Add optional hardware synchronization for shutdown", 2019-06-28) is intended to fix, given a get_irqchip_state() method for the interrupt controller being used. It works by polling the interrupt controller when an interrupt is being freed until the controller says it is not pending. With XIVE, the PQ bits of the interrupt source indicate the state of the interrupt source, and in particular the P bit goes from 0 to 1 at the point where the hardware writes an entry into the interrupt queue that this interrupt is directed towards. Normally, the code will then process the interrupt and do an end-of-interrupt (EOI) operation which will reset PQ to 00 (assuming another interrupt hasn't been generated in the meantime). However, there are situations where the code resets P even though a queue entry exists (for example, by setting PQ to 01, which disables the interrupt source), and also situations where the code leaves P at 1 after removing the queue entry (for example, this is done for escalation interrupts so they cannot fire again until they are explicitly re-enabled). The code already has a 'saved_p' flag for the interrupt source which indicates that a queue entry exists, although it isn't maintained consistently. This patch adds a 'stale_p' flag to indicate that P has been left at 1 after processing a queue entry, and adds code to set and clear saved_p and stale_p as necessary to maintain a consistent indication of whether a queue entry may or may not exist. With this, we can implement xive_get_irqchip_state() by looking at stale_p, saved_p and the ESB PQ bits for the interrupt. There is some additional code to handle escalation interrupts properly; because they are enabled and disabled in KVM assembly code, which does not have access to the xive_irq_data struct for the escalation interrupt. Hence, stale_p may be incorrect when the escalation interrupt is freed in kvmppc_xive_{,native_}cleanup_vcpu(). Fortunately, we can fix it up by looking at vcpu->arch.xive_esc_on, with some careful attention to barriers in order to ensure the correct result if xive_esc_irq() races with kvmppc_xive_cleanup_vcpu(). Finally, this adds code to make noise on the console (pr_crit and WARN_ON(1)) if we find an interrupt queue entry for an interrupt which does not have a descriptor. While this won't catch the race reliably, if it does get triggered it will be an indication that the race is occurring and needs to be debugged. Fixes: 243e25112d06 ("powerpc/xive: Native exploitation of the XIVE interrupt controller") Cc: stable@vger.kernel.org # v4.12+ Signed-off-by: Paul Mackerras <paulus@ozlabs.org> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20190813100648.GE9567@blackberry
-rw-r--r--arch/powerpc/include/asm/xive.h8
-rw-r--r--arch/powerpc/kvm/book3s_xive.c31
-rw-r--r--arch/powerpc/kvm/book3s_xive.h2
-rw-r--r--arch/powerpc/kvm/book3s_xive_native.c3
-rw-r--r--arch/powerpc/sysdev/xive/common.c87
5 files changed, 108 insertions, 23 deletions
diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
index e4016985764e..efb0e597b272 100644
--- a/arch/powerpc/include/asm/xive.h
+++ b/arch/powerpc/include/asm/xive.h
@@ -46,7 +46,15 @@ struct xive_irq_data {
46 46
47 /* Setup/used by frontend */ 47 /* Setup/used by frontend */
48 int target; 48 int target;
49 /*
50 * saved_p means that there is a queue entry for this interrupt
51 * in some CPU's queue (not including guest vcpu queues), even
52 * if P is not set in the source ESB.
53 * stale_p means that there is no queue entry for this interrupt
54 * in some CPU's queue, even if P is set in the source ESB.
55 */
49 bool saved_p; 56 bool saved_p;
57 bool stale_p;
50}; 58};
51#define XIVE_IRQ_FLAG_STORE_EOI 0x01 59#define XIVE_IRQ_FLAG_STORE_EOI 0x01
52#define XIVE_IRQ_FLAG_LSI 0x02 60#define XIVE_IRQ_FLAG_LSI 0x02
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index 586867e46e51..591bfb4bfd0f 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -166,6 +166,9 @@ static irqreturn_t xive_esc_irq(int irq, void *data)
166 */ 166 */
167 vcpu->arch.xive_esc_on = false; 167 vcpu->arch.xive_esc_on = false;
168 168
169 /* This orders xive_esc_on = false vs. subsequent stale_p = true */
170 smp_wmb(); /* goes with smp_mb() in cleanup_single_escalation */
171
169 return IRQ_HANDLED; 172 return IRQ_HANDLED;
170} 173}
171 174
@@ -1119,6 +1122,31 @@ void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu)
1119 vcpu->arch.xive_esc_raddr = 0; 1122 vcpu->arch.xive_esc_raddr = 0;
1120} 1123}
1121 1124
1125/*
1126 * In single escalation mode, the escalation interrupt is marked so
1127 * that EOI doesn't re-enable it, but just sets the stale_p flag to
1128 * indicate that the P bit has already been dealt with. However, the
1129 * assembly code that enters the guest sets PQ to 00 without clearing
1130 * stale_p (because it has no easy way to address it). Hence we have
1131 * to adjust stale_p before shutting down the interrupt.
1132 */
1133void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu,
1134 struct kvmppc_xive_vcpu *xc, int irq)
1135{
1136 struct irq_data *d = irq_get_irq_data(irq);
1137 struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
1138
1139 /*
1140 * This slightly odd sequence gives the right result
1141 * (i.e. stale_p set if xive_esc_on is false) even if
1142 * we race with xive_esc_irq() and xive_irq_eoi().
1143 */
1144 xd->stale_p = false;
1145 smp_mb(); /* paired with smb_wmb in xive_esc_irq */
1146 if (!vcpu->arch.xive_esc_on)
1147 xd->stale_p = true;
1148}
1149
1122void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) 1150void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
1123{ 1151{
1124 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 1152 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
@@ -1143,6 +1171,9 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
1143 /* Free escalations */ 1171 /* Free escalations */
1144 for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { 1172 for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
1145 if (xc->esc_virq[i]) { 1173 if (xc->esc_virq[i]) {
1174 if (xc->xive->single_escalation)
1175 xive_cleanup_single_escalation(vcpu, xc,
1176 xc->esc_virq[i]);
1146 free_irq(xc->esc_virq[i], vcpu); 1177 free_irq(xc->esc_virq[i], vcpu);
1147 irq_dispose_mapping(xc->esc_virq[i]); 1178 irq_dispose_mapping(xc->esc_virq[i]);
1148 kfree(xc->esc_virq_names[i]); 1179 kfree(xc->esc_virq_names[i]);
diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h
index 50494d0ee375..955b820ffd6d 100644
--- a/arch/powerpc/kvm/book3s_xive.h
+++ b/arch/powerpc/kvm/book3s_xive.h
@@ -282,6 +282,8 @@ int kvmppc_xive_select_target(struct kvm *kvm, u32 *server, u8 prio);
282int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio, 282int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio,
283 bool single_escalation); 283 bool single_escalation);
284struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type); 284struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type);
285void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu,
286 struct kvmppc_xive_vcpu *xc, int irq);
285 287
286#endif /* CONFIG_KVM_XICS */ 288#endif /* CONFIG_KVM_XICS */
287#endif /* _KVM_PPC_BOOK3S_XICS_H */ 289#endif /* _KVM_PPC_BOOK3S_XICS_H */
diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
index 11b91b46fc39..f0cab43e6f4b 100644
--- a/arch/powerpc/kvm/book3s_xive_native.c
+++ b/arch/powerpc/kvm/book3s_xive_native.c
@@ -71,6 +71,9 @@ void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu)
71 for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { 71 for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
72 /* Free the escalation irq */ 72 /* Free the escalation irq */
73 if (xc->esc_virq[i]) { 73 if (xc->esc_virq[i]) {
74 if (xc->xive->single_escalation)
75 xive_cleanup_single_escalation(vcpu, xc,
76 xc->esc_virq[i]);
74 free_irq(xc->esc_virq[i], vcpu); 77 free_irq(xc->esc_virq[i], vcpu);
75 irq_dispose_mapping(xc->esc_virq[i]); 78 irq_dispose_mapping(xc->esc_virq[i]);
76 kfree(xc->esc_virq_names[i]); 79 kfree(xc->esc_virq_names[i]);
diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
index 1cdb39575eae..be86fce1a84e 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -135,7 +135,7 @@ static u32 xive_read_eq(struct xive_q *q, bool just_peek)
135static u32 xive_scan_interrupts(struct xive_cpu *xc, bool just_peek) 135static u32 xive_scan_interrupts(struct xive_cpu *xc, bool just_peek)
136{ 136{
137 u32 irq = 0; 137 u32 irq = 0;
138 u8 prio; 138 u8 prio = 0;
139 139
140 /* Find highest pending priority */ 140 /* Find highest pending priority */
141 while (xc->pending_prio != 0) { 141 while (xc->pending_prio != 0) {
@@ -148,8 +148,19 @@ static u32 xive_scan_interrupts(struct xive_cpu *xc, bool just_peek)
148 irq = xive_read_eq(&xc->queue[prio], just_peek); 148 irq = xive_read_eq(&xc->queue[prio], just_peek);
149 149
150 /* Found something ? That's it */ 150 /* Found something ? That's it */
151 if (irq) 151 if (irq) {
152 break; 152 if (just_peek || irq_to_desc(irq))
153 break;
154 /*
155 * We should never get here; if we do then we must
156 * have failed to synchronize the interrupt properly
157 * when shutting it down.
158 */
159 pr_crit("xive: got interrupt %d without descriptor, dropping\n",
160 irq);
161 WARN_ON(1);
162 continue;
163 }
153 164
154 /* Clear pending bits */ 165 /* Clear pending bits */
155 xc->pending_prio &= ~(1 << prio); 166 xc->pending_prio &= ~(1 << prio);
@@ -307,6 +318,7 @@ static void xive_do_queue_eoi(struct xive_cpu *xc)
307 */ 318 */
308static void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd) 319static void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd)
309{ 320{
321 xd->stale_p = false;
310 /* If the XIVE supports the new "store EOI facility, use it */ 322 /* If the XIVE supports the new "store EOI facility, use it */
311 if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI) 323 if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI)
312 xive_esb_write(xd, XIVE_ESB_STORE_EOI, 0); 324 xive_esb_write(xd, XIVE_ESB_STORE_EOI, 0);
@@ -350,7 +362,7 @@ static void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd)
350 } 362 }
351} 363}
352 364
353/* irq_chip eoi callback */ 365/* irq_chip eoi callback, called with irq descriptor lock held */
354static void xive_irq_eoi(struct irq_data *d) 366static void xive_irq_eoi(struct irq_data *d)
355{ 367{
356 struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); 368 struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
@@ -366,6 +378,8 @@ static void xive_irq_eoi(struct irq_data *d)
366 if (!irqd_irq_disabled(d) && !irqd_is_forwarded_to_vcpu(d) && 378 if (!irqd_irq_disabled(d) && !irqd_is_forwarded_to_vcpu(d) &&
367 !(xd->flags & XIVE_IRQ_NO_EOI)) 379 !(xd->flags & XIVE_IRQ_NO_EOI))
368 xive_do_source_eoi(irqd_to_hwirq(d), xd); 380 xive_do_source_eoi(irqd_to_hwirq(d), xd);
381 else
382 xd->stale_p = true;
369 383
370 /* 384 /*
371 * Clear saved_p to indicate that it's no longer occupying 385 * Clear saved_p to indicate that it's no longer occupying
@@ -397,11 +411,16 @@ static void xive_do_source_set_mask(struct xive_irq_data *xd,
397 */ 411 */
398 if (mask) { 412 if (mask) {
399 val = xive_esb_read(xd, XIVE_ESB_SET_PQ_01); 413 val = xive_esb_read(xd, XIVE_ESB_SET_PQ_01);
400 xd->saved_p = !!(val & XIVE_ESB_VAL_P); 414 if (!xd->stale_p && !!(val & XIVE_ESB_VAL_P))
401 } else if (xd->saved_p) 415 xd->saved_p = true;
416 xd->stale_p = false;
417 } else if (xd->saved_p) {
402 xive_esb_read(xd, XIVE_ESB_SET_PQ_10); 418 xive_esb_read(xd, XIVE_ESB_SET_PQ_10);
403 else 419 xd->saved_p = false;
420 } else {
404 xive_esb_read(xd, XIVE_ESB_SET_PQ_00); 421 xive_esb_read(xd, XIVE_ESB_SET_PQ_00);
422 xd->stale_p = false;
423 }
405} 424}
406 425
407/* 426/*
@@ -541,6 +560,8 @@ static unsigned int xive_irq_startup(struct irq_data *d)
541 unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); 560 unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
542 int target, rc; 561 int target, rc;
543 562
563 xd->saved_p = false;
564 xd->stale_p = false;
544 pr_devel("xive_irq_startup: irq %d [0x%x] data @%p\n", 565 pr_devel("xive_irq_startup: irq %d [0x%x] data @%p\n",
545 d->irq, hw_irq, d); 566 d->irq, hw_irq, d);
546 567
@@ -587,6 +608,7 @@ static unsigned int xive_irq_startup(struct irq_data *d)
587 return 0; 608 return 0;
588} 609}
589 610
611/* called with irq descriptor lock held */
590static void xive_irq_shutdown(struct irq_data *d) 612static void xive_irq_shutdown(struct irq_data *d)
591{ 613{
592 struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); 614 struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
@@ -602,16 +624,6 @@ static void xive_irq_shutdown(struct irq_data *d)
602 xive_do_source_set_mask(xd, true); 624 xive_do_source_set_mask(xd, true);
603 625
604 /* 626 /*
605 * The above may have set saved_p. We clear it otherwise it
606 * will prevent re-enabling later on. It is ok to forget the
607 * fact that the interrupt might be in a queue because we are
608 * accounting that already in xive_dec_target_count() and will
609 * be re-routing it to a new queue with proper accounting when
610 * it's started up again
611 */
612 xd->saved_p = false;
613
614 /*
615 * Mask the interrupt in HW in the IVT/EAS and set the number 627 * Mask the interrupt in HW in the IVT/EAS and set the number
616 * to be the "bad" IRQ number 628 * to be the "bad" IRQ number
617 */ 629 */
@@ -797,6 +809,10 @@ static int xive_irq_retrigger(struct irq_data *d)
797 return 1; 809 return 1;
798} 810}
799 811
812/*
813 * Caller holds the irq descriptor lock, so this won't be called
814 * concurrently with xive_get_irqchip_state on the same interrupt.
815 */
800static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state) 816static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state)
801{ 817{
802 struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); 818 struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
@@ -820,6 +836,10 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state)
820 836
821 /* Set it to PQ=10 state to prevent further sends */ 837 /* Set it to PQ=10 state to prevent further sends */
822 pq = xive_esb_read(xd, XIVE_ESB_SET_PQ_10); 838 pq = xive_esb_read(xd, XIVE_ESB_SET_PQ_10);
839 if (!xd->stale_p) {
840 xd->saved_p = !!(pq & XIVE_ESB_VAL_P);
841 xd->stale_p = !xd->saved_p;
842 }
823 843
824 /* No target ? nothing to do */ 844 /* No target ? nothing to do */
825 if (xd->target == XIVE_INVALID_TARGET) { 845 if (xd->target == XIVE_INVALID_TARGET) {
@@ -827,7 +847,7 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state)
827 * An untargetted interrupt should have been 847 * An untargetted interrupt should have been
828 * also masked at the source 848 * also masked at the source
829 */ 849 */
830 WARN_ON(pq & 2); 850 WARN_ON(xd->saved_p);
831 851
832 return 0; 852 return 0;
833 } 853 }
@@ -847,9 +867,8 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state)
847 * This saved_p is cleared by the host EOI, when we know 867 * This saved_p is cleared by the host EOI, when we know
848 * for sure the queue slot is no longer in use. 868 * for sure the queue slot is no longer in use.
849 */ 869 */
850 if (pq & 2) { 870 if (xd->saved_p) {
851 pq = xive_esb_read(xd, XIVE_ESB_SET_PQ_11); 871 xive_esb_read(xd, XIVE_ESB_SET_PQ_11);
852 xd->saved_p = true;
853 872
854 /* 873 /*
855 * Sync the XIVE source HW to ensure the interrupt 874 * Sync the XIVE source HW to ensure the interrupt
@@ -862,8 +881,7 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state)
862 */ 881 */
863 if (xive_ops->sync_source) 882 if (xive_ops->sync_source)
864 xive_ops->sync_source(hw_irq); 883 xive_ops->sync_source(hw_irq);
865 } else 884 }
866 xd->saved_p = false;
867 } else { 885 } else {
868 irqd_clr_forwarded_to_vcpu(d); 886 irqd_clr_forwarded_to_vcpu(d);
869 887
@@ -914,6 +932,23 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state)
914 return 0; 932 return 0;
915} 933}
916 934
935/* Called with irq descriptor lock held. */
936static int xive_get_irqchip_state(struct irq_data *data,
937 enum irqchip_irq_state which, bool *state)
938{
939 struct xive_irq_data *xd = irq_data_get_irq_handler_data(data);
940
941 switch (which) {
942 case IRQCHIP_STATE_ACTIVE:
943 *state = !xd->stale_p &&
944 (xd->saved_p ||
945 !!(xive_esb_read(xd, XIVE_ESB_GET) & XIVE_ESB_VAL_P));
946 return 0;
947 default:
948 return -EINVAL;
949 }
950}
951
917static struct irq_chip xive_irq_chip = { 952static struct irq_chip xive_irq_chip = {
918 .name = "XIVE-IRQ", 953 .name = "XIVE-IRQ",
919 .irq_startup = xive_irq_startup, 954 .irq_startup = xive_irq_startup,
@@ -925,6 +960,7 @@ static struct irq_chip xive_irq_chip = {
925 .irq_set_type = xive_irq_set_type, 960 .irq_set_type = xive_irq_set_type,
926 .irq_retrigger = xive_irq_retrigger, 961 .irq_retrigger = xive_irq_retrigger,
927 .irq_set_vcpu_affinity = xive_irq_set_vcpu_affinity, 962 .irq_set_vcpu_affinity = xive_irq_set_vcpu_affinity,
963 .irq_get_irqchip_state = xive_get_irqchip_state,
928}; 964};
929 965
930bool is_xive_irq(struct irq_chip *chip) 966bool is_xive_irq(struct irq_chip *chip)
@@ -1338,6 +1374,11 @@ static void xive_flush_cpu_queue(unsigned int cpu, struct xive_cpu *xc)
1338 xd = irq_desc_get_handler_data(desc); 1374 xd = irq_desc_get_handler_data(desc);
1339 1375
1340 /* 1376 /*
1377 * Clear saved_p to indicate that it's no longer pending
1378 */
1379 xd->saved_p = false;
1380
1381 /*
1341 * For LSIs, we EOI, this will cause a resend if it's 1382 * For LSIs, we EOI, this will cause a resend if it's
1342 * still asserted. Otherwise do an MSI retrigger. 1383 * still asserted. Otherwise do an MSI retrigger.
1343 */ 1384 */