aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2016-03-14 04:40:46 -0400
committerThomas Gleixner <tglx@linutronix.de>2016-03-18 09:51:06 -0400
commit551adc60573cb68e3d55cacca9ba1b7437313df7 (patch)
treeee3fd31639e6e49cc73014ac3fcfc0915b8f7d0f
parentf508a5ba7a4570418df6cfd68fe663ffdef2be63 (diff)
x86/irq: Cure live lock in fixup_irqs()
Harry reported, that he's able to trigger a system freeze with cpu hot unplug. The freeze turned out to be a live lock caused by recent changes in irq_force_complete_move(). When fixup_irqs() and from there irq_force_complete_move() is called on the dying cpu, then all other cpus are in stop machine an wait for the dying cpu to complete the teardown. If there is a move of an interrupt pending then irq_force_complete_move() sends the cleanup IPI to the cpus in the old_domain mask and waits for them to clear the mask. That's obviously impossible as those cpus are firmly stuck in stop machine with interrupts disabled. I should have known that, but I completely overlooked it being concentrated on the locking issues around the vectors. And the existance of the call to __irq_complete_move() in the code, which actually sends the cleanup IPI made it reasonable to wait for that cleanup to complete. That call was bogus even before the recent changes as it was just a pointless distraction. We have to look at two cases: 1) The move_in_progress flag of the interrupt is set This means the ioapic has been updated with the new vector, but it has not fired yet. In theory there is a race: set_ioapic(new_vector) <-- Interrupt is raised before update is effective, i.e. it's raised on the old vector. So if the target cpu cannot handle that interrupt before the old vector is cleaned up, we get a spurious interrupt and in the worst case the ioapic irq line becomes stale, but my experiments so far have only resulted in spurious interrupts. But in case of cpu hotplug this should be a non issue because if the affinity update happens right before all cpus rendevouz in stop machine, there is no way that the interrupt can be blocked on the target cpu because all cpus loops first with interrupts enabled in stop machine, so the old vector is not yet cleaned up when the interrupt fires. So the only way to run into this issue is if the delivery of the interrupt on the apic/system bus would be delayed beyond the point where the target cpu disables interrupts in stop machine. I doubt that it can happen, but at least there is a theroretical chance. Virtualization might be able to expose this, but AFAICT the IOAPIC emulation is not as stupid as the real hardware. I've spent quite some time over the weekend to enforce that situation, though I was not able to trigger the delayed case. 2) The move_in_progress flag is not set and the old_domain cpu mask is not empty. That means, that an interrupt was delivered after the change and the cleanup IPI has been sent to the cpus in old_domain, but not all CPUs have responded to it yet. In both cases we can assume that the next interrupt will arrive on the new vector, so we can cleanup the old vectors on the cpus in the old_domain cpu mask. Fixes: 98229aa36caa "x86/irq: Plug vector cleanup race" Reported-by: Harry Junior <harryjr@outlook.fr> Tested-by: Tony Luck <tony.luck@intel.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Joe Lawrence <joe.lawrence@stratus.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Ben Hutchings <ben@decadent.org.uk> Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1603140931430.3657@nanos Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
-rw-r--r--arch/x86/include/asm/hw_irq.h1
-rw-r--r--arch/x86/kernel/apic/vector.c88
2 files changed, 71 insertions, 18 deletions
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index f564d2b2275f..b90e1053049b 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -141,6 +141,7 @@ struct irq_alloc_info {
141struct irq_cfg { 141struct irq_cfg {
142 unsigned int dest_apicid; 142 unsigned int dest_apicid;
143 u8 vector; 143 u8 vector;
144 u8 old_vector;
144}; 145};
145 146
146extern struct irq_cfg *irq_cfg(unsigned int irq); 147extern struct irq_cfg *irq_cfg(unsigned int irq);
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 3b670df4ba7b..ad59d70bcb1a 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -213,6 +213,7 @@ update:
213 */ 213 */
214 cpumask_and(d->old_domain, d->old_domain, cpu_online_mask); 214 cpumask_and(d->old_domain, d->old_domain, cpu_online_mask);
215 d->move_in_progress = !cpumask_empty(d->old_domain); 215 d->move_in_progress = !cpumask_empty(d->old_domain);
216 d->cfg.old_vector = d->move_in_progress ? d->cfg.vector : 0;
216 d->cfg.vector = vector; 217 d->cfg.vector = vector;
217 cpumask_copy(d->domain, vector_cpumask); 218 cpumask_copy(d->domain, vector_cpumask);
218success: 219success:
@@ -655,46 +656,97 @@ void irq_complete_move(struct irq_cfg *cfg)
655} 656}
656 657
657/* 658/*
658 * Called with @desc->lock held and interrupts disabled. 659 * Called from fixup_irqs() with @desc->lock held and interrupts disabled.
659 */ 660 */
660void irq_force_complete_move(struct irq_desc *desc) 661void irq_force_complete_move(struct irq_desc *desc)
661{ 662{
662 struct irq_data *irqdata = irq_desc_get_irq_data(desc); 663 struct irq_data *irqdata = irq_desc_get_irq_data(desc);
663 struct apic_chip_data *data = apic_chip_data(irqdata); 664 struct apic_chip_data *data = apic_chip_data(irqdata);
664 struct irq_cfg *cfg = data ? &data->cfg : NULL; 665 struct irq_cfg *cfg = data ? &data->cfg : NULL;
666 unsigned int cpu;
665 667
666 if (!cfg) 668 if (!cfg)
667 return; 669 return;
668 670
669 __irq_complete_move(cfg, cfg->vector);
670
671 /* 671 /*
672 * This is tricky. If the cleanup of @data->old_domain has not been 672 * This is tricky. If the cleanup of @data->old_domain has not been
673 * done yet, then the following setaffinity call will fail with 673 * done yet, then the following setaffinity call will fail with
674 * -EBUSY. This can leave the interrupt in a stale state. 674 * -EBUSY. This can leave the interrupt in a stale state.
675 * 675 *
676 * The cleanup cannot make progress because we hold @desc->lock. So in 676 * All CPUs are stuck in stop machine with interrupts disabled so
677 * case @data->old_domain is not yet cleaned up, we need to drop the 677 * calling __irq_complete_move() would be completely pointless.
678 * lock and acquire it again. @desc cannot go away, because the
679 * hotplug code holds the sparse irq lock.
680 */ 678 */
681 raw_spin_lock(&vector_lock); 679 raw_spin_lock(&vector_lock);
682 /* Clean out all offline cpus (including ourself) first. */ 680 /*
681 * Clean out all offline cpus (including the outgoing one) from the
682 * old_domain mask.
683 */
683 cpumask_and(data->old_domain, data->old_domain, cpu_online_mask); 684 cpumask_and(data->old_domain, data->old_domain, cpu_online_mask);
684 while (!cpumask_empty(data->old_domain)) { 685
686 /*
687 * If move_in_progress is cleared and the old_domain mask is empty,
688 * then there is nothing to cleanup. fixup_irqs() will take care of
689 * the stale vectors on the outgoing cpu.
690 */
691 if (!data->move_in_progress && cpumask_empty(data->old_domain)) {
685 raw_spin_unlock(&vector_lock); 692 raw_spin_unlock(&vector_lock);
686 raw_spin_unlock(&desc->lock); 693 return;
687 cpu_relax(); 694 }
688 raw_spin_lock(&desc->lock); 695
696 /*
697 * 1) The interrupt is in move_in_progress state. That means that we
698 * have not seen an interrupt since the io_apic was reprogrammed to
699 * the new vector.
700 *
701 * 2) The interrupt has fired on the new vector, but the cleanup IPIs
702 * have not been processed yet.
703 */
704 if (data->move_in_progress) {
689 /* 705 /*
690 * Reevaluate apic_chip_data. It might have been cleared after 706 * In theory there is a race:
691 * we dropped @desc->lock. 707 *
708 * set_ioapic(new_vector) <-- Interrupt is raised before update
709 * is effective, i.e. it's raised on
710 * the old vector.
711 *
712 * So if the target cpu cannot handle that interrupt before
713 * the old vector is cleaned up, we get a spurious interrupt
714 * and in the worst case the ioapic irq line becomes stale.
715 *
716 * But in case of cpu hotplug this should be a non issue
717 * because if the affinity update happens right before all
718 * cpus rendevouz in stop machine, there is no way that the
719 * interrupt can be blocked on the target cpu because all cpus
720 * loops first with interrupts enabled in stop machine, so the
721 * old vector is not yet cleaned up when the interrupt fires.
722 *
723 * So the only way to run into this issue is if the delivery
724 * of the interrupt on the apic/system bus would be delayed
725 * beyond the point where the target cpu disables interrupts
726 * in stop machine. I doubt that it can happen, but at least
727 * there is a theroretical chance. Virtualization might be
728 * able to expose this, but AFAICT the IOAPIC emulation is not
729 * as stupid as the real hardware.
730 *
731 * Anyway, there is nothing we can do about that at this point
732 * w/o refactoring the whole fixup_irq() business completely.
733 * We print at least the irq number and the old vector number,
734 * so we have the necessary information when a problem in that
735 * area arises.
692 */ 736 */
693 data = apic_chip_data(irqdata); 737 pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n",
694 if (!data) 738 irqdata->irq, cfg->old_vector);
695 return;
696 raw_spin_lock(&vector_lock);
697 } 739 }
740 /*
741 * If old_domain is not empty, then other cpus still have the irq
742 * descriptor set in their vector array. Clean it up.
743 */
744 for_each_cpu(cpu, data->old_domain)
745 per_cpu(vector_irq, cpu)[cfg->old_vector] = VECTOR_UNUSED;
746
747 /* Cleanup the left overs of the (half finished) move */
748 cpumask_clear(data->old_domain);
749 data->move_in_progress = 0;
698 raw_spin_unlock(&vector_lock); 750 raw_spin_unlock(&vector_lock);
699} 751}
700#endif 752#endif