aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinas Vepstas <linas@linas.org>2005-11-03 19:49:23 -0500
committerPaul Mackerras <paulus@samba.org>2005-11-09 19:33:32 -0500
commitfd761fd876e4d1c0d07b6d93bc45c999fa596cb0 (patch)
tree9b117c197cd011e66cd623b54bee8586426fdabf
parent76e6faf7a3a3ad3e18a1b70f9e4cd96cdf58140d (diff)
[PATCH] ppc64: serialize reports of PCI errors
07-eeh-report-race.patch When a PCI slot is isolated, all PCI functions under that slot are affected. If hese functions have separate device drivers, the EEH isolation event might be reported multiple times. This patch adds a lock to prevent the racing of such multiple reports. It also marks every device under the slot as having experienced an EEH event, so that multiple reports may be recognized more easily. Signed-off-by: Linas Vepstas <linas@linas.org> Signed-off-by: Paul Mackerras <paulus@samba.org>
-rw-r--r--arch/ppc64/kernel/eeh.c98
1 files changed, 85 insertions, 13 deletions
diff --git a/arch/ppc64/kernel/eeh.c b/arch/ppc64/kernel/eeh.c
index 0060934dffd2..e7522f6da69d 100644
--- a/arch/ppc64/kernel/eeh.c
+++ b/arch/ppc64/kernel/eeh.c
@@ -96,6 +96,9 @@ static int ibm_slot_error_detail;
96 96
97static int eeh_subsystem_enabled; 97static int eeh_subsystem_enabled;
98 98
99/* Lock to avoid races due to multiple reports of an error */
100static DEFINE_SPINLOCK(confirm_error_lock);
101
99/* Buffer for reporting slot-error-detail rtas calls */ 102/* Buffer for reporting slot-error-detail rtas calls */
100static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX]; 103static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX];
101static DEFINE_SPINLOCK(slot_errbuf_lock); 104static DEFINE_SPINLOCK(slot_errbuf_lock);
@@ -544,6 +547,55 @@ static inline unsigned long eeh_token_to_phys(unsigned long token)
544 return pa | (token & (PAGE_SIZE-1)); 547 return pa | (token & (PAGE_SIZE-1));
545} 548}
546 549
550/**
551 * Return the "partitionable endpoint" (pe) under which this device lies
552 */
553static struct device_node * find_device_pe(struct device_node *dn)
554{
555 while ((dn->parent) && PCI_DN(dn->parent) &&
556 (PCI_DN(dn->parent)->eeh_mode & EEH_MODE_SUPPORTED)) {
557 dn = dn->parent;
558 }
559 return dn;
560}
561
562/** Mark all devices that are peers of this device as failed.
563 * Mark the device driver too, so that it can see the failure
564 * immediately; this is critical, since some drivers poll
565 * status registers in interrupts ... If a driver is polling,
566 * and the slot is frozen, then the driver can deadlock in
567 * an interrupt context, which is bad.
568 */
569
570static inline void __eeh_mark_slot (struct device_node *dn)
571{
572 while (dn) {
573 PCI_DN(dn)->eeh_mode |= EEH_MODE_ISOLATED;
574
575 if (dn->child)
576 __eeh_mark_slot (dn->child);
577 dn = dn->sibling;
578 }
579}
580
581static inline void __eeh_clear_slot (struct device_node *dn)
582{
583 while (dn) {
584 PCI_DN(dn)->eeh_mode &= ~EEH_MODE_ISOLATED;
585 if (dn->child)
586 __eeh_clear_slot (dn->child);
587 dn = dn->sibling;
588 }
589}
590
591static inline void eeh_clear_slot (struct device_node *dn)
592{
593 unsigned long flags;
594 spin_lock_irqsave(&confirm_error_lock, flags);
595 __eeh_clear_slot (dn);
596 spin_unlock_irqrestore(&confirm_error_lock, flags);
597}
598
547/** 599/**
548 * eeh_dn_check_failure - check if all 1's data is due to EEH slot freeze 600 * eeh_dn_check_failure - check if all 1's data is due to EEH slot freeze
549 * @dn device node 601 * @dn device node
@@ -567,6 +619,8 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
567 int reset_state; 619 int reset_state;
568 struct eeh_event *event; 620 struct eeh_event *event;
569 struct pci_dn *pdn; 621 struct pci_dn *pdn;
622 struct device_node *pe_dn;
623 int rc = 0;
570 624
571 __get_cpu_var(total_mmio_ffs)++; 625 __get_cpu_var(total_mmio_ffs)++;
572 626
@@ -594,10 +648,14 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
594 return 0; 648 return 0;
595 } 649 }
596 650
597 /* 651 /* If we already have a pending isolation event for this
598 * If we already have a pending isolation event for this 652 * slot, we know it's bad already, we don't need to check.
599 * slot, we know it's bad already, we don't need to check... 653 * Do this checking under a lock; as multiple PCI devices
654 * in one slot might report errors simultaneously, and we
655 * only want one error recovery routine running.
600 */ 656 */
657 spin_lock_irqsave(&confirm_error_lock, flags);
658 rc = 1;
601 if (pdn->eeh_mode & EEH_MODE_ISOLATED) { 659 if (pdn->eeh_mode & EEH_MODE_ISOLATED) {
602 atomic_inc(&eeh_fail_count); 660 atomic_inc(&eeh_fail_count);
603 if (atomic_read(&eeh_fail_count) >= EEH_MAX_FAILS) { 661 if (atomic_read(&eeh_fail_count) >= EEH_MAX_FAILS) {
@@ -606,7 +664,7 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
606 rets[0] = -1; /* reset state unknown */ 664 rets[0] = -1; /* reset state unknown */
607 eeh_panic(dev, rets[0]); 665 eeh_panic(dev, rets[0]);
608 } 666 }
609 return 0; 667 goto dn_unlock;
610 } 668 }
611 669
612 /* 670 /*
@@ -623,7 +681,8 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
623 printk(KERN_WARNING "EEH: read_slot_reset_state() failed; rc=%d dn=%s\n", 681 printk(KERN_WARNING "EEH: read_slot_reset_state() failed; rc=%d dn=%s\n",
624 ret, dn->full_name); 682 ret, dn->full_name);
625 __get_cpu_var(false_positives)++; 683 __get_cpu_var(false_positives)++;
626 return 0; 684 rc = 0;
685 goto dn_unlock;
627 } 686 }
628 687
629 /* If EEH is not supported on this device, punt. */ 688 /* If EEH is not supported on this device, punt. */
@@ -631,25 +690,33 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
631 printk(KERN_WARNING "EEH: event on unsupported device, rc=%d dn=%s\n", 690 printk(KERN_WARNING "EEH: event on unsupported device, rc=%d dn=%s\n",
632 ret, dn->full_name); 691 ret, dn->full_name);
633 __get_cpu_var(false_positives)++; 692 __get_cpu_var(false_positives)++;
634 return 0; 693 rc = 0;
694 goto dn_unlock;
635 } 695 }
636 696
637 /* If not the kind of error we know about, punt. */ 697 /* If not the kind of error we know about, punt. */
638 if (rets[0] != 2 && rets[0] != 4 && rets[0] != 5) { 698 if (rets[0] != 2 && rets[0] != 4 && rets[0] != 5) {
639 __get_cpu_var(false_positives)++; 699 __get_cpu_var(false_positives)++;
640 return 0; 700 rc = 0;
701 goto dn_unlock;
641 } 702 }
642 703
643 /* Note that config-io to empty slots may fail; 704 /* Note that config-io to empty slots may fail;
644 * we recognize empty because they don't have children. */ 705 * we recognize empty because they don't have children. */
645 if ((rets[0] == 5) && (dn->child == NULL)) { 706 if ((rets[0] == 5) && (dn->child == NULL)) {
646 __get_cpu_var(false_positives)++; 707 __get_cpu_var(false_positives)++;
647 return 0; 708 rc = 0;
709 goto dn_unlock;
648 } 710 }
649 711
650 /* prevent repeated reports of this failure */ 712 __get_cpu_var(slot_resets)++;
651 pdn->eeh_mode |= EEH_MODE_ISOLATED; 713
652 __get_cpu_var(slot_resets)++; 714 /* Avoid repeated reports of this failure, including problems
715 * with other functions on this device, and functions under
716 * bridges. */
717 pe_dn = find_device_pe (dn);
718 __eeh_mark_slot (pe_dn);
719 spin_unlock_irqrestore(&confirm_error_lock, flags);
653 720
654 reset_state = rets[0]; 721 reset_state = rets[0];
655 722
@@ -678,10 +745,14 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
678 if (rets[0] != 5) dump_stack(); 745 if (rets[0] != 5) dump_stack();
679 schedule_work(&eeh_event_wq); 746 schedule_work(&eeh_event_wq);
680 747
681 return 0; 748 return 1;
749
750dn_unlock:
751 spin_unlock_irqrestore(&confirm_error_lock, flags);
752 return rc;
682} 753}
683 754
684EXPORT_SYMBOL(eeh_dn_check_failure); 755EXPORT_SYMBOL_GPL(eeh_dn_check_failure);
685 756
686/** 757/**
687 * eeh_check_failure - check if all 1's data is due to EEH slot freeze 758 * eeh_check_failure - check if all 1's data is due to EEH slot freeze
@@ -820,6 +891,7 @@ void __init eeh_init(void)
820 struct device_node *phb, *np; 891 struct device_node *phb, *np;
821 struct eeh_early_enable_info info; 892 struct eeh_early_enable_info info;
822 893
894 spin_lock_init(&confirm_error_lock);
823 spin_lock_init(&slot_errbuf_lock); 895 spin_lock_init(&slot_errbuf_lock);
824 896
825 np = of_find_node_by_path("/rtas"); 897 np = of_find_node_by_path("/rtas");