aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/ppc64/kernel/eeh.c98
1 files changed, 85 insertions, 13 deletions
diff --git a/arch/ppc64/kernel/eeh.c b/arch/ppc64/kernel/eeh.c
index 0060934dffd2..e7522f6da69d 100644
--- a/arch/ppc64/kernel/eeh.c
+++ b/arch/ppc64/kernel/eeh.c
@@ -96,6 +96,9 @@ static int ibm_slot_error_detail;
96 96
97static int eeh_subsystem_enabled; 97static int eeh_subsystem_enabled;
98 98
99/* Lock to avoid races due to multiple reports of an error */
100static DEFINE_SPINLOCK(confirm_error_lock);
101
99/* Buffer for reporting slot-error-detail rtas calls */ 102/* Buffer for reporting slot-error-detail rtas calls */
100static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX]; 103static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX];
101static DEFINE_SPINLOCK(slot_errbuf_lock); 104static DEFINE_SPINLOCK(slot_errbuf_lock);
@@ -544,6 +547,55 @@ static inline unsigned long eeh_token_to_phys(unsigned long token)
544 return pa | (token & (PAGE_SIZE-1)); 547 return pa | (token & (PAGE_SIZE-1));
545} 548}
546 549
550/**
551 * Return the "partitionable endpoint" (pe) under which this device lies
552 */
553static struct device_node * find_device_pe(struct device_node *dn)
554{
555 while ((dn->parent) && PCI_DN(dn->parent) &&
556 (PCI_DN(dn->parent)->eeh_mode & EEH_MODE_SUPPORTED)) {
557 dn = dn->parent;
558 }
559 return dn;
560}
561
562/** Mark all devices that are peers of this device as failed.
563 * Mark the device driver too, so that it can see the failure
564 * immediately; this is critical, since some drivers poll
565 * status registers in interrupts ... If a driver is polling,
566 * and the slot is frozen, then the driver can deadlock in
567 * an interrupt context, which is bad.
568 */
569
570static inline void __eeh_mark_slot (struct device_node *dn)
571{
572 while (dn) {
573 PCI_DN(dn)->eeh_mode |= EEH_MODE_ISOLATED;
574
575 if (dn->child)
576 __eeh_mark_slot (dn->child);
577 dn = dn->sibling;
578 }
579}
580
581static inline void __eeh_clear_slot (struct device_node *dn)
582{
583 while (dn) {
584 PCI_DN(dn)->eeh_mode &= ~EEH_MODE_ISOLATED;
585 if (dn->child)
586 __eeh_clear_slot (dn->child);
587 dn = dn->sibling;
588 }
589}
590
591static inline void eeh_clear_slot (struct device_node *dn)
592{
593 unsigned long flags;
594 spin_lock_irqsave(&confirm_error_lock, flags);
595 __eeh_clear_slot (dn);
596 spin_unlock_irqrestore(&confirm_error_lock, flags);
597}
598
547/** 599/**
548 * eeh_dn_check_failure - check if all 1's data is due to EEH slot freeze 600 * eeh_dn_check_failure - check if all 1's data is due to EEH slot freeze
549 * @dn device node 601 * @dn device node
@@ -567,6 +619,8 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
567 int reset_state; 619 int reset_state;
568 struct eeh_event *event; 620 struct eeh_event *event;
569 struct pci_dn *pdn; 621 struct pci_dn *pdn;
622 struct device_node *pe_dn;
623 int rc = 0;
570 624
571 __get_cpu_var(total_mmio_ffs)++; 625 __get_cpu_var(total_mmio_ffs)++;
572 626
@@ -594,10 +648,14 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
594 return 0; 648 return 0;
595 } 649 }
596 650
597 /* 651 /* If we already have a pending isolation event for this
598 * If we already have a pending isolation event for this 652 * slot, we know it's bad already, we don't need to check.
599 * slot, we know it's bad already, we don't need to check... 653 * Do this checking under a lock; as multiple PCI devices
654 * in one slot might report errors simultaneously, and we
655 * only want one error recovery routine running.
600 */ 656 */
657 spin_lock_irqsave(&confirm_error_lock, flags);
658 rc = 1;
601 if (pdn->eeh_mode & EEH_MODE_ISOLATED) { 659 if (pdn->eeh_mode & EEH_MODE_ISOLATED) {
602 atomic_inc(&eeh_fail_count); 660 atomic_inc(&eeh_fail_count);
603 if (atomic_read(&eeh_fail_count) >= EEH_MAX_FAILS) { 661 if (atomic_read(&eeh_fail_count) >= EEH_MAX_FAILS) {
@@ -606,7 +664,7 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
606 rets[0] = -1; /* reset state unknown */ 664 rets[0] = -1; /* reset state unknown */
607 eeh_panic(dev, rets[0]); 665 eeh_panic(dev, rets[0]);
608 } 666 }
609 return 0; 667 goto dn_unlock;
610 } 668 }
611 669
612 /* 670 /*
@@ -623,7 +681,8 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
623 printk(KERN_WARNING "EEH: read_slot_reset_state() failed; rc=%d dn=%s\n", 681 printk(KERN_WARNING "EEH: read_slot_reset_state() failed; rc=%d dn=%s\n",
624 ret, dn->full_name); 682 ret, dn->full_name);
625 __get_cpu_var(false_positives)++; 683 __get_cpu_var(false_positives)++;
626 return 0; 684 rc = 0;
685 goto dn_unlock;
627 } 686 }
628 687
629 /* If EEH is not supported on this device, punt. */ 688 /* If EEH is not supported on this device, punt. */
@@ -631,25 +690,33 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
631 printk(KERN_WARNING "EEH: event on unsupported device, rc=%d dn=%s\n", 690 printk(KERN_WARNING "EEH: event on unsupported device, rc=%d dn=%s\n",
632 ret, dn->full_name); 691 ret, dn->full_name);
633 __get_cpu_var(false_positives)++; 692 __get_cpu_var(false_positives)++;
634 return 0; 693 rc = 0;
694 goto dn_unlock;
635 } 695 }
636 696
637 /* If not the kind of error we know about, punt. */ 697 /* If not the kind of error we know about, punt. */
638 if (rets[0] != 2 && rets[0] != 4 && rets[0] != 5) { 698 if (rets[0] != 2 && rets[0] != 4 && rets[0] != 5) {
639 __get_cpu_var(false_positives)++; 699 __get_cpu_var(false_positives)++;
640 return 0; 700 rc = 0;
701 goto dn_unlock;
641 } 702 }
642 703
643 /* Note that config-io to empty slots may fail; 704 /* Note that config-io to empty slots may fail;
644 * we recognize empty because they don't have children. */ 705 * we recognize empty because they don't have children. */
645 if ((rets[0] == 5) && (dn->child == NULL)) { 706 if ((rets[0] == 5) && (dn->child == NULL)) {
646 __get_cpu_var(false_positives)++; 707 __get_cpu_var(false_positives)++;
647 return 0; 708 rc = 0;
709 goto dn_unlock;
648 } 710 }
649 711
650 /* prevent repeated reports of this failure */ 712 __get_cpu_var(slot_resets)++;
651 pdn->eeh_mode |= EEH_MODE_ISOLATED; 713
652 __get_cpu_var(slot_resets)++; 714 /* Avoid repeated reports of this failure, including problems
715 * with other functions on this device, and functions under
716 * bridges. */
717 pe_dn = find_device_pe (dn);
718 __eeh_mark_slot (pe_dn);
719 spin_unlock_irqrestore(&confirm_error_lock, flags);
653 720
654 reset_state = rets[0]; 721 reset_state = rets[0];
655 722
@@ -678,10 +745,14 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
678 if (rets[0] != 5) dump_stack(); 745 if (rets[0] != 5) dump_stack();
679 schedule_work(&eeh_event_wq); 746 schedule_work(&eeh_event_wq);
680 747
681 return 0; 748 return 1;
749
750dn_unlock:
751 spin_unlock_irqrestore(&confirm_error_lock, flags);
752 return rc;
682} 753}
683 754
684EXPORT_SYMBOL(eeh_dn_check_failure); 755EXPORT_SYMBOL_GPL(eeh_dn_check_failure);
685 756
686/** 757/**
687 * eeh_check_failure - check if all 1's data is due to EEH slot freeze 758 * eeh_check_failure - check if all 1's data is due to EEH slot freeze
@@ -820,6 +891,7 @@ void __init eeh_init(void)
820 struct device_node *phb, *np; 891 struct device_node *phb, *np;
821 struct eeh_early_enable_info info; 892 struct eeh_early_enable_info info;
822 893
894 spin_lock_init(&confirm_error_lock);
823 spin_lock_init(&slot_errbuf_lock); 895 spin_lock_init(&slot_errbuf_lock);
824 896
825 np = of_find_node_by_path("/rtas"); 897 np = of_find_node_by_path("/rtas");