diff options
-rw-r--r-- | arch/ppc64/kernel/eeh.c | 98 |
1 files changed, 85 insertions, 13 deletions
diff --git a/arch/ppc64/kernel/eeh.c b/arch/ppc64/kernel/eeh.c index 0060934dffd2..e7522f6da69d 100644 --- a/arch/ppc64/kernel/eeh.c +++ b/arch/ppc64/kernel/eeh.c | |||
@@ -96,6 +96,9 @@ static int ibm_slot_error_detail; | |||
96 | 96 | ||
97 | static int eeh_subsystem_enabled; | 97 | static int eeh_subsystem_enabled; |
98 | 98 | ||
99 | /* Lock to avoid races due to multiple reports of an error */ | ||
100 | static DEFINE_SPINLOCK(confirm_error_lock); | ||
101 | |||
99 | /* Buffer for reporting slot-error-detail rtas calls */ | 102 | /* Buffer for reporting slot-error-detail rtas calls */ |
100 | static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX]; | 103 | static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX]; |
101 | static DEFINE_SPINLOCK(slot_errbuf_lock); | 104 | static DEFINE_SPINLOCK(slot_errbuf_lock); |
@@ -544,6 +547,55 @@ static inline unsigned long eeh_token_to_phys(unsigned long token) | |||
544 | return pa | (token & (PAGE_SIZE-1)); | 547 | return pa | (token & (PAGE_SIZE-1)); |
545 | } | 548 | } |
546 | 549 | ||
550 | /** | ||
551 | * Return the "partitionable endpoint" (pe) under which this device lies | ||
552 | */ | ||
553 | static struct device_node * find_device_pe(struct device_node *dn) | ||
554 | { | ||
555 | while ((dn->parent) && PCI_DN(dn->parent) && | ||
556 | (PCI_DN(dn->parent)->eeh_mode & EEH_MODE_SUPPORTED)) { | ||
557 | dn = dn->parent; | ||
558 | } | ||
559 | return dn; | ||
560 | } | ||
561 | |||
562 | /** Mark all devices that are peers of this device as failed. | ||
563 | * Mark the device driver too, so that it can see the failure | ||
564 | * immediately; this is critical, since some drivers poll | ||
565 | * status registers in interrupts ... If a driver is polling, | ||
566 | * and the slot is frozen, then the driver can deadlock in | ||
567 | * an interrupt context, which is bad. | ||
568 | */ | ||
569 | |||
570 | static inline void __eeh_mark_slot (struct device_node *dn) | ||
571 | { | ||
572 | while (dn) { | ||
573 | PCI_DN(dn)->eeh_mode |= EEH_MODE_ISOLATED; | ||
574 | |||
575 | if (dn->child) | ||
576 | __eeh_mark_slot (dn->child); | ||
577 | dn = dn->sibling; | ||
578 | } | ||
579 | } | ||
580 | |||
581 | static inline void __eeh_clear_slot (struct device_node *dn) | ||
582 | { | ||
583 | while (dn) { | ||
584 | PCI_DN(dn)->eeh_mode &= ~EEH_MODE_ISOLATED; | ||
585 | if (dn->child) | ||
586 | __eeh_clear_slot (dn->child); | ||
587 | dn = dn->sibling; | ||
588 | } | ||
589 | } | ||
590 | |||
591 | static inline void eeh_clear_slot (struct device_node *dn) | ||
592 | { | ||
593 | unsigned long flags; | ||
594 | spin_lock_irqsave(&confirm_error_lock, flags); | ||
595 | __eeh_clear_slot (dn); | ||
596 | spin_unlock_irqrestore(&confirm_error_lock, flags); | ||
597 | } | ||
598 | |||
547 | /** | 599 | /** |
548 | * eeh_dn_check_failure - check if all 1's data is due to EEH slot freeze | 600 | * eeh_dn_check_failure - check if all 1's data is due to EEH slot freeze |
549 | * @dn device node | 601 | * @dn device node |
@@ -567,6 +619,8 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev) | |||
567 | int reset_state; | 619 | int reset_state; |
568 | struct eeh_event *event; | 620 | struct eeh_event *event; |
569 | struct pci_dn *pdn; | 621 | struct pci_dn *pdn; |
622 | struct device_node *pe_dn; | ||
623 | int rc = 0; | ||
570 | 624 | ||
571 | __get_cpu_var(total_mmio_ffs)++; | 625 | __get_cpu_var(total_mmio_ffs)++; |
572 | 626 | ||
@@ -594,10 +648,14 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev) | |||
594 | return 0; | 648 | return 0; |
595 | } | 649 | } |
596 | 650 | ||
597 | /* | 651 | /* If we already have a pending isolation event for this |
598 | * If we already have a pending isolation event for this | 652 | * slot, we know it's bad already, we don't need to check. |
599 | * slot, we know it's bad already, we don't need to check... | 653 | * Do this checking under a lock; as multiple PCI devices |
654 | * in one slot might report errors simultaneously, and we | ||
655 | * only want one error recovery routine running. | ||
600 | */ | 656 | */ |
657 | spin_lock_irqsave(&confirm_error_lock, flags); | ||
658 | rc = 1; | ||
601 | if (pdn->eeh_mode & EEH_MODE_ISOLATED) { | 659 | if (pdn->eeh_mode & EEH_MODE_ISOLATED) { |
602 | atomic_inc(&eeh_fail_count); | 660 | atomic_inc(&eeh_fail_count); |
603 | if (atomic_read(&eeh_fail_count) >= EEH_MAX_FAILS) { | 661 | if (atomic_read(&eeh_fail_count) >= EEH_MAX_FAILS) { |
@@ -606,7 +664,7 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev) | |||
606 | rets[0] = -1; /* reset state unknown */ | 664 | rets[0] = -1; /* reset state unknown */ |
607 | eeh_panic(dev, rets[0]); | 665 | eeh_panic(dev, rets[0]); |
608 | } | 666 | } |
609 | return 0; | 667 | goto dn_unlock; |
610 | } | 668 | } |
611 | 669 | ||
612 | /* | 670 | /* |
@@ -623,7 +681,8 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev) | |||
623 | printk(KERN_WARNING "EEH: read_slot_reset_state() failed; rc=%d dn=%s\n", | 681 | printk(KERN_WARNING "EEH: read_slot_reset_state() failed; rc=%d dn=%s\n", |
624 | ret, dn->full_name); | 682 | ret, dn->full_name); |
625 | __get_cpu_var(false_positives)++; | 683 | __get_cpu_var(false_positives)++; |
626 | return 0; | 684 | rc = 0; |
685 | goto dn_unlock; | ||
627 | } | 686 | } |
628 | 687 | ||
629 | /* If EEH is not supported on this device, punt. */ | 688 | /* If EEH is not supported on this device, punt. */ |
@@ -631,25 +690,33 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev) | |||
631 | printk(KERN_WARNING "EEH: event on unsupported device, rc=%d dn=%s\n", | 690 | printk(KERN_WARNING "EEH: event on unsupported device, rc=%d dn=%s\n", |
632 | ret, dn->full_name); | 691 | ret, dn->full_name); |
633 | __get_cpu_var(false_positives)++; | 692 | __get_cpu_var(false_positives)++; |
634 | return 0; | 693 | rc = 0; |
694 | goto dn_unlock; | ||
635 | } | 695 | } |
636 | 696 | ||
637 | /* If not the kind of error we know about, punt. */ | 697 | /* If not the kind of error we know about, punt. */ |
638 | if (rets[0] != 2 && rets[0] != 4 && rets[0] != 5) { | 698 | if (rets[0] != 2 && rets[0] != 4 && rets[0] != 5) { |
639 | __get_cpu_var(false_positives)++; | 699 | __get_cpu_var(false_positives)++; |
640 | return 0; | 700 | rc = 0; |
701 | goto dn_unlock; | ||
641 | } | 702 | } |
642 | 703 | ||
643 | /* Note that config-io to empty slots may fail; | 704 | /* Note that config-io to empty slots may fail; |
644 | * we recognize empty because they don't have children. */ | 705 | * we recognize empty because they don't have children. */ |
645 | if ((rets[0] == 5) && (dn->child == NULL)) { | 706 | if ((rets[0] == 5) && (dn->child == NULL)) { |
646 | __get_cpu_var(false_positives)++; | 707 | __get_cpu_var(false_positives)++; |
647 | return 0; | 708 | rc = 0; |
709 | goto dn_unlock; | ||
648 | } | 710 | } |
649 | 711 | ||
650 | /* prevent repeated reports of this failure */ | 712 | __get_cpu_var(slot_resets)++; |
651 | pdn->eeh_mode |= EEH_MODE_ISOLATED; | 713 | |
652 | __get_cpu_var(slot_resets)++; | 714 | /* Avoid repeated reports of this failure, including problems |
715 | * with other functions on this device, and functions under | ||
716 | * bridges. */ | ||
717 | pe_dn = find_device_pe (dn); | ||
718 | __eeh_mark_slot (pe_dn); | ||
719 | spin_unlock_irqrestore(&confirm_error_lock, flags); | ||
653 | 720 | ||
654 | reset_state = rets[0]; | 721 | reset_state = rets[0]; |
655 | 722 | ||
@@ -678,10 +745,14 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev) | |||
678 | if (rets[0] != 5) dump_stack(); | 745 | if (rets[0] != 5) dump_stack(); |
679 | schedule_work(&eeh_event_wq); | 746 | schedule_work(&eeh_event_wq); |
680 | 747 | ||
681 | return 0; | 748 | return 1; |
749 | |||
750 | dn_unlock: | ||
751 | spin_unlock_irqrestore(&confirm_error_lock, flags); | ||
752 | return rc; | ||
682 | } | 753 | } |
683 | 754 | ||
684 | EXPORT_SYMBOL(eeh_dn_check_failure); | 755 | EXPORT_SYMBOL_GPL(eeh_dn_check_failure); |
685 | 756 | ||
686 | /** | 757 | /** |
687 | * eeh_check_failure - check if all 1's data is due to EEH slot freeze | 758 | * eeh_check_failure - check if all 1's data is due to EEH slot freeze |
@@ -820,6 +891,7 @@ void __init eeh_init(void) | |||
820 | struct device_node *phb, *np; | 891 | struct device_node *phb, *np; |
821 | struct eeh_early_enable_info info; | 892 | struct eeh_early_enable_info info; |
822 | 893 | ||
894 | spin_lock_init(&confirm_error_lock); | ||
823 | spin_lock_init(&slot_errbuf_lock); | 895 | spin_lock_init(&slot_errbuf_lock); |
824 | 896 | ||
825 | np = of_find_node_by_path("/rtas"); | 897 | np = of_find_node_by_path("/rtas"); |