1 files changed, 85 insertions, 13 deletions
diff --git a/arch/ppc64/kernel/eeh.c b/arch/ppc64/kernel/eeh.c
index 0060934dffd2..e7522f6da69d 100644
--- a/arch/ppc64/kernel/eeh.c
+++ b/arch/ppc64/kernel/eeh.c
@@ -96,6 +96,9 @@ static int ibm_slot_error_detail;
 static int eeh_subsystem_enabled;
+/* Lock to avoid races due to multiple reports of an error */
+static DEFINE_SPINLOCK(confirm_error_lock);
 /* Buffer for reporting slot-error-detail rtas calls */
 static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX];
 static DEFINE_SPINLOCK(slot_errbuf_lock);
@@ -544,6 +547,55 @@ static inline unsigned long eeh_token_to_phys(unsigned long token)
        return pa | (token & (PAGE_SIZE-1));
 }
+/** 
+ * Return the "partitionable endpoint" (pe) under which this device lies
+ */
+static struct device_node * find_device_pe(struct device_node *dn)
+{
+        while ((dn->parent) && PCI_DN(dn->parent) &&
+              (PCI_DN(dn->parent)->eeh_mode & EEH_MODE_SUPPORTED)) {
+                dn = dn->parent;
+        }
+        return dn;
+}
+/** Mark all devices that are peers of this device as failed.
+ *  Mark the device driver too, so that it can see the failure
+ *  immediately; this is critical, since some drivers poll
+ *  status registers in interrupts ... If a driver is polling,
+ *  and the slot is frozen, then the driver can deadlock in
+ *  an interrupt context, which is bad.
+ */
+static inline void __eeh_mark_slot (struct device_node *dn)
+{
+        while (dn) {
+                PCI_DN(dn)->eeh_mode |= EEH_MODE_ISOLATED;
+                if (dn->child)
+                        __eeh_mark_slot (dn->child);
+                dn = dn->sibling;
+        }
+}
+static inline void __eeh_clear_slot (struct device_node *dn)
+{
+        while (dn) {
+                PCI_DN(dn)->eeh_mode &= ~EEH_MODE_ISOLATED;
+                if (dn->child)
+                        __eeh_clear_slot (dn->child);
+                dn = dn->sibling;
+        }
+}
+static inline void eeh_clear_slot (struct device_node *dn)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&confirm_error_lock, flags);
+        __eeh_clear_slot (dn);
+        spin_unlock_irqrestore(&confirm_error_lock, flags);
+}
 /**
 * eeh_dn_check_failure - check if all 1's data is due to EEH slot freeze
 * @dn device node
@@ -567,6 +619,8 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
        int reset_state;
        struct eeh_event  *event;
        struct pci_dn *pdn;
+        struct device_node *pe_dn;
+        int rc = 0;
        __get_cpu_var(total_mmio_ffs)++;
@@ -594,10 +648,14 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
                return 0;
        }
-        /*
+        /* If we already have a pending isolation event for this
-         * If we already have a pending isolation event for this
+         * slot, we know it's bad already, we don't need to check.
-         * slot, we know it's bad already, we don't need to check...
+         * Do this checking under a lock; as multiple PCI devices
+         * in one slot might report errors simultaneously, and we
+         * only want one error recovery routine running.
         */
+        spin_lock_irqsave(&confirm_error_lock, flags);
+        rc = 1;
        if (pdn->eeh_mode & EEH_MODE_ISOLATED) {
                atomic_inc(&eeh_fail_count);
                if (atomic_read(&eeh_fail_count) >= EEH_MAX_FAILS) {
@@ -606,7 +664,7 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
                                rets[0] = -1;   /* reset state unknown */
                        eeh_panic(dev, rets[0]);
                }
-                return 0;
+                goto dn_unlock;
        }
        /*
@@ -623,7 +681,8 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
                printk(KERN_WARNING "EEH: read_slot_reset_state() failed; rc=%d dn=%s\n",
                       ret, dn->full_name);
                __get_cpu_var(false_positives)++;
-                return 0;
+                rc = 0;
+                goto dn_unlock;
        }
        /* If EEH is not supported on this device, punt. */
@@ -631,25 +690,33 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
                printk(KERN_WARNING "EEH: event on unsupported device, rc=%d dn=%s\n",
                       ret, dn->full_name);
                __get_cpu_var(false_positives)++;
-                return 0;
+                rc = 0;
+                goto dn_unlock;
        }
        /* If not the kind of error we know about, punt. */
        if (rets[0] != 2 && rets[0] != 4 && rets[0] != 5) {
                __get_cpu_var(false_positives)++;
-                return 0;
+                rc = 0;
+                goto dn_unlock;
        }
        /* Note that config-io to empty slots may fail;
         * we recognize empty because they don't have children. */
        if ((rets[0] == 5) && (dn->child == NULL)) {
                __get_cpu_var(false_positives)++;
-                return 0;
+                rc = 0;
+                goto dn_unlock;
        }
-        /* prevent repeated reports of this failure */
+        __get_cpu_var(slot_resets)++;
-        pdn->eeh_mode |= EEH_MODE_ISOLATED;
+ 
-         __get_cpu_var(slot_resets)++;
+        /* Avoid repeated reports of this failure, including problems
+         * with other functions on this device, and functions under
+         * bridges. */
+        pe_dn = find_device_pe (dn);
+        __eeh_mark_slot (pe_dn);
+        spin_unlock_irqrestore(&confirm_error_lock, flags);
        reset_state = rets[0];
@@ -678,10 +745,14 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
        if (rets[0] != 5) dump_stack();
        schedule_work(&eeh_event_wq);
-        return 0;
+        return 1;
+dn_unlock:
+        spin_unlock_irqrestore(&confirm_error_lock, flags);
+        return rc;
 }
-EXPORT_SYMBOL(eeh_dn_check_failure);
+EXPORT_SYMBOL_GPL(eeh_dn_check_failure);
 /**
 * eeh_check_failure - check if all 1's data is due to EEH slot freeze
@@ -820,6 +891,7 @@ void __init eeh_init(void)
        struct device_node *phb, *np;
        struct eeh_early_enable_info info;
+        spin_lock_init(&confirm_error_lock);
        spin_lock_init(&slot_errbuf_lock);
        np = of_find_node_by_path("/rtas");

diff --git a/arch/ppc64/kernel/eeh.c b/arch/ppc64/kernel/eeh.c index 0060934dffd2..e7522f6da69d 100644 --- a/arch/ppc64/kernel/eeh.c +++ b/arch/ppc64/kernel/eeh.c
@@ -96,6 +96,9 @@ static int ibm_slot_error_detail;
96		96
97	static int eeh_subsystem_enabled;	97	static int eeh_subsystem_enabled;
98		98
		99	/* Lock to avoid races due to multiple reports of an error */
		100	static DEFINE_SPINLOCK(confirm_error_lock);
		101
99	/* Buffer for reporting slot-error-detail rtas calls */	102	/* Buffer for reporting slot-error-detail rtas calls */
100	static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX];	103	static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX];
101	static DEFINE_SPINLOCK(slot_errbuf_lock);	104	static DEFINE_SPINLOCK(slot_errbuf_lock);
@@ -544,6 +547,55 @@ static inline unsigned long eeh_token_to_phys(unsigned long token)
544	return pa \| (token & (PAGE_SIZE-1));	547	return pa \| (token & (PAGE_SIZE-1));
545	}	548	}
546		549
		550	/**
		551	* Return the "partitionable endpoint" (pe) under which this device lies
		552	*/
		553	static struct device_node * find_device_pe(struct device_node *dn)
		554	{
		555	while ((dn->parent) && PCI_DN(dn->parent) &&
		556	(PCI_DN(dn->parent)->eeh_mode & EEH_MODE_SUPPORTED)) {
		557	dn = dn->parent;
		558	}
		559	return dn;
		560	}
		561
		562	/** Mark all devices that are peers of this device as failed.
		563	* Mark the device driver too, so that it can see the failure
		564	* immediately; this is critical, since some drivers poll
		565	* status registers in interrupts ... If a driver is polling,
		566	* and the slot is frozen, then the driver can deadlock in
		567	* an interrupt context, which is bad.
		568	*/
		569
		570	static inline void __eeh_mark_slot (struct device_node *dn)
		571	{
		572	while (dn) {
		573	PCI_DN(dn)->eeh_mode \|= EEH_MODE_ISOLATED;
		574
		575	if (dn->child)
		576	__eeh_mark_slot (dn->child);
		577	dn = dn->sibling;
		578	}
		579	}
		580
		581	static inline void __eeh_clear_slot (struct device_node *dn)
		582	{
		583	while (dn) {
		584	PCI_DN(dn)->eeh_mode &= ~EEH_MODE_ISOLATED;
		585	if (dn->child)
		586	__eeh_clear_slot (dn->child);
		587	dn = dn->sibling;
		588	}
		589	}
		590
		591	static inline void eeh_clear_slot (struct device_node *dn)
		592	{
		593	unsigned long flags;
		594	spin_lock_irqsave(&confirm_error_lock, flags);
		595	__eeh_clear_slot (dn);
		596	spin_unlock_irqrestore(&confirm_error_lock, flags);
		597	}
		598
547	/**	599	/**
548	* eeh_dn_check_failure - check if all 1's data is due to EEH slot freeze	600	* eeh_dn_check_failure - check if all 1's data is due to EEH slot freeze
549	* @dn device node	601	* @dn device node
@@ -567,6 +619,8 @@ int eeh_dn_check_failure(struct device_node dn, struct pci_dev dev)
567	int reset_state;	619	int reset_state;
568	struct eeh_event *event;	620	struct eeh_event *event;
569	struct pci_dn *pdn;	621	struct pci_dn *pdn;
		622	struct device_node *pe_dn;
		623	int rc = 0;
570		624
571	__get_cpu_var(total_mmio_ffs)++;	625	__get_cpu_var(total_mmio_ffs)++;
572		626
@@ -594,10 +648,14 @@ int eeh_dn_check_failure(struct device_node dn, struct pci_dev dev)
594	return 0;	648	return 0;
595	}	649	}
596		650
597	/*	651	/* If we already have a pending isolation event for this
598	* If we already have a pending isolation event for this	652	* slot, we know it's bad already, we don't need to check.
599	* slot, we know it's bad already, we don't need to check...	653	* Do this checking under a lock; as multiple PCI devices
		654	* in one slot might report errors simultaneously, and we
		655	* only want one error recovery routine running.
600	*/	656	*/
		657	spin_lock_irqsave(&confirm_error_lock, flags);
		658	rc = 1;
601	if (pdn->eeh_mode & EEH_MODE_ISOLATED) {	659	if (pdn->eeh_mode & EEH_MODE_ISOLATED) {
602	atomic_inc(&eeh_fail_count);	660	atomic_inc(&eeh_fail_count);
603	if (atomic_read(&eeh_fail_count) >= EEH_MAX_FAILS) {	661	if (atomic_read(&eeh_fail_count) >= EEH_MAX_FAILS) {
@@ -606,7 +664,7 @@ int eeh_dn_check_failure(struct device_node dn, struct pci_dev dev)
606	rets[0] = -1; /* reset state unknown */	664	rets[0] = -1; /* reset state unknown */
607	eeh_panic(dev, rets[0]);	665	eeh_panic(dev, rets[0]);
608	}	666	}
609	return 0;	667	goto dn_unlock;
610	}	668	}
611		669
612	/*	670	/*
@@ -623,7 +681,8 @@ int eeh_dn_check_failure(struct device_node dn, struct pci_dev dev)
623	printk(KERN_WARNING "EEH: read_slot_reset_state() failed; rc=%d dn=%s\n",	681	printk(KERN_WARNING "EEH: read_slot_reset_state() failed; rc=%d dn=%s\n",
624	ret, dn->full_name);	682	ret, dn->full_name);
625	__get_cpu_var(false_positives)++;	683	__get_cpu_var(false_positives)++;
626	return 0;	684	rc = 0;
		685	goto dn_unlock;
627	}	686	}
628		687
629	/* If EEH is not supported on this device, punt. */	688	/* If EEH is not supported on this device, punt. */
@@ -631,25 +690,33 @@ int eeh_dn_check_failure(struct device_node dn, struct pci_dev dev)
631	printk(KERN_WARNING "EEH: event on unsupported device, rc=%d dn=%s\n",	690	printk(KERN_WARNING "EEH: event on unsupported device, rc=%d dn=%s\n",
632	ret, dn->full_name);	691	ret, dn->full_name);
633	__get_cpu_var(false_positives)++;	692	__get_cpu_var(false_positives)++;
634	return 0;	693	rc = 0;
		694	goto dn_unlock;
635	}	695	}
636		696
637	/* If not the kind of error we know about, punt. */	697	/* If not the kind of error we know about, punt. */
638	if (rets[0] != 2 && rets[0] != 4 && rets[0] != 5) {	698	if (rets[0] != 2 && rets[0] != 4 && rets[0] != 5) {
639	__get_cpu_var(false_positives)++;	699	__get_cpu_var(false_positives)++;
640	return 0;	700	rc = 0;
		701	goto dn_unlock;
641	}	702	}
642		703
643	/* Note that config-io to empty slots may fail;	704	/* Note that config-io to empty slots may fail;
644	* we recognize empty because they don't have children. */	705	* we recognize empty because they don't have children. */
645	if ((rets[0] == 5) && (dn->child == NULL)) {	706	if ((rets[0] == 5) && (dn->child == NULL)) {
646	__get_cpu_var(false_positives)++;	707	__get_cpu_var(false_positives)++;
647	return 0;	708	rc = 0;
		709	goto dn_unlock;
648	}	710	}
649		711
650	/* prevent repeated reports of this failure */	712	__get_cpu_var(slot_resets)++;
651	pdn->eeh_mode \|= EEH_MODE_ISOLATED;	713
652	__get_cpu_var(slot_resets)++;	714	/* Avoid repeated reports of this failure, including problems
		715	* with other functions on this device, and functions under
		716	* bridges. */
		717	pe_dn = find_device_pe (dn);
		718	__eeh_mark_slot (pe_dn);
		719	spin_unlock_irqrestore(&confirm_error_lock, flags);
653		720
654	reset_state = rets[0];	721	reset_state = rets[0];
655		722
@@ -678,10 +745,14 @@ int eeh_dn_check_failure(struct device_node dn, struct pci_dev dev)
678	if (rets[0] != 5) dump_stack();	745	if (rets[0] != 5) dump_stack();
679	schedule_work(&eeh_event_wq);	746	schedule_work(&eeh_event_wq);
680		747
681	return 0;	748	return 1;
		749
		750	dn_unlock:
		751	spin_unlock_irqrestore(&confirm_error_lock, flags);
		752	return rc;
682	}	753	}
683		754
684	EXPORT_SYMBOL(eeh_dn_check_failure);	755	EXPORT_SYMBOL_GPL(eeh_dn_check_failure);
685		756
686	/**	757	/**
687	* eeh_check_failure - check if all 1's data is due to EEH slot freeze	758	* eeh_check_failure - check if all 1's data is due to EEH slot freeze
@@ -820,6 +891,7 @@ void __init eeh_init(void)
820	struct device_node phb, np;	891	struct device_node phb, np;
821	struct eeh_early_enable_info info;	892	struct eeh_early_enable_info info;
822		893
		894	spin_lock_init(&confirm_error_lock);
823	spin_lock_init(&slot_errbuf_lock);	895	spin_lock_init(&slot_errbuf_lock);
824		896
825	np = of_find_node_by_path("/rtas");	897	np = of_find_node_by_path("/rtas");