[PATCH] ppc64: serialize reports of PCI errors

07-eeh-report-race.patch When a PCI slot is isolated, all PCI functions under that slot are affected. If hese functions have separate device drivers, the EEH isolation event might be reported multiple times. This patch adds a lock to prevent the racing of such multiple reports. It also marks every device under the slot as having experienced an EEH event, so that multiple reports may be recognized more easily. Signed-off-by: Linas Vepstas <linas@linas.org> Signed-off-by: Paul Mackerras <paulus@samba.org>
author: Linas Vepstas <linas@linas.org> 2005-11-03 19:49:23 -0500
committer: Paul Mackerras <paulus@samba.org> 2005-11-09 19:33:32 -0500
commit: fd761fd876e4d1c0d07b6d93bc45c999fa596cb0 (patch)
tree: 9b117c197cd011e66cd623b54bee8586426fdabf /arch
parent: 76e6faf7a3a3ad3e18a1b70f9e4cd96cdf58140d (diff)
1 files changed, 85 insertions, 13 deletions
diff --git a/arch/ppc64/kernel/eeh.c b/arch/ppc64/kernel/eeh.c
index 0060934dffd2..e7522f6da69d 100644
--- a/arch/ppc64/kernel/eeh.c
+++ b/arch/ppc64/kernel/eeh.c
@@ -96,6 +96,9 @@ static int ibm_slot_error_detail;
 static int eeh_subsystem_enabled;
+/* Lock to avoid races due to multiple reports of an error */
+static DEFINE_SPINLOCK(confirm_error_lock);
 /* Buffer for reporting slot-error-detail rtas calls */
 static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX];
 static DEFINE_SPINLOCK(slot_errbuf_lock);
@@ -544,6 +547,55 @@ static inline unsigned long eeh_token_to_phys(unsigned long token)
        return pa | (token & (PAGE_SIZE-1));
 }
+/** 
+ * Return the "partitionable endpoint" (pe) under which this device lies
+ */
+static struct device_node * find_device_pe(struct device_node *dn)
+{
+        while ((dn->parent) && PCI_DN(dn->parent) &&
+              (PCI_DN(dn->parent)->eeh_mode & EEH_MODE_SUPPORTED)) {
+                dn = dn->parent;
+        }
+        return dn;
+}
+/** Mark all devices that are peers of this device as failed.
+ *  Mark the device driver too, so that it can see the failure
+ *  immediately; this is critical, since some drivers poll
+ *  status registers in interrupts ... If a driver is polling,
+ *  and the slot is frozen, then the driver can deadlock in
+ *  an interrupt context, which is bad.
+ */
+static inline void __eeh_mark_slot (struct device_node *dn)
+{
+        while (dn) {
+                PCI_DN(dn)->eeh_mode |= EEH_MODE_ISOLATED;
+                if (dn->child)
+                        __eeh_mark_slot (dn->child);
+                dn = dn->sibling;
+        }
+}
+static inline void __eeh_clear_slot (struct device_node *dn)
+{
+        while (dn) {
+                PCI_DN(dn)->eeh_mode &= ~EEH_MODE_ISOLATED;
+                if (dn->child)
+                        __eeh_clear_slot (dn->child);
+                dn = dn->sibling;
+        }
+}
+static inline void eeh_clear_slot (struct device_node *dn)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&confirm_error_lock, flags);
+        __eeh_clear_slot (dn);
+        spin_unlock_irqrestore(&confirm_error_lock, flags);
+}
 /**
 * eeh_dn_check_failure - check if all 1's data is due to EEH slot freeze
 * @dn device node
@@ -567,6 +619,8 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
        int reset_state;
        struct eeh_event  *event;
        struct pci_dn *pdn;
+        struct device_node *pe_dn;
+        int rc = 0;
        __get_cpu_var(total_mmio_ffs)++;
@@ -594,10 +648,14 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
                return 0;
        }
-        /*
+        /* If we already have a pending isolation event for this
-         * If we already have a pending isolation event for this
+         * slot, we know it's bad already, we don't need to check.
-         * slot, we know it's bad already, we don't need to check...
+         * Do this checking under a lock; as multiple PCI devices
+         * in one slot might report errors simultaneously, and we
+         * only want one error recovery routine running.
         */
+        spin_lock_irqsave(&confirm_error_lock, flags);
+        rc = 1;
        if (pdn->eeh_mode & EEH_MODE_ISOLATED) {
                atomic_inc(&eeh_fail_count);
                if (atomic_read(&eeh_fail_count) >= EEH_MAX_FAILS) {
@@ -606,7 +664,7 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
                                rets[0] = -1;   /* reset state unknown */
                        eeh_panic(dev, rets[0]);
                }
-                return 0;
+                goto dn_unlock;
        }
        /*
@@ -623,7 +681,8 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
                printk(KERN_WARNING "EEH: read_slot_reset_state() failed; rc=%d dn=%s\n",
                       ret, dn->full_name);
                __get_cpu_var(false_positives)++;
-                return 0;
+                rc = 0;
+                goto dn_unlock;
        }
        /* If EEH is not supported on this device, punt. */
@@ -631,25 +690,33 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
                printk(KERN_WARNING "EEH: event on unsupported device, rc=%d dn=%s\n",
                       ret, dn->full_name);
                __get_cpu_var(false_positives)++;
-                return 0;
+                rc = 0;
+                goto dn_unlock;
        }
        /* If not the kind of error we know about, punt. */
        if (rets[0] != 2 && rets[0] != 4 && rets[0] != 5) {
                __get_cpu_var(false_positives)++;
-                return 0;
+                rc = 0;
+                goto dn_unlock;
        }
        /* Note that config-io to empty slots may fail;
         * we recognize empty because they don't have children. */
        if ((rets[0] == 5) && (dn->child == NULL)) {
                __get_cpu_var(false_positives)++;
-                return 0;
+                rc = 0;
+                goto dn_unlock;
        }
-        /* prevent repeated reports of this failure */
+        __get_cpu_var(slot_resets)++;
-        pdn->eeh_mode |= EEH_MODE_ISOLATED;
+ 
-         __get_cpu_var(slot_resets)++;
+        /* Avoid repeated reports of this failure, including problems
+         * with other functions on this device, and functions under
+         * bridges. */
+        pe_dn = find_device_pe (dn);
+        __eeh_mark_slot (pe_dn);
+        spin_unlock_irqrestore(&confirm_error_lock, flags);
        reset_state = rets[0];
@@ -678,10 +745,14 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
        if (rets[0] != 5) dump_stack();
        schedule_work(&eeh_event_wq);
-        return 0;
+        return 1;
+dn_unlock:
+        spin_unlock_irqrestore(&confirm_error_lock, flags);
+        return rc;
 }
-EXPORT_SYMBOL(eeh_dn_check_failure);
+EXPORT_SYMBOL_GPL(eeh_dn_check_failure);
 /**
 * eeh_check_failure - check if all 1's data is due to EEH slot freeze
@@ -820,6 +891,7 @@ void __init eeh_init(void)
        struct device_node *phb, *np;
        struct eeh_early_enable_info info;
+        spin_lock_init(&confirm_error_lock);
        spin_lock_init(&slot_errbuf_lock);
        np = of_find_node_by_path("/rtas");
author	Linas Vepstas <linas@linas.org>	2005-11-03 19:49:23 -0500
committer	Paul Mackerras <paulus@samba.org>	2005-11-09 19:33:32 -0500
commit	fd761fd876e4d1c0d07b6d93bc45c999fa596cb0 (patch)
tree	9b117c197cd011e66cd623b54bee8586426fdabf /arch
parent	76e6faf7a3a3ad3e18a1b70f9e4cd96cdf58140d (diff)

diff --git a/arch/ppc64/kernel/eeh.c b/arch/ppc64/kernel/eeh.c index 0060934dffd2..e7522f6da69d 100644 --- a/arch/ppc64/kernel/eeh.c +++ b/arch/ppc64/kernel/eeh.c
@@ -96,6 +96,9 @@ static int ibm_slot_error_detail;
96		96
97	static int eeh_subsystem_enabled;	97	static int eeh_subsystem_enabled;
98		98
		99	/* Lock to avoid races due to multiple reports of an error */
		100	static DEFINE_SPINLOCK(confirm_error_lock);
		101
99	/* Buffer for reporting slot-error-detail rtas calls */	102	/* Buffer for reporting slot-error-detail rtas calls */
100	static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX];	103	static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX];
101	static DEFINE_SPINLOCK(slot_errbuf_lock);	104	static DEFINE_SPINLOCK(slot_errbuf_lock);
@@ -544,6 +547,55 @@ static inline unsigned long eeh_token_to_phys(unsigned long token)
544	return pa \| (token & (PAGE_SIZE-1));	547	return pa \| (token & (PAGE_SIZE-1));
545	}	548	}
546		549
		550	/**
		551	* Return the "partitionable endpoint" (pe) under which this device lies
		552	*/
		553	static struct device_node * find_device_pe(struct device_node *dn)
		554	{
		555	while ((dn->parent) && PCI_DN(dn->parent) &&
		556	(PCI_DN(dn->parent)->eeh_mode & EEH_MODE_SUPPORTED)) {
		557	dn = dn->parent;
		558	}
		559	return dn;
		560	}
		561
		562	/** Mark all devices that are peers of this device as failed.
		563	* Mark the device driver too, so that it can see the failure
		564	* immediately; this is critical, since some drivers poll
		565	* status registers in interrupts ... If a driver is polling,
		566	* and the slot is frozen, then the driver can deadlock in
		567	* an interrupt context, which is bad.
		568	*/
		569
		570	static inline void __eeh_mark_slot (struct device_node *dn)
		571	{
		572	while (dn) {
		573	PCI_DN(dn)->eeh_mode \|= EEH_MODE_ISOLATED;
		574
		575	if (dn->child)
		576	__eeh_mark_slot (dn->child);
		577	dn = dn->sibling;
		578	}
		579	}
		580
		581	static inline void __eeh_clear_slot (struct device_node *dn)
		582	{
		583	while (dn) {
		584	PCI_DN(dn)->eeh_mode &= ~EEH_MODE_ISOLATED;
		585	if (dn->child)
		586	__eeh_clear_slot (dn->child);
		587	dn = dn->sibling;
		588	}
		589	}
		590
		591	static inline void eeh_clear_slot (struct device_node *dn)
		592	{
		593	unsigned long flags;
		594	spin_lock_irqsave(&confirm_error_lock, flags);
		595	__eeh_clear_slot (dn);
		596	spin_unlock_irqrestore(&confirm_error_lock, flags);
		597	}
		598
547	/**	599	/**
548	* eeh_dn_check_failure - check if all 1's data is due to EEH slot freeze	600	* eeh_dn_check_failure - check if all 1's data is due to EEH slot freeze
549	* @dn device node	601	* @dn device node
@@ -567,6 +619,8 @@ int eeh_dn_check_failure(struct device_node dn, struct pci_dev dev)
567	int reset_state;	619	int reset_state;
568	struct eeh_event *event;	620	struct eeh_event *event;
569	struct pci_dn *pdn;	621	struct pci_dn *pdn;
		622	struct device_node *pe_dn;
		623	int rc = 0;
570		624
571	__get_cpu_var(total_mmio_ffs)++;	625	__get_cpu_var(total_mmio_ffs)++;
572		626
@@ -594,10 +648,14 @@ int eeh_dn_check_failure(struct device_node dn, struct pci_dev dev)
594	return 0;	648	return 0;
595	}	649	}
596		650
597	/*	651	/* If we already have a pending isolation event for this
598	* If we already have a pending isolation event for this	652	* slot, we know it's bad already, we don't need to check.
599	* slot, we know it's bad already, we don't need to check...	653	* Do this checking under a lock; as multiple PCI devices
		654	* in one slot might report errors simultaneously, and we
		655	* only want one error recovery routine running.
600	*/	656	*/
		657	spin_lock_irqsave(&confirm_error_lock, flags);
		658	rc = 1;
601	if (pdn->eeh_mode & EEH_MODE_ISOLATED) {	659	if (pdn->eeh_mode & EEH_MODE_ISOLATED) {
602	atomic_inc(&eeh_fail_count);	660	atomic_inc(&eeh_fail_count);
603	if (atomic_read(&eeh_fail_count) >= EEH_MAX_FAILS) {	661	if (atomic_read(&eeh_fail_count) >= EEH_MAX_FAILS) {
@@ -606,7 +664,7 @@ int eeh_dn_check_failure(struct device_node dn, struct pci_dev dev)
606	rets[0] = -1; /* reset state unknown */	664	rets[0] = -1; /* reset state unknown */
607	eeh_panic(dev, rets[0]);	665	eeh_panic(dev, rets[0]);
608	}	666	}
609	return 0;	667	goto dn_unlock;
610	}	668	}
611		669
612	/*	670	/*
@@ -623,7 +681,8 @@ int eeh_dn_check_failure(struct device_node dn, struct pci_dev dev)
623	printk(KERN_WARNING "EEH: read_slot_reset_state() failed; rc=%d dn=%s\n",	681	printk(KERN_WARNING "EEH: read_slot_reset_state() failed; rc=%d dn=%s\n",
624	ret, dn->full_name);	682	ret, dn->full_name);
625	__get_cpu_var(false_positives)++;	683	__get_cpu_var(false_positives)++;
626	return 0;	684	rc = 0;
		685	goto dn_unlock;
627	}	686	}
628		687
629	/* If EEH is not supported on this device, punt. */	688	/* If EEH is not supported on this device, punt. */
@@ -631,25 +690,33 @@ int eeh_dn_check_failure(struct device_node dn, struct pci_dev dev)
631	printk(KERN_WARNING "EEH: event on unsupported device, rc=%d dn=%s\n",	690	printk(KERN_WARNING "EEH: event on unsupported device, rc=%d dn=%s\n",
632	ret, dn->full_name);	691	ret, dn->full_name);
633	__get_cpu_var(false_positives)++;	692	__get_cpu_var(false_positives)++;
634	return 0;	693	rc = 0;
		694	goto dn_unlock;
635	}	695	}
636		696
637	/* If not the kind of error we know about, punt. */	697	/* If not the kind of error we know about, punt. */
638	if (rets[0] != 2 && rets[0] != 4 && rets[0] != 5) {	698	if (rets[0] != 2 && rets[0] != 4 && rets[0] != 5) {
639	__get_cpu_var(false_positives)++;	699	__get_cpu_var(false_positives)++;
640	return 0;	700	rc = 0;
		701	goto dn_unlock;
641	}	702	}
642		703
643	/* Note that config-io to empty slots may fail;	704	/* Note that config-io to empty slots may fail;
644	* we recognize empty because they don't have children. */	705	* we recognize empty because they don't have children. */
645	if ((rets[0] == 5) && (dn->child == NULL)) {	706	if ((rets[0] == 5) && (dn->child == NULL)) {
646	__get_cpu_var(false_positives)++;	707	__get_cpu_var(false_positives)++;
647	return 0;	708	rc = 0;
		709	goto dn_unlock;
648	}	710	}
649		711
650	/* prevent repeated reports of this failure */	712	__get_cpu_var(slot_resets)++;
651	pdn->eeh_mode \|= EEH_MODE_ISOLATED;	713
652	__get_cpu_var(slot_resets)++;	714	/* Avoid repeated reports of this failure, including problems
		715	* with other functions on this device, and functions under
		716	* bridges. */
		717	pe_dn = find_device_pe (dn);
		718	__eeh_mark_slot (pe_dn);
		719	spin_unlock_irqrestore(&confirm_error_lock, flags);
653		720
654	reset_state = rets[0];	721	reset_state = rets[0];
655		722
@@ -678,10 +745,14 @@ int eeh_dn_check_failure(struct device_node dn, struct pci_dev dev)
678	if (rets[0] != 5) dump_stack();	745	if (rets[0] != 5) dump_stack();
679	schedule_work(&eeh_event_wq);	746	schedule_work(&eeh_event_wq);
680		747
681	return 0;	748	return 1;
		749
		750	dn_unlock:
		751	spin_unlock_irqrestore(&confirm_error_lock, flags);
		752	return rc;
682	}	753	}
683		754
684	EXPORT_SYMBOL(eeh_dn_check_failure);	755	EXPORT_SYMBOL_GPL(eeh_dn_check_failure);
685		756
686	/**	757	/**
687	* eeh_check_failure - check if all 1's data is due to EEH slot freeze	758	* eeh_check_failure - check if all 1's data is due to EEH slot freeze
@@ -820,6 +891,7 @@ void __init eeh_init(void)
820	struct device_node phb, np;	891	struct device_node phb, np;
821	struct eeh_early_enable_info info;	892	struct eeh_early_enable_info info;
822		893
		894	spin_lock_init(&confirm_error_lock);
823	spin_lock_init(&slot_errbuf_lock);	895	spin_lock_init(&slot_errbuf_lock);
824		896
825	np = of_find_node_by_path("/rtas");	897	np = of_find_node_by_path("/rtas");