2 files changed, 112 insertions, 18 deletions
diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 0c0ac93f422f..a0b11fb3237e 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -53,6 +53,7 @@ struct device_node;
 #define EEH_PE_ISOLATED         (1 << 0)        /* Isolated PE          */
 #define EEH_PE_RECOVERING       (1 << 1)        /* Recovering PE        */
+#define EEH_PE_PHB_DEAD         (1 << 2)        /* Dead PHB             */
 struct eeh_pe {
        int type;                       /* PE type: PHB/Bus/Device      */
@@ -145,6 +146,7 @@ struct eeh_ops {
        int (*configure_bridge)(struct eeh_pe *pe);
        int (*read_config)(struct device_node *dn, int where, int size, u32 *val);
        int (*write_config)(struct device_node *dn, int where, int size, u32 val);
+        int (*next_error)(struct eeh_pe **pe);
 };
 extern struct eeh_ops *eeh_ops;
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 678bc6cddf82..0974e1326842 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -399,24 +399,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
 */
 #define MAX_WAIT_FOR_RECOVERY 150
-/**
+static void eeh_handle_normal_event(struct eeh_pe *pe)
- * eeh_handle_event - Reset a PCI device after hard lockup.
- * @pe: EEH PE
- *
- * While PHB detects address or data parity errors on particular PCI
- * slot, the associated PE will be frozen. Besides, DMA's occurring
- * to wild addresses (which usually happen due to bugs in device
- * drivers or in PCI adapter firmware) can cause EEH error. #SERR,
- * #PERR or other misc PCI-related errors also can trigger EEH errors.
- *
- * Recovery process consists of unplugging the device driver (which
- * generated hotplug events to userspace), then issuing a PCI #RST to
- * the device, then reconfiguring the PCI config space for all bridges
- * & devices under this slot, and then finally restarting the device
- * drivers (which cause a second set of hotplug events to go out to
- * userspace).
- */
-void eeh_handle_event(struct eeh_pe *pe)
 {
        struct pci_bus *frozen_bus;
        int rc = 0;
@@ -554,3 +537,112 @@ perm_error:
        if (frozen_bus)
                pcibios_remove_pci_devices(frozen_bus);
 }
+static void eeh_handle_special_event(void)
+{
+        struct eeh_pe *pe, *phb_pe;
+        struct pci_bus *bus;
+        struct pci_controller *hose, *tmp;
+        unsigned long flags;
+        int rc = 0;
+        /*
+         * The return value from next_error() has been classified as follows.
+         * It might be good to enumerate them. However, next_error() is only
+         * supported by PowerNV platform for now. So it would be fine to use
+         * integer directly:
+         *
+         * 4 - Dead IOC           3 - Dead PHB
+         * 2 - Fenced PHB         1 - Frozen PE
+         * 0 - No error found
+         *
+         */
+        rc = eeh_ops->next_error(&pe);
+        if (rc <= 0)
+                return;
+        switch (rc) {
+        case 4:
+                /* Mark all PHBs in dead state */
+                eeh_serialize_lock(&flags);
+                list_for_each_entry_safe(hose, tmp,
+                                &hose_list, list_node) {
+                        phb_pe = eeh_phb_pe_get(hose);
+                        if (!phb_pe) continue;
+                        eeh_pe_state_mark(phb_pe,
+                                EEH_PE_ISOLATED | EEH_PE_PHB_DEAD);
+                }
+                eeh_serialize_unlock(flags);
+                /* Purge all events */
+                eeh_remove_event(NULL);
+                break;
+        case 3:
+        case 2:
+        case 1:
+                /* Mark the PE in fenced state */
+                eeh_serialize_lock(&flags);
+                if (rc == 3)
+                        eeh_pe_state_mark(pe,
+                                EEH_PE_ISOLATED | EEH_PE_PHB_DEAD);
+                else
+                        eeh_pe_state_mark(pe,
+                                EEH_PE_ISOLATED | EEH_PE_RECOVERING);
+                eeh_serialize_unlock(flags);
+                /* Purge all events of the PHB */
+                eeh_remove_event(pe);
+                break;
+        default:
+                pr_err("%s: Invalid value %d from next_error()\n",
+                       __func__, rc);
+                return;
+        }
+        /*
+         * For fenced PHB and frozen PE, it's handled as normal
+         * event. We have to remove the affected PHBs for dead
+         * PHB and IOC
+         */
+        if (rc == 2 || rc == 1)
+                eeh_handle_normal_event(pe);
+        else {
+                list_for_each_entry_safe(hose, tmp,
+                        &hose_list, list_node) {
+                        phb_pe = eeh_phb_pe_get(hose);
+                        if (!phb_pe || !(phb_pe->state & EEH_PE_PHB_DEAD))
+                                continue;
+                        bus = eeh_pe_bus_get(phb_pe);
+                        /* Notify all devices that they're about to go down. */
+                        eeh_pe_dev_traverse(pe, eeh_report_failure, NULL);
+                        pcibios_remove_pci_devices(bus);
+                }
+        }
+}
+/**
+ * eeh_handle_event - Reset a PCI device after hard lockup.
+ * @pe: EEH PE
+ *
+ * While PHB detects address or data parity errors on particular PCI
+ * slot, the associated PE will be frozen. Besides, DMA's occurring
+ * to wild addresses (which usually happen due to bugs in device
+ * drivers or in PCI adapter firmware) can cause EEH error. #SERR,
+ * #PERR or other misc PCI-related errors also can trigger EEH errors.
+ *
+ * Recovery process consists of unplugging the device driver (which
+ * generated hotplug events to userspace), then issuing a PCI #RST to
+ * the device, then reconfiguring the PCI config space for all bridges
+ * & devices under this slot, and then finally restarting the device
+ * drivers (which cause a second set of hotplug events to go out to
+ * userspace).
+ */
+void eeh_handle_event(struct eeh_pe *pe)
+{
+        if (pe)
+                eeh_handle_normal_event(pe);
+        else
+                eeh_handle_special_event();
+}

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 0c0ac93f422f..a0b11fb3237e 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h
@@ -53,6 +53,7 @@ struct device_node;
53		53
54	#define EEH_PE_ISOLATED (1 << 0) /* Isolated PE */	54	#define EEH_PE_ISOLATED (1 << 0) /* Isolated PE */
55	#define EEH_PE_RECOVERING (1 << 1) /* Recovering PE */	55	#define EEH_PE_RECOVERING (1 << 1) /* Recovering PE */
		56	#define EEH_PE_PHB_DEAD (1 << 2) /* Dead PHB */
56		57
57	struct eeh_pe {	58	struct eeh_pe {
58	int type; /* PE type: PHB/Bus/Device */	59	int type; /* PE type: PHB/Bus/Device */
@@ -145,6 +146,7 @@ struct eeh_ops {
145	int (configure_bridge)(struct eeh_pe pe);	146	int (configure_bridge)(struct eeh_pe pe);
146	int (read_config)(struct device_node dn, int where, int size, u32 *val);	147	int (read_config)(struct device_node dn, int where, int size, u32 *val);
147	int (write_config)(struct device_node dn, int where, int size, u32 val);	148	int (write_config)(struct device_node dn, int where, int size, u32 val);
		149	int (next_error)(struct eeh_pe *pe);
148	};	150	};
149		151
150	extern struct eeh_ops *eeh_ops;	152	extern struct eeh_ops *eeh_ops;


diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 678bc6cddf82..0974e1326842 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c
@@ -399,24 +399,7 @@ static int eeh_reset_device(struct eeh_pe pe, struct pci_bus bus)
399	*/	399	*/
400	#define MAX_WAIT_FOR_RECOVERY 150	400	#define MAX_WAIT_FOR_RECOVERY 150
401		401
402	/**	402	static void eeh_handle_normal_event(struct eeh_pe *pe)
403	* eeh_handle_event - Reset a PCI device after hard lockup.
404	* @pe: EEH PE
405	*
406	* While PHB detects address or data parity errors on particular PCI
407	* slot, the associated PE will be frozen. Besides, DMA's occurring
408	* to wild addresses (which usually happen due to bugs in device
409	* drivers or in PCI adapter firmware) can cause EEH error. #SERR,
410	* #PERR or other misc PCI-related errors also can trigger EEH errors.
411	*
412	* Recovery process consists of unplugging the device driver (which
413	* generated hotplug events to userspace), then issuing a PCI #RST to
414	* the device, then reconfiguring the PCI config space for all bridges
415	* & devices under this slot, and then finally restarting the device
416	* drivers (which cause a second set of hotplug events to go out to
417	* userspace).
418	*/
419	void eeh_handle_event(struct eeh_pe *pe)
420	{	403	{
421	struct pci_bus *frozen_bus;	404	struct pci_bus *frozen_bus;
422	int rc = 0;	405	int rc = 0;
@@ -554,3 +537,112 @@ perm_error:
554	if (frozen_bus)	537	if (frozen_bus)
555	pcibios_remove_pci_devices(frozen_bus);	538	pcibios_remove_pci_devices(frozen_bus);
556	}	539	}
		540
		541	static void eeh_handle_special_event(void)
		542	{
		543	struct eeh_pe pe, phb_pe;
		544	struct pci_bus *bus;
		545	struct pci_controller hose, tmp;
		546	unsigned long flags;
		547	int rc = 0;
		548
		549	/*
		550	* The return value from next_error() has been classified as follows.
		551	* It might be good to enumerate them. However, next_error() is only
		552	* supported by PowerNV platform for now. So it would be fine to use
		553	* integer directly:
		554	*
		555	* 4 - Dead IOC 3 - Dead PHB
		556	* 2 - Fenced PHB 1 - Frozen PE
		557	* 0 - No error found
		558	*
		559	*/
		560	rc = eeh_ops->next_error(&pe);
		561	if (rc <= 0)
		562	return;
		563
		564	switch (rc) {
		565	case 4:
		566	/* Mark all PHBs in dead state */
		567	eeh_serialize_lock(&flags);
		568	list_for_each_entry_safe(hose, tmp,
		569	&hose_list, list_node) {
		570	phb_pe = eeh_phb_pe_get(hose);
		571	if (!phb_pe) continue;
		572
		573	eeh_pe_state_mark(phb_pe,
		574	EEH_PE_ISOLATED \| EEH_PE_PHB_DEAD);
		575	}
		576	eeh_serialize_unlock(flags);
		577
		578	/* Purge all events */
		579	eeh_remove_event(NULL);
		580	break;
		581	case 3:
		582	case 2:
		583	case 1:
		584	/* Mark the PE in fenced state */
		585	eeh_serialize_lock(&flags);
		586	if (rc == 3)
		587	eeh_pe_state_mark(pe,
		588	EEH_PE_ISOLATED \| EEH_PE_PHB_DEAD);
		589	else
		590	eeh_pe_state_mark(pe,
		591	EEH_PE_ISOLATED \| EEH_PE_RECOVERING);
		592	eeh_serialize_unlock(flags);
		593
		594	/* Purge all events of the PHB */
		595	eeh_remove_event(pe);
		596	break;
		597	default:
		598	pr_err("%s: Invalid value %d from next_error()\n",
		599	__func__, rc);
		600	return;
		601	}
		602
		603	/*
		604	* For fenced PHB and frozen PE, it's handled as normal
		605	* event. We have to remove the affected PHBs for dead
		606	* PHB and IOC
		607	*/
		608	if (rc == 2 \|\| rc == 1)
		609	eeh_handle_normal_event(pe);
		610	else {
		611	list_for_each_entry_safe(hose, tmp,
		612	&hose_list, list_node) {
		613	phb_pe = eeh_phb_pe_get(hose);
		614	if (!phb_pe \|\| !(phb_pe->state & EEH_PE_PHB_DEAD))
		615	continue;
		616
		617	bus = eeh_pe_bus_get(phb_pe);
		618	/* Notify all devices that they're about to go down. */
		619	eeh_pe_dev_traverse(pe, eeh_report_failure, NULL);
		620	pcibios_remove_pci_devices(bus);
		621	}
		622	}
		623	}
		624
		625	/**
		626	* eeh_handle_event - Reset a PCI device after hard lockup.
		627	* @pe: EEH PE
		628	*
		629	* While PHB detects address or data parity errors on particular PCI
		630	* slot, the associated PE will be frozen. Besides, DMA's occurring
		631	* to wild addresses (which usually happen due to bugs in device
		632	* drivers or in PCI adapter firmware) can cause EEH error. #SERR,
		633	* #PERR or other misc PCI-related errors also can trigger EEH errors.
		634	*
		635	* Recovery process consists of unplugging the device driver (which
		636	* generated hotplug events to userspace), then issuing a PCI #RST to
		637	* the device, then reconfiguring the PCI config space for all bridges
		638	* & devices under this slot, and then finally restarting the device
		639	* drivers (which cause a second set of hotplug events to go out to
		640	* userspace).
		641	*/
		642	void eeh_handle_event(struct eeh_pe *pe)
		643	{
		644	if (pe)
		645	eeh_handle_normal_event(pe);
		646	else
		647	eeh_handle_special_event();
		648	}