powerpc/eeh: Allow to check fenced PHB proactively

It's meaningless to handle frozen PE if we already had fenced PHB. The patch intends to check the PHB state before checking PE. If the PHB has been put into fenced state, we need take care of that firstly. Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
author: Gavin Shan <shangw@linux.vnet.ibm.com> 2013-06-20 01:21:16 -0400
committer: Benjamin Herrenschmidt <benh@kernel.crashing.org> 2013-06-20 03:06:53 -0400
commit: b95cd2cd44b39cf11087b15f74e29ef9f2c6bf0f (patch)
tree: a89c79c3dd68b32003f96ba47cee56f6072caebe /arch/powerpc/kernel
parent: be7e744607175fb1620f0390d20c880e16de163b (diff)
1 files changed, 60 insertions, 0 deletions
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 81cd0311dee8..7c567be3dd03 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -269,6 +269,58 @@ static inline unsigned long eeh_token_to_phys(unsigned long token)
        return pa | (token & (PAGE_SIZE-1));
 }
+/*
+ * On PowerNV platform, we might already have fenced PHB there.
+ * For that case, it's meaningless to recover frozen PE. Intead,
+ * We have to handle fenced PHB firstly.
+ */
+static int eeh_phb_check_failure(struct eeh_pe *pe)
+{
+        struct eeh_pe *phb_pe;
+        unsigned long flags;
+        int ret;
+        if (!eeh_probe_mode_dev())
+                return -EPERM;
+        /* Find the PHB PE */
+        phb_pe = eeh_phb_pe_get(pe->phb);
+        if (!phb_pe) {
+                pr_warning("%s Can't find PE for PHB#%d\n",
+                           __func__, pe->phb->global_number);
+                return -EEXIST;
+        }
+        /* If the PHB has been in problematic state */
+        eeh_serialize_lock(&flags);
+        if (phb_pe->state & (EEH_PE_ISOLATED | EEH_PE_PHB_DEAD)) {
+                ret = 0;
+                goto out;
+        }
+        /* Check PHB state */
+        ret = eeh_ops->get_state(phb_pe, NULL);
+        if ((ret < 0) ||
+            (ret == EEH_STATE_NOT_SUPPORT) ||
+            (ret & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) ==
+            (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) {
+                ret = 0;
+                goto out;
+        }
+        /* Isolate the PHB and send event */
+        eeh_pe_state_mark(phb_pe, EEH_PE_ISOLATED);
+        eeh_serialize_unlock(flags);
+        eeh_send_failure_event(phb_pe);
+        WARN(1, "EEH: PHB failure detected\n");
+        return 1;
+out:
+        eeh_serialize_unlock(flags);
+        return ret;
+}
 /**
 * eeh_dev_check_failure - Check if all 1's data is due to EEH slot freeze
 * @edev: eeh device
@@ -319,6 +371,14 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
                return 0;
        }
+        /*
+         * On PowerNV platform, we might already have fenced PHB
+         * there and we need take care of that firstly.
+         */
+        ret = eeh_phb_check_failure(pe);
+        if (ret > 0)
+                return ret;
        /* If we already have a pending isolation event for this
         * slot, we know it's bad already, we don't need to check.
         * Do this checking under a lock; as multiple PCI devices
author	Gavin Shan <shangw@linux.vnet.ibm.com>	2013-06-20 01:21:16 -0400
committer	Benjamin Herrenschmidt <benh@kernel.crashing.org>	2013-06-20 03:06:53 -0400
commit	b95cd2cd44b39cf11087b15f74e29ef9f2c6bf0f (patch)
tree	a89c79c3dd68b32003f96ba47cee56f6072caebe /arch/powerpc/kernel
parent	be7e744607175fb1620f0390d20c880e16de163b (diff)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 81cd0311dee8..7c567be3dd03 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c
@@ -269,6 +269,58 @@ static inline unsigned long eeh_token_to_phys(unsigned long token)
269	return pa \| (token & (PAGE_SIZE-1));	269	return pa \| (token & (PAGE_SIZE-1));
270	}	270	}
271		271
		272	/*
		273	* On PowerNV platform, we might already have fenced PHB there.
		274	* For that case, it's meaningless to recover frozen PE. Intead,
		275	* We have to handle fenced PHB firstly.
		276	*/
		277	static int eeh_phb_check_failure(struct eeh_pe *pe)
		278	{
		279	struct eeh_pe *phb_pe;
		280	unsigned long flags;
		281	int ret;
		282
		283	if (!eeh_probe_mode_dev())
		284	return -EPERM;
		285
		286	/* Find the PHB PE */
		287	phb_pe = eeh_phb_pe_get(pe->phb);
		288	if (!phb_pe) {
		289	pr_warning("%s Can't find PE for PHB#%d\n",
		290	__func__, pe->phb->global_number);
		291	return -EEXIST;
		292	}
		293
		294	/* If the PHB has been in problematic state */
		295	eeh_serialize_lock(&flags);
		296	if (phb_pe->state & (EEH_PE_ISOLATED \| EEH_PE_PHB_DEAD)) {
		297	ret = 0;
		298	goto out;
		299	}
		300
		301	/* Check PHB state */
		302	ret = eeh_ops->get_state(phb_pe, NULL);
		303	if ((ret < 0) \|\|
		304	(ret == EEH_STATE_NOT_SUPPORT) \|\|
		305	(ret & (EEH_STATE_MMIO_ACTIVE \| EEH_STATE_DMA_ACTIVE)) ==
		306	(EEH_STATE_MMIO_ACTIVE \| EEH_STATE_DMA_ACTIVE)) {
		307	ret = 0;
		308	goto out;
		309	}
		310
		311	/* Isolate the PHB and send event */
		312	eeh_pe_state_mark(phb_pe, EEH_PE_ISOLATED);
		313	eeh_serialize_unlock(flags);
		314	eeh_send_failure_event(phb_pe);
		315
		316	WARN(1, "EEH: PHB failure detected\n");
		317
		318	return 1;
		319	out:
		320	eeh_serialize_unlock(flags);
		321	return ret;
		322	}
		323
272	/**	324	/**
273	* eeh_dev_check_failure - Check if all 1's data is due to EEH slot freeze	325	* eeh_dev_check_failure - Check if all 1's data is due to EEH slot freeze
274	* @edev: eeh device	326	* @edev: eeh device
@@ -319,6 +371,14 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
319	return 0;	371	return 0;
320	}	372	}
321		373
		374	/*
		375	* On PowerNV platform, we might already have fenced PHB
		376	* there and we need take care of that firstly.
		377	*/
		378	ret = eeh_phb_check_failure(pe);
		379	if (ret > 0)
		380	return ret;
		381
322	/* If we already have a pending isolation event for this	382	/* If we already have a pending isolation event for this
323	* slot, we know it's bad already, we don't need to check.	383	* slot, we know it's bad already, we don't need to check.
324	* Do this checking under a lock; as multiple PCI devices	384	* Do this checking under a lock; as multiple PCI devices