wimax/i2400m: add the error recovery mechanism on TX path

This patch adds an error recovery mechanism on TX path. The intention is to bring back the device to some known state whenever TX sees -110 (-ETIMEOUT) on copying the data to the HW FIFO. The TX failure could mean a device bus stuck or function stuck, so the current error recovery implementation is to trigger a bus reset and expect this can bring back the device. Since the TX work is done in a thread context, there may be a queue of TX works already that all hit the -ETIMEOUT error condition because the device has somewhat stuck already. We don't want any consecutive bus resets simply because multiple TX works in the queue all hit the same device erratum, the flag "error_recovery" is introduced to denote if we are ready for taking any error recovery. See @error_recovery doc in i2400m.h. Signed-off-by: Cindy H Kao <cindy.h.kao@intel.com>
author: Cindy H Kao <cindy.h.kao@intel.com> 2010-04-07 23:07:47 -0400
committer: Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com> 2010-05-11 17:05:39 -0400
commit: 599e59538448ee49d5470f226bb191b2f78aa3a2 (patch)
tree: 03c8c92a907c19e28bb5f9eef0a1121081515b31 /drivers/net/wimax/i2400m
parent: f4e413458104210bc29aa5c437882c68b4b20100 (diff)
3 files changed, 92 insertions, 0 deletions
diff --git a/drivers/net/wimax/i2400m/driver.c b/drivers/net/wimax/i2400m/driver.c
index 1674dba43f83..d83fe84407bf 100644
--- a/drivers/net/wimax/i2400m/driver.c
+++ b/drivers/net/wimax/i2400m/driver.c
@@ -395,6 +395,16 @@ retry:
        result = i2400m_dev_initialize(i2400m);
        if (result < 0)
                goto error_dev_initialize;
+        /* We don't want any additional unwanted error recovery triggered
+         * from any other context so if anything went wrong before we come
+         * here, let's keep i2400m->error_recovery untouched and leave it to
+         * dev_reset_handle(). See dev_reset_handle(). */
+        atomic_dec(&i2400m->error_recovery);
+        /* Every thing works so far, ok, now we are ready to
+         * take error recovery if it's required. */
        /* At this point, reports will come for the device and set it
         * to the right state if it is different than UNINITIALIZED */
        d_fnend(3, dev, "(net_dev %p [i2400m %p]) = %d\n",
@@ -770,6 +780,66 @@ int i2400m_dev_reset_handle(struct i2400m *i2400m, const char *reason)
 EXPORT_SYMBOL_GPL(i2400m_dev_reset_handle);
+ /*
+ * The actual work of error recovery.
+ *
+ * The current implementation of error recovery is to trigger a bus reset.
+ */
+static
+void __i2400m_error_recovery(struct work_struct *ws)
+{
+        struct i2400m_work *iw = container_of(ws, struct i2400m_work, ws);
+        struct i2400m *i2400m = iw->i2400m;
+        i2400m_reset(i2400m, I2400M_RT_BUS);
+        i2400m_put(i2400m);
+        kfree(iw);
+        return;
+}
+/*
+ * Schedule a work struct for error recovery.
+ *
+ * The intention of error recovery is to bring back the device to some
+ * known state whenever TX sees -110 (-ETIMEOUT) on copying the data to
+ * the device. The TX failure could mean a device bus stuck, so the current
+ * error recovery implementation is to trigger a bus reset to the device
+ * and hopefully it can bring back the device.
+ *
+ * The actual work of error recovery has to be in a thread context because
+ * it is kicked off in the TX thread (i2400ms->tx_workqueue) which is to be
+ * destroyed by the error recovery mechanism (currently a bus reset).
+ *
+ * Also, there may be already a queue of TX works that all hit
+ * the -ETIMEOUT error condition because the device is stuck already.
+ * Since bus reset is used as the error recovery mechanism and we don't
+ * want consecutive bus resets simply because the multiple TX works
+ * in the queue all hit the same device erratum, the flag "error_recovery"
+ * is introduced for preventing unwanted consecutive bus resets.
+ *
+ * Error recovery shall only be invoked again if previous one was completed.
+ * The flag error_recovery is set when error recovery mechanism is scheduled,
+ * and is checked when we need to schedule another error recovery. If it is
+ * in place already, then we shouldn't schedule another one.
+ */
+void i2400m_error_recovery(struct i2400m *i2400m)
+{
+        struct device *dev = i2400m_dev(i2400m);
+        if (atomic_add_return(1, &i2400m->error_recovery) == 1) {
+                if (i2400m_schedule_work(i2400m, __i2400m_error_recovery,
+                        GFP_ATOMIC, NULL, 0) < 0) {
+                        dev_err(dev, "run out of memory for "
+                                "scheduling an error recovery ?\n");
+                        atomic_dec(&i2400m->error_recovery);
+                }
+        } else
+                atomic_dec(&i2400m->error_recovery);
+        return;
+}
+EXPORT_SYMBOL_GPL(i2400m_error_recovery);
 /*
 * Alloc the command and ack buffers for boot mode
 *
@@ -839,6 +909,10 @@ void i2400m_init(struct i2400m *i2400m)
        atomic_set(&i2400m->bus_reset_retries, 0);
        i2400m->alive = 0;
+        /* initialize error_recovery to 1 for denoting we
+         * are not yet ready to take any error recovery */
+        atomic_set(&i2400m->error_recovery, 1);
 }
 EXPORT_SYMBOL_GPL(i2400m_init);
diff --git a/drivers/net/wimax/i2400m/i2400m.h b/drivers/net/wimax/i2400m/i2400m.h
index ad8e6a3be1e3..7a9c2c5b25cb 100644
--- a/drivers/net/wimax/i2400m/i2400m.h
+++ b/drivers/net/wimax/i2400m/i2400m.h
@@ -545,6 +545,15 @@ struct i2400m_barker_db;
 *      all the device reboot events detected can be still handled properly
 *      by either dev_reset_handle() or .pre_reset/.post_reset as long as
 *      the driver presents. It is set 0 along with @updown in dev_stop().
+ *
+ * @error_recovery: flag to denote if we are ready to take an error recovery.
+ *      0 for ready to take an error recovery; 1 for not ready. It is
+ *      initialized to 1 while probe() since we don't tend to take any error
+ *      recovery during probe(). It is decremented by 1 whenever dev_start()
+ *      succeeds to indicate we are ready to take error recovery from now on.
+ *      It is checked every time we wanna schedule an error recovery. If an
+ *      error recovery is already in place (error_recovery was set 1), we
+ *      should not schedule another one until the last one is done.
 */
 struct i2400m {
        struct wimax_dev wimax_dev;     /* FIRST! See doc */
@@ -625,6 +634,10 @@ struct i2400m {
        /* if the device is expected to be alive */
        unsigned alive;
+        /* 0 if we are ready for error recovery; 1 if not ready  */
+        atomic_t error_recovery;
 };
@@ -847,6 +860,7 @@ void i2400m_put(struct i2400m *i2400m)
 extern int i2400m_dev_reset_handle(struct i2400m *, const char *);
 extern int i2400m_pre_reset(struct i2400m *);
 extern int i2400m_post_reset(struct i2400m *);
+extern void i2400m_error_recovery(struct i2400m *);
 /*
 * _setup()/_release() are called by the probe/disconnect functions of
diff --git a/drivers/net/wimax/i2400m/sdio-tx.c b/drivers/net/wimax/i2400m/sdio-tx.c
index 412b6a8eaef2..b53cd1c80e3e 100644
--- a/drivers/net/wimax/i2400m/sdio-tx.c
+++ b/drivers/net/wimax/i2400m/sdio-tx.c
@@ -98,6 +98,10 @@ void i2400ms_tx_submit(struct work_struct *ws)
                                tx_msg_size, result);
                }
+                if (result == -ETIMEDOUT) {
+                        i2400m_error_recovery(i2400m);
+                        break;
+                }
                d_printf(2, dev, "TX: %zub submitted\n", tx_msg_size);
        }
author	Cindy H Kao <cindy.h.kao@intel.com>	2010-04-07 23:07:47 -0400
committer	Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>	2010-05-11 17:05:39 -0400
commit	599e59538448ee49d5470f226bb191b2f78aa3a2 (patch)
tree	03c8c92a907c19e28bb5f9eef0a1121081515b31 /drivers/net/wimax/i2400m
parent	f4e413458104210bc29aa5c437882c68b4b20100 (diff)

diff --git a/drivers/net/wimax/i2400m/driver.c b/drivers/net/wimax/i2400m/driver.c index 1674dba43f83..d83fe84407bf 100644 --- a/drivers/net/wimax/i2400m/driver.c +++ b/drivers/net/wimax/i2400m/driver.c
@@ -395,6 +395,16 @@ retry:
395	result = i2400m_dev_initialize(i2400m);	395	result = i2400m_dev_initialize(i2400m);
396	if (result < 0)	396	if (result < 0)
397	goto error_dev_initialize;	397	goto error_dev_initialize;
		398
		399	/* We don't want any additional unwanted error recovery triggered
		400	* from any other context so if anything went wrong before we come
		401	* here, let's keep i2400m->error_recovery untouched and leave it to
		402	* dev_reset_handle(). See dev_reset_handle(). */
		403
		404	atomic_dec(&i2400m->error_recovery);
		405	/* Every thing works so far, ok, now we are ready to
		406	* take error recovery if it's required. */
		407
398	/* At this point, reports will come for the device and set it	408	/* At this point, reports will come for the device and set it
399	* to the right state if it is different than UNINITIALIZED */	409	* to the right state if it is different than UNINITIALIZED */
400	d_fnend(3, dev, "(net_dev %p [i2400m %p]) = %d\n",	410	d_fnend(3, dev, "(net_dev %p [i2400m %p]) = %d\n",
@@ -770,6 +780,66 @@ int i2400m_dev_reset_handle(struct i2400m i2400m, const char reason)
770	EXPORT_SYMBOL_GPL(i2400m_dev_reset_handle);	780	EXPORT_SYMBOL_GPL(i2400m_dev_reset_handle);
771		781
772		782
		783	/*
		784	* The actual work of error recovery.
		785	*
		786	* The current implementation of error recovery is to trigger a bus reset.
		787	*/
		788	static
		789	void __i2400m_error_recovery(struct work_struct *ws)
		790	{
		791	struct i2400m_work *iw = container_of(ws, struct i2400m_work, ws);
		792	struct i2400m *i2400m = iw->i2400m;
		793
		794	i2400m_reset(i2400m, I2400M_RT_BUS);
		795
		796	i2400m_put(i2400m);
		797	kfree(iw);
		798	return;
		799	}
		800
		801	/*
		802	* Schedule a work struct for error recovery.
		803	*
		804	* The intention of error recovery is to bring back the device to some
		805	* known state whenever TX sees -110 (-ETIMEOUT) on copying the data to
		806	* the device. The TX failure could mean a device bus stuck, so the current
		807	* error recovery implementation is to trigger a bus reset to the device
		808	* and hopefully it can bring back the device.
		809	*
		810	* The actual work of error recovery has to be in a thread context because
		811	* it is kicked off in the TX thread (i2400ms->tx_workqueue) which is to be
		812	* destroyed by the error recovery mechanism (currently a bus reset).
		813	*
		814	* Also, there may be already a queue of TX works that all hit
		815	* the -ETIMEOUT error condition because the device is stuck already.
		816	* Since bus reset is used as the error recovery mechanism and we don't
		817	* want consecutive bus resets simply because the multiple TX works
		818	* in the queue all hit the same device erratum, the flag "error_recovery"
		819	* is introduced for preventing unwanted consecutive bus resets.
		820	*
		821	* Error recovery shall only be invoked again if previous one was completed.
		822	* The flag error_recovery is set when error recovery mechanism is scheduled,
		823	* and is checked when we need to schedule another error recovery. If it is
		824	* in place already, then we shouldn't schedule another one.
		825	*/
		826	void i2400m_error_recovery(struct i2400m *i2400m)
		827	{
		828	struct device *dev = i2400m_dev(i2400m);
		829
		830	if (atomic_add_return(1, &i2400m->error_recovery) == 1) {
		831	if (i2400m_schedule_work(i2400m, __i2400m_error_recovery,
		832	GFP_ATOMIC, NULL, 0) < 0) {
		833	dev_err(dev, "run out of memory for "
		834	"scheduling an error recovery ?\n");
		835	atomic_dec(&i2400m->error_recovery);
		836	}
		837	} else
		838	atomic_dec(&i2400m->error_recovery);
		839	return;
		840	}
		841	EXPORT_SYMBOL_GPL(i2400m_error_recovery);
		842
773	/*	843	/*
774	* Alloc the command and ack buffers for boot mode	844	* Alloc the command and ack buffers for boot mode
775	*	845	*
@@ -839,6 +909,10 @@ void i2400m_init(struct i2400m *i2400m)
839	atomic_set(&i2400m->bus_reset_retries, 0);	909	atomic_set(&i2400m->bus_reset_retries, 0);
840		910
841	i2400m->alive = 0;	911	i2400m->alive = 0;
		912
		913	/* initialize error_recovery to 1 for denoting we
		914	* are not yet ready to take any error recovery */
		915	atomic_set(&i2400m->error_recovery, 1);
842	}	916	}
843	EXPORT_SYMBOL_GPL(i2400m_init);	917	EXPORT_SYMBOL_GPL(i2400m_init);
844		918


diff --git a/drivers/net/wimax/i2400m/i2400m.h b/drivers/net/wimax/i2400m/i2400m.h index ad8e6a3be1e3..7a9c2c5b25cb 100644 --- a/drivers/net/wimax/i2400m/i2400m.h +++ b/drivers/net/wimax/i2400m/i2400m.h
@@ -545,6 +545,15 @@ struct i2400m_barker_db;
545	* all the device reboot events detected can be still handled properly	545	* all the device reboot events detected can be still handled properly
546	* by either dev_reset_handle() or .pre_reset/.post_reset as long as	546	* by either dev_reset_handle() or .pre_reset/.post_reset as long as
547	* the driver presents. It is set 0 along with @updown in dev_stop().	547	* the driver presents. It is set 0 along with @updown in dev_stop().
		548	*
		549	* @error_recovery: flag to denote if we are ready to take an error recovery.
		550	* 0 for ready to take an error recovery; 1 for not ready. It is
		551	* initialized to 1 while probe() since we don't tend to take any error
		552	* recovery during probe(). It is decremented by 1 whenever dev_start()
		553	* succeeds to indicate we are ready to take error recovery from now on.
		554	* It is checked every time we wanna schedule an error recovery. If an
		555	* error recovery is already in place (error_recovery was set 1), we
		556	* should not schedule another one until the last one is done.
548	*/	557	*/
549	struct i2400m {	558	struct i2400m {
550	struct wimax_dev wimax_dev; /* FIRST! See doc */	559	struct wimax_dev wimax_dev; /* FIRST! See doc */
@@ -625,6 +634,10 @@ struct i2400m {
625		634
626	/* if the device is expected to be alive */	635	/* if the device is expected to be alive */
627	unsigned alive;	636	unsigned alive;
		637
		638	/* 0 if we are ready for error recovery; 1 if not ready */
		639	atomic_t error_recovery;
		640
628	};	641	};
629		642
630		643
@@ -847,6 +860,7 @@ void i2400m_put(struct i2400m *i2400m)
847	extern int i2400m_dev_reset_handle(struct i2400m , const char );	860	extern int i2400m_dev_reset_handle(struct i2400m , const char );
848	extern int i2400m_pre_reset(struct i2400m *);	861	extern int i2400m_pre_reset(struct i2400m *);
849	extern int i2400m_post_reset(struct i2400m *);	862	extern int i2400m_post_reset(struct i2400m *);
		863	extern void i2400m_error_recovery(struct i2400m *);
850		864
851	/*	865	/*
852	* _setup()/_release() are called by the probe/disconnect functions of	866	* _setup()/_release() are called by the probe/disconnect functions of


diff --git a/drivers/net/wimax/i2400m/sdio-tx.c b/drivers/net/wimax/i2400m/sdio-tx.c index 412b6a8eaef2..b53cd1c80e3e 100644 --- a/drivers/net/wimax/i2400m/sdio-tx.c +++ b/drivers/net/wimax/i2400m/sdio-tx.c
@@ -98,6 +98,10 @@ void i2400ms_tx_submit(struct work_struct *ws)
98	tx_msg_size, result);	98	tx_msg_size, result);
99	}	99	}
100		100
		101	if (result == -ETIMEDOUT) {
		102	i2400m_error_recovery(i2400m);
		103	break;
		104	}
101	d_printf(2, dev, "TX: %zub submitted\n", tx_msg_size);	105	d_printf(2, dev, "TX: %zub submitted\n", tx_msg_size);
102	}	106	}
103		107