myri10ge: rework parity error check and cleanup

Clean up watchdog reset code: - move code that checks for stuck slice to a common routine - unless there is a confirmed h/w fault, verify that a stuck slice is still stuck in the watchdog worker; if the slice is no longer stuck, abort the reset. - this removes an egregious 2000ms pause in the watchdog worker that was a diagnostic aid (to look for spurious resets) the snuck into production code. v3 includes corrections from Joe Perches Signed-off-by: Jon Mason <mason@myri.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Jon Mason <mason@myri.com> 2011-06-27 13:57:28 -0400
committer: David S. Miller <davem@davemloft.net> 2011-06-29 09:02:04 -0400
commit: c689b81b4267b1335b11f18fe8a79c56880d9d43 (patch)
tree: ccfba7f4a8003a63194ab4eca263e6574b80f762 /drivers/net/myri10ge/myri10ge.c
parent: 7539a613c646f9e870bbedfa753a54cf13b98d22 (diff)
1 files changed, 60 insertions, 40 deletions
diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c
index 0f0f83d50ddc..ca0345795fa6 100644
--- a/drivers/net/myri10ge/myri10ge.c
+++ b/drivers/net/myri10ge/myri10ge.c
@@ -193,6 +193,7 @@ struct myri10ge_slice_state {
        int watchdog_tx_done;
        int watchdog_tx_req;
        int watchdog_rx_done;
+        int stuck;
 #ifdef CONFIG_MYRI10GE_DCA
        int cached_dca_tag;
        int cpu;
@@ -3442,6 +3443,42 @@ static u32 myri10ge_read_reboot(struct myri10ge_priv *mgp)
        return reboot;
 }
+static void
+myri10ge_check_slice(struct myri10ge_slice_state *ss, int *reset_needed,
+                     int *busy_slice_cnt, u32 rx_pause_cnt)
+{
+        struct myri10ge_priv *mgp = ss->mgp;
+        int slice = ss - mgp->ss;
+        if (ss->tx.req != ss->tx.done &&
+            ss->tx.done == ss->watchdog_tx_done &&
+            ss->watchdog_tx_req != ss->watchdog_tx_done) {
+                /* nic seems like it might be stuck.. */
+                if (rx_pause_cnt != mgp->watchdog_pause) {
+                        if (net_ratelimit())
+                                netdev_warn(mgp->dev, "slice %d: TX paused, "
+                                            "check link partner\n", slice);
+                } else {
+                        netdev_warn(mgp->dev,
+                                    "slice %d: TX stuck %d %d %d %d %d %d\n",
+                                    slice, ss->tx.queue_active, ss->tx.req,
+                                    ss->tx.done, ss->tx.pkt_start,
+                                    ss->tx.pkt_done,
+                                    (int)ntohl(mgp->ss[slice].fw_stats->
+                                               send_done_count));
+                        *reset_needed = 1;
+                        ss->stuck = 1;
+                }
+        }
+        if (ss->watchdog_tx_done != ss->tx.done ||
+            ss->watchdog_rx_done != ss->rx_done.cnt) {
+                *busy_slice_cnt += 1;
+        }
+        ss->watchdog_tx_done = ss->tx.done;
+        ss->watchdog_tx_req = ss->tx.req;
+        ss->watchdog_rx_done = ss->rx_done.cnt;
+}
 /*
 * This watchdog is used to check whether the board has suffered
 * from a parity error and needs to be recovered.
@@ -3450,10 +3487,12 @@ static void myri10ge_watchdog(struct work_struct *work)
 {
        struct myri10ge_priv *mgp =
            container_of(work, struct myri10ge_priv, watchdog_work);
-        struct myri10ge_tx_buf *tx;
+        struct myri10ge_slice_state *ss;
-        u32 reboot;
+        u32 reboot, rx_pause_cnt;
        int status, rebooted;
        int i;
+        int reset_needed = 0;
+        int busy_slice_cnt = 0;
        u16 cmd, vendor;
        mgp->watchdog_resets++;
@@ -3465,8 +3504,7 @@ static void myri10ge_watchdog(struct work_struct *work)
                 * For now, just report it */
                reboot = myri10ge_read_reboot(mgp);
                netdev_err(mgp->dev, "NIC rebooted (0x%x),%s resetting\n",
-                           reboot,
+                           reboot, myri10ge_reset_recover ? "" : " not");
-                           myri10ge_reset_recover ? "" : " not");
                if (myri10ge_reset_recover == 0)
                        return;
                rtnl_lock();
@@ -3498,23 +3536,24 @@ static void myri10ge_watchdog(struct work_struct *work)
                                return;
                        }
                }
-                /* Perhaps it is a software error.  Try to reset */
+                /* Perhaps it is a software error. See if stuck slice
+                 * has recovered, reset if not */
-                netdev_err(mgp->dev, "device timeout, resetting\n");
+                rx_pause_cnt = ntohl(mgp->ss[0].fw_stats->dropped_pause);
                for (i = 0; i < mgp->num_slices; i++) {
-                        tx = &mgp->ss[i].tx;
+                        ss = mgp->ss;
-                        netdev_err(mgp->dev, "(%d): %d %d %d %d %d %d\n",
+                        if (ss->stuck) {
-                                   i, tx->queue_active, tx->req,
+                                myri10ge_check_slice(ss, &reset_needed,
-                                   tx->done, tx->pkt_start, tx->pkt_done,
+                                                     &busy_slice_cnt,
-                                   (int)ntohl(mgp->ss[i].fw_stats->
+                                                     rx_pause_cnt);
-                                              send_done_count));
+                                ss->stuck = 0;
-                        msleep(2000);
+                        }
-                        netdev_info(mgp->dev, "(%d): %d %d %d %d %d %d\n",
-                                    i, tx->queue_active, tx->req,
-                                    tx->done, tx->pkt_start, tx->pkt_done,
-                                    (int)ntohl(mgp->ss[i].fw_stats->
-                                               send_done_count));
                }
+                if (!reset_needed) {
+                        netdev_dbg(mgp->dev, "not resetting\n");
+                        return;
+                }
+                netdev_err(mgp->dev, "device timeout, resetting\n");
        }
        if (!rebooted) {
@@ -3567,27 +3606,8 @@ static void myri10ge_watchdog_timer(unsigned long arg)
                            myri10ge_fill_thresh)
                                ss->rx_big.watchdog_needed = 0;
                }
+                myri10ge_check_slice(ss, &reset_needed, &busy_slice_cnt,
-                if (ss->tx.req != ss->tx.done &&
+                                     rx_pause_cnt);
-                    ss->tx.done == ss->watchdog_tx_done &&
-                    ss->watchdog_tx_req != ss->watchdog_tx_done) {
-                        /* nic seems like it might be stuck.. */
-                        if (rx_pause_cnt != mgp->watchdog_pause) {
-                                if (net_ratelimit())
-                                        netdev_err(mgp->dev, "slice %d: TX paused, check link partner\n",
-                                                   i);
-                        } else {
-                                netdev_warn(mgp->dev, "slice %d stuck:", i);
-                                reset_needed = 1;
-                        }
-                }
-                if (ss->watchdog_tx_done != ss->tx.done ||
-                    ss->watchdog_rx_done != ss->rx_done.cnt) {
-                        busy_slice_cnt++;
-                }
-                ss->watchdog_tx_done = ss->tx.done;
-                ss->watchdog_tx_req = ss->tx.req;
-                ss->watchdog_rx_done = ss->rx_done.cnt;
        }
        /* if we've sent or received no traffic, poll the NIC to
         * ensure it is still there.  Otherwise, we risk not noticing
author	Jon Mason <mason@myri.com>	2011-06-27 13:57:28 -0400
committer	David S. Miller <davem@davemloft.net>	2011-06-29 09:02:04 -0400
commit	c689b81b4267b1335b11f18fe8a79c56880d9d43 (patch)
tree	ccfba7f4a8003a63194ab4eca263e6574b80f762 /drivers/net/myri10ge/myri10ge.c
parent	7539a613c646f9e870bbedfa753a54cf13b98d22 (diff)

diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c index 0f0f83d50ddc..ca0345795fa6 100644 --- a/drivers/net/myri10ge/myri10ge.c +++ b/drivers/net/myri10ge/myri10ge.c
@@ -193,6 +193,7 @@ struct myri10ge_slice_state {
193	int watchdog_tx_done;	193	int watchdog_tx_done;
194	int watchdog_tx_req;	194	int watchdog_tx_req;
195	int watchdog_rx_done;	195	int watchdog_rx_done;
		196	int stuck;
196	#ifdef CONFIG_MYRI10GE_DCA	197	#ifdef CONFIG_MYRI10GE_DCA
197	int cached_dca_tag;	198	int cached_dca_tag;
198	int cpu;	199	int cpu;
@@ -3442,6 +3443,42 @@ static u32 myri10ge_read_reboot(struct myri10ge_priv *mgp)
3442	return reboot;	3443	return reboot;
3443	}	3444	}
3444		3445
		3446	static void
		3447	myri10ge_check_slice(struct myri10ge_slice_state ss, int reset_needed,
		3448	int *busy_slice_cnt, u32 rx_pause_cnt)
		3449	{
		3450	struct myri10ge_priv *mgp = ss->mgp;
		3451	int slice = ss - mgp->ss;
		3452
		3453	if (ss->tx.req != ss->tx.done &&
		3454	ss->tx.done == ss->watchdog_tx_done &&
		3455	ss->watchdog_tx_req != ss->watchdog_tx_done) {
		3456	/* nic seems like it might be stuck.. */
		3457	if (rx_pause_cnt != mgp->watchdog_pause) {
		3458	if (net_ratelimit())
		3459	netdev_warn(mgp->dev, "slice %d: TX paused, "
		3460	"check link partner\n", slice);
		3461	} else {
		3462	netdev_warn(mgp->dev,
		3463	"slice %d: TX stuck %d %d %d %d %d %d\n",
		3464	slice, ss->tx.queue_active, ss->tx.req,
		3465	ss->tx.done, ss->tx.pkt_start,
		3466	ss->tx.pkt_done,
		3467	(int)ntohl(mgp->ss[slice].fw_stats->
		3468	send_done_count));
		3469	*reset_needed = 1;
		3470	ss->stuck = 1;
		3471	}
		3472	}
		3473	if (ss->watchdog_tx_done != ss->tx.done \|\|
		3474	ss->watchdog_rx_done != ss->rx_done.cnt) {
		3475	*busy_slice_cnt += 1;
		3476	}
		3477	ss->watchdog_tx_done = ss->tx.done;
		3478	ss->watchdog_tx_req = ss->tx.req;
		3479	ss->watchdog_rx_done = ss->rx_done.cnt;
		3480	}
		3481
3445	/*	3482	/*
3446	* This watchdog is used to check whether the board has suffered	3483	* This watchdog is used to check whether the board has suffered
3447	* from a parity error and needs to be recovered.	3484	* from a parity error and needs to be recovered.
@@ -3450,10 +3487,12 @@ static void myri10ge_watchdog(struct work_struct *work)
3450	{	3487	{
3451	struct myri10ge_priv *mgp =	3488	struct myri10ge_priv *mgp =
3452	container_of(work, struct myri10ge_priv, watchdog_work);	3489	container_of(work, struct myri10ge_priv, watchdog_work);
3453	struct myri10ge_tx_buf *tx;	3490	struct myri10ge_slice_state *ss;
3454	u32 reboot;	3491	u32 reboot, rx_pause_cnt;
3455	int status, rebooted;	3492	int status, rebooted;
3456	int i;	3493	int i;
		3494	int reset_needed = 0;
		3495	int busy_slice_cnt = 0;
3457	u16 cmd, vendor;	3496	u16 cmd, vendor;
3458		3497
3459	mgp->watchdog_resets++;	3498	mgp->watchdog_resets++;
@@ -3465,8 +3504,7 @@ static void myri10ge_watchdog(struct work_struct *work)
3465	* For now, just report it */	3504	* For now, just report it */
3466	reboot = myri10ge_read_reboot(mgp);	3505	reboot = myri10ge_read_reboot(mgp);
3467	netdev_err(mgp->dev, "NIC rebooted (0x%x),%s resetting\n",	3506	netdev_err(mgp->dev, "NIC rebooted (0x%x),%s resetting\n",
3468	reboot,	3507	reboot, myri10ge_reset_recover ? "" : " not");
3469	myri10ge_reset_recover ? "" : " not");
3470	if (myri10ge_reset_recover == 0)	3508	if (myri10ge_reset_recover == 0)
3471	return;	3509	return;
3472	rtnl_lock();	3510	rtnl_lock();
@@ -3498,23 +3536,24 @@ static void myri10ge_watchdog(struct work_struct *work)
3498	return;	3536	return;
3499	}	3537	}
3500	}	3538	}
3501	/* Perhaps it is a software error. Try to reset */	3539	/* Perhaps it is a software error. See if stuck slice
3502		3540	* has recovered, reset if not */
3503	netdev_err(mgp->dev, "device timeout, resetting\n");	3541	rx_pause_cnt = ntohl(mgp->ss[0].fw_stats->dropped_pause);
3504	for (i = 0; i < mgp->num_slices; i++) {	3542	for (i = 0; i < mgp->num_slices; i++) {
3505	tx = &mgp->ss[i].tx;	3543	ss = mgp->ss;
3506	netdev_err(mgp->dev, "(%d): %d %d %d %d %d %d\n",	3544	if (ss->stuck) {
3507	i, tx->queue_active, tx->req,	3545	myri10ge_check_slice(ss, &reset_needed,
3508	tx->done, tx->pkt_start, tx->pkt_done,	3546	&busy_slice_cnt,
3509	(int)ntohl(mgp->ss[i].fw_stats->	3547	rx_pause_cnt);
3510	send_done_count));	3548	ss->stuck = 0;
3511	msleep(2000);	3549	}
3512	netdev_info(mgp->dev, "(%d): %d %d %d %d %d %d\n",
3513	i, tx->queue_active, tx->req,
3514	tx->done, tx->pkt_start, tx->pkt_done,
3515	(int)ntohl(mgp->ss[i].fw_stats->
3516	send_done_count));
3517	}	3550	}
		3551	if (!reset_needed) {
		3552	netdev_dbg(mgp->dev, "not resetting\n");
		3553	return;
		3554	}
		3555
		3556	netdev_err(mgp->dev, "device timeout, resetting\n");
3518	}	3557	}
3519		3558
3520	if (!rebooted) {	3559	if (!rebooted) {
@@ -3567,27 +3606,8 @@ static void myri10ge_watchdog_timer(unsigned long arg)
3567	myri10ge_fill_thresh)	3606	myri10ge_fill_thresh)
3568	ss->rx_big.watchdog_needed = 0;	3607	ss->rx_big.watchdog_needed = 0;
3569	}	3608	}
3570		3609	myri10ge_check_slice(ss, &reset_needed, &busy_slice_cnt,
3571	if (ss->tx.req != ss->tx.done &&	3610	rx_pause_cnt);
3572	ss->tx.done == ss->watchdog_tx_done &&
3573	ss->watchdog_tx_req != ss->watchdog_tx_done) {
3574	/* nic seems like it might be stuck.. */
3575	if (rx_pause_cnt != mgp->watchdog_pause) {
3576	if (net_ratelimit())
3577	netdev_err(mgp->dev, "slice %d: TX paused, check link partner\n",
3578	i);
3579	} else {
3580	netdev_warn(mgp->dev, "slice %d stuck:", i);
3581	reset_needed = 1;
3582	}
3583	}
3584	if (ss->watchdog_tx_done != ss->tx.done \|\|
3585	ss->watchdog_rx_done != ss->rx_done.cnt) {
3586	busy_slice_cnt++;
3587	}
3588	ss->watchdog_tx_done = ss->tx.done;
3589	ss->watchdog_tx_req = ss->tx.req;
3590	ss->watchdog_rx_done = ss->rx_done.cnt;
3591	}	3611	}
3592	/* if we've sent or received no traffic, poll the NIC to	3612	/* if we've sent or received no traffic, poll the NIC to
3593	* ensure it is still there. Otherwise, we risk not noticing	3613	* ensure it is still there. Otherwise, we risk not noticing