diff options
author | Jon Mason <mason@myri.com> | 2011-06-27 13:57:28 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2011-06-29 09:02:04 -0400 |
commit | c689b81b4267b1335b11f18fe8a79c56880d9d43 (patch) | |
tree | ccfba7f4a8003a63194ab4eca263e6574b80f762 /drivers/net/myri10ge/myri10ge.c | |
parent | 7539a613c646f9e870bbedfa753a54cf13b98d22 (diff) |
myri10ge: rework parity error check and cleanup
Clean up watchdog reset code:
- move code that checks for stuck slice to a common routine
- unless there is a confirmed h/w fault, verify that a stuck
slice is still stuck in the watchdog worker; if the slice is no
longer stuck, abort the reset.
- this removes an egregious 2000ms pause in the watchdog worker that
was a diagnostic aid (to look for spurious resets) the snuck into
production code.
v3 includes corrections from Joe Perches
Signed-off-by: Jon Mason <mason@myri.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers/net/myri10ge/myri10ge.c')
-rw-r--r-- | drivers/net/myri10ge/myri10ge.c | 100 |
1 files changed, 60 insertions, 40 deletions
diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c index 0f0f83d50ddc..ca0345795fa6 100644 --- a/drivers/net/myri10ge/myri10ge.c +++ b/drivers/net/myri10ge/myri10ge.c | |||
@@ -193,6 +193,7 @@ struct myri10ge_slice_state { | |||
193 | int watchdog_tx_done; | 193 | int watchdog_tx_done; |
194 | int watchdog_tx_req; | 194 | int watchdog_tx_req; |
195 | int watchdog_rx_done; | 195 | int watchdog_rx_done; |
196 | int stuck; | ||
196 | #ifdef CONFIG_MYRI10GE_DCA | 197 | #ifdef CONFIG_MYRI10GE_DCA |
197 | int cached_dca_tag; | 198 | int cached_dca_tag; |
198 | int cpu; | 199 | int cpu; |
@@ -3442,6 +3443,42 @@ static u32 myri10ge_read_reboot(struct myri10ge_priv *mgp) | |||
3442 | return reboot; | 3443 | return reboot; |
3443 | } | 3444 | } |
3444 | 3445 | ||
3446 | static void | ||
3447 | myri10ge_check_slice(struct myri10ge_slice_state *ss, int *reset_needed, | ||
3448 | int *busy_slice_cnt, u32 rx_pause_cnt) | ||
3449 | { | ||
3450 | struct myri10ge_priv *mgp = ss->mgp; | ||
3451 | int slice = ss - mgp->ss; | ||
3452 | |||
3453 | if (ss->tx.req != ss->tx.done && | ||
3454 | ss->tx.done == ss->watchdog_tx_done && | ||
3455 | ss->watchdog_tx_req != ss->watchdog_tx_done) { | ||
3456 | /* nic seems like it might be stuck.. */ | ||
3457 | if (rx_pause_cnt != mgp->watchdog_pause) { | ||
3458 | if (net_ratelimit()) | ||
3459 | netdev_warn(mgp->dev, "slice %d: TX paused, " | ||
3460 | "check link partner\n", slice); | ||
3461 | } else { | ||
3462 | netdev_warn(mgp->dev, | ||
3463 | "slice %d: TX stuck %d %d %d %d %d %d\n", | ||
3464 | slice, ss->tx.queue_active, ss->tx.req, | ||
3465 | ss->tx.done, ss->tx.pkt_start, | ||
3466 | ss->tx.pkt_done, | ||
3467 | (int)ntohl(mgp->ss[slice].fw_stats-> | ||
3468 | send_done_count)); | ||
3469 | *reset_needed = 1; | ||
3470 | ss->stuck = 1; | ||
3471 | } | ||
3472 | } | ||
3473 | if (ss->watchdog_tx_done != ss->tx.done || | ||
3474 | ss->watchdog_rx_done != ss->rx_done.cnt) { | ||
3475 | *busy_slice_cnt += 1; | ||
3476 | } | ||
3477 | ss->watchdog_tx_done = ss->tx.done; | ||
3478 | ss->watchdog_tx_req = ss->tx.req; | ||
3479 | ss->watchdog_rx_done = ss->rx_done.cnt; | ||
3480 | } | ||
3481 | |||
3445 | /* | 3482 | /* |
3446 | * This watchdog is used to check whether the board has suffered | 3483 | * This watchdog is used to check whether the board has suffered |
3447 | * from a parity error and needs to be recovered. | 3484 | * from a parity error and needs to be recovered. |
@@ -3450,10 +3487,12 @@ static void myri10ge_watchdog(struct work_struct *work) | |||
3450 | { | 3487 | { |
3451 | struct myri10ge_priv *mgp = | 3488 | struct myri10ge_priv *mgp = |
3452 | container_of(work, struct myri10ge_priv, watchdog_work); | 3489 | container_of(work, struct myri10ge_priv, watchdog_work); |
3453 | struct myri10ge_tx_buf *tx; | 3490 | struct myri10ge_slice_state *ss; |
3454 | u32 reboot; | 3491 | u32 reboot, rx_pause_cnt; |
3455 | int status, rebooted; | 3492 | int status, rebooted; |
3456 | int i; | 3493 | int i; |
3494 | int reset_needed = 0; | ||
3495 | int busy_slice_cnt = 0; | ||
3457 | u16 cmd, vendor; | 3496 | u16 cmd, vendor; |
3458 | 3497 | ||
3459 | mgp->watchdog_resets++; | 3498 | mgp->watchdog_resets++; |
@@ -3465,8 +3504,7 @@ static void myri10ge_watchdog(struct work_struct *work) | |||
3465 | * For now, just report it */ | 3504 | * For now, just report it */ |
3466 | reboot = myri10ge_read_reboot(mgp); | 3505 | reboot = myri10ge_read_reboot(mgp); |
3467 | netdev_err(mgp->dev, "NIC rebooted (0x%x),%s resetting\n", | 3506 | netdev_err(mgp->dev, "NIC rebooted (0x%x),%s resetting\n", |
3468 | reboot, | 3507 | reboot, myri10ge_reset_recover ? "" : " not"); |
3469 | myri10ge_reset_recover ? "" : " not"); | ||
3470 | if (myri10ge_reset_recover == 0) | 3508 | if (myri10ge_reset_recover == 0) |
3471 | return; | 3509 | return; |
3472 | rtnl_lock(); | 3510 | rtnl_lock(); |
@@ -3498,23 +3536,24 @@ static void myri10ge_watchdog(struct work_struct *work) | |||
3498 | return; | 3536 | return; |
3499 | } | 3537 | } |
3500 | } | 3538 | } |
3501 | /* Perhaps it is a software error. Try to reset */ | 3539 | /* Perhaps it is a software error. See if stuck slice |
3502 | 3540 | * has recovered, reset if not */ | |
3503 | netdev_err(mgp->dev, "device timeout, resetting\n"); | 3541 | rx_pause_cnt = ntohl(mgp->ss[0].fw_stats->dropped_pause); |
3504 | for (i = 0; i < mgp->num_slices; i++) { | 3542 | for (i = 0; i < mgp->num_slices; i++) { |
3505 | tx = &mgp->ss[i].tx; | 3543 | ss = mgp->ss; |
3506 | netdev_err(mgp->dev, "(%d): %d %d %d %d %d %d\n", | 3544 | if (ss->stuck) { |
3507 | i, tx->queue_active, tx->req, | 3545 | myri10ge_check_slice(ss, &reset_needed, |
3508 | tx->done, tx->pkt_start, tx->pkt_done, | 3546 | &busy_slice_cnt, |
3509 | (int)ntohl(mgp->ss[i].fw_stats-> | 3547 | rx_pause_cnt); |
3510 | send_done_count)); | 3548 | ss->stuck = 0; |
3511 | msleep(2000); | 3549 | } |
3512 | netdev_info(mgp->dev, "(%d): %d %d %d %d %d %d\n", | ||
3513 | i, tx->queue_active, tx->req, | ||
3514 | tx->done, tx->pkt_start, tx->pkt_done, | ||
3515 | (int)ntohl(mgp->ss[i].fw_stats-> | ||
3516 | send_done_count)); | ||
3517 | } | 3550 | } |
3551 | if (!reset_needed) { | ||
3552 | netdev_dbg(mgp->dev, "not resetting\n"); | ||
3553 | return; | ||
3554 | } | ||
3555 | |||
3556 | netdev_err(mgp->dev, "device timeout, resetting\n"); | ||
3518 | } | 3557 | } |
3519 | 3558 | ||
3520 | if (!rebooted) { | 3559 | if (!rebooted) { |
@@ -3567,27 +3606,8 @@ static void myri10ge_watchdog_timer(unsigned long arg) | |||
3567 | myri10ge_fill_thresh) | 3606 | myri10ge_fill_thresh) |
3568 | ss->rx_big.watchdog_needed = 0; | 3607 | ss->rx_big.watchdog_needed = 0; |
3569 | } | 3608 | } |
3570 | 3609 | myri10ge_check_slice(ss, &reset_needed, &busy_slice_cnt, | |
3571 | if (ss->tx.req != ss->tx.done && | 3610 | rx_pause_cnt); |
3572 | ss->tx.done == ss->watchdog_tx_done && | ||
3573 | ss->watchdog_tx_req != ss->watchdog_tx_done) { | ||
3574 | /* nic seems like it might be stuck.. */ | ||
3575 | if (rx_pause_cnt != mgp->watchdog_pause) { | ||
3576 | if (net_ratelimit()) | ||
3577 | netdev_err(mgp->dev, "slice %d: TX paused, check link partner\n", | ||
3578 | i); | ||
3579 | } else { | ||
3580 | netdev_warn(mgp->dev, "slice %d stuck:", i); | ||
3581 | reset_needed = 1; | ||
3582 | } | ||
3583 | } | ||
3584 | if (ss->watchdog_tx_done != ss->tx.done || | ||
3585 | ss->watchdog_rx_done != ss->rx_done.cnt) { | ||
3586 | busy_slice_cnt++; | ||
3587 | } | ||
3588 | ss->watchdog_tx_done = ss->tx.done; | ||
3589 | ss->watchdog_tx_req = ss->tx.req; | ||
3590 | ss->watchdog_rx_done = ss->rx_done.cnt; | ||
3591 | } | 3611 | } |
3592 | /* if we've sent or received no traffic, poll the NIC to | 3612 | /* if we've sent or received no traffic, poll the NIC to |
3593 | * ensure it is still there. Otherwise, we risk not noticing | 3613 | * ensure it is still there. Otherwise, we risk not noticing |