diff options
author | Brice Goglin <brice@myri.com> | 2009-08-07 06:44:22 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2009-08-13 00:54:59 -0400 |
commit | d02342151c51344034fbdeceff8effcb0a77c573 (patch) | |
tree | 62d215202b8afe9e04f857d60185d40aa1a6c229 | |
parent | c9145a2df072f75d97592ddac1624baeb7bad195 (diff) |
myri10ge: improve parity error detection and recovery
Improve myri10ge parity error detection and recovery:
1) Don't restore PCI config space to a rebooted NIC until AFTER the
host is quiescent.
2) Let myri10ge_close() know the NIC is dead, so it won't waste time
waiting for a dead nic to respond to MXGEFW_CMD_ETHERNET_DOWN
3) When the NIC is quiet (link down, or otherwise idle link) use
a pci config space read to detect a rebooted NIC. Otherwise
we might never notice that a NIC rebooted
Signed-off-by: Andrew Gallatin <gallatin@myri.com>
Signed-off-by: Brice Goglin <brice@myri.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | drivers/net/myri10ge/myri10ge.c | 63 |
1 files changed, 46 insertions, 17 deletions
diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c index 1a34f7e11d98..75deef35b1e0 100644 --- a/drivers/net/myri10ge/myri10ge.c +++ b/drivers/net/myri10ge/myri10ge.c | |||
@@ -75,7 +75,7 @@ | |||
75 | #include "myri10ge_mcp.h" | 75 | #include "myri10ge_mcp.h" |
76 | #include "myri10ge_mcp_gen_header.h" | 76 | #include "myri10ge_mcp_gen_header.h" |
77 | 77 | ||
78 | #define MYRI10GE_VERSION_STR "1.5.0-1.418" | 78 | #define MYRI10GE_VERSION_STR "1.5.0-1.432" |
79 | 79 | ||
80 | MODULE_DESCRIPTION("Myricom 10G driver (10GbE)"); | 80 | MODULE_DESCRIPTION("Myricom 10G driver (10GbE)"); |
81 | MODULE_AUTHOR("Maintainer: help@myri.com"); | 81 | MODULE_AUTHOR("Maintainer: help@myri.com"); |
@@ -188,6 +188,7 @@ struct myri10ge_slice_state { | |||
188 | dma_addr_t fw_stats_bus; | 188 | dma_addr_t fw_stats_bus; |
189 | int watchdog_tx_done; | 189 | int watchdog_tx_done; |
190 | int watchdog_tx_req; | 190 | int watchdog_tx_req; |
191 | int watchdog_rx_done; | ||
191 | #ifdef CONFIG_MYRI10GE_DCA | 192 | #ifdef CONFIG_MYRI10GE_DCA |
192 | int cached_dca_tag; | 193 | int cached_dca_tag; |
193 | int cpu; | 194 | int cpu; |
@@ -256,6 +257,7 @@ struct myri10ge_priv { | |||
256 | u32 link_changes; | 257 | u32 link_changes; |
257 | u32 msg_enable; | 258 | u32 msg_enable; |
258 | unsigned int board_number; | 259 | unsigned int board_number; |
260 | int rebooted; | ||
259 | }; | 261 | }; |
260 | 262 | ||
261 | static char *myri10ge_fw_unaligned = "myri10ge_ethp_z8e.dat"; | 263 | static char *myri10ge_fw_unaligned = "myri10ge_ethp_z8e.dat"; |
@@ -2552,17 +2554,22 @@ static int myri10ge_close(struct net_device *dev) | |||
2552 | netif_carrier_off(dev); | 2554 | netif_carrier_off(dev); |
2553 | 2555 | ||
2554 | netif_tx_stop_all_queues(dev); | 2556 | netif_tx_stop_all_queues(dev); |
2555 | old_down_cnt = mgp->down_cnt; | 2557 | if (mgp->rebooted == 0) { |
2556 | mb(); | 2558 | old_down_cnt = mgp->down_cnt; |
2557 | status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_DOWN, &cmd, 0); | 2559 | mb(); |
2558 | if (status) | 2560 | status = |
2559 | printk(KERN_ERR "myri10ge: %s: Couldn't bring down link\n", | 2561 | myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_DOWN, &cmd, 0); |
2560 | dev->name); | 2562 | if (status) |
2561 | 2563 | printk(KERN_ERR | |
2562 | wait_event_timeout(mgp->down_wq, old_down_cnt != mgp->down_cnt, HZ); | 2564 | "myri10ge: %s: Couldn't bring down link\n", |
2563 | if (old_down_cnt == mgp->down_cnt) | 2565 | dev->name); |
2564 | printk(KERN_ERR "myri10ge: %s never got down irq\n", dev->name); | ||
2565 | 2566 | ||
2567 | wait_event_timeout(mgp->down_wq, old_down_cnt != mgp->down_cnt, | ||
2568 | HZ); | ||
2569 | if (old_down_cnt == mgp->down_cnt) | ||
2570 | printk(KERN_ERR "myri10ge: %s never got down irq\n", | ||
2571 | dev->name); | ||
2572 | } | ||
2566 | netif_tx_disable(dev); | 2573 | netif_tx_disable(dev); |
2567 | myri10ge_free_irq(mgp); | 2574 | myri10ge_free_irq(mgp); |
2568 | for (i = 0; i < mgp->num_slices; i++) | 2575 | for (i = 0; i < mgp->num_slices; i++) |
@@ -3427,12 +3434,13 @@ static void myri10ge_watchdog(struct work_struct *work) | |||
3427 | container_of(work, struct myri10ge_priv, watchdog_work); | 3434 | container_of(work, struct myri10ge_priv, watchdog_work); |
3428 | struct myri10ge_tx_buf *tx; | 3435 | struct myri10ge_tx_buf *tx; |
3429 | u32 reboot; | 3436 | u32 reboot; |
3430 | int status; | 3437 | int status, rebooted; |
3431 | int i; | 3438 | int i; |
3432 | u16 cmd, vendor; | 3439 | u16 cmd, vendor; |
3433 | 3440 | ||
3434 | mgp->watchdog_resets++; | 3441 | mgp->watchdog_resets++; |
3435 | pci_read_config_word(mgp->pdev, PCI_COMMAND, &cmd); | 3442 | pci_read_config_word(mgp->pdev, PCI_COMMAND, &cmd); |
3443 | rebooted = 0; | ||
3436 | if ((cmd & PCI_COMMAND_MASTER) == 0) { | 3444 | if ((cmd & PCI_COMMAND_MASTER) == 0) { |
3437 | /* Bus master DMA disabled? Check to see | 3445 | /* Bus master DMA disabled? Check to see |
3438 | * if the card rebooted due to a parity error | 3446 | * if the card rebooted due to a parity error |
@@ -3444,9 +3452,12 @@ static void myri10ge_watchdog(struct work_struct *work) | |||
3444 | myri10ge_reset_recover ? " " : " not"); | 3452 | myri10ge_reset_recover ? " " : " not"); |
3445 | if (myri10ge_reset_recover == 0) | 3453 | if (myri10ge_reset_recover == 0) |
3446 | return; | 3454 | return; |
3447 | 3455 | rtnl_lock(); | |
3456 | mgp->rebooted = 1; | ||
3457 | rebooted = 1; | ||
3458 | myri10ge_close(mgp->dev); | ||
3448 | myri10ge_reset_recover--; | 3459 | myri10ge_reset_recover--; |
3449 | 3460 | mgp->rebooted = 0; | |
3450 | /* | 3461 | /* |
3451 | * A rebooted nic will come back with config space as | 3462 | * A rebooted nic will come back with config space as |
3452 | * it was after power was applied to PCIe bus. | 3463 | * it was after power was applied to PCIe bus. |
@@ -3494,8 +3505,10 @@ static void myri10ge_watchdog(struct work_struct *work) | |||
3494 | } | 3505 | } |
3495 | } | 3506 | } |
3496 | 3507 | ||
3497 | rtnl_lock(); | 3508 | if (!rebooted) { |
3498 | myri10ge_close(mgp->dev); | 3509 | rtnl_lock(); |
3510 | myri10ge_close(mgp->dev); | ||
3511 | } | ||
3499 | status = myri10ge_load_firmware(mgp, 1); | 3512 | status = myri10ge_load_firmware(mgp, 1); |
3500 | if (status != 0) | 3513 | if (status != 0) |
3501 | printk(KERN_ERR "myri10ge: %s: failed to load firmware\n", | 3514 | printk(KERN_ERR "myri10ge: %s: failed to load firmware\n", |
@@ -3516,12 +3529,14 @@ static void myri10ge_watchdog_timer(unsigned long arg) | |||
3516 | { | 3529 | { |
3517 | struct myri10ge_priv *mgp; | 3530 | struct myri10ge_priv *mgp; |
3518 | struct myri10ge_slice_state *ss; | 3531 | struct myri10ge_slice_state *ss; |
3519 | int i, reset_needed; | 3532 | int i, reset_needed, busy_slice_cnt; |
3520 | u32 rx_pause_cnt; | 3533 | u32 rx_pause_cnt; |
3534 | u16 cmd; | ||
3521 | 3535 | ||
3522 | mgp = (struct myri10ge_priv *)arg; | 3536 | mgp = (struct myri10ge_priv *)arg; |
3523 | 3537 | ||
3524 | rx_pause_cnt = ntohl(mgp->ss[0].fw_stats->dropped_pause); | 3538 | rx_pause_cnt = ntohl(mgp->ss[0].fw_stats->dropped_pause); |
3539 | busy_slice_cnt = 0; | ||
3525 | for (i = 0, reset_needed = 0; | 3540 | for (i = 0, reset_needed = 0; |
3526 | i < mgp->num_slices && reset_needed == 0; ++i) { | 3541 | i < mgp->num_slices && reset_needed == 0; ++i) { |
3527 | 3542 | ||
@@ -3559,8 +3574,22 @@ static void myri10ge_watchdog_timer(unsigned long arg) | |||
3559 | reset_needed = 1; | 3574 | reset_needed = 1; |
3560 | } | 3575 | } |
3561 | } | 3576 | } |
3577 | if (ss->watchdog_tx_done != ss->tx.done || | ||
3578 | ss->watchdog_rx_done != ss->rx_done.cnt) { | ||
3579 | busy_slice_cnt++; | ||
3580 | } | ||
3562 | ss->watchdog_tx_done = ss->tx.done; | 3581 | ss->watchdog_tx_done = ss->tx.done; |
3563 | ss->watchdog_tx_req = ss->tx.req; | 3582 | ss->watchdog_tx_req = ss->tx.req; |
3583 | ss->watchdog_rx_done = ss->rx_done.cnt; | ||
3584 | } | ||
3585 | /* if we've sent or received no traffic, poll the NIC to | ||
3586 | * ensure it is still there. Otherwise, we risk not noticing | ||
3587 | * an error in a timely fashion */ | ||
3588 | if (busy_slice_cnt == 0) { | ||
3589 | pci_read_config_word(mgp->pdev, PCI_COMMAND, &cmd); | ||
3590 | if ((cmd & PCI_COMMAND_MASTER) == 0) { | ||
3591 | reset_needed = 1; | ||
3592 | } | ||
3564 | } | 3593 | } |
3565 | mgp->watchdog_pause = rx_pause_cnt; | 3594 | mgp->watchdog_pause = rx_pause_cnt; |
3566 | 3595 | ||