aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBrice Goglin <brice@myri.com>2009-08-07 06:44:22 -0400
committerDavid S. Miller <davem@davemloft.net>2009-08-13 00:54:59 -0400
commitd02342151c51344034fbdeceff8effcb0a77c573 (patch)
tree62d215202b8afe9e04f857d60185d40aa1a6c229
parentc9145a2df072f75d97592ddac1624baeb7bad195 (diff)
myri10ge: improve parity error detection and recovery
Improve myri10ge parity error detection and recovery: 1) Don't restore PCI config space to a rebooted NIC until AFTER the host is quiescent. 2) Let myri10ge_close() know the NIC is dead, so it won't waste time waiting for a dead nic to respond to MXGEFW_CMD_ETHERNET_DOWN 3) When the NIC is quiet (link down, or otherwise idle link) use a pci config space read to detect a rebooted NIC. Otherwise we might never notice that a NIC rebooted Signed-off-by: Andrew Gallatin <gallatin@myri.com> Signed-off-by: Brice Goglin <brice@myri.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--drivers/net/myri10ge/myri10ge.c63
1 files changed, 46 insertions, 17 deletions
diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c
index 1a34f7e11d98..75deef35b1e0 100644
--- a/drivers/net/myri10ge/myri10ge.c
+++ b/drivers/net/myri10ge/myri10ge.c
@@ -75,7 +75,7 @@
75#include "myri10ge_mcp.h" 75#include "myri10ge_mcp.h"
76#include "myri10ge_mcp_gen_header.h" 76#include "myri10ge_mcp_gen_header.h"
77 77
78#define MYRI10GE_VERSION_STR "1.5.0-1.418" 78#define MYRI10GE_VERSION_STR "1.5.0-1.432"
79 79
80MODULE_DESCRIPTION("Myricom 10G driver (10GbE)"); 80MODULE_DESCRIPTION("Myricom 10G driver (10GbE)");
81MODULE_AUTHOR("Maintainer: help@myri.com"); 81MODULE_AUTHOR("Maintainer: help@myri.com");
@@ -188,6 +188,7 @@ struct myri10ge_slice_state {
188 dma_addr_t fw_stats_bus; 188 dma_addr_t fw_stats_bus;
189 int watchdog_tx_done; 189 int watchdog_tx_done;
190 int watchdog_tx_req; 190 int watchdog_tx_req;
191 int watchdog_rx_done;
191#ifdef CONFIG_MYRI10GE_DCA 192#ifdef CONFIG_MYRI10GE_DCA
192 int cached_dca_tag; 193 int cached_dca_tag;
193 int cpu; 194 int cpu;
@@ -256,6 +257,7 @@ struct myri10ge_priv {
256 u32 link_changes; 257 u32 link_changes;
257 u32 msg_enable; 258 u32 msg_enable;
258 unsigned int board_number; 259 unsigned int board_number;
260 int rebooted;
259}; 261};
260 262
261static char *myri10ge_fw_unaligned = "myri10ge_ethp_z8e.dat"; 263static char *myri10ge_fw_unaligned = "myri10ge_ethp_z8e.dat";
@@ -2552,17 +2554,22 @@ static int myri10ge_close(struct net_device *dev)
2552 netif_carrier_off(dev); 2554 netif_carrier_off(dev);
2553 2555
2554 netif_tx_stop_all_queues(dev); 2556 netif_tx_stop_all_queues(dev);
2555 old_down_cnt = mgp->down_cnt; 2557 if (mgp->rebooted == 0) {
2556 mb(); 2558 old_down_cnt = mgp->down_cnt;
2557 status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_DOWN, &cmd, 0); 2559 mb();
2558 if (status) 2560 status =
2559 printk(KERN_ERR "myri10ge: %s: Couldn't bring down link\n", 2561 myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_DOWN, &cmd, 0);
2560 dev->name); 2562 if (status)
2561 2563 printk(KERN_ERR
2562 wait_event_timeout(mgp->down_wq, old_down_cnt != mgp->down_cnt, HZ); 2564 "myri10ge: %s: Couldn't bring down link\n",
2563 if (old_down_cnt == mgp->down_cnt) 2565 dev->name);
2564 printk(KERN_ERR "myri10ge: %s never got down irq\n", dev->name);
2565 2566
2567 wait_event_timeout(mgp->down_wq, old_down_cnt != mgp->down_cnt,
2568 HZ);
2569 if (old_down_cnt == mgp->down_cnt)
2570 printk(KERN_ERR "myri10ge: %s never got down irq\n",
2571 dev->name);
2572 }
2566 netif_tx_disable(dev); 2573 netif_tx_disable(dev);
2567 myri10ge_free_irq(mgp); 2574 myri10ge_free_irq(mgp);
2568 for (i = 0; i < mgp->num_slices; i++) 2575 for (i = 0; i < mgp->num_slices; i++)
@@ -3427,12 +3434,13 @@ static void myri10ge_watchdog(struct work_struct *work)
3427 container_of(work, struct myri10ge_priv, watchdog_work); 3434 container_of(work, struct myri10ge_priv, watchdog_work);
3428 struct myri10ge_tx_buf *tx; 3435 struct myri10ge_tx_buf *tx;
3429 u32 reboot; 3436 u32 reboot;
3430 int status; 3437 int status, rebooted;
3431 int i; 3438 int i;
3432 u16 cmd, vendor; 3439 u16 cmd, vendor;
3433 3440
3434 mgp->watchdog_resets++; 3441 mgp->watchdog_resets++;
3435 pci_read_config_word(mgp->pdev, PCI_COMMAND, &cmd); 3442 pci_read_config_word(mgp->pdev, PCI_COMMAND, &cmd);
3443 rebooted = 0;
3436 if ((cmd & PCI_COMMAND_MASTER) == 0) { 3444 if ((cmd & PCI_COMMAND_MASTER) == 0) {
3437 /* Bus master DMA disabled? Check to see 3445 /* Bus master DMA disabled? Check to see
3438 * if the card rebooted due to a parity error 3446 * if the card rebooted due to a parity error
@@ -3444,9 +3452,12 @@ static void myri10ge_watchdog(struct work_struct *work)
3444 myri10ge_reset_recover ? " " : " not"); 3452 myri10ge_reset_recover ? " " : " not");
3445 if (myri10ge_reset_recover == 0) 3453 if (myri10ge_reset_recover == 0)
3446 return; 3454 return;
3447 3455 rtnl_lock();
3456 mgp->rebooted = 1;
3457 rebooted = 1;
3458 myri10ge_close(mgp->dev);
3448 myri10ge_reset_recover--; 3459 myri10ge_reset_recover--;
3449 3460 mgp->rebooted = 0;
3450 /* 3461 /*
3451 * A rebooted nic will come back with config space as 3462 * A rebooted nic will come back with config space as
3452 * it was after power was applied to PCIe bus. 3463 * it was after power was applied to PCIe bus.
@@ -3494,8 +3505,10 @@ static void myri10ge_watchdog(struct work_struct *work)
3494 } 3505 }
3495 } 3506 }
3496 3507
3497 rtnl_lock(); 3508 if (!rebooted) {
3498 myri10ge_close(mgp->dev); 3509 rtnl_lock();
3510 myri10ge_close(mgp->dev);
3511 }
3499 status = myri10ge_load_firmware(mgp, 1); 3512 status = myri10ge_load_firmware(mgp, 1);
3500 if (status != 0) 3513 if (status != 0)
3501 printk(KERN_ERR "myri10ge: %s: failed to load firmware\n", 3514 printk(KERN_ERR "myri10ge: %s: failed to load firmware\n",
@@ -3516,12 +3529,14 @@ static void myri10ge_watchdog_timer(unsigned long arg)
3516{ 3529{
3517 struct myri10ge_priv *mgp; 3530 struct myri10ge_priv *mgp;
3518 struct myri10ge_slice_state *ss; 3531 struct myri10ge_slice_state *ss;
3519 int i, reset_needed; 3532 int i, reset_needed, busy_slice_cnt;
3520 u32 rx_pause_cnt; 3533 u32 rx_pause_cnt;
3534 u16 cmd;
3521 3535
3522 mgp = (struct myri10ge_priv *)arg; 3536 mgp = (struct myri10ge_priv *)arg;
3523 3537
3524 rx_pause_cnt = ntohl(mgp->ss[0].fw_stats->dropped_pause); 3538 rx_pause_cnt = ntohl(mgp->ss[0].fw_stats->dropped_pause);
3539 busy_slice_cnt = 0;
3525 for (i = 0, reset_needed = 0; 3540 for (i = 0, reset_needed = 0;
3526 i < mgp->num_slices && reset_needed == 0; ++i) { 3541 i < mgp->num_slices && reset_needed == 0; ++i) {
3527 3542
@@ -3559,8 +3574,22 @@ static void myri10ge_watchdog_timer(unsigned long arg)
3559 reset_needed = 1; 3574 reset_needed = 1;
3560 } 3575 }
3561 } 3576 }
3577 if (ss->watchdog_tx_done != ss->tx.done ||
3578 ss->watchdog_rx_done != ss->rx_done.cnt) {
3579 busy_slice_cnt++;
3580 }
3562 ss->watchdog_tx_done = ss->tx.done; 3581 ss->watchdog_tx_done = ss->tx.done;
3563 ss->watchdog_tx_req = ss->tx.req; 3582 ss->watchdog_tx_req = ss->tx.req;
3583 ss->watchdog_rx_done = ss->rx_done.cnt;
3584 }
3585 /* if we've sent or received no traffic, poll the NIC to
3586 * ensure it is still there. Otherwise, we risk not noticing
3587 * an error in a timely fashion */
3588 if (busy_slice_cnt == 0) {
3589 pci_read_config_word(mgp->pdev, PCI_COMMAND, &cmd);
3590 if ((cmd & PCI_COMMAND_MASTER) == 0) {
3591 reset_needed = 1;
3592 }
3564 } 3593 }
3565 mgp->watchdog_pause = rx_pause_cnt; 3594 mgp->watchdog_pause = rx_pause_cnt;
3566 3595