aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/net/ethernet/mellanox
diff options
context:
space:
mode:
authorKleber Sacilotto de Souza <klebers@linux.vnet.ibm.com>2012-07-20 05:55:43 -0400
committerDavid S. Miller <davem@davemloft.net>2012-07-25 18:24:13 -0400
commit57dbf29a54bda5773f9ed1d00e3cc633294259da (patch)
tree311f1973354a10fa997db812299982b560dc821f /drivers/net/ethernet/mellanox
parentf94898ea6682977f15c5a8f9ffb293a14f95455a (diff)
mlx4: Add support for EEH error recovery
Currently the mlx4 drivers don't have the necessary callbacks to implement EEH errors detection and recovery, so the PCI layer uses the probe and remove callbacks to try to recover the device after an error on the bus. However, these callbacks have race conditions with the internal catastrophic error recovery functions, which will also detect the error and this can cause the system to crash if both EEH and catas functions try to reset the device. This patch adds the necessary error recovery callbacks and makes sure that the internal catastrophic error functions will not try to reset the device in such scenarios. It also adds some calls to pci_channel_offline() to suppress reads/writes on the bus when the slot cannot accept I/O operations so we prevent unnecessary accesses to the bus and speed up the device removal. Signed-off-by: Kleber Sacilotto de Souza <klebers@linux.vnet.ibm.com> Acked-by: Shlomo Pongratz <shlomop@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers/net/ethernet/mellanox')
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/catas.c25
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/cmd.c49
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/main.c30
3 files changed, 93 insertions, 11 deletions
diff --git a/drivers/net/ethernet/mellanox/mlx4/catas.c b/drivers/net/ethernet/mellanox/mlx4/catas.c
index 915e947b422d..9c656fe4983d 100644
--- a/drivers/net/ethernet/mellanox/mlx4/catas.c
+++ b/drivers/net/ethernet/mellanox/mlx4/catas.c
@@ -69,16 +69,21 @@ static void poll_catas(unsigned long dev_ptr)
69 struct mlx4_priv *priv = mlx4_priv(dev); 69 struct mlx4_priv *priv = mlx4_priv(dev);
70 70
71 if (readl(priv->catas_err.map)) { 71 if (readl(priv->catas_err.map)) {
72 dump_err_buf(dev); 72 /* If the device is off-line, we cannot try to recover it */
73 73 if (pci_channel_offline(dev->pdev))
74 mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0); 74 mod_timer(&priv->catas_err.timer,
75 round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL));
76 else {
77 dump_err_buf(dev);
78 mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0);
75 79
76 if (internal_err_reset) { 80 if (internal_err_reset) {
77 spin_lock(&catas_lock); 81 spin_lock(&catas_lock);
78 list_add(&priv->catas_err.list, &catas_list); 82 list_add(&priv->catas_err.list, &catas_list);
79 spin_unlock(&catas_lock); 83 spin_unlock(&catas_lock);
80 84
81 queue_work(mlx4_wq, &catas_work); 85 queue_work(mlx4_wq, &catas_work);
86 }
82 } 87 }
83 } else 88 } else
84 mod_timer(&priv->catas_err.timer, 89 mod_timer(&priv->catas_err.timer,
@@ -100,6 +105,10 @@ static void catas_reset(struct work_struct *work)
100 list_for_each_entry_safe(priv, tmppriv, &tlist, catas_err.list) { 105 list_for_each_entry_safe(priv, tmppriv, &tlist, catas_err.list) {
101 struct pci_dev *pdev = priv->dev.pdev; 106 struct pci_dev *pdev = priv->dev.pdev;
102 107
108 /* If the device is off-line, we cannot reset it */
109 if (pci_channel_offline(pdev))
110 continue;
111
103 ret = mlx4_restart_one(priv->dev.pdev); 112 ret = mlx4_restart_one(priv->dev.pdev);
104 /* 'priv' now is not valid */ 113 /* 'priv' now is not valid */
105 if (ret) 114 if (ret)
diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index 7e94987d030c..c8fef4353021 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -296,7 +296,12 @@ int mlx4_comm_cmd(struct mlx4_dev *dev, u8 cmd, u16 param,
296 296
297static int cmd_pending(struct mlx4_dev *dev) 297static int cmd_pending(struct mlx4_dev *dev)
298{ 298{
299 u32 status = readl(mlx4_priv(dev)->cmd.hcr + HCR_STATUS_OFFSET); 299 u32 status;
300
301 if (pci_channel_offline(dev->pdev))
302 return -EIO;
303
304 status = readl(mlx4_priv(dev)->cmd.hcr + HCR_STATUS_OFFSET);
300 305
301 return (status & swab32(1 << HCR_GO_BIT)) || 306 return (status & swab32(1 << HCR_GO_BIT)) ||
302 (mlx4_priv(dev)->cmd.toggle == 307 (mlx4_priv(dev)->cmd.toggle ==
@@ -314,11 +319,29 @@ static int mlx4_cmd_post(struct mlx4_dev *dev, u64 in_param, u64 out_param,
314 319
315 mutex_lock(&cmd->hcr_mutex); 320 mutex_lock(&cmd->hcr_mutex);
316 321
322 if (pci_channel_offline(dev->pdev)) {
323 /*
324 * Device is going through error recovery
325 * and cannot accept commands.
326 */
327 ret = -EIO;
328 goto out;
329 }
330
317 end = jiffies; 331 end = jiffies;
318 if (event) 332 if (event)
319 end += msecs_to_jiffies(GO_BIT_TIMEOUT_MSECS); 333 end += msecs_to_jiffies(GO_BIT_TIMEOUT_MSECS);
320 334
321 while (cmd_pending(dev)) { 335 while (cmd_pending(dev)) {
336 if (pci_channel_offline(dev->pdev)) {
337 /*
338 * Device is going through error recovery
339 * and cannot accept commands.
340 */
341 ret = -EIO;
342 goto out;
343 }
344
322 if (time_after_eq(jiffies, end)) { 345 if (time_after_eq(jiffies, end)) {
323 mlx4_err(dev, "%s:cmd_pending failed\n", __func__); 346 mlx4_err(dev, "%s:cmd_pending failed\n", __func__);
324 goto out; 347 goto out;
@@ -431,14 +454,33 @@ static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
431 454
432 down(&priv->cmd.poll_sem); 455 down(&priv->cmd.poll_sem);
433 456
457 if (pci_channel_offline(dev->pdev)) {
458 /*
459 * Device is going through error recovery
460 * and cannot accept commands.
461 */
462 err = -EIO;
463 goto out;
464 }
465
434 err = mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0, 466 err = mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0,
435 in_modifier, op_modifier, op, CMD_POLL_TOKEN, 0); 467 in_modifier, op_modifier, op, CMD_POLL_TOKEN, 0);
436 if (err) 468 if (err)
437 goto out; 469 goto out;
438 470
439 end = msecs_to_jiffies(timeout) + jiffies; 471 end = msecs_to_jiffies(timeout) + jiffies;
440 while (cmd_pending(dev) && time_before(jiffies, end)) 472 while (cmd_pending(dev) && time_before(jiffies, end)) {
473 if (pci_channel_offline(dev->pdev)) {
474 /*
475 * Device is going through error recovery
476 * and cannot accept commands.
477 */
478 err = -EIO;
479 goto out;
480 }
481
441 cond_resched(); 482 cond_resched();
483 }
442 484
443 if (cmd_pending(dev)) { 485 if (cmd_pending(dev)) {
444 err = -ETIMEDOUT; 486 err = -ETIMEDOUT;
@@ -532,6 +574,9 @@ int __mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
532 int out_is_imm, u32 in_modifier, u8 op_modifier, 574 int out_is_imm, u32 in_modifier, u8 op_modifier,
533 u16 op, unsigned long timeout, int native) 575 u16 op, unsigned long timeout, int native)
534{ 576{
577 if (pci_channel_offline(dev->pdev))
578 return -EIO;
579
535 if (!mlx4_is_mfunc(dev) || (native && mlx4_is_master(dev))) { 580 if (!mlx4_is_mfunc(dev) || (native && mlx4_is_master(dev))) {
536 if (mlx4_priv(dev)->cmd.use_events) 581 if (mlx4_priv(dev)->cmd.use_events)
537 return mlx4_cmd_wait(dev, in_param, out_param, 582 return mlx4_cmd_wait(dev, in_param, out_param,
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 42645166bae2..e717091734d0 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -1775,6 +1775,9 @@ static int mlx4_get_ownership(struct mlx4_dev *dev)
1775 void __iomem *owner; 1775 void __iomem *owner;
1776 u32 ret; 1776 u32 ret;
1777 1777
1778 if (pci_channel_offline(dev->pdev))
1779 return -EIO;
1780
1778 owner = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_OWNER_BASE, 1781 owner = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_OWNER_BASE,
1779 MLX4_OWNER_SIZE); 1782 MLX4_OWNER_SIZE);
1780 if (!owner) { 1783 if (!owner) {
@@ -1791,6 +1794,9 @@ static void mlx4_free_ownership(struct mlx4_dev *dev)
1791{ 1794{
1792 void __iomem *owner; 1795 void __iomem *owner;
1793 1796
1797 if (pci_channel_offline(dev->pdev))
1798 return;
1799
1794 owner = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_OWNER_BASE, 1800 owner = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_OWNER_BASE,
1795 MLX4_OWNER_SIZE); 1801 MLX4_OWNER_SIZE);
1796 if (!owner) { 1802 if (!owner) {
@@ -2237,11 +2243,33 @@ static DEFINE_PCI_DEVICE_TABLE(mlx4_pci_table) = {
2237 2243
2238MODULE_DEVICE_TABLE(pci, mlx4_pci_table); 2244MODULE_DEVICE_TABLE(pci, mlx4_pci_table);
2239 2245
2246static pci_ers_result_t mlx4_pci_err_detected(struct pci_dev *pdev,
2247 pci_channel_state_t state)
2248{
2249 mlx4_remove_one(pdev);
2250
2251 return state == pci_channel_io_perm_failure ?
2252 PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET;
2253}
2254
2255static pci_ers_result_t mlx4_pci_slot_reset(struct pci_dev *pdev)
2256{
2257 int ret = __mlx4_init_one(pdev, NULL);
2258
2259 return ret ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
2260}
2261
2262static struct pci_error_handlers mlx4_err_handler = {
2263 .error_detected = mlx4_pci_err_detected,
2264 .slot_reset = mlx4_pci_slot_reset,
2265};
2266
2240static struct pci_driver mlx4_driver = { 2267static struct pci_driver mlx4_driver = {
2241 .name = DRV_NAME, 2268 .name = DRV_NAME,
2242 .id_table = mlx4_pci_table, 2269 .id_table = mlx4_pci_table,
2243 .probe = mlx4_init_one, 2270 .probe = mlx4_init_one,
2244 .remove = __devexit_p(mlx4_remove_one) 2271 .remove = __devexit_p(mlx4_remove_one),
2272 .err_handler = &mlx4_err_handler,
2245}; 2273};
2246 2274
2247static int __init mlx4_verify_params(void) 2275static int __init mlx4_verify_params(void)