aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMohamad Haj Yahia <mohamad@mellanox.com>2016-10-25 11:36:33 -0400
committerDavid S. Miller <davem@davemloft.net>2016-10-29 12:00:39 -0400
commit05ac2c0b7438ea08c5d54b48797acf9b22cb2f6f (patch)
treee25e0ebfe1a447c0de7a32fea33212457debcb20
parent2241007b3d783cbdbaa78c30bdb1994278b6f9b9 (diff)
net/mlx5: Fix race between PCI error handlers and health work
Currently there is a race between the health care work and the kernel pci error handlers because both of them detect the error, the first one to be called will do the error handling. There is a chance that health care will disable the pci after resuming pci slot. Also create a separate WQ because now we will have two types of health works, one for the error detection and one for the recovery. Fixes: 89d44f0a6c73 ('net/mlx5_core: Add pci error handlers to mlx5_core driver') Signed-off-by: Mohamad Haj Yahia <mohamad@mellanox.com> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/health.c30
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/main.c9
-rw-r--r--include/linux/mlx5/driver.h4
3 files changed, 38 insertions, 5 deletions
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 2cb4094c9c49..2d00022c92d8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -64,6 +64,10 @@ enum {
64 MLX5_NIC_IFC_NO_DRAM_NIC = 2 64 MLX5_NIC_IFC_NO_DRAM_NIC = 2
65}; 65};
66 66
67enum {
68 MLX5_DROP_NEW_HEALTH_WORK,
69};
70
67static u8 get_nic_interface(struct mlx5_core_dev *dev) 71static u8 get_nic_interface(struct mlx5_core_dev *dev)
68{ 72{
69 return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 3; 73 return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 3;
@@ -272,7 +276,13 @@ static void poll_health(unsigned long data)
272 if (in_fatal(dev) && !health->sick) { 276 if (in_fatal(dev) && !health->sick) {
273 health->sick = true; 277 health->sick = true;
274 print_health_info(dev); 278 print_health_info(dev);
275 schedule_work(&health->work); 279 spin_lock(&health->wq_lock);
280 if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags))
281 queue_work(health->wq, &health->work);
282 else
283 dev_err(&dev->pdev->dev,
284 "new health works are not permitted at this stage\n");
285 spin_unlock(&health->wq_lock);
276 } 286 }
277} 287}
278 288
@@ -282,6 +292,7 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev)
282 292
283 init_timer(&health->timer); 293 init_timer(&health->timer);
284 health->sick = 0; 294 health->sick = 0;
295 clear_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
285 health->health = &dev->iseg->health; 296 health->health = &dev->iseg->health;
286 health->health_counter = &dev->iseg->health_counter; 297 health->health_counter = &dev->iseg->health_counter;
287 298
@@ -298,11 +309,21 @@ void mlx5_stop_health_poll(struct mlx5_core_dev *dev)
298 del_timer_sync(&health->timer); 309 del_timer_sync(&health->timer);
299} 310}
300 311
312void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
313{
314 struct mlx5_core_health *health = &dev->priv.health;
315
316 spin_lock(&health->wq_lock);
317 set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
318 spin_unlock(&health->wq_lock);
319 cancel_work_sync(&health->work);
320}
321
301void mlx5_health_cleanup(struct mlx5_core_dev *dev) 322void mlx5_health_cleanup(struct mlx5_core_dev *dev)
302{ 323{
303 struct mlx5_core_health *health = &dev->priv.health; 324 struct mlx5_core_health *health = &dev->priv.health;
304 325
305 flush_work(&health->work); 326 destroy_workqueue(health->wq);
306} 327}
307 328
308int mlx5_health_init(struct mlx5_core_dev *dev) 329int mlx5_health_init(struct mlx5_core_dev *dev)
@@ -317,8 +338,11 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
317 338
318 strcpy(name, "mlx5_health"); 339 strcpy(name, "mlx5_health");
319 strcat(name, dev_name(&dev->pdev->dev)); 340 strcat(name, dev_name(&dev->pdev->dev));
341 health->wq = create_singlethread_workqueue(name);
320 kfree(name); 342 kfree(name);
321 343 if (!health->wq)
344 return -ENOMEM;
345 spin_lock_init(&health->wq_lock);
322 INIT_WORK(&health->work, health_care); 346 INIT_WORK(&health->work, health_care);
323 347
324 return 0; 348 return 0;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 8a63910bfccf..9f90226bc120 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1315,8 +1315,13 @@ static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev,
1315 dev_info(&pdev->dev, "%s was called\n", __func__); 1315 dev_info(&pdev->dev, "%s was called\n", __func__);
1316 mlx5_enter_error_state(dev); 1316 mlx5_enter_error_state(dev);
1317 mlx5_unload_one(dev, priv, false); 1317 mlx5_unload_one(dev, priv, false);
1318 pci_save_state(pdev); 1318 /* In case of kernel call save the pci state and drain health wq */
1319 mlx5_pci_disable_device(dev); 1319 if (state) {
1320 pci_save_state(pdev);
1321 mlx5_drain_health_wq(dev);
1322 mlx5_pci_disable_device(dev);
1323 }
1324
1320 return state == pci_channel_io_perm_failure ? 1325 return state == pci_channel_io_perm_failure ?
1321 PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET; 1326 PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET;
1322} 1327}
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 5dbda60a09f4..7d9a5d08eb59 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -418,7 +418,10 @@ struct mlx5_core_health {
418 u32 prev; 418 u32 prev;
419 int miss_counter; 419 int miss_counter;
420 bool sick; 420 bool sick;
421 /* wq spinlock to synchronize draining */
422 spinlock_t wq_lock;
421 struct workqueue_struct *wq; 423 struct workqueue_struct *wq;
424 unsigned long flags;
422 struct work_struct work; 425 struct work_struct work;
423}; 426};
424 427
@@ -778,6 +781,7 @@ void mlx5_health_cleanup(struct mlx5_core_dev *dev);
778int mlx5_health_init(struct mlx5_core_dev *dev); 781int mlx5_health_init(struct mlx5_core_dev *dev);
779void mlx5_start_health_poll(struct mlx5_core_dev *dev); 782void mlx5_start_health_poll(struct mlx5_core_dev *dev);
780void mlx5_stop_health_poll(struct mlx5_core_dev *dev); 783void mlx5_stop_health_poll(struct mlx5_core_dev *dev);
784void mlx5_drain_health_wq(struct mlx5_core_dev *dev);
781int mlx5_buf_alloc_node(struct mlx5_core_dev *dev, int size, 785int mlx5_buf_alloc_node(struct mlx5_core_dev *dev, int size,
782 struct mlx5_buf *buf, int node); 786 struct mlx5_buf *buf, int node);
783int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, struct mlx5_buf *buf); 787int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, struct mlx5_buf *buf);