aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJack Morgenstein <jackm@dev.mellanox.co.il>2009-09-05 23:24:50 -0400
committerRoland Dreier <rolandd@cisco.com>2009-09-05 23:24:50 -0400
commit3b4a8cd51e59c1c342c51b241bbb96c6ac24a147 (patch)
treef185d61b515a21e93159c2c6d50efd8ebf2ac7c7
parentf5f5951c7494b6ae89ec53ca7ca6b0177ebd1308 (diff)
IB/mlx4: Don't allow userspace open while recovering from catastrophic error
Userspace apps are supposed to release all ib device resources if they receive a fatal async event (IBV_EVENT_DEVICE_FATAL). However, the app has no way of knowing when the device has come back up, except to repeatedly attempt ibv_open_device() until it succeeds. However, currently there is no protection against the open succeeding while the device is in being removed following the fatal event. In this case, the open will succeed, but as a result the device waits in the middle of its removal until the new app releases its resources -- and the new app will not do so, since the open succeeded at a point following the fatal event generation. This patch adds an "active" flag to the device. The active flag is set to false (in the fatal event flow) before the "fatal" event is generated, so any subsequent ibv_dev_open() call to the device will fail until the device comes back up, thus preventing the above deadlock. Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il> Signed-off-by: Roland Dreier <rolandd@cisco.com>
-rw-r--r--drivers/infiniband/hw/mlx4/main.c6
-rw-r--r--drivers/infiniband/hw/mlx4/mlx4_ib.h1
2 files changed, 7 insertions, 0 deletions
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index ae3d7590346e..313ce7fb2735 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -342,6 +342,9 @@ static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev,
342 struct mlx4_ib_alloc_ucontext_resp resp; 342 struct mlx4_ib_alloc_ucontext_resp resp;
343 int err; 343 int err;
344 344
345 if (!dev->ib_active)
346 return ERR_PTR(-EAGAIN);
347
345 resp.qp_tab_size = dev->dev->caps.num_qps; 348 resp.qp_tab_size = dev->dev->caps.num_qps;
346 resp.bf_reg_size = dev->dev->caps.bf_reg_size; 349 resp.bf_reg_size = dev->dev->caps.bf_reg_size;
347 resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; 350 resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page;
@@ -673,6 +676,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
673 goto err_reg; 676 goto err_reg;
674 } 677 }
675 678
679 ibdev->ib_active = true;
680
676 return ibdev; 681 return ibdev;
677 682
678err_reg: 683err_reg:
@@ -729,6 +734,7 @@ static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr,
729 break; 734 break;
730 735
731 case MLX4_DEV_EVENT_CATASTROPHIC_ERROR: 736 case MLX4_DEV_EVENT_CATASTROPHIC_ERROR:
737 ibdev->ib_active = false;
732 ibev.event = IB_EVENT_DEVICE_FATAL; 738 ibev.event = IB_EVENT_DEVICE_FATAL;
733 break; 739 break;
734 740
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 8a7dd6795fa0..3486d7675e56 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -175,6 +175,7 @@ struct mlx4_ib_dev {
175 spinlock_t sm_lock; 175 spinlock_t sm_lock;
176 176
177 struct mutex cap_mask_mutex; 177 struct mutex cap_mask_mutex;
178 bool ib_active;
178}; 179};
179 180
180static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev) 181static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev)