diff options
author | Jack Morgenstein <jackm@dev.mellanox.co.il> | 2009-09-05 23:24:50 -0400 |
---|---|---|
committer | Roland Dreier <rolandd@cisco.com> | 2009-09-05 23:24:50 -0400 |
commit | 3b4a8cd51e59c1c342c51b241bbb96c6ac24a147 (patch) | |
tree | f185d61b515a21e93159c2c6d50efd8ebf2ac7c7 | |
parent | f5f5951c7494b6ae89ec53ca7ca6b0177ebd1308 (diff) |
IB/mlx4: Don't allow userspace open while recovering from catastrophic error
Userspace apps are supposed to release all ib device resources if they
receive a fatal async event (IBV_EVENT_DEVICE_FATAL). However, the
app has no way of knowing when the device has come back up, except to
repeatedly attempt ibv_open_device() until it succeeds.
However, currently there is no protection against the open succeeding
while the device is in being removed following the fatal event. In
this case, the open will succeed, but as a result the device waits in
the middle of its removal until the new app releases its resources --
and the new app will not do so, since the open succeeded at a point
following the fatal event generation.
This patch adds an "active" flag to the device. The active flag is set
to false (in the fatal event flow) before the "fatal" event is
generated, so any subsequent ibv_dev_open() call to the device will
fail until the device comes back up, thus preventing the above
deadlock.
Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
-rw-r--r-- | drivers/infiniband/hw/mlx4/main.c | 6 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx4/mlx4_ib.h | 1 |
2 files changed, 7 insertions, 0 deletions
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index ae3d7590346e..313ce7fb2735 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c | |||
@@ -342,6 +342,9 @@ static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev, | |||
342 | struct mlx4_ib_alloc_ucontext_resp resp; | 342 | struct mlx4_ib_alloc_ucontext_resp resp; |
343 | int err; | 343 | int err; |
344 | 344 | ||
345 | if (!dev->ib_active) | ||
346 | return ERR_PTR(-EAGAIN); | ||
347 | |||
345 | resp.qp_tab_size = dev->dev->caps.num_qps; | 348 | resp.qp_tab_size = dev->dev->caps.num_qps; |
346 | resp.bf_reg_size = dev->dev->caps.bf_reg_size; | 349 | resp.bf_reg_size = dev->dev->caps.bf_reg_size; |
347 | resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; | 350 | resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; |
@@ -673,6 +676,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) | |||
673 | goto err_reg; | 676 | goto err_reg; |
674 | } | 677 | } |
675 | 678 | ||
679 | ibdev->ib_active = true; | ||
680 | |||
676 | return ibdev; | 681 | return ibdev; |
677 | 682 | ||
678 | err_reg: | 683 | err_reg: |
@@ -729,6 +734,7 @@ static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr, | |||
729 | break; | 734 | break; |
730 | 735 | ||
731 | case MLX4_DEV_EVENT_CATASTROPHIC_ERROR: | 736 | case MLX4_DEV_EVENT_CATASTROPHIC_ERROR: |
737 | ibdev->ib_active = false; | ||
732 | ibev.event = IB_EVENT_DEVICE_FATAL; | 738 | ibev.event = IB_EVENT_DEVICE_FATAL; |
733 | break; | 739 | break; |
734 | 740 | ||
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 8a7dd6795fa0..3486d7675e56 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h | |||
@@ -175,6 +175,7 @@ struct mlx4_ib_dev { | |||
175 | spinlock_t sm_lock; | 175 | spinlock_t sm_lock; |
176 | 176 | ||
177 | struct mutex cap_mask_mutex; | 177 | struct mutex cap_mask_mutex; |
178 | bool ib_active; | ||
178 | }; | 179 | }; |
179 | 180 | ||
180 | static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev) | 181 | static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev) |