diff options
Diffstat (limited to 'net/rds/ib.c')
| -rw-r--r-- | net/rds/ib.c | 194 |
1 files changed, 154 insertions, 40 deletions
diff --git a/net/rds/ib.c b/net/rds/ib.c index 8f2d6dd7700a..b12a3951167d 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c | |||
| @@ -53,12 +53,71 @@ MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer"); | |||
| 53 | module_param(rds_ib_retry_count, int, 0444); | 53 | module_param(rds_ib_retry_count, int, 0444); |
| 54 | MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error"); | 54 | MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error"); |
| 55 | 55 | ||
| 56 | /* | ||
| 57 | * we have a clumsy combination of RCU and a rwsem protecting this list | ||
| 58 | * because it is used both in the get_mr fast path and while blocking in | ||
| 59 | * the FMR flushing path. | ||
| 60 | */ | ||
| 61 | DECLARE_RWSEM(rds_ib_devices_lock); | ||
| 56 | struct list_head rds_ib_devices; | 62 | struct list_head rds_ib_devices; |
| 57 | 63 | ||
| 58 | /* NOTE: if also grabbing ibdev lock, grab this first */ | 64 | /* NOTE: if also grabbing ibdev lock, grab this first */ |
| 59 | DEFINE_SPINLOCK(ib_nodev_conns_lock); | 65 | DEFINE_SPINLOCK(ib_nodev_conns_lock); |
| 60 | LIST_HEAD(ib_nodev_conns); | 66 | LIST_HEAD(ib_nodev_conns); |
| 61 | 67 | ||
| 68 | void rds_ib_nodev_connect(void) | ||
| 69 | { | ||
| 70 | struct rds_ib_connection *ic; | ||
| 71 | |||
| 72 | spin_lock(&ib_nodev_conns_lock); | ||
| 73 | list_for_each_entry(ic, &ib_nodev_conns, ib_node) | ||
| 74 | rds_conn_connect_if_down(ic->conn); | ||
| 75 | spin_unlock(&ib_nodev_conns_lock); | ||
| 76 | } | ||
| 77 | |||
| 78 | void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev) | ||
| 79 | { | ||
| 80 | struct rds_ib_connection *ic; | ||
| 81 | unsigned long flags; | ||
| 82 | |||
| 83 | spin_lock_irqsave(&rds_ibdev->spinlock, flags); | ||
| 84 | list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node) | ||
| 85 | rds_conn_drop(ic->conn); | ||
| 86 | spin_unlock_irqrestore(&rds_ibdev->spinlock, flags); | ||
| 87 | } | ||
| 88 | |||
| 89 | /* | ||
| 90 | * rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references | ||
| 91 | * from interrupt context so we push freing off into a work struct in krdsd. | ||
| 92 | */ | ||
| 93 | static void rds_ib_dev_free(struct work_struct *work) | ||
| 94 | { | ||
| 95 | struct rds_ib_ipaddr *i_ipaddr, *i_next; | ||
| 96 | struct rds_ib_device *rds_ibdev = container_of(work, | ||
| 97 | struct rds_ib_device, free_work); | ||
| 98 | |||
| 99 | if (rds_ibdev->mr_pool) | ||
| 100 | rds_ib_destroy_mr_pool(rds_ibdev->mr_pool); | ||
| 101 | if (rds_ibdev->mr) | ||
| 102 | ib_dereg_mr(rds_ibdev->mr); | ||
| 103 | if (rds_ibdev->pd) | ||
| 104 | ib_dealloc_pd(rds_ibdev->pd); | ||
| 105 | |||
| 106 | list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { | ||
| 107 | list_del(&i_ipaddr->list); | ||
| 108 | kfree(i_ipaddr); | ||
| 109 | } | ||
| 110 | |||
| 111 | kfree(rds_ibdev); | ||
| 112 | } | ||
| 113 | |||
| 114 | void rds_ib_dev_put(struct rds_ib_device *rds_ibdev) | ||
| 115 | { | ||
| 116 | BUG_ON(atomic_read(&rds_ibdev->refcount) <= 0); | ||
| 117 | if (atomic_dec_and_test(&rds_ibdev->refcount)) | ||
| 118 | queue_work(rds_wq, &rds_ibdev->free_work); | ||
| 119 | } | ||
| 120 | |||
| 62 | void rds_ib_add_one(struct ib_device *device) | 121 | void rds_ib_add_one(struct ib_device *device) |
| 63 | { | 122 | { |
| 64 | struct rds_ib_device *rds_ibdev; | 123 | struct rds_ib_device *rds_ibdev; |
| @@ -77,11 +136,14 @@ void rds_ib_add_one(struct ib_device *device) | |||
| 77 | goto free_attr; | 136 | goto free_attr; |
| 78 | } | 137 | } |
| 79 | 138 | ||
| 80 | rds_ibdev = kmalloc(sizeof *rds_ibdev, GFP_KERNEL); | 139 | rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL, |
| 140 | ibdev_to_node(device)); | ||
| 81 | if (!rds_ibdev) | 141 | if (!rds_ibdev) |
| 82 | goto free_attr; | 142 | goto free_attr; |
| 83 | 143 | ||
| 84 | spin_lock_init(&rds_ibdev->spinlock); | 144 | spin_lock_init(&rds_ibdev->spinlock); |
| 145 | atomic_set(&rds_ibdev->refcount, 1); | ||
| 146 | INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free); | ||
| 85 | 147 | ||
| 86 | rds_ibdev->max_wrs = dev_attr->max_qp_wr; | 148 | rds_ibdev->max_wrs = dev_attr->max_qp_wr; |
| 87 | rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); | 149 | rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); |
| @@ -91,68 +153,107 @@ void rds_ib_add_one(struct ib_device *device) | |||
| 91 | min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) : | 153 | min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) : |
| 92 | fmr_pool_size; | 154 | fmr_pool_size; |
| 93 | 155 | ||
| 156 | rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom; | ||
| 157 | rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom; | ||
| 158 | |||
| 94 | rds_ibdev->dev = device; | 159 | rds_ibdev->dev = device; |
| 95 | rds_ibdev->pd = ib_alloc_pd(device); | 160 | rds_ibdev->pd = ib_alloc_pd(device); |
| 96 | if (IS_ERR(rds_ibdev->pd)) | 161 | if (IS_ERR(rds_ibdev->pd)) { |
| 97 | goto free_dev; | 162 | rds_ibdev->pd = NULL; |
| 163 | goto put_dev; | ||
| 164 | } | ||
| 98 | 165 | ||
| 99 | rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, | 166 | rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE); |
| 100 | IB_ACCESS_LOCAL_WRITE); | 167 | if (IS_ERR(rds_ibdev->mr)) { |
| 101 | if (IS_ERR(rds_ibdev->mr)) | 168 | rds_ibdev->mr = NULL; |
| 102 | goto err_pd; | 169 | goto put_dev; |
| 170 | } | ||
| 103 | 171 | ||
| 104 | rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev); | 172 | rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev); |
| 105 | if (IS_ERR(rds_ibdev->mr_pool)) { | 173 | if (IS_ERR(rds_ibdev->mr_pool)) { |
| 106 | rds_ibdev->mr_pool = NULL; | 174 | rds_ibdev->mr_pool = NULL; |
| 107 | goto err_mr; | 175 | goto put_dev; |
| 108 | } | 176 | } |
| 109 | 177 | ||
| 110 | INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); | 178 | INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); |
| 111 | INIT_LIST_HEAD(&rds_ibdev->conn_list); | 179 | INIT_LIST_HEAD(&rds_ibdev->conn_list); |
| 112 | list_add_tail(&rds_ibdev->list, &rds_ib_devices); | 180 | |
| 181 | down_write(&rds_ib_devices_lock); | ||
| 182 | list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices); | ||
| 183 | up_write(&rds_ib_devices_lock); | ||
| 184 | atomic_inc(&rds_ibdev->refcount); | ||
| 113 | 185 | ||
| 114 | ib_set_client_data(device, &rds_ib_client, rds_ibdev); | 186 | ib_set_client_data(device, &rds_ib_client, rds_ibdev); |
| 187 | atomic_inc(&rds_ibdev->refcount); | ||
| 115 | 188 | ||
| 116 | goto free_attr; | 189 | rds_ib_nodev_connect(); |
| 117 | 190 | ||
| 118 | err_mr: | 191 | put_dev: |
| 119 | ib_dereg_mr(rds_ibdev->mr); | 192 | rds_ib_dev_put(rds_ibdev); |
| 120 | err_pd: | ||
| 121 | ib_dealloc_pd(rds_ibdev->pd); | ||
| 122 | free_dev: | ||
| 123 | kfree(rds_ibdev); | ||
| 124 | free_attr: | 193 | free_attr: |
| 125 | kfree(dev_attr); | 194 | kfree(dev_attr); |
| 126 | } | 195 | } |
| 127 | 196 | ||
| 197 | /* | ||
| 198 | * New connections use this to find the device to associate with the | ||
| 199 | * connection. It's not in the fast path so we're not concerned about the | ||
| 200 | * performance of the IB call. (As of this writing, it uses an interrupt | ||
| 201 | * blocking spinlock to serialize walking a per-device list of all registered | ||
| 202 | * clients.) | ||
| 203 | * | ||
| 204 | * RCU is used to handle incoming connections racing with device teardown. | ||
| 205 | * Rather than use a lock to serialize removal from the client_data and | ||
| 206 | * getting a new reference, we use an RCU grace period. The destruction | ||
| 207 | * path removes the device from client_data and then waits for all RCU | ||
| 208 | * readers to finish. | ||
| 209 | * | ||
| 210 | * A new connection can get NULL from this if its arriving on a | ||
| 211 | * device that is in the process of being removed. | ||
| 212 | */ | ||
| 213 | struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device) | ||
| 214 | { | ||
| 215 | struct rds_ib_device *rds_ibdev; | ||
| 216 | |||
| 217 | rcu_read_lock(); | ||
| 218 | rds_ibdev = ib_get_client_data(device, &rds_ib_client); | ||
| 219 | if (rds_ibdev) | ||
| 220 | atomic_inc(&rds_ibdev->refcount); | ||
| 221 | rcu_read_unlock(); | ||
| 222 | return rds_ibdev; | ||
| 223 | } | ||
| 224 | |||
| 225 | /* | ||
| 226 | * The IB stack is letting us know that a device is going away. This can | ||
| 227 | * happen if the underlying HCA driver is removed or if PCI hotplug is removing | ||
| 228 | * the pci function, for example. | ||
| 229 | * | ||
| 230 | * This can be called at any time and can be racing with any other RDS path. | ||
| 231 | */ | ||
| 128 | void rds_ib_remove_one(struct ib_device *device) | 232 | void rds_ib_remove_one(struct ib_device *device) |
| 129 | { | 233 | { |
| 130 | struct rds_ib_device *rds_ibdev; | 234 | struct rds_ib_device *rds_ibdev; |
| 131 | struct rds_ib_ipaddr *i_ipaddr, *i_next; | ||
| 132 | 235 | ||
| 133 | rds_ibdev = ib_get_client_data(device, &rds_ib_client); | 236 | rds_ibdev = ib_get_client_data(device, &rds_ib_client); |
| 134 | if (!rds_ibdev) | 237 | if (!rds_ibdev) |
| 135 | return; | 238 | return; |
| 136 | 239 | ||
| 137 | list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { | 240 | rds_ib_dev_shutdown(rds_ibdev); |
| 138 | list_del(&i_ipaddr->list); | ||
| 139 | kfree(i_ipaddr); | ||
| 140 | } | ||
| 141 | 241 | ||
| 142 | rds_ib_destroy_conns(rds_ibdev); | 242 | /* stop connection attempts from getting a reference to this device. */ |
| 243 | ib_set_client_data(device, &rds_ib_client, NULL); | ||
| 143 | 244 | ||
| 144 | if (rds_ibdev->mr_pool) | 245 | down_write(&rds_ib_devices_lock); |
| 145 | rds_ib_destroy_mr_pool(rds_ibdev->mr_pool); | 246 | list_del_rcu(&rds_ibdev->list); |
| 146 | 247 | up_write(&rds_ib_devices_lock); | |
| 147 | ib_dereg_mr(rds_ibdev->mr); | ||
| 148 | |||
| 149 | while (ib_dealloc_pd(rds_ibdev->pd)) { | ||
| 150 | rdsdebug("Failed to dealloc pd %p\n", rds_ibdev->pd); | ||
| 151 | msleep(1); | ||
| 152 | } | ||
| 153 | 248 | ||
| 154 | list_del(&rds_ibdev->list); | 249 | /* |
| 155 | kfree(rds_ibdev); | 250 | * This synchronize rcu is waiting for readers of both the ib |
| 251 | * client data and the devices list to finish before we drop | ||
| 252 | * both of those references. | ||
| 253 | */ | ||
| 254 | synchronize_rcu(); | ||
| 255 | rds_ib_dev_put(rds_ibdev); | ||
| 256 | rds_ib_dev_put(rds_ibdev); | ||
| 156 | } | 257 | } |
| 157 | 258 | ||
| 158 | struct ib_client rds_ib_client = { | 259 | struct ib_client rds_ib_client = { |
| @@ -186,7 +287,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn, | |||
| 186 | rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); | 287 | rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); |
| 187 | rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); | 288 | rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); |
| 188 | 289 | ||
| 189 | rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); | 290 | rds_ibdev = ic->rds_ibdev; |
| 190 | iinfo->max_send_wr = ic->i_send_ring.w_nr; | 291 | iinfo->max_send_wr = ic->i_send_ring.w_nr; |
| 191 | iinfo->max_recv_wr = ic->i_recv_ring.w_nr; | 292 | iinfo->max_recv_wr = ic->i_recv_ring.w_nr; |
| 192 | iinfo->max_send_sge = rds_ibdev->max_sge; | 293 | iinfo->max_send_sge = rds_ibdev->max_sge; |
| @@ -248,29 +349,36 @@ static int rds_ib_laddr_check(__be32 addr) | |||
| 248 | return ret; | 349 | return ret; |
| 249 | } | 350 | } |
| 250 | 351 | ||
| 352 | static void rds_ib_unregister_client(void) | ||
| 353 | { | ||
| 354 | ib_unregister_client(&rds_ib_client); | ||
| 355 | /* wait for rds_ib_dev_free() to complete */ | ||
| 356 | flush_workqueue(rds_wq); | ||
| 357 | } | ||
| 358 | |||
| 251 | void rds_ib_exit(void) | 359 | void rds_ib_exit(void) |
| 252 | { | 360 | { |
| 253 | rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); | 361 | rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); |
| 362 | rds_ib_unregister_client(); | ||
| 254 | rds_ib_destroy_nodev_conns(); | 363 | rds_ib_destroy_nodev_conns(); |
| 255 | ib_unregister_client(&rds_ib_client); | ||
| 256 | rds_ib_sysctl_exit(); | 364 | rds_ib_sysctl_exit(); |
| 257 | rds_ib_recv_exit(); | 365 | rds_ib_recv_exit(); |
| 258 | rds_trans_unregister(&rds_ib_transport); | 366 | rds_trans_unregister(&rds_ib_transport); |
| 367 | rds_ib_fmr_exit(); | ||
| 259 | } | 368 | } |
| 260 | 369 | ||
| 261 | struct rds_transport rds_ib_transport = { | 370 | struct rds_transport rds_ib_transport = { |
| 262 | .laddr_check = rds_ib_laddr_check, | 371 | .laddr_check = rds_ib_laddr_check, |
| 263 | .xmit_complete = rds_ib_xmit_complete, | 372 | .xmit_complete = rds_ib_xmit_complete, |
| 264 | .xmit = rds_ib_xmit, | 373 | .xmit = rds_ib_xmit, |
| 265 | .xmit_cong_map = NULL, | ||
| 266 | .xmit_rdma = rds_ib_xmit_rdma, | 374 | .xmit_rdma = rds_ib_xmit_rdma, |
| 375 | .xmit_atomic = rds_ib_xmit_atomic, | ||
| 267 | .recv = rds_ib_recv, | 376 | .recv = rds_ib_recv, |
| 268 | .conn_alloc = rds_ib_conn_alloc, | 377 | .conn_alloc = rds_ib_conn_alloc, |
| 269 | .conn_free = rds_ib_conn_free, | 378 | .conn_free = rds_ib_conn_free, |
| 270 | .conn_connect = rds_ib_conn_connect, | 379 | .conn_connect = rds_ib_conn_connect, |
| 271 | .conn_shutdown = rds_ib_conn_shutdown, | 380 | .conn_shutdown = rds_ib_conn_shutdown, |
| 272 | .inc_copy_to_user = rds_ib_inc_copy_to_user, | 381 | .inc_copy_to_user = rds_ib_inc_copy_to_user, |
| 273 | .inc_purge = rds_ib_inc_purge, | ||
| 274 | .inc_free = rds_ib_inc_free, | 382 | .inc_free = rds_ib_inc_free, |
| 275 | .cm_initiate_connect = rds_ib_cm_initiate_connect, | 383 | .cm_initiate_connect = rds_ib_cm_initiate_connect, |
| 276 | .cm_handle_connect = rds_ib_cm_handle_connect, | 384 | .cm_handle_connect = rds_ib_cm_handle_connect, |
| @@ -286,16 +394,20 @@ struct rds_transport rds_ib_transport = { | |||
| 286 | .t_type = RDS_TRANS_IB | 394 | .t_type = RDS_TRANS_IB |
| 287 | }; | 395 | }; |
| 288 | 396 | ||
| 289 | int __init rds_ib_init(void) | 397 | int rds_ib_init(void) |
| 290 | { | 398 | { |
| 291 | int ret; | 399 | int ret; |
| 292 | 400 | ||
| 293 | INIT_LIST_HEAD(&rds_ib_devices); | 401 | INIT_LIST_HEAD(&rds_ib_devices); |
| 294 | 402 | ||
| 295 | ret = ib_register_client(&rds_ib_client); | 403 | ret = rds_ib_fmr_init(); |
| 296 | if (ret) | 404 | if (ret) |
| 297 | goto out; | 405 | goto out; |
| 298 | 406 | ||
| 407 | ret = ib_register_client(&rds_ib_client); | ||
| 408 | if (ret) | ||
| 409 | goto out_fmr_exit; | ||
| 410 | |||
| 299 | ret = rds_ib_sysctl_init(); | 411 | ret = rds_ib_sysctl_init(); |
| 300 | if (ret) | 412 | if (ret) |
| 301 | goto out_ibreg; | 413 | goto out_ibreg; |
| @@ -317,7 +429,9 @@ out_recv: | |||
| 317 | out_sysctl: | 429 | out_sysctl: |
| 318 | rds_ib_sysctl_exit(); | 430 | rds_ib_sysctl_exit(); |
| 319 | out_ibreg: | 431 | out_ibreg: |
| 320 | ib_unregister_client(&rds_ib_client); | 432 | rds_ib_unregister_client(); |
| 433 | out_fmr_exit: | ||
| 434 | rds_ib_fmr_exit(); | ||
| 321 | out: | 435 | out: |
| 322 | return ret; | 436 | return ret; |
| 323 | } | 437 | } |
