diff options
Diffstat (limited to 'net/rds/ib.c')
-rw-r--r-- | net/rds/ib.c | 200 |
1 files changed, 157 insertions, 43 deletions
diff --git a/net/rds/ib.c b/net/rds/ib.c index 8f2d6dd7700a..4123967d4d65 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c | |||
@@ -42,7 +42,7 @@ | |||
42 | #include "rds.h" | 42 | #include "rds.h" |
43 | #include "ib.h" | 43 | #include "ib.h" |
44 | 44 | ||
45 | unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE; | 45 | static unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE; |
46 | unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */ | 46 | unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */ |
47 | unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT; | 47 | unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT; |
48 | 48 | ||
@@ -53,13 +53,72 @@ MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer"); | |||
53 | module_param(rds_ib_retry_count, int, 0444); | 53 | module_param(rds_ib_retry_count, int, 0444); |
54 | MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error"); | 54 | MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error"); |
55 | 55 | ||
56 | /* | ||
57 | * we have a clumsy combination of RCU and a rwsem protecting this list | ||
58 | * because it is used both in the get_mr fast path and while blocking in | ||
59 | * the FMR flushing path. | ||
60 | */ | ||
61 | DECLARE_RWSEM(rds_ib_devices_lock); | ||
56 | struct list_head rds_ib_devices; | 62 | struct list_head rds_ib_devices; |
57 | 63 | ||
58 | /* NOTE: if also grabbing ibdev lock, grab this first */ | 64 | /* NOTE: if also grabbing ibdev lock, grab this first */ |
59 | DEFINE_SPINLOCK(ib_nodev_conns_lock); | 65 | DEFINE_SPINLOCK(ib_nodev_conns_lock); |
60 | LIST_HEAD(ib_nodev_conns); | 66 | LIST_HEAD(ib_nodev_conns); |
61 | 67 | ||
62 | void rds_ib_add_one(struct ib_device *device) | 68 | static void rds_ib_nodev_connect(void) |
69 | { | ||
70 | struct rds_ib_connection *ic; | ||
71 | |||
72 | spin_lock(&ib_nodev_conns_lock); | ||
73 | list_for_each_entry(ic, &ib_nodev_conns, ib_node) | ||
74 | rds_conn_connect_if_down(ic->conn); | ||
75 | spin_unlock(&ib_nodev_conns_lock); | ||
76 | } | ||
77 | |||
78 | static void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev) | ||
79 | { | ||
80 | struct rds_ib_connection *ic; | ||
81 | unsigned long flags; | ||
82 | |||
83 | spin_lock_irqsave(&rds_ibdev->spinlock, flags); | ||
84 | list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node) | ||
85 | rds_conn_drop(ic->conn); | ||
86 | spin_unlock_irqrestore(&rds_ibdev->spinlock, flags); | ||
87 | } | ||
88 | |||
89 | /* | ||
90 | * rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references | ||
91 | * from interrupt context so we push freing off into a work struct in krdsd. | ||
92 | */ | ||
93 | static void rds_ib_dev_free(struct work_struct *work) | ||
94 | { | ||
95 | struct rds_ib_ipaddr *i_ipaddr, *i_next; | ||
96 | struct rds_ib_device *rds_ibdev = container_of(work, | ||
97 | struct rds_ib_device, free_work); | ||
98 | |||
99 | if (rds_ibdev->mr_pool) | ||
100 | rds_ib_destroy_mr_pool(rds_ibdev->mr_pool); | ||
101 | if (rds_ibdev->mr) | ||
102 | ib_dereg_mr(rds_ibdev->mr); | ||
103 | if (rds_ibdev->pd) | ||
104 | ib_dealloc_pd(rds_ibdev->pd); | ||
105 | |||
106 | list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { | ||
107 | list_del(&i_ipaddr->list); | ||
108 | kfree(i_ipaddr); | ||
109 | } | ||
110 | |||
111 | kfree(rds_ibdev); | ||
112 | } | ||
113 | |||
114 | void rds_ib_dev_put(struct rds_ib_device *rds_ibdev) | ||
115 | { | ||
116 | BUG_ON(atomic_read(&rds_ibdev->refcount) <= 0); | ||
117 | if (atomic_dec_and_test(&rds_ibdev->refcount)) | ||
118 | queue_work(rds_wq, &rds_ibdev->free_work); | ||
119 | } | ||
120 | |||
121 | static void rds_ib_add_one(struct ib_device *device) | ||
63 | { | 122 | { |
64 | struct rds_ib_device *rds_ibdev; | 123 | struct rds_ib_device *rds_ibdev; |
65 | struct ib_device_attr *dev_attr; | 124 | struct ib_device_attr *dev_attr; |
@@ -77,11 +136,14 @@ void rds_ib_add_one(struct ib_device *device) | |||
77 | goto free_attr; | 136 | goto free_attr; |
78 | } | 137 | } |
79 | 138 | ||
80 | rds_ibdev = kmalloc(sizeof *rds_ibdev, GFP_KERNEL); | 139 | rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL, |
140 | ibdev_to_node(device)); | ||
81 | if (!rds_ibdev) | 141 | if (!rds_ibdev) |
82 | goto free_attr; | 142 | goto free_attr; |
83 | 143 | ||
84 | spin_lock_init(&rds_ibdev->spinlock); | 144 | spin_lock_init(&rds_ibdev->spinlock); |
145 | atomic_set(&rds_ibdev->refcount, 1); | ||
146 | INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free); | ||
85 | 147 | ||
86 | rds_ibdev->max_wrs = dev_attr->max_qp_wr; | 148 | rds_ibdev->max_wrs = dev_attr->max_qp_wr; |
87 | rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); | 149 | rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); |
@@ -91,68 +153,107 @@ void rds_ib_add_one(struct ib_device *device) | |||
91 | min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) : | 153 | min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) : |
92 | fmr_pool_size; | 154 | fmr_pool_size; |
93 | 155 | ||
156 | rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom; | ||
157 | rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom; | ||
158 | |||
94 | rds_ibdev->dev = device; | 159 | rds_ibdev->dev = device; |
95 | rds_ibdev->pd = ib_alloc_pd(device); | 160 | rds_ibdev->pd = ib_alloc_pd(device); |
96 | if (IS_ERR(rds_ibdev->pd)) | 161 | if (IS_ERR(rds_ibdev->pd)) { |
97 | goto free_dev; | 162 | rds_ibdev->pd = NULL; |
163 | goto put_dev; | ||
164 | } | ||
98 | 165 | ||
99 | rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, | 166 | rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE); |
100 | IB_ACCESS_LOCAL_WRITE); | 167 | if (IS_ERR(rds_ibdev->mr)) { |
101 | if (IS_ERR(rds_ibdev->mr)) | 168 | rds_ibdev->mr = NULL; |
102 | goto err_pd; | 169 | goto put_dev; |
170 | } | ||
103 | 171 | ||
104 | rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev); | 172 | rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev); |
105 | if (IS_ERR(rds_ibdev->mr_pool)) { | 173 | if (IS_ERR(rds_ibdev->mr_pool)) { |
106 | rds_ibdev->mr_pool = NULL; | 174 | rds_ibdev->mr_pool = NULL; |
107 | goto err_mr; | 175 | goto put_dev; |
108 | } | 176 | } |
109 | 177 | ||
110 | INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); | 178 | INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); |
111 | INIT_LIST_HEAD(&rds_ibdev->conn_list); | 179 | INIT_LIST_HEAD(&rds_ibdev->conn_list); |
112 | list_add_tail(&rds_ibdev->list, &rds_ib_devices); | 180 | |
181 | down_write(&rds_ib_devices_lock); | ||
182 | list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices); | ||
183 | up_write(&rds_ib_devices_lock); | ||
184 | atomic_inc(&rds_ibdev->refcount); | ||
113 | 185 | ||
114 | ib_set_client_data(device, &rds_ib_client, rds_ibdev); | 186 | ib_set_client_data(device, &rds_ib_client, rds_ibdev); |
187 | atomic_inc(&rds_ibdev->refcount); | ||
115 | 188 | ||
116 | goto free_attr; | 189 | rds_ib_nodev_connect(); |
117 | 190 | ||
118 | err_mr: | 191 | put_dev: |
119 | ib_dereg_mr(rds_ibdev->mr); | 192 | rds_ib_dev_put(rds_ibdev); |
120 | err_pd: | ||
121 | ib_dealloc_pd(rds_ibdev->pd); | ||
122 | free_dev: | ||
123 | kfree(rds_ibdev); | ||
124 | free_attr: | 193 | free_attr: |
125 | kfree(dev_attr); | 194 | kfree(dev_attr); |
126 | } | 195 | } |
127 | 196 | ||
128 | void rds_ib_remove_one(struct ib_device *device) | 197 | /* |
198 | * New connections use this to find the device to associate with the | ||
199 | * connection. It's not in the fast path so we're not concerned about the | ||
200 | * performance of the IB call. (As of this writing, it uses an interrupt | ||
201 | * blocking spinlock to serialize walking a per-device list of all registered | ||
202 | * clients.) | ||
203 | * | ||
204 | * RCU is used to handle incoming connections racing with device teardown. | ||
205 | * Rather than use a lock to serialize removal from the client_data and | ||
206 | * getting a new reference, we use an RCU grace period. The destruction | ||
207 | * path removes the device from client_data and then waits for all RCU | ||
208 | * readers to finish. | ||
209 | * | ||
210 | * A new connection can get NULL from this if its arriving on a | ||
211 | * device that is in the process of being removed. | ||
212 | */ | ||
213 | struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device) | ||
129 | { | 214 | { |
130 | struct rds_ib_device *rds_ibdev; | 215 | struct rds_ib_device *rds_ibdev; |
131 | struct rds_ib_ipaddr *i_ipaddr, *i_next; | ||
132 | 216 | ||
217 | rcu_read_lock(); | ||
133 | rds_ibdev = ib_get_client_data(device, &rds_ib_client); | 218 | rds_ibdev = ib_get_client_data(device, &rds_ib_client); |
134 | if (!rds_ibdev) | 219 | if (rds_ibdev) |
135 | return; | 220 | atomic_inc(&rds_ibdev->refcount); |
221 | rcu_read_unlock(); | ||
222 | return rds_ibdev; | ||
223 | } | ||
136 | 224 | ||
137 | list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { | 225 | /* |
138 | list_del(&i_ipaddr->list); | 226 | * The IB stack is letting us know that a device is going away. This can |
139 | kfree(i_ipaddr); | 227 | * happen if the underlying HCA driver is removed or if PCI hotplug is removing |
140 | } | 228 | * the pci function, for example. |
229 | * | ||
230 | * This can be called at any time and can be racing with any other RDS path. | ||
231 | */ | ||
232 | static void rds_ib_remove_one(struct ib_device *device) | ||
233 | { | ||
234 | struct rds_ib_device *rds_ibdev; | ||
141 | 235 | ||
142 | rds_ib_destroy_conns(rds_ibdev); | 236 | rds_ibdev = ib_get_client_data(device, &rds_ib_client); |
237 | if (!rds_ibdev) | ||
238 | return; | ||
143 | 239 | ||
144 | if (rds_ibdev->mr_pool) | 240 | rds_ib_dev_shutdown(rds_ibdev); |
145 | rds_ib_destroy_mr_pool(rds_ibdev->mr_pool); | ||
146 | 241 | ||
147 | ib_dereg_mr(rds_ibdev->mr); | 242 | /* stop connection attempts from getting a reference to this device. */ |
243 | ib_set_client_data(device, &rds_ib_client, NULL); | ||
148 | 244 | ||
149 | while (ib_dealloc_pd(rds_ibdev->pd)) { | 245 | down_write(&rds_ib_devices_lock); |
150 | rdsdebug("Failed to dealloc pd %p\n", rds_ibdev->pd); | 246 | list_del_rcu(&rds_ibdev->list); |
151 | msleep(1); | 247 | up_write(&rds_ib_devices_lock); |
152 | } | ||
153 | 248 | ||
154 | list_del(&rds_ibdev->list); | 249 | /* |
155 | kfree(rds_ibdev); | 250 | * This synchronize rcu is waiting for readers of both the ib |
251 | * client data and the devices list to finish before we drop | ||
252 | * both of those references. | ||
253 | */ | ||
254 | synchronize_rcu(); | ||
255 | rds_ib_dev_put(rds_ibdev); | ||
256 | rds_ib_dev_put(rds_ibdev); | ||
156 | } | 257 | } |
157 | 258 | ||
158 | struct ib_client rds_ib_client = { | 259 | struct ib_client rds_ib_client = { |
@@ -186,7 +287,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn, | |||
186 | rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); | 287 | rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); |
187 | rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); | 288 | rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); |
188 | 289 | ||
189 | rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); | 290 | rds_ibdev = ic->rds_ibdev; |
190 | iinfo->max_send_wr = ic->i_send_ring.w_nr; | 291 | iinfo->max_send_wr = ic->i_send_ring.w_nr; |
191 | iinfo->max_recv_wr = ic->i_recv_ring.w_nr; | 292 | iinfo->max_recv_wr = ic->i_recv_ring.w_nr; |
192 | iinfo->max_send_sge = rds_ibdev->max_sge; | 293 | iinfo->max_send_sge = rds_ibdev->max_sge; |
@@ -248,29 +349,36 @@ static int rds_ib_laddr_check(__be32 addr) | |||
248 | return ret; | 349 | return ret; |
249 | } | 350 | } |
250 | 351 | ||
352 | static void rds_ib_unregister_client(void) | ||
353 | { | ||
354 | ib_unregister_client(&rds_ib_client); | ||
355 | /* wait for rds_ib_dev_free() to complete */ | ||
356 | flush_workqueue(rds_wq); | ||
357 | } | ||
358 | |||
251 | void rds_ib_exit(void) | 359 | void rds_ib_exit(void) |
252 | { | 360 | { |
253 | rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); | 361 | rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); |
362 | rds_ib_unregister_client(); | ||
254 | rds_ib_destroy_nodev_conns(); | 363 | rds_ib_destroy_nodev_conns(); |
255 | ib_unregister_client(&rds_ib_client); | ||
256 | rds_ib_sysctl_exit(); | 364 | rds_ib_sysctl_exit(); |
257 | rds_ib_recv_exit(); | 365 | rds_ib_recv_exit(); |
258 | rds_trans_unregister(&rds_ib_transport); | 366 | rds_trans_unregister(&rds_ib_transport); |
367 | rds_ib_fmr_exit(); | ||
259 | } | 368 | } |
260 | 369 | ||
261 | struct rds_transport rds_ib_transport = { | 370 | struct rds_transport rds_ib_transport = { |
262 | .laddr_check = rds_ib_laddr_check, | 371 | .laddr_check = rds_ib_laddr_check, |
263 | .xmit_complete = rds_ib_xmit_complete, | 372 | .xmit_complete = rds_ib_xmit_complete, |
264 | .xmit = rds_ib_xmit, | 373 | .xmit = rds_ib_xmit, |
265 | .xmit_cong_map = NULL, | ||
266 | .xmit_rdma = rds_ib_xmit_rdma, | 374 | .xmit_rdma = rds_ib_xmit_rdma, |
375 | .xmit_atomic = rds_ib_xmit_atomic, | ||
267 | .recv = rds_ib_recv, | 376 | .recv = rds_ib_recv, |
268 | .conn_alloc = rds_ib_conn_alloc, | 377 | .conn_alloc = rds_ib_conn_alloc, |
269 | .conn_free = rds_ib_conn_free, | 378 | .conn_free = rds_ib_conn_free, |
270 | .conn_connect = rds_ib_conn_connect, | 379 | .conn_connect = rds_ib_conn_connect, |
271 | .conn_shutdown = rds_ib_conn_shutdown, | 380 | .conn_shutdown = rds_ib_conn_shutdown, |
272 | .inc_copy_to_user = rds_ib_inc_copy_to_user, | 381 | .inc_copy_to_user = rds_ib_inc_copy_to_user, |
273 | .inc_purge = rds_ib_inc_purge, | ||
274 | .inc_free = rds_ib_inc_free, | 382 | .inc_free = rds_ib_inc_free, |
275 | .cm_initiate_connect = rds_ib_cm_initiate_connect, | 383 | .cm_initiate_connect = rds_ib_cm_initiate_connect, |
276 | .cm_handle_connect = rds_ib_cm_handle_connect, | 384 | .cm_handle_connect = rds_ib_cm_handle_connect, |
@@ -286,16 +394,20 @@ struct rds_transport rds_ib_transport = { | |||
286 | .t_type = RDS_TRANS_IB | 394 | .t_type = RDS_TRANS_IB |
287 | }; | 395 | }; |
288 | 396 | ||
289 | int __init rds_ib_init(void) | 397 | int rds_ib_init(void) |
290 | { | 398 | { |
291 | int ret; | 399 | int ret; |
292 | 400 | ||
293 | INIT_LIST_HEAD(&rds_ib_devices); | 401 | INIT_LIST_HEAD(&rds_ib_devices); |
294 | 402 | ||
295 | ret = ib_register_client(&rds_ib_client); | 403 | ret = rds_ib_fmr_init(); |
296 | if (ret) | 404 | if (ret) |
297 | goto out; | 405 | goto out; |
298 | 406 | ||
407 | ret = ib_register_client(&rds_ib_client); | ||
408 | if (ret) | ||
409 | goto out_fmr_exit; | ||
410 | |||
299 | ret = rds_ib_sysctl_init(); | 411 | ret = rds_ib_sysctl_init(); |
300 | if (ret) | 412 | if (ret) |
301 | goto out_ibreg; | 413 | goto out_ibreg; |
@@ -317,7 +429,9 @@ out_recv: | |||
317 | out_sysctl: | 429 | out_sysctl: |
318 | rds_ib_sysctl_exit(); | 430 | rds_ib_sysctl_exit(); |
319 | out_ibreg: | 431 | out_ibreg: |
320 | ib_unregister_client(&rds_ib_client); | 432 | rds_ib_unregister_client(); |
433 | out_fmr_exit: | ||
434 | rds_ib_fmr_exit(); | ||
321 | out: | 435 | out: |
322 | return ret; | 436 | return ret; |
323 | } | 437 | } |