aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/infiniband
diff options
context:
space:
mode:
authorSagi Grimberg <sagig@mellanox.com>2013-07-28 05:35:42 -0400
committerRoland Dreier <roland@purestorage.com>2013-08-09 20:18:10 -0400
commit5587856c9659ac2d6ab201141aa8a5c2ff3be4cd (patch)
tree4a3b63aedd212e74fa67ee95b9ad5afab39c434c /drivers/infiniband
parente657571b76faf96a1de1aaf40b2111adcf76c673 (diff)
IB/iser: Introduce fast memory registration model (FRWR)
Newer HCAs and Virtual functions may not support FMRs but rather a fast registration model, which we call FRWR - "Fast Registration Work Requests". This model was introduced in 00f7ec36c ("RDMA/core: Add memory management extensions support") and works when the IB device supports the IB_DEVICE_MEM_MGT_EXTENSIONS capability. Upon creating the iser device iser will test whether the HCA supports FMRs. If no support for FMRs, check if IB_DEVICE_MEM_MGT_EXTENSIONS is supported and assign function pointers that handle fast registration and allocation of appropriate resources (fast_reg descriptors). Registration is done using posting IB_WR_FAST_REG_MR to the QP and invalidations using posting IB_WR_LOCAL_INV. Signed-off-by: Sagi Grimberg <sagig@mellanox.com> Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com> Signed-off-by: Roland Dreier <roland@purestorage.com>
Diffstat (limited to 'drivers/infiniband')
-rw-r--r--drivers/infiniband/ulp/iser/iscsi_iser.h21
-rw-r--r--drivers/infiniband/ulp/iser/iser_memory.c140
-rw-r--r--drivers/infiniband/ulp/iser/iser_verbs.c138
3 files changed, 287 insertions, 12 deletions
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
index 75c535260e78..67914027c614 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -211,7 +211,7 @@ struct iser_mem_reg {
211 u64 va; 211 u64 va;
212 u64 len; 212 u64 len;
213 void *mem_h; 213 void *mem_h;
214 int is_fmr; 214 int is_mr;
215}; 215};
216 216
217struct iser_regd_buf { 217struct iser_regd_buf {
@@ -277,6 +277,15 @@ struct iser_device {
277 enum iser_data_dir cmd_dir); 277 enum iser_data_dir cmd_dir);
278}; 278};
279 279
280struct fast_reg_descriptor {
281 struct list_head list;
282 /* For fast registration - FRWR */
283 struct ib_mr *data_mr;
284 struct ib_fast_reg_page_list *data_frpl;
285 /* Valid for fast registration flag */
286 bool valid;
287};
288
280struct iser_conn { 289struct iser_conn {
281 struct iscsi_iser_conn *iser_conn; /* iser conn for upcalls */ 290 struct iscsi_iser_conn *iser_conn; /* iser conn for upcalls */
282 struct iscsi_endpoint *ep; 291 struct iscsi_endpoint *ep;
@@ -307,6 +316,10 @@ struct iser_conn {
307 struct iser_page_vec *page_vec; /* represents SG to fmr maps* 316 struct iser_page_vec *page_vec; /* represents SG to fmr maps*
308 * maps serialized as tx is*/ 317 * maps serialized as tx is*/
309 } fmr; 318 } fmr;
319 struct {
320 struct list_head pool;
321 int pool_size;
322 } frwr;
310 } fastreg; 323 } fastreg;
311}; 324};
312 325
@@ -393,6 +406,8 @@ void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *task,
393 406
394int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *task, 407int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *task,
395 enum iser_data_dir cmd_dir); 408 enum iser_data_dir cmd_dir);
409int iser_reg_rdma_mem_frwr(struct iscsi_iser_task *task,
410 enum iser_data_dir cmd_dir);
396 411
397int iser_connect(struct iser_conn *ib_conn, 412int iser_connect(struct iser_conn *ib_conn,
398 struct sockaddr_in *src_addr, 413 struct sockaddr_in *src_addr,
@@ -405,6 +420,8 @@ int iser_reg_page_vec(struct iser_conn *ib_conn,
405 420
406void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task, 421void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task,
407 enum iser_data_dir cmd_dir); 422 enum iser_data_dir cmd_dir);
423void iser_unreg_mem_frwr(struct iscsi_iser_task *iser_task,
424 enum iser_data_dir cmd_dir);
408 425
409int iser_post_recvl(struct iser_conn *ib_conn); 426int iser_post_recvl(struct iser_conn *ib_conn);
410int iser_post_recvm(struct iser_conn *ib_conn, int count); 427int iser_post_recvm(struct iser_conn *ib_conn, int count);
@@ -421,4 +438,6 @@ int iser_initialize_task_headers(struct iscsi_task *task,
421int iser_alloc_rx_descriptors(struct iser_conn *ib_conn, struct iscsi_session *session); 438int iser_alloc_rx_descriptors(struct iser_conn *ib_conn, struct iscsi_session *session);
422int iser_create_fmr_pool(struct iser_conn *ib_conn, unsigned cmds_max); 439int iser_create_fmr_pool(struct iser_conn *ib_conn, unsigned cmds_max);
423void iser_free_fmr_pool(struct iser_conn *ib_conn); 440void iser_free_fmr_pool(struct iser_conn *ib_conn);
441int iser_create_frwr_pool(struct iser_conn *ib_conn, unsigned cmds_max);
442void iser_free_frwr_pool(struct iser_conn *ib_conn);
424#endif 443#endif
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c
index 1985e907f03a..1ce0c97d2ccb 100644
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -395,8 +395,7 @@ int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task,
395 regd_buf = &iser_task->rdma_regd[cmd_dir]; 395 regd_buf = &iser_task->rdma_regd[cmd_dir];
396 396
397 aligned_len = iser_data_buf_aligned_len(mem, ibdev); 397 aligned_len = iser_data_buf_aligned_len(mem, ibdev);
398 if (aligned_len != mem->dma_nents || 398 if (aligned_len != mem->dma_nents) {
399 (!ib_conn->fastreg.fmr.pool && mem->dma_nents > 1)) {
400 err = fall_to_bounce_buf(iser_task, ibdev, 399 err = fall_to_bounce_buf(iser_task, ibdev,
401 cmd_dir, aligned_len); 400 cmd_dir, aligned_len);
402 if (err) { 401 if (err) {
@@ -414,7 +413,7 @@ int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task,
414 regd_buf->reg.rkey = device->mr->rkey; 413 regd_buf->reg.rkey = device->mr->rkey;
415 regd_buf->reg.len = ib_sg_dma_len(ibdev, &sg[0]); 414 regd_buf->reg.len = ib_sg_dma_len(ibdev, &sg[0]);
416 regd_buf->reg.va = ib_sg_dma_address(ibdev, &sg[0]); 415 regd_buf->reg.va = ib_sg_dma_address(ibdev, &sg[0]);
417 regd_buf->reg.is_fmr = 0; 416 regd_buf->reg.is_mr = 0;
418 417
419 iser_dbg("PHYSICAL Mem.register: lkey: 0x%08X rkey: 0x%08X " 418 iser_dbg("PHYSICAL Mem.register: lkey: 0x%08X rkey: 0x%08X "
420 "va: 0x%08lX sz: %ld]\n", 419 "va: 0x%08lX sz: %ld]\n",
@@ -444,3 +443,138 @@ int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task,
444 } 443 }
445 return 0; 444 return 0;
446} 445}
446
447static int iser_fast_reg_mr(struct fast_reg_descriptor *desc,
448 struct iser_conn *ib_conn,
449 struct iser_regd_buf *regd_buf,
450 u32 offset, unsigned int data_size,
451 unsigned int page_list_len)
452{
453 struct ib_send_wr fastreg_wr, inv_wr;
454 struct ib_send_wr *bad_wr, *wr = NULL;
455 u8 key;
456 int ret;
457
458 if (!desc->valid) {
459 memset(&inv_wr, 0, sizeof(inv_wr));
460 inv_wr.opcode = IB_WR_LOCAL_INV;
461 inv_wr.send_flags = IB_SEND_SIGNALED;
462 inv_wr.ex.invalidate_rkey = desc->data_mr->rkey;
463 wr = &inv_wr;
464 /* Bump the key */
465 key = (u8)(desc->data_mr->rkey & 0x000000FF);
466 ib_update_fast_reg_key(desc->data_mr, ++key);
467 }
468
469 /* Prepare FASTREG WR */
470 memset(&fastreg_wr, 0, sizeof(fastreg_wr));
471 fastreg_wr.opcode = IB_WR_FAST_REG_MR;
472 fastreg_wr.send_flags = IB_SEND_SIGNALED;
473 fastreg_wr.wr.fast_reg.iova_start = desc->data_frpl->page_list[0] + offset;
474 fastreg_wr.wr.fast_reg.page_list = desc->data_frpl;
475 fastreg_wr.wr.fast_reg.page_list_len = page_list_len;
476 fastreg_wr.wr.fast_reg.page_shift = SHIFT_4K;
477 fastreg_wr.wr.fast_reg.length = data_size;
478 fastreg_wr.wr.fast_reg.rkey = desc->data_mr->rkey;
479 fastreg_wr.wr.fast_reg.access_flags = (IB_ACCESS_LOCAL_WRITE |
480 IB_ACCESS_REMOTE_WRITE |
481 IB_ACCESS_REMOTE_READ);
482
483 if (!wr) {
484 wr = &fastreg_wr;
485 atomic_inc(&ib_conn->post_send_buf_count);
486 } else {
487 wr->next = &fastreg_wr;
488 atomic_add(2, &ib_conn->post_send_buf_count);
489 }
490
491 ret = ib_post_send(ib_conn->qp, wr, &bad_wr);
492 if (ret) {
493 if (bad_wr->next)
494 atomic_sub(2, &ib_conn->post_send_buf_count);
495 else
496 atomic_dec(&ib_conn->post_send_buf_count);
497 iser_err("fast registration failed, ret:%d\n", ret);
498 return ret;
499 }
500 desc->valid = false;
501
502 regd_buf->reg.mem_h = desc;
503 regd_buf->reg.lkey = desc->data_mr->lkey;
504 regd_buf->reg.rkey = desc->data_mr->rkey;
505 regd_buf->reg.va = desc->data_frpl->page_list[0] + offset;
506 regd_buf->reg.len = data_size;
507 regd_buf->reg.is_mr = 1;
508
509 return ret;
510}
511
512/**
513 * iser_reg_rdma_mem_frwr - Registers memory intended for RDMA,
514 * using Fast Registration WR (if possible) obtaining rkey and va
515 *
516 * returns 0 on success, errno code on failure
517 */
518int iser_reg_rdma_mem_frwr(struct iscsi_iser_task *iser_task,
519 enum iser_data_dir cmd_dir)
520{
521 struct iser_conn *ib_conn = iser_task->iser_conn->ib_conn;
522 struct iser_device *device = ib_conn->device;
523 struct ib_device *ibdev = device->ib_device;
524 struct iser_data_buf *mem = &iser_task->data[cmd_dir];
525 struct iser_regd_buf *regd_buf = &iser_task->rdma_regd[cmd_dir];
526 struct fast_reg_descriptor *desc;
527 unsigned int data_size, page_list_len;
528 int err, aligned_len;
529 unsigned long flags;
530 u32 offset;
531
532 aligned_len = iser_data_buf_aligned_len(mem, ibdev);
533 if (aligned_len != mem->dma_nents) {
534 err = fall_to_bounce_buf(iser_task, ibdev,
535 cmd_dir, aligned_len);
536 if (err) {
537 iser_err("failed to allocate bounce buffer\n");
538 return err;
539 }
540 mem = &iser_task->data_copy[cmd_dir];
541 }
542
543 /* if there a single dma entry, dma mr suffices */
544 if (mem->dma_nents == 1) {
545 struct scatterlist *sg = (struct scatterlist *)mem->buf;
546
547 regd_buf->reg.lkey = device->mr->lkey;
548 regd_buf->reg.rkey = device->mr->rkey;
549 regd_buf->reg.len = ib_sg_dma_len(ibdev, &sg[0]);
550 regd_buf->reg.va = ib_sg_dma_address(ibdev, &sg[0]);
551 regd_buf->reg.is_mr = 0;
552 } else {
553 spin_lock_irqsave(&ib_conn->lock, flags);
554 desc = list_first_entry(&ib_conn->fastreg.frwr.pool,
555 struct fast_reg_descriptor, list);
556 list_del(&desc->list);
557 spin_unlock_irqrestore(&ib_conn->lock, flags);
558 page_list_len = iser_sg_to_page_vec(mem, device->ib_device,
559 desc->data_frpl->page_list,
560 &offset, &data_size);
561
562 if (page_list_len * SIZE_4K < data_size) {
563 iser_err("fast reg page_list too short to hold this SG\n");
564 err = -EINVAL;
565 goto err_reg;
566 }
567
568 err = iser_fast_reg_mr(desc, ib_conn, regd_buf,
569 offset, data_size, page_list_len);
570 if (err)
571 goto err_reg;
572 }
573
574 return 0;
575err_reg:
576 spin_lock_irqsave(&ib_conn->lock, flags);
577 list_add_tail(&desc->list, &ib_conn->fastreg.frwr.pool);
578 spin_unlock_irqrestore(&ib_conn->lock, flags);
579 return err;
580}
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index d9a47b91fe43..28badacb0134 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -73,12 +73,36 @@ static int iser_create_device_ib_res(struct iser_device *device)
73{ 73{
74 int i, j; 74 int i, j;
75 struct iser_cq_desc *cq_desc; 75 struct iser_cq_desc *cq_desc;
76 struct ib_device_attr *dev_attr;
76 77
77 /* Assign function handles */ 78 dev_attr = kmalloc(sizeof(*dev_attr), GFP_KERNEL);
78 device->iser_alloc_rdma_reg_res = iser_create_fmr_pool; 79 if (!dev_attr)
79 device->iser_free_rdma_reg_res = iser_free_fmr_pool; 80 return -ENOMEM;
80 device->iser_reg_rdma_mem = iser_reg_rdma_mem_fmr; 81
81 device->iser_unreg_rdma_mem = iser_unreg_mem_fmr; 82 if (ib_query_device(device->ib_device, dev_attr)) {
83 pr_warn("Query device failed for %s\n", device->ib_device->name);
84 goto dev_attr_err;
85 }
86
87 /* Assign function handles - based on FMR support */
88 if (device->ib_device->alloc_fmr && device->ib_device->dealloc_fmr &&
89 device->ib_device->map_phys_fmr && device->ib_device->unmap_fmr) {
90 iser_info("FMR supported, using FMR for registration\n");
91 device->iser_alloc_rdma_reg_res = iser_create_fmr_pool;
92 device->iser_free_rdma_reg_res = iser_free_fmr_pool;
93 device->iser_reg_rdma_mem = iser_reg_rdma_mem_fmr;
94 device->iser_unreg_rdma_mem = iser_unreg_mem_fmr;
95 } else
96 if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
97 iser_info("FRWR supported, using FRWR for registration\n");
98 device->iser_alloc_rdma_reg_res = iser_create_frwr_pool;
99 device->iser_free_rdma_reg_res = iser_free_frwr_pool;
100 device->iser_reg_rdma_mem = iser_reg_rdma_mem_frwr;
101 device->iser_unreg_rdma_mem = iser_unreg_mem_frwr;
102 } else {
103 iser_err("IB device does not support FMRs nor FRWRs, can't register memory\n");
104 goto dev_attr_err;
105 }
82 106
83 device->cqs_used = min(ISER_MAX_CQ, device->ib_device->num_comp_vectors); 107 device->cqs_used = min(ISER_MAX_CQ, device->ib_device->num_comp_vectors);
84 iser_info("using %d CQs, device %s supports %d vectors\n", 108 iser_info("using %d CQs, device %s supports %d vectors\n",
@@ -134,6 +158,7 @@ static int iser_create_device_ib_res(struct iser_device *device)
134 if (ib_register_event_handler(&device->event_handler)) 158 if (ib_register_event_handler(&device->event_handler))
135 goto handler_err; 159 goto handler_err;
136 160
161 kfree(dev_attr);
137 return 0; 162 return 0;
138 163
139handler_err: 164handler_err:
@@ -153,6 +178,8 @@ pd_err:
153 kfree(device->cq_desc); 178 kfree(device->cq_desc);
154cq_desc_err: 179cq_desc_err:
155 iser_err("failed to allocate an IB resource\n"); 180 iser_err("failed to allocate an IB resource\n");
181dev_attr_err:
182 kfree(dev_attr);
156 return -1; 183 return -1;
157} 184}
158 185
@@ -253,6 +280,80 @@ void iser_free_fmr_pool(struct iser_conn *ib_conn)
253} 280}
254 281
255/** 282/**
283 * iser_create_frwr_pool - Creates pool of fast_reg descriptors
284 * for fast registration work requests.
285 * returns 0 on success, or errno code on failure
286 */
287int iser_create_frwr_pool(struct iser_conn *ib_conn, unsigned cmds_max)
288{
289 struct iser_device *device = ib_conn->device;
290 struct fast_reg_descriptor *desc;
291 int i, ret;
292
293 INIT_LIST_HEAD(&ib_conn->fastreg.frwr.pool);
294 ib_conn->fastreg.frwr.pool_size = 0;
295 for (i = 0; i < cmds_max; i++) {
296 desc = kmalloc(sizeof(*desc), GFP_KERNEL);
297 if (!desc) {
298 iser_err("Failed to allocate a new fast_reg descriptor\n");
299 ret = -ENOMEM;
300 goto err;
301 }
302
303 desc->data_frpl = ib_alloc_fast_reg_page_list(device->ib_device,
304 ISCSI_ISER_SG_TABLESIZE + 1);
305 if (IS_ERR(desc->data_frpl)) {
306 ret = PTR_ERR(desc->data_frpl);
307 iser_err("Failed to allocate ib_fast_reg_page_list err=%d\n", ret);
308 goto err;
309 }
310
311 desc->data_mr = ib_alloc_fast_reg_mr(device->pd,
312 ISCSI_ISER_SG_TABLESIZE + 1);
313 if (IS_ERR(desc->data_mr)) {
314 ret = PTR_ERR(desc->data_mr);
315 iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret);
316 ib_free_fast_reg_page_list(desc->data_frpl);
317 goto err;
318 }
319 desc->valid = true;
320 list_add_tail(&desc->list, &ib_conn->fastreg.frwr.pool);
321 ib_conn->fastreg.frwr.pool_size++;
322 }
323
324 return 0;
325err:
326 iser_free_frwr_pool(ib_conn);
327 return ret;
328}
329
330/**
331 * iser_free_frwr_pool - releases the pool of fast_reg descriptors
332 */
333void iser_free_frwr_pool(struct iser_conn *ib_conn)
334{
335 struct fast_reg_descriptor *desc, *tmp;
336 int i = 0;
337
338 if (list_empty(&ib_conn->fastreg.frwr.pool))
339 return;
340
341 iser_info("freeing conn %p frwr pool\n", ib_conn);
342
343 list_for_each_entry_safe(desc, tmp, &ib_conn->fastreg.frwr.pool, list) {
344 list_del(&desc->list);
345 ib_free_fast_reg_page_list(desc->data_frpl);
346 ib_dereg_mr(desc->data_mr);
347 kfree(desc);
348 ++i;
349 }
350
351 if (i < ib_conn->fastreg.frwr.pool_size)
352 iser_warn("pool still has %d regions registered\n",
353 ib_conn->fastreg.frwr.pool_size - i);
354}
355
356/**
256 * iser_create_ib_conn_res - Queue-Pair (QP) 357 * iser_create_ib_conn_res - Queue-Pair (QP)
257 * 358 *
258 * returns 0 on success, -1 on failure 359 * returns 0 on success, -1 on failure
@@ -707,7 +808,7 @@ int iser_reg_page_vec(struct iser_conn *ib_conn,
707 mem_reg->rkey = mem->fmr->rkey; 808 mem_reg->rkey = mem->fmr->rkey;
708 mem_reg->len = page_vec->length * SIZE_4K; 809 mem_reg->len = page_vec->length * SIZE_4K;
709 mem_reg->va = io_addr; 810 mem_reg->va = io_addr;
710 mem_reg->is_fmr = 1; 811 mem_reg->is_mr = 1;
711 mem_reg->mem_h = (void *)mem; 812 mem_reg->mem_h = (void *)mem;
712 813
713 mem_reg->va += page_vec->offset; 814 mem_reg->va += page_vec->offset;
@@ -734,7 +835,7 @@ void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task,
734 struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg; 835 struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg;
735 int ret; 836 int ret;
736 837
737 if (!reg->is_fmr) 838 if (!reg->is_mr)
738 return; 839 return;
739 840
740 iser_dbg("PHYSICAL Mem.Unregister mem_h %p\n",reg->mem_h); 841 iser_dbg("PHYSICAL Mem.Unregister mem_h %p\n",reg->mem_h);
@@ -746,6 +847,23 @@ void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task,
746 reg->mem_h = NULL; 847 reg->mem_h = NULL;
747} 848}
748 849
850void iser_unreg_mem_frwr(struct iscsi_iser_task *iser_task,
851 enum iser_data_dir cmd_dir)
852{
853 struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg;
854 struct iser_conn *ib_conn = iser_task->iser_conn->ib_conn;
855 struct fast_reg_descriptor *desc = reg->mem_h;
856
857 if (!reg->is_mr)
858 return;
859
860 reg->mem_h = NULL;
861 reg->is_mr = 0;
862 spin_lock_bh(&ib_conn->lock);
863 list_add_tail(&desc->list, &ib_conn->fastreg.frwr.pool);
864 spin_unlock_bh(&ib_conn->lock);
865}
866
749int iser_post_recvl(struct iser_conn *ib_conn) 867int iser_post_recvl(struct iser_conn *ib_conn)
750{ 868{
751 struct ib_recv_wr rx_wr, *rx_wr_failed; 869 struct ib_recv_wr rx_wr, *rx_wr_failed;
@@ -867,7 +985,11 @@ static int iser_drain_tx_cq(struct iser_device *device, int cq_index)
867 if (wc.status == IB_WC_SUCCESS) { 985 if (wc.status == IB_WC_SUCCESS) {
868 if (wc.opcode == IB_WC_SEND) 986 if (wc.opcode == IB_WC_SEND)
869 iser_snd_completion(tx_desc, ib_conn); 987 iser_snd_completion(tx_desc, ib_conn);
870 else 988 else if (wc.opcode == IB_WC_LOCAL_INV ||
989 wc.opcode == IB_WC_FAST_REG_MR) {
990 atomic_dec(&ib_conn->post_send_buf_count);
991 continue;
992 } else
871 iser_err("expected opcode %d got %d\n", 993 iser_err("expected opcode %d got %d\n",
872 IB_WC_SEND, wc.opcode); 994 IB_WC_SEND, wc.opcode);
873 } else { 995 } else {