aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTom Tucker <tom@opengridcomputing.com>2008-10-03 16:45:03 -0400
committerTom Tucker <tom@opengridcomputing.com>2008-10-06 15:46:05 -0400
commitafd566ea080572499cc01d42d2f578bf4b54f20f (patch)
treecbd0c55f27cfd0a432bf669d6db4cbe0e595a2bf
parent146b6df6a537939570c5772ebd7db826fdbd5d82 (diff)
svcrdma: Modify the RPC reply path to use FRMR when available
Use FRMR to map local RPC reply data. This allows RDMA_WRITE to send reply data using a single WR. The FRMR is invalidated by linking the LOCAL_INV WR to the RDMA_SEND message used to complete the reply. Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c255
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c2
2 files changed, 217 insertions, 40 deletions
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 84d328329d98..9a7a8e7ae038 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -69,9 +69,127 @@
69 * array is only concerned with the reply we are assured that we have 69 * array is only concerned with the reply we are assured that we have
70 * on extra page for the RPCRMDA header. 70 * on extra page for the RPCRMDA header.
71 */ 71 */
72static void xdr_to_sge(struct svcxprt_rdma *xprt, 72int fast_reg_xdr(struct svcxprt_rdma *xprt,
73 struct xdr_buf *xdr, 73 struct xdr_buf *xdr,
74 struct svc_rdma_req_map *vec) 74 struct svc_rdma_req_map *vec)
75{
76 int sge_no;
77 u32 sge_bytes;
78 u32 page_bytes;
79 u32 page_off;
80 int page_no = 0;
81 u8 *frva;
82 struct svc_rdma_fastreg_mr *frmr;
83
84 frmr = svc_rdma_get_frmr(xprt);
85 if (IS_ERR(frmr))
86 return -ENOMEM;
87 vec->frmr = frmr;
88
89 /* Skip the RPCRDMA header */
90 sge_no = 1;
91
92 /* Map the head. */
93 frva = (void *)((unsigned long)(xdr->head[0].iov_base) & PAGE_MASK);
94 vec->sge[sge_no].iov_base = xdr->head[0].iov_base;
95 vec->sge[sge_no].iov_len = xdr->head[0].iov_len;
96 vec->count = 2;
97 sge_no++;
98
99 /* Build the FRMR */
100 frmr->kva = frva;
101 frmr->direction = DMA_TO_DEVICE;
102 frmr->access_flags = 0;
103 frmr->map_len = PAGE_SIZE;
104 frmr->page_list_len = 1;
105 frmr->page_list->page_list[page_no] =
106 ib_dma_map_single(xprt->sc_cm_id->device,
107 (void *)xdr->head[0].iov_base,
108 PAGE_SIZE, DMA_TO_DEVICE);
109 if (ib_dma_mapping_error(xprt->sc_cm_id->device,
110 frmr->page_list->page_list[page_no]))
111 goto fatal_err;
112 atomic_inc(&xprt->sc_dma_used);
113
114 page_off = xdr->page_base;
115 page_bytes = xdr->page_len + page_off;
116 if (!page_bytes)
117 goto encode_tail;
118
119 /* Map the pages */
120 vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off;
121 vec->sge[sge_no].iov_len = page_bytes;
122 sge_no++;
123 while (page_bytes) {
124 struct page *page;
125
126 page = xdr->pages[page_no++];
127 sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off));
128 page_bytes -= sge_bytes;
129
130 frmr->page_list->page_list[page_no] =
131 ib_dma_map_page(xprt->sc_cm_id->device, page, 0,
132 PAGE_SIZE, DMA_TO_DEVICE);
133 if (ib_dma_mapping_error(xprt->sc_cm_id->device,
134 frmr->page_list->page_list[page_no]))
135 goto fatal_err;
136
137 atomic_inc(&xprt->sc_dma_used);
138 page_off = 0; /* reset for next time through loop */
139 frmr->map_len += PAGE_SIZE;
140 frmr->page_list_len++;
141 }
142 vec->count++;
143
144 encode_tail:
145 /* Map tail */
146 if (0 == xdr->tail[0].iov_len)
147 goto done;
148
149 vec->count++;
150 vec->sge[sge_no].iov_len = xdr->tail[0].iov_len;
151
152 if (((unsigned long)xdr->tail[0].iov_base & PAGE_MASK) ==
153 ((unsigned long)xdr->head[0].iov_base & PAGE_MASK)) {
154 /*
155 * If head and tail use the same page, we don't need
156 * to map it again.
157 */
158 vec->sge[sge_no].iov_base = xdr->tail[0].iov_base;
159 } else {
160 void *va;
161
162 /* Map another page for the tail */
163 page_off = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK;
164 va = (void *)((unsigned long)xdr->tail[0].iov_base & PAGE_MASK);
165 vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off;
166
167 frmr->page_list->page_list[page_no] =
168 ib_dma_map_single(xprt->sc_cm_id->device, va, PAGE_SIZE,
169 DMA_TO_DEVICE);
170 if (ib_dma_mapping_error(xprt->sc_cm_id->device,
171 frmr->page_list->page_list[page_no]))
172 goto fatal_err;
173 atomic_inc(&xprt->sc_dma_used);
174 frmr->map_len += PAGE_SIZE;
175 frmr->page_list_len++;
176 }
177
178 done:
179 if (svc_rdma_fastreg(xprt, frmr))
180 goto fatal_err;
181
182 return 0;
183
184 fatal_err:
185 printk("svcrdma: Error fast registering memory for xprt %p\n", xprt);
186 svc_rdma_put_frmr(xprt, frmr);
187 return -EIO;
188}
189
190static int map_xdr(struct svcxprt_rdma *xprt,
191 struct xdr_buf *xdr,
192 struct svc_rdma_req_map *vec)
75{ 193{
76 int sge_max = (xdr->len+PAGE_SIZE-1) / PAGE_SIZE + 3; 194 int sge_max = (xdr->len+PAGE_SIZE-1) / PAGE_SIZE + 3;
77 int sge_no; 195 int sge_no;
@@ -83,6 +201,9 @@ static void xdr_to_sge(struct svcxprt_rdma *xprt,
83 BUG_ON(xdr->len != 201 BUG_ON(xdr->len !=
84 (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)); 202 (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len));
85 203
204 if (xprt->sc_frmr_pg_list_len)
205 return fast_reg_xdr(xprt, xdr, vec);
206
86 /* Skip the first sge, this is for the RPCRDMA header */ 207 /* Skip the first sge, this is for the RPCRDMA header */
87 sge_no = 1; 208 sge_no = 1;
88 209
@@ -116,9 +237,12 @@ static void xdr_to_sge(struct svcxprt_rdma *xprt,
116 237
117 BUG_ON(sge_no > sge_max); 238 BUG_ON(sge_no > sge_max);
118 vec->count = sge_no; 239 vec->count = sge_no;
240 return 0;
119} 241}
120 242
121/* Assumptions: 243/* Assumptions:
244 * - We are using FRMR
245 * - or -
122 * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE 246 * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
123 */ 247 */
124static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, 248static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
@@ -158,30 +282,35 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
158 sge_no = 0; 282 sge_no = 0;
159 283
160 /* Copy the remaining SGE */ 284 /* Copy the remaining SGE */
161 while (bc != 0 && xdr_sge_no < vec->count) { 285 while (bc != 0) {
162 sge[sge_no].lkey = xprt->sc_phys_mr->lkey; 286 sge_bytes = min_t(size_t,
163 sge_bytes = min((size_t)bc, 287 bc, vec->sge[xdr_sge_no].iov_len-sge_off);
164 (size_t)(vec->sge[xdr_sge_no].iov_len-sge_off));
165 sge[sge_no].length = sge_bytes; 288 sge[sge_no].length = sge_bytes;
166 atomic_inc(&xprt->sc_dma_used); 289 if (!vec->frmr) {
167 sge[sge_no].addr = 290 sge[sge_no].addr =
168 ib_dma_map_single(xprt->sc_cm_id->device, 291 ib_dma_map_single(xprt->sc_cm_id->device,
169 (void *) 292 (void *)
170 vec->sge[xdr_sge_no].iov_base + sge_off, 293 vec->sge[xdr_sge_no].iov_base + sge_off,
171 sge_bytes, DMA_TO_DEVICE); 294 sge_bytes, DMA_TO_DEVICE);
172 if (dma_mapping_error(xprt->sc_cm_id->device->dma_device, 295 if (ib_dma_mapping_error(xprt->sc_cm_id->device,
173 sge[sge_no].addr)) 296 sge[sge_no].addr))
174 goto err; 297 goto err;
298 atomic_inc(&xprt->sc_dma_used);
299 sge[sge_no].lkey = xprt->sc_dma_lkey;
300 } else {
301 sge[sge_no].addr = (unsigned long)
302 vec->sge[xdr_sge_no].iov_base + sge_off;
303 sge[sge_no].lkey = vec->frmr->mr->lkey;
304 }
305 ctxt->count++;
306 ctxt->frmr = vec->frmr;
175 sge_off = 0; 307 sge_off = 0;
176 sge_no++; 308 sge_no++;
177 ctxt->count++;
178 xdr_sge_no++; 309 xdr_sge_no++;
310 BUG_ON(xdr_sge_no > vec->count);
179 bc -= sge_bytes; 311 bc -= sge_bytes;
180 } 312 }
181 313
182 BUG_ON(bc != 0);
183 BUG_ON(xdr_sge_no > vec->count);
184
185 /* Prepare WRITE WR */ 314 /* Prepare WRITE WR */
186 memset(&write_wr, 0, sizeof write_wr); 315 memset(&write_wr, 0, sizeof write_wr);
187 ctxt->wr_op = IB_WR_RDMA_WRITE; 316 ctxt->wr_op = IB_WR_RDMA_WRITE;
@@ -226,7 +355,10 @@ static int send_write_chunks(struct svcxprt_rdma *xprt,
226 res_ary = (struct rpcrdma_write_array *) 355 res_ary = (struct rpcrdma_write_array *)
227 &rdma_resp->rm_body.rm_chunks[1]; 356 &rdma_resp->rm_body.rm_chunks[1];
228 357
229 max_write = xprt->sc_max_sge * PAGE_SIZE; 358 if (vec->frmr)
359 max_write = vec->frmr->map_len;
360 else
361 max_write = xprt->sc_max_sge * PAGE_SIZE;
230 362
231 /* Write chunks start at the pagelist */ 363 /* Write chunks start at the pagelist */
232 for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0; 364 for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0;
@@ -297,7 +429,10 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
297 res_ary = (struct rpcrdma_write_array *) 429 res_ary = (struct rpcrdma_write_array *)
298 &rdma_resp->rm_body.rm_chunks[2]; 430 &rdma_resp->rm_body.rm_chunks[2];
299 431
300 max_write = xprt->sc_max_sge * PAGE_SIZE; 432 if (vec->frmr)
433 max_write = vec->frmr->map_len;
434 else
435 max_write = xprt->sc_max_sge * PAGE_SIZE;
301 436
302 /* xdr offset starts at RPC message */ 437 /* xdr offset starts at RPC message */
303 for (xdr_off = 0, chunk_no = 0; 438 for (xdr_off = 0, chunk_no = 0;
@@ -307,7 +442,6 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
307 ch = &arg_ary->wc_array[chunk_no].wc_target; 442 ch = &arg_ary->wc_array[chunk_no].wc_target;
308 write_len = min(xfer_len, ch->rs_length); 443 write_len = min(xfer_len, ch->rs_length);
309 444
310
311 /* Prepare the reply chunk given the length actually 445 /* Prepare the reply chunk given the length actually
312 * written */ 446 * written */
313 rs_offset = get_unaligned(&(ch->rs_offset)); 447 rs_offset = get_unaligned(&(ch->rs_offset));
@@ -366,6 +500,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
366 int byte_count) 500 int byte_count)
367{ 501{
368 struct ib_send_wr send_wr; 502 struct ib_send_wr send_wr;
503 struct ib_send_wr inv_wr;
369 int sge_no; 504 int sge_no;
370 int sge_bytes; 505 int sge_bytes;
371 int page_no; 506 int page_no;
@@ -385,27 +520,45 @@ static int send_reply(struct svcxprt_rdma *rdma,
385 /* Prepare the context */ 520 /* Prepare the context */
386 ctxt->pages[0] = page; 521 ctxt->pages[0] = page;
387 ctxt->count = 1; 522 ctxt->count = 1;
523 ctxt->frmr = vec->frmr;
524 if (vec->frmr)
525 set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
526 else
527 clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
388 528
389 /* Prepare the SGE for the RPCRDMA Header */ 529 /* Prepare the SGE for the RPCRDMA Header */
390 atomic_inc(&rdma->sc_dma_used);
391 ctxt->sge[0].addr = 530 ctxt->sge[0].addr =
392 ib_dma_map_page(rdma->sc_cm_id->device, 531 ib_dma_map_page(rdma->sc_cm_id->device,
393 page, 0, PAGE_SIZE, DMA_TO_DEVICE); 532 page, 0, PAGE_SIZE, DMA_TO_DEVICE);
533 if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr))
534 goto err;
535 atomic_inc(&rdma->sc_dma_used);
536
394 ctxt->direction = DMA_TO_DEVICE; 537 ctxt->direction = DMA_TO_DEVICE;
538
395 ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp); 539 ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp);
396 ctxt->sge[0].lkey = rdma->sc_phys_mr->lkey; 540 ctxt->sge[0].lkey = rdma->sc_dma_lkey;
397 541
398 /* Determine how many of our SGE are to be transmitted */ 542 /* Determine how many of our SGE are to be transmitted */
399 for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) { 543 for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) {
400 sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count); 544 sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count);
401 byte_count -= sge_bytes; 545 byte_count -= sge_bytes;
402 atomic_inc(&rdma->sc_dma_used); 546 if (!vec->frmr) {
403 ctxt->sge[sge_no].addr = 547 ctxt->sge[sge_no].addr =
404 ib_dma_map_single(rdma->sc_cm_id->device, 548 ib_dma_map_single(rdma->sc_cm_id->device,
405 vec->sge[sge_no].iov_base, 549 vec->sge[sge_no].iov_base,
406 sge_bytes, DMA_TO_DEVICE); 550 sge_bytes, DMA_TO_DEVICE);
551 if (ib_dma_mapping_error(rdma->sc_cm_id->device,
552 ctxt->sge[sge_no].addr))
553 goto err;
554 atomic_inc(&rdma->sc_dma_used);
555 ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey;
556 } else {
557 ctxt->sge[sge_no].addr = (unsigned long)
558 vec->sge[sge_no].iov_base;
559 ctxt->sge[sge_no].lkey = vec->frmr->mr->lkey;
560 }
407 ctxt->sge[sge_no].length = sge_bytes; 561 ctxt->sge[sge_no].length = sge_bytes;
408 ctxt->sge[sge_no].lkey = rdma->sc_phys_mr->lkey;
409 } 562 }
410 BUG_ON(byte_count != 0); 563 BUG_ON(byte_count != 0);
411 564
@@ -417,11 +570,16 @@ static int send_reply(struct svcxprt_rdma *rdma,
417 ctxt->pages[page_no+1] = rqstp->rq_respages[page_no]; 570 ctxt->pages[page_no+1] = rqstp->rq_respages[page_no];
418 ctxt->count++; 571 ctxt->count++;
419 rqstp->rq_respages[page_no] = NULL; 572 rqstp->rq_respages[page_no] = NULL;
420 /* If there are more pages than SGE, terminate SGE list */ 573 /*
574 * If there are more pages than SGE, terminate SGE
575 * list so that svc_rdma_unmap_dma doesn't attempt to
576 * unmap garbage.
577 */
421 if (page_no+1 >= sge_no) 578 if (page_no+1 >= sge_no)
422 ctxt->sge[page_no+1].length = 0; 579 ctxt->sge[page_no+1].length = 0;
423 } 580 }
424 BUG_ON(sge_no > rdma->sc_max_sge); 581 BUG_ON(sge_no > rdma->sc_max_sge);
582 BUG_ON(sge_no > ctxt->count);
425 memset(&send_wr, 0, sizeof send_wr); 583 memset(&send_wr, 0, sizeof send_wr);
426 ctxt->wr_op = IB_WR_SEND; 584 ctxt->wr_op = IB_WR_SEND;
427 send_wr.wr_id = (unsigned long)ctxt; 585 send_wr.wr_id = (unsigned long)ctxt;
@@ -429,12 +587,26 @@ static int send_reply(struct svcxprt_rdma *rdma,
429 send_wr.num_sge = sge_no; 587 send_wr.num_sge = sge_no;
430 send_wr.opcode = IB_WR_SEND; 588 send_wr.opcode = IB_WR_SEND;
431 send_wr.send_flags = IB_SEND_SIGNALED; 589 send_wr.send_flags = IB_SEND_SIGNALED;
590 if (vec->frmr) {
591 /* Prepare INVALIDATE WR */
592 memset(&inv_wr, 0, sizeof inv_wr);
593 inv_wr.opcode = IB_WR_LOCAL_INV;
594 inv_wr.send_flags = IB_SEND_SIGNALED;
595 inv_wr.ex.invalidate_rkey =
596 vec->frmr->mr->lkey;
597 send_wr.next = &inv_wr;
598 }
432 599
433 ret = svc_rdma_send(rdma, &send_wr); 600 ret = svc_rdma_send(rdma, &send_wr);
434 if (ret) 601 if (ret)
435 svc_rdma_put_context(ctxt, 1); 602 goto err;
436 603
437 return ret; 604 return 0;
605
606 err:
607 svc_rdma_put_frmr(rdma, vec->frmr);
608 svc_rdma_put_context(ctxt, 1);
609 return -EIO;
438} 610}
439 611
440void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp) 612void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
@@ -477,8 +649,9 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
477 ctxt = svc_rdma_get_context(rdma); 649 ctxt = svc_rdma_get_context(rdma);
478 ctxt->direction = DMA_TO_DEVICE; 650 ctxt->direction = DMA_TO_DEVICE;
479 vec = svc_rdma_get_req_map(); 651 vec = svc_rdma_get_req_map();
480 xdr_to_sge(rdma, &rqstp->rq_res, vec); 652 ret = map_xdr(rdma, &rqstp->rq_res, vec);
481 653 if (ret)
654 goto err0;
482 inline_bytes = rqstp->rq_res.len; 655 inline_bytes = rqstp->rq_res.len;
483 656
484 /* Create the RDMA response header */ 657 /* Create the RDMA response header */
@@ -498,7 +671,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
498 if (ret < 0) { 671 if (ret < 0) {
499 printk(KERN_ERR "svcrdma: failed to send write chunks, rc=%d\n", 672 printk(KERN_ERR "svcrdma: failed to send write chunks, rc=%d\n",
500 ret); 673 ret);
501 goto error; 674 goto err1;
502 } 675 }
503 inline_bytes -= ret; 676 inline_bytes -= ret;
504 677
@@ -508,7 +681,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
508 if (ret < 0) { 681 if (ret < 0) {
509 printk(KERN_ERR "svcrdma: failed to send reply chunks, rc=%d\n", 682 printk(KERN_ERR "svcrdma: failed to send reply chunks, rc=%d\n",
510 ret); 683 ret);
511 goto error; 684 goto err1;
512 } 685 }
513 inline_bytes -= ret; 686 inline_bytes -= ret;
514 687
@@ -517,9 +690,11 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
517 svc_rdma_put_req_map(vec); 690 svc_rdma_put_req_map(vec);
518 dprintk("svcrdma: send_reply returns %d\n", ret); 691 dprintk("svcrdma: send_reply returns %d\n", ret);
519 return ret; 692 return ret;
520 error: 693
694 err1:
695 put_page(res_page);
696 err0:
521 svc_rdma_put_req_map(vec); 697 svc_rdma_put_req_map(vec);
522 svc_rdma_put_context(ctxt, 0); 698 svc_rdma_put_context(ctxt, 0);
523 put_page(res_page);
524 return ret; 699 return ret;
525} 700}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index fb0dff5e53ea..98f945c5a007 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -335,6 +335,8 @@ static void process_context(struct svcxprt_rdma *xprt,
335 335
336 switch (ctxt->wr_op) { 336 switch (ctxt->wr_op) {
337 case IB_WR_SEND: 337 case IB_WR_SEND:
338 if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags))
339 svc_rdma_put_frmr(xprt, ctxt->frmr);
338 svc_rdma_put_context(ctxt, 1); 340 svc_rdma_put_context(ctxt, 1);
339 break; 341 break;
340 342