aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristoph Hellwig <hch@lst.de>2016-05-03 12:01:09 -0400
committerDoug Ledford <dledford@redhat.com>2016-05-13 13:37:19 -0400
commita060b5629ab066dd1d321430eeb96f70939a1790 (patch)
tree403dd22e8affcb2c92415065ceeac894374d7af3
parentd4a85c309b33f93cb211f2fa9d26fa77d0bb7b5e (diff)
IB/core: generic RDMA READ/WRITE API
This supports both manual mapping of lots of SGEs, as well as using MRs from the QP's MR pool, for iWarp or other cases where it's more optimal. For now, MRs are only used for iWARP transports. The user of the RDMA-RW API must allocate the QP MR pool as well as size the SQ accordingly. Thanks to Steve Wise for testing, fixing and rewriting the iWarp support, and to Sagi Grimberg for ideas, reviews and fixes. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Doug Ledford <dledford@redhat.com>
-rw-r--r--drivers/infiniband/core/Makefile2
-rw-r--r--drivers/infiniband/core/rw.c509
-rw-r--r--drivers/infiniband/core/verbs.c25
-rw-r--r--include/rdma/ib_verbs.h14
-rw-r--r--include/rdma/rw.h69
5 files changed, 617 insertions, 2 deletions
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index 48bd9d829289..26987d9d7e1c 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -8,7 +8,7 @@ obj-$(CONFIG_INFINIBAND_USER_MAD) += ib_umad.o
8obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \ 8obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \
9 $(user_access-y) 9 $(user_access-y)
10 10
11ib_core-y := packer.o ud_header.o verbs.o cq.o sysfs.o \ 11ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
12 device.o fmr_pool.o cache.o netlink.o \ 12 device.o fmr_pool.o cache.o netlink.o \
13 roce_gid_mgmt.o mr_pool.o 13 roce_gid_mgmt.o mr_pool.o
14ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o 14ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
new file mode 100644
index 000000000000..bd700ff6d438
--- /dev/null
+++ b/drivers/infiniband/core/rw.c
@@ -0,0 +1,509 @@
1/*
2 * Copyright (c) 2016 HGST, a Western Digital Company.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 */
13#include <linux/moduleparam.h>
14#include <linux/slab.h>
15#include <rdma/mr_pool.h>
16#include <rdma/rw.h>
17
18enum {
19 RDMA_RW_SINGLE_WR,
20 RDMA_RW_MULTI_WR,
21 RDMA_RW_MR,
22};
23
24static bool rdma_rw_force_mr;
25module_param_named(force_mr, rdma_rw_force_mr, bool, 0);
26MODULE_PARM_DESC(force_mr, "Force usage of MRs for RDMA READ/WRITE operations");
27
28/*
29 * Check if the device might use memory registration. This is currently only
30 * true for iWarp devices. In the future we can hopefully fine tune this based
31 * on HCA driver input.
32 */
33static inline bool rdma_rw_can_use_mr(struct ib_device *dev, u8 port_num)
34{
35 if (rdma_protocol_iwarp(dev, port_num))
36 return true;
37 if (unlikely(rdma_rw_force_mr))
38 return true;
39 return false;
40}
41
42/*
43 * Check if the device will use memory registration for this RW operation.
44 * We currently always use memory registrations for iWarp RDMA READs, and
45 * have a debug option to force usage of MRs.
46 *
47 * XXX: In the future we can hopefully fine tune this based on HCA driver
48 * input.
49 */
50static inline bool rdma_rw_io_needs_mr(struct ib_device *dev, u8 port_num,
51 enum dma_data_direction dir, int dma_nents)
52{
53 if (rdma_protocol_iwarp(dev, port_num) && dir == DMA_FROM_DEVICE)
54 return true;
55 if (unlikely(rdma_rw_force_mr))
56 return true;
57 return false;
58}
59
60static inline u32 rdma_rw_max_sge(struct ib_device *dev,
61 enum dma_data_direction dir)
62{
63 return dir == DMA_TO_DEVICE ?
64 dev->attrs.max_sge : dev->attrs.max_sge_rd;
65}
66
67static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev)
68{
69 /* arbitrary limit to avoid allocating gigantic resources */
70 return min_t(u32, dev->attrs.max_fast_reg_page_list_len, 256);
71}
72
73static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num,
74 struct rdma_rw_reg_ctx *reg, struct scatterlist *sg,
75 u32 sg_cnt, u32 offset)
76{
77 u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device);
78 u32 nents = min(sg_cnt, pages_per_mr);
79 int count = 0, ret;
80
81 reg->mr = ib_mr_pool_get(qp, &qp->rdma_mrs);
82 if (!reg->mr)
83 return -EAGAIN;
84
85 if (reg->mr->need_inval) {
86 reg->inv_wr.opcode = IB_WR_LOCAL_INV;
87 reg->inv_wr.ex.invalidate_rkey = reg->mr->lkey;
88 reg->inv_wr.next = &reg->reg_wr.wr;
89 count++;
90 } else {
91 reg->inv_wr.next = NULL;
92 }
93
94 ret = ib_map_mr_sg(reg->mr, sg, nents, offset, PAGE_SIZE);
95 if (ret < nents) {
96 ib_mr_pool_put(qp, &qp->rdma_mrs, reg->mr);
97 return -EINVAL;
98 }
99
100 reg->reg_wr.wr.opcode = IB_WR_REG_MR;
101 reg->reg_wr.mr = reg->mr;
102 reg->reg_wr.access = IB_ACCESS_LOCAL_WRITE;
103 if (rdma_protocol_iwarp(qp->device, port_num))
104 reg->reg_wr.access |= IB_ACCESS_REMOTE_WRITE;
105 count++;
106
107 reg->sge.addr = reg->mr->iova;
108 reg->sge.length = reg->mr->length;
109 return count;
110}
111
112static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
113 u8 port_num, struct scatterlist *sg, u32 sg_cnt, u32 offset,
114 u64 remote_addr, u32 rkey, enum dma_data_direction dir)
115{
116 u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device);
117 int i, j, ret = 0, count = 0;
118
119 ctx->nr_ops = (sg_cnt + pages_per_mr - 1) / pages_per_mr;
120 ctx->reg = kcalloc(ctx->nr_ops, sizeof(*ctx->reg), GFP_KERNEL);
121 if (!ctx->reg) {
122 ret = -ENOMEM;
123 goto out;
124 }
125
126 for (i = 0; i < ctx->nr_ops; i++) {
127 struct rdma_rw_reg_ctx *prev = i ? &ctx->reg[i - 1] : NULL;
128 struct rdma_rw_reg_ctx *reg = &ctx->reg[i];
129 u32 nents = min(sg_cnt, pages_per_mr);
130
131 ret = rdma_rw_init_one_mr(qp, port_num, reg, sg, sg_cnt,
132 offset);
133 if (ret < 0)
134 goto out_free;
135 count += ret;
136
137 if (prev) {
138 if (reg->mr->need_inval)
139 prev->wr.wr.next = &reg->inv_wr;
140 else
141 prev->wr.wr.next = &reg->reg_wr.wr;
142 }
143
144 reg->reg_wr.wr.next = &reg->wr.wr;
145
146 reg->wr.wr.sg_list = &reg->sge;
147 reg->wr.wr.num_sge = 1;
148 reg->wr.remote_addr = remote_addr;
149 reg->wr.rkey = rkey;
150 if (dir == DMA_TO_DEVICE) {
151 reg->wr.wr.opcode = IB_WR_RDMA_WRITE;
152 } else if (!rdma_cap_read_inv(qp->device, port_num)) {
153 reg->wr.wr.opcode = IB_WR_RDMA_READ;
154 } else {
155 reg->wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV;
156 reg->wr.wr.ex.invalidate_rkey = reg->mr->lkey;
157 }
158 count++;
159
160 remote_addr += reg->sge.length;
161 sg_cnt -= nents;
162 for (j = 0; j < nents; j++)
163 sg = sg_next(sg);
164 offset = 0;
165 }
166
167 ctx->type = RDMA_RW_MR;
168 return count;
169
170out_free:
171 while (--i >= 0)
172 ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
173 kfree(ctx->reg);
174out:
175 return ret;
176}
177
178static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
179 struct scatterlist *sg, u32 sg_cnt, u32 offset,
180 u64 remote_addr, u32 rkey, enum dma_data_direction dir)
181{
182 struct ib_device *dev = qp->pd->device;
183 u32 max_sge = rdma_rw_max_sge(dev, dir);
184 struct ib_sge *sge;
185 u32 total_len = 0, i, j;
186
187 ctx->nr_ops = DIV_ROUND_UP(sg_cnt, max_sge);
188
189 ctx->map.sges = sge = kcalloc(sg_cnt, sizeof(*sge), GFP_KERNEL);
190 if (!ctx->map.sges)
191 goto out;
192
193 ctx->map.wrs = kcalloc(ctx->nr_ops, sizeof(*ctx->map.wrs), GFP_KERNEL);
194 if (!ctx->map.wrs)
195 goto out_free_sges;
196
197 for (i = 0; i < ctx->nr_ops; i++) {
198 struct ib_rdma_wr *rdma_wr = &ctx->map.wrs[i];
199 u32 nr_sge = min(sg_cnt, max_sge);
200
201 if (dir == DMA_TO_DEVICE)
202 rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
203 else
204 rdma_wr->wr.opcode = IB_WR_RDMA_READ;
205 rdma_wr->remote_addr = remote_addr + total_len;
206 rdma_wr->rkey = rkey;
207 rdma_wr->wr.sg_list = sge;
208
209 for (j = 0; j < nr_sge; j++, sg = sg_next(sg)) {
210 rdma_wr->wr.num_sge++;
211
212 sge->addr = ib_sg_dma_address(dev, sg) + offset;
213 sge->length = ib_sg_dma_len(dev, sg) - offset;
214 sge->lkey = qp->pd->local_dma_lkey;
215
216 total_len += sge->length;
217 sge++;
218 sg_cnt--;
219 offset = 0;
220 }
221
222 if (i + 1 < ctx->nr_ops)
223 rdma_wr->wr.next = &ctx->map.wrs[i + 1].wr;
224 }
225
226 ctx->type = RDMA_RW_MULTI_WR;
227 return ctx->nr_ops;
228
229out_free_sges:
230 kfree(ctx->map.sges);
231out:
232 return -ENOMEM;
233}
234
235static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
236 struct scatterlist *sg, u32 offset, u64 remote_addr, u32 rkey,
237 enum dma_data_direction dir)
238{
239 struct ib_device *dev = qp->pd->device;
240 struct ib_rdma_wr *rdma_wr = &ctx->single.wr;
241
242 ctx->nr_ops = 1;
243
244 ctx->single.sge.lkey = qp->pd->local_dma_lkey;
245 ctx->single.sge.addr = ib_sg_dma_address(dev, sg) + offset;
246 ctx->single.sge.length = ib_sg_dma_len(dev, sg) - offset;
247
248 memset(rdma_wr, 0, sizeof(*rdma_wr));
249 if (dir == DMA_TO_DEVICE)
250 rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
251 else
252 rdma_wr->wr.opcode = IB_WR_RDMA_READ;
253 rdma_wr->wr.sg_list = &ctx->single.sge;
254 rdma_wr->wr.num_sge = 1;
255 rdma_wr->remote_addr = remote_addr;
256 rdma_wr->rkey = rkey;
257
258 ctx->type = RDMA_RW_SINGLE_WR;
259 return 1;
260}
261
262/**
263 * rdma_rw_ctx_init - initialize a RDMA READ/WRITE context
264 * @ctx: context to initialize
265 * @qp: queue pair to operate on
266 * @port_num: port num to which the connection is bound
267 * @sg: scatterlist to READ/WRITE from/to
268 * @sg_cnt: number of entries in @sg
269 * @sg_offset: current byte offset into @sg
270 * @remote_addr:remote address to read/write (relative to @rkey)
271 * @rkey: remote key to operate on
272 * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
273 *
274 * Returns the number of WQEs that will be needed on the workqueue if
275 * successful, or a negative error code.
276 */
277int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
278 struct scatterlist *sg, u32 sg_cnt, u32 sg_offset,
279 u64 remote_addr, u32 rkey, enum dma_data_direction dir)
280{
281 struct ib_device *dev = qp->pd->device;
282 int ret;
283
284 ret = ib_dma_map_sg(dev, sg, sg_cnt, dir);
285 if (!ret)
286 return -ENOMEM;
287 sg_cnt = ret;
288
289 /*
290 * Skip to the S/G entry that sg_offset falls into:
291 */
292 for (;;) {
293 u32 len = ib_sg_dma_len(dev, sg);
294
295 if (sg_offset < len)
296 break;
297
298 sg = sg_next(sg);
299 sg_offset -= len;
300 sg_cnt--;
301 }
302
303 ret = -EIO;
304 if (WARN_ON_ONCE(sg_cnt == 0))
305 goto out_unmap_sg;
306
307 if (rdma_rw_io_needs_mr(qp->device, port_num, dir, sg_cnt)) {
308 ret = rdma_rw_init_mr_wrs(ctx, qp, port_num, sg, sg_cnt,
309 sg_offset, remote_addr, rkey, dir);
310 } else if (sg_cnt > 1) {
311 ret = rdma_rw_init_map_wrs(ctx, qp, sg, sg_cnt, sg_offset,
312 remote_addr, rkey, dir);
313 } else {
314 ret = rdma_rw_init_single_wr(ctx, qp, sg, sg_offset,
315 remote_addr, rkey, dir);
316 }
317
318 if (ret < 0)
319 goto out_unmap_sg;
320 return ret;
321
322out_unmap_sg:
323 ib_dma_unmap_sg(dev, sg, sg_cnt, dir);
324 return ret;
325}
326EXPORT_SYMBOL(rdma_rw_ctx_init);
327
328/*
329 * Now that we are going to post the WRs we can update the lkey and need_inval
330 * state on the MRs. If we were doing this at init time, we would get double
331 * or missing invalidations if a context was initialized but not actually
332 * posted.
333 */
334static void rdma_rw_update_lkey(struct rdma_rw_reg_ctx *reg, bool need_inval)
335{
336 reg->mr->need_inval = need_inval;
337 ib_update_fast_reg_key(reg->mr, ib_inc_rkey(reg->mr->lkey));
338 reg->reg_wr.key = reg->mr->lkey;
339 reg->sge.lkey = reg->mr->lkey;
340}
341
342/**
343 * rdma_rw_ctx_wrs - return chain of WRs for a RDMA READ or WRITE operation
344 * @ctx: context to operate on
345 * @qp: queue pair to operate on
346 * @port_num: port num to which the connection is bound
347 * @cqe: completion queue entry for the last WR
348 * @chain_wr: WR to append to the posted chain
349 *
350 * Return the WR chain for the set of RDMA READ/WRITE operations described by
351 * @ctx, as well as any memory registration operations needed. If @chain_wr
352 * is non-NULL the WR it points to will be appended to the chain of WRs posted.
353 * If @chain_wr is not set @cqe must be set so that the caller gets a
354 * completion notification.
355 */
356struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
357 u8 port_num, struct ib_cqe *cqe, struct ib_send_wr *chain_wr)
358{
359 struct ib_send_wr *first_wr, *last_wr;
360 int i;
361
362 switch (ctx->type) {
363 case RDMA_RW_MR:
364 for (i = 0; i < ctx->nr_ops; i++) {
365 rdma_rw_update_lkey(&ctx->reg[i],
366 ctx->reg[i].wr.wr.opcode !=
367 IB_WR_RDMA_READ_WITH_INV);
368 }
369
370 if (ctx->reg[0].inv_wr.next)
371 first_wr = &ctx->reg[0].inv_wr;
372 else
373 first_wr = &ctx->reg[0].reg_wr.wr;
374 last_wr = &ctx->reg[ctx->nr_ops - 1].wr.wr;
375 break;
376 case RDMA_RW_MULTI_WR:
377 first_wr = &ctx->map.wrs[0].wr;
378 last_wr = &ctx->map.wrs[ctx->nr_ops - 1].wr;
379 break;
380 case RDMA_RW_SINGLE_WR:
381 first_wr = &ctx->single.wr.wr;
382 last_wr = &ctx->single.wr.wr;
383 break;
384 default:
385 BUG();
386 }
387
388 if (chain_wr) {
389 last_wr->next = chain_wr;
390 } else {
391 last_wr->wr_cqe = cqe;
392 last_wr->send_flags |= IB_SEND_SIGNALED;
393 }
394
395 return first_wr;
396}
397EXPORT_SYMBOL(rdma_rw_ctx_wrs);
398
399/**
400 * rdma_rw_ctx_post - post a RDMA READ or RDMA WRITE operation
401 * @ctx: context to operate on
402 * @qp: queue pair to operate on
403 * @port_num: port num to which the connection is bound
404 * @cqe: completion queue entry for the last WR
405 * @chain_wr: WR to append to the posted chain
406 *
407 * Post the set of RDMA READ/WRITE operations described by @ctx, as well as
408 * any memory registration operations needed. If @chain_wr is non-NULL the
409 * WR it points to will be appended to the chain of WRs posted. If @chain_wr
410 * is not set @cqe must be set so that the caller gets a completion
411 * notification.
412 */
413int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
414 struct ib_cqe *cqe, struct ib_send_wr *chain_wr)
415{
416 struct ib_send_wr *first_wr, *bad_wr;
417
418 first_wr = rdma_rw_ctx_wrs(ctx, qp, port_num, cqe, chain_wr);
419 return ib_post_send(qp, first_wr, &bad_wr);
420}
421EXPORT_SYMBOL(rdma_rw_ctx_post);
422
423/**
424 * rdma_rw_ctx_destroy - release all resources allocated by rdma_rw_ctx_init
425 * @ctx: context to release
426 * @qp: queue pair to operate on
427 * @port_num: port num to which the connection is bound
428 * @sg: scatterlist that was used for the READ/WRITE
429 * @sg_cnt: number of entries in @sg
430 * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
431 */
432void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
433 struct scatterlist *sg, u32 sg_cnt, enum dma_data_direction dir)
434{
435 int i;
436
437 switch (ctx->type) {
438 case RDMA_RW_MR:
439 for (i = 0; i < ctx->nr_ops; i++)
440 ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
441 kfree(ctx->reg);
442 break;
443 case RDMA_RW_MULTI_WR:
444 kfree(ctx->map.wrs);
445 kfree(ctx->map.sges);
446 break;
447 case RDMA_RW_SINGLE_WR:
448 break;
449 default:
450 BUG();
451 break;
452 }
453
454 ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
455}
456EXPORT_SYMBOL(rdma_rw_ctx_destroy);
457
458void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr)
459{
460 u32 factor;
461
462 WARN_ON_ONCE(attr->port_num == 0);
463
464 /*
465 * Each context needs at least one RDMA READ or WRITE WR.
466 *
467 * For some hardware we might need more, eventually we should ask the
468 * HCA driver for a multiplier here.
469 */
470 factor = 1;
471
472 /*
473 * If the devices needs MRs to perform RDMA READ or WRITE operations,
474 * we'll need two additional MRs for the registrations and the
475 * invalidation.
476 */
477 if (rdma_rw_can_use_mr(dev, attr->port_num))
478 factor += 2; /* inv + reg */
479
480 attr->cap.max_send_wr += factor * attr->cap.max_rdma_ctxs;
481
482 /*
483 * But maybe we were just too high in the sky and the device doesn't
484 * even support all we need, and we'll have to live with what we get..
485 */
486 attr->cap.max_send_wr =
487 min_t(u32, attr->cap.max_send_wr, dev->attrs.max_qp_wr);
488}
489
490int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr)
491{
492 struct ib_device *dev = qp->pd->device;
493 int ret = 0;
494
495 if (rdma_rw_can_use_mr(dev, attr->port_num)) {
496 ret = ib_mr_pool_init(qp, &qp->rdma_mrs,
497 attr->cap.max_rdma_ctxs, IB_MR_TYPE_MEM_REG,
498 rdma_rw_fr_page_list_len(dev));
499 if (ret)
500 return ret;
501 }
502
503 return ret;
504}
505
506void rdma_rw_cleanup_mrs(struct ib_qp *qp)
507{
508 ib_mr_pool_destroy(qp, &qp->rdma_mrs);
509}
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 76c9c3faac20..566bfb31cadb 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -48,6 +48,7 @@
48#include <rdma/ib_verbs.h> 48#include <rdma/ib_verbs.h>
49#include <rdma/ib_cache.h> 49#include <rdma/ib_cache.h>
50#include <rdma/ib_addr.h> 50#include <rdma/ib_addr.h>
51#include <rdma/rw.h>
51 52
52#include "core_priv.h" 53#include "core_priv.h"
53 54
@@ -751,6 +752,16 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
751{ 752{
752 struct ib_device *device = pd ? pd->device : qp_init_attr->xrcd->device; 753 struct ib_device *device = pd ? pd->device : qp_init_attr->xrcd->device;
753 struct ib_qp *qp; 754 struct ib_qp *qp;
755 int ret;
756
757 /*
758 * If the callers is using the RDMA API calculate the resources
759 * needed for the RDMA READ/WRITE operations.
760 *
761 * Note that these callers need to pass in a port number.
762 */
763 if (qp_init_attr->cap.max_rdma_ctxs)
764 rdma_rw_init_qp(device, qp_init_attr);
754 765
755 qp = device->create_qp(pd, qp_init_attr, NULL); 766 qp = device->create_qp(pd, qp_init_attr, NULL);
756 if (IS_ERR(qp)) 767 if (IS_ERR(qp))
@@ -764,6 +775,7 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
764 atomic_set(&qp->usecnt, 0); 775 atomic_set(&qp->usecnt, 0);
765 qp->mrs_used = 0; 776 qp->mrs_used = 0;
766 spin_lock_init(&qp->mr_lock); 777 spin_lock_init(&qp->mr_lock);
778 INIT_LIST_HEAD(&qp->rdma_mrs);
767 779
768 if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) 780 if (qp_init_attr->qp_type == IB_QPT_XRC_TGT)
769 return ib_create_xrc_qp(qp, qp_init_attr); 781 return ib_create_xrc_qp(qp, qp_init_attr);
@@ -787,6 +799,16 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
787 799
788 atomic_inc(&pd->usecnt); 800 atomic_inc(&pd->usecnt);
789 atomic_inc(&qp_init_attr->send_cq->usecnt); 801 atomic_inc(&qp_init_attr->send_cq->usecnt);
802
803 if (qp_init_attr->cap.max_rdma_ctxs) {
804 ret = rdma_rw_init_mrs(qp, qp_init_attr);
805 if (ret) {
806 pr_err("failed to init MR pool ret= %d\n", ret);
807 ib_destroy_qp(qp);
808 qp = ERR_PTR(ret);
809 }
810 }
811
790 return qp; 812 return qp;
791} 813}
792EXPORT_SYMBOL(ib_create_qp); 814EXPORT_SYMBOL(ib_create_qp);
@@ -1271,6 +1293,9 @@ int ib_destroy_qp(struct ib_qp *qp)
1271 rcq = qp->recv_cq; 1293 rcq = qp->recv_cq;
1272 srq = qp->srq; 1294 srq = qp->srq;
1273 1295
1296 if (!qp->uobject)
1297 rdma_rw_cleanup_mrs(qp);
1298
1274 ret = qp->device->destroy_qp(qp); 1299 ret = qp->device->destroy_qp(qp);
1275 if (!ret) { 1300 if (!ret) {
1276 if (pd) 1301 if (pd)
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 3f66647749ca..dd8e15dfc1a8 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -931,6 +931,13 @@ struct ib_qp_cap {
931 u32 max_send_sge; 931 u32 max_send_sge;
932 u32 max_recv_sge; 932 u32 max_recv_sge;
933 u32 max_inline_data; 933 u32 max_inline_data;
934
935 /*
936 * Maximum number of rdma_rw_ctx structures in flight at a time.
937 * ib_create_qp() will calculate the right amount of neededed WRs
938 * and MRs based on this.
939 */
940 u32 max_rdma_ctxs;
934}; 941};
935 942
936enum ib_sig_type { 943enum ib_sig_type {
@@ -1002,7 +1009,11 @@ struct ib_qp_init_attr {
1002 enum ib_sig_type sq_sig_type; 1009 enum ib_sig_type sq_sig_type;
1003 enum ib_qp_type qp_type; 1010 enum ib_qp_type qp_type;
1004 enum ib_qp_create_flags create_flags; 1011 enum ib_qp_create_flags create_flags;
1005 u8 port_num; /* special QP types only */ 1012
1013 /*
1014 * Only needed for special QP types, or when using the RW API.
1015 */
1016 u8 port_num;
1006}; 1017};
1007 1018
1008struct ib_qp_open_attr { 1019struct ib_qp_open_attr {
@@ -1423,6 +1434,7 @@ struct ib_qp {
1423 struct ib_cq *recv_cq; 1434 struct ib_cq *recv_cq;
1424 spinlock_t mr_lock; 1435 spinlock_t mr_lock;
1425 int mrs_used; 1436 int mrs_used;
1437 struct list_head rdma_mrs;
1426 struct ib_srq *srq; 1438 struct ib_srq *srq;
1427 struct ib_xrcd *xrcd; /* XRC TGT QPs only */ 1439 struct ib_xrcd *xrcd; /* XRC TGT QPs only */
1428 struct list_head xrcd_list; 1440 struct list_head xrcd_list;
diff --git a/include/rdma/rw.h b/include/rdma/rw.h
new file mode 100644
index 000000000000..d3896bb9134b
--- /dev/null
+++ b/include/rdma/rw.h
@@ -0,0 +1,69 @@
1/*
2 * Copyright (c) 2016 HGST, a Western Digital Company.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 */
13#ifndef _RDMA_RW_H
14#define _RDMA_RW_H
15
16#include <linux/dma-mapping.h>
17#include <linux/scatterlist.h>
18#include <rdma/ib_verbs.h>
19#include <rdma/rdma_cm.h>
20#include <rdma/mr_pool.h>
21
22struct rdma_rw_ctx {
23 /* number of RDMA READ/WRITE WRs (not counting MR WRs) */
24 u32 nr_ops;
25
26 /* tag for the union below: */
27 u8 type;
28
29 union {
30 /* for mapping a single SGE: */
31 struct {
32 struct ib_sge sge;
33 struct ib_rdma_wr wr;
34 } single;
35
36 /* for mapping of multiple SGEs: */
37 struct {
38 struct ib_sge *sges;
39 struct ib_rdma_wr *wrs;
40 } map;
41
42 /* for registering multiple WRs: */
43 struct rdma_rw_reg_ctx {
44 struct ib_sge sge;
45 struct ib_rdma_wr wr;
46 struct ib_reg_wr reg_wr;
47 struct ib_send_wr inv_wr;
48 struct ib_mr *mr;
49 } *reg;
50 };
51};
52
53int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
54 struct scatterlist *sg, u32 sg_cnt, u32 sg_offset,
55 u64 remote_addr, u32 rkey, enum dma_data_direction dir);
56void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
57 struct scatterlist *sg, u32 sg_cnt,
58 enum dma_data_direction dir);
59
60struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
61 u8 port_num, struct ib_cqe *cqe, struct ib_send_wr *chain_wr);
62int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
63 struct ib_cqe *cqe, struct ib_send_wr *chain_wr);
64
65void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr);
66int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr);
67void rdma_rw_cleanup_mrs(struct ib_qp *qp);
68
69#endif /* _RDMA_RW_H */