aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIsrael Rukshin <israelr@mellanox.com>2017-11-26 05:40:55 -0500
committerChristoph Hellwig <hch@lst.de>2017-11-26 09:33:32 -0500
commitf41725bbe16b0773302c0cc7dc2e89f54828712d (patch)
tree75b1568b766b158c5d4395dc1f3e60826dfab86a
parent3ef0279bb0031f67537bd8972899a6a23d3064d7 (diff)
nvme-rdma: Use mr pool
Currently, blk_mq_tagset_iter() iterate over initial hctx tags only. If an I/O scheduler is used, it doesn't iterate the hctx scheduler tags and the static request aren't been updated. For example, while using NVMe over Fabrics RDMA host, this cause us not to reinit the scheduler requests and thus not re-register all the memory regions during the tagset re-initialization in the reconnect flow. This may lead to a memory registration error: "MEMREG for CQE 0xffff88044c14dce8 failed with status memory management operation error (6)" With this commit we don't need to reinit the requests, and thus fix this failure. Signed-off-by: Israel Rukshin <israelr@mellanox.com> Reviewed-by: Sagi Grimberg <sagi@grimberg.me> Reviewed-by: Max Gurtovoy <maxg@mellanox.com> Signed-off-by: Christoph Hellwig <hch@lst.de>
-rw-r--r--drivers/nvme/host/rdma.c95
1 files changed, 37 insertions, 58 deletions
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 3a952d458e7c..02ef0771f6b9 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -15,6 +15,7 @@
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <rdma/mr_pool.h>
18#include <linux/err.h> 19#include <linux/err.h>
19#include <linux/string.h> 20#include <linux/string.h>
20#include <linux/atomic.h> 21#include <linux/atomic.h>
@@ -260,32 +261,6 @@ static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor)
260 return ret; 261 return ret;
261} 262}
262 263
263static int nvme_rdma_reinit_request(void *data, struct request *rq)
264{
265 struct nvme_rdma_ctrl *ctrl = data;
266 struct nvme_rdma_device *dev = ctrl->device;
267 struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
268 int ret = 0;
269
270 if (WARN_ON_ONCE(!req->mr))
271 return 0;
272
273 ib_dereg_mr(req->mr);
274
275 req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG,
276 ctrl->max_fr_pages);
277 if (IS_ERR(req->mr)) {
278 ret = PTR_ERR(req->mr);
279 req->mr = NULL;
280 goto out;
281 }
282
283 req->mr->need_inval = false;
284
285out:
286 return ret;
287}
288
289static void nvme_rdma_exit_request(struct blk_mq_tag_set *set, 264static void nvme_rdma_exit_request(struct blk_mq_tag_set *set,
290 struct request *rq, unsigned int hctx_idx) 265 struct request *rq, unsigned int hctx_idx)
291{ 266{
@@ -295,9 +270,6 @@ static void nvme_rdma_exit_request(struct blk_mq_tag_set *set,
295 struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx]; 270 struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
296 struct nvme_rdma_device *dev = queue->device; 271 struct nvme_rdma_device *dev = queue->device;
297 272
298 if (req->mr)
299 ib_dereg_mr(req->mr);
300
301 nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command), 273 nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command),
302 DMA_TO_DEVICE); 274 DMA_TO_DEVICE);
303} 275}
@@ -319,21 +291,9 @@ static int nvme_rdma_init_request(struct blk_mq_tag_set *set,
319 if (ret) 291 if (ret)
320 return ret; 292 return ret;
321 293
322 req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG,
323 ctrl->max_fr_pages);
324 if (IS_ERR(req->mr)) {
325 ret = PTR_ERR(req->mr);
326 goto out_free_qe;
327 }
328
329 req->queue = queue; 294 req->queue = queue;
330 295
331 return 0; 296 return 0;
332
333out_free_qe:
334 nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command),
335 DMA_TO_DEVICE);
336 return -ENOMEM;
337} 297}
338 298
339static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, 299static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
@@ -433,6 +393,8 @@ static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
433 struct nvme_rdma_device *dev = queue->device; 393 struct nvme_rdma_device *dev = queue->device;
434 struct ib_device *ibdev = dev->dev; 394 struct ib_device *ibdev = dev->dev;
435 395
396 ib_mr_pool_destroy(queue->qp, &queue->qp->rdma_mrs);
397
436 rdma_destroy_qp(queue->cm_id); 398 rdma_destroy_qp(queue->cm_id);
437 ib_free_cq(queue->ib_cq); 399 ib_free_cq(queue->ib_cq);
438 400
@@ -442,6 +404,12 @@ static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
442 nvme_rdma_dev_put(dev); 404 nvme_rdma_dev_put(dev);
443} 405}
444 406
407static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev)
408{
409 return min_t(u32, NVME_RDMA_MAX_SEGMENTS,
410 ibdev->attrs.max_fast_reg_page_list_len);
411}
412
445static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) 413static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
446{ 414{
447 struct ib_device *ibdev; 415 struct ib_device *ibdev;
@@ -484,8 +452,22 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
484 goto out_destroy_qp; 452 goto out_destroy_qp;
485 } 453 }
486 454
455 ret = ib_mr_pool_init(queue->qp, &queue->qp->rdma_mrs,
456 queue->queue_size,
457 IB_MR_TYPE_MEM_REG,
458 nvme_rdma_get_max_fr_pages(ibdev));
459 if (ret) {
460 dev_err(queue->ctrl->ctrl.device,
461 "failed to initialize MR pool sized %d for QID %d\n",
462 queue->queue_size, idx);
463 goto out_destroy_ring;
464 }
465
487 return 0; 466 return 0;
488 467
468out_destroy_ring:
469 nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size,
470 sizeof(struct nvme_completion), DMA_FROM_DEVICE);
489out_destroy_qp: 471out_destroy_qp:
490 rdma_destroy_qp(queue->cm_id); 472 rdma_destroy_qp(queue->cm_id);
491out_destroy_ib_cq: 473out_destroy_ib_cq:
@@ -757,8 +739,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
757 739
758 ctrl->device = ctrl->queues[0].device; 740 ctrl->device = ctrl->queues[0].device;
759 741
760 ctrl->max_fr_pages = min_t(u32, NVME_RDMA_MAX_SEGMENTS, 742 ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev);
761 ctrl->device->dev->attrs.max_fast_reg_page_list_len);
762 743
763 if (new) { 744 if (new) {
764 ctrl->ctrl.admin_tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, true); 745 ctrl->ctrl.admin_tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, true);
@@ -772,10 +753,6 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
772 error = PTR_ERR(ctrl->ctrl.admin_q); 753 error = PTR_ERR(ctrl->ctrl.admin_q);
773 goto out_free_tagset; 754 goto out_free_tagset;
774 } 755 }
775 } else {
776 error = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset);
777 if (error)
778 goto out_free_queue;
779 } 756 }
780 757
781 error = nvme_rdma_start_queue(ctrl, 0); 758 error = nvme_rdma_start_queue(ctrl, 0);
@@ -855,10 +832,6 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
855 goto out_free_tag_set; 832 goto out_free_tag_set;
856 } 833 }
857 } else { 834 } else {
858 ret = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
859 if (ret)
860 goto out_free_io_queues;
861
862 blk_mq_update_nr_hw_queues(&ctrl->tag_set, 835 blk_mq_update_nr_hw_queues(&ctrl->tag_set,
863 ctrl->ctrl.queue_count - 1); 836 ctrl->ctrl.queue_count - 1);
864 } 837 }
@@ -1061,6 +1034,11 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue,
1061 if (!blk_rq_bytes(rq)) 1034 if (!blk_rq_bytes(rq))
1062 return; 1035 return;
1063 1036
1037 if (req->mr) {
1038 ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr);
1039 req->mr = NULL;
1040 }
1041
1064 ib_dma_unmap_sg(ibdev, req->sg_table.sgl, 1042 ib_dma_unmap_sg(ibdev, req->sg_table.sgl,
1065 req->nents, rq_data_dir(rq) == 1043 req->nents, rq_data_dir(rq) ==
1066 WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 1044 WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
@@ -1117,12 +1095,18 @@ static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue,
1117 struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl; 1095 struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
1118 int nr; 1096 int nr;
1119 1097
1098 req->mr = ib_mr_pool_get(queue->qp, &queue->qp->rdma_mrs);
1099 if (WARN_ON_ONCE(!req->mr))
1100 return -EAGAIN;
1101
1120 /* 1102 /*
1121 * Align the MR to a 4K page size to match the ctrl page size and 1103 * Align the MR to a 4K page size to match the ctrl page size and
1122 * the block virtual boundary. 1104 * the block virtual boundary.
1123 */ 1105 */
1124 nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, SZ_4K); 1106 nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, SZ_4K);
1125 if (unlikely(nr < count)) { 1107 if (unlikely(nr < count)) {
1108 ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr);
1109 req->mr = NULL;
1126 if (nr < 0) 1110 if (nr < 0)
1127 return nr; 1111 return nr;
1128 return -EINVAL; 1112 return -EINVAL;
@@ -1141,8 +1125,6 @@ static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue,
1141 IB_ACCESS_REMOTE_READ | 1125 IB_ACCESS_REMOTE_READ |
1142 IB_ACCESS_REMOTE_WRITE; 1126 IB_ACCESS_REMOTE_WRITE;
1143 1127
1144 req->mr->need_inval = true;
1145
1146 sg->addr = cpu_to_le64(req->mr->iova); 1128 sg->addr = cpu_to_le64(req->mr->iova);
1147 put_unaligned_le24(req->mr->length, sg->length); 1129 put_unaligned_le24(req->mr->length, sg->length);
1148 put_unaligned_le32(req->mr->rkey, sg->key); 1130 put_unaligned_le32(req->mr->rkey, sg->key);
@@ -1162,7 +1144,6 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
1162 1144
1163 req->num_sge = 1; 1145 req->num_sge = 1;
1164 req->inline_data = false; 1146 req->inline_data = false;
1165 req->mr->need_inval = false;
1166 refcount_set(&req->ref, 2); /* send and recv completions */ 1147 refcount_set(&req->ref, 2); /* send and recv completions */
1167 1148
1168 c->common.flags |= NVME_CMD_SGL_METABUF; 1149 c->common.flags |= NVME_CMD_SGL_METABUF;
@@ -1341,8 +1322,7 @@ static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
1341 req->mr->rkey); 1322 req->mr->rkey);
1342 nvme_rdma_error_recovery(queue->ctrl); 1323 nvme_rdma_error_recovery(queue->ctrl);
1343 } 1324 }
1344 req->mr->need_inval = false; 1325 } else if (req->mr) {
1345 } else if (req->mr->need_inval) {
1346 ret = nvme_rdma_inv_rkey(queue, req); 1326 ret = nvme_rdma_inv_rkey(queue, req);
1347 if (unlikely(ret < 0)) { 1327 if (unlikely(ret < 0)) {
1348 dev_err(queue->ctrl->ctrl.device, 1328 dev_err(queue->ctrl->ctrl.device,
@@ -1650,7 +1630,7 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
1650 sizeof(struct nvme_command), DMA_TO_DEVICE); 1630 sizeof(struct nvme_command), DMA_TO_DEVICE);
1651 1631
1652 err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge, 1632 err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge,
1653 req->mr->need_inval ? &req->reg_wr.wr : NULL); 1633 req->mr ? &req->reg_wr.wr : NULL);
1654 if (unlikely(err)) { 1634 if (unlikely(err)) {
1655 nvme_rdma_unmap_data(queue, rq); 1635 nvme_rdma_unmap_data(queue, rq);
1656 goto err; 1636 goto err;
@@ -1798,7 +1778,6 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
1798 .submit_async_event = nvme_rdma_submit_async_event, 1778 .submit_async_event = nvme_rdma_submit_async_event,
1799 .delete_ctrl = nvme_rdma_delete_ctrl, 1779 .delete_ctrl = nvme_rdma_delete_ctrl,
1800 .get_address = nvmf_get_address, 1780 .get_address = nvmf_get_address,
1801 .reinit_request = nvme_rdma_reinit_request,
1802}; 1781};
1803 1782
1804static inline bool 1783static inline bool