diff options
author | Jens Axboe <axboe@kernel.dk> | 2017-11-29 11:21:50 -0500 |
---|---|---|
committer | Jens Axboe <axboe@kernel.dk> | 2017-11-29 11:21:50 -0500 |
commit | ed565371e368f014db237aacf42b27b40b1bd247 (patch) | |
tree | 3d950c90e216b27175ccfb193002c798af4f5fc2 | |
parent | 2967acbb257a6a9bf912f4778b727e00972eac9b (diff) | |
parent | 7e5dd57ef3081ff6c03908d786ed5087f6fbb7ae (diff) |
Merge branch 'nvme-4.15' of git://git.infradead.org/nvme into for-linus
Pull NVMe fixes from Christoph:
"A few more nvme updates for 4.15. A single small PCIe fix, and a number
of patches for RDMA that are a little larger than what I'd like to see
for -rc2, but they fix important issues seen in the wild."
-rw-r--r-- | drivers/nvme/host/pci.c | 1 | ||||
-rw-r--r-- | drivers/nvme/host/rdma.c | 234 |
2 files changed, 119 insertions, 116 deletions
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 617374762b7c..f5800c3c9082 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c | |||
@@ -1759,6 +1759,7 @@ static void nvme_free_host_mem(struct nvme_dev *dev) | |||
1759 | dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs), | 1759 | dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs), |
1760 | dev->host_mem_descs, dev->host_mem_descs_dma); | 1760 | dev->host_mem_descs, dev->host_mem_descs_dma); |
1761 | dev->host_mem_descs = NULL; | 1761 | dev->host_mem_descs = NULL; |
1762 | dev->nr_host_mem_descs = 0; | ||
1762 | } | 1763 | } |
1763 | 1764 | ||
1764 | static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred, | 1765 | static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred, |
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 2c597105a6bf..37af56596be6 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/module.h> | 15 | #include <linux/module.h> |
16 | #include <linux/init.h> | 16 | #include <linux/init.h> |
17 | #include <linux/slab.h> | 17 | #include <linux/slab.h> |
18 | #include <rdma/mr_pool.h> | ||
18 | #include <linux/err.h> | 19 | #include <linux/err.h> |
19 | #include <linux/string.h> | 20 | #include <linux/string.h> |
20 | #include <linux/atomic.h> | 21 | #include <linux/atomic.h> |
@@ -59,6 +60,9 @@ struct nvme_rdma_request { | |||
59 | struct nvme_request req; | 60 | struct nvme_request req; |
60 | struct ib_mr *mr; | 61 | struct ib_mr *mr; |
61 | struct nvme_rdma_qe sqe; | 62 | struct nvme_rdma_qe sqe; |
63 | union nvme_result result; | ||
64 | __le16 status; | ||
65 | refcount_t ref; | ||
62 | struct ib_sge sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS]; | 66 | struct ib_sge sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS]; |
63 | u32 num_sge; | 67 | u32 num_sge; |
64 | int nents; | 68 | int nents; |
@@ -73,11 +77,11 @@ struct nvme_rdma_request { | |||
73 | enum nvme_rdma_queue_flags { | 77 | enum nvme_rdma_queue_flags { |
74 | NVME_RDMA_Q_ALLOCATED = 0, | 78 | NVME_RDMA_Q_ALLOCATED = 0, |
75 | NVME_RDMA_Q_LIVE = 1, | 79 | NVME_RDMA_Q_LIVE = 1, |
80 | NVME_RDMA_Q_TR_READY = 2, | ||
76 | }; | 81 | }; |
77 | 82 | ||
78 | struct nvme_rdma_queue { | 83 | struct nvme_rdma_queue { |
79 | struct nvme_rdma_qe *rsp_ring; | 84 | struct nvme_rdma_qe *rsp_ring; |
80 | atomic_t sig_count; | ||
81 | int queue_size; | 85 | int queue_size; |
82 | size_t cmnd_capsule_len; | 86 | size_t cmnd_capsule_len; |
83 | struct nvme_rdma_ctrl *ctrl; | 87 | struct nvme_rdma_ctrl *ctrl; |
@@ -258,32 +262,6 @@ static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor) | |||
258 | return ret; | 262 | return ret; |
259 | } | 263 | } |
260 | 264 | ||
261 | static int nvme_rdma_reinit_request(void *data, struct request *rq) | ||
262 | { | ||
263 | struct nvme_rdma_ctrl *ctrl = data; | ||
264 | struct nvme_rdma_device *dev = ctrl->device; | ||
265 | struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); | ||
266 | int ret = 0; | ||
267 | |||
268 | if (WARN_ON_ONCE(!req->mr)) | ||
269 | return 0; | ||
270 | |||
271 | ib_dereg_mr(req->mr); | ||
272 | |||
273 | req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG, | ||
274 | ctrl->max_fr_pages); | ||
275 | if (IS_ERR(req->mr)) { | ||
276 | ret = PTR_ERR(req->mr); | ||
277 | req->mr = NULL; | ||
278 | goto out; | ||
279 | } | ||
280 | |||
281 | req->mr->need_inval = false; | ||
282 | |||
283 | out: | ||
284 | return ret; | ||
285 | } | ||
286 | |||
287 | static void nvme_rdma_exit_request(struct blk_mq_tag_set *set, | 265 | static void nvme_rdma_exit_request(struct blk_mq_tag_set *set, |
288 | struct request *rq, unsigned int hctx_idx) | 266 | struct request *rq, unsigned int hctx_idx) |
289 | { | 267 | { |
@@ -293,9 +271,6 @@ static void nvme_rdma_exit_request(struct blk_mq_tag_set *set, | |||
293 | struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx]; | 271 | struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx]; |
294 | struct nvme_rdma_device *dev = queue->device; | 272 | struct nvme_rdma_device *dev = queue->device; |
295 | 273 | ||
296 | if (req->mr) | ||
297 | ib_dereg_mr(req->mr); | ||
298 | |||
299 | nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command), | 274 | nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command), |
300 | DMA_TO_DEVICE); | 275 | DMA_TO_DEVICE); |
301 | } | 276 | } |
@@ -317,21 +292,9 @@ static int nvme_rdma_init_request(struct blk_mq_tag_set *set, | |||
317 | if (ret) | 292 | if (ret) |
318 | return ret; | 293 | return ret; |
319 | 294 | ||
320 | req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG, | ||
321 | ctrl->max_fr_pages); | ||
322 | if (IS_ERR(req->mr)) { | ||
323 | ret = PTR_ERR(req->mr); | ||
324 | goto out_free_qe; | ||
325 | } | ||
326 | |||
327 | req->queue = queue; | 295 | req->queue = queue; |
328 | 296 | ||
329 | return 0; | 297 | return 0; |
330 | |||
331 | out_free_qe: | ||
332 | nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command), | ||
333 | DMA_TO_DEVICE); | ||
334 | return -ENOMEM; | ||
335 | } | 298 | } |
336 | 299 | ||
337 | static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, | 300 | static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, |
@@ -428,10 +391,23 @@ out_err: | |||
428 | 391 | ||
429 | static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) | 392 | static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) |
430 | { | 393 | { |
431 | struct nvme_rdma_device *dev = queue->device; | 394 | struct nvme_rdma_device *dev; |
432 | struct ib_device *ibdev = dev->dev; | 395 | struct ib_device *ibdev; |
433 | 396 | ||
434 | rdma_destroy_qp(queue->cm_id); | 397 | if (!test_and_clear_bit(NVME_RDMA_Q_TR_READY, &queue->flags)) |
398 | return; | ||
399 | |||
400 | dev = queue->device; | ||
401 | ibdev = dev->dev; | ||
402 | |||
403 | ib_mr_pool_destroy(queue->qp, &queue->qp->rdma_mrs); | ||
404 | |||
405 | /* | ||
406 | * The cm_id object might have been destroyed during RDMA connection | ||
407 | * establishment error flow to avoid getting other cma events, thus | ||
408 | * the destruction of the QP shouldn't use rdma_cm API. | ||
409 | */ | ||
410 | ib_destroy_qp(queue->qp); | ||
435 | ib_free_cq(queue->ib_cq); | 411 | ib_free_cq(queue->ib_cq); |
436 | 412 | ||
437 | nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size, | 413 | nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size, |
@@ -440,6 +416,12 @@ static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) | |||
440 | nvme_rdma_dev_put(dev); | 416 | nvme_rdma_dev_put(dev); |
441 | } | 417 | } |
442 | 418 | ||
419 | static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev) | ||
420 | { | ||
421 | return min_t(u32, NVME_RDMA_MAX_SEGMENTS, | ||
422 | ibdev->attrs.max_fast_reg_page_list_len); | ||
423 | } | ||
424 | |||
443 | static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) | 425 | static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) |
444 | { | 426 | { |
445 | struct ib_device *ibdev; | 427 | struct ib_device *ibdev; |
@@ -482,8 +464,24 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) | |||
482 | goto out_destroy_qp; | 464 | goto out_destroy_qp; |
483 | } | 465 | } |
484 | 466 | ||
467 | ret = ib_mr_pool_init(queue->qp, &queue->qp->rdma_mrs, | ||
468 | queue->queue_size, | ||
469 | IB_MR_TYPE_MEM_REG, | ||
470 | nvme_rdma_get_max_fr_pages(ibdev)); | ||
471 | if (ret) { | ||
472 | dev_err(queue->ctrl->ctrl.device, | ||
473 | "failed to initialize MR pool sized %d for QID %d\n", | ||
474 | queue->queue_size, idx); | ||
475 | goto out_destroy_ring; | ||
476 | } | ||
477 | |||
478 | set_bit(NVME_RDMA_Q_TR_READY, &queue->flags); | ||
479 | |||
485 | return 0; | 480 | return 0; |
486 | 481 | ||
482 | out_destroy_ring: | ||
483 | nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size, | ||
484 | sizeof(struct nvme_completion), DMA_FROM_DEVICE); | ||
487 | out_destroy_qp: | 485 | out_destroy_qp: |
488 | rdma_destroy_qp(queue->cm_id); | 486 | rdma_destroy_qp(queue->cm_id); |
489 | out_destroy_ib_cq: | 487 | out_destroy_ib_cq: |
@@ -510,7 +508,6 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl, | |||
510 | queue->cmnd_capsule_len = sizeof(struct nvme_command); | 508 | queue->cmnd_capsule_len = sizeof(struct nvme_command); |
511 | 509 | ||
512 | queue->queue_size = queue_size; | 510 | queue->queue_size = queue_size; |
513 | atomic_set(&queue->sig_count, 0); | ||
514 | 511 | ||
515 | queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue, | 512 | queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue, |
516 | RDMA_PS_TCP, IB_QPT_RC); | 513 | RDMA_PS_TCP, IB_QPT_RC); |
@@ -546,6 +543,7 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl, | |||
546 | 543 | ||
547 | out_destroy_cm_id: | 544 | out_destroy_cm_id: |
548 | rdma_destroy_id(queue->cm_id); | 545 | rdma_destroy_id(queue->cm_id); |
546 | nvme_rdma_destroy_queue_ib(queue); | ||
549 | return ret; | 547 | return ret; |
550 | } | 548 | } |
551 | 549 | ||
@@ -756,8 +754,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, | |||
756 | 754 | ||
757 | ctrl->device = ctrl->queues[0].device; | 755 | ctrl->device = ctrl->queues[0].device; |
758 | 756 | ||
759 | ctrl->max_fr_pages = min_t(u32, NVME_RDMA_MAX_SEGMENTS, | 757 | ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev); |
760 | ctrl->device->dev->attrs.max_fast_reg_page_list_len); | ||
761 | 758 | ||
762 | if (new) { | 759 | if (new) { |
763 | ctrl->ctrl.admin_tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, true); | 760 | ctrl->ctrl.admin_tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, true); |
@@ -771,10 +768,6 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, | |||
771 | error = PTR_ERR(ctrl->ctrl.admin_q); | 768 | error = PTR_ERR(ctrl->ctrl.admin_q); |
772 | goto out_free_tagset; | 769 | goto out_free_tagset; |
773 | } | 770 | } |
774 | } else { | ||
775 | error = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset); | ||
776 | if (error) | ||
777 | goto out_free_queue; | ||
778 | } | 771 | } |
779 | 772 | ||
780 | error = nvme_rdma_start_queue(ctrl, 0); | 773 | error = nvme_rdma_start_queue(ctrl, 0); |
@@ -854,10 +847,6 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) | |||
854 | goto out_free_tag_set; | 847 | goto out_free_tag_set; |
855 | } | 848 | } |
856 | } else { | 849 | } else { |
857 | ret = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.tagset); | ||
858 | if (ret) | ||
859 | goto out_free_io_queues; | ||
860 | |||
861 | blk_mq_update_nr_hw_queues(&ctrl->tag_set, | 850 | blk_mq_update_nr_hw_queues(&ctrl->tag_set, |
862 | ctrl->ctrl.queue_count - 1); | 851 | ctrl->ctrl.queue_count - 1); |
863 | } | 852 | } |
@@ -1018,8 +1007,18 @@ static void nvme_rdma_memreg_done(struct ib_cq *cq, struct ib_wc *wc) | |||
1018 | 1007 | ||
1019 | static void nvme_rdma_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc) | 1008 | static void nvme_rdma_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc) |
1020 | { | 1009 | { |
1021 | if (unlikely(wc->status != IB_WC_SUCCESS)) | 1010 | struct nvme_rdma_request *req = |
1011 | container_of(wc->wr_cqe, struct nvme_rdma_request, reg_cqe); | ||
1012 | struct request *rq = blk_mq_rq_from_pdu(req); | ||
1013 | |||
1014 | if (unlikely(wc->status != IB_WC_SUCCESS)) { | ||
1022 | nvme_rdma_wr_error(cq, wc, "LOCAL_INV"); | 1015 | nvme_rdma_wr_error(cq, wc, "LOCAL_INV"); |
1016 | return; | ||
1017 | } | ||
1018 | |||
1019 | if (refcount_dec_and_test(&req->ref)) | ||
1020 | nvme_end_request(rq, req->status, req->result); | ||
1021 | |||
1023 | } | 1022 | } |
1024 | 1023 | ||
1025 | static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue, | 1024 | static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue, |
@@ -1030,7 +1029,7 @@ static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue, | |||
1030 | .opcode = IB_WR_LOCAL_INV, | 1029 | .opcode = IB_WR_LOCAL_INV, |
1031 | .next = NULL, | 1030 | .next = NULL, |
1032 | .num_sge = 0, | 1031 | .num_sge = 0, |
1033 | .send_flags = 0, | 1032 | .send_flags = IB_SEND_SIGNALED, |
1034 | .ex.invalidate_rkey = req->mr->rkey, | 1033 | .ex.invalidate_rkey = req->mr->rkey, |
1035 | }; | 1034 | }; |
1036 | 1035 | ||
@@ -1044,22 +1043,15 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue, | |||
1044 | struct request *rq) | 1043 | struct request *rq) |
1045 | { | 1044 | { |
1046 | struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); | 1045 | struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); |
1047 | struct nvme_rdma_ctrl *ctrl = queue->ctrl; | ||
1048 | struct nvme_rdma_device *dev = queue->device; | 1046 | struct nvme_rdma_device *dev = queue->device; |
1049 | struct ib_device *ibdev = dev->dev; | 1047 | struct ib_device *ibdev = dev->dev; |
1050 | int res; | ||
1051 | 1048 | ||
1052 | if (!blk_rq_bytes(rq)) | 1049 | if (!blk_rq_bytes(rq)) |
1053 | return; | 1050 | return; |
1054 | 1051 | ||
1055 | if (req->mr->need_inval && test_bit(NVME_RDMA_Q_LIVE, &req->queue->flags)) { | 1052 | if (req->mr) { |
1056 | res = nvme_rdma_inv_rkey(queue, req); | 1053 | ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr); |
1057 | if (unlikely(res < 0)) { | 1054 | req->mr = NULL; |
1058 | dev_err(ctrl->ctrl.device, | ||
1059 | "Queueing INV WR for rkey %#x failed (%d)\n", | ||
1060 | req->mr->rkey, res); | ||
1061 | nvme_rdma_error_recovery(queue->ctrl); | ||
1062 | } | ||
1063 | } | 1055 | } |
1064 | 1056 | ||
1065 | ib_dma_unmap_sg(ibdev, req->sg_table.sgl, | 1057 | ib_dma_unmap_sg(ibdev, req->sg_table.sgl, |
@@ -1118,12 +1110,18 @@ static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue, | |||
1118 | struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl; | 1110 | struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl; |
1119 | int nr; | 1111 | int nr; |
1120 | 1112 | ||
1113 | req->mr = ib_mr_pool_get(queue->qp, &queue->qp->rdma_mrs); | ||
1114 | if (WARN_ON_ONCE(!req->mr)) | ||
1115 | return -EAGAIN; | ||
1116 | |||
1121 | /* | 1117 | /* |
1122 | * Align the MR to a 4K page size to match the ctrl page size and | 1118 | * Align the MR to a 4K page size to match the ctrl page size and |
1123 | * the block virtual boundary. | 1119 | * the block virtual boundary. |
1124 | */ | 1120 | */ |
1125 | nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, SZ_4K); | 1121 | nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, SZ_4K); |
1126 | if (unlikely(nr < count)) { | 1122 | if (unlikely(nr < count)) { |
1123 | ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr); | ||
1124 | req->mr = NULL; | ||
1127 | if (nr < 0) | 1125 | if (nr < 0) |
1128 | return nr; | 1126 | return nr; |
1129 | return -EINVAL; | 1127 | return -EINVAL; |
@@ -1142,8 +1140,6 @@ static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue, | |||
1142 | IB_ACCESS_REMOTE_READ | | 1140 | IB_ACCESS_REMOTE_READ | |
1143 | IB_ACCESS_REMOTE_WRITE; | 1141 | IB_ACCESS_REMOTE_WRITE; |
1144 | 1142 | ||
1145 | req->mr->need_inval = true; | ||
1146 | |||
1147 | sg->addr = cpu_to_le64(req->mr->iova); | 1143 | sg->addr = cpu_to_le64(req->mr->iova); |
1148 | put_unaligned_le24(req->mr->length, sg->length); | 1144 | put_unaligned_le24(req->mr->length, sg->length); |
1149 | put_unaligned_le32(req->mr->rkey, sg->key); | 1145 | put_unaligned_le32(req->mr->rkey, sg->key); |
@@ -1163,7 +1159,7 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, | |||
1163 | 1159 | ||
1164 | req->num_sge = 1; | 1160 | req->num_sge = 1; |
1165 | req->inline_data = false; | 1161 | req->inline_data = false; |
1166 | req->mr->need_inval = false; | 1162 | refcount_set(&req->ref, 2); /* send and recv completions */ |
1167 | 1163 | ||
1168 | c->common.flags |= NVME_CMD_SGL_METABUF; | 1164 | c->common.flags |= NVME_CMD_SGL_METABUF; |
1169 | 1165 | ||
@@ -1200,25 +1196,24 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, | |||
1200 | 1196 | ||
1201 | static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) | 1197 | static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) |
1202 | { | 1198 | { |
1203 | if (unlikely(wc->status != IB_WC_SUCCESS)) | 1199 | struct nvme_rdma_qe *qe = |
1204 | nvme_rdma_wr_error(cq, wc, "SEND"); | 1200 | container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe); |
1205 | } | 1201 | struct nvme_rdma_request *req = |
1202 | container_of(qe, struct nvme_rdma_request, sqe); | ||
1203 | struct request *rq = blk_mq_rq_from_pdu(req); | ||
1206 | 1204 | ||
1207 | /* | 1205 | if (unlikely(wc->status != IB_WC_SUCCESS)) { |
1208 | * We want to signal completion at least every queue depth/2. This returns the | 1206 | nvme_rdma_wr_error(cq, wc, "SEND"); |
1209 | * largest power of two that is not above half of (queue size + 1) to optimize | 1207 | return; |
1210 | * (avoid divisions). | 1208 | } |
1211 | */ | ||
1212 | static inline bool nvme_rdma_queue_sig_limit(struct nvme_rdma_queue *queue) | ||
1213 | { | ||
1214 | int limit = 1 << ilog2((queue->queue_size + 1) / 2); | ||
1215 | 1209 | ||
1216 | return (atomic_inc_return(&queue->sig_count) & (limit - 1)) == 0; | 1210 | if (refcount_dec_and_test(&req->ref)) |
1211 | nvme_end_request(rq, req->status, req->result); | ||
1217 | } | 1212 | } |
1218 | 1213 | ||
1219 | static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, | 1214 | static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, |
1220 | struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge, | 1215 | struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge, |
1221 | struct ib_send_wr *first, bool flush) | 1216 | struct ib_send_wr *first) |
1222 | { | 1217 | { |
1223 | struct ib_send_wr wr, *bad_wr; | 1218 | struct ib_send_wr wr, *bad_wr; |
1224 | int ret; | 1219 | int ret; |
@@ -1227,31 +1222,12 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, | |||
1227 | sge->length = sizeof(struct nvme_command), | 1222 | sge->length = sizeof(struct nvme_command), |
1228 | sge->lkey = queue->device->pd->local_dma_lkey; | 1223 | sge->lkey = queue->device->pd->local_dma_lkey; |
1229 | 1224 | ||
1230 | qe->cqe.done = nvme_rdma_send_done; | ||
1231 | |||
1232 | wr.next = NULL; | 1225 | wr.next = NULL; |
1233 | wr.wr_cqe = &qe->cqe; | 1226 | wr.wr_cqe = &qe->cqe; |
1234 | wr.sg_list = sge; | 1227 | wr.sg_list = sge; |
1235 | wr.num_sge = num_sge; | 1228 | wr.num_sge = num_sge; |
1236 | wr.opcode = IB_WR_SEND; | 1229 | wr.opcode = IB_WR_SEND; |
1237 | wr.send_flags = 0; | 1230 | wr.send_flags = IB_SEND_SIGNALED; |
1238 | |||
1239 | /* | ||
1240 | * Unsignalled send completions are another giant desaster in the | ||
1241 | * IB Verbs spec: If we don't regularly post signalled sends | ||
1242 | * the send queue will fill up and only a QP reset will rescue us. | ||
1243 | * Would have been way to obvious to handle this in hardware or | ||
1244 | * at least the RDMA stack.. | ||
1245 | * | ||
1246 | * Always signal the flushes. The magic request used for the flush | ||
1247 | * sequencer is not allocated in our driver's tagset and it's | ||
1248 | * triggered to be freed by blk_cleanup_queue(). So we need to | ||
1249 | * always mark it as signaled to ensure that the "wr_cqe", which is | ||
1250 | * embedded in request's payload, is not freed when __ib_process_cq() | ||
1251 | * calls wr_cqe->done(). | ||
1252 | */ | ||
1253 | if (nvme_rdma_queue_sig_limit(queue) || flush) | ||
1254 | wr.send_flags |= IB_SEND_SIGNALED; | ||
1255 | 1231 | ||
1256 | if (first) | 1232 | if (first) |
1257 | first->next = ≀ | 1233 | first->next = ≀ |
@@ -1301,6 +1277,12 @@ static struct blk_mq_tags *nvme_rdma_tagset(struct nvme_rdma_queue *queue) | |||
1301 | return queue->ctrl->tag_set.tags[queue_idx - 1]; | 1277 | return queue->ctrl->tag_set.tags[queue_idx - 1]; |
1302 | } | 1278 | } |
1303 | 1279 | ||
1280 | static void nvme_rdma_async_done(struct ib_cq *cq, struct ib_wc *wc) | ||
1281 | { | ||
1282 | if (unlikely(wc->status != IB_WC_SUCCESS)) | ||
1283 | nvme_rdma_wr_error(cq, wc, "ASYNC"); | ||
1284 | } | ||
1285 | |||
1304 | static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg) | 1286 | static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg) |
1305 | { | 1287 | { |
1306 | struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg); | 1288 | struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg); |
@@ -1319,10 +1301,12 @@ static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg) | |||
1319 | cmd->common.flags |= NVME_CMD_SGL_METABUF; | 1301 | cmd->common.flags |= NVME_CMD_SGL_METABUF; |
1320 | nvme_rdma_set_sg_null(cmd); | 1302 | nvme_rdma_set_sg_null(cmd); |
1321 | 1303 | ||
1304 | sqe->cqe.done = nvme_rdma_async_done; | ||
1305 | |||
1322 | ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd), | 1306 | ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd), |
1323 | DMA_TO_DEVICE); | 1307 | DMA_TO_DEVICE); |
1324 | 1308 | ||
1325 | ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL, false); | 1309 | ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL); |
1326 | WARN_ON_ONCE(ret); | 1310 | WARN_ON_ONCE(ret); |
1327 | } | 1311 | } |
1328 | 1312 | ||
@@ -1343,14 +1327,34 @@ static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue, | |||
1343 | } | 1327 | } |
1344 | req = blk_mq_rq_to_pdu(rq); | 1328 | req = blk_mq_rq_to_pdu(rq); |
1345 | 1329 | ||
1346 | if (rq->tag == tag) | 1330 | req->status = cqe->status; |
1347 | ret = 1; | 1331 | req->result = cqe->result; |
1332 | |||
1333 | if (wc->wc_flags & IB_WC_WITH_INVALIDATE) { | ||
1334 | if (unlikely(wc->ex.invalidate_rkey != req->mr->rkey)) { | ||
1335 | dev_err(queue->ctrl->ctrl.device, | ||
1336 | "Bogus remote invalidation for rkey %#x\n", | ||
1337 | req->mr->rkey); | ||
1338 | nvme_rdma_error_recovery(queue->ctrl); | ||
1339 | } | ||
1340 | } else if (req->mr) { | ||
1341 | ret = nvme_rdma_inv_rkey(queue, req); | ||
1342 | if (unlikely(ret < 0)) { | ||
1343 | dev_err(queue->ctrl->ctrl.device, | ||
1344 | "Queueing INV WR for rkey %#x failed (%d)\n", | ||
1345 | req->mr->rkey, ret); | ||
1346 | nvme_rdma_error_recovery(queue->ctrl); | ||
1347 | } | ||
1348 | /* the local invalidation completion will end the request */ | ||
1349 | return 0; | ||
1350 | } | ||
1348 | 1351 | ||
1349 | if ((wc->wc_flags & IB_WC_WITH_INVALIDATE) && | 1352 | if (refcount_dec_and_test(&req->ref)) { |
1350 | wc->ex.invalidate_rkey == req->mr->rkey) | 1353 | if (rq->tag == tag) |
1351 | req->mr->need_inval = false; | 1354 | ret = 1; |
1355 | nvme_end_request(rq, req->status, req->result); | ||
1356 | } | ||
1352 | 1357 | ||
1353 | nvme_end_request(rq, cqe->status, cqe->result); | ||
1354 | return ret; | 1358 | return ret; |
1355 | } | 1359 | } |
1356 | 1360 | ||
@@ -1607,7 +1611,6 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, | |||
1607 | struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); | 1611 | struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); |
1608 | struct nvme_rdma_qe *sqe = &req->sqe; | 1612 | struct nvme_rdma_qe *sqe = &req->sqe; |
1609 | struct nvme_command *c = sqe->data; | 1613 | struct nvme_command *c = sqe->data; |
1610 | bool flush = false; | ||
1611 | struct ib_device *dev; | 1614 | struct ib_device *dev; |
1612 | blk_status_t ret; | 1615 | blk_status_t ret; |
1613 | int err; | 1616 | int err; |
@@ -1636,13 +1639,13 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, | |||
1636 | goto err; | 1639 | goto err; |
1637 | } | 1640 | } |
1638 | 1641 | ||
1642 | sqe->cqe.done = nvme_rdma_send_done; | ||
1643 | |||
1639 | ib_dma_sync_single_for_device(dev, sqe->dma, | 1644 | ib_dma_sync_single_for_device(dev, sqe->dma, |
1640 | sizeof(struct nvme_command), DMA_TO_DEVICE); | 1645 | sizeof(struct nvme_command), DMA_TO_DEVICE); |
1641 | 1646 | ||
1642 | if (req_op(rq) == REQ_OP_FLUSH) | ||
1643 | flush = true; | ||
1644 | err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge, | 1647 | err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge, |
1645 | req->mr->need_inval ? &req->reg_wr.wr : NULL, flush); | 1648 | req->mr ? &req->reg_wr.wr : NULL); |
1646 | if (unlikely(err)) { | 1649 | if (unlikely(err)) { |
1647 | nvme_rdma_unmap_data(queue, rq); | 1650 | nvme_rdma_unmap_data(queue, rq); |
1648 | goto err; | 1651 | goto err; |
@@ -1790,7 +1793,6 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { | |||
1790 | .submit_async_event = nvme_rdma_submit_async_event, | 1793 | .submit_async_event = nvme_rdma_submit_async_event, |
1791 | .delete_ctrl = nvme_rdma_delete_ctrl, | 1794 | .delete_ctrl = nvme_rdma_delete_ctrl, |
1792 | .get_address = nvmf_get_address, | 1795 | .get_address = nvmf_get_address, |
1793 | .reinit_request = nvme_rdma_reinit_request, | ||
1794 | }; | 1796 | }; |
1795 | 1797 | ||
1796 | static inline bool | 1798 | static inline bool |