nvme-rdma: don't suppress send completions

The entire completions suppress mechanism is currently broken because the HCA might retry a send operation (due to dropped ack) after the nvme transaction has completed. In order to handle this, we signal all send completions and introduce a separate done handler for async events as they will be handled differently (as they don't include in-capsule data by definition). Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Reviewed-by: Max Gurtovoy <maxg@mellanox.com> Signed-off-by: Christoph Hellwig <hch@lst.de>
author: Sagi Grimberg <sagi@grimberg.me> 2017-11-23 10:35:21 -0500
committer: Christoph Hellwig <hch@lst.de> 2017-11-26 09:33:32 -0500
commit: b4b591c87f2b0f4ebaf3a68d4f13873b241aa584 (patch)
tree: dcde717c1d834f5f5582d7c32ef50ae59e08ec27
parent: 6c4ca1e36cdc1a0a7a84797804b87920ccbebf51 (diff)
1 files changed, 14 insertions, 40 deletions
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 2c597105a6bf..61511bed8aca 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -77,7 +77,6 @@ enum nvme_rdma_queue_flags {
 struct nvme_rdma_queue {
        struct nvme_rdma_qe     *rsp_ring;
-        atomic_t                sig_count;
        int                     queue_size;
        size_t                  cmnd_capsule_len;
        struct nvme_rdma_ctrl   *ctrl;
@@ -510,7 +509,6 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl,
                queue->cmnd_capsule_len = sizeof(struct nvme_command);
        queue->queue_size = queue_size;
-        atomic_set(&queue->sig_count, 0);
        queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue,
                        RDMA_PS_TCP, IB_QPT_RC);
@@ -1204,21 +1202,9 @@ static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
                nvme_rdma_wr_error(cq, wc, "SEND");
 }
-/*
- * We want to signal completion at least every queue depth/2.  This returns the
- * largest power of two that is not above half of (queue size + 1) to optimize
- * (avoid divisions).
- */
-static inline bool nvme_rdma_queue_sig_limit(struct nvme_rdma_queue *queue)
-{
-        int limit = 1 << ilog2((queue->queue_size + 1) / 2);
-        return (atomic_inc_return(&queue->sig_count) & (limit - 1)) == 0;
-}
 static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
                struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge,
-                struct ib_send_wr *first, bool flush)
+                struct ib_send_wr *first)
 {
        struct ib_send_wr wr, *bad_wr;
        int ret;
@@ -1227,31 +1213,12 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
        sge->length = sizeof(struct nvme_command),
        sge->lkey   = queue->device->pd->local_dma_lkey;
-        qe->cqe.done = nvme_rdma_send_done;
        wr.next       = NULL;
        wr.wr_cqe     = &qe->cqe;
        wr.sg_list    = sge;
        wr.num_sge    = num_sge;
        wr.opcode     = IB_WR_SEND;
-        wr.send_flags = 0;
+        wr.send_flags = IB_SEND_SIGNALED;
-        /*
-         * Unsignalled send completions are another giant desaster in the
-         * IB Verbs spec:  If we don't regularly post signalled sends
-         * the send queue will fill up and only a QP reset will rescue us.
-         * Would have been way to obvious to handle this in hardware or
-         * at least the RDMA stack..
-         *
-         * Always signal the flushes. The magic request used for the flush
-         * sequencer is not allocated in our driver's tagset and it's
-         * triggered to be freed by blk_cleanup_queue(). So we need to
-         * always mark it as signaled to ensure that the "wr_cqe", which is
-         * embedded in request's payload, is not freed when __ib_process_cq()
-         * calls wr_cqe->done().
-         */
-        if (nvme_rdma_queue_sig_limit(queue) || flush)
-                wr.send_flags |= IB_SEND_SIGNALED;
        if (first)
                first->next = &wr;
@@ -1301,6 +1268,12 @@ static struct blk_mq_tags *nvme_rdma_tagset(struct nvme_rdma_queue *queue)
        return queue->ctrl->tag_set.tags[queue_idx - 1];
 }
+static void nvme_rdma_async_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+        if (unlikely(wc->status != IB_WC_SUCCESS))
+                nvme_rdma_wr_error(cq, wc, "ASYNC");
+}
 static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg)
 {
        struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg);
@@ -1319,10 +1292,12 @@ static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg)
        cmd->common.flags |= NVME_CMD_SGL_METABUF;
        nvme_rdma_set_sg_null(cmd);
+        sqe->cqe.done = nvme_rdma_async_done;
        ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd),
                        DMA_TO_DEVICE);
-        ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL, false);
+        ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL);
        WARN_ON_ONCE(ret);
 }
@@ -1607,7 +1582,6 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
        struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
        struct nvme_rdma_qe *sqe = &req->sqe;
        struct nvme_command *c = sqe->data;
-        bool flush = false;
        struct ib_device *dev;
        blk_status_t ret;
        int err;
@@ -1636,13 +1610,13 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
                goto err;
        }
+        sqe->cqe.done = nvme_rdma_send_done;
        ib_dma_sync_single_for_device(dev, sqe->dma,
                        sizeof(struct nvme_command), DMA_TO_DEVICE);
-        if (req_op(rq) == REQ_OP_FLUSH)
-                flush = true;
        err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge,
-                        req->mr->need_inval ? &req->reg_wr.wr : NULL, flush);
+                        req->mr->need_inval ? &req->reg_wr.wr : NULL);
        if (unlikely(err)) {
                nvme_rdma_unmap_data(queue, rq);
                goto err;
author	Sagi Grimberg <sagi@grimberg.me>	2017-11-23 10:35:21 -0500
committer	Christoph Hellwig <hch@lst.de>	2017-11-26 09:33:32 -0500
commit	b4b591c87f2b0f4ebaf3a68d4f13873b241aa584 (patch)
tree	dcde717c1d834f5f5582d7c32ef50ae59e08ec27
parent	6c4ca1e36cdc1a0a7a84797804b87920ccbebf51 (diff)