aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSagi Grimberg <sagi@grimberg.me>2017-11-23 10:35:21 -0500
committerChristoph Hellwig <hch@lst.de>2017-11-26 09:33:32 -0500
commitb4b591c87f2b0f4ebaf3a68d4f13873b241aa584 (patch)
treedcde717c1d834f5f5582d7c32ef50ae59e08ec27
parent6c4ca1e36cdc1a0a7a84797804b87920ccbebf51 (diff)
nvme-rdma: don't suppress send completions
The entire completions suppress mechanism is currently broken because the HCA might retry a send operation (due to dropped ack) after the nvme transaction has completed. In order to handle this, we signal all send completions and introduce a separate done handler for async events as they will be handled differently (as they don't include in-capsule data by definition). Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Reviewed-by: Max Gurtovoy <maxg@mellanox.com> Signed-off-by: Christoph Hellwig <hch@lst.de>
-rw-r--r--drivers/nvme/host/rdma.c54
1 files changed, 14 insertions, 40 deletions
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 2c597105a6bf..61511bed8aca 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -77,7 +77,6 @@ enum nvme_rdma_queue_flags {
77 77
78struct nvme_rdma_queue { 78struct nvme_rdma_queue {
79 struct nvme_rdma_qe *rsp_ring; 79 struct nvme_rdma_qe *rsp_ring;
80 atomic_t sig_count;
81 int queue_size; 80 int queue_size;
82 size_t cmnd_capsule_len; 81 size_t cmnd_capsule_len;
83 struct nvme_rdma_ctrl *ctrl; 82 struct nvme_rdma_ctrl *ctrl;
@@ -510,7 +509,6 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl,
510 queue->cmnd_capsule_len = sizeof(struct nvme_command); 509 queue->cmnd_capsule_len = sizeof(struct nvme_command);
511 510
512 queue->queue_size = queue_size; 511 queue->queue_size = queue_size;
513 atomic_set(&queue->sig_count, 0);
514 512
515 queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue, 513 queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue,
516 RDMA_PS_TCP, IB_QPT_RC); 514 RDMA_PS_TCP, IB_QPT_RC);
@@ -1204,21 +1202,9 @@ static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
1204 nvme_rdma_wr_error(cq, wc, "SEND"); 1202 nvme_rdma_wr_error(cq, wc, "SEND");
1205} 1203}
1206 1204
1207/*
1208 * We want to signal completion at least every queue depth/2. This returns the
1209 * largest power of two that is not above half of (queue size + 1) to optimize
1210 * (avoid divisions).
1211 */
1212static inline bool nvme_rdma_queue_sig_limit(struct nvme_rdma_queue *queue)
1213{
1214 int limit = 1 << ilog2((queue->queue_size + 1) / 2);
1215
1216 return (atomic_inc_return(&queue->sig_count) & (limit - 1)) == 0;
1217}
1218
1219static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, 1205static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
1220 struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge, 1206 struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge,
1221 struct ib_send_wr *first, bool flush) 1207 struct ib_send_wr *first)
1222{ 1208{
1223 struct ib_send_wr wr, *bad_wr; 1209 struct ib_send_wr wr, *bad_wr;
1224 int ret; 1210 int ret;
@@ -1227,31 +1213,12 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
1227 sge->length = sizeof(struct nvme_command), 1213 sge->length = sizeof(struct nvme_command),
1228 sge->lkey = queue->device->pd->local_dma_lkey; 1214 sge->lkey = queue->device->pd->local_dma_lkey;
1229 1215
1230 qe->cqe.done = nvme_rdma_send_done;
1231
1232 wr.next = NULL; 1216 wr.next = NULL;
1233 wr.wr_cqe = &qe->cqe; 1217 wr.wr_cqe = &qe->cqe;
1234 wr.sg_list = sge; 1218 wr.sg_list = sge;
1235 wr.num_sge = num_sge; 1219 wr.num_sge = num_sge;
1236 wr.opcode = IB_WR_SEND; 1220 wr.opcode = IB_WR_SEND;
1237 wr.send_flags = 0; 1221 wr.send_flags = IB_SEND_SIGNALED;
1238
1239 /*
1240 * Unsignalled send completions are another giant desaster in the
1241 * IB Verbs spec: If we don't regularly post signalled sends
1242 * the send queue will fill up and only a QP reset will rescue us.
1243 * Would have been way to obvious to handle this in hardware or
1244 * at least the RDMA stack..
1245 *
1246 * Always signal the flushes. The magic request used for the flush
1247 * sequencer is not allocated in our driver's tagset and it's
1248 * triggered to be freed by blk_cleanup_queue(). So we need to
1249 * always mark it as signaled to ensure that the "wr_cqe", which is
1250 * embedded in request's payload, is not freed when __ib_process_cq()
1251 * calls wr_cqe->done().
1252 */
1253 if (nvme_rdma_queue_sig_limit(queue) || flush)
1254 wr.send_flags |= IB_SEND_SIGNALED;
1255 1222
1256 if (first) 1223 if (first)
1257 first->next = &wr; 1224 first->next = &wr;
@@ -1301,6 +1268,12 @@ static struct blk_mq_tags *nvme_rdma_tagset(struct nvme_rdma_queue *queue)
1301 return queue->ctrl->tag_set.tags[queue_idx - 1]; 1268 return queue->ctrl->tag_set.tags[queue_idx - 1];
1302} 1269}
1303 1270
1271static void nvme_rdma_async_done(struct ib_cq *cq, struct ib_wc *wc)
1272{
1273 if (unlikely(wc->status != IB_WC_SUCCESS))
1274 nvme_rdma_wr_error(cq, wc, "ASYNC");
1275}
1276
1304static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg) 1277static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg)
1305{ 1278{
1306 struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg); 1279 struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg);
@@ -1319,10 +1292,12 @@ static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg)
1319 cmd->common.flags |= NVME_CMD_SGL_METABUF; 1292 cmd->common.flags |= NVME_CMD_SGL_METABUF;
1320 nvme_rdma_set_sg_null(cmd); 1293 nvme_rdma_set_sg_null(cmd);
1321 1294
1295 sqe->cqe.done = nvme_rdma_async_done;
1296
1322 ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd), 1297 ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd),
1323 DMA_TO_DEVICE); 1298 DMA_TO_DEVICE);
1324 1299
1325 ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL, false); 1300 ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL);
1326 WARN_ON_ONCE(ret); 1301 WARN_ON_ONCE(ret);
1327} 1302}
1328 1303
@@ -1607,7 +1582,6 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
1607 struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); 1582 struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
1608 struct nvme_rdma_qe *sqe = &req->sqe; 1583 struct nvme_rdma_qe *sqe = &req->sqe;
1609 struct nvme_command *c = sqe->data; 1584 struct nvme_command *c = sqe->data;
1610 bool flush = false;
1611 struct ib_device *dev; 1585 struct ib_device *dev;
1612 blk_status_t ret; 1586 blk_status_t ret;
1613 int err; 1587 int err;
@@ -1636,13 +1610,13 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
1636 goto err; 1610 goto err;
1637 } 1611 }
1638 1612
1613 sqe->cqe.done = nvme_rdma_send_done;
1614
1639 ib_dma_sync_single_for_device(dev, sqe->dma, 1615 ib_dma_sync_single_for_device(dev, sqe->dma,
1640 sizeof(struct nvme_command), DMA_TO_DEVICE); 1616 sizeof(struct nvme_command), DMA_TO_DEVICE);
1641 1617
1642 if (req_op(rq) == REQ_OP_FLUSH)
1643 flush = true;
1644 err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge, 1618 err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge,
1645 req->mr->need_inval ? &req->reg_wr.wr : NULL, flush); 1619 req->mr->need_inval ? &req->reg_wr.wr : NULL);
1646 if (unlikely(err)) { 1620 if (unlikely(err)) {
1647 nvme_rdma_unmap_data(queue, rq); 1621 nvme_rdma_unmap_data(queue, rq);
1648 goto err; 1622 goto err;