diff options
author | Sagi Grimberg <sagi@grimberg.me> | 2017-11-23 10:35:21 -0500 |
---|---|---|
committer | Christoph Hellwig <hch@lst.de> | 2017-11-26 09:33:32 -0500 |
commit | b4b591c87f2b0f4ebaf3a68d4f13873b241aa584 (patch) | |
tree | dcde717c1d834f5f5582d7c32ef50ae59e08ec27 | |
parent | 6c4ca1e36cdc1a0a7a84797804b87920ccbebf51 (diff) |
nvme-rdma: don't suppress send completions
The entire completions suppress mechanism is currently broken because the
HCA might retry a send operation (due to dropped ack) after the nvme
transaction has completed.
In order to handle this, we signal all send completions and introduce a
separate done handler for async events as they will be handled differently
(as they don't include in-capsule data by definition).
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
-rw-r--r-- | drivers/nvme/host/rdma.c | 54 |
1 files changed, 14 insertions, 40 deletions
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 2c597105a6bf..61511bed8aca 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c | |||
@@ -77,7 +77,6 @@ enum nvme_rdma_queue_flags { | |||
77 | 77 | ||
78 | struct nvme_rdma_queue { | 78 | struct nvme_rdma_queue { |
79 | struct nvme_rdma_qe *rsp_ring; | 79 | struct nvme_rdma_qe *rsp_ring; |
80 | atomic_t sig_count; | ||
81 | int queue_size; | 80 | int queue_size; |
82 | size_t cmnd_capsule_len; | 81 | size_t cmnd_capsule_len; |
83 | struct nvme_rdma_ctrl *ctrl; | 82 | struct nvme_rdma_ctrl *ctrl; |
@@ -510,7 +509,6 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl, | |||
510 | queue->cmnd_capsule_len = sizeof(struct nvme_command); | 509 | queue->cmnd_capsule_len = sizeof(struct nvme_command); |
511 | 510 | ||
512 | queue->queue_size = queue_size; | 511 | queue->queue_size = queue_size; |
513 | atomic_set(&queue->sig_count, 0); | ||
514 | 512 | ||
515 | queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue, | 513 | queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue, |
516 | RDMA_PS_TCP, IB_QPT_RC); | 514 | RDMA_PS_TCP, IB_QPT_RC); |
@@ -1204,21 +1202,9 @@ static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) | |||
1204 | nvme_rdma_wr_error(cq, wc, "SEND"); | 1202 | nvme_rdma_wr_error(cq, wc, "SEND"); |
1205 | } | 1203 | } |
1206 | 1204 | ||
1207 | /* | ||
1208 | * We want to signal completion at least every queue depth/2. This returns the | ||
1209 | * largest power of two that is not above half of (queue size + 1) to optimize | ||
1210 | * (avoid divisions). | ||
1211 | */ | ||
1212 | static inline bool nvme_rdma_queue_sig_limit(struct nvme_rdma_queue *queue) | ||
1213 | { | ||
1214 | int limit = 1 << ilog2((queue->queue_size + 1) / 2); | ||
1215 | |||
1216 | return (atomic_inc_return(&queue->sig_count) & (limit - 1)) == 0; | ||
1217 | } | ||
1218 | |||
1219 | static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, | 1205 | static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, |
1220 | struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge, | 1206 | struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge, |
1221 | struct ib_send_wr *first, bool flush) | 1207 | struct ib_send_wr *first) |
1222 | { | 1208 | { |
1223 | struct ib_send_wr wr, *bad_wr; | 1209 | struct ib_send_wr wr, *bad_wr; |
1224 | int ret; | 1210 | int ret; |
@@ -1227,31 +1213,12 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, | |||
1227 | sge->length = sizeof(struct nvme_command), | 1213 | sge->length = sizeof(struct nvme_command), |
1228 | sge->lkey = queue->device->pd->local_dma_lkey; | 1214 | sge->lkey = queue->device->pd->local_dma_lkey; |
1229 | 1215 | ||
1230 | qe->cqe.done = nvme_rdma_send_done; | ||
1231 | |||
1232 | wr.next = NULL; | 1216 | wr.next = NULL; |
1233 | wr.wr_cqe = &qe->cqe; | 1217 | wr.wr_cqe = &qe->cqe; |
1234 | wr.sg_list = sge; | 1218 | wr.sg_list = sge; |
1235 | wr.num_sge = num_sge; | 1219 | wr.num_sge = num_sge; |
1236 | wr.opcode = IB_WR_SEND; | 1220 | wr.opcode = IB_WR_SEND; |
1237 | wr.send_flags = 0; | 1221 | wr.send_flags = IB_SEND_SIGNALED; |
1238 | |||
1239 | /* | ||
1240 | * Unsignalled send completions are another giant desaster in the | ||
1241 | * IB Verbs spec: If we don't regularly post signalled sends | ||
1242 | * the send queue will fill up and only a QP reset will rescue us. | ||
1243 | * Would have been way to obvious to handle this in hardware or | ||
1244 | * at least the RDMA stack.. | ||
1245 | * | ||
1246 | * Always signal the flushes. The magic request used for the flush | ||
1247 | * sequencer is not allocated in our driver's tagset and it's | ||
1248 | * triggered to be freed by blk_cleanup_queue(). So we need to | ||
1249 | * always mark it as signaled to ensure that the "wr_cqe", which is | ||
1250 | * embedded in request's payload, is not freed when __ib_process_cq() | ||
1251 | * calls wr_cqe->done(). | ||
1252 | */ | ||
1253 | if (nvme_rdma_queue_sig_limit(queue) || flush) | ||
1254 | wr.send_flags |= IB_SEND_SIGNALED; | ||
1255 | 1222 | ||
1256 | if (first) | 1223 | if (first) |
1257 | first->next = ≀ | 1224 | first->next = ≀ |
@@ -1301,6 +1268,12 @@ static struct blk_mq_tags *nvme_rdma_tagset(struct nvme_rdma_queue *queue) | |||
1301 | return queue->ctrl->tag_set.tags[queue_idx - 1]; | 1268 | return queue->ctrl->tag_set.tags[queue_idx - 1]; |
1302 | } | 1269 | } |
1303 | 1270 | ||
1271 | static void nvme_rdma_async_done(struct ib_cq *cq, struct ib_wc *wc) | ||
1272 | { | ||
1273 | if (unlikely(wc->status != IB_WC_SUCCESS)) | ||
1274 | nvme_rdma_wr_error(cq, wc, "ASYNC"); | ||
1275 | } | ||
1276 | |||
1304 | static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg) | 1277 | static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg) |
1305 | { | 1278 | { |
1306 | struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg); | 1279 | struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg); |
@@ -1319,10 +1292,12 @@ static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg) | |||
1319 | cmd->common.flags |= NVME_CMD_SGL_METABUF; | 1292 | cmd->common.flags |= NVME_CMD_SGL_METABUF; |
1320 | nvme_rdma_set_sg_null(cmd); | 1293 | nvme_rdma_set_sg_null(cmd); |
1321 | 1294 | ||
1295 | sqe->cqe.done = nvme_rdma_async_done; | ||
1296 | |||
1322 | ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd), | 1297 | ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd), |
1323 | DMA_TO_DEVICE); | 1298 | DMA_TO_DEVICE); |
1324 | 1299 | ||
1325 | ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL, false); | 1300 | ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL); |
1326 | WARN_ON_ONCE(ret); | 1301 | WARN_ON_ONCE(ret); |
1327 | } | 1302 | } |
1328 | 1303 | ||
@@ -1607,7 +1582,6 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, | |||
1607 | struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); | 1582 | struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); |
1608 | struct nvme_rdma_qe *sqe = &req->sqe; | 1583 | struct nvme_rdma_qe *sqe = &req->sqe; |
1609 | struct nvme_command *c = sqe->data; | 1584 | struct nvme_command *c = sqe->data; |
1610 | bool flush = false; | ||
1611 | struct ib_device *dev; | 1585 | struct ib_device *dev; |
1612 | blk_status_t ret; | 1586 | blk_status_t ret; |
1613 | int err; | 1587 | int err; |
@@ -1636,13 +1610,13 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, | |||
1636 | goto err; | 1610 | goto err; |
1637 | } | 1611 | } |
1638 | 1612 | ||
1613 | sqe->cqe.done = nvme_rdma_send_done; | ||
1614 | |||
1639 | ib_dma_sync_single_for_device(dev, sqe->dma, | 1615 | ib_dma_sync_single_for_device(dev, sqe->dma, |
1640 | sizeof(struct nvme_command), DMA_TO_DEVICE); | 1616 | sizeof(struct nvme_command), DMA_TO_DEVICE); |
1641 | 1617 | ||
1642 | if (req_op(rq) == REQ_OP_FLUSH) | ||
1643 | flush = true; | ||
1644 | err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge, | 1618 | err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge, |
1645 | req->mr->need_inval ? &req->reg_wr.wr : NULL, flush); | 1619 | req->mr->need_inval ? &req->reg_wr.wr : NULL); |
1646 | if (unlikely(err)) { | 1620 | if (unlikely(err)) { |
1647 | nvme_rdma_unmap_data(queue, rq); | 1621 | nvme_rdma_unmap_data(queue, rq); |
1648 | goto err; | 1622 | goto err; |