diff options
-rw-r--r-- | drivers/nvme/host/core.c | 21 | ||||
-rw-r--r-- | drivers/nvme/host/fc.c | 20 | ||||
-rw-r--r-- | drivers/nvme/host/pci.c | 13 | ||||
-rw-r--r-- | drivers/nvme/host/rdma.c | 44 |
4 files changed, 67 insertions, 31 deletions
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a60926410438..903d5813023a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c | |||
@@ -56,7 +56,7 @@ MODULE_PARM_DESC(max_retries, "max number of retries a command may have"); | |||
56 | static int nvme_char_major; | 56 | static int nvme_char_major; |
57 | module_param(nvme_char_major, int, 0); | 57 | module_param(nvme_char_major, int, 0); |
58 | 58 | ||
59 | static unsigned long default_ps_max_latency_us = 25000; | 59 | static unsigned long default_ps_max_latency_us = 100000; |
60 | module_param(default_ps_max_latency_us, ulong, 0644); | 60 | module_param(default_ps_max_latency_us, ulong, 0644); |
61 | MODULE_PARM_DESC(default_ps_max_latency_us, | 61 | MODULE_PARM_DESC(default_ps_max_latency_us, |
62 | "max power saving latency for new devices; use PM QOS to change per device"); | 62 | "max power saving latency for new devices; use PM QOS to change per device"); |
@@ -1342,7 +1342,7 @@ static void nvme_configure_apst(struct nvme_ctrl *ctrl) | |||
1342 | * transitioning between power states. Therefore, when running | 1342 | * transitioning between power states. Therefore, when running |
1343 | * in any given state, we will enter the next lower-power | 1343 | * in any given state, we will enter the next lower-power |
1344 | * non-operational state after waiting 50 * (enlat + exlat) | 1344 | * non-operational state after waiting 50 * (enlat + exlat) |
1345 | * microseconds, as long as that state's total latency is under | 1345 | * microseconds, as long as that state's exit latency is under |
1346 | * the requested maximum latency. | 1346 | * the requested maximum latency. |
1347 | * | 1347 | * |
1348 | * We will not autonomously enter any non-operational state for | 1348 | * We will not autonomously enter any non-operational state for |
@@ -1387,7 +1387,7 @@ static void nvme_configure_apst(struct nvme_ctrl *ctrl) | |||
1387 | * lowest-power state, not the number of states. | 1387 | * lowest-power state, not the number of states. |
1388 | */ | 1388 | */ |
1389 | for (state = (int)ctrl->npss; state >= 0; state--) { | 1389 | for (state = (int)ctrl->npss; state >= 0; state--) { |
1390 | u64 total_latency_us, transition_ms; | 1390 | u64 total_latency_us, exit_latency_us, transition_ms; |
1391 | 1391 | ||
1392 | if (target) | 1392 | if (target) |
1393 | table->entries[state] = target; | 1393 | table->entries[state] = target; |
@@ -1408,12 +1408,15 @@ static void nvme_configure_apst(struct nvme_ctrl *ctrl) | |||
1408 | NVME_PS_FLAGS_NON_OP_STATE)) | 1408 | NVME_PS_FLAGS_NON_OP_STATE)) |
1409 | continue; | 1409 | continue; |
1410 | 1410 | ||
1411 | total_latency_us = | 1411 | exit_latency_us = |
1412 | (u64)le32_to_cpu(ctrl->psd[state].entry_lat) + | 1412 | (u64)le32_to_cpu(ctrl->psd[state].exit_lat); |
1413 | + le32_to_cpu(ctrl->psd[state].exit_lat); | 1413 | if (exit_latency_us > ctrl->ps_max_latency_us) |
1414 | if (total_latency_us > ctrl->ps_max_latency_us) | ||
1415 | continue; | 1414 | continue; |
1416 | 1415 | ||
1416 | total_latency_us = | ||
1417 | exit_latency_us + | ||
1418 | le32_to_cpu(ctrl->psd[state].entry_lat); | ||
1419 | |||
1417 | /* | 1420 | /* |
1418 | * This state is good. Use it as the APST idle | 1421 | * This state is good. Use it as the APST idle |
1419 | * target for higher power states. | 1422 | * target for higher power states. |
@@ -2438,6 +2441,10 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) | |||
2438 | struct nvme_ns *ns; | 2441 | struct nvme_ns *ns; |
2439 | 2442 | ||
2440 | mutex_lock(&ctrl->namespaces_mutex); | 2443 | mutex_lock(&ctrl->namespaces_mutex); |
2444 | |||
2445 | /* Forcibly start all queues to avoid having stuck requests */ | ||
2446 | blk_mq_start_hw_queues(ctrl->admin_q); | ||
2447 | |||
2441 | list_for_each_entry(ns, &ctrl->namespaces, list) { | 2448 | list_for_each_entry(ns, &ctrl->namespaces, list) { |
2442 | /* | 2449 | /* |
2443 | * Revalidating a dead namespace sets capacity to 0. This will | 2450 | * Revalidating a dead namespace sets capacity to 0. This will |
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 5b14cbefb724..92964cef0f4b 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c | |||
@@ -1139,6 +1139,7 @@ nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl) | |||
1139 | /* *********************** NVME Ctrl Routines **************************** */ | 1139 | /* *********************** NVME Ctrl Routines **************************** */ |
1140 | 1140 | ||
1141 | static void __nvme_fc_final_op_cleanup(struct request *rq); | 1141 | static void __nvme_fc_final_op_cleanup(struct request *rq); |
1142 | static void nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg); | ||
1142 | 1143 | ||
1143 | static int | 1144 | static int |
1144 | nvme_fc_reinit_request(void *data, struct request *rq) | 1145 | nvme_fc_reinit_request(void *data, struct request *rq) |
@@ -1265,7 +1266,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) | |||
1265 | struct nvme_command *sqe = &op->cmd_iu.sqe; | 1266 | struct nvme_command *sqe = &op->cmd_iu.sqe; |
1266 | __le16 status = cpu_to_le16(NVME_SC_SUCCESS << 1); | 1267 | __le16 status = cpu_to_le16(NVME_SC_SUCCESS << 1); |
1267 | union nvme_result result; | 1268 | union nvme_result result; |
1268 | bool complete_rq; | 1269 | bool complete_rq, terminate_assoc = true; |
1269 | 1270 | ||
1270 | /* | 1271 | /* |
1271 | * WARNING: | 1272 | * WARNING: |
@@ -1294,6 +1295,14 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) | |||
1294 | * fabricate a CQE, the following fields will not be set as they | 1295 | * fabricate a CQE, the following fields will not be set as they |
1295 | * are not referenced: | 1296 | * are not referenced: |
1296 | * cqe.sqid, cqe.sqhd, cqe.command_id | 1297 | * cqe.sqid, cqe.sqhd, cqe.command_id |
1298 | * | ||
1299 | * Failure or error of an individual i/o, in a transport | ||
1300 | * detected fashion unrelated to the nvme completion status, | ||
1301 | * potentially cause the initiator and target sides to get out | ||
1302 | * of sync on SQ head/tail (aka outstanding io count allowed). | ||
1303 | * Per FC-NVME spec, failure of an individual command requires | ||
1304 | * the connection to be terminated, which in turn requires the | ||
1305 | * association to be terminated. | ||
1297 | */ | 1306 | */ |
1298 | 1307 | ||
1299 | fc_dma_sync_single_for_cpu(ctrl->lport->dev, op->fcp_req.rspdma, | 1308 | fc_dma_sync_single_for_cpu(ctrl->lport->dev, op->fcp_req.rspdma, |
@@ -1359,6 +1368,8 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) | |||
1359 | goto done; | 1368 | goto done; |
1360 | } | 1369 | } |
1361 | 1370 | ||
1371 | terminate_assoc = false; | ||
1372 | |||
1362 | done: | 1373 | done: |
1363 | if (op->flags & FCOP_FLAGS_AEN) { | 1374 | if (op->flags & FCOP_FLAGS_AEN) { |
1364 | nvme_complete_async_event(&queue->ctrl->ctrl, status, &result); | 1375 | nvme_complete_async_event(&queue->ctrl->ctrl, status, &result); |
@@ -1366,7 +1377,7 @@ done: | |||
1366 | atomic_set(&op->state, FCPOP_STATE_IDLE); | 1377 | atomic_set(&op->state, FCPOP_STATE_IDLE); |
1367 | op->flags = FCOP_FLAGS_AEN; /* clear other flags */ | 1378 | op->flags = FCOP_FLAGS_AEN; /* clear other flags */ |
1368 | nvme_fc_ctrl_put(ctrl); | 1379 | nvme_fc_ctrl_put(ctrl); |
1369 | return; | 1380 | goto check_error; |
1370 | } | 1381 | } |
1371 | 1382 | ||
1372 | complete_rq = __nvme_fc_fcpop_chk_teardowns(ctrl, op); | 1383 | complete_rq = __nvme_fc_fcpop_chk_teardowns(ctrl, op); |
@@ -1379,6 +1390,10 @@ done: | |||
1379 | nvme_end_request(rq, status, result); | 1390 | nvme_end_request(rq, status, result); |
1380 | } else | 1391 | } else |
1381 | __nvme_fc_final_op_cleanup(rq); | 1392 | __nvme_fc_final_op_cleanup(rq); |
1393 | |||
1394 | check_error: | ||
1395 | if (terminate_assoc) | ||
1396 | nvme_fc_error_recovery(ctrl, "transport detected io error"); | ||
1382 | } | 1397 | } |
1383 | 1398 | ||
1384 | static int | 1399 | static int |
@@ -2791,6 +2806,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, | |||
2791 | ctrl->ctrl.opts = NULL; | 2806 | ctrl->ctrl.opts = NULL; |
2792 | /* initiate nvme ctrl ref counting teardown */ | 2807 | /* initiate nvme ctrl ref counting teardown */ |
2793 | nvme_uninit_ctrl(&ctrl->ctrl); | 2808 | nvme_uninit_ctrl(&ctrl->ctrl); |
2809 | nvme_put_ctrl(&ctrl->ctrl); | ||
2794 | 2810 | ||
2795 | /* as we're past the point where we transition to the ref | 2811 | /* as we're past the point where we transition to the ref |
2796 | * counting teardown path, if we return a bad pointer here, | 2812 | * counting teardown path, if we return a bad pointer here, |
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index d52701df7245..951042a375d6 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c | |||
@@ -1367,7 +1367,7 @@ static bool nvme_should_reset(struct nvme_dev *dev, u32 csts) | |||
1367 | bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO); | 1367 | bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO); |
1368 | 1368 | ||
1369 | /* If there is a reset ongoing, we shouldn't reset again. */ | 1369 | /* If there is a reset ongoing, we shouldn't reset again. */ |
1370 | if (work_busy(&dev->reset_work)) | 1370 | if (dev->ctrl.state == NVME_CTRL_RESETTING) |
1371 | return false; | 1371 | return false; |
1372 | 1372 | ||
1373 | /* We shouldn't reset unless the controller is on fatal error state | 1373 | /* We shouldn't reset unless the controller is on fatal error state |
@@ -1903,7 +1903,7 @@ static void nvme_reset_work(struct work_struct *work) | |||
1903 | bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL); | 1903 | bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL); |
1904 | int result = -ENODEV; | 1904 | int result = -ENODEV; |
1905 | 1905 | ||
1906 | if (WARN_ON(dev->ctrl.state == NVME_CTRL_RESETTING)) | 1906 | if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING)) |
1907 | goto out; | 1907 | goto out; |
1908 | 1908 | ||
1909 | /* | 1909 | /* |
@@ -1913,9 +1913,6 @@ static void nvme_reset_work(struct work_struct *work) | |||
1913 | if (dev->ctrl.ctrl_config & NVME_CC_ENABLE) | 1913 | if (dev->ctrl.ctrl_config & NVME_CC_ENABLE) |
1914 | nvme_dev_disable(dev, false); | 1914 | nvme_dev_disable(dev, false); |
1915 | 1915 | ||
1916 | if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) | ||
1917 | goto out; | ||
1918 | |||
1919 | result = nvme_pci_enable(dev); | 1916 | result = nvme_pci_enable(dev); |
1920 | if (result) | 1917 | if (result) |
1921 | goto out; | 1918 | goto out; |
@@ -2009,8 +2006,8 @@ static int nvme_reset(struct nvme_dev *dev) | |||
2009 | { | 2006 | { |
2010 | if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q)) | 2007 | if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q)) |
2011 | return -ENODEV; | 2008 | return -ENODEV; |
2012 | if (work_busy(&dev->reset_work)) | 2009 | if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) |
2013 | return -ENODEV; | 2010 | return -EBUSY; |
2014 | if (!queue_work(nvme_workq, &dev->reset_work)) | 2011 | if (!queue_work(nvme_workq, &dev->reset_work)) |
2015 | return -EBUSY; | 2012 | return -EBUSY; |
2016 | return 0; | 2013 | return 0; |
@@ -2136,6 +2133,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) | |||
2136 | if (result) | 2133 | if (result) |
2137 | goto release_pools; | 2134 | goto release_pools; |
2138 | 2135 | ||
2136 | nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING); | ||
2139 | dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); | 2137 | dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); |
2140 | 2138 | ||
2141 | queue_work(nvme_workq, &dev->reset_work); | 2139 | queue_work(nvme_workq, &dev->reset_work); |
@@ -2179,6 +2177,7 @@ static void nvme_remove(struct pci_dev *pdev) | |||
2179 | 2177 | ||
2180 | nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); | 2178 | nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); |
2181 | 2179 | ||
2180 | cancel_work_sync(&dev->reset_work); | ||
2182 | pci_set_drvdata(pdev, NULL); | 2181 | pci_set_drvdata(pdev, NULL); |
2183 | 2182 | ||
2184 | if (!pci_device_is_present(pdev)) { | 2183 | if (!pci_device_is_present(pdev)) { |
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 28bd255c144d..24397d306d53 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c | |||
@@ -753,28 +753,26 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) | |||
753 | if (ret) | 753 | if (ret) |
754 | goto requeue; | 754 | goto requeue; |
755 | 755 | ||
756 | blk_mq_start_stopped_hw_queues(ctrl->ctrl.admin_q, true); | ||
757 | |||
758 | ret = nvmf_connect_admin_queue(&ctrl->ctrl); | 756 | ret = nvmf_connect_admin_queue(&ctrl->ctrl); |
759 | if (ret) | 757 | if (ret) |
760 | goto stop_admin_q; | 758 | goto requeue; |
761 | 759 | ||
762 | set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags); | 760 | set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags); |
763 | 761 | ||
764 | ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap); | 762 | ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap); |
765 | if (ret) | 763 | if (ret) |
766 | goto stop_admin_q; | 764 | goto requeue; |
767 | 765 | ||
768 | nvme_start_keep_alive(&ctrl->ctrl); | 766 | nvme_start_keep_alive(&ctrl->ctrl); |
769 | 767 | ||
770 | if (ctrl->queue_count > 1) { | 768 | if (ctrl->queue_count > 1) { |
771 | ret = nvme_rdma_init_io_queues(ctrl); | 769 | ret = nvme_rdma_init_io_queues(ctrl); |
772 | if (ret) | 770 | if (ret) |
773 | goto stop_admin_q; | 771 | goto requeue; |
774 | 772 | ||
775 | ret = nvme_rdma_connect_io_queues(ctrl); | 773 | ret = nvme_rdma_connect_io_queues(ctrl); |
776 | if (ret) | 774 | if (ret) |
777 | goto stop_admin_q; | 775 | goto requeue; |
778 | } | 776 | } |
779 | 777 | ||
780 | changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); | 778 | changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); |
@@ -782,7 +780,6 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) | |||
782 | ctrl->ctrl.opts->nr_reconnects = 0; | 780 | ctrl->ctrl.opts->nr_reconnects = 0; |
783 | 781 | ||
784 | if (ctrl->queue_count > 1) { | 782 | if (ctrl->queue_count > 1) { |
785 | nvme_start_queues(&ctrl->ctrl); | ||
786 | nvme_queue_scan(&ctrl->ctrl); | 783 | nvme_queue_scan(&ctrl->ctrl); |
787 | nvme_queue_async_events(&ctrl->ctrl); | 784 | nvme_queue_async_events(&ctrl->ctrl); |
788 | } | 785 | } |
@@ -791,8 +788,6 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) | |||
791 | 788 | ||
792 | return; | 789 | return; |
793 | 790 | ||
794 | stop_admin_q: | ||
795 | blk_mq_stop_hw_queues(ctrl->ctrl.admin_q); | ||
796 | requeue: | 791 | requeue: |
797 | dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n", | 792 | dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n", |
798 | ctrl->ctrl.opts->nr_reconnects); | 793 | ctrl->ctrl.opts->nr_reconnects); |
@@ -823,6 +818,13 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) | |||
823 | blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, | 818 | blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, |
824 | nvme_cancel_request, &ctrl->ctrl); | 819 | nvme_cancel_request, &ctrl->ctrl); |
825 | 820 | ||
821 | /* | ||
822 | * queues are not a live anymore, so restart the queues to fail fast | ||
823 | * new IO | ||
824 | */ | ||
825 | blk_mq_start_stopped_hw_queues(ctrl->ctrl.admin_q, true); | ||
826 | nvme_start_queues(&ctrl->ctrl); | ||
827 | |||
826 | nvme_rdma_reconnect_or_remove(ctrl); | 828 | nvme_rdma_reconnect_or_remove(ctrl); |
827 | } | 829 | } |
828 | 830 | ||
@@ -1433,7 +1435,7 @@ nvme_rdma_timeout(struct request *rq, bool reserved) | |||
1433 | /* | 1435 | /* |
1434 | * We cannot accept any other command until the Connect command has completed. | 1436 | * We cannot accept any other command until the Connect command has completed. |
1435 | */ | 1437 | */ |
1436 | static inline bool nvme_rdma_queue_is_ready(struct nvme_rdma_queue *queue, | 1438 | static inline int nvme_rdma_queue_is_ready(struct nvme_rdma_queue *queue, |
1437 | struct request *rq) | 1439 | struct request *rq) |
1438 | { | 1440 | { |
1439 | if (unlikely(!test_bit(NVME_RDMA_Q_LIVE, &queue->flags))) { | 1441 | if (unlikely(!test_bit(NVME_RDMA_Q_LIVE, &queue->flags))) { |
@@ -1441,11 +1443,22 @@ static inline bool nvme_rdma_queue_is_ready(struct nvme_rdma_queue *queue, | |||
1441 | 1443 | ||
1442 | if (!blk_rq_is_passthrough(rq) || | 1444 | if (!blk_rq_is_passthrough(rq) || |
1443 | cmd->common.opcode != nvme_fabrics_command || | 1445 | cmd->common.opcode != nvme_fabrics_command || |
1444 | cmd->fabrics.fctype != nvme_fabrics_type_connect) | 1446 | cmd->fabrics.fctype != nvme_fabrics_type_connect) { |
1445 | return false; | 1447 | /* |
1448 | * reconnecting state means transport disruption, which | ||
1449 | * can take a long time and even might fail permanently, | ||
1450 | * so we can't let incoming I/O be requeued forever. | ||
1451 | * fail it fast to allow upper layers a chance to | ||
1452 | * failover. | ||
1453 | */ | ||
1454 | if (queue->ctrl->ctrl.state == NVME_CTRL_RECONNECTING) | ||
1455 | return -EIO; | ||
1456 | else | ||
1457 | return -EAGAIN; | ||
1458 | } | ||
1446 | } | 1459 | } |
1447 | 1460 | ||
1448 | return true; | 1461 | return 0; |
1449 | } | 1462 | } |
1450 | 1463 | ||
1451 | static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, | 1464 | static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, |
@@ -1463,8 +1476,9 @@ static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, | |||
1463 | 1476 | ||
1464 | WARN_ON_ONCE(rq->tag < 0); | 1477 | WARN_ON_ONCE(rq->tag < 0); |
1465 | 1478 | ||
1466 | if (!nvme_rdma_queue_is_ready(queue, rq)) | 1479 | ret = nvme_rdma_queue_is_ready(queue, rq); |
1467 | return BLK_MQ_RQ_QUEUE_BUSY; | 1480 | if (unlikely(ret)) |
1481 | goto err; | ||
1468 | 1482 | ||
1469 | dev = queue->device->dev; | 1483 | dev = queue->device->dev; |
1470 | ib_dma_sync_single_for_cpu(dev, sqe->dma, | 1484 | ib_dma_sync_single_for_cpu(dev, sqe->dma, |