diff options
author | Keith Busch <keith.busch@intel.com> | 2014-03-03 18:39:13 -0500 |
---|---|---|
committer | Matthew Wilcox <matthew.r.wilcox@intel.com> | 2014-03-24 08:54:40 -0400 |
commit | 4f5099af4f3d5f999d8ab7784472d93e810e3912 (patch) | |
tree | f0dc2bc897e723037ae63a5f7f41bb3bff710399 /drivers/block | |
parent | 5a92e700af2e5e0e6404988d6a7f2ed3dad3f46f (diff) |
NVMe: IOCTL path RCU protect queue access
This adds rcu protected access to a queue in the nvme IOCTL path
to fix potential races between a surprise removal and queue usage in
nvme_submit_sync_cmd. The fix holds the rcu_read_lock() here to prevent
the nvme_queue from freeing while this path is executing so it can't
sleep, and so this path will no longer wait for a available command
id should they all be in use at the time a passthrough IOCTL request
is received.
Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Diffstat (limited to 'drivers/block')
-rw-r--r-- | drivers/block/nvme-core.c | 82 | ||||
-rw-r--r-- | drivers/block/nvme-scsi.c | 31 |
2 files changed, 59 insertions, 54 deletions
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index b66ab1db4629..04664cadadfa 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c | |||
@@ -268,18 +268,30 @@ static struct nvme_queue *raw_nvmeq(struct nvme_dev *dev, int qid) | |||
268 | return rcu_dereference_raw(dev->queues[qid]); | 268 | return rcu_dereference_raw(dev->queues[qid]); |
269 | } | 269 | } |
270 | 270 | ||
271 | struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU) | 271 | static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU) |
272 | { | 272 | { |
273 | rcu_read_lock(); | 273 | rcu_read_lock(); |
274 | return rcu_dereference(dev->queues[get_cpu() + 1]); | 274 | return rcu_dereference(dev->queues[get_cpu() + 1]); |
275 | } | 275 | } |
276 | 276 | ||
277 | void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) | 277 | static void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) |
278 | { | 278 | { |
279 | put_cpu(); | 279 | put_cpu(); |
280 | rcu_read_unlock(); | 280 | rcu_read_unlock(); |
281 | } | 281 | } |
282 | 282 | ||
283 | static struct nvme_queue *lock_nvmeq(struct nvme_dev *dev, int q_idx) | ||
284 | __acquires(RCU) | ||
285 | { | ||
286 | rcu_read_lock(); | ||
287 | return rcu_dereference(dev->queues[q_idx]); | ||
288 | } | ||
289 | |||
290 | static void unlock_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) | ||
291 | { | ||
292 | rcu_read_unlock(); | ||
293 | } | ||
294 | |||
283 | /** | 295 | /** |
284 | * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell | 296 | * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell |
285 | * @nvmeq: The queue to use | 297 | * @nvmeq: The queue to use |
@@ -292,6 +304,10 @@ static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) | |||
292 | unsigned long flags; | 304 | unsigned long flags; |
293 | u16 tail; | 305 | u16 tail; |
294 | spin_lock_irqsave(&nvmeq->q_lock, flags); | 306 | spin_lock_irqsave(&nvmeq->q_lock, flags); |
307 | if (nvmeq->q_suspended) { | ||
308 | spin_unlock_irqrestore(&nvmeq->q_lock, flags); | ||
309 | return -EBUSY; | ||
310 | } | ||
295 | tail = nvmeq->sq_tail; | 311 | tail = nvmeq->sq_tail; |
296 | memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); | 312 | memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); |
297 | if (++tail == nvmeq->q_depth) | 313 | if (++tail == nvmeq->q_depth) |
@@ -812,27 +828,46 @@ static void sync_completion(struct nvme_dev *dev, void *ctx, | |||
812 | * Returns 0 on success. If the result is negative, it's a Linux error code; | 828 | * Returns 0 on success. If the result is negative, it's a Linux error code; |
813 | * if the result is positive, it's an NVM Express status code | 829 | * if the result is positive, it's an NVM Express status code |
814 | */ | 830 | */ |
815 | int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd, | 831 | static int nvme_submit_sync_cmd(struct nvme_dev *dev, int q_idx, |
832 | struct nvme_command *cmd, | ||
816 | u32 *result, unsigned timeout) | 833 | u32 *result, unsigned timeout) |
817 | { | 834 | { |
818 | int cmdid; | 835 | int cmdid, ret; |
819 | struct sync_cmd_info cmdinfo; | 836 | struct sync_cmd_info cmdinfo; |
837 | struct nvme_queue *nvmeq; | ||
838 | |||
839 | nvmeq = lock_nvmeq(dev, q_idx); | ||
840 | if (!nvmeq) { | ||
841 | unlock_nvmeq(nvmeq); | ||
842 | return -ENODEV; | ||
843 | } | ||
820 | 844 | ||
821 | cmdinfo.task = current; | 845 | cmdinfo.task = current; |
822 | cmdinfo.status = -EINTR; | 846 | cmdinfo.status = -EINTR; |
823 | 847 | ||
824 | cmdid = alloc_cmdid_killable(nvmeq, &cmdinfo, sync_completion, | 848 | cmdid = alloc_cmdid(nvmeq, &cmdinfo, sync_completion, timeout); |
825 | timeout); | 849 | if (cmdid < 0) { |
826 | if (cmdid < 0) | 850 | unlock_nvmeq(nvmeq); |
827 | return cmdid; | 851 | return cmdid; |
852 | } | ||
828 | cmd->common.command_id = cmdid; | 853 | cmd->common.command_id = cmdid; |
829 | 854 | ||
830 | set_current_state(TASK_KILLABLE); | 855 | set_current_state(TASK_KILLABLE); |
831 | nvme_submit_cmd(nvmeq, cmd); | 856 | ret = nvme_submit_cmd(nvmeq, cmd); |
857 | if (ret) { | ||
858 | free_cmdid(nvmeq, cmdid, NULL); | ||
859 | unlock_nvmeq(nvmeq); | ||
860 | set_current_state(TASK_RUNNING); | ||
861 | return ret; | ||
862 | } | ||
863 | unlock_nvmeq(nvmeq); | ||
832 | schedule_timeout(timeout); | 864 | schedule_timeout(timeout); |
833 | 865 | ||
834 | if (cmdinfo.status == -EINTR) { | 866 | if (cmdinfo.status == -EINTR) { |
835 | nvme_abort_command(nvmeq, cmdid); | 867 | nvmeq = lock_nvmeq(dev, q_idx); |
868 | if (nvmeq) | ||
869 | nvme_abort_command(nvmeq, cmdid); | ||
870 | unlock_nvmeq(nvmeq); | ||
836 | return -EINTR; | 871 | return -EINTR; |
837 | } | 872 | } |
838 | 873 | ||
@@ -853,15 +888,20 @@ static int nvme_submit_async_cmd(struct nvme_queue *nvmeq, | |||
853 | return cmdid; | 888 | return cmdid; |
854 | cmdinfo->status = -EINTR; | 889 | cmdinfo->status = -EINTR; |
855 | cmd->common.command_id = cmdid; | 890 | cmd->common.command_id = cmdid; |
856 | nvme_submit_cmd(nvmeq, cmd); | 891 | return nvme_submit_cmd(nvmeq, cmd); |
857 | return 0; | ||
858 | } | 892 | } |
859 | 893 | ||
860 | int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, | 894 | int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, |
861 | u32 *result) | 895 | u32 *result) |
862 | { | 896 | { |
863 | return nvme_submit_sync_cmd(raw_nvmeq(dev, 0), cmd, result, | 897 | return nvme_submit_sync_cmd(dev, 0, cmd, result, ADMIN_TIMEOUT); |
864 | ADMIN_TIMEOUT); | 898 | } |
899 | |||
900 | int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_command *cmd, | ||
901 | u32 *result) | ||
902 | { | ||
903 | return nvme_submit_sync_cmd(dev, smp_processor_id() + 1, cmd, result, | ||
904 | NVME_IO_TIMEOUT); | ||
865 | } | 905 | } |
866 | 906 | ||
867 | static int nvme_submit_admin_cmd_async(struct nvme_dev *dev, | 907 | static int nvme_submit_admin_cmd_async(struct nvme_dev *dev, |
@@ -1434,7 +1474,6 @@ void nvme_unmap_user_pages(struct nvme_dev *dev, int write, | |||
1434 | static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) | 1474 | static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) |
1435 | { | 1475 | { |
1436 | struct nvme_dev *dev = ns->dev; | 1476 | struct nvme_dev *dev = ns->dev; |
1437 | struct nvme_queue *nvmeq; | ||
1438 | struct nvme_user_io io; | 1477 | struct nvme_user_io io; |
1439 | struct nvme_command c; | 1478 | struct nvme_command c; |
1440 | unsigned length, meta_len; | 1479 | unsigned length, meta_len; |
@@ -1510,20 +1549,10 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) | |||
1510 | 1549 | ||
1511 | length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL); | 1550 | length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL); |
1512 | 1551 | ||
1513 | nvmeq = get_nvmeq(dev); | ||
1514 | /* | ||
1515 | * Since nvme_submit_sync_cmd sleeps, we can't keep preemption | ||
1516 | * disabled. We may be preempted at any point, and be rescheduled | ||
1517 | * to a different CPU. That will cause cacheline bouncing, but no | ||
1518 | * additional races since q_lock already protects against other CPUs. | ||
1519 | */ | ||
1520 | put_nvmeq(nvmeq); | ||
1521 | if (length != (io.nblocks + 1) << ns->lba_shift) | 1552 | if (length != (io.nblocks + 1) << ns->lba_shift) |
1522 | status = -ENOMEM; | 1553 | status = -ENOMEM; |
1523 | else if (!nvmeq || nvmeq->q_suspended) | ||
1524 | status = -EBUSY; | ||
1525 | else | 1554 | else |
1526 | status = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT); | 1555 | status = nvme_submit_io_cmd(dev, &c, NULL); |
1527 | 1556 | ||
1528 | if (meta_len) { | 1557 | if (meta_len) { |
1529 | if (status == NVME_SC_SUCCESS && !(io.opcode & 1)) { | 1558 | if (status == NVME_SC_SUCCESS && !(io.opcode & 1)) { |
@@ -1597,8 +1626,7 @@ static int nvme_user_admin_cmd(struct nvme_dev *dev, | |||
1597 | if (length != cmd.data_len) | 1626 | if (length != cmd.data_len) |
1598 | status = -ENOMEM; | 1627 | status = -ENOMEM; |
1599 | else | 1628 | else |
1600 | status = nvme_submit_sync_cmd(raw_nvmeq(dev, 0), &c, | 1629 | status = nvme_submit_sync_cmd(dev, 0, &c, &cmd.result, timeout); |
1601 | &cmd.result, timeout); | ||
1602 | 1630 | ||
1603 | if (cmd.data_len) { | 1631 | if (cmd.data_len) { |
1604 | nvme_unmap_user_pages(dev, cmd.opcode & 1, iod); | 1632 | nvme_unmap_user_pages(dev, cmd.opcode & 1, iod); |
diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c index 4a0ceb64e269..e157e85bb5d7 100644 --- a/drivers/block/nvme-scsi.c +++ b/drivers/block/nvme-scsi.c | |||
@@ -2033,7 +2033,6 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, | |||
2033 | int res = SNTI_TRANSLATION_SUCCESS; | 2033 | int res = SNTI_TRANSLATION_SUCCESS; |
2034 | int nvme_sc; | 2034 | int nvme_sc; |
2035 | struct nvme_dev *dev = ns->dev; | 2035 | struct nvme_dev *dev = ns->dev; |
2036 | struct nvme_queue *nvmeq; | ||
2037 | u32 num_cmds; | 2036 | u32 num_cmds; |
2038 | struct nvme_iod *iod; | 2037 | struct nvme_iod *iod; |
2039 | u64 unit_len; | 2038 | u64 unit_len; |
@@ -2106,18 +2105,7 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, | |||
2106 | 2105 | ||
2107 | nvme_offset += unit_num_blocks; | 2106 | nvme_offset += unit_num_blocks; |
2108 | 2107 | ||
2109 | nvmeq = get_nvmeq(dev); | 2108 | nvme_sc = nvme_submit_io_cmd(dev, &c, NULL); |
2110 | /* | ||
2111 | * Since nvme_submit_sync_cmd sleeps, we can't keep | ||
2112 | * preemption disabled. We may be preempted at any | ||
2113 | * point, and be rescheduled to a different CPU. That | ||
2114 | * will cause cacheline bouncing, but no additional | ||
2115 | * races since q_lock already protects against other | ||
2116 | * CPUs. | ||
2117 | */ | ||
2118 | put_nvmeq(nvmeq); | ||
2119 | nvme_sc = nvme_submit_sync_cmd(nvmeq, &c, NULL, | ||
2120 | NVME_IO_TIMEOUT); | ||
2121 | if (nvme_sc != NVME_SC_SUCCESS) { | 2109 | if (nvme_sc != NVME_SC_SUCCESS) { |
2122 | nvme_unmap_user_pages(dev, | 2110 | nvme_unmap_user_pages(dev, |
2123 | (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, | 2111 | (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, |
@@ -2644,7 +2632,6 @@ static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr, | |||
2644 | { | 2632 | { |
2645 | int res = SNTI_TRANSLATION_SUCCESS; | 2633 | int res = SNTI_TRANSLATION_SUCCESS; |
2646 | int nvme_sc; | 2634 | int nvme_sc; |
2647 | struct nvme_queue *nvmeq; | ||
2648 | struct nvme_command c; | 2635 | struct nvme_command c; |
2649 | u8 immed, pcmod, pc, no_flush, start; | 2636 | u8 immed, pcmod, pc, no_flush, start; |
2650 | 2637 | ||
@@ -2671,10 +2658,7 @@ static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr, | |||
2671 | c.common.opcode = nvme_cmd_flush; | 2658 | c.common.opcode = nvme_cmd_flush; |
2672 | c.common.nsid = cpu_to_le32(ns->ns_id); | 2659 | c.common.nsid = cpu_to_le32(ns->ns_id); |
2673 | 2660 | ||
2674 | nvmeq = get_nvmeq(ns->dev); | 2661 | nvme_sc = nvme_submit_io_cmd(ns->dev, &c, NULL); |
2675 | put_nvmeq(nvmeq); | ||
2676 | nvme_sc = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT); | ||
2677 | |||
2678 | res = nvme_trans_status_code(hdr, nvme_sc); | 2662 | res = nvme_trans_status_code(hdr, nvme_sc); |
2679 | if (res) | 2663 | if (res) |
2680 | goto out; | 2664 | goto out; |
@@ -2697,15 +2681,12 @@ static int nvme_trans_synchronize_cache(struct nvme_ns *ns, | |||
2697 | int res = SNTI_TRANSLATION_SUCCESS; | 2681 | int res = SNTI_TRANSLATION_SUCCESS; |
2698 | int nvme_sc; | 2682 | int nvme_sc; |
2699 | struct nvme_command c; | 2683 | struct nvme_command c; |
2700 | struct nvme_queue *nvmeq; | ||
2701 | 2684 | ||
2702 | memset(&c, 0, sizeof(c)); | 2685 | memset(&c, 0, sizeof(c)); |
2703 | c.common.opcode = nvme_cmd_flush; | 2686 | c.common.opcode = nvme_cmd_flush; |
2704 | c.common.nsid = cpu_to_le32(ns->ns_id); | 2687 | c.common.nsid = cpu_to_le32(ns->ns_id); |
2705 | 2688 | ||
2706 | nvmeq = get_nvmeq(ns->dev); | 2689 | nvme_sc = nvme_submit_io_cmd(ns->dev, &c, NULL); |
2707 | put_nvmeq(nvmeq); | ||
2708 | nvme_sc = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT); | ||
2709 | 2690 | ||
2710 | res = nvme_trans_status_code(hdr, nvme_sc); | 2691 | res = nvme_trans_status_code(hdr, nvme_sc); |
2711 | if (res) | 2692 | if (res) |
@@ -2872,7 +2853,6 @@ static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr, | |||
2872 | struct nvme_dev *dev = ns->dev; | 2853 | struct nvme_dev *dev = ns->dev; |
2873 | struct scsi_unmap_parm_list *plist; | 2854 | struct scsi_unmap_parm_list *plist; |
2874 | struct nvme_dsm_range *range; | 2855 | struct nvme_dsm_range *range; |
2875 | struct nvme_queue *nvmeq; | ||
2876 | struct nvme_command c; | 2856 | struct nvme_command c; |
2877 | int i, nvme_sc, res = -ENOMEM; | 2857 | int i, nvme_sc, res = -ENOMEM; |
2878 | u16 ndesc, list_len; | 2858 | u16 ndesc, list_len; |
@@ -2914,10 +2894,7 @@ static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr, | |||
2914 | c.dsm.nr = cpu_to_le32(ndesc - 1); | 2894 | c.dsm.nr = cpu_to_le32(ndesc - 1); |
2915 | c.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); | 2895 | c.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); |
2916 | 2896 | ||
2917 | nvmeq = get_nvmeq(dev); | 2897 | nvme_sc = nvme_submit_io_cmd(dev, &c, NULL); |
2918 | put_nvmeq(nvmeq); | ||
2919 | |||
2920 | nvme_sc = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT); | ||
2921 | res = nvme_trans_status_code(hdr, nvme_sc); | 2898 | res = nvme_trans_status_code(hdr, nvme_sc); |
2922 | 2899 | ||
2923 | dma_free_coherent(&dev->pci_dev->dev, ndesc * sizeof(*range), | 2900 | dma_free_coherent(&dev->pci_dev->dev, ndesc * sizeof(*range), |