NVMe: IOCTL path RCU protect queue access

This adds rcu protected access to a queue in the nvme IOCTL path to fix potential races between a surprise removal and queue usage in nvme_submit_sync_cmd. The fix holds the rcu_read_lock() here to prevent the nvme_queue from freeing while this path is executing so it can't sleep, and so this path will no longer wait for a available command id should they all be in use at the time a passthrough IOCTL request is received. Signed-off-by: Keith Busch <keith.busch@intel.com> Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
author: Keith Busch <keith.busch@intel.com> 2014-03-03 18:39:13 -0500
committer: Matthew Wilcox <matthew.r.wilcox@intel.com> 2014-03-24 08:54:40 -0400
commit: 4f5099af4f3d5f999d8ab7784472d93e810e3912 (patch)
tree: f0dc2bc897e723037ae63a5f7f41bb3bff710399 /drivers/block
parent: 5a92e700af2e5e0e6404988d6a7f2ed3dad3f46f (diff)
2 files changed, 59 insertions, 54 deletions
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index b66ab1db4629..04664cadadfa 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -268,18 +268,30 @@ static struct nvme_queue *raw_nvmeq(struct nvme_dev *dev, int qid)
        return rcu_dereference_raw(dev->queues[qid]);
 }
-struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU)
+static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU)
 {
        rcu_read_lock();
        return rcu_dereference(dev->queues[get_cpu() + 1]);
 }
-void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU)
+static void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU)
 {
        put_cpu();
        rcu_read_unlock();
 }
+static struct nvme_queue *lock_nvmeq(struct nvme_dev *dev, int q_idx)
+                                                        __acquires(RCU)
+{
+        rcu_read_lock();
+        return rcu_dereference(dev->queues[q_idx]);
+}
+static void unlock_nvmeq(struct nvme_queue *nvmeq) __releases(RCU)
+{
+        rcu_read_unlock();
+}
 /**
 * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
 * @nvmeq: The queue to use
@@ -292,6 +304,10 @@ static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
        unsigned long flags;
        u16 tail;
        spin_lock_irqsave(&nvmeq->q_lock, flags);
+        if (nvmeq->q_suspended) {
+                spin_unlock_irqrestore(&nvmeq->q_lock, flags);
+                return -EBUSY;
+        }
        tail = nvmeq->sq_tail;
        memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
        if (++tail == nvmeq->q_depth)
@@ -812,27 +828,46 @@ static void sync_completion(struct nvme_dev *dev, void *ctx,
 * Returns 0 on success.  If the result is negative, it's a Linux error code;
 * if the result is positive, it's an NVM Express status code
 */
-int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
+static int nvme_submit_sync_cmd(struct nvme_dev *dev, int q_idx,
+                                                struct nvme_command *cmd,
                                                u32 *result, unsigned timeout)
 {
-        int cmdid;
+        int cmdid, ret;
        struct sync_cmd_info cmdinfo;
+        struct nvme_queue *nvmeq;
+        nvmeq = lock_nvmeq(dev, q_idx);
+        if (!nvmeq) {
+                unlock_nvmeq(nvmeq);
+                return -ENODEV;
+        }
        cmdinfo.task = current;
        cmdinfo.status = -EINTR;
-        cmdid = alloc_cmdid_killable(nvmeq, &cmdinfo, sync_completion,
+        cmdid = alloc_cmdid(nvmeq, &cmdinfo, sync_completion, timeout);
-                                                                timeout);
+        if (cmdid < 0) {
-        if (cmdid < 0)
+                unlock_nvmeq(nvmeq);
                return cmdid;
+        }
        cmd->common.command_id = cmdid;
        set_current_state(TASK_KILLABLE);
-        nvme_submit_cmd(nvmeq, cmd);
+        ret = nvme_submit_cmd(nvmeq, cmd);
+        if (ret) {
+                free_cmdid(nvmeq, cmdid, NULL);
+                unlock_nvmeq(nvmeq);
+                set_current_state(TASK_RUNNING);
+                return ret;
+        }
+        unlock_nvmeq(nvmeq);
        schedule_timeout(timeout);
        if (cmdinfo.status == -EINTR) {
-                nvme_abort_command(nvmeq, cmdid);
+                nvmeq = lock_nvmeq(dev, q_idx);
+                if (nvmeq)
+                        nvme_abort_command(nvmeq, cmdid);
+                unlock_nvmeq(nvmeq);
                return -EINTR;
        }
@@ -853,15 +888,20 @@ static int nvme_submit_async_cmd(struct nvme_queue *nvmeq,
                return cmdid;
        cmdinfo->status = -EINTR;
        cmd->common.command_id = cmdid;
-        nvme_submit_cmd(nvmeq, cmd);
+        return nvme_submit_cmd(nvmeq, cmd);
-        return 0;
 }
 int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
                                                                u32 *result)
 {
-        return nvme_submit_sync_cmd(raw_nvmeq(dev, 0), cmd, result,
+        return nvme_submit_sync_cmd(dev, 0, cmd, result, ADMIN_TIMEOUT);
-                                                                ADMIN_TIMEOUT);
+}
+int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
+                                                                u32 *result)
+{
+        return nvme_submit_sync_cmd(dev, smp_processor_id() + 1, cmd, result,
+                                                        NVME_IO_TIMEOUT);
 }
 static int nvme_submit_admin_cmd_async(struct nvme_dev *dev,
@@ -1434,7 +1474,6 @@ void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 {
        struct nvme_dev *dev = ns->dev;
-        struct nvme_queue *nvmeq;
        struct nvme_user_io io;
        struct nvme_command c;
        unsigned length, meta_len;
@@ -1510,20 +1549,10 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
        length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL);
-        nvmeq = get_nvmeq(dev);
-        /*
-         * Since nvme_submit_sync_cmd sleeps, we can't keep preemption
-         * disabled.  We may be preempted at any point, and be rescheduled
-         * to a different CPU.  That will cause cacheline bouncing, but no
-         * additional races since q_lock already protects against other CPUs.
-         */
-        put_nvmeq(nvmeq);
        if (length != (io.nblocks + 1) << ns->lba_shift)
                status = -ENOMEM;
-        else if (!nvmeq || nvmeq->q_suspended)
-                status = -EBUSY;
        else
-                status = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT);
+                status = nvme_submit_io_cmd(dev, &c, NULL);
        if (meta_len) {
                if (status == NVME_SC_SUCCESS && !(io.opcode & 1)) {
@@ -1597,8 +1626,7 @@ static int nvme_user_admin_cmd(struct nvme_dev *dev,
        if (length != cmd.data_len)
                status = -ENOMEM;
        else
-                status = nvme_submit_sync_cmd(raw_nvmeq(dev, 0), &c,
+                status = nvme_submit_sync_cmd(dev, 0, &c, &cmd.result, timeout);
-                                                        &cmd.result, timeout);
        if (cmd.data_len) {
                nvme_unmap_user_pages(dev, cmd.opcode & 1, iod);
diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c
index 4a0ceb64e269..e157e85bb5d7 100644
--- a/drivers/block/nvme-scsi.c
+++ b/drivers/block/nvme-scsi.c
@@ -2033,7 +2033,6 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr,
        int res = SNTI_TRANSLATION_SUCCESS;
        int nvme_sc;
        struct nvme_dev *dev = ns->dev;
-        struct nvme_queue *nvmeq;
        u32 num_cmds;
        struct nvme_iod *iod;
        u64 unit_len;
@@ -2106,18 +2105,7 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr,
                nvme_offset += unit_num_blocks;
-                nvmeq = get_nvmeq(dev);
+                nvme_sc = nvme_submit_io_cmd(dev, &c, NULL);
-                /*
-                 * Since nvme_submit_sync_cmd sleeps, we can't keep
-                 * preemption disabled.  We may be preempted at any
-                 * point, and be rescheduled to a different CPU.  That
-                 * will cause cacheline bouncing, but no additional
-                 * races since q_lock already protects against other
-                 * CPUs.
-                 */
-                put_nvmeq(nvmeq);
-                nvme_sc = nvme_submit_sync_cmd(nvmeq, &c, NULL,
-                                                NVME_IO_TIMEOUT);
                if (nvme_sc != NVME_SC_SUCCESS) {
                        nvme_unmap_user_pages(dev,
                                (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
@@ -2644,7 +2632,6 @@ static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 {
        int res = SNTI_TRANSLATION_SUCCESS;
        int nvme_sc;
-        struct nvme_queue *nvmeq;
        struct nvme_command c;
        u8 immed, pcmod, pc, no_flush, start;
@@ -2671,10 +2658,7 @@ static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr,
                        c.common.opcode = nvme_cmd_flush;
                        c.common.nsid = cpu_to_le32(ns->ns_id);
-                        nvmeq = get_nvmeq(ns->dev);
+                        nvme_sc = nvme_submit_io_cmd(ns->dev, &c, NULL);
-                        put_nvmeq(nvmeq);
-                        nvme_sc = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT);
                        res = nvme_trans_status_code(hdr, nvme_sc);
                        if (res)
                                goto out;
@@ -2697,15 +2681,12 @@ static int nvme_trans_synchronize_cache(struct nvme_ns *ns,
        int res = SNTI_TRANSLATION_SUCCESS;
        int nvme_sc;
        struct nvme_command c;
-        struct nvme_queue *nvmeq;
        memset(&c, 0, sizeof(c));
        c.common.opcode = nvme_cmd_flush;
        c.common.nsid = cpu_to_le32(ns->ns_id);
-        nvmeq = get_nvmeq(ns->dev);
+        nvme_sc = nvme_submit_io_cmd(ns->dev, &c, NULL);
-        put_nvmeq(nvmeq);
-        nvme_sc = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT);
        res = nvme_trans_status_code(hdr, nvme_sc);
        if (res)
@@ -2872,7 +2853,6 @@ static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr,
        struct nvme_dev *dev = ns->dev;
        struct scsi_unmap_parm_list *plist;
        struct nvme_dsm_range *range;
-        struct nvme_queue *nvmeq;
        struct nvme_command c;
        int i, nvme_sc, res = -ENOMEM;
        u16 ndesc, list_len;
@@ -2914,10 +2894,7 @@ static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr,
        c.dsm.nr = cpu_to_le32(ndesc - 1);
        c.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
-        nvmeq = get_nvmeq(dev);
+        nvme_sc = nvme_submit_io_cmd(dev, &c, NULL);
-        put_nvmeq(nvmeq);
-        nvme_sc = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT);
        res = nvme_trans_status_code(hdr, nvme_sc);
        dma_free_coherent(&dev->pci_dev->dev, ndesc * sizeof(*range),
author	Keith Busch <keith.busch@intel.com>	2014-03-03 18:39:13 -0500
committer	Matthew Wilcox <matthew.r.wilcox@intel.com>	2014-03-24 08:54:40 -0400
commit	4f5099af4f3d5f999d8ab7784472d93e810e3912 (patch)
tree	f0dc2bc897e723037ae63a5f7f41bb3bff710399 /drivers/block
parent	5a92e700af2e5e0e6404988d6a7f2ed3dad3f46f (diff)