Merge tag 'for-linus-20180413' of git://git.kernel.dk/linux-block

Pull block fixes from Jens Axboe: "Followup fixes for this merge window. This contains: - Series from Ming, fixing corner cases in our CPU <-> queue mapping. This triggered repeated warnings on especially s390, but I also hit it in cpu hot plug/unplug testing while doing IO on NVMe on x86-64. - Another fix from Ming, ensuring that we always order budget and driver tag identically, avoiding a deadlock on QD=1 devices. - Loop locking regression fix from this merge window, from Omar. - Another loop locking fix, this time missing an unlock, from Tetsuo Handa. - Fix for racing IO submission with device removal from Bart. - sr reference fix from me, fixing a case where disk change or getevents can race with device removal. - Set of nvme fixes by way of Keith, from various contributors" * tag 'for-linus-20180413' of git://git.kernel.dk/linux-block: (28 commits) nvme: expand nvmf_check_if_ready checks nvme: Use admin command effects for admin commands nvmet: fix space padding in serial number nvme: check return value of init_srcu_struct function nvmet: Fix nvmet_execute_write_zeroes sector count nvme-pci: Separate IO and admin queue IRQ vectors nvme-pci: Remove unused queue parameter nvme-pci: Skip queue deletion if there are no queues nvme: target: fix buffer overflow nvme: don't send keep-alives to the discovery controller nvme: unexport nvme_start_keep_alive nvme-loop: fix kernel oops in case of unhandled command nvme: enforce 64bit offset for nvme_get_log_ext fn sr: get/drop reference to device in revalidate and check_events blk-mq: Revert "blk-mq: reimplement blk_mq_hw_queue_mapped" blk-mq: Avoid that submitting a bio concurrently with device removal triggers a crash backing: silence compiler warning using __printf blk-mq: remove code for dealing with remapping queue blk-mq: reimplement blk_mq_hw_queue_mapped blk-mq: don't check queue mapped in __blk_mq_delay_run_hw_queue() ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2018-04-13 18:15:15 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2018-04-13 18:15:15 -0400
commit: edda415314804c29fa07e538938fa07947012d8f (patch)
tree: 0428db94253f73bb0744f52d26645c33830756f3
parent: 3e565a351ed3e94352bfbe0be06c659fc8fafb19 (diff)
parent: bb06ec31452fb2da1594f88035c2ecea4e0652f4 (diff)
19 files changed, 245 insertions, 226 deletions
diff --git a/block/blk-core.c b/block/blk-core.c
index abcb8684ba67..806ce2442819 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2385,8 +2385,20 @@ blk_qc_t generic_make_request(struct bio *bio)
         * yet.
         */
        struct bio_list bio_list_on_stack[2];
+        blk_mq_req_flags_t flags = 0;
+        struct request_queue *q = bio->bi_disk->queue;
        blk_qc_t ret = BLK_QC_T_NONE;
+        if (bio->bi_opf & REQ_NOWAIT)
+                flags = BLK_MQ_REQ_NOWAIT;
+        if (blk_queue_enter(q, flags) < 0) {
+                if (!blk_queue_dying(q) && (bio->bi_opf & REQ_NOWAIT))
+                        bio_wouldblock_error(bio);
+                else
+                        bio_io_error(bio);
+                return ret;
+        }
        if (!generic_make_request_checks(bio))
                goto out;
@@ -2423,11 +2435,22 @@ blk_qc_t generic_make_request(struct bio *bio)
        bio_list_init(&bio_list_on_stack[0]);
        current->bio_list = bio_list_on_stack;
        do {
-                struct request_queue *q = bio->bi_disk->queue;
+                bool enter_succeeded = true;
-                blk_mq_req_flags_t flags = bio->bi_opf & REQ_NOWAIT ?
-                        BLK_MQ_REQ_NOWAIT : 0;
+                if (unlikely(q != bio->bi_disk->queue)) {
+                        if (q)
+                                blk_queue_exit(q);
+                        q = bio->bi_disk->queue;
+                        flags = 0;
+                        if (bio->bi_opf & REQ_NOWAIT)
+                                flags = BLK_MQ_REQ_NOWAIT;
+                        if (blk_queue_enter(q, flags) < 0) {
+                                enter_succeeded = false;
+                                q = NULL;
+                        }
+                }
-                if (likely(blk_queue_enter(q, flags) == 0)) {
+                if (enter_succeeded) {
                        struct bio_list lower, same;
                        /* Create a fresh bio_list for all subordinate requests */
@@ -2435,8 +2458,6 @@ blk_qc_t generic_make_request(struct bio *bio)
                        bio_list_init(&bio_list_on_stack[0]);
                        ret = q->make_request_fn(q, bio);
-                        blk_queue_exit(q);
                        /* sort new bios into those for a lower level
                         * and those for the same level
                         */
@@ -2463,6 +2484,8 @@ blk_qc_t generic_make_request(struct bio *bio)
        current->bio_list = NULL; /* deactivate */
 out:
+        if (q)
+                blk_queue_exit(q);
        return ret;
 }
 EXPORT_SYMBOL(generic_make_request);
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 9f8cffc8a701..3eb169f15842 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -16,11 +16,6 @@
 static int cpu_to_queue_index(unsigned int nr_queues, const int cpu)
 {
-        /*
-         * Non present CPU will be mapped to queue index 0.
-         */
-        if (!cpu_present(cpu))
-                return 0;
        return cpu % nr_queues;
 }
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 58b3b79cbe83..3080e18cb859 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -235,7 +235,6 @@ static const char *const hctx_state_name[] = {
        HCTX_STATE_NAME(STOPPED),
        HCTX_STATE_NAME(TAG_ACTIVE),
        HCTX_STATE_NAME(SCHED_RESTART),
-        HCTX_STATE_NAME(START_ON_RUN),
 };
 #undef HCTX_STATE_NAME
diff --git a/block/blk-mq.c b/block/blk-mq.c
index f5c7dbcb954f..0dc9e341c2a7 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1180,7 +1180,12 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
                struct blk_mq_queue_data bd;
                rq = list_first_entry(list, struct request, queuelist);
-                if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
+                hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
+                if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
+                        break;
+                if (!blk_mq_get_driver_tag(rq, NULL, false)) {
                        /*
                         * The initial allocation attempt failed, so we need to
                         * rerun the hardware queue when a tag is freed. The
@@ -1189,8 +1194,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
                         * we'll re-run it below.
                         */
                        if (!blk_mq_mark_tag_wait(&hctx, rq)) {
-                                if (got_budget)
+                                blk_mq_put_dispatch_budget(hctx);
-                                        blk_mq_put_dispatch_budget(hctx);
                                /*
                                 * For non-shared tags, the RESTART check
                                 * will suffice.
@@ -1201,11 +1205,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
                        }
                }
-                if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) {
-                        blk_mq_put_driver_tag(rq);
-                        break;
-                }
                list_del_init(&rq->queuelist);
                bd.rq = rq;
@@ -1336,6 +1335,15 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
        hctx_unlock(hctx, srcu_idx);
 }
+static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
+{
+        int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask);
+        if (cpu >= nr_cpu_ids)
+                cpu = cpumask_first(hctx->cpumask);
+        return cpu;
+}
 /*
 * It'd be great if the workqueue API had a way to pass
 * in a mask and had some smarts for more clever placement.
@@ -1345,26 +1353,17 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 {
        bool tried = false;
+        int next_cpu = hctx->next_cpu;
        if (hctx->queue->nr_hw_queues == 1)
                return WORK_CPU_UNBOUND;
        if (--hctx->next_cpu_batch <= 0) {
-                int next_cpu;
 select_cpu:
-                next_cpu = cpumask_next_and(hctx->next_cpu, hctx->cpumask,
+                next_cpu = cpumask_next_and(next_cpu, hctx->cpumask,
                                cpu_online_mask);
                if (next_cpu >= nr_cpu_ids)
-                        next_cpu = cpumask_first_and(hctx->cpumask,cpu_online_mask);
+                        next_cpu = blk_mq_first_mapped_cpu(hctx);
-                /*
-                 * No online CPU is found, so have to make sure hctx->next_cpu
-                 * is set correctly for not breaking workqueue.
-                 */
-                if (next_cpu >= nr_cpu_ids)
-                        hctx->next_cpu = cpumask_first(hctx->cpumask);
-                else
-                        hctx->next_cpu = next_cpu;
                hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
        }
@@ -1372,7 +1371,7 @@ select_cpu:
         * Do unbound schedule if we can't find a online CPU for this hctx,
         * and it should only happen in the path of handling CPU DEAD.
         */
-        if (!cpu_online(hctx->next_cpu)) {
+        if (!cpu_online(next_cpu)) {
                if (!tried) {
                        tried = true;
                        goto select_cpu;
@@ -1382,18 +1381,18 @@ select_cpu:
                 * Make sure to re-select CPU next time once after CPUs
                 * in hctx->cpumask become online again.
                 */
+                hctx->next_cpu = next_cpu;
                hctx->next_cpu_batch = 1;
                return WORK_CPU_UNBOUND;
        }
-        return hctx->next_cpu;
+        hctx->next_cpu = next_cpu;
+        return next_cpu;
 }
 static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
                                        unsigned long msecs)
 {
-        if (WARN_ON_ONCE(!blk_mq_hw_queue_mapped(hctx)))
-                return;
        if (unlikely(blk_mq_hctx_stopped(hctx)))
                return;
@@ -1560,40 +1559,14 @@ static void blk_mq_run_work_fn(struct work_struct *work)
        hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
        /*
-         * If we are stopped, don't run the queue. The exception is if
+         * If we are stopped, don't run the queue.
-         * BLK_MQ_S_START_ON_RUN is set. For that case, we auto-clear
-         * the STOPPED bit and run it.
         */
-        if (test_bit(BLK_MQ_S_STOPPED, &hctx->state)) {
+        if (test_bit(BLK_MQ_S_STOPPED, &hctx->state))
-                if (!test_bit(BLK_MQ_S_START_ON_RUN, &hctx->state))
-                        return;
-                clear_bit(BLK_MQ_S_START_ON_RUN, &hctx->state);
                clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
-        }
        __blk_mq_run_hw_queue(hctx);
 }
-void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
-{
-        if (WARN_ON_ONCE(!blk_mq_hw_queue_mapped(hctx)))
-                return;
-        /*
-         * Stop the hw queue, then modify currently delayed work.
-         * This should prevent us from running the queue prematurely.
-         * Mark the queue as auto-clearing STOPPED when it runs.
-         */
-        blk_mq_stop_hw_queue(hctx);
-        set_bit(BLK_MQ_S_START_ON_RUN, &hctx->state);
-        kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
-                                        &hctx->run_work,
-                                        msecs_to_jiffies(msecs));
-}
-EXPORT_SYMBOL(blk_mq_delay_queue);
 static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
                                            struct request *rq,
                                            bool at_head)
@@ -1804,11 +1777,11 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
        if (q->elevator && !bypass_insert)
                goto insert;
-        if (!blk_mq_get_driver_tag(rq, NULL, false))
+        if (!blk_mq_get_dispatch_budget(hctx))
                goto insert;
-        if (!blk_mq_get_dispatch_budget(hctx)) {
+        if (!blk_mq_get_driver_tag(rq, NULL, false)) {
-                blk_mq_put_driver_tag(rq);
+                blk_mq_put_dispatch_budget(hctx);
                goto insert;
        }
@@ -2356,7 +2329,7 @@ static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
 static void blk_mq_map_swqueue(struct request_queue *q)
 {
-        unsigned int i, hctx_idx;
+        unsigned int i;
        struct blk_mq_hw_ctx *hctx;
        struct blk_mq_ctx *ctx;
        struct blk_mq_tag_set *set = q->tag_set;
@@ -2373,23 +2346,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
        /*
         * Map software to hardware queues.
-         *
-         * If the cpu isn't present, the cpu is mapped to first hctx.
         */
        for_each_possible_cpu(i) {
-                hctx_idx = q->mq_map[i];
-                /* unmapped hw queue can be remapped after CPU topo changed */
-                if (!set->tags[hctx_idx] &&
-                    !__blk_mq_alloc_rq_map(set, hctx_idx)) {
-                        /*
-                         * If tags initialization fail for some hctx,
-                         * that hctx won't be brought online.  In this
-                         * case, remap the current ctx to hctx[0] which
-                         * is guaranteed to always have tags allocated
-                         */
-                        q->mq_map[i] = 0;
-                }
                ctx = per_cpu_ptr(q->queue_ctx, i);
                hctx = blk_mq_map_queue(q, i);
@@ -2401,21 +2359,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
        mutex_unlock(&q->sysfs_lock);
        queue_for_each_hw_ctx(q, hctx, i) {
-                /*
+                /* every hctx should get mapped by at least one CPU */
-                 * If no software queues are mapped to this hardware queue,
+                WARN_ON(!hctx->nr_ctx);
-                 * disable it and free the request entries.
-                 */
-                if (!hctx->nr_ctx) {
-                        /* Never unmap queue 0.  We need it as a
-                         * fallback in case of a new remap fails
-                         * allocation
-                         */
-                        if (i && set->tags[i])
-                                blk_mq_free_map_and_requests(set, i);
-                        hctx->tags = NULL;
-                        continue;
-                }
                hctx->tags = set->tags[i];
                WARN_ON(!hctx->tags);
@@ -2430,8 +2375,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
                /*
                 * Initialize batch roundrobin counts
                 */
-                hctx->next_cpu = cpumask_first_and(hctx->cpumask,
+                hctx->next_cpu = blk_mq_first_mapped_cpu(hctx);
-                                cpu_online_mask);
                hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
        }
 }
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 264abaaff662..c9d04497a415 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1103,11 +1103,15 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
        if (info->lo_encrypt_type) {
                unsigned int type = info->lo_encrypt_type;
-                if (type >= MAX_LO_CRYPT)
+                if (type >= MAX_LO_CRYPT) {
-                        return -EINVAL;
+                        err = -EINVAL;
+                        goto exit;
+                }
                xfer = xfer_funcs[type];
-                if (xfer == NULL)
+                if (xfer == NULL) {
-                        return -EINVAL;
+                        err = -EINVAL;
+                        goto exit;
+                }
        } else
                xfer = NULL;
@@ -1283,12 +1287,13 @@ static int
 loop_get_status_old(struct loop_device *lo, struct loop_info __user *arg) {
        struct loop_info info;
        struct loop_info64 info64;
-        int err = 0;
+        int err;
-        if (!arg)
+        if (!arg) {
-                err = -EINVAL;
+                mutex_unlock(&lo->lo_ctl_mutex);
-        if (!err)
+                return -EINVAL;
-                err = loop_get_status(lo, &info64);
+        }
+        err = loop_get_status(lo, &info64);
        if (!err)
                err = loop_info64_to_old(&info64, &info);
        if (!err && copy_to_user(arg, &info, sizeof(info)))
@@ -1300,12 +1305,13 @@ loop_get_status_old(struct loop_device *lo, struct loop_info __user *arg) {
 static int
 loop_get_status64(struct loop_device *lo, struct loop_info64 __user *arg) {
        struct loop_info64 info64;
-        int err = 0;
+        int err;
-        if (!arg)
+        if (!arg) {
-                err = -EINVAL;
+                mutex_unlock(&lo->lo_ctl_mutex);
-        if (!err)
+                return -EINVAL;
-                err = loop_get_status(lo, &info64);
+        }
+        err = loop_get_status(lo, &info64);
        if (!err && copy_to_user(arg, &info64, sizeof(info64)))
                err = -EFAULT;
@@ -1529,12 +1535,13 @@ loop_get_status_compat(struct loop_device *lo,
                       struct compat_loop_info __user *arg)
 {
        struct loop_info64 info64;
-        int err = 0;
+        int err;
-        if (!arg)
+        if (!arg) {
-                err = -EINVAL;
+                mutex_unlock(&lo->lo_ctl_mutex);
-        if (!err)
+                return -EINVAL;
-                err = loop_get_status(lo, &info64);
+        }
+        err = loop_get_status(lo, &info64);
        if (!err)
                err = loop_info64_to_compat(&info64, arg);
        return err;
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 197a6ba9700f..9df4f71e58ca 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -376,6 +376,15 @@ static void nvme_put_ns(struct nvme_ns *ns)
        kref_put(&ns->kref, nvme_free_ns);
 }
+static inline void nvme_clear_nvme_request(struct request *req)
+{
+        if (!(req->rq_flags & RQF_DONTPREP)) {
+                nvme_req(req)->retries = 0;
+                nvme_req(req)->flags = 0;
+                req->rq_flags |= RQF_DONTPREP;
+        }
+}
 struct request *nvme_alloc_request(struct request_queue *q,
                struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid)
 {
@@ -392,6 +401,7 @@ struct request *nvme_alloc_request(struct request_queue *q,
                return req;
        req->cmd_flags |= REQ_FAILFAST_DRIVER;
+        nvme_clear_nvme_request(req);
        nvme_req(req)->cmd = cmd;
        return req;
@@ -608,11 +618,7 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
 {
        blk_status_t ret = BLK_STS_OK;
-        if (!(req->rq_flags & RQF_DONTPREP)) {
+        nvme_clear_nvme_request(req);
-                nvme_req(req)->retries = 0;
-                nvme_req(req)->flags = 0;
-                req->rq_flags |= RQF_DONTPREP;
-        }
        switch (req_op(req)) {
        case REQ_OP_DRV_IN:
@@ -742,6 +748,7 @@ static int nvme_submit_user_cmd(struct request_queue *q,
                return PTR_ERR(req);
        req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
+        nvme_req(req)->flags |= NVME_REQ_USERCMD;
        if (ubuffer && bufflen) {
                ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
@@ -826,7 +833,7 @@ static void nvme_keep_alive_work(struct work_struct *work)
        }
 }
-void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
+static void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
 {
        if (unlikely(ctrl->kato == 0))
                return;
@@ -836,7 +843,6 @@ void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
        ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
        schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
 }
-EXPORT_SYMBOL_GPL(nvme_start_keep_alive);
 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
 {
@@ -1103,7 +1109,7 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
        }
        if (ctrl->effects)
-                effects = le32_to_cpu(ctrl->effects->iocs[opcode]);
+                effects = le32_to_cpu(ctrl->effects->acs[opcode]);
        else
                effects = nvme_known_admin_effects(opcode);
@@ -2220,7 +2226,7 @@ out_unlock:
 int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
                     u8 log_page, void *log,
-                     size_t size, size_t offset)
+                     size_t size, u64 offset)
 {
        struct nvme_command c = { };
        unsigned long dwlen = size / 4 - 1;
@@ -2235,8 +2241,8 @@ int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
        c.get_log_page.lid = log_page;
        c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1));
        c.get_log_page.numdu = cpu_to_le16(dwlen >> 16);
-        c.get_log_page.lpol = cpu_to_le32(offset & ((1ULL << 32) - 1));
+        c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
-        c.get_log_page.lpou = cpu_to_le32(offset >> 32ULL);
+        c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset));
        return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
 }
@@ -2833,7 +2839,9 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
                goto out_free_head;
        head->instance = ret;
        INIT_LIST_HEAD(&head->list);
-        init_srcu_struct(&head->srcu);
+        ret = init_srcu_struct(&head->srcu);
+        if (ret)
+                goto out_ida_remove;
        head->subsys = ctrl->subsys;
        head->ns_id = nsid;
        kref_init(&head->ref);
@@ -2855,6 +2863,7 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
        return head;
 out_cleanup_srcu:
        cleanup_srcu_struct(&head->srcu);
+out_ida_remove:
        ida_simple_remove(&ctrl->subsys->ns_ida, head->instance);
 out_free_head:
        kfree(head);
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 8f0f34d06d46..124c458806df 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -536,6 +536,85 @@ static struct nvmf_transport_ops *nvmf_lookup_transport(
        return NULL;
 }
+blk_status_t nvmf_check_if_ready(struct nvme_ctrl *ctrl, struct request *rq,
+                bool queue_live, bool is_connected)
+{
+        struct nvme_command *cmd = nvme_req(rq)->cmd;
+        if (likely(ctrl->state == NVME_CTRL_LIVE && is_connected))
+                return BLK_STS_OK;
+        switch (ctrl->state) {
+        case NVME_CTRL_DELETING:
+                goto reject_io;
+        case NVME_CTRL_NEW:
+        case NVME_CTRL_CONNECTING:
+                if (!is_connected)
+                        /*
+                         * This is the case of starting a new
+                         * association but connectivity was lost
+                         * before it was fully created. We need to
+                         * error the commands used to initialize the
+                         * controller so the reconnect can go into a
+                         * retry attempt. The commands should all be
+                         * marked REQ_FAILFAST_DRIVER, which will hit
+                         * the reject path below. Anything else will
+                         * be queued while the state settles.
+                         */
+                        goto reject_or_queue_io;
+                if ((queue_live &&
+                     !(nvme_req(rq)->flags & NVME_REQ_USERCMD)) ||
+                    (!queue_live && blk_rq_is_passthrough(rq) &&
+                     cmd->common.opcode == nvme_fabrics_command &&
+                     cmd->fabrics.fctype == nvme_fabrics_type_connect))
+                        /*
+                         * If queue is live, allow only commands that
+                         * are internally generated pass through. These
+                         * are commands on the admin queue to initialize
+                         * the controller. This will reject any ioctl
+                         * admin cmds received while initializing.
+                         *
+                         * If the queue is not live, allow only a
+                         * connect command. This will reject any ioctl
+                         * admin cmd as well as initialization commands
+                         * if the controller reverted the queue to non-live.
+                         */
+                        return BLK_STS_OK;
+                /*
+                 * fall-thru to the reject_or_queue_io clause
+                 */
+                break;
+        /* these cases fall-thru
+         * case NVME_CTRL_LIVE:
+         * case NVME_CTRL_RESETTING:
+         */
+        default:
+                break;
+        }
+reject_or_queue_io:
+        /*
+         * Any other new io is something we're not in a state to send
+         * to the device. Default action is to busy it and retry it
+         * after the controller state is recovered. However, anything
+         * marked for failfast or nvme multipath is immediately failed.
+         * Note: commands used to initialize the controller will be
+         *  marked for failfast.
+         * Note: nvme cli/ioctl commands are marked for failfast.
+         */
+        if (!blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
+                return BLK_STS_RESOURCE;
+reject_io:
+        nvme_req(rq)->status = NVME_SC_ABORT_REQ;
+        return BLK_STS_IOERR;
+}
+EXPORT_SYMBOL_GPL(nvmf_check_if_ready);
 static const match_table_t opt_tokens = {
        { NVMF_OPT_TRANSPORT,           "transport=%s"          },
        { NVMF_OPT_TRADDR,              "traddr=%s"             },
@@ -608,8 +687,10 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
                        opts->discovery_nqn =
                                !(strcmp(opts->subsysnqn,
                                         NVME_DISC_SUBSYS_NAME));
-                        if (opts->discovery_nqn)
+                        if (opts->discovery_nqn) {
+                                opts->kato = 0;
                                opts->nr_io_queues = 0;
+                        }
                        break;
                case NVMF_OPT_TRADDR:
                        p = match_strdup(args);
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index a3145d90c1d2..ef46c915b7b5 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -157,36 +157,7 @@ void nvmf_unregister_transport(struct nvmf_transport_ops *ops);
 void nvmf_free_options(struct nvmf_ctrl_options *opts);
 int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size);
 bool nvmf_should_reconnect(struct nvme_ctrl *ctrl);
+blk_status_t nvmf_check_if_ready(struct nvme_ctrl *ctrl,
-static inline blk_status_t nvmf_check_init_req(struct nvme_ctrl *ctrl,
+        struct request *rq, bool queue_live, bool is_connected);
-                struct request *rq)
-{
-        struct nvme_command *cmd = nvme_req(rq)->cmd;
-        /*
-         * We cannot accept any other command until the connect command has
-         * completed, so only allow connect to pass.
-         */
-        if (!blk_rq_is_passthrough(rq) ||
-            cmd->common.opcode != nvme_fabrics_command ||
-            cmd->fabrics.fctype != nvme_fabrics_type_connect) {
-                /*
-                 * Connecting state means transport disruption or initial
-                 * establishment, which can take a long time and even might
-                 * fail permanently, fail fast to give upper layers a chance
-                 * to failover.
-                 * Deleting state means that the ctrl will never accept commands
-                 * again, fail it permanently.
-                 */
-                if (ctrl->state == NVME_CTRL_CONNECTING ||
-                    ctrl->state == NVME_CTRL_DELETING) {
-                        nvme_req(rq)->status = NVME_SC_ABORT_REQ;
-                        return BLK_STS_IOERR;
-                }
-                return BLK_STS_RESOURCE; /* try again later */
-        }
-        return BLK_STS_OK;
-}
 #endif /* _NVME_FABRICS_H */
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index c6e719b2f3ca..6cb26bcf6ec0 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2277,14 +2277,6 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
        return BLK_STS_OK;
 }
-static inline blk_status_t nvme_fc_is_ready(struct nvme_fc_queue *queue,
-                struct request *rq)
-{
-        if (unlikely(!test_bit(NVME_FC_Q_LIVE, &queue->flags)))
-                return nvmf_check_init_req(&queue->ctrl->ctrl, rq);
-        return BLK_STS_OK;
-}
 static blk_status_t
 nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx,
                        const struct blk_mq_queue_data *bd)
@@ -2300,7 +2292,9 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx,
        u32 data_len;
        blk_status_t ret;
-        ret = nvme_fc_is_ready(queue, rq);
+        ret = nvmf_check_if_ready(&queue->ctrl->ctrl, rq,
+                test_bit(NVME_FC_Q_LIVE, &queue->flags),
+                ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE);
        if (unlikely(ret))
                return ret;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index cf93690b3ffc..061fecfd44f5 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -105,6 +105,7 @@ struct nvme_request {
 enum {
        NVME_REQ_CANCELLED              = (1 << 0),
+        NVME_REQ_USERCMD                = (1 << 1),
 };
 static inline struct nvme_request *nvme_req(struct request *req)
@@ -422,7 +423,6 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
                unsigned timeout, int qid, int at_head,
                blk_mq_req_flags_t flags);
 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
-void nvme_start_keep_alive(struct nvme_ctrl *ctrl);
 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
 int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
 int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
@@ -430,7 +430,7 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
 int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl);
 int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
-                u8 log_page, void *log, size_t size, size_t offset);
+                u8 log_page, void *log, size_t size, u64 offset);
 extern const struct attribute_group nvme_ns_id_attr_group;
 extern const struct block_device_operations nvme_ns_head_ops;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 295fbec1e5f2..fbc71fac6f1e 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -84,6 +84,7 @@ struct nvme_dev {
        struct dma_pool *prp_small_pool;
        unsigned online_queues;
        unsigned max_qid;
+        unsigned int num_vecs;
        int q_depth;
        u32 db_stride;
        void __iomem *bar;
@@ -414,7 +415,8 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
 {
        struct nvme_dev *dev = set->driver_data;
-        return blk_mq_pci_map_queues(set, to_pci_dev(dev->dev), 0);
+        return blk_mq_pci_map_queues(set, to_pci_dev(dev->dev),
+                        dev->num_vecs > 1 ? 1 /* admin queue */ : 0);
 }
 /**
@@ -1380,8 +1382,7 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
        return 0;
 }
-static int nvme_alloc_queue(struct nvme_dev *dev, int qid,
+static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
-                int depth, int node)
 {
        struct nvme_queue *nvmeq = &dev->queues[qid];
@@ -1457,7 +1458,11 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
                nvmeq->sq_cmds_io = dev->cmb + offset;
        }
-        nvmeq->cq_vector = qid - 1;
+        /*
+         * A queue's vector matches the queue identifier unless the controller
+         * has only one vector available.
+         */
+        nvmeq->cq_vector = dev->num_vecs == 1 ? 0 : qid;
        result = adapter_alloc_cq(dev, qid, nvmeq);
        if (result < 0)
                goto release_vector;
@@ -1596,8 +1601,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
        if (result < 0)
                return result;
-        result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH,
+        result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);
-                        dev_to_node(dev->dev));
        if (result)
                return result;
@@ -1630,9 +1634,7 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
        int ret = 0;
        for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
-                /* vector == qid - 1, match nvme_create_queue */
+                if (nvme_alloc_queue(dev, i, dev->q_depth)) {
-                if (nvme_alloc_queue(dev, i, dev->q_depth,
-                     pci_irq_get_node(to_pci_dev(dev->dev), i - 1))) {
                        ret = -ENOMEM;
                        break;
                }
@@ -1914,6 +1916,10 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
        int result, nr_io_queues;
        unsigned long size;
+        struct irq_affinity affd = {
+                .pre_vectors = 1
+        };
        nr_io_queues = num_possible_cpus();
        result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
        if (result < 0)
@@ -1949,11 +1955,12 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
         * setting up the full range we need.
         */
        pci_free_irq_vectors(pdev);
-        nr_io_queues = pci_alloc_irq_vectors(pdev, 1, nr_io_queues,
+        result = pci_alloc_irq_vectors_affinity(pdev, 1, nr_io_queues + 1,
-                        PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY);
+                        PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
-        if (nr_io_queues <= 0)
+        if (result <= 0)
                return -EIO;
-        dev->max_qid = nr_io_queues;
+        dev->num_vecs = result;
+        dev->max_qid = max(result - 1, 1);
        /*
         * Should investigate if there's a performance win from allocating
@@ -2201,7 +2208,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
        nvme_stop_queues(&dev->ctrl);
-        if (!dead) {
+        if (!dead && dev->ctrl.queue_count > 0) {
                /*
                 * If the controller is still alive tell it to stop using the
                 * host memory buffer.  In theory the shutdown / reset should
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 758537e9ba07..1eb4438a8763 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1601,17 +1601,6 @@ nvme_rdma_timeout(struct request *rq, bool reserved)
        return BLK_EH_HANDLED;
 }
-/*
- * We cannot accept any other command until the Connect command has completed.
- */
-static inline blk_status_t
-nvme_rdma_is_ready(struct nvme_rdma_queue *queue, struct request *rq)
-{
-        if (unlikely(!test_bit(NVME_RDMA_Q_LIVE, &queue->flags)))
-                return nvmf_check_init_req(&queue->ctrl->ctrl, rq);
-        return BLK_STS_OK;
-}
 static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
                const struct blk_mq_queue_data *bd)
 {
@@ -1627,7 +1616,8 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
        WARN_ON_ONCE(rq->tag < 0);
-        ret = nvme_rdma_is_ready(queue, rq);
+        ret = nvmf_check_if_ready(&queue->ctrl->ctrl, rq,
+                test_bit(NVME_RDMA_Q_LIVE, &queue->flags), true);
        if (unlikely(ret))
                return ret;
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 90dcdc40ac71..5e0e9fcc0d4d 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -178,6 +178,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
        id->vid = 0;
        id->ssvid = 0;
+        memset(id->sn, ' ', sizeof(id->sn));
        bin2hex(id->sn, &ctrl->subsys->serial,
                min(sizeof(ctrl->subsys->serial), sizeof(id->sn) / 2));
        memcpy_and_pad(id->mn, sizeof(id->mn), model, sizeof(model) - 1, ' ');
diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
index a72425d8bce0..231e04e0a496 100644
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -59,7 +59,7 @@ static void nvmet_format_discovery_entry(struct nvmf_disc_rsp_page_hdr *hdr,
        memcpy(e->trsvcid, port->disc_addr.trsvcid, NVMF_TRSVCID_SIZE);
        memcpy(e->traddr, traddr, NVMF_TRADDR_SIZE);
        memcpy(e->tsas.common, port->disc_addr.tsas.common, NVMF_TSAS_SIZE);
-        memcpy(e->subnqn, subsys_nqn, NVMF_NQN_SIZE);
+        strncpy(e->subnqn, subsys_nqn, NVMF_NQN_SIZE);
 }
 /*
diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c
index 28bbdff4a88b..cd2344179673 100644
--- a/drivers/nvme/target/io-cmd.c
+++ b/drivers/nvme/target/io-cmd.c
@@ -173,8 +173,8 @@ static void nvmet_execute_write_zeroes(struct nvmet_req *req)
        sector = le64_to_cpu(write_zeroes->slba) <<
                (req->ns->blksize_shift - 9);
-        nr_sector = (((sector_t)le16_to_cpu(write_zeroes->length)) <<
+        nr_sector = (((sector_t)le16_to_cpu(write_zeroes->length) + 1) <<
-                (req->ns->blksize_shift - 9)) + 1;
+                (req->ns->blksize_shift - 9));
        if (__blkdev_issue_zeroout(req->ns->bdev, sector, nr_sector,
                                GFP_KERNEL, &bio, 0))
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index a350765d2d5c..31fdfba556a8 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -149,14 +149,6 @@ nvme_loop_timeout(struct request *rq, bool reserved)
        return BLK_EH_HANDLED;
 }
-static inline blk_status_t nvme_loop_is_ready(struct nvme_loop_queue *queue,
-                struct request *rq)
-{
-        if (unlikely(!test_bit(NVME_LOOP_Q_LIVE, &queue->flags)))
-                return nvmf_check_init_req(&queue->ctrl->ctrl, rq);
-        return BLK_STS_OK;
-}
 static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
                const struct blk_mq_queue_data *bd)
 {
@@ -166,7 +158,8 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
        struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req);
        blk_status_t ret;
-        ret = nvme_loop_is_ready(queue, req);
+        ret = nvmf_check_if_ready(&queue->ctrl->ctrl, req,
+                test_bit(NVME_LOOP_Q_LIVE, &queue->flags), true);
        if (unlikely(ret))
                return ret;
@@ -174,15 +167,12 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
        if (ret)
                return ret;
+        blk_mq_start_request(req);
        iod->cmd.common.flags |= NVME_CMD_SGL_METABUF;
        iod->req.port = nvmet_loop_port;
        if (!nvmet_req_init(&iod->req, &queue->nvme_cq,
-                        &queue->nvme_sq, &nvme_loop_ops)) {
+                        &queue->nvme_sq, &nvme_loop_ops))
-                nvme_cleanup_cmd(req);
-                blk_mq_start_request(req);
-                nvme_loop_queue_response(&iod->req);
                return BLK_STS_OK;
-        }
        if (blk_rq_payload_bytes(req)) {
                iod->sg_table.sgl = iod->first_sgl;
@@ -196,8 +186,6 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
                iod->req.transfer_len = blk_rq_payload_bytes(req);
        }
-        blk_mq_start_request(req);
        schedule_work(&iod->work);
        return BLK_STS_OK;
 }
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 0cf25d789d05..3f3cb72e0c0c 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -587,18 +587,28 @@ out:
 static unsigned int sr_block_check_events(struct gendisk *disk,
                                          unsigned int clearing)
 {
-        struct scsi_cd *cd = scsi_cd(disk);
+        unsigned int ret = 0;
+        struct scsi_cd *cd;
-        if (atomic_read(&cd->device->disk_events_disable_depth))
+        cd = scsi_cd_get(disk);
+        if (!cd)
                return 0;
-        return cdrom_check_events(&cd->cdi, clearing);
+        if (!atomic_read(&cd->device->disk_events_disable_depth))
+                ret = cdrom_check_events(&cd->cdi, clearing);
+        scsi_cd_put(cd);
+        return ret;
 }
 static int sr_block_revalidate_disk(struct gendisk *disk)
 {
-        struct scsi_cd *cd = scsi_cd(disk);
        struct scsi_sense_hdr sshdr;
+        struct scsi_cd *cd;
+        cd = scsi_cd_get(disk);
+        if (!cd)
+                return -ENXIO;
        /* if the unit is not ready, nothing more to do */
        if (scsi_test_unit_ready(cd->device, SR_TIMEOUT, MAX_RETRIES, &sshdr))
@@ -607,6 +617,7 @@ static int sr_block_revalidate_disk(struct gendisk *disk)
        sr_cd_check(&cd->cdi);
        get_sectorsize(cd);
 out:
+        scsi_cd_put(cd);
        return 0;
 }
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 09da0f124699..f6be4b0b6c18 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -28,6 +28,7 @@ void bdi_put(struct backing_dev_info *bdi);
 __printf(2, 3)
 int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...);
+__printf(2, 0)
 int bdi_register_va(struct backing_dev_info *bdi, const char *fmt,
                    va_list args);
 int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 8efcf49796a3..e3986f4b3461 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -183,7 +183,6 @@ enum {
        BLK_MQ_S_STOPPED        = 0,
        BLK_MQ_S_TAG_ACTIVE     = 1,
        BLK_MQ_S_SCHED_RESTART  = 2,
-        BLK_MQ_S_START_ON_RUN   = 3,
        BLK_MQ_MAX_DEPTH        = 10240,
@@ -270,7 +269,6 @@ void blk_mq_unquiesce_queue(struct request_queue *q);
 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
 bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_run_hw_queues(struct request_queue *q, bool async);
-void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
                busy_tag_iter_fn *fn, void *priv);
 void blk_mq_freeze_queue(struct request_queue *q);
author	Linus Torvalds <torvalds@linux-foundation.org>	2018-04-13 18:15:15 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2018-04-13 18:15:15 -0400
commit	edda415314804c29fa07e538938fa07947012d8f (patch)
tree	0428db94253f73bb0744f52d26645c33830756f3
parent	3e565a351ed3e94352bfbe0be06c659fc8fafb19 (diff)
parent	bb06ec31452fb2da1594f88035c2ecea4e0652f4 (diff)