1 files changed, 141 insertions, 87 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c
index ade8a2d1b0aa..f53779692c77 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -89,7 +89,8 @@ static int blk_mq_queue_enter(struct request_queue *q, gfp_t gfp)
                        return -EBUSY;
                ret = wait_event_interruptible(q->mq_freeze_wq,
-                                !q->mq_freeze_depth || blk_queue_dying(q));
+                                !atomic_read(&q->mq_freeze_depth) ||
+                                blk_queue_dying(q));
                if (blk_queue_dying(q))
                        return -ENODEV;
                if (ret)
@@ -112,13 +113,10 @@ static void blk_mq_usage_counter_release(struct percpu_ref *ref)
 void blk_mq_freeze_queue_start(struct request_queue *q)
 {
-        bool freeze;
+        int freeze_depth;
-        spin_lock_irq(q->queue_lock);
+        freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
-        freeze = !q->mq_freeze_depth++;
+        if (freeze_depth == 1) {
-        spin_unlock_irq(q->queue_lock);
-        if (freeze) {
                percpu_ref_kill(&q->mq_usage_counter);
                blk_mq_run_hw_queues(q, false);
        }
@@ -143,13 +141,11 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
 void blk_mq_unfreeze_queue(struct request_queue *q)
 {
-        bool wake;
+        int freeze_depth;
-        spin_lock_irq(q->queue_lock);
+        freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
-        wake = !--q->mq_freeze_depth;
+        WARN_ON_ONCE(freeze_depth < 0);
-        WARN_ON_ONCE(q->mq_freeze_depth < 0);
+        if (!freeze_depth) {
-        spin_unlock_irq(q->queue_lock);
-        if (wake) {
                percpu_ref_reinit(&q->mq_usage_counter);
                wake_up_all(&q->mq_freeze_wq);
        }
@@ -677,8 +673,11 @@ static void blk_mq_rq_timer(unsigned long priv)
                data.next = blk_rq_timeout(round_jiffies_up(data.next));
                mod_timer(&q->timeout, data.next);
        } else {
-                queue_for_each_hw_ctx(q, hctx, i)
+                queue_for_each_hw_ctx(q, hctx, i) {
-                        blk_mq_tag_idle(hctx);
+                        /* the hctx may be unmapped, so check it here */
+                        if (blk_mq_hw_queue_mapped(hctx))
+                                blk_mq_tag_idle(hctx);
+                }
        }
 }
@@ -855,6 +854,16 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
                spin_lock(&hctx->lock);
                list_splice(&rq_list, &hctx->dispatch);
                spin_unlock(&hctx->lock);
+                /*
+                 * the queue is expected stopped with BLK_MQ_RQ_QUEUE_BUSY, but
+                 * it's possible the queue is stopped and restarted again
+                 * before this. Queue restart will dispatch requests. And since
+                 * requests in rq_list aren't added into hctx->dispatch yet,
+                 * the requests in rq_list might get lost.
+                 *
+                 * blk_mq_run_hw_queue() already checks the STOPPED bit
+                 **/
+                blk_mq_run_hw_queue(hctx, true);
        }
 }
@@ -1224,6 +1233,38 @@ static struct request *blk_mq_map_request(struct request_queue *q,
        return rq;
 }
+static int blk_mq_direct_issue_request(struct request *rq)
+{
+        int ret;
+        struct request_queue *q = rq->q;
+        struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q,
+                        rq->mq_ctx->cpu);
+        struct blk_mq_queue_data bd = {
+                .rq = rq,
+                .list = NULL,
+                .last = 1
+        };
+        /*
+         * For OK queue, we are done. For error, kill it. Any other
+         * error (busy), just add it to our list as we previously
+         * would have done
+         */
+        ret = q->mq_ops->queue_rq(hctx, &bd);
+        if (ret == BLK_MQ_RQ_QUEUE_OK)
+                return 0;
+        else {
+                __blk_mq_requeue_request(rq);
+                if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
+                        rq->errors = -EIO;
+                        blk_mq_end_request(rq, rq->errors);
+                        return 0;
+                }
+                return -1;
+        }
+}
 /*
 * Multiple hardware queue variant. This will not use per-process plugs,
 * but will attempt to bypass the hctx queueing if we can go straight to
@@ -1235,6 +1276,9 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
        const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
        struct blk_map_ctx data;
        struct request *rq;
+        unsigned int request_count = 0;
+        struct blk_plug *plug;
+        struct request *same_queue_rq = NULL;
        blk_queue_bounce(q, &bio);
@@ -1243,6 +1287,10 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
                return;
        }
+        if (!is_flush_fua && !blk_queue_nomerges(q) &&
+            blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
+                return;
        rq = blk_mq_map_request(q, bio, &data);
        if (unlikely(!rq))
                return;
@@ -1253,38 +1301,42 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
                goto run_queue;
        }
+        plug = current->plug;
        /*
         * If the driver supports defer issued based on 'last', then
         * queue it up like normal since we can potentially save some
         * CPU this way.
         */
-        if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
+        if (((plug && !blk_queue_nomerges(q)) || is_sync) &&
-                struct blk_mq_queue_data bd = {
+            !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
-                        .rq = rq,
+                struct request *old_rq = NULL;
-                        .list = NULL,
-                        .last = 1
-                };
-                int ret;
                blk_mq_bio_to_request(rq, bio);
                /*
-                 * For OK queue, we are done. For error, kill it. Any other
+                 * we do limited pluging. If bio can be merged, do merge.
-                 * error (busy), just add it to our list as we previously
+                 * Otherwise the existing request in the plug list will be
-                 * would have done
+                 * issued. So the plug list will have one request at most
                 */
-                ret = q->mq_ops->queue_rq(data.hctx, &bd);
+                if (plug) {
-                if (ret == BLK_MQ_RQ_QUEUE_OK)
+                        /*
-                        goto done;
+                         * The plug list might get flushed before this. If that
-                else {
+                         * happens, same_queue_rq is invalid and plug list is empty
-                        __blk_mq_requeue_request(rq);
+                         **/
+                        if (same_queue_rq && !list_empty(&plug->mq_list)) {
-                        if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
+                                old_rq = same_queue_rq;
-                                rq->errors = -EIO;
+                                list_del_init(&old_rq->queuelist);
-                                blk_mq_end_request(rq, rq->errors);
-                                goto done;
                        }
-                }
+                        list_add_tail(&rq->queuelist, &plug->mq_list);
+                } else /* is_sync */
+                        old_rq = rq;
+                blk_mq_put_ctx(data.ctx);
+                if (!old_rq)
+                        return;
+                if (!blk_mq_direct_issue_request(old_rq))
+                        return;
+                blk_mq_insert_request(old_rq, false, true, true);
+                return;
        }
        if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
@@ -1297,7 +1349,6 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 run_queue:
                blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
        }
-done:
        blk_mq_put_ctx(data.ctx);
 }
@@ -1309,16 +1360,11 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
 {
        const int is_sync = rw_is_sync(bio->bi_rw);
        const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
-        unsigned int use_plug, request_count = 0;
+        struct blk_plug *plug;
+        unsigned int request_count = 0;
        struct blk_map_ctx data;
        struct request *rq;
-        /*
-         * If we have multiple hardware queues, just go directly to
-         * one of those for sync IO.
-         */
-        use_plug = !is_flush_fua && !is_sync;
        blk_queue_bounce(q, &bio);
        if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
@@ -1326,8 +1372,8 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
                return;
        }
-        if (use_plug && !blk_queue_nomerges(q) &&
+        if (!is_flush_fua && !blk_queue_nomerges(q) &&
-            blk_attempt_plug_merge(q, bio, &request_count))
+            blk_attempt_plug_merge(q, bio, &request_count, NULL))
                return;
        rq = blk_mq_map_request(q, bio, &data);
@@ -1345,21 +1391,18 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
         * utilize that to temporarily store requests until the task is
         * either done or scheduled away.
         */
-        if (use_plug) {
+        plug = current->plug;
-                struct blk_plug *plug = current->plug;
+        if (plug) {
+                blk_mq_bio_to_request(rq, bio);
-                if (plug) {
+                if (list_empty(&plug->mq_list))
-                        blk_mq_bio_to_request(rq, bio);
+                        trace_block_plug(q);
-                        if (list_empty(&plug->mq_list))
+                else if (request_count >= BLK_MAX_REQUEST_COUNT) {
-                                trace_block_plug(q);
+                        blk_flush_plug_list(plug, false);
-                        else if (request_count >= BLK_MAX_REQUEST_COUNT) {
+                        trace_block_plug(q);
-                                blk_flush_plug_list(plug, false);
-                                trace_block_plug(q);
-                        }
-                        list_add_tail(&rq->queuelist, &plug->mq_list);
-                        blk_mq_put_ctx(data.ctx);
-                        return;
                }
+                list_add_tail(&rq->queuelist, &plug->mq_list);
+                blk_mq_put_ctx(data.ctx);
+                return;
        }
        if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
@@ -1495,7 +1538,6 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
                        i++;
                }
        }
        return tags;
 fail:
@@ -1571,22 +1613,6 @@ static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
        return NOTIFY_OK;
 }
-static int blk_mq_hctx_cpu_online(struct blk_mq_hw_ctx *hctx, int cpu)
-{
-        struct request_queue *q = hctx->queue;
-        struct blk_mq_tag_set *set = q->tag_set;
-        if (set->tags[hctx->queue_num])
-                return NOTIFY_OK;
-        set->tags[hctx->queue_num] = blk_mq_init_rq_map(set, hctx->queue_num);
-        if (!set->tags[hctx->queue_num])
-                return NOTIFY_STOP;
-        hctx->tags = set->tags[hctx->queue_num];
-        return NOTIFY_OK;
-}
 static int blk_mq_hctx_notify(void *data, unsigned long action,
                              unsigned int cpu)
 {
@@ -1594,12 +1620,16 @@ static int blk_mq_hctx_notify(void *data, unsigned long action,
        if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
                return blk_mq_hctx_cpu_offline(hctx, cpu);
-        else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
-                return blk_mq_hctx_cpu_online(hctx, cpu);
+        /*
+         * In case of CPU online, tags may be reallocated
+         * in blk_mq_map_swqueue() after mapping is updated.
+         */
        return NOTIFY_OK;
 }
+/* hctx->ctxs will be freed in queue's release handler */
 static void blk_mq_exit_hctx(struct request_queue *q,
                struct blk_mq_tag_set *set,
                struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
@@ -1618,7 +1648,6 @@ static void blk_mq_exit_hctx(struct request_queue *q,
        blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
        blk_free_flush_queue(hctx->fq);
-        kfree(hctx->ctxs);
        blk_mq_free_bitmap(&hctx->ctx_map);
 }
@@ -1775,6 +1804,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
        unsigned int i;
        struct blk_mq_hw_ctx *hctx;
        struct blk_mq_ctx *ctx;
+        struct blk_mq_tag_set *set = q->tag_set;
        queue_for_each_hw_ctx(q, hctx, i) {
                cpumask_clear(hctx->cpumask);
@@ -1791,6 +1821,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
                hctx = q->mq_ops->map_queue(q, i);
                cpumask_set_cpu(i, hctx->cpumask);
+                cpumask_set_cpu(i, hctx->tags->cpumask);
                ctx->index_hw = hctx->nr_ctx;
                hctx->ctxs[hctx->nr_ctx++] = ctx;
        }
@@ -1803,16 +1834,20 @@ static void blk_mq_map_swqueue(struct request_queue *q)
                 * disable it and free the request entries.
                 */
                if (!hctx->nr_ctx) {
-                        struct blk_mq_tag_set *set = q->tag_set;
                        if (set->tags[i]) {
                                blk_mq_free_rq_map(set, set->tags[i], i);
                                set->tags[i] = NULL;
-                                hctx->tags = NULL;
                        }
+                        hctx->tags = NULL;
                        continue;
                }
+                /* unmapped hw queue can be remapped after CPU topo changed */
+                if (!set->tags[i])
+                        set->tags[i] = blk_mq_init_rq_map(set, i);
+                hctx->tags = set->tags[i];
+                WARN_ON(!hctx->tags);
                /*
                 * Set the map size to the number of mapped software queues.
                 * This is more accurate and more efficient than looping
@@ -1886,8 +1921,12 @@ void blk_mq_release(struct request_queue *q)
        unsigned int i;
        /* hctx kobj stays in hctx */
-        queue_for_each_hw_ctx(q, hctx, i)
+        queue_for_each_hw_ctx(q, hctx, i) {
+                if (!hctx)
+                        continue;
+                kfree(hctx->ctxs);
                kfree(hctx);
+        }
        kfree(q->queue_hw_ctx);
@@ -2047,7 +2086,7 @@ void blk_mq_free_queue(struct request_queue *q)
 /* Basically redo blk_mq_init_queue with queue frozen */
 static void blk_mq_queue_reinit(struct request_queue *q)
 {
-        WARN_ON_ONCE(!q->mq_freeze_depth);
+        WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
        blk_mq_sysfs_unregister(q);
@@ -2090,9 +2129,16 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
         */
        list_for_each_entry(q, &all_q_list, all_q_node)
                blk_mq_freeze_queue_start(q);
-        list_for_each_entry(q, &all_q_list, all_q_node)
+        list_for_each_entry(q, &all_q_list, all_q_node) {
                blk_mq_freeze_queue_wait(q);
+                /*
+                 * timeout handler can't touch hw queue during the
+                 * reinitialization
+                 */
+                del_timer_sync(&q->timeout);
+        }
        list_for_each_entry(q, &all_q_list, all_q_node)
                blk_mq_queue_reinit(q);
@@ -2157,6 +2203,12 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
        return 0;
 }
+struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags)
+{
+        return tags->cpumask;
+}
+EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask);
 /*
 * Alloc a tag set to be associated with one or more request queues.
 * May fail with EINVAL for various error conditions. May adjust the
@@ -2218,8 +2270,10 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
        int i;
        for (i = 0; i < set->nr_hw_queues; i++) {
-                if (set->tags[i])
+                if (set->tags[i]) {
                        blk_mq_free_rq_map(set, set->tags[i], i);
+                        free_cpumask_var(set->tags[i]->cpumask);
+                }
        }
        kfree(set->tags);

diff --git a/block/blk-mq.c b/block/blk-mq.c index ade8a2d1b0aa..f53779692c77 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c
@@ -89,7 +89,8 @@ static int blk_mq_queue_enter(struct request_queue *q, gfp_t gfp)
89	return -EBUSY;	89	return -EBUSY;
90		90
91	ret = wait_event_interruptible(q->mq_freeze_wq,	91	ret = wait_event_interruptible(q->mq_freeze_wq,
92	!q->mq_freeze_depth \|\| blk_queue_dying(q));	92	!atomic_read(&q->mq_freeze_depth) \|\|
		93	blk_queue_dying(q));
93	if (blk_queue_dying(q))	94	if (blk_queue_dying(q))
94	return -ENODEV;	95	return -ENODEV;
95	if (ret)	96	if (ret)
@@ -112,13 +113,10 @@ static void blk_mq_usage_counter_release(struct percpu_ref *ref)
112		113
113	void blk_mq_freeze_queue_start(struct request_queue *q)	114	void blk_mq_freeze_queue_start(struct request_queue *q)
114	{	115	{
115	bool freeze;	116	int freeze_depth;
116		117
117	spin_lock_irq(q->queue_lock);	118	freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
118	freeze = !q->mq_freeze_depth++;	119	if (freeze_depth == 1) {
119	spin_unlock_irq(q->queue_lock);
120
121	if (freeze) {
122	percpu_ref_kill(&q->mq_usage_counter);	120	percpu_ref_kill(&q->mq_usage_counter);
123	blk_mq_run_hw_queues(q, false);	121	blk_mq_run_hw_queues(q, false);
124	}	122	}
@@ -143,13 +141,11 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
143		141
144	void blk_mq_unfreeze_queue(struct request_queue *q)	142	void blk_mq_unfreeze_queue(struct request_queue *q)
145	{	143	{
146	bool wake;	144	int freeze_depth;
147		145
148	spin_lock_irq(q->queue_lock);	146	freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
149	wake = !--q->mq_freeze_depth;	147	WARN_ON_ONCE(freeze_depth < 0);
150	WARN_ON_ONCE(q->mq_freeze_depth < 0);	148	if (!freeze_depth) {
151	spin_unlock_irq(q->queue_lock);
152	if (wake) {
153	percpu_ref_reinit(&q->mq_usage_counter);	149	percpu_ref_reinit(&q->mq_usage_counter);
154	wake_up_all(&q->mq_freeze_wq);	150	wake_up_all(&q->mq_freeze_wq);
155	}	151	}
@@ -677,8 +673,11 @@ static void blk_mq_rq_timer(unsigned long priv)
677	data.next = blk_rq_timeout(round_jiffies_up(data.next));	673	data.next = blk_rq_timeout(round_jiffies_up(data.next));
678	mod_timer(&q->timeout, data.next);	674	mod_timer(&q->timeout, data.next);
679	} else {	675	} else {
680	queue_for_each_hw_ctx(q, hctx, i)	676	queue_for_each_hw_ctx(q, hctx, i) {
681	blk_mq_tag_idle(hctx);	677	/* the hctx may be unmapped, so check it here */
		678	if (blk_mq_hw_queue_mapped(hctx))
		679	blk_mq_tag_idle(hctx);
		680	}
682	}	681	}
683	}	682	}
684		683
@@ -855,6 +854,16 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
855	spin_lock(&hctx->lock);	854	spin_lock(&hctx->lock);
856	list_splice(&rq_list, &hctx->dispatch);	855	list_splice(&rq_list, &hctx->dispatch);
857	spin_unlock(&hctx->lock);	856	spin_unlock(&hctx->lock);
		857	/*
		858	* the queue is expected stopped with BLK_MQ_RQ_QUEUE_BUSY, but
		859	* it's possible the queue is stopped and restarted again
		860	* before this. Queue restart will dispatch requests. And since
		861	* requests in rq_list aren't added into hctx->dispatch yet,
		862	* the requests in rq_list might get lost.
		863	*
		864	* blk_mq_run_hw_queue() already checks the STOPPED bit
		865	**/
		866	blk_mq_run_hw_queue(hctx, true);
858	}	867	}
859	}	868	}
860		869
@@ -1224,6 +1233,38 @@ static struct request blk_mq_map_request(struct request_queue q,
1224	return rq;	1233	return rq;
1225	}	1234	}
1226		1235
		1236	static int blk_mq_direct_issue_request(struct request *rq)
		1237	{
		1238	int ret;
		1239	struct request_queue *q = rq->q;
		1240	struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q,
		1241	rq->mq_ctx->cpu);
		1242	struct blk_mq_queue_data bd = {
		1243	.rq = rq,
		1244	.list = NULL,
		1245	.last = 1
		1246	};
		1247
		1248	/*
		1249	* For OK queue, we are done. For error, kill it. Any other
		1250	* error (busy), just add it to our list as we previously
		1251	* would have done
		1252	*/
		1253	ret = q->mq_ops->queue_rq(hctx, &bd);
		1254	if (ret == BLK_MQ_RQ_QUEUE_OK)
		1255	return 0;
		1256	else {
		1257	__blk_mq_requeue_request(rq);
		1258
		1259	if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
		1260	rq->errors = -EIO;
		1261	blk_mq_end_request(rq, rq->errors);
		1262	return 0;
		1263	}
		1264	return -1;
		1265	}
		1266	}
		1267
1227	/*	1268	/*
1228	* Multiple hardware queue variant. This will not use per-process plugs,	1269	* Multiple hardware queue variant. This will not use per-process plugs,
1229	* but will attempt to bypass the hctx queueing if we can go straight to	1270	* but will attempt to bypass the hctx queueing if we can go straight to
@@ -1235,6 +1276,9 @@ static void blk_mq_make_request(struct request_queue q, struct bio bio)
1235	const int is_flush_fua = bio->bi_rw & (REQ_FLUSH \| REQ_FUA);	1276	const int is_flush_fua = bio->bi_rw & (REQ_FLUSH \| REQ_FUA);
1236	struct blk_map_ctx data;	1277	struct blk_map_ctx data;
1237	struct request *rq;	1278	struct request *rq;
		1279	unsigned int request_count = 0;
		1280	struct blk_plug *plug;
		1281	struct request *same_queue_rq = NULL;
1238		1282
1239	blk_queue_bounce(q, &bio);	1283	blk_queue_bounce(q, &bio);
1240		1284
@@ -1243,6 +1287,10 @@ static void blk_mq_make_request(struct request_queue q, struct bio bio)
1243	return;	1287	return;
1244	}	1288	}
1245		1289
		1290	if (!is_flush_fua && !blk_queue_nomerges(q) &&
		1291	blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
		1292	return;
		1293
1246	rq = blk_mq_map_request(q, bio, &data);	1294	rq = blk_mq_map_request(q, bio, &data);
1247	if (unlikely(!rq))	1295	if (unlikely(!rq))
1248	return;	1296	return;
@@ -1253,38 +1301,42 @@ static void blk_mq_make_request(struct request_queue q, struct bio bio)
1253	goto run_queue;	1301	goto run_queue;
1254	}	1302	}
1255		1303
		1304	plug = current->plug;
1256	/*	1305	/*
1257	* If the driver supports defer issued based on 'last', then	1306	* If the driver supports defer issued based on 'last', then
1258	* queue it up like normal since we can potentially save some	1307	* queue it up like normal since we can potentially save some
1259	* CPU this way.	1308	* CPU this way.
1260	*/	1309	*/
1261	if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {	1310	if (((plug && !blk_queue_nomerges(q)) \|\| is_sync) &&
1262	struct blk_mq_queue_data bd = {	1311	!(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
1263	.rq = rq,	1312	struct request *old_rq = NULL;
1264	.list = NULL,
1265	.last = 1
1266	};
1267	int ret;
1268		1313
1269	blk_mq_bio_to_request(rq, bio);	1314	blk_mq_bio_to_request(rq, bio);
1270		1315
1271	/*	1316	/*
1272	* For OK queue, we are done. For error, kill it. Any other	1317	* we do limited pluging. If bio can be merged, do merge.
1273	* error (busy), just add it to our list as we previously	1318	* Otherwise the existing request in the plug list will be
1274	* would have done	1319	* issued. So the plug list will have one request at most
1275	*/	1320	*/
1276	ret = q->mq_ops->queue_rq(data.hctx, &bd);	1321	if (plug) {
1277	if (ret == BLK_MQ_RQ_QUEUE_OK)	1322	/*
1278	goto done;	1323	* The plug list might get flushed before this. If that
1279	else {	1324	* happens, same_queue_rq is invalid and plug list is empty
1280	__blk_mq_requeue_request(rq);	1325	**/
1281		1326	if (same_queue_rq && !list_empty(&plug->mq_list)) {
1282	if (ret == BLK_MQ_RQ_QUEUE_ERROR) {	1327	old_rq = same_queue_rq;
1283	rq->errors = -EIO;	1328	list_del_init(&old_rq->queuelist);
1284	blk_mq_end_request(rq, rq->errors);
1285	goto done;
1286	}	1329	}
1287	}	1330	list_add_tail(&rq->queuelist, &plug->mq_list);
		1331	} else /* is_sync */
		1332	old_rq = rq;
		1333	blk_mq_put_ctx(data.ctx);
		1334	if (!old_rq)
		1335	return;
		1336	if (!blk_mq_direct_issue_request(old_rq))
		1337	return;
		1338	blk_mq_insert_request(old_rq, false, true, true);
		1339	return;
1288	}	1340	}
1289		1341
1290	if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {	1342	if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
@@ -1297,7 +1349,6 @@ static void blk_mq_make_request(struct request_queue q, struct bio bio)
1297	run_queue:	1349	run_queue:
1298	blk_mq_run_hw_queue(data.hctx, !is_sync \|\| is_flush_fua);	1350	blk_mq_run_hw_queue(data.hctx, !is_sync \|\| is_flush_fua);
1299	}	1351	}
1300	done:
1301	blk_mq_put_ctx(data.ctx);	1352	blk_mq_put_ctx(data.ctx);
1302	}	1353	}
1303		1354
@@ -1309,16 +1360,11 @@ static void blk_sq_make_request(struct request_queue q, struct bio bio)
1309	{	1360	{
1310	const int is_sync = rw_is_sync(bio->bi_rw);	1361	const int is_sync = rw_is_sync(bio->bi_rw);
1311	const int is_flush_fua = bio->bi_rw & (REQ_FLUSH \| REQ_FUA);	1362	const int is_flush_fua = bio->bi_rw & (REQ_FLUSH \| REQ_FUA);
1312	unsigned int use_plug, request_count = 0;	1363	struct blk_plug *plug;
		1364	unsigned int request_count = 0;
1313	struct blk_map_ctx data;	1365	struct blk_map_ctx data;
1314	struct request *rq;	1366	struct request *rq;
1315		1367
1316	/*
1317	* If we have multiple hardware queues, just go directly to
1318	* one of those for sync IO.
1319	*/
1320	use_plug = !is_flush_fua && !is_sync;
1321
1322	blk_queue_bounce(q, &bio);	1368	blk_queue_bounce(q, &bio);
1323		1369
1324	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {	1370	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
@@ -1326,8 +1372,8 @@ static void blk_sq_make_request(struct request_queue q, struct bio bio)
1326	return;	1372	return;
1327	}	1373	}
1328		1374
1329	if (use_plug && !blk_queue_nomerges(q) &&	1375	if (!is_flush_fua && !blk_queue_nomerges(q) &&
1330	blk_attempt_plug_merge(q, bio, &request_count))	1376	blk_attempt_plug_merge(q, bio, &request_count, NULL))
1331	return;	1377	return;
1332		1378
1333	rq = blk_mq_map_request(q, bio, &data);	1379	rq = blk_mq_map_request(q, bio, &data);
@@ -1345,21 +1391,18 @@ static void blk_sq_make_request(struct request_queue q, struct bio bio)
1345	* utilize that to temporarily store requests until the task is	1391	* utilize that to temporarily store requests until the task is
1346	* either done or scheduled away.	1392	* either done or scheduled away.
1347	*/	1393	*/
1348	if (use_plug) {	1394	plug = current->plug;
1349	struct blk_plug *plug = current->plug;	1395	if (plug) {
1350		1396	blk_mq_bio_to_request(rq, bio);
1351	if (plug) {	1397	if (list_empty(&plug->mq_list))
1352	blk_mq_bio_to_request(rq, bio);	1398	trace_block_plug(q);
1353	if (list_empty(&plug->mq_list))	1399	else if (request_count >= BLK_MAX_REQUEST_COUNT) {
1354	trace_block_plug(q);	1400	blk_flush_plug_list(plug, false);
1355	else if (request_count >= BLK_MAX_REQUEST_COUNT) {	1401	trace_block_plug(q);
1356	blk_flush_plug_list(plug, false);
1357	trace_block_plug(q);
1358	}
1359	list_add_tail(&rq->queuelist, &plug->mq_list);
1360	blk_mq_put_ctx(data.ctx);
1361	return;
1362	}	1402	}
		1403	list_add_tail(&rq->queuelist, &plug->mq_list);
		1404	blk_mq_put_ctx(data.ctx);
		1405	return;
1363	}	1406	}
1364		1407
1365	if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {	1408	if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
@@ -1495,7 +1538,6 @@ static struct blk_mq_tags blk_mq_init_rq_map(struct blk_mq_tag_set set,
1495	i++;	1538	i++;
1496	}	1539	}
1497	}	1540	}
1498
1499	return tags;	1541	return tags;
1500		1542
1501	fail:	1543	fail:
@@ -1571,22 +1613,6 @@ static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
1571	return NOTIFY_OK;	1613	return NOTIFY_OK;
1572	}	1614	}
1573		1615
1574	static int blk_mq_hctx_cpu_online(struct blk_mq_hw_ctx *hctx, int cpu)
1575	{
1576	struct request_queue *q = hctx->queue;
1577	struct blk_mq_tag_set *set = q->tag_set;
1578
1579	if (set->tags[hctx->queue_num])
1580	return NOTIFY_OK;
1581
1582	set->tags[hctx->queue_num] = blk_mq_init_rq_map(set, hctx->queue_num);
1583	if (!set->tags[hctx->queue_num])
1584	return NOTIFY_STOP;
1585
1586	hctx->tags = set->tags[hctx->queue_num];
1587	return NOTIFY_OK;
1588	}
1589
1590	static int blk_mq_hctx_notify(void *data, unsigned long action,	1616	static int blk_mq_hctx_notify(void *data, unsigned long action,
1591	unsigned int cpu)	1617	unsigned int cpu)
1592	{	1618	{
@@ -1594,12 +1620,16 @@ static int blk_mq_hctx_notify(void *data, unsigned long action,
1594		1620
1595	if (action == CPU_DEAD \|\| action == CPU_DEAD_FROZEN)	1621	if (action == CPU_DEAD \|\| action == CPU_DEAD_FROZEN)
1596	return blk_mq_hctx_cpu_offline(hctx, cpu);	1622	return blk_mq_hctx_cpu_offline(hctx, cpu);
1597	else if (action == CPU_ONLINE \|\| action == CPU_ONLINE_FROZEN)	1623
1598	return blk_mq_hctx_cpu_online(hctx, cpu);	1624	/*
		1625	* In case of CPU online, tags may be reallocated
		1626	* in blk_mq_map_swqueue() after mapping is updated.
		1627	*/
1599		1628
1600	return NOTIFY_OK;	1629	return NOTIFY_OK;
1601	}	1630	}
1602		1631
		1632	/* hctx->ctxs will be freed in queue's release handler */
1603	static void blk_mq_exit_hctx(struct request_queue *q,	1633	static void blk_mq_exit_hctx(struct request_queue *q,
1604	struct blk_mq_tag_set *set,	1634	struct blk_mq_tag_set *set,
1605	struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)	1635	struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
@@ -1618,7 +1648,6 @@ static void blk_mq_exit_hctx(struct request_queue *q,
1618		1648
1619	blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);	1649	blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1620	blk_free_flush_queue(hctx->fq);	1650	blk_free_flush_queue(hctx->fq);
1621	kfree(hctx->ctxs);
1622	blk_mq_free_bitmap(&hctx->ctx_map);	1651	blk_mq_free_bitmap(&hctx->ctx_map);
1623	}	1652	}
1624		1653
@@ -1775,6 +1804,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
1775	unsigned int i;	1804	unsigned int i;
1776	struct blk_mq_hw_ctx *hctx;	1805	struct blk_mq_hw_ctx *hctx;
1777	struct blk_mq_ctx *ctx;	1806	struct blk_mq_ctx *ctx;
		1807	struct blk_mq_tag_set *set = q->tag_set;
1778		1808
1779	queue_for_each_hw_ctx(q, hctx, i) {	1809	queue_for_each_hw_ctx(q, hctx, i) {
1780	cpumask_clear(hctx->cpumask);	1810	cpumask_clear(hctx->cpumask);
@@ -1791,6 +1821,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
1791		1821
1792	hctx = q->mq_ops->map_queue(q, i);	1822	hctx = q->mq_ops->map_queue(q, i);
1793	cpumask_set_cpu(i, hctx->cpumask);	1823	cpumask_set_cpu(i, hctx->cpumask);
		1824	cpumask_set_cpu(i, hctx->tags->cpumask);
1794	ctx->index_hw = hctx->nr_ctx;	1825	ctx->index_hw = hctx->nr_ctx;
1795	hctx->ctxs[hctx->nr_ctx++] = ctx;	1826	hctx->ctxs[hctx->nr_ctx++] = ctx;
1796	}	1827	}
@@ -1803,16 +1834,20 @@ static void blk_mq_map_swqueue(struct request_queue *q)
1803	* disable it and free the request entries.	1834	* disable it and free the request entries.
1804	*/	1835	*/
1805	if (!hctx->nr_ctx) {	1836	if (!hctx->nr_ctx) {
1806	struct blk_mq_tag_set *set = q->tag_set;
1807
1808	if (set->tags[i]) {	1837	if (set->tags[i]) {
1809	blk_mq_free_rq_map(set, set->tags[i], i);	1838	blk_mq_free_rq_map(set, set->tags[i], i);
1810	set->tags[i] = NULL;	1839	set->tags[i] = NULL;
1811	hctx->tags = NULL;
1812	}	1840	}
		1841	hctx->tags = NULL;
1813	continue;	1842	continue;
1814	}	1843	}
1815		1844
		1845	/* unmapped hw queue can be remapped after CPU topo changed */
		1846	if (!set->tags[i])
		1847	set->tags[i] = blk_mq_init_rq_map(set, i);
		1848	hctx->tags = set->tags[i];
		1849	WARN_ON(!hctx->tags);
		1850
1816	/*	1851	/*
1817	* Set the map size to the number of mapped software queues.	1852	* Set the map size to the number of mapped software queues.
1818	* This is more accurate and more efficient than looping	1853	* This is more accurate and more efficient than looping
@@ -1886,8 +1921,12 @@ void blk_mq_release(struct request_queue *q)
1886	unsigned int i;	1921	unsigned int i;
1887		1922
1888	/* hctx kobj stays in hctx */	1923	/* hctx kobj stays in hctx */
1889	queue_for_each_hw_ctx(q, hctx, i)	1924	queue_for_each_hw_ctx(q, hctx, i) {
		1925	if (!hctx)
		1926	continue;
		1927	kfree(hctx->ctxs);
1890	kfree(hctx);	1928	kfree(hctx);
		1929	}
1891		1930
1892	kfree(q->queue_hw_ctx);	1931	kfree(q->queue_hw_ctx);
1893		1932
@@ -2047,7 +2086,7 @@ void blk_mq_free_queue(struct request_queue *q)
2047	/* Basically redo blk_mq_init_queue with queue frozen */	2086	/* Basically redo blk_mq_init_queue with queue frozen */
2048	static void blk_mq_queue_reinit(struct request_queue *q)	2087	static void blk_mq_queue_reinit(struct request_queue *q)
2049	{	2088	{
2050	WARN_ON_ONCE(!q->mq_freeze_depth);	2089	WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
2051		2090
2052	blk_mq_sysfs_unregister(q);	2091	blk_mq_sysfs_unregister(q);
2053		2092
@@ -2090,9 +2129,16 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
2090	*/	2129	*/
2091	list_for_each_entry(q, &all_q_list, all_q_node)	2130	list_for_each_entry(q, &all_q_list, all_q_node)
2092	blk_mq_freeze_queue_start(q);	2131	blk_mq_freeze_queue_start(q);
2093	list_for_each_entry(q, &all_q_list, all_q_node)	2132	list_for_each_entry(q, &all_q_list, all_q_node) {
2094	blk_mq_freeze_queue_wait(q);	2133	blk_mq_freeze_queue_wait(q);
2095		2134
		2135	/*
		2136	* timeout handler can't touch hw queue during the
		2137	* reinitialization
		2138	*/
		2139	del_timer_sync(&q->timeout);
		2140	}
		2141
2096	list_for_each_entry(q, &all_q_list, all_q_node)	2142	list_for_each_entry(q, &all_q_list, all_q_node)
2097	blk_mq_queue_reinit(q);	2143	blk_mq_queue_reinit(q);
2098		2144
@@ -2157,6 +2203,12 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2157	return 0;	2203	return 0;
2158	}	2204	}
2159		2205
		2206	struct cpumask blk_mq_tags_cpumask(struct blk_mq_tags tags)
		2207	{
		2208	return tags->cpumask;
		2209	}
		2210	EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask);
		2211
2160	/*	2212	/*
2161	* Alloc a tag set to be associated with one or more request queues.	2213	* Alloc a tag set to be associated with one or more request queues.
2162	* May fail with EINVAL for various error conditions. May adjust the	2214	* May fail with EINVAL for various error conditions. May adjust the
@@ -2218,8 +2270,10 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
2218	int i;	2270	int i;
2219		2271
2220	for (i = 0; i < set->nr_hw_queues; i++) {	2272	for (i = 0; i < set->nr_hw_queues; i++) {
2221	if (set->tags[i])	2273	if (set->tags[i]) {
2222	blk_mq_free_rq_map(set, set->tags[i], i);	2274	blk_mq_free_rq_map(set, set->tags[i], i);
		2275	free_cpumask_var(set->tags[i]->cpumask);
		2276	}
2223	}	2277	}
2224		2278
2225	kfree(set->tags);	2279	kfree(set->tags);