Merge branch 'for-3.16/core' of git://git.kernel.dk/linux-block into next

Pull block core updates from Jens Axboe: "It's a big(ish) round this time, lots of development effort has gone into blk-mq in the last 3 months. Generally we're heading to where 3.16 will be a feature complete and performant blk-mq. scsi-mq is progressing nicely and will hopefully be in 3.17. A nvme port is in progress, and the Micron pci-e flash driver, mtip32xx, is converted and will be sent in with the driver pull request for 3.16. This pull request contains: - Lots of prep and support patches for scsi-mq have been integrated. All from Christoph. - API and code cleanups for blk-mq from Christoph. - Lots of good corner case and error handling cleanup fixes for blk-mq from Ming Lei. - A flew of blk-mq updates from me: * Provide strict mappings so that the driver can rely on the CPU to queue mapping. This enables optimizations in the driver. * Provided a bitmap tagging instead of percpu_ida, which never really worked well for blk-mq. percpu_ida relies on the fact that we have a lot more tags available than we really need, it fails miserably for cases where we exhaust (or are close to exhausting) the tag space. * Provide sane support for shared tag maps, as utilized by scsi-mq * Various fixes for IO timeouts. * API cleanups, and lots of perf tweaks and optimizations. - Remove 'buffer' from struct request. This is ancient code, from when requests were always virtually mapped. Kill it, to reclaim some space in struct request. From me. - Remove 'magic' from blk_plug. Since we store these on the stack and since we've never caught any actual bugs with this, lets just get rid of it. From me. - Only call part_in_flight() once for IO completion, as includes two atomic reads. Hopefully we'll get a better implementation soon, as the part IO stats are now one of the more expensive parts of doing IO on blk-mq. From me. - File migration of block code from {mm,fs}/ to block/. This includes bio.c, bio-integrity.c, bounce.c, and ioprio.c. From me, from a discussion on lkml. That should describe the meat of the pull request. Also has various little fixes and cleanups from Dave Jones, Shaohua Li, Duan Jiong, Fengguang Wu, Fabian Frederick, Randy Dunlap, Robert Elliott, and Sam Bradshaw" * 'for-3.16/core' of git://git.kernel.dk/linux-block: (100 commits) blk-mq: push IPI or local end_io decision to __blk_mq_complete_request() blk-mq: remember to start timeout handler for direct queue block: ensure that the timer is always added blk-mq: blk_mq_unregister_hctx() can be static blk-mq: make the sysfs mq/ layout reflect current mappings blk-mq: blk_mq_tag_to_rq should handle flush request block: remove dead code in scsi_ioctl:blk_verify_command blk-mq: request initialization optimizations block: add queue flag for disabling SG merging block: remove 'magic' from struct blk_plug blk-mq: remove alloc_hctx and free_hctx methods blk-mq: add file comments and update copyright notices blk-mq: remove blk_mq_alloc_request_pinned blk-mq: do not use blk_mq_alloc_request_pinned in blk_mq_map_request blk-mq: remove blk_mq_wait_for_tags blk-mq: initialize request in __blk_mq_alloc_request blk-mq: merge blk_mq_alloc_reserved_request into blk_mq_alloc_request blk-mq: add helper to insert requests from irq context blk-mq: remove stale comment for blk_mq_complete_request() blk-mq: allow non-softirq completions ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2014-06-02 12:29:34 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2014-06-02 12:29:34 -0400
commit: 681a2895486243a82547d8c9f53043eb54b53da0 (patch)
tree: 464273280aed6db55a99cc0d8614d4393f94fc48
parent: 6c52486dedbb30a1313da64945dcd686b4579c51 (diff)
parent: ed851860b4552fc8963ecf71eab9f6f7a5c19d74 (diff)
56 files changed, 2088 insertions, 986 deletions
diff --git a/Documentation/DocBook/filesystems.tmpl b/Documentation/DocBook/filesystems.tmpl
index 4f676838da06..bcdfdb9a9277 100644
--- a/Documentation/DocBook/filesystems.tmpl
+++ b/Documentation/DocBook/filesystems.tmpl
@@ -62,7 +62,7 @@
 !Efs/mpage.c
 !Efs/namei.c
 !Efs/buffer.c
-!Efs/bio.c
+!Eblock/bio.c
 !Efs/seq_file.c
 !Efs/filesystems.c
 !Efs/fs-writeback.c
diff --git a/block/Makefile b/block/Makefile
index 20645e88fb57..a2ce6ac935ec 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -2,13 +2,15 @@
 # Makefile for the kernel block layer
 #
-obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
+obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
                        blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
                        blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
                        blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
                        blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
-                        genhd.o scsi_ioctl.o partition-generic.o partitions/
+                        genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
+                        partitions/
+obj-$(CONFIG_BOUNCE)    += bounce.o
 obj-$(CONFIG_BLK_DEV_BSG)       += bsg.o
 obj-$(CONFIG_BLK_DEV_BSGLIB)    += bsg-lib.o
 obj-$(CONFIG_BLK_CGROUP)        += blk-cgroup.o
@@ -20,3 +22,4 @@ obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
 obj-$(CONFIG_BLOCK_COMPAT)      += compat_ioctl.o
 obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o
 obj-$(CONFIG_BLK_CMDLINE_PARSER)        += cmdline-parser.o
+obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
diff --git a/fs/bio-integrity.c b/block/bio-integrity.c
index 1c2ce0c87711..9e241063a616 100644
--- a/fs/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -617,7 +617,7 @@ int bioset_integrity_create(struct bio_set *bs, int pool_size)
        if (!bs->bio_integrity_pool)
                return -1;
-        bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size);
+        bs->bvec_integrity_pool = biovec_create_pool(pool_size);
        if (!bs->bvec_integrity_pool) {
                mempool_destroy(bs->bio_integrity_pool);
                return -1;
diff --git a/fs/bio.c b/block/bio.c
index 6f0362b77806..96d28eee8a1e 100644
--- a/fs/bio.c
+++ b/block/bio.c
@@ -305,6 +305,8 @@ static void bio_chain_endio(struct bio *bio, int error)
 /**
 * bio_chain - chain bio completions
+ * @bio: the target bio
+ * @parent: the @bio's parent bio
 *
 * The caller won't have a bi_end_io called when @bio completes - instead,
 * @parent's bi_end_io won't be called until both @parent and @bio have
@@ -1011,8 +1013,7 @@ static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
        bio->bi_private = bmd;
 }
-static struct bio_map_data *bio_alloc_map_data(int nr_segs,
+static struct bio_map_data *bio_alloc_map_data(unsigned int iov_count,
-                                               unsigned int iov_count,
                                               gfp_t gfp_mask)
 {
        if (iov_count > UIO_MAXIOV)
@@ -1154,7 +1155,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
        if (offset)
                nr_pages++;
-        bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask);
+        bmd = bio_alloc_map_data(iov_count, gfp_mask);
        if (!bmd)
                return ERR_PTR(-ENOMEM);
@@ -1859,7 +1860,7 @@ EXPORT_SYMBOL_GPL(bio_trim);
 * create memory pools for biovec's in a bio_set.
 * use the global biovec slabs created for general use.
 */
-mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries)
+mempool_t *biovec_create_pool(int pool_entries)
 {
        struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX;
@@ -1922,7 +1923,7 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
        if (!bs->bio_pool)
                goto bad;
-        bs->bvec_pool = biovec_create_pool(bs, pool_size);
+        bs->bvec_pool = biovec_create_pool(pool_size);
        if (!bs->bvec_pool)
                goto bad;
diff --git a/block/blk-core.c b/block/blk-core.c
index a0e3096c4bb5..40d654861c33 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -146,8 +146,8 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
        printk(KERN_INFO "  sector %llu, nr/cnr %u/%u\n",
               (unsigned long long)blk_rq_pos(rq),
               blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
-        printk(KERN_INFO "  bio %p, biotail %p, buffer %p, len %u\n",
+        printk(KERN_INFO "  bio %p, biotail %p, len %u\n",
-               rq->bio, rq->biotail, rq->buffer, blk_rq_bytes(rq));
+               rq->bio, rq->biotail, blk_rq_bytes(rq));
        if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
                printk(KERN_INFO "  cdb: ");
@@ -251,8 +251,10 @@ void blk_sync_queue(struct request_queue *q)
                struct blk_mq_hw_ctx *hctx;
                int i;
-                queue_for_each_hw_ctx(q, hctx, i)
+                queue_for_each_hw_ctx(q, hctx, i) {
-                        cancel_delayed_work_sync(&hctx->delayed_work);
+                        cancel_delayed_work_sync(&hctx->run_work);
+                        cancel_delayed_work_sync(&hctx->delay_work);
+                }
        } else {
                cancel_delayed_work_sync(&q->delay_work);
        }
@@ -574,12 +576,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
        if (!q)
                return NULL;
-        if (percpu_counter_init(&q->mq_usage_counter, 0))
-                goto fail_q;
        q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
        if (q->id < 0)
-                goto fail_c;
+                goto fail_q;
        q->backing_dev_info.ra_pages =
                        (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
@@ -637,8 +636,6 @@ fail_bdi:
        bdi_destroy(&q->backing_dev_info);
 fail_id:
        ida_simple_remove(&blk_queue_ida, q->id);
-fail_c:
-        percpu_counter_destroy(&q->mq_usage_counter);
 fail_q:
        kmem_cache_free(blk_requestq_cachep, q);
        return NULL;
@@ -846,6 +843,47 @@ static void freed_request(struct request_list *rl, unsigned int flags)
                __freed_request(rl, sync ^ 1);
 }
+int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
+{
+        struct request_list *rl;
+        spin_lock_irq(q->queue_lock);
+        q->nr_requests = nr;
+        blk_queue_congestion_threshold(q);
+        /* congestion isn't cgroup aware and follows root blkcg for now */
+        rl = &q->root_rl;
+        if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
+                blk_set_queue_congested(q, BLK_RW_SYNC);
+        else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
+                blk_clear_queue_congested(q, BLK_RW_SYNC);
+        if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q))
+                blk_set_queue_congested(q, BLK_RW_ASYNC);
+        else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
+                blk_clear_queue_congested(q, BLK_RW_ASYNC);
+        blk_queue_for_each_rl(rl, q) {
+                if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
+                        blk_set_rl_full(rl, BLK_RW_SYNC);
+                } else {
+                        blk_clear_rl_full(rl, BLK_RW_SYNC);
+                        wake_up(&rl->wait[BLK_RW_SYNC]);
+                }
+                if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
+                        blk_set_rl_full(rl, BLK_RW_ASYNC);
+                } else {
+                        blk_clear_rl_full(rl, BLK_RW_ASYNC);
+                        wake_up(&rl->wait[BLK_RW_ASYNC]);
+                }
+        }
+        spin_unlock_irq(q->queue_lock);
+        return 0;
+}
 /*
 * Determine if elevator data should be initialized when allocating the
 * request associated with @bio.
@@ -1135,7 +1173,7 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw,
 struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
 {
        if (q->mq_ops)
-                return blk_mq_alloc_request(q, rw, gfp_mask);
+                return blk_mq_alloc_request(q, rw, gfp_mask, false);
        else
                return blk_old_get_request(q, rw, gfp_mask);
 }
@@ -1231,12 +1269,15 @@ static void add_acct_request(struct request_queue *q, struct request *rq,
 static void part_round_stats_single(int cpu, struct hd_struct *part,
                                    unsigned long now)
 {
+        int inflight;
        if (now == part->stamp)
                return;
-        if (part_in_flight(part)) {
+        inflight = part_in_flight(part);
+        if (inflight) {
                __part_stat_add(cpu, part, time_in_queue,
-                                part_in_flight(part) * (now - part->stamp));
+                                inflight * (now - part->stamp));
                __part_stat_add(cpu, part, io_ticks, (now - part->stamp));
        }
        part->stamp = now;
@@ -1360,7 +1401,6 @@ void blk_add_request_payload(struct request *rq, struct page *page,
        rq->__data_len = rq->resid_len = len;
        rq->nr_phys_segments = 1;
-        rq->buffer = bio_data(bio);
 }
 EXPORT_SYMBOL_GPL(blk_add_request_payload);
@@ -1402,12 +1442,6 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
        bio->bi_next = req->bio;
        req->bio = bio;
-        /*
-         * may not be valid. if the low level driver said
-         * it didn't need a bounce buffer then it better
-         * not touch req->buffer either...
-         */
-        req->buffer = bio_data(bio);
        req->__sector = bio->bi_iter.bi_sector;
        req->__data_len += bio->bi_iter.bi_size;
        req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
@@ -1432,6 +1466,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
 * added on the elevator at this point.  In addition, we don't have
 * reliable access to the elevator outside queue lock.  Only check basic
 * merging parameters without querying the elevator.
+ *
+ * Caller must ensure !blk_queue_nomerges(q) beforehand.
 */
 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
                            unsigned int *request_count)
@@ -1441,9 +1477,6 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
        bool ret = false;
        struct list_head *plug_list;
-        if (blk_queue_nomerges(q))
-                goto out;
        plug = current->plug;
        if (!plug)
                goto out;
@@ -1522,7 +1555,8 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio)
         * Check if we can merge with the plugged list before grabbing
         * any locks.
         */
-        if (blk_attempt_plug_merge(q, bio, &request_count))
+        if (!blk_queue_nomerges(q) &&
+            blk_attempt_plug_merge(q, bio, &request_count))
                return;
        spin_lock_irq(q->queue_lock);
@@ -1654,7 +1688,7 @@ static int __init fail_make_request_debugfs(void)
        struct dentry *dir = fault_create_debugfs_attr("fail_make_request",
                                                NULL, &fail_make_request);
-        return IS_ERR(dir) ? PTR_ERR(dir) : 0;
+        return PTR_ERR_OR_ZERO(dir);
 }
 late_initcall(fail_make_request_debugfs);
@@ -2434,7 +2468,6 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
        }
        req->__data_len -= total_bytes;
-        req->buffer = bio_data(req->bio);
        /* update sector only for requests with clear definition of sector */
        if (req->cmd_type == REQ_TYPE_FS)
@@ -2503,7 +2536,7 @@ EXPORT_SYMBOL_GPL(blk_unprep_request);
 /*
 * queue lock must be held
 */
-static void blk_finish_request(struct request *req, int error)
+void blk_finish_request(struct request *req, int error)
 {
        if (blk_rq_tagged(req))
                blk_queue_end_tag(req->q, req);
@@ -2529,6 +2562,7 @@ static void blk_finish_request(struct request *req, int error)
                __blk_put_request(req->q, req);
        }
 }
+EXPORT_SYMBOL(blk_finish_request);
 /**
 * blk_end_bidi_request - Complete a bidi request
@@ -2752,10 +2786,9 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
        /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw */
        rq->cmd_flags |= bio->bi_rw & REQ_WRITE;
-        if (bio_has_data(bio)) {
+        if (bio_has_data(bio))
                rq->nr_phys_segments = bio_phys_segments(q, bio);
-                rq->buffer = bio_data(bio);
-        }
        rq->__data_len = bio->bi_iter.bi_size;
        rq->bio = rq->biotail = bio;
@@ -2831,7 +2864,7 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
 /*
 * Copy attributes of the original request to the clone request.
- * The actual data parts (e.g. ->cmd, ->buffer, ->sense) are not copied.
+ * The actual data parts (e.g. ->cmd, ->sense) are not copied.
 */
 static void __blk_rq_prep_clone(struct request *dst, struct request *src)
 {
@@ -2857,7 +2890,7 @@ static void __blk_rq_prep_clone(struct request *dst, struct request *src)
 *
 * Description:
 *     Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
- *     The actual data parts of @rq_src (e.g. ->cmd, ->buffer, ->sense)
+ *     The actual data parts of @rq_src (e.g. ->cmd, ->sense)
 *     are not copied, and copying such parts is the caller's responsibility.
 *     Also, pages which the original bios are pointing to are not copied
 *     and the cloned bios just point same pages.
@@ -2904,20 +2937,25 @@ free_and_out:
 }
 EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
-int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
+int kblockd_schedule_work(struct work_struct *work)
 {
        return queue_work(kblockd_workqueue, work);
 }
 EXPORT_SYMBOL(kblockd_schedule_work);
-int kblockd_schedule_delayed_work(struct request_queue *q,
+int kblockd_schedule_delayed_work(struct delayed_work *dwork,
-                        struct delayed_work *dwork, unsigned long delay)
+                                  unsigned long delay)
 {
        return queue_delayed_work(kblockd_workqueue, dwork, delay);
 }
 EXPORT_SYMBOL(kblockd_schedule_delayed_work);
-#define PLUG_MAGIC      0x91827364
+int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
+                                     unsigned long delay)
+{
+        return queue_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
+}
+EXPORT_SYMBOL(kblockd_schedule_delayed_work_on);
 /**
 * blk_start_plug - initialize blk_plug and track it inside the task_struct
@@ -2937,7 +2975,6 @@ void blk_start_plug(struct blk_plug *plug)
 {
        struct task_struct *tsk = current;
-        plug->magic = PLUG_MAGIC;
        INIT_LIST_HEAD(&plug->list);
        INIT_LIST_HEAD(&plug->mq_list);
        INIT_LIST_HEAD(&plug->cb_list);
@@ -3034,8 +3071,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
        LIST_HEAD(list);
        unsigned int depth;
-        BUG_ON(plug->magic != PLUG_MAGIC);
        flush_plug_callbacks(plug, from_schedule);
        if (!list_empty(&plug->mq_list))
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 43e6b4755e9a..ff87c664b7df 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -130,21 +130,13 @@ static void blk_flush_restore_request(struct request *rq)
        blk_clear_rq_complete(rq);
 }
-static void mq_flush_run(struct work_struct *work)
-{
-        struct request *rq;
-        rq = container_of(work, struct request, mq_flush_work);
-        memset(&rq->csd, 0, sizeof(rq->csd));
-        blk_mq_insert_request(rq, false, true, false);
-}
 static bool blk_flush_queue_rq(struct request *rq, bool add_front)
 {
        if (rq->q->mq_ops) {
-                INIT_WORK(&rq->mq_flush_work, mq_flush_run);
+                struct request_queue *q = rq->q;
-                kblockd_schedule_work(rq->q, &rq->mq_flush_work);
+                blk_mq_add_to_requeue_list(rq, add_front);
+                blk_mq_kick_requeue_list(q);
                return false;
        } else {
                if (add_front)
@@ -231,8 +223,10 @@ static void flush_end_io(struct request *flush_rq, int error)
        struct request *rq, *n;
        unsigned long flags = 0;
-        if (q->mq_ops)
+        if (q->mq_ops) {
                spin_lock_irqsave(&q->mq_flush_lock, flags);
+                q->flush_rq->cmd_flags = 0;
+        }
        running = &q->flush_queue[q->flush_running_idx];
        BUG_ON(q->flush_pending_idx == q->flush_running_idx);
@@ -306,23 +300,9 @@ static bool blk_kick_flush(struct request_queue *q)
         */
        q->flush_pending_idx ^= 1;
-        if (q->mq_ops) {
+        blk_rq_init(q, q->flush_rq);
-                struct blk_mq_ctx *ctx = first_rq->mq_ctx;
+        if (q->mq_ops)
-                struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
+                blk_mq_clone_flush_request(q->flush_rq, first_rq);
-                blk_mq_rq_init(hctx, q->flush_rq);
-                q->flush_rq->mq_ctx = ctx;
-                /*
-                 * Reuse the tag value from the fist waiting request,
-                 * with blk-mq the tag is generated during request
-                 * allocation and drivers can rely on it being inside
-                 * the range they asked for.
-                 */
-                q->flush_rq->tag = first_rq->tag;
-        } else {
-                blk_rq_init(q, q->flush_rq);
-        }
        q->flush_rq->cmd_type = REQ_TYPE_FS;
        q->flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c
index c11d24e379e2..d828b44a404b 100644
--- a/block/blk-iopoll.c
+++ b/block/blk-iopoll.c
@@ -64,12 +64,12 @@ EXPORT_SYMBOL(__blk_iopoll_complete);
 *     iopoll handler will not be invoked again before blk_iopoll_sched_prep()
 *     is called.
 **/
-void blk_iopoll_complete(struct blk_iopoll *iopoll)
+void blk_iopoll_complete(struct blk_iopoll *iop)
 {
        unsigned long flags;
        local_irq_save(flags);
-        __blk_iopoll_complete(iopoll);
+        __blk_iopoll_complete(iop);
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL(blk_iopoll_complete);
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 97a733cf3d5f..8411be3c19d3 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -226,8 +226,8 @@ EXPORT_SYMBOL(blkdev_issue_write_same);
 *  Generate and issue number of bios with zerofiled pages.
 */
-int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
+static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
-                        sector_t nr_sects, gfp_t gfp_mask)
+                                  sector_t nr_sects, gfp_t gfp_mask)
 {
        int ret;
        struct bio *bio;
diff --git a/block/blk-map.c b/block/blk-map.c
index f7b22bc21518..f890d4345b0c 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -155,7 +155,6 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq,
        if (!bio_flagged(bio, BIO_USER_MAPPED))
                rq->cmd_flags |= REQ_COPY_USER;
-        rq->buffer = NULL;
        return 0;
 unmap_rq:
        blk_rq_unmap_user(bio);
@@ -238,7 +237,6 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
        blk_queue_bounce(q, &bio);
        bio_get(bio);
        blk_rq_bio_prep(q, rq, bio);
-        rq->buffer = NULL;
        return 0;
 }
 EXPORT_SYMBOL(blk_rq_map_user_iov);
@@ -325,7 +323,6 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
        }
        blk_queue_bounce(q, &rq->bio);
-        rq->buffer = NULL;
        return 0;
 }
 EXPORT_SYMBOL(blk_rq_map_kern);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 6c583f9c5b65..b3bf0df0f4c2 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -13,7 +13,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
                                             struct bio *bio)
 {
        struct bio_vec bv, bvprv = { NULL };
-        int cluster, high, highprv = 1;
+        int cluster, high, highprv = 1, no_sg_merge;
        unsigned int seg_size, nr_phys_segs;
        struct bio *fbio, *bbio;
        struct bvec_iter iter;
@@ -35,12 +35,21 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
        cluster = blk_queue_cluster(q);
        seg_size = 0;
        nr_phys_segs = 0;
+        no_sg_merge = test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags);
+        high = 0;
        for_each_bio(bio) {
                bio_for_each_segment(bv, bio, iter) {
                        /*
+                         * If SG merging is disabled, each bio vector is
+                         * a segment
+                         */
+                        if (no_sg_merge)
+                                goto new_segment;
+                        /*
                         * the trick here is making sure that a high page is
-                         * never considered part of another segment, since that
+                         * never considered part of another segment, since
-                         * might change with the bounce page.
+                         * that might change with the bounce page.
                         */
                        high = page_to_pfn(bv.bv_page) > queue_bounce_pfn(q);
                        if (!high && !highprv && cluster) {
@@ -84,11 +93,16 @@ void blk_recalc_rq_segments(struct request *rq)
 void blk_recount_segments(struct request_queue *q, struct bio *bio)
 {
-        struct bio *nxt = bio->bi_next;
+        if (test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags))
+                bio->bi_phys_segments = bio->bi_vcnt;
+        else {
+                struct bio *nxt = bio->bi_next;
+                bio->bi_next = NULL;
+                bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio);
+                bio->bi_next = nxt;
+        }
-        bio->bi_next = NULL;
-        bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio);
-        bio->bi_next = nxt;
        bio->bi_flags |= (1 << BIO_SEG_VALID);
 }
 EXPORT_SYMBOL(blk_recount_segments);
diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c
index 136ef8643bba..bb3ed488f7b5 100644
--- a/block/blk-mq-cpu.c
+++ b/block/blk-mq-cpu.c
@@ -1,3 +1,8 @@
+/*
+ * CPU notifier helper code for blk-mq
+ *
+ * Copyright (C) 2013-2014 Jens Axboe
+ */
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
@@ -18,14 +23,18 @@ static int blk_mq_main_cpu_notify(struct notifier_block *self,
 {
        unsigned int cpu = (unsigned long) hcpu;
        struct blk_mq_cpu_notifier *notify;
+        int ret = NOTIFY_OK;
        raw_spin_lock(&blk_mq_cpu_notify_lock);
-        list_for_each_entry(notify, &blk_mq_cpu_notify_list, list)
+        list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) {
-                notify->notify(notify->data, action, cpu);
+                ret = notify->notify(notify->data, action, cpu);
+                if (ret != NOTIFY_OK)
+                        break;
+        }
        raw_spin_unlock(&blk_mq_cpu_notify_lock);
-        return NOTIFY_OK;
+        return ret;
 }
 void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
@@ -45,7 +54,7 @@ void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
 }
 void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
-                              void (*fn)(void *, unsigned long, unsigned int),
+                              int (*fn)(void *, unsigned long, unsigned int),
                              void *data)
 {
        notifier->notify = fn;
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 097921329619..1065d7c65fa1 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -1,3 +1,8 @@
+/*
+ * CPU <-> hardware queue mapping helpers
+ *
+ * Copyright (C) 2013-2014 Jens Axboe
+ */
 #include <linux/kernel.h>
 #include <linux/threads.h>
 #include <linux/module.h>
@@ -80,19 +85,35 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
        return 0;
 }
-unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg)
+unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set)
 {
        unsigned int *map;
        /* If cpus are offline, map them to first hctx */
        map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL,
-                                reg->numa_node);
+                                set->numa_node);
        if (!map)
                return NULL;
-        if (!blk_mq_update_queue_map(map, reg->nr_hw_queues))
+        if (!blk_mq_update_queue_map(map, set->nr_hw_queues))
                return map;
        kfree(map);
        return NULL;
 }
+/*
+ * We have no quick way of doing reverse lookups. This is only used at
+ * queue init time, so runtime isn't important.
+ */
+int blk_mq_hw_queue_to_node(unsigned int *mq_map, unsigned int index)
+{
+        int i;
+        for_each_possible_cpu(i) {
+                if (index == mq_map[i])
+                        return cpu_to_node(i);
+        }
+        return NUMA_NO_NODE;
+}
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index b0ba264b0522..ed5217867555 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -203,59 +203,24 @@ static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx,
        return ret;
 }
-static ssize_t blk_mq_hw_sysfs_ipi_show(struct blk_mq_hw_ctx *hctx, char *page)
+static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page)
-{
-        ssize_t ret;
-        spin_lock(&hctx->lock);
-        ret = sprintf(page, "%u\n", !!(hctx->flags & BLK_MQ_F_SHOULD_IPI));
-        spin_unlock(&hctx->lock);
-        return ret;
-}
-static ssize_t blk_mq_hw_sysfs_ipi_store(struct blk_mq_hw_ctx *hctx,
-                                         const char *page, size_t len)
 {
-        struct blk_mq_ctx *ctx;
+        return blk_mq_tag_sysfs_show(hctx->tags, page);
-        unsigned long ret;
-        unsigned int i;
-        if (kstrtoul(page, 10, &ret)) {
-                pr_err("blk-mq-sysfs: invalid input '%s'\n", page);
-                return -EINVAL;
-        }
-        spin_lock(&hctx->lock);
-        if (ret)
-                hctx->flags |= BLK_MQ_F_SHOULD_IPI;
-        else
-                hctx->flags &= ~BLK_MQ_F_SHOULD_IPI;
-        spin_unlock(&hctx->lock);
-        hctx_for_each_ctx(hctx, ctx, i)
-                ctx->ipi_redirect = !!ret;
-        return len;
 }
-static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page)
+static ssize_t blk_mq_hw_sysfs_active_show(struct blk_mq_hw_ctx *hctx, char *page)
 {
-        return blk_mq_tag_sysfs_show(hctx->tags, page);
+        return sprintf(page, "%u\n", atomic_read(&hctx->nr_active));
 }
 static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
 {
-        unsigned int i, queue_num, first = 1;
+        unsigned int i, first = 1;
        ssize_t ret = 0;
        blk_mq_disable_hotplug();
-        for_each_online_cpu(i) {
+        for_each_cpu(i, hctx->cpumask) {
-                queue_num = hctx->queue->mq_map[i];
-                if (queue_num != hctx->queue_num)
-                        continue;
                if (first)
                        ret += sprintf(ret + page, "%u", i);
                else
@@ -307,15 +272,14 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = {
        .attr = {.name = "dispatched", .mode = S_IRUGO },
        .show = blk_mq_hw_sysfs_dispatched_show,
 };
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_active = {
+        .attr = {.name = "active", .mode = S_IRUGO },
+        .show = blk_mq_hw_sysfs_active_show,
+};
 static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = {
        .attr = {.name = "pending", .mode = S_IRUGO },
        .show = blk_mq_hw_sysfs_rq_list_show,
 };
-static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_ipi = {
-        .attr = {.name = "ipi_redirect", .mode = S_IRUGO | S_IWUSR},
-        .show = blk_mq_hw_sysfs_ipi_show,
-        .store = blk_mq_hw_sysfs_ipi_store,
-};
 static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = {
        .attr = {.name = "tags", .mode = S_IRUGO },
        .show = blk_mq_hw_sysfs_tags_show,
@@ -330,9 +294,9 @@ static struct attribute *default_hw_ctx_attrs[] = {
        &blk_mq_hw_sysfs_run.attr,
        &blk_mq_hw_sysfs_dispatched.attr,
        &blk_mq_hw_sysfs_pending.attr,
-        &blk_mq_hw_sysfs_ipi.attr,
        &blk_mq_hw_sysfs_tags.attr,
        &blk_mq_hw_sysfs_cpus.attr,
+        &blk_mq_hw_sysfs_active.attr,
        NULL,
 };
@@ -363,6 +327,42 @@ static struct kobj_type blk_mq_hw_ktype = {
        .release        = blk_mq_sysfs_release,
 };
+static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx)
+{
+        struct blk_mq_ctx *ctx;
+        int i;
+        if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP))
+                return;
+        hctx_for_each_ctx(hctx, ctx, i)
+                kobject_del(&ctx->kobj);
+        kobject_del(&hctx->kobj);
+}
+static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
+{
+        struct request_queue *q = hctx->queue;
+        struct blk_mq_ctx *ctx;
+        int i, ret;
+        if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP))
+                return 0;
+        ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", hctx->queue_num);
+        if (ret)
+                return ret;
+        hctx_for_each_ctx(hctx, ctx, i) {
+                ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu);
+                if (ret)
+                        break;
+        }
+        return ret;
+}
 void blk_mq_unregister_disk(struct gendisk *disk)
 {
        struct request_queue *q = disk->queue;
@@ -371,11 +371,11 @@ void blk_mq_unregister_disk(struct gendisk *disk)
        int i, j;
        queue_for_each_hw_ctx(q, hctx, i) {
-                hctx_for_each_ctx(hctx, ctx, j) {
+                blk_mq_unregister_hctx(hctx);
-                        kobject_del(&ctx->kobj);
+                hctx_for_each_ctx(hctx, ctx, j)
                        kobject_put(&ctx->kobj);
-                }
-                kobject_del(&hctx->kobj);
                kobject_put(&hctx->kobj);
        }
@@ -386,15 +386,30 @@ void blk_mq_unregister_disk(struct gendisk *disk)
        kobject_put(&disk_to_dev(disk)->kobj);
 }
+static void blk_mq_sysfs_init(struct request_queue *q)
+{
+        struct blk_mq_hw_ctx *hctx;
+        struct blk_mq_ctx *ctx;
+        int i, j;
+        kobject_init(&q->mq_kobj, &blk_mq_ktype);
+        queue_for_each_hw_ctx(q, hctx, i) {
+                kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
+                hctx_for_each_ctx(hctx, ctx, j)
+                        kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
+        }
+}
 int blk_mq_register_disk(struct gendisk *disk)
 {
        struct device *dev = disk_to_dev(disk);
        struct request_queue *q = disk->queue;
        struct blk_mq_hw_ctx *hctx;
-        struct blk_mq_ctx *ctx;
+        int ret, i;
-        int ret, i, j;
-        kobject_init(&q->mq_kobj, &blk_mq_ktype);
+        blk_mq_sysfs_init(q);
        ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
        if (ret < 0)
@@ -403,20 +418,10 @@ int blk_mq_register_disk(struct gendisk *disk)
        kobject_uevent(&q->mq_kobj, KOBJ_ADD);
        queue_for_each_hw_ctx(q, hctx, i) {
-                kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
+                hctx->flags |= BLK_MQ_F_SYSFS_UP;
-                ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", i);
+                ret = blk_mq_register_hctx(hctx);
                if (ret)
                        break;
-                if (!hctx->nr_ctx)
-                        continue;
-                hctx_for_each_ctx(hctx, ctx, j) {
-                        kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
-                        ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu);
-                        if (ret)
-                                break;
-                }
        }
        if (ret) {
@@ -426,3 +431,26 @@ int blk_mq_register_disk(struct gendisk *disk)
        return 0;
 }
+void blk_mq_sysfs_unregister(struct request_queue *q)
+{
+        struct blk_mq_hw_ctx *hctx;
+        int i;
+        queue_for_each_hw_ctx(q, hctx, i)
+                blk_mq_unregister_hctx(hctx);
+}
+int blk_mq_sysfs_register(struct request_queue *q)
+{
+        struct blk_mq_hw_ctx *hctx;
+        int i, ret = 0;
+        queue_for_each_hw_ctx(q, hctx, i) {
+                ret = blk_mq_register_hctx(hctx);
+                if (ret)
+                        break;
+        }
+        return ret;
+}
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 83ae96c51a27..d90c4aeb7dd3 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -1,78 +1,345 @@
+/*
+ * Fast and scalable bitmap tagging variant. Uses sparser bitmaps spread
+ * over multiple cachelines to avoid ping-pong between multiple submitters
+ * or submitter and completer. Uses rolling wakeups to avoid falling of
+ * the scaling cliff when we run out of tags and have to start putting
+ * submitters to sleep.
+ *
+ * Uses active queue tracking to support fairer distribution of tags
+ * between multiple submitters when a shared tag map is used.
+ *
+ * Copyright (C) 2013-2014 Jens Axboe
+ */
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/percpu_ida.h>
+#include <linux/random.h>
 #include <linux/blk-mq.h>
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
+static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt)
+{
+        int i;
+        for (i = 0; i < bt->map_nr; i++) {
+                struct blk_align_bitmap *bm = &bt->map[i];
+                int ret;
+                ret = find_first_zero_bit(&bm->word, bm->depth);
+                if (ret < bm->depth)
+                        return true;
+        }
+        return false;
+}
+bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
+{
+        if (!tags)
+                return true;
+        return bt_has_free_tags(&tags->bitmap_tags);
+}
+static inline void bt_index_inc(unsigned int *index)
+{
+        *index = (*index + 1) & (BT_WAIT_QUEUES - 1);
+}
 /*
- * Per tagged queue (tag address space) map
+ * If a previously inactive queue goes active, bump the active user count.
 */
-struct blk_mq_tags {
+bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
-        unsigned int nr_tags;
+{
-        unsigned int nr_reserved_tags;
+        if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
-        unsigned int nr_batch_move;
+            !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
-        unsigned int nr_max_cache;
+                atomic_inc(&hctx->tags->active_queues);
-        struct percpu_ida free_tags;
+        return true;
-        struct percpu_ida reserved_tags;
+}
-};
-void blk_mq_wait_for_tags(struct blk_mq_tags *tags)
+/*
+ * Wakeup all potentially sleeping on normal (non-reserved) tags
+ */
+static void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags)
 {
-        int tag = blk_mq_get_tag(tags, __GFP_WAIT, false);
+        struct blk_mq_bitmap_tags *bt;
-        blk_mq_put_tag(tags, tag);
+        int i, wake_index;
+        bt = &tags->bitmap_tags;
+        wake_index = bt->wake_index;
+        for (i = 0; i < BT_WAIT_QUEUES; i++) {
+                struct bt_wait_state *bs = &bt->bs[wake_index];
+                if (waitqueue_active(&bs->wait))
+                        wake_up(&bs->wait);
+                bt_index_inc(&wake_index);
+        }
 }
-bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
+/*
+ * If a previously busy queue goes inactive, potential waiters could now
+ * be allowed to queue. Wake them up and check.
+ */
+void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
+{
+        struct blk_mq_tags *tags = hctx->tags;
+        if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+                return;
+        atomic_dec(&tags->active_queues);
+        blk_mq_tag_wakeup_all(tags);
+}
+/*
+ * For shared tag users, we track the number of currently active users
+ * and attempt to provide a fair share of the tag depth for each of them.
+ */
+static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
+                                  struct blk_mq_bitmap_tags *bt)
+{
+        unsigned int depth, users;
+        if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED))
+                return true;
+        if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+                return true;
+        /*
+         * Don't try dividing an ant
+         */
+        if (bt->depth == 1)
+                return true;
+        users = atomic_read(&hctx->tags->active_queues);
+        if (!users)
+                return true;
+        /*
+         * Allow at least some tags
+         */
+        depth = max((bt->depth + users - 1) / users, 4U);
+        return atomic_read(&hctx->nr_active) < depth;
+}
+static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag)
 {
-        return !tags ||
+        int tag, org_last_tag, end;
-                percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids) != 0;
+        org_last_tag = last_tag;
+        end = bm->depth;
+        do {
+restart:
+                tag = find_next_zero_bit(&bm->word, end, last_tag);
+                if (unlikely(tag >= end)) {
+                        /*
+                         * We started with an offset, start from 0 to
+                         * exhaust the map.
+                         */
+                        if (org_last_tag && last_tag) {
+                                end = last_tag;
+                                last_tag = 0;
+                                goto restart;
+                        }
+                        return -1;
+                }
+                last_tag = tag + 1;
+        } while (test_and_set_bit_lock(tag, &bm->word));
+        return tag;
 }
-static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp)
+/*
+ * Straight forward bitmap tag implementation, where each bit is a tag
+ * (cleared == free, and set == busy). The small twist is using per-cpu
+ * last_tag caches, which blk-mq stores in the blk_mq_ctx software queue
+ * contexts. This enables us to drastically limit the space searched,
+ * without dirtying an extra shared cacheline like we would if we stored
+ * the cache value inside the shared blk_mq_bitmap_tags structure. On top
+ * of that, each word of tags is in a separate cacheline. This means that
+ * multiple users will tend to stick to different cachelines, at least
+ * until the map is exhausted.
+ */
+static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt,
+                    unsigned int *tag_cache)
 {
+        unsigned int last_tag, org_last_tag;
+        int index, i, tag;
+        if (!hctx_may_queue(hctx, bt))
+                return -1;
+        last_tag = org_last_tag = *tag_cache;
+        index = TAG_TO_INDEX(bt, last_tag);
+        for (i = 0; i < bt->map_nr; i++) {
+                tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag));
+                if (tag != -1) {
+                        tag += (index << bt->bits_per_word);
+                        goto done;
+                }
+                last_tag = 0;
+                if (++index >= bt->map_nr)
+                        index = 0;
+        }
+        *tag_cache = 0;
+        return -1;
+        /*
+         * Only update the cache from the allocation path, if we ended
+         * up using the specific cached tag.
+         */
+done:
+        if (tag == org_last_tag) {
+                last_tag = tag + 1;
+                if (last_tag >= bt->depth - 1)
+                        last_tag = 0;
+                *tag_cache = last_tag;
+        }
+        return tag;
+}
+static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt,
+                                         struct blk_mq_hw_ctx *hctx)
+{
+        struct bt_wait_state *bs;
+        if (!hctx)
+                return &bt->bs[0];
+        bs = &bt->bs[hctx->wait_index];
+        bt_index_inc(&hctx->wait_index);
+        return bs;
+}
+static int bt_get(struct blk_mq_bitmap_tags *bt, struct blk_mq_hw_ctx *hctx,
+                  unsigned int *last_tag, gfp_t gfp)
+{
+        struct bt_wait_state *bs;
+        DEFINE_WAIT(wait);
        int tag;
-        tag = percpu_ida_alloc(&tags->free_tags, (gfp & __GFP_WAIT) ?
+        tag = __bt_get(hctx, bt, last_tag);
-                               TASK_UNINTERRUPTIBLE : TASK_RUNNING);
+        if (tag != -1)
-        if (tag < 0)
+                return tag;
-                return BLK_MQ_TAG_FAIL;
-        return tag + tags->nr_reserved_tags;
+        if (!(gfp & __GFP_WAIT))
+                return -1;
+        bs = bt_wait_ptr(bt, hctx);
+        do {
+                bool was_empty;
+                was_empty = list_empty(&wait.task_list);
+                prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE);
+                tag = __bt_get(hctx, bt, last_tag);
+                if (tag != -1)
+                        break;
+                if (was_empty)
+                        atomic_set(&bs->wait_cnt, bt->wake_cnt);
+                io_schedule();
+        } while (1);
+        finish_wait(&bs->wait, &wait);
+        return tag;
+}
+static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags,
+                                     struct blk_mq_hw_ctx *hctx,
+                                     unsigned int *last_tag, gfp_t gfp)
+{
+        int tag;
+        tag = bt_get(&tags->bitmap_tags, hctx, last_tag, gfp);
+        if (tag >= 0)
+                return tag + tags->nr_reserved_tags;
+        return BLK_MQ_TAG_FAIL;
 }
 static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags,
                                              gfp_t gfp)
 {
-        int tag;
+        int tag, zero = 0;
        if (unlikely(!tags->nr_reserved_tags)) {
                WARN_ON_ONCE(1);
                return BLK_MQ_TAG_FAIL;
        }
-        tag = percpu_ida_alloc(&tags->reserved_tags, (gfp & __GFP_WAIT) ?
+        tag = bt_get(&tags->breserved_tags, NULL, &zero, gfp);
-                               TASK_UNINTERRUPTIBLE : TASK_RUNNING);
        if (tag < 0)
                return BLK_MQ_TAG_FAIL;
        return tag;
 }
-unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved)
+unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag,
+                            gfp_t gfp, bool reserved)
 {
        if (!reserved)
-                return __blk_mq_get_tag(tags, gfp);
+                return __blk_mq_get_tag(hctx->tags, hctx, last_tag, gfp);
-        return __blk_mq_get_reserved_tag(tags, gfp);
+        return __blk_mq_get_reserved_tag(hctx->tags, gfp);
+}
+static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt)
+{
+        int i, wake_index;
+        wake_index = bt->wake_index;
+        for (i = 0; i < BT_WAIT_QUEUES; i++) {
+                struct bt_wait_state *bs = &bt->bs[wake_index];
+                if (waitqueue_active(&bs->wait)) {
+                        if (wake_index != bt->wake_index)
+                                bt->wake_index = wake_index;
+                        return bs;
+                }
+                bt_index_inc(&wake_index);
+        }
+        return NULL;
+}
+static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag)
+{
+        const int index = TAG_TO_INDEX(bt, tag);
+        struct bt_wait_state *bs;
+        /*
+         * The unlock memory barrier need to order access to req in free
+         * path and clearing tag bit
+         */
+        clear_bit_unlock(TAG_TO_BIT(bt, tag), &bt->map[index].word);
+        bs = bt_wake_ptr(bt);
+        if (bs && atomic_dec_and_test(&bs->wait_cnt)) {
+                atomic_set(&bs->wait_cnt, bt->wake_cnt);
+                bt_index_inc(&bt->wake_index);
+                wake_up(&bs->wait);
+        }
 }
 static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag)
 {
        BUG_ON(tag >= tags->nr_tags);
-        percpu_ida_free(&tags->free_tags, tag - tags->nr_reserved_tags);
+        bt_clear_tag(&tags->bitmap_tags, tag);
 }
 static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags,
@@ -80,22 +347,43 @@ static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags,
 {
        BUG_ON(tag >= tags->nr_reserved_tags);
-        percpu_ida_free(&tags->reserved_tags, tag);
+        bt_clear_tag(&tags->breserved_tags, tag);
 }
-void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag)
+void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
+                    unsigned int *last_tag)
 {
-        if (tag >= tags->nr_reserved_tags)
+        struct blk_mq_tags *tags = hctx->tags;
-                __blk_mq_put_tag(tags, tag);
-        else
+        if (tag >= tags->nr_reserved_tags) {
+                const int real_tag = tag - tags->nr_reserved_tags;
+                __blk_mq_put_tag(tags, real_tag);
+                *last_tag = real_tag;
+        } else
                __blk_mq_put_reserved_tag(tags, tag);
 }
-static int __blk_mq_tag_iter(unsigned id, void *data)
+static void bt_for_each_free(struct blk_mq_bitmap_tags *bt,
+                             unsigned long *free_map, unsigned int off)
 {
-        unsigned long *tag_map = data;
+        int i;
-        __set_bit(id, tag_map);
-        return 0;
+        for (i = 0; i < bt->map_nr; i++) {
+                struct blk_align_bitmap *bm = &bt->map[i];
+                int bit = 0;
+                do {
+                        bit = find_next_zero_bit(&bm->word, bm->depth, bit);
+                        if (bit >= bm->depth)
+                                break;
+                        __set_bit(bit + off, free_map);
+                        bit++;
+                } while (1);
+                off += (1 << bt->bits_per_word);
+        }
 }
 void blk_mq_tag_busy_iter(struct blk_mq_tags *tags,
@@ -109,21 +397,128 @@ void blk_mq_tag_busy_iter(struct blk_mq_tags *tags,
        if (!tag_map)
                return;
-        percpu_ida_for_each_free(&tags->free_tags, __blk_mq_tag_iter, tag_map);
+        bt_for_each_free(&tags->bitmap_tags, tag_map, tags->nr_reserved_tags);
        if (tags->nr_reserved_tags)
-                percpu_ida_for_each_free(&tags->reserved_tags, __blk_mq_tag_iter,
+                bt_for_each_free(&tags->breserved_tags, tag_map, 0);
-                        tag_map);
        fn(data, tag_map);
        kfree(tag_map);
 }
+EXPORT_SYMBOL(blk_mq_tag_busy_iter);
+static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt)
+{
+        unsigned int i, used;
+        for (i = 0, used = 0; i < bt->map_nr; i++) {
+                struct blk_align_bitmap *bm = &bt->map[i];
+                used += bitmap_weight(&bm->word, bm->depth);
+        }
+        return bt->depth - used;
+}
+static void bt_update_count(struct blk_mq_bitmap_tags *bt,
+                            unsigned int depth)
+{
+        unsigned int tags_per_word = 1U << bt->bits_per_word;
+        unsigned int map_depth = depth;
+        if (depth) {
+                int i;
+                for (i = 0; i < bt->map_nr; i++) {
+                        bt->map[i].depth = min(map_depth, tags_per_word);
+                        map_depth -= bt->map[i].depth;
+                }
+        }
+        bt->wake_cnt = BT_WAIT_BATCH;
+        if (bt->wake_cnt > depth / 4)
+                bt->wake_cnt = max(1U, depth / 4);
+        bt->depth = depth;
+}
+static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth,
+                        int node, bool reserved)
+{
+        int i;
+        bt->bits_per_word = ilog2(BITS_PER_LONG);
+        /*
+         * Depth can be zero for reserved tags, that's not a failure
+         * condition.
+         */
+        if (depth) {
+                unsigned int nr, tags_per_word;
+                tags_per_word = (1 << bt->bits_per_word);
+                /*
+                 * If the tag space is small, shrink the number of tags
+                 * per word so we spread over a few cachelines, at least.
+                 * If less than 4 tags, just forget about it, it's not
+                 * going to work optimally anyway.
+                 */
+                if (depth >= 4) {
+                        while (tags_per_word * 4 > depth) {
+                                bt->bits_per_word--;
+                                tags_per_word = (1 << bt->bits_per_word);
+                        }
+                }
+                nr = ALIGN(depth, tags_per_word) / tags_per_word;
+                bt->map = kzalloc_node(nr * sizeof(struct blk_align_bitmap),
+                                                GFP_KERNEL, node);
+                if (!bt->map)
+                        return -ENOMEM;
+                bt->map_nr = nr;
+        }
+        bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL);
+        if (!bt->bs) {
+                kfree(bt->map);
+                return -ENOMEM;
+        }
+        for (i = 0; i < BT_WAIT_QUEUES; i++)
+                init_waitqueue_head(&bt->bs[i].wait);
+        bt_update_count(bt, depth);
+        return 0;
+}
+static void bt_free(struct blk_mq_bitmap_tags *bt)
+{
+        kfree(bt->map);
+        kfree(bt->bs);
+}
+static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
+                                                   int node)
+{
+        unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
+        if (bt_alloc(&tags->bitmap_tags, depth, node, false))
+                goto enomem;
+        if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true))
+                goto enomem;
+        return tags;
+enomem:
+        bt_free(&tags->bitmap_tags);
+        kfree(tags);
+        return NULL;
+}
 struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
                                     unsigned int reserved_tags, int node)
 {
-        unsigned int nr_tags, nr_cache;
        struct blk_mq_tags *tags;
-        int ret;
        if (total_tags > BLK_MQ_TAG_MAX) {
                pr_err("blk-mq: tag depth too large\n");
@@ -134,73 +529,59 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
        if (!tags)
                return NULL;
-        nr_tags = total_tags - reserved_tags;
-        nr_cache = nr_tags / num_possible_cpus();
-        if (nr_cache < BLK_MQ_TAG_CACHE_MIN)
-                nr_cache = BLK_MQ_TAG_CACHE_MIN;
-        else if (nr_cache > BLK_MQ_TAG_CACHE_MAX)
-                nr_cache = BLK_MQ_TAG_CACHE_MAX;
        tags->nr_tags = total_tags;
        tags->nr_reserved_tags = reserved_tags;
-        tags->nr_max_cache = nr_cache;
-        tags->nr_batch_move = max(1u, nr_cache / 2);
-        ret = __percpu_ida_init(&tags->free_tags, tags->nr_tags -
+        return blk_mq_init_bitmap_tags(tags, node);
-                                tags->nr_reserved_tags,
+}
-                                tags->nr_max_cache,
-                                tags->nr_batch_move);
-        if (ret)
-                goto err_free_tags;
-        if (reserved_tags) {
+void blk_mq_free_tags(struct blk_mq_tags *tags)
-                /*
+{
-                 * With max_cahe and batch set to 1, the allocator fallbacks to
+        bt_free(&tags->bitmap_tags);
-                 * no cached. It's fine reserved tags allocation is slow.
+        bt_free(&tags->breserved_tags);
-                 */
+        kfree(tags);
-                ret = __percpu_ida_init(&tags->reserved_tags, reserved_tags,
+}
-                                1, 1);
-                if (ret)
-                        goto err_reserved_tags;
-        }
-        return tags;
+void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *tag)
+{
+        unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
-err_reserved_tags:
+        *tag = prandom_u32() % depth;
-        percpu_ida_destroy(&tags->free_tags);
-err_free_tags:
-        kfree(tags);
-        return NULL;
 }
-void blk_mq_free_tags(struct blk_mq_tags *tags)
+int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth)
 {
-        percpu_ida_destroy(&tags->free_tags);
+        tdepth -= tags->nr_reserved_tags;
-        percpu_ida_destroy(&tags->reserved_tags);
+        if (tdepth > tags->nr_tags)
-        kfree(tags);
+                return -EINVAL;
+        /*
+         * Don't need (or can't) update reserved tags here, they remain
+         * static and should never need resizing.
+         */
+        bt_update_count(&tags->bitmap_tags, tdepth);
+        blk_mq_tag_wakeup_all(tags);
+        return 0;
 }
 ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page)
 {
        char *orig_page = page;
-        unsigned int cpu;
+        unsigned int free, res;
        if (!tags)
                return 0;
-        page += sprintf(page, "nr_tags=%u, reserved_tags=%u, batch_move=%u,"
+        page += sprintf(page, "nr_tags=%u, reserved_tags=%u, "
-                        " max_cache=%u\n", tags->nr_tags, tags->nr_reserved_tags,
+                        "bits_per_word=%u\n",
-                        tags->nr_batch_move, tags->nr_max_cache);
+                        tags->nr_tags, tags->nr_reserved_tags,
+                        tags->bitmap_tags.bits_per_word);
-        page += sprintf(page, "nr_free=%u, nr_reserved=%u\n",
+        free = bt_unused_tags(&tags->bitmap_tags);
-                        percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids),
+        res = bt_unused_tags(&tags->breserved_tags);
-                        percpu_ida_free_tags(&tags->reserved_tags, nr_cpu_ids));
-        for_each_possible_cpu(cpu) {
+        page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", free, res);
-                page += sprintf(page, "  cpu%02u: nr_free=%u\n", cpu,
+        page += sprintf(page, "active_queues=%u\n", atomic_read(&tags->active_queues));
-                                percpu_ida_free_tags(&tags->free_tags, cpu));
-        }
        return page - orig_page;
 }
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 947ba2c6148e..c959de58d2a5 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -1,17 +1,59 @@
 #ifndef INT_BLK_MQ_TAG_H
 #define INT_BLK_MQ_TAG_H
-struct blk_mq_tags;
+#include "blk-mq.h"
+enum {
+        BT_WAIT_QUEUES  = 8,
+        BT_WAIT_BATCH   = 8,
+};
+struct bt_wait_state {
+        atomic_t wait_cnt;
+        wait_queue_head_t wait;
+} ____cacheline_aligned_in_smp;
+#define TAG_TO_INDEX(bt, tag)   ((tag) >> (bt)->bits_per_word)
+#define TAG_TO_BIT(bt, tag)     ((tag) & ((1 << (bt)->bits_per_word) - 1))
+struct blk_mq_bitmap_tags {
+        unsigned int depth;
+        unsigned int wake_cnt;
+        unsigned int bits_per_word;
+        unsigned int map_nr;
+        struct blk_align_bitmap *map;
+        unsigned int wake_index;
+        struct bt_wait_state *bs;
+};
+/*
+ * Tag address space map.
+ */
+struct blk_mq_tags {
+        unsigned int nr_tags;
+        unsigned int nr_reserved_tags;
+        atomic_t active_queues;
+        struct blk_mq_bitmap_tags bitmap_tags;
+        struct blk_mq_bitmap_tags breserved_tags;
+        struct request **rqs;
+        struct list_head page_list;
+};
 extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node);
 extern void blk_mq_free_tags(struct blk_mq_tags *tags);
-extern unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved);
+extern unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, gfp_t gfp, bool reserved);
-extern void blk_mq_wait_for_tags(struct blk_mq_tags *tags);
+extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag);
-extern void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag);
-extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data);
 extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
 extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
+extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag);
+extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth);
 enum {
        BLK_MQ_TAG_CACHE_MIN    = 1,
@@ -24,4 +66,23 @@ enum {
        BLK_MQ_TAG_MAX          = BLK_MQ_TAG_FAIL - 1,
 };
+extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
+extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
+static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
+{
+        if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
+                return false;
+        return __blk_mq_tag_busy(hctx);
+}
+static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
+{
+        if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
+                return;
+        __blk_mq_tag_idle(hctx);
+}
 #endif
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 1d2a9bdbee57..0f5879c42dcd 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1,3 +1,9 @@
+/*
+ * Block multiqueue core code
+ *
+ * Copyright (C) 2013-2014 Jens Axboe
+ * Copyright (C) 2013-2014 Christoph Hellwig
+ */
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/backing-dev.h>
@@ -56,38 +62,40 @@ static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
 {
        unsigned int i;
-        for (i = 0; i < hctx->nr_ctx_map; i++)
+        for (i = 0; i < hctx->ctx_map.map_size; i++)
-                if (hctx->ctx_map[i])
+                if (hctx->ctx_map.map[i].word)
                        return true;
        return false;
 }
+static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx,
+                                              struct blk_mq_ctx *ctx)
+{
+        return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word];
+}
+#define CTX_TO_BIT(hctx, ctx)   \
+        ((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1))
 /*
 * Mark this ctx as having pending work in this hardware queue
 */
 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
                                     struct blk_mq_ctx *ctx)
 {
-        if (!test_bit(ctx->index_hw, hctx->ctx_map))
+        struct blk_align_bitmap *bm = get_bm(hctx, ctx);
-                set_bit(ctx->index_hw, hctx->ctx_map);
+        if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word))
+                set_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
 }
-static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx,
+static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
-                                              gfp_t gfp, bool reserved)
+                                      struct blk_mq_ctx *ctx)
 {
-        struct request *rq;
+        struct blk_align_bitmap *bm = get_bm(hctx, ctx);
-        unsigned int tag;
-        tag = blk_mq_get_tag(hctx->tags, gfp, reserved);
+        clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
-        if (tag != BLK_MQ_TAG_FAIL) {
-                rq = hctx->rqs[tag];
-                rq->tag = tag;
-                return rq;
-        }
-        return NULL;
 }
 static int blk_mq_queue_enter(struct request_queue *q)
@@ -186,78 +194,95 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
        if (blk_queue_io_stat(q))
                rw_flags |= REQ_IO_STAT;
+        INIT_LIST_HEAD(&rq->queuelist);
+        /* csd/requeue_work/fifo_time is initialized before use */
+        rq->q = q;
        rq->mq_ctx = ctx;
-        rq->cmd_flags = rw_flags;
+        rq->cmd_flags |= rw_flags;
-        rq->start_time = jiffies;
+        /* do not touch atomic flags, it needs atomic ops against the timer */
+        rq->cpu = -1;
+        INIT_HLIST_NODE(&rq->hash);
+        RB_CLEAR_NODE(&rq->rb_node);
+        rq->rq_disk = NULL;
+        rq->part = NULL;
+#ifdef CONFIG_BLK_CGROUP
+        rq->rl = NULL;
        set_start_time_ns(rq);
+        rq->io_start_time_ns = 0;
+#endif
+        rq->nr_phys_segments = 0;
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+        rq->nr_integrity_segments = 0;
+#endif
+        rq->special = NULL;
+        /* tag was already set */
+        rq->errors = 0;
+        rq->extra_len = 0;
+        rq->sense_len = 0;
+        rq->resid_len = 0;
+        rq->sense = NULL;
+        INIT_LIST_HEAD(&rq->timeout_list);
+        rq->end_io = NULL;
+        rq->end_io_data = NULL;
+        rq->next_rq = NULL;
        ctx->rq_dispatched[rw_is_sync(rw_flags)]++;
 }
-static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,
+static struct request *
-                                                   int rw, gfp_t gfp,
+__blk_mq_alloc_request(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
-                                                   bool reserved)
+                struct blk_mq_ctx *ctx, int rw, gfp_t gfp, bool reserved)
 {
        struct request *rq;
+        unsigned int tag;
-        do {
+        tag = blk_mq_get_tag(hctx, &ctx->last_tag, gfp, reserved);
-                struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
+        if (tag != BLK_MQ_TAG_FAIL) {
-                struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
+                rq = hctx->tags->rqs[tag];
-                rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved);
+                rq->cmd_flags = 0;
-                if (rq) {
+                if (blk_mq_tag_busy(hctx)) {
-                        blk_mq_rq_ctx_init(q, ctx, rq, rw);
+                        rq->cmd_flags = REQ_MQ_INFLIGHT;
-                        break;
+                        atomic_inc(&hctx->nr_active);
                }
-                blk_mq_put_ctx(ctx);
+                rq->tag = tag;
-                if (!(gfp & __GFP_WAIT))
+                blk_mq_rq_ctx_init(q, ctx, rq, rw);
-                        break;
+                return rq;
+        }
-                __blk_mq_run_hw_queue(hctx);
-                blk_mq_wait_for_tags(hctx->tags);
-        } while (1);
-        return rq;
+        return NULL;
 }
-struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp)
+struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
+                bool reserved)
 {
+        struct blk_mq_ctx *ctx;
+        struct blk_mq_hw_ctx *hctx;
        struct request *rq;
        if (blk_mq_queue_enter(q))
                return NULL;
-        rq = blk_mq_alloc_request_pinned(q, rw, gfp, false);
+        ctx = blk_mq_get_ctx(q);
-        if (rq)
+        hctx = q->mq_ops->map_queue(q, ctx->cpu);
-                blk_mq_put_ctx(rq->mq_ctx);
-        return rq;
-}
-struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw,
-                                              gfp_t gfp)
-{
-        struct request *rq;
-        if (blk_mq_queue_enter(q))
+        rq = __blk_mq_alloc_request(q, hctx, ctx, rw, gfp & ~__GFP_WAIT,
-                return NULL;
+                                    reserved);
+        if (!rq && (gfp & __GFP_WAIT)) {
+                __blk_mq_run_hw_queue(hctx);
+                blk_mq_put_ctx(ctx);
-        rq = blk_mq_alloc_request_pinned(q, rw, gfp, true);
+                ctx = blk_mq_get_ctx(q);
-        if (rq)
+                hctx = q->mq_ops->map_queue(q, ctx->cpu);
-                blk_mq_put_ctx(rq->mq_ctx);
+                rq =  __blk_mq_alloc_request(q, hctx, ctx, rw, gfp, reserved);
+        }
+        blk_mq_put_ctx(ctx);
        return rq;
 }
-EXPORT_SYMBOL(blk_mq_alloc_reserved_request);
+EXPORT_SYMBOL(blk_mq_alloc_request);
-/*
- * Re-init and set pdu, if we have it
- */
-void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq)
-{
-        blk_rq_init(hctx->queue, rq);
-        if (hctx->cmd_size)
-                rq->special = blk_mq_rq_to_pdu(rq);
-}
 static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
                                  struct blk_mq_ctx *ctx, struct request *rq)
@@ -265,9 +290,11 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
        const int tag = rq->tag;
        struct request_queue *q = rq->q;
-        blk_mq_rq_init(hctx, rq);
+        if (rq->cmd_flags & REQ_MQ_INFLIGHT)
-        blk_mq_put_tag(hctx->tags, tag);
+                atomic_dec(&hctx->nr_active);
+        clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
+        blk_mq_put_tag(hctx, tag, &ctx->last_tag);
        blk_mq_queue_exit(q);
 }
@@ -283,20 +310,47 @@ void blk_mq_free_request(struct request *rq)
        __blk_mq_free_request(hctx, ctx, rq);
 }
-bool blk_mq_end_io_partial(struct request *rq, int error, unsigned int nr_bytes)
+/*
+ * Clone all relevant state from a request that has been put on hold in
+ * the flush state machine into the preallocated flush request that hangs
+ * off the request queue.
+ *
+ * For a driver the flush request should be invisible, that's why we are
+ * impersonating the original request here.
+ */
+void blk_mq_clone_flush_request(struct request *flush_rq,
+                struct request *orig_rq)
 {
-        if (blk_update_request(rq, error, blk_rq_bytes(rq)))
+        struct blk_mq_hw_ctx *hctx =
-                return true;
+                orig_rq->q->mq_ops->map_queue(orig_rq->q, orig_rq->mq_ctx->cpu);
+        flush_rq->mq_ctx = orig_rq->mq_ctx;
+        flush_rq->tag = orig_rq->tag;
+        memcpy(blk_mq_rq_to_pdu(flush_rq), blk_mq_rq_to_pdu(orig_rq),
+                hctx->cmd_size);
+}
+inline void __blk_mq_end_io(struct request *rq, int error)
+{
        blk_account_io_done(rq);
-        if (rq->end_io)
+        if (rq->end_io) {
                rq->end_io(rq, error);
-        else
+        } else {
+                if (unlikely(blk_bidi_rq(rq)))
+                        blk_mq_free_request(rq->next_rq);
                blk_mq_free_request(rq);
-        return false;
+        }
+}
+EXPORT_SYMBOL(__blk_mq_end_io);
+void blk_mq_end_io(struct request *rq, int error)
+{
+        if (blk_update_request(rq, error, blk_rq_bytes(rq)))
+                BUG();
+        __blk_mq_end_io(rq, error);
 }
-EXPORT_SYMBOL(blk_mq_end_io_partial);
+EXPORT_SYMBOL(blk_mq_end_io);
 static void __blk_mq_complete_request_remote(void *data)
 {
@@ -305,18 +359,22 @@ static void __blk_mq_complete_request_remote(void *data)
        rq->q->softirq_done_fn(rq);
 }
-void __blk_mq_complete_request(struct request *rq)
+static void blk_mq_ipi_complete_request(struct request *rq)
 {
        struct blk_mq_ctx *ctx = rq->mq_ctx;
+        bool shared = false;
        int cpu;
-        if (!ctx->ipi_redirect) {
+        if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
                rq->q->softirq_done_fn(rq);
                return;
        }
        cpu = get_cpu();
-        if (cpu != ctx->cpu && cpu_online(ctx->cpu)) {
+        if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
+                shared = cpus_share_cache(cpu, ctx->cpu);
+        if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
                rq->csd.func = __blk_mq_complete_request_remote;
                rq->csd.info = rq;
                rq->csd.flags = 0;
@@ -327,6 +385,16 @@ void __blk_mq_complete_request(struct request *rq)
        put_cpu();
 }
+void __blk_mq_complete_request(struct request *rq)
+{
+        struct request_queue *q = rq->q;
+        if (!q->softirq_done_fn)
+                blk_mq_end_io(rq, rq->errors);
+        else
+                blk_mq_ipi_complete_request(rq);
+}
 /**
 * blk_mq_complete_request - end I/O on a request
 * @rq:         the request being processed
@@ -337,7 +405,9 @@ void __blk_mq_complete_request(struct request *rq)
 **/
 void blk_mq_complete_request(struct request *rq)
 {
-        if (unlikely(blk_should_fake_timeout(rq->q)))
+        struct request_queue *q = rq->q;
+        if (unlikely(blk_should_fake_timeout(q)))
                return;
        if (!blk_mark_rq_complete(rq))
                __blk_mq_complete_request(rq);
@@ -350,13 +420,31 @@ static void blk_mq_start_request(struct request *rq, bool last)
        trace_block_rq_issue(q, rq);
+        rq->resid_len = blk_rq_bytes(rq);
+        if (unlikely(blk_bidi_rq(rq)))
+                rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
        /*
         * Just mark start time and set the started bit. Due to memory
         * ordering, we know we'll see the correct deadline as long as
-         * REQ_ATOMIC_STARTED is seen.
+         * REQ_ATOMIC_STARTED is seen. Use the default queue timeout,
+         * unless one has been set in the request.
+         */
+        if (!rq->timeout)
+                rq->deadline = jiffies + q->rq_timeout;
+        else
+                rq->deadline = jiffies + rq->timeout;
+        /*
+         * Mark us as started and clear complete. Complete might have been
+         * set if requeue raced with timeout, which then marked it as
+         * complete. So be sure to clear complete again when we start
+         * the request, otherwise we'll ignore the completion event.
         */
-        rq->deadline = jiffies + q->rq_timeout;
+        if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
-        set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
+                set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
+        if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
+                clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
        if (q->dma_drain_size && blk_rq_bytes(rq)) {
                /*
@@ -378,7 +466,7 @@ static void blk_mq_start_request(struct request *rq, bool last)
                rq->cmd_flags |= REQ_END;
 }
-static void blk_mq_requeue_request(struct request *rq)
+static void __blk_mq_requeue_request(struct request *rq)
 {
        struct request_queue *q = rq->q;
@@ -391,6 +479,86 @@ static void blk_mq_requeue_request(struct request *rq)
                rq->nr_phys_segments--;
 }
+void blk_mq_requeue_request(struct request *rq)
+{
+        __blk_mq_requeue_request(rq);
+        blk_clear_rq_complete(rq);
+        BUG_ON(blk_queued_rq(rq));
+        blk_mq_add_to_requeue_list(rq, true);
+}
+EXPORT_SYMBOL(blk_mq_requeue_request);
+static void blk_mq_requeue_work(struct work_struct *work)
+{
+        struct request_queue *q =
+                container_of(work, struct request_queue, requeue_work);
+        LIST_HEAD(rq_list);
+        struct request *rq, *next;
+        unsigned long flags;
+        spin_lock_irqsave(&q->requeue_lock, flags);
+        list_splice_init(&q->requeue_list, &rq_list);
+        spin_unlock_irqrestore(&q->requeue_lock, flags);
+        list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
+                if (!(rq->cmd_flags & REQ_SOFTBARRIER))
+                        continue;
+                rq->cmd_flags &= ~REQ_SOFTBARRIER;
+                list_del_init(&rq->queuelist);
+                blk_mq_insert_request(rq, true, false, false);
+        }
+        while (!list_empty(&rq_list)) {
+                rq = list_entry(rq_list.next, struct request, queuelist);
+                list_del_init(&rq->queuelist);
+                blk_mq_insert_request(rq, false, false, false);
+        }
+        blk_mq_run_queues(q, false);
+}
+void blk_mq_add_to_requeue_list(struct request *rq, bool at_head)
+{
+        struct request_queue *q = rq->q;
+        unsigned long flags;
+        /*
+         * We abuse this flag that is otherwise used by the I/O scheduler to
+         * request head insertation from the workqueue.
+         */
+        BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER);
+        spin_lock_irqsave(&q->requeue_lock, flags);
+        if (at_head) {
+                rq->cmd_flags |= REQ_SOFTBARRIER;
+                list_add(&rq->queuelist, &q->requeue_list);
+        } else {
+                list_add_tail(&rq->queuelist, &q->requeue_list);
+        }
+        spin_unlock_irqrestore(&q->requeue_lock, flags);
+}
+EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
+void blk_mq_kick_requeue_list(struct request_queue *q)
+{
+        kblockd_schedule_work(&q->requeue_work);
+}
+EXPORT_SYMBOL(blk_mq_kick_requeue_list);
+struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx, unsigned int tag)
+{
+        struct request_queue *q = hctx->queue;
+        if ((q->flush_rq->cmd_flags & REQ_FLUSH_SEQ) &&
+            q->flush_rq->tag == tag)
+                return q->flush_rq;
+        return hctx->tags->rqs[tag];
+}
+EXPORT_SYMBOL(blk_mq_tag_to_rq);
 struct blk_mq_timeout_data {
        struct blk_mq_hw_ctx *hctx;
        unsigned long *next;
@@ -412,12 +580,13 @@ static void blk_mq_timeout_check(void *__data, unsigned long *free_tags)
        do {
                struct request *rq;
-                tag = find_next_zero_bit(free_tags, hctx->queue_depth, tag);
+                tag = find_next_zero_bit(free_tags, hctx->tags->nr_tags, tag);
-                if (tag >= hctx->queue_depth)
+                if (tag >= hctx->tags->nr_tags)
                        break;
-                rq = hctx->rqs[tag++];
+                rq = blk_mq_tag_to_rq(hctx, tag++);
+                if (rq->q != hctx->queue)
+                        continue;
                if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
                        continue;
@@ -442,6 +611,28 @@ static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx,
        blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data);
 }
+static enum blk_eh_timer_return blk_mq_rq_timed_out(struct request *rq)
+{
+        struct request_queue *q = rq->q;
+        /*
+         * We know that complete is set at this point. If STARTED isn't set
+         * anymore, then the request isn't active and the "timeout" should
+         * just be ignored. This can happen due to the bitflag ordering.
+         * Timeout first checks if STARTED is set, and if it is, assumes
+         * the request is active. But if we race with completion, then
+         * we both flags will get cleared. So check here again, and ignore
+         * a timeout event with a request that isn't active.
+         */
+        if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
+                return BLK_EH_NOT_HANDLED;
+        if (!q->mq_ops->timeout)
+                return BLK_EH_RESET_TIMER;
+        return q->mq_ops->timeout(rq);
+}
 static void blk_mq_rq_timer(unsigned long data)
 {
        struct request_queue *q = (struct request_queue *) data;
@@ -449,11 +640,24 @@ static void blk_mq_rq_timer(unsigned long data)
        unsigned long next = 0;
        int i, next_set = 0;
-        queue_for_each_hw_ctx(q, hctx, i)
+        queue_for_each_hw_ctx(q, hctx, i) {
+                /*
+                 * If not software queues are currently mapped to this
+                 * hardware queue, there's nothing to check
+                 */
+                if (!hctx->nr_ctx || !hctx->tags)
+                        continue;
                blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set);
+        }
-        if (next_set)
+        if (next_set) {
-                mod_timer(&q->timeout, round_jiffies_up(next));
+                next = blk_rq_timeout(round_jiffies_up(next));
+                mod_timer(&q->timeout, next);
+        } else {
+                queue_for_each_hw_ctx(q, hctx, i)
+                        blk_mq_tag_idle(hctx);
+        }
 }
 /*
@@ -495,9 +699,38 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
        return false;
 }
-void blk_mq_add_timer(struct request *rq)
+/*
+ * Process software queues that have been marked busy, splicing them
+ * to the for-dispatch
+ */
+static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 {
-        __blk_add_timer(rq, NULL);
+        struct blk_mq_ctx *ctx;
+        int i;
+        for (i = 0; i < hctx->ctx_map.map_size; i++) {
+                struct blk_align_bitmap *bm = &hctx->ctx_map.map[i];
+                unsigned int off, bit;
+                if (!bm->word)
+                        continue;
+                bit = 0;
+                off = i * hctx->ctx_map.bits_per_word;
+                do {
+                        bit = find_next_bit(&bm->word, bm->depth, bit);
+                        if (bit >= bm->depth)
+                                break;
+                        ctx = hctx->ctxs[bit + off];
+                        clear_bit(bit, &bm->word);
+                        spin_lock(&ctx->lock);
+                        list_splice_tail_init(&ctx->rq_list, list);
+                        spin_unlock(&ctx->lock);
+                        bit++;
+                } while (1);
+        }
 }
 /*
@@ -509,10 +742,11 @@ void blk_mq_add_timer(struct request *rq)
 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
        struct request_queue *q = hctx->queue;
-        struct blk_mq_ctx *ctx;
        struct request *rq;
        LIST_HEAD(rq_list);
-        int bit, queued;
+        int queued;
+        WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask));
        if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
                return;
@@ -522,15 +756,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
        /*
         * Touch any software queue that has pending entries.
         */
-        for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) {
+        flush_busy_ctxs(hctx, &rq_list);
-                clear_bit(bit, hctx->ctx_map);
-                ctx = hctx->ctxs[bit];
-                BUG_ON(bit != ctx->index_hw);
-                spin_lock(&ctx->lock);
-                list_splice_tail_init(&ctx->rq_list, &rq_list);
-                spin_unlock(&ctx->lock);
-        }
        /*
         * If we have previous entries on our dispatch list, grab them
@@ -544,13 +770,9 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
        }
        /*
-         * Delete and return all entries from our dispatch list
-         */
-        queued = 0;
-        /*
         * Now process all the entries, sending them to the driver.
         */
+        queued = 0;
        while (!list_empty(&rq_list)) {
                int ret;
@@ -565,13 +787,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
                        queued++;
                        continue;
                case BLK_MQ_RQ_QUEUE_BUSY:
-                        /*
-                         * FIXME: we should have a mechanism to stop the queue
-                         * like blk_stop_queue, otherwise we will waste cpu
-                         * time
-                         */
                        list_add(&rq->queuelist, &rq_list);
-                        blk_mq_requeue_request(rq);
+                        __blk_mq_requeue_request(rq);
                        break;
                default:
                        pr_err("blk-mq: bad return on queue: %d\n", ret);
@@ -601,17 +818,44 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
        }
 }
+/*
+ * It'd be great if the workqueue API had a way to pass
+ * in a mask and had some smarts for more clever placement.
+ * For now we just round-robin here, switching for every
+ * BLK_MQ_CPU_WORK_BATCH queued items.
+ */
+static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
+{
+        int cpu = hctx->next_cpu;
+        if (--hctx->next_cpu_batch <= 0) {
+                int next_cpu;
+                next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
+                if (next_cpu >= nr_cpu_ids)
+                        next_cpu = cpumask_first(hctx->cpumask);
+                hctx->next_cpu = next_cpu;
+                hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
+        }
+        return cpu;
+}
 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 {
        if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
                return;
-        if (!async)
+        if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask))
                __blk_mq_run_hw_queue(hctx);
+        else if (hctx->queue->nr_hw_queues == 1)
+                kblockd_schedule_delayed_work(&hctx->run_work, 0);
        else {
-                struct request_queue *q = hctx->queue;
+                unsigned int cpu;
-                kblockd_schedule_delayed_work(q, &hctx->delayed_work, 0);
+                cpu = blk_mq_hctx_next_cpu(hctx);
+                kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0);
        }
 }
@@ -626,14 +870,17 @@ void blk_mq_run_queues(struct request_queue *q, bool async)
                    test_bit(BLK_MQ_S_STOPPED, &hctx->state))
                        continue;
+                preempt_disable();
                blk_mq_run_hw_queue(hctx, async);
+                preempt_enable();
        }
 }
 EXPORT_SYMBOL(blk_mq_run_queues);
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
-        cancel_delayed_work(&hctx->delayed_work);
+        cancel_delayed_work(&hctx->run_work);
+        cancel_delayed_work(&hctx->delay_work);
        set_bit(BLK_MQ_S_STOPPED, &hctx->state);
 }
 EXPORT_SYMBOL(blk_mq_stop_hw_queue);
@@ -651,11 +898,25 @@ EXPORT_SYMBOL(blk_mq_stop_hw_queues);
 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
        clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
+        preempt_disable();
        __blk_mq_run_hw_queue(hctx);
+        preempt_enable();
 }
 EXPORT_SYMBOL(blk_mq_start_hw_queue);
-void blk_mq_start_stopped_hw_queues(struct request_queue *q)
+void blk_mq_start_hw_queues(struct request_queue *q)
+{
+        struct blk_mq_hw_ctx *hctx;
+        int i;
+        queue_for_each_hw_ctx(q, hctx, i)
+                blk_mq_start_hw_queue(hctx);
+}
+EXPORT_SYMBOL(blk_mq_start_hw_queues);
+void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
 {
        struct blk_mq_hw_ctx *hctx;
        int i;
@@ -665,19 +926,47 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q)
                        continue;
                clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
-                blk_mq_run_hw_queue(hctx, true);
+                preempt_disable();
+                blk_mq_run_hw_queue(hctx, async);
+                preempt_enable();
        }
 }
 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
-static void blk_mq_work_fn(struct work_struct *work)
+static void blk_mq_run_work_fn(struct work_struct *work)
 {
        struct blk_mq_hw_ctx *hctx;
-        hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work);
+        hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
        __blk_mq_run_hw_queue(hctx);
 }
+static void blk_mq_delay_work_fn(struct work_struct *work)
+{
+        struct blk_mq_hw_ctx *hctx;
+        hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work);
+        if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state))
+                __blk_mq_run_hw_queue(hctx);
+}
+void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
+{
+        unsigned long tmo = msecs_to_jiffies(msecs);
+        if (hctx->queue->nr_hw_queues == 1)
+                kblockd_schedule_delayed_work(&hctx->delay_work, tmo);
+        else {
+                unsigned int cpu;
+                cpu = blk_mq_hctx_next_cpu(hctx);
+                kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo);
+        }
+}
+EXPORT_SYMBOL(blk_mq_delay_queue);
 static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
                                    struct request *rq, bool at_head)
 {
@@ -689,12 +978,13 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
                list_add(&rq->queuelist, &ctx->rq_list);
        else
                list_add_tail(&rq->queuelist, &ctx->rq_list);
        blk_mq_hctx_mark_pending(hctx, ctx);
        /*
         * We do this early, to ensure we are on the right CPU.
         */
-        blk_mq_add_timer(rq);
+        blk_add_timer(rq);
 }
 void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
@@ -719,10 +1009,10 @@ void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
                spin_unlock(&ctx->lock);
        }
-        blk_mq_put_ctx(current_ctx);
        if (run_queue)
                blk_mq_run_hw_queue(hctx, async);
+        blk_mq_put_ctx(current_ctx);
 }
 static void blk_mq_insert_requests(struct request_queue *q,
@@ -758,9 +1048,8 @@ static void blk_mq_insert_requests(struct request_queue *q,
        }
        spin_unlock(&ctx->lock);
-        blk_mq_put_ctx(current_ctx);
        blk_mq_run_hw_queue(hctx, from_schedule);
+        blk_mq_put_ctx(current_ctx);
 }
 static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
@@ -823,24 +1112,169 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
 {
        init_request_from_bio(rq, bio);
-        blk_account_io_start(rq, 1);
+        if (blk_do_io_stat(rq)) {
+                rq->start_time = jiffies;
+                blk_account_io_start(rq, 1);
+        }
 }
-static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
+static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx,
+                                         struct blk_mq_ctx *ctx,
+                                         struct request *rq, struct bio *bio)
+{
+        struct request_queue *q = hctx->queue;
+        if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE)) {
+                blk_mq_bio_to_request(rq, bio);
+                spin_lock(&ctx->lock);
+insert_rq:
+                __blk_mq_insert_request(hctx, rq, false);
+                spin_unlock(&ctx->lock);
+                return false;
+        } else {
+                spin_lock(&ctx->lock);
+                if (!blk_mq_attempt_merge(q, ctx, bio)) {
+                        blk_mq_bio_to_request(rq, bio);
+                        goto insert_rq;
+                }
+                spin_unlock(&ctx->lock);
+                __blk_mq_free_request(hctx, ctx, rq);
+                return true;
+        }
+}
+struct blk_map_ctx {
+        struct blk_mq_hw_ctx *hctx;
+        struct blk_mq_ctx *ctx;
+};
+static struct request *blk_mq_map_request(struct request_queue *q,
+                                          struct bio *bio,
+                                          struct blk_map_ctx *data)
 {
        struct blk_mq_hw_ctx *hctx;
        struct blk_mq_ctx *ctx;
+        struct request *rq;
+        int rw = bio_data_dir(bio);
+        if (unlikely(blk_mq_queue_enter(q))) {
+                bio_endio(bio, -EIO);
+                return NULL;
+        }
+        ctx = blk_mq_get_ctx(q);
+        hctx = q->mq_ops->map_queue(q, ctx->cpu);
+        if (rw_is_sync(bio->bi_rw))
+                rw |= REQ_SYNC;
+        trace_block_getrq(q, bio, rw);
+        rq = __blk_mq_alloc_request(q, hctx, ctx, rw, GFP_ATOMIC, false);
+        if (unlikely(!rq)) {
+                __blk_mq_run_hw_queue(hctx);
+                blk_mq_put_ctx(ctx);
+                trace_block_sleeprq(q, bio, rw);
+                ctx = blk_mq_get_ctx(q);
+                hctx = q->mq_ops->map_queue(q, ctx->cpu);
+                rq = __blk_mq_alloc_request(q, hctx, ctx, rw,
+                                            __GFP_WAIT|GFP_ATOMIC, false);
+        }
+        hctx->queued++;
+        data->hctx = hctx;
+        data->ctx = ctx;
+        return rq;
+}
+/*
+ * Multiple hardware queue variant. This will not use per-process plugs,
+ * but will attempt to bypass the hctx queueing if we can go straight to
+ * hardware for SYNC IO.
+ */
+static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
+{
        const int is_sync = rw_is_sync(bio->bi_rw);
        const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
-        int rw = bio_data_dir(bio);
+        struct blk_map_ctx data;
        struct request *rq;
+        blk_queue_bounce(q, &bio);
+        if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
+                bio_endio(bio, -EIO);
+                return;
+        }
+        rq = blk_mq_map_request(q, bio, &data);
+        if (unlikely(!rq))
+                return;
+        if (unlikely(is_flush_fua)) {
+                blk_mq_bio_to_request(rq, bio);
+                blk_insert_flush(rq);
+                goto run_queue;
+        }
+        if (is_sync) {
+                int ret;
+                blk_mq_bio_to_request(rq, bio);
+                blk_mq_start_request(rq, true);
+                blk_add_timer(rq);
+                /*
+                 * For OK queue, we are done. For error, kill it. Any other
+                 * error (busy), just add it to our list as we previously
+                 * would have done
+                 */
+                ret = q->mq_ops->queue_rq(data.hctx, rq);
+                if (ret == BLK_MQ_RQ_QUEUE_OK)
+                        goto done;
+                else {
+                        __blk_mq_requeue_request(rq);
+                        if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
+                                rq->errors = -EIO;
+                                blk_mq_end_io(rq, rq->errors);
+                                goto done;
+                        }
+                }
+        }
+        if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
+                /*
+                 * For a SYNC request, send it to the hardware immediately. For
+                 * an ASYNC request, just ensure that we run it later on. The
+                 * latter allows for merging opportunities and more efficient
+                 * dispatching.
+                 */
+run_queue:
+                blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
+        }
+done:
+        blk_mq_put_ctx(data.ctx);
+}
+/*
+ * Single hardware queue variant. This will attempt to use any per-process
+ * plug for merging and IO deferral.
+ */
+static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
+{
+        const int is_sync = rw_is_sync(bio->bi_rw);
+        const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
        unsigned int use_plug, request_count = 0;
+        struct blk_map_ctx data;
+        struct request *rq;
        /*
         * If we have multiple hardware queues, just go directly to
         * one of those for sync IO.
         */
-        use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) || !is_sync);
+        use_plug = !is_flush_fua && !is_sync;
        blk_queue_bounce(q, &bio);
@@ -849,37 +1283,14 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
                return;
        }
-        if (use_plug && blk_attempt_plug_merge(q, bio, &request_count))
+        if (use_plug && !blk_queue_nomerges(q) &&
+            blk_attempt_plug_merge(q, bio, &request_count))
                return;
-        if (blk_mq_queue_enter(q)) {
+        rq = blk_mq_map_request(q, bio, &data);
-                bio_endio(bio, -EIO);
-                return;
-        }
-        ctx = blk_mq_get_ctx(q);
-        hctx = q->mq_ops->map_queue(q, ctx->cpu);
-        if (is_sync)
-                rw |= REQ_SYNC;
-        trace_block_getrq(q, bio, rw);
-        rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false);
-        if (likely(rq))
-                blk_mq_rq_ctx_init(q, ctx, rq, rw);
-        else {
-                blk_mq_put_ctx(ctx);
-                trace_block_sleeprq(q, bio, rw);
-                rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC,
-                                                        false);
-                ctx = rq->mq_ctx;
-                hctx = q->mq_ops->map_queue(q, ctx->cpu);
-        }
-        hctx->queued++;
        if (unlikely(is_flush_fua)) {
                blk_mq_bio_to_request(rq, bio);
-                blk_mq_put_ctx(ctx);
                blk_insert_flush(rq);
                goto run_queue;
        }
@@ -901,31 +1312,23 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
                                trace_block_plug(q);
                        }
                        list_add_tail(&rq->queuelist, &plug->mq_list);
-                        blk_mq_put_ctx(ctx);
+                        blk_mq_put_ctx(data.ctx);
                        return;
                }
        }
-        spin_lock(&ctx->lock);
+        if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
+                /*
-        if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
+                 * For a SYNC request, send it to the hardware immediately. For
-            blk_mq_attempt_merge(q, ctx, bio))
+                 * an ASYNC request, just ensure that we run it later on. The
-                __blk_mq_free_request(hctx, ctx, rq);
+                 * latter allows for merging opportunities and more efficient
-        else {
+                 * dispatching.
-                blk_mq_bio_to_request(rq, bio);
+                 */
-                __blk_mq_insert_request(hctx, rq, false);
+run_queue:
+                blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
        }
-        spin_unlock(&ctx->lock);
+        blk_mq_put_ctx(data.ctx);
-        blk_mq_put_ctx(ctx);
-        /*
-         * For a SYNC request, send it to the hardware immediately. For an
-         * ASYNC request, just ensure that we run it later on. The latter
-         * allows for merging opportunities and more efficient dispatching.
-         */
-run_queue:
-        blk_mq_run_hw_queue(hctx, !is_sync || is_flush_fua);
 }
 /*
@@ -937,32 +1340,153 @@ struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu)
 }
 EXPORT_SYMBOL(blk_mq_map_queue);
-struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *reg,
+static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
-                                                   unsigned int hctx_index)
+                struct blk_mq_tags *tags, unsigned int hctx_idx)
 {
-        return kmalloc_node(sizeof(struct blk_mq_hw_ctx),
+        struct page *page;
-                                GFP_KERNEL | __GFP_ZERO, reg->numa_node);
+        if (tags->rqs && set->ops->exit_request) {
+                int i;
+                for (i = 0; i < tags->nr_tags; i++) {
+                        if (!tags->rqs[i])
+                                continue;
+                        set->ops->exit_request(set->driver_data, tags->rqs[i],
+                                                hctx_idx, i);
+                }
+        }
+        while (!list_empty(&tags->page_list)) {
+                page = list_first_entry(&tags->page_list, struct page, lru);
+                list_del_init(&page->lru);
+                __free_pages(page, page->private);
+        }
+        kfree(tags->rqs);
+        blk_mq_free_tags(tags);
 }
-EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue);
-void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx,
+static size_t order_to_size(unsigned int order)
-                                 unsigned int hctx_index)
 {
-        kfree(hctx);
+        return (size_t)PAGE_SIZE << order;
 }
-EXPORT_SYMBOL(blk_mq_free_single_hw_queue);
-static void blk_mq_hctx_notify(void *data, unsigned long action,
+static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
-                               unsigned int cpu)
+                unsigned int hctx_idx)
+{
+        struct blk_mq_tags *tags;
+        unsigned int i, j, entries_per_page, max_order = 4;
+        size_t rq_size, left;
+        tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags,
+                                set->numa_node);
+        if (!tags)
+                return NULL;
+        INIT_LIST_HEAD(&tags->page_list);
+        tags->rqs = kmalloc_node(set->queue_depth * sizeof(struct request *),
+                                        GFP_KERNEL, set->numa_node);
+        if (!tags->rqs) {
+                blk_mq_free_tags(tags);
+                return NULL;
+        }
+        /*
+         * rq_size is the size of the request plus driver payload, rounded
+         * to the cacheline size
+         */
+        rq_size = round_up(sizeof(struct request) + set->cmd_size,
+                                cache_line_size());
+        left = rq_size * set->queue_depth;
+        for (i = 0; i < set->queue_depth; ) {
+                int this_order = max_order;
+                struct page *page;
+                int to_do;
+                void *p;
+                while (left < order_to_size(this_order - 1) && this_order)
+                        this_order--;
+                do {
+                        page = alloc_pages_node(set->numa_node, GFP_KERNEL,
+                                                this_order);
+                        if (page)
+                                break;
+                        if (!this_order--)
+                                break;
+                        if (order_to_size(this_order) < rq_size)
+                                break;
+                } while (1);
+                if (!page)
+                        goto fail;
+                page->private = this_order;
+                list_add_tail(&page->lru, &tags->page_list);
+                p = page_address(page);
+                entries_per_page = order_to_size(this_order) / rq_size;
+                to_do = min(entries_per_page, set->queue_depth - i);
+                left -= to_do * rq_size;
+                for (j = 0; j < to_do; j++) {
+                        tags->rqs[i] = p;
+                        if (set->ops->init_request) {
+                                if (set->ops->init_request(set->driver_data,
+                                                tags->rqs[i], hctx_idx, i,
+                                                set->numa_node))
+                                        goto fail;
+                        }
+                        p += rq_size;
+                        i++;
+                }
+        }
+        return tags;
+fail:
+        pr_warn("%s: failed to allocate requests\n", __func__);
+        blk_mq_free_rq_map(set, tags, hctx_idx);
+        return NULL;
+}
+static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap)
+{
+        kfree(bitmap->map);
+}
+static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node)
+{
+        unsigned int bpw = 8, total, num_maps, i;
+        bitmap->bits_per_word = bpw;
+        num_maps = ALIGN(nr_cpu_ids, bpw) / bpw;
+        bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap),
+                                        GFP_KERNEL, node);
+        if (!bitmap->map)
+                return -ENOMEM;
+        bitmap->map_size = num_maps;
+        total = nr_cpu_ids;
+        for (i = 0; i < num_maps; i++) {
+                bitmap->map[i].depth = min(total, bitmap->bits_per_word);
+                total -= bitmap->map[i].depth;
+        }
+        return 0;
+}
+static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
 {
-        struct blk_mq_hw_ctx *hctx = data;
        struct request_queue *q = hctx->queue;
        struct blk_mq_ctx *ctx;
        LIST_HEAD(tmp);
-        if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
-                return;
        /*
         * Move ctx entries to new CPU, if this one is going away.
         */
@@ -971,12 +1495,12 @@ static void blk_mq_hctx_notify(void *data, unsigned long action,
        spin_lock(&ctx->lock);
        if (!list_empty(&ctx->rq_list)) {
                list_splice_init(&ctx->rq_list, &tmp);
-                clear_bit(ctx->index_hw, hctx->ctx_map);
+                blk_mq_hctx_clear_pending(hctx, ctx);
        }
        spin_unlock(&ctx->lock);
        if (list_empty(&tmp))
-                return;
+                return NOTIFY_OK;
        ctx = blk_mq_get_ctx(q);
        spin_lock(&ctx->lock);
@@ -993,210 +1517,103 @@ static void blk_mq_hctx_notify(void *data, unsigned long action,
        blk_mq_hctx_mark_pending(hctx, ctx);
        spin_unlock(&ctx->lock);
-        blk_mq_put_ctx(ctx);
        blk_mq_run_hw_queue(hctx, true);
+        blk_mq_put_ctx(ctx);
+        return NOTIFY_OK;
 }
-static int blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx,
+static int blk_mq_hctx_cpu_online(struct blk_mq_hw_ctx *hctx, int cpu)
-                                   int (*init)(void *, struct blk_mq_hw_ctx *,
-                                        struct request *, unsigned int),
-                                   void *data)
 {
-        unsigned int i;
+        struct request_queue *q = hctx->queue;
-        int ret = 0;
+        struct blk_mq_tag_set *set = q->tag_set;
-        for (i = 0; i < hctx->queue_depth; i++) {
-                struct request *rq = hctx->rqs[i];
-                ret = init(data, hctx, rq, i);
-                if (ret)
-                        break;
-        }
-        return ret;
-}
-int blk_mq_init_commands(struct request_queue *q,
+        if (set->tags[hctx->queue_num])
-                         int (*init)(void *, struct blk_mq_hw_ctx *,
+                return NOTIFY_OK;
-                                        struct request *, unsigned int),
-                         void *data)
-{
-        struct blk_mq_hw_ctx *hctx;
-        unsigned int i;
-        int ret = 0;
-        queue_for_each_hw_ctx(q, hctx, i) {
+        set->tags[hctx->queue_num] = blk_mq_init_rq_map(set, hctx->queue_num);
-                ret = blk_mq_init_hw_commands(hctx, init, data);
+        if (!set->tags[hctx->queue_num])
-                if (ret)
+                return NOTIFY_STOP;
-                        break;
-        }
-        return ret;
+        hctx->tags = set->tags[hctx->queue_num];
+        return NOTIFY_OK;
 }
-EXPORT_SYMBOL(blk_mq_init_commands);
-static void blk_mq_free_hw_commands(struct blk_mq_hw_ctx *hctx,
+static int blk_mq_hctx_notify(void *data, unsigned long action,
-                                    void (*free)(void *, struct blk_mq_hw_ctx *,
+                              unsigned int cpu)
-                                        struct request *, unsigned int),
-                                    void *data)
 {
-        unsigned int i;
+        struct blk_mq_hw_ctx *hctx = data;
-        for (i = 0; i < hctx->queue_depth; i++) {
+        if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
-                struct request *rq = hctx->rqs[i];
+                return blk_mq_hctx_cpu_offline(hctx, cpu);
+        else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
+                return blk_mq_hctx_cpu_online(hctx, cpu);
-                free(data, hctx, rq, i);
+        return NOTIFY_OK;
-        }
 }
-void blk_mq_free_commands(struct request_queue *q,
+static void blk_mq_exit_hw_queues(struct request_queue *q,
-                          void (*free)(void *, struct blk_mq_hw_ctx *,
+                struct blk_mq_tag_set *set, int nr_queue)
-                                        struct request *, unsigned int),
-                          void *data)
 {
        struct blk_mq_hw_ctx *hctx;
        unsigned int i;
-        queue_for_each_hw_ctx(q, hctx, i)
+        queue_for_each_hw_ctx(q, hctx, i) {
-                blk_mq_free_hw_commands(hctx, free, data);
+                if (i == nr_queue)
-}
+                        break;
-EXPORT_SYMBOL(blk_mq_free_commands);
-static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx)
+                if (set->ops->exit_hctx)
-{
+                        set->ops->exit_hctx(hctx, i);
-        struct page *page;
-        while (!list_empty(&hctx->page_list)) {
+                blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
-                page = list_first_entry(&hctx->page_list, struct page, lru);
+                kfree(hctx->ctxs);
-                list_del_init(&page->lru);
+                blk_mq_free_bitmap(&hctx->ctx_map);
-                __free_pages(page, page->private);
        }
-        kfree(hctx->rqs);
-        if (hctx->tags)
-                blk_mq_free_tags(hctx->tags);
-}
-static size_t order_to_size(unsigned int order)
-{
-        size_t ret = PAGE_SIZE;
-        while (order--)
-                ret *= 2;
-        return ret;
 }
-static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
+static void blk_mq_free_hw_queues(struct request_queue *q,
-                              unsigned int reserved_tags, int node)
+                struct blk_mq_tag_set *set)
 {
-        unsigned int i, j, entries_per_page, max_order = 4;
+        struct blk_mq_hw_ctx *hctx;
-        size_t rq_size, left;
+        unsigned int i;
-        INIT_LIST_HEAD(&hctx->page_list);
-        hctx->rqs = kmalloc_node(hctx->queue_depth * sizeof(struct request *),
-                                        GFP_KERNEL, node);
-        if (!hctx->rqs)
-                return -ENOMEM;
-        /*
-         * rq_size is the size of the request plus driver payload, rounded
-         * to the cacheline size
-         */
-        rq_size = round_up(sizeof(struct request) + hctx->cmd_size,
-                                cache_line_size());
-        left = rq_size * hctx->queue_depth;
-        for (i = 0; i < hctx->queue_depth;) {
-                int this_order = max_order;
-                struct page *page;
-                int to_do;
-                void *p;
-                while (left < order_to_size(this_order - 1) && this_order)
-                        this_order--;
-                do {
-                        page = alloc_pages_node(node, GFP_KERNEL, this_order);
-                        if (page)
-                                break;
-                        if (!this_order--)
-                                break;
-                        if (order_to_size(this_order) < rq_size)
-                                break;
-                } while (1);
-                if (!page)
-                        break;
-                page->private = this_order;
-                list_add_tail(&page->lru, &hctx->page_list);
-                p = page_address(page);
-                entries_per_page = order_to_size(this_order) / rq_size;
-                to_do = min(entries_per_page, hctx->queue_depth - i);
-                left -= to_do * rq_size;
-                for (j = 0; j < to_do; j++) {
-                        hctx->rqs[i] = p;
-                        blk_mq_rq_init(hctx, hctx->rqs[i]);
-                        p += rq_size;
-                        i++;
-                }
-        }
-        if (i < (reserved_tags + BLK_MQ_TAG_MIN))
-                goto err_rq_map;
-        else if (i != hctx->queue_depth) {
-                hctx->queue_depth = i;
-                pr_warn("%s: queue depth set to %u because of low memory\n",
-                                        __func__, i);
-        }
-        hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node);
+        queue_for_each_hw_ctx(q, hctx, i) {
-        if (!hctx->tags) {
+                free_cpumask_var(hctx->cpumask);
-err_rq_map:
+                kfree(hctx);
-                blk_mq_free_rq_map(hctx);
-                return -ENOMEM;
        }
-        return 0;
 }
 static int blk_mq_init_hw_queues(struct request_queue *q,
-                                 struct blk_mq_reg *reg, void *driver_data)
+                struct blk_mq_tag_set *set)
 {
        struct blk_mq_hw_ctx *hctx;
-        unsigned int i, j;
+        unsigned int i;
        /*
         * Initialize hardware queues
         */
        queue_for_each_hw_ctx(q, hctx, i) {
-                unsigned int num_maps;
                int node;
                node = hctx->numa_node;
                if (node == NUMA_NO_NODE)
-                        node = hctx->numa_node = reg->numa_node;
+                        node = hctx->numa_node = set->numa_node;
-                INIT_DELAYED_WORK(&hctx->delayed_work, blk_mq_work_fn);
+                INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
+                INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
                spin_lock_init(&hctx->lock);
                INIT_LIST_HEAD(&hctx->dispatch);
                hctx->queue = q;
                hctx->queue_num = i;
-                hctx->flags = reg->flags;
+                hctx->flags = set->flags;
-                hctx->queue_depth = reg->queue_depth;
+                hctx->cmd_size = set->cmd_size;
-                hctx->cmd_size = reg->cmd_size;
                blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
                                                blk_mq_hctx_notify, hctx);
                blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
-                if (blk_mq_init_rq_map(hctx, reg->reserved_tags, node))
+                hctx->tags = set->tags[i];
-                        break;
                /*
                 * Allocate space for all possible cpus to avoid allocation in
@@ -1207,17 +1624,13 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
                if (!hctx->ctxs)
                        break;
-                num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG;
+                if (blk_mq_alloc_bitmap(&hctx->ctx_map, node))
-                hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long),
-                                                GFP_KERNEL, node);
-                if (!hctx->ctx_map)
                        break;
-                hctx->nr_ctx_map = num_maps;
                hctx->nr_ctx = 0;
-                if (reg->ops->init_hctx &&
+                if (set->ops->init_hctx &&
-                    reg->ops->init_hctx(hctx, driver_data, i))
+                    set->ops->init_hctx(hctx, set->driver_data, i))
                        break;
        }
@@ -1227,17 +1640,7 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
        /*
         * Init failed
         */
-        queue_for_each_hw_ctx(q, hctx, j) {
+        blk_mq_exit_hw_queues(q, set, i);
-                if (i == j)
-                        break;
-                if (reg->ops->exit_hctx)
-                        reg->ops->exit_hctx(hctx, j);
-                blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
-                blk_mq_free_rq_map(hctx);
-                kfree(hctx->ctxs);
-        }
        return 1;
 }
@@ -1258,12 +1661,13 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
                __ctx->queue = q;
                /* If the cpu isn't online, the cpu is mapped to first hctx */
-                hctx = q->mq_ops->map_queue(q, i);
-                hctx->nr_ctx++;
                if (!cpu_online(i))
                        continue;
+                hctx = q->mq_ops->map_queue(q, i);
+                cpumask_set_cpu(i, hctx->cpumask);
+                hctx->nr_ctx++;
                /*
                 * Set local node, IFF we have more than one hw queue. If
                 * not, we remain on the home node of the device
@@ -1280,6 +1684,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
        struct blk_mq_ctx *ctx;
        queue_for_each_hw_ctx(q, hctx, i) {
+                cpumask_clear(hctx->cpumask);
                hctx->nr_ctx = 0;
        }
@@ -1288,115 +1693,208 @@ static void blk_mq_map_swqueue(struct request_queue *q)
         */
        queue_for_each_ctx(q, ctx, i) {
                /* If the cpu isn't online, the cpu is mapped to first hctx */
+                if (!cpu_online(i))
+                        continue;
                hctx = q->mq_ops->map_queue(q, i);
+                cpumask_set_cpu(i, hctx->cpumask);
                ctx->index_hw = hctx->nr_ctx;
                hctx->ctxs[hctx->nr_ctx++] = ctx;
        }
+        queue_for_each_hw_ctx(q, hctx, i) {
+                /*
+                 * If not software queues are mapped to this hardware queue,
+                 * disable it and free the request entries
+                 */
+                if (!hctx->nr_ctx) {
+                        struct blk_mq_tag_set *set = q->tag_set;
+                        if (set->tags[i]) {
+                                blk_mq_free_rq_map(set, set->tags[i], i);
+                                set->tags[i] = NULL;
+                                hctx->tags = NULL;
+                        }
+                        continue;
+                }
+                /*
+                 * Initialize batch roundrobin counts
+                 */
+                hctx->next_cpu = cpumask_first(hctx->cpumask);
+                hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
+        }
 }
-struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,
+static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set)
-                                        void *driver_data)
 {
-        struct blk_mq_hw_ctx **hctxs;
+        struct blk_mq_hw_ctx *hctx;
-        struct blk_mq_ctx *ctx;
        struct request_queue *q;
+        bool shared;
        int i;
-        if (!reg->nr_hw_queues ||
+        if (set->tag_list.next == set->tag_list.prev)
-            !reg->ops->queue_rq || !reg->ops->map_queue ||
+                shared = false;
-            !reg->ops->alloc_hctx || !reg->ops->free_hctx)
+        else
-                return ERR_PTR(-EINVAL);
+                shared = true;
+        list_for_each_entry(q, &set->tag_list, tag_set_list) {
+                blk_mq_freeze_queue(q);
-        if (!reg->queue_depth)
+                queue_for_each_hw_ctx(q, hctx, i) {
-                reg->queue_depth = BLK_MQ_MAX_DEPTH;
+                        if (shared)
-        else if (reg->queue_depth > BLK_MQ_MAX_DEPTH) {
+                                hctx->flags |= BLK_MQ_F_TAG_SHARED;
-                pr_err("blk-mq: queuedepth too large (%u)\n", reg->queue_depth);
+                        else
-                reg->queue_depth = BLK_MQ_MAX_DEPTH;
+                                hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
+                }
+                blk_mq_unfreeze_queue(q);
        }
+}
-        if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN))
+static void blk_mq_del_queue_tag_set(struct request_queue *q)
-                return ERR_PTR(-EINVAL);
+{
+        struct blk_mq_tag_set *set = q->tag_set;
+        blk_mq_freeze_queue(q);
+        mutex_lock(&set->tag_list_lock);
+        list_del_init(&q->tag_set_list);
+        blk_mq_update_tag_set_depth(set);
+        mutex_unlock(&set->tag_list_lock);
+        blk_mq_unfreeze_queue(q);
+}
+static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
+                                     struct request_queue *q)
+{
+        q->tag_set = set;
+        mutex_lock(&set->tag_list_lock);
+        list_add_tail(&q->tag_set_list, &set->tag_list);
+        blk_mq_update_tag_set_depth(set);
+        mutex_unlock(&set->tag_list_lock);
+}
+struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
+{
+        struct blk_mq_hw_ctx **hctxs;
+        struct blk_mq_ctx *ctx;
+        struct request_queue *q;
+        unsigned int *map;
+        int i;
        ctx = alloc_percpu(struct blk_mq_ctx);
        if (!ctx)
                return ERR_PTR(-ENOMEM);
-        hctxs = kmalloc_node(reg->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
+        hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
-                        reg->numa_node);
+                        set->numa_node);
        if (!hctxs)
                goto err_percpu;
-        for (i = 0; i < reg->nr_hw_queues; i++) {
+        map = blk_mq_make_queue_map(set);
-                hctxs[i] = reg->ops->alloc_hctx(reg, i);
+        if (!map)
+                goto err_map;
+        for (i = 0; i < set->nr_hw_queues; i++) {
+                int node = blk_mq_hw_queue_to_node(map, i);
+                hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx),
+                                        GFP_KERNEL, node);
                if (!hctxs[i])
                        goto err_hctxs;
-                hctxs[i]->numa_node = NUMA_NO_NODE;
+                if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL))
+                        goto err_hctxs;
+                atomic_set(&hctxs[i]->nr_active, 0);
+                hctxs[i]->numa_node = node;
                hctxs[i]->queue_num = i;
        }
-        q = blk_alloc_queue_node(GFP_KERNEL, reg->numa_node);
+        q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
        if (!q)
                goto err_hctxs;
-        q->mq_map = blk_mq_make_queue_map(reg);
+        if (percpu_counter_init(&q->mq_usage_counter, 0))
-        if (!q->mq_map)
                goto err_map;
        setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
        blk_queue_rq_timeout(q, 30000);
        q->nr_queues = nr_cpu_ids;
-        q->nr_hw_queues = reg->nr_hw_queues;
+        q->nr_hw_queues = set->nr_hw_queues;
+        q->mq_map = map;
        q->queue_ctx = ctx;
        q->queue_hw_ctx = hctxs;
-        q->mq_ops = reg->ops;
+        q->mq_ops = set->ops;
        q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
+        if (!(set->flags & BLK_MQ_F_SG_MERGE))
+                q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE;
        q->sg_reserved_size = INT_MAX;
-        blk_queue_make_request(q, blk_mq_make_request);
+        INIT_WORK(&q->requeue_work, blk_mq_requeue_work);
-        blk_queue_rq_timed_out(q, reg->ops->timeout);
+        INIT_LIST_HEAD(&q->requeue_list);
-        if (reg->timeout)
+        spin_lock_init(&q->requeue_lock);
-                blk_queue_rq_timeout(q, reg->timeout);
+        if (q->nr_hw_queues > 1)
+                blk_queue_make_request(q, blk_mq_make_request);
+        else
+                blk_queue_make_request(q, blk_sq_make_request);
+        blk_queue_rq_timed_out(q, blk_mq_rq_timed_out);
+        if (set->timeout)
+                blk_queue_rq_timeout(q, set->timeout);
+        /*
+         * Do this after blk_queue_make_request() overrides it...
+         */
+        q->nr_requests = set->queue_depth;
-        if (reg->ops->complete)
+        if (set->ops->complete)
-                blk_queue_softirq_done(q, reg->ops->complete);
+                blk_queue_softirq_done(q, set->ops->complete);
        blk_mq_init_flush(q);
-        blk_mq_init_cpu_queues(q, reg->nr_hw_queues);
+        blk_mq_init_cpu_queues(q, set->nr_hw_queues);
-        q->flush_rq = kzalloc(round_up(sizeof(struct request) + reg->cmd_size,
+        q->flush_rq = kzalloc(round_up(sizeof(struct request) +
-                                cache_line_size()), GFP_KERNEL);
+                                set->cmd_size, cache_line_size()),
+                                GFP_KERNEL);
        if (!q->flush_rq)
                goto err_hw;
-        if (blk_mq_init_hw_queues(q, reg, driver_data))
+        if (blk_mq_init_hw_queues(q, set))
                goto err_flush_rq;
-        blk_mq_map_swqueue(q);
        mutex_lock(&all_q_mutex);
        list_add_tail(&q->all_q_node, &all_q_list);
        mutex_unlock(&all_q_mutex);
+        blk_mq_add_queue_tag_set(set, q);
+        blk_mq_map_swqueue(q);
        return q;
 err_flush_rq:
        kfree(q->flush_rq);
 err_hw:
-        kfree(q->mq_map);
-err_map:
        blk_cleanup_queue(q);
 err_hctxs:
-        for (i = 0; i < reg->nr_hw_queues; i++) {
+        kfree(map);
+        for (i = 0; i < set->nr_hw_queues; i++) {
                if (!hctxs[i])
                        break;
-                reg->ops->free_hctx(hctxs[i], i);
+                free_cpumask_var(hctxs[i]->cpumask);
+                kfree(hctxs[i]);
        }
+err_map:
        kfree(hctxs);
 err_percpu:
        free_percpu(ctx);
@@ -1406,18 +1904,14 @@ EXPORT_SYMBOL(blk_mq_init_queue);
 void blk_mq_free_queue(struct request_queue *q)
 {
-        struct blk_mq_hw_ctx *hctx;
+        struct blk_mq_tag_set   *set = q->tag_set;
-        int i;
-        queue_for_each_hw_ctx(q, hctx, i) {
+        blk_mq_del_queue_tag_set(q);
-                kfree(hctx->ctx_map);
-                kfree(hctx->ctxs);
+        blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
-                blk_mq_free_rq_map(hctx);
+        blk_mq_free_hw_queues(q, set);
-                blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
-                if (q->mq_ops->exit_hctx)
+        percpu_counter_destroy(&q->mq_usage_counter);
-                        q->mq_ops->exit_hctx(hctx, i);
-                q->mq_ops->free_hctx(hctx, i);
-        }
        free_percpu(q->queue_ctx);
        kfree(q->queue_hw_ctx);
@@ -1437,6 +1931,8 @@ static void blk_mq_queue_reinit(struct request_queue *q)
 {
        blk_mq_freeze_queue(q);
+        blk_mq_sysfs_unregister(q);
        blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
        /*
@@ -1447,6 +1943,8 @@ static void blk_mq_queue_reinit(struct request_queue *q)
        blk_mq_map_swqueue(q);
+        blk_mq_sysfs_register(q);
        blk_mq_unfreeze_queue(q);
 }
@@ -1456,10 +1954,10 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
        struct request_queue *q;
        /*
-         * Before new mapping is established, hotadded cpu might already start
+         * Before new mappings are established, hotadded cpu might already
-         * handling requests. This doesn't break anything as we map offline
+         * start handling requests. This doesn't break anything as we map
-         * CPUs to first hardware queue. We will re-init queue below to get
+         * offline CPUs to first hardware queue. We will re-init the queue
-         * optimal settings.
+         * below to get optimal settings.
         */
        if (action != CPU_DEAD && action != CPU_DEAD_FROZEN &&
            action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)
@@ -1472,6 +1970,81 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
        return NOTIFY_OK;
 }
+int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
+{
+        int i;
+        if (!set->nr_hw_queues)
+                return -EINVAL;
+        if (!set->queue_depth || set->queue_depth > BLK_MQ_MAX_DEPTH)
+                return -EINVAL;
+        if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
+                return -EINVAL;
+        if (!set->nr_hw_queues || !set->ops->queue_rq || !set->ops->map_queue)
+                return -EINVAL;
+        set->tags = kmalloc_node(set->nr_hw_queues *
+                                 sizeof(struct blk_mq_tags *),
+                                 GFP_KERNEL, set->numa_node);
+        if (!set->tags)
+                goto out;
+        for (i = 0; i < set->nr_hw_queues; i++) {
+                set->tags[i] = blk_mq_init_rq_map(set, i);
+                if (!set->tags[i])
+                        goto out_unwind;
+        }
+        mutex_init(&set->tag_list_lock);
+        INIT_LIST_HEAD(&set->tag_list);
+        return 0;
+out_unwind:
+        while (--i >= 0)
+                blk_mq_free_rq_map(set, set->tags[i], i);
+out:
+        return -ENOMEM;
+}
+EXPORT_SYMBOL(blk_mq_alloc_tag_set);
+void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
+{
+        int i;
+        for (i = 0; i < set->nr_hw_queues; i++) {
+                if (set->tags[i])
+                        blk_mq_free_rq_map(set, set->tags[i], i);
+        }
+        kfree(set->tags);
+}
+EXPORT_SYMBOL(blk_mq_free_tag_set);
+int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
+{
+        struct blk_mq_tag_set *set = q->tag_set;
+        struct blk_mq_hw_ctx *hctx;
+        int i, ret;
+        if (!set || nr > set->queue_depth)
+                return -EINVAL;
+        ret = 0;
+        queue_for_each_hw_ctx(q, hctx, i) {
+                ret = blk_mq_tag_update_depth(hctx->tags, nr);
+                if (ret)
+                        break;
+        }
+        if (!ret)
+                q->nr_requests = nr;
+        return ret;
+}
 void blk_mq_disable_hotplug(void)
 {
        mutex_lock(&all_q_mutex);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index ebbe6bac9d61..de7b3bbd5bd6 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -1,6 +1,8 @@
 #ifndef INT_BLK_MQ_H
 #define INT_BLK_MQ_H
+struct blk_mq_tag_set;
 struct blk_mq_ctx {
        struct {
                spinlock_t              lock;
@@ -9,7 +11,8 @@ struct blk_mq_ctx {
        unsigned int            cpu;
        unsigned int            index_hw;
-        unsigned int            ipi_redirect;
+        unsigned int            last_tag ____cacheline_aligned_in_smp;
        /* incremented at dispatch time */
        unsigned long           rq_dispatched[2];
@@ -20,21 +23,23 @@ struct blk_mq_ctx {
        struct request_queue    *queue;
        struct kobject          kobj;
-};
+} ____cacheline_aligned_in_smp;
 void __blk_mq_complete_request(struct request *rq);
 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_init_flush(struct request_queue *q);
 void blk_mq_drain_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
-void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq);
+void blk_mq_clone_flush_request(struct request *flush_rq,
+                struct request *orig_rq);
+int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
 /*
 * CPU hotplug helpers
 */
 struct blk_mq_cpu_notifier;
 void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
-                              void (*fn)(void *, unsigned long, unsigned int),
+                              int (*fn)(void *, unsigned long, unsigned int),
                              void *data);
 void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier);
 void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier);
@@ -45,10 +50,23 @@ void blk_mq_disable_hotplug(void);
 /*
 * CPU -> queue mappings
 */
-struct blk_mq_reg;
+extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set);
-extern unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg);
 extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues);
+extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int);
-void blk_mq_add_timer(struct request *rq);
+/*
+ * sysfs helpers
+ */
+extern int blk_mq_sysfs_register(struct request_queue *q);
+extern void blk_mq_sysfs_unregister(struct request_queue *q);
+/*
+ * Basic implementation of sparser bitmap, allowing the user to spread
+ * the bits over more cachelines.
+ */
+struct blk_align_bitmap {
+        unsigned long word;
+        unsigned long depth;
+} ____cacheline_aligned_in_smp;
 #endif
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 7500f876dae4..23321fbab293 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -48,11 +48,10 @@ static ssize_t queue_requests_show(struct request_queue *q, char *page)
 static ssize_t
 queue_requests_store(struct request_queue *q, const char *page, size_t count)
 {
-        struct request_list *rl;
        unsigned long nr;
-        int ret;
+        int ret, err;
-        if (!q->request_fn)
+        if (!q->request_fn && !q->mq_ops)
                return -EINVAL;
        ret = queue_var_store(&nr, page, count);
@@ -62,40 +61,14 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
        if (nr < BLKDEV_MIN_RQ)
                nr = BLKDEV_MIN_RQ;
-        spin_lock_irq(q->queue_lock);
+        if (q->request_fn)
-        q->nr_requests = nr;
+                err = blk_update_nr_requests(q, nr);
-        blk_queue_congestion_threshold(q);
+        else
+                err = blk_mq_update_nr_requests(q, nr);
-        /* congestion isn't cgroup aware and follows root blkcg for now */
-        rl = &q->root_rl;
+        if (err)
+                return err;
-        if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
-                blk_set_queue_congested(q, BLK_RW_SYNC);
-        else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
-                blk_clear_queue_congested(q, BLK_RW_SYNC);
-        if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q))
-                blk_set_queue_congested(q, BLK_RW_ASYNC);
-        else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
-                blk_clear_queue_congested(q, BLK_RW_ASYNC);
-        blk_queue_for_each_rl(rl, q) {
-                if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
-                        blk_set_rl_full(rl, BLK_RW_SYNC);
-                } else {
-                        blk_clear_rl_full(rl, BLK_RW_SYNC);
-                        wake_up(&rl->wait[BLK_RW_SYNC]);
-                }
-                if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
-                        blk_set_rl_full(rl, BLK_RW_ASYNC);
-                } else {
-                        blk_clear_rl_full(rl, BLK_RW_ASYNC);
-                        wake_up(&rl->wait[BLK_RW_ASYNC]);
-                }
-        }
-        spin_unlock_irq(q->queue_lock);
        return ret;
 }
@@ -544,8 +517,6 @@ static void blk_release_queue(struct kobject *kobj)
        if (q->queue_tags)
                __blk_queue_free_tags(q);
-        percpu_counter_destroy(&q->mq_usage_counter);
        if (q->mq_ops)
                blk_mq_free_queue(q);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 033745cd7fba..9353b4683359 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -744,7 +744,7 @@ static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
 static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
 {
        if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
-                return 0;
+                return false;
        return 1;
 }
@@ -842,7 +842,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
        if (tg->io_disp[rw] + 1 <= io_allowed) {
                if (wait)
                        *wait = 0;
-                return 1;
+                return true;
        }
        /* Calc approx time to dispatch */
@@ -880,7 +880,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
        if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) {
                if (wait)
                        *wait = 0;
-                return 1;
+                return true;
        }
        /* Calc approx time to dispatch */
@@ -923,7 +923,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
        if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
                if (wait)
                        *wait = 0;
-                return 1;
+                return true;
        }
        /*
@@ -1258,7 +1258,7 @@ out_unlock:
 * of throtl_data->service_queue.  Those bio's are ready and issued by this
 * function.
 */
-void blk_throtl_dispatch_work_fn(struct work_struct *work)
+static void blk_throtl_dispatch_work_fn(struct work_struct *work)
 {
        struct throtl_data *td = container_of(work, struct throtl_data,
                                              dispatch_work);
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index d96f7061c6fd..95a09590ccfd 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -96,11 +96,7 @@ static void blk_rq_timed_out(struct request *req)
                        __blk_complete_request(req);
                break;
        case BLK_EH_RESET_TIMER:
-                if (q->mq_ops)
+                blk_add_timer(req);
-                        blk_mq_add_timer(req);
-                else
-                        blk_add_timer(req);
                blk_clear_rq_complete(req);
                break;
        case BLK_EH_NOT_HANDLED:
@@ -170,7 +166,26 @@ void blk_abort_request(struct request *req)
 }
 EXPORT_SYMBOL_GPL(blk_abort_request);
-void __blk_add_timer(struct request *req, struct list_head *timeout_list)
+unsigned long blk_rq_timeout(unsigned long timeout)
+{
+        unsigned long maxt;
+        maxt = round_jiffies_up(jiffies + BLK_MAX_TIMEOUT);
+        if (time_after(timeout, maxt))
+                timeout = maxt;
+        return timeout;
+}
+/**
+ * blk_add_timer - Start timeout timer for a single request
+ * @req:        request that is about to start running.
+ *
+ * Notes:
+ *    Each request has its own timer, and as it is added to the queue, we
+ *    set up the timer. When the request completes, we cancel the timer.
+ */
+void blk_add_timer(struct request *req)
 {
        struct request_queue *q = req->q;
        unsigned long expiry;
@@ -188,32 +203,29 @@ void __blk_add_timer(struct request *req, struct list_head *timeout_list)
                req->timeout = q->rq_timeout;
        req->deadline = jiffies + req->timeout;
-        if (timeout_list)
+        if (!q->mq_ops)
-                list_add_tail(&req->timeout_list, timeout_list);
+                list_add_tail(&req->timeout_list, &req->q->timeout_list);
        /*
         * If the timer isn't already pending or this timeout is earlier
         * than an existing one, modify the timer. Round up to next nearest
         * second.
         */
-        expiry = round_jiffies_up(req->deadline);
+        expiry = blk_rq_timeout(round_jiffies_up(req->deadline));
        if (!timer_pending(&q->timeout) ||
-            time_before(expiry, q->timeout.expires))
+            time_before(expiry, q->timeout.expires)) {
-                mod_timer(&q->timeout, expiry);
+                unsigned long diff = q->timeout.expires - expiry;
-}
+                /*
+                 * Due to added timer slack to group timers, the timer
+                 * will often be a little in front of what we asked for.
+                 * So apply some tolerance here too, otherwise we keep
+                 * modifying the timer because expires for value X
+                 * will be X + something.
+                 */
+                if (!timer_pending(&q->timeout) || (diff >= HZ / 2))
+                        mod_timer(&q->timeout, expiry);
+        }
-/**
- * blk_add_timer - Start timeout timer for a single request
- * @req:        request that is about to start running.
- *
- * Notes:
- *    Each request has its own timer, and as it is added to the queue, we
- *    set up the timer. When the request completes, we cancel the timer.
- */
-void blk_add_timer(struct request *req)
-{
-        __blk_add_timer(req, &req->q->timeout_list);
 }
diff --git a/block/blk.h b/block/blk.h
index 1d880f1f957f..45385e9abf6f 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -9,6 +9,9 @@
 /* Number of requests a "batching" process may submit */
 #define BLK_BATCH_REQ   32
+/* Max future timer expiry for timeouts */
+#define BLK_MAX_TIMEOUT         (5 * HZ)
 extern struct kmem_cache *blk_requestq_cachep;
 extern struct kmem_cache *request_cachep;
 extern struct kobj_type blk_queue_ktype;
@@ -37,9 +40,9 @@ bool __blk_end_bidi_request(struct request *rq, int error,
 void blk_rq_timed_out_timer(unsigned long data);
 void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
                          unsigned int *next_set);
-void __blk_add_timer(struct request *req, struct list_head *timeout_list);
+unsigned long blk_rq_timeout(unsigned long timeout);
+void blk_add_timer(struct request *req);
 void blk_delete_timer(struct request *);
-void blk_add_timer(struct request *);
 bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
@@ -185,6 +188,8 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)
        return q->nr_congestion_off;
 }
+extern int blk_update_nr_requests(struct request_queue *, unsigned int);
 /*
 * Contribute to IO statistics IFF:
 *
diff --git a/mm/bounce.c b/block/bounce.c
index 523918b8c6dc..523918b8c6dc 100644
--- a/mm/bounce.c
+++ b/block/bounce.c
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index e0985f1955e7..22dffebc7c73 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -908,7 +908,7 @@ static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
 {
        if (cfqd->busy_queues) {
                cfq_log(cfqd, "schedule dispatch");
-                kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
+                kblockd_schedule_work(&cfqd->unplug_work);
        }
 }
@@ -4460,7 +4460,7 @@ out_free:
 static ssize_t
 cfq_var_show(unsigned int var, char *page)
 {
-        return sprintf(page, "%d\n", var);
+        return sprintf(page, "%u\n", var);
 }
 static ssize_t
diff --git a/fs/ioprio.c b/block/ioprio.c
index e50170ca7c33..e50170ca7c33 100644
--- a/fs/ioprio.c
+++ b/block/ioprio.c
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 26487972ac54..9c28a5b38042 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -205,10 +205,6 @@ int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm)
        if (capable(CAP_SYS_RAWIO))
                return 0;
-        /* if there's no filter set, assume we're filtering everything out */
-        if (!filter)
-                return -EPERM;
        /* Anybody who can open the device can do a read-safe command */
        if (test_bit(cmd[0], filter->read_ok))
                return 0;
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 748dea4f34dc..758da2287d9a 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1406,7 +1406,7 @@ next_segment:
                track = block / (floppy->dtype->sects * floppy->type->sect_mult);
                sector = block % (floppy->dtype->sects * floppy->type->sect_mult);
-                data = rq->buffer + 512 * cnt;
+                data = bio_data(rq->bio) + 512 * cnt;
 #ifdef DEBUG
                printk("access to track %d, sector %d, with buffer at "
                       "0x%08lx\n", track, sector, data);
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index cfa64bdf01c9..2104b1b4ccda 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -1484,7 +1484,7 @@ repeat:
        ReqCnt = 0;
        ReqCmd = rq_data_dir(fd_request);
        ReqBlock = blk_rq_pos(fd_request);
-        ReqBuffer = fd_request->buffer;
+        ReqBuffer = bio_data(fd_request->bio);
        setup_req_params( drive );
        do_fd_action( drive );
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index fa9bb742df6e..dc3a41c82b38 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -2351,7 +2351,7 @@ static void rw_interrupt(void)
        }
        if (CT(COMMAND) != FD_READ ||
-            raw_cmd->kernel_data == current_req->buffer) {
+            raw_cmd->kernel_data == bio_data(current_req->bio)) {
                /* transfer directly from buffer */
                cont->done(1);
        } else if (CT(COMMAND) == FD_READ) {
@@ -2640,7 +2640,7 @@ static int make_raw_rw_request(void)
                raw_cmd->flags &= ~FD_RAW_WRITE;
                raw_cmd->flags |= FD_RAW_READ;
                COMMAND = FM_MODE(_floppy, FD_READ);
-        } else if ((unsigned long)current_req->buffer < MAX_DMA_ADDRESS) {
+        } else if ((unsigned long)bio_data(current_req->bio) < MAX_DMA_ADDRESS) {
                unsigned long dma_limit;
                int direct, indirect;
@@ -2654,13 +2654,13 @@ static int make_raw_rw_request(void)
                 */
                max_size = buffer_chain_size();
                dma_limit = (MAX_DMA_ADDRESS -
-                             ((unsigned long)current_req->buffer)) >> 9;
+                             ((unsigned long)bio_data(current_req->bio))) >> 9;
                if ((unsigned long)max_size > dma_limit)
                        max_size = dma_limit;
                /* 64 kb boundaries */
-                if (CROSS_64KB(current_req->buffer, max_size << 9))
+                if (CROSS_64KB(bio_data(current_req->bio), max_size << 9))
                        max_size = (K_64 -
-                                    ((unsigned long)current_req->buffer) %
+                                    ((unsigned long)bio_data(current_req->bio)) %
                                    K_64) >> 9;
                direct = transfer_size(ssize, max_sector, max_size) - fsector_t;
                /*
@@ -2677,7 +2677,7 @@ static int make_raw_rw_request(void)
                       (DP->read_track & (1 << DRS->probed_format)))))) {
                        max_size = blk_rq_sectors(current_req);
                } else {
-                        raw_cmd->kernel_data = current_req->buffer;
+                        raw_cmd->kernel_data = bio_data(current_req->bio);
                        raw_cmd->length = current_count_sectors << 9;
                        if (raw_cmd->length == 0) {
                                DPRINT("%s: zero dma transfer attempted\n", __func__);
@@ -2731,7 +2731,7 @@ static int make_raw_rw_request(void)
        raw_cmd->length = ((raw_cmd->length - 1) | (ssize - 1)) + 1;
        raw_cmd->length <<= 9;
        if ((raw_cmd->length < current_count_sectors << 9) ||
-            (raw_cmd->kernel_data != current_req->buffer &&
+            (raw_cmd->kernel_data != bio_data(current_req->bio) &&
             CT(COMMAND) == FD_WRITE &&
             (aligned_sector_t + (raw_cmd->length >> 9) > buffer_max ||
              aligned_sector_t < buffer_min)) ||
@@ -2739,7 +2739,7 @@ static int make_raw_rw_request(void)
            raw_cmd->length <= 0 || current_count_sectors <= 0) {
                DPRINT("fractionary current count b=%lx s=%lx\n",
                       raw_cmd->length, current_count_sectors);
-                if (raw_cmd->kernel_data != current_req->buffer)
+                if (raw_cmd->kernel_data != bio_data(current_req->bio))
                        pr_info("addr=%d, length=%ld\n",
                                (int)((raw_cmd->kernel_data -
                                       floppy_track_buffer) >> 9),
@@ -2756,7 +2756,7 @@ static int make_raw_rw_request(void)
                return 0;
        }
-        if (raw_cmd->kernel_data != current_req->buffer) {
+        if (raw_cmd->kernel_data != bio_data(current_req->bio)) {
                if (raw_cmd->kernel_data < floppy_track_buffer ||
                    current_count_sectors < 0 ||
                    raw_cmd->length < 0 ||
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index bf397bf108b7..8a290c08262f 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -464,11 +464,11 @@ static void read_intr(void)
 ok_to_read:
        req = hd_req;
-        insw(HD_DATA, req->buffer, 256);
+        insw(HD_DATA, bio_data(req->bio), 256);
 #ifdef DEBUG
        printk("%s: read: sector %ld, remaining = %u, buffer=%p\n",
               req->rq_disk->disk_name, blk_rq_pos(req) + 1,
-               blk_rq_sectors(req) - 1, req->buffer+512);
+               blk_rq_sectors(req) - 1, bio_data(req->bio)+512);
 #endif
        if (hd_end_request(0, 512)) {
                SET_HANDLER(&read_intr);
@@ -505,7 +505,7 @@ static void write_intr(void)
 ok_to_write:
        if (hd_end_request(0, 512)) {
                SET_HANDLER(&write_intr);
-                outsw(HD_DATA, req->buffer, 256);
+                outsw(HD_DATA, bio_data(req->bio), 256);
                return;
        }
@@ -624,7 +624,7 @@ repeat:
        printk("%s: %sing: CHS=%d/%d/%d, sectors=%d, buffer=%p\n",
                req->rq_disk->disk_name,
                req_data_dir(req) == READ ? "read" : "writ",
-                cyl, head, sec, nsect, req->buffer);
+                cyl, head, sec, nsect, bio_data(req->bio));
 #endif
        if (req->cmd_type == REQ_TYPE_FS) {
                switch (rq_data_dir(req)) {
@@ -643,7 +643,7 @@ repeat:
                                bad_rw_intr();
                                goto repeat;
                        }
-                        outsw(HD_DATA, req->buffer, 256);
+                        outsw(HD_DATA, bio_data(req->bio), 256);
                        break;
                default:
                        printk("unknown hd-command\n");
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index eb59b1241366..e352cac707e8 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -479,7 +479,7 @@ static unsigned int mg_out(struct mg_host *host,
 static void mg_read_one(struct mg_host *host, struct request *req)
 {
-        u16 *buff = (u16 *)req->buffer;
+        u16 *buff = (u16 *)bio_data(req->bio);
        u32 i;
        for (i = 0; i < MG_SECTOR_SIZE >> 1; i++)
@@ -496,7 +496,7 @@ static void mg_read(struct request *req)
                mg_bad_rw_intr(host);
        MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
-               blk_rq_sectors(req), blk_rq_pos(req), req->buffer);
+               blk_rq_sectors(req), blk_rq_pos(req), bio_data(req->bio));
        do {
                if (mg_wait(host, ATA_DRQ,
@@ -514,7 +514,7 @@ static void mg_read(struct request *req)
 static void mg_write_one(struct mg_host *host, struct request *req)
 {
-        u16 *buff = (u16 *)req->buffer;
+        u16 *buff = (u16 *)bio_data(req->bio);
        u32 i;
        for (i = 0; i < MG_SECTOR_SIZE >> 1; i++)
@@ -534,7 +534,7 @@ static void mg_write(struct request *req)
        }
        MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
-               rem, blk_rq_pos(req), req->buffer);
+               rem, blk_rq_pos(req), bio_data(req->bio));
        if (mg_wait(host, ATA_DRQ,
                    MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
@@ -585,7 +585,7 @@ ok_to_read:
        mg_read_one(host, req);
        MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n",
-               blk_rq_pos(req), blk_rq_sectors(req) - 1, req->buffer);
+               blk_rq_pos(req), blk_rq_sectors(req) - 1, bio_data(req->bio));
        /* send read confirm */
        outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
@@ -624,7 +624,7 @@ ok_to_write:
                /* write 1 sector and set handler if remains */
                mg_write_one(host, req);
                MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n",
-                       blk_rq_pos(req), blk_rq_sectors(req), req->buffer);
+                       blk_rq_pos(req), blk_rq_sectors(req), bio_data(req->bio));
                host->mg_do_intr = mg_write_intr;
                mod_timer(&host->timer, jiffies + 3 * HZ);
        }
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 091b9ea14feb..b40af63a5476 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -32,6 +32,7 @@ struct nullb {
        unsigned int index;
        struct request_queue *q;
        struct gendisk *disk;
+        struct blk_mq_tag_set tag_set;
        struct hrtimer timer;
        unsigned int queue_depth;
        spinlock_t lock;
@@ -226,7 +227,7 @@ static void null_cmd_end_timer(struct nullb_cmd *cmd)
 static void null_softirq_done_fn(struct request *rq)
 {
-        end_cmd(rq->special);
+        end_cmd(blk_mq_rq_to_pdu(rq));
 }
 static inline void null_handle_cmd(struct nullb_cmd *cmd)
@@ -311,7 +312,7 @@ static void null_request_fn(struct request_queue *q)
 static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq)
 {
-        struct nullb_cmd *cmd = rq->special;
+        struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq);
        cmd->rq = rq;
        cmd->nq = hctx->driver_data;
@@ -320,46 +321,6 @@ static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq)
        return BLK_MQ_RQ_QUEUE_OK;
 }
-static struct blk_mq_hw_ctx *null_alloc_hctx(struct blk_mq_reg *reg, unsigned int hctx_index)
-{
-        int b_size = DIV_ROUND_UP(reg->nr_hw_queues, nr_online_nodes);
-        int tip = (reg->nr_hw_queues % nr_online_nodes);
-        int node = 0, i, n;
-        /*
-         * Split submit queues evenly wrt to the number of nodes. If uneven,
-         * fill the first buckets with one extra, until the rest is filled with
-         * no extra.
-         */
-        for (i = 0, n = 1; i < hctx_index; i++, n++) {
-                if (n % b_size == 0) {
-                        n = 0;
-                        node++;
-                        tip--;
-                        if (!tip)
-                                b_size = reg->nr_hw_queues / nr_online_nodes;
-                }
-        }
-        /*
-         * A node might not be online, therefore map the relative node id to the
-         * real node id.
-         */
-        for_each_online_node(n) {
-                if (!node)
-                        break;
-                node--;
-        }
-        return kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL, n);
-}
-static void null_free_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_index)
-{
-        kfree(hctx);
-}
 static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq)
 {
        BUG_ON(!nullb);
@@ -389,19 +350,14 @@ static struct blk_mq_ops null_mq_ops = {
        .complete       = null_softirq_done_fn,
 };
-static struct blk_mq_reg null_mq_reg = {
-        .ops            = &null_mq_ops,
-        .queue_depth    = 64,
-        .cmd_size       = sizeof(struct nullb_cmd),
-        .flags          = BLK_MQ_F_SHOULD_MERGE,
-};
 static void null_del_dev(struct nullb *nullb)
 {
        list_del_init(&nullb->list);
        del_gendisk(nullb->disk);
        blk_cleanup_queue(nullb->q);
+        if (queue_mode == NULL_Q_MQ)
+                blk_mq_free_tag_set(&nullb->tag_set);
        put_disk(nullb->disk);
        kfree(nullb);
 }
@@ -506,7 +462,7 @@ static int null_add_dev(void)
        nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, home_node);
        if (!nullb)
-                return -ENOMEM;
+                goto out;
        spin_lock_init(&nullb->lock);
@@ -514,49 +470,44 @@ static int null_add_dev(void)
                submit_queues = nr_online_nodes;
        if (setup_queues(nullb))
-                goto err;
+                goto out_free_nullb;
        if (queue_mode == NULL_Q_MQ) {
-                null_mq_reg.numa_node = home_node;
+                nullb->tag_set.ops = &null_mq_ops;
-                null_mq_reg.queue_depth = hw_queue_depth;
+                nullb->tag_set.nr_hw_queues = submit_queues;
-                null_mq_reg.nr_hw_queues = submit_queues;
+                nullb->tag_set.queue_depth = hw_queue_depth;
+                nullb->tag_set.numa_node = home_node;
-                if (use_per_node_hctx) {
+                nullb->tag_set.cmd_size = sizeof(struct nullb_cmd);
-                        null_mq_reg.ops->alloc_hctx = null_alloc_hctx;
+                nullb->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
-                        null_mq_reg.ops->free_hctx = null_free_hctx;
+                nullb->tag_set.driver_data = nullb;
-                } else {
-                        null_mq_reg.ops->alloc_hctx = blk_mq_alloc_single_hw_queue;
+                if (blk_mq_alloc_tag_set(&nullb->tag_set))
-                        null_mq_reg.ops->free_hctx = blk_mq_free_single_hw_queue;
+                        goto out_cleanup_queues;
-                }
+                nullb->q = blk_mq_init_queue(&nullb->tag_set);
-                nullb->q = blk_mq_init_queue(&null_mq_reg, nullb);
+                if (!nullb->q)
+                        goto out_cleanup_tags;
        } else if (queue_mode == NULL_Q_BIO) {
                nullb->q = blk_alloc_queue_node(GFP_KERNEL, home_node);
+                if (!nullb->q)
+                        goto out_cleanup_queues;
                blk_queue_make_request(nullb->q, null_queue_bio);
                init_driver_queues(nullb);
        } else {
                nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock, home_node);
+                if (!nullb->q)
+                        goto out_cleanup_queues;
                blk_queue_prep_rq(nullb->q, null_rq_prep_fn);
-                if (nullb->q)
+                blk_queue_softirq_done(nullb->q, null_softirq_done_fn);
-                        blk_queue_softirq_done(nullb->q, null_softirq_done_fn);
                init_driver_queues(nullb);
        }
-        if (!nullb->q)
-                goto queue_fail;
        nullb->q->queuedata = nullb;
        queue_flag_set_unlocked(QUEUE_FLAG_NONROT, nullb->q);
        disk = nullb->disk = alloc_disk_node(1, home_node);
-        if (!disk) {
+        if (!disk)
-queue_fail:
+                goto out_cleanup_blk_queue;
-                blk_cleanup_queue(nullb->q);
-                cleanup_queues(nullb);
-err:
-                kfree(nullb);
-                return -ENOMEM;
-        }
        mutex_lock(&lock);
        list_add_tail(&nullb->list, &nullb_list);
@@ -579,6 +530,18 @@ err:
        sprintf(disk->disk_name, "nullb%d", nullb->index);
        add_disk(disk);
        return 0;
+out_cleanup_blk_queue:
+        blk_cleanup_queue(nullb->q);
+out_cleanup_tags:
+        if (queue_mode == NULL_Q_MQ)
+                blk_mq_free_tag_set(&nullb->tag_set);
+out_cleanup_queues:
+        cleanup_queues(nullb);
+out_free_nullb:
+        kfree(nullb);
+out:
+        return -ENOMEM;
 }
 static int __init null_init(void)
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index e76bdc074dbe..719cb1bc1640 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -747,7 +747,7 @@ static void do_pcd_request(struct request_queue * q)
                        pcd_current = cd;
                        pcd_sector = blk_rq_pos(pcd_req);
                        pcd_count = blk_rq_cur_sectors(pcd_req);
-                        pcd_buf = pcd_req->buffer;
+                        pcd_buf = bio_data(pcd_req->bio);
                        pcd_busy = 1;
                        ps_set_intr(do_pcd_read, NULL, 0, nice);
                        return;
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 19ad8f0c83ef..fea7e76a00de 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -454,7 +454,7 @@ static enum action do_pd_io_start(void)
                if (pd_block + pd_count > get_capacity(pd_req->rq_disk))
                        return Fail;
                pd_run = blk_rq_sectors(pd_req);
-                pd_buf = pd_req->buffer;
+                pd_buf = bio_data(pd_req->bio);
                pd_retries = 0;
                if (pd_cmd == READ)
                        return do_pd_read_start();
@@ -485,7 +485,7 @@ static int pd_next_buf(void)
        spin_lock_irqsave(&pd_lock, saved_flags);
        __blk_end_request_cur(pd_req, 0);
        pd_count = blk_rq_cur_sectors(pd_req);
-        pd_buf = pd_req->buffer;
+        pd_buf = bio_data(pd_req->bio);
        spin_unlock_irqrestore(&pd_lock, saved_flags);
        return 0;
 }
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index f5c86d523ba0..9a15fd3c9349 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -795,7 +795,7 @@ repeat:
        }
        pf_cmd = rq_data_dir(pf_req);
-        pf_buf = pf_req->buffer;
+        pf_buf = bio_data(pf_req->bio);
        pf_retries = 0;
        pf_busy = 1;
@@ -827,7 +827,7 @@ static int pf_next_buf(void)
                if (!pf_req)
                        return 1;
                pf_count = blk_rq_cur_sectors(pf_req);
-                pf_buf = pf_req->buffer;
+                pf_buf = bio_data(pf_req->bio);
        }
        return 0;
 }
diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index a69dd93d1bd5..c48d9084c965 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -563,7 +563,6 @@ skd_prep_discard_cdb(struct skd_scsi_request *scsi_req,
        req = skreq->req;
        blk_add_request_payload(req, page, len);
-        req->buffer = buf;
 }
 static void skd_request_fn_not_online(struct request_queue *q);
@@ -744,6 +743,7 @@ static void skd_request_fn(struct request_queue *q)
                                break;
                        }
                        skreq->discard_page = 1;
+                        req->completion_data = page;
                        skd_prep_discard_cdb(scsi_req, skreq, page, lba, count);
                } else if (flush == SKD_FLUSH_ZERO_SIZE_FIRST) {
@@ -858,8 +858,7 @@ static void skd_end_request(struct skd_device *skdev,
                (skreq->discard_page == 1)) {
                pr_debug("%s:%s:%d, free the page!",
                         skdev->name, __func__, __LINE__);
-                free_page((unsigned long)req->buffer);
+                __free_page(req->completion_data);
-                req->buffer = NULL;
        }
        if (unlikely(error)) {
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index b02d53a399f3..6b44bbe528b7 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -549,7 +549,7 @@ static void redo_fd_request(struct request_queue *q)
                case READ:
                        err = floppy_read_sectors(fs, blk_rq_pos(req),
                                                  blk_rq_cur_sectors(req),
-                                                  req->buffer);
+                                                  bio_data(req->bio));
                        break;
                }
        done:
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index c74f7b56e7c4..523ee8fd4c15 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -342,7 +342,7 @@ static void start_request(struct floppy_state *fs)
                swim3_dbg("do_fd_req: dev=%s cmd=%d sec=%ld nr_sec=%u buf=%p\n",
                          req->rq_disk->disk_name, req->cmd,
                          (long)blk_rq_pos(req), blk_rq_sectors(req),
-                          req->buffer);
+                          bio_data(req->bio));
                swim3_dbg("           errors=%d current_nr_sectors=%u\n",
                          req->errors, blk_rq_cur_sectors(req));
 #endif
@@ -479,11 +479,11 @@ static inline void setup_transfer(struct floppy_state *fs)
                /* Set up 3 dma commands: write preamble, data, postamble */
                init_dma(cp, OUTPUT_MORE, write_preamble, sizeof(write_preamble));
                ++cp;
-                init_dma(cp, OUTPUT_MORE, req->buffer, 512);
+                init_dma(cp, OUTPUT_MORE, bio_data(req->bio), 512);
                ++cp;
                init_dma(cp, OUTPUT_LAST, write_postamble, sizeof(write_postamble));
        } else {
-                init_dma(cp, INPUT_LAST, req->buffer, n * 512);
+                init_dma(cp, INPUT_LAST, bio_data(req->bio), n * 512);
        }
        ++cp;
        out_le16(&cp->command, DBDMA_STOP);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index cb9b1f8326c3..c8f286e8d80f 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -30,6 +30,9 @@ struct virtio_blk
        /* The disk structure for the kernel. */
        struct gendisk *disk;
+        /* Block layer tags. */
+        struct blk_mq_tag_set tag_set;
        /* Process context for config space updates */
        struct work_struct config_work;
@@ -112,7 +115,7 @@ static int __virtblk_add_req(struct virtqueue *vq,
 static inline void virtblk_request_done(struct request *req)
 {
-        struct virtblk_req *vbr = req->special;
+        struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
        int error = virtblk_result(vbr);
        if (req->cmd_type == REQ_TYPE_BLOCK_PC) {
@@ -147,14 +150,14 @@ static void virtblk_done(struct virtqueue *vq)
        /* In case queue is stopped waiting for more buffers. */
        if (req_done)
-                blk_mq_start_stopped_hw_queues(vblk->disk->queue);
+                blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
        spin_unlock_irqrestore(&vblk->vq_lock, flags);
 }
 static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
 {
        struct virtio_blk *vblk = hctx->queue->queuedata;
-        struct virtblk_req *vbr = req->special;
+        struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
        unsigned long flags;
        unsigned int num;
        const bool last = (req->cmd_flags & REQ_END) != 0;
@@ -480,33 +483,27 @@ static const struct device_attribute dev_attr_cache_type_rw =
        __ATTR(cache_type, S_IRUGO|S_IWUSR,
               virtblk_cache_type_show, virtblk_cache_type_store);
-static struct blk_mq_ops virtio_mq_ops = {
+static int virtblk_init_request(void *data, struct request *rq,
-        .queue_rq       = virtio_queue_rq,
+                unsigned int hctx_idx, unsigned int request_idx,
-        .map_queue      = blk_mq_map_queue,
+                unsigned int numa_node)
-        .alloc_hctx     = blk_mq_alloc_single_hw_queue,
-        .free_hctx      = blk_mq_free_single_hw_queue,
-        .complete       = virtblk_request_done,
-};
-static struct blk_mq_reg virtio_mq_reg = {
-        .ops            = &virtio_mq_ops,
-        .nr_hw_queues   = 1,
-        .queue_depth    = 0, /* Set in virtblk_probe */
-        .numa_node      = NUMA_NO_NODE,
-        .flags          = BLK_MQ_F_SHOULD_MERGE,
-};
-module_param_named(queue_depth, virtio_mq_reg.queue_depth, uint, 0444);
-static int virtblk_init_vbr(void *data, struct blk_mq_hw_ctx *hctx,
-                             struct request *rq, unsigned int nr)
 {
        struct virtio_blk *vblk = data;
-        struct virtblk_req *vbr = rq->special;
+        struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq);
        sg_init_table(vbr->sg, vblk->sg_elems);
        return 0;
 }
+static struct blk_mq_ops virtio_mq_ops = {
+        .queue_rq       = virtio_queue_rq,
+        .map_queue      = blk_mq_map_queue,
+        .complete       = virtblk_request_done,
+        .init_request   = virtblk_init_request,
+};
+static unsigned int virtblk_queue_depth;
+module_param_named(queue_depth, virtblk_queue_depth, uint, 0444);
 static int virtblk_probe(struct virtio_device *vdev)
 {
        struct virtio_blk *vblk;
@@ -561,24 +558,34 @@ static int virtblk_probe(struct virtio_device *vdev)
        }
        /* Default queue sizing is to fill the ring. */
-        if (!virtio_mq_reg.queue_depth) {
+        if (!virtblk_queue_depth) {
-                virtio_mq_reg.queue_depth = vblk->vq->num_free;
+                virtblk_queue_depth = vblk->vq->num_free;
                /* ... but without indirect descs, we use 2 descs per req */
                if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
-                        virtio_mq_reg.queue_depth /= 2;
+                        virtblk_queue_depth /= 2;
        }
-        virtio_mq_reg.cmd_size =
+        memset(&vblk->tag_set, 0, sizeof(vblk->tag_set));
+        vblk->tag_set.ops = &virtio_mq_ops;
+        vblk->tag_set.nr_hw_queues = 1;
+        vblk->tag_set.queue_depth = virtblk_queue_depth;
+        vblk->tag_set.numa_node = NUMA_NO_NODE;
+        vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+        vblk->tag_set.cmd_size =
                sizeof(struct virtblk_req) +
                sizeof(struct scatterlist) * sg_elems;
+        vblk->tag_set.driver_data = vblk;
-        q = vblk->disk->queue = blk_mq_init_queue(&virtio_mq_reg, vblk);
+        err = blk_mq_alloc_tag_set(&vblk->tag_set);
+        if (err)
+                goto out_put_disk;
+        q = vblk->disk->queue = blk_mq_init_queue(&vblk->tag_set);
        if (!q) {
                err = -ENOMEM;
-                goto out_put_disk;
+                goto out_free_tags;
        }
-        blk_mq_init_commands(q, virtblk_init_vbr, vblk);
        q->queuedata = vblk;
        virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
@@ -679,6 +686,8 @@ static int virtblk_probe(struct virtio_device *vdev)
 out_del_disk:
        del_gendisk(vblk->disk);
        blk_cleanup_queue(vblk->disk->queue);
+out_free_tags:
+        blk_mq_free_tag_set(&vblk->tag_set);
 out_put_disk:
        put_disk(vblk->disk);
 out_free_vq:
@@ -705,6 +714,8 @@ static void virtblk_remove(struct virtio_device *vdev)
        del_gendisk(vblk->disk);
        blk_cleanup_queue(vblk->disk->queue);
+        blk_mq_free_tag_set(&vblk->tag_set);
        /* Stop all the virtqueues. */
        vdev->config->reset(vdev);
@@ -749,7 +760,7 @@ static int virtblk_restore(struct virtio_device *vdev)
        vblk->config_enable = true;
        ret = init_vq(vdev->priv);
        if (!ret)
-                blk_mq_start_stopped_hw_queues(vblk->disk->queue);
+                blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
        return ret;
 }
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index efe1b4761735..283a30e88287 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -612,10 +612,10 @@ static void do_blkif_request(struct request_queue *rq)
                }
                pr_debug("do_blk_req %p: cmd %p, sec %lx, "
-                         "(%u/%u) buffer:%p [%s]\n",
+                         "(%u/%u) [%s]\n",
                         req, req->cmd, (unsigned long)blk_rq_pos(req),
                         blk_rq_cur_sectors(req), blk_rq_sectors(req),
-                         req->buffer, rq_data_dir(req) ? "write" : "read");
+                         rq_data_dir(req) ? "write" : "read");
                if (blkif_queue_request(req)) {
                        blk_requeue_request(rq, req);
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index 1393b8871a28..ab3ea62e5dfc 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -661,7 +661,7 @@ static void ace_fsm_dostate(struct ace_device *ace)
                        rq_data_dir(req));
                ace->req = req;
-                ace->data_ptr = req->buffer;
+                ace->data_ptr = bio_data(req->bio);
                ace->data_count = blk_rq_cur_sectors(req) * ACE_BUF_PER_SECTOR;
                ace_out32(ace, ACE_MPULBA, blk_rq_pos(req) & 0x0FFFFFFF);
@@ -733,7 +733,7 @@ static void ace_fsm_dostate(struct ace_device *ace)
                         *      blk_rq_sectors(ace->req),
                         *      blk_rq_cur_sectors(ace->req));
                         */
-                        ace->data_ptr = ace->req->buffer;
+                        ace->data_ptr = bio_data(ace->req->bio);
                        ace->data_count = blk_rq_cur_sectors(ace->req) * 16;
                        ace_fsm_yieldirq(ace);
                        break;
diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c
index 27de5046708a..968f9e52effa 100644
--- a/drivers/block/z2ram.c
+++ b/drivers/block/z2ram.c
@@ -87,13 +87,15 @@ static void do_z2_request(struct request_queue *q)
                while (len) {
                        unsigned long addr = start & Z2RAM_CHUNKMASK;
                        unsigned long size = Z2RAM_CHUNKSIZE - addr;
+                        void *buffer = bio_data(req->bio);
                        if (len < size)
                                size = len;
                        addr += z2ram_map[ start >> Z2RAM_CHUNKSHIFT ];
                        if (rq_data_dir(req) == READ)
-                                memcpy(req->buffer, (char *)addr, size);
+                                memcpy(buffer, (char *)addr, size);
                        else
-                                memcpy((char *)addr, req->buffer, size);
+                                memcpy((char *)addr, buffer, size);
                        start += size;
                        len -= size;
                }
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index 51e75ad96422..584bc3126403 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -602,7 +602,7 @@ static void gdrom_readdisk_dma(struct work_struct *work)
                spin_unlock(&gdrom_lock);
                block = blk_rq_pos(req)/GD_TO_BLK + GD_SESSION_OFFSET;
                block_cnt = blk_rq_sectors(req)/GD_TO_BLK;
-                __raw_writel(virt_to_phys(req->buffer), GDROM_DMA_STARTADDR_REG);
+                __raw_writel(virt_to_phys(bio_data(req->bio)), GDROM_DMA_STARTADDR_REG);
                __raw_writel(block_cnt * GDROM_HARD_SECTOR, GDROM_DMA_LENGTH_REG);
                __raw_writel(1, GDROM_DMA_DIRECTION_REG);
                __raw_writel(1, GDROM_DMA_ENABLE_REG);
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 102c50d38902..06cea7ff3a7c 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -902,6 +902,7 @@ void add_disk_randomness(struct gendisk *disk)
        add_timer_randomness(disk->random, 0x100 + disk_devt(disk));
        trace_add_disk_randomness(disk_devt(disk), ENTROPY_BITS(&input_pool));
 }
+EXPORT_SYMBOL_GPL(add_disk_randomness);
 #endif
 /*********************************************************************
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 16f69be820c7..ee880382e3bc 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -188,10 +188,9 @@ static ide_startstop_t ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
        ledtrig_ide_activity();
-        pr_debug("%s: %sing: block=%llu, sectors=%u, buffer=0x%08lx\n",
+        pr_debug("%s: %sing: block=%llu, sectors=%u\n",
                 drive->name, rq_data_dir(rq) == READ ? "read" : "writ",
-                 (unsigned long long)block, blk_rq_sectors(rq),
+                 (unsigned long long)block, blk_rq_sectors(rq));
-                 (unsigned long)rq->buffer);
        if (hwif->rw_disk)
                hwif->rw_disk(drive, rq);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 455e64916498..6a71bc7c9133 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1544,7 +1544,6 @@ static int setup_clone(struct request *clone, struct request *rq,
        clone->cmd = rq->cmd;
        clone->cmd_len = rq->cmd_len;
        clone->sense = rq->sense;
-        clone->buffer = rq->buffer;
        clone->end_io = end_clone_request;
        clone->end_io_data = tio;
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 0b2ccb68c0d0..4dbfaee9aa95 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -82,8 +82,7 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr,
        block = blk_rq_pos(req) << 9 >> tr->blkshift;
        nsect = blk_rq_cur_bytes(req) >> tr->blkshift;
+        buf = bio_data(req->bio);
-        buf = req->buffer;
        if (req->cmd_type != REQ_TYPE_FS)
                return -EIO;
diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c
index 8d659e6a1b4c..20a667c95da4 100644
--- a/drivers/mtd/ubi/block.c
+++ b/drivers/mtd/ubi/block.c
@@ -253,7 +253,7 @@ static int do_ubiblock_request(struct ubiblock *dev, struct request *req)
         * flash access anyway.
         */
        mutex_lock(&dev->dev_mutex);
-        ret = ubiblock_read(dev, req->buffer, sec, len);
+        ret = ubiblock_read(dev, bio_data(req->bio), sec, len);
        mutex_unlock(&dev->dev_mutex);
        return ret;
diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c
index 4ccb5d869389..a40ee1e37486 100644
--- a/drivers/sbus/char/jsflash.c
+++ b/drivers/sbus/char/jsflash.c
@@ -207,7 +207,7 @@ static void jsfd_do_request(struct request_queue *q)
                        goto end;
                }
-                jsfd_read(req->buffer, jdp->dbase + offset, len);
+                jsfd_read(bio_data(req->bio), jdp->dbase + offset, len);
                err = 0;
        end:
                if (!__blk_end_request_cur(req, err))
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 9db097a28a74..a0c95cac91f0 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -140,7 +140,7 @@ static void __scsi_queue_insert(struct scsi_cmnd *cmd, int reason, int unbusy)
        cmd->result = 0;
        spin_lock_irqsave(q->queue_lock, flags);
        blk_requeue_request(q, cmd->request);
-        kblockd_schedule_work(q, &device->requeue_work);
+        kblockd_schedule_work(&device->requeue_work);
        spin_unlock_irqrestore(q->queue_lock, flags);
 }
@@ -1019,8 +1019,6 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb,
                return BLKPREP_DEFER;
        }
-        req->buffer = NULL;
        /* 
         * Next, walk the list, and fill in the addresses and sizes of
         * each segment.
@@ -1158,7 +1156,6 @@ int scsi_setup_blk_pc_cmnd(struct scsi_device *sdev, struct request *req)
                BUG_ON(blk_rq_bytes(req));
                memset(&cmd->sdb, 0, sizeof(cmd->sdb));
-                req->buffer = NULL;
        }
        cmd->cmd_len = req->cmd_len;
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index efcbcd182863..96af195224f2 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -737,16 +737,14 @@ static int sd_setup_discard_cmnd(struct scsi_device *sdp, struct request *rq)
                goto out;
        }
+        rq->completion_data = page;
        blk_add_request_payload(rq, page, len);
        ret = scsi_setup_blk_pc_cmnd(sdp, rq);
-        rq->buffer = page_address(page);
        rq->__data_len = nr_bytes;
 out:
-        if (ret != BLKPREP_OK) {
+        if (ret != BLKPREP_OK)
                __free_page(page);
-                rq->buffer = NULL;
-        }
        return ret;
 }
@@ -842,10 +840,9 @@ static void sd_unprep_fn(struct request_queue *q, struct request *rq)
 {
        struct scsi_cmnd *SCpnt = rq->special;
-        if (rq->cmd_flags & REQ_DISCARD) {
+        if (rq->cmd_flags & REQ_DISCARD)
-                free_page((unsigned long)rq->buffer);
+                __free_page(rq->completion_data);
-                rq->buffer = NULL;
-        }
        if (SCpnt->cmnd != rq->cmd) {
                mempool_free(SCpnt->cmnd, sd_cdb_pool);
                SCpnt->cmnd = NULL;
diff --git a/fs/Makefile b/fs/Makefile
index f9cb9876e466..4030cbfbc9af 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -14,14 +14,13 @@ obj-y :=	open.o read_write.o file_table.o super.o \
                stack.o fs_struct.o statfs.o
 ifeq ($(CONFIG_BLOCK),y)
-obj-y +=        buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
+obj-y +=        buffer.o block_dev.o direct-io.o mpage.o
 else
 obj-y +=        no-block.o
 endif
 obj-$(CONFIG_PROC_FS) += proc_namespace.o
-obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
 obj-y                           += notify/
 obj-$(CONFIG_EPOLL)             += eventpoll.o
 obj-$(CONFIG_ANON_INODES)       += anon_inodes.o
diff --git a/include/linux/bio.h b/include/linux/bio.h
index bba550826921..5a645769f020 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -333,7 +333,7 @@ static inline struct bio *bio_next_split(struct bio *bio, int sectors,
 extern struct bio_set *bioset_create(unsigned int, unsigned int);
 extern void bioset_free(struct bio_set *);
-extern mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries);
+extern mempool_t *biovec_create_pool(int pool_entries);
 extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
 extern void bio_put(struct bio *);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 0120451545d8..c15128833100 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -8,7 +8,13 @@ struct blk_mq_tags;
 struct blk_mq_cpu_notifier {
        struct list_head list;
        void *data;
-        void (*notify)(void *data, unsigned long action, unsigned int cpu);
+        int (*notify)(void *data, unsigned long action, unsigned int cpu);
+};
+struct blk_mq_ctxmap {
+        unsigned int map_size;
+        unsigned int bits_per_word;
+        struct blk_align_bitmap *map;
 };
 struct blk_mq_hw_ctx {
@@ -18,7 +24,11 @@ struct blk_mq_hw_ctx {
        } ____cacheline_aligned_in_smp;
        unsigned long           state;          /* BLK_MQ_S_* flags */
-        struct delayed_work     delayed_work;
+        struct delayed_work     run_work;
+        struct delayed_work     delay_work;
+        cpumask_var_t           cpumask;
+        int                     next_cpu;
+        int                     next_cpu_batch;
        unsigned long           flags;          /* BLK_MQ_F_* flags */
@@ -27,13 +37,13 @@ struct blk_mq_hw_ctx {
        void                    *driver_data;
+        struct blk_mq_ctxmap    ctx_map;
        unsigned int            nr_ctx;
        struct blk_mq_ctx       **ctxs;
-        unsigned int            nr_ctx_map;
-        unsigned long           *ctx_map;
-        struct request          **rqs;
+        unsigned int            wait_index;
-        struct list_head        page_list;
        struct blk_mq_tags      *tags;
        unsigned long           queued;
@@ -41,31 +51,40 @@ struct blk_mq_hw_ctx {
 #define BLK_MQ_MAX_DISPATCH_ORDER       10
        unsigned long           dispatched[BLK_MQ_MAX_DISPATCH_ORDER];
-        unsigned int            queue_depth;
        unsigned int            numa_node;
        unsigned int            cmd_size;       /* per-request extra data */
+        atomic_t                nr_active;
        struct blk_mq_cpu_notifier      cpu_notifier;
        struct kobject          kobj;
 };
-struct blk_mq_reg {
+struct blk_mq_tag_set {
        struct blk_mq_ops       *ops;
        unsigned int            nr_hw_queues;
-        unsigned int            queue_depth;
+        unsigned int            queue_depth;    /* max hw supported */
        unsigned int            reserved_tags;
        unsigned int            cmd_size;       /* per-request extra data */
        int                     numa_node;
        unsigned int            timeout;
        unsigned int            flags;          /* BLK_MQ_F_* */
+        void                    *driver_data;
+        struct blk_mq_tags      **tags;
+        struct mutex            tag_list_lock;
+        struct list_head        tag_list;
 };
 typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *);
 typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int);
-typedef struct blk_mq_hw_ctx *(alloc_hctx_fn)(struct blk_mq_reg *,unsigned int);
-typedef void (free_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
 typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
 typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
+typedef int (init_request_fn)(void *, struct request *, unsigned int,
+                unsigned int, unsigned int);
+typedef void (exit_request_fn)(void *, struct request *, unsigned int,
+                unsigned int);
 struct blk_mq_ops {
        /*
@@ -86,18 +105,20 @@ struct blk_mq_ops {
        softirq_done_fn         *complete;
        /*
-         * Override for hctx allocations (should probably go)
-         */
-        alloc_hctx_fn           *alloc_hctx;
-        free_hctx_fn            *free_hctx;
-        /*
         * Called when the block layer side of a hardware queue has been
         * set up, allowing the driver to allocate/init matching structures.
         * Ditto for exit/teardown.
         */
        init_hctx_fn            *init_hctx;
        exit_hctx_fn            *exit_hctx;
+        /*
+         * Called for every command allocated by the block layer to allow
+         * the driver to set up driver specific data.
+         * Ditto for exit/teardown.
+         */
+        init_request_fn         *init_request;
+        exit_request_fn         *exit_request;
 };
 enum {
@@ -107,18 +128,24 @@ enum {
        BLK_MQ_F_SHOULD_MERGE   = 1 << 0,
        BLK_MQ_F_SHOULD_SORT    = 1 << 1,
-        BLK_MQ_F_SHOULD_IPI     = 1 << 2,
+        BLK_MQ_F_TAG_SHARED     = 1 << 2,
+        BLK_MQ_F_SG_MERGE       = 1 << 3,
+        BLK_MQ_F_SYSFS_UP       = 1 << 4,
        BLK_MQ_S_STOPPED        = 0,
+        BLK_MQ_S_TAG_ACTIVE     = 1,
        BLK_MQ_MAX_DEPTH        = 2048,
+        BLK_MQ_CPU_WORK_BATCH   = 8,
 };
-struct request_queue *blk_mq_init_queue(struct blk_mq_reg *, void *);
+struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);
 int blk_mq_register_disk(struct gendisk *);
 void blk_mq_unregister_disk(struct gendisk *);
-int blk_mq_init_commands(struct request_queue *, int (*init)(void *data, struct blk_mq_hw_ctx *, struct request *, unsigned int), void *data);
-void blk_mq_free_commands(struct request_queue *, void (*free)(void *data, struct blk_mq_hw_ctx *, struct request *, unsigned int), void *data);
+int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set);
+void blk_mq_free_tag_set(struct blk_mq_tag_set *set);
 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
@@ -126,28 +153,28 @@ void blk_mq_insert_request(struct request *, bool, bool, bool);
 void blk_mq_run_queues(struct request_queue *q, bool async);
 void blk_mq_free_request(struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
-struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp);
+struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
-struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, gfp_t gfp);
+                gfp_t gfp, bool reserved);
-struct request *blk_mq_rq_from_tag(struct request_queue *q, unsigned int tag);
+struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx, unsigned int tag);
 struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index);
-struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *, unsigned int);
+struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int);
-void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *, unsigned int);
-bool blk_mq_end_io_partial(struct request *rq, int error,
+void blk_mq_end_io(struct request *rq, int error);
-                unsigned int nr_bytes);
+void __blk_mq_end_io(struct request *rq, int error);
-static inline void blk_mq_end_io(struct request *rq, int error)
-{
-        bool done = !blk_mq_end_io_partial(rq, error, blk_rq_bytes(rq));
-        BUG_ON(!done);
-}
+void blk_mq_requeue_request(struct request *rq);
+void blk_mq_add_to_requeue_list(struct request *rq, bool at_head);
+void blk_mq_kick_requeue_list(struct request_queue *q);
 void blk_mq_complete_request(struct request *rq);
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
 void blk_mq_stop_hw_queues(struct request_queue *q);
-void blk_mq_start_stopped_hw_queues(struct request_queue *q);
+void blk_mq_start_hw_queues(struct request_queue *q);
+void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
+void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
+void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data);
 /*
 * Driver command data is immediately after the request. So subtract request
@@ -162,12 +189,6 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq)
        return (void *) rq + sizeof(*rq);
 }
-static inline struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx,
-                                               unsigned int tag)
-{
-        return hctx->rqs[tag];
-}
 #define queue_for_each_hw_ctx(q, hctx, i)                               \
        for ((i) = 0; (i) < (q)->nr_hw_queues &&                        \
             ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index aa0eaa2d0bd8..d8e4cea23a25 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -190,6 +190,7 @@ enum rq_flag_bits {
        __REQ_PM,               /* runtime pm request */
        __REQ_END,              /* last of chain of requests */
        __REQ_HASHED,           /* on IO scheduler merge hash */
+        __REQ_MQ_INFLIGHT,      /* track inflight for MQ */
        __REQ_NR_BITS,          /* stops here */
 };
@@ -243,5 +244,6 @@ enum rq_flag_bits {
 #define REQ_PM                  (1ULL << __REQ_PM)
 #define REQ_END                 (1ULL << __REQ_END)
 #define REQ_HASHED              (1ULL << __REQ_HASHED)
+#define REQ_MQ_INFLIGHT         (1ULL << __REQ_MQ_INFLIGHT)
 #endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0d84981ee03f..695b9fd41efe 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -90,15 +90,15 @@ enum rq_cmd_type_bits {
 #define BLK_MAX_CDB     16
 /*
- * try to put the fields that are referenced together in the same cacheline.
+ * Try to put the fields that are referenced together in the same cacheline.
- * if you modify this structure, be sure to check block/blk-core.c:blk_rq_init()
+ *
- * as well!
+ * If you modify this structure, make sure to update blk_rq_init() and
+ * especially blk_mq_rq_ctx_init() to take care of the added fields.
 */
 struct request {
        struct list_head queuelist;
        union {
                struct call_single_data csd;
-                struct work_struct mq_flush_work;
                unsigned long fifo_time;
        };
@@ -178,7 +178,6 @@ struct request {
        unsigned short ioprio;
        void *special;          /* opaque pointer available for LLD use */
-        char *buffer;           /* kaddr of the current segment if available */
        int tag;
        int errors;
@@ -463,6 +462,10 @@ struct request_queue {
        struct request          *flush_rq;
        spinlock_t              mq_flush_lock;
+        struct list_head        requeue_list;
+        spinlock_t              requeue_lock;
+        struct work_struct      requeue_work;
        struct mutex            sysfs_lock;
        int                     bypass_depth;
@@ -481,6 +484,9 @@ struct request_queue {
        wait_queue_head_t       mq_freeze_wq;
        struct percpu_counter   mq_usage_counter;
        struct list_head        all_q_node;
+        struct blk_mq_tag_set   *tag_set;
+        struct list_head        tag_set_list;
 };
 #define QUEUE_FLAG_QUEUED       1       /* uses generic tag queueing */
@@ -504,6 +510,7 @@ struct request_queue {
 #define QUEUE_FLAG_SAME_FORCE  18       /* force complete on same CPU */
 #define QUEUE_FLAG_DEAD        19       /* queue tear-down finished */
 #define QUEUE_FLAG_INIT_DONE   20       /* queue is initialized */
+#define QUEUE_FLAG_NO_SG_MERGE 21       /* don't attempt to merge SG segments*/
 #define QUEUE_FLAG_DEFAULT      ((1 << QUEUE_FLAG_IO_STAT) |            \
                                 (1 << QUEUE_FLAG_STACKABLE)    |       \
@@ -937,6 +944,7 @@ extern struct request *blk_fetch_request(struct request_queue *q);
 */
 extern bool blk_update_request(struct request *rq, int error,
                               unsigned int nr_bytes);
+extern void blk_finish_request(struct request *rq, int error);
 extern bool blk_end_request(struct request *rq, int error,
                            unsigned int nr_bytes);
 extern void blk_end_request_all(struct request *rq, int error);
@@ -1053,7 +1061,6 @@ static inline void blk_post_runtime_resume(struct request_queue *q, int err) {}
 * schedule() where blk_schedule_flush_plug() is called.
 */
 struct blk_plug {
-        unsigned long magic; /* detect uninitialized use-cases */
        struct list_head list; /* requests */
        struct list_head mq_list; /* blk-mq requests */
        struct list_head cb_list; /* md requires an unplug callback */
@@ -1102,7 +1109,8 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk)
 /*
 * tag stuff
 */
-#define blk_rq_tagged(rq)               ((rq)->cmd_flags & REQ_QUEUED)
+#define blk_rq_tagged(rq) \
+        ((rq)->mq_ctx || ((rq)->cmd_flags & REQ_QUEUED))
 extern int blk_queue_start_tag(struct request_queue *, struct request *);
 extern struct request *blk_queue_find_tag(struct request_queue *, int);
 extern void blk_queue_end_tag(struct request_queue *, struct request *);
@@ -1370,8 +1378,9 @@ static inline void put_dev_sector(Sector p)
 }
 struct work_struct;
-int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
+int kblockd_schedule_work(struct work_struct *work);
-int kblockd_schedule_delayed_work(struct request_queue *q, struct delayed_work *dwork, unsigned long delay);
+int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay);
+int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
 #ifdef CONFIG_BLK_CGROUP
 /*
diff --git a/mm/Makefile b/mm/Makefile
index b484452dac57..0173940407f6 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -30,7 +30,6 @@ endif
 obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
-obj-$(CONFIG_BOUNCE)    += bounce.o
 obj-$(CONFIG_SWAP)      += page_io.o swap_state.o swapfile.o
 obj-$(CONFIG_FRONTSWAP) += frontswap.o
 obj-$(CONFIG_ZSWAP)     += zswap.o
author	Linus Torvalds <torvalds@linux-foundation.org>	2014-06-02 12:29:34 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2014-06-02 12:29:34 -0400
commit	681a2895486243a82547d8c9f53043eb54b53da0 (patch)
tree	464273280aed6db55a99cc0d8614d4393f94fc48
parent	6c52486dedbb30a1313da64945dcd686b4579c51 (diff)
parent	ed851860b4552fc8963ecf71eab9f6f7a5c19d74 (diff)