18 files changed, 534 insertions, 299 deletions
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 5cbd5d9ea61d..0436c21db7f2 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -361,7 +361,7 @@ static void bio_integrity_verify_fn(struct work_struct *work)
        /* Restore original bio completion handler */
        bio->bi_end_io = bip->bip_end_io;
-        bio_endio_nodec(bio, error);
+        bio_endio(bio, error);
 }
 /**
@@ -388,7 +388,7 @@ void bio_integrity_endio(struct bio *bio, int error)
         */
        if (error) {
                bio->bi_end_io = bip->bip_end_io;
-                bio_endio_nodec(bio, error);
+                bio_endio(bio, error);
                return;
        }
diff --git a/block/bio.c b/block/bio.c
index f66a4eae16ee..259197d97de1 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -270,8 +270,8 @@ void bio_init(struct bio *bio)
 {
        memset(bio, 0, sizeof(*bio));
        bio->bi_flags = 1 << BIO_UPTODATE;
-        atomic_set(&bio->bi_remaining, 1);
+        atomic_set(&bio->__bi_remaining, 1);
-        atomic_set(&bio->bi_cnt, 1);
+        atomic_set(&bio->__bi_cnt, 1);
 }
 EXPORT_SYMBOL(bio_init);
@@ -292,8 +292,8 @@ void bio_reset(struct bio *bio)
        __bio_free(bio);
        memset(bio, 0, BIO_RESET_BYTES);
-        bio->bi_flags = flags|(1 << BIO_UPTODATE);
+        bio->bi_flags = flags | (1 << BIO_UPTODATE);
-        atomic_set(&bio->bi_remaining, 1);
+        atomic_set(&bio->__bi_remaining, 1);
 }
 EXPORT_SYMBOL(bio_reset);
@@ -303,6 +303,17 @@ static void bio_chain_endio(struct bio *bio, int error)
        bio_put(bio);
 }
+/*
+ * Increment chain count for the bio. Make sure the CHAIN flag update
+ * is visible before the raised count.
+ */
+static inline void bio_inc_remaining(struct bio *bio)
+{
+        bio->bi_flags |= (1 << BIO_CHAIN);
+        smp_mb__before_atomic();
+        atomic_inc(&bio->__bi_remaining);
+}
 /**
 * bio_chain - chain bio completions
 * @bio: the target bio
@@ -320,7 +331,7 @@ void bio_chain(struct bio *bio, struct bio *parent)
        bio->bi_private = parent;
        bio->bi_end_io  = bio_chain_endio;
-        atomic_inc(&parent->bi_remaining);
+        bio_inc_remaining(parent);
 }
 EXPORT_SYMBOL(bio_chain);
@@ -524,13 +535,17 @@ EXPORT_SYMBOL(zero_fill_bio);
 **/
 void bio_put(struct bio *bio)
 {
-        BIO_BUG_ON(!atomic_read(&bio->bi_cnt));
+        if (!bio_flagged(bio, BIO_REFFED))
-        /*
-         * last put frees it
-         */
-        if (atomic_dec_and_test(&bio->bi_cnt))
                bio_free(bio);
+        else {
+                BIO_BUG_ON(!atomic_read(&bio->__bi_cnt));
+                /*
+                 * last put frees it
+                 */
+                if (atomic_dec_and_test(&bio->__bi_cnt))
+                        bio_free(bio);
+        }
 }
 EXPORT_SYMBOL(bio_put);
@@ -1741,6 +1756,25 @@ void bio_flush_dcache_pages(struct bio *bi)
 EXPORT_SYMBOL(bio_flush_dcache_pages);
 #endif
+static inline bool bio_remaining_done(struct bio *bio)
+{
+        /*
+         * If we're not chaining, then ->__bi_remaining is always 1 and
+         * we always end io on the first invocation.
+         */
+        if (!bio_flagged(bio, BIO_CHAIN))
+                return true;
+        BUG_ON(atomic_read(&bio->__bi_remaining) <= 0);
+        if (atomic_dec_and_test(&bio->__bi_remaining)) {
+                clear_bit(BIO_CHAIN, &bio->bi_flags);
+                return true;
+        }
+        return false;
+}
 /**
 * bio_endio - end I/O on a bio
 * @bio:        bio
@@ -1758,15 +1792,13 @@ EXPORT_SYMBOL(bio_flush_dcache_pages);
 void bio_endio(struct bio *bio, int error)
 {
        while (bio) {
-                BUG_ON(atomic_read(&bio->bi_remaining) <= 0);
                if (error)
                        clear_bit(BIO_UPTODATE, &bio->bi_flags);
                else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
                        error = -EIO;
-                if (!atomic_dec_and_test(&bio->bi_remaining))
+                if (unlikely(!bio_remaining_done(bio)))
-                        return;
+                        break;
                /*
                 * Need to have a real endio function for chained bios,
@@ -1790,21 +1822,6 @@ void bio_endio(struct bio *bio, int error)
 EXPORT_SYMBOL(bio_endio);
 /**
- * bio_endio_nodec - end I/O on a bio, without decrementing bi_remaining
- * @bio:        bio
- * @error:      error, if any
- *
- * For code that has saved and restored bi_end_io; thing hard before using this
- * function, probably you should've cloned the entire bio.
- **/
-void bio_endio_nodec(struct bio *bio, int error)
-{
-        atomic_inc(&bio->bi_remaining);
-        bio_endio(bio, error);
-}
-EXPORT_SYMBOL(bio_endio_nodec);
-/**
 * bio_split - split a bio
 * @bio:        bio to split
 * @sectors:    number of sectors to split from the front of @bio
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 0ac817b750db..6e43fa355e71 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -9,6 +9,10 @@
 *
 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
 *                    Nauman Rafique <nauman@google.com>
+ *
+ * For policy-specific per-blkcg data:
+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
+ *                    Arianna Avanzini <avanzini.arianna@gmail.com>
 */
 #include <linux/ioprio.h>
 #include <linux/kdev_t.h>
@@ -26,8 +30,7 @@
 static DEFINE_MUTEX(blkcg_pol_mutex);
-struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT,
+struct blkcg blkcg_root;
-                            .cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, };
 EXPORT_SYMBOL_GPL(blkcg_root);
 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
@@ -823,6 +826,8 @@ static struct cgroup_subsys_state *
 blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 {
        struct blkcg *blkcg;
+        struct cgroup_subsys_state *ret;
+        int i;
        if (!parent_css) {
                blkcg = &blkcg_root;
@@ -830,17 +835,49 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
        }
        blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
-        if (!blkcg)
+        if (!blkcg) {
-                return ERR_PTR(-ENOMEM);
+                ret = ERR_PTR(-ENOMEM);
+                goto free_blkcg;
+        }
+        for (i = 0; i < BLKCG_MAX_POLS ; i++) {
+                struct blkcg_policy *pol = blkcg_policy[i];
+                struct blkcg_policy_data *cpd;
+                /*
+                 * If the policy hasn't been attached yet, wait for it
+                 * to be attached before doing anything else. Otherwise,
+                 * check if the policy requires any specific per-cgroup
+                 * data: if it does, allocate and initialize it.
+                 */
+                if (!pol || !pol->cpd_size)
+                        continue;
+                BUG_ON(blkcg->pd[i]);
+                cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
+                if (!cpd) {
+                        ret = ERR_PTR(-ENOMEM);
+                        goto free_pd_blkcg;
+                }
+                blkcg->pd[i] = cpd;
+                cpd->plid = i;
+                pol->cpd_init_fn(blkcg);
+        }
-        blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT;
-        blkcg->cfq_leaf_weight = CFQ_WEIGHT_DEFAULT;
 done:
        spin_lock_init(&blkcg->lock);
        INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
        INIT_HLIST_HEAD(&blkcg->blkg_list);
        return &blkcg->css;
+free_pd_blkcg:
+        for (i--; i >= 0; i--)
+                kfree(blkcg->pd[i]);
+free_blkcg:
+        kfree(blkcg);
+        return ret;
 }
 /**
@@ -958,8 +995,10 @@ int blkcg_activate_policy(struct request_queue *q,
                          const struct blkcg_policy *pol)
 {
        LIST_HEAD(pds);
+        LIST_HEAD(cpds);
        struct blkcg_gq *blkg, *new_blkg;
-        struct blkg_policy_data *pd, *n;
+        struct blkg_policy_data *pd, *nd;
+        struct blkcg_policy_data *cpd, *cnd;
        int cnt = 0, ret;
        bool preloaded;
@@ -1003,7 +1042,10 @@ int blkcg_activate_policy(struct request_queue *q,
        spin_unlock_irq(q->queue_lock);
-        /* allocate policy_data for all existing blkgs */
+        /*
+         * Allocate per-blkg and per-blkcg policy data
+         * for all existing blkgs.
+         */
        while (cnt--) {
                pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
                if (!pd) {
@@ -1011,26 +1053,50 @@ int blkcg_activate_policy(struct request_queue *q,
                        goto out_free;
                }
                list_add_tail(&pd->alloc_node, &pds);
+                if (!pol->cpd_size)
+                        continue;
+                cpd = kzalloc_node(pol->cpd_size, GFP_KERNEL, q->node);
+                if (!cpd) {
+                        ret = -ENOMEM;
+                        goto out_free;
+                }
+                list_add_tail(&cpd->alloc_node, &cpds);
        }
        /*
-         * Install the allocated pds.  With @q bypassing, no new blkg
+         * Install the allocated pds and cpds. With @q bypassing, no new blkg
         * should have been created while the queue lock was dropped.
         */
        spin_lock_irq(q->queue_lock);
        list_for_each_entry(blkg, &q->blkg_list, q_node) {
-                if (WARN_ON(list_empty(&pds))) {
+                if (WARN_ON(list_empty(&pds)) ||
+                    WARN_ON(pol->cpd_size && list_empty(&cpds))) {
                        /* umm... this shouldn't happen, just abort */
                        ret = -ENOMEM;
                        goto out_unlock;
                }
+                cpd = list_first_entry(&cpds, struct blkcg_policy_data,
+                                       alloc_node);
+                list_del_init(&cpd->alloc_node);
                pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node);
                list_del_init(&pd->alloc_node);
                /* grab blkcg lock too while installing @pd on @blkg */
                spin_lock(&blkg->blkcg->lock);
+                if (!pol->cpd_size)
+                        goto no_cpd;
+                if (!blkg->blkcg->pd[pol->plid]) {
+                        /* Per-policy per-blkcg data */
+                        blkg->blkcg->pd[pol->plid] = cpd;
+                        cpd->plid = pol->plid;
+                        pol->cpd_init_fn(blkg->blkcg);
+                } else { /* must free it as it has already been extracted */
+                        kfree(cpd);
+                }
+no_cpd:
                blkg->pd[pol->plid] = pd;
                pd->blkg = blkg;
                pd->plid = pol->plid;
@@ -1045,8 +1111,10 @@ out_unlock:
        spin_unlock_irq(q->queue_lock);
 out_free:
        blk_queue_bypass_end(q);
-        list_for_each_entry_safe(pd, n, &pds, alloc_node)
+        list_for_each_entry_safe(pd, nd, &pds, alloc_node)
                kfree(pd);
+        list_for_each_entry_safe(cpd, cnd, &cpds, alloc_node)
+                kfree(cpd);
        return ret;
 }
 EXPORT_SYMBOL_GPL(blkcg_activate_policy);
@@ -1087,6 +1155,8 @@ void blkcg_deactivate_policy(struct request_queue *q,
                kfree(blkg->pd[pol->plid]);
                blkg->pd[pol->plid] = NULL;
+                kfree(blkg->blkcg->pd[pol->plid]);
+                blkg->blkcg->pd[pol->plid] = NULL;
                spin_unlock(&blkg->blkcg->lock);
        }
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index c567865b5f1d..74296a78bba1 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -23,11 +23,6 @@
 /* Max limits for throttle policy */
 #define THROTL_IOPS_MAX         UINT_MAX
-/* CFQ specific, out here for blkcg->cfq_weight */
-#define CFQ_WEIGHT_MIN          10
-#define CFQ_WEIGHT_MAX          1000
-#define CFQ_WEIGHT_DEFAULT      500
 #ifdef CONFIG_BLK_CGROUP
 enum blkg_rwstat_type {
@@ -50,9 +45,7 @@ struct blkcg {
        struct blkcg_gq                 *blkg_hint;
        struct hlist_head               blkg_list;
-        /* TODO: per-policy storage in blkcg */
+        struct blkcg_policy_data        *pd[BLKCG_MAX_POLS];
-        unsigned int                    cfq_weight;     /* belongs to cfq */
-        unsigned int                    cfq_leaf_weight;
 };
 struct blkg_stat {
@@ -87,6 +80,24 @@ struct blkg_policy_data {
        struct list_head                alloc_node;
 };
+/*
+ * Policies that need to keep per-blkcg data which is independent
+ * from any request_queue associated to it must specify its size
+ * with the cpd_size field of the blkcg_policy structure and
+ * embed a blkcg_policy_data in it. blkcg core allocates
+ * policy-specific per-blkcg structures lazily the first time
+ * they are actually needed, so it handles them together with
+ * blkgs. cpd_init() is invoked to let each policy handle
+ * per-blkcg data.
+ */
+struct blkcg_policy_data {
+        /* the policy id this per-policy data belongs to */
+        int                             plid;
+        /* used during policy activation */
+        struct list_head                alloc_node;
+};
 /* association between a blk cgroup and a request queue */
 struct blkcg_gq {
        /* Pointer to the associated request_queue */
@@ -112,6 +123,7 @@ struct blkcg_gq {
        struct rcu_head                 rcu_head;
 };
+typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg);
 typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
 typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg);
 typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg);
@@ -122,10 +134,13 @@ struct blkcg_policy {
        int                             plid;
        /* policy specific private data size */
        size_t                          pd_size;
+        /* policy specific per-blkcg data size */
+        size_t                          cpd_size;
        /* cgroup files for the policy */
        struct cftype                   *cftypes;
        /* operations */
+        blkcg_pol_init_cpd_fn           *cpd_init_fn;
        blkcg_pol_init_pd_fn            *pd_init_fn;
        blkcg_pol_online_pd_fn          *pd_online_fn;
        blkcg_pol_offline_pd_fn         *pd_offline_fn;
@@ -218,6 +233,12 @@ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
        return blkg ? blkg->pd[pol->plid] : NULL;
 }
+static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
+                                                     struct blkcg_policy *pol)
+{
+        return blkcg ? blkcg->pd[pol->plid] : NULL;
+}
 /**
 * pdata_to_blkg - get blkg associated with policy private data
 * @pd: policy private data of interest
@@ -564,6 +585,9 @@ struct blkcg;
 struct blkg_policy_data {
 };
+struct blkcg_policy_data {
+};
 struct blkcg_gq {
 };
diff --git a/block/blk-core.c b/block/blk-core.c
index fd154b94447a..f6ab750060fe 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -117,7 +117,7 @@ EXPORT_SYMBOL(blk_rq_init);
 static void req_bio_endio(struct request *rq, struct bio *bio,
                          unsigned int nbytes, int error)
 {
-        if (error)
+        if (error && !(rq->cmd_flags & REQ_CLONE))
                clear_bit(BIO_UPTODATE, &bio->bi_flags);
        else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
                error = -EIO;
@@ -128,7 +128,8 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
        bio_advance(bio, nbytes);
        /* don't actually finish bio if it's part of flush sequence */
-        if (bio->bi_iter.bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
+        if (bio->bi_iter.bi_size == 0 &&
+            !(rq->cmd_flags & (REQ_FLUSH_SEQ|REQ_CLONE)))
                bio_endio(bio, error);
 }
@@ -285,6 +286,7 @@ inline void __blk_run_queue_uncond(struct request_queue *q)
        q->request_fn(q);
        q->request_fn_active--;
 }
+EXPORT_SYMBOL_GPL(__blk_run_queue_uncond);
 /**
 * __blk_run_queue - run a single device queue
@@ -552,6 +554,8 @@ void blk_cleanup_queue(struct request_queue *q)
                q->queue_lock = &q->__queue_lock;
        spin_unlock_irq(lock);
+        bdi_destroy(&q->backing_dev_info);
        /* @q is and will stay empty, shutdown and put */
        blk_put_queue(q);
 }
@@ -732,6 +736,8 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 }
 EXPORT_SYMBOL(blk_init_queue_node);
+static void blk_queue_bio(struct request_queue *q, struct bio *bio);
 struct request_queue *
 blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
                         spinlock_t *lock)
@@ -1521,7 +1527,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
 * Caller must ensure !blk_queue_nomerges(q) beforehand.
 */
 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
-                            unsigned int *request_count)
+                            unsigned int *request_count,
+                            struct request **same_queue_rq)
 {
        struct blk_plug *plug;
        struct request *rq;
@@ -1541,8 +1548,16 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
        list_for_each_entry_reverse(rq, plug_list, queuelist) {
                int el_ret;
-                if (rq->q == q)
+                if (rq->q == q) {
                        (*request_count)++;
+                        /*
+                         * Only blk-mq multiple hardware queues case checks the
+                         * rq in the same queue, there should be only one such
+                         * rq in a queue
+                         **/
+                        if (same_queue_rq)
+                                *same_queue_rq = rq;
+                }
                if (rq->q != q || !blk_rq_merge_ok(rq, bio))
                        continue;
@@ -1576,7 +1591,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
        blk_rq_bio_prep(req->q, req, bio);
 }
-void blk_queue_bio(struct request_queue *q, struct bio *bio)
+static void blk_queue_bio(struct request_queue *q, struct bio *bio)
 {
        const bool sync = !!(bio->bi_rw & REQ_SYNC);
        struct blk_plug *plug;
@@ -1607,7 +1622,7 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio)
         * any locks.
         */
        if (!blk_queue_nomerges(q) &&
-            blk_attempt_plug_merge(q, bio, &request_count))
+            blk_attempt_plug_merge(q, bio, &request_count, NULL))
                return;
        spin_lock_irq(q->queue_lock);
@@ -1684,7 +1699,6 @@ out_unlock:
                spin_unlock_irq(q->queue_lock);
        }
 }
-EXPORT_SYMBOL_GPL(blk_queue_bio);       /* for device mapper only */
 /*
 * If bio->bi_dev is a partition, remap the location
@@ -1715,8 +1729,6 @@ static void handle_bad_sector(struct bio *bio)
                        bio->bi_rw,
                        (unsigned long long)bio_end_sector(bio),
                        (long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9));
-        set_bit(BIO_EOF, &bio->bi_flags);
 }
 #ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -2901,95 +2913,22 @@ int blk_lld_busy(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_lld_busy);
-/**
+void blk_rq_prep_clone(struct request *dst, struct request *src)
- * blk_rq_unprep_clone - Helper function to free all bios in a cloned request
- * @rq: the clone request to be cleaned up
- *
- * Description:
- *     Free all bios in @rq for a cloned request.
- */
-void blk_rq_unprep_clone(struct request *rq)
-{
-        struct bio *bio;
-        while ((bio = rq->bio) != NULL) {
-                rq->bio = bio->bi_next;
-                bio_put(bio);
-        }
-}
-EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
-/*
- * Copy attributes of the original request to the clone request.
- * The actual data parts (e.g. ->cmd, ->sense) are not copied.
- */
-static void __blk_rq_prep_clone(struct request *dst, struct request *src)
 {
        dst->cpu = src->cpu;
-        dst->cmd_flags |= (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE;
+        dst->cmd_flags |= (src->cmd_flags & REQ_CLONE_MASK);
+        dst->cmd_flags |= REQ_NOMERGE | REQ_CLONE;
        dst->cmd_type = src->cmd_type;
        dst->__sector = blk_rq_pos(src);
        dst->__data_len = blk_rq_bytes(src);
        dst->nr_phys_segments = src->nr_phys_segments;
        dst->ioprio = src->ioprio;
        dst->extra_len = src->extra_len;
-}
+        dst->bio = src->bio;
+        dst->biotail = src->biotail;
-/**
+        dst->cmd = src->cmd;
- * blk_rq_prep_clone - Helper function to setup clone request
+        dst->cmd_len = src->cmd_len;
- * @rq: the request to be setup
+        dst->sense = src->sense;
- * @rq_src: original request to be cloned
- * @bs: bio_set that bios for clone are allocated from
- * @gfp_mask: memory allocation mask for bio
- * @bio_ctr: setup function to be called for each clone bio.
- *           Returns %0 for success, non %0 for failure.
- * @data: private data to be passed to @bio_ctr
- *
- * Description:
- *     Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
- *     The actual data parts of @rq_src (e.g. ->cmd, ->sense)
- *     are not copied, and copying such parts is the caller's responsibility.
- *     Also, pages which the original bios are pointing to are not copied
- *     and the cloned bios just point same pages.
- *     So cloned bios must be completed before original bios, which means
- *     the caller must complete @rq before @rq_src.
- */
-int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
-                      struct bio_set *bs, gfp_t gfp_mask,
-                      int (*bio_ctr)(struct bio *, struct bio *, void *),
-                      void *data)
-{
-        struct bio *bio, *bio_src;
-        if (!bs)
-                bs = fs_bio_set;
-        __rq_for_each_bio(bio_src, rq_src) {
-                bio = bio_clone_fast(bio_src, gfp_mask, bs);
-                if (!bio)
-                        goto free_and_out;
-                if (bio_ctr && bio_ctr(bio, bio_src, data))
-                        goto free_and_out;
-                if (rq->bio) {
-                        rq->biotail->bi_next = bio;
-                        rq->biotail = bio;
-                } else
-                        rq->bio = rq->biotail = bio;
-        }
-        __blk_rq_prep_clone(rq, rq_src);
-        return 0;
-free_and_out:
-        if (bio)
-                bio_put(bio);
-        blk_rq_unprep_clone(rq);
-        return -ENOMEM;
 }
 EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
@@ -3031,21 +2970,20 @@ void blk_start_plug(struct blk_plug *plug)
 {
        struct task_struct *tsk = current;
+        /*
+         * If this is a nested plug, don't actually assign it.
+         */
+        if (tsk->plug)
+                return;
        INIT_LIST_HEAD(&plug->list);
        INIT_LIST_HEAD(&plug->mq_list);
        INIT_LIST_HEAD(&plug->cb_list);
        /*
-         * If this is a nested plug, don't actually assign it. It will be
+         * Store ordering should not be needed here, since a potential
-         * flushed on its own.
+         * preempt will imply a full memory barrier
         */
-        if (!tsk->plug) {
+        tsk->plug = plug;
-                /*
-                 * Store ordering should not be needed here, since a potential
-                 * preempt will imply a full memory barrier
-                 */
-                tsk->plug = plug;
-        }
 }
 EXPORT_SYMBOL(blk_start_plug);
@@ -3192,10 +3130,11 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 void blk_finish_plug(struct blk_plug *plug)
 {
+        if (plug != current->plug)
+                return;
        blk_flush_plug_list(plug, false);
-        if (plug == current->plug)
+        current->plug = NULL;
-                current->plug = NULL;
 }
 EXPORT_SYMBOL(blk_finish_plug);
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 9924725fa50d..3fec8a29d0fa 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -53,7 +53,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
                           rq_end_io_fn *done)
 {
        int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
-        bool is_pm_resume;
        WARN_ON(irqs_disabled());
        WARN_ON(rq->cmd_type == REQ_TYPE_FS);
@@ -70,12 +69,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
                return;
        }
-        /*
-         * need to check this before __blk_run_queue(), because rq can
-         * be freed before that returns.
-         */
-        is_pm_resume = rq->cmd_type == REQ_TYPE_PM_RESUME;
        spin_lock_irq(q->queue_lock);
        if (unlikely(blk_queue_dying(q))) {
@@ -88,9 +81,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
        __elv_add_request(q, rq, where);
        __blk_run_queue(q);
-        /* the queue is stopped so it won't be run */
-        if (is_pm_resume)
-                __blk_run_queue_uncond(q);
        spin_unlock_irq(q->queue_lock);
 }
 EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index fd3fee81c23c..30a0d9f89017 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -589,7 +589,8 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
            !blk_write_same_mergeable(rq->bio, bio))
                return false;
-        if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS)) {
+        /* Only check gaps if the bio carries data */
+        if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS) && bio_has_data(bio)) {
                struct bio_vec *bprev;
                bprev = &rq->biotail->bi_io_vec[rq->biotail->bi_vcnt - 1];
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 5f13f4d0bcce..1e28ddb656b8 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -24,7 +24,7 @@ static int get_first_sibling(unsigned int cpu)
 {
        unsigned int ret;
-        ret = cpumask_first(topology_thread_cpumask(cpu));
+        ret = cpumask_first(topology_sibling_cpumask(cpu));
        if (ret < nr_cpu_ids)
                return ret;
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index be3290cc0644..9b6e28830b82 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -438,6 +438,39 @@ static void bt_for_each(struct blk_mq_hw_ctx *hctx,
        }
 }
+static void bt_tags_for_each(struct blk_mq_tags *tags,
+                struct blk_mq_bitmap_tags *bt, unsigned int off,
+                busy_tag_iter_fn *fn, void *data, bool reserved)
+{
+        struct request *rq;
+        int bit, i;
+        if (!tags->rqs)
+                return;
+        for (i = 0; i < bt->map_nr; i++) {
+                struct blk_align_bitmap *bm = &bt->map[i];
+                for (bit = find_first_bit(&bm->word, bm->depth);
+                     bit < bm->depth;
+                     bit = find_next_bit(&bm->word, bm->depth, bit + 1)) {
+                        rq = blk_mq_tag_to_rq(tags, off + bit);
+                        fn(rq, data, reserved);
+                }
+                off += (1 << bt->bits_per_word);
+        }
+}
+void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
+                void *priv)
+{
+        if (tags->nr_reserved_tags)
+                bt_tags_for_each(tags, &tags->breserved_tags, 0, fn, priv, true);
+        bt_tags_for_each(tags, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv,
+                        false);
+}
+EXPORT_SYMBOL(blk_mq_all_tag_busy_iter);
 void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn,
                void *priv)
 {
@@ -580,6 +613,11 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
        if (!tags)
                return NULL;
+        if (!zalloc_cpumask_var(&tags->cpumask, GFP_KERNEL)) {
+                kfree(tags);
+                return NULL;
+        }
        tags->nr_tags = total_tags;
        tags->nr_reserved_tags = reserved_tags;
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 90767b370308..75893a34237d 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -44,6 +44,7 @@ struct blk_mq_tags {
        struct list_head page_list;
        int alloc_policy;
+        cpumask_var_t cpumask;
 };
diff --git a/block/blk-mq.c b/block/blk-mq.c
index ade8a2d1b0aa..f53779692c77 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -89,7 +89,8 @@ static int blk_mq_queue_enter(struct request_queue *q, gfp_t gfp)
                        return -EBUSY;
                ret = wait_event_interruptible(q->mq_freeze_wq,
-                                !q->mq_freeze_depth || blk_queue_dying(q));
+                                !atomic_read(&q->mq_freeze_depth) ||
+                                blk_queue_dying(q));
                if (blk_queue_dying(q))
                        return -ENODEV;
                if (ret)
@@ -112,13 +113,10 @@ static void blk_mq_usage_counter_release(struct percpu_ref *ref)
 void blk_mq_freeze_queue_start(struct request_queue *q)
 {
-        bool freeze;
+        int freeze_depth;
-        spin_lock_irq(q->queue_lock);
+        freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
-        freeze = !q->mq_freeze_depth++;
+        if (freeze_depth == 1) {
-        spin_unlock_irq(q->queue_lock);
-        if (freeze) {
                percpu_ref_kill(&q->mq_usage_counter);
                blk_mq_run_hw_queues(q, false);
        }
@@ -143,13 +141,11 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
 void blk_mq_unfreeze_queue(struct request_queue *q)
 {
-        bool wake;
+        int freeze_depth;
-        spin_lock_irq(q->queue_lock);
+        freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
-        wake = !--q->mq_freeze_depth;
+        WARN_ON_ONCE(freeze_depth < 0);
-        WARN_ON_ONCE(q->mq_freeze_depth < 0);
+        if (!freeze_depth) {
-        spin_unlock_irq(q->queue_lock);
-        if (wake) {
                percpu_ref_reinit(&q->mq_usage_counter);
                wake_up_all(&q->mq_freeze_wq);
        }
@@ -677,8 +673,11 @@ static void blk_mq_rq_timer(unsigned long priv)
                data.next = blk_rq_timeout(round_jiffies_up(data.next));
                mod_timer(&q->timeout, data.next);
        } else {
-                queue_for_each_hw_ctx(q, hctx, i)
+                queue_for_each_hw_ctx(q, hctx, i) {
-                        blk_mq_tag_idle(hctx);
+                        /* the hctx may be unmapped, so check it here */
+                        if (blk_mq_hw_queue_mapped(hctx))
+                                blk_mq_tag_idle(hctx);
+                }
        }
 }
@@ -855,6 +854,16 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
                spin_lock(&hctx->lock);
                list_splice(&rq_list, &hctx->dispatch);
                spin_unlock(&hctx->lock);
+                /*
+                 * the queue is expected stopped with BLK_MQ_RQ_QUEUE_BUSY, but
+                 * it's possible the queue is stopped and restarted again
+                 * before this. Queue restart will dispatch requests. And since
+                 * requests in rq_list aren't added into hctx->dispatch yet,
+                 * the requests in rq_list might get lost.
+                 *
+                 * blk_mq_run_hw_queue() already checks the STOPPED bit
+                 **/
+                blk_mq_run_hw_queue(hctx, true);
        }
 }
@@ -1224,6 +1233,38 @@ static struct request *blk_mq_map_request(struct request_queue *q,
        return rq;
 }
+static int blk_mq_direct_issue_request(struct request *rq)
+{
+        int ret;
+        struct request_queue *q = rq->q;
+        struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q,
+                        rq->mq_ctx->cpu);
+        struct blk_mq_queue_data bd = {
+                .rq = rq,
+                .list = NULL,
+                .last = 1
+        };
+        /*
+         * For OK queue, we are done. For error, kill it. Any other
+         * error (busy), just add it to our list as we previously
+         * would have done
+         */
+        ret = q->mq_ops->queue_rq(hctx, &bd);
+        if (ret == BLK_MQ_RQ_QUEUE_OK)
+                return 0;
+        else {
+                __blk_mq_requeue_request(rq);
+                if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
+                        rq->errors = -EIO;
+                        blk_mq_end_request(rq, rq->errors);
+                        return 0;
+                }
+                return -1;
+        }
+}
 /*
 * Multiple hardware queue variant. This will not use per-process plugs,
 * but will attempt to bypass the hctx queueing if we can go straight to
@@ -1235,6 +1276,9 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
        const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
        struct blk_map_ctx data;
        struct request *rq;
+        unsigned int request_count = 0;
+        struct blk_plug *plug;
+        struct request *same_queue_rq = NULL;
        blk_queue_bounce(q, &bio);
@@ -1243,6 +1287,10 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
                return;
        }
+        if (!is_flush_fua && !blk_queue_nomerges(q) &&
+            blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
+                return;
        rq = blk_mq_map_request(q, bio, &data);
        if (unlikely(!rq))
                return;
@@ -1253,38 +1301,42 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
                goto run_queue;
        }
+        plug = current->plug;
        /*
         * If the driver supports defer issued based on 'last', then
         * queue it up like normal since we can potentially save some
         * CPU this way.
         */
-        if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
+        if (((plug && !blk_queue_nomerges(q)) || is_sync) &&
-                struct blk_mq_queue_data bd = {
+            !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
-                        .rq = rq,
+                struct request *old_rq = NULL;
-                        .list = NULL,
-                        .last = 1
-                };
-                int ret;
                blk_mq_bio_to_request(rq, bio);
                /*
-                 * For OK queue, we are done. For error, kill it. Any other
+                 * we do limited pluging. If bio can be merged, do merge.
-                 * error (busy), just add it to our list as we previously
+                 * Otherwise the existing request in the plug list will be
-                 * would have done
+                 * issued. So the plug list will have one request at most
                 */
-                ret = q->mq_ops->queue_rq(data.hctx, &bd);
+                if (plug) {
-                if (ret == BLK_MQ_RQ_QUEUE_OK)
+                        /*
-                        goto done;
+                         * The plug list might get flushed before this. If that
-                else {
+                         * happens, same_queue_rq is invalid and plug list is empty
-                        __blk_mq_requeue_request(rq);
+                         **/
+                        if (same_queue_rq && !list_empty(&plug->mq_list)) {
-                        if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
+                                old_rq = same_queue_rq;
-                                rq->errors = -EIO;
+                                list_del_init(&old_rq->queuelist);
-                                blk_mq_end_request(rq, rq->errors);
-                                goto done;
                        }
-                }
+                        list_add_tail(&rq->queuelist, &plug->mq_list);
+                } else /* is_sync */
+                        old_rq = rq;
+                blk_mq_put_ctx(data.ctx);
+                if (!old_rq)
+                        return;
+                if (!blk_mq_direct_issue_request(old_rq))
+                        return;
+                blk_mq_insert_request(old_rq, false, true, true);
+                return;
        }
        if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
@@ -1297,7 +1349,6 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 run_queue:
                blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
        }
-done:
        blk_mq_put_ctx(data.ctx);
 }
@@ -1309,16 +1360,11 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
 {
        const int is_sync = rw_is_sync(bio->bi_rw);
        const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
-        unsigned int use_plug, request_count = 0;
+        struct blk_plug *plug;
+        unsigned int request_count = 0;
        struct blk_map_ctx data;
        struct request *rq;
-        /*
-         * If we have multiple hardware queues, just go directly to
-         * one of those for sync IO.
-         */
-        use_plug = !is_flush_fua && !is_sync;
        blk_queue_bounce(q, &bio);
        if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
@@ -1326,8 +1372,8 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
                return;
        }
-        if (use_plug && !blk_queue_nomerges(q) &&
+        if (!is_flush_fua && !blk_queue_nomerges(q) &&
-            blk_attempt_plug_merge(q, bio, &request_count))
+            blk_attempt_plug_merge(q, bio, &request_count, NULL))
                return;
        rq = blk_mq_map_request(q, bio, &data);
@@ -1345,21 +1391,18 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
         * utilize that to temporarily store requests until the task is
         * either done or scheduled away.
         */
-        if (use_plug) {
+        plug = current->plug;
-                struct blk_plug *plug = current->plug;
+        if (plug) {
+                blk_mq_bio_to_request(rq, bio);
-                if (plug) {
+                if (list_empty(&plug->mq_list))
-                        blk_mq_bio_to_request(rq, bio);
+                        trace_block_plug(q);
-                        if (list_empty(&plug->mq_list))
+                else if (request_count >= BLK_MAX_REQUEST_COUNT) {
-                                trace_block_plug(q);
+                        blk_flush_plug_list(plug, false);
-                        else if (request_count >= BLK_MAX_REQUEST_COUNT) {
+                        trace_block_plug(q);
-                                blk_flush_plug_list(plug, false);
-                                trace_block_plug(q);
-                        }
-                        list_add_tail(&rq->queuelist, &plug->mq_list);
-                        blk_mq_put_ctx(data.ctx);
-                        return;
                }
+                list_add_tail(&rq->queuelist, &plug->mq_list);
+                blk_mq_put_ctx(data.ctx);
+                return;
        }
        if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
@@ -1495,7 +1538,6 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
                        i++;
                }
        }
        return tags;
 fail:
@@ -1571,22 +1613,6 @@ static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
        return NOTIFY_OK;
 }
-static int blk_mq_hctx_cpu_online(struct blk_mq_hw_ctx *hctx, int cpu)
-{
-        struct request_queue *q = hctx->queue;
-        struct blk_mq_tag_set *set = q->tag_set;
-        if (set->tags[hctx->queue_num])
-                return NOTIFY_OK;
-        set->tags[hctx->queue_num] = blk_mq_init_rq_map(set, hctx->queue_num);
-        if (!set->tags[hctx->queue_num])
-                return NOTIFY_STOP;
-        hctx->tags = set->tags[hctx->queue_num];
-        return NOTIFY_OK;
-}
 static int blk_mq_hctx_notify(void *data, unsigned long action,
                              unsigned int cpu)
 {
@@ -1594,12 +1620,16 @@ static int blk_mq_hctx_notify(void *data, unsigned long action,
        if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
                return blk_mq_hctx_cpu_offline(hctx, cpu);
-        else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
-                return blk_mq_hctx_cpu_online(hctx, cpu);
+        /*
+         * In case of CPU online, tags may be reallocated
+         * in blk_mq_map_swqueue() after mapping is updated.
+         */
        return NOTIFY_OK;
 }
+/* hctx->ctxs will be freed in queue's release handler */
 static void blk_mq_exit_hctx(struct request_queue *q,
                struct blk_mq_tag_set *set,
                struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
@@ -1618,7 +1648,6 @@ static void blk_mq_exit_hctx(struct request_queue *q,
        blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
        blk_free_flush_queue(hctx->fq);
-        kfree(hctx->ctxs);
        blk_mq_free_bitmap(&hctx->ctx_map);
 }
@@ -1775,6 +1804,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
        unsigned int i;
        struct blk_mq_hw_ctx *hctx;
        struct blk_mq_ctx *ctx;
+        struct blk_mq_tag_set *set = q->tag_set;
        queue_for_each_hw_ctx(q, hctx, i) {
                cpumask_clear(hctx->cpumask);
@@ -1791,6 +1821,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
                hctx = q->mq_ops->map_queue(q, i);
                cpumask_set_cpu(i, hctx->cpumask);
+                cpumask_set_cpu(i, hctx->tags->cpumask);
                ctx->index_hw = hctx->nr_ctx;
                hctx->ctxs[hctx->nr_ctx++] = ctx;
        }
@@ -1803,16 +1834,20 @@ static void blk_mq_map_swqueue(struct request_queue *q)
                 * disable it and free the request entries.
                 */
                if (!hctx->nr_ctx) {
-                        struct blk_mq_tag_set *set = q->tag_set;
                        if (set->tags[i]) {
                                blk_mq_free_rq_map(set, set->tags[i], i);
                                set->tags[i] = NULL;
-                                hctx->tags = NULL;
                        }
+                        hctx->tags = NULL;
                        continue;
                }
+                /* unmapped hw queue can be remapped after CPU topo changed */
+                if (!set->tags[i])
+                        set->tags[i] = blk_mq_init_rq_map(set, i);
+                hctx->tags = set->tags[i];
+                WARN_ON(!hctx->tags);
                /*
                 * Set the map size to the number of mapped software queues.
                 * This is more accurate and more efficient than looping
@@ -1886,8 +1921,12 @@ void blk_mq_release(struct request_queue *q)
        unsigned int i;
        /* hctx kobj stays in hctx */
-        queue_for_each_hw_ctx(q, hctx, i)
+        queue_for_each_hw_ctx(q, hctx, i) {
+                if (!hctx)
+                        continue;
+                kfree(hctx->ctxs);
                kfree(hctx);
+        }
        kfree(q->queue_hw_ctx);
@@ -2047,7 +2086,7 @@ void blk_mq_free_queue(struct request_queue *q)
 /* Basically redo blk_mq_init_queue with queue frozen */
 static void blk_mq_queue_reinit(struct request_queue *q)
 {
-        WARN_ON_ONCE(!q->mq_freeze_depth);
+        WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
        blk_mq_sysfs_unregister(q);
@@ -2090,9 +2129,16 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
         */
        list_for_each_entry(q, &all_q_list, all_q_node)
                blk_mq_freeze_queue_start(q);
-        list_for_each_entry(q, &all_q_list, all_q_node)
+        list_for_each_entry(q, &all_q_list, all_q_node) {
                blk_mq_freeze_queue_wait(q);
+                /*
+                 * timeout handler can't touch hw queue during the
+                 * reinitialization
+                 */
+                del_timer_sync(&q->timeout);
+        }
        list_for_each_entry(q, &all_q_list, all_q_node)
                blk_mq_queue_reinit(q);
@@ -2157,6 +2203,12 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
        return 0;
 }
+struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags)
+{
+        return tags->cpumask;
+}
+EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask);
 /*
 * Alloc a tag set to be associated with one or more request queues.
 * May fail with EINVAL for various error conditions. May adjust the
@@ -2218,8 +2270,10 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
        int i;
        for (i = 0; i < set->nr_hw_queues; i++) {
-                if (set->tags[i])
+                if (set->tags[i]) {
                        blk_mq_free_rq_map(set, set->tags[i], i);
+                        free_cpumask_var(set->tags[i]->cpumask);
+                }
        }
        kfree(set->tags);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index faaf36ade7eb..2b8fd302f677 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -522,8 +522,6 @@ static void blk_release_queue(struct kobject *kobj)
        blk_trace_shutdown(q);
-        bdi_destroy(&q->backing_dev_info);
        ida_simple_remove(&blk_queue_ida, q->id);
        call_rcu(&q->rcu_head, blk_free_queue_rcu);
 }
diff --git a/block/blk.h b/block/blk.h
index 43b036185712..026d9594142b 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -78,7 +78,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
 bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
                            struct bio *bio);
 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
-                            unsigned int *request_count);
+                            unsigned int *request_count,
+                            struct request **same_queue_rq);
 void blk_account_io_start(struct request *req, bool new_io);
 void blk_account_io_completion(struct request *req, unsigned int bytes);
@@ -193,8 +194,6 @@ int blk_try_merge(struct request *rq, struct bio *bio);
 void blk_queue_congestion_threshold(struct request_queue *q);
-void __blk_run_queue_uncond(struct request_queue *q);
 int blk_dev_init(void);
diff --git a/block/bounce.c b/block/bounce.c
index ab21ba203d5c..3ab0bce1c947 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -128,9 +128,6 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
        struct bio_vec *bvec, *org_vec;
        int i;
-        if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
-                set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
        /*
         * free up bounce indirect pages used
         */
@@ -221,8 +218,8 @@ bounce:
                if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force)
                        continue;
-                inc_zone_page_state(to->bv_page, NR_BOUNCE);
                to->bv_page = mempool_alloc(pool, q->bounce_gfp);
+                inc_zone_page_state(to->bv_page, NR_BOUNCE);
                if (rw == WRITE) {
                        char *vto, *vfrom;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 5da8e6e9ab4b..d8ad45ccd8fa 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -67,6 +67,11 @@ static struct kmem_cache *cfq_pool;
 #define sample_valid(samples)   ((samples) > 80)
 #define rb_entry_cfqg(node)     rb_entry((node), struct cfq_group, rb_node)
+/* blkio-related constants */
+#define CFQ_WEIGHT_MIN          10
+#define CFQ_WEIGHT_MAX          1000
+#define CFQ_WEIGHT_DEFAULT      500
 struct cfq_ttime {
        unsigned long last_end_request;
@@ -212,6 +217,15 @@ struct cfqg_stats {
 #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
 };
+/* Per-cgroup data */
+struct cfq_group_data {
+        /* must be the first member */
+        struct blkcg_policy_data pd;
+        unsigned int weight;
+        unsigned int leaf_weight;
+};
 /* This is per cgroup per device grouping structure */
 struct cfq_group {
        /* must be the first member */
@@ -446,16 +460,6 @@ CFQ_CFQQ_FNS(deep);
 CFQ_CFQQ_FNS(wait_busy);
 #undef CFQ_CFQQ_FNS
-static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
-{
-        return pd ? container_of(pd, struct cfq_group, pd) : NULL;
-}
-static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
-{
-        return pd_to_blkg(&cfqg->pd);
-}
 #if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
 /* cfqg stats flags */
@@ -600,6 +604,22 @@ static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
+static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
+{
+        return pd ? container_of(pd, struct cfq_group, pd) : NULL;
+}
+static struct cfq_group_data
+*cpd_to_cfqgd(struct blkcg_policy_data *cpd)
+{
+        return cpd ? container_of(cpd, struct cfq_group_data, pd) : NULL;
+}
+static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
+{
+        return pd_to_blkg(&cfqg->pd);
+}
 static struct blkcg_policy blkcg_policy_cfq;
 static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
@@ -607,6 +627,11 @@ static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
        return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
 }
+static struct cfq_group_data *blkcg_to_cfqgd(struct blkcg *blkcg)
+{
+        return cpd_to_cfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_cfq));
+}
 static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg)
 {
        struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent;
@@ -1544,13 +1569,28 @@ static void cfqg_stats_init(struct cfqg_stats *stats)
 #endif
 }
+static void cfq_cpd_init(const struct blkcg *blkcg)
+{
+        struct cfq_group_data *cgd =
+                cpd_to_cfqgd(blkcg->pd[blkcg_policy_cfq.plid]);
+        if (blkcg == &blkcg_root) {
+                cgd->weight = 2 * CFQ_WEIGHT_DEFAULT;
+                cgd->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
+        } else {
+                cgd->weight = CFQ_WEIGHT_DEFAULT;
+                cgd->leaf_weight = CFQ_WEIGHT_DEFAULT;
+        }
+}
 static void cfq_pd_init(struct blkcg_gq *blkg)
 {
        struct cfq_group *cfqg = blkg_to_cfqg(blkg);
+        struct cfq_group_data *cgd = blkcg_to_cfqgd(blkg->blkcg);
        cfq_init_cfqg_base(cfqg);
-        cfqg->weight = blkg->blkcg->cfq_weight;
+        cfqg->weight = cgd->weight;
-        cfqg->leaf_weight = blkg->blkcg->cfq_leaf_weight;
+        cfqg->leaf_weight = cgd->leaf_weight;
        cfqg_stats_init(&cfqg->stats);
        cfqg_stats_init(&cfqg->dead_stats);
 }
@@ -1673,13 +1713,27 @@ static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v)
 static int cfq_print_weight(struct seq_file *sf, void *v)
 {
-        seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_weight);
+        struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+        struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
+        unsigned int val = 0;
+        if (cgd)
+                val = cgd->weight;
+        seq_printf(sf, "%u\n", val);
        return 0;
 }
 static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
 {
-        seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_leaf_weight);
+        struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+        struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
+        unsigned int val = 0;
+        if (cgd)
+                val = cgd->leaf_weight;
+        seq_printf(sf, "%u\n", val);
        return 0;
 }
@@ -1690,6 +1744,7 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
        struct blkcg *blkcg = css_to_blkcg(of_css(of));
        struct blkg_conf_ctx ctx;
        struct cfq_group *cfqg;
+        struct cfq_group_data *cfqgd;
        int ret;
        ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
@@ -1698,17 +1753,22 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
        ret = -EINVAL;
        cfqg = blkg_to_cfqg(ctx.blkg);
+        cfqgd = blkcg_to_cfqgd(blkcg);
+        if (!cfqg || !cfqgd)
+                goto err;
        if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
                if (!is_leaf_weight) {
                        cfqg->dev_weight = ctx.v;
-                        cfqg->new_weight = ctx.v ?: blkcg->cfq_weight;
+                        cfqg->new_weight = ctx.v ?: cfqgd->weight;
                } else {
                        cfqg->dev_leaf_weight = ctx.v;
-                        cfqg->new_leaf_weight = ctx.v ?: blkcg->cfq_leaf_weight;
+                        cfqg->new_leaf_weight = ctx.v ?: cfqgd->leaf_weight;
                }
                ret = 0;
        }
+err:
        blkg_conf_finish(&ctx);
        return ret ?: nbytes;
 }
@@ -1730,16 +1790,23 @@ static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
 {
        struct blkcg *blkcg = css_to_blkcg(css);
        struct blkcg_gq *blkg;
+        struct cfq_group_data *cfqgd;
+        int ret = 0;
        if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
                return -EINVAL;
        spin_lock_irq(&blkcg->lock);
+        cfqgd = blkcg_to_cfqgd(blkcg);
+        if (!cfqgd) {
+                ret = -EINVAL;
+                goto out;
+        }
        if (!is_leaf_weight)
-                blkcg->cfq_weight = val;
+                cfqgd->weight = val;
        else
-                blkcg->cfq_leaf_weight = val;
+                cfqgd->leaf_weight = val;
        hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
                struct cfq_group *cfqg = blkg_to_cfqg(blkg);
@@ -1749,15 +1816,16 @@ static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
                if (!is_leaf_weight) {
                        if (!cfqg->dev_weight)
-                                cfqg->new_weight = blkcg->cfq_weight;
+                                cfqg->new_weight = cfqgd->weight;
                } else {
                        if (!cfqg->dev_leaf_weight)
-                                cfqg->new_leaf_weight = blkcg->cfq_leaf_weight;
+                                cfqg->new_leaf_weight = cfqgd->leaf_weight;
                }
        }
+out:
        spin_unlock_irq(&blkcg->lock);
-        return 0;
+        return ret;
 }
 static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
@@ -4477,6 +4545,18 @@ out_free:
        return ret;
 }
+static void cfq_registered_queue(struct request_queue *q)
+{
+        struct elevator_queue *e = q->elevator;
+        struct cfq_data *cfqd = e->elevator_data;
+        /*
+         * Default to IOPS mode with no idling for SSDs
+         */
+        if (blk_queue_nonrot(q))
+                cfqd->cfq_slice_idle = 0;
+}
 /*
 * sysfs parts below -->
 */
@@ -4592,6 +4672,7 @@ static struct elevator_type iosched_cfq = {
                .elevator_may_queue_fn =        cfq_may_queue,
                .elevator_init_fn =             cfq_init_queue,
                .elevator_exit_fn =             cfq_exit_queue,
+                .elevator_registered_fn =       cfq_registered_queue,
        },
        .icq_size       =       sizeof(struct cfq_io_cq),
        .icq_align      =       __alignof__(struct cfq_io_cq),
@@ -4603,8 +4684,10 @@ static struct elevator_type iosched_cfq = {
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 static struct blkcg_policy blkcg_policy_cfq = {
        .pd_size                = sizeof(struct cfq_group),
+        .cpd_size               = sizeof(struct cfq_group_data),
        .cftypes                = cfq_blkcg_files,
+        .cpd_init_fn            = cfq_cpd_init,
        .pd_init_fn             = cfq_pd_init,
        .pd_offline_fn          = cfq_pd_offline,
        .pd_reset_stats_fn      = cfq_pd_reset_stats,
diff --git a/block/elevator.c b/block/elevator.c
index 59794d0d38e3..942579d04128 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -157,7 +157,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
        eq = kzalloc_node(sizeof(*eq), GFP_KERNEL, q->node);
        if (unlikely(!eq))
-                goto err;
+                return NULL;
        eq->type = e;
        kobject_init(&eq->kobj, &elv_ktype);
@@ -165,10 +165,6 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
        hash_init(eq->hash);
        return eq;
-err:
-        kfree(eq);
-        elevator_put(e);
-        return NULL;
 }
 EXPORT_SYMBOL(elevator_alloc);
@@ -810,6 +806,8 @@ int elv_register_queue(struct request_queue *q)
                }
                kobject_uevent(&e->kobj, KOBJ_ADD);
                e->registered = 1;
+                if (e->type->ops.elevator_registered_fn)
+                        e->type->ops.elevator_registered_fn(q);
        }
        return error;
 }
diff --git a/block/genhd.c b/block/genhd.c
index 0a536dc05f3b..ea982eadaf63 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -422,9 +422,9 @@ int blk_alloc_devt(struct hd_struct *part, dev_t *devt)
        /* allocate ext devt */
        idr_preload(GFP_KERNEL);
-        spin_lock(&ext_devt_lock);
+        spin_lock_bh(&ext_devt_lock);
        idx = idr_alloc(&ext_devt_idr, part, 0, NR_EXT_DEVT, GFP_NOWAIT);
-        spin_unlock(&ext_devt_lock);
+        spin_unlock_bh(&ext_devt_lock);
        idr_preload_end();
        if (idx < 0)
@@ -449,9 +449,9 @@ void blk_free_devt(dev_t devt)
                return;
        if (MAJOR(devt) == BLOCK_EXT_MAJOR) {
-                spin_lock(&ext_devt_lock);
+                spin_lock_bh(&ext_devt_lock);
                idr_remove(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
-                spin_unlock(&ext_devt_lock);
+                spin_unlock_bh(&ext_devt_lock);
        }
 }
@@ -653,7 +653,6 @@ void del_gendisk(struct gendisk *disk)
        disk->flags &= ~GENHD_FL_UP;
        sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
-        bdi_unregister(&disk->queue->backing_dev_info);
        blk_unregister_queue(disk);
        blk_unregister_region(disk_devt(disk), disk->minors);
@@ -691,13 +690,13 @@ struct gendisk *get_gendisk(dev_t devt, int *partno)
        } else {
                struct hd_struct *part;
-                spin_lock(&ext_devt_lock);
+                spin_lock_bh(&ext_devt_lock);
                part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
                if (part && get_disk(part_to_disk(part))) {
                        *partno = part->partno;
                        disk = part_to_disk(part);
                }
-                spin_unlock(&ext_devt_lock);
+                spin_unlock_bh(&ext_devt_lock);
        }
        return disk;
diff --git a/block/ioctl.c b/block/ioctl.c
index 7d8befde2aca..8061eba42887 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -150,21 +150,48 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
        }
 }
-static int blkdev_reread_part(struct block_device *bdev)
+/*
+ * This is an exported API for the block driver, and will not
+ * acquire bd_mutex. This API should be used in case that
+ * caller has held bd_mutex already.
+ */
+int __blkdev_reread_part(struct block_device *bdev)
 {
        struct gendisk *disk = bdev->bd_disk;
-        int res;
        if (!disk_part_scan_enabled(disk) || bdev != bdev->bd_contains)
                return -EINVAL;
        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;
-        if (!mutex_trylock(&bdev->bd_mutex))
-                return -EBUSY;
+        lockdep_assert_held(&bdev->bd_mutex);
-        res = rescan_partitions(disk, bdev);
+        return rescan_partitions(disk, bdev);
+}
+EXPORT_SYMBOL(__blkdev_reread_part);
+/*
+ * This is an exported API for the block driver, and will
+ * try to acquire bd_mutex. If bd_mutex has been held already
+ * in current context, please call __blkdev_reread_part().
+ *
+ * Make sure the held locks in current context aren't required
+ * in open()/close() handler and I/O path for avoiding ABBA deadlock:
+ * - bd_mutex is held before calling block driver's open/close
+ *   handler
+ * - reading partition table may submit I/O to the block device
+ */
+int blkdev_reread_part(struct block_device *bdev)
+{
+        int res;
+        mutex_lock(&bdev->bd_mutex);
+        res = __blkdev_reread_part(bdev);
        mutex_unlock(&bdev->bd_mutex);
        return res;
 }
+EXPORT_SYMBOL(blkdev_reread_part);
 static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
                             uint64_t len, int secure)