15 files changed, 416 insertions, 290 deletions
diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt
index d8147b336c35..6518a55273e7 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -38,6 +38,13 @@ read or write requests. Note that the total allocated number may be twice
 this amount, since it applies only to reads or writes (not the accumulated
 sum).
+To avoid priority inversion through request starvation, a request
+queue maintains a separate request pool per each cgroup when
+CONFIG_BLK_CGROUP is enabled, and this parameter applies to each such
+per-block-cgroup request pool.  IOW, if there are N block cgroups,
+each request queue may have upto N request pools, each independently
+regulated by nr_requests.
 read_ahead_kb (RW)
 ------------------
 Maximum number of kilobytes to read-ahead for filesystems on this block
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index e7dee617358e..f3b44a65fc7a 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -31,27 +31,6 @@ EXPORT_SYMBOL_GPL(blkcg_root);
 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
-struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup)
-{
-        return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
-                            struct blkcg, css);
-}
-EXPORT_SYMBOL_GPL(cgroup_to_blkcg);
-static struct blkcg *task_blkcg(struct task_struct *tsk)
-{
-        return container_of(task_subsys_state(tsk, blkio_subsys_id),
-                            struct blkcg, css);
-}
-struct blkcg *bio_blkcg(struct bio *bio)
-{
-        if (bio && bio->bi_css)
-                return container_of(bio->bi_css, struct blkcg, css);
-        return task_blkcg(current);
-}
-EXPORT_SYMBOL_GPL(bio_blkcg);
 static bool blkcg_policy_enabled(struct request_queue *q,
                                 const struct blkcg_policy *pol)
 {
@@ -84,6 +63,7 @@ static void blkg_free(struct blkcg_gq *blkg)
                kfree(pd);
        }
+        blk_exit_rl(&blkg->rl);
        kfree(blkg);
 }
@@ -91,16 +71,18 @@ static void blkg_free(struct blkcg_gq *blkg)
 * blkg_alloc - allocate a blkg
 * @blkcg: block cgroup the new blkg is associated with
 * @q: request_queue the new blkg is associated with
+ * @gfp_mask: allocation mask to use
 *
 * Allocate a new blkg assocating @blkcg and @q.
 */
-static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q)
+static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
+                                   gfp_t gfp_mask)
 {
        struct blkcg_gq *blkg;
        int i;
        /* alloc and init base part */
-        blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node);
+        blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node);
        if (!blkg)
                return NULL;
@@ -109,6 +91,13 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q)
        blkg->blkcg = blkcg;
        blkg->refcnt = 1;
+        /* root blkg uses @q->root_rl, init rl only for !root blkgs */
+        if (blkcg != &blkcg_root) {
+                if (blk_init_rl(&blkg->rl, q, gfp_mask))
+                        goto err_free;
+                blkg->rl.blkg = blkg;
+        }
        for (i = 0; i < BLKCG_MAX_POLS; i++) {
                struct blkcg_policy *pol = blkcg_policy[i];
                struct blkg_policy_data *pd;
@@ -117,11 +106,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q)
                        continue;
                /* alloc per-policy data and attach it to blkg */
-                pd = kzalloc_node(pol->pd_size, GFP_ATOMIC, q->node);
+                pd = kzalloc_node(pol->pd_size, gfp_mask, q->node);
-                if (!pd) {
+                if (!pd)
-                        blkg_free(blkg);
+                        goto err_free;
-                        return NULL;
-                }
                blkg->pd[i] = pd;
                pd->blkg = blkg;
@@ -132,6 +119,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q)
        }
        return blkg;
+err_free:
+        blkg_free(blkg);
+        return NULL;
 }
 static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
@@ -175,9 +166,13 @@ struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blkg_lookup);
+/*
+ * If @new_blkg is %NULL, this function tries to allocate a new one as
+ * necessary using %GFP_ATOMIC.  @new_blkg is always consumed on return.
+ */
 static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
-                                             struct request_queue *q)
+                                             struct request_queue *q,
-        __releases(q->queue_lock) __acquires(q->queue_lock)
+                                             struct blkcg_gq *new_blkg)
 {
        struct blkcg_gq *blkg;
        int ret;
@@ -189,24 +184,26 @@ static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
        blkg = __blkg_lookup(blkcg, q);
        if (blkg) {
                rcu_assign_pointer(blkcg->blkg_hint, blkg);
-                return blkg;
+                goto out_free;
        }
        /* blkg holds a reference to blkcg */
-        if (!css_tryget(&blkcg->css))
+        if (!css_tryget(&blkcg->css)) {
-                return ERR_PTR(-EINVAL);
+                blkg = ERR_PTR(-EINVAL);
+                goto out_free;
+        }
        /* allocate */
-        ret = -ENOMEM;
+        if (!new_blkg) {
-        blkg = blkg_alloc(blkcg, q);
+                new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC);
-        if (unlikely(!blkg))
+                if (unlikely(!new_blkg)) {
-                goto err_put;
+                        blkg = ERR_PTR(-ENOMEM);
+                        goto out_put;
+                }
+        }
+        blkg = new_blkg;
        /* insert */
-        ret = radix_tree_preload(GFP_ATOMIC);
-        if (ret)
-                goto err_free;
        spin_lock(&blkcg->lock);
        ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
        if (likely(!ret)) {
@@ -215,15 +212,15 @@ static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
        }
        spin_unlock(&blkcg->lock);
-        radix_tree_preload_end();
        if (!ret)
                return blkg;
-err_free:
-        blkg_free(blkg);
+        blkg = ERR_PTR(ret);
-err_put:
+out_put:
        css_put(&blkcg->css);
-        return ERR_PTR(ret);
+out_free:
+        blkg_free(new_blkg);
+        return blkg;
 }
 struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
@@ -235,7 +232,7 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
         */
        if (unlikely(blk_queue_bypass(q)))
                return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);
-        return __blkg_lookup_create(blkcg, q);
+        return __blkg_lookup_create(blkcg, q, NULL);
 }
 EXPORT_SYMBOL_GPL(blkg_lookup_create);
@@ -313,6 +310,38 @@ void __blkg_release(struct blkcg_gq *blkg)
 }
 EXPORT_SYMBOL_GPL(__blkg_release);
+/*
+ * The next function used by blk_queue_for_each_rl().  It's a bit tricky
+ * because the root blkg uses @q->root_rl instead of its own rl.
+ */
+struct request_list *__blk_queue_next_rl(struct request_list *rl,
+                                         struct request_queue *q)
+{
+        struct list_head *ent;
+        struct blkcg_gq *blkg;
+        /*
+         * Determine the current blkg list_head.  The first entry is
+         * root_rl which is off @q->blkg_list and mapped to the head.
+         */
+        if (rl == &q->root_rl) {
+                ent = &q->blkg_list;
+        } else {
+                blkg = container_of(rl, struct blkcg_gq, rl);
+                ent = &blkg->q_node;
+        }
+        /* walk to the next list_head, skip root blkcg */
+        ent = ent->next;
+        if (ent == &q->root_blkg->q_node)
+                ent = ent->next;
+        if (ent == &q->blkg_list)
+                return NULL;
+        blkg = container_of(ent, struct blkcg_gq, q_node);
+        return &blkg->rl;
+}
 static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype,
                             u64 val)
 {
@@ -734,24 +763,36 @@ int blkcg_activate_policy(struct request_queue *q,
        struct blkcg_gq *blkg;
        struct blkg_policy_data *pd, *n;
        int cnt = 0, ret;
+        bool preloaded;
        if (blkcg_policy_enabled(q, pol))
                return 0;
+        /* preallocations for root blkg */
+        blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
+        if (!blkg)
+                return -ENOMEM;
+        preloaded = !radix_tree_preload(GFP_KERNEL);
        blk_queue_bypass_start(q);
        /* make sure the root blkg exists and count the existing blkgs */
        spin_lock_irq(q->queue_lock);
        rcu_read_lock();
-        blkg = __blkg_lookup_create(&blkcg_root, q);
+        blkg = __blkg_lookup_create(&blkcg_root, q, blkg);
        rcu_read_unlock();
+        if (preloaded)
+                radix_tree_preload_end();
        if (IS_ERR(blkg)) {
                ret = PTR_ERR(blkg);
                goto out_unlock;
        }
        q->root_blkg = blkg;
+        q->root_rl.blkg = blkg;
        list_for_each_entry(blkg, &q->blkg_list, q_node)
                cnt++;
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 8ac457ce7783..24597309e23d 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -17,6 +17,7 @@
 #include <linux/u64_stats_sync.h>
 #include <linux/seq_file.h>
 #include <linux/radix-tree.h>
+#include <linux/blkdev.h>
 /* Max limits for throttle policy */
 #define THROTL_IOPS_MAX         UINT_MAX
@@ -93,6 +94,8 @@ struct blkcg_gq {
        struct list_head                q_node;
        struct hlist_node               blkcg_node;
        struct blkcg                    *blkcg;
+        /* request allocation list for this blkcg-q pair */
+        struct request_list             rl;
        /* reference count */
        int                             refcnt;
@@ -120,8 +123,6 @@ struct blkcg_policy {
 extern struct blkcg blkcg_root;
-struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup);
-struct blkcg *bio_blkcg(struct bio *bio);
 struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q);
 struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
                                    struct request_queue *q);
@@ -160,6 +161,25 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 void blkg_conf_finish(struct blkg_conf_ctx *ctx);
+static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup)
+{
+        return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
+                            struct blkcg, css);
+}
+static inline struct blkcg *task_blkcg(struct task_struct *tsk)
+{
+        return container_of(task_subsys_state(tsk, blkio_subsys_id),
+                            struct blkcg, css);
+}
+static inline struct blkcg *bio_blkcg(struct bio *bio)
+{
+        if (bio && bio->bi_css)
+                return container_of(bio->bi_css, struct blkcg, css);
+        return task_blkcg(current);
+}
 /**
 * blkg_to_pdata - get policy private data
 * @blkg: blkg of interest
@@ -234,6 +254,95 @@ static inline void blkg_put(struct blkcg_gq *blkg)
 }
 /**
+ * blk_get_rl - get request_list to use
+ * @q: request_queue of interest
+ * @bio: bio which will be attached to the allocated request (may be %NULL)
+ *
+ * The caller wants to allocate a request from @q to use for @bio.  Find
+ * the request_list to use and obtain a reference on it.  Should be called
+ * under queue_lock.  This function is guaranteed to return non-%NULL
+ * request_list.
+ */
+static inline struct request_list *blk_get_rl(struct request_queue *q,
+                                              struct bio *bio)
+{
+        struct blkcg *blkcg;
+        struct blkcg_gq *blkg;
+        rcu_read_lock();
+        blkcg = bio_blkcg(bio);
+        /* bypass blkg lookup and use @q->root_rl directly for root */
+        if (blkcg == &blkcg_root)
+                goto root_rl;
+        /*
+         * Try to use blkg->rl.  blkg lookup may fail under memory pressure
+         * or if either the blkcg or queue is going away.  Fall back to
+         * root_rl in such cases.
+         */
+        blkg = blkg_lookup_create(blkcg, q);
+        if (unlikely(IS_ERR(blkg)))
+                goto root_rl;
+        blkg_get(blkg);
+        rcu_read_unlock();
+        return &blkg->rl;
+root_rl:
+        rcu_read_unlock();
+        return &q->root_rl;
+}
+/**
+ * blk_put_rl - put request_list
+ * @rl: request_list to put
+ *
+ * Put the reference acquired by blk_get_rl().  Should be called under
+ * queue_lock.
+ */
+static inline void blk_put_rl(struct request_list *rl)
+{
+        /* root_rl may not have blkg set */
+        if (rl->blkg && rl->blkg->blkcg != &blkcg_root)
+                blkg_put(rl->blkg);
+}
+/**
+ * blk_rq_set_rl - associate a request with a request_list
+ * @rq: request of interest
+ * @rl: target request_list
+ *
+ * Associate @rq with @rl so that accounting and freeing can know the
+ * request_list @rq came from.
+ */
+static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl)
+{
+        rq->rl = rl;
+}
+/**
+ * blk_rq_rl - return the request_list a request came from
+ * @rq: request of interest
+ *
+ * Return the request_list @rq is allocated from.
+ */
+static inline struct request_list *blk_rq_rl(struct request *rq)
+{
+        return rq->rl;
+}
+struct request_list *__blk_queue_next_rl(struct request_list *rl,
+                                         struct request_queue *q);
+/**
+ * blk_queue_for_each_rl - iterate through all request_lists of a request_queue
+ *
+ * Should be used under queue_lock.
+ */
+#define blk_queue_for_each_rl(rl, q)    \
+        for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
+/**
 * blkg_stat_add - add a value to a blkg_stat
 * @stat: target blkg_stat
 * @val: value to add
@@ -351,6 +460,7 @@ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
 #else   /* CONFIG_BLK_CGROUP */
 struct cgroup;
+struct blkcg;
 struct blkg_policy_data {
 };
@@ -361,8 +471,6 @@ struct blkcg_gq {
 struct blkcg_policy {
 };
-static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; }
-static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
 static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
 static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
 static inline void blkcg_drain_queue(struct request_queue *q) { }
@@ -374,6 +482,9 @@ static inline int blkcg_activate_policy(struct request_queue *q,
 static inline void blkcg_deactivate_policy(struct request_queue *q,
                                           const struct blkcg_policy *pol) { }
+static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; }
+static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
 static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
                                                  struct blkcg_policy *pol) { return NULL; }
 static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
@@ -381,5 +492,14 @@ static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
 static inline void blkg_get(struct blkcg_gq *blkg) { }
 static inline void blkg_put(struct blkcg_gq *blkg) { }
+static inline struct request_list *blk_get_rl(struct request_queue *q,
+                                              struct bio *bio) { return &q->root_rl; }
+static inline void blk_put_rl(struct request_list *rl) { }
+static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { }
+static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; }
+#define blk_queue_for_each_rl(rl, q)    \
+        for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
 #endif  /* CONFIG_BLK_CGROUP */
 #endif  /* _BLK_CGROUP_H */
diff --git a/block/blk-core.c b/block/blk-core.c
index 93eb3e4f88ce..dd134d834d58 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -387,7 +387,7 @@ void blk_drain_queue(struct request_queue *q, bool drain_all)
                if (!list_empty(&q->queue_head) && q->request_fn)
                        __blk_run_queue(q);
-                drain |= q->rq.elvpriv;
+                drain |= q->nr_rqs_elvpriv;
                /*
                 * Unfortunately, requests are queued at and tracked from
@@ -397,7 +397,7 @@ void blk_drain_queue(struct request_queue *q, bool drain_all)
                if (drain_all) {
                        drain |= !list_empty(&q->queue_head);
                        for (i = 0; i < 2; i++) {
-                                drain |= q->rq.count[i];
+                                drain |= q->nr_rqs[i];
                                drain |= q->in_flight[i];
                                drain |= !list_empty(&q->flush_queue[i]);
                        }
@@ -416,9 +416,14 @@ void blk_drain_queue(struct request_queue *q, bool drain_all)
         * left with hung waiters. We need to wake up those waiters.
         */
        if (q->request_fn) {
+                struct request_list *rl;
                spin_lock_irq(q->queue_lock);
-                for (i = 0; i < ARRAY_SIZE(q->rq.wait); i++)
-                        wake_up_all(&q->rq.wait[i]);
+                blk_queue_for_each_rl(rl, q)
+                        for (i = 0; i < ARRAY_SIZE(rl->wait); i++)
+                                wake_up_all(&rl->wait[i]);
                spin_unlock_irq(q->queue_lock);
        }
 }
@@ -517,28 +522,33 @@ void blk_cleanup_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_cleanup_queue);
-static int blk_init_free_list(struct request_queue *q)
+int blk_init_rl(struct request_list *rl, struct request_queue *q,
+                gfp_t gfp_mask)
 {
-        struct request_list *rl = &q->rq;
        if (unlikely(rl->rq_pool))
                return 0;
+        rl->q = q;
        rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0;
        rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0;
-        rl->elvpriv = 0;
        init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);
        init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);
        rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
-                                mempool_free_slab, request_cachep, q->node);
+                                          mempool_free_slab, request_cachep,
+                                          gfp_mask, q->node);
        if (!rl->rq_pool)
                return -ENOMEM;
        return 0;
 }
+void blk_exit_rl(struct request_list *rl)
+{
+        if (rl->rq_pool)
+                mempool_destroy(rl->rq_pool);
+}
 struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
 {
        return blk_alloc_queue_node(gfp_mask, -1);
@@ -680,7 +690,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
        if (!q)
                return NULL;
-        if (blk_init_free_list(q))
+        if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
                return NULL;
        q->request_fn           = rfn;
@@ -722,15 +732,15 @@ bool blk_get_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_get_queue);
-static inline void blk_free_request(struct request_queue *q, struct request *rq)
+static inline void blk_free_request(struct request_list *rl, struct request *rq)
 {
        if (rq->cmd_flags & REQ_ELVPRIV) {
-                elv_put_request(q, rq);
+                elv_put_request(rl->q, rq);
                if (rq->elv.icq)
                        put_io_context(rq->elv.icq->ioc);
        }
-        mempool_free(rq, q->rq.rq_pool);
+        mempool_free(rq, rl->rq_pool);
 }
 /*
@@ -767,18 +777,23 @@ static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)
        ioc->last_waited = jiffies;
 }
-static void __freed_request(struct request_queue *q, int sync)
+static void __freed_request(struct request_list *rl, int sync)
 {
-        struct request_list *rl = &q->rq;
+        struct request_queue *q = rl->q;
-        if (rl->count[sync] < queue_congestion_off_threshold(q))
+        /*
+         * bdi isn't aware of blkcg yet.  As all async IOs end up root
+         * blkcg anyway, just use root blkcg state.
+         */
+        if (rl == &q->root_rl &&
+            rl->count[sync] < queue_congestion_off_threshold(q))
                blk_clear_queue_congested(q, sync);
        if (rl->count[sync] + 1 <= q->nr_requests) {
                if (waitqueue_active(&rl->wait[sync]))
                        wake_up(&rl->wait[sync]);
-                blk_clear_queue_full(q, sync);
+                blk_clear_rl_full(rl, sync);
        }
 }
@@ -786,19 +801,20 @@ static void __freed_request(struct request_queue *q, int sync)
 * A request has just been released.  Account for it, update the full and
 * congestion status, wake up any waiters.   Called under q->queue_lock.
 */
-static void freed_request(struct request_queue *q, unsigned int flags)
+static void freed_request(struct request_list *rl, unsigned int flags)
 {
-        struct request_list *rl = &q->rq;
+        struct request_queue *q = rl->q;
        int sync = rw_is_sync(flags);
+        q->nr_rqs[sync]--;
        rl->count[sync]--;
        if (flags & REQ_ELVPRIV)
-                rl->elvpriv--;
+                q->nr_rqs_elvpriv--;
-        __freed_request(q, sync);
+        __freed_request(rl, sync);
        if (unlikely(rl->starved[sync ^ 1]))
-                __freed_request(q, sync ^ 1);
+                __freed_request(rl, sync ^ 1);
 }
 /*
@@ -837,8 +853,8 @@ static struct io_context *rq_ioc(struct bio *bio)
 }
 /**
- * get_request - get a free request
+ * __get_request - get a free request
- * @q: request_queue to allocate request from
+ * @rl: request list to allocate from
 * @rw_flags: RW and SYNC flags
 * @bio: bio to allocate request for (can be %NULL)
 * @gfp_mask: allocation mask
@@ -850,20 +866,16 @@ static struct io_context *rq_ioc(struct bio *bio)
 * Returns %NULL on failure, with @q->queue_lock held.
 * Returns !%NULL on success, with @q->queue_lock *not held*.
 */
-static struct request *get_request(struct request_queue *q, int rw_flags,
+static struct request *__get_request(struct request_list *rl, int rw_flags,
-                                   struct bio *bio, gfp_t gfp_mask)
+                                     struct bio *bio, gfp_t gfp_mask)
 {
+        struct request_queue *q = rl->q;
        struct request *rq;
-        struct request_list *rl = &q->rq;
+        struct elevator_type *et = q->elevator->type;
-        struct elevator_type *et;
+        struct io_context *ioc = rq_ioc(bio);
-        struct io_context *ioc;
        struct io_cq *icq = NULL;
        const bool is_sync = rw_is_sync(rw_flags) != 0;
-        bool retried = false;
        int may_queue;
-retry:
-        et = q->elevator->type;
-        ioc = rq_ioc(bio);
        if (unlikely(blk_queue_dead(q)))
                return NULL;
@@ -875,28 +887,14 @@ retry:
        if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
                if (rl->count[is_sync]+1 >= q->nr_requests) {
                        /*
-                         * We want ioc to record batching state.  If it's
-                         * not already there, creating a new one requires
-                         * dropping queue_lock, which in turn requires
-                         * retesting conditions to avoid queue hang.
-                         */
-                        if (!ioc && !retried) {
-                                spin_unlock_irq(q->queue_lock);
-                                create_io_context(gfp_mask, q->node);
-                                spin_lock_irq(q->queue_lock);
-                                retried = true;
-                                goto retry;
-                        }
-                        /*
                         * The queue will fill after this allocation, so set
                         * it as full, and mark this process as "batching".
                         * This process will be allowed to complete a batch of
                         * requests, others will be blocked.
                         */
-                        if (!blk_queue_full(q, is_sync)) {
+                        if (!blk_rl_full(rl, is_sync)) {
                                ioc_set_batching(q, ioc);
-                                blk_set_queue_full(q, is_sync);
+                                blk_set_rl_full(rl, is_sync);
                        } else {
                                if (may_queue != ELV_MQUEUE_MUST
                                                && !ioc_batching(q, ioc)) {
@@ -909,7 +907,12 @@ retry:
                                }
                        }
                }
-                blk_set_queue_congested(q, is_sync);
+                /*
+                 * bdi isn't aware of blkcg yet.  As all async IOs end up
+                 * root blkcg anyway, just use root blkcg state.
+                 */
+                if (rl == &q->root_rl)
+                        blk_set_queue_congested(q, is_sync);
        }
        /*
@@ -920,6 +923,7 @@ retry:
        if (rl->count[is_sync] >= (3 * q->nr_requests / 2))
                return NULL;
+        q->nr_rqs[is_sync]++;
        rl->count[is_sync]++;
        rl->starved[is_sync] = 0;
@@ -935,7 +939,7 @@ retry:
         */
        if (blk_rq_should_init_elevator(bio) && !blk_queue_bypass(q)) {
                rw_flags |= REQ_ELVPRIV;
-                rl->elvpriv++;
+                q->nr_rqs_elvpriv++;
                if (et->icq_cache && ioc)
                        icq = ioc_lookup_icq(ioc, q);
        }
@@ -945,22 +949,19 @@ retry:
        spin_unlock_irq(q->queue_lock);
        /* allocate and init request */
-        rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
+        rq = mempool_alloc(rl->rq_pool, gfp_mask);
        if (!rq)
                goto fail_alloc;
        blk_rq_init(q, rq);
+        blk_rq_set_rl(rq, rl);
        rq->cmd_flags = rw_flags | REQ_ALLOCED;
        /* init elvpriv */
        if (rw_flags & REQ_ELVPRIV) {
                if (unlikely(et->icq_cache && !icq)) {
-                        create_io_context(gfp_mask, q->node);
+                        if (ioc)
-                        ioc = rq_ioc(bio);
+                                icq = ioc_create_icq(ioc, q, gfp_mask);
-                        if (!ioc)
-                                goto fail_elvpriv;
-                        icq = ioc_create_icq(ioc, q, gfp_mask);
                        if (!icq)
                                goto fail_elvpriv;
                }
@@ -1000,7 +1001,7 @@ fail_elvpriv:
        rq->elv.icq = NULL;
        spin_lock_irq(q->queue_lock);
-        rl->elvpriv--;
+        q->nr_rqs_elvpriv--;
        spin_unlock_irq(q->queue_lock);
        goto out;
@@ -1013,7 +1014,7 @@ fail_alloc:
         * queue, but this is pretty rare.
         */
        spin_lock_irq(q->queue_lock);
-        freed_request(q, rw_flags);
+        freed_request(rl, rw_flags);
        /*
         * in the very unlikely event that allocation failed and no
@@ -1029,56 +1030,58 @@ rq_starved:
 }
 /**
- * get_request_wait - get a free request with retry
+ * get_request - get a free request
 * @q: request_queue to allocate request from
 * @rw_flags: RW and SYNC flags
 * @bio: bio to allocate request for (can be %NULL)
+ * @gfp_mask: allocation mask
 *
- * Get a free request from @q.  This function keeps retrying under memory
+ * Get a free request from @q.  If %__GFP_WAIT is set in @gfp_mask, this
- * pressure and fails iff @q is dead.
+ * function keeps retrying under memory pressure and fails iff @q is dead.
 *
 * Must be callled with @q->queue_lock held and,
 * Returns %NULL on failure, with @q->queue_lock held.
 * Returns !%NULL on success, with @q->queue_lock *not held*.
 */
-static struct request *get_request_wait(struct request_queue *q, int rw_flags,
+static struct request *get_request(struct request_queue *q, int rw_flags,
-                                        struct bio *bio)
+                                   struct bio *bio, gfp_t gfp_mask)
 {
        const bool is_sync = rw_is_sync(rw_flags) != 0;
+        DEFINE_WAIT(wait);
+        struct request_list *rl;
        struct request *rq;
-        rq = get_request(q, rw_flags, bio, GFP_NOIO);
+        rl = blk_get_rl(q, bio);        /* transferred to @rq on success */
-        while (!rq) {
+retry:
-                DEFINE_WAIT(wait);
+        rq = __get_request(rl, rw_flags, bio, gfp_mask);
-                struct request_list *rl = &q->rq;
+        if (rq)
+                return rq;
-                if (unlikely(blk_queue_dead(q)))
-                        return NULL;
-                prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
+        if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dead(q))) {
-                                TASK_UNINTERRUPTIBLE);
+                blk_put_rl(rl);
+                return NULL;
+        }
-                trace_block_sleeprq(q, bio, rw_flags & 1);
+        /* wait on @rl and retry */
+        prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
+                                  TASK_UNINTERRUPTIBLE);
-                spin_unlock_irq(q->queue_lock);
+        trace_block_sleeprq(q, bio, rw_flags & 1);
-                io_schedule();
-                /*
+        spin_unlock_irq(q->queue_lock);
-                 * After sleeping, we become a "batching" process and
+        io_schedule();
-                 * will be able to allocate at least one request, and
-                 * up to a big batch of them for a small period time.
-                 * See ioc_batching, ioc_set_batching
-                 */
-                create_io_context(GFP_NOIO, q->node);
-                ioc_set_batching(q, current->io_context);
-                spin_lock_irq(q->queue_lock);
+        /*
-                finish_wait(&rl->wait[is_sync], &wait);
+         * After sleeping, we become a "batching" process and will be able
+         * to allocate at least one request, and up to a big batch of them
+         * for a small period time.  See ioc_batching, ioc_set_batching
+         */
+        ioc_set_batching(q, current->io_context);
-                rq = get_request(q, rw_flags, bio, GFP_NOIO);
+        spin_lock_irq(q->queue_lock);
-        };
+        finish_wait(&rl->wait[is_sync], &wait);
-        return rq;
+        goto retry;
 }
 struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
@@ -1087,11 +1090,11 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
        BUG_ON(rw != READ && rw != WRITE);
+        /* create ioc upfront */
+        create_io_context(gfp_mask, q->node);
        spin_lock_irq(q->queue_lock);
-        if (gfp_mask & __GFP_WAIT)
+        rq = get_request(q, rw, NULL, gfp_mask);
-                rq = get_request_wait(q, rw, NULL);
-        else
-                rq = get_request(q, rw, NULL, gfp_mask);
        if (!rq)
                spin_unlock_irq(q->queue_lock);
        /* q->queue_lock is unlocked at this point */
@@ -1248,12 +1251,14 @@ void __blk_put_request(struct request_queue *q, struct request *req)
         */
        if (req->cmd_flags & REQ_ALLOCED) {
                unsigned int flags = req->cmd_flags;
+                struct request_list *rl = blk_rq_rl(req);
                BUG_ON(!list_empty(&req->queuelist));
                BUG_ON(!hlist_unhashed(&req->hash));
-                blk_free_request(q, req);
+                blk_free_request(rl, req);
-                freed_request(q, flags);
+                freed_request(rl, flags);
+                blk_put_rl(rl);
        }
 }
 EXPORT_SYMBOL_GPL(__blk_put_request);
@@ -1481,7 +1486,7 @@ get_rq:
         * Grab a free request. This is might sleep but can not fail.
         * Returns with the queue unlocked.
         */
-        req = get_request_wait(q, rw_flags, bio);
+        req = get_request(q, rw_flags, bio, GFP_NOIO);
        if (unlikely(!req)) {
                bio_endio(bio, -ENODEV);        /* @q is dead */
                goto out_unlock;
@@ -1702,6 +1707,14 @@ generic_make_request_checks(struct bio *bio)
                goto end_io;
        }
+        /*
+         * Various block parts want %current->io_context and lazy ioc
+         * allocation ends up trading a lot of pain for a small amount of
+         * memory.  Just allocate it upfront.  This may fail and block
+         * layer knows how to live with it.
+         */
+        create_io_context(GFP_ATOMIC, q->node);
        if (blk_throtl_bio(q, bio))
                return false;   /* throttled, will be resubmitted later */
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index aa41b47c22d2..9628b291f960 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -40,7 +40,7 @@ static ssize_t queue_requests_show(struct request_queue *q, char *page)
 static ssize_t
 queue_requests_store(struct request_queue *q, const char *page, size_t count)
 {
-        struct request_list *rl = &q->rq;
+        struct request_list *rl;
        unsigned long nr;
        int ret;
@@ -55,6 +55,9 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
        q->nr_requests = nr;
        blk_queue_congestion_threshold(q);
+        /* congestion isn't cgroup aware and follows root blkcg for now */
+        rl = &q->root_rl;
        if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
                blk_set_queue_congested(q, BLK_RW_SYNC);
        else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
@@ -65,19 +68,22 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
        else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
                blk_clear_queue_congested(q, BLK_RW_ASYNC);
-        if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
+        blk_queue_for_each_rl(rl, q) {
-                blk_set_queue_full(q, BLK_RW_SYNC);
+                if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
-        } else {
+                        blk_set_rl_full(rl, BLK_RW_SYNC);
-                blk_clear_queue_full(q, BLK_RW_SYNC);
+                } else {
-                wake_up(&rl->wait[BLK_RW_SYNC]);
+                        blk_clear_rl_full(rl, BLK_RW_SYNC);
+                        wake_up(&rl->wait[BLK_RW_SYNC]);
+                }
+                if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
+                        blk_set_rl_full(rl, BLK_RW_ASYNC);
+                } else {
+                        blk_clear_rl_full(rl, BLK_RW_ASYNC);
+                        wake_up(&rl->wait[BLK_RW_ASYNC]);
+                }
        }
-        if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
-                blk_set_queue_full(q, BLK_RW_ASYNC);
-        } else {
-                blk_clear_queue_full(q, BLK_RW_ASYNC);
-                wake_up(&rl->wait[BLK_RW_ASYNC]);
-        }
        spin_unlock_irq(q->queue_lock);
        return ret;
 }
@@ -476,7 +482,6 @@ static void blk_release_queue(struct kobject *kobj)
 {
        struct request_queue *q =
                container_of(kobj, struct request_queue, kobj);
-        struct request_list *rl = &q->rq;
        blk_sync_queue(q);
@@ -489,8 +494,7 @@ static void blk_release_queue(struct kobject *kobj)
                elevator_exit(q->elevator);
        }
-        if (rl->rq_pool)
+        blk_exit_rl(&q->root_rl);
-                mempool_destroy(rl->rq_pool);
        if (q->queue_tags)
                __blk_queue_free_tags(q);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 5b0659512047..e287c19908c8 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1123,9 +1123,6 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
                goto out;
        }
-        /* bio_associate_current() needs ioc, try creating */
-        create_io_context(GFP_ATOMIC, q->node);
        /*
         * A throtl_grp pointer retrieved under rcu can be used to access
         * basic fields like stats and io rates. If a group has no rules,
diff --git a/block/blk.h b/block/blk.h
index 85f6ae42f7d3..a134231fd22a 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -18,6 +18,9 @@ static inline void __blk_get_queue(struct request_queue *q)
        kobject_get(&q->kobj);
 }
+int blk_init_rl(struct request_list *rl, struct request_queue *q,
+                gfp_t gfp_mask);
+void blk_exit_rl(struct request_list *rl);
 void init_request_from_bio(struct request *req, struct bio *bio);
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
                        struct bio *bio);
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index 7ad49c88f6b1..deee61fbb741 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -243,56 +243,3 @@ int bsg_setup_queue(struct device *dev, struct request_queue *q,
        return 0;
 }
 EXPORT_SYMBOL_GPL(bsg_setup_queue);
-/**
- * bsg_remove_queue - Deletes the bsg dev from the q
- * @q:  the request_queue that is to be torn down.
- *
- * Notes:
- *   Before unregistering the queue empty any requests that are blocked
- */
-void bsg_remove_queue(struct request_queue *q)
-{
-        struct request *req; /* block request */
-        int counts; /* totals for request_list count and starved */
-        if (!q)
-                return;
-        /* Stop taking in new requests */
-        spin_lock_irq(q->queue_lock);
-        blk_stop_queue(q);
-        /* drain all requests in the queue */
-        while (1) {
-                /* need the lock to fetch a request
-                 * this may fetch the same reqeust as the previous pass
-                 */
-                req = blk_fetch_request(q);
-                /* save requests in use and starved */
-                counts = q->rq.count[0] + q->rq.count[1] +
-                         q->rq.starved[0] + q->rq.starved[1];
-                spin_unlock_irq(q->queue_lock);
-                /* any requests still outstanding? */
-                if (counts == 0)
-                        break;
-                /* This may be the same req as the previous iteration,
-                 * always send the blk_end_request_all after a prefetch.
-                 * It is not okay to not end the request because the
-                 * prefetch started the request.
-                 */
-                if (req) {
-                        /* return -ENXIO to indicate that this queue is
-                         * going away
-                         */
-                        req->errors = -ENXIO;
-                        blk_end_request_all(req, -ENXIO);
-                }
-                msleep(200); /* allow bsg to possibly finish */
-                spin_lock_irq(q->queue_lock);
-        }
-        bsg_unregister_queue(q);
-}
-EXPORT_SYMBOL_GPL(bsg_remove_queue);
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 553f43a90953..8d4afc83e05f 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -191,6 +191,7 @@ static int print_unex = 1;
 #include <linux/mutex.h>
 #include <linux/io.h>
 #include <linux/uaccess.h>
+#include <linux/async.h>
 /*
 * PS/2 floppies have much slower step rates than regular floppies.
@@ -4123,7 +4124,7 @@ static struct kobject *floppy_find(dev_t dev, int *part, void *data)
        return get_disk(disks[drive]);
 }
-static int __init floppy_init(void)
+static int __init do_floppy_init(void)
 {
        int i, unit, drive;
        int err, dr;
@@ -4338,6 +4339,24 @@ out_put_disk:
        return err;
 }
+#ifndef MODULE
+static __init void floppy_async_init(void *data, async_cookie_t cookie)
+{
+        do_floppy_init();
+}
+#endif
+static int __init floppy_init(void)
+{
+#ifdef MODULE
+        return do_floppy_init();
+#else
+        /* Don't hold up the bootup by the floppy initialization */
+        async_schedule(floppy_async_init, NULL);
+        return 0;
+#endif
+}
 static const struct io_region {
        int offset;
        int size;
diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index 579760420d53..a9617ad05f33 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -4130,45 +4130,7 @@ fc_bsg_rportadd(struct Scsi_Host *shost, struct fc_rport *rport)
 static void
 fc_bsg_remove(struct request_queue *q)
 {
-        struct request *req; /* block request */
-        int counts; /* totals for request_list count and starved */
        if (q) {
-                /* Stop taking in new requests */
-                spin_lock_irq(q->queue_lock);
-                blk_stop_queue(q);
-                /* drain all requests in the queue */
-                while (1) {
-                        /* need the lock to fetch a request
-                         * this may fetch the same reqeust as the previous pass
-                         */
-                        req = blk_fetch_request(q);
-                        /* save requests in use and starved */
-                        counts = q->rq.count[0] + q->rq.count[1] +
-                                q->rq.starved[0] + q->rq.starved[1];
-                        spin_unlock_irq(q->queue_lock);
-                        /* any requests still outstanding? */
-                        if (counts == 0)
-                                break;
-                        /* This may be the same req as the previous iteration,
-                         * always send the blk_end_request_all after a prefetch.
-                         * It is not okay to not end the request because the
-                         * prefetch started the request.
-                         */
-                        if (req) {
-                                /* return -ENXIO to indicate that this queue is
-                                 * going away
-                                 */
-                                req->errors = -ENXIO;
-                                blk_end_request_all(req, -ENXIO);
-                        }
-                        msleep(200); /* allow bsg to possibly finish */
-                        spin_lock_irq(q->queue_lock);
-                }
                bsg_unregister_queue(q);
                blk_cleanup_queue(q);
        }
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 1cf640e575da..c737a16b0a1d 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -575,7 +575,7 @@ static int iscsi_remove_host(struct transport_container *tc,
        struct iscsi_cls_host *ihost = shost->shost_data;
        if (ihost->bsg_q) {
-                bsg_remove_queue(ihost->bsg_q);
+                bsg_unregister_queue(ihost->bsg_q);
                blk_cleanup_queue(ihost->bsg_q);
        }
        return 0;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 07954b05b86c..3816ce8a08fc 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -46,16 +46,23 @@ struct blkcg_gq;
 struct request;
 typedef void (rq_end_io_fn)(struct request *, int);
+#define BLK_RL_SYNCFULL         (1U << 0)
+#define BLK_RL_ASYNCFULL        (1U << 1)
 struct request_list {
+        struct request_queue    *q;     /* the queue this rl belongs to */
+#ifdef CONFIG_BLK_CGROUP
+        struct blkcg_gq         *blkg;  /* blkg this request pool belongs to */
+#endif
        /*
         * count[], starved[], and wait[] are indexed by
         * BLK_RW_SYNC/BLK_RW_ASYNC
         */
-        int count[2];
+        int                     count[2];
-        int starved[2];
+        int                     starved[2];
-        int elvpriv;
+        mempool_t               *rq_pool;
-        mempool_t *rq_pool;
+        wait_queue_head_t       wait[2];
-        wait_queue_head_t wait[2];
+        unsigned int            flags;
 };
 /*
@@ -138,6 +145,7 @@ struct request {
        struct hd_struct *part;
        unsigned long start_time;
 #ifdef CONFIG_BLK_CGROUP
+        struct request_list *rl;                /* rl this rq is alloced from */
        unsigned long long start_time_ns;
        unsigned long long io_start_time_ns;    /* when passed to hardware */
 #endif
@@ -282,11 +290,16 @@ struct request_queue {
        struct list_head        queue_head;
        struct request          *last_merge;
        struct elevator_queue   *elevator;
+        int                     nr_rqs[2];      /* # allocated [a]sync rqs */
+        int                     nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */
        /*
-         * the queue request freelist, one for reads and one for writes
+         * If blkcg is not used, @q->root_rl serves all requests.  If blkcg
+         * is used, root blkg allocates from @q->root_rl and all other
+         * blkgs from their own blkg->rl.  Which one to use should be
+         * determined using bio_request_list().
         */
-        struct request_list     rq;
+        struct request_list     root_rl;
        request_fn_proc         *request_fn;
        make_request_fn         *make_request_fn;
@@ -561,27 +574,25 @@ static inline bool rq_is_sync(struct request *rq)
        return rw_is_sync(rq->cmd_flags);
 }
-static inline int blk_queue_full(struct request_queue *q, int sync)
+static inline bool blk_rl_full(struct request_list *rl, bool sync)
 {
-        if (sync)
+        unsigned int flag = sync ? BLK_RL_SYNCFULL : BLK_RL_ASYNCFULL;
-                return test_bit(QUEUE_FLAG_SYNCFULL, &q->queue_flags);
-        return test_bit(QUEUE_FLAG_ASYNCFULL, &q->queue_flags);
+        return rl->flags & flag;
 }
-static inline void blk_set_queue_full(struct request_queue *q, int sync)
+static inline void blk_set_rl_full(struct request_list *rl, bool sync)
 {
-        if (sync)
+        unsigned int flag = sync ? BLK_RL_SYNCFULL : BLK_RL_ASYNCFULL;
-                queue_flag_set(QUEUE_FLAG_SYNCFULL, q);
-        else
+        rl->flags |= flag;
-                queue_flag_set(QUEUE_FLAG_ASYNCFULL, q);
 }
-static inline void blk_clear_queue_full(struct request_queue *q, int sync)
+static inline void blk_clear_rl_full(struct request_list *rl, bool sync)
 {
-        if (sync)
+        unsigned int flag = sync ? BLK_RL_SYNCFULL : BLK_RL_ASYNCFULL;
-                queue_flag_clear(QUEUE_FLAG_SYNCFULL, q);
-        else
+        rl->flags &= ~flag;
-                queue_flag_clear(QUEUE_FLAG_ASYNCFULL, q);
 }
diff --git a/include/linux/bsg-lib.h b/include/linux/bsg-lib.h
index f55ab8cdc106..4d0fb3df2f4a 100644
--- a/include/linux/bsg-lib.h
+++ b/include/linux/bsg-lib.h
@@ -67,7 +67,6 @@ void bsg_job_done(struct bsg_job *job, int result,
 int bsg_setup_queue(struct device *dev, struct request_queue *q, char *name,
                    bsg_job_fn *job_fn, int dd_job_size);
 void bsg_request_fn(struct request_queue *q);
-void bsg_remove_queue(struct request_queue *q);
 void bsg_goose_queue(struct request_queue *q);
 #endif
diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 7c08052e3321..39ed62ab5b8a 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -26,7 +26,8 @@ typedef struct mempool_s {
 extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
                        mempool_free_t *free_fn, void *pool_data);
 extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
-                        mempool_free_t *free_fn, void *pool_data, int nid);
+                        mempool_free_t *free_fn, void *pool_data,
+                        gfp_t gfp_mask, int nid);
 extern int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask);
 extern void mempool_destroy(mempool_t *pool);
diff --git a/mm/mempool.c b/mm/mempool.c
index d9049811f352..54990476c049 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -63,19 +63,21 @@ EXPORT_SYMBOL(mempool_destroy);
 mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
                                mempool_free_t *free_fn, void *pool_data)
 {
-        return  mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,-1);
+        return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,
+                                   GFP_KERNEL, NUMA_NO_NODE);
 }
 EXPORT_SYMBOL(mempool_create);
 mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
-                        mempool_free_t *free_fn, void *pool_data, int node_id)
+                               mempool_free_t *free_fn, void *pool_data,
+                               gfp_t gfp_mask, int node_id)
 {
        mempool_t *pool;
-        pool = kmalloc_node(sizeof(*pool), GFP_KERNEL | __GFP_ZERO, node_id);
+        pool = kmalloc_node(sizeof(*pool), gfp_mask | __GFP_ZERO, node_id);
        if (!pool)
                return NULL;
        pool->elements = kmalloc_node(min_nr * sizeof(void *),
-                                        GFP_KERNEL, node_id);
+                                      gfp_mask, node_id);
        if (!pool->elements) {
                kfree(pool);
                return NULL;
@@ -93,7 +95,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
        while (pool->curr_nr < pool->min_nr) {
                void *element;
-                element = pool->alloc(GFP_KERNEL, pool->pool_data);
+                element = pool->alloc(gfp_mask, pool->pool_data);
                if (unlikely(!element)) {
                        mempool_destroy(pool);
                        return NULL;