blkcg: implement per-blkg request allocation

Currently, request_queue has one request_list to allocate requests from regardless of blkcg of the IO being issued. When the unified request pool is used up, cfq proportional IO limits become meaningless - whoever grabs the next request being freed wins the race regardless of the configured weights. This can be easily demonstrated by creating a blkio cgroup w/ very low weight, put a program which can issue a lot of random direct IOs there and running a sequential IO from a different cgroup. As soon as the request pool is used up, the sequential IO bandwidth crashes. This patch implements per-blkg request_list. Each blkg has its own request_list and any IO allocates its request from the matching blkg making blkcgs completely isolated in terms of request allocation. * Root blkcg uses the request_list embedded in each request_queue, which was renamed to @q->root_rl from @q->rq. While making blkcg rl handling a bit harier, this enables avoiding most overhead for root blkcg. * Queue fullness is properly per request_list but bdi isn't blkcg aware yet, so congestion state currently just follows the root blkcg. As writeback isn't aware of blkcg yet, this works okay for async congestion but readahead may get the wrong signals. It's better than blkcg completely collapsing with shared request_list but needs to be improved with future changes. * After this change, each block cgroup gets a full request pool making resource consumption of each cgroup higher. This makes allowing non-root users to create cgroups less desirable; however, note that allowing non-root users to directly manage cgroups is already severely broken regardless of this patch - each block cgroup consumes kernel memory and skews IO weight (IO weights are not hierarchical). v2: queue-sysfs.txt updated and patch description udpated as suggested by Vivek. v3: blk_get_rl() wasn't checking error return from blkg_lookup_create() and may cause oops on lookup failure. Fix it by falling back to root_rl on blkg lookup failures. This problem was spotted by Rakesh Iyer <rni@google.com>. v4: Updated to accomodate 458f27a982 "block: Avoid missed wakeup in request waitqueue". blk_drain_queue() now wakes up waiters on all blkg->rl on the target queue. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Vivek Goyal <vgoyal@redhat.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
author: Tejun Heo <tj@kernel.org> 2012-06-26 18:05:44 -0400
committer: Jens Axboe <axboe@kernel.dk> 2012-06-26 18:42:49 -0400
commit: a051661ca6d134c18599498b185b667859d4339b (patch)
tree: 9d840030874aed9b97a58051bf9568455126e8e8
parent: 5b788ce3e2acac9bf109743b1281d77347cf2101 (diff)
6 files changed, 216 insertions, 30 deletions
diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt
index d8147b336c35..6518a55273e7 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -38,6 +38,13 @@ read or write requests. Note that the total allocated number may be twice
 this amount, since it applies only to reads or writes (not the accumulated
 sum).
+To avoid priority inversion through request starvation, a request
+queue maintains a separate request pool per each cgroup when
+CONFIG_BLK_CGROUP is enabled, and this parameter applies to each such
+per-block-cgroup request pool.  IOW, if there are N block cgroups,
+each request queue may have upto N request pools, each independently
+regulated by nr_requests.
 read_ahead_kb (RW)
 ------------------
 Maximum number of kilobytes to read-ahead for filesystems on this block
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 63b31ebae6e2..f3b44a65fc7a 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -63,6 +63,7 @@ static void blkg_free(struct blkcg_gq *blkg)
                kfree(pd);
        }
+        blk_exit_rl(&blkg->rl);
        kfree(blkg);
 }
@@ -90,6 +91,13 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
        blkg->blkcg = blkcg;
        blkg->refcnt = 1;
+        /* root blkg uses @q->root_rl, init rl only for !root blkgs */
+        if (blkcg != &blkcg_root) {
+                if (blk_init_rl(&blkg->rl, q, gfp_mask))
+                        goto err_free;
+                blkg->rl.blkg = blkg;
+        }
        for (i = 0; i < BLKCG_MAX_POLS; i++) {
                struct blkcg_policy *pol = blkcg_policy[i];
                struct blkg_policy_data *pd;
@@ -99,10 +107,8 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
                /* alloc per-policy data and attach it to blkg */
                pd = kzalloc_node(pol->pd_size, gfp_mask, q->node);
-                if (!pd) {
+                if (!pd)
-                        blkg_free(blkg);
+                        goto err_free;
-                        return NULL;
-                }
                blkg->pd[i] = pd;
                pd->blkg = blkg;
@@ -113,6 +119,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
        }
        return blkg;
+err_free:
+        blkg_free(blkg);
+        return NULL;
 }
 static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
@@ -300,6 +310,38 @@ void __blkg_release(struct blkcg_gq *blkg)
 }
 EXPORT_SYMBOL_GPL(__blkg_release);
+/*
+ * The next function used by blk_queue_for_each_rl().  It's a bit tricky
+ * because the root blkg uses @q->root_rl instead of its own rl.
+ */
+struct request_list *__blk_queue_next_rl(struct request_list *rl,
+                                         struct request_queue *q)
+{
+        struct list_head *ent;
+        struct blkcg_gq *blkg;
+        /*
+         * Determine the current blkg list_head.  The first entry is
+         * root_rl which is off @q->blkg_list and mapped to the head.
+         */
+        if (rl == &q->root_rl) {
+                ent = &q->blkg_list;
+        } else {
+                blkg = container_of(rl, struct blkcg_gq, rl);
+                ent = &blkg->q_node;
+        }
+        /* walk to the next list_head, skip root blkcg */
+        ent = ent->next;
+        if (ent == &q->root_blkg->q_node)
+                ent = ent->next;
+        if (ent == &q->blkg_list)
+                return NULL;
+        blkg = container_of(ent, struct blkcg_gq, q_node);
+        return &blkg->rl;
+}
 static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype,
                             u64 val)
 {
@@ -750,6 +792,7 @@ int blkcg_activate_policy(struct request_queue *q,
                goto out_unlock;
        }
        q->root_blkg = blkg;
+        q->root_rl.blkg = blkg;
        list_for_each_entry(blkg, &q->blkg_list, q_node)
                cnt++;
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index e74cce1fbac9..24597309e23d 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -17,6 +17,7 @@
 #include <linux/u64_stats_sync.h>
 #include <linux/seq_file.h>
 #include <linux/radix-tree.h>
+#include <linux/blkdev.h>
 /* Max limits for throttle policy */
 #define THROTL_IOPS_MAX         UINT_MAX
@@ -93,6 +94,8 @@ struct blkcg_gq {
        struct list_head                q_node;
        struct hlist_node               blkcg_node;
        struct blkcg                    *blkcg;
+        /* request allocation list for this blkcg-q pair */
+        struct request_list             rl;
        /* reference count */
        int                             refcnt;
@@ -251,6 +254,95 @@ static inline void blkg_put(struct blkcg_gq *blkg)
 }
 /**
+ * blk_get_rl - get request_list to use
+ * @q: request_queue of interest
+ * @bio: bio which will be attached to the allocated request (may be %NULL)
+ *
+ * The caller wants to allocate a request from @q to use for @bio.  Find
+ * the request_list to use and obtain a reference on it.  Should be called
+ * under queue_lock.  This function is guaranteed to return non-%NULL
+ * request_list.
+ */
+static inline struct request_list *blk_get_rl(struct request_queue *q,
+                                              struct bio *bio)
+{
+        struct blkcg *blkcg;
+        struct blkcg_gq *blkg;
+        rcu_read_lock();
+        blkcg = bio_blkcg(bio);
+        /* bypass blkg lookup and use @q->root_rl directly for root */
+        if (blkcg == &blkcg_root)
+                goto root_rl;
+        /*
+         * Try to use blkg->rl.  blkg lookup may fail under memory pressure
+         * or if either the blkcg or queue is going away.  Fall back to
+         * root_rl in such cases.
+         */
+        blkg = blkg_lookup_create(blkcg, q);
+        if (unlikely(IS_ERR(blkg)))
+                goto root_rl;
+        blkg_get(blkg);
+        rcu_read_unlock();
+        return &blkg->rl;
+root_rl:
+        rcu_read_unlock();
+        return &q->root_rl;
+}
+/**
+ * blk_put_rl - put request_list
+ * @rl: request_list to put
+ *
+ * Put the reference acquired by blk_get_rl().  Should be called under
+ * queue_lock.
+ */
+static inline void blk_put_rl(struct request_list *rl)
+{
+        /* root_rl may not have blkg set */
+        if (rl->blkg && rl->blkg->blkcg != &blkcg_root)
+                blkg_put(rl->blkg);
+}
+/**
+ * blk_rq_set_rl - associate a request with a request_list
+ * @rq: request of interest
+ * @rl: target request_list
+ *
+ * Associate @rq with @rl so that accounting and freeing can know the
+ * request_list @rq came from.
+ */
+static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl)
+{
+        rq->rl = rl;
+}
+/**
+ * blk_rq_rl - return the request_list a request came from
+ * @rq: request of interest
+ *
+ * Return the request_list @rq is allocated from.
+ */
+static inline struct request_list *blk_rq_rl(struct request *rq)
+{
+        return rq->rl;
+}
+struct request_list *__blk_queue_next_rl(struct request_list *rl,
+                                         struct request_queue *q);
+/**
+ * blk_queue_for_each_rl - iterate through all request_lists of a request_queue
+ *
+ * Should be used under queue_lock.
+ */
+#define blk_queue_for_each_rl(rl, q)    \
+        for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
+/**
 * blkg_stat_add - add a value to a blkg_stat
 * @stat: target blkg_stat
 * @val: value to add
@@ -392,6 +484,7 @@ static inline void blkcg_deactivate_policy(struct request_queue *q,
 static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; }
 static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
 static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
                                                  struct blkcg_policy *pol) { return NULL; }
 static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
@@ -399,5 +492,14 @@ static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
 static inline void blkg_get(struct blkcg_gq *blkg) { }
 static inline void blkg_put(struct blkcg_gq *blkg) { }
+static inline struct request_list *blk_get_rl(struct request_queue *q,
+                                              struct bio *bio) { return &q->root_rl; }
+static inline void blk_put_rl(struct request_list *rl) { }
+static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { }
+static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; }
+#define blk_queue_for_each_rl(rl, q)    \
+        for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
 #endif  /* CONFIG_BLK_CGROUP */
 #endif  /* _BLK_CGROUP_H */
diff --git a/block/blk-core.c b/block/blk-core.c
index f392a2edf462..dd134d834d58 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -416,9 +416,14 @@ void blk_drain_queue(struct request_queue *q, bool drain_all)
         * left with hung waiters. We need to wake up those waiters.
         */
        if (q->request_fn) {
+                struct request_list *rl;
                spin_lock_irq(q->queue_lock);
-                for (i = 0; i < ARRAY_SIZE(q->rq.wait); i++)
-                        wake_up_all(&q->rq.wait[i]);
+                blk_queue_for_each_rl(rl, q)
+                        for (i = 0; i < ARRAY_SIZE(rl->wait); i++)
+                                wake_up_all(&rl->wait[i]);
                spin_unlock_irq(q->queue_lock);
        }
 }
@@ -685,7 +690,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
        if (!q)
                return NULL;
-        if (blk_init_rl(&q->rq, q, GFP_KERNEL))
+        if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
                return NULL;
        q->request_fn           = rfn;
@@ -776,7 +781,12 @@ static void __freed_request(struct request_list *rl, int sync)
 {
        struct request_queue *q = rl->q;
-        if (rl->count[sync] < queue_congestion_off_threshold(q))
+        /*
+         * bdi isn't aware of blkcg yet.  As all async IOs end up root
+         * blkcg anyway, just use root blkcg state.
+         */
+        if (rl == &q->root_rl &&
+            rl->count[sync] < queue_congestion_off_threshold(q))
                blk_clear_queue_congested(q, sync);
        if (rl->count[sync] + 1 <= q->nr_requests) {
@@ -897,7 +907,12 @@ static struct request *__get_request(struct request_list *rl, int rw_flags,
                                }
                        }
                }
-                blk_set_queue_congested(q, is_sync);
+                /*
+                 * bdi isn't aware of blkcg yet.  As all async IOs end up
+                 * root blkcg anyway, just use root blkcg state.
+                 */
+                if (rl == &q->root_rl)
+                        blk_set_queue_congested(q, is_sync);
        }
        /*
@@ -939,6 +954,7 @@ static struct request *__get_request(struct request_list *rl, int rw_flags,
                goto fail_alloc;
        blk_rq_init(q, rq);
+        blk_rq_set_rl(rq, rl);
        rq->cmd_flags = rw_flags | REQ_ALLOCED;
        /* init elvpriv */
@@ -1032,15 +1048,19 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 {
        const bool is_sync = rw_is_sync(rw_flags) != 0;
        DEFINE_WAIT(wait);
-        struct request_list *rl = &q->rq;
+        struct request_list *rl;
        struct request *rq;
+        rl = blk_get_rl(q, bio);        /* transferred to @rq on success */
 retry:
-        rq = __get_request(&q->rq, rw_flags, bio, gfp_mask);
+        rq = __get_request(rl, rw_flags, bio, gfp_mask);
        if (rq)
                return rq;
-        if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dead(q)))
+        if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dead(q))) {
+                blk_put_rl(rl);
                return NULL;
+        }
        /* wait on @rl and retry */
        prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
@@ -1231,12 +1251,14 @@ void __blk_put_request(struct request_queue *q, struct request *req)
         */
        if (req->cmd_flags & REQ_ALLOCED) {
                unsigned int flags = req->cmd_flags;
+                struct request_list *rl = blk_rq_rl(req);
                BUG_ON(!list_empty(&req->queuelist));
                BUG_ON(!hlist_unhashed(&req->hash));
-                blk_free_request(&q->rq, req);
+                blk_free_request(rl, req);
-                freed_request(&q->rq, flags);
+                freed_request(rl, flags);
+                blk_put_rl(rl);
        }
 }
 EXPORT_SYMBOL_GPL(__blk_put_request);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 234ce7c082fa..9628b291f960 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -40,7 +40,7 @@ static ssize_t queue_requests_show(struct request_queue *q, char *page)
 static ssize_t
 queue_requests_store(struct request_queue *q, const char *page, size_t count)
 {
-        struct request_list *rl = &q->rq;
+        struct request_list *rl;
        unsigned long nr;
        int ret;
@@ -55,6 +55,9 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
        q->nr_requests = nr;
        blk_queue_congestion_threshold(q);
+        /* congestion isn't cgroup aware and follows root blkcg for now */
+        rl = &q->root_rl;
        if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
                blk_set_queue_congested(q, BLK_RW_SYNC);
        else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
@@ -65,19 +68,22 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
        else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
                blk_clear_queue_congested(q, BLK_RW_ASYNC);
-        if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
+        blk_queue_for_each_rl(rl, q) {
-                blk_set_rl_full(rl, BLK_RW_SYNC);
+                if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
-        } else {
+                        blk_set_rl_full(rl, BLK_RW_SYNC);
-                blk_clear_rl_full(rl, BLK_RW_SYNC);
+                } else {
-                wake_up(&rl->wait[BLK_RW_SYNC]);
+                        blk_clear_rl_full(rl, BLK_RW_SYNC);
+                        wake_up(&rl->wait[BLK_RW_SYNC]);
+                }
+                if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
+                        blk_set_rl_full(rl, BLK_RW_ASYNC);
+                } else {
+                        blk_clear_rl_full(rl, BLK_RW_ASYNC);
+                        wake_up(&rl->wait[BLK_RW_ASYNC]);
+                }
        }
-        if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
-                blk_set_rl_full(rl, BLK_RW_ASYNC);
-        } else {
-                blk_clear_rl_full(rl, BLK_RW_ASYNC);
-                wake_up(&rl->wait[BLK_RW_ASYNC]);
-        }
        spin_unlock_irq(q->queue_lock);
        return ret;
 }
@@ -488,7 +494,7 @@ static void blk_release_queue(struct kobject *kobj)
                elevator_exit(q->elevator);
        }
-        blk_exit_rl(&q->rq);
+        blk_exit_rl(&q->root_rl);
        if (q->queue_tags)
                __blk_queue_free_tags(q);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f2385ee7c7b2..3816ce8a08fc 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -51,7 +51,9 @@ typedef void (rq_end_io_fn)(struct request *, int);
 struct request_list {
        struct request_queue    *q;     /* the queue this rl belongs to */
+#ifdef CONFIG_BLK_CGROUP
+        struct blkcg_gq         *blkg;  /* blkg this request pool belongs to */
+#endif
        /*
         * count[], starved[], and wait[] are indexed by
         * BLK_RW_SYNC/BLK_RW_ASYNC
@@ -143,6 +145,7 @@ struct request {
        struct hd_struct *part;
        unsigned long start_time;
 #ifdef CONFIG_BLK_CGROUP
+        struct request_list *rl;                /* rl this rq is alloced from */
        unsigned long long start_time_ns;
        unsigned long long io_start_time_ns;    /* when passed to hardware */
 #endif
@@ -291,9 +294,12 @@ struct request_queue {
        int                     nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */
        /*
-         * the queue request freelist, one for reads and one for writes
+         * If blkcg is not used, @q->root_rl serves all requests.  If blkcg
+         * is used, root blkg allocates from @q->root_rl and all other
+         * blkgs from their own blkg->rl.  Which one to use should be
+         * determined using bio_request_list().
         */
-        struct request_list     rq;
+        struct request_list     root_rl;
        request_fn_proc         *request_fn;
        make_request_fn         *make_request_fn;
author	Tejun Heo <tj@kernel.org>	2012-06-26 18:05:44 -0400
committer	Jens Axboe <axboe@kernel.dk>	2012-06-26 18:42:49 -0400
commit	a051661ca6d134c18599498b185b667859d4339b (patch)
tree	9d840030874aed9b97a58051bf9568455126e8e8
parent	5b788ce3e2acac9bf109743b1281d77347cf2101 (diff)

diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt index d8147b336c35..6518a55273e7 100644 --- a/Documentation/block/queue-sysfs.txt +++ b/Documentation/block/queue-sysfs.txt
@@ -38,6 +38,13 @@ read or write requests. Note that the total allocated number may be twice
38	this amount, since it applies only to reads or writes (not the accumulated	38	this amount, since it applies only to reads or writes (not the accumulated
39	sum).	39	sum).
40		40
		41	To avoid priority inversion through request starvation, a request
		42	queue maintains a separate request pool per each cgroup when
		43	CONFIG_BLK_CGROUP is enabled, and this parameter applies to each such
		44	per-block-cgroup request pool. IOW, if there are N block cgroups,
		45	each request queue may have upto N request pools, each independently
		46	regulated by nr_requests.
		47
41	read_ahead_kb (RW)	48	read_ahead_kb (RW)
42	------------------	49	------------------
43	Maximum number of kilobytes to read-ahead for filesystems on this block	50	Maximum number of kilobytes to read-ahead for filesystems on this block


diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 63b31ebae6e2..f3b44a65fc7a 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c
@@ -63,6 +63,7 @@ static void blkg_free(struct blkcg_gq *blkg)
63	kfree(pd);	63	kfree(pd);
64	}	64	}
65		65
		66	blk_exit_rl(&blkg->rl);
66	kfree(blkg);	67	kfree(blkg);
67	}	68	}
68		69
@@ -90,6 +91,13 @@ static struct blkcg_gq blkg_alloc(struct blkcg blkcg, struct request_queue *q,
90	blkg->blkcg = blkcg;	91	blkg->blkcg = blkcg;
91	blkg->refcnt = 1;	92	blkg->refcnt = 1;
92		93
		94	/* root blkg uses @q->root_rl, init rl only for !root blkgs */
		95	if (blkcg != &blkcg_root) {
		96	if (blk_init_rl(&blkg->rl, q, gfp_mask))
		97	goto err_free;
		98	blkg->rl.blkg = blkg;
		99	}
		100
93	for (i = 0; i < BLKCG_MAX_POLS; i++) {	101	for (i = 0; i < BLKCG_MAX_POLS; i++) {
94	struct blkcg_policy *pol = blkcg_policy[i];	102	struct blkcg_policy *pol = blkcg_policy[i];
95	struct blkg_policy_data *pd;	103	struct blkg_policy_data *pd;
@@ -99,10 +107,8 @@ static struct blkcg_gq blkg_alloc(struct blkcg blkcg, struct request_queue *q,
99		107
100	/* alloc per-policy data and attach it to blkg */	108	/* alloc per-policy data and attach it to blkg */
101	pd = kzalloc_node(pol->pd_size, gfp_mask, q->node);	109	pd = kzalloc_node(pol->pd_size, gfp_mask, q->node);
102	if (!pd) {	110	if (!pd)
103	blkg_free(blkg);	111	goto err_free;
104	return NULL;
105	}
106		112
107	blkg->pd[i] = pd;	113	blkg->pd[i] = pd;
108	pd->blkg = blkg;	114	pd->blkg = blkg;
@@ -113,6 +119,10 @@ static struct blkcg_gq blkg_alloc(struct blkcg blkcg, struct request_queue *q,
113	}	119	}
114		120
115	return blkg;	121	return blkg;
		122
		123	err_free:
		124	blkg_free(blkg);
		125	return NULL;
116	}	126	}
117		127
118	static struct blkcg_gq __blkg_lookup(struct blkcg blkcg,	128	static struct blkcg_gq __blkg_lookup(struct blkcg blkcg,
@@ -300,6 +310,38 @@ void __blkg_release(struct blkcg_gq *blkg)
300	}	310	}
301	EXPORT_SYMBOL_GPL(__blkg_release);	311	EXPORT_SYMBOL_GPL(__blkg_release);
302		312
		313	/*
		314	* The next function used by blk_queue_for_each_rl(). It's a bit tricky
		315	* because the root blkg uses @q->root_rl instead of its own rl.
		316	*/
		317	struct request_list __blk_queue_next_rl(struct request_list rl,
		318	struct request_queue *q)
		319	{
		320	struct list_head *ent;
		321	struct blkcg_gq *blkg;
		322
		323	/*
		324	* Determine the current blkg list_head. The first entry is
		325	* root_rl which is off @q->blkg_list and mapped to the head.
		326	*/
		327	if (rl == &q->root_rl) {
		328	ent = &q->blkg_list;
		329	} else {
		330	blkg = container_of(rl, struct blkcg_gq, rl);
		331	ent = &blkg->q_node;
		332	}
		333
		334	/* walk to the next list_head, skip root blkcg */
		335	ent = ent->next;
		336	if (ent == &q->root_blkg->q_node)
		337	ent = ent->next;
		338	if (ent == &q->blkg_list)
		339	return NULL;
		340
		341	blkg = container_of(ent, struct blkcg_gq, q_node);
		342	return &blkg->rl;
		343	}
		344
303	static int blkcg_reset_stats(struct cgroup cgroup, struct cftype cftype,	345	static int blkcg_reset_stats(struct cgroup cgroup, struct cftype cftype,
304	u64 val)	346	u64 val)
305	{	347	{
@@ -750,6 +792,7 @@ int blkcg_activate_policy(struct request_queue *q,
750	goto out_unlock;	792	goto out_unlock;
751	}	793	}
752	q->root_blkg = blkg;	794	q->root_blkg = blkg;
		795	q->root_rl.blkg = blkg;
753		796
754	list_for_each_entry(blkg, &q->blkg_list, q_node)	797	list_for_each_entry(blkg, &q->blkg_list, q_node)
755	cnt++;	798	cnt++;


diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index e74cce1fbac9..24597309e23d 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h
@@ -17,6 +17,7 @@
17	#include <linux/u64_stats_sync.h>	17	#include <linux/u64_stats_sync.h>
18	#include <linux/seq_file.h>	18	#include <linux/seq_file.h>
19	#include <linux/radix-tree.h>	19	#include <linux/radix-tree.h>
		20	#include <linux/blkdev.h>
20		21
21	/* Max limits for throttle policy */	22	/* Max limits for throttle policy */
22	#define THROTL_IOPS_MAX UINT_MAX	23	#define THROTL_IOPS_MAX UINT_MAX
@@ -93,6 +94,8 @@ struct blkcg_gq {
93	struct list_head q_node;	94	struct list_head q_node;
94	struct hlist_node blkcg_node;	95	struct hlist_node blkcg_node;
95	struct blkcg *blkcg;	96	struct blkcg *blkcg;
		97	/* request allocation list for this blkcg-q pair */
		98	struct request_list rl;
96	/* reference count */	99	/* reference count */
97	int refcnt;	100	int refcnt;
98		101
@@ -251,6 +254,95 @@ static inline void blkg_put(struct blkcg_gq *blkg)
251	}	254	}
252		255
253	/**	256	/**
		257	* blk_get_rl - get request_list to use
		258	* @q: request_queue of interest
		259	* @bio: bio which will be attached to the allocated request (may be %NULL)
		260	*
		261	* The caller wants to allocate a request from @q to use for @bio. Find
		262	* the request_list to use and obtain a reference on it. Should be called
		263	* under queue_lock. This function is guaranteed to return non-%NULL
		264	* request_list.
		265	*/
		266	static inline struct request_list blk_get_rl(struct request_queue q,
		267	struct bio *bio)
		268	{
		269	struct blkcg *blkcg;
		270	struct blkcg_gq *blkg;
		271
		272	rcu_read_lock();
		273
		274	blkcg = bio_blkcg(bio);
		275
		276	/* bypass blkg lookup and use @q->root_rl directly for root */
		277	if (blkcg == &blkcg_root)
		278	goto root_rl;
		279
		280	/*
		281	* Try to use blkg->rl. blkg lookup may fail under memory pressure
		282	* or if either the blkcg or queue is going away. Fall back to
		283	* root_rl in such cases.
		284	*/
		285	blkg = blkg_lookup_create(blkcg, q);
		286	if (unlikely(IS_ERR(blkg)))
		287	goto root_rl;
		288
		289	blkg_get(blkg);
		290	rcu_read_unlock();
		291	return &blkg->rl;
		292	root_rl:
		293	rcu_read_unlock();
		294	return &q->root_rl;
		295	}
		296
		297	/**
		298	* blk_put_rl - put request_list
		299	* @rl: request_list to put
		300	*
		301	* Put the reference acquired by blk_get_rl(). Should be called under
		302	* queue_lock.
		303	*/
		304	static inline void blk_put_rl(struct request_list *rl)
		305	{
		306	/* root_rl may not have blkg set */
		307	if (rl->blkg && rl->blkg->blkcg != &blkcg_root)
		308	blkg_put(rl->blkg);
		309	}
		310
		311	/**
		312	* blk_rq_set_rl - associate a request with a request_list
		313	* @rq: request of interest
		314	* @rl: target request_list
		315	*
		316	* Associate @rq with @rl so that accounting and freeing can know the
		317	* request_list @rq came from.
		318	*/
		319	static inline void blk_rq_set_rl(struct request rq, struct request_list rl)
		320	{
		321	rq->rl = rl;
		322	}
		323
		324	/**
		325	* blk_rq_rl - return the request_list a request came from
		326	* @rq: request of interest
		327	*
		328	* Return the request_list @rq is allocated from.
		329	*/
		330	static inline struct request_list blk_rq_rl(struct request rq)
		331	{
		332	return rq->rl;
		333	}
		334
		335	struct request_list __blk_queue_next_rl(struct request_list rl,
		336	struct request_queue *q);
		337	/**
		338	* blk_queue_for_each_rl - iterate through all request_lists of a request_queue
		339	*
		340	* Should be used under queue_lock.
		341	*/
		342	#define blk_queue_for_each_rl(rl, q) \
		343	for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
		344
		345	/**
254	* blkg_stat_add - add a value to a blkg_stat	346	* blkg_stat_add - add a value to a blkg_stat
255	* @stat: target blkg_stat	347	* @stat: target blkg_stat
256	* @val: value to add	348	* @val: value to add
@@ -392,6 +484,7 @@ static inline void blkcg_deactivate_policy(struct request_queue *q,
392		484
393	static inline struct blkcg cgroup_to_blkcg(struct cgroup cgroup) { return NULL; }	485	static inline struct blkcg cgroup_to_blkcg(struct cgroup cgroup) { return NULL; }
394	static inline struct blkcg bio_blkcg(struct bio bio) { return NULL; }	486	static inline struct blkcg bio_blkcg(struct bio bio) { return NULL; }
		487
395	static inline struct blkg_policy_data blkg_to_pd(struct blkcg_gq blkg,	488	static inline struct blkg_policy_data blkg_to_pd(struct blkcg_gq blkg,
396	struct blkcg_policy *pol) { return NULL; }	489	struct blkcg_policy *pol) { return NULL; }
397	static inline struct blkcg_gq pd_to_blkg(struct blkg_policy_data pd) { return NULL; }	490	static inline struct blkcg_gq pd_to_blkg(struct blkg_policy_data pd) { return NULL; }
@@ -399,5 +492,14 @@ static inline char blkg_path(struct blkcg_gq blkg) { return NULL; }
399	static inline void blkg_get(struct blkcg_gq *blkg) { }	492	static inline void blkg_get(struct blkcg_gq *blkg) { }
400	static inline void blkg_put(struct blkcg_gq *blkg) { }	493	static inline void blkg_put(struct blkcg_gq *blkg) { }
401		494
		495	static inline struct request_list blk_get_rl(struct request_queue q,
		496	struct bio *bio) { return &q->root_rl; }
		497	static inline void blk_put_rl(struct request_list *rl) { }
		498	static inline void blk_rq_set_rl(struct request rq, struct request_list rl) { }
		499	static inline struct request_list blk_rq_rl(struct request rq) { return &rq->q->root_rl; }
		500
		501	#define blk_queue_for_each_rl(rl, q) \
		502	for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
		503
402	#endif /* CONFIG_BLK_CGROUP */	504	#endif /* CONFIG_BLK_CGROUP */
403	#endif /* _BLK_CGROUP_H */	505	#endif /* _BLK_CGROUP_H */


diff --git a/block/blk-core.c b/block/blk-core.c index f392a2edf462..dd134d834d58 100644 --- a/block/blk-core.c +++ b/block/blk-core.c
@@ -416,9 +416,14 @@ void blk_drain_queue(struct request_queue *q, bool drain_all)
416	* left with hung waiters. We need to wake up those waiters.	416	* left with hung waiters. We need to wake up those waiters.
417	*/	417	*/
418	if (q->request_fn) {	418	if (q->request_fn) {
		419	struct request_list *rl;
		420
419	spin_lock_irq(q->queue_lock);	421	spin_lock_irq(q->queue_lock);
420	for (i = 0; i < ARRAY_SIZE(q->rq.wait); i++)	422
421	wake_up_all(&q->rq.wait[i]);	423	blk_queue_for_each_rl(rl, q)
		424	for (i = 0; i < ARRAY_SIZE(rl->wait); i++)
		425	wake_up_all(&rl->wait[i]);
		426
422	spin_unlock_irq(q->queue_lock);	427	spin_unlock_irq(q->queue_lock);
423	}	428	}
424	}	429	}
@@ -685,7 +690,7 @@ blk_init_allocated_queue(struct request_queue q, request_fn_proc rfn,
685	if (!q)	690	if (!q)
686	return NULL;	691	return NULL;
687		692
688	if (blk_init_rl(&q->rq, q, GFP_KERNEL))	693	if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
689	return NULL;	694	return NULL;
690		695
691	q->request_fn = rfn;	696	q->request_fn = rfn;
@@ -776,7 +781,12 @@ static void __freed_request(struct request_list *rl, int sync)
776	{	781	{
777	struct request_queue *q = rl->q;	782	struct request_queue *q = rl->q;
778		783
779	if (rl->count[sync] < queue_congestion_off_threshold(q))	784	/*
		785	* bdi isn't aware of blkcg yet. As all async IOs end up root
		786	* blkcg anyway, just use root blkcg state.
		787	*/
		788	if (rl == &q->root_rl &&
		789	rl->count[sync] < queue_congestion_off_threshold(q))
780	blk_clear_queue_congested(q, sync);	790	blk_clear_queue_congested(q, sync);
781		791
782	if (rl->count[sync] + 1 <= q->nr_requests) {	792	if (rl->count[sync] + 1 <= q->nr_requests) {
@@ -897,7 +907,12 @@ static struct request __get_request(struct request_list rl, int rw_flags,
897	}	907	}
898	}	908	}
899	}	909	}
900	blk_set_queue_congested(q, is_sync);	910	/*
		911	* bdi isn't aware of blkcg yet. As all async IOs end up
		912	* root blkcg anyway, just use root blkcg state.
		913	*/
		914	if (rl == &q->root_rl)
		915	blk_set_queue_congested(q, is_sync);
901	}	916	}
902		917
903	/*	918	/*
@@ -939,6 +954,7 @@ static struct request __get_request(struct request_list rl, int rw_flags,
939	goto fail_alloc;	954	goto fail_alloc;
940		955
941	blk_rq_init(q, rq);	956	blk_rq_init(q, rq);
		957	blk_rq_set_rl(rq, rl);
942	rq->cmd_flags = rw_flags \| REQ_ALLOCED;	958	rq->cmd_flags = rw_flags \| REQ_ALLOCED;
943		959
944	/* init elvpriv */	960	/* init elvpriv */
@@ -1032,15 +1048,19 @@ static struct request get_request(struct request_queue q, int rw_flags,
1032	{	1048	{
1033	const bool is_sync = rw_is_sync(rw_flags) != 0;	1049	const bool is_sync = rw_is_sync(rw_flags) != 0;
1034	DEFINE_WAIT(wait);	1050	DEFINE_WAIT(wait);
1035	struct request_list *rl = &q->rq;	1051	struct request_list *rl;
1036	struct request *rq;	1052	struct request *rq;
		1053
		1054	rl = blk_get_rl(q, bio); /* transferred to @rq on success */
1037	retry:	1055	retry:
1038	rq = __get_request(&q->rq, rw_flags, bio, gfp_mask);	1056	rq = __get_request(rl, rw_flags, bio, gfp_mask);
1039	if (rq)	1057	if (rq)
1040	return rq;	1058	return rq;
1041		1059
1042	if (!(gfp_mask & __GFP_WAIT) \|\| unlikely(blk_queue_dead(q)))	1060	if (!(gfp_mask & __GFP_WAIT) \|\| unlikely(blk_queue_dead(q))) {
		1061	blk_put_rl(rl);
1043	return NULL;	1062	return NULL;
		1063	}
1044		1064
1045	/* wait on @rl and retry */	1065	/* wait on @rl and retry */
1046	prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,	1066	prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
@@ -1231,12 +1251,14 @@ void __blk_put_request(struct request_queue q, struct request req)
1231	*/	1251	*/
1232	if (req->cmd_flags & REQ_ALLOCED) {	1252	if (req->cmd_flags & REQ_ALLOCED) {
1233	unsigned int flags = req->cmd_flags;	1253	unsigned int flags = req->cmd_flags;
		1254	struct request_list *rl = blk_rq_rl(req);
1234		1255
1235	BUG_ON(!list_empty(&req->queuelist));	1256	BUG_ON(!list_empty(&req->queuelist));
1236	BUG_ON(!hlist_unhashed(&req->hash));	1257	BUG_ON(!hlist_unhashed(&req->hash));
1237		1258
1238	blk_free_request(&q->rq, req);	1259	blk_free_request(rl, req);
1239	freed_request(&q->rq, flags);	1260	freed_request(rl, flags);
		1261	blk_put_rl(rl);
1240	}	1262	}
1241	}	1263	}
1242	EXPORT_SYMBOL_GPL(__blk_put_request);	1264	EXPORT_SYMBOL_GPL(__blk_put_request);


diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 234ce7c082fa..9628b291f960 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c
@@ -40,7 +40,7 @@ static ssize_t queue_requests_show(struct request_queue q, char page)
40	static ssize_t	40	static ssize_t
41	queue_requests_store(struct request_queue q, const char page, size_t count)	41	queue_requests_store(struct request_queue q, const char page, size_t count)
42	{	42	{
43	struct request_list *rl = &q->rq;	43	struct request_list *rl;
44	unsigned long nr;	44	unsigned long nr;
45	int ret;	45	int ret;
46		46
@@ -55,6 +55,9 @@ queue_requests_store(struct request_queue q, const char page, size_t count)
55	q->nr_requests = nr;	55	q->nr_requests = nr;
56	blk_queue_congestion_threshold(q);	56	blk_queue_congestion_threshold(q);
57		57
		58	/* congestion isn't cgroup aware and follows root blkcg for now */
		59	rl = &q->root_rl;
		60
58	if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))	61	if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
59	blk_set_queue_congested(q, BLK_RW_SYNC);	62	blk_set_queue_congested(q, BLK_RW_SYNC);
60	else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))	63	else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
@@ -65,19 +68,22 @@ queue_requests_store(struct request_queue q, const char page, size_t count)
65	else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))	68	else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
66	blk_clear_queue_congested(q, BLK_RW_ASYNC);	69	blk_clear_queue_congested(q, BLK_RW_ASYNC);
67		70
68	if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {	71	blk_queue_for_each_rl(rl, q) {
69	blk_set_rl_full(rl, BLK_RW_SYNC);	72	if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
70	} else {	73	blk_set_rl_full(rl, BLK_RW_SYNC);
71	blk_clear_rl_full(rl, BLK_RW_SYNC);	74	} else {
72	wake_up(&rl->wait[BLK_RW_SYNC]);	75	blk_clear_rl_full(rl, BLK_RW_SYNC);
		76	wake_up(&rl->wait[BLK_RW_SYNC]);
		77	}
		78
		79	if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
		80	blk_set_rl_full(rl, BLK_RW_ASYNC);
		81	} else {
		82	blk_clear_rl_full(rl, BLK_RW_ASYNC);
		83	wake_up(&rl->wait[BLK_RW_ASYNC]);
		84	}
73	}	85	}
74		86
75	if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
76	blk_set_rl_full(rl, BLK_RW_ASYNC);
77	} else {
78	blk_clear_rl_full(rl, BLK_RW_ASYNC);
79	wake_up(&rl->wait[BLK_RW_ASYNC]);
80	}
81	spin_unlock_irq(q->queue_lock);	87	spin_unlock_irq(q->queue_lock);
82	return ret;	88	return ret;
83	}	89	}
@@ -488,7 +494,7 @@ static void blk_release_queue(struct kobject *kobj)
488	elevator_exit(q->elevator);	494	elevator_exit(q->elevator);
489	}	495	}
490		496
491	blk_exit_rl(&q->rq);	497	blk_exit_rl(&q->root_rl);
492		498
493	if (q->queue_tags)	499	if (q->queue_tags)
494	__blk_queue_free_tags(q);	500	__blk_queue_free_tags(q);


diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f2385ee7c7b2..3816ce8a08fc 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h
@@ -51,7 +51,9 @@ typedef void (rq_end_io_fn)(struct request *, int);
51		51
52	struct request_list {	52	struct request_list {
53	struct request_queue q; / the queue this rl belongs to */	53	struct request_queue q; / the queue this rl belongs to */
54		54	#ifdef CONFIG_BLK_CGROUP
		55	struct blkcg_gq blkg; / blkg this request pool belongs to */
		56	#endif
55	/*	57	/*
56	* count[], starved[], and wait[] are indexed by	58	* count[], starved[], and wait[] are indexed by
57	* BLK_RW_SYNC/BLK_RW_ASYNC	59	* BLK_RW_SYNC/BLK_RW_ASYNC
@@ -143,6 +145,7 @@ struct request {
143	struct hd_struct *part;	145	struct hd_struct *part;
144	unsigned long start_time;	146	unsigned long start_time;
145	#ifdef CONFIG_BLK_CGROUP	147	#ifdef CONFIG_BLK_CGROUP
		148	struct request_list rl; / rl this rq is alloced from */
146	unsigned long long start_time_ns;	149	unsigned long long start_time_ns;
147	unsigned long long io_start_time_ns; /* when passed to hardware */	150	unsigned long long io_start_time_ns; /* when passed to hardware */
148	#endif	151	#endif
@@ -291,9 +294,12 @@ struct request_queue {
291	int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */	294	int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */
292		295
293	/*	296	/*
294	* the queue request freelist, one for reads and one for writes	297	* If blkcg is not used, @q->root_rl serves all requests. If blkcg
		298	* is used, root blkg allocates from @q->root_rl and all other
		299	* blkgs from their own blkg->rl. Which one to use should be
		300	* determined using bio_request_list().
295	*/	301	*/
296	struct request_list rq;	302	struct request_list root_rl;
297		303
298	request_fn_proc *request_fn;	304	request_fn_proc *request_fn;
299	make_request_fn *make_request_fn;	305	make_request_fn *make_request_fn;