blkcg: implement per-blkg request allocation

Currently, request_queue has one request_list to allocate requests from regardless of blkcg of the IO being issued. When the unified request pool is used up, cfq proportional IO limits become meaningless - whoever grabs the next request being freed wins the race regardless of the configured weights. This can be easily demonstrated by creating a blkio cgroup w/ very low weight, put a program which can issue a lot of random direct IOs there and running a sequential IO from a different cgroup. As soon as the request pool is used up, the sequential IO bandwidth crashes. This patch implements per-blkg request_list. Each blkg has its own request_list and any IO allocates its request from the matching blkg making blkcgs completely isolated in terms of request allocation. * Root blkcg uses the request_list embedded in each request_queue, which was renamed to @q->root_rl from @q->rq. While making blkcg rl handling a bit harier, this enables avoiding most overhead for root blkcg. * Queue fullness is properly per request_list but bdi isn't blkcg aware yet, so congestion state currently just follows the root blkcg. As writeback isn't aware of blkcg yet, this works okay for async congestion but readahead may get the wrong signals. It's better than blkcg completely collapsing with shared request_list but needs to be improved with future changes. * After this change, each block cgroup gets a full request pool making resource consumption of each cgroup higher. This makes allowing non-root users to create cgroups less desirable; however, note that allowing non-root users to directly manage cgroups is already severely broken regardless of this patch - each block cgroup consumes kernel memory and skews IO weight (IO weights are not hierarchical). v2: queue-sysfs.txt updated and patch description udpated as suggested by Vivek. v3: blk_get_rl() wasn't checking error return from blkg_lookup_create() and may cause oops on lookup failure. Fix it by falling back to root_rl on blkg lookup failures. This problem was spotted by Rakesh Iyer <rni@google.com>. v4: Updated to accomodate 458f27a982 "block: Avoid missed wakeup in request waitqueue". blk_drain_queue() now wakes up waiters on all blkg->rl on the target queue. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Vivek Goyal <vgoyal@redhat.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
author: Tejun Heo <tj@kernel.org> 2012-06-26 18:05:44 -0400
committer: Jens Axboe <axboe@kernel.dk> 2012-06-26 18:42:49 -0400
commit: a051661ca6d134c18599498b185b667859d4339b (patch)
tree: 9d840030874aed9b97a58051bf9568455126e8e8 /block/blk-cgroup.c
parent: 5b788ce3e2acac9bf109743b1281d77347cf2101 (diff)
1 files changed, 47 insertions, 4 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 63b31ebae6e2..f3b44a65fc7a 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -63,6 +63,7 @@ static void blkg_free(struct blkcg_gq *blkg)
                kfree(pd);
        }
+        blk_exit_rl(&blkg->rl);
        kfree(blkg);
 }
@@ -90,6 +91,13 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
        blkg->blkcg = blkcg;
        blkg->refcnt = 1;
+        /* root blkg uses @q->root_rl, init rl only for !root blkgs */
+        if (blkcg != &blkcg_root) {
+                if (blk_init_rl(&blkg->rl, q, gfp_mask))
+                        goto err_free;
+                blkg->rl.blkg = blkg;
+        }
        for (i = 0; i < BLKCG_MAX_POLS; i++) {
                struct blkcg_policy *pol = blkcg_policy[i];
                struct blkg_policy_data *pd;
@@ -99,10 +107,8 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
                /* alloc per-policy data and attach it to blkg */
                pd = kzalloc_node(pol->pd_size, gfp_mask, q->node);
-                if (!pd) {
+                if (!pd)
-                        blkg_free(blkg);
+                        goto err_free;
-                        return NULL;
-                }
                blkg->pd[i] = pd;
                pd->blkg = blkg;
@@ -113,6 +119,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
        }
        return blkg;
+err_free:
+        blkg_free(blkg);
+        return NULL;
 }
 static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
@@ -300,6 +310,38 @@ void __blkg_release(struct blkcg_gq *blkg)
 }
 EXPORT_SYMBOL_GPL(__blkg_release);
+/*
+ * The next function used by blk_queue_for_each_rl().  It's a bit tricky
+ * because the root blkg uses @q->root_rl instead of its own rl.
+ */
+struct request_list *__blk_queue_next_rl(struct request_list *rl,
+                                         struct request_queue *q)
+{
+        struct list_head *ent;
+        struct blkcg_gq *blkg;
+        /*
+         * Determine the current blkg list_head.  The first entry is
+         * root_rl which is off @q->blkg_list and mapped to the head.
+         */
+        if (rl == &q->root_rl) {
+                ent = &q->blkg_list;
+        } else {
+                blkg = container_of(rl, struct blkcg_gq, rl);
+                ent = &blkg->q_node;
+        }
+        /* walk to the next list_head, skip root blkcg */
+        ent = ent->next;
+        if (ent == &q->root_blkg->q_node)
+                ent = ent->next;
+        if (ent == &q->blkg_list)
+                return NULL;
+        blkg = container_of(ent, struct blkcg_gq, q_node);
+        return &blkg->rl;
+}
 static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype,
                             u64 val)
 {
@@ -750,6 +792,7 @@ int blkcg_activate_policy(struct request_queue *q,
                goto out_unlock;
        }
        q->root_blkg = blkg;
+        q->root_rl.blkg = blkg;
        list_for_each_entry(blkg, &q->blkg_list, q_node)
                cnt++;
author	Tejun Heo <tj@kernel.org>	2012-06-26 18:05:44 -0400
committer	Jens Axboe <axboe@kernel.dk>	2012-06-26 18:42:49 -0400
commit	a051661ca6d134c18599498b185b667859d4339b (patch)
tree	9d840030874aed9b97a58051bf9568455126e8e8 /block/blk-cgroup.c
parent	5b788ce3e2acac9bf109743b1281d77347cf2101 (diff)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 63b31ebae6e2..f3b44a65fc7a 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c
@@ -63,6 +63,7 @@ static void blkg_free(struct blkcg_gq *blkg)
63	kfree(pd);	63	kfree(pd);
64	}	64	}
65		65
		66	blk_exit_rl(&blkg->rl);
66	kfree(blkg);	67	kfree(blkg);
67	}	68	}
68		69
@@ -90,6 +91,13 @@ static struct blkcg_gq blkg_alloc(struct blkcg blkcg, struct request_queue *q,
90	blkg->blkcg = blkcg;	91	blkg->blkcg = blkcg;
91	blkg->refcnt = 1;	92	blkg->refcnt = 1;
92		93
		94	/* root blkg uses @q->root_rl, init rl only for !root blkgs */
		95	if (blkcg != &blkcg_root) {
		96	if (blk_init_rl(&blkg->rl, q, gfp_mask))
		97	goto err_free;
		98	blkg->rl.blkg = blkg;
		99	}
		100
93	for (i = 0; i < BLKCG_MAX_POLS; i++) {	101	for (i = 0; i < BLKCG_MAX_POLS; i++) {
94	struct blkcg_policy *pol = blkcg_policy[i];	102	struct blkcg_policy *pol = blkcg_policy[i];
95	struct blkg_policy_data *pd;	103	struct blkg_policy_data *pd;
@@ -99,10 +107,8 @@ static struct blkcg_gq blkg_alloc(struct blkcg blkcg, struct request_queue *q,
99		107
100	/* alloc per-policy data and attach it to blkg */	108	/* alloc per-policy data and attach it to blkg */
101	pd = kzalloc_node(pol->pd_size, gfp_mask, q->node);	109	pd = kzalloc_node(pol->pd_size, gfp_mask, q->node);
102	if (!pd) {	110	if (!pd)
103	blkg_free(blkg);	111	goto err_free;
104	return NULL;
105	}
106		112
107	blkg->pd[i] = pd;	113	blkg->pd[i] = pd;
108	pd->blkg = blkg;	114	pd->blkg = blkg;
@@ -113,6 +119,10 @@ static struct blkcg_gq blkg_alloc(struct blkcg blkcg, struct request_queue *q,
113	}	119	}
114		120
115	return blkg;	121	return blkg;
		122
		123	err_free:
		124	blkg_free(blkg);
		125	return NULL;
116	}	126	}
117		127
118	static struct blkcg_gq __blkg_lookup(struct blkcg blkcg,	128	static struct blkcg_gq __blkg_lookup(struct blkcg blkcg,
@@ -300,6 +310,38 @@ void __blkg_release(struct blkcg_gq *blkg)
300	}	310	}
301	EXPORT_SYMBOL_GPL(__blkg_release);	311	EXPORT_SYMBOL_GPL(__blkg_release);
302		312
		313	/*
		314	* The next function used by blk_queue_for_each_rl(). It's a bit tricky
		315	* because the root blkg uses @q->root_rl instead of its own rl.
		316	*/
		317	struct request_list __blk_queue_next_rl(struct request_list rl,
		318	struct request_queue *q)
		319	{
		320	struct list_head *ent;
		321	struct blkcg_gq *blkg;
		322
		323	/*
		324	* Determine the current blkg list_head. The first entry is
		325	* root_rl which is off @q->blkg_list and mapped to the head.
		326	*/
		327	if (rl == &q->root_rl) {
		328	ent = &q->blkg_list;
		329	} else {
		330	blkg = container_of(rl, struct blkcg_gq, rl);
		331	ent = &blkg->q_node;
		332	}
		333
		334	/* walk to the next list_head, skip root blkcg */
		335	ent = ent->next;
		336	if (ent == &q->root_blkg->q_node)
		337	ent = ent->next;
		338	if (ent == &q->blkg_list)
		339	return NULL;
		340
		341	blkg = container_of(ent, struct blkcg_gq, q_node);
		342	return &blkg->rl;
		343	}
		344
303	static int blkcg_reset_stats(struct cgroup cgroup, struct cftype cftype,	345	static int blkcg_reset_stats(struct cgroup cgroup, struct cftype cftype,
304	u64 val)	346	u64 val)
305	{	347	{
@@ -750,6 +792,7 @@ int blkcg_activate_policy(struct request_queue *q,
750	goto out_unlock;	792	goto out_unlock;
751	}	793	}
752	q->root_blkg = blkg;	794	q->root_blkg = blkg;
		795	q->root_rl.blkg = blkg;
753		796
754	list_for_each_entry(blkg, &q->blkg_list, q_node)	797	list_for_each_entry(blkg, &q->blkg_list, q_node)
755	cnt++;	798	cnt++;