diff options
author | Tejun Heo <tj@kernel.org> | 2012-06-26 18:05:44 -0400 |
---|---|---|
committer | Jens Axboe <axboe@kernel.dk> | 2012-06-26 18:42:49 -0400 |
commit | a051661ca6d134c18599498b185b667859d4339b (patch) | |
tree | 9d840030874aed9b97a58051bf9568455126e8e8 /block/blk-cgroup.c | |
parent | 5b788ce3e2acac9bf109743b1281d77347cf2101 (diff) |
blkcg: implement per-blkg request allocation
Currently, request_queue has one request_list to allocate requests
from regardless of blkcg of the IO being issued. When the unified
request pool is used up, cfq proportional IO limits become meaningless
- whoever grabs the next request being freed wins the race regardless
of the configured weights.
This can be easily demonstrated by creating a blkio cgroup w/ very low
weight, put a program which can issue a lot of random direct IOs there
and running a sequential IO from a different cgroup. As soon as the
request pool is used up, the sequential IO bandwidth crashes.
This patch implements per-blkg request_list. Each blkg has its own
request_list and any IO allocates its request from the matching blkg
making blkcgs completely isolated in terms of request allocation.
* Root blkcg uses the request_list embedded in each request_queue,
which was renamed to @q->root_rl from @q->rq. While making blkcg rl
handling a bit harier, this enables avoiding most overhead for root
blkcg.
* Queue fullness is properly per request_list but bdi isn't blkcg
aware yet, so congestion state currently just follows the root
blkcg. As writeback isn't aware of blkcg yet, this works okay for
async congestion but readahead may get the wrong signals. It's
better than blkcg completely collapsing with shared request_list but
needs to be improved with future changes.
* After this change, each block cgroup gets a full request pool making
resource consumption of each cgroup higher. This makes allowing
non-root users to create cgroups less desirable; however, note that
allowing non-root users to directly manage cgroups is already
severely broken regardless of this patch - each block cgroup
consumes kernel memory and skews IO weight (IO weights are not
hierarchical).
v2: queue-sysfs.txt updated and patch description udpated as suggested
by Vivek.
v3: blk_get_rl() wasn't checking error return from
blkg_lookup_create() and may cause oops on lookup failure. Fix it
by falling back to root_rl on blkg lookup failures. This problem
was spotted by Rakesh Iyer <rni@google.com>.
v4: Updated to accomodate 458f27a982 "block: Avoid missed wakeup in
request waitqueue". blk_drain_queue() now wakes up waiters on all
blkg->rl on the target queue.
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Diffstat (limited to 'block/blk-cgroup.c')
-rw-r--r-- | block/blk-cgroup.c | 51 |
1 files changed, 47 insertions, 4 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 63b31ebae6e2..f3b44a65fc7a 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c | |||
@@ -63,6 +63,7 @@ static void blkg_free(struct blkcg_gq *blkg) | |||
63 | kfree(pd); | 63 | kfree(pd); |
64 | } | 64 | } |
65 | 65 | ||
66 | blk_exit_rl(&blkg->rl); | ||
66 | kfree(blkg); | 67 | kfree(blkg); |
67 | } | 68 | } |
68 | 69 | ||
@@ -90,6 +91,13 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, | |||
90 | blkg->blkcg = blkcg; | 91 | blkg->blkcg = blkcg; |
91 | blkg->refcnt = 1; | 92 | blkg->refcnt = 1; |
92 | 93 | ||
94 | /* root blkg uses @q->root_rl, init rl only for !root blkgs */ | ||
95 | if (blkcg != &blkcg_root) { | ||
96 | if (blk_init_rl(&blkg->rl, q, gfp_mask)) | ||
97 | goto err_free; | ||
98 | blkg->rl.blkg = blkg; | ||
99 | } | ||
100 | |||
93 | for (i = 0; i < BLKCG_MAX_POLS; i++) { | 101 | for (i = 0; i < BLKCG_MAX_POLS; i++) { |
94 | struct blkcg_policy *pol = blkcg_policy[i]; | 102 | struct blkcg_policy *pol = blkcg_policy[i]; |
95 | struct blkg_policy_data *pd; | 103 | struct blkg_policy_data *pd; |
@@ -99,10 +107,8 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, | |||
99 | 107 | ||
100 | /* alloc per-policy data and attach it to blkg */ | 108 | /* alloc per-policy data and attach it to blkg */ |
101 | pd = kzalloc_node(pol->pd_size, gfp_mask, q->node); | 109 | pd = kzalloc_node(pol->pd_size, gfp_mask, q->node); |
102 | if (!pd) { | 110 | if (!pd) |
103 | blkg_free(blkg); | 111 | goto err_free; |
104 | return NULL; | ||
105 | } | ||
106 | 112 | ||
107 | blkg->pd[i] = pd; | 113 | blkg->pd[i] = pd; |
108 | pd->blkg = blkg; | 114 | pd->blkg = blkg; |
@@ -113,6 +119,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, | |||
113 | } | 119 | } |
114 | 120 | ||
115 | return blkg; | 121 | return blkg; |
122 | |||
123 | err_free: | ||
124 | blkg_free(blkg); | ||
125 | return NULL; | ||
116 | } | 126 | } |
117 | 127 | ||
118 | static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, | 128 | static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, |
@@ -300,6 +310,38 @@ void __blkg_release(struct blkcg_gq *blkg) | |||
300 | } | 310 | } |
301 | EXPORT_SYMBOL_GPL(__blkg_release); | 311 | EXPORT_SYMBOL_GPL(__blkg_release); |
302 | 312 | ||
313 | /* | ||
314 | * The next function used by blk_queue_for_each_rl(). It's a bit tricky | ||
315 | * because the root blkg uses @q->root_rl instead of its own rl. | ||
316 | */ | ||
317 | struct request_list *__blk_queue_next_rl(struct request_list *rl, | ||
318 | struct request_queue *q) | ||
319 | { | ||
320 | struct list_head *ent; | ||
321 | struct blkcg_gq *blkg; | ||
322 | |||
323 | /* | ||
324 | * Determine the current blkg list_head. The first entry is | ||
325 | * root_rl which is off @q->blkg_list and mapped to the head. | ||
326 | */ | ||
327 | if (rl == &q->root_rl) { | ||
328 | ent = &q->blkg_list; | ||
329 | } else { | ||
330 | blkg = container_of(rl, struct blkcg_gq, rl); | ||
331 | ent = &blkg->q_node; | ||
332 | } | ||
333 | |||
334 | /* walk to the next list_head, skip root blkcg */ | ||
335 | ent = ent->next; | ||
336 | if (ent == &q->root_blkg->q_node) | ||
337 | ent = ent->next; | ||
338 | if (ent == &q->blkg_list) | ||
339 | return NULL; | ||
340 | |||
341 | blkg = container_of(ent, struct blkcg_gq, q_node); | ||
342 | return &blkg->rl; | ||
343 | } | ||
344 | |||
303 | static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, | 345 | static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, |
304 | u64 val) | 346 | u64 val) |
305 | { | 347 | { |
@@ -750,6 +792,7 @@ int blkcg_activate_policy(struct request_queue *q, | |||
750 | goto out_unlock; | 792 | goto out_unlock; |
751 | } | 793 | } |
752 | q->root_blkg = blkg; | 794 | q->root_blkg = blkg; |
795 | q->root_rl.blkg = blkg; | ||
753 | 796 | ||
754 | list_for_each_entry(blkg, &q->blkg_list, q_node) | 797 | list_for_each_entry(blkg, &q->blkg_list, q_node) |
755 | cnt++; | 798 | cnt++; |