aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2012-06-26 18:05:44 -0400
committerJens Axboe <axboe@kernel.dk>2012-06-26 18:42:49 -0400
commita051661ca6d134c18599498b185b667859d4339b (patch)
tree9d840030874aed9b97a58051bf9568455126e8e8 /block
parent5b788ce3e2acac9bf109743b1281d77347cf2101 (diff)
blkcg: implement per-blkg request allocation
Currently, request_queue has one request_list to allocate requests from regardless of blkcg of the IO being issued. When the unified request pool is used up, cfq proportional IO limits become meaningless - whoever grabs the next request being freed wins the race regardless of the configured weights. This can be easily demonstrated by creating a blkio cgroup w/ very low weight, put a program which can issue a lot of random direct IOs there and running a sequential IO from a different cgroup. As soon as the request pool is used up, the sequential IO bandwidth crashes. This patch implements per-blkg request_list. Each blkg has its own request_list and any IO allocates its request from the matching blkg making blkcgs completely isolated in terms of request allocation. * Root blkcg uses the request_list embedded in each request_queue, which was renamed to @q->root_rl from @q->rq. While making blkcg rl handling a bit harier, this enables avoiding most overhead for root blkcg. * Queue fullness is properly per request_list but bdi isn't blkcg aware yet, so congestion state currently just follows the root blkcg. As writeback isn't aware of blkcg yet, this works okay for async congestion but readahead may get the wrong signals. It's better than blkcg completely collapsing with shared request_list but needs to be improved with future changes. * After this change, each block cgroup gets a full request pool making resource consumption of each cgroup higher. This makes allowing non-root users to create cgroups less desirable; however, note that allowing non-root users to directly manage cgroups is already severely broken regardless of this patch - each block cgroup consumes kernel memory and skews IO weight (IO weights are not hierarchical). v2: queue-sysfs.txt updated and patch description udpated as suggested by Vivek. v3: blk_get_rl() wasn't checking error return from blkg_lookup_create() and may cause oops on lookup failure. Fix it by falling back to root_rl on blkg lookup failures. This problem was spotted by Rakesh Iyer <rni@google.com>. v4: Updated to accomodate 458f27a982 "block: Avoid missed wakeup in request waitqueue". blk_drain_queue() now wakes up waiters on all blkg->rl on the target queue. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Vivek Goyal <vgoyal@redhat.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
Diffstat (limited to 'block')
-rw-r--r--block/blk-cgroup.c51
-rw-r--r--block/blk-cgroup.h102
-rw-r--r--block/blk-core.c42
-rw-r--r--block/blk-sysfs.c32
4 files changed, 200 insertions, 27 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 63b31ebae6e2..f3b44a65fc7a 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -63,6 +63,7 @@ static void blkg_free(struct blkcg_gq *blkg)
63 kfree(pd); 63 kfree(pd);
64 } 64 }
65 65
66 blk_exit_rl(&blkg->rl);
66 kfree(blkg); 67 kfree(blkg);
67} 68}
68 69
@@ -90,6 +91,13 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
90 blkg->blkcg = blkcg; 91 blkg->blkcg = blkcg;
91 blkg->refcnt = 1; 92 blkg->refcnt = 1;
92 93
94 /* root blkg uses @q->root_rl, init rl only for !root blkgs */
95 if (blkcg != &blkcg_root) {
96 if (blk_init_rl(&blkg->rl, q, gfp_mask))
97 goto err_free;
98 blkg->rl.blkg = blkg;
99 }
100
93 for (i = 0; i < BLKCG_MAX_POLS; i++) { 101 for (i = 0; i < BLKCG_MAX_POLS; i++) {
94 struct blkcg_policy *pol = blkcg_policy[i]; 102 struct blkcg_policy *pol = blkcg_policy[i];
95 struct blkg_policy_data *pd; 103 struct blkg_policy_data *pd;
@@ -99,10 +107,8 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
99 107
100 /* alloc per-policy data and attach it to blkg */ 108 /* alloc per-policy data and attach it to blkg */
101 pd = kzalloc_node(pol->pd_size, gfp_mask, q->node); 109 pd = kzalloc_node(pol->pd_size, gfp_mask, q->node);
102 if (!pd) { 110 if (!pd)
103 blkg_free(blkg); 111 goto err_free;
104 return NULL;
105 }
106 112
107 blkg->pd[i] = pd; 113 blkg->pd[i] = pd;
108 pd->blkg = blkg; 114 pd->blkg = blkg;
@@ -113,6 +119,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
113 } 119 }
114 120
115 return blkg; 121 return blkg;
122
123err_free:
124 blkg_free(blkg);
125 return NULL;
116} 126}
117 127
118static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, 128static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
@@ -300,6 +310,38 @@ void __blkg_release(struct blkcg_gq *blkg)
300} 310}
301EXPORT_SYMBOL_GPL(__blkg_release); 311EXPORT_SYMBOL_GPL(__blkg_release);
302 312
313/*
314 * The next function used by blk_queue_for_each_rl(). It's a bit tricky
315 * because the root blkg uses @q->root_rl instead of its own rl.
316 */
317struct request_list *__blk_queue_next_rl(struct request_list *rl,
318 struct request_queue *q)
319{
320 struct list_head *ent;
321 struct blkcg_gq *blkg;
322
323 /*
324 * Determine the current blkg list_head. The first entry is
325 * root_rl which is off @q->blkg_list and mapped to the head.
326 */
327 if (rl == &q->root_rl) {
328 ent = &q->blkg_list;
329 } else {
330 blkg = container_of(rl, struct blkcg_gq, rl);
331 ent = &blkg->q_node;
332 }
333
334 /* walk to the next list_head, skip root blkcg */
335 ent = ent->next;
336 if (ent == &q->root_blkg->q_node)
337 ent = ent->next;
338 if (ent == &q->blkg_list)
339 return NULL;
340
341 blkg = container_of(ent, struct blkcg_gq, q_node);
342 return &blkg->rl;
343}
344
303static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, 345static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype,
304 u64 val) 346 u64 val)
305{ 347{
@@ -750,6 +792,7 @@ int blkcg_activate_policy(struct request_queue *q,
750 goto out_unlock; 792 goto out_unlock;
751 } 793 }
752 q->root_blkg = blkg; 794 q->root_blkg = blkg;
795 q->root_rl.blkg = blkg;
753 796
754 list_for_each_entry(blkg, &q->blkg_list, q_node) 797 list_for_each_entry(blkg, &q->blkg_list, q_node)
755 cnt++; 798 cnt++;
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index e74cce1fbac9..24597309e23d 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -17,6 +17,7 @@
17#include <linux/u64_stats_sync.h> 17#include <linux/u64_stats_sync.h>
18#include <linux/seq_file.h> 18#include <linux/seq_file.h>
19#include <linux/radix-tree.h> 19#include <linux/radix-tree.h>
20#include <linux/blkdev.h>
20 21
21/* Max limits for throttle policy */ 22/* Max limits for throttle policy */
22#define THROTL_IOPS_MAX UINT_MAX 23#define THROTL_IOPS_MAX UINT_MAX
@@ -93,6 +94,8 @@ struct blkcg_gq {
93 struct list_head q_node; 94 struct list_head q_node;
94 struct hlist_node blkcg_node; 95 struct hlist_node blkcg_node;
95 struct blkcg *blkcg; 96 struct blkcg *blkcg;
97 /* request allocation list for this blkcg-q pair */
98 struct request_list rl;
96 /* reference count */ 99 /* reference count */
97 int refcnt; 100 int refcnt;
98 101
@@ -251,6 +254,95 @@ static inline void blkg_put(struct blkcg_gq *blkg)
251} 254}
252 255
253/** 256/**
257 * blk_get_rl - get request_list to use
258 * @q: request_queue of interest
259 * @bio: bio which will be attached to the allocated request (may be %NULL)
260 *
261 * The caller wants to allocate a request from @q to use for @bio. Find
262 * the request_list to use and obtain a reference on it. Should be called
263 * under queue_lock. This function is guaranteed to return non-%NULL
264 * request_list.
265 */
266static inline struct request_list *blk_get_rl(struct request_queue *q,
267 struct bio *bio)
268{
269 struct blkcg *blkcg;
270 struct blkcg_gq *blkg;
271
272 rcu_read_lock();
273
274 blkcg = bio_blkcg(bio);
275
276 /* bypass blkg lookup and use @q->root_rl directly for root */
277 if (blkcg == &blkcg_root)
278 goto root_rl;
279
280 /*
281 * Try to use blkg->rl. blkg lookup may fail under memory pressure
282 * or if either the blkcg or queue is going away. Fall back to
283 * root_rl in such cases.
284 */
285 blkg = blkg_lookup_create(blkcg, q);
286 if (unlikely(IS_ERR(blkg)))
287 goto root_rl;
288
289 blkg_get(blkg);
290 rcu_read_unlock();
291 return &blkg->rl;
292root_rl:
293 rcu_read_unlock();
294 return &q->root_rl;
295}
296
297/**
298 * blk_put_rl - put request_list
299 * @rl: request_list to put
300 *
301 * Put the reference acquired by blk_get_rl(). Should be called under
302 * queue_lock.
303 */
304static inline void blk_put_rl(struct request_list *rl)
305{
306 /* root_rl may not have blkg set */
307 if (rl->blkg && rl->blkg->blkcg != &blkcg_root)
308 blkg_put(rl->blkg);
309}
310
311/**
312 * blk_rq_set_rl - associate a request with a request_list
313 * @rq: request of interest
314 * @rl: target request_list
315 *
316 * Associate @rq with @rl so that accounting and freeing can know the
317 * request_list @rq came from.
318 */
319static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl)
320{
321 rq->rl = rl;
322}
323
324/**
325 * blk_rq_rl - return the request_list a request came from
326 * @rq: request of interest
327 *
328 * Return the request_list @rq is allocated from.
329 */
330static inline struct request_list *blk_rq_rl(struct request *rq)
331{
332 return rq->rl;
333}
334
335struct request_list *__blk_queue_next_rl(struct request_list *rl,
336 struct request_queue *q);
337/**
338 * blk_queue_for_each_rl - iterate through all request_lists of a request_queue
339 *
340 * Should be used under queue_lock.
341 */
342#define blk_queue_for_each_rl(rl, q) \
343 for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
344
345/**
254 * blkg_stat_add - add a value to a blkg_stat 346 * blkg_stat_add - add a value to a blkg_stat
255 * @stat: target blkg_stat 347 * @stat: target blkg_stat
256 * @val: value to add 348 * @val: value to add
@@ -392,6 +484,7 @@ static inline void blkcg_deactivate_policy(struct request_queue *q,
392 484
393static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; } 485static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; }
394static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } 486static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
487
395static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, 488static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
396 struct blkcg_policy *pol) { return NULL; } 489 struct blkcg_policy *pol) { return NULL; }
397static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } 490static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
@@ -399,5 +492,14 @@ static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
399static inline void blkg_get(struct blkcg_gq *blkg) { } 492static inline void blkg_get(struct blkcg_gq *blkg) { }
400static inline void blkg_put(struct blkcg_gq *blkg) { } 493static inline void blkg_put(struct blkcg_gq *blkg) { }
401 494
495static inline struct request_list *blk_get_rl(struct request_queue *q,
496 struct bio *bio) { return &q->root_rl; }
497static inline void blk_put_rl(struct request_list *rl) { }
498static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { }
499static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; }
500
501#define blk_queue_for_each_rl(rl, q) \
502 for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
503
402#endif /* CONFIG_BLK_CGROUP */ 504#endif /* CONFIG_BLK_CGROUP */
403#endif /* _BLK_CGROUP_H */ 505#endif /* _BLK_CGROUP_H */
diff --git a/block/blk-core.c b/block/blk-core.c
index f392a2edf462..dd134d834d58 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -416,9 +416,14 @@ void blk_drain_queue(struct request_queue *q, bool drain_all)
416 * left with hung waiters. We need to wake up those waiters. 416 * left with hung waiters. We need to wake up those waiters.
417 */ 417 */
418 if (q->request_fn) { 418 if (q->request_fn) {
419 struct request_list *rl;
420
419 spin_lock_irq(q->queue_lock); 421 spin_lock_irq(q->queue_lock);
420 for (i = 0; i < ARRAY_SIZE(q->rq.wait); i++) 422
421 wake_up_all(&q->rq.wait[i]); 423 blk_queue_for_each_rl(rl, q)
424 for (i = 0; i < ARRAY_SIZE(rl->wait); i++)
425 wake_up_all(&rl->wait[i]);
426
422 spin_unlock_irq(q->queue_lock); 427 spin_unlock_irq(q->queue_lock);
423 } 428 }
424} 429}
@@ -685,7 +690,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
685 if (!q) 690 if (!q)
686 return NULL; 691 return NULL;
687 692
688 if (blk_init_rl(&q->rq, q, GFP_KERNEL)) 693 if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
689 return NULL; 694 return NULL;
690 695
691 q->request_fn = rfn; 696 q->request_fn = rfn;
@@ -776,7 +781,12 @@ static void __freed_request(struct request_list *rl, int sync)
776{ 781{
777 struct request_queue *q = rl->q; 782 struct request_queue *q = rl->q;
778 783
779 if (rl->count[sync] < queue_congestion_off_threshold(q)) 784 /*
785 * bdi isn't aware of blkcg yet. As all async IOs end up root
786 * blkcg anyway, just use root blkcg state.
787 */
788 if (rl == &q->root_rl &&
789 rl->count[sync] < queue_congestion_off_threshold(q))
780 blk_clear_queue_congested(q, sync); 790 blk_clear_queue_congested(q, sync);
781 791
782 if (rl->count[sync] + 1 <= q->nr_requests) { 792 if (rl->count[sync] + 1 <= q->nr_requests) {
@@ -897,7 +907,12 @@ static struct request *__get_request(struct request_list *rl, int rw_flags,
897 } 907 }
898 } 908 }
899 } 909 }
900 blk_set_queue_congested(q, is_sync); 910 /*
911 * bdi isn't aware of blkcg yet. As all async IOs end up
912 * root blkcg anyway, just use root blkcg state.
913 */
914 if (rl == &q->root_rl)
915 blk_set_queue_congested(q, is_sync);
901 } 916 }
902 917
903 /* 918 /*
@@ -939,6 +954,7 @@ static struct request *__get_request(struct request_list *rl, int rw_flags,
939 goto fail_alloc; 954 goto fail_alloc;
940 955
941 blk_rq_init(q, rq); 956 blk_rq_init(q, rq);
957 blk_rq_set_rl(rq, rl);
942 rq->cmd_flags = rw_flags | REQ_ALLOCED; 958 rq->cmd_flags = rw_flags | REQ_ALLOCED;
943 959
944 /* init elvpriv */ 960 /* init elvpriv */
@@ -1032,15 +1048,19 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
1032{ 1048{
1033 const bool is_sync = rw_is_sync(rw_flags) != 0; 1049 const bool is_sync = rw_is_sync(rw_flags) != 0;
1034 DEFINE_WAIT(wait); 1050 DEFINE_WAIT(wait);
1035 struct request_list *rl = &q->rq; 1051 struct request_list *rl;
1036 struct request *rq; 1052 struct request *rq;
1053
1054 rl = blk_get_rl(q, bio); /* transferred to @rq on success */
1037retry: 1055retry:
1038 rq = __get_request(&q->rq, rw_flags, bio, gfp_mask); 1056 rq = __get_request(rl, rw_flags, bio, gfp_mask);
1039 if (rq) 1057 if (rq)
1040 return rq; 1058 return rq;
1041 1059
1042 if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dead(q))) 1060 if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dead(q))) {
1061 blk_put_rl(rl);
1043 return NULL; 1062 return NULL;
1063 }
1044 1064
1045 /* wait on @rl and retry */ 1065 /* wait on @rl and retry */
1046 prepare_to_wait_exclusive(&rl->wait[is_sync], &wait, 1066 prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
@@ -1231,12 +1251,14 @@ void __blk_put_request(struct request_queue *q, struct request *req)
1231 */ 1251 */
1232 if (req->cmd_flags & REQ_ALLOCED) { 1252 if (req->cmd_flags & REQ_ALLOCED) {
1233 unsigned int flags = req->cmd_flags; 1253 unsigned int flags = req->cmd_flags;
1254 struct request_list *rl = blk_rq_rl(req);
1234 1255
1235 BUG_ON(!list_empty(&req->queuelist)); 1256 BUG_ON(!list_empty(&req->queuelist));
1236 BUG_ON(!hlist_unhashed(&req->hash)); 1257 BUG_ON(!hlist_unhashed(&req->hash));
1237 1258
1238 blk_free_request(&q->rq, req); 1259 blk_free_request(rl, req);
1239 freed_request(&q->rq, flags); 1260 freed_request(rl, flags);
1261 blk_put_rl(rl);
1240 } 1262 }
1241} 1263}
1242EXPORT_SYMBOL_GPL(__blk_put_request); 1264EXPORT_SYMBOL_GPL(__blk_put_request);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 234ce7c082fa..9628b291f960 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -40,7 +40,7 @@ static ssize_t queue_requests_show(struct request_queue *q, char *page)
40static ssize_t 40static ssize_t
41queue_requests_store(struct request_queue *q, const char *page, size_t count) 41queue_requests_store(struct request_queue *q, const char *page, size_t count)
42{ 42{
43 struct request_list *rl = &q->rq; 43 struct request_list *rl;
44 unsigned long nr; 44 unsigned long nr;
45 int ret; 45 int ret;
46 46
@@ -55,6 +55,9 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
55 q->nr_requests = nr; 55 q->nr_requests = nr;
56 blk_queue_congestion_threshold(q); 56 blk_queue_congestion_threshold(q);
57 57
58 /* congestion isn't cgroup aware and follows root blkcg for now */
59 rl = &q->root_rl;
60
58 if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q)) 61 if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
59 blk_set_queue_congested(q, BLK_RW_SYNC); 62 blk_set_queue_congested(q, BLK_RW_SYNC);
60 else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q)) 63 else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
@@ -65,19 +68,22 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
65 else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q)) 68 else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
66 blk_clear_queue_congested(q, BLK_RW_ASYNC); 69 blk_clear_queue_congested(q, BLK_RW_ASYNC);
67 70
68 if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { 71 blk_queue_for_each_rl(rl, q) {
69 blk_set_rl_full(rl, BLK_RW_SYNC); 72 if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
70 } else { 73 blk_set_rl_full(rl, BLK_RW_SYNC);
71 blk_clear_rl_full(rl, BLK_RW_SYNC); 74 } else {
72 wake_up(&rl->wait[BLK_RW_SYNC]); 75 blk_clear_rl_full(rl, BLK_RW_SYNC);
76 wake_up(&rl->wait[BLK_RW_SYNC]);
77 }
78
79 if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
80 blk_set_rl_full(rl, BLK_RW_ASYNC);
81 } else {
82 blk_clear_rl_full(rl, BLK_RW_ASYNC);
83 wake_up(&rl->wait[BLK_RW_ASYNC]);
84 }
73 } 85 }
74 86
75 if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
76 blk_set_rl_full(rl, BLK_RW_ASYNC);
77 } else {
78 blk_clear_rl_full(rl, BLK_RW_ASYNC);
79 wake_up(&rl->wait[BLK_RW_ASYNC]);
80 }
81 spin_unlock_irq(q->queue_lock); 87 spin_unlock_irq(q->queue_lock);
82 return ret; 88 return ret;
83} 89}
@@ -488,7 +494,7 @@ static void blk_release_queue(struct kobject *kobj)
488 elevator_exit(q->elevator); 494 elevator_exit(q->elevator);
489 } 495 }
490 496
491 blk_exit_rl(&q->rq); 497 blk_exit_rl(&q->root_rl);
492 498
493 if (q->queue_tags) 499 if (q->queue_tags)
494 __blk_queue_free_tags(q); 500 __blk_queue_free_tags(q);