aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-07-11 16:03:24 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-07-11 16:03:24 -0400
commit36805aaea5ae3cf1bb32f1643e0a800bb69f0d5b (patch)
tree5565132549a0733772b3a2ac6b5cda516ea8cdce
parent6d2fa9e141ea56a571ec842fd4f3a86bea44a203 (diff)
parentd50235b7bc3ee0a0427984d763ea7534149531b4 (diff)
Merge branch 'for-3.11/core' of git://git.kernel.dk/linux-block
Pull core block IO updates from Jens Axboe: "Here are the core IO block bits for 3.11. It contains: - A tweak to the reserved tag logic from Jan, for weirdo devices with just 3 free tags. But for those it improves things substantially for random writes. - Periodic writeback fix from Jan. Marked for stable as well. - Fix for a race condition in IO scheduler switching from Jianpeng. - The hierarchical blk-cgroup support from Tejun. This is the grunt of the series. - blk-throttle fix from Vivek. Just a note that I'm in the middle of a relocation, whole family is flying out tomorrow. Hence I will be awal the remainder of this week, but back at work again on Monday the 15th. CC'ing Tejun, since any potential "surprises" will most likely be from the blk-cgroup work. But it's been brewing for a while and sitting in my tree and linux-next for a long time, so should be solid." * 'for-3.11/core' of git://git.kernel.dk/linux-block: (36 commits) elevator: Fix a race in elevator switching block: Reserve only one queue tag for sync IO if only 3 tags are available writeback: Fix periodic writeback after fs mount blk-throttle: implement proper hierarchy support blk-throttle: implement throtl_grp->has_rules[] blk-throttle: Account for child group's start time in parent while bio climbs up blk-throttle: add throtl_qnode for dispatch fairness blk-throttle: make throtl_pending_timer_fn() ready for hierarchy blk-throttle: make tg_dispatch_one_bio() ready for hierarchy blk-throttle: make blk_throtl_bio() ready for hierarchy blk-throttle: make blk_throtl_drain() ready for hierarchy blk-throttle: dispatch from throtl_pending_timer_fn() blk-throttle: implement dispatch looping blk-throttle: separate out throtl_service_queue->pending_timer from throtl_data->dispatch_work blk-throttle: set REQ_THROTTLED from throtl_charge_bio() and gate stats update with it blk-throttle: implement sq_to_tg(), sq_to_td() and throtl_log() blk-throttle: add throtl_service_queue->parent_sq blk-throttle: generalize update_disptime optimization in blk_throtl_bio() blk-throttle: dispatch to throtl_data->service_queue.bio_lists[] blk-throttle: move bio_lists[] and friends to throtl_service_queue ...
-rw-r--r--Documentation/cgroups/blkio-controller.txt29
-rw-r--r--block/blk-cgroup.c105
-rw-r--r--block/blk-cgroup.h38
-rw-r--r--block/blk-tag.c11
-rw-r--r--block/blk-throttle.c1064
-rw-r--r--block/cfq-iosched.c17
-rw-r--r--block/deadline-iosched.c16
-rw-r--r--block/elevator.c25
-rw-r--r--block/noop-iosched.c17
-rw-r--r--fs/block_dev.c9
-rw-r--r--include/linux/cgroup.h2
-rw-r--r--include/linux/elevator.h6
12 files changed, 905 insertions, 434 deletions
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index da272c8f44e7..cd556b914786 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -94,11 +94,13 @@ Throttling/Upper Limit policy
94 94
95Hierarchical Cgroups 95Hierarchical Cgroups
96==================== 96====================
97- Currently only CFQ supports hierarchical groups. For throttling,
98 cgroup interface does allow creation of hierarchical cgroups and
99 internally it treats them as flat hierarchy.
100 97
101 If somebody created a hierarchy like as follows. 98Both CFQ and throttling implement hierarchy support; however,
99throttling's hierarchy support is enabled iff "sane_behavior" is
100enabled from cgroup side, which currently is a development option and
101not publicly available.
102
103If somebody created a hierarchy like as follows.
102 104
103 root 105 root
104 / \ 106 / \
@@ -106,21 +108,20 @@ Hierarchical Cgroups
106 | 108 |
107 test3 109 test3
108 110
109 CFQ will handle the hierarchy correctly but and throttling will 111CFQ by default and throttling with "sane_behavior" will handle the
110 practically treat all groups at same level. For details on CFQ 112hierarchy correctly. For details on CFQ hierarchy support, refer to
111 hierarchy support, refer to Documentation/block/cfq-iosched.txt. 113Documentation/block/cfq-iosched.txt. For throttling, all limits apply
112 Throttling will treat the hierarchy as if it looks like the 114to the whole subtree while all statistics are local to the IOs
113 following. 115directly generated by tasks in that cgroup.
116
117Throttling without "sane_behavior" enabled from cgroup side will
118practically treat all groups at same level as if it looks like the
119following.
114 120
115 pivot 121 pivot
116 / / \ \ 122 / / \ \
117 root test1 test2 test3 123 root test1 test2 test3
118 124
119 Nesting cgroups, while allowed, isn't officially supported and blkio
120 genereates warning when cgroups nest. Once throttling implements
121 hierarchy support, hierarchy will be supported and the warning will
122 be removed.
123
124Various user visible config options 125Various user visible config options
125=================================== 126===================================
126CONFIG_BLK_CGROUP 127CONFIG_BLK_CGROUP
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index e8918ffaf96d..290792a13e3c 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -32,26 +32,6 @@ EXPORT_SYMBOL_GPL(blkcg_root);
32 32
33static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; 33static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
34 34
35static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
36 struct request_queue *q, bool update_hint);
37
38/**
39 * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
40 * @d_blkg: loop cursor pointing to the current descendant
41 * @pos_cgrp: used for iteration
42 * @p_blkg: target blkg to walk descendants of
43 *
44 * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU
45 * read locked. If called under either blkcg or queue lock, the iteration
46 * is guaranteed to include all and only online blkgs. The caller may
47 * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip
48 * subtree.
49 */
50#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \
51 cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
52 if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
53 (p_blkg)->q, false)))
54
55static bool blkcg_policy_enabled(struct request_queue *q, 35static bool blkcg_policy_enabled(struct request_queue *q,
56 const struct blkcg_policy *pol) 36 const struct blkcg_policy *pol)
57{ 37{
@@ -71,18 +51,8 @@ static void blkg_free(struct blkcg_gq *blkg)
71 if (!blkg) 51 if (!blkg)
72 return; 52 return;
73 53
74 for (i = 0; i < BLKCG_MAX_POLS; i++) { 54 for (i = 0; i < BLKCG_MAX_POLS; i++)
75 struct blkcg_policy *pol = blkcg_policy[i]; 55 kfree(blkg->pd[i]);
76 struct blkg_policy_data *pd = blkg->pd[i];
77
78 if (!pd)
79 continue;
80
81 if (pol && pol->pd_exit_fn)
82 pol->pd_exit_fn(blkg);
83
84 kfree(pd);
85 }
86 56
87 blk_exit_rl(&blkg->rl); 57 blk_exit_rl(&blkg->rl);
88 kfree(blkg); 58 kfree(blkg);
@@ -134,10 +104,6 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
134 blkg->pd[i] = pd; 104 blkg->pd[i] = pd;
135 pd->blkg = blkg; 105 pd->blkg = blkg;
136 pd->plid = i; 106 pd->plid = i;
137
138 /* invoke per-policy init */
139 if (pol->pd_init_fn)
140 pol->pd_init_fn(blkg);
141 } 107 }
142 108
143 return blkg; 109 return blkg;
@@ -158,8 +124,8 @@ err_free:
158 * @q's bypass state. If @update_hint is %true, the caller should be 124 * @q's bypass state. If @update_hint is %true, the caller should be
159 * holding @q->queue_lock and lookup hint is updated on success. 125 * holding @q->queue_lock and lookup hint is updated on success.
160 */ 126 */
161static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, 127struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
162 struct request_queue *q, bool update_hint) 128 bool update_hint)
163{ 129{
164 struct blkcg_gq *blkg; 130 struct blkcg_gq *blkg;
165 131
@@ -234,16 +200,25 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
234 } 200 }
235 blkg = new_blkg; 201 blkg = new_blkg;
236 202
237 /* link parent and insert */ 203 /* link parent */
238 if (blkcg_parent(blkcg)) { 204 if (blkcg_parent(blkcg)) {
239 blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); 205 blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
240 if (WARN_ON_ONCE(!blkg->parent)) { 206 if (WARN_ON_ONCE(!blkg->parent)) {
241 blkg = ERR_PTR(-EINVAL); 207 ret = -EINVAL;
242 goto err_put_css; 208 goto err_put_css;
243 } 209 }
244 blkg_get(blkg->parent); 210 blkg_get(blkg->parent);
245 } 211 }
246 212
213 /* invoke per-policy init */
214 for (i = 0; i < BLKCG_MAX_POLS; i++) {
215 struct blkcg_policy *pol = blkcg_policy[i];
216
217 if (blkg->pd[i] && pol->pd_init_fn)
218 pol->pd_init_fn(blkg);
219 }
220
221 /* insert */
247 spin_lock(&blkcg->lock); 222 spin_lock(&blkcg->lock);
248 ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); 223 ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
249 if (likely(!ret)) { 224 if (likely(!ret)) {
@@ -394,30 +369,38 @@ static void blkg_destroy_all(struct request_queue *q)
394 q->root_rl.blkg = NULL; 369 q->root_rl.blkg = NULL;
395} 370}
396 371
397static void blkg_rcu_free(struct rcu_head *rcu_head) 372/*
373 * A group is RCU protected, but having an rcu lock does not mean that one
374 * can access all the fields of blkg and assume these are valid. For
375 * example, don't try to follow throtl_data and request queue links.
376 *
377 * Having a reference to blkg under an rcu allows accesses to only values
378 * local to groups like group stats and group rate limits.
379 */
380void __blkg_release_rcu(struct rcu_head *rcu_head)
398{ 381{
399 blkg_free(container_of(rcu_head, struct blkcg_gq, rcu_head)); 382 struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head);
400} 383 int i;
384
385 /* tell policies that this one is being freed */
386 for (i = 0; i < BLKCG_MAX_POLS; i++) {
387 struct blkcg_policy *pol = blkcg_policy[i];
388
389 if (blkg->pd[i] && pol->pd_exit_fn)
390 pol->pd_exit_fn(blkg);
391 }
401 392
402void __blkg_release(struct blkcg_gq *blkg)
403{
404 /* release the blkcg and parent blkg refs this blkg has been holding */ 393 /* release the blkcg and parent blkg refs this blkg has been holding */
405 css_put(&blkg->blkcg->css); 394 css_put(&blkg->blkcg->css);
406 if (blkg->parent) 395 if (blkg->parent) {
396 spin_lock_irq(blkg->q->queue_lock);
407 blkg_put(blkg->parent); 397 blkg_put(blkg->parent);
398 spin_unlock_irq(blkg->q->queue_lock);
399 }
408 400
409 /* 401 blkg_free(blkg);
410 * A group is freed in rcu manner. But having an rcu lock does not
411 * mean that one can access all the fields of blkg and assume these
412 * are valid. For example, don't try to follow throtl_data and
413 * request queue links.
414 *
415 * Having a reference to blkg under an rcu allows acess to only
416 * values local to groups like group stats and group rate limits
417 */
418 call_rcu(&blkg->rcu_head, blkg_rcu_free);
419} 402}
420EXPORT_SYMBOL_GPL(__blkg_release); 403EXPORT_SYMBOL_GPL(__blkg_release_rcu);
421 404
422/* 405/*
423 * The next function used by blk_queue_for_each_rl(). It's a bit tricky 406 * The next function used by blk_queue_for_each_rl(). It's a bit tricky
@@ -928,14 +911,6 @@ struct cgroup_subsys blkio_subsys = {
928 .subsys_id = blkio_subsys_id, 911 .subsys_id = blkio_subsys_id,
929 .base_cftypes = blkcg_files, 912 .base_cftypes = blkcg_files,
930 .module = THIS_MODULE, 913 .module = THIS_MODULE,
931
932 /*
933 * blkio subsystem is utterly broken in terms of hierarchy support.
934 * It treats all cgroups equally regardless of where they're
935 * located in the hierarchy - all cgroups are treated as if they're
936 * right below the root. Fix it and remove the following.
937 */
938 .broken_hierarchy = true,
939}; 914};
940EXPORT_SYMBOL_GPL(blkio_subsys); 915EXPORT_SYMBOL_GPL(blkio_subsys);
941 916
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 4e595ee8c915..8056c03a3382 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -266,7 +266,7 @@ static inline void blkg_get(struct blkcg_gq *blkg)
266 blkg->refcnt++; 266 blkg->refcnt++;
267} 267}
268 268
269void __blkg_release(struct blkcg_gq *blkg); 269void __blkg_release_rcu(struct rcu_head *rcu);
270 270
271/** 271/**
272 * blkg_put - put a blkg reference 272 * blkg_put - put a blkg reference
@@ -279,9 +279,43 @@ static inline void blkg_put(struct blkcg_gq *blkg)
279 lockdep_assert_held(blkg->q->queue_lock); 279 lockdep_assert_held(blkg->q->queue_lock);
280 WARN_ON_ONCE(blkg->refcnt <= 0); 280 WARN_ON_ONCE(blkg->refcnt <= 0);
281 if (!--blkg->refcnt) 281 if (!--blkg->refcnt)
282 __blkg_release(blkg); 282 call_rcu(&blkg->rcu_head, __blkg_release_rcu);
283} 283}
284 284
285struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
286 bool update_hint);
287
288/**
289 * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
290 * @d_blkg: loop cursor pointing to the current descendant
291 * @pos_cgrp: used for iteration
292 * @p_blkg: target blkg to walk descendants of
293 *
294 * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU
295 * read locked. If called under either blkcg or queue lock, the iteration
296 * is guaranteed to include all and only online blkgs. The caller may
297 * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip
298 * subtree.
299 */
300#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \
301 cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
302 if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
303 (p_blkg)->q, false)))
304
305/**
306 * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
307 * @d_blkg: loop cursor pointing to the current descendant
308 * @pos_cgrp: used for iteration
309 * @p_blkg: target blkg to walk descendants of
310 *
311 * Similar to blkg_for_each_descendant_pre() but performs post-order
312 * traversal instead. Synchronization rules are the same.
313 */
314#define blkg_for_each_descendant_post(d_blkg, pos_cgrp, p_blkg) \
315 cgroup_for_each_descendant_post((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
316 if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
317 (p_blkg)->q, false)))
318
285/** 319/**
286 * blk_get_rl - get request_list to use 320 * blk_get_rl - get request_list to use
287 * @q: request_queue of interest 321 * @q: request_queue of interest
diff --git a/block/blk-tag.c b/block/blk-tag.c
index cc345e1d8d4e..3f33d8672268 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -348,9 +348,16 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq)
348 */ 348 */
349 max_depth = bqt->max_depth; 349 max_depth = bqt->max_depth;
350 if (!rq_is_sync(rq) && max_depth > 1) { 350 if (!rq_is_sync(rq) && max_depth > 1) {
351 max_depth -= 2; 351 switch (max_depth) {
352 if (!max_depth) 352 case 2:
353 max_depth = 1; 353 max_depth = 1;
354 break;
355 case 3:
356 max_depth = 2;
357 break;
358 default:
359 max_depth -= 2;
360 }
354 if (q->in_flight[BLK_RW_ASYNC] > max_depth) 361 if (q->in_flight[BLK_RW_ASYNC] > max_depth)
355 return 1; 362 return 1;
356 } 363 }
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 31146225f3d0..08a32dfd3844 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -25,18 +25,61 @@ static struct blkcg_policy blkcg_policy_throtl;
25 25
26/* A workqueue to queue throttle related work */ 26/* A workqueue to queue throttle related work */
27static struct workqueue_struct *kthrotld_workqueue; 27static struct workqueue_struct *kthrotld_workqueue;
28static void throtl_schedule_delayed_work(struct throtl_data *td, 28
29 unsigned long delay); 29/*
30 30 * To implement hierarchical throttling, throtl_grps form a tree and bios
31struct throtl_rb_root { 31 * are dispatched upwards level by level until they reach the top and get
32 struct rb_root rb; 32 * issued. When dispatching bios from the children and local group at each
33 struct rb_node *left; 33 * level, if the bios are dispatched into a single bio_list, there's a risk
34 unsigned int count; 34 * of a local or child group which can queue many bios at once filling up
35 unsigned long min_disptime; 35 * the list starving others.
36 *
37 * To avoid such starvation, dispatched bios are queued separately
38 * according to where they came from. When they are again dispatched to
39 * the parent, they're popped in round-robin order so that no single source
40 * hogs the dispatch window.
41 *
42 * throtl_qnode is used to keep the queued bios separated by their sources.
43 * Bios are queued to throtl_qnode which in turn is queued to
44 * throtl_service_queue and then dispatched in round-robin order.
45 *
46 * It's also used to track the reference counts on blkg's. A qnode always
47 * belongs to a throtl_grp and gets queued on itself or the parent, so
48 * incrementing the reference of the associated throtl_grp when a qnode is
49 * queued and decrementing when dequeued is enough to keep the whole blkg
50 * tree pinned while bios are in flight.
51 */
52struct throtl_qnode {
53 struct list_head node; /* service_queue->queued[] */
54 struct bio_list bios; /* queued bios */
55 struct throtl_grp *tg; /* tg this qnode belongs to */
36}; 56};
37 57
38#define THROTL_RB_ROOT (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \ 58struct throtl_service_queue {
39 .count = 0, .min_disptime = 0} 59 struct throtl_service_queue *parent_sq; /* the parent service_queue */
60
61 /*
62 * Bios queued directly to this service_queue or dispatched from
63 * children throtl_grp's.
64 */
65 struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */
66 unsigned int nr_queued[2]; /* number of queued bios */
67
68 /*
69 * RB tree of active children throtl_grp's, which are sorted by
70 * their ->disptime.
71 */
72 struct rb_root pending_tree; /* RB tree of active tgs */
73 struct rb_node *first_pending; /* first node in the tree */
74 unsigned int nr_pending; /* # queued in the tree */
75 unsigned long first_pending_disptime; /* disptime of the first tg */
76 struct timer_list pending_timer; /* fires on first_pending_disptime */
77};
78
79enum tg_state_flags {
80 THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
81 THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
82};
40 83
41#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) 84#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
42 85
@@ -52,9 +95,26 @@ struct throtl_grp {
52 /* must be the first member */ 95 /* must be the first member */
53 struct blkg_policy_data pd; 96 struct blkg_policy_data pd;
54 97
55 /* active throtl group service_tree member */ 98 /* active throtl group service_queue member */
56 struct rb_node rb_node; 99 struct rb_node rb_node;
57 100
101 /* throtl_data this group belongs to */
102 struct throtl_data *td;
103
104 /* this group's service queue */
105 struct throtl_service_queue service_queue;
106
107 /*
108 * qnode_on_self is used when bios are directly queued to this
109 * throtl_grp so that local bios compete fairly with bios
110 * dispatched from children. qnode_on_parent is used when bios are
111 * dispatched from this throtl_grp into its parent and will compete
112 * with the sibling qnode_on_parents and the parent's
113 * qnode_on_self.
114 */
115 struct throtl_qnode qnode_on_self[2];
116 struct throtl_qnode qnode_on_parent[2];
117
58 /* 118 /*
59 * Dispatch time in jiffies. This is the estimated time when group 119 * Dispatch time in jiffies. This is the estimated time when group
60 * will unthrottle and is ready to dispatch more bio. It is used as 120 * will unthrottle and is ready to dispatch more bio. It is used as
@@ -64,11 +124,8 @@ struct throtl_grp {
64 124
65 unsigned int flags; 125 unsigned int flags;
66 126
67 /* Two lists for READ and WRITE */ 127 /* are there any throtl rules between this group and td? */
68 struct bio_list bio_lists[2]; 128 bool has_rules[2];
69
70 /* Number of queued bios on READ and WRITE lists */
71 unsigned int nr_queued[2];
72 129
73 /* bytes per second rate limits */ 130 /* bytes per second rate limits */
74 uint64_t bps[2]; 131 uint64_t bps[2];
@@ -85,9 +142,6 @@ struct throtl_grp {
85 unsigned long slice_start[2]; 142 unsigned long slice_start[2];
86 unsigned long slice_end[2]; 143 unsigned long slice_end[2];
87 144
88 /* Some throttle limits got updated for the group */
89 int limits_changed;
90
91 /* Per cpu stats pointer */ 145 /* Per cpu stats pointer */
92 struct tg_stats_cpu __percpu *stats_cpu; 146 struct tg_stats_cpu __percpu *stats_cpu;
93 147
@@ -98,7 +152,7 @@ struct throtl_grp {
98struct throtl_data 152struct throtl_data
99{ 153{
100 /* service tree for active throtl groups */ 154 /* service tree for active throtl groups */
101 struct throtl_rb_root tg_service_tree; 155 struct throtl_service_queue service_queue;
102 156
103 struct request_queue *queue; 157 struct request_queue *queue;
104 158
@@ -111,9 +165,7 @@ struct throtl_data
111 unsigned int nr_undestroyed_grps; 165 unsigned int nr_undestroyed_grps;
112 166
113 /* Work for dispatching throttled bios */ 167 /* Work for dispatching throttled bios */
114 struct delayed_work throtl_work; 168 struct work_struct dispatch_work;
115
116 int limits_changed;
117}; 169};
118 170
119/* list and work item to allocate percpu group stats */ 171/* list and work item to allocate percpu group stats */
@@ -123,6 +175,8 @@ static LIST_HEAD(tg_stats_alloc_list);
123static void tg_stats_alloc_fn(struct work_struct *); 175static void tg_stats_alloc_fn(struct work_struct *);
124static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn); 176static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);
125 177
178static void throtl_pending_timer_fn(unsigned long arg);
179
126static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) 180static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
127{ 181{
128 return pd ? container_of(pd, struct throtl_grp, pd) : NULL; 182 return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
@@ -143,41 +197,65 @@ static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
143 return blkg_to_tg(td->queue->root_blkg); 197 return blkg_to_tg(td->queue->root_blkg);
144} 198}
145 199
146enum tg_state_flags { 200/**
147 THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */ 201 * sq_to_tg - return the throl_grp the specified service queue belongs to
148}; 202 * @sq: the throtl_service_queue of interest
149 203 *
150#define THROTL_TG_FNS(name) \ 204 * Return the throtl_grp @sq belongs to. If @sq is the top-level one
151static inline void throtl_mark_tg_##name(struct throtl_grp *tg) \ 205 * embedded in throtl_data, %NULL is returned.
152{ \ 206 */
153 (tg)->flags |= (1 << THROTL_TG_FLAG_##name); \ 207static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq)
154} \ 208{
155static inline void throtl_clear_tg_##name(struct throtl_grp *tg) \ 209 if (sq && sq->parent_sq)
156{ \ 210 return container_of(sq, struct throtl_grp, service_queue);
157 (tg)->flags &= ~(1 << THROTL_TG_FLAG_##name); \ 211 else
158} \ 212 return NULL;
159static inline int throtl_tg_##name(const struct throtl_grp *tg) \
160{ \
161 return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0; \
162} 213}
163 214
164THROTL_TG_FNS(on_rr); 215/**
216 * sq_to_td - return throtl_data the specified service queue belongs to
217 * @sq: the throtl_service_queue of interest
218 *
219 * A service_queue can be embeded in either a throtl_grp or throtl_data.
220 * Determine the associated throtl_data accordingly and return it.
221 */
222static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
223{
224 struct throtl_grp *tg = sq_to_tg(sq);
165 225
166#define throtl_log_tg(td, tg, fmt, args...) do { \ 226 if (tg)
167 char __pbuf[128]; \ 227 return tg->td;
228 else
229 return container_of(sq, struct throtl_data, service_queue);
230}
231
232/**
233 * throtl_log - log debug message via blktrace
234 * @sq: the service_queue being reported
235 * @fmt: printf format string
236 * @args: printf args
237 *
238 * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a
239 * throtl_grp; otherwise, just "throtl".
240 *
241 * TODO: this should be made a function and name formatting should happen
242 * after testing whether blktrace is enabled.
243 */
244#define throtl_log(sq, fmt, args...) do { \
245 struct throtl_grp *__tg = sq_to_tg((sq)); \
246 struct throtl_data *__td = sq_to_td((sq)); \
247 \
248 (void)__td; \
249 if ((__tg)) { \
250 char __pbuf[128]; \
168 \ 251 \
169 blkg_path(tg_to_blkg(tg), __pbuf, sizeof(__pbuf)); \ 252 blkg_path(tg_to_blkg(__tg), __pbuf, sizeof(__pbuf)); \
170 blk_add_trace_msg((td)->queue, "throtl %s " fmt, __pbuf, ##args); \ 253 blk_add_trace_msg(__td->queue, "throtl %s " fmt, __pbuf, ##args); \
254 } else { \
255 blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \
256 } \
171} while (0) 257} while (0)
172 258
173#define throtl_log(td, fmt, args...) \
174 blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
175
176static inline unsigned int total_nr_queued(struct throtl_data *td)
177{
178 return td->nr_queued[0] + td->nr_queued[1];
179}
180
181/* 259/*
182 * Worker for allocating per cpu stat for tgs. This is scheduled on the 260 * Worker for allocating per cpu stat for tgs. This is scheduled on the
183 * system_wq once there are some groups on the alloc_list waiting for 261 * system_wq once there are some groups on the alloc_list waiting for
@@ -215,15 +293,141 @@ alloc_stats:
215 goto alloc_stats; 293 goto alloc_stats;
216} 294}
217 295
296static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
297{
298 INIT_LIST_HEAD(&qn->node);
299 bio_list_init(&qn->bios);
300 qn->tg = tg;
301}
302
303/**
304 * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it
305 * @bio: bio being added
306 * @qn: qnode to add bio to
307 * @queued: the service_queue->queued[] list @qn belongs to
308 *
309 * Add @bio to @qn and put @qn on @queued if it's not already on.
310 * @qn->tg's reference count is bumped when @qn is activated. See the
311 * comment on top of throtl_qnode definition for details.
312 */
313static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
314 struct list_head *queued)
315{
316 bio_list_add(&qn->bios, bio);
317 if (list_empty(&qn->node)) {
318 list_add_tail(&qn->node, queued);
319 blkg_get(tg_to_blkg(qn->tg));
320 }
321}
322
323/**
324 * throtl_peek_queued - peek the first bio on a qnode list
325 * @queued: the qnode list to peek
326 */
327static struct bio *throtl_peek_queued(struct list_head *queued)
328{
329 struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
330 struct bio *bio;
331
332 if (list_empty(queued))
333 return NULL;
334
335 bio = bio_list_peek(&qn->bios);
336 WARN_ON_ONCE(!bio);
337 return bio;
338}
339
340/**
341 * throtl_pop_queued - pop the first bio form a qnode list
342 * @queued: the qnode list to pop a bio from
343 * @tg_to_put: optional out argument for throtl_grp to put
344 *
345 * Pop the first bio from the qnode list @queued. After popping, the first
346 * qnode is removed from @queued if empty or moved to the end of @queued so
347 * that the popping order is round-robin.
348 *
349 * When the first qnode is removed, its associated throtl_grp should be put
350 * too. If @tg_to_put is NULL, this function automatically puts it;
351 * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is
352 * responsible for putting it.
353 */
354static struct bio *throtl_pop_queued(struct list_head *queued,
355 struct throtl_grp **tg_to_put)
356{
357 struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
358 struct bio *bio;
359
360 if (list_empty(queued))
361 return NULL;
362
363 bio = bio_list_pop(&qn->bios);
364 WARN_ON_ONCE(!bio);
365
366 if (bio_list_empty(&qn->bios)) {
367 list_del_init(&qn->node);
368 if (tg_to_put)
369 *tg_to_put = qn->tg;
370 else
371 blkg_put(tg_to_blkg(qn->tg));
372 } else {
373 list_move_tail(&qn->node, queued);
374 }
375
376 return bio;
377}
378
379/* init a service_queue, assumes the caller zeroed it */
380static void throtl_service_queue_init(struct throtl_service_queue *sq,
381 struct throtl_service_queue *parent_sq)
382{
383 INIT_LIST_HEAD(&sq->queued[0]);
384 INIT_LIST_HEAD(&sq->queued[1]);
385 sq->pending_tree = RB_ROOT;
386 sq->parent_sq = parent_sq;
387 setup_timer(&sq->pending_timer, throtl_pending_timer_fn,
388 (unsigned long)sq);
389}
390
391static void throtl_service_queue_exit(struct throtl_service_queue *sq)
392{
393 del_timer_sync(&sq->pending_timer);
394}
395
218static void throtl_pd_init(struct blkcg_gq *blkg) 396static void throtl_pd_init(struct blkcg_gq *blkg)
219{ 397{
220 struct throtl_grp *tg = blkg_to_tg(blkg); 398 struct throtl_grp *tg = blkg_to_tg(blkg);
399 struct throtl_data *td = blkg->q->td;
400 struct throtl_service_queue *parent_sq;
221 unsigned long flags; 401 unsigned long flags;
402 int rw;
403
404 /*
405 * If sane_hierarchy is enabled, we switch to properly hierarchical
406 * behavior where limits on a given throtl_grp are applied to the
407 * whole subtree rather than just the group itself. e.g. If 16M
408 * read_bps limit is set on the root group, the whole system can't
409 * exceed 16M for the device.
410 *
411 * If sane_hierarchy is not enabled, the broken flat hierarchy
412 * behavior is retained where all throtl_grps are treated as if
413 * they're all separate root groups right below throtl_data.
414 * Limits of a group don't interact with limits of other groups
415 * regardless of the position of the group in the hierarchy.
416 */
417 parent_sq = &td->service_queue;
418
419 if (cgroup_sane_behavior(blkg->blkcg->css.cgroup) && blkg->parent)
420 parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
421
422 throtl_service_queue_init(&tg->service_queue, parent_sq);
423
424 for (rw = READ; rw <= WRITE; rw++) {
425 throtl_qnode_init(&tg->qnode_on_self[rw], tg);
426 throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
427 }
222 428
223 RB_CLEAR_NODE(&tg->rb_node); 429 RB_CLEAR_NODE(&tg->rb_node);
224 bio_list_init(&tg->bio_lists[0]); 430 tg->td = td;
225 bio_list_init(&tg->bio_lists[1]);
226 tg->limits_changed = false;
227 431
228 tg->bps[READ] = -1; 432 tg->bps[READ] = -1;
229 tg->bps[WRITE] = -1; 433 tg->bps[WRITE] = -1;
@@ -241,6 +445,30 @@ static void throtl_pd_init(struct blkcg_gq *blkg)
241 spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); 445 spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
242} 446}
243 447
448/*
449 * Set has_rules[] if @tg or any of its parents have limits configured.
450 * This doesn't require walking up to the top of the hierarchy as the
451 * parent's has_rules[] is guaranteed to be correct.
452 */
453static void tg_update_has_rules(struct throtl_grp *tg)
454{
455 struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq);
456 int rw;
457
458 for (rw = READ; rw <= WRITE; rw++)
459 tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) ||
460 (tg->bps[rw] != -1 || tg->iops[rw] != -1);
461}
462
463static void throtl_pd_online(struct blkcg_gq *blkg)
464{
465 /*
466 * We don't want new groups to escape the limits of its ancestors.
467 * Update has_rules[] after a new group is brought online.
468 */
469 tg_update_has_rules(blkg_to_tg(blkg));
470}
471
244static void throtl_pd_exit(struct blkcg_gq *blkg) 472static void throtl_pd_exit(struct blkcg_gq *blkg)
245{ 473{
246 struct throtl_grp *tg = blkg_to_tg(blkg); 474 struct throtl_grp *tg = blkg_to_tg(blkg);
@@ -251,6 +479,8 @@ static void throtl_pd_exit(struct blkcg_gq *blkg)
251 spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); 479 spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
252 480
253 free_percpu(tg->stats_cpu); 481 free_percpu(tg->stats_cpu);
482
483 throtl_service_queue_exit(&tg->service_queue);
254} 484}
255 485
256static void throtl_pd_reset_stats(struct blkcg_gq *blkg) 486static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
@@ -309,17 +539,18 @@ static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
309 return tg; 539 return tg;
310} 540}
311 541
312static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root) 542static struct throtl_grp *
543throtl_rb_first(struct throtl_service_queue *parent_sq)
313{ 544{
314 /* Service tree is empty */ 545 /* Service tree is empty */
315 if (!root->count) 546 if (!parent_sq->nr_pending)
316 return NULL; 547 return NULL;
317 548
318 if (!root->left) 549 if (!parent_sq->first_pending)
319 root->left = rb_first(&root->rb); 550 parent_sq->first_pending = rb_first(&parent_sq->pending_tree);
320 551
321 if (root->left) 552 if (parent_sq->first_pending)
322 return rb_entry_tg(root->left); 553 return rb_entry_tg(parent_sq->first_pending);
323 554
324 return NULL; 555 return NULL;
325} 556}
@@ -330,29 +561,30 @@ static void rb_erase_init(struct rb_node *n, struct rb_root *root)
330 RB_CLEAR_NODE(n); 561 RB_CLEAR_NODE(n);
331} 562}
332 563
333static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root) 564static void throtl_rb_erase(struct rb_node *n,
565 struct throtl_service_queue *parent_sq)
334{ 566{
335 if (root->left == n) 567 if (parent_sq->first_pending == n)
336 root->left = NULL; 568 parent_sq->first_pending = NULL;
337 rb_erase_init(n, &root->rb); 569 rb_erase_init(n, &parent_sq->pending_tree);
338 --root->count; 570 --parent_sq->nr_pending;
339} 571}
340 572
341static void update_min_dispatch_time(struct throtl_rb_root *st) 573static void update_min_dispatch_time(struct throtl_service_queue *parent_sq)
342{ 574{
343 struct throtl_grp *tg; 575 struct throtl_grp *tg;
344 576
345 tg = throtl_rb_first(st); 577 tg = throtl_rb_first(parent_sq);
346 if (!tg) 578 if (!tg)
347 return; 579 return;
348 580
349 st->min_disptime = tg->disptime; 581 parent_sq->first_pending_disptime = tg->disptime;
350} 582}
351 583
352static void 584static void tg_service_queue_add(struct throtl_grp *tg)
353tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
354{ 585{
355 struct rb_node **node = &st->rb.rb_node; 586 struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq;
587 struct rb_node **node = &parent_sq->pending_tree.rb_node;
356 struct rb_node *parent = NULL; 588 struct rb_node *parent = NULL;
357 struct throtl_grp *__tg; 589 struct throtl_grp *__tg;
358 unsigned long key = tg->disptime; 590 unsigned long key = tg->disptime;
@@ -371,89 +603,135 @@ tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
371 } 603 }
372 604
373 if (left) 605 if (left)
374 st->left = &tg->rb_node; 606 parent_sq->first_pending = &tg->rb_node;
375 607
376 rb_link_node(&tg->rb_node, parent, node); 608 rb_link_node(&tg->rb_node, parent, node);
377 rb_insert_color(&tg->rb_node, &st->rb); 609 rb_insert_color(&tg->rb_node, &parent_sq->pending_tree);
378} 610}
379 611
380static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) 612static void __throtl_enqueue_tg(struct throtl_grp *tg)
381{ 613{
382 struct throtl_rb_root *st = &td->tg_service_tree; 614 tg_service_queue_add(tg);
615 tg->flags |= THROTL_TG_PENDING;
616 tg->service_queue.parent_sq->nr_pending++;
617}
383 618
384 tg_service_tree_add(st, tg); 619static void throtl_enqueue_tg(struct throtl_grp *tg)
385 throtl_mark_tg_on_rr(tg); 620{
386 st->count++; 621 if (!(tg->flags & THROTL_TG_PENDING))
622 __throtl_enqueue_tg(tg);
387} 623}
388 624
389static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) 625static void __throtl_dequeue_tg(struct throtl_grp *tg)
390{ 626{
391 if (!throtl_tg_on_rr(tg)) 627 throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq);
392 __throtl_enqueue_tg(td, tg); 628 tg->flags &= ~THROTL_TG_PENDING;
393} 629}
394 630
395static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) 631static void throtl_dequeue_tg(struct throtl_grp *tg)
396{ 632{
397 throtl_rb_erase(&tg->rb_node, &td->tg_service_tree); 633 if (tg->flags & THROTL_TG_PENDING)
398 throtl_clear_tg_on_rr(tg); 634 __throtl_dequeue_tg(tg);
399} 635}
400 636
401static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) 637/* Call with queue lock held */
638static void throtl_schedule_pending_timer(struct throtl_service_queue *sq,
639 unsigned long expires)
402{ 640{
403 if (throtl_tg_on_rr(tg)) 641 mod_timer(&sq->pending_timer, expires);
404 __throtl_dequeue_tg(td, tg); 642 throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu",
643 expires - jiffies, jiffies);
405} 644}
406 645
407static void throtl_schedule_next_dispatch(struct throtl_data *td) 646/**
647 * throtl_schedule_next_dispatch - schedule the next dispatch cycle
648 * @sq: the service_queue to schedule dispatch for
649 * @force: force scheduling
650 *
651 * Arm @sq->pending_timer so that the next dispatch cycle starts on the
652 * dispatch time of the first pending child. Returns %true if either timer
653 * is armed or there's no pending child left. %false if the current
654 * dispatch window is still open and the caller should continue
655 * dispatching.
656 *
657 * If @force is %true, the dispatch timer is always scheduled and this
658 * function is guaranteed to return %true. This is to be used when the
659 * caller can't dispatch itself and needs to invoke pending_timer
660 * unconditionally. Note that forced scheduling is likely to induce short
661 * delay before dispatch starts even if @sq->first_pending_disptime is not
662 * in the future and thus shouldn't be used in hot paths.
663 */
664static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq,
665 bool force)
408{ 666{
409 struct throtl_rb_root *st = &td->tg_service_tree; 667 /* any pending children left? */
668 if (!sq->nr_pending)
669 return true;
410 670
411 /* 671 update_min_dispatch_time(sq);
412 * If there are more bios pending, schedule more work.
413 */
414 if (!total_nr_queued(td))
415 return;
416 672
417 BUG_ON(!st->count); 673 /* is the next dispatch time in the future? */
674 if (force || time_after(sq->first_pending_disptime, jiffies)) {
675 throtl_schedule_pending_timer(sq, sq->first_pending_disptime);
676 return true;
677 }
418 678
419 update_min_dispatch_time(st); 679 /* tell the caller to continue dispatching */
680 return false;
681}
420 682
421 if (time_before_eq(st->min_disptime, jiffies)) 683static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
422 throtl_schedule_delayed_work(td, 0); 684 bool rw, unsigned long start)
423 else 685{
424 throtl_schedule_delayed_work(td, (st->min_disptime - jiffies)); 686 tg->bytes_disp[rw] = 0;
687 tg->io_disp[rw] = 0;
688
689 /*
690 * Previous slice has expired. We must have trimmed it after last
691 * bio dispatch. That means since start of last slice, we never used
692 * that bandwidth. Do try to make use of that bandwidth while giving
693 * credit.
694 */
695 if (time_after_eq(start, tg->slice_start[rw]))
696 tg->slice_start[rw] = start;
697
698 tg->slice_end[rw] = jiffies + throtl_slice;
699 throtl_log(&tg->service_queue,
700 "[%c] new slice with credit start=%lu end=%lu jiffies=%lu",
701 rw == READ ? 'R' : 'W', tg->slice_start[rw],
702 tg->slice_end[rw], jiffies);
425} 703}
426 704
427static inline void 705static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
428throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
429{ 706{
430 tg->bytes_disp[rw] = 0; 707 tg->bytes_disp[rw] = 0;
431 tg->io_disp[rw] = 0; 708 tg->io_disp[rw] = 0;
432 tg->slice_start[rw] = jiffies; 709 tg->slice_start[rw] = jiffies;
433 tg->slice_end[rw] = jiffies + throtl_slice; 710 tg->slice_end[rw] = jiffies + throtl_slice;
434 throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu", 711 throtl_log(&tg->service_queue,
435 rw == READ ? 'R' : 'W', tg->slice_start[rw], 712 "[%c] new slice start=%lu end=%lu jiffies=%lu",
436 tg->slice_end[rw], jiffies); 713 rw == READ ? 'R' : 'W', tg->slice_start[rw],
714 tg->slice_end[rw], jiffies);
437} 715}
438 716
439static inline void throtl_set_slice_end(struct throtl_data *td, 717static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
440 struct throtl_grp *tg, bool rw, unsigned long jiffy_end) 718 unsigned long jiffy_end)
441{ 719{
442 tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); 720 tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
443} 721}
444 722
445static inline void throtl_extend_slice(struct throtl_data *td, 723static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
446 struct throtl_grp *tg, bool rw, unsigned long jiffy_end) 724 unsigned long jiffy_end)
447{ 725{
448 tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); 726 tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
449 throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu", 727 throtl_log(&tg->service_queue,
450 rw == READ ? 'R' : 'W', tg->slice_start[rw], 728 "[%c] extend slice start=%lu end=%lu jiffies=%lu",
451 tg->slice_end[rw], jiffies); 729 rw == READ ? 'R' : 'W', tg->slice_start[rw],
730 tg->slice_end[rw], jiffies);
452} 731}
453 732
454/* Determine if previously allocated or extended slice is complete or not */ 733/* Determine if previously allocated or extended slice is complete or not */
455static bool 734static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
456throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
457{ 735{
458 if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) 736 if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
459 return 0; 737 return 0;
@@ -462,8 +740,7 @@ throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
462} 740}
463 741
464/* Trim the used slices and adjust slice start accordingly */ 742/* Trim the used slices and adjust slice start accordingly */
465static inline void 743static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
466throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
467{ 744{
468 unsigned long nr_slices, time_elapsed, io_trim; 745 unsigned long nr_slices, time_elapsed, io_trim;
469 u64 bytes_trim, tmp; 746 u64 bytes_trim, tmp;
@@ -475,7 +752,7 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
475 * renewed. Don't try to trim the slice if slice is used. A new 752 * renewed. Don't try to trim the slice if slice is used. A new
476 * slice will start when appropriate. 753 * slice will start when appropriate.
477 */ 754 */
478 if (throtl_slice_used(td, tg, rw)) 755 if (throtl_slice_used(tg, rw))
479 return; 756 return;
480 757
481 /* 758 /*
@@ -486,7 +763,7 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
486 * is bad because it does not allow new slice to start. 763 * is bad because it does not allow new slice to start.
487 */ 764 */
488 765
489 throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice); 766 throtl_set_slice_end(tg, rw, jiffies + throtl_slice);
490 767
491 time_elapsed = jiffies - tg->slice_start[rw]; 768 time_elapsed = jiffies - tg->slice_start[rw];
492 769
@@ -515,14 +792,14 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
515 792
516 tg->slice_start[rw] += nr_slices * throtl_slice; 793 tg->slice_start[rw] += nr_slices * throtl_slice;
517 794
518 throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu" 795 throtl_log(&tg->service_queue,
519 " start=%lu end=%lu jiffies=%lu", 796 "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu",
520 rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, 797 rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
521 tg->slice_start[rw], tg->slice_end[rw], jiffies); 798 tg->slice_start[rw], tg->slice_end[rw], jiffies);
522} 799}
523 800
524static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg, 801static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
525 struct bio *bio, unsigned long *wait) 802 unsigned long *wait)
526{ 803{
527 bool rw = bio_data_dir(bio); 804 bool rw = bio_data_dir(bio);
528 unsigned int io_allowed; 805 unsigned int io_allowed;
@@ -571,8 +848,8 @@ static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
571 return 0; 848 return 0;
572} 849}
573 850
574static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg, 851static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
575 struct bio *bio, unsigned long *wait) 852 unsigned long *wait)
576{ 853{
577 bool rw = bio_data_dir(bio); 854 bool rw = bio_data_dir(bio);
578 u64 bytes_allowed, extra_bytes, tmp; 855 u64 bytes_allowed, extra_bytes, tmp;
@@ -613,18 +890,12 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
613 return 0; 890 return 0;
614} 891}
615 892
616static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) {
617 if (tg->bps[rw] == -1 && tg->iops[rw] == -1)
618 return 1;
619 return 0;
620}
621
622/* 893/*
623 * Returns whether one can dispatch a bio or not. Also returns approx number 894 * Returns whether one can dispatch a bio or not. Also returns approx number
624 * of jiffies to wait before this bio is with-in IO rate and can be dispatched 895 * of jiffies to wait before this bio is with-in IO rate and can be dispatched
625 */ 896 */
626static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, 897static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
627 struct bio *bio, unsigned long *wait) 898 unsigned long *wait)
628{ 899{
629 bool rw = bio_data_dir(bio); 900 bool rw = bio_data_dir(bio);
630 unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; 901 unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
@@ -635,7 +906,8 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
635 * this function with a different bio if there are other bios 906 * this function with a different bio if there are other bios
636 * queued. 907 * queued.
637 */ 908 */
638 BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw])); 909 BUG_ON(tg->service_queue.nr_queued[rw] &&
910 bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
639 911
640 /* If tg->bps = -1, then BW is unlimited */ 912 /* If tg->bps = -1, then BW is unlimited */
641 if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { 913 if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
@@ -649,15 +921,15 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
649 * existing slice to make sure it is at least throtl_slice interval 921 * existing slice to make sure it is at least throtl_slice interval
650 * long since now. 922 * long since now.
651 */ 923 */
652 if (throtl_slice_used(td, tg, rw)) 924 if (throtl_slice_used(tg, rw))
653 throtl_start_new_slice(td, tg, rw); 925 throtl_start_new_slice(tg, rw);
654 else { 926 else {
655 if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) 927 if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
656 throtl_extend_slice(td, tg, rw, jiffies + throtl_slice); 928 throtl_extend_slice(tg, rw, jiffies + throtl_slice);
657 } 929 }
658 930
659 if (tg_with_in_bps_limit(td, tg, bio, &bps_wait) 931 if (tg_with_in_bps_limit(tg, bio, &bps_wait) &&
660 && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) { 932 tg_with_in_iops_limit(tg, bio, &iops_wait)) {
661 if (wait) 933 if (wait)
662 *wait = 0; 934 *wait = 0;
663 return 1; 935 return 1;
@@ -669,7 +941,7 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
669 *wait = max_wait; 941 *wait = max_wait;
670 942
671 if (time_before(tg->slice_end[rw], jiffies + max_wait)) 943 if (time_before(tg->slice_end[rw], jiffies + max_wait))
672 throtl_extend_slice(td, tg, rw, jiffies + max_wait); 944 throtl_extend_slice(tg, rw, jiffies + max_wait);
673 945
674 return 0; 946 return 0;
675} 947}
@@ -708,65 +980,136 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
708 tg->bytes_disp[rw] += bio->bi_size; 980 tg->bytes_disp[rw] += bio->bi_size;
709 tg->io_disp[rw]++; 981 tg->io_disp[rw]++;
710 982
711 throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw); 983 /*
984 * REQ_THROTTLED is used to prevent the same bio to be throttled
985 * more than once as a throttled bio will go through blk-throtl the
986 * second time when it eventually gets issued. Set it when a bio
987 * is being charged to a tg.
988 *
989 * Dispatch stats aren't recursive and each @bio should only be
990 * accounted by the @tg it was originally associated with. Let's
991 * update the stats when setting REQ_THROTTLED for the first time
992 * which is guaranteed to be for the @bio's original tg.
993 */
994 if (!(bio->bi_rw & REQ_THROTTLED)) {
995 bio->bi_rw |= REQ_THROTTLED;
996 throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size,
997 bio->bi_rw);
998 }
712} 999}
713 1000
714static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, 1001/**
715 struct bio *bio) 1002 * throtl_add_bio_tg - add a bio to the specified throtl_grp
1003 * @bio: bio to add
1004 * @qn: qnode to use
1005 * @tg: the target throtl_grp
1006 *
1007 * Add @bio to @tg's service_queue using @qn. If @qn is not specified,
1008 * tg->qnode_on_self[] is used.
1009 */
1010static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn,
1011 struct throtl_grp *tg)
716{ 1012{
1013 struct throtl_service_queue *sq = &tg->service_queue;
717 bool rw = bio_data_dir(bio); 1014 bool rw = bio_data_dir(bio);
718 1015
719 bio_list_add(&tg->bio_lists[rw], bio); 1016 if (!qn)
720 /* Take a bio reference on tg */ 1017 qn = &tg->qnode_on_self[rw];
721 blkg_get(tg_to_blkg(tg)); 1018
722 tg->nr_queued[rw]++; 1019 /*
723 td->nr_queued[rw]++; 1020 * If @tg doesn't currently have any bios queued in the same
724 throtl_enqueue_tg(td, tg); 1021 * direction, queueing @bio can change when @tg should be
1022 * dispatched. Mark that @tg was empty. This is automatically
1023 * cleaered on the next tg_update_disptime().
1024 */
1025 if (!sq->nr_queued[rw])
1026 tg->flags |= THROTL_TG_WAS_EMPTY;
1027
1028 throtl_qnode_add_bio(bio, qn, &sq->queued[rw]);
1029
1030 sq->nr_queued[rw]++;
1031 throtl_enqueue_tg(tg);
725} 1032}
726 1033
727static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg) 1034static void tg_update_disptime(struct throtl_grp *tg)
728{ 1035{
1036 struct throtl_service_queue *sq = &tg->service_queue;
729 unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; 1037 unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
730 struct bio *bio; 1038 struct bio *bio;
731 1039
732 if ((bio = bio_list_peek(&tg->bio_lists[READ]))) 1040 if ((bio = throtl_peek_queued(&sq->queued[READ])))
733 tg_may_dispatch(td, tg, bio, &read_wait); 1041 tg_may_dispatch(tg, bio, &read_wait);
734 1042
735 if ((bio = bio_list_peek(&tg->bio_lists[WRITE]))) 1043 if ((bio = throtl_peek_queued(&sq->queued[WRITE])))
736 tg_may_dispatch(td, tg, bio, &write_wait); 1044 tg_may_dispatch(tg, bio, &write_wait);
737 1045
738 min_wait = min(read_wait, write_wait); 1046 min_wait = min(read_wait, write_wait);
739 disptime = jiffies + min_wait; 1047 disptime = jiffies + min_wait;
740 1048
741 /* Update dispatch time */ 1049 /* Update dispatch time */
742 throtl_dequeue_tg(td, tg); 1050 throtl_dequeue_tg(tg);
743 tg->disptime = disptime; 1051 tg->disptime = disptime;
744 throtl_enqueue_tg(td, tg); 1052 throtl_enqueue_tg(tg);
1053
1054 /* see throtl_add_bio_tg() */
1055 tg->flags &= ~THROTL_TG_WAS_EMPTY;
745} 1056}
746 1057
747static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg, 1058static void start_parent_slice_with_credit(struct throtl_grp *child_tg,
748 bool rw, struct bio_list *bl) 1059 struct throtl_grp *parent_tg, bool rw)
749{ 1060{
750 struct bio *bio; 1061 if (throtl_slice_used(parent_tg, rw)) {
1062 throtl_start_new_slice_with_credit(parent_tg, rw,
1063 child_tg->slice_start[rw]);
1064 }
1065
1066}
751 1067
752 bio = bio_list_pop(&tg->bio_lists[rw]); 1068static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
753 tg->nr_queued[rw]--; 1069{
754 /* Drop bio reference on blkg */ 1070 struct throtl_service_queue *sq = &tg->service_queue;
755 blkg_put(tg_to_blkg(tg)); 1071 struct throtl_service_queue *parent_sq = sq->parent_sq;
1072 struct throtl_grp *parent_tg = sq_to_tg(parent_sq);
1073 struct throtl_grp *tg_to_put = NULL;
1074 struct bio *bio;
756 1075
757 BUG_ON(td->nr_queued[rw] <= 0); 1076 /*
758 td->nr_queued[rw]--; 1077 * @bio is being transferred from @tg to @parent_sq. Popping a bio
1078 * from @tg may put its reference and @parent_sq might end up
1079 * getting released prematurely. Remember the tg to put and put it
1080 * after @bio is transferred to @parent_sq.
1081 */
1082 bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put);
1083 sq->nr_queued[rw]--;
759 1084
760 throtl_charge_bio(tg, bio); 1085 throtl_charge_bio(tg, bio);
761 bio_list_add(bl, bio);
762 bio->bi_rw |= REQ_THROTTLED;
763 1086
764 throtl_trim_slice(td, tg, rw); 1087 /*
1088 * If our parent is another tg, we just need to transfer @bio to
1089 * the parent using throtl_add_bio_tg(). If our parent is
1090 * @td->service_queue, @bio is ready to be issued. Put it on its
1091 * bio_lists[] and decrease total number queued. The caller is
1092 * responsible for issuing these bios.
1093 */
1094 if (parent_tg) {
1095 throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg);
1096 start_parent_slice_with_credit(tg, parent_tg, rw);
1097 } else {
1098 throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw],
1099 &parent_sq->queued[rw]);
1100 BUG_ON(tg->td->nr_queued[rw] <= 0);
1101 tg->td->nr_queued[rw]--;
1102 }
1103
1104 throtl_trim_slice(tg, rw);
1105
1106 if (tg_to_put)
1107 blkg_put(tg_to_blkg(tg_to_put));
765} 1108}
766 1109
767static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg, 1110static int throtl_dispatch_tg(struct throtl_grp *tg)
768 struct bio_list *bl)
769{ 1111{
1112 struct throtl_service_queue *sq = &tg->service_queue;
770 unsigned int nr_reads = 0, nr_writes = 0; 1113 unsigned int nr_reads = 0, nr_writes = 0;
771 unsigned int max_nr_reads = throtl_grp_quantum*3/4; 1114 unsigned int max_nr_reads = throtl_grp_quantum*3/4;
772 unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads; 1115 unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
@@ -774,20 +1117,20 @@ static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
774 1117
775 /* Try to dispatch 75% READS and 25% WRITES */ 1118 /* Try to dispatch 75% READS and 25% WRITES */
776 1119
777 while ((bio = bio_list_peek(&tg->bio_lists[READ])) 1120 while ((bio = throtl_peek_queued(&sq->queued[READ])) &&
778 && tg_may_dispatch(td, tg, bio, NULL)) { 1121 tg_may_dispatch(tg, bio, NULL)) {
779 1122
780 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); 1123 tg_dispatch_one_bio(tg, bio_data_dir(bio));
781 nr_reads++; 1124 nr_reads++;
782 1125
783 if (nr_reads >= max_nr_reads) 1126 if (nr_reads >= max_nr_reads)
784 break; 1127 break;
785 } 1128 }
786 1129
787 while ((bio = bio_list_peek(&tg->bio_lists[WRITE])) 1130 while ((bio = throtl_peek_queued(&sq->queued[WRITE])) &&
788 && tg_may_dispatch(td, tg, bio, NULL)) { 1131 tg_may_dispatch(tg, bio, NULL)) {
789 1132
790 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); 1133 tg_dispatch_one_bio(tg, bio_data_dir(bio));
791 nr_writes++; 1134 nr_writes++;
792 1135
793 if (nr_writes >= max_nr_writes) 1136 if (nr_writes >= max_nr_writes)
@@ -797,14 +1140,13 @@ static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
797 return nr_reads + nr_writes; 1140 return nr_reads + nr_writes;
798} 1141}
799 1142
800static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl) 1143static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
801{ 1144{
802 unsigned int nr_disp = 0; 1145 unsigned int nr_disp = 0;
803 struct throtl_grp *tg;
804 struct throtl_rb_root *st = &td->tg_service_tree;
805 1146
806 while (1) { 1147 while (1) {
807 tg = throtl_rb_first(st); 1148 struct throtl_grp *tg = throtl_rb_first(parent_sq);
1149 struct throtl_service_queue *sq = &tg->service_queue;
808 1150
809 if (!tg) 1151 if (!tg)
810 break; 1152 break;
@@ -812,14 +1154,12 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
812 if (time_before(jiffies, tg->disptime)) 1154 if (time_before(jiffies, tg->disptime))
813 break; 1155 break;
814 1156
815 throtl_dequeue_tg(td, tg); 1157 throtl_dequeue_tg(tg);
816 1158
817 nr_disp += throtl_dispatch_tg(td, tg, bl); 1159 nr_disp += throtl_dispatch_tg(tg);
818 1160
819 if (tg->nr_queued[0] || tg->nr_queued[1]) { 1161 if (sq->nr_queued[0] || sq->nr_queued[1])
820 tg_update_disptime(td, tg); 1162 tg_update_disptime(tg);
821 throtl_enqueue_tg(td, tg);
822 }
823 1163
824 if (nr_disp >= throtl_quantum) 1164 if (nr_disp >= throtl_quantum)
825 break; 1165 break;
@@ -828,111 +1168,111 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
828 return nr_disp; 1168 return nr_disp;
829} 1169}
830 1170
831static void throtl_process_limit_change(struct throtl_data *td) 1171/**
1172 * throtl_pending_timer_fn - timer function for service_queue->pending_timer
1173 * @arg: the throtl_service_queue being serviced
1174 *
1175 * This timer is armed when a child throtl_grp with active bio's become
1176 * pending and queued on the service_queue's pending_tree and expires when
1177 * the first child throtl_grp should be dispatched. This function
1178 * dispatches bio's from the children throtl_grps to the parent
1179 * service_queue.
1180 *
1181 * If the parent's parent is another throtl_grp, dispatching is propagated
1182 * by either arming its pending_timer or repeating dispatch directly. If
1183 * the top-level service_tree is reached, throtl_data->dispatch_work is
1184 * kicked so that the ready bio's are issued.
1185 */
1186static void throtl_pending_timer_fn(unsigned long arg)
832{ 1187{
1188 struct throtl_service_queue *sq = (void *)arg;
1189 struct throtl_grp *tg = sq_to_tg(sq);
1190 struct throtl_data *td = sq_to_td(sq);
833 struct request_queue *q = td->queue; 1191 struct request_queue *q = td->queue;
834 struct blkcg_gq *blkg, *n; 1192 struct throtl_service_queue *parent_sq;
835 1193 bool dispatched;
836 if (!td->limits_changed) 1194 int ret;
837 return;
838
839 xchg(&td->limits_changed, false);
840
841 throtl_log(td, "limits changed");
842
843 list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
844 struct throtl_grp *tg = blkg_to_tg(blkg);
845 1195
846 if (!tg->limits_changed) 1196 spin_lock_irq(q->queue_lock);
847 continue; 1197again:
1198 parent_sq = sq->parent_sq;
1199 dispatched = false;
1200
1201 while (true) {
1202 throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u",
1203 sq->nr_queued[READ] + sq->nr_queued[WRITE],
1204 sq->nr_queued[READ], sq->nr_queued[WRITE]);
1205
1206 ret = throtl_select_dispatch(sq);
1207 if (ret) {
1208 throtl_log(sq, "bios disp=%u", ret);
1209 dispatched = true;
1210 }
848 1211
849 if (!xchg(&tg->limits_changed, false)) 1212 if (throtl_schedule_next_dispatch(sq, false))
850 continue; 1213 break;
851 1214
852 throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu" 1215 /* this dispatch windows is still open, relax and repeat */
853 " riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE], 1216 spin_unlock_irq(q->queue_lock);
854 tg->iops[READ], tg->iops[WRITE]); 1217 cpu_relax();
1218 spin_lock_irq(q->queue_lock);
1219 }
855 1220
856 /* 1221 if (!dispatched)
857 * Restart the slices for both READ and WRITES. It 1222 goto out_unlock;
858 * might happen that a group's limit are dropped
859 * suddenly and we don't want to account recently
860 * dispatched IO with new low rate
861 */
862 throtl_start_new_slice(td, tg, 0);
863 throtl_start_new_slice(td, tg, 1);
864 1223
865 if (throtl_tg_on_rr(tg)) 1224 if (parent_sq) {
866 tg_update_disptime(td, tg); 1225 /* @parent_sq is another throl_grp, propagate dispatch */
1226 if (tg->flags & THROTL_TG_WAS_EMPTY) {
1227 tg_update_disptime(tg);
1228 if (!throtl_schedule_next_dispatch(parent_sq, false)) {
1229 /* window is already open, repeat dispatching */
1230 sq = parent_sq;
1231 tg = sq_to_tg(sq);
1232 goto again;
1233 }
1234 }
1235 } else {
1236 /* reached the top-level, queue issueing */
1237 queue_work(kthrotld_workqueue, &td->dispatch_work);
867 } 1238 }
1239out_unlock:
1240 spin_unlock_irq(q->queue_lock);
868} 1241}
869 1242
870/* Dispatch throttled bios. Should be called without queue lock held. */ 1243/**
871static int throtl_dispatch(struct request_queue *q) 1244 * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work
1245 * @work: work item being executed
1246 *
1247 * This function is queued for execution when bio's reach the bio_lists[]
1248 * of throtl_data->service_queue. Those bio's are ready and issued by this
1249 * function.
1250 */
1251void blk_throtl_dispatch_work_fn(struct work_struct *work)
872{ 1252{
873 struct throtl_data *td = q->td; 1253 struct throtl_data *td = container_of(work, struct throtl_data,
874 unsigned int nr_disp = 0; 1254 dispatch_work);
1255 struct throtl_service_queue *td_sq = &td->service_queue;
1256 struct request_queue *q = td->queue;
875 struct bio_list bio_list_on_stack; 1257 struct bio_list bio_list_on_stack;
876 struct bio *bio; 1258 struct bio *bio;
877 struct blk_plug plug; 1259 struct blk_plug plug;
878 1260 int rw;
879 spin_lock_irq(q->queue_lock);
880
881 throtl_process_limit_change(td);
882
883 if (!total_nr_queued(td))
884 goto out;
885 1261
886 bio_list_init(&bio_list_on_stack); 1262 bio_list_init(&bio_list_on_stack);
887 1263
888 throtl_log(td, "dispatch nr_queued=%u read=%u write=%u", 1264 spin_lock_irq(q->queue_lock);
889 total_nr_queued(td), td->nr_queued[READ], 1265 for (rw = READ; rw <= WRITE; rw++)
890 td->nr_queued[WRITE]); 1266 while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))
891 1267 bio_list_add(&bio_list_on_stack, bio);
892 nr_disp = throtl_select_dispatch(td, &bio_list_on_stack);
893
894 if (nr_disp)
895 throtl_log(td, "bios disp=%u", nr_disp);
896
897 throtl_schedule_next_dispatch(td);
898out:
899 spin_unlock_irq(q->queue_lock); 1268 spin_unlock_irq(q->queue_lock);
900 1269
901 /* 1270 if (!bio_list_empty(&bio_list_on_stack)) {
902 * If we dispatched some requests, unplug the queue to make sure
903 * immediate dispatch
904 */
905 if (nr_disp) {
906 blk_start_plug(&plug); 1271 blk_start_plug(&plug);
907 while((bio = bio_list_pop(&bio_list_on_stack))) 1272 while((bio = bio_list_pop(&bio_list_on_stack)))
908 generic_make_request(bio); 1273 generic_make_request(bio);
909 blk_finish_plug(&plug); 1274 blk_finish_plug(&plug);
910 } 1275 }
911 return nr_disp;
912}
913
914void blk_throtl_work(struct work_struct *work)
915{
916 struct throtl_data *td = container_of(work, struct throtl_data,
917 throtl_work.work);
918 struct request_queue *q = td->queue;
919
920 throtl_dispatch(q);
921}
922
923/* Call with queue lock held */
924static void
925throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
926{
927
928 struct delayed_work *dwork = &td->throtl_work;
929
930 /* schedule work if limits changed even if no bio is queued */
931 if (total_nr_queued(td) || td->limits_changed) {
932 mod_delayed_work(kthrotld_workqueue, dwork, delay);
933 throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
934 delay, jiffies);
935 }
936} 1276}
937 1277
938static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, 1278static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
@@ -1007,7 +1347,9 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
1007 struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 1347 struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1008 struct blkg_conf_ctx ctx; 1348 struct blkg_conf_ctx ctx;
1009 struct throtl_grp *tg; 1349 struct throtl_grp *tg;
1010 struct throtl_data *td; 1350 struct throtl_service_queue *sq;
1351 struct blkcg_gq *blkg;
1352 struct cgroup *pos_cgrp;
1011 int ret; 1353 int ret;
1012 1354
1013 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); 1355 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
@@ -1015,7 +1357,7 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
1015 return ret; 1357 return ret;
1016 1358
1017 tg = blkg_to_tg(ctx.blkg); 1359 tg = blkg_to_tg(ctx.blkg);
1018 td = ctx.blkg->q->td; 1360 sq = &tg->service_queue;
1019 1361
1020 if (!ctx.v) 1362 if (!ctx.v)
1021 ctx.v = -1; 1363 ctx.v = -1;
@@ -1025,10 +1367,37 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
1025 else 1367 else
1026 *(unsigned int *)((void *)tg + cft->private) = ctx.v; 1368 *(unsigned int *)((void *)tg + cft->private) = ctx.v;
1027 1369
1028 /* XXX: we don't need the following deferred processing */ 1370 throtl_log(&tg->service_queue,
1029 xchg(&tg->limits_changed, true); 1371 "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
1030 xchg(&td->limits_changed, true); 1372 tg->bps[READ], tg->bps[WRITE],
1031 throtl_schedule_delayed_work(td, 0); 1373 tg->iops[READ], tg->iops[WRITE]);
1374
1375 /*
1376 * Update has_rules[] flags for the updated tg's subtree. A tg is
1377 * considered to have rules if either the tg itself or any of its
1378 * ancestors has rules. This identifies groups without any
1379 * restrictions in the whole hierarchy and allows them to bypass
1380 * blk-throttle.
1381 */
1382 tg_update_has_rules(tg);
1383 blkg_for_each_descendant_pre(blkg, pos_cgrp, ctx.blkg)
1384 tg_update_has_rules(blkg_to_tg(blkg));
1385
1386 /*
1387 * We're already holding queue_lock and know @tg is valid. Let's
1388 * apply the new config directly.
1389 *
1390 * Restart the slices for both READ and WRITES. It might happen
1391 * that a group's limit are dropped suddenly and we don't want to
1392 * account recently dispatched IO with new low rate.
1393 */
1394 throtl_start_new_slice(tg, 0);
1395 throtl_start_new_slice(tg, 1);
1396
1397 if (tg->flags & THROTL_TG_PENDING) {
1398 tg_update_disptime(tg);
1399 throtl_schedule_next_dispatch(sq->parent_sq, true);
1400 }
1032 1401
1033 blkg_conf_finish(&ctx); 1402 blkg_conf_finish(&ctx);
1034 return 0; 1403 return 0;
@@ -1092,7 +1461,7 @@ static void throtl_shutdown_wq(struct request_queue *q)
1092{ 1461{
1093 struct throtl_data *td = q->td; 1462 struct throtl_data *td = q->td;
1094 1463
1095 cancel_delayed_work_sync(&td->throtl_work); 1464 cancel_work_sync(&td->dispatch_work);
1096} 1465}
1097 1466
1098static struct blkcg_policy blkcg_policy_throtl = { 1467static struct blkcg_policy blkcg_policy_throtl = {
@@ -1100,6 +1469,7 @@ static struct blkcg_policy blkcg_policy_throtl = {
1100 .cftypes = throtl_files, 1469 .cftypes = throtl_files,
1101 1470
1102 .pd_init_fn = throtl_pd_init, 1471 .pd_init_fn = throtl_pd_init,
1472 .pd_online_fn = throtl_pd_online,
1103 .pd_exit_fn = throtl_pd_exit, 1473 .pd_exit_fn = throtl_pd_exit,
1104 .pd_reset_stats_fn = throtl_pd_reset_stats, 1474 .pd_reset_stats_fn = throtl_pd_reset_stats,
1105}; 1475};
@@ -1107,15 +1477,16 @@ static struct blkcg_policy blkcg_policy_throtl = {
1107bool blk_throtl_bio(struct request_queue *q, struct bio *bio) 1477bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
1108{ 1478{
1109 struct throtl_data *td = q->td; 1479 struct throtl_data *td = q->td;
1480 struct throtl_qnode *qn = NULL;
1110 struct throtl_grp *tg; 1481 struct throtl_grp *tg;
1111 bool rw = bio_data_dir(bio), update_disptime = true; 1482 struct throtl_service_queue *sq;
1483 bool rw = bio_data_dir(bio);
1112 struct blkcg *blkcg; 1484 struct blkcg *blkcg;
1113 bool throttled = false; 1485 bool throttled = false;
1114 1486
1115 if (bio->bi_rw & REQ_THROTTLED) { 1487 /* see throtl_charge_bio() */
1116 bio->bi_rw &= ~REQ_THROTTLED; 1488 if (bio->bi_rw & REQ_THROTTLED)
1117 goto out; 1489 goto out;
1118 }
1119 1490
1120 /* 1491 /*
1121 * A throtl_grp pointer retrieved under rcu can be used to access 1492 * A throtl_grp pointer retrieved under rcu can be used to access
@@ -1126,7 +1497,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
1126 blkcg = bio_blkcg(bio); 1497 blkcg = bio_blkcg(bio);
1127 tg = throtl_lookup_tg(td, blkcg); 1498 tg = throtl_lookup_tg(td, blkcg);
1128 if (tg) { 1499 if (tg) {
1129 if (tg_no_rule_group(tg, rw)) { 1500 if (!tg->has_rules[rw]) {
1130 throtl_update_dispatch_stats(tg_to_blkg(tg), 1501 throtl_update_dispatch_stats(tg_to_blkg(tg),
1131 bio->bi_size, bio->bi_rw); 1502 bio->bi_size, bio->bi_rw);
1132 goto out_unlock_rcu; 1503 goto out_unlock_rcu;
@@ -1142,18 +1513,18 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
1142 if (unlikely(!tg)) 1513 if (unlikely(!tg))
1143 goto out_unlock; 1514 goto out_unlock;
1144 1515
1145 if (tg->nr_queued[rw]) { 1516 sq = &tg->service_queue;
1146 /*
1147 * There is already another bio queued in same dir. No
1148 * need to update dispatch time.
1149 */
1150 update_disptime = false;
1151 goto queue_bio;
1152 1517
1153 } 1518 while (true) {
1519 /* throtl is FIFO - if bios are already queued, should queue */
1520 if (sq->nr_queued[rw])
1521 break;
1522
1523 /* if above limits, break to queue */
1524 if (!tg_may_dispatch(tg, bio, NULL))
1525 break;
1154 1526
1155 /* Bio is with-in rate limit of group */ 1527 /* within limits, let's charge and dispatch directly */
1156 if (tg_may_dispatch(td, tg, bio, NULL)) {
1157 throtl_charge_bio(tg, bio); 1528 throtl_charge_bio(tg, bio);
1158 1529
1159 /* 1530 /*
@@ -1167,25 +1538,41 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
1167 * 1538 *
1168 * So keep on trimming slice even if bio is not queued. 1539 * So keep on trimming slice even if bio is not queued.
1169 */ 1540 */
1170 throtl_trim_slice(td, tg, rw); 1541 throtl_trim_slice(tg, rw);
1171 goto out_unlock; 1542
1543 /*
1544 * @bio passed through this layer without being throttled.
1545 * Climb up the ladder. If we''re already at the top, it
1546 * can be executed directly.
1547 */
1548 qn = &tg->qnode_on_parent[rw];
1549 sq = sq->parent_sq;
1550 tg = sq_to_tg(sq);
1551 if (!tg)
1552 goto out_unlock;
1172 } 1553 }
1173 1554
1174queue_bio: 1555 /* out-of-limit, queue to @tg */
1175 throtl_log_tg(td, tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu" 1556 throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d",
1176 " iodisp=%u iops=%u queued=%d/%d", 1557 rw == READ ? 'R' : 'W',
1177 rw == READ ? 'R' : 'W', 1558 tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
1178 tg->bytes_disp[rw], bio->bi_size, tg->bps[rw], 1559 tg->io_disp[rw], tg->iops[rw],
1179 tg->io_disp[rw], tg->iops[rw], 1560 sq->nr_queued[READ], sq->nr_queued[WRITE]);
1180 tg->nr_queued[READ], tg->nr_queued[WRITE]);
1181 1561
1182 bio_associate_current(bio); 1562 bio_associate_current(bio);
1183 throtl_add_bio_tg(q->td, tg, bio); 1563 tg->td->nr_queued[rw]++;
1564 throtl_add_bio_tg(bio, qn, tg);
1184 throttled = true; 1565 throttled = true;
1185 1566
1186 if (update_disptime) { 1567 /*
1187 tg_update_disptime(td, tg); 1568 * Update @tg's dispatch time and force schedule dispatch if @tg
1188 throtl_schedule_next_dispatch(td); 1569 * was empty before @bio. The forced scheduling isn't likely to
1570 * cause undue delay as @bio is likely to be dispatched directly if
1571 * its @tg's disptime is not in the future.
1572 */
1573 if (tg->flags & THROTL_TG_WAS_EMPTY) {
1574 tg_update_disptime(tg);
1575 throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true);
1189 } 1576 }
1190 1577
1191out_unlock: 1578out_unlock:
@@ -1193,9 +1580,38 @@ out_unlock:
1193out_unlock_rcu: 1580out_unlock_rcu:
1194 rcu_read_unlock(); 1581 rcu_read_unlock();
1195out: 1582out:
1583 /*
1584 * As multiple blk-throtls may stack in the same issue path, we
1585 * don't want bios to leave with the flag set. Clear the flag if
1586 * being issued.
1587 */
1588 if (!throttled)
1589 bio->bi_rw &= ~REQ_THROTTLED;
1196 return throttled; 1590 return throttled;
1197} 1591}
1198 1592
1593/*
1594 * Dispatch all bios from all children tg's queued on @parent_sq. On
1595 * return, @parent_sq is guaranteed to not have any active children tg's
1596 * and all bios from previously active tg's are on @parent_sq->bio_lists[].
1597 */
1598static void tg_drain_bios(struct throtl_service_queue *parent_sq)
1599{
1600 struct throtl_grp *tg;
1601
1602 while ((tg = throtl_rb_first(parent_sq))) {
1603 struct throtl_service_queue *sq = &tg->service_queue;
1604 struct bio *bio;
1605
1606 throtl_dequeue_tg(tg);
1607
1608 while ((bio = throtl_peek_queued(&sq->queued[READ])))
1609 tg_dispatch_one_bio(tg, bio_data_dir(bio));
1610 while ((bio = throtl_peek_queued(&sq->queued[WRITE])))
1611 tg_dispatch_one_bio(tg, bio_data_dir(bio));
1612 }
1613}
1614
1199/** 1615/**
1200 * blk_throtl_drain - drain throttled bios 1616 * blk_throtl_drain - drain throttled bios
1201 * @q: request_queue to drain throttled bios for 1617 * @q: request_queue to drain throttled bios for
@@ -1206,27 +1622,36 @@ void blk_throtl_drain(struct request_queue *q)
1206 __releases(q->queue_lock) __acquires(q->queue_lock) 1622 __releases(q->queue_lock) __acquires(q->queue_lock)
1207{ 1623{
1208 struct throtl_data *td = q->td; 1624 struct throtl_data *td = q->td;
1209 struct throtl_rb_root *st = &td->tg_service_tree; 1625 struct blkcg_gq *blkg;
1210 struct throtl_grp *tg; 1626 struct cgroup *pos_cgrp;
1211 struct bio_list bl;
1212 struct bio *bio; 1627 struct bio *bio;
1628 int rw;
1213 1629
1214 queue_lockdep_assert_held(q); 1630 queue_lockdep_assert_held(q);
1631 rcu_read_lock();
1632
1633 /*
1634 * Drain each tg while doing post-order walk on the blkg tree, so
1635 * that all bios are propagated to td->service_queue. It'd be
1636 * better to walk service_queue tree directly but blkg walk is
1637 * easier.
1638 */
1639 blkg_for_each_descendant_post(blkg, pos_cgrp, td->queue->root_blkg)
1640 tg_drain_bios(&blkg_to_tg(blkg)->service_queue);
1215 1641
1216 bio_list_init(&bl); 1642 tg_drain_bios(&td_root_tg(td)->service_queue);
1217 1643
1218 while ((tg = throtl_rb_first(st))) { 1644 /* finally, transfer bios from top-level tg's into the td */
1219 throtl_dequeue_tg(td, tg); 1645 tg_drain_bios(&td->service_queue);
1220 1646
1221 while ((bio = bio_list_peek(&tg->bio_lists[READ]))) 1647 rcu_read_unlock();
1222 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
1223 while ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
1224 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
1225 }
1226 spin_unlock_irq(q->queue_lock); 1648 spin_unlock_irq(q->queue_lock);
1227 1649
1228 while ((bio = bio_list_pop(&bl))) 1650 /* all bios now should be in td->service_queue, issue them */
1229 generic_make_request(bio); 1651 for (rw = READ; rw <= WRITE; rw++)
1652 while ((bio = throtl_pop_queued(&td->service_queue.queued[rw],
1653 NULL)))
1654 generic_make_request(bio);
1230 1655
1231 spin_lock_irq(q->queue_lock); 1656 spin_lock_irq(q->queue_lock);
1232} 1657}
@@ -1240,9 +1665,8 @@ int blk_throtl_init(struct request_queue *q)
1240 if (!td) 1665 if (!td)
1241 return -ENOMEM; 1666 return -ENOMEM;
1242 1667
1243 td->tg_service_tree = THROTL_RB_ROOT; 1668 INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
1244 td->limits_changed = false; 1669 throtl_service_queue_init(&td->service_queue, NULL);
1245 INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
1246 1670
1247 q->td = td; 1671 q->td = td;
1248 td->queue = q; 1672 td->queue = q;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index d5cd3131c57a..d5bbdcfd0dab 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -4347,18 +4347,28 @@ static void cfq_exit_queue(struct elevator_queue *e)
4347 kfree(cfqd); 4347 kfree(cfqd);
4348} 4348}
4349 4349
4350static int cfq_init_queue(struct request_queue *q) 4350static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
4351{ 4351{
4352 struct cfq_data *cfqd; 4352 struct cfq_data *cfqd;
4353 struct blkcg_gq *blkg __maybe_unused; 4353 struct blkcg_gq *blkg __maybe_unused;
4354 int i, ret; 4354 int i, ret;
4355 struct elevator_queue *eq;
4356
4357 eq = elevator_alloc(q, e);
4358 if (!eq)
4359 return -ENOMEM;
4355 4360
4356 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); 4361 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
4357 if (!cfqd) 4362 if (!cfqd) {
4363 kobject_put(&eq->kobj);
4358 return -ENOMEM; 4364 return -ENOMEM;
4365 }
4366 eq->elevator_data = cfqd;
4359 4367
4360 cfqd->queue = q; 4368 cfqd->queue = q;
4361 q->elevator->elevator_data = cfqd; 4369 spin_lock_irq(q->queue_lock);
4370 q->elevator = eq;
4371 spin_unlock_irq(q->queue_lock);
4362 4372
4363 /* Init root service tree */ 4373 /* Init root service tree */
4364 cfqd->grp_service_tree = CFQ_RB_ROOT; 4374 cfqd->grp_service_tree = CFQ_RB_ROOT;
@@ -4433,6 +4443,7 @@ static int cfq_init_queue(struct request_queue *q)
4433 4443
4434out_free: 4444out_free:
4435 kfree(cfqd); 4445 kfree(cfqd);
4446 kobject_put(&eq->kobj);
4436 return ret; 4447 return ret;
4437} 4448}
4438 4449
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index ba19a3afab79..20614a332362 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -337,13 +337,21 @@ static void deadline_exit_queue(struct elevator_queue *e)
337/* 337/*
338 * initialize elevator private data (deadline_data). 338 * initialize elevator private data (deadline_data).
339 */ 339 */
340static int deadline_init_queue(struct request_queue *q) 340static int deadline_init_queue(struct request_queue *q, struct elevator_type *e)
341{ 341{
342 struct deadline_data *dd; 342 struct deadline_data *dd;
343 struct elevator_queue *eq;
344
345 eq = elevator_alloc(q, e);
346 if (!eq)
347 return -ENOMEM;
343 348
344 dd = kmalloc_node(sizeof(*dd), GFP_KERNEL | __GFP_ZERO, q->node); 349 dd = kmalloc_node(sizeof(*dd), GFP_KERNEL | __GFP_ZERO, q->node);
345 if (!dd) 350 if (!dd) {
351 kobject_put(&eq->kobj);
346 return -ENOMEM; 352 return -ENOMEM;
353 }
354 eq->elevator_data = dd;
347 355
348 INIT_LIST_HEAD(&dd->fifo_list[READ]); 356 INIT_LIST_HEAD(&dd->fifo_list[READ]);
349 INIT_LIST_HEAD(&dd->fifo_list[WRITE]); 357 INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
@@ -355,7 +363,9 @@ static int deadline_init_queue(struct request_queue *q)
355 dd->front_merges = 1; 363 dd->front_merges = 1;
356 dd->fifo_batch = fifo_batch; 364 dd->fifo_batch = fifo_batch;
357 365
358 q->elevator->elevator_data = dd; 366 spin_lock_irq(q->queue_lock);
367 q->elevator = eq;
368 spin_unlock_irq(q->queue_lock);
359 return 0; 369 return 0;
360} 370}
361 371
diff --git a/block/elevator.c b/block/elevator.c
index eba5b04c29b1..668394d18588 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -150,7 +150,7 @@ void __init load_default_elevator_module(void)
150 150
151static struct kobj_type elv_ktype; 151static struct kobj_type elv_ktype;
152 152
153static struct elevator_queue *elevator_alloc(struct request_queue *q, 153struct elevator_queue *elevator_alloc(struct request_queue *q,
154 struct elevator_type *e) 154 struct elevator_type *e)
155{ 155{
156 struct elevator_queue *eq; 156 struct elevator_queue *eq;
@@ -170,6 +170,7 @@ err:
170 elevator_put(e); 170 elevator_put(e);
171 return NULL; 171 return NULL;
172} 172}
173EXPORT_SYMBOL(elevator_alloc);
173 174
174static void elevator_release(struct kobject *kobj) 175static void elevator_release(struct kobject *kobj)
175{ 176{
@@ -221,16 +222,7 @@ int elevator_init(struct request_queue *q, char *name)
221 } 222 }
222 } 223 }
223 224
224 q->elevator = elevator_alloc(q, e); 225 err = e->ops.elevator_init_fn(q, e);
225 if (!q->elevator)
226 return -ENOMEM;
227
228 err = e->ops.elevator_init_fn(q);
229 if (err) {
230 kobject_put(&q->elevator->kobj);
231 return err;
232 }
233
234 return 0; 226 return 0;
235} 227}
236EXPORT_SYMBOL(elevator_init); 228EXPORT_SYMBOL(elevator_init);
@@ -935,16 +927,9 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
935 spin_unlock_irq(q->queue_lock); 927 spin_unlock_irq(q->queue_lock);
936 928
937 /* allocate, init and register new elevator */ 929 /* allocate, init and register new elevator */
938 err = -ENOMEM; 930 err = new_e->ops.elevator_init_fn(q, new_e);
939 q->elevator = elevator_alloc(q, new_e); 931 if (err)
940 if (!q->elevator)
941 goto fail_init;
942
943 err = new_e->ops.elevator_init_fn(q);
944 if (err) {
945 kobject_put(&q->elevator->kobj);
946 goto fail_init; 932 goto fail_init;
947 }
948 933
949 if (registered) { 934 if (registered) {
950 err = elv_register_queue(q); 935 err = elv_register_queue(q);
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index 5d1bf70e33d5..3de89d4690f3 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -59,16 +59,27 @@ noop_latter_request(struct request_queue *q, struct request *rq)
59 return list_entry(rq->queuelist.next, struct request, queuelist); 59 return list_entry(rq->queuelist.next, struct request, queuelist);
60} 60}
61 61
62static int noop_init_queue(struct request_queue *q) 62static int noop_init_queue(struct request_queue *q, struct elevator_type *e)
63{ 63{
64 struct noop_data *nd; 64 struct noop_data *nd;
65 struct elevator_queue *eq;
66
67 eq = elevator_alloc(q, e);
68 if (!eq)
69 return -ENOMEM;
65 70
66 nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node); 71 nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node);
67 if (!nd) 72 if (!nd) {
73 kobject_put(&eq->kobj);
68 return -ENOMEM; 74 return -ENOMEM;
75 }
76 eq->elevator_data = nd;
69 77
70 INIT_LIST_HEAD(&nd->queue); 78 INIT_LIST_HEAD(&nd->queue);
71 q->elevator->elevator_data = nd; 79
80 spin_lock_irq(q->queue_lock);
81 q->elevator = eq;
82 spin_unlock_irq(q->queue_lock);
72 return 0; 83 return 0;
73} 84}
74 85
diff --git a/fs/block_dev.c b/fs/block_dev.c
index bb43ce081d6e..c7bda5cd3da7 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -58,17 +58,24 @@ static void bdev_inode_switch_bdi(struct inode *inode,
58 struct backing_dev_info *dst) 58 struct backing_dev_info *dst)
59{ 59{
60 struct backing_dev_info *old = inode->i_data.backing_dev_info; 60 struct backing_dev_info *old = inode->i_data.backing_dev_info;
61 bool wakeup_bdi = false;
61 62
62 if (unlikely(dst == old)) /* deadlock avoidance */ 63 if (unlikely(dst == old)) /* deadlock avoidance */
63 return; 64 return;
64 bdi_lock_two(&old->wb, &dst->wb); 65 bdi_lock_two(&old->wb, &dst->wb);
65 spin_lock(&inode->i_lock); 66 spin_lock(&inode->i_lock);
66 inode->i_data.backing_dev_info = dst; 67 inode->i_data.backing_dev_info = dst;
67 if (inode->i_state & I_DIRTY) 68 if (inode->i_state & I_DIRTY) {
69 if (bdi_cap_writeback_dirty(dst) && !wb_has_dirty_io(&dst->wb))
70 wakeup_bdi = true;
68 list_move(&inode->i_wb_list, &dst->wb.b_dirty); 71 list_move(&inode->i_wb_list, &dst->wb.b_dirty);
72 }
69 spin_unlock(&inode->i_lock); 73 spin_unlock(&inode->i_lock);
70 spin_unlock(&old->wb.list_lock); 74 spin_unlock(&old->wb.list_lock);
71 spin_unlock(&dst->wb.list_lock); 75 spin_unlock(&dst->wb.list_lock);
76
77 if (wakeup_bdi)
78 bdi_wakeup_thread_delayed(dst);
72} 79}
73 80
74/* Kill _all_ buffers and pagecache , dirty or not.. */ 81/* Kill _all_ buffers and pagecache , dirty or not.. */
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index fd097ecfcd97..297462b9f41a 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -278,6 +278,8 @@ enum {
278 * 278 *
279 * - memcg: use_hierarchy is on by default and the cgroup file for 279 * - memcg: use_hierarchy is on by default and the cgroup file for
280 * the flag is not created. 280 * the flag is not created.
281 *
282 * - blkcg: blk-throttle becomes properly hierarchical.
281 */ 283 */
282 CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), 284 CGRP_ROOT_SANE_BEHAVIOR = (1 << 0),
283 285
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index acd0312d46fb..306dd8cd0b6f 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -7,6 +7,7 @@
7#ifdef CONFIG_BLOCK 7#ifdef CONFIG_BLOCK
8 8
9struct io_cq; 9struct io_cq;
10struct elevator_type;
10 11
11typedef int (elevator_merge_fn) (struct request_queue *, struct request **, 12typedef int (elevator_merge_fn) (struct request_queue *, struct request **,
12 struct bio *); 13 struct bio *);
@@ -35,7 +36,8 @@ typedef void (elevator_put_req_fn) (struct request *);
35typedef void (elevator_activate_req_fn) (struct request_queue *, struct request *); 36typedef void (elevator_activate_req_fn) (struct request_queue *, struct request *);
36typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct request *); 37typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct request *);
37 38
38typedef int (elevator_init_fn) (struct request_queue *); 39typedef int (elevator_init_fn) (struct request_queue *,
40 struct elevator_type *e);
39typedef void (elevator_exit_fn) (struct elevator_queue *); 41typedef void (elevator_exit_fn) (struct elevator_queue *);
40 42
41struct elevator_ops 43struct elevator_ops
@@ -155,6 +157,8 @@ extern int elevator_init(struct request_queue *, char *);
155extern void elevator_exit(struct elevator_queue *); 157extern void elevator_exit(struct elevator_queue *);
156extern int elevator_change(struct request_queue *, const char *); 158extern int elevator_change(struct request_queue *, const char *);
157extern bool elv_rq_merge_ok(struct request *, struct bio *); 159extern bool elv_rq_merge_ok(struct request *, struct bio *);
160extern struct elevator_queue *elevator_alloc(struct request_queue *,
161 struct elevator_type *);
158 162
159/* 163/*
160 * Helper functions. 164 * Helper functions.