summaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-07-11 16:03:24 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-07-11 16:03:24 -0400
commit36805aaea5ae3cf1bb32f1643e0a800bb69f0d5b (patch)
tree5565132549a0733772b3a2ac6b5cda516ea8cdce /block
parent6d2fa9e141ea56a571ec842fd4f3a86bea44a203 (diff)
parentd50235b7bc3ee0a0427984d763ea7534149531b4 (diff)
Merge branch 'for-3.11/core' of git://git.kernel.dk/linux-block
Pull core block IO updates from Jens Axboe: "Here are the core IO block bits for 3.11. It contains: - A tweak to the reserved tag logic from Jan, for weirdo devices with just 3 free tags. But for those it improves things substantially for random writes. - Periodic writeback fix from Jan. Marked for stable as well. - Fix for a race condition in IO scheduler switching from Jianpeng. - The hierarchical blk-cgroup support from Tejun. This is the grunt of the series. - blk-throttle fix from Vivek. Just a note that I'm in the middle of a relocation, whole family is flying out tomorrow. Hence I will be awal the remainder of this week, but back at work again on Monday the 15th. CC'ing Tejun, since any potential "surprises" will most likely be from the blk-cgroup work. But it's been brewing for a while and sitting in my tree and linux-next for a long time, so should be solid." * 'for-3.11/core' of git://git.kernel.dk/linux-block: (36 commits) elevator: Fix a race in elevator switching block: Reserve only one queue tag for sync IO if only 3 tags are available writeback: Fix periodic writeback after fs mount blk-throttle: implement proper hierarchy support blk-throttle: implement throtl_grp->has_rules[] blk-throttle: Account for child group's start time in parent while bio climbs up blk-throttle: add throtl_qnode for dispatch fairness blk-throttle: make throtl_pending_timer_fn() ready for hierarchy blk-throttle: make tg_dispatch_one_bio() ready for hierarchy blk-throttle: make blk_throtl_bio() ready for hierarchy blk-throttle: make blk_throtl_drain() ready for hierarchy blk-throttle: dispatch from throtl_pending_timer_fn() blk-throttle: implement dispatch looping blk-throttle: separate out throtl_service_queue->pending_timer from throtl_data->dispatch_work blk-throttle: set REQ_THROTTLED from throtl_charge_bio() and gate stats update with it blk-throttle: implement sq_to_tg(), sq_to_td() and throtl_log() blk-throttle: add throtl_service_queue->parent_sq blk-throttle: generalize update_disptime optimization in blk_throtl_bio() blk-throttle: dispatch to throtl_data->service_queue.bio_lists[] blk-throttle: move bio_lists[] and friends to throtl_service_queue ...
Diffstat (limited to 'block')
-rw-r--r--block/blk-cgroup.c105
-rw-r--r--block/blk-cgroup.h38
-rw-r--r--block/blk-tag.c11
-rw-r--r--block/blk-throttle.c1064
-rw-r--r--block/cfq-iosched.c17
-rw-r--r--block/deadline-iosched.c16
-rw-r--r--block/elevator.c25
-rw-r--r--block/noop-iosched.c17
8 files changed, 875 insertions, 418 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index e8918ffaf96d..290792a13e3c 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -32,26 +32,6 @@ EXPORT_SYMBOL_GPL(blkcg_root);
32 32
33static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; 33static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
34 34
35static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
36 struct request_queue *q, bool update_hint);
37
38/**
39 * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
40 * @d_blkg: loop cursor pointing to the current descendant
41 * @pos_cgrp: used for iteration
42 * @p_blkg: target blkg to walk descendants of
43 *
44 * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU
45 * read locked. If called under either blkcg or queue lock, the iteration
46 * is guaranteed to include all and only online blkgs. The caller may
47 * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip
48 * subtree.
49 */
50#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \
51 cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
52 if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
53 (p_blkg)->q, false)))
54
55static bool blkcg_policy_enabled(struct request_queue *q, 35static bool blkcg_policy_enabled(struct request_queue *q,
56 const struct blkcg_policy *pol) 36 const struct blkcg_policy *pol)
57{ 37{
@@ -71,18 +51,8 @@ static void blkg_free(struct blkcg_gq *blkg)
71 if (!blkg) 51 if (!blkg)
72 return; 52 return;
73 53
74 for (i = 0; i < BLKCG_MAX_POLS; i++) { 54 for (i = 0; i < BLKCG_MAX_POLS; i++)
75 struct blkcg_policy *pol = blkcg_policy[i]; 55 kfree(blkg->pd[i]);
76 struct blkg_policy_data *pd = blkg->pd[i];
77
78 if (!pd)
79 continue;
80
81 if (pol && pol->pd_exit_fn)
82 pol->pd_exit_fn(blkg);
83
84 kfree(pd);
85 }
86 56
87 blk_exit_rl(&blkg->rl); 57 blk_exit_rl(&blkg->rl);
88 kfree(blkg); 58 kfree(blkg);
@@ -134,10 +104,6 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
134 blkg->pd[i] = pd; 104 blkg->pd[i] = pd;
135 pd->blkg = blkg; 105 pd->blkg = blkg;
136 pd->plid = i; 106 pd->plid = i;
137
138 /* invoke per-policy init */
139 if (pol->pd_init_fn)
140 pol->pd_init_fn(blkg);
141 } 107 }
142 108
143 return blkg; 109 return blkg;
@@ -158,8 +124,8 @@ err_free:
158 * @q's bypass state. If @update_hint is %true, the caller should be 124 * @q's bypass state. If @update_hint is %true, the caller should be
159 * holding @q->queue_lock and lookup hint is updated on success. 125 * holding @q->queue_lock and lookup hint is updated on success.
160 */ 126 */
161static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, 127struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
162 struct request_queue *q, bool update_hint) 128 bool update_hint)
163{ 129{
164 struct blkcg_gq *blkg; 130 struct blkcg_gq *blkg;
165 131
@@ -234,16 +200,25 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
234 } 200 }
235 blkg = new_blkg; 201 blkg = new_blkg;
236 202
237 /* link parent and insert */ 203 /* link parent */
238 if (blkcg_parent(blkcg)) { 204 if (blkcg_parent(blkcg)) {
239 blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); 205 blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
240 if (WARN_ON_ONCE(!blkg->parent)) { 206 if (WARN_ON_ONCE(!blkg->parent)) {
241 blkg = ERR_PTR(-EINVAL); 207 ret = -EINVAL;
242 goto err_put_css; 208 goto err_put_css;
243 } 209 }
244 blkg_get(blkg->parent); 210 blkg_get(blkg->parent);
245 } 211 }
246 212
213 /* invoke per-policy init */
214 for (i = 0; i < BLKCG_MAX_POLS; i++) {
215 struct blkcg_policy *pol = blkcg_policy[i];
216
217 if (blkg->pd[i] && pol->pd_init_fn)
218 pol->pd_init_fn(blkg);
219 }
220
221 /* insert */
247 spin_lock(&blkcg->lock); 222 spin_lock(&blkcg->lock);
248 ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); 223 ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
249 if (likely(!ret)) { 224 if (likely(!ret)) {
@@ -394,30 +369,38 @@ static void blkg_destroy_all(struct request_queue *q)
394 q->root_rl.blkg = NULL; 369 q->root_rl.blkg = NULL;
395} 370}
396 371
397static void blkg_rcu_free(struct rcu_head *rcu_head) 372/*
373 * A group is RCU protected, but having an rcu lock does not mean that one
374 * can access all the fields of blkg and assume these are valid. For
375 * example, don't try to follow throtl_data and request queue links.
376 *
377 * Having a reference to blkg under an rcu allows accesses to only values
378 * local to groups like group stats and group rate limits.
379 */
380void __blkg_release_rcu(struct rcu_head *rcu_head)
398{ 381{
399 blkg_free(container_of(rcu_head, struct blkcg_gq, rcu_head)); 382 struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head);
400} 383 int i;
384
385 /* tell policies that this one is being freed */
386 for (i = 0; i < BLKCG_MAX_POLS; i++) {
387 struct blkcg_policy *pol = blkcg_policy[i];
388
389 if (blkg->pd[i] && pol->pd_exit_fn)
390 pol->pd_exit_fn(blkg);
391 }
401 392
402void __blkg_release(struct blkcg_gq *blkg)
403{
404 /* release the blkcg and parent blkg refs this blkg has been holding */ 393 /* release the blkcg and parent blkg refs this blkg has been holding */
405 css_put(&blkg->blkcg->css); 394 css_put(&blkg->blkcg->css);
406 if (blkg->parent) 395 if (blkg->parent) {
396 spin_lock_irq(blkg->q->queue_lock);
407 blkg_put(blkg->parent); 397 blkg_put(blkg->parent);
398 spin_unlock_irq(blkg->q->queue_lock);
399 }
408 400
409 /* 401 blkg_free(blkg);
410 * A group is freed in rcu manner. But having an rcu lock does not
411 * mean that one can access all the fields of blkg and assume these
412 * are valid. For example, don't try to follow throtl_data and
413 * request queue links.
414 *
415 * Having a reference to blkg under an rcu allows acess to only
416 * values local to groups like group stats and group rate limits
417 */
418 call_rcu(&blkg->rcu_head, blkg_rcu_free);
419} 402}
420EXPORT_SYMBOL_GPL(__blkg_release); 403EXPORT_SYMBOL_GPL(__blkg_release_rcu);
421 404
422/* 405/*
423 * The next function used by blk_queue_for_each_rl(). It's a bit tricky 406 * The next function used by blk_queue_for_each_rl(). It's a bit tricky
@@ -928,14 +911,6 @@ struct cgroup_subsys blkio_subsys = {
928 .subsys_id = blkio_subsys_id, 911 .subsys_id = blkio_subsys_id,
929 .base_cftypes = blkcg_files, 912 .base_cftypes = blkcg_files,
930 .module = THIS_MODULE, 913 .module = THIS_MODULE,
931
932 /*
933 * blkio subsystem is utterly broken in terms of hierarchy support.
934 * It treats all cgroups equally regardless of where they're
935 * located in the hierarchy - all cgroups are treated as if they're
936 * right below the root. Fix it and remove the following.
937 */
938 .broken_hierarchy = true,
939}; 914};
940EXPORT_SYMBOL_GPL(blkio_subsys); 915EXPORT_SYMBOL_GPL(blkio_subsys);
941 916
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 4e595ee8c915..8056c03a3382 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -266,7 +266,7 @@ static inline void blkg_get(struct blkcg_gq *blkg)
266 blkg->refcnt++; 266 blkg->refcnt++;
267} 267}
268 268
269void __blkg_release(struct blkcg_gq *blkg); 269void __blkg_release_rcu(struct rcu_head *rcu);
270 270
271/** 271/**
272 * blkg_put - put a blkg reference 272 * blkg_put - put a blkg reference
@@ -279,9 +279,43 @@ static inline void blkg_put(struct blkcg_gq *blkg)
279 lockdep_assert_held(blkg->q->queue_lock); 279 lockdep_assert_held(blkg->q->queue_lock);
280 WARN_ON_ONCE(blkg->refcnt <= 0); 280 WARN_ON_ONCE(blkg->refcnt <= 0);
281 if (!--blkg->refcnt) 281 if (!--blkg->refcnt)
282 __blkg_release(blkg); 282 call_rcu(&blkg->rcu_head, __blkg_release_rcu);
283} 283}
284 284
285struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
286 bool update_hint);
287
288/**
289 * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
290 * @d_blkg: loop cursor pointing to the current descendant
291 * @pos_cgrp: used for iteration
292 * @p_blkg: target blkg to walk descendants of
293 *
294 * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU
295 * read locked. If called under either blkcg or queue lock, the iteration
296 * is guaranteed to include all and only online blkgs. The caller may
297 * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip
298 * subtree.
299 */
300#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \
301 cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
302 if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
303 (p_blkg)->q, false)))
304
305/**
306 * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
307 * @d_blkg: loop cursor pointing to the current descendant
308 * @pos_cgrp: used for iteration
309 * @p_blkg: target blkg to walk descendants of
310 *
311 * Similar to blkg_for_each_descendant_pre() but performs post-order
312 * traversal instead. Synchronization rules are the same.
313 */
314#define blkg_for_each_descendant_post(d_blkg, pos_cgrp, p_blkg) \
315 cgroup_for_each_descendant_post((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
316 if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
317 (p_blkg)->q, false)))
318
285/** 319/**
286 * blk_get_rl - get request_list to use 320 * blk_get_rl - get request_list to use
287 * @q: request_queue of interest 321 * @q: request_queue of interest
diff --git a/block/blk-tag.c b/block/blk-tag.c
index cc345e1d8d4e..3f33d8672268 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -348,9 +348,16 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq)
348 */ 348 */
349 max_depth = bqt->max_depth; 349 max_depth = bqt->max_depth;
350 if (!rq_is_sync(rq) && max_depth > 1) { 350 if (!rq_is_sync(rq) && max_depth > 1) {
351 max_depth -= 2; 351 switch (max_depth) {
352 if (!max_depth) 352 case 2:
353 max_depth = 1; 353 max_depth = 1;
354 break;
355 case 3:
356 max_depth = 2;
357 break;
358 default:
359 max_depth -= 2;
360 }
354 if (q->in_flight[BLK_RW_ASYNC] > max_depth) 361 if (q->in_flight[BLK_RW_ASYNC] > max_depth)
355 return 1; 362 return 1;
356 } 363 }
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 31146225f3d0..08a32dfd3844 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -25,18 +25,61 @@ static struct blkcg_policy blkcg_policy_throtl;
25 25
26/* A workqueue to queue throttle related work */ 26/* A workqueue to queue throttle related work */
27static struct workqueue_struct *kthrotld_workqueue; 27static struct workqueue_struct *kthrotld_workqueue;
28static void throtl_schedule_delayed_work(struct throtl_data *td, 28
29 unsigned long delay); 29/*
30 30 * To implement hierarchical throttling, throtl_grps form a tree and bios
31struct throtl_rb_root { 31 * are dispatched upwards level by level until they reach the top and get
32 struct rb_root rb; 32 * issued. When dispatching bios from the children and local group at each
33 struct rb_node *left; 33 * level, if the bios are dispatched into a single bio_list, there's a risk
34 unsigned int count; 34 * of a local or child group which can queue many bios at once filling up
35 unsigned long min_disptime; 35 * the list starving others.
36 *
37 * To avoid such starvation, dispatched bios are queued separately
38 * according to where they came from. When they are again dispatched to
39 * the parent, they're popped in round-robin order so that no single source
40 * hogs the dispatch window.
41 *
42 * throtl_qnode is used to keep the queued bios separated by their sources.
43 * Bios are queued to throtl_qnode which in turn is queued to
44 * throtl_service_queue and then dispatched in round-robin order.
45 *
46 * It's also used to track the reference counts on blkg's. A qnode always
47 * belongs to a throtl_grp and gets queued on itself or the parent, so
48 * incrementing the reference of the associated throtl_grp when a qnode is
49 * queued and decrementing when dequeued is enough to keep the whole blkg
50 * tree pinned while bios are in flight.
51 */
52struct throtl_qnode {
53 struct list_head node; /* service_queue->queued[] */
54 struct bio_list bios; /* queued bios */
55 struct throtl_grp *tg; /* tg this qnode belongs to */
36}; 56};
37 57
38#define THROTL_RB_ROOT (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \ 58struct throtl_service_queue {
39 .count = 0, .min_disptime = 0} 59 struct throtl_service_queue *parent_sq; /* the parent service_queue */
60
61 /*
62 * Bios queued directly to this service_queue or dispatched from
63 * children throtl_grp's.
64 */
65 struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */
66 unsigned int nr_queued[2]; /* number of queued bios */
67
68 /*
69 * RB tree of active children throtl_grp's, which are sorted by
70 * their ->disptime.
71 */
72 struct rb_root pending_tree; /* RB tree of active tgs */
73 struct rb_node *first_pending; /* first node in the tree */
74 unsigned int nr_pending; /* # queued in the tree */
75 unsigned long first_pending_disptime; /* disptime of the first tg */
76 struct timer_list pending_timer; /* fires on first_pending_disptime */
77};
78
79enum tg_state_flags {
80 THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
81 THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
82};
40 83
41#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) 84#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
42 85
@@ -52,9 +95,26 @@ struct throtl_grp {
52 /* must be the first member */ 95 /* must be the first member */
53 struct blkg_policy_data pd; 96 struct blkg_policy_data pd;
54 97
55 /* active throtl group service_tree member */ 98 /* active throtl group service_queue member */
56 struct rb_node rb_node; 99 struct rb_node rb_node;
57 100
101 /* throtl_data this group belongs to */
102 struct throtl_data *td;
103
104 /* this group's service queue */
105 struct throtl_service_queue service_queue;
106
107 /*
108 * qnode_on_self is used when bios are directly queued to this
109 * throtl_grp so that local bios compete fairly with bios
110 * dispatched from children. qnode_on_parent is used when bios are
111 * dispatched from this throtl_grp into its parent and will compete
112 * with the sibling qnode_on_parents and the parent's
113 * qnode_on_self.
114 */
115 struct throtl_qnode qnode_on_self[2];
116 struct throtl_qnode qnode_on_parent[2];
117
58 /* 118 /*
59 * Dispatch time in jiffies. This is the estimated time when group 119 * Dispatch time in jiffies. This is the estimated time when group
60 * will unthrottle and is ready to dispatch more bio. It is used as 120 * will unthrottle and is ready to dispatch more bio. It is used as
@@ -64,11 +124,8 @@ struct throtl_grp {
64 124
65 unsigned int flags; 125 unsigned int flags;
66 126
67 /* Two lists for READ and WRITE */ 127 /* are there any throtl rules between this group and td? */
68 struct bio_list bio_lists[2]; 128 bool has_rules[2];
69
70 /* Number of queued bios on READ and WRITE lists */
71 unsigned int nr_queued[2];
72 129
73 /* bytes per second rate limits */ 130 /* bytes per second rate limits */
74 uint64_t bps[2]; 131 uint64_t bps[2];
@@ -85,9 +142,6 @@ struct throtl_grp {
85 unsigned long slice_start[2]; 142 unsigned long slice_start[2];
86 unsigned long slice_end[2]; 143 unsigned long slice_end[2];
87 144
88 /* Some throttle limits got updated for the group */
89 int limits_changed;
90
91 /* Per cpu stats pointer */ 145 /* Per cpu stats pointer */
92 struct tg_stats_cpu __percpu *stats_cpu; 146 struct tg_stats_cpu __percpu *stats_cpu;
93 147
@@ -98,7 +152,7 @@ struct throtl_grp {
98struct throtl_data 152struct throtl_data
99{ 153{
100 /* service tree for active throtl groups */ 154 /* service tree for active throtl groups */
101 struct throtl_rb_root tg_service_tree; 155 struct throtl_service_queue service_queue;
102 156
103 struct request_queue *queue; 157 struct request_queue *queue;
104 158
@@ -111,9 +165,7 @@ struct throtl_data
111 unsigned int nr_undestroyed_grps; 165 unsigned int nr_undestroyed_grps;
112 166
113 /* Work for dispatching throttled bios */ 167 /* Work for dispatching throttled bios */
114 struct delayed_work throtl_work; 168 struct work_struct dispatch_work;
115
116 int limits_changed;
117}; 169};
118 170
119/* list and work item to allocate percpu group stats */ 171/* list and work item to allocate percpu group stats */
@@ -123,6 +175,8 @@ static LIST_HEAD(tg_stats_alloc_list);
123static void tg_stats_alloc_fn(struct work_struct *); 175static void tg_stats_alloc_fn(struct work_struct *);
124static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn); 176static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);
125 177
178static void throtl_pending_timer_fn(unsigned long arg);
179
126static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) 180static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
127{ 181{
128 return pd ? container_of(pd, struct throtl_grp, pd) : NULL; 182 return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
@@ -143,41 +197,65 @@ static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
143 return blkg_to_tg(td->queue->root_blkg); 197 return blkg_to_tg(td->queue->root_blkg);
144} 198}
145 199
146enum tg_state_flags { 200/**
147 THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */ 201 * sq_to_tg - return the throl_grp the specified service queue belongs to
148}; 202 * @sq: the throtl_service_queue of interest
149 203 *
150#define THROTL_TG_FNS(name) \ 204 * Return the throtl_grp @sq belongs to. If @sq is the top-level one
151static inline void throtl_mark_tg_##name(struct throtl_grp *tg) \ 205 * embedded in throtl_data, %NULL is returned.
152{ \ 206 */
153 (tg)->flags |= (1 << THROTL_TG_FLAG_##name); \ 207static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq)
154} \ 208{
155static inline void throtl_clear_tg_##name(struct throtl_grp *tg) \ 209 if (sq && sq->parent_sq)
156{ \ 210 return container_of(sq, struct throtl_grp, service_queue);
157 (tg)->flags &= ~(1 << THROTL_TG_FLAG_##name); \ 211 else
158} \ 212 return NULL;
159static inline int throtl_tg_##name(const struct throtl_grp *tg) \
160{ \
161 return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0; \
162} 213}
163 214
164THROTL_TG_FNS(on_rr); 215/**
216 * sq_to_td - return throtl_data the specified service queue belongs to
217 * @sq: the throtl_service_queue of interest
218 *
219 * A service_queue can be embeded in either a throtl_grp or throtl_data.
220 * Determine the associated throtl_data accordingly and return it.
221 */
222static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
223{
224 struct throtl_grp *tg = sq_to_tg(sq);
165 225
166#define throtl_log_tg(td, tg, fmt, args...) do { \ 226 if (tg)
167 char __pbuf[128]; \ 227 return tg->td;
228 else
229 return container_of(sq, struct throtl_data, service_queue);
230}
231
232/**
233 * throtl_log - log debug message via blktrace
234 * @sq: the service_queue being reported
235 * @fmt: printf format string
236 * @args: printf args
237 *
238 * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a
239 * throtl_grp; otherwise, just "throtl".
240 *
241 * TODO: this should be made a function and name formatting should happen
242 * after testing whether blktrace is enabled.
243 */
244#define throtl_log(sq, fmt, args...) do { \
245 struct throtl_grp *__tg = sq_to_tg((sq)); \
246 struct throtl_data *__td = sq_to_td((sq)); \
247 \
248 (void)__td; \
249 if ((__tg)) { \
250 char __pbuf[128]; \
168 \ 251 \
169 blkg_path(tg_to_blkg(tg), __pbuf, sizeof(__pbuf)); \ 252 blkg_path(tg_to_blkg(__tg), __pbuf, sizeof(__pbuf)); \
170 blk_add_trace_msg((td)->queue, "throtl %s " fmt, __pbuf, ##args); \ 253 blk_add_trace_msg(__td->queue, "throtl %s " fmt, __pbuf, ##args); \
254 } else { \
255 blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \
256 } \
171} while (0) 257} while (0)
172 258
173#define throtl_log(td, fmt, args...) \
174 blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
175
176static inline unsigned int total_nr_queued(struct throtl_data *td)
177{
178 return td->nr_queued[0] + td->nr_queued[1];
179}
180
181/* 259/*
182 * Worker for allocating per cpu stat for tgs. This is scheduled on the 260 * Worker for allocating per cpu stat for tgs. This is scheduled on the
183 * system_wq once there are some groups on the alloc_list waiting for 261 * system_wq once there are some groups on the alloc_list waiting for
@@ -215,15 +293,141 @@ alloc_stats:
215 goto alloc_stats; 293 goto alloc_stats;
216} 294}
217 295
296static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
297{
298 INIT_LIST_HEAD(&qn->node);
299 bio_list_init(&qn->bios);
300 qn->tg = tg;
301}
302
303/**
304 * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it
305 * @bio: bio being added
306 * @qn: qnode to add bio to
307 * @queued: the service_queue->queued[] list @qn belongs to
308 *
309 * Add @bio to @qn and put @qn on @queued if it's not already on.
310 * @qn->tg's reference count is bumped when @qn is activated. See the
311 * comment on top of throtl_qnode definition for details.
312 */
313static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
314 struct list_head *queued)
315{
316 bio_list_add(&qn->bios, bio);
317 if (list_empty(&qn->node)) {
318 list_add_tail(&qn->node, queued);
319 blkg_get(tg_to_blkg(qn->tg));
320 }
321}
322
323/**
324 * throtl_peek_queued - peek the first bio on a qnode list
325 * @queued: the qnode list to peek
326 */
327static struct bio *throtl_peek_queued(struct list_head *queued)
328{
329 struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
330 struct bio *bio;
331
332 if (list_empty(queued))
333 return NULL;
334
335 bio = bio_list_peek(&qn->bios);
336 WARN_ON_ONCE(!bio);
337 return bio;
338}
339
340/**
341 * throtl_pop_queued - pop the first bio form a qnode list
342 * @queued: the qnode list to pop a bio from
343 * @tg_to_put: optional out argument for throtl_grp to put
344 *
345 * Pop the first bio from the qnode list @queued. After popping, the first
346 * qnode is removed from @queued if empty or moved to the end of @queued so
347 * that the popping order is round-robin.
348 *
349 * When the first qnode is removed, its associated throtl_grp should be put
350 * too. If @tg_to_put is NULL, this function automatically puts it;
351 * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is
352 * responsible for putting it.
353 */
354static struct bio *throtl_pop_queued(struct list_head *queued,
355 struct throtl_grp **tg_to_put)
356{
357 struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
358 struct bio *bio;
359
360 if (list_empty(queued))
361 return NULL;
362
363 bio = bio_list_pop(&qn->bios);
364 WARN_ON_ONCE(!bio);
365
366 if (bio_list_empty(&qn->bios)) {
367 list_del_init(&qn->node);
368 if (tg_to_put)
369 *tg_to_put = qn->tg;
370 else
371 blkg_put(tg_to_blkg(qn->tg));
372 } else {
373 list_move_tail(&qn->node, queued);
374 }
375
376 return bio;
377}
378
379/* init a service_queue, assumes the caller zeroed it */
380static void throtl_service_queue_init(struct throtl_service_queue *sq,
381 struct throtl_service_queue *parent_sq)
382{
383 INIT_LIST_HEAD(&sq->queued[0]);
384 INIT_LIST_HEAD(&sq->queued[1]);
385 sq->pending_tree = RB_ROOT;
386 sq->parent_sq = parent_sq;
387 setup_timer(&sq->pending_timer, throtl_pending_timer_fn,
388 (unsigned long)sq);
389}
390
391static void throtl_service_queue_exit(struct throtl_service_queue *sq)
392{
393 del_timer_sync(&sq->pending_timer);
394}
395
218static void throtl_pd_init(struct blkcg_gq *blkg) 396static void throtl_pd_init(struct blkcg_gq *blkg)
219{ 397{
220 struct throtl_grp *tg = blkg_to_tg(blkg); 398 struct throtl_grp *tg = blkg_to_tg(blkg);
399 struct throtl_data *td = blkg->q->td;
400 struct throtl_service_queue *parent_sq;
221 unsigned long flags; 401 unsigned long flags;
402 int rw;
403
404 /*
405 * If sane_hierarchy is enabled, we switch to properly hierarchical
406 * behavior where limits on a given throtl_grp are applied to the
407 * whole subtree rather than just the group itself. e.g. If 16M
408 * read_bps limit is set on the root group, the whole system can't
409 * exceed 16M for the device.
410 *
411 * If sane_hierarchy is not enabled, the broken flat hierarchy
412 * behavior is retained where all throtl_grps are treated as if
413 * they're all separate root groups right below throtl_data.
414 * Limits of a group don't interact with limits of other groups
415 * regardless of the position of the group in the hierarchy.
416 */
417 parent_sq = &td->service_queue;
418
419 if (cgroup_sane_behavior(blkg->blkcg->css.cgroup) && blkg->parent)
420 parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
421
422 throtl_service_queue_init(&tg->service_queue, parent_sq);
423
424 for (rw = READ; rw <= WRITE; rw++) {
425 throtl_qnode_init(&tg->qnode_on_self[rw], tg);
426 throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
427 }
222 428
223 RB_CLEAR_NODE(&tg->rb_node); 429 RB_CLEAR_NODE(&tg->rb_node);
224 bio_list_init(&tg->bio_lists[0]); 430 tg->td = td;
225 bio_list_init(&tg->bio_lists[1]);
226 tg->limits_changed = false;
227 431
228 tg->bps[READ] = -1; 432 tg->bps[READ] = -1;
229 tg->bps[WRITE] = -1; 433 tg->bps[WRITE] = -1;
@@ -241,6 +445,30 @@ static void throtl_pd_init(struct blkcg_gq *blkg)
241 spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); 445 spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
242} 446}
243 447
448/*
449 * Set has_rules[] if @tg or any of its parents have limits configured.
450 * This doesn't require walking up to the top of the hierarchy as the
451 * parent's has_rules[] is guaranteed to be correct.
452 */
453static void tg_update_has_rules(struct throtl_grp *tg)
454{
455 struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq);
456 int rw;
457
458 for (rw = READ; rw <= WRITE; rw++)
459 tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) ||
460 (tg->bps[rw] != -1 || tg->iops[rw] != -1);
461}
462
463static void throtl_pd_online(struct blkcg_gq *blkg)
464{
465 /*
466 * We don't want new groups to escape the limits of its ancestors.
467 * Update has_rules[] after a new group is brought online.
468 */
469 tg_update_has_rules(blkg_to_tg(blkg));
470}
471
244static void throtl_pd_exit(struct blkcg_gq *blkg) 472static void throtl_pd_exit(struct blkcg_gq *blkg)
245{ 473{
246 struct throtl_grp *tg = blkg_to_tg(blkg); 474 struct throtl_grp *tg = blkg_to_tg(blkg);
@@ -251,6 +479,8 @@ static void throtl_pd_exit(struct blkcg_gq *blkg)
251 spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); 479 spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
252 480
253 free_percpu(tg->stats_cpu); 481 free_percpu(tg->stats_cpu);
482
483 throtl_service_queue_exit(&tg->service_queue);
254} 484}
255 485
256static void throtl_pd_reset_stats(struct blkcg_gq *blkg) 486static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
@@ -309,17 +539,18 @@ static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
309 return tg; 539 return tg;
310} 540}
311 541
312static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root) 542static struct throtl_grp *
543throtl_rb_first(struct throtl_service_queue *parent_sq)
313{ 544{
314 /* Service tree is empty */ 545 /* Service tree is empty */
315 if (!root->count) 546 if (!parent_sq->nr_pending)
316 return NULL; 547 return NULL;
317 548
318 if (!root->left) 549 if (!parent_sq->first_pending)
319 root->left = rb_first(&root->rb); 550 parent_sq->first_pending = rb_first(&parent_sq->pending_tree);
320 551
321 if (root->left) 552 if (parent_sq->first_pending)
322 return rb_entry_tg(root->left); 553 return rb_entry_tg(parent_sq->first_pending);
323 554
324 return NULL; 555 return NULL;
325} 556}
@@ -330,29 +561,30 @@ static void rb_erase_init(struct rb_node *n, struct rb_root *root)
330 RB_CLEAR_NODE(n); 561 RB_CLEAR_NODE(n);
331} 562}
332 563
333static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root) 564static void throtl_rb_erase(struct rb_node *n,
565 struct throtl_service_queue *parent_sq)
334{ 566{
335 if (root->left == n) 567 if (parent_sq->first_pending == n)
336 root->left = NULL; 568 parent_sq->first_pending = NULL;
337 rb_erase_init(n, &root->rb); 569 rb_erase_init(n, &parent_sq->pending_tree);
338 --root->count; 570 --parent_sq->nr_pending;
339} 571}
340 572
341static void update_min_dispatch_time(struct throtl_rb_root *st) 573static void update_min_dispatch_time(struct throtl_service_queue *parent_sq)
342{ 574{
343 struct throtl_grp *tg; 575 struct throtl_grp *tg;
344 576
345 tg = throtl_rb_first(st); 577 tg = throtl_rb_first(parent_sq);
346 if (!tg) 578 if (!tg)
347 return; 579 return;
348 580
349 st->min_disptime = tg->disptime; 581 parent_sq->first_pending_disptime = tg->disptime;
350} 582}
351 583
352static void 584static void tg_service_queue_add(struct throtl_grp *tg)
353tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
354{ 585{
355 struct rb_node **node = &st->rb.rb_node; 586 struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq;
587 struct rb_node **node = &parent_sq->pending_tree.rb_node;
356 struct rb_node *parent = NULL; 588 struct rb_node *parent = NULL;
357 struct throtl_grp *__tg; 589 struct throtl_grp *__tg;
358 unsigned long key = tg->disptime; 590 unsigned long key = tg->disptime;
@@ -371,89 +603,135 @@ tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
371 } 603 }
372 604
373 if (left) 605 if (left)
374 st->left = &tg->rb_node; 606 parent_sq->first_pending = &tg->rb_node;
375 607
376 rb_link_node(&tg->rb_node, parent, node); 608 rb_link_node(&tg->rb_node, parent, node);
377 rb_insert_color(&tg->rb_node, &st->rb); 609 rb_insert_color(&tg->rb_node, &parent_sq->pending_tree);
378} 610}
379 611
380static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) 612static void __throtl_enqueue_tg(struct throtl_grp *tg)
381{ 613{
382 struct throtl_rb_root *st = &td->tg_service_tree; 614 tg_service_queue_add(tg);
615 tg->flags |= THROTL_TG_PENDING;
616 tg->service_queue.parent_sq->nr_pending++;
617}
383 618
384 tg_service_tree_add(st, tg); 619static void throtl_enqueue_tg(struct throtl_grp *tg)
385 throtl_mark_tg_on_rr(tg); 620{
386 st->count++; 621 if (!(tg->flags & THROTL_TG_PENDING))
622 __throtl_enqueue_tg(tg);
387} 623}
388 624
389static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) 625static void __throtl_dequeue_tg(struct throtl_grp *tg)
390{ 626{
391 if (!throtl_tg_on_rr(tg)) 627 throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq);
392 __throtl_enqueue_tg(td, tg); 628 tg->flags &= ~THROTL_TG_PENDING;
393} 629}
394 630
395static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) 631static void throtl_dequeue_tg(struct throtl_grp *tg)
396{ 632{
397 throtl_rb_erase(&tg->rb_node, &td->tg_service_tree); 633 if (tg->flags & THROTL_TG_PENDING)
398 throtl_clear_tg_on_rr(tg); 634 __throtl_dequeue_tg(tg);
399} 635}
400 636
401static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) 637/* Call with queue lock held */
638static void throtl_schedule_pending_timer(struct throtl_service_queue *sq,
639 unsigned long expires)
402{ 640{
403 if (throtl_tg_on_rr(tg)) 641 mod_timer(&sq->pending_timer, expires);
404 __throtl_dequeue_tg(td, tg); 642 throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu",
643 expires - jiffies, jiffies);
405} 644}
406 645
407static void throtl_schedule_next_dispatch(struct throtl_data *td) 646/**
647 * throtl_schedule_next_dispatch - schedule the next dispatch cycle
648 * @sq: the service_queue to schedule dispatch for
649 * @force: force scheduling
650 *
651 * Arm @sq->pending_timer so that the next dispatch cycle starts on the
652 * dispatch time of the first pending child. Returns %true if either timer
653 * is armed or there's no pending child left. %false if the current
654 * dispatch window is still open and the caller should continue
655 * dispatching.
656 *
657 * If @force is %true, the dispatch timer is always scheduled and this
658 * function is guaranteed to return %true. This is to be used when the
659 * caller can't dispatch itself and needs to invoke pending_timer
660 * unconditionally. Note that forced scheduling is likely to induce short
661 * delay before dispatch starts even if @sq->first_pending_disptime is not
662 * in the future and thus shouldn't be used in hot paths.
663 */
664static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq,
665 bool force)
408{ 666{
409 struct throtl_rb_root *st = &td->tg_service_tree; 667 /* any pending children left? */
668 if (!sq->nr_pending)
669 return true;
410 670
411 /* 671 update_min_dispatch_time(sq);
412 * If there are more bios pending, schedule more work.
413 */
414 if (!total_nr_queued(td))
415 return;
416 672
417 BUG_ON(!st->count); 673 /* is the next dispatch time in the future? */
674 if (force || time_after(sq->first_pending_disptime, jiffies)) {
675 throtl_schedule_pending_timer(sq, sq->first_pending_disptime);
676 return true;
677 }
418 678
419 update_min_dispatch_time(st); 679 /* tell the caller to continue dispatching */
680 return false;
681}
420 682
421 if (time_before_eq(st->min_disptime, jiffies)) 683static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
422 throtl_schedule_delayed_work(td, 0); 684 bool rw, unsigned long start)
423 else 685{
424 throtl_schedule_delayed_work(td, (st->min_disptime - jiffies)); 686 tg->bytes_disp[rw] = 0;
687 tg->io_disp[rw] = 0;
688
689 /*
690 * Previous slice has expired. We must have trimmed it after last
691 * bio dispatch. That means since start of last slice, we never used
692 * that bandwidth. Do try to make use of that bandwidth while giving
693 * credit.
694 */
695 if (time_after_eq(start, tg->slice_start[rw]))
696 tg->slice_start[rw] = start;
697
698 tg->slice_end[rw] = jiffies + throtl_slice;
699 throtl_log(&tg->service_queue,
700 "[%c] new slice with credit start=%lu end=%lu jiffies=%lu",
701 rw == READ ? 'R' : 'W', tg->slice_start[rw],
702 tg->slice_end[rw], jiffies);
425} 703}
426 704
427static inline void 705static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
428throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
429{ 706{
430 tg->bytes_disp[rw] = 0; 707 tg->bytes_disp[rw] = 0;
431 tg->io_disp[rw] = 0; 708 tg->io_disp[rw] = 0;
432 tg->slice_start[rw] = jiffies; 709 tg->slice_start[rw] = jiffies;
433 tg->slice_end[rw] = jiffies + throtl_slice; 710 tg->slice_end[rw] = jiffies + throtl_slice;
434 throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu", 711 throtl_log(&tg->service_queue,
435 rw == READ ? 'R' : 'W', tg->slice_start[rw], 712 "[%c] new slice start=%lu end=%lu jiffies=%lu",
436 tg->slice_end[rw], jiffies); 713 rw == READ ? 'R' : 'W', tg->slice_start[rw],
714 tg->slice_end[rw], jiffies);
437} 715}
438 716
439static inline void throtl_set_slice_end(struct throtl_data *td, 717static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
440 struct throtl_grp *tg, bool rw, unsigned long jiffy_end) 718 unsigned long jiffy_end)
441{ 719{
442 tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); 720 tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
443} 721}
444 722
445static inline void throtl_extend_slice(struct throtl_data *td, 723static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
446 struct throtl_grp *tg, bool rw, unsigned long jiffy_end) 724 unsigned long jiffy_end)
447{ 725{
448 tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); 726 tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
449 throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu", 727 throtl_log(&tg->service_queue,
450 rw == READ ? 'R' : 'W', tg->slice_start[rw], 728 "[%c] extend slice start=%lu end=%lu jiffies=%lu",
451 tg->slice_end[rw], jiffies); 729 rw == READ ? 'R' : 'W', tg->slice_start[rw],
730 tg->slice_end[rw], jiffies);
452} 731}
453 732
454/* Determine if previously allocated or extended slice is complete or not */ 733/* Determine if previously allocated or extended slice is complete or not */
455static bool 734static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
456throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
457{ 735{
458 if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) 736 if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
459 return 0; 737 return 0;
@@ -462,8 +740,7 @@ throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
462} 740}
463 741
464/* Trim the used slices and adjust slice start accordingly */ 742/* Trim the used slices and adjust slice start accordingly */
465static inline void 743static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
466throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
467{ 744{
468 unsigned long nr_slices, time_elapsed, io_trim; 745 unsigned long nr_slices, time_elapsed, io_trim;
469 u64 bytes_trim, tmp; 746 u64 bytes_trim, tmp;
@@ -475,7 +752,7 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
475 * renewed. Don't try to trim the slice if slice is used. A new 752 * renewed. Don't try to trim the slice if slice is used. A new
476 * slice will start when appropriate. 753 * slice will start when appropriate.
477 */ 754 */
478 if (throtl_slice_used(td, tg, rw)) 755 if (throtl_slice_used(tg, rw))
479 return; 756 return;
480 757
481 /* 758 /*
@@ -486,7 +763,7 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
486 * is bad because it does not allow new slice to start. 763 * is bad because it does not allow new slice to start.
487 */ 764 */
488 765
489 throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice); 766 throtl_set_slice_end(tg, rw, jiffies + throtl_slice);
490 767
491 time_elapsed = jiffies - tg->slice_start[rw]; 768 time_elapsed = jiffies - tg->slice_start[rw];
492 769
@@ -515,14 +792,14 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
515 792
516 tg->slice_start[rw] += nr_slices * throtl_slice; 793 tg->slice_start[rw] += nr_slices * throtl_slice;
517 794
518 throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu" 795 throtl_log(&tg->service_queue,
519 " start=%lu end=%lu jiffies=%lu", 796 "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu",
520 rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, 797 rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
521 tg->slice_start[rw], tg->slice_end[rw], jiffies); 798 tg->slice_start[rw], tg->slice_end[rw], jiffies);
522} 799}
523 800
524static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg, 801static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
525 struct bio *bio, unsigned long *wait) 802 unsigned long *wait)
526{ 803{
527 bool rw = bio_data_dir(bio); 804 bool rw = bio_data_dir(bio);
528 unsigned int io_allowed; 805 unsigned int io_allowed;
@@ -571,8 +848,8 @@ static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
571 return 0; 848 return 0;
572} 849}
573 850
574static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg, 851static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
575 struct bio *bio, unsigned long *wait) 852 unsigned long *wait)
576{ 853{
577 bool rw = bio_data_dir(bio); 854 bool rw = bio_data_dir(bio);
578 u64 bytes_allowed, extra_bytes, tmp; 855 u64 bytes_allowed, extra_bytes, tmp;
@@ -613,18 +890,12 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
613 return 0; 890 return 0;
614} 891}
615 892
616static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) {
617 if (tg->bps[rw] == -1 && tg->iops[rw] == -1)
618 return 1;
619 return 0;
620}
621
622/* 893/*
623 * Returns whether one can dispatch a bio or not. Also returns approx number 894 * Returns whether one can dispatch a bio or not. Also returns approx number
624 * of jiffies to wait before this bio is with-in IO rate and can be dispatched 895 * of jiffies to wait before this bio is with-in IO rate and can be dispatched
625 */ 896 */
626static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, 897static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
627 struct bio *bio, unsigned long *wait) 898 unsigned long *wait)
628{ 899{
629 bool rw = bio_data_dir(bio); 900 bool rw = bio_data_dir(bio);
630 unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; 901 unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
@@ -635,7 +906,8 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
635 * this function with a different bio if there are other bios 906 * this function with a different bio if there are other bios
636 * queued. 907 * queued.
637 */ 908 */
638 BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw])); 909 BUG_ON(tg->service_queue.nr_queued[rw] &&
910 bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
639 911
640 /* If tg->bps = -1, then BW is unlimited */ 912 /* If tg->bps = -1, then BW is unlimited */
641 if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { 913 if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
@@ -649,15 +921,15 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
649 * existing slice to make sure it is at least throtl_slice interval 921 * existing slice to make sure it is at least throtl_slice interval
650 * long since now. 922 * long since now.
651 */ 923 */
652 if (throtl_slice_used(td, tg, rw)) 924 if (throtl_slice_used(tg, rw))
653 throtl_start_new_slice(td, tg, rw); 925 throtl_start_new_slice(tg, rw);
654 else { 926 else {
655 if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) 927 if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
656 throtl_extend_slice(td, tg, rw, jiffies + throtl_slice); 928 throtl_extend_slice(tg, rw, jiffies + throtl_slice);
657 } 929 }
658 930
659 if (tg_with_in_bps_limit(td, tg, bio, &bps_wait) 931 if (tg_with_in_bps_limit(tg, bio, &bps_wait) &&
660 && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) { 932 tg_with_in_iops_limit(tg, bio, &iops_wait)) {
661 if (wait) 933 if (wait)
662 *wait = 0; 934 *wait = 0;
663 return 1; 935 return 1;
@@ -669,7 +941,7 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
669 *wait = max_wait; 941 *wait = max_wait;
670 942
671 if (time_before(tg->slice_end[rw], jiffies + max_wait)) 943 if (time_before(tg->slice_end[rw], jiffies + max_wait))
672 throtl_extend_slice(td, tg, rw, jiffies + max_wait); 944 throtl_extend_slice(tg, rw, jiffies + max_wait);
673 945
674 return 0; 946 return 0;
675} 947}
@@ -708,65 +980,136 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
708 tg->bytes_disp[rw] += bio->bi_size; 980 tg->bytes_disp[rw] += bio->bi_size;
709 tg->io_disp[rw]++; 981 tg->io_disp[rw]++;
710 982
711 throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw); 983 /*
984 * REQ_THROTTLED is used to prevent the same bio to be throttled
985 * more than once as a throttled bio will go through blk-throtl the
986 * second time when it eventually gets issued. Set it when a bio
987 * is being charged to a tg.
988 *
989 * Dispatch stats aren't recursive and each @bio should only be
990 * accounted by the @tg it was originally associated with. Let's
991 * update the stats when setting REQ_THROTTLED for the first time
992 * which is guaranteed to be for the @bio's original tg.
993 */
994 if (!(bio->bi_rw & REQ_THROTTLED)) {
995 bio->bi_rw |= REQ_THROTTLED;
996 throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size,
997 bio->bi_rw);
998 }
712} 999}
713 1000
714static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, 1001/**
715 struct bio *bio) 1002 * throtl_add_bio_tg - add a bio to the specified throtl_grp
1003 * @bio: bio to add
1004 * @qn: qnode to use
1005 * @tg: the target throtl_grp
1006 *
1007 * Add @bio to @tg's service_queue using @qn. If @qn is not specified,
1008 * tg->qnode_on_self[] is used.
1009 */
1010static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn,
1011 struct throtl_grp *tg)
716{ 1012{
1013 struct throtl_service_queue *sq = &tg->service_queue;
717 bool rw = bio_data_dir(bio); 1014 bool rw = bio_data_dir(bio);
718 1015
719 bio_list_add(&tg->bio_lists[rw], bio); 1016 if (!qn)
720 /* Take a bio reference on tg */ 1017 qn = &tg->qnode_on_self[rw];
721 blkg_get(tg_to_blkg(tg)); 1018
722 tg->nr_queued[rw]++; 1019 /*
723 td->nr_queued[rw]++; 1020 * If @tg doesn't currently have any bios queued in the same
724 throtl_enqueue_tg(td, tg); 1021 * direction, queueing @bio can change when @tg should be
1022 * dispatched. Mark that @tg was empty. This is automatically
1023 * cleaered on the next tg_update_disptime().
1024 */
1025 if (!sq->nr_queued[rw])
1026 tg->flags |= THROTL_TG_WAS_EMPTY;
1027
1028 throtl_qnode_add_bio(bio, qn, &sq->queued[rw]);
1029
1030 sq->nr_queued[rw]++;
1031 throtl_enqueue_tg(tg);
725} 1032}
726 1033
727static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg) 1034static void tg_update_disptime(struct throtl_grp *tg)
728{ 1035{
1036 struct throtl_service_queue *sq = &tg->service_queue;
729 unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; 1037 unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
730 struct bio *bio; 1038 struct bio *bio;
731 1039
732 if ((bio = bio_list_peek(&tg->bio_lists[READ]))) 1040 if ((bio = throtl_peek_queued(&sq->queued[READ])))
733 tg_may_dispatch(td, tg, bio, &read_wait); 1041 tg_may_dispatch(tg, bio, &read_wait);
734 1042
735 if ((bio = bio_list_peek(&tg->bio_lists[WRITE]))) 1043 if ((bio = throtl_peek_queued(&sq->queued[WRITE])))
736 tg_may_dispatch(td, tg, bio, &write_wait); 1044 tg_may_dispatch(tg, bio, &write_wait);
737 1045
738 min_wait = min(read_wait, write_wait); 1046 min_wait = min(read_wait, write_wait);
739 disptime = jiffies + min_wait; 1047 disptime = jiffies + min_wait;
740 1048
741 /* Update dispatch time */ 1049 /* Update dispatch time */
742 throtl_dequeue_tg(td, tg); 1050 throtl_dequeue_tg(tg);
743 tg->disptime = disptime; 1051 tg->disptime = disptime;
744 throtl_enqueue_tg(td, tg); 1052 throtl_enqueue_tg(tg);
1053
1054 /* see throtl_add_bio_tg() */
1055 tg->flags &= ~THROTL_TG_WAS_EMPTY;
745} 1056}
746 1057
747static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg, 1058static void start_parent_slice_with_credit(struct throtl_grp *child_tg,
748 bool rw, struct bio_list *bl) 1059 struct throtl_grp *parent_tg, bool rw)
749{ 1060{
750 struct bio *bio; 1061 if (throtl_slice_used(parent_tg, rw)) {
1062 throtl_start_new_slice_with_credit(parent_tg, rw,
1063 child_tg->slice_start[rw]);
1064 }
1065
1066}
751 1067
752 bio = bio_list_pop(&tg->bio_lists[rw]); 1068static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
753 tg->nr_queued[rw]--; 1069{
754 /* Drop bio reference on blkg */ 1070 struct throtl_service_queue *sq = &tg->service_queue;
755 blkg_put(tg_to_blkg(tg)); 1071 struct throtl_service_queue *parent_sq = sq->parent_sq;
1072 struct throtl_grp *parent_tg = sq_to_tg(parent_sq);
1073 struct throtl_grp *tg_to_put = NULL;
1074 struct bio *bio;
756 1075
757 BUG_ON(td->nr_queued[rw] <= 0); 1076 /*
758 td->nr_queued[rw]--; 1077 * @bio is being transferred from @tg to @parent_sq. Popping a bio
1078 * from @tg may put its reference and @parent_sq might end up
1079 * getting released prematurely. Remember the tg to put and put it
1080 * after @bio is transferred to @parent_sq.
1081 */
1082 bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put);
1083 sq->nr_queued[rw]--;
759 1084
760 throtl_charge_bio(tg, bio); 1085 throtl_charge_bio(tg, bio);
761 bio_list_add(bl, bio);
762 bio->bi_rw |= REQ_THROTTLED;
763 1086
764 throtl_trim_slice(td, tg, rw); 1087 /*
1088 * If our parent is another tg, we just need to transfer @bio to
1089 * the parent using throtl_add_bio_tg(). If our parent is
1090 * @td->service_queue, @bio is ready to be issued. Put it on its
1091 * bio_lists[] and decrease total number queued. The caller is
1092 * responsible for issuing these bios.
1093 */
1094 if (parent_tg) {
1095 throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg);
1096 start_parent_slice_with_credit(tg, parent_tg, rw);
1097 } else {
1098 throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw],
1099 &parent_sq->queued[rw]);
1100 BUG_ON(tg->td->nr_queued[rw] <= 0);
1101 tg->td->nr_queued[rw]--;
1102 }
1103
1104 throtl_trim_slice(tg, rw);
1105
1106 if (tg_to_put)
1107 blkg_put(tg_to_blkg(tg_to_put));
765} 1108}
766 1109
767static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg, 1110static int throtl_dispatch_tg(struct throtl_grp *tg)
768 struct bio_list *bl)
769{ 1111{
1112 struct throtl_service_queue *sq = &tg->service_queue;
770 unsigned int nr_reads = 0, nr_writes = 0; 1113 unsigned int nr_reads = 0, nr_writes = 0;
771 unsigned int max_nr_reads = throtl_grp_quantum*3/4; 1114 unsigned int max_nr_reads = throtl_grp_quantum*3/4;
772 unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads; 1115 unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
@@ -774,20 +1117,20 @@ static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
774 1117
775 /* Try to dispatch 75% READS and 25% WRITES */ 1118 /* Try to dispatch 75% READS and 25% WRITES */
776 1119
777 while ((bio = bio_list_peek(&tg->bio_lists[READ])) 1120 while ((bio = throtl_peek_queued(&sq->queued[READ])) &&
778 && tg_may_dispatch(td, tg, bio, NULL)) { 1121 tg_may_dispatch(tg, bio, NULL)) {
779 1122
780 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); 1123 tg_dispatch_one_bio(tg, bio_data_dir(bio));
781 nr_reads++; 1124 nr_reads++;
782 1125
783 if (nr_reads >= max_nr_reads) 1126 if (nr_reads >= max_nr_reads)
784 break; 1127 break;
785 } 1128 }
786 1129
787 while ((bio = bio_list_peek(&tg->bio_lists[WRITE])) 1130 while ((bio = throtl_peek_queued(&sq->queued[WRITE])) &&
788 && tg_may_dispatch(td, tg, bio, NULL)) { 1131 tg_may_dispatch(tg, bio, NULL)) {
789 1132
790 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); 1133 tg_dispatch_one_bio(tg, bio_data_dir(bio));
791 nr_writes++; 1134 nr_writes++;
792 1135
793 if (nr_writes >= max_nr_writes) 1136 if (nr_writes >= max_nr_writes)
@@ -797,14 +1140,13 @@ static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
797 return nr_reads + nr_writes; 1140 return nr_reads + nr_writes;
798} 1141}
799 1142
800static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl) 1143static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
801{ 1144{
802 unsigned int nr_disp = 0; 1145 unsigned int nr_disp = 0;
803 struct throtl_grp *tg;
804 struct throtl_rb_root *st = &td->tg_service_tree;
805 1146
806 while (1) { 1147 while (1) {
807 tg = throtl_rb_first(st); 1148 struct throtl_grp *tg = throtl_rb_first(parent_sq);
1149 struct throtl_service_queue *sq = &tg->service_queue;
808 1150
809 if (!tg) 1151 if (!tg)
810 break; 1152 break;
@@ -812,14 +1154,12 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
812 if (time_before(jiffies, tg->disptime)) 1154 if (time_before(jiffies, tg->disptime))
813 break; 1155 break;
814 1156
815 throtl_dequeue_tg(td, tg); 1157 throtl_dequeue_tg(tg);
816 1158
817 nr_disp += throtl_dispatch_tg(td, tg, bl); 1159 nr_disp += throtl_dispatch_tg(tg);
818 1160
819 if (tg->nr_queued[0] || tg->nr_queued[1]) { 1161 if (sq->nr_queued[0] || sq->nr_queued[1])
820 tg_update_disptime(td, tg); 1162 tg_update_disptime(tg);
821 throtl_enqueue_tg(td, tg);
822 }
823 1163
824 if (nr_disp >= throtl_quantum) 1164 if (nr_disp >= throtl_quantum)
825 break; 1165 break;
@@ -828,111 +1168,111 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
828 return nr_disp; 1168 return nr_disp;
829} 1169}
830 1170
831static void throtl_process_limit_change(struct throtl_data *td) 1171/**
1172 * throtl_pending_timer_fn - timer function for service_queue->pending_timer
1173 * @arg: the throtl_service_queue being serviced
1174 *
1175 * This timer is armed when a child throtl_grp with active bio's become
1176 * pending and queued on the service_queue's pending_tree and expires when
1177 * the first child throtl_grp should be dispatched. This function
1178 * dispatches bio's from the children throtl_grps to the parent
1179 * service_queue.
1180 *
1181 * If the parent's parent is another throtl_grp, dispatching is propagated
1182 * by either arming its pending_timer or repeating dispatch directly. If
1183 * the top-level service_tree is reached, throtl_data->dispatch_work is
1184 * kicked so that the ready bio's are issued.
1185 */
1186static void throtl_pending_timer_fn(unsigned long arg)
832{ 1187{
1188 struct throtl_service_queue *sq = (void *)arg;
1189 struct throtl_grp *tg = sq_to_tg(sq);
1190 struct throtl_data *td = sq_to_td(sq);
833 struct request_queue *q = td->queue; 1191 struct request_queue *q = td->queue;
834 struct blkcg_gq *blkg, *n; 1192 struct throtl_service_queue *parent_sq;
835 1193 bool dispatched;
836 if (!td->limits_changed) 1194 int ret;
837 return;
838
839 xchg(&td->limits_changed, false);
840
841 throtl_log(td, "limits changed");
842
843 list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
844 struct throtl_grp *tg = blkg_to_tg(blkg);
845 1195
846 if (!tg->limits_changed) 1196 spin_lock_irq(q->queue_lock);
847 continue; 1197again:
1198 parent_sq = sq->parent_sq;
1199 dispatched = false;
1200
1201 while (true) {
1202 throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u",
1203 sq->nr_queued[READ] + sq->nr_queued[WRITE],
1204 sq->nr_queued[READ], sq->nr_queued[WRITE]);
1205
1206 ret = throtl_select_dispatch(sq);
1207 if (ret) {
1208 throtl_log(sq, "bios disp=%u", ret);
1209 dispatched = true;
1210 }
848 1211
849 if (!xchg(&tg->limits_changed, false)) 1212 if (throtl_schedule_next_dispatch(sq, false))
850 continue; 1213 break;
851 1214
852 throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu" 1215 /* this dispatch windows is still open, relax and repeat */
853 " riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE], 1216 spin_unlock_irq(q->queue_lock);
854 tg->iops[READ], tg->iops[WRITE]); 1217 cpu_relax();
1218 spin_lock_irq(q->queue_lock);
1219 }
855 1220
856 /* 1221 if (!dispatched)
857 * Restart the slices for both READ and WRITES. It 1222 goto out_unlock;
858 * might happen that a group's limit are dropped
859 * suddenly and we don't want to account recently
860 * dispatched IO with new low rate
861 */
862 throtl_start_new_slice(td, tg, 0);
863 throtl_start_new_slice(td, tg, 1);
864 1223
865 if (throtl_tg_on_rr(tg)) 1224 if (parent_sq) {
866 tg_update_disptime(td, tg); 1225 /* @parent_sq is another throl_grp, propagate dispatch */
1226 if (tg->flags & THROTL_TG_WAS_EMPTY) {
1227 tg_update_disptime(tg);
1228 if (!throtl_schedule_next_dispatch(parent_sq, false)) {
1229 /* window is already open, repeat dispatching */
1230 sq = parent_sq;
1231 tg = sq_to_tg(sq);
1232 goto again;
1233 }
1234 }
1235 } else {
1236 /* reached the top-level, queue issueing */
1237 queue_work(kthrotld_workqueue, &td->dispatch_work);
867 } 1238 }
1239out_unlock:
1240 spin_unlock_irq(q->queue_lock);
868} 1241}
869 1242
870/* Dispatch throttled bios. Should be called without queue lock held. */ 1243/**
871static int throtl_dispatch(struct request_queue *q) 1244 * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work
1245 * @work: work item being executed
1246 *
1247 * This function is queued for execution when bio's reach the bio_lists[]
1248 * of throtl_data->service_queue. Those bio's are ready and issued by this
1249 * function.
1250 */
1251void blk_throtl_dispatch_work_fn(struct work_struct *work)
872{ 1252{
873 struct throtl_data *td = q->td; 1253 struct throtl_data *td = container_of(work, struct throtl_data,
874 unsigned int nr_disp = 0; 1254 dispatch_work);
1255 struct throtl_service_queue *td_sq = &td->service_queue;
1256 struct request_queue *q = td->queue;
875 struct bio_list bio_list_on_stack; 1257 struct bio_list bio_list_on_stack;
876 struct bio *bio; 1258 struct bio *bio;
877 struct blk_plug plug; 1259 struct blk_plug plug;
878 1260 int rw;
879 spin_lock_irq(q->queue_lock);
880
881 throtl_process_limit_change(td);
882
883 if (!total_nr_queued(td))
884 goto out;
885 1261
886 bio_list_init(&bio_list_on_stack); 1262 bio_list_init(&bio_list_on_stack);
887 1263
888 throtl_log(td, "dispatch nr_queued=%u read=%u write=%u", 1264 spin_lock_irq(q->queue_lock);
889 total_nr_queued(td), td->nr_queued[READ], 1265 for (rw = READ; rw <= WRITE; rw++)
890 td->nr_queued[WRITE]); 1266 while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))
891 1267 bio_list_add(&bio_list_on_stack, bio);
892 nr_disp = throtl_select_dispatch(td, &bio_list_on_stack);
893
894 if (nr_disp)
895 throtl_log(td, "bios disp=%u", nr_disp);
896
897 throtl_schedule_next_dispatch(td);
898out:
899 spin_unlock_irq(q->queue_lock); 1268 spin_unlock_irq(q->queue_lock);
900 1269
901 /* 1270 if (!bio_list_empty(&bio_list_on_stack)) {
902 * If we dispatched some requests, unplug the queue to make sure
903 * immediate dispatch
904 */
905 if (nr_disp) {
906 blk_start_plug(&plug); 1271 blk_start_plug(&plug);
907 while((bio = bio_list_pop(&bio_list_on_stack))) 1272 while((bio = bio_list_pop(&bio_list_on_stack)))
908 generic_make_request(bio); 1273 generic_make_request(bio);
909 blk_finish_plug(&plug); 1274 blk_finish_plug(&plug);
910 } 1275 }
911 return nr_disp;
912}
913
914void blk_throtl_work(struct work_struct *work)
915{
916 struct throtl_data *td = container_of(work, struct throtl_data,
917 throtl_work.work);
918 struct request_queue *q = td->queue;
919
920 throtl_dispatch(q);
921}
922
923/* Call with queue lock held */
924static void
925throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
926{
927
928 struct delayed_work *dwork = &td->throtl_work;
929
930 /* schedule work if limits changed even if no bio is queued */
931 if (total_nr_queued(td) || td->limits_changed) {
932 mod_delayed_work(kthrotld_workqueue, dwork, delay);
933 throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
934 delay, jiffies);
935 }
936} 1276}
937 1277
938static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, 1278static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
@@ -1007,7 +1347,9 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
1007 struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 1347 struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1008 struct blkg_conf_ctx ctx; 1348 struct blkg_conf_ctx ctx;
1009 struct throtl_grp *tg; 1349 struct throtl_grp *tg;
1010 struct throtl_data *td; 1350 struct throtl_service_queue *sq;
1351 struct blkcg_gq *blkg;
1352 struct cgroup *pos_cgrp;
1011 int ret; 1353 int ret;
1012 1354
1013 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); 1355 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
@@ -1015,7 +1357,7 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
1015 return ret; 1357 return ret;
1016 1358
1017 tg = blkg_to_tg(ctx.blkg); 1359 tg = blkg_to_tg(ctx.blkg);
1018 td = ctx.blkg->q->td; 1360 sq = &tg->service_queue;
1019 1361
1020 if (!ctx.v) 1362 if (!ctx.v)
1021 ctx.v = -1; 1363 ctx.v = -1;
@@ -1025,10 +1367,37 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
1025 else 1367 else
1026 *(unsigned int *)((void *)tg + cft->private) = ctx.v; 1368 *(unsigned int *)((void *)tg + cft->private) = ctx.v;
1027 1369
1028 /* XXX: we don't need the following deferred processing */ 1370 throtl_log(&tg->service_queue,
1029 xchg(&tg->limits_changed, true); 1371 "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
1030 xchg(&td->limits_changed, true); 1372 tg->bps[READ], tg->bps[WRITE],
1031 throtl_schedule_delayed_work(td, 0); 1373 tg->iops[READ], tg->iops[WRITE]);
1374
1375 /*
1376 * Update has_rules[] flags for the updated tg's subtree. A tg is
1377 * considered to have rules if either the tg itself or any of its
1378 * ancestors has rules. This identifies groups without any
1379 * restrictions in the whole hierarchy and allows them to bypass
1380 * blk-throttle.
1381 */
1382 tg_update_has_rules(tg);
1383 blkg_for_each_descendant_pre(blkg, pos_cgrp, ctx.blkg)
1384 tg_update_has_rules(blkg_to_tg(blkg));
1385
1386 /*
1387 * We're already holding queue_lock and know @tg is valid. Let's
1388 * apply the new config directly.
1389 *
1390 * Restart the slices for both READ and WRITES. It might happen
1391 * that a group's limit are dropped suddenly and we don't want to
1392 * account recently dispatched IO with new low rate.
1393 */
1394 throtl_start_new_slice(tg, 0);
1395 throtl_start_new_slice(tg, 1);
1396
1397 if (tg->flags & THROTL_TG_PENDING) {
1398 tg_update_disptime(tg);
1399 throtl_schedule_next_dispatch(sq->parent_sq, true);
1400 }
1032 1401
1033 blkg_conf_finish(&ctx); 1402 blkg_conf_finish(&ctx);
1034 return 0; 1403 return 0;
@@ -1092,7 +1461,7 @@ static void throtl_shutdown_wq(struct request_queue *q)
1092{ 1461{
1093 struct throtl_data *td = q->td; 1462 struct throtl_data *td = q->td;
1094 1463
1095 cancel_delayed_work_sync(&td->throtl_work); 1464 cancel_work_sync(&td->dispatch_work);
1096} 1465}
1097 1466
1098static struct blkcg_policy blkcg_policy_throtl = { 1467static struct blkcg_policy blkcg_policy_throtl = {
@@ -1100,6 +1469,7 @@ static struct blkcg_policy blkcg_policy_throtl = {
1100 .cftypes = throtl_files, 1469 .cftypes = throtl_files,
1101 1470
1102 .pd_init_fn = throtl_pd_init, 1471 .pd_init_fn = throtl_pd_init,
1472 .pd_online_fn = throtl_pd_online,
1103 .pd_exit_fn = throtl_pd_exit, 1473 .pd_exit_fn = throtl_pd_exit,
1104 .pd_reset_stats_fn = throtl_pd_reset_stats, 1474 .pd_reset_stats_fn = throtl_pd_reset_stats,
1105}; 1475};
@@ -1107,15 +1477,16 @@ static struct blkcg_policy blkcg_policy_throtl = {
1107bool blk_throtl_bio(struct request_queue *q, struct bio *bio) 1477bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
1108{ 1478{
1109 struct throtl_data *td = q->td; 1479 struct throtl_data *td = q->td;
1480 struct throtl_qnode *qn = NULL;
1110 struct throtl_grp *tg; 1481 struct throtl_grp *tg;
1111 bool rw = bio_data_dir(bio), update_disptime = true; 1482 struct throtl_service_queue *sq;
1483 bool rw = bio_data_dir(bio);
1112 struct blkcg *blkcg; 1484 struct blkcg *blkcg;
1113 bool throttled = false; 1485 bool throttled = false;
1114 1486
1115 if (bio->bi_rw & REQ_THROTTLED) { 1487 /* see throtl_charge_bio() */
1116 bio->bi_rw &= ~REQ_THROTTLED; 1488 if (bio->bi_rw & REQ_THROTTLED)
1117 goto out; 1489 goto out;
1118 }
1119 1490
1120 /* 1491 /*
1121 * A throtl_grp pointer retrieved under rcu can be used to access 1492 * A throtl_grp pointer retrieved under rcu can be used to access
@@ -1126,7 +1497,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
1126 blkcg = bio_blkcg(bio); 1497 blkcg = bio_blkcg(bio);
1127 tg = throtl_lookup_tg(td, blkcg); 1498 tg = throtl_lookup_tg(td, blkcg);
1128 if (tg) { 1499 if (tg) {
1129 if (tg_no_rule_group(tg, rw)) { 1500 if (!tg->has_rules[rw]) {
1130 throtl_update_dispatch_stats(tg_to_blkg(tg), 1501 throtl_update_dispatch_stats(tg_to_blkg(tg),
1131 bio->bi_size, bio->bi_rw); 1502 bio->bi_size, bio->bi_rw);
1132 goto out_unlock_rcu; 1503 goto out_unlock_rcu;
@@ -1142,18 +1513,18 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
1142 if (unlikely(!tg)) 1513 if (unlikely(!tg))
1143 goto out_unlock; 1514 goto out_unlock;
1144 1515
1145 if (tg->nr_queued[rw]) { 1516 sq = &tg->service_queue;
1146 /*
1147 * There is already another bio queued in same dir. No
1148 * need to update dispatch time.
1149 */
1150 update_disptime = false;
1151 goto queue_bio;
1152 1517
1153 } 1518 while (true) {
1519 /* throtl is FIFO - if bios are already queued, should queue */
1520 if (sq->nr_queued[rw])
1521 break;
1522
1523 /* if above limits, break to queue */
1524 if (!tg_may_dispatch(tg, bio, NULL))
1525 break;
1154 1526
1155 /* Bio is with-in rate limit of group */ 1527 /* within limits, let's charge and dispatch directly */
1156 if (tg_may_dispatch(td, tg, bio, NULL)) {
1157 throtl_charge_bio(tg, bio); 1528 throtl_charge_bio(tg, bio);
1158 1529
1159 /* 1530 /*
@@ -1167,25 +1538,41 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
1167 * 1538 *
1168 * So keep on trimming slice even if bio is not queued. 1539 * So keep on trimming slice even if bio is not queued.
1169 */ 1540 */
1170 throtl_trim_slice(td, tg, rw); 1541 throtl_trim_slice(tg, rw);
1171 goto out_unlock; 1542
1543 /*
1544 * @bio passed through this layer without being throttled.
1545 * Climb up the ladder. If we''re already at the top, it
1546 * can be executed directly.
1547 */
1548 qn = &tg->qnode_on_parent[rw];
1549 sq = sq->parent_sq;
1550 tg = sq_to_tg(sq);
1551 if (!tg)
1552 goto out_unlock;
1172 } 1553 }
1173 1554
1174queue_bio: 1555 /* out-of-limit, queue to @tg */
1175 throtl_log_tg(td, tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu" 1556 throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d",
1176 " iodisp=%u iops=%u queued=%d/%d", 1557 rw == READ ? 'R' : 'W',
1177 rw == READ ? 'R' : 'W', 1558 tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
1178 tg->bytes_disp[rw], bio->bi_size, tg->bps[rw], 1559 tg->io_disp[rw], tg->iops[rw],
1179 tg->io_disp[rw], tg->iops[rw], 1560 sq->nr_queued[READ], sq->nr_queued[WRITE]);
1180 tg->nr_queued[READ], tg->nr_queued[WRITE]);
1181 1561
1182 bio_associate_current(bio); 1562 bio_associate_current(bio);
1183 throtl_add_bio_tg(q->td, tg, bio); 1563 tg->td->nr_queued[rw]++;
1564 throtl_add_bio_tg(bio, qn, tg);
1184 throttled = true; 1565 throttled = true;
1185 1566
1186 if (update_disptime) { 1567 /*
1187 tg_update_disptime(td, tg); 1568 * Update @tg's dispatch time and force schedule dispatch if @tg
1188 throtl_schedule_next_dispatch(td); 1569 * was empty before @bio. The forced scheduling isn't likely to
1570 * cause undue delay as @bio is likely to be dispatched directly if
1571 * its @tg's disptime is not in the future.
1572 */
1573 if (tg->flags & THROTL_TG_WAS_EMPTY) {
1574 tg_update_disptime(tg);
1575 throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true);
1189 } 1576 }
1190 1577
1191out_unlock: 1578out_unlock:
@@ -1193,9 +1580,38 @@ out_unlock:
1193out_unlock_rcu: 1580out_unlock_rcu:
1194 rcu_read_unlock(); 1581 rcu_read_unlock();
1195out: 1582out:
1583 /*
1584 * As multiple blk-throtls may stack in the same issue path, we
1585 * don't want bios to leave with the flag set. Clear the flag if
1586 * being issued.
1587 */
1588 if (!throttled)
1589 bio->bi_rw &= ~REQ_THROTTLED;
1196 return throttled; 1590 return throttled;
1197} 1591}
1198 1592
1593/*
1594 * Dispatch all bios from all children tg's queued on @parent_sq. On
1595 * return, @parent_sq is guaranteed to not have any active children tg's
1596 * and all bios from previously active tg's are on @parent_sq->bio_lists[].
1597 */
1598static void tg_drain_bios(struct throtl_service_queue *parent_sq)
1599{
1600 struct throtl_grp *tg;
1601
1602 while ((tg = throtl_rb_first(parent_sq))) {
1603 struct throtl_service_queue *sq = &tg->service_queue;
1604 struct bio *bio;
1605
1606 throtl_dequeue_tg(tg);
1607
1608 while ((bio = throtl_peek_queued(&sq->queued[READ])))
1609 tg_dispatch_one_bio(tg, bio_data_dir(bio));
1610 while ((bio = throtl_peek_queued(&sq->queued[WRITE])))
1611 tg_dispatch_one_bio(tg, bio_data_dir(bio));
1612 }
1613}
1614
1199/** 1615/**
1200 * blk_throtl_drain - drain throttled bios 1616 * blk_throtl_drain - drain throttled bios
1201 * @q: request_queue to drain throttled bios for 1617 * @q: request_queue to drain throttled bios for
@@ -1206,27 +1622,36 @@ void blk_throtl_drain(struct request_queue *q)
1206 __releases(q->queue_lock) __acquires(q->queue_lock) 1622 __releases(q->queue_lock) __acquires(q->queue_lock)
1207{ 1623{
1208 struct throtl_data *td = q->td; 1624 struct throtl_data *td = q->td;
1209 struct throtl_rb_root *st = &td->tg_service_tree; 1625 struct blkcg_gq *blkg;
1210 struct throtl_grp *tg; 1626 struct cgroup *pos_cgrp;
1211 struct bio_list bl;
1212 struct bio *bio; 1627 struct bio *bio;
1628 int rw;
1213 1629
1214 queue_lockdep_assert_held(q); 1630 queue_lockdep_assert_held(q);
1631 rcu_read_lock();
1632
1633 /*
1634 * Drain each tg while doing post-order walk on the blkg tree, so
1635 * that all bios are propagated to td->service_queue. It'd be
1636 * better to walk service_queue tree directly but blkg walk is
1637 * easier.
1638 */
1639 blkg_for_each_descendant_post(blkg, pos_cgrp, td->queue->root_blkg)
1640 tg_drain_bios(&blkg_to_tg(blkg)->service_queue);
1215 1641
1216 bio_list_init(&bl); 1642 tg_drain_bios(&td_root_tg(td)->service_queue);
1217 1643
1218 while ((tg = throtl_rb_first(st))) { 1644 /* finally, transfer bios from top-level tg's into the td */
1219 throtl_dequeue_tg(td, tg); 1645 tg_drain_bios(&td->service_queue);
1220 1646
1221 while ((bio = bio_list_peek(&tg->bio_lists[READ]))) 1647 rcu_read_unlock();
1222 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
1223 while ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
1224 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
1225 }
1226 spin_unlock_irq(q->queue_lock); 1648 spin_unlock_irq(q->queue_lock);
1227 1649
1228 while ((bio = bio_list_pop(&bl))) 1650 /* all bios now should be in td->service_queue, issue them */
1229 generic_make_request(bio); 1651 for (rw = READ; rw <= WRITE; rw++)
1652 while ((bio = throtl_pop_queued(&td->service_queue.queued[rw],
1653 NULL)))
1654 generic_make_request(bio);
1230 1655
1231 spin_lock_irq(q->queue_lock); 1656 spin_lock_irq(q->queue_lock);
1232} 1657}
@@ -1240,9 +1665,8 @@ int blk_throtl_init(struct request_queue *q)
1240 if (!td) 1665 if (!td)
1241 return -ENOMEM; 1666 return -ENOMEM;
1242 1667
1243 td->tg_service_tree = THROTL_RB_ROOT; 1668 INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
1244 td->limits_changed = false; 1669 throtl_service_queue_init(&td->service_queue, NULL);
1245 INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
1246 1670
1247 q->td = td; 1671 q->td = td;
1248 td->queue = q; 1672 td->queue = q;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index d5cd3131c57a..d5bbdcfd0dab 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -4347,18 +4347,28 @@ static void cfq_exit_queue(struct elevator_queue *e)
4347 kfree(cfqd); 4347 kfree(cfqd);
4348} 4348}
4349 4349
4350static int cfq_init_queue(struct request_queue *q) 4350static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
4351{ 4351{
4352 struct cfq_data *cfqd; 4352 struct cfq_data *cfqd;
4353 struct blkcg_gq *blkg __maybe_unused; 4353 struct blkcg_gq *blkg __maybe_unused;
4354 int i, ret; 4354 int i, ret;
4355 struct elevator_queue *eq;
4356
4357 eq = elevator_alloc(q, e);
4358 if (!eq)
4359 return -ENOMEM;
4355 4360
4356 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); 4361 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
4357 if (!cfqd) 4362 if (!cfqd) {
4363 kobject_put(&eq->kobj);
4358 return -ENOMEM; 4364 return -ENOMEM;
4365 }
4366 eq->elevator_data = cfqd;
4359 4367
4360 cfqd->queue = q; 4368 cfqd->queue = q;
4361 q->elevator->elevator_data = cfqd; 4369 spin_lock_irq(q->queue_lock);
4370 q->elevator = eq;
4371 spin_unlock_irq(q->queue_lock);
4362 4372
4363 /* Init root service tree */ 4373 /* Init root service tree */
4364 cfqd->grp_service_tree = CFQ_RB_ROOT; 4374 cfqd->grp_service_tree = CFQ_RB_ROOT;
@@ -4433,6 +4443,7 @@ static int cfq_init_queue(struct request_queue *q)
4433 4443
4434out_free: 4444out_free:
4435 kfree(cfqd); 4445 kfree(cfqd);
4446 kobject_put(&eq->kobj);
4436 return ret; 4447 return ret;
4437} 4448}
4438 4449
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index ba19a3afab79..20614a332362 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -337,13 +337,21 @@ static void deadline_exit_queue(struct elevator_queue *e)
337/* 337/*
338 * initialize elevator private data (deadline_data). 338 * initialize elevator private data (deadline_data).
339 */ 339 */
340static int deadline_init_queue(struct request_queue *q) 340static int deadline_init_queue(struct request_queue *q, struct elevator_type *e)
341{ 341{
342 struct deadline_data *dd; 342 struct deadline_data *dd;
343 struct elevator_queue *eq;
344
345 eq = elevator_alloc(q, e);
346 if (!eq)
347 return -ENOMEM;
343 348
344 dd = kmalloc_node(sizeof(*dd), GFP_KERNEL | __GFP_ZERO, q->node); 349 dd = kmalloc_node(sizeof(*dd), GFP_KERNEL | __GFP_ZERO, q->node);
345 if (!dd) 350 if (!dd) {
351 kobject_put(&eq->kobj);
346 return -ENOMEM; 352 return -ENOMEM;
353 }
354 eq->elevator_data = dd;
347 355
348 INIT_LIST_HEAD(&dd->fifo_list[READ]); 356 INIT_LIST_HEAD(&dd->fifo_list[READ]);
349 INIT_LIST_HEAD(&dd->fifo_list[WRITE]); 357 INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
@@ -355,7 +363,9 @@ static int deadline_init_queue(struct request_queue *q)
355 dd->front_merges = 1; 363 dd->front_merges = 1;
356 dd->fifo_batch = fifo_batch; 364 dd->fifo_batch = fifo_batch;
357 365
358 q->elevator->elevator_data = dd; 366 spin_lock_irq(q->queue_lock);
367 q->elevator = eq;
368 spin_unlock_irq(q->queue_lock);
359 return 0; 369 return 0;
360} 370}
361 371
diff --git a/block/elevator.c b/block/elevator.c
index eba5b04c29b1..668394d18588 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -150,7 +150,7 @@ void __init load_default_elevator_module(void)
150 150
151static struct kobj_type elv_ktype; 151static struct kobj_type elv_ktype;
152 152
153static struct elevator_queue *elevator_alloc(struct request_queue *q, 153struct elevator_queue *elevator_alloc(struct request_queue *q,
154 struct elevator_type *e) 154 struct elevator_type *e)
155{ 155{
156 struct elevator_queue *eq; 156 struct elevator_queue *eq;
@@ -170,6 +170,7 @@ err:
170 elevator_put(e); 170 elevator_put(e);
171 return NULL; 171 return NULL;
172} 172}
173EXPORT_SYMBOL(elevator_alloc);
173 174
174static void elevator_release(struct kobject *kobj) 175static void elevator_release(struct kobject *kobj)
175{ 176{
@@ -221,16 +222,7 @@ int elevator_init(struct request_queue *q, char *name)
221 } 222 }
222 } 223 }
223 224
224 q->elevator = elevator_alloc(q, e); 225 err = e->ops.elevator_init_fn(q, e);
225 if (!q->elevator)
226 return -ENOMEM;
227
228 err = e->ops.elevator_init_fn(q);
229 if (err) {
230 kobject_put(&q->elevator->kobj);
231 return err;
232 }
233
234 return 0; 226 return 0;
235} 227}
236EXPORT_SYMBOL(elevator_init); 228EXPORT_SYMBOL(elevator_init);
@@ -935,16 +927,9 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
935 spin_unlock_irq(q->queue_lock); 927 spin_unlock_irq(q->queue_lock);
936 928
937 /* allocate, init and register new elevator */ 929 /* allocate, init and register new elevator */
938 err = -ENOMEM; 930 err = new_e->ops.elevator_init_fn(q, new_e);
939 q->elevator = elevator_alloc(q, new_e); 931 if (err)
940 if (!q->elevator)
941 goto fail_init;
942
943 err = new_e->ops.elevator_init_fn(q);
944 if (err) {
945 kobject_put(&q->elevator->kobj);
946 goto fail_init; 932 goto fail_init;
947 }
948 933
949 if (registered) { 934 if (registered) {
950 err = elv_register_queue(q); 935 err = elv_register_queue(q);
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index 5d1bf70e33d5..3de89d4690f3 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -59,16 +59,27 @@ noop_latter_request(struct request_queue *q, struct request *rq)
59 return list_entry(rq->queuelist.next, struct request, queuelist); 59 return list_entry(rq->queuelist.next, struct request, queuelist);
60} 60}
61 61
62static int noop_init_queue(struct request_queue *q) 62static int noop_init_queue(struct request_queue *q, struct elevator_type *e)
63{ 63{
64 struct noop_data *nd; 64 struct noop_data *nd;
65 struct elevator_queue *eq;
66
67 eq = elevator_alloc(q, e);
68 if (!eq)
69 return -ENOMEM;
65 70
66 nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node); 71 nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node);
67 if (!nd) 72 if (!nd) {
73 kobject_put(&eq->kobj);
68 return -ENOMEM; 74 return -ENOMEM;
75 }
76 eq->elevator_data = nd;
69 77
70 INIT_LIST_HEAD(&nd->queue); 78 INIT_LIST_HEAD(&nd->queue);
71 q->elevator->elevator_data = nd; 79
80 spin_lock_irq(q->queue_lock);
81 q->elevator = eq;
82 spin_unlock_irq(q->queue_lock);
72 return 0; 83 return 0;
73} 84}
74 85