aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2013-01-11 13:53:53 -0500
committerJens Axboe <axboe@kernel.dk>2013-01-11 13:53:53 -0500
commitac9a19745196388ae5d828c0be7a1d6e472101f3 (patch)
tree49c47e1a07241653deb4a4b4e7a91626f586ad05
parent422765c2638924da10ff363b5eed77924911bdc7 (diff)
parent43114018cb0b253fd03c4ff4d42bcdc43389ac1c (diff)
Merge branch 'blkcg-cfq-hierarchy' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup into for-3.9/core
Tejun writes: Hello, Jens. Please consider pulling from the following branch to receive cfq blkcg hierarchy support. The branch is based on top of v3.8-rc2. git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git blkcg-cfq-hierarchy The patchset was reviewd in the following thread. http://thread.gmane.org/gmane.linux.kernel.cgroups/5571
-rw-r--r--Documentation/block/cfq-iosched.txt58
-rw-r--r--Documentation/cgroups/blkio-controller.txt35
-rw-r--r--block/blk-cgroup.c277
-rw-r--r--block/blk-cgroup.h68
-rw-r--r--block/blk-sysfs.c9
-rw-r--r--block/cfq-iosched.c627
-rw-r--r--include/linux/blkdev.h2
7 files changed, 902 insertions, 174 deletions
diff --git a/Documentation/block/cfq-iosched.txt b/Documentation/block/cfq-iosched.txt
index d89b4fe724d7..a5eb7d19a65d 100644
--- a/Documentation/block/cfq-iosched.txt
+++ b/Documentation/block/cfq-iosched.txt
@@ -102,6 +102,64 @@ processing of request. Therefore, increasing the value can imporve the
102performace although this can cause the latency of some I/O to increase due 102performace although this can cause the latency of some I/O to increase due
103to more number of requests. 103to more number of requests.
104 104
105CFQ Group scheduling
106====================
107
108CFQ supports blkio cgroup and has "blkio." prefixed files in each
109blkio cgroup directory. It is weight-based and there are four knobs
110for configuration - weight[_device] and leaf_weight[_device].
111Internal cgroup nodes (the ones with children) can also have tasks in
112them, so the former two configure how much proportion the cgroup as a
113whole is entitled to at its parent's level while the latter two
114configure how much proportion the tasks in the cgroup have compared to
115its direct children.
116
117Another way to think about it is assuming that each internal node has
118an implicit leaf child node which hosts all the tasks whose weight is
119configured by leaf_weight[_device]. Let's assume a blkio hierarchy
120composed of five cgroups - root, A, B, AA and AB - with the following
121weights where the names represent the hierarchy.
122
123 weight leaf_weight
124 root : 125 125
125 A : 500 750
126 B : 250 500
127 AA : 500 500
128 AB : 1000 500
129
130root never has a parent making its weight is meaningless. For backward
131compatibility, weight is always kept in sync with leaf_weight. B, AA
132and AB have no child and thus its tasks have no children cgroup to
133compete with. They always get 100% of what the cgroup won at the
134parent level. Considering only the weights which matter, the hierarchy
135looks like the following.
136
137 root
138 / | \
139 A B leaf
140 500 250 125
141 / | \
142 AA AB leaf
143 500 1000 750
144
145If all cgroups have active IOs and competing with each other, disk
146time will be distributed like the following.
147
148Distribution below root. The total active weight at this level is
149A:500 + B:250 + C:125 = 875.
150
151 root-leaf : 125 / 875 =~ 14%
152 A : 500 / 875 =~ 57%
153 B(-leaf) : 250 / 875 =~ 28%
154
155A has children and further distributes its 57% among the children and
156the implicit leaf node. The total active weight at this level is
157AA:500 + AB:1000 + A-leaf:750 = 2250.
158
159 A-leaf : ( 750 / 2250) * A =~ 19%
160 AA(-leaf) : ( 500 / 2250) * A =~ 12%
161 AB(-leaf) : (1000 / 2250) * A =~ 25%
162
105CFQ IOPS Mode for group scheduling 163CFQ IOPS Mode for group scheduling
106=================================== 164===================================
107Basic CFQ design is to provide priority based time slices. Higher priority 165Basic CFQ design is to provide priority based time slices. Higher priority
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index b4b1fb3a83f0..1b70843c574e 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -94,13 +94,11 @@ Throttling/Upper Limit policy
94 94
95Hierarchical Cgroups 95Hierarchical Cgroups
96==================== 96====================
97- Currently none of the IO control policy supports hierarchical groups. But 97- Currently only CFQ supports hierarchical groups. For throttling,
98 cgroup interface does allow creation of hierarchical cgroups and internally 98 cgroup interface does allow creation of hierarchical cgroups and
99 IO policies treat them as flat hierarchy. 99 internally it treats them as flat hierarchy.
100 100
101 So this patch will allow creation of cgroup hierarchcy but at the backend 101 If somebody created a hierarchy like as follows.
102 everything will be treated as flat. So if somebody created a hierarchy like
103 as follows.
104 102
105 root 103 root
106 / \ 104 / \
@@ -108,16 +106,20 @@ Hierarchical Cgroups
108 | 106 |
109 test3 107 test3
110 108
111 CFQ and throttling will practically treat all groups at same level. 109 CFQ will handle the hierarchy correctly but and throttling will
110 practically treat all groups at same level. For details on CFQ
111 hierarchy support, refer to Documentation/block/cfq-iosched.txt.
112 Throttling will treat the hierarchy as if it looks like the
113 following.
112 114
113 pivot 115 pivot
114 / / \ \ 116 / / \ \
115 root test1 test2 test3 117 root test1 test2 test3
116 118
117 Down the line we can implement hierarchical accounting/control support 119 Nesting cgroups, while allowed, isn't officially supported and blkio
118 and also introduce a new cgroup file "use_hierarchy" which will control 120 genereates warning when cgroups nest. Once throttling implements
119 whether cgroup hierarchy is viewed as flat or hierarchical by the policy.. 121 hierarchy support, hierarchy will be supported and the warning will
120 This is how memory controller also has implemented the things. 122 be removed.
121 123
122Various user visible config options 124Various user visible config options
123=================================== 125===================================
@@ -172,6 +174,12 @@ Proportional weight policy files
172 dev weight 174 dev weight
173 8:16 300 175 8:16 300
174 176
177- blkio.leaf_weight[_device]
178 - Equivalents of blkio.weight[_device] for the purpose of
179 deciding how much weight tasks in the given cgroup has while
180 competing with the cgroup's child cgroups. For details,
181 please refer to Documentation/block/cfq-iosched.txt.
182
175- blkio.time 183- blkio.time
176 - disk time allocated to cgroup per device in milliseconds. First 184 - disk time allocated to cgroup per device in milliseconds. First
177 two fields specify the major and minor number of the device and 185 two fields specify the major and minor number of the device and
@@ -279,6 +287,11 @@ Proportional weight policy files
279 and minor number of the device and third field specifies the number 287 and minor number of the device and third field specifies the number
280 of times a group was dequeued from a particular device. 288 of times a group was dequeued from a particular device.
281 289
290- blkio.*_recursive
291 - Recursive version of various stats. These files show the
292 same information as their non-recursive counterparts but
293 include stats from all the descendant cgroups.
294
282Throttling/Upper limit policy files 295Throttling/Upper limit policy files
283----------------------------------- 296-----------------------------------
284- blkio.throttle.read_bps_device 297- blkio.throttle.read_bps_device
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b8858fb0cafa..87ea95d1f533 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -26,11 +26,32 @@
26 26
27static DEFINE_MUTEX(blkcg_pol_mutex); 27static DEFINE_MUTEX(blkcg_pol_mutex);
28 28
29struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT }; 29struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT,
30 .cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, };
30EXPORT_SYMBOL_GPL(blkcg_root); 31EXPORT_SYMBOL_GPL(blkcg_root);
31 32
32static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; 33static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
33 34
35static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
36 struct request_queue *q, bool update_hint);
37
38/**
39 * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
40 * @d_blkg: loop cursor pointing to the current descendant
41 * @pos_cgrp: used for iteration
42 * @p_blkg: target blkg to walk descendants of
43 *
44 * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU
45 * read locked. If called under either blkcg or queue lock, the iteration
46 * is guaranteed to include all and only online blkgs. The caller may
47 * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip
48 * subtree.
49 */
50#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \
51 cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
52 if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
53 (p_blkg)->q, false)))
54
34static bool blkcg_policy_enabled(struct request_queue *q, 55static bool blkcg_policy_enabled(struct request_queue *q,
35 const struct blkcg_policy *pol) 56 const struct blkcg_policy *pol)
36{ 57{
@@ -112,9 +133,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
112 133
113 blkg->pd[i] = pd; 134 blkg->pd[i] = pd;
114 pd->blkg = blkg; 135 pd->blkg = blkg;
136 pd->plid = i;
115 137
116 /* invoke per-policy init */ 138 /* invoke per-policy init */
117 if (blkcg_policy_enabled(blkg->q, pol)) 139 if (pol->pd_init_fn)
118 pol->pd_init_fn(blkg); 140 pol->pd_init_fn(blkg);
119 } 141 }
120 142
@@ -125,8 +147,19 @@ err_free:
125 return NULL; 147 return NULL;
126} 148}
127 149
150/**
151 * __blkg_lookup - internal version of blkg_lookup()
152 * @blkcg: blkcg of interest
153 * @q: request_queue of interest
154 * @update_hint: whether to update lookup hint with the result or not
155 *
156 * This is internal version and shouldn't be used by policy
157 * implementations. Looks up blkgs for the @blkcg - @q pair regardless of
158 * @q's bypass state. If @update_hint is %true, the caller should be
159 * holding @q->queue_lock and lookup hint is updated on success.
160 */
128static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, 161static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
129 struct request_queue *q) 162 struct request_queue *q, bool update_hint)
130{ 163{
131 struct blkcg_gq *blkg; 164 struct blkcg_gq *blkg;
132 165
@@ -135,14 +168,19 @@ static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
135 return blkg; 168 return blkg;
136 169
137 /* 170 /*
138 * Hint didn't match. Look up from the radix tree. Note that we 171 * Hint didn't match. Look up from the radix tree. Note that the
139 * may not be holding queue_lock and thus are not sure whether 172 * hint can only be updated under queue_lock as otherwise @blkg
140 * @blkg from blkg_tree has already been removed or not, so we 173 * could have already been removed from blkg_tree. The caller is
141 * can't update hint to the lookup result. Leave it to the caller. 174 * responsible for grabbing queue_lock if @update_hint.
142 */ 175 */
143 blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); 176 blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
144 if (blkg && blkg->q == q) 177 if (blkg && blkg->q == q) {
178 if (update_hint) {
179 lockdep_assert_held(q->queue_lock);
180 rcu_assign_pointer(blkcg->blkg_hint, blkg);
181 }
145 return blkg; 182 return blkg;
183 }
146 184
147 return NULL; 185 return NULL;
148} 186}
@@ -162,7 +200,7 @@ struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q)
162 200
163 if (unlikely(blk_queue_bypass(q))) 201 if (unlikely(blk_queue_bypass(q)))
164 return NULL; 202 return NULL;
165 return __blkg_lookup(blkcg, q); 203 return __blkg_lookup(blkcg, q, false);
166} 204}
167EXPORT_SYMBOL_GPL(blkg_lookup); 205EXPORT_SYMBOL_GPL(blkg_lookup);
168 206
@@ -170,75 +208,129 @@ EXPORT_SYMBOL_GPL(blkg_lookup);
170 * If @new_blkg is %NULL, this function tries to allocate a new one as 208 * If @new_blkg is %NULL, this function tries to allocate a new one as
171 * necessary using %GFP_ATOMIC. @new_blkg is always consumed on return. 209 * necessary using %GFP_ATOMIC. @new_blkg is always consumed on return.
172 */ 210 */
173static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, 211static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
174 struct request_queue *q, 212 struct request_queue *q,
175 struct blkcg_gq *new_blkg) 213 struct blkcg_gq *new_blkg)
176{ 214{
177 struct blkcg_gq *blkg; 215 struct blkcg_gq *blkg;
178 int ret; 216 int i, ret;
179 217
180 WARN_ON_ONCE(!rcu_read_lock_held()); 218 WARN_ON_ONCE(!rcu_read_lock_held());
181 lockdep_assert_held(q->queue_lock); 219 lockdep_assert_held(q->queue_lock);
182 220
183 /* lookup and update hint on success, see __blkg_lookup() for details */
184 blkg = __blkg_lookup(blkcg, q);
185 if (blkg) {
186 rcu_assign_pointer(blkcg->blkg_hint, blkg);
187 goto out_free;
188 }
189
190 /* blkg holds a reference to blkcg */ 221 /* blkg holds a reference to blkcg */
191 if (!css_tryget(&blkcg->css)) { 222 if (!css_tryget(&blkcg->css)) {
192 blkg = ERR_PTR(-EINVAL); 223 ret = -EINVAL;
193 goto out_free; 224 goto err_free_blkg;
194 } 225 }
195 226
196 /* allocate */ 227 /* allocate */
197 if (!new_blkg) { 228 if (!new_blkg) {
198 new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC); 229 new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC);
199 if (unlikely(!new_blkg)) { 230 if (unlikely(!new_blkg)) {
200 blkg = ERR_PTR(-ENOMEM); 231 ret = -ENOMEM;
201 goto out_put; 232 goto err_put_css;
202 } 233 }
203 } 234 }
204 blkg = new_blkg; 235 blkg = new_blkg;
205 236
206 /* insert */ 237 /* link parent and insert */
238 if (blkcg_parent(blkcg)) {
239 blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
240 if (WARN_ON_ONCE(!blkg->parent)) {
241 blkg = ERR_PTR(-EINVAL);
242 goto err_put_css;
243 }
244 blkg_get(blkg->parent);
245 }
246
207 spin_lock(&blkcg->lock); 247 spin_lock(&blkcg->lock);
208 ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); 248 ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
209 if (likely(!ret)) { 249 if (likely(!ret)) {
210 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); 250 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
211 list_add(&blkg->q_node, &q->blkg_list); 251 list_add(&blkg->q_node, &q->blkg_list);
252
253 for (i = 0; i < BLKCG_MAX_POLS; i++) {
254 struct blkcg_policy *pol = blkcg_policy[i];
255
256 if (blkg->pd[i] && pol->pd_online_fn)
257 pol->pd_online_fn(blkg);
258 }
212 } 259 }
260 blkg->online = true;
213 spin_unlock(&blkcg->lock); 261 spin_unlock(&blkcg->lock);
214 262
215 if (!ret) 263 if (!ret)
216 return blkg; 264 return blkg;
217 265
218 blkg = ERR_PTR(ret); 266 /* @blkg failed fully initialized, use the usual release path */
219out_put: 267 blkg_put(blkg);
268 return ERR_PTR(ret);
269
270err_put_css:
220 css_put(&blkcg->css); 271 css_put(&blkcg->css);
221out_free: 272err_free_blkg:
222 blkg_free(new_blkg); 273 blkg_free(new_blkg);
223 return blkg; 274 return ERR_PTR(ret);
224} 275}
225 276
277/**
278 * blkg_lookup_create - lookup blkg, try to create one if not there
279 * @blkcg: blkcg of interest
280 * @q: request_queue of interest
281 *
282 * Lookup blkg for the @blkcg - @q pair. If it doesn't exist, try to
283 * create one. blkg creation is performed recursively from blkcg_root such
284 * that all non-root blkg's have access to the parent blkg. This function
285 * should be called under RCU read lock and @q->queue_lock.
286 *
287 * Returns pointer to the looked up or created blkg on success, ERR_PTR()
288 * value on error. If @q is dead, returns ERR_PTR(-EINVAL). If @q is not
289 * dead and bypassing, returns ERR_PTR(-EBUSY).
290 */
226struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, 291struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
227 struct request_queue *q) 292 struct request_queue *q)
228{ 293{
294 struct blkcg_gq *blkg;
295
296 WARN_ON_ONCE(!rcu_read_lock_held());
297 lockdep_assert_held(q->queue_lock);
298
229 /* 299 /*
230 * This could be the first entry point of blkcg implementation and 300 * This could be the first entry point of blkcg implementation and
231 * we shouldn't allow anything to go through for a bypassing queue. 301 * we shouldn't allow anything to go through for a bypassing queue.
232 */ 302 */
233 if (unlikely(blk_queue_bypass(q))) 303 if (unlikely(blk_queue_bypass(q)))
234 return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY); 304 return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY);
235 return __blkg_lookup_create(blkcg, q, NULL); 305
306 blkg = __blkg_lookup(blkcg, q, true);
307 if (blkg)
308 return blkg;
309
310 /*
311 * Create blkgs walking down from blkcg_root to @blkcg, so that all
312 * non-root blkgs have access to their parents.
313 */
314 while (true) {
315 struct blkcg *pos = blkcg;
316 struct blkcg *parent = blkcg_parent(blkcg);
317
318 while (parent && !__blkg_lookup(parent, q, false)) {
319 pos = parent;
320 parent = blkcg_parent(parent);
321 }
322
323 blkg = blkg_create(pos, q, NULL);
324 if (pos == blkcg || IS_ERR(blkg))
325 return blkg;
326 }
236} 327}
237EXPORT_SYMBOL_GPL(blkg_lookup_create); 328EXPORT_SYMBOL_GPL(blkg_lookup_create);
238 329
239static void blkg_destroy(struct blkcg_gq *blkg) 330static void blkg_destroy(struct blkcg_gq *blkg)
240{ 331{
241 struct blkcg *blkcg = blkg->blkcg; 332 struct blkcg *blkcg = blkg->blkcg;
333 int i;
242 334
243 lockdep_assert_held(blkg->q->queue_lock); 335 lockdep_assert_held(blkg->q->queue_lock);
244 lockdep_assert_held(&blkcg->lock); 336 lockdep_assert_held(&blkcg->lock);
@@ -247,6 +339,14 @@ static void blkg_destroy(struct blkcg_gq *blkg)
247 WARN_ON_ONCE(list_empty(&blkg->q_node)); 339 WARN_ON_ONCE(list_empty(&blkg->q_node));
248 WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); 340 WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
249 341
342 for (i = 0; i < BLKCG_MAX_POLS; i++) {
343 struct blkcg_policy *pol = blkcg_policy[i];
344
345 if (blkg->pd[i] && pol->pd_offline_fn)
346 pol->pd_offline_fn(blkg);
347 }
348 blkg->online = false;
349
250 radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); 350 radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
251 list_del_init(&blkg->q_node); 351 list_del_init(&blkg->q_node);
252 hlist_del_init_rcu(&blkg->blkcg_node); 352 hlist_del_init_rcu(&blkg->blkcg_node);
@@ -301,8 +401,10 @@ static void blkg_rcu_free(struct rcu_head *rcu_head)
301 401
302void __blkg_release(struct blkcg_gq *blkg) 402void __blkg_release(struct blkcg_gq *blkg)
303{ 403{
304 /* release the extra blkcg reference this blkg has been holding */ 404 /* release the blkcg and parent blkg refs this blkg has been holding */
305 css_put(&blkg->blkcg->css); 405 css_put(&blkg->blkcg->css);
406 if (blkg->parent)
407 blkg_put(blkg->parent);
306 408
307 /* 409 /*
308 * A group is freed in rcu manner. But having an rcu lock does not 410 * A group is freed in rcu manner. But having an rcu lock does not
@@ -402,8 +504,9 @@ static const char *blkg_dev_name(struct blkcg_gq *blkg)
402 * 504 *
403 * This function invokes @prfill on each blkg of @blkcg if pd for the 505 * This function invokes @prfill on each blkg of @blkcg if pd for the
404 * policy specified by @pol exists. @prfill is invoked with @sf, the 506 * policy specified by @pol exists. @prfill is invoked with @sf, the
405 * policy data and @data. If @show_total is %true, the sum of the return 507 * policy data and @data and the matching queue lock held. If @show_total
406 * values from @prfill is printed with "Total" label at the end. 508 * is %true, the sum of the return values from @prfill is printed with
509 * "Total" label at the end.
407 * 510 *
408 * This is to be used to construct print functions for 511 * This is to be used to construct print functions for
409 * cftype->read_seq_string method. 512 * cftype->read_seq_string method.
@@ -418,11 +521,14 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
418 struct hlist_node *n; 521 struct hlist_node *n;
419 u64 total = 0; 522 u64 total = 0;
420 523
421 spin_lock_irq(&blkcg->lock); 524 rcu_read_lock();
422 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) 525 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
526 spin_lock_irq(blkg->q->queue_lock);
423 if (blkcg_policy_enabled(blkg->q, pol)) 527 if (blkcg_policy_enabled(blkg->q, pol))
424 total += prfill(sf, blkg->pd[pol->plid], data); 528 total += prfill(sf, blkg->pd[pol->plid], data);
425 spin_unlock_irq(&blkcg->lock); 529 spin_unlock_irq(blkg->q->queue_lock);
530 }
531 rcu_read_unlock();
426 532
427 if (show_total) 533 if (show_total)
428 seq_printf(sf, "Total %llu\n", (unsigned long long)total); 534 seq_printf(sf, "Total %llu\n", (unsigned long long)total);
@@ -481,6 +587,7 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
481 seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); 587 seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
482 return v; 588 return v;
483} 589}
590EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat);
484 591
485/** 592/**
486 * blkg_prfill_stat - prfill callback for blkg_stat 593 * blkg_prfill_stat - prfill callback for blkg_stat
@@ -514,6 +621,82 @@ u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
514EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); 621EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
515 622
516/** 623/**
624 * blkg_stat_recursive_sum - collect hierarchical blkg_stat
625 * @pd: policy private data of interest
626 * @off: offset to the blkg_stat in @pd
627 *
628 * Collect the blkg_stat specified by @off from @pd and all its online
629 * descendants and return the sum. The caller must be holding the queue
630 * lock for online tests.
631 */
632u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off)
633{
634 struct blkcg_policy *pol = blkcg_policy[pd->plid];
635 struct blkcg_gq *pos_blkg;
636 struct cgroup *pos_cgrp;
637 u64 sum;
638
639 lockdep_assert_held(pd->blkg->q->queue_lock);
640
641 sum = blkg_stat_read((void *)pd + off);
642
643 rcu_read_lock();
644 blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) {
645 struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
646 struct blkg_stat *stat = (void *)pos_pd + off;
647
648 if (pos_blkg->online)
649 sum += blkg_stat_read(stat);
650 }
651 rcu_read_unlock();
652
653 return sum;
654}
655EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
656
657/**
658 * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat
659 * @pd: policy private data of interest
660 * @off: offset to the blkg_stat in @pd
661 *
662 * Collect the blkg_rwstat specified by @off from @pd and all its online
663 * descendants and return the sum. The caller must be holding the queue
664 * lock for online tests.
665 */
666struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
667 int off)
668{
669 struct blkcg_policy *pol = blkcg_policy[pd->plid];
670 struct blkcg_gq *pos_blkg;
671 struct cgroup *pos_cgrp;
672 struct blkg_rwstat sum;
673 int i;
674
675 lockdep_assert_held(pd->blkg->q->queue_lock);
676
677 sum = blkg_rwstat_read((void *)pd + off);
678
679 rcu_read_lock();
680 blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) {
681 struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
682 struct blkg_rwstat *rwstat = (void *)pos_pd + off;
683 struct blkg_rwstat tmp;
684
685 if (!pos_blkg->online)
686 continue;
687
688 tmp = blkg_rwstat_read(rwstat);
689
690 for (i = 0; i < BLKG_RWSTAT_NR; i++)
691 sum.cnt[i] += tmp.cnt[i];
692 }
693 rcu_read_unlock();
694
695 return sum;
696}
697EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
698
699/**
517 * blkg_conf_prep - parse and prepare for per-blkg config update 700 * blkg_conf_prep - parse and prepare for per-blkg config update
518 * @blkcg: target block cgroup 701 * @blkcg: target block cgroup
519 * @pol: target policy 702 * @pol: target policy
@@ -658,6 +841,7 @@ static struct cgroup_subsys_state *blkcg_css_alloc(struct cgroup *cgroup)
658 return ERR_PTR(-ENOMEM); 841 return ERR_PTR(-ENOMEM);
659 842
660 blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT; 843 blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT;
844 blkcg->cfq_leaf_weight = CFQ_WEIGHT_DEFAULT;
661 blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */ 845 blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */
662done: 846done:
663 spin_lock_init(&blkcg->lock); 847 spin_lock_init(&blkcg->lock);
@@ -777,7 +961,7 @@ int blkcg_activate_policy(struct request_queue *q,
777 const struct blkcg_policy *pol) 961 const struct blkcg_policy *pol)
778{ 962{
779 LIST_HEAD(pds); 963 LIST_HEAD(pds);
780 struct blkcg_gq *blkg; 964 struct blkcg_gq *blkg, *new_blkg;
781 struct blkg_policy_data *pd, *n; 965 struct blkg_policy_data *pd, *n;
782 int cnt = 0, ret; 966 int cnt = 0, ret;
783 bool preloaded; 967 bool preloaded;
@@ -786,19 +970,27 @@ int blkcg_activate_policy(struct request_queue *q,
786 return 0; 970 return 0;
787 971
788 /* preallocations for root blkg */ 972 /* preallocations for root blkg */
789 blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); 973 new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
790 if (!blkg) 974 if (!new_blkg)
791 return -ENOMEM; 975 return -ENOMEM;
792 976
793 preloaded = !radix_tree_preload(GFP_KERNEL); 977 preloaded = !radix_tree_preload(GFP_KERNEL);
794 978
795 blk_queue_bypass_start(q); 979 blk_queue_bypass_start(q);
796 980
797 /* make sure the root blkg exists and count the existing blkgs */ 981 /*
982 * Make sure the root blkg exists and count the existing blkgs. As
983 * @q is bypassing at this point, blkg_lookup_create() can't be
984 * used. Open code it.
985 */
798 spin_lock_irq(q->queue_lock); 986 spin_lock_irq(q->queue_lock);
799 987
800 rcu_read_lock(); 988 rcu_read_lock();
801 blkg = __blkg_lookup_create(&blkcg_root, q, blkg); 989 blkg = __blkg_lookup(&blkcg_root, q, false);
990 if (blkg)
991 blkg_free(new_blkg);
992 else
993 blkg = blkg_create(&blkcg_root, q, new_blkg);
802 rcu_read_unlock(); 994 rcu_read_unlock();
803 995
804 if (preloaded) 996 if (preloaded)
@@ -846,6 +1038,7 @@ int blkcg_activate_policy(struct request_queue *q,
846 1038
847 blkg->pd[pol->plid] = pd; 1039 blkg->pd[pol->plid] = pd;
848 pd->blkg = blkg; 1040 pd->blkg = blkg;
1041 pd->plid = pol->plid;
849 pol->pd_init_fn(blkg); 1042 pol->pd_init_fn(blkg);
850 1043
851 spin_unlock(&blkg->blkcg->lock); 1044 spin_unlock(&blkg->blkcg->lock);
@@ -892,6 +1085,8 @@ void blkcg_deactivate_policy(struct request_queue *q,
892 /* grab blkcg lock too while removing @pd from @blkg */ 1085 /* grab blkcg lock too while removing @pd from @blkg */
893 spin_lock(&blkg->blkcg->lock); 1086 spin_lock(&blkg->blkcg->lock);
894 1087
1088 if (pol->pd_offline_fn)
1089 pol->pd_offline_fn(blkg);
895 if (pol->pd_exit_fn) 1090 if (pol->pd_exit_fn)
896 pol->pd_exit_fn(blkg); 1091 pol->pd_exit_fn(blkg);
897 1092
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 24597309e23d..f2b292925ccd 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -54,6 +54,7 @@ struct blkcg {
54 54
55 /* TODO: per-policy storage in blkcg */ 55 /* TODO: per-policy storage in blkcg */
56 unsigned int cfq_weight; /* belongs to cfq */ 56 unsigned int cfq_weight; /* belongs to cfq */
57 unsigned int cfq_leaf_weight;
57}; 58};
58 59
59struct blkg_stat { 60struct blkg_stat {
@@ -80,8 +81,9 @@ struct blkg_rwstat {
80 * beginning and pd_size can't be smaller than pd. 81 * beginning and pd_size can't be smaller than pd.
81 */ 82 */
82struct blkg_policy_data { 83struct blkg_policy_data {
83 /* the blkg this per-policy data belongs to */ 84 /* the blkg and policy id this per-policy data belongs to */
84 struct blkcg_gq *blkg; 85 struct blkcg_gq *blkg;
86 int plid;
85 87
86 /* used during policy activation */ 88 /* used during policy activation */
87 struct list_head alloc_node; 89 struct list_head alloc_node;
@@ -94,17 +96,27 @@ struct blkcg_gq {
94 struct list_head q_node; 96 struct list_head q_node;
95 struct hlist_node blkcg_node; 97 struct hlist_node blkcg_node;
96 struct blkcg *blkcg; 98 struct blkcg *blkcg;
99
100 /* all non-root blkcg_gq's are guaranteed to have access to parent */
101 struct blkcg_gq *parent;
102
97 /* request allocation list for this blkcg-q pair */ 103 /* request allocation list for this blkcg-q pair */
98 struct request_list rl; 104 struct request_list rl;
105
99 /* reference count */ 106 /* reference count */
100 int refcnt; 107 int refcnt;
101 108
109 /* is this blkg online? protected by both blkcg and q locks */
110 bool online;
111
102 struct blkg_policy_data *pd[BLKCG_MAX_POLS]; 112 struct blkg_policy_data *pd[BLKCG_MAX_POLS];
103 113
104 struct rcu_head rcu_head; 114 struct rcu_head rcu_head;
105}; 115};
106 116
107typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); 117typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
118typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg);
119typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg);
108typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg); 120typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg);
109typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); 121typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg);
110 122
@@ -117,6 +129,8 @@ struct blkcg_policy {
117 129
118 /* operations */ 130 /* operations */
119 blkcg_pol_init_pd_fn *pd_init_fn; 131 blkcg_pol_init_pd_fn *pd_init_fn;
132 blkcg_pol_online_pd_fn *pd_online_fn;
133 blkcg_pol_offline_pd_fn *pd_offline_fn;
120 blkcg_pol_exit_pd_fn *pd_exit_fn; 134 blkcg_pol_exit_pd_fn *pd_exit_fn;
121 blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; 135 blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn;
122}; 136};
@@ -150,6 +164,10 @@ u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off);
150u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, 164u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
151 int off); 165 int off);
152 166
167u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off);
168struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
169 int off);
170
153struct blkg_conf_ctx { 171struct blkg_conf_ctx {
154 struct gendisk *disk; 172 struct gendisk *disk;
155 struct blkcg_gq *blkg; 173 struct blkcg_gq *blkg;
@@ -181,6 +199,19 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
181} 199}
182 200
183/** 201/**
202 * blkcg_parent - get the parent of a blkcg
203 * @blkcg: blkcg of interest
204 *
205 * Return the parent blkcg of @blkcg. Can be called anytime.
206 */
207static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
208{
209 struct cgroup *pcg = blkcg->css.cgroup->parent;
210
211 return pcg ? cgroup_to_blkcg(pcg) : NULL;
212}
213
214/**
184 * blkg_to_pdata - get policy private data 215 * blkg_to_pdata - get policy private data
185 * @blkg: blkg of interest 216 * @blkg: blkg of interest
186 * @pol: policy of interest 217 * @pol: policy of interest
@@ -387,6 +418,18 @@ static inline void blkg_stat_reset(struct blkg_stat *stat)
387} 418}
388 419
389/** 420/**
421 * blkg_stat_merge - merge a blkg_stat into another
422 * @to: the destination blkg_stat
423 * @from: the source
424 *
425 * Add @from's count to @to.
426 */
427static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from)
428{
429 blkg_stat_add(to, blkg_stat_read(from));
430}
431
432/**
390 * blkg_rwstat_add - add a value to a blkg_rwstat 433 * blkg_rwstat_add - add a value to a blkg_rwstat
391 * @rwstat: target blkg_rwstat 434 * @rwstat: target blkg_rwstat
392 * @rw: mask of REQ_{WRITE|SYNC} 435 * @rw: mask of REQ_{WRITE|SYNC}
@@ -434,14 +477,14 @@ static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
434} 477}
435 478
436/** 479/**
437 * blkg_rwstat_sum - read the total count of a blkg_rwstat 480 * blkg_rwstat_total - read the total count of a blkg_rwstat
438 * @rwstat: blkg_rwstat to read 481 * @rwstat: blkg_rwstat to read
439 * 482 *
440 * Return the total count of @rwstat regardless of the IO direction. This 483 * Return the total count of @rwstat regardless of the IO direction. This
441 * function can be called without synchronization and takes care of u64 484 * function can be called without synchronization and takes care of u64
442 * atomicity. 485 * atomicity.
443 */ 486 */
444static inline uint64_t blkg_rwstat_sum(struct blkg_rwstat *rwstat) 487static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
445{ 488{
446 struct blkg_rwstat tmp = blkg_rwstat_read(rwstat); 489 struct blkg_rwstat tmp = blkg_rwstat_read(rwstat);
447 490
@@ -457,6 +500,25 @@ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
457 memset(rwstat->cnt, 0, sizeof(rwstat->cnt)); 500 memset(rwstat->cnt, 0, sizeof(rwstat->cnt));
458} 501}
459 502
503/**
504 * blkg_rwstat_merge - merge a blkg_rwstat into another
505 * @to: the destination blkg_rwstat
506 * @from: the source
507 *
508 * Add @from's counts to @to.
509 */
510static inline void blkg_rwstat_merge(struct blkg_rwstat *to,
511 struct blkg_rwstat *from)
512{
513 struct blkg_rwstat v = blkg_rwstat_read(from);
514 int i;
515
516 u64_stats_update_begin(&to->syncp);
517 for (i = 0; i < BLKG_RWSTAT_NR; i++)
518 to->cnt[i] += v.cnt[i];
519 u64_stats_update_end(&to->syncp);
520}
521
460#else /* CONFIG_BLK_CGROUP */ 522#else /* CONFIG_BLK_CGROUP */
461 523
462struct cgroup; 524struct cgroup;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 788147797a79..6206a934eb8c 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -497,6 +497,13 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
497 return res; 497 return res;
498} 498}
499 499
500static void blk_free_queue_rcu(struct rcu_head *rcu_head)
501{
502 struct request_queue *q = container_of(rcu_head, struct request_queue,
503 rcu_head);
504 kmem_cache_free(blk_requestq_cachep, q);
505}
506
500/** 507/**
501 * blk_release_queue: - release a &struct request_queue when it is no longer needed 508 * blk_release_queue: - release a &struct request_queue when it is no longer needed
502 * @kobj: the kobj belonging to the request queue to be released 509 * @kobj: the kobj belonging to the request queue to be released
@@ -538,7 +545,7 @@ static void blk_release_queue(struct kobject *kobj)
538 bdi_destroy(&q->backing_dev_info); 545 bdi_destroy(&q->backing_dev_info);
539 546
540 ida_simple_remove(&blk_queue_ida, q->id); 547 ida_simple_remove(&blk_queue_ida, q->id);
541 kmem_cache_free(blk_requestq_cachep, q); 548 call_rcu(&q->rcu_head, blk_free_queue_rcu);
542} 549}
543 550
544static const struct sysfs_ops queue_sysfs_ops = { 551static const struct sysfs_ops queue_sysfs_ops = {
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index e62e9205b80a..b66365b6ba77 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -85,7 +85,6 @@ struct cfq_rb_root {
85 struct rb_root rb; 85 struct rb_root rb;
86 struct rb_node *left; 86 struct rb_node *left;
87 unsigned count; 87 unsigned count;
88 unsigned total_weight;
89 u64 min_vdisktime; 88 u64 min_vdisktime;
90 struct cfq_ttime ttime; 89 struct cfq_ttime ttime;
91}; 90};
@@ -155,7 +154,7 @@ struct cfq_queue {
155 * First index in the service_trees. 154 * First index in the service_trees.
156 * IDLE is handled separately, so it has negative index 155 * IDLE is handled separately, so it has negative index
157 */ 156 */
158enum wl_prio_t { 157enum wl_class_t {
159 BE_WORKLOAD = 0, 158 BE_WORKLOAD = 0,
160 RT_WORKLOAD = 1, 159 RT_WORKLOAD = 1,
161 IDLE_WORKLOAD = 2, 160 IDLE_WORKLOAD = 2,
@@ -223,10 +222,45 @@ struct cfq_group {
223 222
224 /* group service_tree key */ 223 /* group service_tree key */
225 u64 vdisktime; 224 u64 vdisktime;
225
226 /*
227 * The number of active cfqgs and sum of their weights under this
228 * cfqg. This covers this cfqg's leaf_weight and all children's
229 * weights, but does not cover weights of further descendants.
230 *
231 * If a cfqg is on the service tree, it's active. An active cfqg
232 * also activates its parent and contributes to the children_weight
233 * of the parent.
234 */
235 int nr_active;
236 unsigned int children_weight;
237
238 /*
239 * vfraction is the fraction of vdisktime that the tasks in this
240 * cfqg are entitled to. This is determined by compounding the
241 * ratios walking up from this cfqg to the root.
242 *
243 * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all
244 * vfractions on a service tree is approximately 1. The sum may
245 * deviate a bit due to rounding errors and fluctuations caused by
246 * cfqgs entering and leaving the service tree.
247 */
248 unsigned int vfraction;
249
250 /*
251 * There are two weights - (internal) weight is the weight of this
252 * cfqg against the sibling cfqgs. leaf_weight is the wight of
253 * this cfqg against the child cfqgs. For the root cfqg, both
254 * weights are kept in sync for backward compatibility.
255 */
226 unsigned int weight; 256 unsigned int weight;
227 unsigned int new_weight; 257 unsigned int new_weight;
228 unsigned int dev_weight; 258 unsigned int dev_weight;
229 259
260 unsigned int leaf_weight;
261 unsigned int new_leaf_weight;
262 unsigned int dev_leaf_weight;
263
230 /* number of cfqq currently on this group */ 264 /* number of cfqq currently on this group */
231 int nr_cfqq; 265 int nr_cfqq;
232 266
@@ -248,14 +282,15 @@ struct cfq_group {
248 struct cfq_rb_root service_trees[2][3]; 282 struct cfq_rb_root service_trees[2][3];
249 struct cfq_rb_root service_tree_idle; 283 struct cfq_rb_root service_tree_idle;
250 284
251 unsigned long saved_workload_slice; 285 unsigned long saved_wl_slice;
252 enum wl_type_t saved_workload; 286 enum wl_type_t saved_wl_type;
253 enum wl_prio_t saved_serving_prio; 287 enum wl_class_t saved_wl_class;
254 288
255 /* number of requests that are on the dispatch list or inside driver */ 289 /* number of requests that are on the dispatch list or inside driver */
256 int dispatched; 290 int dispatched;
257 struct cfq_ttime ttime; 291 struct cfq_ttime ttime;
258 struct cfqg_stats stats; 292 struct cfqg_stats stats; /* stats for this cfqg */
293 struct cfqg_stats dead_stats; /* stats pushed from dead children */
259}; 294};
260 295
261struct cfq_io_cq { 296struct cfq_io_cq {
@@ -280,8 +315,8 @@ struct cfq_data {
280 /* 315 /*
281 * The priority currently being served 316 * The priority currently being served
282 */ 317 */
283 enum wl_prio_t serving_prio; 318 enum wl_class_t serving_wl_class;
284 enum wl_type_t serving_type; 319 enum wl_type_t serving_wl_type;
285 unsigned long workload_expires; 320 unsigned long workload_expires;
286 struct cfq_group *serving_group; 321 struct cfq_group *serving_group;
287 322
@@ -353,17 +388,17 @@ struct cfq_data {
353 388
354static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); 389static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
355 390
356static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg, 391static struct cfq_rb_root *st_for(struct cfq_group *cfqg,
357 enum wl_prio_t prio, 392 enum wl_class_t class,
358 enum wl_type_t type) 393 enum wl_type_t type)
359{ 394{
360 if (!cfqg) 395 if (!cfqg)
361 return NULL; 396 return NULL;
362 397
363 if (prio == IDLE_WORKLOAD) 398 if (class == IDLE_WORKLOAD)
364 return &cfqg->service_tree_idle; 399 return &cfqg->service_tree_idle;
365 400
366 return &cfqg->service_trees[prio][type]; 401 return &cfqg->service_trees[class][type];
367} 402}
368 403
369enum cfqq_state_flags { 404enum cfqq_state_flags {
@@ -502,7 +537,7 @@ static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)
502{ 537{
503 struct cfqg_stats *stats = &cfqg->stats; 538 struct cfqg_stats *stats = &cfqg->stats;
504 539
505 if (blkg_rwstat_sum(&stats->queued)) 540 if (blkg_rwstat_total(&stats->queued))
506 return; 541 return;
507 542
508 /* 543 /*
@@ -546,7 +581,7 @@ static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg)
546 struct cfqg_stats *stats = &cfqg->stats; 581 struct cfqg_stats *stats = &cfqg->stats;
547 582
548 blkg_stat_add(&stats->avg_queue_size_sum, 583 blkg_stat_add(&stats->avg_queue_size_sum,
549 blkg_rwstat_sum(&stats->queued)); 584 blkg_rwstat_total(&stats->queued));
550 blkg_stat_add(&stats->avg_queue_size_samples, 1); 585 blkg_stat_add(&stats->avg_queue_size_samples, 1);
551 cfqg_stats_update_group_wait_time(stats); 586 cfqg_stats_update_group_wait_time(stats);
552} 587}
@@ -572,6 +607,13 @@ static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
572 return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq)); 607 return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
573} 608}
574 609
610static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg)
611{
612 struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent;
613
614 return pblkg ? blkg_to_cfqg(pblkg) : NULL;
615}
616
575static inline void cfqg_get(struct cfq_group *cfqg) 617static inline void cfqg_get(struct cfq_group *cfqg)
576{ 618{
577 return blkg_get(cfqg_to_blkg(cfqg)); 619 return blkg_get(cfqg_to_blkg(cfqg));
@@ -586,8 +628,9 @@ static inline void cfqg_put(struct cfq_group *cfqg)
586 char __pbuf[128]; \ 628 char __pbuf[128]; \
587 \ 629 \
588 blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf)); \ 630 blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf)); \
589 blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ 631 blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c %s " fmt, (cfqq)->pid, \
590 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ 632 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
633 cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
591 __pbuf, ##args); \ 634 __pbuf, ##args); \
592} while (0) 635} while (0)
593 636
@@ -646,11 +689,9 @@ static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
646 io_start_time - start_time); 689 io_start_time - start_time);
647} 690}
648 691
649static void cfq_pd_reset_stats(struct blkcg_gq *blkg) 692/* @stats = 0 */
693static void cfqg_stats_reset(struct cfqg_stats *stats)
650{ 694{
651 struct cfq_group *cfqg = blkg_to_cfqg(blkg);
652 struct cfqg_stats *stats = &cfqg->stats;
653
654 /* queued stats shouldn't be cleared */ 695 /* queued stats shouldn't be cleared */
655 blkg_rwstat_reset(&stats->service_bytes); 696 blkg_rwstat_reset(&stats->service_bytes);
656 blkg_rwstat_reset(&stats->serviced); 697 blkg_rwstat_reset(&stats->serviced);
@@ -669,13 +710,58 @@ static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
669#endif 710#endif
670} 711}
671 712
713/* @to += @from */
714static void cfqg_stats_merge(struct cfqg_stats *to, struct cfqg_stats *from)
715{
716 /* queued stats shouldn't be cleared */
717 blkg_rwstat_merge(&to->service_bytes, &from->service_bytes);
718 blkg_rwstat_merge(&to->serviced, &from->serviced);
719 blkg_rwstat_merge(&to->merged, &from->merged);
720 blkg_rwstat_merge(&to->service_time, &from->service_time);
721 blkg_rwstat_merge(&to->wait_time, &from->wait_time);
722 blkg_stat_merge(&from->time, &from->time);
723#ifdef CONFIG_DEBUG_BLK_CGROUP
724 blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time);
725 blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
726 blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
727 blkg_stat_merge(&to->dequeue, &from->dequeue);
728 blkg_stat_merge(&to->group_wait_time, &from->group_wait_time);
729 blkg_stat_merge(&to->idle_time, &from->idle_time);
730 blkg_stat_merge(&to->empty_time, &from->empty_time);
731#endif
732}
733
734/*
735 * Transfer @cfqg's stats to its parent's dead_stats so that the ancestors'
736 * recursive stats can still account for the amount used by this cfqg after
737 * it's gone.
738 */
739static void cfqg_stats_xfer_dead(struct cfq_group *cfqg)
740{
741 struct cfq_group *parent = cfqg_parent(cfqg);
742
743 lockdep_assert_held(cfqg_to_blkg(cfqg)->q->queue_lock);
744
745 if (unlikely(!parent))
746 return;
747
748 cfqg_stats_merge(&parent->dead_stats, &cfqg->stats);
749 cfqg_stats_merge(&parent->dead_stats, &cfqg->dead_stats);
750 cfqg_stats_reset(&cfqg->stats);
751 cfqg_stats_reset(&cfqg->dead_stats);
752}
753
672#else /* CONFIG_CFQ_GROUP_IOSCHED */ 754#else /* CONFIG_CFQ_GROUP_IOSCHED */
673 755
756static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; }
674static inline void cfqg_get(struct cfq_group *cfqg) { } 757static inline void cfqg_get(struct cfq_group *cfqg) { }
675static inline void cfqg_put(struct cfq_group *cfqg) { } 758static inline void cfqg_put(struct cfq_group *cfqg) { }
676 759
677#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ 760#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
678 blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) 761 blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c " fmt, (cfqq)->pid, \
762 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
763 cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
764 ##args)
679#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0) 765#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0)
680 766
681static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, 767static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
@@ -732,7 +818,7 @@ static inline bool iops_mode(struct cfq_data *cfqd)
732 return false; 818 return false;
733} 819}
734 820
735static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq) 821static inline enum wl_class_t cfqq_class(struct cfq_queue *cfqq)
736{ 822{
737 if (cfq_class_idle(cfqq)) 823 if (cfq_class_idle(cfqq))
738 return IDLE_WORKLOAD; 824 return IDLE_WORKLOAD;
@@ -751,23 +837,23 @@ static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
751 return SYNC_WORKLOAD; 837 return SYNC_WORKLOAD;
752} 838}
753 839
754static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl, 840static inline int cfq_group_busy_queues_wl(enum wl_class_t wl_class,
755 struct cfq_data *cfqd, 841 struct cfq_data *cfqd,
756 struct cfq_group *cfqg) 842 struct cfq_group *cfqg)
757{ 843{
758 if (wl == IDLE_WORKLOAD) 844 if (wl_class == IDLE_WORKLOAD)
759 return cfqg->service_tree_idle.count; 845 return cfqg->service_tree_idle.count;
760 846
761 return cfqg->service_trees[wl][ASYNC_WORKLOAD].count 847 return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count +
762 + cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count 848 cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count +
763 + cfqg->service_trees[wl][SYNC_WORKLOAD].count; 849 cfqg->service_trees[wl_class][SYNC_WORKLOAD].count;
764} 850}
765 851
766static inline int cfqg_busy_async_queues(struct cfq_data *cfqd, 852static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
767 struct cfq_group *cfqg) 853 struct cfq_group *cfqg)
768{ 854{
769 return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count 855 return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count +
770 + cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count; 856 cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
771} 857}
772 858
773static void cfq_dispatch_insert(struct request_queue *, struct request *); 859static void cfq_dispatch_insert(struct request_queue *, struct request *);
@@ -847,13 +933,27 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
847 return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); 933 return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
848} 934}
849 935
850static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg) 936/**
937 * cfqg_scale_charge - scale disk time charge according to cfqg weight
938 * @charge: disk time being charged
939 * @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT
940 *
941 * Scale @charge according to @vfraction, which is in range (0, 1]. The
942 * scaling is inversely proportional.
943 *
944 * scaled = charge / vfraction
945 *
946 * The result is also in fixed point w/ CFQ_SERVICE_SHIFT.
947 */
948static inline u64 cfqg_scale_charge(unsigned long charge,
949 unsigned int vfraction)
851{ 950{
852 u64 d = delta << CFQ_SERVICE_SHIFT; 951 u64 c = charge << CFQ_SERVICE_SHIFT; /* make it fixed point */
853 952
854 d = d * CFQ_WEIGHT_DEFAULT; 953 /* charge / vfraction */
855 do_div(d, cfqg->weight); 954 c <<= CFQ_SERVICE_SHIFT;
856 return d; 955 do_div(c, vfraction);
956 return c;
857} 957}
858 958
859static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime) 959static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
@@ -909,9 +1009,7 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
909static inline unsigned 1009static inline unsigned
910cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg) 1010cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
911{ 1011{
912 struct cfq_rb_root *st = &cfqd->grp_service_tree; 1012 return cfqd->cfq_target_latency * cfqg->vfraction >> CFQ_SERVICE_SHIFT;
913
914 return cfqd->cfq_target_latency * cfqg->weight / st->total_weight;
915} 1013}
916 1014
917static inline unsigned 1015static inline unsigned
@@ -1178,20 +1276,61 @@ static void
1178cfq_update_group_weight(struct cfq_group *cfqg) 1276cfq_update_group_weight(struct cfq_group *cfqg)
1179{ 1277{
1180 BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); 1278 BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
1279
1181 if (cfqg->new_weight) { 1280 if (cfqg->new_weight) {
1182 cfqg->weight = cfqg->new_weight; 1281 cfqg->weight = cfqg->new_weight;
1183 cfqg->new_weight = 0; 1282 cfqg->new_weight = 0;
1184 } 1283 }
1284
1285 if (cfqg->new_leaf_weight) {
1286 cfqg->leaf_weight = cfqg->new_leaf_weight;
1287 cfqg->new_leaf_weight = 0;
1288 }
1185} 1289}
1186 1290
1187static void 1291static void
1188cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) 1292cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
1189{ 1293{
1294 unsigned int vfr = 1 << CFQ_SERVICE_SHIFT; /* start with 1 */
1295 struct cfq_group *pos = cfqg;
1296 struct cfq_group *parent;
1297 bool propagate;
1298
1299 /* add to the service tree */
1190 BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); 1300 BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
1191 1301
1192 cfq_update_group_weight(cfqg); 1302 cfq_update_group_weight(cfqg);
1193 __cfq_group_service_tree_add(st, cfqg); 1303 __cfq_group_service_tree_add(st, cfqg);
1194 st->total_weight += cfqg->weight; 1304
1305 /*
1306 * Activate @cfqg and calculate the portion of vfraction @cfqg is
1307 * entitled to. vfraction is calculated by walking the tree
1308 * towards the root calculating the fraction it has at each level.
1309 * The compounded ratio is how much vfraction @cfqg owns.
1310 *
1311 * Start with the proportion tasks in this cfqg has against active
1312 * children cfqgs - its leaf_weight against children_weight.
1313 */
1314 propagate = !pos->nr_active++;
1315 pos->children_weight += pos->leaf_weight;
1316 vfr = vfr * pos->leaf_weight / pos->children_weight;
1317
1318 /*
1319 * Compound ->weight walking up the tree. Both activation and
1320 * vfraction calculation are done in the same loop. Propagation
1321 * stops once an already activated node is met. vfraction
1322 * calculation should always continue to the root.
1323 */
1324 while ((parent = cfqg_parent(pos))) {
1325 if (propagate) {
1326 propagate = !parent->nr_active++;
1327 parent->children_weight += pos->weight;
1328 }
1329 vfr = vfr * pos->weight / parent->children_weight;
1330 pos = parent;
1331 }
1332
1333 cfqg->vfraction = max_t(unsigned, vfr, 1);
1195} 1334}
1196 1335
1197static void 1336static void
@@ -1222,7 +1361,32 @@ cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
1222static void 1361static void
1223cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg) 1362cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
1224{ 1363{
1225 st->total_weight -= cfqg->weight; 1364 struct cfq_group *pos = cfqg;
1365 bool propagate;
1366
1367 /*
1368 * Undo activation from cfq_group_service_tree_add(). Deactivate
1369 * @cfqg and propagate deactivation upwards.
1370 */
1371 propagate = !--pos->nr_active;
1372 pos->children_weight -= pos->leaf_weight;
1373
1374 while (propagate) {
1375 struct cfq_group *parent = cfqg_parent(pos);
1376
1377 /* @pos has 0 nr_active at this point */
1378 WARN_ON_ONCE(pos->children_weight);
1379 pos->vfraction = 0;
1380
1381 if (!parent)
1382 break;
1383
1384 propagate = !--parent->nr_active;
1385 parent->children_weight -= pos->weight;
1386 pos = parent;
1387 }
1388
1389 /* remove from the service tree */
1226 if (!RB_EMPTY_NODE(&cfqg->rb_node)) 1390 if (!RB_EMPTY_NODE(&cfqg->rb_node))
1227 cfq_rb_erase(&cfqg->rb_node, st); 1391 cfq_rb_erase(&cfqg->rb_node, st);
1228} 1392}
@@ -1241,7 +1405,7 @@ cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
1241 1405
1242 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); 1406 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
1243 cfq_group_service_tree_del(st, cfqg); 1407 cfq_group_service_tree_del(st, cfqg);
1244 cfqg->saved_workload_slice = 0; 1408 cfqg->saved_wl_slice = 0;
1245 cfqg_stats_update_dequeue(cfqg); 1409 cfqg_stats_update_dequeue(cfqg);
1246} 1410}
1247 1411
@@ -1284,6 +1448,7 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
1284 unsigned int used_sl, charge, unaccounted_sl = 0; 1448 unsigned int used_sl, charge, unaccounted_sl = 0;
1285 int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) 1449 int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
1286 - cfqg->service_tree_idle.count; 1450 - cfqg->service_tree_idle.count;
1451 unsigned int vfr;
1287 1452
1288 BUG_ON(nr_sync < 0); 1453 BUG_ON(nr_sync < 0);
1289 used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl); 1454 used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
@@ -1293,20 +1458,25 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
1293 else if (!cfq_cfqq_sync(cfqq) && !nr_sync) 1458 else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
1294 charge = cfqq->allocated_slice; 1459 charge = cfqq->allocated_slice;
1295 1460
1296 /* Can't update vdisktime while group is on service tree */ 1461 /*
1462 * Can't update vdisktime while on service tree and cfqg->vfraction
1463 * is valid only while on it. Cache vfr, leave the service tree,
1464 * update vdisktime and go back on. The re-addition to the tree
1465 * will also update the weights as necessary.
1466 */
1467 vfr = cfqg->vfraction;
1297 cfq_group_service_tree_del(st, cfqg); 1468 cfq_group_service_tree_del(st, cfqg);
1298 cfqg->vdisktime += cfq_scale_slice(charge, cfqg); 1469 cfqg->vdisktime += cfqg_scale_charge(charge, vfr);
1299 /* If a new weight was requested, update now, off tree */
1300 cfq_group_service_tree_add(st, cfqg); 1470 cfq_group_service_tree_add(st, cfqg);
1301 1471
1302 /* This group is being expired. Save the context */ 1472 /* This group is being expired. Save the context */
1303 if (time_after(cfqd->workload_expires, jiffies)) { 1473 if (time_after(cfqd->workload_expires, jiffies)) {
1304 cfqg->saved_workload_slice = cfqd->workload_expires 1474 cfqg->saved_wl_slice = cfqd->workload_expires
1305 - jiffies; 1475 - jiffies;
1306 cfqg->saved_workload = cfqd->serving_type; 1476 cfqg->saved_wl_type = cfqd->serving_wl_type;
1307 cfqg->saved_serving_prio = cfqd->serving_prio; 1477 cfqg->saved_wl_class = cfqd->serving_wl_class;
1308 } else 1478 } else
1309 cfqg->saved_workload_slice = 0; 1479 cfqg->saved_wl_slice = 0;
1310 1480
1311 cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, 1481 cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
1312 st->min_vdisktime); 1482 st->min_vdisktime);
@@ -1344,6 +1514,52 @@ static void cfq_pd_init(struct blkcg_gq *blkg)
1344 1514
1345 cfq_init_cfqg_base(cfqg); 1515 cfq_init_cfqg_base(cfqg);
1346 cfqg->weight = blkg->blkcg->cfq_weight; 1516 cfqg->weight = blkg->blkcg->cfq_weight;
1517 cfqg->leaf_weight = blkg->blkcg->cfq_leaf_weight;
1518}
1519
1520static void cfq_pd_offline(struct blkcg_gq *blkg)
1521{
1522 /*
1523 * @blkg is going offline and will be ignored by
1524 * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so
1525 * that they don't get lost. If IOs complete after this point, the
1526 * stats for them will be lost. Oh well...
1527 */
1528 cfqg_stats_xfer_dead(blkg_to_cfqg(blkg));
1529}
1530
1531/* offset delta from cfqg->stats to cfqg->dead_stats */
1532static const int dead_stats_off_delta = offsetof(struct cfq_group, dead_stats) -
1533 offsetof(struct cfq_group, stats);
1534
1535/* to be used by recursive prfill, sums live and dead stats recursively */
1536static u64 cfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off)
1537{
1538 u64 sum = 0;
1539
1540 sum += blkg_stat_recursive_sum(pd, off);
1541 sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta);
1542 return sum;
1543}
1544
1545/* to be used by recursive prfill, sums live and dead rwstats recursively */
1546static struct blkg_rwstat cfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd,
1547 int off)
1548{
1549 struct blkg_rwstat a, b;
1550
1551 a = blkg_rwstat_recursive_sum(pd, off);
1552 b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta);
1553 blkg_rwstat_merge(&a, &b);
1554 return a;
1555}
1556
1557static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
1558{
1559 struct cfq_group *cfqg = blkg_to_cfqg(blkg);
1560
1561 cfqg_stats_reset(&cfqg->stats);
1562 cfqg_stats_reset(&cfqg->dead_stats);
1347} 1563}
1348 1564
1349/* 1565/*
@@ -1400,6 +1616,26 @@ static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft,
1400 return 0; 1616 return 0;
1401} 1617}
1402 1618
1619static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf,
1620 struct blkg_policy_data *pd, int off)
1621{
1622 struct cfq_group *cfqg = pd_to_cfqg(pd);
1623
1624 if (!cfqg->dev_leaf_weight)
1625 return 0;
1626 return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight);
1627}
1628
1629static int cfqg_print_leaf_weight_device(struct cgroup *cgrp,
1630 struct cftype *cft,
1631 struct seq_file *sf)
1632{
1633 blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp),
1634 cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq, 0,
1635 false);
1636 return 0;
1637}
1638
1403static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft, 1639static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft,
1404 struct seq_file *sf) 1640 struct seq_file *sf)
1405{ 1641{
@@ -1407,8 +1643,16 @@ static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft,
1407 return 0; 1643 return 0;
1408} 1644}
1409 1645
1410static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, 1646static int cfq_print_leaf_weight(struct cgroup *cgrp, struct cftype *cft,
1411 const char *buf) 1647 struct seq_file *sf)
1648{
1649 seq_printf(sf, "%u\n",
1650 cgroup_to_blkcg(cgrp)->cfq_leaf_weight);
1651 return 0;
1652}
1653
1654static int __cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
1655 const char *buf, bool is_leaf_weight)
1412{ 1656{
1413 struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 1657 struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1414 struct blkg_conf_ctx ctx; 1658 struct blkg_conf_ctx ctx;
@@ -1422,8 +1666,13 @@ static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
1422 ret = -EINVAL; 1666 ret = -EINVAL;
1423 cfqg = blkg_to_cfqg(ctx.blkg); 1667 cfqg = blkg_to_cfqg(ctx.blkg);
1424 if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) { 1668 if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
1425 cfqg->dev_weight = ctx.v; 1669 if (!is_leaf_weight) {
1426 cfqg->new_weight = cfqg->dev_weight ?: blkcg->cfq_weight; 1670 cfqg->dev_weight = ctx.v;
1671 cfqg->new_weight = ctx.v ?: blkcg->cfq_weight;
1672 } else {
1673 cfqg->dev_leaf_weight = ctx.v;
1674 cfqg->new_leaf_weight = ctx.v ?: blkcg->cfq_leaf_weight;
1675 }
1427 ret = 0; 1676 ret = 0;
1428 } 1677 }
1429 1678
@@ -1431,7 +1680,20 @@ static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
1431 return ret; 1680 return ret;
1432} 1681}
1433 1682
1434static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val) 1683static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
1684 const char *buf)
1685{
1686 return __cfqg_set_weight_device(cgrp, cft, buf, false);
1687}
1688
1689static int cfqg_set_leaf_weight_device(struct cgroup *cgrp, struct cftype *cft,
1690 const char *buf)
1691{
1692 return __cfqg_set_weight_device(cgrp, cft, buf, true);
1693}
1694
1695static int __cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val,
1696 bool is_leaf_weight)
1435{ 1697{
1436 struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 1698 struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1437 struct blkcg_gq *blkg; 1699 struct blkcg_gq *blkg;
@@ -1441,19 +1703,41 @@ static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
1441 return -EINVAL; 1703 return -EINVAL;
1442 1704
1443 spin_lock_irq(&blkcg->lock); 1705 spin_lock_irq(&blkcg->lock);
1444 blkcg->cfq_weight = (unsigned int)val; 1706
1707 if (!is_leaf_weight)
1708 blkcg->cfq_weight = val;
1709 else
1710 blkcg->cfq_leaf_weight = val;
1445 1711
1446 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { 1712 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1447 struct cfq_group *cfqg = blkg_to_cfqg(blkg); 1713 struct cfq_group *cfqg = blkg_to_cfqg(blkg);
1448 1714
1449 if (cfqg && !cfqg->dev_weight) 1715 if (!cfqg)
1450 cfqg->new_weight = blkcg->cfq_weight; 1716 continue;
1717
1718 if (!is_leaf_weight) {
1719 if (!cfqg->dev_weight)
1720 cfqg->new_weight = blkcg->cfq_weight;
1721 } else {
1722 if (!cfqg->dev_leaf_weight)
1723 cfqg->new_leaf_weight = blkcg->cfq_leaf_weight;
1724 }
1451 } 1725 }
1452 1726
1453 spin_unlock_irq(&blkcg->lock); 1727 spin_unlock_irq(&blkcg->lock);
1454 return 0; 1728 return 0;
1455} 1729}
1456 1730
1731static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
1732{
1733 return __cfq_set_weight(cgrp, cft, val, false);
1734}
1735
1736static int cfq_set_leaf_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
1737{
1738 return __cfq_set_weight(cgrp, cft, val, true);
1739}
1740
1457static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft, 1741static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft,
1458 struct seq_file *sf) 1742 struct seq_file *sf)
1459{ 1743{
@@ -1474,6 +1758,42 @@ static int cfqg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
1474 return 0; 1758 return 0;
1475} 1759}
1476 1760
1761static u64 cfqg_prfill_stat_recursive(struct seq_file *sf,
1762 struct blkg_policy_data *pd, int off)
1763{
1764 u64 sum = cfqg_stat_pd_recursive_sum(pd, off);
1765
1766 return __blkg_prfill_u64(sf, pd, sum);
1767}
1768
1769static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
1770 struct blkg_policy_data *pd, int off)
1771{
1772 struct blkg_rwstat sum = cfqg_rwstat_pd_recursive_sum(pd, off);
1773
1774 return __blkg_prfill_rwstat(sf, pd, &sum);
1775}
1776
1777static int cfqg_print_stat_recursive(struct cgroup *cgrp, struct cftype *cft,
1778 struct seq_file *sf)
1779{
1780 struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1781
1782 blkcg_print_blkgs(sf, blkcg, cfqg_prfill_stat_recursive,
1783 &blkcg_policy_cfq, cft->private, false);
1784 return 0;
1785}
1786
1787static int cfqg_print_rwstat_recursive(struct cgroup *cgrp, struct cftype *cft,
1788 struct seq_file *sf)
1789{
1790 struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1791
1792 blkcg_print_blkgs(sf, blkcg, cfqg_prfill_rwstat_recursive,
1793 &blkcg_policy_cfq, cft->private, true);
1794 return 0;
1795}
1796
1477#ifdef CONFIG_DEBUG_BLK_CGROUP 1797#ifdef CONFIG_DEBUG_BLK_CGROUP
1478static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, 1798static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
1479 struct blkg_policy_data *pd, int off) 1799 struct blkg_policy_data *pd, int off)
@@ -1503,17 +1823,49 @@ static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft,
1503#endif /* CONFIG_DEBUG_BLK_CGROUP */ 1823#endif /* CONFIG_DEBUG_BLK_CGROUP */
1504 1824
1505static struct cftype cfq_blkcg_files[] = { 1825static struct cftype cfq_blkcg_files[] = {
1826 /* on root, weight is mapped to leaf_weight */
1506 { 1827 {
1507 .name = "weight_device", 1828 .name = "weight_device",
1829 .flags = CFTYPE_ONLY_ON_ROOT,
1830 .read_seq_string = cfqg_print_leaf_weight_device,
1831 .write_string = cfqg_set_leaf_weight_device,
1832 .max_write_len = 256,
1833 },
1834 {
1835 .name = "weight",
1836 .flags = CFTYPE_ONLY_ON_ROOT,
1837 .read_seq_string = cfq_print_leaf_weight,
1838 .write_u64 = cfq_set_leaf_weight,
1839 },
1840
1841 /* no such mapping necessary for !roots */
1842 {
1843 .name = "weight_device",
1844 .flags = CFTYPE_NOT_ON_ROOT,
1508 .read_seq_string = cfqg_print_weight_device, 1845 .read_seq_string = cfqg_print_weight_device,
1509 .write_string = cfqg_set_weight_device, 1846 .write_string = cfqg_set_weight_device,
1510 .max_write_len = 256, 1847 .max_write_len = 256,
1511 }, 1848 },
1512 { 1849 {
1513 .name = "weight", 1850 .name = "weight",
1851 .flags = CFTYPE_NOT_ON_ROOT,
1514 .read_seq_string = cfq_print_weight, 1852 .read_seq_string = cfq_print_weight,
1515 .write_u64 = cfq_set_weight, 1853 .write_u64 = cfq_set_weight,
1516 }, 1854 },
1855
1856 {
1857 .name = "leaf_weight_device",
1858 .read_seq_string = cfqg_print_leaf_weight_device,
1859 .write_string = cfqg_set_leaf_weight_device,
1860 .max_write_len = 256,
1861 },
1862 {
1863 .name = "leaf_weight",
1864 .read_seq_string = cfq_print_leaf_weight,
1865 .write_u64 = cfq_set_leaf_weight,
1866 },
1867
1868 /* statistics, covers only the tasks in the cfqg */
1517 { 1869 {
1518 .name = "time", 1870 .name = "time",
1519 .private = offsetof(struct cfq_group, stats.time), 1871 .private = offsetof(struct cfq_group, stats.time),
@@ -1554,6 +1906,48 @@ static struct cftype cfq_blkcg_files[] = {
1554 .private = offsetof(struct cfq_group, stats.queued), 1906 .private = offsetof(struct cfq_group, stats.queued),
1555 .read_seq_string = cfqg_print_rwstat, 1907 .read_seq_string = cfqg_print_rwstat,
1556 }, 1908 },
1909
1910 /* the same statictics which cover the cfqg and its descendants */
1911 {
1912 .name = "time_recursive",
1913 .private = offsetof(struct cfq_group, stats.time),
1914 .read_seq_string = cfqg_print_stat_recursive,
1915 },
1916 {
1917 .name = "sectors_recursive",
1918 .private = offsetof(struct cfq_group, stats.sectors),
1919 .read_seq_string = cfqg_print_stat_recursive,
1920 },
1921 {
1922 .name = "io_service_bytes_recursive",
1923 .private = offsetof(struct cfq_group, stats.service_bytes),
1924 .read_seq_string = cfqg_print_rwstat_recursive,
1925 },
1926 {
1927 .name = "io_serviced_recursive",
1928 .private = offsetof(struct cfq_group, stats.serviced),
1929 .read_seq_string = cfqg_print_rwstat_recursive,
1930 },
1931 {
1932 .name = "io_service_time_recursive",
1933 .private = offsetof(struct cfq_group, stats.service_time),
1934 .read_seq_string = cfqg_print_rwstat_recursive,
1935 },
1936 {
1937 .name = "io_wait_time_recursive",
1938 .private = offsetof(struct cfq_group, stats.wait_time),
1939 .read_seq_string = cfqg_print_rwstat_recursive,
1940 },
1941 {
1942 .name = "io_merged_recursive",
1943 .private = offsetof(struct cfq_group, stats.merged),
1944 .read_seq_string = cfqg_print_rwstat_recursive,
1945 },
1946 {
1947 .name = "io_queued_recursive",
1948 .private = offsetof(struct cfq_group, stats.queued),
1949 .read_seq_string = cfqg_print_rwstat_recursive,
1950 },
1557#ifdef CONFIG_DEBUG_BLK_CGROUP 1951#ifdef CONFIG_DEBUG_BLK_CGROUP
1558 { 1952 {
1559 .name = "avg_queue_size", 1953 .name = "avg_queue_size",
@@ -1612,15 +2006,14 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1612 struct rb_node **p, *parent; 2006 struct rb_node **p, *parent;
1613 struct cfq_queue *__cfqq; 2007 struct cfq_queue *__cfqq;
1614 unsigned long rb_key; 2008 unsigned long rb_key;
1615 struct cfq_rb_root *service_tree; 2009 struct cfq_rb_root *st;
1616 int left; 2010 int left;
1617 int new_cfqq = 1; 2011 int new_cfqq = 1;
1618 2012
1619 service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), 2013 st = st_for(cfqq->cfqg, cfqq_class(cfqq), cfqq_type(cfqq));
1620 cfqq_type(cfqq));
1621 if (cfq_class_idle(cfqq)) { 2014 if (cfq_class_idle(cfqq)) {
1622 rb_key = CFQ_IDLE_DELAY; 2015 rb_key = CFQ_IDLE_DELAY;
1623 parent = rb_last(&service_tree->rb); 2016 parent = rb_last(&st->rb);
1624 if (parent && parent != &cfqq->rb_node) { 2017 if (parent && parent != &cfqq->rb_node) {
1625 __cfqq = rb_entry(parent, struct cfq_queue, rb_node); 2018 __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
1626 rb_key += __cfqq->rb_key; 2019 rb_key += __cfqq->rb_key;
@@ -1638,7 +2031,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1638 cfqq->slice_resid = 0; 2031 cfqq->slice_resid = 0;
1639 } else { 2032 } else {
1640 rb_key = -HZ; 2033 rb_key = -HZ;
1641 __cfqq = cfq_rb_first(service_tree); 2034 __cfqq = cfq_rb_first(st);
1642 rb_key += __cfqq ? __cfqq->rb_key : jiffies; 2035 rb_key += __cfqq ? __cfqq->rb_key : jiffies;
1643 } 2036 }
1644 2037
@@ -1647,8 +2040,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1647 /* 2040 /*
1648 * same position, nothing more to do 2041 * same position, nothing more to do
1649 */ 2042 */
1650 if (rb_key == cfqq->rb_key && 2043 if (rb_key == cfqq->rb_key && cfqq->service_tree == st)
1651 cfqq->service_tree == service_tree)
1652 return; 2044 return;
1653 2045
1654 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); 2046 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
@@ -1657,11 +2049,9 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1657 2049
1658 left = 1; 2050 left = 1;
1659 parent = NULL; 2051 parent = NULL;
1660 cfqq->service_tree = service_tree; 2052 cfqq->service_tree = st;
1661 p = &service_tree->rb.rb_node; 2053 p = &st->rb.rb_node;
1662 while (*p) { 2054 while (*p) {
1663 struct rb_node **n;
1664
1665 parent = *p; 2055 parent = *p;
1666 __cfqq = rb_entry(parent, struct cfq_queue, rb_node); 2056 __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
1667 2057
@@ -1669,22 +2059,20 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1669 * sort by key, that represents service time. 2059 * sort by key, that represents service time.
1670 */ 2060 */
1671 if (time_before(rb_key, __cfqq->rb_key)) 2061 if (time_before(rb_key, __cfqq->rb_key))
1672 n = &(*p)->rb_left; 2062 p = &parent->rb_left;
1673 else { 2063 else {
1674 n = &(*p)->rb_right; 2064 p = &parent->rb_right;
1675 left = 0; 2065 left = 0;
1676 } 2066 }
1677
1678 p = n;
1679 } 2067 }
1680 2068
1681 if (left) 2069 if (left)
1682 service_tree->left = &cfqq->rb_node; 2070 st->left = &cfqq->rb_node;
1683 2071
1684 cfqq->rb_key = rb_key; 2072 cfqq->rb_key = rb_key;
1685 rb_link_node(&cfqq->rb_node, parent, p); 2073 rb_link_node(&cfqq->rb_node, parent, p);
1686 rb_insert_color(&cfqq->rb_node, &service_tree->rb); 2074 rb_insert_color(&cfqq->rb_node, &st->rb);
1687 service_tree->count++; 2075 st->count++;
1688 if (add_front || !new_cfqq) 2076 if (add_front || !new_cfqq)
1689 return; 2077 return;
1690 cfq_group_notify_queue_add(cfqd, cfqq->cfqg); 2078 cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
@@ -2030,8 +2418,8 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
2030 struct cfq_queue *cfqq) 2418 struct cfq_queue *cfqq)
2031{ 2419{
2032 if (cfqq) { 2420 if (cfqq) {
2033 cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d", 2421 cfq_log_cfqq(cfqd, cfqq, "set_active wl_class:%d wl_type:%d",
2034 cfqd->serving_prio, cfqd->serving_type); 2422 cfqd->serving_wl_class, cfqd->serving_wl_type);
2035 cfqg_stats_update_avg_queue_size(cfqq->cfqg); 2423 cfqg_stats_update_avg_queue_size(cfqq->cfqg);
2036 cfqq->slice_start = 0; 2424 cfqq->slice_start = 0;
2037 cfqq->dispatch_start = jiffies; 2425 cfqq->dispatch_start = jiffies;
@@ -2117,19 +2505,18 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
2117 */ 2505 */
2118static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) 2506static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
2119{ 2507{
2120 struct cfq_rb_root *service_tree = 2508 struct cfq_rb_root *st = st_for(cfqd->serving_group,
2121 service_tree_for(cfqd->serving_group, cfqd->serving_prio, 2509 cfqd->serving_wl_class, cfqd->serving_wl_type);
2122 cfqd->serving_type);
2123 2510
2124 if (!cfqd->rq_queued) 2511 if (!cfqd->rq_queued)
2125 return NULL; 2512 return NULL;
2126 2513
2127 /* There is nothing to dispatch */ 2514 /* There is nothing to dispatch */
2128 if (!service_tree) 2515 if (!st)
2129 return NULL; 2516 return NULL;
2130 if (RB_EMPTY_ROOT(&service_tree->rb)) 2517 if (RB_EMPTY_ROOT(&st->rb))
2131 return NULL; 2518 return NULL;
2132 return cfq_rb_first(service_tree); 2519 return cfq_rb_first(st);
2133} 2520}
2134 2521
2135static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd) 2522static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
@@ -2285,17 +2672,17 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
2285 2672
2286static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2673static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2287{ 2674{
2288 enum wl_prio_t prio = cfqq_prio(cfqq); 2675 enum wl_class_t wl_class = cfqq_class(cfqq);
2289 struct cfq_rb_root *service_tree = cfqq->service_tree; 2676 struct cfq_rb_root *st = cfqq->service_tree;
2290 2677
2291 BUG_ON(!service_tree); 2678 BUG_ON(!st);
2292 BUG_ON(!service_tree->count); 2679 BUG_ON(!st->count);
2293 2680
2294 if (!cfqd->cfq_slice_idle) 2681 if (!cfqd->cfq_slice_idle)
2295 return false; 2682 return false;
2296 2683
2297 /* We never do for idle class queues. */ 2684 /* We never do for idle class queues. */
2298 if (prio == IDLE_WORKLOAD) 2685 if (wl_class == IDLE_WORKLOAD)
2299 return false; 2686 return false;
2300 2687
2301 /* We do for queues that were marked with idle window flag. */ 2688 /* We do for queues that were marked with idle window flag. */
@@ -2307,11 +2694,10 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2307 * Otherwise, we do only if they are the last ones 2694 * Otherwise, we do only if they are the last ones
2308 * in their service tree. 2695 * in their service tree.
2309 */ 2696 */
2310 if (service_tree->count == 1 && cfq_cfqq_sync(cfqq) && 2697 if (st->count == 1 && cfq_cfqq_sync(cfqq) &&
2311 !cfq_io_thinktime_big(cfqd, &service_tree->ttime, false)) 2698 !cfq_io_thinktime_big(cfqd, &st->ttime, false))
2312 return true; 2699 return true;
2313 cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", 2700 cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", st->count);
2314 service_tree->count);
2315 return false; 2701 return false;
2316} 2702}
2317 2703
@@ -2494,8 +2880,8 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
2494 } 2880 }
2495} 2881}
2496 2882
2497static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, 2883static enum wl_type_t cfq_choose_wl_type(struct cfq_data *cfqd,
2498 struct cfq_group *cfqg, enum wl_prio_t prio) 2884 struct cfq_group *cfqg, enum wl_class_t wl_class)
2499{ 2885{
2500 struct cfq_queue *queue; 2886 struct cfq_queue *queue;
2501 int i; 2887 int i;
@@ -2505,7 +2891,7 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
2505 2891
2506 for (i = 0; i <= SYNC_WORKLOAD; ++i) { 2892 for (i = 0; i <= SYNC_WORKLOAD; ++i) {
2507 /* select the one with lowest rb_key */ 2893 /* select the one with lowest rb_key */
2508 queue = cfq_rb_first(service_tree_for(cfqg, prio, i)); 2894 queue = cfq_rb_first(st_for(cfqg, wl_class, i));
2509 if (queue && 2895 if (queue &&
2510 (!key_valid || time_before(queue->rb_key, lowest_key))) { 2896 (!key_valid || time_before(queue->rb_key, lowest_key))) {
2511 lowest_key = queue->rb_key; 2897 lowest_key = queue->rb_key;
@@ -2517,26 +2903,27 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
2517 return cur_best; 2903 return cur_best;
2518} 2904}
2519 2905
2520static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) 2906static void
2907choose_wl_class_and_type(struct cfq_data *cfqd, struct cfq_group *cfqg)
2521{ 2908{
2522 unsigned slice; 2909 unsigned slice;
2523 unsigned count; 2910 unsigned count;
2524 struct cfq_rb_root *st; 2911 struct cfq_rb_root *st;
2525 unsigned group_slice; 2912 unsigned group_slice;
2526 enum wl_prio_t original_prio = cfqd->serving_prio; 2913 enum wl_class_t original_class = cfqd->serving_wl_class;
2527 2914
2528 /* Choose next priority. RT > BE > IDLE */ 2915 /* Choose next priority. RT > BE > IDLE */
2529 if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg)) 2916 if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
2530 cfqd->serving_prio = RT_WORKLOAD; 2917 cfqd->serving_wl_class = RT_WORKLOAD;
2531 else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg)) 2918 else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
2532 cfqd->serving_prio = BE_WORKLOAD; 2919 cfqd->serving_wl_class = BE_WORKLOAD;
2533 else { 2920 else {
2534 cfqd->serving_prio = IDLE_WORKLOAD; 2921 cfqd->serving_wl_class = IDLE_WORKLOAD;
2535 cfqd->workload_expires = jiffies + 1; 2922 cfqd->workload_expires = jiffies + 1;
2536 return; 2923 return;
2537 } 2924 }
2538 2925
2539 if (original_prio != cfqd->serving_prio) 2926 if (original_class != cfqd->serving_wl_class)
2540 goto new_workload; 2927 goto new_workload;
2541 2928
2542 /* 2929 /*
@@ -2544,7 +2931,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
2544 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload 2931 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
2545 * expiration time 2932 * expiration time
2546 */ 2933 */
2547 st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type); 2934 st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
2548 count = st->count; 2935 count = st->count;
2549 2936
2550 /* 2937 /*
@@ -2555,9 +2942,9 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
2555 2942
2556new_workload: 2943new_workload:
2557 /* otherwise select new workload type */ 2944 /* otherwise select new workload type */
2558 cfqd->serving_type = 2945 cfqd->serving_wl_type = cfq_choose_wl_type(cfqd, cfqg,
2559 cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio); 2946 cfqd->serving_wl_class);
2560 st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type); 2947 st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
2561 count = st->count; 2948 count = st->count;
2562 2949
2563 /* 2950 /*
@@ -2568,10 +2955,11 @@ new_workload:
2568 group_slice = cfq_group_slice(cfqd, cfqg); 2955 group_slice = cfq_group_slice(cfqd, cfqg);
2569 2956
2570 slice = group_slice * count / 2957 slice = group_slice * count /
2571 max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio], 2958 max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_wl_class],
2572 cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg)); 2959 cfq_group_busy_queues_wl(cfqd->serving_wl_class, cfqd,
2960 cfqg));
2573 2961
2574 if (cfqd->serving_type == ASYNC_WORKLOAD) { 2962 if (cfqd->serving_wl_type == ASYNC_WORKLOAD) {
2575 unsigned int tmp; 2963 unsigned int tmp;
2576 2964
2577 /* 2965 /*
@@ -2617,14 +3005,14 @@ static void cfq_choose_cfqg(struct cfq_data *cfqd)
2617 cfqd->serving_group = cfqg; 3005 cfqd->serving_group = cfqg;
2618 3006
2619 /* Restore the workload type data */ 3007 /* Restore the workload type data */
2620 if (cfqg->saved_workload_slice) { 3008 if (cfqg->saved_wl_slice) {
2621 cfqd->workload_expires = jiffies + cfqg->saved_workload_slice; 3009 cfqd->workload_expires = jiffies + cfqg->saved_wl_slice;
2622 cfqd->serving_type = cfqg->saved_workload; 3010 cfqd->serving_wl_type = cfqg->saved_wl_type;
2623 cfqd->serving_prio = cfqg->saved_serving_prio; 3011 cfqd->serving_wl_class = cfqg->saved_wl_class;
2624 } else 3012 } else
2625 cfqd->workload_expires = jiffies - 1; 3013 cfqd->workload_expires = jiffies - 1;
2626 3014
2627 choose_service_tree(cfqd, cfqg); 3015 choose_wl_class_and_type(cfqd, cfqg);
2628} 3016}
2629 3017
2630/* 3018/*
@@ -3403,7 +3791,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
3403 return true; 3791 return true;
3404 3792
3405 /* Allow preemption only if we are idling on sync-noidle tree */ 3793 /* Allow preemption only if we are idling on sync-noidle tree */
3406 if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD && 3794 if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD &&
3407 cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD && 3795 cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
3408 new_cfqq->service_tree->count == 2 && 3796 new_cfqq->service_tree->count == 2 &&
3409 RB_EMPTY_ROOT(&cfqq->sort_list)) 3797 RB_EMPTY_ROOT(&cfqq->sort_list))
@@ -3455,7 +3843,7 @@ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3455 * doesn't happen 3843 * doesn't happen
3456 */ 3844 */
3457 if (old_type != cfqq_type(cfqq)) 3845 if (old_type != cfqq_type(cfqq))
3458 cfqq->cfqg->saved_workload_slice = 0; 3846 cfqq->cfqg->saved_wl_slice = 0;
3459 3847
3460 /* 3848 /*
3461 * Put the new queue at the front of the of the current list, 3849 * Put the new queue at the front of the of the current list,
@@ -3637,16 +4025,17 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
3637 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; 4025 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
3638 4026
3639 if (sync) { 4027 if (sync) {
3640 struct cfq_rb_root *service_tree; 4028 struct cfq_rb_root *st;
3641 4029
3642 RQ_CIC(rq)->ttime.last_end_request = now; 4030 RQ_CIC(rq)->ttime.last_end_request = now;
3643 4031
3644 if (cfq_cfqq_on_rr(cfqq)) 4032 if (cfq_cfqq_on_rr(cfqq))
3645 service_tree = cfqq->service_tree; 4033 st = cfqq->service_tree;
3646 else 4034 else
3647 service_tree = service_tree_for(cfqq->cfqg, 4035 st = st_for(cfqq->cfqg, cfqq_class(cfqq),
3648 cfqq_prio(cfqq), cfqq_type(cfqq)); 4036 cfqq_type(cfqq));
3649 service_tree->ttime.last_end_request = now; 4037
4038 st->ttime.last_end_request = now;
3650 if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now)) 4039 if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
3651 cfqd->last_delayed_sync = now; 4040 cfqd->last_delayed_sync = now;
3652 } 4041 }
@@ -3993,6 +4382,7 @@ static int cfq_init_queue(struct request_queue *q)
3993 cfq_init_cfqg_base(cfqd->root_group); 4382 cfq_init_cfqg_base(cfqd->root_group);
3994#endif 4383#endif
3995 cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT; 4384 cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT;
4385 cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
3996 4386
3997 /* 4387 /*
3998 * Not strictly needed (since RB_ROOT just clears the node and we 4388 * Not strictly needed (since RB_ROOT just clears the node and we
@@ -4177,6 +4567,7 @@ static struct blkcg_policy blkcg_policy_cfq = {
4177 .cftypes = cfq_blkcg_files, 4567 .cftypes = cfq_blkcg_files,
4178 4568
4179 .pd_init_fn = cfq_pd_init, 4569 .pd_init_fn = cfq_pd_init,
4570 .pd_offline_fn = cfq_pd_offline,
4180 .pd_reset_stats_fn = cfq_pd_reset_stats, 4571 .pd_reset_stats_fn = cfq_pd_reset_stats,
4181}; 4572};
4182#endif 4573#endif
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index dbe74279f3d6..78feda9bbae2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -19,6 +19,7 @@
19#include <linux/gfp.h> 19#include <linux/gfp.h>
20#include <linux/bsg.h> 20#include <linux/bsg.h>
21#include <linux/smp.h> 21#include <linux/smp.h>
22#include <linux/rcupdate.h>
22 23
23#include <asm/scatterlist.h> 24#include <asm/scatterlist.h>
24 25
@@ -437,6 +438,7 @@ struct request_queue {
437 /* Throttle data */ 438 /* Throttle data */
438 struct throtl_data *td; 439 struct throtl_data *td;
439#endif 440#endif
441 struct rcu_head rcu_head;
440}; 442};
441 443
442#define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ 444#define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */