aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-09-10 21:56:14 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-09-10 21:56:14 -0400
commitb0a1ea51bda4c2bcdde460221e1772f3a4f8c44f (patch)
tree9684c11b72718cd7e96e5eb93298690269ecf447
parent33e247c7e58d335d70ecb84fd869091e2e4b8dcb (diff)
parent69d7fde5909b614114343974cfc52cb8ff30b544 (diff)
Merge branch 'for-4.3/blkcg' of git://git.kernel.dk/linux-block
Pull blk-cg updates from Jens Axboe: "A bit later in the cycle, but this has been in the block tree for a a while. This is basically four patchsets from Tejun, that improve our buffered cgroup writeback. It was dependent on the other cgroup changes, but they went in earlier in this cycle. Series 1 is set of 5 patches that has cgroup writeback updates: - bdi_writeback iteration fix which could lead to some wb's being skipped or repeated during e.g. sync under memory pressure. - Simplification of wb work wait mechanism. - Writeback tracepoints updated to report cgroup. Series 2 is is a set of updates for the CFQ cgroup writeback handling: cfq has always charged all async IOs to the root cgroup. It didn't have much choice as writeback didn't know about cgroups and there was no way to tell who to blame for a given writeback IO. writeback finally grew support for cgroups and now tags each writeback IO with the appropriate cgroup to charge it against. This patchset updates cfq so that it follows the blkcg each bio is tagged with. Async cfq_queues are now shared across cfq_group, which is per-cgroup, instead of per-request_queue cfq_data. This makes all IOs follow the weight based IO resource distribution implemented by cfq. - Switched from GFP_ATOMIC to GFP_NOWAIT as suggested by Jeff. - Other misc review points addressed, acks added and rebased. Series 3 is the blkcg policy cleanup patches: This patchset contains assorted cleanups for blkcg_policy methods and blk[c]g_policy_data handling. - alloc/free added for blkg_policy_data. exit dropped. - alloc/free added for blkcg_policy_data. - blk-throttle's async percpu allocation is replaced with direct allocation. - all methods now take blk[c]g_policy_data instead of blkcg_gq or blkcg. And finally, series 4 is a set of patches cleaning up the blkcg stats handling: blkcg's stats have always been somwhat of a mess. This patchset tries to improve the situation a bit. - The following patches added to consolidate blkcg entry point and blkg creation. This is in itself is an improvement and helps colllecting common stats on bio issue. - per-blkg stats now accounted on bio issue rather than request completion so that bio based and request based drivers can behave the same way. The issue was spotted by Vivek. - cfq-iosched implements custom recursive stats and blk-throttle implements custom per-cpu stats. This patchset make blkcg core support both by default. - cfq-iosched and blk-throttle keep track of the same stats multiple times. Unify them" * 'for-4.3/blkcg' of git://git.kernel.dk/linux-block: (45 commits) blkcg: use CGROUP_WEIGHT_* scale for io.weight on the unified hierarchy blkcg: s/CFQ_WEIGHT_*/CFQ_WEIGHT_LEGACY_*/ blkcg: implement interface for the unified hierarchy blkcg: misc preparations for unified hierarchy interface blkcg: separate out tg_conf_updated() from tg_set_conf() blkcg: move body parsing from blkg_conf_prep() to its callers blkcg: mark existing cftypes as legacy blkcg: rename subsystem name from blkio to io blkcg: refine error codes returned during blkcg configuration blkcg: remove unnecessary NULL checks from __cfqg_set_weight_device() blkcg: reduce stack usage of blkg_rwstat_recursive_sum() blkcg: remove cfqg_stats->sectors blkcg: move io_service_bytes and io_serviced stats into blkcg_gq blkcg: make blkg_[rw]stat_recursive_sum() to be able to index into blkcg_gq blkcg: make blkcg_[rw]stat per-cpu blkcg: add blkg_[rw]stat->aux_cnt and replace cfq_group->dead_stats with it blkcg: consolidate blkg creation in blkcg_bio_issue_check() blk-throttle: improve queue bypass handling blkcg: move root blkg lookup optimization from throtl_lookup_tg() to __blkg_lookup() blkcg: inline [__]blkg_lookup() ...
-rw-r--r--Documentation/cgroups/blkio-controller.txt24
-rw-r--r--Documentation/cgroups/unified-hierarchy.txt61
-rw-r--r--block/bio.c2
-rw-r--r--block/blk-cgroup.c524
-rw-r--r--block/blk-core.c4
-rw-r--r--block/blk-throttle.c505
-rw-r--r--block/blk.h5
-rw-r--r--block/cfq-iosched.c651
-rw-r--r--fs/fs-writeback.c139
-rw-r--r--fs/kernfs/dir.c23
-rw-r--r--include/linux/backing-dev.h26
-rw-r--r--include/linux/blk-cgroup.h340
-rw-r--r--include/linux/cgroup_subsys.h2
-rw-r--r--include/linux/kernfs.h4
-rw-r--r--include/trace/events/writeback.h180
-rw-r--r--mm/backing-dev.c4
-rw-r--r--mm/page-writeback.c6
17 files changed, 1422 insertions, 1078 deletions
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index 68b6a6a470b0..12686bec37b9 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -201,7 +201,7 @@ Proportional weight policy files
201 specifies the number of bytes. 201 specifies the number of bytes.
202 202
203- blkio.io_serviced 203- blkio.io_serviced
204 - Number of IOs completed to/from the disk by the group. These 204 - Number of IOs (bio) issued to the disk by the group. These
205 are further divided by the type of operation - read or write, sync 205 are further divided by the type of operation - read or write, sync
206 or async. First two fields specify the major and minor number of the 206 or async. First two fields specify the major and minor number of the
207 device, third field specifies the operation type and the fourth field 207 device, third field specifies the operation type and the fourth field
@@ -327,18 +327,11 @@ Note: If both BW and IOPS rules are specified for a device, then IO is
327 subjected to both the constraints. 327 subjected to both the constraints.
328 328
329- blkio.throttle.io_serviced 329- blkio.throttle.io_serviced
330 - Number of IOs (bio) completed to/from the disk by the group (as 330 - Number of IOs (bio) issued to the disk by the group. These
331 seen by throttling policy). These are further divided by the type 331 are further divided by the type of operation - read or write, sync
332 of operation - read or write, sync or async. First two fields specify 332 or async. First two fields specify the major and minor number of the
333 the major and minor number of the device, third field specifies the 333 device, third field specifies the operation type and the fourth field
334 operation type and the fourth field specifies the number of IOs. 334 specifies the number of IOs.
335
336 blkio.io_serviced does accounting as seen by CFQ and counts are in
337 number of requests (struct request). On the other hand,
338 blkio.throttle.io_serviced counts number of IO in terms of number
339 of bios as seen by throttling policy. These bios can later be
340 merged by elevator and total number of requests completed can be
341 lesser.
342 335
343- blkio.throttle.io_service_bytes 336- blkio.throttle.io_service_bytes
344 - Number of bytes transferred to/from the disk by the group. These 337 - Number of bytes transferred to/from the disk by the group. These
@@ -347,11 +340,6 @@ Note: If both BW and IOPS rules are specified for a device, then IO is
347 device, third field specifies the operation type and the fourth field 340 device, third field specifies the operation type and the fourth field
348 specifies the number of bytes. 341 specifies the number of bytes.
349 342
350 These numbers should roughly be same as blkio.io_service_bytes as
351 updated by CFQ. The difference between two is that
352 blkio.io_service_bytes will not be updated if CFQ is not operating
353 on request queue.
354
355Common files among various policies 343Common files among various policies
356----------------------------------- 344-----------------------------------
357- blkio.reset_stats 345- blkio.reset_stats
diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt
index 1ee9caf29e57..e0975c2cf03d 100644
--- a/Documentation/cgroups/unified-hierarchy.txt
+++ b/Documentation/cgroups/unified-hierarchy.txt
@@ -27,7 +27,7 @@ CONTENTS
27 5-3-1. Format 27 5-3-1. Format
28 5-3-2. Control Knobs 28 5-3-2. Control Knobs
29 5-4. Per-Controller Changes 29 5-4. Per-Controller Changes
30 5-4-1. blkio 30 5-4-1. io
31 5-4-2. cpuset 31 5-4-2. cpuset
32 5-4-3. memory 32 5-4-3. memory
336. Planned Changes 336. Planned Changes
@@ -203,7 +203,7 @@ other issues. The mapping from nice level to weight isn't obvious or
203universal, and there are various other knobs which simply aren't 203universal, and there are various other knobs which simply aren't
204available for tasks. 204available for tasks.
205 205
206The blkio controller implicitly creates a hidden leaf node for each 206The io controller implicitly creates a hidden leaf node for each
207cgroup to host the tasks. The hidden leaf has its own copies of all 207cgroup to host the tasks. The hidden leaf has its own copies of all
208the knobs with "leaf_" prefixed. While this allows equivalent control 208the knobs with "leaf_" prefixed. While this allows equivalent control
209over internal tasks, it's with serious drawbacks. It always adds an 209over internal tasks, it's with serious drawbacks. It always adds an
@@ -438,9 +438,62 @@ may be specified in any order and not all pairs have to be specified.
438 438
4395-4. Per-Controller Changes 4395-4. Per-Controller Changes
440 440
4415-4-1. blkio 4415-4-1. io
442 442
443- blk-throttle becomes properly hierarchical. 443- blkio is renamed to io. The interface is overhauled anyway. The
444 new name is more in line with the other two major controllers, cpu
445 and memory, and better suited given that it may be used for cgroup
446 writeback without involving block layer.
447
448- Everything including stat is always hierarchical making separate
449 recursive stat files pointless and, as no internal node can have
450 tasks, leaf weights are meaningless. The operation model is
451 simplified and the interface is overhauled accordingly.
452
453 io.stat
454
455 The stat file. The reported stats are from the point where
456 bio's are issued to request_queue. The stats are counted
457 independent of which policies are enabled. Each line in the
458 file follows the following format. More fields may later be
459 added at the end.
460
461 $MAJ:$MIN rbytes=$RBYTES wbytes=$WBYTES rios=$RIOS wrios=$WIOS
462
463 io.weight
464
465 The weight setting, currently only available and effective if
466 cfq-iosched is in use for the target device. The weight is
467 between 1 and 10000 and defaults to 100. The first line
468 always contains the default weight in the following format to
469 use when per-device setting is missing.
470
471 default $WEIGHT
472
473 Subsequent lines list per-device weights of the following
474 format.
475
476 $MAJ:$MIN $WEIGHT
477
478 Writing "$WEIGHT" or "default $WEIGHT" changes the default
479 setting. Writing "$MAJ:$MIN $WEIGHT" sets per-device weight
480 while "$MAJ:$MIN default" clears it.
481
482 This file is available only on non-root cgroups.
483
484 io.max
485
486 The maximum bandwidth and/or iops setting, only available if
487 blk-throttle is enabled. The file is of the following format.
488
489 $MAJ:$MIN rbps=$RBPS wbps=$WBPS riops=$RIOPS wiops=$WIOPS
490
491 ${R|W}BPS are read/write bytes per second and ${R|W}IOPS are
492 read/write IOs per second. "max" indicates no limit. Writing
493 to the file follows the same format but the individual
494 settings may be ommitted or specified in any order.
495
496 This file is available only on non-root cgroups.
444 497
445 498
4465-4-2. cpuset 4995-4-2. cpuset
diff --git a/block/bio.c b/block/bio.c
index 515b5434fe2d..ad3f276d74bc 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1990,7 +1990,7 @@ int bio_associate_current(struct bio *bio)
1990 1990
1991 get_io_context_active(ioc); 1991 get_io_context_active(ioc);
1992 bio->bi_ioc = ioc; 1992 bio->bi_ioc = ioc;
1993 bio->bi_css = task_get_css(current, blkio_cgrp_id); 1993 bio->bi_css = task_get_css(current, io_cgrp_id);
1994 return 0; 1994 return 0;
1995} 1995}
1996EXPORT_SYMBOL_GPL(bio_associate_current); 1996EXPORT_SYMBOL_GPL(bio_associate_current);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index d6283b3f5db5..ac8370cb2515 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -24,6 +24,7 @@
24#include <linux/genhd.h> 24#include <linux/genhd.h>
25#include <linux/delay.h> 25#include <linux/delay.h>
26#include <linux/atomic.h> 26#include <linux/atomic.h>
27#include <linux/ctype.h>
27#include <linux/blk-cgroup.h> 28#include <linux/blk-cgroup.h>
28#include "blk.h" 29#include "blk.h"
29 30
@@ -68,9 +69,14 @@ static void blkg_free(struct blkcg_gq *blkg)
68 return; 69 return;
69 70
70 for (i = 0; i < BLKCG_MAX_POLS; i++) 71 for (i = 0; i < BLKCG_MAX_POLS; i++)
71 kfree(blkg->pd[i]); 72 if (blkg->pd[i])
73 blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
72 74
73 blk_exit_rl(&blkg->rl); 75 if (blkg->blkcg != &blkcg_root)
76 blk_exit_rl(&blkg->rl);
77
78 blkg_rwstat_exit(&blkg->stat_ios);
79 blkg_rwstat_exit(&blkg->stat_bytes);
74 kfree(blkg); 80 kfree(blkg);
75} 81}
76 82
@@ -93,6 +99,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
93 if (!blkg) 99 if (!blkg)
94 return NULL; 100 return NULL;
95 101
102 if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) ||
103 blkg_rwstat_init(&blkg->stat_ios, gfp_mask))
104 goto err_free;
105
96 blkg->q = q; 106 blkg->q = q;
97 INIT_LIST_HEAD(&blkg->q_node); 107 INIT_LIST_HEAD(&blkg->q_node);
98 blkg->blkcg = blkcg; 108 blkg->blkcg = blkcg;
@@ -113,7 +123,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
113 continue; 123 continue;
114 124
115 /* alloc per-policy data and attach it to blkg */ 125 /* alloc per-policy data and attach it to blkg */
116 pd = kzalloc_node(pol->pd_size, gfp_mask, q->node); 126 pd = pol->pd_alloc_fn(gfp_mask, q->node);
117 if (!pd) 127 if (!pd)
118 goto err_free; 128 goto err_free;
119 129
@@ -129,26 +139,11 @@ err_free:
129 return NULL; 139 return NULL;
130} 140}
131 141
132/** 142struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
133 * __blkg_lookup - internal version of blkg_lookup() 143 struct request_queue *q, bool update_hint)
134 * @blkcg: blkcg of interest
135 * @q: request_queue of interest
136 * @update_hint: whether to update lookup hint with the result or not
137 *
138 * This is internal version and shouldn't be used by policy
139 * implementations. Looks up blkgs for the @blkcg - @q pair regardless of
140 * @q's bypass state. If @update_hint is %true, the caller should be
141 * holding @q->queue_lock and lookup hint is updated on success.
142 */
143struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
144 bool update_hint)
145{ 144{
146 struct blkcg_gq *blkg; 145 struct blkcg_gq *blkg;
147 146
148 blkg = rcu_dereference(blkcg->blkg_hint);
149 if (blkg && blkg->q == q)
150 return blkg;
151
152 /* 147 /*
153 * Hint didn't match. Look up from the radix tree. Note that the 148 * Hint didn't match. Look up from the radix tree. Note that the
154 * hint can only be updated under queue_lock as otherwise @blkg 149 * hint can only be updated under queue_lock as otherwise @blkg
@@ -166,29 +161,11 @@ struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
166 161
167 return NULL; 162 return NULL;
168} 163}
169 164EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
170/**
171 * blkg_lookup - lookup blkg for the specified blkcg - q pair
172 * @blkcg: blkcg of interest
173 * @q: request_queue of interest
174 *
175 * Lookup blkg for the @blkcg - @q pair. This function should be called
176 * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
177 * - see blk_queue_bypass_start() for details.
178 */
179struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q)
180{
181 WARN_ON_ONCE(!rcu_read_lock_held());
182
183 if (unlikely(blk_queue_bypass(q)))
184 return NULL;
185 return __blkg_lookup(blkcg, q, false);
186}
187EXPORT_SYMBOL_GPL(blkg_lookup);
188 165
189/* 166/*
190 * If @new_blkg is %NULL, this function tries to allocate a new one as 167 * If @new_blkg is %NULL, this function tries to allocate a new one as
191 * necessary using %GFP_ATOMIC. @new_blkg is always consumed on return. 168 * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return.
192 */ 169 */
193static struct blkcg_gq *blkg_create(struct blkcg *blkcg, 170static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
194 struct request_queue *q, 171 struct request_queue *q,
@@ -203,12 +180,12 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
203 180
204 /* blkg holds a reference to blkcg */ 181 /* blkg holds a reference to blkcg */
205 if (!css_tryget_online(&blkcg->css)) { 182 if (!css_tryget_online(&blkcg->css)) {
206 ret = -EINVAL; 183 ret = -ENODEV;
207 goto err_free_blkg; 184 goto err_free_blkg;
208 } 185 }
209 186
210 wb_congested = wb_congested_get_create(&q->backing_dev_info, 187 wb_congested = wb_congested_get_create(&q->backing_dev_info,
211 blkcg->css.id, GFP_ATOMIC); 188 blkcg->css.id, GFP_NOWAIT);
212 if (!wb_congested) { 189 if (!wb_congested) {
213 ret = -ENOMEM; 190 ret = -ENOMEM;
214 goto err_put_css; 191 goto err_put_css;
@@ -216,7 +193,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
216 193
217 /* allocate */ 194 /* allocate */
218 if (!new_blkg) { 195 if (!new_blkg) {
219 new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC); 196 new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT);
220 if (unlikely(!new_blkg)) { 197 if (unlikely(!new_blkg)) {
221 ret = -ENOMEM; 198 ret = -ENOMEM;
222 goto err_put_congested; 199 goto err_put_congested;
@@ -229,7 +206,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
229 if (blkcg_parent(blkcg)) { 206 if (blkcg_parent(blkcg)) {
230 blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); 207 blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
231 if (WARN_ON_ONCE(!blkg->parent)) { 208 if (WARN_ON_ONCE(!blkg->parent)) {
232 ret = -EINVAL; 209 ret = -ENODEV;
233 goto err_put_congested; 210 goto err_put_congested;
234 } 211 }
235 blkg_get(blkg->parent); 212 blkg_get(blkg->parent);
@@ -240,7 +217,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
240 struct blkcg_policy *pol = blkcg_policy[i]; 217 struct blkcg_policy *pol = blkcg_policy[i];
241 218
242 if (blkg->pd[i] && pol->pd_init_fn) 219 if (blkg->pd[i] && pol->pd_init_fn)
243 pol->pd_init_fn(blkg); 220 pol->pd_init_fn(blkg->pd[i]);
244 } 221 }
245 222
246 /* insert */ 223 /* insert */
@@ -254,7 +231,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
254 struct blkcg_policy *pol = blkcg_policy[i]; 231 struct blkcg_policy *pol = blkcg_policy[i];
255 232
256 if (blkg->pd[i] && pol->pd_online_fn) 233 if (blkg->pd[i] && pol->pd_online_fn)
257 pol->pd_online_fn(blkg); 234 pol->pd_online_fn(blkg->pd[i]);
258 } 235 }
259 } 236 }
260 blkg->online = true; 237 blkg->online = true;
@@ -303,7 +280,7 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
303 * we shouldn't allow anything to go through for a bypassing queue. 280 * we shouldn't allow anything to go through for a bypassing queue.
304 */ 281 */
305 if (unlikely(blk_queue_bypass(q))) 282 if (unlikely(blk_queue_bypass(q)))
306 return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY); 283 return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
307 284
308 blkg = __blkg_lookup(blkcg, q, true); 285 blkg = __blkg_lookup(blkcg, q, true);
309 if (blkg) 286 if (blkg)
@@ -327,11 +304,11 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
327 return blkg; 304 return blkg;
328 } 305 }
329} 306}
330EXPORT_SYMBOL_GPL(blkg_lookup_create);
331 307
332static void blkg_destroy(struct blkcg_gq *blkg) 308static void blkg_destroy(struct blkcg_gq *blkg)
333{ 309{
334 struct blkcg *blkcg = blkg->blkcg; 310 struct blkcg *blkcg = blkg->blkcg;
311 struct blkcg_gq *parent = blkg->parent;
335 int i; 312 int i;
336 313
337 lockdep_assert_held(blkg->q->queue_lock); 314 lockdep_assert_held(blkg->q->queue_lock);
@@ -345,8 +322,14 @@ static void blkg_destroy(struct blkcg_gq *blkg)
345 struct blkcg_policy *pol = blkcg_policy[i]; 322 struct blkcg_policy *pol = blkcg_policy[i];
346 323
347 if (blkg->pd[i] && pol->pd_offline_fn) 324 if (blkg->pd[i] && pol->pd_offline_fn)
348 pol->pd_offline_fn(blkg); 325 pol->pd_offline_fn(blkg->pd[i]);
326 }
327
328 if (parent) {
329 blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes);
330 blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios);
349 } 331 }
332
350 blkg->online = false; 333 blkg->online = false;
351 334
352 radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); 335 radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
@@ -400,15 +383,6 @@ static void blkg_destroy_all(struct request_queue *q)
400void __blkg_release_rcu(struct rcu_head *rcu_head) 383void __blkg_release_rcu(struct rcu_head *rcu_head)
401{ 384{
402 struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head); 385 struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head);
403 int i;
404
405 /* tell policies that this one is being freed */
406 for (i = 0; i < BLKCG_MAX_POLS; i++) {
407 struct blkcg_policy *pol = blkcg_policy[i];
408
409 if (blkg->pd[i] && pol->pd_exit_fn)
410 pol->pd_exit_fn(blkg);
411 }
412 386
413 /* release the blkcg and parent blkg refs this blkg has been holding */ 387 /* release the blkcg and parent blkg refs this blkg has been holding */
414 css_put(&blkg->blkcg->css); 388 css_put(&blkg->blkcg->css);
@@ -472,12 +446,14 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
472 * anyway. If you get hit by a race, retry. 446 * anyway. If you get hit by a race, retry.
473 */ 447 */
474 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { 448 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
449 blkg_rwstat_reset(&blkg->stat_bytes);
450 blkg_rwstat_reset(&blkg->stat_ios);
451
475 for (i = 0; i < BLKCG_MAX_POLS; i++) { 452 for (i = 0; i < BLKCG_MAX_POLS; i++) {
476 struct blkcg_policy *pol = blkcg_policy[i]; 453 struct blkcg_policy *pol = blkcg_policy[i];
477 454
478 if (blkcg_policy_enabled(blkg->q, pol) && 455 if (blkg->pd[i] && pol->pd_reset_stats_fn)
479 pol->pd_reset_stats_fn) 456 pol->pd_reset_stats_fn(blkg->pd[i]);
480 pol->pd_reset_stats_fn(blkg);
481 } 457 }
482 } 458 }
483 459
@@ -486,13 +462,14 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
486 return 0; 462 return 0;
487} 463}
488 464
489static const char *blkg_dev_name(struct blkcg_gq *blkg) 465const char *blkg_dev_name(struct blkcg_gq *blkg)
490{ 466{
491 /* some drivers (floppy) instantiate a queue w/o disk registered */ 467 /* some drivers (floppy) instantiate a queue w/o disk registered */
492 if (blkg->q->backing_dev_info.dev) 468 if (blkg->q->backing_dev_info.dev)
493 return dev_name(blkg->q->backing_dev_info.dev); 469 return dev_name(blkg->q->backing_dev_info.dev);
494 return NULL; 470 return NULL;
495} 471}
472EXPORT_SYMBOL_GPL(blkg_dev_name);
496 473
497/** 474/**
498 * blkcg_print_blkgs - helper for printing per-blkg data 475 * blkcg_print_blkgs - helper for printing per-blkg data
@@ -581,9 +558,10 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
581 558
582 for (i = 0; i < BLKG_RWSTAT_NR; i++) 559 for (i = 0; i < BLKG_RWSTAT_NR; i++)
583 seq_printf(sf, "%s %s %llu\n", dname, rwstr[i], 560 seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
584 (unsigned long long)rwstat->cnt[i]); 561 (unsigned long long)atomic64_read(&rwstat->aux_cnt[i]));
585 562
586 v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE]; 563 v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) +
564 atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]);
587 seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); 565 seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
588 return v; 566 return v;
589} 567}
@@ -620,31 +598,122 @@ u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
620} 598}
621EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); 599EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
622 600
601static u64 blkg_prfill_rwstat_field(struct seq_file *sf,
602 struct blkg_policy_data *pd, int off)
603{
604 struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off);
605
606 return __blkg_prfill_rwstat(sf, pd, &rwstat);
607}
608
609/**
610 * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes
611 * @sf: seq_file to print to
612 * @v: unused
613 *
614 * To be used as cftype->seq_show to print blkg->stat_bytes.
615 * cftype->private must be set to the blkcg_policy.
616 */
617int blkg_print_stat_bytes(struct seq_file *sf, void *v)
618{
619 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
620 blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
621 offsetof(struct blkcg_gq, stat_bytes), true);
622 return 0;
623}
624EXPORT_SYMBOL_GPL(blkg_print_stat_bytes);
625
626/**
627 * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios
628 * @sf: seq_file to print to
629 * @v: unused
630 *
631 * To be used as cftype->seq_show to print blkg->stat_ios. cftype->private
632 * must be set to the blkcg_policy.
633 */
634int blkg_print_stat_ios(struct seq_file *sf, void *v)
635{
636 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
637 blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
638 offsetof(struct blkcg_gq, stat_ios), true);
639 return 0;
640}
641EXPORT_SYMBOL_GPL(blkg_print_stat_ios);
642
643static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf,
644 struct blkg_policy_data *pd,
645 int off)
646{
647 struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg,
648 NULL, off);
649 return __blkg_prfill_rwstat(sf, pd, &rwstat);
650}
651
652/**
653 * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes
654 * @sf: seq_file to print to
655 * @v: unused
656 */
657int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v)
658{
659 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
660 blkg_prfill_rwstat_field_recursive,
661 (void *)seq_cft(sf)->private,
662 offsetof(struct blkcg_gq, stat_bytes), true);
663 return 0;
664}
665EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive);
666
667/**
668 * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios
669 * @sf: seq_file to print to
670 * @v: unused
671 */
672int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v)
673{
674 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
675 blkg_prfill_rwstat_field_recursive,
676 (void *)seq_cft(sf)->private,
677 offsetof(struct blkcg_gq, stat_ios), true);
678 return 0;
679}
680EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive);
681
623/** 682/**
624 * blkg_stat_recursive_sum - collect hierarchical blkg_stat 683 * blkg_stat_recursive_sum - collect hierarchical blkg_stat
625 * @pd: policy private data of interest 684 * @blkg: blkg of interest
626 * @off: offset to the blkg_stat in @pd 685 * @pol: blkcg_policy which contains the blkg_stat
686 * @off: offset to the blkg_stat in blkg_policy_data or @blkg
687 *
688 * Collect the blkg_stat specified by @blkg, @pol and @off and all its
689 * online descendants and their aux counts. The caller must be holding the
690 * queue lock for online tests.
627 * 691 *
628 * Collect the blkg_stat specified by @off from @pd and all its online 692 * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is
629 * descendants and return the sum. The caller must be holding the queue 693 * at @off bytes into @blkg's blkg_policy_data of the policy.
630 * lock for online tests.
631 */ 694 */
632u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off) 695u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
696 struct blkcg_policy *pol, int off)
633{ 697{
634 struct blkcg_policy *pol = blkcg_policy[pd->plid];
635 struct blkcg_gq *pos_blkg; 698 struct blkcg_gq *pos_blkg;
636 struct cgroup_subsys_state *pos_css; 699 struct cgroup_subsys_state *pos_css;
637 u64 sum = 0; 700 u64 sum = 0;
638 701
639 lockdep_assert_held(pd->blkg->q->queue_lock); 702 lockdep_assert_held(blkg->q->queue_lock);
640 703
641 rcu_read_lock(); 704 rcu_read_lock();
642 blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) { 705 blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
643 struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); 706 struct blkg_stat *stat;
644 struct blkg_stat *stat = (void *)pos_pd + off; 707
708 if (!pos_blkg->online)
709 continue;
710
711 if (pol)
712 stat = (void *)blkg_to_pd(pos_blkg, pol) + off;
713 else
714 stat = (void *)blkg + off;
645 715
646 if (pos_blkg->online) 716 sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt);
647 sum += blkg_stat_read(stat);
648 } 717 }
649 rcu_read_unlock(); 718 rcu_read_unlock();
650 719
@@ -654,37 +723,43 @@ EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
654 723
655/** 724/**
656 * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat 725 * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat
657 * @pd: policy private data of interest 726 * @blkg: blkg of interest
658 * @off: offset to the blkg_stat in @pd 727 * @pol: blkcg_policy which contains the blkg_rwstat
728 * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg
729 *
730 * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its
731 * online descendants and their aux counts. The caller must be holding the
732 * queue lock for online tests.
659 * 733 *
660 * Collect the blkg_rwstat specified by @off from @pd and all its online 734 * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it
661 * descendants and return the sum. The caller must be holding the queue 735 * is at @off bytes into @blkg's blkg_policy_data of the policy.
662 * lock for online tests.
663 */ 736 */
664struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, 737struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
665 int off) 738 struct blkcg_policy *pol, int off)
666{ 739{
667 struct blkcg_policy *pol = blkcg_policy[pd->plid];
668 struct blkcg_gq *pos_blkg; 740 struct blkcg_gq *pos_blkg;
669 struct cgroup_subsys_state *pos_css; 741 struct cgroup_subsys_state *pos_css;
670 struct blkg_rwstat sum = { }; 742 struct blkg_rwstat sum = { };
671 int i; 743 int i;
672 744
673 lockdep_assert_held(pd->blkg->q->queue_lock); 745 lockdep_assert_held(blkg->q->queue_lock);
674 746
675 rcu_read_lock(); 747 rcu_read_lock();
676 blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) { 748 blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
677 struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); 749 struct blkg_rwstat *rwstat;
678 struct blkg_rwstat *rwstat = (void *)pos_pd + off;
679 struct blkg_rwstat tmp;
680 750
681 if (!pos_blkg->online) 751 if (!pos_blkg->online)
682 continue; 752 continue;
683 753
684 tmp = blkg_rwstat_read(rwstat); 754 if (pol)
755 rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off;
756 else
757 rwstat = (void *)pos_blkg + off;
685 758
686 for (i = 0; i < BLKG_RWSTAT_NR; i++) 759 for (i = 0; i < BLKG_RWSTAT_NR; i++)
687 sum.cnt[i] += tmp.cnt[i]; 760 atomic64_add(atomic64_read(&rwstat->aux_cnt[i]) +
761 percpu_counter_sum_positive(&rwstat->cpu_cnt[i]),
762 &sum.aux_cnt[i]);
688 } 763 }
689 rcu_read_unlock(); 764 rcu_read_unlock();
690 765
@@ -700,29 +775,34 @@ EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
700 * @ctx: blkg_conf_ctx to be filled 775 * @ctx: blkg_conf_ctx to be filled
701 * 776 *
702 * Parse per-blkg config update from @input and initialize @ctx with the 777 * Parse per-blkg config update from @input and initialize @ctx with the
703 * result. @ctx->blkg points to the blkg to be updated and @ctx->v the new 778 * result. @ctx->blkg points to the blkg to be updated and @ctx->body the
704 * value. This function returns with RCU read lock and queue lock held and 779 * part of @input following MAJ:MIN. This function returns with RCU read
705 * must be paired with blkg_conf_finish(). 780 * lock and queue lock held and must be paired with blkg_conf_finish().
706 */ 781 */
707int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, 782int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
708 const char *input, struct blkg_conf_ctx *ctx) 783 char *input, struct blkg_conf_ctx *ctx)
709 __acquires(rcu) __acquires(disk->queue->queue_lock) 784 __acquires(rcu) __acquires(disk->queue->queue_lock)
710{ 785{
711 struct gendisk *disk; 786 struct gendisk *disk;
712 struct blkcg_gq *blkg; 787 struct blkcg_gq *blkg;
713 unsigned int major, minor; 788 unsigned int major, minor;
714 unsigned long long v; 789 int key_len, part, ret;
715 int part, ret; 790 char *body;
716 791
717 if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3) 792 if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
718 return -EINVAL; 793 return -EINVAL;
719 794
795 body = input + key_len;
796 if (!isspace(*body))
797 return -EINVAL;
798 body = skip_spaces(body);
799
720 disk = get_gendisk(MKDEV(major, minor), &part); 800 disk = get_gendisk(MKDEV(major, minor), &part);
721 if (!disk) 801 if (!disk)
722 return -EINVAL; 802 return -ENODEV;
723 if (part) { 803 if (part) {
724 put_disk(disk); 804 put_disk(disk);
725 return -EINVAL; 805 return -ENODEV;
726 } 806 }
727 807
728 rcu_read_lock(); 808 rcu_read_lock();
@@ -731,7 +811,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
731 if (blkcg_policy_enabled(disk->queue, pol)) 811 if (blkcg_policy_enabled(disk->queue, pol))
732 blkg = blkg_lookup_create(blkcg, disk->queue); 812 blkg = blkg_lookup_create(blkcg, disk->queue);
733 else 813 else
734 blkg = ERR_PTR(-EINVAL); 814 blkg = ERR_PTR(-EOPNOTSUPP);
735 815
736 if (IS_ERR(blkg)) { 816 if (IS_ERR(blkg)) {
737 ret = PTR_ERR(blkg); 817 ret = PTR_ERR(blkg);
@@ -753,7 +833,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
753 833
754 ctx->disk = disk; 834 ctx->disk = disk;
755 ctx->blkg = blkg; 835 ctx->blkg = blkg;
756 ctx->v = v; 836 ctx->body = body;
757 return 0; 837 return 0;
758} 838}
759EXPORT_SYMBOL_GPL(blkg_conf_prep); 839EXPORT_SYMBOL_GPL(blkg_conf_prep);
@@ -774,8 +854,55 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx)
774} 854}
775EXPORT_SYMBOL_GPL(blkg_conf_finish); 855EXPORT_SYMBOL_GPL(blkg_conf_finish);
776 856
857static int blkcg_print_stat(struct seq_file *sf, void *v)
858{
859 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
860 struct blkcg_gq *blkg;
861
862 rcu_read_lock();
863
864 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
865 const char *dname;
866 struct blkg_rwstat rwstat;
867 u64 rbytes, wbytes, rios, wios;
868
869 dname = blkg_dev_name(blkg);
870 if (!dname)
871 continue;
872
873 spin_lock_irq(blkg->q->queue_lock);
874
875 rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
876 offsetof(struct blkcg_gq, stat_bytes));
877 rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
878 wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
879
880 rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
881 offsetof(struct blkcg_gq, stat_ios));
882 rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
883 wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
884
885 spin_unlock_irq(blkg->q->queue_lock);
886
887 if (rbytes || wbytes || rios || wios)
888 seq_printf(sf, "%s rbytes=%llu wbytes=%llu rios=%llu wios=%llu\n",
889 dname, rbytes, wbytes, rios, wios);
890 }
891
892 rcu_read_unlock();
893 return 0;
894}
895
777struct cftype blkcg_files[] = { 896struct cftype blkcg_files[] = {
778 { 897 {
898 .name = "stat",
899 .seq_show = blkcg_print_stat,
900 },
901 { } /* terminate */
902};
903
904struct cftype blkcg_legacy_files[] = {
905 {
779 .name = "reset_stats", 906 .name = "reset_stats",
780 .write_u64 = blkcg_reset_stats, 907 .write_u64 = blkcg_reset_stats,
781 }, 908 },
@@ -822,18 +949,19 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css)
822static void blkcg_css_free(struct cgroup_subsys_state *css) 949static void blkcg_css_free(struct cgroup_subsys_state *css)
823{ 950{
824 struct blkcg *blkcg = css_to_blkcg(css); 951 struct blkcg *blkcg = css_to_blkcg(css);
952 int i;
825 953
826 mutex_lock(&blkcg_pol_mutex); 954 mutex_lock(&blkcg_pol_mutex);
955
827 list_del(&blkcg->all_blkcgs_node); 956 list_del(&blkcg->all_blkcgs_node);
828 mutex_unlock(&blkcg_pol_mutex);
829 957
830 if (blkcg != &blkcg_root) { 958 for (i = 0; i < BLKCG_MAX_POLS; i++)
831 int i; 959 if (blkcg->cpd[i])
960 blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
832 961
833 for (i = 0; i < BLKCG_MAX_POLS; i++) 962 mutex_unlock(&blkcg_pol_mutex);
834 kfree(blkcg->pd[i]); 963
835 kfree(blkcg); 964 kfree(blkcg);
836 }
837} 965}
838 966
839static struct cgroup_subsys_state * 967static struct cgroup_subsys_state *
@@ -847,13 +975,12 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
847 975
848 if (!parent_css) { 976 if (!parent_css) {
849 blkcg = &blkcg_root; 977 blkcg = &blkcg_root;
850 goto done; 978 } else {
851 } 979 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
852 980 if (!blkcg) {
853 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); 981 ret = ERR_PTR(-ENOMEM);
854 if (!blkcg) { 982 goto free_blkcg;
855 ret = ERR_PTR(-ENOMEM); 983 }
856 goto free_blkcg;
857 } 984 }
858 985
859 for (i = 0; i < BLKCG_MAX_POLS ; i++) { 986 for (i = 0; i < BLKCG_MAX_POLS ; i++) {
@@ -866,23 +993,23 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
866 * check if the policy requires any specific per-cgroup 993 * check if the policy requires any specific per-cgroup
867 * data: if it does, allocate and initialize it. 994 * data: if it does, allocate and initialize it.
868 */ 995 */
869 if (!pol || !pol->cpd_size) 996 if (!pol || !pol->cpd_alloc_fn)
870 continue; 997 continue;
871 998
872 BUG_ON(blkcg->pd[i]); 999 cpd = pol->cpd_alloc_fn(GFP_KERNEL);
873 cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
874 if (!cpd) { 1000 if (!cpd) {
875 ret = ERR_PTR(-ENOMEM); 1001 ret = ERR_PTR(-ENOMEM);
876 goto free_pd_blkcg; 1002 goto free_pd_blkcg;
877 } 1003 }
878 blkcg->pd[i] = cpd; 1004 blkcg->cpd[i] = cpd;
1005 cpd->blkcg = blkcg;
879 cpd->plid = i; 1006 cpd->plid = i;
880 pol->cpd_init_fn(blkcg); 1007 if (pol->cpd_init_fn)
1008 pol->cpd_init_fn(cpd);
881 } 1009 }
882 1010
883done:
884 spin_lock_init(&blkcg->lock); 1011 spin_lock_init(&blkcg->lock);
885 INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC); 1012 INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT);
886 INIT_HLIST_HEAD(&blkcg->blkg_list); 1013 INIT_HLIST_HEAD(&blkcg->blkg_list);
887#ifdef CONFIG_CGROUP_WRITEBACK 1014#ifdef CONFIG_CGROUP_WRITEBACK
888 INIT_LIST_HEAD(&blkcg->cgwb_list); 1015 INIT_LIST_HEAD(&blkcg->cgwb_list);
@@ -894,7 +1021,8 @@ done:
894 1021
895free_pd_blkcg: 1022free_pd_blkcg:
896 for (i--; i >= 0; i--) 1023 for (i--; i >= 0; i--)
897 kfree(blkcg->pd[i]); 1024 if (blkcg->cpd[i])
1025 blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
898free_blkcg: 1026free_blkcg:
899 kfree(blkcg); 1027 kfree(blkcg);
900 mutex_unlock(&blkcg_pol_mutex); 1028 mutex_unlock(&blkcg_pol_mutex);
@@ -938,7 +1066,7 @@ int blkcg_init_queue(struct request_queue *q)
938 radix_tree_preload_end(); 1066 radix_tree_preload_end();
939 1067
940 if (IS_ERR(blkg)) { 1068 if (IS_ERR(blkg)) {
941 kfree(new_blkg); 1069 blkg_free(new_blkg);
942 return PTR_ERR(blkg); 1070 return PTR_ERR(blkg);
943 } 1071 }
944 1072
@@ -1015,12 +1143,35 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css,
1015 return ret; 1143 return ret;
1016} 1144}
1017 1145
1018struct cgroup_subsys blkio_cgrp_subsys = { 1146static void blkcg_bind(struct cgroup_subsys_state *root_css)
1147{
1148 int i;
1149
1150 mutex_lock(&blkcg_pol_mutex);
1151
1152 for (i = 0; i < BLKCG_MAX_POLS; i++) {
1153 struct blkcg_policy *pol = blkcg_policy[i];
1154 struct blkcg *blkcg;
1155
1156 if (!pol || !pol->cpd_bind_fn)
1157 continue;
1158
1159 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node)
1160 if (blkcg->cpd[pol->plid])
1161 pol->cpd_bind_fn(blkcg->cpd[pol->plid]);
1162 }
1163 mutex_unlock(&blkcg_pol_mutex);
1164}
1165
1166struct cgroup_subsys io_cgrp_subsys = {
1019 .css_alloc = blkcg_css_alloc, 1167 .css_alloc = blkcg_css_alloc,
1020 .css_offline = blkcg_css_offline, 1168 .css_offline = blkcg_css_offline,
1021 .css_free = blkcg_css_free, 1169 .css_free = blkcg_css_free,
1022 .can_attach = blkcg_can_attach, 1170 .can_attach = blkcg_can_attach,
1023 .legacy_cftypes = blkcg_files, 1171 .bind = blkcg_bind,
1172 .dfl_cftypes = blkcg_files,
1173 .legacy_cftypes = blkcg_legacy_files,
1174 .legacy_name = "blkio",
1024#ifdef CONFIG_MEMCG 1175#ifdef CONFIG_MEMCG
1025 /* 1176 /*
1026 * This ensures that, if available, memcg is automatically enabled 1177 * This ensures that, if available, memcg is automatically enabled
@@ -1030,7 +1181,7 @@ struct cgroup_subsys blkio_cgrp_subsys = {
1030 .depends_on = 1 << memory_cgrp_id, 1181 .depends_on = 1 << memory_cgrp_id,
1031#endif 1182#endif
1032}; 1183};
1033EXPORT_SYMBOL_GPL(blkio_cgrp_subsys); 1184EXPORT_SYMBOL_GPL(io_cgrp_subsys);
1034 1185
1035/** 1186/**
1036 * blkcg_activate_policy - activate a blkcg policy on a request_queue 1187 * blkcg_activate_policy - activate a blkcg policy on a request_queue
@@ -1051,65 +1202,54 @@ EXPORT_SYMBOL_GPL(blkio_cgrp_subsys);
1051int blkcg_activate_policy(struct request_queue *q, 1202int blkcg_activate_policy(struct request_queue *q,
1052 const struct blkcg_policy *pol) 1203 const struct blkcg_policy *pol)
1053{ 1204{
1054 LIST_HEAD(pds); 1205 struct blkg_policy_data *pd_prealloc = NULL;
1055 struct blkcg_gq *blkg; 1206 struct blkcg_gq *blkg;
1056 struct blkg_policy_data *pd, *nd; 1207 int ret;
1057 int cnt = 0, ret;
1058 1208
1059 if (blkcg_policy_enabled(q, pol)) 1209 if (blkcg_policy_enabled(q, pol))
1060 return 0; 1210 return 0;
1061 1211
1062 /* count and allocate policy_data for all existing blkgs */
1063 blk_queue_bypass_start(q); 1212 blk_queue_bypass_start(q);
1064 spin_lock_irq(q->queue_lock); 1213pd_prealloc:
1065 list_for_each_entry(blkg, &q->blkg_list, q_node) 1214 if (!pd_prealloc) {
1066 cnt++; 1215 pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node);
1067 spin_unlock_irq(q->queue_lock); 1216 if (!pd_prealloc) {
1068
1069 /* allocate per-blkg policy data for all existing blkgs */
1070 while (cnt--) {
1071 pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
1072 if (!pd) {
1073 ret = -ENOMEM; 1217 ret = -ENOMEM;
1074 goto out_free; 1218 goto out_bypass_end;
1075 } 1219 }
1076 list_add_tail(&pd->alloc_node, &pds);
1077 } 1220 }
1078 1221
1079 /*
1080 * Install the allocated pds and cpds. With @q bypassing, no new blkg
1081 * should have been created while the queue lock was dropped.
1082 */
1083 spin_lock_irq(q->queue_lock); 1222 spin_lock_irq(q->queue_lock);
1084 1223
1085 list_for_each_entry(blkg, &q->blkg_list, q_node) { 1224 list_for_each_entry(blkg, &q->blkg_list, q_node) {
1086 if (WARN_ON(list_empty(&pds))) { 1225 struct blkg_policy_data *pd;
1087 /* umm... this shouldn't happen, just abort */
1088 ret = -ENOMEM;
1089 goto out_unlock;
1090 }
1091 pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node);
1092 list_del_init(&pd->alloc_node);
1093 1226
1094 /* grab blkcg lock too while installing @pd on @blkg */ 1227 if (blkg->pd[pol->plid])
1095 spin_lock(&blkg->blkcg->lock); 1228 continue;
1229
1230 pd = pol->pd_alloc_fn(GFP_NOWAIT, q->node);
1231 if (!pd)
1232 swap(pd, pd_prealloc);
1233 if (!pd) {
1234 spin_unlock_irq(q->queue_lock);
1235 goto pd_prealloc;
1236 }
1096 1237
1097 blkg->pd[pol->plid] = pd; 1238 blkg->pd[pol->plid] = pd;
1098 pd->blkg = blkg; 1239 pd->blkg = blkg;
1099 pd->plid = pol->plid; 1240 pd->plid = pol->plid;
1100 pol->pd_init_fn(blkg); 1241 if (pol->pd_init_fn)
1101 1242 pol->pd_init_fn(pd);
1102 spin_unlock(&blkg->blkcg->lock);
1103 } 1243 }
1104 1244
1105 __set_bit(pol->plid, q->blkcg_pols); 1245 __set_bit(pol->plid, q->blkcg_pols);
1106 ret = 0; 1246 ret = 0;
1107out_unlock: 1247
1108 spin_unlock_irq(q->queue_lock); 1248 spin_unlock_irq(q->queue_lock);
1109out_free: 1249out_bypass_end:
1110 blk_queue_bypass_end(q); 1250 blk_queue_bypass_end(q);
1111 list_for_each_entry_safe(pd, nd, &pds, alloc_node) 1251 if (pd_prealloc)
1112 kfree(pd); 1252 pol->pd_free_fn(pd_prealloc);
1113 return ret; 1253 return ret;
1114} 1254}
1115EXPORT_SYMBOL_GPL(blkcg_activate_policy); 1255EXPORT_SYMBOL_GPL(blkcg_activate_policy);
@@ -1139,13 +1279,12 @@ void blkcg_deactivate_policy(struct request_queue *q,
1139 /* grab blkcg lock too while removing @pd from @blkg */ 1279 /* grab blkcg lock too while removing @pd from @blkg */
1140 spin_lock(&blkg->blkcg->lock); 1280 spin_lock(&blkg->blkcg->lock);
1141 1281
1142 if (pol->pd_offline_fn) 1282 if (blkg->pd[pol->plid]) {
1143 pol->pd_offline_fn(blkg); 1283 if (pol->pd_offline_fn)
1144 if (pol->pd_exit_fn) 1284 pol->pd_offline_fn(blkg->pd[pol->plid]);
1145 pol->pd_exit_fn(blkg); 1285 pol->pd_free_fn(blkg->pd[pol->plid]);
1146 1286 blkg->pd[pol->plid] = NULL;
1147 kfree(blkg->pd[pol->plid]); 1287 }
1148 blkg->pd[pol->plid] = NULL;
1149 1288
1150 spin_unlock(&blkg->blkcg->lock); 1289 spin_unlock(&blkg->blkcg->lock);
1151 } 1290 }
@@ -1167,9 +1306,6 @@ int blkcg_policy_register(struct blkcg_policy *pol)
1167 struct blkcg *blkcg; 1306 struct blkcg *blkcg;
1168 int i, ret; 1307 int i, ret;
1169 1308
1170 if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data)))
1171 return -EINVAL;
1172
1173 mutex_lock(&blkcg_pol_register_mutex); 1309 mutex_lock(&blkcg_pol_register_mutex);
1174 mutex_lock(&blkcg_pol_mutex); 1310 mutex_lock(&blkcg_pol_mutex);
1175 1311
@@ -1186,36 +1322,42 @@ int blkcg_policy_register(struct blkcg_policy *pol)
1186 blkcg_policy[pol->plid] = pol; 1322 blkcg_policy[pol->plid] = pol;
1187 1323
1188 /* allocate and install cpd's */ 1324 /* allocate and install cpd's */
1189 if (pol->cpd_size) { 1325 if (pol->cpd_alloc_fn) {
1190 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { 1326 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1191 struct blkcg_policy_data *cpd; 1327 struct blkcg_policy_data *cpd;
1192 1328
1193 cpd = kzalloc(pol->cpd_size, GFP_KERNEL); 1329 cpd = pol->cpd_alloc_fn(GFP_KERNEL);
1194 if (!cpd) { 1330 if (!cpd) {
1195 mutex_unlock(&blkcg_pol_mutex); 1331 mutex_unlock(&blkcg_pol_mutex);
1196 goto err_free_cpds; 1332 goto err_free_cpds;
1197 } 1333 }
1198 1334
1199 blkcg->pd[pol->plid] = cpd; 1335 blkcg->cpd[pol->plid] = cpd;
1336 cpd->blkcg = blkcg;
1200 cpd->plid = pol->plid; 1337 cpd->plid = pol->plid;
1201 pol->cpd_init_fn(blkcg); 1338 pol->cpd_init_fn(cpd);
1202 } 1339 }
1203 } 1340 }
1204 1341
1205 mutex_unlock(&blkcg_pol_mutex); 1342 mutex_unlock(&blkcg_pol_mutex);
1206 1343
1207 /* everything is in place, add intf files for the new policy */ 1344 /* everything is in place, add intf files for the new policy */
1208 if (pol->cftypes) 1345 if (pol->dfl_cftypes)
1209 WARN_ON(cgroup_add_legacy_cftypes(&blkio_cgrp_subsys, 1346 WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys,
1210 pol->cftypes)); 1347 pol->dfl_cftypes));
1348 if (pol->legacy_cftypes)
1349 WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
1350 pol->legacy_cftypes));
1211 mutex_unlock(&blkcg_pol_register_mutex); 1351 mutex_unlock(&blkcg_pol_register_mutex);
1212 return 0; 1352 return 0;
1213 1353
1214err_free_cpds: 1354err_free_cpds:
1215 if (pol->cpd_size) { 1355 if (pol->cpd_alloc_fn) {
1216 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { 1356 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1217 kfree(blkcg->pd[pol->plid]); 1357 if (blkcg->cpd[pol->plid]) {
1218 blkcg->pd[pol->plid] = NULL; 1358 pol->cpd_free_fn(blkcg->cpd[pol->plid]);
1359 blkcg->cpd[pol->plid] = NULL;
1360 }
1219 } 1361 }
1220 } 1362 }
1221 blkcg_policy[pol->plid] = NULL; 1363 blkcg_policy[pol->plid] = NULL;
@@ -1242,16 +1384,20 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
1242 goto out_unlock; 1384 goto out_unlock;
1243 1385
1244 /* kill the intf files first */ 1386 /* kill the intf files first */
1245 if (pol->cftypes) 1387 if (pol->dfl_cftypes)
1246 cgroup_rm_cftypes(pol->cftypes); 1388 cgroup_rm_cftypes(pol->dfl_cftypes);
1389 if (pol->legacy_cftypes)
1390 cgroup_rm_cftypes(pol->legacy_cftypes);
1247 1391
1248 /* remove cpds and unregister */ 1392 /* remove cpds and unregister */
1249 mutex_lock(&blkcg_pol_mutex); 1393 mutex_lock(&blkcg_pol_mutex);
1250 1394
1251 if (pol->cpd_size) { 1395 if (pol->cpd_alloc_fn) {
1252 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { 1396 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1253 kfree(blkcg->pd[pol->plid]); 1397 if (blkcg->cpd[pol->plid]) {
1254 blkcg->pd[pol->plid] = NULL; 1398 pol->cpd_free_fn(blkcg->cpd[pol->plid]);
1399 blkcg->cpd[pol->plid] = NULL;
1400 }
1255 } 1401 }
1256 } 1402 }
1257 blkcg_policy[pol->plid] = NULL; 1403 blkcg_policy[pol->plid] = NULL;
diff --git a/block/blk-core.c b/block/blk-core.c
index 60912e983f16..2eb722d48773 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1888,8 +1888,8 @@ generic_make_request_checks(struct bio *bio)
1888 */ 1888 */
1889 create_io_context(GFP_ATOMIC, q->node); 1889 create_io_context(GFP_ATOMIC, q->node);
1890 1890
1891 if (blk_throtl_bio(q, bio)) 1891 if (!blkcg_bio_issue_check(q, bio))
1892 return false; /* throttled, will be resubmitted later */ 1892 return false;
1893 1893
1894 trace_block_bio_queue(q, bio); 1894 trace_block_bio_queue(q, bio);
1895 return true; 1895 return true;
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index b23193518ac7..c75a2636dd40 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -83,14 +83,6 @@ enum tg_state_flags {
83 83
84#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) 84#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
85 85
86/* Per-cpu group stats */
87struct tg_stats_cpu {
88 /* total bytes transferred */
89 struct blkg_rwstat service_bytes;
90 /* total IOs serviced, post merge */
91 struct blkg_rwstat serviced;
92};
93
94struct throtl_grp { 86struct throtl_grp {
95 /* must be the first member */ 87 /* must be the first member */
96 struct blkg_policy_data pd; 88 struct blkg_policy_data pd;
@@ -141,12 +133,6 @@ struct throtl_grp {
141 /* When did we start a new slice */ 133 /* When did we start a new slice */
142 unsigned long slice_start[2]; 134 unsigned long slice_start[2];
143 unsigned long slice_end[2]; 135 unsigned long slice_end[2];
144
145 /* Per cpu stats pointer */
146 struct tg_stats_cpu __percpu *stats_cpu;
147
148 /* List of tgs waiting for per cpu stats memory to be allocated */
149 struct list_head stats_alloc_node;
150}; 136};
151 137
152struct throtl_data 138struct throtl_data
@@ -168,13 +154,6 @@ struct throtl_data
168 struct work_struct dispatch_work; 154 struct work_struct dispatch_work;
169}; 155};
170 156
171/* list and work item to allocate percpu group stats */
172static DEFINE_SPINLOCK(tg_stats_alloc_lock);
173static LIST_HEAD(tg_stats_alloc_list);
174
175static void tg_stats_alloc_fn(struct work_struct *);
176static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);
177
178static void throtl_pending_timer_fn(unsigned long arg); 157static void throtl_pending_timer_fn(unsigned long arg);
179 158
180static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) 159static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
@@ -192,11 +171,6 @@ static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
192 return pd_to_blkg(&tg->pd); 171 return pd_to_blkg(&tg->pd);
193} 172}
194 173
195static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
196{
197 return blkg_to_tg(td->queue->root_blkg);
198}
199
200/** 174/**
201 * sq_to_tg - return the throl_grp the specified service queue belongs to 175 * sq_to_tg - return the throl_grp the specified service queue belongs to
202 * @sq: the throtl_service_queue of interest 176 * @sq: the throtl_service_queue of interest
@@ -256,53 +230,6 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
256 } \ 230 } \
257} while (0) 231} while (0)
258 232
259static void tg_stats_init(struct tg_stats_cpu *tg_stats)
260{
261 blkg_rwstat_init(&tg_stats->service_bytes);
262 blkg_rwstat_init(&tg_stats->serviced);
263}
264
265/*
266 * Worker for allocating per cpu stat for tgs. This is scheduled on the
267 * system_wq once there are some groups on the alloc_list waiting for
268 * allocation.
269 */
270static void tg_stats_alloc_fn(struct work_struct *work)
271{
272 static struct tg_stats_cpu *stats_cpu; /* this fn is non-reentrant */
273 struct delayed_work *dwork = to_delayed_work(work);
274 bool empty = false;
275
276alloc_stats:
277 if (!stats_cpu) {
278 int cpu;
279
280 stats_cpu = alloc_percpu(struct tg_stats_cpu);
281 if (!stats_cpu) {
282 /* allocation failed, try again after some time */
283 schedule_delayed_work(dwork, msecs_to_jiffies(10));
284 return;
285 }
286 for_each_possible_cpu(cpu)
287 tg_stats_init(per_cpu_ptr(stats_cpu, cpu));
288 }
289
290 spin_lock_irq(&tg_stats_alloc_lock);
291
292 if (!list_empty(&tg_stats_alloc_list)) {
293 struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list,
294 struct throtl_grp,
295 stats_alloc_node);
296 swap(tg->stats_cpu, stats_cpu);
297 list_del_init(&tg->stats_alloc_node);
298 }
299
300 empty = list_empty(&tg_stats_alloc_list);
301 spin_unlock_irq(&tg_stats_alloc_lock);
302 if (!empty)
303 goto alloc_stats;
304}
305
306static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) 233static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
307{ 234{
308 INIT_LIST_HEAD(&qn->node); 235 INIT_LIST_HEAD(&qn->node);
@@ -387,29 +314,46 @@ static struct bio *throtl_pop_queued(struct list_head *queued,
387} 314}
388 315
389/* init a service_queue, assumes the caller zeroed it */ 316/* init a service_queue, assumes the caller zeroed it */
390static void throtl_service_queue_init(struct throtl_service_queue *sq, 317static void throtl_service_queue_init(struct throtl_service_queue *sq)
391 struct throtl_service_queue *parent_sq)
392{ 318{
393 INIT_LIST_HEAD(&sq->queued[0]); 319 INIT_LIST_HEAD(&sq->queued[0]);
394 INIT_LIST_HEAD(&sq->queued[1]); 320 INIT_LIST_HEAD(&sq->queued[1]);
395 sq->pending_tree = RB_ROOT; 321 sq->pending_tree = RB_ROOT;
396 sq->parent_sq = parent_sq;
397 setup_timer(&sq->pending_timer, throtl_pending_timer_fn, 322 setup_timer(&sq->pending_timer, throtl_pending_timer_fn,
398 (unsigned long)sq); 323 (unsigned long)sq);
399} 324}
400 325
401static void throtl_service_queue_exit(struct throtl_service_queue *sq) 326static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
402{ 327{
403 del_timer_sync(&sq->pending_timer); 328 struct throtl_grp *tg;
329 int rw;
330
331 tg = kzalloc_node(sizeof(*tg), gfp, node);
332 if (!tg)
333 return NULL;
334
335 throtl_service_queue_init(&tg->service_queue);
336
337 for (rw = READ; rw <= WRITE; rw++) {
338 throtl_qnode_init(&tg->qnode_on_self[rw], tg);
339 throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
340 }
341
342 RB_CLEAR_NODE(&tg->rb_node);
343 tg->bps[READ] = -1;
344 tg->bps[WRITE] = -1;
345 tg->iops[READ] = -1;
346 tg->iops[WRITE] = -1;
347
348 return &tg->pd;
404} 349}
405 350
406static void throtl_pd_init(struct blkcg_gq *blkg) 351static void throtl_pd_init(struct blkg_policy_data *pd)
407{ 352{
408 struct throtl_grp *tg = blkg_to_tg(blkg); 353 struct throtl_grp *tg = pd_to_tg(pd);
354 struct blkcg_gq *blkg = tg_to_blkg(tg);
409 struct throtl_data *td = blkg->q->td; 355 struct throtl_data *td = blkg->q->td;
410 struct throtl_service_queue *parent_sq; 356 struct throtl_service_queue *sq = &tg->service_queue;
411 unsigned long flags;
412 int rw;
413 357
414 /* 358 /*
415 * If on the default hierarchy, we switch to properly hierarchical 359 * If on the default hierarchy, we switch to properly hierarchical
@@ -424,35 +368,10 @@ static void throtl_pd_init(struct blkcg_gq *blkg)
424 * Limits of a group don't interact with limits of other groups 368 * Limits of a group don't interact with limits of other groups
425 * regardless of the position of the group in the hierarchy. 369 * regardless of the position of the group in the hierarchy.
426 */ 370 */
427 parent_sq = &td->service_queue; 371 sq->parent_sq = &td->service_queue;
428
429 if (cgroup_on_dfl(blkg->blkcg->css.cgroup) && blkg->parent) 372 if (cgroup_on_dfl(blkg->blkcg->css.cgroup) && blkg->parent)
430 parent_sq = &blkg_to_tg(blkg->parent)->service_queue; 373 sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
431
432 throtl_service_queue_init(&tg->service_queue, parent_sq);
433
434 for (rw = READ; rw <= WRITE; rw++) {
435 throtl_qnode_init(&tg->qnode_on_self[rw], tg);
436 throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
437 }
438
439 RB_CLEAR_NODE(&tg->rb_node);
440 tg->td = td; 374 tg->td = td;
441
442 tg->bps[READ] = -1;
443 tg->bps[WRITE] = -1;
444 tg->iops[READ] = -1;
445 tg->iops[WRITE] = -1;
446
447 /*
448 * Ugh... We need to perform per-cpu allocation for tg->stats_cpu
449 * but percpu allocator can't be called from IO path. Queue tg on
450 * tg_stats_alloc_list and allocate from work item.
451 */
452 spin_lock_irqsave(&tg_stats_alloc_lock, flags);
453 list_add(&tg->stats_alloc_node, &tg_stats_alloc_list);
454 schedule_delayed_work(&tg_stats_alloc_work, 0);
455 spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
456} 375}
457 376
458/* 377/*
@@ -470,83 +389,21 @@ static void tg_update_has_rules(struct throtl_grp *tg)
470 (tg->bps[rw] != -1 || tg->iops[rw] != -1); 389 (tg->bps[rw] != -1 || tg->iops[rw] != -1);
471} 390}
472 391
473static void throtl_pd_online(struct blkcg_gq *blkg) 392static void throtl_pd_online(struct blkg_policy_data *pd)
474{ 393{
475 /* 394 /*
476 * We don't want new groups to escape the limits of its ancestors. 395 * We don't want new groups to escape the limits of its ancestors.
477 * Update has_rules[] after a new group is brought online. 396 * Update has_rules[] after a new group is brought online.
478 */ 397 */
479 tg_update_has_rules(blkg_to_tg(blkg)); 398 tg_update_has_rules(pd_to_tg(pd));
480}
481
482static void throtl_pd_exit(struct blkcg_gq *blkg)
483{
484 struct throtl_grp *tg = blkg_to_tg(blkg);
485 unsigned long flags;
486
487 spin_lock_irqsave(&tg_stats_alloc_lock, flags);
488 list_del_init(&tg->stats_alloc_node);
489 spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
490
491 free_percpu(tg->stats_cpu);
492
493 throtl_service_queue_exit(&tg->service_queue);
494}
495
496static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
497{
498 struct throtl_grp *tg = blkg_to_tg(blkg);
499 int cpu;
500
501 if (tg->stats_cpu == NULL)
502 return;
503
504 for_each_possible_cpu(cpu) {
505 struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
506
507 blkg_rwstat_reset(&sc->service_bytes);
508 blkg_rwstat_reset(&sc->serviced);
509 }
510}
511
512static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td,
513 struct blkcg *blkcg)
514{
515 /*
516 * This is the common case when there are no blkcgs. Avoid lookup
517 * in this case
518 */
519 if (blkcg == &blkcg_root)
520 return td_root_tg(td);
521
522 return blkg_to_tg(blkg_lookup(blkcg, td->queue));
523} 399}
524 400
525static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td, 401static void throtl_pd_free(struct blkg_policy_data *pd)
526 struct blkcg *blkcg)
527{ 402{
528 struct request_queue *q = td->queue; 403 struct throtl_grp *tg = pd_to_tg(pd);
529 struct throtl_grp *tg = NULL;
530
531 /*
532 * This is the common case when there are no blkcgs. Avoid lookup
533 * in this case
534 */
535 if (blkcg == &blkcg_root) {
536 tg = td_root_tg(td);
537 } else {
538 struct blkcg_gq *blkg;
539
540 blkg = blkg_lookup_create(blkcg, q);
541
542 /* if %NULL and @q is alive, fall back to root_tg */
543 if (!IS_ERR(blkg))
544 tg = blkg_to_tg(blkg);
545 else if (!blk_queue_dying(q))
546 tg = td_root_tg(td);
547 }
548 404
549 return tg; 405 del_timer_sync(&tg->service_queue.pending_timer);
406 kfree(tg);
550} 407}
551 408
552static struct throtl_grp * 409static struct throtl_grp *
@@ -956,32 +813,6 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
956 return 0; 813 return 0;
957} 814}
958 815
959static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,
960 int rw)
961{
962 struct throtl_grp *tg = blkg_to_tg(blkg);
963 struct tg_stats_cpu *stats_cpu;
964 unsigned long flags;
965
966 /* If per cpu stats are not allocated yet, don't do any accounting. */
967 if (tg->stats_cpu == NULL)
968 return;
969
970 /*
971 * Disabling interrupts to provide mutual exclusion between two
972 * writes on same cpu. It probably is not needed for 64bit. Not
973 * optimizing that case yet.
974 */
975 local_irq_save(flags);
976
977 stats_cpu = this_cpu_ptr(tg->stats_cpu);
978
979 blkg_rwstat_add(&stats_cpu->serviced, rw, 1);
980 blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);
981
982 local_irq_restore(flags);
983}
984
985static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) 816static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
986{ 817{
987 bool rw = bio_data_dir(bio); 818 bool rw = bio_data_dir(bio);
@@ -995,17 +826,9 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
995 * more than once as a throttled bio will go through blk-throtl the 826 * more than once as a throttled bio will go through blk-throtl the
996 * second time when it eventually gets issued. Set it when a bio 827 * second time when it eventually gets issued. Set it when a bio
997 * is being charged to a tg. 828 * is being charged to a tg.
998 *
999 * Dispatch stats aren't recursive and each @bio should only be
1000 * accounted by the @tg it was originally associated with. Let's
1001 * update the stats when setting REQ_THROTTLED for the first time
1002 * which is guaranteed to be for the @bio's original tg.
1003 */ 829 */
1004 if (!(bio->bi_rw & REQ_THROTTLED)) { 830 if (!(bio->bi_rw & REQ_THROTTLED))
1005 bio->bi_rw |= REQ_THROTTLED; 831 bio->bi_rw |= REQ_THROTTLED;
1006 throtl_update_dispatch_stats(tg_to_blkg(tg),
1007 bio->bi_iter.bi_size, bio->bi_rw);
1008 }
1009} 832}
1010 833
1011/** 834/**
@@ -1285,34 +1108,6 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
1285 } 1108 }
1286} 1109}
1287 1110
1288static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
1289 struct blkg_policy_data *pd, int off)
1290{
1291 struct throtl_grp *tg = pd_to_tg(pd);
1292 struct blkg_rwstat rwstat = { }, tmp;
1293 int i, cpu;
1294
1295 if (tg->stats_cpu == NULL)
1296 return 0;
1297
1298 for_each_possible_cpu(cpu) {
1299 struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
1300
1301 tmp = blkg_rwstat_read((void *)sc + off);
1302 for (i = 0; i < BLKG_RWSTAT_NR; i++)
1303 rwstat.cnt[i] += tmp.cnt[i];
1304 }
1305
1306 return __blkg_prfill_rwstat(sf, pd, &rwstat);
1307}
1308
1309static int tg_print_cpu_rwstat(struct seq_file *sf, void *v)
1310{
1311 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat,
1312 &blkcg_policy_throtl, seq_cft(sf)->private, true);
1313 return 0;
1314}
1315
1316static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, 1111static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
1317 int off) 1112 int off)
1318{ 1113{
@@ -1349,31 +1144,11 @@ static int tg_print_conf_uint(struct seq_file *sf, void *v)
1349 return 0; 1144 return 0;
1350} 1145}
1351 1146
1352static ssize_t tg_set_conf(struct kernfs_open_file *of, 1147static void tg_conf_updated(struct throtl_grp *tg)
1353 char *buf, size_t nbytes, loff_t off, bool is_u64)
1354{ 1148{
1355 struct blkcg *blkcg = css_to_blkcg(of_css(of)); 1149 struct throtl_service_queue *sq = &tg->service_queue;
1356 struct blkg_conf_ctx ctx;
1357 struct throtl_grp *tg;
1358 struct throtl_service_queue *sq;
1359 struct blkcg_gq *blkg;
1360 struct cgroup_subsys_state *pos_css; 1150 struct cgroup_subsys_state *pos_css;
1361 int ret; 1151 struct blkcg_gq *blkg;
1362
1363 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
1364 if (ret)
1365 return ret;
1366
1367 tg = blkg_to_tg(ctx.blkg);
1368 sq = &tg->service_queue;
1369
1370 if (!ctx.v)
1371 ctx.v = -1;
1372
1373 if (is_u64)
1374 *(u64 *)((void *)tg + of_cft(of)->private) = ctx.v;
1375 else
1376 *(unsigned int *)((void *)tg + of_cft(of)->private) = ctx.v;
1377 1152
1378 throtl_log(&tg->service_queue, 1153 throtl_log(&tg->service_queue,
1379 "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", 1154 "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
@@ -1387,7 +1162,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
1387 * restrictions in the whole hierarchy and allows them to bypass 1162 * restrictions in the whole hierarchy and allows them to bypass
1388 * blk-throttle. 1163 * blk-throttle.
1389 */ 1164 */
1390 blkg_for_each_descendant_pre(blkg, pos_css, ctx.blkg) 1165 blkg_for_each_descendant_pre(blkg, pos_css, tg_to_blkg(tg))
1391 tg_update_has_rules(blkg_to_tg(blkg)); 1166 tg_update_has_rules(blkg_to_tg(blkg));
1392 1167
1393 /* 1168 /*
@@ -1405,9 +1180,39 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
1405 tg_update_disptime(tg); 1180 tg_update_disptime(tg);
1406 throtl_schedule_next_dispatch(sq->parent_sq, true); 1181 throtl_schedule_next_dispatch(sq->parent_sq, true);
1407 } 1182 }
1183}
1184
1185static ssize_t tg_set_conf(struct kernfs_open_file *of,
1186 char *buf, size_t nbytes, loff_t off, bool is_u64)
1187{
1188 struct blkcg *blkcg = css_to_blkcg(of_css(of));
1189 struct blkg_conf_ctx ctx;
1190 struct throtl_grp *tg;
1191 int ret;
1192 u64 v;
1408 1193
1194 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
1195 if (ret)
1196 return ret;
1197
1198 ret = -EINVAL;
1199 if (sscanf(ctx.body, "%llu", &v) != 1)
1200 goto out_finish;
1201 if (!v)
1202 v = -1;
1203
1204 tg = blkg_to_tg(ctx.blkg);
1205
1206 if (is_u64)
1207 *(u64 *)((void *)tg + of_cft(of)->private) = v;
1208 else
1209 *(unsigned int *)((void *)tg + of_cft(of)->private) = v;
1210
1211 tg_conf_updated(tg);
1212 ret = 0;
1213out_finish:
1409 blkg_conf_finish(&ctx); 1214 blkg_conf_finish(&ctx);
1410 return nbytes; 1215 return ret ?: nbytes;
1411} 1216}
1412 1217
1413static ssize_t tg_set_conf_u64(struct kernfs_open_file *of, 1218static ssize_t tg_set_conf_u64(struct kernfs_open_file *of,
@@ -1422,7 +1227,7 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of,
1422 return tg_set_conf(of, buf, nbytes, off, false); 1227 return tg_set_conf(of, buf, nbytes, off, false);
1423} 1228}
1424 1229
1425static struct cftype throtl_files[] = { 1230static struct cftype throtl_legacy_files[] = {
1426 { 1231 {
1427 .name = "throttle.read_bps_device", 1232 .name = "throttle.read_bps_device",
1428 .private = offsetof(struct throtl_grp, bps[READ]), 1233 .private = offsetof(struct throtl_grp, bps[READ]),
@@ -1449,13 +1254,124 @@ static struct cftype throtl_files[] = {
1449 }, 1254 },
1450 { 1255 {
1451 .name = "throttle.io_service_bytes", 1256 .name = "throttle.io_service_bytes",
1452 .private = offsetof(struct tg_stats_cpu, service_bytes), 1257 .private = (unsigned long)&blkcg_policy_throtl,
1453 .seq_show = tg_print_cpu_rwstat, 1258 .seq_show = blkg_print_stat_bytes,
1454 }, 1259 },
1455 { 1260 {
1456 .name = "throttle.io_serviced", 1261 .name = "throttle.io_serviced",
1457 .private = offsetof(struct tg_stats_cpu, serviced), 1262 .private = (unsigned long)&blkcg_policy_throtl,
1458 .seq_show = tg_print_cpu_rwstat, 1263 .seq_show = blkg_print_stat_ios,
1264 },
1265 { } /* terminate */
1266};
1267
1268static u64 tg_prfill_max(struct seq_file *sf, struct blkg_policy_data *pd,
1269 int off)
1270{
1271 struct throtl_grp *tg = pd_to_tg(pd);
1272 const char *dname = blkg_dev_name(pd->blkg);
1273 char bufs[4][21] = { "max", "max", "max", "max" };
1274
1275 if (!dname)
1276 return 0;
1277 if (tg->bps[READ] == -1 && tg->bps[WRITE] == -1 &&
1278 tg->iops[READ] == -1 && tg->iops[WRITE] == -1)
1279 return 0;
1280
1281 if (tg->bps[READ] != -1)
1282 snprintf(bufs[0], sizeof(bufs[0]), "%llu", tg->bps[READ]);
1283 if (tg->bps[WRITE] != -1)
1284 snprintf(bufs[1], sizeof(bufs[1]), "%llu", tg->bps[WRITE]);
1285 if (tg->iops[READ] != -1)
1286 snprintf(bufs[2], sizeof(bufs[2]), "%u", tg->iops[READ]);
1287 if (tg->iops[WRITE] != -1)
1288 snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops[WRITE]);
1289
1290 seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s\n",
1291 dname, bufs[0], bufs[1], bufs[2], bufs[3]);
1292 return 0;
1293}
1294
1295static int tg_print_max(struct seq_file *sf, void *v)
1296{
1297 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_max,
1298 &blkcg_policy_throtl, seq_cft(sf)->private, false);
1299 return 0;
1300}
1301
1302static ssize_t tg_set_max(struct kernfs_open_file *of,
1303 char *buf, size_t nbytes, loff_t off)
1304{
1305 struct blkcg *blkcg = css_to_blkcg(of_css(of));
1306 struct blkg_conf_ctx ctx;
1307 struct throtl_grp *tg;
1308 u64 v[4];
1309 int ret;
1310
1311 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
1312 if (ret)
1313 return ret;
1314
1315 tg = blkg_to_tg(ctx.blkg);
1316
1317 v[0] = tg->bps[READ];
1318 v[1] = tg->bps[WRITE];
1319 v[2] = tg->iops[READ];
1320 v[3] = tg->iops[WRITE];
1321
1322 while (true) {
1323 char tok[27]; /* wiops=18446744073709551616 */
1324 char *p;
1325 u64 val = -1;
1326 int len;
1327
1328 if (sscanf(ctx.body, "%26s%n", tok, &len) != 1)
1329 break;
1330 if (tok[0] == '\0')
1331 break;
1332 ctx.body += len;
1333
1334 ret = -EINVAL;
1335 p = tok;
1336 strsep(&p, "=");
1337 if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max")))
1338 goto out_finish;
1339
1340 ret = -ERANGE;
1341 if (!val)
1342 goto out_finish;
1343
1344 ret = -EINVAL;
1345 if (!strcmp(tok, "rbps"))
1346 v[0] = val;
1347 else if (!strcmp(tok, "wbps"))
1348 v[1] = val;
1349 else if (!strcmp(tok, "riops"))
1350 v[2] = min_t(u64, val, UINT_MAX);
1351 else if (!strcmp(tok, "wiops"))
1352 v[3] = min_t(u64, val, UINT_MAX);
1353 else
1354 goto out_finish;
1355 }
1356
1357 tg->bps[READ] = v[0];
1358 tg->bps[WRITE] = v[1];
1359 tg->iops[READ] = v[2];
1360 tg->iops[WRITE] = v[3];
1361
1362 tg_conf_updated(tg);
1363 ret = 0;
1364out_finish:
1365 blkg_conf_finish(&ctx);
1366 return ret ?: nbytes;
1367}
1368
1369static struct cftype throtl_files[] = {
1370 {
1371 .name = "max",
1372 .flags = CFTYPE_NOT_ON_ROOT,
1373 .seq_show = tg_print_max,
1374 .write = tg_set_max,
1459 }, 1375 },
1460 { } /* terminate */ 1376 { } /* terminate */
1461}; 1377};
@@ -1468,52 +1384,33 @@ static void throtl_shutdown_wq(struct request_queue *q)
1468} 1384}
1469 1385
1470static struct blkcg_policy blkcg_policy_throtl = { 1386static struct blkcg_policy blkcg_policy_throtl = {
1471 .pd_size = sizeof(struct throtl_grp), 1387 .dfl_cftypes = throtl_files,
1472 .cftypes = throtl_files, 1388 .legacy_cftypes = throtl_legacy_files,
1473 1389
1390 .pd_alloc_fn = throtl_pd_alloc,
1474 .pd_init_fn = throtl_pd_init, 1391 .pd_init_fn = throtl_pd_init,
1475 .pd_online_fn = throtl_pd_online, 1392 .pd_online_fn = throtl_pd_online,
1476 .pd_exit_fn = throtl_pd_exit, 1393 .pd_free_fn = throtl_pd_free,
1477 .pd_reset_stats_fn = throtl_pd_reset_stats,
1478}; 1394};
1479 1395
1480bool blk_throtl_bio(struct request_queue *q, struct bio *bio) 1396bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
1397 struct bio *bio)
1481{ 1398{
1482 struct throtl_data *td = q->td;
1483 struct throtl_qnode *qn = NULL; 1399 struct throtl_qnode *qn = NULL;
1484 struct throtl_grp *tg; 1400 struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg);
1485 struct throtl_service_queue *sq; 1401 struct throtl_service_queue *sq;
1486 bool rw = bio_data_dir(bio); 1402 bool rw = bio_data_dir(bio);
1487 struct blkcg *blkcg;
1488 bool throttled = false; 1403 bool throttled = false;
1489 1404
1405 WARN_ON_ONCE(!rcu_read_lock_held());
1406
1490 /* see throtl_charge_bio() */ 1407 /* see throtl_charge_bio() */
1491 if (bio->bi_rw & REQ_THROTTLED) 1408 if ((bio->bi_rw & REQ_THROTTLED) || !tg->has_rules[rw])
1492 goto out; 1409 goto out;
1493 1410
1494 /*
1495 * A throtl_grp pointer retrieved under rcu can be used to access
1496 * basic fields like stats and io rates. If a group has no rules,
1497 * just update the dispatch stats in lockless manner and return.
1498 */
1499 rcu_read_lock();
1500 blkcg = bio_blkcg(bio);
1501 tg = throtl_lookup_tg(td, blkcg);
1502 if (tg) {
1503 if (!tg->has_rules[rw]) {
1504 throtl_update_dispatch_stats(tg_to_blkg(tg),
1505 bio->bi_iter.bi_size, bio->bi_rw);
1506 goto out_unlock_rcu;
1507 }
1508 }
1509
1510 /*
1511 * Either group has not been allocated yet or it is not an unlimited
1512 * IO group
1513 */
1514 spin_lock_irq(q->queue_lock); 1411 spin_lock_irq(q->queue_lock);
1515 tg = throtl_lookup_create_tg(td, blkcg); 1412
1516 if (unlikely(!tg)) 1413 if (unlikely(blk_queue_bypass(q)))
1517 goto out_unlock; 1414 goto out_unlock;
1518 1415
1519 sq = &tg->service_queue; 1416 sq = &tg->service_queue;
@@ -1580,8 +1477,6 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
1580 1477
1581out_unlock: 1478out_unlock:
1582 spin_unlock_irq(q->queue_lock); 1479 spin_unlock_irq(q->queue_lock);
1583out_unlock_rcu:
1584 rcu_read_unlock();
1585out: 1480out:
1586 /* 1481 /*
1587 * As multiple blk-throtls may stack in the same issue path, we 1482 * As multiple blk-throtls may stack in the same issue path, we
@@ -1667,7 +1562,7 @@ int blk_throtl_init(struct request_queue *q)
1667 return -ENOMEM; 1562 return -ENOMEM;
1668 1563
1669 INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); 1564 INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
1670 throtl_service_queue_init(&td->service_queue, NULL); 1565 throtl_service_queue_init(&td->service_queue);
1671 1566
1672 q->td = td; 1567 q->td = td;
1673 td->queue = q; 1568 td->queue = q;
diff --git a/block/blk.h b/block/blk.h
index 838188b35a83..98614ad37c81 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -272,15 +272,10 @@ static inline struct io_context *create_io_context(gfp_t gfp_mask, int node)
272 * Internal throttling interface 272 * Internal throttling interface
273 */ 273 */
274#ifdef CONFIG_BLK_DEV_THROTTLING 274#ifdef CONFIG_BLK_DEV_THROTTLING
275extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio);
276extern void blk_throtl_drain(struct request_queue *q); 275extern void blk_throtl_drain(struct request_queue *q);
277extern int blk_throtl_init(struct request_queue *q); 276extern int blk_throtl_init(struct request_queue *q);
278extern void blk_throtl_exit(struct request_queue *q); 277extern void blk_throtl_exit(struct request_queue *q);
279#else /* CONFIG_BLK_DEV_THROTTLING */ 278#else /* CONFIG_BLK_DEV_THROTTLING */
280static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
281{
282 return false;
283}
284static inline void blk_throtl_drain(struct request_queue *q) { } 279static inline void blk_throtl_drain(struct request_queue *q) { }
285static inline int blk_throtl_init(struct request_queue *q) { return 0; } 280static inline int blk_throtl_init(struct request_queue *q) { return 0; }
286static inline void blk_throtl_exit(struct request_queue *q) { } 281static inline void blk_throtl_exit(struct request_queue *q) { }
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index c62bb2e650b8..04de88463a98 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -68,9 +68,9 @@ static struct kmem_cache *cfq_pool;
68#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) 68#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)
69 69
70/* blkio-related constants */ 70/* blkio-related constants */
71#define CFQ_WEIGHT_MIN 10 71#define CFQ_WEIGHT_LEGACY_MIN 10
72#define CFQ_WEIGHT_MAX 1000 72#define CFQ_WEIGHT_LEGACY_DFL 500
73#define CFQ_WEIGHT_DEFAULT 500 73#define CFQ_WEIGHT_LEGACY_MAX 1000
74 74
75struct cfq_ttime { 75struct cfq_ttime {
76 unsigned long last_end_request; 76 unsigned long last_end_request;
@@ -177,10 +177,6 @@ enum wl_type_t {
177 177
178struct cfqg_stats { 178struct cfqg_stats {
179#ifdef CONFIG_CFQ_GROUP_IOSCHED 179#ifdef CONFIG_CFQ_GROUP_IOSCHED
180 /* total bytes transferred */
181 struct blkg_rwstat service_bytes;
182 /* total IOs serviced, post merge */
183 struct blkg_rwstat serviced;
184 /* number of ios merged */ 180 /* number of ios merged */
185 struct blkg_rwstat merged; 181 struct blkg_rwstat merged;
186 /* total time spent on device in ns, may not be accurate w/ queueing */ 182 /* total time spent on device in ns, may not be accurate w/ queueing */
@@ -189,8 +185,6 @@ struct cfqg_stats {
189 struct blkg_rwstat wait_time; 185 struct blkg_rwstat wait_time;
190 /* number of IOs queued up */ 186 /* number of IOs queued up */
191 struct blkg_rwstat queued; 187 struct blkg_rwstat queued;
192 /* total sectors transferred */
193 struct blkg_stat sectors;
194 /* total disk time and nr sectors dispatched by this group */ 188 /* total disk time and nr sectors dispatched by this group */
195 struct blkg_stat time; 189 struct blkg_stat time;
196#ifdef CONFIG_DEBUG_BLK_CGROUP 190#ifdef CONFIG_DEBUG_BLK_CGROUP
@@ -220,7 +214,7 @@ struct cfqg_stats {
220/* Per-cgroup data */ 214/* Per-cgroup data */
221struct cfq_group_data { 215struct cfq_group_data {
222 /* must be the first member */ 216 /* must be the first member */
223 struct blkcg_policy_data pd; 217 struct blkcg_policy_data cpd;
224 218
225 unsigned int weight; 219 unsigned int weight;
226 unsigned int leaf_weight; 220 unsigned int leaf_weight;
@@ -304,7 +298,11 @@ struct cfq_group {
304 int dispatched; 298 int dispatched;
305 struct cfq_ttime ttime; 299 struct cfq_ttime ttime;
306 struct cfqg_stats stats; /* stats for this cfqg */ 300 struct cfqg_stats stats; /* stats for this cfqg */
307 struct cfqg_stats dead_stats; /* stats pushed from dead children */ 301
302 /* async queue for each priority case */
303 struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
304 struct cfq_queue *async_idle_cfqq;
305
308}; 306};
309 307
310struct cfq_io_cq { 308struct cfq_io_cq {
@@ -370,12 +368,6 @@ struct cfq_data {
370 struct cfq_queue *active_queue; 368 struct cfq_queue *active_queue;
371 struct cfq_io_cq *active_cic; 369 struct cfq_io_cq *active_cic;
372 370
373 /*
374 * async queue for each priority case
375 */
376 struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
377 struct cfq_queue *async_idle_cfqq;
378
379 sector_t last_position; 371 sector_t last_position;
380 372
381 /* 373 /*
@@ -401,6 +393,7 @@ struct cfq_data {
401}; 393};
402 394
403static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); 395static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
396static void cfq_put_queue(struct cfq_queue *cfqq);
404 397
405static struct cfq_rb_root *st_for(struct cfq_group *cfqg, 398static struct cfq_rb_root *st_for(struct cfq_group *cfqg,
406 enum wl_class_t class, 399 enum wl_class_t class,
@@ -612,7 +605,7 @@ static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
612static struct cfq_group_data 605static struct cfq_group_data
613*cpd_to_cfqgd(struct blkcg_policy_data *cpd) 606*cpd_to_cfqgd(struct blkcg_policy_data *cpd)
614{ 607{
615 return cpd ? container_of(cpd, struct cfq_group_data, pd) : NULL; 608 return cpd ? container_of(cpd, struct cfq_group_data, cpd) : NULL;
616} 609}
617 610
618static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg) 611static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
@@ -693,14 +686,6 @@ static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw)
693 blkg_rwstat_add(&cfqg->stats.merged, rw, 1); 686 blkg_rwstat_add(&cfqg->stats.merged, rw, 1);
694} 687}
695 688
696static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
697 uint64_t bytes, int rw)
698{
699 blkg_stat_add(&cfqg->stats.sectors, bytes >> 9);
700 blkg_rwstat_add(&cfqg->stats.serviced, rw, 1);
701 blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes);
702}
703
704static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, 689static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
705 uint64_t start_time, uint64_t io_start_time, int rw) 690 uint64_t start_time, uint64_t io_start_time, int rw)
706{ 691{
@@ -718,8 +703,6 @@ static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
718static void cfqg_stats_reset(struct cfqg_stats *stats) 703static void cfqg_stats_reset(struct cfqg_stats *stats)
719{ 704{
720 /* queued stats shouldn't be cleared */ 705 /* queued stats shouldn't be cleared */
721 blkg_rwstat_reset(&stats->service_bytes);
722 blkg_rwstat_reset(&stats->serviced);
723 blkg_rwstat_reset(&stats->merged); 706 blkg_rwstat_reset(&stats->merged);
724 blkg_rwstat_reset(&stats->service_time); 707 blkg_rwstat_reset(&stats->service_time);
725 blkg_rwstat_reset(&stats->wait_time); 708 blkg_rwstat_reset(&stats->wait_time);
@@ -736,28 +719,26 @@ static void cfqg_stats_reset(struct cfqg_stats *stats)
736} 719}
737 720
738/* @to += @from */ 721/* @to += @from */
739static void cfqg_stats_merge(struct cfqg_stats *to, struct cfqg_stats *from) 722static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from)
740{ 723{
741 /* queued stats shouldn't be cleared */ 724 /* queued stats shouldn't be cleared */
742 blkg_rwstat_merge(&to->service_bytes, &from->service_bytes); 725 blkg_rwstat_add_aux(&to->merged, &from->merged);
743 blkg_rwstat_merge(&to->serviced, &from->serviced); 726 blkg_rwstat_add_aux(&to->service_time, &from->service_time);
744 blkg_rwstat_merge(&to->merged, &from->merged); 727 blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
745 blkg_rwstat_merge(&to->service_time, &from->service_time); 728 blkg_stat_add_aux(&from->time, &from->time);
746 blkg_rwstat_merge(&to->wait_time, &from->wait_time);
747 blkg_stat_merge(&from->time, &from->time);
748#ifdef CONFIG_DEBUG_BLK_CGROUP 729#ifdef CONFIG_DEBUG_BLK_CGROUP
749 blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time); 730 blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time);
750 blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum); 731 blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
751 blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples); 732 blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
752 blkg_stat_merge(&to->dequeue, &from->dequeue); 733 blkg_stat_add_aux(&to->dequeue, &from->dequeue);
753 blkg_stat_merge(&to->group_wait_time, &from->group_wait_time); 734 blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
754 blkg_stat_merge(&to->idle_time, &from->idle_time); 735 blkg_stat_add_aux(&to->idle_time, &from->idle_time);
755 blkg_stat_merge(&to->empty_time, &from->empty_time); 736 blkg_stat_add_aux(&to->empty_time, &from->empty_time);
756#endif 737#endif
757} 738}
758 739
759/* 740/*
760 * Transfer @cfqg's stats to its parent's dead_stats so that the ancestors' 741 * Transfer @cfqg's stats to its parent's aux counts so that the ancestors'
761 * recursive stats can still account for the amount used by this cfqg after 742 * recursive stats can still account for the amount used by this cfqg after
762 * it's gone. 743 * it's gone.
763 */ 744 */
@@ -770,10 +751,8 @@ static void cfqg_stats_xfer_dead(struct cfq_group *cfqg)
770 if (unlikely(!parent)) 751 if (unlikely(!parent))
771 return; 752 return;
772 753
773 cfqg_stats_merge(&parent->dead_stats, &cfqg->stats); 754 cfqg_stats_add_aux(&parent->stats, &cfqg->stats);
774 cfqg_stats_merge(&parent->dead_stats, &cfqg->dead_stats);
775 cfqg_stats_reset(&cfqg->stats); 755 cfqg_stats_reset(&cfqg->stats);
776 cfqg_stats_reset(&cfqg->dead_stats);
777} 756}
778 757
779#else /* CONFIG_CFQ_GROUP_IOSCHED */ 758#else /* CONFIG_CFQ_GROUP_IOSCHED */
@@ -795,8 +774,6 @@ static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
795 unsigned long time, unsigned long unaccounted_time) { } 774 unsigned long time, unsigned long unaccounted_time) { }
796static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { } 775static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { }
797static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { } 776static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { }
798static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
799 uint64_t bytes, int rw) { }
800static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, 777static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
801 uint64_t start_time, uint64_t io_start_time, int rw) { } 778 uint64_t start_time, uint64_t io_start_time, int rw) { }
802 779
@@ -883,8 +860,7 @@ static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
883 860
884static void cfq_dispatch_insert(struct request_queue *, struct request *); 861static void cfq_dispatch_insert(struct request_queue *, struct request *);
885static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync, 862static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
886 struct cfq_io_cq *cic, struct bio *bio, 863 struct cfq_io_cq *cic, struct bio *bio);
887 gfp_t gfp_mask);
888 864
889static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq) 865static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
890{ 866{
@@ -1546,130 +1522,171 @@ static void cfq_init_cfqg_base(struct cfq_group *cfqg)
1546} 1522}
1547 1523
1548#ifdef CONFIG_CFQ_GROUP_IOSCHED 1524#ifdef CONFIG_CFQ_GROUP_IOSCHED
1549static void cfqg_stats_init(struct cfqg_stats *stats) 1525static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
1526 bool on_dfl, bool reset_dev, bool is_leaf_weight);
1527
1528static void cfqg_stats_exit(struct cfqg_stats *stats)
1550{ 1529{
1551 blkg_rwstat_init(&stats->service_bytes); 1530 blkg_rwstat_exit(&stats->merged);
1552 blkg_rwstat_init(&stats->serviced); 1531 blkg_rwstat_exit(&stats->service_time);
1553 blkg_rwstat_init(&stats->merged); 1532 blkg_rwstat_exit(&stats->wait_time);
1554 blkg_rwstat_init(&stats->service_time); 1533 blkg_rwstat_exit(&stats->queued);
1555 blkg_rwstat_init(&stats->wait_time); 1534 blkg_stat_exit(&stats->time);
1556 blkg_rwstat_init(&stats->queued); 1535#ifdef CONFIG_DEBUG_BLK_CGROUP
1536 blkg_stat_exit(&stats->unaccounted_time);
1537 blkg_stat_exit(&stats->avg_queue_size_sum);
1538 blkg_stat_exit(&stats->avg_queue_size_samples);
1539 blkg_stat_exit(&stats->dequeue);
1540 blkg_stat_exit(&stats->group_wait_time);
1541 blkg_stat_exit(&stats->idle_time);
1542 blkg_stat_exit(&stats->empty_time);
1543#endif
1544}
1557 1545
1558 blkg_stat_init(&stats->sectors); 1546static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp)
1559 blkg_stat_init(&stats->time); 1547{
1548 if (blkg_rwstat_init(&stats->merged, gfp) ||
1549 blkg_rwstat_init(&stats->service_time, gfp) ||
1550 blkg_rwstat_init(&stats->wait_time, gfp) ||
1551 blkg_rwstat_init(&stats->queued, gfp) ||
1552 blkg_stat_init(&stats->time, gfp))
1553 goto err;
1560 1554
1561#ifdef CONFIG_DEBUG_BLK_CGROUP 1555#ifdef CONFIG_DEBUG_BLK_CGROUP
1562 blkg_stat_init(&stats->unaccounted_time); 1556 if (blkg_stat_init(&stats->unaccounted_time, gfp) ||
1563 blkg_stat_init(&stats->avg_queue_size_sum); 1557 blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
1564 blkg_stat_init(&stats->avg_queue_size_samples); 1558 blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
1565 blkg_stat_init(&stats->dequeue); 1559 blkg_stat_init(&stats->dequeue, gfp) ||
1566 blkg_stat_init(&stats->group_wait_time); 1560 blkg_stat_init(&stats->group_wait_time, gfp) ||
1567 blkg_stat_init(&stats->idle_time); 1561 blkg_stat_init(&stats->idle_time, gfp) ||
1568 blkg_stat_init(&stats->empty_time); 1562 blkg_stat_init(&stats->empty_time, gfp))
1563 goto err;
1569#endif 1564#endif
1565 return 0;
1566err:
1567 cfqg_stats_exit(stats);
1568 return -ENOMEM;
1570} 1569}
1571 1570
1572static void cfq_cpd_init(const struct blkcg *blkcg) 1571static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp)
1573{ 1572{
1574 struct cfq_group_data *cgd = 1573 struct cfq_group_data *cgd;
1575 cpd_to_cfqgd(blkcg->pd[blkcg_policy_cfq.plid]);
1576 1574
1577 if (blkcg == &blkcg_root) { 1575 cgd = kzalloc(sizeof(*cgd), GFP_KERNEL);
1578 cgd->weight = 2 * CFQ_WEIGHT_DEFAULT; 1576 if (!cgd)
1579 cgd->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT; 1577 return NULL;
1580 } else { 1578 return &cgd->cpd;
1581 cgd->weight = CFQ_WEIGHT_DEFAULT; 1579}
1582 cgd->leaf_weight = CFQ_WEIGHT_DEFAULT; 1580
1583 } 1581static void cfq_cpd_init(struct blkcg_policy_data *cpd)
1582{
1583 struct cfq_group_data *cgd = cpd_to_cfqgd(cpd);
1584 unsigned int weight = cgroup_on_dfl(blkcg_root.css.cgroup) ?
1585 CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
1586
1587 if (cpd_to_blkcg(cpd) == &blkcg_root)
1588 weight *= 2;
1589
1590 cgd->weight = weight;
1591 cgd->leaf_weight = weight;
1584} 1592}
1585 1593
1586static void cfq_pd_init(struct blkcg_gq *blkg) 1594static void cfq_cpd_free(struct blkcg_policy_data *cpd)
1587{ 1595{
1588 struct cfq_group *cfqg = blkg_to_cfqg(blkg); 1596 kfree(cpd_to_cfqgd(cpd));
1589 struct cfq_group_data *cgd = blkcg_to_cfqgd(blkg->blkcg); 1597}
1598
1599static void cfq_cpd_bind(struct blkcg_policy_data *cpd)
1600{
1601 struct blkcg *blkcg = cpd_to_blkcg(cpd);
1602 bool on_dfl = cgroup_on_dfl(blkcg_root.css.cgroup);
1603 unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
1604
1605 if (blkcg == &blkcg_root)
1606 weight *= 2;
1607
1608 WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, false));
1609 WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, true));
1610}
1611
1612static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node)
1613{
1614 struct cfq_group *cfqg;
1615
1616 cfqg = kzalloc_node(sizeof(*cfqg), gfp, node);
1617 if (!cfqg)
1618 return NULL;
1590 1619
1591 cfq_init_cfqg_base(cfqg); 1620 cfq_init_cfqg_base(cfqg);
1621 if (cfqg_stats_init(&cfqg->stats, gfp)) {
1622 kfree(cfqg);
1623 return NULL;
1624 }
1625
1626 return &cfqg->pd;
1627}
1628
1629static void cfq_pd_init(struct blkg_policy_data *pd)
1630{
1631 struct cfq_group *cfqg = pd_to_cfqg(pd);
1632 struct cfq_group_data *cgd = blkcg_to_cfqgd(pd->blkg->blkcg);
1633
1592 cfqg->weight = cgd->weight; 1634 cfqg->weight = cgd->weight;
1593 cfqg->leaf_weight = cgd->leaf_weight; 1635 cfqg->leaf_weight = cgd->leaf_weight;
1594 cfqg_stats_init(&cfqg->stats);
1595 cfqg_stats_init(&cfqg->dead_stats);
1596} 1636}
1597 1637
1598static void cfq_pd_offline(struct blkcg_gq *blkg) 1638static void cfq_pd_offline(struct blkg_policy_data *pd)
1599{ 1639{
1640 struct cfq_group *cfqg = pd_to_cfqg(pd);
1641 int i;
1642
1643 for (i = 0; i < IOPRIO_BE_NR; i++) {
1644 if (cfqg->async_cfqq[0][i])
1645 cfq_put_queue(cfqg->async_cfqq[0][i]);
1646 if (cfqg->async_cfqq[1][i])
1647 cfq_put_queue(cfqg->async_cfqq[1][i]);
1648 }
1649
1650 if (cfqg->async_idle_cfqq)
1651 cfq_put_queue(cfqg->async_idle_cfqq);
1652
1600 /* 1653 /*
1601 * @blkg is going offline and will be ignored by 1654 * @blkg is going offline and will be ignored by
1602 * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so 1655 * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so
1603 * that they don't get lost. If IOs complete after this point, the 1656 * that they don't get lost. If IOs complete after this point, the
1604 * stats for them will be lost. Oh well... 1657 * stats for them will be lost. Oh well...
1605 */ 1658 */
1606 cfqg_stats_xfer_dead(blkg_to_cfqg(blkg)); 1659 cfqg_stats_xfer_dead(cfqg);
1607} 1660}
1608 1661
1609/* offset delta from cfqg->stats to cfqg->dead_stats */ 1662static void cfq_pd_free(struct blkg_policy_data *pd)
1610static const int dead_stats_off_delta = offsetof(struct cfq_group, dead_stats) -
1611 offsetof(struct cfq_group, stats);
1612
1613/* to be used by recursive prfill, sums live and dead stats recursively */
1614static u64 cfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off)
1615{ 1663{
1616 u64 sum = 0; 1664 struct cfq_group *cfqg = pd_to_cfqg(pd);
1617
1618 sum += blkg_stat_recursive_sum(pd, off);
1619 sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta);
1620 return sum;
1621}
1622
1623/* to be used by recursive prfill, sums live and dead rwstats recursively */
1624static struct blkg_rwstat cfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd,
1625 int off)
1626{
1627 struct blkg_rwstat a, b;
1628 1665
1629 a = blkg_rwstat_recursive_sum(pd, off); 1666 cfqg_stats_exit(&cfqg->stats);
1630 b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta); 1667 return kfree(cfqg);
1631 blkg_rwstat_merge(&a, &b);
1632 return a;
1633} 1668}
1634 1669
1635static void cfq_pd_reset_stats(struct blkcg_gq *blkg) 1670static void cfq_pd_reset_stats(struct blkg_policy_data *pd)
1636{ 1671{
1637 struct cfq_group *cfqg = blkg_to_cfqg(blkg); 1672 struct cfq_group *cfqg = pd_to_cfqg(pd);
1638 1673
1639 cfqg_stats_reset(&cfqg->stats); 1674 cfqg_stats_reset(&cfqg->stats);
1640 cfqg_stats_reset(&cfqg->dead_stats);
1641} 1675}
1642 1676
1643/* 1677static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
1644 * Search for the cfq group current task belongs to. request_queue lock must 1678 struct blkcg *blkcg)
1645 * be held.
1646 */
1647static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
1648 struct blkcg *blkcg)
1649{ 1679{
1650 struct request_queue *q = cfqd->queue; 1680 struct blkcg_gq *blkg;
1651 struct cfq_group *cfqg = NULL;
1652
1653 /* avoid lookup for the common case where there's no blkcg */
1654 if (blkcg == &blkcg_root) {
1655 cfqg = cfqd->root_group;
1656 } else {
1657 struct blkcg_gq *blkg;
1658
1659 blkg = blkg_lookup_create(blkcg, q);
1660 if (!IS_ERR(blkg))
1661 cfqg = blkg_to_cfqg(blkg);
1662 }
1663 1681
1664 return cfqg; 1682 blkg = blkg_lookup(blkcg, cfqd->queue);
1683 if (likely(blkg))
1684 return blkg_to_cfqg(blkg);
1685 return NULL;
1665} 1686}
1666 1687
1667static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) 1688static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
1668{ 1689{
1669 /* Currently, all async queues are mapped to root group */
1670 if (!cfq_cfqq_sync(cfqq))
1671 cfqg = cfqq->cfqd->root_group;
1672
1673 cfqq->cfqg = cfqg; 1690 cfqq->cfqg = cfqg;
1674 /* cfqq reference on cfqg */ 1691 /* cfqq reference on cfqg */
1675 cfqg_get(cfqg); 1692 cfqg_get(cfqg);
@@ -1739,36 +1756,48 @@ static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
1739 1756
1740static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of, 1757static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
1741 char *buf, size_t nbytes, loff_t off, 1758 char *buf, size_t nbytes, loff_t off,
1742 bool is_leaf_weight) 1759 bool on_dfl, bool is_leaf_weight)
1743{ 1760{
1761 unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
1762 unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
1744 struct blkcg *blkcg = css_to_blkcg(of_css(of)); 1763 struct blkcg *blkcg = css_to_blkcg(of_css(of));
1745 struct blkg_conf_ctx ctx; 1764 struct blkg_conf_ctx ctx;
1746 struct cfq_group *cfqg; 1765 struct cfq_group *cfqg;
1747 struct cfq_group_data *cfqgd; 1766 struct cfq_group_data *cfqgd;
1748 int ret; 1767 int ret;
1768 u64 v;
1749 1769
1750 ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx); 1770 ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
1751 if (ret) 1771 if (ret)
1752 return ret; 1772 return ret;
1753 1773
1754 ret = -EINVAL; 1774 if (sscanf(ctx.body, "%llu", &v) == 1) {
1775 /* require "default" on dfl */
1776 ret = -ERANGE;
1777 if (!v && on_dfl)
1778 goto out_finish;
1779 } else if (!strcmp(strim(ctx.body), "default")) {
1780 v = 0;
1781 } else {
1782 ret = -EINVAL;
1783 goto out_finish;
1784 }
1785
1755 cfqg = blkg_to_cfqg(ctx.blkg); 1786 cfqg = blkg_to_cfqg(ctx.blkg);
1756 cfqgd = blkcg_to_cfqgd(blkcg); 1787 cfqgd = blkcg_to_cfqgd(blkcg);
1757 if (!cfqg || !cfqgd)
1758 goto err;
1759 1788
1760 if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) { 1789 ret = -ERANGE;
1790 if (!v || (v >= min && v <= max)) {
1761 if (!is_leaf_weight) { 1791 if (!is_leaf_weight) {
1762 cfqg->dev_weight = ctx.v; 1792 cfqg->dev_weight = v;
1763 cfqg->new_weight = ctx.v ?: cfqgd->weight; 1793 cfqg->new_weight = v ?: cfqgd->weight;
1764 } else { 1794 } else {
1765 cfqg->dev_leaf_weight = ctx.v; 1795 cfqg->dev_leaf_weight = v;
1766 cfqg->new_leaf_weight = ctx.v ?: cfqgd->leaf_weight; 1796 cfqg->new_leaf_weight = v ?: cfqgd->leaf_weight;
1767 } 1797 }
1768 ret = 0; 1798 ret = 0;
1769 } 1799 }
1770 1800out_finish:
1771err:
1772 blkg_conf_finish(&ctx); 1801 blkg_conf_finish(&ctx);
1773 return ret ?: nbytes; 1802 return ret ?: nbytes;
1774} 1803}
@@ -1776,25 +1805,27 @@ err:
1776static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of, 1805static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of,
1777 char *buf, size_t nbytes, loff_t off) 1806 char *buf, size_t nbytes, loff_t off)
1778{ 1807{
1779 return __cfqg_set_weight_device(of, buf, nbytes, off, false); 1808 return __cfqg_set_weight_device(of, buf, nbytes, off, false, false);
1780} 1809}
1781 1810
1782static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of, 1811static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of,
1783 char *buf, size_t nbytes, loff_t off) 1812 char *buf, size_t nbytes, loff_t off)
1784{ 1813{
1785 return __cfqg_set_weight_device(of, buf, nbytes, off, true); 1814 return __cfqg_set_weight_device(of, buf, nbytes, off, false, true);
1786} 1815}
1787 1816
1788static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, 1817static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
1789 u64 val, bool is_leaf_weight) 1818 bool on_dfl, bool reset_dev, bool is_leaf_weight)
1790{ 1819{
1820 unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
1821 unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
1791 struct blkcg *blkcg = css_to_blkcg(css); 1822 struct blkcg *blkcg = css_to_blkcg(css);
1792 struct blkcg_gq *blkg; 1823 struct blkcg_gq *blkg;
1793 struct cfq_group_data *cfqgd; 1824 struct cfq_group_data *cfqgd;
1794 int ret = 0; 1825 int ret = 0;
1795 1826
1796 if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX) 1827 if (val < min || val > max)
1797 return -EINVAL; 1828 return -ERANGE;
1798 1829
1799 spin_lock_irq(&blkcg->lock); 1830 spin_lock_irq(&blkcg->lock);
1800 cfqgd = blkcg_to_cfqgd(blkcg); 1831 cfqgd = blkcg_to_cfqgd(blkcg);
@@ -1815,9 +1846,13 @@ static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
1815 continue; 1846 continue;
1816 1847
1817 if (!is_leaf_weight) { 1848 if (!is_leaf_weight) {
1849 if (reset_dev)
1850 cfqg->dev_weight = 0;
1818 if (!cfqg->dev_weight) 1851 if (!cfqg->dev_weight)
1819 cfqg->new_weight = cfqgd->weight; 1852 cfqg->new_weight = cfqgd->weight;
1820 } else { 1853 } else {
1854 if (reset_dev)
1855 cfqg->dev_leaf_weight = 0;
1821 if (!cfqg->dev_leaf_weight) 1856 if (!cfqg->dev_leaf_weight)
1822 cfqg->new_leaf_weight = cfqgd->leaf_weight; 1857 cfqg->new_leaf_weight = cfqgd->leaf_weight;
1823 } 1858 }
@@ -1831,13 +1866,13 @@ out:
1831static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, 1866static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
1832 u64 val) 1867 u64 val)
1833{ 1868{
1834 return __cfq_set_weight(css, cft, val, false); 1869 return __cfq_set_weight(css, val, false, false, false);
1835} 1870}
1836 1871
1837static int cfq_set_leaf_weight(struct cgroup_subsys_state *css, 1872static int cfq_set_leaf_weight(struct cgroup_subsys_state *css,
1838 struct cftype *cft, u64 val) 1873 struct cftype *cft, u64 val)
1839{ 1874{
1840 return __cfq_set_weight(css, cft, val, true); 1875 return __cfq_set_weight(css, val, false, false, true);
1841} 1876}
1842 1877
1843static int cfqg_print_stat(struct seq_file *sf, void *v) 1878static int cfqg_print_stat(struct seq_file *sf, void *v)
@@ -1857,16 +1892,16 @@ static int cfqg_print_rwstat(struct seq_file *sf, void *v)
1857static u64 cfqg_prfill_stat_recursive(struct seq_file *sf, 1892static u64 cfqg_prfill_stat_recursive(struct seq_file *sf,
1858 struct blkg_policy_data *pd, int off) 1893 struct blkg_policy_data *pd, int off)
1859{ 1894{
1860 u64 sum = cfqg_stat_pd_recursive_sum(pd, off); 1895 u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
1861 1896 &blkcg_policy_cfq, off);
1862 return __blkg_prfill_u64(sf, pd, sum); 1897 return __blkg_prfill_u64(sf, pd, sum);
1863} 1898}
1864 1899
1865static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf, 1900static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
1866 struct blkg_policy_data *pd, int off) 1901 struct blkg_policy_data *pd, int off)
1867{ 1902{
1868 struct blkg_rwstat sum = cfqg_rwstat_pd_recursive_sum(pd, off); 1903 struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
1869 1904 &blkcg_policy_cfq, off);
1870 return __blkg_prfill_rwstat(sf, pd, &sum); 1905 return __blkg_prfill_rwstat(sf, pd, &sum);
1871} 1906}
1872 1907
@@ -1886,6 +1921,40 @@ static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
1886 return 0; 1921 return 0;
1887} 1922}
1888 1923
1924static u64 cfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
1925 int off)
1926{
1927 u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
1928
1929 return __blkg_prfill_u64(sf, pd, sum >> 9);
1930}
1931
1932static int cfqg_print_stat_sectors(struct seq_file *sf, void *v)
1933{
1934 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1935 cfqg_prfill_sectors, &blkcg_policy_cfq, 0, false);
1936 return 0;
1937}
1938
1939static u64 cfqg_prfill_sectors_recursive(struct seq_file *sf,
1940 struct blkg_policy_data *pd, int off)
1941{
1942 struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
1943 offsetof(struct blkcg_gq, stat_bytes));
1944 u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
1945 atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
1946
1947 return __blkg_prfill_u64(sf, pd, sum >> 9);
1948}
1949
1950static int cfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
1951{
1952 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1953 cfqg_prfill_sectors_recursive, &blkcg_policy_cfq, 0,
1954 false);
1955 return 0;
1956}
1957
1889#ifdef CONFIG_DEBUG_BLK_CGROUP 1958#ifdef CONFIG_DEBUG_BLK_CGROUP
1890static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, 1959static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
1891 struct blkg_policy_data *pd, int off) 1960 struct blkg_policy_data *pd, int off)
@@ -1912,7 +1981,7 @@ static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v)
1912} 1981}
1913#endif /* CONFIG_DEBUG_BLK_CGROUP */ 1982#endif /* CONFIG_DEBUG_BLK_CGROUP */
1914 1983
1915static struct cftype cfq_blkcg_files[] = { 1984static struct cftype cfq_blkcg_legacy_files[] = {
1916 /* on root, weight is mapped to leaf_weight */ 1985 /* on root, weight is mapped to leaf_weight */
1917 { 1986 {
1918 .name = "weight_device", 1987 .name = "weight_device",
@@ -1960,18 +2029,17 @@ static struct cftype cfq_blkcg_files[] = {
1960 }, 2029 },
1961 { 2030 {
1962 .name = "sectors", 2031 .name = "sectors",
1963 .private = offsetof(struct cfq_group, stats.sectors), 2032 .seq_show = cfqg_print_stat_sectors,
1964 .seq_show = cfqg_print_stat,
1965 }, 2033 },
1966 { 2034 {
1967 .name = "io_service_bytes", 2035 .name = "io_service_bytes",
1968 .private = offsetof(struct cfq_group, stats.service_bytes), 2036 .private = (unsigned long)&blkcg_policy_cfq,
1969 .seq_show = cfqg_print_rwstat, 2037 .seq_show = blkg_print_stat_bytes,
1970 }, 2038 },
1971 { 2039 {
1972 .name = "io_serviced", 2040 .name = "io_serviced",
1973 .private = offsetof(struct cfq_group, stats.serviced), 2041 .private = (unsigned long)&blkcg_policy_cfq,
1974 .seq_show = cfqg_print_rwstat, 2042 .seq_show = blkg_print_stat_ios,
1975 }, 2043 },
1976 { 2044 {
1977 .name = "io_service_time", 2045 .name = "io_service_time",
@@ -2002,18 +2070,17 @@ static struct cftype cfq_blkcg_files[] = {
2002 }, 2070 },
2003 { 2071 {
2004 .name = "sectors_recursive", 2072 .name = "sectors_recursive",
2005 .private = offsetof(struct cfq_group, stats.sectors), 2073 .seq_show = cfqg_print_stat_sectors_recursive,
2006 .seq_show = cfqg_print_stat_recursive,
2007 }, 2074 },
2008 { 2075 {
2009 .name = "io_service_bytes_recursive", 2076 .name = "io_service_bytes_recursive",
2010 .private = offsetof(struct cfq_group, stats.service_bytes), 2077 .private = (unsigned long)&blkcg_policy_cfq,
2011 .seq_show = cfqg_print_rwstat_recursive, 2078 .seq_show = blkg_print_stat_bytes_recursive,
2012 }, 2079 },
2013 { 2080 {
2014 .name = "io_serviced_recursive", 2081 .name = "io_serviced_recursive",
2015 .private = offsetof(struct cfq_group, stats.serviced), 2082 .private = (unsigned long)&blkcg_policy_cfq,
2016 .seq_show = cfqg_print_rwstat_recursive, 2083 .seq_show = blkg_print_stat_ios_recursive,
2017 }, 2084 },
2018 { 2085 {
2019 .name = "io_service_time_recursive", 2086 .name = "io_service_time_recursive",
@@ -2068,9 +2135,51 @@ static struct cftype cfq_blkcg_files[] = {
2068#endif /* CONFIG_DEBUG_BLK_CGROUP */ 2135#endif /* CONFIG_DEBUG_BLK_CGROUP */
2069 { } /* terminate */ 2136 { } /* terminate */
2070}; 2137};
2138
2139static int cfq_print_weight_on_dfl(struct seq_file *sf, void *v)
2140{
2141 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2142 struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
2143
2144 seq_printf(sf, "default %u\n", cgd->weight);
2145 blkcg_print_blkgs(sf, blkcg, cfqg_prfill_weight_device,
2146 &blkcg_policy_cfq, 0, false);
2147 return 0;
2148}
2149
2150static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of,
2151 char *buf, size_t nbytes, loff_t off)
2152{
2153 char *endp;
2154 int ret;
2155 u64 v;
2156
2157 buf = strim(buf);
2158
2159 /* "WEIGHT" or "default WEIGHT" sets the default weight */
2160 v = simple_strtoull(buf, &endp, 0);
2161 if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) {
2162 ret = __cfq_set_weight(of_css(of), v, true, false, false);
2163 return ret ?: nbytes;
2164 }
2165
2166 /* "MAJ:MIN WEIGHT" */
2167 return __cfqg_set_weight_device(of, buf, nbytes, off, true, false);
2168}
2169
2170static struct cftype cfq_blkcg_files[] = {
2171 {
2172 .name = "weight",
2173 .flags = CFTYPE_NOT_ON_ROOT,
2174 .seq_show = cfq_print_weight_on_dfl,
2175 .write = cfq_set_weight_on_dfl,
2176 },
2177 { } /* terminate */
2178};
2179
2071#else /* GROUP_IOSCHED */ 2180#else /* GROUP_IOSCHED */
2072static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd, 2181static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
2073 struct blkcg *blkcg) 2182 struct blkcg *blkcg)
2074{ 2183{
2075 return cfqd->root_group; 2184 return cfqd->root_group;
2076} 2185}
@@ -2873,7 +2982,6 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
2873 2982
2874 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; 2983 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
2875 cfqq->nr_sectors += blk_rq_sectors(rq); 2984 cfqq->nr_sectors += blk_rq_sectors(rq);
2876 cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags);
2877} 2985}
2878 2986
2879/* 2987/*
@@ -3506,14 +3614,14 @@ static void cfq_exit_icq(struct io_cq *icq)
3506 struct cfq_io_cq *cic = icq_to_cic(icq); 3614 struct cfq_io_cq *cic = icq_to_cic(icq);
3507 struct cfq_data *cfqd = cic_to_cfqd(cic); 3615 struct cfq_data *cfqd = cic_to_cfqd(cic);
3508 3616
3509 if (cic->cfqq[BLK_RW_ASYNC]) { 3617 if (cic_to_cfqq(cic, false)) {
3510 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); 3618 cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, false));
3511 cic->cfqq[BLK_RW_ASYNC] = NULL; 3619 cic_set_cfqq(cic, NULL, false);
3512 } 3620 }
3513 3621
3514 if (cic->cfqq[BLK_RW_SYNC]) { 3622 if (cic_to_cfqq(cic, true)) {
3515 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]); 3623 cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, true));
3516 cic->cfqq[BLK_RW_SYNC] = NULL; 3624 cic_set_cfqq(cic, NULL, true);
3517 } 3625 }
3518} 3626}
3519 3627
@@ -3572,18 +3680,14 @@ static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
3572 if (unlikely(!cfqd) || likely(cic->ioprio == ioprio)) 3680 if (unlikely(!cfqd) || likely(cic->ioprio == ioprio))
3573 return; 3681 return;
3574 3682
3575 cfqq = cic->cfqq[BLK_RW_ASYNC]; 3683 cfqq = cic_to_cfqq(cic, false);
3576 if (cfqq) { 3684 if (cfqq) {
3577 struct cfq_queue *new_cfqq; 3685 cfq_put_queue(cfqq);
3578 new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio, 3686 cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio);
3579 GFP_ATOMIC); 3687 cic_set_cfqq(cic, cfqq, false);
3580 if (new_cfqq) {
3581 cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
3582 cfq_put_queue(cfqq);
3583 }
3584 } 3688 }
3585 3689
3586 cfqq = cic->cfqq[BLK_RW_SYNC]; 3690 cfqq = cic_to_cfqq(cic, true);
3587 if (cfqq) 3691 if (cfqq)
3588 cfq_mark_cfqq_prio_changed(cfqq); 3692 cfq_mark_cfqq_prio_changed(cfqq);
3589 3693
@@ -3614,7 +3718,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3614static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) 3718static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
3615{ 3719{
3616 struct cfq_data *cfqd = cic_to_cfqd(cic); 3720 struct cfq_data *cfqd = cic_to_cfqd(cic);
3617 struct cfq_queue *sync_cfqq; 3721 struct cfq_queue *cfqq;
3618 uint64_t serial_nr; 3722 uint64_t serial_nr;
3619 3723
3620 rcu_read_lock(); 3724 rcu_read_lock();
@@ -3628,15 +3732,22 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
3628 if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr)) 3732 if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr))
3629 return; 3733 return;
3630 3734
3631 sync_cfqq = cic_to_cfqq(cic, 1); 3735 /*
3632 if (sync_cfqq) { 3736 * Drop reference to queues. New queues will be assigned in new
3633 /* 3737 * group upon arrival of fresh requests.
3634 * Drop reference to sync queue. A new sync queue will be 3738 */
3635 * assigned in new group upon arrival of a fresh request. 3739 cfqq = cic_to_cfqq(cic, false);
3636 */ 3740 if (cfqq) {
3637 cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup"); 3741 cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
3638 cic_set_cfqq(cic, NULL, 1); 3742 cic_set_cfqq(cic, NULL, false);
3639 cfq_put_queue(sync_cfqq); 3743 cfq_put_queue(cfqq);
3744 }
3745
3746 cfqq = cic_to_cfqq(cic, true);
3747 if (cfqq) {
3748 cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
3749 cic_set_cfqq(cic, NULL, true);
3750 cfq_put_queue(cfqq);
3640 } 3751 }
3641 3752
3642 cic->blkcg_serial_nr = serial_nr; 3753 cic->blkcg_serial_nr = serial_nr;
@@ -3645,81 +3756,19 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
3645static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { } 3756static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { }
3646#endif /* CONFIG_CFQ_GROUP_IOSCHED */ 3757#endif /* CONFIG_CFQ_GROUP_IOSCHED */
3647 3758
3648static struct cfq_queue *
3649cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
3650 struct bio *bio, gfp_t gfp_mask)
3651{
3652 struct blkcg *blkcg;
3653 struct cfq_queue *cfqq, *new_cfqq = NULL;
3654 struct cfq_group *cfqg;
3655
3656retry:
3657 rcu_read_lock();
3658
3659 blkcg = bio_blkcg(bio);
3660 cfqg = cfq_lookup_create_cfqg(cfqd, blkcg);
3661 if (!cfqg) {
3662 cfqq = &cfqd->oom_cfqq;
3663 goto out;
3664 }
3665
3666 cfqq = cic_to_cfqq(cic, is_sync);
3667
3668 /*
3669 * Always try a new alloc if we fell back to the OOM cfqq
3670 * originally, since it should just be a temporary situation.
3671 */
3672 if (!cfqq || cfqq == &cfqd->oom_cfqq) {
3673 cfqq = NULL;
3674 if (new_cfqq) {
3675 cfqq = new_cfqq;
3676 new_cfqq = NULL;
3677 } else if (gfp_mask & __GFP_WAIT) {
3678 rcu_read_unlock();
3679 spin_unlock_irq(cfqd->queue->queue_lock);
3680 new_cfqq = kmem_cache_alloc_node(cfq_pool,
3681 gfp_mask | __GFP_ZERO,
3682 cfqd->queue->node);
3683 spin_lock_irq(cfqd->queue->queue_lock);
3684 if (new_cfqq)
3685 goto retry;
3686 else
3687 return &cfqd->oom_cfqq;
3688 } else {
3689 cfqq = kmem_cache_alloc_node(cfq_pool,
3690 gfp_mask | __GFP_ZERO,
3691 cfqd->queue->node);
3692 }
3693
3694 if (cfqq) {
3695 cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
3696 cfq_init_prio_data(cfqq, cic);
3697 cfq_link_cfqq_cfqg(cfqq, cfqg);
3698 cfq_log_cfqq(cfqd, cfqq, "alloced");
3699 } else
3700 cfqq = &cfqd->oom_cfqq;
3701 }
3702out:
3703 if (new_cfqq)
3704 kmem_cache_free(cfq_pool, new_cfqq);
3705
3706 rcu_read_unlock();
3707 return cfqq;
3708}
3709
3710static struct cfq_queue ** 3759static struct cfq_queue **
3711cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) 3760cfq_async_queue_prio(struct cfq_group *cfqg, int ioprio_class, int ioprio)
3712{ 3761{
3713 switch (ioprio_class) { 3762 switch (ioprio_class) {
3714 case IOPRIO_CLASS_RT: 3763 case IOPRIO_CLASS_RT:
3715 return &cfqd->async_cfqq[0][ioprio]; 3764 return &cfqg->async_cfqq[0][ioprio];
3716 case IOPRIO_CLASS_NONE: 3765 case IOPRIO_CLASS_NONE:
3717 ioprio = IOPRIO_NORM; 3766 ioprio = IOPRIO_NORM;
3718 /* fall through */ 3767 /* fall through */
3719 case IOPRIO_CLASS_BE: 3768 case IOPRIO_CLASS_BE:
3720 return &cfqd->async_cfqq[1][ioprio]; 3769 return &cfqg->async_cfqq[1][ioprio];
3721 case IOPRIO_CLASS_IDLE: 3770 case IOPRIO_CLASS_IDLE:
3722 return &cfqd->async_idle_cfqq; 3771 return &cfqg->async_idle_cfqq;
3723 default: 3772 default:
3724 BUG(); 3773 BUG();
3725 } 3774 }
@@ -3727,12 +3776,20 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
3727 3776
3728static struct cfq_queue * 3777static struct cfq_queue *
3729cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, 3778cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
3730 struct bio *bio, gfp_t gfp_mask) 3779 struct bio *bio)
3731{ 3780{
3732 int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); 3781 int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
3733 int ioprio = IOPRIO_PRIO_DATA(cic->ioprio); 3782 int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3734 struct cfq_queue **async_cfqq = NULL; 3783 struct cfq_queue **async_cfqq = NULL;
3735 struct cfq_queue *cfqq = NULL; 3784 struct cfq_queue *cfqq;
3785 struct cfq_group *cfqg;
3786
3787 rcu_read_lock();
3788 cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio));
3789 if (!cfqg) {
3790 cfqq = &cfqd->oom_cfqq;
3791 goto out;
3792 }
3736 3793
3737 if (!is_sync) { 3794 if (!is_sync) {
3738 if (!ioprio_valid(cic->ioprio)) { 3795 if (!ioprio_valid(cic->ioprio)) {
@@ -3740,22 +3797,32 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
3740 ioprio = task_nice_ioprio(tsk); 3797 ioprio = task_nice_ioprio(tsk);
3741 ioprio_class = task_nice_ioclass(tsk); 3798 ioprio_class = task_nice_ioclass(tsk);
3742 } 3799 }
3743 async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio); 3800 async_cfqq = cfq_async_queue_prio(cfqg, ioprio_class, ioprio);
3744 cfqq = *async_cfqq; 3801 cfqq = *async_cfqq;
3802 if (cfqq)
3803 goto out;
3745 } 3804 }
3746 3805
3747 if (!cfqq) 3806 cfqq = kmem_cache_alloc_node(cfq_pool, GFP_NOWAIT | __GFP_ZERO,
3748 cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask); 3807 cfqd->queue->node);
3808 if (!cfqq) {
3809 cfqq = &cfqd->oom_cfqq;
3810 goto out;
3811 }
3749 3812
3750 /* 3813 cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
3751 * pin the queue now that it's allocated, scheduler exit will prune it 3814 cfq_init_prio_data(cfqq, cic);
3752 */ 3815 cfq_link_cfqq_cfqg(cfqq, cfqg);
3753 if (!is_sync && !(*async_cfqq)) { 3816 cfq_log_cfqq(cfqd, cfqq, "alloced");
3817
3818 if (async_cfqq) {
3819 /* a new async queue is created, pin and remember */
3754 cfqq->ref++; 3820 cfqq->ref++;
3755 *async_cfqq = cfqq; 3821 *async_cfqq = cfqq;
3756 } 3822 }
3757 3823out:
3758 cfqq->ref++; 3824 cfqq->ref++;
3825 rcu_read_unlock();
3759 return cfqq; 3826 return cfqq;
3760} 3827}
3761 3828
@@ -4289,8 +4356,6 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
4289 const bool is_sync = rq_is_sync(rq); 4356 const bool is_sync = rq_is_sync(rq);
4290 struct cfq_queue *cfqq; 4357 struct cfq_queue *cfqq;
4291 4358
4292 might_sleep_if(gfp_mask & __GFP_WAIT);
4293
4294 spin_lock_irq(q->queue_lock); 4359 spin_lock_irq(q->queue_lock);
4295 4360
4296 check_ioprio_changed(cic, bio); 4361 check_ioprio_changed(cic, bio);
@@ -4298,7 +4363,9 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
4298new_queue: 4363new_queue:
4299 cfqq = cic_to_cfqq(cic, is_sync); 4364 cfqq = cic_to_cfqq(cic, is_sync);
4300 if (!cfqq || cfqq == &cfqd->oom_cfqq) { 4365 if (!cfqq || cfqq == &cfqd->oom_cfqq) {
4301 cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask); 4366 if (cfqq)
4367 cfq_put_queue(cfqq);
4368 cfqq = cfq_get_queue(cfqd, is_sync, cic, bio);
4302 cic_set_cfqq(cic, cfqq, is_sync); 4369 cic_set_cfqq(cic, cfqq, is_sync);
4303 } else { 4370 } else {
4304 /* 4371 /*
@@ -4404,21 +4471,6 @@ static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
4404 cancel_work_sync(&cfqd->unplug_work); 4471 cancel_work_sync(&cfqd->unplug_work);
4405} 4472}
4406 4473
4407static void cfq_put_async_queues(struct cfq_data *cfqd)
4408{
4409 int i;
4410
4411 for (i = 0; i < IOPRIO_BE_NR; i++) {
4412 if (cfqd->async_cfqq[0][i])
4413 cfq_put_queue(cfqd->async_cfqq[0][i]);
4414 if (cfqd->async_cfqq[1][i])
4415 cfq_put_queue(cfqd->async_cfqq[1][i]);
4416 }
4417
4418 if (cfqd->async_idle_cfqq)
4419 cfq_put_queue(cfqd->async_idle_cfqq);
4420}
4421
4422static void cfq_exit_queue(struct elevator_queue *e) 4474static void cfq_exit_queue(struct elevator_queue *e)
4423{ 4475{
4424 struct cfq_data *cfqd = e->elevator_data; 4476 struct cfq_data *cfqd = e->elevator_data;
@@ -4431,8 +4483,6 @@ static void cfq_exit_queue(struct elevator_queue *e)
4431 if (cfqd->active_queue) 4483 if (cfqd->active_queue)
4432 __cfq_slice_expired(cfqd, cfqd->active_queue, 0); 4484 __cfq_slice_expired(cfqd, cfqd->active_queue, 0);
4433 4485
4434 cfq_put_async_queues(cfqd);
4435
4436 spin_unlock_irq(q->queue_lock); 4486 spin_unlock_irq(q->queue_lock);
4437 4487
4438 cfq_shutdown_timer_wq(cfqd); 4488 cfq_shutdown_timer_wq(cfqd);
@@ -4486,9 +4536,9 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
4486 goto out_free; 4536 goto out_free;
4487 4537
4488 cfq_init_cfqg_base(cfqd->root_group); 4538 cfq_init_cfqg_base(cfqd->root_group);
4539 cfqd->root_group->weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
4540 cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
4489#endif 4541#endif
4490 cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT;
4491 cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
4492 4542
4493 /* 4543 /*
4494 * Not strictly needed (since RB_ROOT just clears the node and we 4544 * Not strictly needed (since RB_ROOT just clears the node and we
@@ -4499,7 +4549,7 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
4499 cfqd->prio_trees[i] = RB_ROOT; 4549 cfqd->prio_trees[i] = RB_ROOT;
4500 4550
4501 /* 4551 /*
4502 * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues. 4552 * Our fallback cfqq if cfq_get_queue() runs into OOM issues.
4503 * Grab a permanent reference to it, so that the normal code flow 4553 * Grab a permanent reference to it, so that the normal code flow
4504 * will not attempt to free it. oom_cfqq is linked to root_group 4554 * will not attempt to free it. oom_cfqq is linked to root_group
4505 * but shouldn't hold a reference as it'll never be unlinked. Lose 4555 * but shouldn't hold a reference as it'll never be unlinked. Lose
@@ -4683,13 +4733,18 @@ static struct elevator_type iosched_cfq = {
4683 4733
4684#ifdef CONFIG_CFQ_GROUP_IOSCHED 4734#ifdef CONFIG_CFQ_GROUP_IOSCHED
4685static struct blkcg_policy blkcg_policy_cfq = { 4735static struct blkcg_policy blkcg_policy_cfq = {
4686 .pd_size = sizeof(struct cfq_group), 4736 .dfl_cftypes = cfq_blkcg_files,
4687 .cpd_size = sizeof(struct cfq_group_data), 4737 .legacy_cftypes = cfq_blkcg_legacy_files,
4688 .cftypes = cfq_blkcg_files,
4689 4738
4739 .cpd_alloc_fn = cfq_cpd_alloc,
4690 .cpd_init_fn = cfq_cpd_init, 4740 .cpd_init_fn = cfq_cpd_init,
4741 .cpd_free_fn = cfq_cpd_free,
4742 .cpd_bind_fn = cfq_cpd_bind,
4743
4744 .pd_alloc_fn = cfq_pd_alloc,
4691 .pd_init_fn = cfq_pd_init, 4745 .pd_init_fn = cfq_pd_init,
4692 .pd_offline_fn = cfq_pd_offline, 4746 .pd_offline_fn = cfq_pd_offline,
4747 .pd_free_fn = cfq_pd_free,
4693 .pd_reset_stats_fn = cfq_pd_reset_stats, 4748 .pd_reset_stats_fn = cfq_pd_reset_stats,
4694}; 4749};
4695#endif 4750#endif
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index ae0f438c2ee6..24489126f8ca 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -53,8 +53,6 @@ struct wb_writeback_work {
53 unsigned int for_background:1; 53 unsigned int for_background:1;
54 unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ 54 unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
55 unsigned int auto_free:1; /* free on completion */ 55 unsigned int auto_free:1; /* free on completion */
56 unsigned int single_wait:1;
57 unsigned int single_done:1;
58 enum wb_reason reason; /* why was writeback initiated? */ 56 enum wb_reason reason; /* why was writeback initiated? */
59 57
60 struct list_head list; /* pending work list */ 58 struct list_head list; /* pending work list */
@@ -178,14 +176,11 @@ static void wb_wakeup(struct bdi_writeback *wb)
178static void wb_queue_work(struct bdi_writeback *wb, 176static void wb_queue_work(struct bdi_writeback *wb,
179 struct wb_writeback_work *work) 177 struct wb_writeback_work *work)
180{ 178{
181 trace_writeback_queue(wb->bdi, work); 179 trace_writeback_queue(wb, work);
182 180
183 spin_lock_bh(&wb->work_lock); 181 spin_lock_bh(&wb->work_lock);
184 if (!test_bit(WB_registered, &wb->state)) { 182 if (!test_bit(WB_registered, &wb->state))
185 if (work->single_wait)
186 work->single_done = 1;
187 goto out_unlock; 183 goto out_unlock;
188 }
189 if (work->done) 184 if (work->done)
190 atomic_inc(&work->done->cnt); 185 atomic_inc(&work->done->cnt);
191 list_add_tail(&work->list, &wb->work_list); 186 list_add_tail(&work->list, &wb->work_list);
@@ -706,7 +701,7 @@ EXPORT_SYMBOL_GPL(wbc_account_io);
706 701
707/** 702/**
708 * inode_congested - test whether an inode is congested 703 * inode_congested - test whether an inode is congested
709 * @inode: inode to test for congestion 704 * @inode: inode to test for congestion (may be NULL)
710 * @cong_bits: mask of WB_[a]sync_congested bits to test 705 * @cong_bits: mask of WB_[a]sync_congested bits to test
711 * 706 *
712 * Tests whether @inode is congested. @cong_bits is the mask of congestion 707 * Tests whether @inode is congested. @cong_bits is the mask of congestion
@@ -716,6 +711,9 @@ EXPORT_SYMBOL_GPL(wbc_account_io);
716 * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg 711 * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg
717 * associated with @inode is congested; otherwise, the root wb's congestion 712 * associated with @inode is congested; otherwise, the root wb's congestion
718 * state is used. 713 * state is used.
714 *
715 * @inode is allowed to be NULL as this function is often called on
716 * mapping->host which is NULL for the swapper space.
719 */ 717 */
720int inode_congested(struct inode *inode, int cong_bits) 718int inode_congested(struct inode *inode, int cong_bits)
721{ 719{
@@ -738,32 +736,6 @@ int inode_congested(struct inode *inode, int cong_bits)
738EXPORT_SYMBOL_GPL(inode_congested); 736EXPORT_SYMBOL_GPL(inode_congested);
739 737
740/** 738/**
741 * wb_wait_for_single_work - wait for completion of a single bdi_writeback_work
742 * @bdi: bdi the work item was issued to
743 * @work: work item to wait for
744 *
745 * Wait for the completion of @work which was issued to one of @bdi's
746 * bdi_writeback's. The caller must have set @work->single_wait before
747 * issuing it. This wait operates independently fo
748 * wb_wait_for_completion() and also disables automatic freeing of @work.
749 */
750static void wb_wait_for_single_work(struct backing_dev_info *bdi,
751 struct wb_writeback_work *work)
752{
753 if (WARN_ON_ONCE(!work->single_wait))
754 return;
755
756 wait_event(bdi->wb_waitq, work->single_done);
757
758 /*
759 * Paired with smp_wmb() in wb_do_writeback() and ensures that all
760 * modifications to @work prior to assertion of ->single_done is
761 * visible to the caller once this function returns.
762 */
763 smp_rmb();
764}
765
766/**
767 * wb_split_bdi_pages - split nr_pages to write according to bandwidth 739 * wb_split_bdi_pages - split nr_pages to write according to bandwidth
768 * @wb: target bdi_writeback to split @nr_pages to 740 * @wb: target bdi_writeback to split @nr_pages to
769 * @nr_pages: number of pages to write for the whole bdi 741 * @nr_pages: number of pages to write for the whole bdi
@@ -792,38 +764,6 @@ static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
792} 764}
793 765
794/** 766/**
795 * wb_clone_and_queue_work - clone a wb_writeback_work and issue it to a wb
796 * @wb: target bdi_writeback
797 * @base_work: source wb_writeback_work
798 *
799 * Try to make a clone of @base_work and issue it to @wb. If cloning
800 * succeeds, %true is returned; otherwise, @base_work is issued directly
801 * and %false is returned. In the latter case, the caller is required to
802 * wait for @base_work's completion using wb_wait_for_single_work().
803 *
804 * A clone is auto-freed on completion. @base_work never is.
805 */
806static bool wb_clone_and_queue_work(struct bdi_writeback *wb,
807 struct wb_writeback_work *base_work)
808{
809 struct wb_writeback_work *work;
810
811 work = kmalloc(sizeof(*work), GFP_ATOMIC);
812 if (work) {
813 *work = *base_work;
814 work->auto_free = 1;
815 work->single_wait = 0;
816 } else {
817 work = base_work;
818 work->auto_free = 0;
819 work->single_wait = 1;
820 }
821 work->single_done = 0;
822 wb_queue_work(wb, work);
823 return work != base_work;
824}
825
826/**
827 * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi 767 * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi
828 * @bdi: target backing_dev_info 768 * @bdi: target backing_dev_info
829 * @base_work: wb_writeback_work to issue 769 * @base_work: wb_writeback_work to issue
@@ -838,15 +778,19 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
838 struct wb_writeback_work *base_work, 778 struct wb_writeback_work *base_work,
839 bool skip_if_busy) 779 bool skip_if_busy)
840{ 780{
841 long nr_pages = base_work->nr_pages; 781 int next_memcg_id = 0;
842 int next_blkcg_id = 0;
843 struct bdi_writeback *wb; 782 struct bdi_writeback *wb;
844 struct wb_iter iter; 783 struct wb_iter iter;
845 784
846 might_sleep(); 785 might_sleep();
847restart: 786restart:
848 rcu_read_lock(); 787 rcu_read_lock();
849 bdi_for_each_wb(wb, bdi, &iter, next_blkcg_id) { 788 bdi_for_each_wb(wb, bdi, &iter, next_memcg_id) {
789 DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
790 struct wb_writeback_work fallback_work;
791 struct wb_writeback_work *work;
792 long nr_pages;
793
850 /* SYNC_ALL writes out I_DIRTY_TIME too */ 794 /* SYNC_ALL writes out I_DIRTY_TIME too */
851 if (!wb_has_dirty_io(wb) && 795 if (!wb_has_dirty_io(wb) &&
852 (base_work->sync_mode == WB_SYNC_NONE || 796 (base_work->sync_mode == WB_SYNC_NONE ||
@@ -855,13 +799,30 @@ restart:
855 if (skip_if_busy && writeback_in_progress(wb)) 799 if (skip_if_busy && writeback_in_progress(wb))
856 continue; 800 continue;
857 801
858 base_work->nr_pages = wb_split_bdi_pages(wb, nr_pages); 802 nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);
859 if (!wb_clone_and_queue_work(wb, base_work)) { 803
860 next_blkcg_id = wb->blkcg_css->id + 1; 804 work = kmalloc(sizeof(*work), GFP_ATOMIC);
861 rcu_read_unlock(); 805 if (work) {
862 wb_wait_for_single_work(bdi, base_work); 806 *work = *base_work;
863 goto restart; 807 work->nr_pages = nr_pages;
808 work->auto_free = 1;
809 wb_queue_work(wb, work);
810 continue;
864 } 811 }
812
813 /* alloc failed, execute synchronously using on-stack fallback */
814 work = &fallback_work;
815 *work = *base_work;
816 work->nr_pages = nr_pages;
817 work->auto_free = 0;
818 work->done = &fallback_work_done;
819
820 wb_queue_work(wb, work);
821
822 next_memcg_id = wb->memcg_css->id + 1;
823 rcu_read_unlock();
824 wb_wait_for_completion(bdi, &fallback_work_done);
825 goto restart;
865 } 826 }
866 rcu_read_unlock(); 827 rcu_read_unlock();
867} 828}
@@ -902,8 +863,6 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
902 863
903 if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) { 864 if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
904 base_work->auto_free = 0; 865 base_work->auto_free = 0;
905 base_work->single_wait = 0;
906 base_work->single_done = 0;
907 wb_queue_work(&bdi->wb, base_work); 866 wb_queue_work(&bdi->wb, base_work);
908 } 867 }
909} 868}
@@ -924,7 +883,7 @@ void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
924 */ 883 */
925 work = kzalloc(sizeof(*work), GFP_ATOMIC); 884 work = kzalloc(sizeof(*work), GFP_ATOMIC);
926 if (!work) { 885 if (!work) {
927 trace_writeback_nowork(wb->bdi); 886 trace_writeback_nowork(wb);
928 wb_wakeup(wb); 887 wb_wakeup(wb);
929 return; 888 return;
930 } 889 }
@@ -954,7 +913,7 @@ void wb_start_background_writeback(struct bdi_writeback *wb)
954 * We just wake up the flusher thread. It will perform background 913 * We just wake up the flusher thread. It will perform background
955 * writeback as soon as there is no other work to do. 914 * writeback as soon as there is no other work to do.
956 */ 915 */
957 trace_writeback_wake_background(wb->bdi); 916 trace_writeback_wake_background(wb);
958 wb_wakeup(wb); 917 wb_wakeup(wb);
959} 918}
960 919
@@ -1660,14 +1619,14 @@ static long wb_writeback(struct bdi_writeback *wb,
1660 } else if (work->for_background) 1619 } else if (work->for_background)
1661 oldest_jif = jiffies; 1620 oldest_jif = jiffies;
1662 1621
1663 trace_writeback_start(wb->bdi, work); 1622 trace_writeback_start(wb, work);
1664 if (list_empty(&wb->b_io)) 1623 if (list_empty(&wb->b_io))
1665 queue_io(wb, work); 1624 queue_io(wb, work);
1666 if (work->sb) 1625 if (work->sb)
1667 progress = writeback_sb_inodes(work->sb, wb, work); 1626 progress = writeback_sb_inodes(work->sb, wb, work);
1668 else 1627 else
1669 progress = __writeback_inodes_wb(wb, work); 1628 progress = __writeback_inodes_wb(wb, work);
1670 trace_writeback_written(wb->bdi, work); 1629 trace_writeback_written(wb, work);
1671 1630
1672 wb_update_bandwidth(wb, wb_start); 1631 wb_update_bandwidth(wb, wb_start);
1673 1632
@@ -1692,7 +1651,7 @@ static long wb_writeback(struct bdi_writeback *wb,
1692 * we'll just busyloop. 1651 * we'll just busyloop.
1693 */ 1652 */
1694 if (!list_empty(&wb->b_more_io)) { 1653 if (!list_empty(&wb->b_more_io)) {
1695 trace_writeback_wait(wb->bdi, work); 1654 trace_writeback_wait(wb, work);
1696 inode = wb_inode(wb->b_more_io.prev); 1655 inode = wb_inode(wb->b_more_io.prev);
1697 spin_lock(&inode->i_lock); 1656 spin_lock(&inode->i_lock);
1698 spin_unlock(&wb->list_lock); 1657 spin_unlock(&wb->list_lock);
@@ -1797,26 +1756,14 @@ static long wb_do_writeback(struct bdi_writeback *wb)
1797 set_bit(WB_writeback_running, &wb->state); 1756 set_bit(WB_writeback_running, &wb->state);
1798 while ((work = get_next_work_item(wb)) != NULL) { 1757 while ((work = get_next_work_item(wb)) != NULL) {
1799 struct wb_completion *done = work->done; 1758 struct wb_completion *done = work->done;
1800 bool need_wake_up = false;
1801 1759
1802 trace_writeback_exec(wb->bdi, work); 1760 trace_writeback_exec(wb, work);
1803 1761
1804 wrote += wb_writeback(wb, work); 1762 wrote += wb_writeback(wb, work);
1805 1763
1806 if (work->single_wait) { 1764 if (work->auto_free)
1807 WARN_ON_ONCE(work->auto_free);
1808 /* paired w/ rmb in wb_wait_for_single_work() */
1809 smp_wmb();
1810 work->single_done = 1;
1811 need_wake_up = true;
1812 } else if (work->auto_free) {
1813 kfree(work); 1765 kfree(work);
1814 }
1815
1816 if (done && atomic_dec_and_test(&done->cnt)) 1766 if (done && atomic_dec_and_test(&done->cnt))
1817 need_wake_up = true;
1818
1819 if (need_wake_up)
1820 wake_up_all(&wb->bdi->wb_waitq); 1767 wake_up_all(&wb->bdi->wb_waitq);
1821 } 1768 }
1822 1769
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 2d48d28e1640..91e004518237 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -92,6 +92,29 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
92} 92}
93 93
94/** 94/**
95 * kernfs_path_len - determine the length of the full path of a given node
96 * @kn: kernfs_node of interest
97 *
98 * The returned length doesn't include the space for the terminating '\0'.
99 */
100size_t kernfs_path_len(struct kernfs_node *kn)
101{
102 size_t len = 0;
103 unsigned long flags;
104
105 spin_lock_irqsave(&kernfs_rename_lock, flags);
106
107 do {
108 len += strlen(kn->name) + 1;
109 kn = kn->parent;
110 } while (kn && kn->parent);
111
112 spin_unlock_irqrestore(&kernfs_rename_lock, flags);
113
114 return len;
115}
116
117/**
95 * kernfs_path - build full path of a given node 118 * kernfs_path - build full path of a given node
96 * @kn: kernfs_node of interest 119 * @kn: kernfs_node of interest
97 * @buf: buffer to copy @kn's name into 120 * @buf: buffer to copy @kn's name into
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 0fe9df983ab7..5a5d79ee256f 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -286,7 +286,7 @@ static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi
286 * %current's blkcg equals the effective blkcg of its memcg. No 286 * %current's blkcg equals the effective blkcg of its memcg. No
287 * need to use the relatively expensive cgroup_get_e_css(). 287 * need to use the relatively expensive cgroup_get_e_css().
288 */ 288 */
289 if (likely(wb && wb->blkcg_css == task_css(current, blkio_cgrp_id))) 289 if (likely(wb && wb->blkcg_css == task_css(current, io_cgrp_id)))
290 return wb; 290 return wb;
291 return NULL; 291 return NULL;
292} 292}
@@ -402,7 +402,7 @@ static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
402} 402}
403 403
404struct wb_iter { 404struct wb_iter {
405 int start_blkcg_id; 405 int start_memcg_id;
406 struct radix_tree_iter tree_iter; 406 struct radix_tree_iter tree_iter;
407 void **slot; 407 void **slot;
408}; 408};
@@ -414,9 +414,9 @@ static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter,
414 414
415 WARN_ON_ONCE(!rcu_read_lock_held()); 415 WARN_ON_ONCE(!rcu_read_lock_held());
416 416
417 if (iter->start_blkcg_id >= 0) { 417 if (iter->start_memcg_id >= 0) {
418 iter->slot = radix_tree_iter_init(titer, iter->start_blkcg_id); 418 iter->slot = radix_tree_iter_init(titer, iter->start_memcg_id);
419 iter->start_blkcg_id = -1; 419 iter->start_memcg_id = -1;
420 } else { 420 } else {
421 iter->slot = radix_tree_next_slot(iter->slot, titer, 0); 421 iter->slot = radix_tree_next_slot(iter->slot, titer, 0);
422 } 422 }
@@ -430,30 +430,30 @@ static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter,
430 430
431static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter, 431static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter,
432 struct backing_dev_info *bdi, 432 struct backing_dev_info *bdi,
433 int start_blkcg_id) 433 int start_memcg_id)
434{ 434{
435 iter->start_blkcg_id = start_blkcg_id; 435 iter->start_memcg_id = start_memcg_id;
436 436
437 if (start_blkcg_id) 437 if (start_memcg_id)
438 return __wb_iter_next(iter, bdi); 438 return __wb_iter_next(iter, bdi);
439 else 439 else
440 return &bdi->wb; 440 return &bdi->wb;
441} 441}
442 442
443/** 443/**
444 * bdi_for_each_wb - walk all wb's of a bdi in ascending blkcg ID order 444 * bdi_for_each_wb - walk all wb's of a bdi in ascending memcg ID order
445 * @wb_cur: cursor struct bdi_writeback pointer 445 * @wb_cur: cursor struct bdi_writeback pointer
446 * @bdi: bdi to walk wb's of 446 * @bdi: bdi to walk wb's of
447 * @iter: pointer to struct wb_iter to be used as iteration buffer 447 * @iter: pointer to struct wb_iter to be used as iteration buffer
448 * @start_blkcg_id: blkcg ID to start iteration from 448 * @start_memcg_id: memcg ID to start iteration from
449 * 449 *
450 * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending 450 * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending
451 * blkcg ID order starting from @start_blkcg_id. @iter is struct wb_iter 451 * memcg ID order starting from @start_memcg_id. @iter is struct wb_iter
452 * to be used as temp storage during iteration. rcu_read_lock() must be 452 * to be used as temp storage during iteration. rcu_read_lock() must be
453 * held throughout iteration. 453 * held throughout iteration.
454 */ 454 */
455#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id) \ 455#define bdi_for_each_wb(wb_cur, bdi, iter, start_memcg_id) \
456 for ((wb_cur) = __wb_iter_init(iter, bdi, start_blkcg_id); \ 456 for ((wb_cur) = __wb_iter_init(iter, bdi, start_memcg_id); \
457 (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi)) 457 (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi))
458 458
459#else /* CONFIG_CGROUP_WRITEBACK */ 459#else /* CONFIG_CGROUP_WRITEBACK */
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index a4cd1641e9e2..0a5cc7a1109b 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -14,12 +14,15 @@
14 */ 14 */
15 15
16#include <linux/cgroup.h> 16#include <linux/cgroup.h>
17#include <linux/u64_stats_sync.h> 17#include <linux/percpu_counter.h>
18#include <linux/seq_file.h> 18#include <linux/seq_file.h>
19#include <linux/radix-tree.h> 19#include <linux/radix-tree.h>
20#include <linux/blkdev.h> 20#include <linux/blkdev.h>
21#include <linux/atomic.h> 21#include <linux/atomic.h>
22 22
23/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
24#define BLKG_STAT_CPU_BATCH (INT_MAX / 2)
25
23/* Max limits for throttle policy */ 26/* Max limits for throttle policy */
24#define THROTL_IOPS_MAX UINT_MAX 27#define THROTL_IOPS_MAX UINT_MAX
25 28
@@ -45,7 +48,7 @@ struct blkcg {
45 struct blkcg_gq *blkg_hint; 48 struct blkcg_gq *blkg_hint;
46 struct hlist_head blkg_list; 49 struct hlist_head blkg_list;
47 50
48 struct blkcg_policy_data *pd[BLKCG_MAX_POLS]; 51 struct blkcg_policy_data *cpd[BLKCG_MAX_POLS];
49 52
50 struct list_head all_blkcgs_node; 53 struct list_head all_blkcgs_node;
51#ifdef CONFIG_CGROUP_WRITEBACK 54#ifdef CONFIG_CGROUP_WRITEBACK
@@ -53,14 +56,19 @@ struct blkcg {
53#endif 56#endif
54}; 57};
55 58
59/*
60 * blkg_[rw]stat->aux_cnt is excluded for local stats but included for
61 * recursive. Used to carry stats of dead children, and, for blkg_rwstat,
62 * to carry result values from read and sum operations.
63 */
56struct blkg_stat { 64struct blkg_stat {
57 struct u64_stats_sync syncp; 65 struct percpu_counter cpu_cnt;
58 uint64_t cnt; 66 atomic64_t aux_cnt;
59}; 67};
60 68
61struct blkg_rwstat { 69struct blkg_rwstat {
62 struct u64_stats_sync syncp; 70 struct percpu_counter cpu_cnt[BLKG_RWSTAT_NR];
63 uint64_t cnt[BLKG_RWSTAT_NR]; 71 atomic64_t aux_cnt[BLKG_RWSTAT_NR];
64}; 72};
65 73
66/* 74/*
@@ -68,32 +76,28 @@ struct blkg_rwstat {
68 * request_queue (q). This is used by blkcg policies which need to track 76 * request_queue (q). This is used by blkcg policies which need to track
69 * information per blkcg - q pair. 77 * information per blkcg - q pair.
70 * 78 *
71 * There can be multiple active blkcg policies and each has its private 79 * There can be multiple active blkcg policies and each blkg:policy pair is
72 * data on each blkg, the size of which is determined by 80 * represented by a blkg_policy_data which is allocated and freed by each
73 * blkcg_policy->pd_size. blkcg core allocates and frees such areas 81 * policy's pd_alloc/free_fn() methods. A policy can allocate private data
74 * together with blkg and invokes pd_init/exit_fn() methods. 82 * area by allocating larger data structure which embeds blkg_policy_data
75 * 83 * at the beginning.
76 * Such private data must embed struct blkg_policy_data (pd) at the
77 * beginning and pd_size can't be smaller than pd.
78 */ 84 */
79struct blkg_policy_data { 85struct blkg_policy_data {
80 /* the blkg and policy id this per-policy data belongs to */ 86 /* the blkg and policy id this per-policy data belongs to */
81 struct blkcg_gq *blkg; 87 struct blkcg_gq *blkg;
82 int plid; 88 int plid;
83
84 /* used during policy activation */
85 struct list_head alloc_node;
86}; 89};
87 90
88/* 91/*
89 * Policies that need to keep per-blkcg data which is independent 92 * Policies that need to keep per-blkcg data which is independent from any
90 * from any request_queue associated to it must specify its size 93 * request_queue associated to it should implement cpd_alloc/free_fn()
91 * with the cpd_size field of the blkcg_policy structure and 94 * methods. A policy can allocate private data area by allocating larger
92 * embed a blkcg_policy_data in it. cpd_init() is invoked to let 95 * data structure which embeds blkcg_policy_data at the beginning.
93 * each policy handle per-blkcg data. 96 * cpd_init() is invoked to let each policy handle per-blkcg data.
94 */ 97 */
95struct blkcg_policy_data { 98struct blkcg_policy_data {
96 /* the policy id this per-policy data belongs to */ 99 /* the blkcg and policy id this per-policy data belongs to */
100 struct blkcg *blkcg;
97 int plid; 101 int plid;
98}; 102};
99 103
@@ -123,40 +127,50 @@ struct blkcg_gq {
123 /* is this blkg online? protected by both blkcg and q locks */ 127 /* is this blkg online? protected by both blkcg and q locks */
124 bool online; 128 bool online;
125 129
130 struct blkg_rwstat stat_bytes;
131 struct blkg_rwstat stat_ios;
132
126 struct blkg_policy_data *pd[BLKCG_MAX_POLS]; 133 struct blkg_policy_data *pd[BLKCG_MAX_POLS];
127 134
128 struct rcu_head rcu_head; 135 struct rcu_head rcu_head;
129}; 136};
130 137
131typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg); 138typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
132typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); 139typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd);
133typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg); 140typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd);
134typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg); 141typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd);
135typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg); 142typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node);
136typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); 143typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd);
144typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
145typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd);
146typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd);
147typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
137 148
138struct blkcg_policy { 149struct blkcg_policy {
139 int plid; 150 int plid;
140 /* policy specific private data size */
141 size_t pd_size;
142 /* policy specific per-blkcg data size */
143 size_t cpd_size;
144 /* cgroup files for the policy */ 151 /* cgroup files for the policy */
145 struct cftype *cftypes; 152 struct cftype *dfl_cftypes;
153 struct cftype *legacy_cftypes;
146 154
147 /* operations */ 155 /* operations */
156 blkcg_pol_alloc_cpd_fn *cpd_alloc_fn;
148 blkcg_pol_init_cpd_fn *cpd_init_fn; 157 blkcg_pol_init_cpd_fn *cpd_init_fn;
158 blkcg_pol_free_cpd_fn *cpd_free_fn;
159 blkcg_pol_bind_cpd_fn *cpd_bind_fn;
160
161 blkcg_pol_alloc_pd_fn *pd_alloc_fn;
149 blkcg_pol_init_pd_fn *pd_init_fn; 162 blkcg_pol_init_pd_fn *pd_init_fn;
150 blkcg_pol_online_pd_fn *pd_online_fn; 163 blkcg_pol_online_pd_fn *pd_online_fn;
151 blkcg_pol_offline_pd_fn *pd_offline_fn; 164 blkcg_pol_offline_pd_fn *pd_offline_fn;
152 blkcg_pol_exit_pd_fn *pd_exit_fn; 165 blkcg_pol_free_pd_fn *pd_free_fn;
153 blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; 166 blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn;
154}; 167};
155 168
156extern struct blkcg blkcg_root; 169extern struct blkcg blkcg_root;
157extern struct cgroup_subsys_state * const blkcg_root_css; 170extern struct cgroup_subsys_state * const blkcg_root_css;
158 171
159struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q); 172struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
173 struct request_queue *q, bool update_hint);
160struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, 174struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
161 struct request_queue *q); 175 struct request_queue *q);
162int blkcg_init_queue(struct request_queue *q); 176int blkcg_init_queue(struct request_queue *q);
@@ -171,6 +185,7 @@ int blkcg_activate_policy(struct request_queue *q,
171void blkcg_deactivate_policy(struct request_queue *q, 185void blkcg_deactivate_policy(struct request_queue *q,
172 const struct blkcg_policy *pol); 186 const struct blkcg_policy *pol);
173 187
188const char *blkg_dev_name(struct blkcg_gq *blkg);
174void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, 189void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
175 u64 (*prfill)(struct seq_file *, 190 u64 (*prfill)(struct seq_file *,
176 struct blkg_policy_data *, int), 191 struct blkg_policy_data *, int),
@@ -182,19 +197,24 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
182u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off); 197u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off);
183u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, 198u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
184 int off); 199 int off);
200int blkg_print_stat_bytes(struct seq_file *sf, void *v);
201int blkg_print_stat_ios(struct seq_file *sf, void *v);
202int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v);
203int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v);
185 204
186u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off); 205u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
187struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, 206 struct blkcg_policy *pol, int off);
188 int off); 207struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
208 struct blkcg_policy *pol, int off);
189 209
190struct blkg_conf_ctx { 210struct blkg_conf_ctx {
191 struct gendisk *disk; 211 struct gendisk *disk;
192 struct blkcg_gq *blkg; 212 struct blkcg_gq *blkg;
193 u64 v; 213 char *body;
194}; 214};
195 215
196int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, 216int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
197 const char *input, struct blkg_conf_ctx *ctx); 217 char *input, struct blkg_conf_ctx *ctx);
198void blkg_conf_finish(struct blkg_conf_ctx *ctx); 218void blkg_conf_finish(struct blkg_conf_ctx *ctx);
199 219
200 220
@@ -205,7 +225,7 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
205 225
206static inline struct blkcg *task_blkcg(struct task_struct *tsk) 226static inline struct blkcg *task_blkcg(struct task_struct *tsk)
207{ 227{
208 return css_to_blkcg(task_css(tsk, blkio_cgrp_id)); 228 return css_to_blkcg(task_css(tsk, io_cgrp_id));
209} 229}
210 230
211static inline struct blkcg *bio_blkcg(struct bio *bio) 231static inline struct blkcg *bio_blkcg(struct bio *bio)
@@ -218,7 +238,7 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
218static inline struct cgroup_subsys_state * 238static inline struct cgroup_subsys_state *
219task_get_blkcg_css(struct task_struct *task) 239task_get_blkcg_css(struct task_struct *task)
220{ 240{
221 return task_get_css(task, blkio_cgrp_id); 241 return task_get_css(task, io_cgrp_id);
222} 242}
223 243
224/** 244/**
@@ -233,6 +253,52 @@ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
233} 253}
234 254
235/** 255/**
256 * __blkg_lookup - internal version of blkg_lookup()
257 * @blkcg: blkcg of interest
258 * @q: request_queue of interest
259 * @update_hint: whether to update lookup hint with the result or not
260 *
261 * This is internal version and shouldn't be used by policy
262 * implementations. Looks up blkgs for the @blkcg - @q pair regardless of
263 * @q's bypass state. If @update_hint is %true, the caller should be
264 * holding @q->queue_lock and lookup hint is updated on success.
265 */
266static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
267 struct request_queue *q,
268 bool update_hint)
269{
270 struct blkcg_gq *blkg;
271
272 if (blkcg == &blkcg_root)
273 return q->root_blkg;
274
275 blkg = rcu_dereference(blkcg->blkg_hint);
276 if (blkg && blkg->q == q)
277 return blkg;
278
279 return blkg_lookup_slowpath(blkcg, q, update_hint);
280}
281
282/**
283 * blkg_lookup - lookup blkg for the specified blkcg - q pair
284 * @blkcg: blkcg of interest
285 * @q: request_queue of interest
286 *
287 * Lookup blkg for the @blkcg - @q pair. This function should be called
288 * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
289 * - see blk_queue_bypass_start() for details.
290 */
291static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
292 struct request_queue *q)
293{
294 WARN_ON_ONCE(!rcu_read_lock_held());
295
296 if (unlikely(blk_queue_bypass(q)))
297 return NULL;
298 return __blkg_lookup(blkcg, q, false);
299}
300
301/**
236 * blkg_to_pdata - get policy private data 302 * blkg_to_pdata - get policy private data
237 * @blkg: blkg of interest 303 * @blkg: blkg of interest
238 * @pol: policy of interest 304 * @pol: policy of interest
@@ -248,7 +314,7 @@ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
248static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg, 314static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
249 struct blkcg_policy *pol) 315 struct blkcg_policy *pol)
250{ 316{
251 return blkcg ? blkcg->pd[pol->plid] : NULL; 317 return blkcg ? blkcg->cpd[pol->plid] : NULL;
252} 318}
253 319
254/** 320/**
@@ -262,6 +328,11 @@ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
262 return pd ? pd->blkg : NULL; 328 return pd ? pd->blkg : NULL;
263} 329}
264 330
331static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd)
332{
333 return cpd ? cpd->blkcg : NULL;
334}
335
265/** 336/**
266 * blkg_path - format cgroup path of blkg 337 * blkg_path - format cgroup path of blkg
267 * @blkg: blkg of interest 338 * @blkg: blkg of interest
@@ -309,9 +380,6 @@ static inline void blkg_put(struct blkcg_gq *blkg)
309 call_rcu(&blkg->rcu_head, __blkg_release_rcu); 380 call_rcu(&blkg->rcu_head, __blkg_release_rcu);
310} 381}
311 382
312struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
313 bool update_hint);
314
315/** 383/**
316 * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants 384 * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
317 * @d_blkg: loop cursor pointing to the current descendant 385 * @d_blkg: loop cursor pointing to the current descendant
@@ -373,8 +441,8 @@ static inline struct request_list *blk_get_rl(struct request_queue *q,
373 * or if either the blkcg or queue is going away. Fall back to 441 * or if either the blkcg or queue is going away. Fall back to
374 * root_rl in such cases. 442 * root_rl in such cases.
375 */ 443 */
376 blkg = blkg_lookup_create(blkcg, q); 444 blkg = blkg_lookup(blkcg, q);
377 if (IS_ERR(blkg)) 445 if (unlikely(!blkg))
378 goto root_rl; 446 goto root_rl;
379 447
380 blkg_get(blkg); 448 blkg_get(blkg);
@@ -394,8 +462,7 @@ root_rl:
394 */ 462 */
395static inline void blk_put_rl(struct request_list *rl) 463static inline void blk_put_rl(struct request_list *rl)
396{ 464{
397 /* root_rl may not have blkg set */ 465 if (rl->blkg->blkcg != &blkcg_root)
398 if (rl->blkg && rl->blkg->blkcg != &blkcg_root)
399 blkg_put(rl->blkg); 466 blkg_put(rl->blkg);
400} 467}
401 468
@@ -433,9 +500,21 @@ struct request_list *__blk_queue_next_rl(struct request_list *rl,
433#define blk_queue_for_each_rl(rl, q) \ 500#define blk_queue_for_each_rl(rl, q) \
434 for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q))) 501 for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
435 502
436static inline void blkg_stat_init(struct blkg_stat *stat) 503static inline int blkg_stat_init(struct blkg_stat *stat, gfp_t gfp)
437{ 504{
438 u64_stats_init(&stat->syncp); 505 int ret;
506
507 ret = percpu_counter_init(&stat->cpu_cnt, 0, gfp);
508 if (ret)
509 return ret;
510
511 atomic64_set(&stat->aux_cnt, 0);
512 return 0;
513}
514
515static inline void blkg_stat_exit(struct blkg_stat *stat)
516{
517 percpu_counter_destroy(&stat->cpu_cnt);
439} 518}
440 519
441/** 520/**
@@ -443,34 +522,21 @@ static inline void blkg_stat_init(struct blkg_stat *stat)
443 * @stat: target blkg_stat 522 * @stat: target blkg_stat
444 * @val: value to add 523 * @val: value to add
445 * 524 *
446 * Add @val to @stat. The caller is responsible for synchronizing calls to 525 * Add @val to @stat. The caller must ensure that IRQ on the same CPU
447 * this function. 526 * don't re-enter this function for the same counter.
448 */ 527 */
449static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val) 528static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val)
450{ 529{
451 u64_stats_update_begin(&stat->syncp); 530 __percpu_counter_add(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH);
452 stat->cnt += val;
453 u64_stats_update_end(&stat->syncp);
454} 531}
455 532
456/** 533/**
457 * blkg_stat_read - read the current value of a blkg_stat 534 * blkg_stat_read - read the current value of a blkg_stat
458 * @stat: blkg_stat to read 535 * @stat: blkg_stat to read
459 *
460 * Read the current value of @stat. This function can be called without
461 * synchroniztion and takes care of u64 atomicity.
462 */ 536 */
463static inline uint64_t blkg_stat_read(struct blkg_stat *stat) 537static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
464{ 538{
465 unsigned int start; 539 return percpu_counter_sum_positive(&stat->cpu_cnt);
466 uint64_t v;
467
468 do {
469 start = u64_stats_fetch_begin_irq(&stat->syncp);
470 v = stat->cnt;
471 } while (u64_stats_fetch_retry_irq(&stat->syncp, start));
472
473 return v;
474} 540}
475 541
476/** 542/**
@@ -479,24 +545,46 @@ static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
479 */ 545 */
480static inline void blkg_stat_reset(struct blkg_stat *stat) 546static inline void blkg_stat_reset(struct blkg_stat *stat)
481{ 547{
482 stat->cnt = 0; 548 percpu_counter_set(&stat->cpu_cnt, 0);
549 atomic64_set(&stat->aux_cnt, 0);
483} 550}
484 551
485/** 552/**
486 * blkg_stat_merge - merge a blkg_stat into another 553 * blkg_stat_add_aux - add a blkg_stat into another's aux count
487 * @to: the destination blkg_stat 554 * @to: the destination blkg_stat
488 * @from: the source 555 * @from: the source
489 * 556 *
490 * Add @from's count to @to. 557 * Add @from's count including the aux one to @to's aux count.
491 */ 558 */
492static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from) 559static inline void blkg_stat_add_aux(struct blkg_stat *to,
560 struct blkg_stat *from)
493{ 561{
494 blkg_stat_add(to, blkg_stat_read(from)); 562 atomic64_add(blkg_stat_read(from) + atomic64_read(&from->aux_cnt),
563 &to->aux_cnt);
495} 564}
496 565
497static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat) 566static inline int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp)
498{ 567{
499 u64_stats_init(&rwstat->syncp); 568 int i, ret;
569
570 for (i = 0; i < BLKG_RWSTAT_NR; i++) {
571 ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp);
572 if (ret) {
573 while (--i >= 0)
574 percpu_counter_destroy(&rwstat->cpu_cnt[i]);
575 return ret;
576 }
577 atomic64_set(&rwstat->aux_cnt[i], 0);
578 }
579 return 0;
580}
581
582static inline void blkg_rwstat_exit(struct blkg_rwstat *rwstat)
583{
584 int i;
585
586 for (i = 0; i < BLKG_RWSTAT_NR; i++)
587 percpu_counter_destroy(&rwstat->cpu_cnt[i]);
500} 588}
501 589
502/** 590/**
@@ -511,39 +599,38 @@ static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat)
511static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, 599static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
512 int rw, uint64_t val) 600 int rw, uint64_t val)
513{ 601{
514 u64_stats_update_begin(&rwstat->syncp); 602 struct percpu_counter *cnt;
515 603
516 if (rw & REQ_WRITE) 604 if (rw & REQ_WRITE)
517 rwstat->cnt[BLKG_RWSTAT_WRITE] += val; 605 cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE];
518 else 606 else
519 rwstat->cnt[BLKG_RWSTAT_READ] += val; 607 cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ];
608
609 __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH);
610
520 if (rw & REQ_SYNC) 611 if (rw & REQ_SYNC)
521 rwstat->cnt[BLKG_RWSTAT_SYNC] += val; 612 cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC];
522 else 613 else
523 rwstat->cnt[BLKG_RWSTAT_ASYNC] += val; 614 cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC];
524 615
525 u64_stats_update_end(&rwstat->syncp); 616 __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH);
526} 617}
527 618
528/** 619/**
529 * blkg_rwstat_read - read the current values of a blkg_rwstat 620 * blkg_rwstat_read - read the current values of a blkg_rwstat
530 * @rwstat: blkg_rwstat to read 621 * @rwstat: blkg_rwstat to read
531 * 622 *
532 * Read the current snapshot of @rwstat and return it as the return value. 623 * Read the current snapshot of @rwstat and return it in the aux counts.
533 * This function can be called without synchronization and takes care of
534 * u64 atomicity.
535 */ 624 */
536static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat) 625static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
537{ 626{
538 unsigned int start; 627 struct blkg_rwstat result;
539 struct blkg_rwstat tmp; 628 int i;
540
541 do {
542 start = u64_stats_fetch_begin_irq(&rwstat->syncp);
543 tmp = *rwstat;
544 } while (u64_stats_fetch_retry_irq(&rwstat->syncp, start));
545 629
546 return tmp; 630 for (i = 0; i < BLKG_RWSTAT_NR; i++)
631 atomic64_set(&result.aux_cnt[i],
632 percpu_counter_sum_positive(&rwstat->cpu_cnt[i]));
633 return result;
547} 634}
548 635
549/** 636/**
@@ -558,7 +645,8 @@ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
558{ 645{
559 struct blkg_rwstat tmp = blkg_rwstat_read(rwstat); 646 struct blkg_rwstat tmp = blkg_rwstat_read(rwstat);
560 647
561 return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]; 648 return atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
649 atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
562} 650}
563 651
564/** 652/**
@@ -567,26 +655,71 @@ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
567 */ 655 */
568static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) 656static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
569{ 657{
570 memset(rwstat->cnt, 0, sizeof(rwstat->cnt)); 658 int i;
659
660 for (i = 0; i < BLKG_RWSTAT_NR; i++) {
661 percpu_counter_set(&rwstat->cpu_cnt[i], 0);
662 atomic64_set(&rwstat->aux_cnt[i], 0);
663 }
571} 664}
572 665
573/** 666/**
574 * blkg_rwstat_merge - merge a blkg_rwstat into another 667 * blkg_rwstat_add_aux - add a blkg_rwstat into another's aux count
575 * @to: the destination blkg_rwstat 668 * @to: the destination blkg_rwstat
576 * @from: the source 669 * @from: the source
577 * 670 *
578 * Add @from's counts to @to. 671 * Add @from's count including the aux one to @to's aux count.
579 */ 672 */
580static inline void blkg_rwstat_merge(struct blkg_rwstat *to, 673static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to,
581 struct blkg_rwstat *from) 674 struct blkg_rwstat *from)
582{ 675{
583 struct blkg_rwstat v = blkg_rwstat_read(from); 676 struct blkg_rwstat v = blkg_rwstat_read(from);
584 int i; 677 int i;
585 678
586 u64_stats_update_begin(&to->syncp);
587 for (i = 0; i < BLKG_RWSTAT_NR; i++) 679 for (i = 0; i < BLKG_RWSTAT_NR; i++)
588 to->cnt[i] += v.cnt[i]; 680 atomic64_add(atomic64_read(&v.aux_cnt[i]) +
589 u64_stats_update_end(&to->syncp); 681 atomic64_read(&from->aux_cnt[i]),
682 &to->aux_cnt[i]);
683}
684
685#ifdef CONFIG_BLK_DEV_THROTTLING
686extern bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
687 struct bio *bio);
688#else
689static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
690 struct bio *bio) { return false; }
691#endif
692
693static inline bool blkcg_bio_issue_check(struct request_queue *q,
694 struct bio *bio)
695{
696 struct blkcg *blkcg;
697 struct blkcg_gq *blkg;
698 bool throtl = false;
699
700 rcu_read_lock();
701 blkcg = bio_blkcg(bio);
702
703 blkg = blkg_lookup(blkcg, q);
704 if (unlikely(!blkg)) {
705 spin_lock_irq(q->queue_lock);
706 blkg = blkg_lookup_create(blkcg, q);
707 if (IS_ERR(blkg))
708 blkg = NULL;
709 spin_unlock_irq(q->queue_lock);
710 }
711
712 throtl = blk_throtl_bio(q, blkg, bio);
713
714 if (!throtl) {
715 blkg = blkg ?: q->root_blkg;
716 blkg_rwstat_add(&blkg->stat_bytes, bio->bi_flags,
717 bio->bi_iter.bi_size);
718 blkg_rwstat_add(&blkg->stat_ios, bio->bi_flags, 1);
719 }
720
721 rcu_read_unlock();
722 return !throtl;
590} 723}
591 724
592#else /* CONFIG_BLK_CGROUP */ 725#else /* CONFIG_BLK_CGROUP */
@@ -642,6 +775,9 @@ static inline void blk_put_rl(struct request_list *rl) { }
642static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { } 775static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { }
643static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; } 776static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; }
644 777
778static inline bool blkcg_bio_issue_check(struct request_queue *q,
779 struct bio *bio) { return true; }
780
645#define blk_queue_for_each_rl(rl, q) \ 781#define blk_queue_for_each_rl(rl, q) \
646 for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) 782 for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
647 783
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 1f36945fd23d..1a96fdaa33d5 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -27,7 +27,7 @@ SUBSYS(cpuacct)
27#endif 27#endif
28 28
29#if IS_ENABLED(CONFIG_BLK_CGROUP) 29#if IS_ENABLED(CONFIG_BLK_CGROUP)
30SUBSYS(blkio) 30SUBSYS(io)
31#endif 31#endif
32 32
33#if IS_ENABLED(CONFIG_MEMCG) 33#if IS_ENABLED(CONFIG_MEMCG)
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 123be25ea15a..5d4e9c4b821d 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -266,6 +266,7 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
266} 266}
267 267
268int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen); 268int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen);
269size_t kernfs_path_len(struct kernfs_node *kn);
269char * __must_check kernfs_path(struct kernfs_node *kn, char *buf, 270char * __must_check kernfs_path(struct kernfs_node *kn, char *buf,
270 size_t buflen); 271 size_t buflen);
271void pr_cont_kernfs_name(struct kernfs_node *kn); 272void pr_cont_kernfs_name(struct kernfs_node *kn);
@@ -332,6 +333,9 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
332static inline int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) 333static inline int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
333{ return -ENOSYS; } 334{ return -ENOSYS; }
334 335
336static inline size_t kernfs_path_len(struct kernfs_node *kn)
337{ return 0; }
338
335static inline char * __must_check kernfs_path(struct kernfs_node *kn, char *buf, 339static inline char * __must_check kernfs_path(struct kernfs_node *kn, char *buf,
336 size_t buflen) 340 size_t buflen)
337{ return NULL; } 341{ return NULL; }
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index a7aa607a4c55..fff846b512e6 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -131,6 +131,66 @@ DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode,
131 TP_ARGS(inode, flags) 131 TP_ARGS(inode, flags)
132); 132);
133 133
134#ifdef CREATE_TRACE_POINTS
135#ifdef CONFIG_CGROUP_WRITEBACK
136
137static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb)
138{
139 return kernfs_path_len(wb->memcg_css->cgroup->kn) + 1;
140}
141
142static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb)
143{
144 struct cgroup *cgrp = wb->memcg_css->cgroup;
145 char *path;
146
147 path = cgroup_path(cgrp, buf, kernfs_path_len(cgrp->kn) + 1);
148 WARN_ON_ONCE(path != buf);
149}
150
151static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc)
152{
153 if (wbc->wb)
154 return __trace_wb_cgroup_size(wbc->wb);
155 else
156 return 2;
157}
158
159static inline void __trace_wbc_assign_cgroup(char *buf,
160 struct writeback_control *wbc)
161{
162 if (wbc->wb)
163 __trace_wb_assign_cgroup(buf, wbc->wb);
164 else
165 strcpy(buf, "/");
166}
167
168#else /* CONFIG_CGROUP_WRITEBACK */
169
170static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb)
171{
172 return 2;
173}
174
175static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb)
176{
177 strcpy(buf, "/");
178}
179
180static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc)
181{
182 return 2;
183}
184
185static inline void __trace_wbc_assign_cgroup(char *buf,
186 struct writeback_control *wbc)
187{
188 strcpy(buf, "/");
189}
190
191#endif /* CONFIG_CGROUP_WRITEBACK */
192#endif /* CREATE_TRACE_POINTS */
193
134DECLARE_EVENT_CLASS(writeback_write_inode_template, 194DECLARE_EVENT_CLASS(writeback_write_inode_template,
135 195
136 TP_PROTO(struct inode *inode, struct writeback_control *wbc), 196 TP_PROTO(struct inode *inode, struct writeback_control *wbc),
@@ -141,6 +201,7 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template,
141 __array(char, name, 32) 201 __array(char, name, 32)
142 __field(unsigned long, ino) 202 __field(unsigned long, ino)
143 __field(int, sync_mode) 203 __field(int, sync_mode)
204 __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
144 ), 205 ),
145 206
146 TP_fast_assign( 207 TP_fast_assign(
@@ -148,12 +209,14 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template,
148 dev_name(inode_to_bdi(inode)->dev), 32); 209 dev_name(inode_to_bdi(inode)->dev), 32);
149 __entry->ino = inode->i_ino; 210 __entry->ino = inode->i_ino;
150 __entry->sync_mode = wbc->sync_mode; 211 __entry->sync_mode = wbc->sync_mode;
212 __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
151 ), 213 ),
152 214
153 TP_printk("bdi %s: ino=%lu sync_mode=%d", 215 TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup=%s",
154 __entry->name, 216 __entry->name,
155 __entry->ino, 217 __entry->ino,
156 __entry->sync_mode 218 __entry->sync_mode,
219 __get_str(cgroup)
157 ) 220 )
158); 221);
159 222
@@ -172,8 +235,8 @@ DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode,
172); 235);
173 236
174DECLARE_EVENT_CLASS(writeback_work_class, 237DECLARE_EVENT_CLASS(writeback_work_class,
175 TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), 238 TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work),
176 TP_ARGS(bdi, work), 239 TP_ARGS(wb, work),
177 TP_STRUCT__entry( 240 TP_STRUCT__entry(
178 __array(char, name, 32) 241 __array(char, name, 32)
179 __field(long, nr_pages) 242 __field(long, nr_pages)
@@ -183,10 +246,11 @@ DECLARE_EVENT_CLASS(writeback_work_class,
183 __field(int, range_cyclic) 246 __field(int, range_cyclic)
184 __field(int, for_background) 247 __field(int, for_background)
185 __field(int, reason) 248 __field(int, reason)
249 __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
186 ), 250 ),
187 TP_fast_assign( 251 TP_fast_assign(
188 strncpy(__entry->name, 252 strncpy(__entry->name,
189 bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32); 253 wb->bdi->dev ? dev_name(wb->bdi->dev) : "(unknown)", 32);
190 __entry->nr_pages = work->nr_pages; 254 __entry->nr_pages = work->nr_pages;
191 __entry->sb_dev = work->sb ? work->sb->s_dev : 0; 255 __entry->sb_dev = work->sb ? work->sb->s_dev : 0;
192 __entry->sync_mode = work->sync_mode; 256 __entry->sync_mode = work->sync_mode;
@@ -194,9 +258,10 @@ DECLARE_EVENT_CLASS(writeback_work_class,
194 __entry->range_cyclic = work->range_cyclic; 258 __entry->range_cyclic = work->range_cyclic;
195 __entry->for_background = work->for_background; 259 __entry->for_background = work->for_background;
196 __entry->reason = work->reason; 260 __entry->reason = work->reason;
261 __trace_wb_assign_cgroup(__get_str(cgroup), wb);
197 ), 262 ),
198 TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d " 263 TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d "
199 "kupdate=%d range_cyclic=%d background=%d reason=%s", 264 "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup=%s",
200 __entry->name, 265 __entry->name,
201 MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev), 266 MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev),
202 __entry->nr_pages, 267 __entry->nr_pages,
@@ -204,13 +269,14 @@ DECLARE_EVENT_CLASS(writeback_work_class,
204 __entry->for_kupdate, 269 __entry->for_kupdate,
205 __entry->range_cyclic, 270 __entry->range_cyclic,
206 __entry->for_background, 271 __entry->for_background,
207 __print_symbolic(__entry->reason, WB_WORK_REASON) 272 __print_symbolic(__entry->reason, WB_WORK_REASON),
273 __get_str(cgroup)
208 ) 274 )
209); 275);
210#define DEFINE_WRITEBACK_WORK_EVENT(name) \ 276#define DEFINE_WRITEBACK_WORK_EVENT(name) \
211DEFINE_EVENT(writeback_work_class, name, \ 277DEFINE_EVENT(writeback_work_class, name, \
212 TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \ 278 TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), \
213 TP_ARGS(bdi, work)) 279 TP_ARGS(wb, work))
214DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); 280DEFINE_WRITEBACK_WORK_EVENT(writeback_queue);
215DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); 281DEFINE_WRITEBACK_WORK_EVENT(writeback_exec);
216DEFINE_WRITEBACK_WORK_EVENT(writeback_start); 282DEFINE_WRITEBACK_WORK_EVENT(writeback_start);
@@ -230,26 +296,42 @@ TRACE_EVENT(writeback_pages_written,
230); 296);
231 297
232DECLARE_EVENT_CLASS(writeback_class, 298DECLARE_EVENT_CLASS(writeback_class,
233 TP_PROTO(struct backing_dev_info *bdi), 299 TP_PROTO(struct bdi_writeback *wb),
234 TP_ARGS(bdi), 300 TP_ARGS(wb),
235 TP_STRUCT__entry( 301 TP_STRUCT__entry(
236 __array(char, name, 32) 302 __array(char, name, 32)
303 __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
237 ), 304 ),
238 TP_fast_assign( 305 TP_fast_assign(
239 strncpy(__entry->name, dev_name(bdi->dev), 32); 306 strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
307 __trace_wb_assign_cgroup(__get_str(cgroup), wb);
240 ), 308 ),
241 TP_printk("bdi %s", 309 TP_printk("bdi %s: cgroup=%s",
242 __entry->name 310 __entry->name,
311 __get_str(cgroup)
243 ) 312 )
244); 313);
245#define DEFINE_WRITEBACK_EVENT(name) \ 314#define DEFINE_WRITEBACK_EVENT(name) \
246DEFINE_EVENT(writeback_class, name, \ 315DEFINE_EVENT(writeback_class, name, \
247 TP_PROTO(struct backing_dev_info *bdi), \ 316 TP_PROTO(struct bdi_writeback *wb), \
248 TP_ARGS(bdi)) 317 TP_ARGS(wb))
249 318
250DEFINE_WRITEBACK_EVENT(writeback_nowork); 319DEFINE_WRITEBACK_EVENT(writeback_nowork);
251DEFINE_WRITEBACK_EVENT(writeback_wake_background); 320DEFINE_WRITEBACK_EVENT(writeback_wake_background);
252DEFINE_WRITEBACK_EVENT(writeback_bdi_register); 321
322TRACE_EVENT(writeback_bdi_register,
323 TP_PROTO(struct backing_dev_info *bdi),
324 TP_ARGS(bdi),
325 TP_STRUCT__entry(
326 __array(char, name, 32)
327 ),
328 TP_fast_assign(
329 strncpy(__entry->name, dev_name(bdi->dev), 32);
330 ),
331 TP_printk("bdi %s",
332 __entry->name
333 )
334);
253 335
254DECLARE_EVENT_CLASS(wbc_class, 336DECLARE_EVENT_CLASS(wbc_class,
255 TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), 337 TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),
@@ -265,6 +347,7 @@ DECLARE_EVENT_CLASS(wbc_class,
265 __field(int, range_cyclic) 347 __field(int, range_cyclic)
266 __field(long, range_start) 348 __field(long, range_start)
267 __field(long, range_end) 349 __field(long, range_end)
350 __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
268 ), 351 ),
269 352
270 TP_fast_assign( 353 TP_fast_assign(
@@ -278,11 +361,12 @@ DECLARE_EVENT_CLASS(wbc_class,
278 __entry->range_cyclic = wbc->range_cyclic; 361 __entry->range_cyclic = wbc->range_cyclic;
279 __entry->range_start = (long)wbc->range_start; 362 __entry->range_start = (long)wbc->range_start;
280 __entry->range_end = (long)wbc->range_end; 363 __entry->range_end = (long)wbc->range_end;
364 __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
281 ), 365 ),
282 366
283 TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d " 367 TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
284 "bgrd=%d reclm=%d cyclic=%d " 368 "bgrd=%d reclm=%d cyclic=%d "
285 "start=0x%lx end=0x%lx", 369 "start=0x%lx end=0x%lx cgroup=%s",
286 __entry->name, 370 __entry->name,
287 __entry->nr_to_write, 371 __entry->nr_to_write,
288 __entry->pages_skipped, 372 __entry->pages_skipped,
@@ -292,7 +376,9 @@ DECLARE_EVENT_CLASS(wbc_class,
292 __entry->for_reclaim, 376 __entry->for_reclaim,
293 __entry->range_cyclic, 377 __entry->range_cyclic,
294 __entry->range_start, 378 __entry->range_start,
295 __entry->range_end) 379 __entry->range_end,
380 __get_str(cgroup)
381 )
296) 382)
297 383
298#define DEFINE_WBC_EVENT(name) \ 384#define DEFINE_WBC_EVENT(name) \
@@ -312,6 +398,7 @@ TRACE_EVENT(writeback_queue_io,
312 __field(long, age) 398 __field(long, age)
313 __field(int, moved) 399 __field(int, moved)
314 __field(int, reason) 400 __field(int, reason)
401 __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
315 ), 402 ),
316 TP_fast_assign( 403 TP_fast_assign(
317 unsigned long *older_than_this = work->older_than_this; 404 unsigned long *older_than_this = work->older_than_this;
@@ -321,13 +408,15 @@ TRACE_EVENT(writeback_queue_io,
321 (jiffies - *older_than_this) * 1000 / HZ : -1; 408 (jiffies - *older_than_this) * 1000 / HZ : -1;
322 __entry->moved = moved; 409 __entry->moved = moved;
323 __entry->reason = work->reason; 410 __entry->reason = work->reason;
411 __trace_wb_assign_cgroup(__get_str(cgroup), wb);
324 ), 412 ),
325 TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s", 413 TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup=%s",
326 __entry->name, 414 __entry->name,
327 __entry->older, /* older_than_this in jiffies */ 415 __entry->older, /* older_than_this in jiffies */
328 __entry->age, /* older_than_this in relative milliseconds */ 416 __entry->age, /* older_than_this in relative milliseconds */
329 __entry->moved, 417 __entry->moved,
330 __print_symbolic(__entry->reason, WB_WORK_REASON) 418 __print_symbolic(__entry->reason, WB_WORK_REASON),
419 __get_str(cgroup)
331 ) 420 )
332); 421);
333 422
@@ -381,11 +470,11 @@ TRACE_EVENT(global_dirty_state,
381 470
382TRACE_EVENT(bdi_dirty_ratelimit, 471TRACE_EVENT(bdi_dirty_ratelimit,
383 472
384 TP_PROTO(struct backing_dev_info *bdi, 473 TP_PROTO(struct bdi_writeback *wb,
385 unsigned long dirty_rate, 474 unsigned long dirty_rate,
386 unsigned long task_ratelimit), 475 unsigned long task_ratelimit),
387 476
388 TP_ARGS(bdi, dirty_rate, task_ratelimit), 477 TP_ARGS(wb, dirty_rate, task_ratelimit),
389 478
390 TP_STRUCT__entry( 479 TP_STRUCT__entry(
391 __array(char, bdi, 32) 480 __array(char, bdi, 32)
@@ -395,36 +484,39 @@ TRACE_EVENT(bdi_dirty_ratelimit,
395 __field(unsigned long, dirty_ratelimit) 484 __field(unsigned long, dirty_ratelimit)
396 __field(unsigned long, task_ratelimit) 485 __field(unsigned long, task_ratelimit)
397 __field(unsigned long, balanced_dirty_ratelimit) 486 __field(unsigned long, balanced_dirty_ratelimit)
487 __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
398 ), 488 ),
399 489
400 TP_fast_assign( 490 TP_fast_assign(
401 strlcpy(__entry->bdi, dev_name(bdi->dev), 32); 491 strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32);
402 __entry->write_bw = KBps(bdi->wb.write_bandwidth); 492 __entry->write_bw = KBps(wb->write_bandwidth);
403 __entry->avg_write_bw = KBps(bdi->wb.avg_write_bandwidth); 493 __entry->avg_write_bw = KBps(wb->avg_write_bandwidth);
404 __entry->dirty_rate = KBps(dirty_rate); 494 __entry->dirty_rate = KBps(dirty_rate);
405 __entry->dirty_ratelimit = KBps(bdi->wb.dirty_ratelimit); 495 __entry->dirty_ratelimit = KBps(wb->dirty_ratelimit);
406 __entry->task_ratelimit = KBps(task_ratelimit); 496 __entry->task_ratelimit = KBps(task_ratelimit);
407 __entry->balanced_dirty_ratelimit = 497 __entry->balanced_dirty_ratelimit =
408 KBps(bdi->wb.balanced_dirty_ratelimit); 498 KBps(wb->balanced_dirty_ratelimit);
499 __trace_wb_assign_cgroup(__get_str(cgroup), wb);
409 ), 500 ),
410 501
411 TP_printk("bdi %s: " 502 TP_printk("bdi %s: "
412 "write_bw=%lu awrite_bw=%lu dirty_rate=%lu " 503 "write_bw=%lu awrite_bw=%lu dirty_rate=%lu "
413 "dirty_ratelimit=%lu task_ratelimit=%lu " 504 "dirty_ratelimit=%lu task_ratelimit=%lu "
414 "balanced_dirty_ratelimit=%lu", 505 "balanced_dirty_ratelimit=%lu cgroup=%s",
415 __entry->bdi, 506 __entry->bdi,
416 __entry->write_bw, /* write bandwidth */ 507 __entry->write_bw, /* write bandwidth */
417 __entry->avg_write_bw, /* avg write bandwidth */ 508 __entry->avg_write_bw, /* avg write bandwidth */
418 __entry->dirty_rate, /* bdi dirty rate */ 509 __entry->dirty_rate, /* bdi dirty rate */
419 __entry->dirty_ratelimit, /* base ratelimit */ 510 __entry->dirty_ratelimit, /* base ratelimit */
420 __entry->task_ratelimit, /* ratelimit with position control */ 511 __entry->task_ratelimit, /* ratelimit with position control */
421 __entry->balanced_dirty_ratelimit /* the balanced ratelimit */ 512 __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */
513 __get_str(cgroup)
422 ) 514 )
423); 515);
424 516
425TRACE_EVENT(balance_dirty_pages, 517TRACE_EVENT(balance_dirty_pages,
426 518
427 TP_PROTO(struct backing_dev_info *bdi, 519 TP_PROTO(struct bdi_writeback *wb,
428 unsigned long thresh, 520 unsigned long thresh,
429 unsigned long bg_thresh, 521 unsigned long bg_thresh,
430 unsigned long dirty, 522 unsigned long dirty,
@@ -437,7 +529,7 @@ TRACE_EVENT(balance_dirty_pages,
437 long pause, 529 long pause,
438 unsigned long start_time), 530 unsigned long start_time),
439 531
440 TP_ARGS(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty, 532 TP_ARGS(wb, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty,
441 dirty_ratelimit, task_ratelimit, 533 dirty_ratelimit, task_ratelimit,
442 dirtied, period, pause, start_time), 534 dirtied, period, pause, start_time),
443 535
@@ -456,11 +548,12 @@ TRACE_EVENT(balance_dirty_pages,
456 __field( long, pause) 548 __field( long, pause)
457 __field(unsigned long, period) 549 __field(unsigned long, period)
458 __field( long, think) 550 __field( long, think)
551 __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
459 ), 552 ),
460 553
461 TP_fast_assign( 554 TP_fast_assign(
462 unsigned long freerun = (thresh + bg_thresh) / 2; 555 unsigned long freerun = (thresh + bg_thresh) / 2;
463 strlcpy(__entry->bdi, dev_name(bdi->dev), 32); 556 strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32);
464 557
465 __entry->limit = global_wb_domain.dirty_limit; 558 __entry->limit = global_wb_domain.dirty_limit;
466 __entry->setpoint = (global_wb_domain.dirty_limit + 559 __entry->setpoint = (global_wb_domain.dirty_limit +
@@ -478,6 +571,7 @@ TRACE_EVENT(balance_dirty_pages,
478 __entry->period = period * 1000 / HZ; 571 __entry->period = period * 1000 / HZ;
479 __entry->pause = pause * 1000 / HZ; 572 __entry->pause = pause * 1000 / HZ;
480 __entry->paused = (jiffies - start_time) * 1000 / HZ; 573 __entry->paused = (jiffies - start_time) * 1000 / HZ;
574 __trace_wb_assign_cgroup(__get_str(cgroup), wb);
481 ), 575 ),
482 576
483 577
@@ -486,7 +580,7 @@ TRACE_EVENT(balance_dirty_pages,
486 "bdi_setpoint=%lu bdi_dirty=%lu " 580 "bdi_setpoint=%lu bdi_dirty=%lu "
487 "dirty_ratelimit=%lu task_ratelimit=%lu " 581 "dirty_ratelimit=%lu task_ratelimit=%lu "
488 "dirtied=%u dirtied_pause=%u " 582 "dirtied=%u dirtied_pause=%u "
489 "paused=%lu pause=%ld period=%lu think=%ld", 583 "paused=%lu pause=%ld period=%lu think=%ld cgroup=%s",
490 __entry->bdi, 584 __entry->bdi,
491 __entry->limit, 585 __entry->limit,
492 __entry->setpoint, 586 __entry->setpoint,
@@ -500,7 +594,8 @@ TRACE_EVENT(balance_dirty_pages,
500 __entry->paused, /* ms */ 594 __entry->paused, /* ms */
501 __entry->pause, /* ms */ 595 __entry->pause, /* ms */
502 __entry->period, /* ms */ 596 __entry->period, /* ms */
503 __entry->think /* ms */ 597 __entry->think, /* ms */
598 __get_str(cgroup)
504 ) 599 )
505); 600);
506 601
@@ -514,6 +609,8 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
514 __field(unsigned long, ino) 609 __field(unsigned long, ino)
515 __field(unsigned long, state) 610 __field(unsigned long, state)
516 __field(unsigned long, dirtied_when) 611 __field(unsigned long, dirtied_when)
612 __dynamic_array(char, cgroup,
613 __trace_wb_cgroup_size(inode_to_wb(inode)))
517 ), 614 ),
518 615
519 TP_fast_assign( 616 TP_fast_assign(
@@ -522,14 +619,16 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
522 __entry->ino = inode->i_ino; 619 __entry->ino = inode->i_ino;
523 __entry->state = inode->i_state; 620 __entry->state = inode->i_state;
524 __entry->dirtied_when = inode->dirtied_when; 621 __entry->dirtied_when = inode->dirtied_when;
622 __trace_wb_assign_cgroup(__get_str(cgroup), inode_to_wb(inode));
525 ), 623 ),
526 624
527 TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu", 625 TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup=%s",
528 __entry->name, 626 __entry->name,
529 __entry->ino, 627 __entry->ino,
530 show_inode_state(__entry->state), 628 show_inode_state(__entry->state),
531 __entry->dirtied_when, 629 __entry->dirtied_when,
532 (jiffies - __entry->dirtied_when) / HZ 630 (jiffies - __entry->dirtied_when) / HZ,
631 __get_str(cgroup)
533 ) 632 )
534); 633);
535 634
@@ -585,6 +684,7 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
585 __field(unsigned long, writeback_index) 684 __field(unsigned long, writeback_index)
586 __field(long, nr_to_write) 685 __field(long, nr_to_write)
587 __field(unsigned long, wrote) 686 __field(unsigned long, wrote)
687 __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
588 ), 688 ),
589 689
590 TP_fast_assign( 690 TP_fast_assign(
@@ -596,10 +696,11 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
596 __entry->writeback_index = inode->i_mapping->writeback_index; 696 __entry->writeback_index = inode->i_mapping->writeback_index;
597 __entry->nr_to_write = nr_to_write; 697 __entry->nr_to_write = nr_to_write;
598 __entry->wrote = nr_to_write - wbc->nr_to_write; 698 __entry->wrote = nr_to_write - wbc->nr_to_write;
699 __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
599 ), 700 ),
600 701
601 TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu " 702 TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu "
602 "index=%lu to_write=%ld wrote=%lu", 703 "index=%lu to_write=%ld wrote=%lu cgroup=%s",
603 __entry->name, 704 __entry->name,
604 __entry->ino, 705 __entry->ino,
605 show_inode_state(__entry->state), 706 show_inode_state(__entry->state),
@@ -607,7 +708,8 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
607 (jiffies - __entry->dirtied_when) / HZ, 708 (jiffies - __entry->dirtied_when) / HZ,
608 __entry->writeback_index, 709 __entry->writeback_index,
609 __entry->nr_to_write, 710 __entry->nr_to_write,
610 __entry->wrote 711 __entry->wrote,
712 __get_str(cgroup)
611 ) 713 )
612); 714);
613 715
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index ee8d7fd07be3..2df8ddcb0ca0 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -523,7 +523,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
523 int ret = 0; 523 int ret = 0;
524 524
525 memcg = mem_cgroup_from_css(memcg_css); 525 memcg = mem_cgroup_from_css(memcg_css);
526 blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &blkio_cgrp_subsys); 526 blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
527 blkcg = css_to_blkcg(blkcg_css); 527 blkcg = css_to_blkcg(blkcg_css);
528 memcg_cgwb_list = mem_cgroup_cgwb_list(memcg); 528 memcg_cgwb_list = mem_cgroup_cgwb_list(memcg);
529 blkcg_cgwb_list = &blkcg->cgwb_list; 529 blkcg_cgwb_list = &blkcg->cgwb_list;
@@ -645,7 +645,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
645 645
646 /* see whether the blkcg association has changed */ 646 /* see whether the blkcg association has changed */
647 blkcg_css = cgroup_get_e_css(memcg_css->cgroup, 647 blkcg_css = cgroup_get_e_css(memcg_css->cgroup,
648 &blkio_cgrp_subsys); 648 &io_cgrp_subsys);
649 if (unlikely(wb->blkcg_css != blkcg_css || 649 if (unlikely(wb->blkcg_css != blkcg_css ||
650 !wb_tryget(wb))) 650 !wb_tryget(wb)))
651 wb = NULL; 651 wb = NULL;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 5cccc127ef81..0a931cdd4f6b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1289,7 +1289,7 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
1289 wb->dirty_ratelimit = max(dirty_ratelimit, 1UL); 1289 wb->dirty_ratelimit = max(dirty_ratelimit, 1UL);
1290 wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit; 1290 wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
1291 1291
1292 trace_bdi_dirty_ratelimit(wb->bdi, dirty_rate, task_ratelimit); 1292 trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit);
1293} 1293}
1294 1294
1295static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, 1295static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
@@ -1683,7 +1683,7 @@ static void balance_dirty_pages(struct address_space *mapping,
1683 * do a reset, as it may be a light dirtier. 1683 * do a reset, as it may be a light dirtier.
1684 */ 1684 */
1685 if (pause < min_pause) { 1685 if (pause < min_pause) {
1686 trace_balance_dirty_pages(bdi, 1686 trace_balance_dirty_pages(wb,
1687 sdtc->thresh, 1687 sdtc->thresh,
1688 sdtc->bg_thresh, 1688 sdtc->bg_thresh,
1689 sdtc->dirty, 1689 sdtc->dirty,
@@ -1712,7 +1712,7 @@ static void balance_dirty_pages(struct address_space *mapping,
1712 } 1712 }
1713 1713
1714pause: 1714pause:
1715 trace_balance_dirty_pages(bdi, 1715 trace_balance_dirty_pages(wb,
1716 sdtc->thresh, 1716 sdtc->thresh,
1717 sdtc->bg_thresh, 1717 sdtc->bg_thresh,
1718 sdtc->dirty, 1718 sdtc->dirty,