diff options
-rw-r--r-- | Documentation/cgroups/blkio-controller.txt | 24 | ||||
-rw-r--r-- | Documentation/cgroups/unified-hierarchy.txt | 61 | ||||
-rw-r--r-- | block/bio.c | 2 | ||||
-rw-r--r-- | block/blk-cgroup.c | 524 | ||||
-rw-r--r-- | block/blk-core.c | 4 | ||||
-rw-r--r-- | block/blk-throttle.c | 505 | ||||
-rw-r--r-- | block/blk.h | 5 | ||||
-rw-r--r-- | block/cfq-iosched.c | 651 | ||||
-rw-r--r-- | fs/fs-writeback.c | 139 | ||||
-rw-r--r-- | fs/kernfs/dir.c | 23 | ||||
-rw-r--r-- | include/linux/backing-dev.h | 26 | ||||
-rw-r--r-- | include/linux/blk-cgroup.h | 340 | ||||
-rw-r--r-- | include/linux/cgroup_subsys.h | 2 | ||||
-rw-r--r-- | include/linux/kernfs.h | 4 | ||||
-rw-r--r-- | include/trace/events/writeback.h | 180 | ||||
-rw-r--r-- | mm/backing-dev.c | 4 | ||||
-rw-r--r-- | mm/page-writeback.c | 6 |
17 files changed, 1422 insertions, 1078 deletions
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt index 68b6a6a470b0..12686bec37b9 100644 --- a/Documentation/cgroups/blkio-controller.txt +++ b/Documentation/cgroups/blkio-controller.txt | |||
@@ -201,7 +201,7 @@ Proportional weight policy files | |||
201 | specifies the number of bytes. | 201 | specifies the number of bytes. |
202 | 202 | ||
203 | - blkio.io_serviced | 203 | - blkio.io_serviced |
204 | - Number of IOs completed to/from the disk by the group. These | 204 | - Number of IOs (bio) issued to the disk by the group. These |
205 | are further divided by the type of operation - read or write, sync | 205 | are further divided by the type of operation - read or write, sync |
206 | or async. First two fields specify the major and minor number of the | 206 | or async. First two fields specify the major and minor number of the |
207 | device, third field specifies the operation type and the fourth field | 207 | device, third field specifies the operation type and the fourth field |
@@ -327,18 +327,11 @@ Note: If both BW and IOPS rules are specified for a device, then IO is | |||
327 | subjected to both the constraints. | 327 | subjected to both the constraints. |
328 | 328 | ||
329 | - blkio.throttle.io_serviced | 329 | - blkio.throttle.io_serviced |
330 | - Number of IOs (bio) completed to/from the disk by the group (as | 330 | - Number of IOs (bio) issued to the disk by the group. These |
331 | seen by throttling policy). These are further divided by the type | 331 | are further divided by the type of operation - read or write, sync |
332 | of operation - read or write, sync or async. First two fields specify | 332 | or async. First two fields specify the major and minor number of the |
333 | the major and minor number of the device, third field specifies the | 333 | device, third field specifies the operation type and the fourth field |
334 | operation type and the fourth field specifies the number of IOs. | 334 | specifies the number of IOs. |
335 | |||
336 | blkio.io_serviced does accounting as seen by CFQ and counts are in | ||
337 | number of requests (struct request). On the other hand, | ||
338 | blkio.throttle.io_serviced counts number of IO in terms of number | ||
339 | of bios as seen by throttling policy. These bios can later be | ||
340 | merged by elevator and total number of requests completed can be | ||
341 | lesser. | ||
342 | 335 | ||
343 | - blkio.throttle.io_service_bytes | 336 | - blkio.throttle.io_service_bytes |
344 | - Number of bytes transferred to/from the disk by the group. These | 337 | - Number of bytes transferred to/from the disk by the group. These |
@@ -347,11 +340,6 @@ Note: If both BW and IOPS rules are specified for a device, then IO is | |||
347 | device, third field specifies the operation type and the fourth field | 340 | device, third field specifies the operation type and the fourth field |
348 | specifies the number of bytes. | 341 | specifies the number of bytes. |
349 | 342 | ||
350 | These numbers should roughly be same as blkio.io_service_bytes as | ||
351 | updated by CFQ. The difference between two is that | ||
352 | blkio.io_service_bytes will not be updated if CFQ is not operating | ||
353 | on request queue. | ||
354 | |||
355 | Common files among various policies | 343 | Common files among various policies |
356 | ----------------------------------- | 344 | ----------------------------------- |
357 | - blkio.reset_stats | 345 | - blkio.reset_stats |
diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt index 1ee9caf29e57..e0975c2cf03d 100644 --- a/Documentation/cgroups/unified-hierarchy.txt +++ b/Documentation/cgroups/unified-hierarchy.txt | |||
@@ -27,7 +27,7 @@ CONTENTS | |||
27 | 5-3-1. Format | 27 | 5-3-1. Format |
28 | 5-3-2. Control Knobs | 28 | 5-3-2. Control Knobs |
29 | 5-4. Per-Controller Changes | 29 | 5-4. Per-Controller Changes |
30 | 5-4-1. blkio | 30 | 5-4-1. io |
31 | 5-4-2. cpuset | 31 | 5-4-2. cpuset |
32 | 5-4-3. memory | 32 | 5-4-3. memory |
33 | 6. Planned Changes | 33 | 6. Planned Changes |
@@ -203,7 +203,7 @@ other issues. The mapping from nice level to weight isn't obvious or | |||
203 | universal, and there are various other knobs which simply aren't | 203 | universal, and there are various other knobs which simply aren't |
204 | available for tasks. | 204 | available for tasks. |
205 | 205 | ||
206 | The blkio controller implicitly creates a hidden leaf node for each | 206 | The io controller implicitly creates a hidden leaf node for each |
207 | cgroup to host the tasks. The hidden leaf has its own copies of all | 207 | cgroup to host the tasks. The hidden leaf has its own copies of all |
208 | the knobs with "leaf_" prefixed. While this allows equivalent control | 208 | the knobs with "leaf_" prefixed. While this allows equivalent control |
209 | over internal tasks, it's with serious drawbacks. It always adds an | 209 | over internal tasks, it's with serious drawbacks. It always adds an |
@@ -438,9 +438,62 @@ may be specified in any order and not all pairs have to be specified. | |||
438 | 438 | ||
439 | 5-4. Per-Controller Changes | 439 | 5-4. Per-Controller Changes |
440 | 440 | ||
441 | 5-4-1. blkio | 441 | 5-4-1. io |
442 | 442 | ||
443 | - blk-throttle becomes properly hierarchical. | 443 | - blkio is renamed to io. The interface is overhauled anyway. The |
444 | new name is more in line with the other two major controllers, cpu | ||
445 | and memory, and better suited given that it may be used for cgroup | ||
446 | writeback without involving block layer. | ||
447 | |||
448 | - Everything including stat is always hierarchical making separate | ||
449 | recursive stat files pointless and, as no internal node can have | ||
450 | tasks, leaf weights are meaningless. The operation model is | ||
451 | simplified and the interface is overhauled accordingly. | ||
452 | |||
453 | io.stat | ||
454 | |||
455 | The stat file. The reported stats are from the point where | ||
456 | bio's are issued to request_queue. The stats are counted | ||
457 | independent of which policies are enabled. Each line in the | ||
458 | file follows the following format. More fields may later be | ||
459 | added at the end. | ||
460 | |||
461 | $MAJ:$MIN rbytes=$RBYTES wbytes=$WBYTES rios=$RIOS wrios=$WIOS | ||
462 | |||
463 | io.weight | ||
464 | |||
465 | The weight setting, currently only available and effective if | ||
466 | cfq-iosched is in use for the target device. The weight is | ||
467 | between 1 and 10000 and defaults to 100. The first line | ||
468 | always contains the default weight in the following format to | ||
469 | use when per-device setting is missing. | ||
470 | |||
471 | default $WEIGHT | ||
472 | |||
473 | Subsequent lines list per-device weights of the following | ||
474 | format. | ||
475 | |||
476 | $MAJ:$MIN $WEIGHT | ||
477 | |||
478 | Writing "$WEIGHT" or "default $WEIGHT" changes the default | ||
479 | setting. Writing "$MAJ:$MIN $WEIGHT" sets per-device weight | ||
480 | while "$MAJ:$MIN default" clears it. | ||
481 | |||
482 | This file is available only on non-root cgroups. | ||
483 | |||
484 | io.max | ||
485 | |||
486 | The maximum bandwidth and/or iops setting, only available if | ||
487 | blk-throttle is enabled. The file is of the following format. | ||
488 | |||
489 | $MAJ:$MIN rbps=$RBPS wbps=$WBPS riops=$RIOPS wiops=$WIOPS | ||
490 | |||
491 | ${R|W}BPS are read/write bytes per second and ${R|W}IOPS are | ||
492 | read/write IOs per second. "max" indicates no limit. Writing | ||
493 | to the file follows the same format but the individual | ||
494 | settings may be ommitted or specified in any order. | ||
495 | |||
496 | This file is available only on non-root cgroups. | ||
444 | 497 | ||
445 | 498 | ||
446 | 5-4-2. cpuset | 499 | 5-4-2. cpuset |
diff --git a/block/bio.c b/block/bio.c index 515b5434fe2d..ad3f276d74bc 100644 --- a/block/bio.c +++ b/block/bio.c | |||
@@ -1990,7 +1990,7 @@ int bio_associate_current(struct bio *bio) | |||
1990 | 1990 | ||
1991 | get_io_context_active(ioc); | 1991 | get_io_context_active(ioc); |
1992 | bio->bi_ioc = ioc; | 1992 | bio->bi_ioc = ioc; |
1993 | bio->bi_css = task_get_css(current, blkio_cgrp_id); | 1993 | bio->bi_css = task_get_css(current, io_cgrp_id); |
1994 | return 0; | 1994 | return 0; |
1995 | } | 1995 | } |
1996 | EXPORT_SYMBOL_GPL(bio_associate_current); | 1996 | EXPORT_SYMBOL_GPL(bio_associate_current); |
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index d6283b3f5db5..ac8370cb2515 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include <linux/genhd.h> | 24 | #include <linux/genhd.h> |
25 | #include <linux/delay.h> | 25 | #include <linux/delay.h> |
26 | #include <linux/atomic.h> | 26 | #include <linux/atomic.h> |
27 | #include <linux/ctype.h> | ||
27 | #include <linux/blk-cgroup.h> | 28 | #include <linux/blk-cgroup.h> |
28 | #include "blk.h" | 29 | #include "blk.h" |
29 | 30 | ||
@@ -68,9 +69,14 @@ static void blkg_free(struct blkcg_gq *blkg) | |||
68 | return; | 69 | return; |
69 | 70 | ||
70 | for (i = 0; i < BLKCG_MAX_POLS; i++) | 71 | for (i = 0; i < BLKCG_MAX_POLS; i++) |
71 | kfree(blkg->pd[i]); | 72 | if (blkg->pd[i]) |
73 | blkcg_policy[i]->pd_free_fn(blkg->pd[i]); | ||
72 | 74 | ||
73 | blk_exit_rl(&blkg->rl); | 75 | if (blkg->blkcg != &blkcg_root) |
76 | blk_exit_rl(&blkg->rl); | ||
77 | |||
78 | blkg_rwstat_exit(&blkg->stat_ios); | ||
79 | blkg_rwstat_exit(&blkg->stat_bytes); | ||
74 | kfree(blkg); | 80 | kfree(blkg); |
75 | } | 81 | } |
76 | 82 | ||
@@ -93,6 +99,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, | |||
93 | if (!blkg) | 99 | if (!blkg) |
94 | return NULL; | 100 | return NULL; |
95 | 101 | ||
102 | if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) || | ||
103 | blkg_rwstat_init(&blkg->stat_ios, gfp_mask)) | ||
104 | goto err_free; | ||
105 | |||
96 | blkg->q = q; | 106 | blkg->q = q; |
97 | INIT_LIST_HEAD(&blkg->q_node); | 107 | INIT_LIST_HEAD(&blkg->q_node); |
98 | blkg->blkcg = blkcg; | 108 | blkg->blkcg = blkcg; |
@@ -113,7 +123,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, | |||
113 | continue; | 123 | continue; |
114 | 124 | ||
115 | /* alloc per-policy data and attach it to blkg */ | 125 | /* alloc per-policy data and attach it to blkg */ |
116 | pd = kzalloc_node(pol->pd_size, gfp_mask, q->node); | 126 | pd = pol->pd_alloc_fn(gfp_mask, q->node); |
117 | if (!pd) | 127 | if (!pd) |
118 | goto err_free; | 128 | goto err_free; |
119 | 129 | ||
@@ -129,26 +139,11 @@ err_free: | |||
129 | return NULL; | 139 | return NULL; |
130 | } | 140 | } |
131 | 141 | ||
132 | /** | 142 | struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, |
133 | * __blkg_lookup - internal version of blkg_lookup() | 143 | struct request_queue *q, bool update_hint) |
134 | * @blkcg: blkcg of interest | ||
135 | * @q: request_queue of interest | ||
136 | * @update_hint: whether to update lookup hint with the result or not | ||
137 | * | ||
138 | * This is internal version and shouldn't be used by policy | ||
139 | * implementations. Looks up blkgs for the @blkcg - @q pair regardless of | ||
140 | * @q's bypass state. If @update_hint is %true, the caller should be | ||
141 | * holding @q->queue_lock and lookup hint is updated on success. | ||
142 | */ | ||
143 | struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, | ||
144 | bool update_hint) | ||
145 | { | 144 | { |
146 | struct blkcg_gq *blkg; | 145 | struct blkcg_gq *blkg; |
147 | 146 | ||
148 | blkg = rcu_dereference(blkcg->blkg_hint); | ||
149 | if (blkg && blkg->q == q) | ||
150 | return blkg; | ||
151 | |||
152 | /* | 147 | /* |
153 | * Hint didn't match. Look up from the radix tree. Note that the | 148 | * Hint didn't match. Look up from the radix tree. Note that the |
154 | * hint can only be updated under queue_lock as otherwise @blkg | 149 | * hint can only be updated under queue_lock as otherwise @blkg |
@@ -166,29 +161,11 @@ struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, | |||
166 | 161 | ||
167 | return NULL; | 162 | return NULL; |
168 | } | 163 | } |
169 | 164 | EXPORT_SYMBOL_GPL(blkg_lookup_slowpath); | |
170 | /** | ||
171 | * blkg_lookup - lookup blkg for the specified blkcg - q pair | ||
172 | * @blkcg: blkcg of interest | ||
173 | * @q: request_queue of interest | ||
174 | * | ||
175 | * Lookup blkg for the @blkcg - @q pair. This function should be called | ||
176 | * under RCU read lock and is guaranteed to return %NULL if @q is bypassing | ||
177 | * - see blk_queue_bypass_start() for details. | ||
178 | */ | ||
179 | struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) | ||
180 | { | ||
181 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
182 | |||
183 | if (unlikely(blk_queue_bypass(q))) | ||
184 | return NULL; | ||
185 | return __blkg_lookup(blkcg, q, false); | ||
186 | } | ||
187 | EXPORT_SYMBOL_GPL(blkg_lookup); | ||
188 | 165 | ||
189 | /* | 166 | /* |
190 | * If @new_blkg is %NULL, this function tries to allocate a new one as | 167 | * If @new_blkg is %NULL, this function tries to allocate a new one as |
191 | * necessary using %GFP_ATOMIC. @new_blkg is always consumed on return. | 168 | * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return. |
192 | */ | 169 | */ |
193 | static struct blkcg_gq *blkg_create(struct blkcg *blkcg, | 170 | static struct blkcg_gq *blkg_create(struct blkcg *blkcg, |
194 | struct request_queue *q, | 171 | struct request_queue *q, |
@@ -203,12 +180,12 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, | |||
203 | 180 | ||
204 | /* blkg holds a reference to blkcg */ | 181 | /* blkg holds a reference to blkcg */ |
205 | if (!css_tryget_online(&blkcg->css)) { | 182 | if (!css_tryget_online(&blkcg->css)) { |
206 | ret = -EINVAL; | 183 | ret = -ENODEV; |
207 | goto err_free_blkg; | 184 | goto err_free_blkg; |
208 | } | 185 | } |
209 | 186 | ||
210 | wb_congested = wb_congested_get_create(&q->backing_dev_info, | 187 | wb_congested = wb_congested_get_create(&q->backing_dev_info, |
211 | blkcg->css.id, GFP_ATOMIC); | 188 | blkcg->css.id, GFP_NOWAIT); |
212 | if (!wb_congested) { | 189 | if (!wb_congested) { |
213 | ret = -ENOMEM; | 190 | ret = -ENOMEM; |
214 | goto err_put_css; | 191 | goto err_put_css; |
@@ -216,7 +193,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, | |||
216 | 193 | ||
217 | /* allocate */ | 194 | /* allocate */ |
218 | if (!new_blkg) { | 195 | if (!new_blkg) { |
219 | new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC); | 196 | new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT); |
220 | if (unlikely(!new_blkg)) { | 197 | if (unlikely(!new_blkg)) { |
221 | ret = -ENOMEM; | 198 | ret = -ENOMEM; |
222 | goto err_put_congested; | 199 | goto err_put_congested; |
@@ -229,7 +206,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, | |||
229 | if (blkcg_parent(blkcg)) { | 206 | if (blkcg_parent(blkcg)) { |
230 | blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); | 207 | blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); |
231 | if (WARN_ON_ONCE(!blkg->parent)) { | 208 | if (WARN_ON_ONCE(!blkg->parent)) { |
232 | ret = -EINVAL; | 209 | ret = -ENODEV; |
233 | goto err_put_congested; | 210 | goto err_put_congested; |
234 | } | 211 | } |
235 | blkg_get(blkg->parent); | 212 | blkg_get(blkg->parent); |
@@ -240,7 +217,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, | |||
240 | struct blkcg_policy *pol = blkcg_policy[i]; | 217 | struct blkcg_policy *pol = blkcg_policy[i]; |
241 | 218 | ||
242 | if (blkg->pd[i] && pol->pd_init_fn) | 219 | if (blkg->pd[i] && pol->pd_init_fn) |
243 | pol->pd_init_fn(blkg); | 220 | pol->pd_init_fn(blkg->pd[i]); |
244 | } | 221 | } |
245 | 222 | ||
246 | /* insert */ | 223 | /* insert */ |
@@ -254,7 +231,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, | |||
254 | struct blkcg_policy *pol = blkcg_policy[i]; | 231 | struct blkcg_policy *pol = blkcg_policy[i]; |
255 | 232 | ||
256 | if (blkg->pd[i] && pol->pd_online_fn) | 233 | if (blkg->pd[i] && pol->pd_online_fn) |
257 | pol->pd_online_fn(blkg); | 234 | pol->pd_online_fn(blkg->pd[i]); |
258 | } | 235 | } |
259 | } | 236 | } |
260 | blkg->online = true; | 237 | blkg->online = true; |
@@ -303,7 +280,7 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, | |||
303 | * we shouldn't allow anything to go through for a bypassing queue. | 280 | * we shouldn't allow anything to go through for a bypassing queue. |
304 | */ | 281 | */ |
305 | if (unlikely(blk_queue_bypass(q))) | 282 | if (unlikely(blk_queue_bypass(q))) |
306 | return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY); | 283 | return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY); |
307 | 284 | ||
308 | blkg = __blkg_lookup(blkcg, q, true); | 285 | blkg = __blkg_lookup(blkcg, q, true); |
309 | if (blkg) | 286 | if (blkg) |
@@ -327,11 +304,11 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, | |||
327 | return blkg; | 304 | return blkg; |
328 | } | 305 | } |
329 | } | 306 | } |
330 | EXPORT_SYMBOL_GPL(blkg_lookup_create); | ||
331 | 307 | ||
332 | static void blkg_destroy(struct blkcg_gq *blkg) | 308 | static void blkg_destroy(struct blkcg_gq *blkg) |
333 | { | 309 | { |
334 | struct blkcg *blkcg = blkg->blkcg; | 310 | struct blkcg *blkcg = blkg->blkcg; |
311 | struct blkcg_gq *parent = blkg->parent; | ||
335 | int i; | 312 | int i; |
336 | 313 | ||
337 | lockdep_assert_held(blkg->q->queue_lock); | 314 | lockdep_assert_held(blkg->q->queue_lock); |
@@ -345,8 +322,14 @@ static void blkg_destroy(struct blkcg_gq *blkg) | |||
345 | struct blkcg_policy *pol = blkcg_policy[i]; | 322 | struct blkcg_policy *pol = blkcg_policy[i]; |
346 | 323 | ||
347 | if (blkg->pd[i] && pol->pd_offline_fn) | 324 | if (blkg->pd[i] && pol->pd_offline_fn) |
348 | pol->pd_offline_fn(blkg); | 325 | pol->pd_offline_fn(blkg->pd[i]); |
326 | } | ||
327 | |||
328 | if (parent) { | ||
329 | blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes); | ||
330 | blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios); | ||
349 | } | 331 | } |
332 | |||
350 | blkg->online = false; | 333 | blkg->online = false; |
351 | 334 | ||
352 | radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); | 335 | radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); |
@@ -400,15 +383,6 @@ static void blkg_destroy_all(struct request_queue *q) | |||
400 | void __blkg_release_rcu(struct rcu_head *rcu_head) | 383 | void __blkg_release_rcu(struct rcu_head *rcu_head) |
401 | { | 384 | { |
402 | struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head); | 385 | struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head); |
403 | int i; | ||
404 | |||
405 | /* tell policies that this one is being freed */ | ||
406 | for (i = 0; i < BLKCG_MAX_POLS; i++) { | ||
407 | struct blkcg_policy *pol = blkcg_policy[i]; | ||
408 | |||
409 | if (blkg->pd[i] && pol->pd_exit_fn) | ||
410 | pol->pd_exit_fn(blkg); | ||
411 | } | ||
412 | 386 | ||
413 | /* release the blkcg and parent blkg refs this blkg has been holding */ | 387 | /* release the blkcg and parent blkg refs this blkg has been holding */ |
414 | css_put(&blkg->blkcg->css); | 388 | css_put(&blkg->blkcg->css); |
@@ -472,12 +446,14 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, | |||
472 | * anyway. If you get hit by a race, retry. | 446 | * anyway. If you get hit by a race, retry. |
473 | */ | 447 | */ |
474 | hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { | 448 | hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { |
449 | blkg_rwstat_reset(&blkg->stat_bytes); | ||
450 | blkg_rwstat_reset(&blkg->stat_ios); | ||
451 | |||
475 | for (i = 0; i < BLKCG_MAX_POLS; i++) { | 452 | for (i = 0; i < BLKCG_MAX_POLS; i++) { |
476 | struct blkcg_policy *pol = blkcg_policy[i]; | 453 | struct blkcg_policy *pol = blkcg_policy[i]; |
477 | 454 | ||
478 | if (blkcg_policy_enabled(blkg->q, pol) && | 455 | if (blkg->pd[i] && pol->pd_reset_stats_fn) |
479 | pol->pd_reset_stats_fn) | 456 | pol->pd_reset_stats_fn(blkg->pd[i]); |
480 | pol->pd_reset_stats_fn(blkg); | ||
481 | } | 457 | } |
482 | } | 458 | } |
483 | 459 | ||
@@ -486,13 +462,14 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, | |||
486 | return 0; | 462 | return 0; |
487 | } | 463 | } |
488 | 464 | ||
489 | static const char *blkg_dev_name(struct blkcg_gq *blkg) | 465 | const char *blkg_dev_name(struct blkcg_gq *blkg) |
490 | { | 466 | { |
491 | /* some drivers (floppy) instantiate a queue w/o disk registered */ | 467 | /* some drivers (floppy) instantiate a queue w/o disk registered */ |
492 | if (blkg->q->backing_dev_info.dev) | 468 | if (blkg->q->backing_dev_info.dev) |
493 | return dev_name(blkg->q->backing_dev_info.dev); | 469 | return dev_name(blkg->q->backing_dev_info.dev); |
494 | return NULL; | 470 | return NULL; |
495 | } | 471 | } |
472 | EXPORT_SYMBOL_GPL(blkg_dev_name); | ||
496 | 473 | ||
497 | /** | 474 | /** |
498 | * blkcg_print_blkgs - helper for printing per-blkg data | 475 | * blkcg_print_blkgs - helper for printing per-blkg data |
@@ -581,9 +558,10 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, | |||
581 | 558 | ||
582 | for (i = 0; i < BLKG_RWSTAT_NR; i++) | 559 | for (i = 0; i < BLKG_RWSTAT_NR; i++) |
583 | seq_printf(sf, "%s %s %llu\n", dname, rwstr[i], | 560 | seq_printf(sf, "%s %s %llu\n", dname, rwstr[i], |
584 | (unsigned long long)rwstat->cnt[i]); | 561 | (unsigned long long)atomic64_read(&rwstat->aux_cnt[i])); |
585 | 562 | ||
586 | v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE]; | 563 | v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) + |
564 | atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]); | ||
587 | seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); | 565 | seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); |
588 | return v; | 566 | return v; |
589 | } | 567 | } |
@@ -620,31 +598,122 @@ u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, | |||
620 | } | 598 | } |
621 | EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); | 599 | EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); |
622 | 600 | ||
601 | static u64 blkg_prfill_rwstat_field(struct seq_file *sf, | ||
602 | struct blkg_policy_data *pd, int off) | ||
603 | { | ||
604 | struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off); | ||
605 | |||
606 | return __blkg_prfill_rwstat(sf, pd, &rwstat); | ||
607 | } | ||
608 | |||
609 | /** | ||
610 | * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes | ||
611 | * @sf: seq_file to print to | ||
612 | * @v: unused | ||
613 | * | ||
614 | * To be used as cftype->seq_show to print blkg->stat_bytes. | ||
615 | * cftype->private must be set to the blkcg_policy. | ||
616 | */ | ||
617 | int blkg_print_stat_bytes(struct seq_file *sf, void *v) | ||
618 | { | ||
619 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), | ||
620 | blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private, | ||
621 | offsetof(struct blkcg_gq, stat_bytes), true); | ||
622 | return 0; | ||
623 | } | ||
624 | EXPORT_SYMBOL_GPL(blkg_print_stat_bytes); | ||
625 | |||
626 | /** | ||
627 | * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios | ||
628 | * @sf: seq_file to print to | ||
629 | * @v: unused | ||
630 | * | ||
631 | * To be used as cftype->seq_show to print blkg->stat_ios. cftype->private | ||
632 | * must be set to the blkcg_policy. | ||
633 | */ | ||
634 | int blkg_print_stat_ios(struct seq_file *sf, void *v) | ||
635 | { | ||
636 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), | ||
637 | blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private, | ||
638 | offsetof(struct blkcg_gq, stat_ios), true); | ||
639 | return 0; | ||
640 | } | ||
641 | EXPORT_SYMBOL_GPL(blkg_print_stat_ios); | ||
642 | |||
643 | static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf, | ||
644 | struct blkg_policy_data *pd, | ||
645 | int off) | ||
646 | { | ||
647 | struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg, | ||
648 | NULL, off); | ||
649 | return __blkg_prfill_rwstat(sf, pd, &rwstat); | ||
650 | } | ||
651 | |||
652 | /** | ||
653 | * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes | ||
654 | * @sf: seq_file to print to | ||
655 | * @v: unused | ||
656 | */ | ||
657 | int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v) | ||
658 | { | ||
659 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), | ||
660 | blkg_prfill_rwstat_field_recursive, | ||
661 | (void *)seq_cft(sf)->private, | ||
662 | offsetof(struct blkcg_gq, stat_bytes), true); | ||
663 | return 0; | ||
664 | } | ||
665 | EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive); | ||
666 | |||
667 | /** | ||
668 | * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios | ||
669 | * @sf: seq_file to print to | ||
670 | * @v: unused | ||
671 | */ | ||
672 | int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v) | ||
673 | { | ||
674 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), | ||
675 | blkg_prfill_rwstat_field_recursive, | ||
676 | (void *)seq_cft(sf)->private, | ||
677 | offsetof(struct blkcg_gq, stat_ios), true); | ||
678 | return 0; | ||
679 | } | ||
680 | EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive); | ||
681 | |||
623 | /** | 682 | /** |
624 | * blkg_stat_recursive_sum - collect hierarchical blkg_stat | 683 | * blkg_stat_recursive_sum - collect hierarchical blkg_stat |
625 | * @pd: policy private data of interest | 684 | * @blkg: blkg of interest |
626 | * @off: offset to the blkg_stat in @pd | 685 | * @pol: blkcg_policy which contains the blkg_stat |
686 | * @off: offset to the blkg_stat in blkg_policy_data or @blkg | ||
687 | * | ||
688 | * Collect the blkg_stat specified by @blkg, @pol and @off and all its | ||
689 | * online descendants and their aux counts. The caller must be holding the | ||
690 | * queue lock for online tests. | ||
627 | * | 691 | * |
628 | * Collect the blkg_stat specified by @off from @pd and all its online | 692 | * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is |
629 | * descendants and return the sum. The caller must be holding the queue | 693 | * at @off bytes into @blkg's blkg_policy_data of the policy. |
630 | * lock for online tests. | ||
631 | */ | 694 | */ |
632 | u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off) | 695 | u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg, |
696 | struct blkcg_policy *pol, int off) | ||
633 | { | 697 | { |
634 | struct blkcg_policy *pol = blkcg_policy[pd->plid]; | ||
635 | struct blkcg_gq *pos_blkg; | 698 | struct blkcg_gq *pos_blkg; |
636 | struct cgroup_subsys_state *pos_css; | 699 | struct cgroup_subsys_state *pos_css; |
637 | u64 sum = 0; | 700 | u64 sum = 0; |
638 | 701 | ||
639 | lockdep_assert_held(pd->blkg->q->queue_lock); | 702 | lockdep_assert_held(blkg->q->queue_lock); |
640 | 703 | ||
641 | rcu_read_lock(); | 704 | rcu_read_lock(); |
642 | blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) { | 705 | blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { |
643 | struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); | 706 | struct blkg_stat *stat; |
644 | struct blkg_stat *stat = (void *)pos_pd + off; | 707 | |
708 | if (!pos_blkg->online) | ||
709 | continue; | ||
710 | |||
711 | if (pol) | ||
712 | stat = (void *)blkg_to_pd(pos_blkg, pol) + off; | ||
713 | else | ||
714 | stat = (void *)blkg + off; | ||
645 | 715 | ||
646 | if (pos_blkg->online) | 716 | sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt); |
647 | sum += blkg_stat_read(stat); | ||
648 | } | 717 | } |
649 | rcu_read_unlock(); | 718 | rcu_read_unlock(); |
650 | 719 | ||
@@ -654,37 +723,43 @@ EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum); | |||
654 | 723 | ||
655 | /** | 724 | /** |
656 | * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat | 725 | * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat |
657 | * @pd: policy private data of interest | 726 | * @blkg: blkg of interest |
658 | * @off: offset to the blkg_stat in @pd | 727 | * @pol: blkcg_policy which contains the blkg_rwstat |
728 | * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg | ||
729 | * | ||
730 | * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its | ||
731 | * online descendants and their aux counts. The caller must be holding the | ||
732 | * queue lock for online tests. | ||
659 | * | 733 | * |
660 | * Collect the blkg_rwstat specified by @off from @pd and all its online | 734 | * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it |
661 | * descendants and return the sum. The caller must be holding the queue | 735 | * is at @off bytes into @blkg's blkg_policy_data of the policy. |
662 | * lock for online tests. | ||
663 | */ | 736 | */ |
664 | struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, | 737 | struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, |
665 | int off) | 738 | struct blkcg_policy *pol, int off) |
666 | { | 739 | { |
667 | struct blkcg_policy *pol = blkcg_policy[pd->plid]; | ||
668 | struct blkcg_gq *pos_blkg; | 740 | struct blkcg_gq *pos_blkg; |
669 | struct cgroup_subsys_state *pos_css; | 741 | struct cgroup_subsys_state *pos_css; |
670 | struct blkg_rwstat sum = { }; | 742 | struct blkg_rwstat sum = { }; |
671 | int i; | 743 | int i; |
672 | 744 | ||
673 | lockdep_assert_held(pd->blkg->q->queue_lock); | 745 | lockdep_assert_held(blkg->q->queue_lock); |
674 | 746 | ||
675 | rcu_read_lock(); | 747 | rcu_read_lock(); |
676 | blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) { | 748 | blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { |
677 | struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); | 749 | struct blkg_rwstat *rwstat; |
678 | struct blkg_rwstat *rwstat = (void *)pos_pd + off; | ||
679 | struct blkg_rwstat tmp; | ||
680 | 750 | ||
681 | if (!pos_blkg->online) | 751 | if (!pos_blkg->online) |
682 | continue; | 752 | continue; |
683 | 753 | ||
684 | tmp = blkg_rwstat_read(rwstat); | 754 | if (pol) |
755 | rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off; | ||
756 | else | ||
757 | rwstat = (void *)pos_blkg + off; | ||
685 | 758 | ||
686 | for (i = 0; i < BLKG_RWSTAT_NR; i++) | 759 | for (i = 0; i < BLKG_RWSTAT_NR; i++) |
687 | sum.cnt[i] += tmp.cnt[i]; | 760 | atomic64_add(atomic64_read(&rwstat->aux_cnt[i]) + |
761 | percpu_counter_sum_positive(&rwstat->cpu_cnt[i]), | ||
762 | &sum.aux_cnt[i]); | ||
688 | } | 763 | } |
689 | rcu_read_unlock(); | 764 | rcu_read_unlock(); |
690 | 765 | ||
@@ -700,29 +775,34 @@ EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum); | |||
700 | * @ctx: blkg_conf_ctx to be filled | 775 | * @ctx: blkg_conf_ctx to be filled |
701 | * | 776 | * |
702 | * Parse per-blkg config update from @input and initialize @ctx with the | 777 | * Parse per-blkg config update from @input and initialize @ctx with the |
703 | * result. @ctx->blkg points to the blkg to be updated and @ctx->v the new | 778 | * result. @ctx->blkg points to the blkg to be updated and @ctx->body the |
704 | * value. This function returns with RCU read lock and queue lock held and | 779 | * part of @input following MAJ:MIN. This function returns with RCU read |
705 | * must be paired with blkg_conf_finish(). | 780 | * lock and queue lock held and must be paired with blkg_conf_finish(). |
706 | */ | 781 | */ |
707 | int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, | 782 | int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, |
708 | const char *input, struct blkg_conf_ctx *ctx) | 783 | char *input, struct blkg_conf_ctx *ctx) |
709 | __acquires(rcu) __acquires(disk->queue->queue_lock) | 784 | __acquires(rcu) __acquires(disk->queue->queue_lock) |
710 | { | 785 | { |
711 | struct gendisk *disk; | 786 | struct gendisk *disk; |
712 | struct blkcg_gq *blkg; | 787 | struct blkcg_gq *blkg; |
713 | unsigned int major, minor; | 788 | unsigned int major, minor; |
714 | unsigned long long v; | 789 | int key_len, part, ret; |
715 | int part, ret; | 790 | char *body; |
716 | 791 | ||
717 | if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3) | 792 | if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2) |
718 | return -EINVAL; | 793 | return -EINVAL; |
719 | 794 | ||
795 | body = input + key_len; | ||
796 | if (!isspace(*body)) | ||
797 | return -EINVAL; | ||
798 | body = skip_spaces(body); | ||
799 | |||
720 | disk = get_gendisk(MKDEV(major, minor), &part); | 800 | disk = get_gendisk(MKDEV(major, minor), &part); |
721 | if (!disk) | 801 | if (!disk) |
722 | return -EINVAL; | 802 | return -ENODEV; |
723 | if (part) { | 803 | if (part) { |
724 | put_disk(disk); | 804 | put_disk(disk); |
725 | return -EINVAL; | 805 | return -ENODEV; |
726 | } | 806 | } |
727 | 807 | ||
728 | rcu_read_lock(); | 808 | rcu_read_lock(); |
@@ -731,7 +811,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, | |||
731 | if (blkcg_policy_enabled(disk->queue, pol)) | 811 | if (blkcg_policy_enabled(disk->queue, pol)) |
732 | blkg = blkg_lookup_create(blkcg, disk->queue); | 812 | blkg = blkg_lookup_create(blkcg, disk->queue); |
733 | else | 813 | else |
734 | blkg = ERR_PTR(-EINVAL); | 814 | blkg = ERR_PTR(-EOPNOTSUPP); |
735 | 815 | ||
736 | if (IS_ERR(blkg)) { | 816 | if (IS_ERR(blkg)) { |
737 | ret = PTR_ERR(blkg); | 817 | ret = PTR_ERR(blkg); |
@@ -753,7 +833,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, | |||
753 | 833 | ||
754 | ctx->disk = disk; | 834 | ctx->disk = disk; |
755 | ctx->blkg = blkg; | 835 | ctx->blkg = blkg; |
756 | ctx->v = v; | 836 | ctx->body = body; |
757 | return 0; | 837 | return 0; |
758 | } | 838 | } |
759 | EXPORT_SYMBOL_GPL(blkg_conf_prep); | 839 | EXPORT_SYMBOL_GPL(blkg_conf_prep); |
@@ -774,8 +854,55 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx) | |||
774 | } | 854 | } |
775 | EXPORT_SYMBOL_GPL(blkg_conf_finish); | 855 | EXPORT_SYMBOL_GPL(blkg_conf_finish); |
776 | 856 | ||
857 | static int blkcg_print_stat(struct seq_file *sf, void *v) | ||
858 | { | ||
859 | struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); | ||
860 | struct blkcg_gq *blkg; | ||
861 | |||
862 | rcu_read_lock(); | ||
863 | |||
864 | hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { | ||
865 | const char *dname; | ||
866 | struct blkg_rwstat rwstat; | ||
867 | u64 rbytes, wbytes, rios, wios; | ||
868 | |||
869 | dname = blkg_dev_name(blkg); | ||
870 | if (!dname) | ||
871 | continue; | ||
872 | |||
873 | spin_lock_irq(blkg->q->queue_lock); | ||
874 | |||
875 | rwstat = blkg_rwstat_recursive_sum(blkg, NULL, | ||
876 | offsetof(struct blkcg_gq, stat_bytes)); | ||
877 | rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); | ||
878 | wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); | ||
879 | |||
880 | rwstat = blkg_rwstat_recursive_sum(blkg, NULL, | ||
881 | offsetof(struct blkcg_gq, stat_ios)); | ||
882 | rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); | ||
883 | wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); | ||
884 | |||
885 | spin_unlock_irq(blkg->q->queue_lock); | ||
886 | |||
887 | if (rbytes || wbytes || rios || wios) | ||
888 | seq_printf(sf, "%s rbytes=%llu wbytes=%llu rios=%llu wios=%llu\n", | ||
889 | dname, rbytes, wbytes, rios, wios); | ||
890 | } | ||
891 | |||
892 | rcu_read_unlock(); | ||
893 | return 0; | ||
894 | } | ||
895 | |||
777 | struct cftype blkcg_files[] = { | 896 | struct cftype blkcg_files[] = { |
778 | { | 897 | { |
898 | .name = "stat", | ||
899 | .seq_show = blkcg_print_stat, | ||
900 | }, | ||
901 | { } /* terminate */ | ||
902 | }; | ||
903 | |||
904 | struct cftype blkcg_legacy_files[] = { | ||
905 | { | ||
779 | .name = "reset_stats", | 906 | .name = "reset_stats", |
780 | .write_u64 = blkcg_reset_stats, | 907 | .write_u64 = blkcg_reset_stats, |
781 | }, | 908 | }, |
@@ -822,18 +949,19 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css) | |||
822 | static void blkcg_css_free(struct cgroup_subsys_state *css) | 949 | static void blkcg_css_free(struct cgroup_subsys_state *css) |
823 | { | 950 | { |
824 | struct blkcg *blkcg = css_to_blkcg(css); | 951 | struct blkcg *blkcg = css_to_blkcg(css); |
952 | int i; | ||
825 | 953 | ||
826 | mutex_lock(&blkcg_pol_mutex); | 954 | mutex_lock(&blkcg_pol_mutex); |
955 | |||
827 | list_del(&blkcg->all_blkcgs_node); | 956 | list_del(&blkcg->all_blkcgs_node); |
828 | mutex_unlock(&blkcg_pol_mutex); | ||
829 | 957 | ||
830 | if (blkcg != &blkcg_root) { | 958 | for (i = 0; i < BLKCG_MAX_POLS; i++) |
831 | int i; | 959 | if (blkcg->cpd[i]) |
960 | blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); | ||
832 | 961 | ||
833 | for (i = 0; i < BLKCG_MAX_POLS; i++) | 962 | mutex_unlock(&blkcg_pol_mutex); |
834 | kfree(blkcg->pd[i]); | 963 | |
835 | kfree(blkcg); | 964 | kfree(blkcg); |
836 | } | ||
837 | } | 965 | } |
838 | 966 | ||
839 | static struct cgroup_subsys_state * | 967 | static struct cgroup_subsys_state * |
@@ -847,13 +975,12 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) | |||
847 | 975 | ||
848 | if (!parent_css) { | 976 | if (!parent_css) { |
849 | blkcg = &blkcg_root; | 977 | blkcg = &blkcg_root; |
850 | goto done; | 978 | } else { |
851 | } | 979 | blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); |
852 | 980 | if (!blkcg) { | |
853 | blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); | 981 | ret = ERR_PTR(-ENOMEM); |
854 | if (!blkcg) { | 982 | goto free_blkcg; |
855 | ret = ERR_PTR(-ENOMEM); | 983 | } |
856 | goto free_blkcg; | ||
857 | } | 984 | } |
858 | 985 | ||
859 | for (i = 0; i < BLKCG_MAX_POLS ; i++) { | 986 | for (i = 0; i < BLKCG_MAX_POLS ; i++) { |
@@ -866,23 +993,23 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) | |||
866 | * check if the policy requires any specific per-cgroup | 993 | * check if the policy requires any specific per-cgroup |
867 | * data: if it does, allocate and initialize it. | 994 | * data: if it does, allocate and initialize it. |
868 | */ | 995 | */ |
869 | if (!pol || !pol->cpd_size) | 996 | if (!pol || !pol->cpd_alloc_fn) |
870 | continue; | 997 | continue; |
871 | 998 | ||
872 | BUG_ON(blkcg->pd[i]); | 999 | cpd = pol->cpd_alloc_fn(GFP_KERNEL); |
873 | cpd = kzalloc(pol->cpd_size, GFP_KERNEL); | ||
874 | if (!cpd) { | 1000 | if (!cpd) { |
875 | ret = ERR_PTR(-ENOMEM); | 1001 | ret = ERR_PTR(-ENOMEM); |
876 | goto free_pd_blkcg; | 1002 | goto free_pd_blkcg; |
877 | } | 1003 | } |
878 | blkcg->pd[i] = cpd; | 1004 | blkcg->cpd[i] = cpd; |
1005 | cpd->blkcg = blkcg; | ||
879 | cpd->plid = i; | 1006 | cpd->plid = i; |
880 | pol->cpd_init_fn(blkcg); | 1007 | if (pol->cpd_init_fn) |
1008 | pol->cpd_init_fn(cpd); | ||
881 | } | 1009 | } |
882 | 1010 | ||
883 | done: | ||
884 | spin_lock_init(&blkcg->lock); | 1011 | spin_lock_init(&blkcg->lock); |
885 | INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC); | 1012 | INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT); |
886 | INIT_HLIST_HEAD(&blkcg->blkg_list); | 1013 | INIT_HLIST_HEAD(&blkcg->blkg_list); |
887 | #ifdef CONFIG_CGROUP_WRITEBACK | 1014 | #ifdef CONFIG_CGROUP_WRITEBACK |
888 | INIT_LIST_HEAD(&blkcg->cgwb_list); | 1015 | INIT_LIST_HEAD(&blkcg->cgwb_list); |
@@ -894,7 +1021,8 @@ done: | |||
894 | 1021 | ||
895 | free_pd_blkcg: | 1022 | free_pd_blkcg: |
896 | for (i--; i >= 0; i--) | 1023 | for (i--; i >= 0; i--) |
897 | kfree(blkcg->pd[i]); | 1024 | if (blkcg->cpd[i]) |
1025 | blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); | ||
898 | free_blkcg: | 1026 | free_blkcg: |
899 | kfree(blkcg); | 1027 | kfree(blkcg); |
900 | mutex_unlock(&blkcg_pol_mutex); | 1028 | mutex_unlock(&blkcg_pol_mutex); |
@@ -938,7 +1066,7 @@ int blkcg_init_queue(struct request_queue *q) | |||
938 | radix_tree_preload_end(); | 1066 | radix_tree_preload_end(); |
939 | 1067 | ||
940 | if (IS_ERR(blkg)) { | 1068 | if (IS_ERR(blkg)) { |
941 | kfree(new_blkg); | 1069 | blkg_free(new_blkg); |
942 | return PTR_ERR(blkg); | 1070 | return PTR_ERR(blkg); |
943 | } | 1071 | } |
944 | 1072 | ||
@@ -1015,12 +1143,35 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css, | |||
1015 | return ret; | 1143 | return ret; |
1016 | } | 1144 | } |
1017 | 1145 | ||
1018 | struct cgroup_subsys blkio_cgrp_subsys = { | 1146 | static void blkcg_bind(struct cgroup_subsys_state *root_css) |
1147 | { | ||
1148 | int i; | ||
1149 | |||
1150 | mutex_lock(&blkcg_pol_mutex); | ||
1151 | |||
1152 | for (i = 0; i < BLKCG_MAX_POLS; i++) { | ||
1153 | struct blkcg_policy *pol = blkcg_policy[i]; | ||
1154 | struct blkcg *blkcg; | ||
1155 | |||
1156 | if (!pol || !pol->cpd_bind_fn) | ||
1157 | continue; | ||
1158 | |||
1159 | list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) | ||
1160 | if (blkcg->cpd[pol->plid]) | ||
1161 | pol->cpd_bind_fn(blkcg->cpd[pol->plid]); | ||
1162 | } | ||
1163 | mutex_unlock(&blkcg_pol_mutex); | ||
1164 | } | ||
1165 | |||
1166 | struct cgroup_subsys io_cgrp_subsys = { | ||
1019 | .css_alloc = blkcg_css_alloc, | 1167 | .css_alloc = blkcg_css_alloc, |
1020 | .css_offline = blkcg_css_offline, | 1168 | .css_offline = blkcg_css_offline, |
1021 | .css_free = blkcg_css_free, | 1169 | .css_free = blkcg_css_free, |
1022 | .can_attach = blkcg_can_attach, | 1170 | .can_attach = blkcg_can_attach, |
1023 | .legacy_cftypes = blkcg_files, | 1171 | .bind = blkcg_bind, |
1172 | .dfl_cftypes = blkcg_files, | ||
1173 | .legacy_cftypes = blkcg_legacy_files, | ||
1174 | .legacy_name = "blkio", | ||
1024 | #ifdef CONFIG_MEMCG | 1175 | #ifdef CONFIG_MEMCG |
1025 | /* | 1176 | /* |
1026 | * This ensures that, if available, memcg is automatically enabled | 1177 | * This ensures that, if available, memcg is automatically enabled |
@@ -1030,7 +1181,7 @@ struct cgroup_subsys blkio_cgrp_subsys = { | |||
1030 | .depends_on = 1 << memory_cgrp_id, | 1181 | .depends_on = 1 << memory_cgrp_id, |
1031 | #endif | 1182 | #endif |
1032 | }; | 1183 | }; |
1033 | EXPORT_SYMBOL_GPL(blkio_cgrp_subsys); | 1184 | EXPORT_SYMBOL_GPL(io_cgrp_subsys); |
1034 | 1185 | ||
1035 | /** | 1186 | /** |
1036 | * blkcg_activate_policy - activate a blkcg policy on a request_queue | 1187 | * blkcg_activate_policy - activate a blkcg policy on a request_queue |
@@ -1051,65 +1202,54 @@ EXPORT_SYMBOL_GPL(blkio_cgrp_subsys); | |||
1051 | int blkcg_activate_policy(struct request_queue *q, | 1202 | int blkcg_activate_policy(struct request_queue *q, |
1052 | const struct blkcg_policy *pol) | 1203 | const struct blkcg_policy *pol) |
1053 | { | 1204 | { |
1054 | LIST_HEAD(pds); | 1205 | struct blkg_policy_data *pd_prealloc = NULL; |
1055 | struct blkcg_gq *blkg; | 1206 | struct blkcg_gq *blkg; |
1056 | struct blkg_policy_data *pd, *nd; | 1207 | int ret; |
1057 | int cnt = 0, ret; | ||
1058 | 1208 | ||
1059 | if (blkcg_policy_enabled(q, pol)) | 1209 | if (blkcg_policy_enabled(q, pol)) |
1060 | return 0; | 1210 | return 0; |
1061 | 1211 | ||
1062 | /* count and allocate policy_data for all existing blkgs */ | ||
1063 | blk_queue_bypass_start(q); | 1212 | blk_queue_bypass_start(q); |
1064 | spin_lock_irq(q->queue_lock); | 1213 | pd_prealloc: |
1065 | list_for_each_entry(blkg, &q->blkg_list, q_node) | 1214 | if (!pd_prealloc) { |
1066 | cnt++; | 1215 | pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node); |
1067 | spin_unlock_irq(q->queue_lock); | 1216 | if (!pd_prealloc) { |
1068 | |||
1069 | /* allocate per-blkg policy data for all existing blkgs */ | ||
1070 | while (cnt--) { | ||
1071 | pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node); | ||
1072 | if (!pd) { | ||
1073 | ret = -ENOMEM; | 1217 | ret = -ENOMEM; |
1074 | goto out_free; | 1218 | goto out_bypass_end; |
1075 | } | 1219 | } |
1076 | list_add_tail(&pd->alloc_node, &pds); | ||
1077 | } | 1220 | } |
1078 | 1221 | ||
1079 | /* | ||
1080 | * Install the allocated pds and cpds. With @q bypassing, no new blkg | ||
1081 | * should have been created while the queue lock was dropped. | ||
1082 | */ | ||
1083 | spin_lock_irq(q->queue_lock); | 1222 | spin_lock_irq(q->queue_lock); |
1084 | 1223 | ||
1085 | list_for_each_entry(blkg, &q->blkg_list, q_node) { | 1224 | list_for_each_entry(blkg, &q->blkg_list, q_node) { |
1086 | if (WARN_ON(list_empty(&pds))) { | 1225 | struct blkg_policy_data *pd; |
1087 | /* umm... this shouldn't happen, just abort */ | ||
1088 | ret = -ENOMEM; | ||
1089 | goto out_unlock; | ||
1090 | } | ||
1091 | pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node); | ||
1092 | list_del_init(&pd->alloc_node); | ||
1093 | 1226 | ||
1094 | /* grab blkcg lock too while installing @pd on @blkg */ | 1227 | if (blkg->pd[pol->plid]) |
1095 | spin_lock(&blkg->blkcg->lock); | 1228 | continue; |
1229 | |||
1230 | pd = pol->pd_alloc_fn(GFP_NOWAIT, q->node); | ||
1231 | if (!pd) | ||
1232 | swap(pd, pd_prealloc); | ||
1233 | if (!pd) { | ||
1234 | spin_unlock_irq(q->queue_lock); | ||
1235 | goto pd_prealloc; | ||
1236 | } | ||
1096 | 1237 | ||
1097 | blkg->pd[pol->plid] = pd; | 1238 | blkg->pd[pol->plid] = pd; |
1098 | pd->blkg = blkg; | 1239 | pd->blkg = blkg; |
1099 | pd->plid = pol->plid; | 1240 | pd->plid = pol->plid; |
1100 | pol->pd_init_fn(blkg); | 1241 | if (pol->pd_init_fn) |
1101 | 1242 | pol->pd_init_fn(pd); | |
1102 | spin_unlock(&blkg->blkcg->lock); | ||
1103 | } | 1243 | } |
1104 | 1244 | ||
1105 | __set_bit(pol->plid, q->blkcg_pols); | 1245 | __set_bit(pol->plid, q->blkcg_pols); |
1106 | ret = 0; | 1246 | ret = 0; |
1107 | out_unlock: | 1247 | |
1108 | spin_unlock_irq(q->queue_lock); | 1248 | spin_unlock_irq(q->queue_lock); |
1109 | out_free: | 1249 | out_bypass_end: |
1110 | blk_queue_bypass_end(q); | 1250 | blk_queue_bypass_end(q); |
1111 | list_for_each_entry_safe(pd, nd, &pds, alloc_node) | 1251 | if (pd_prealloc) |
1112 | kfree(pd); | 1252 | pol->pd_free_fn(pd_prealloc); |
1113 | return ret; | 1253 | return ret; |
1114 | } | 1254 | } |
1115 | EXPORT_SYMBOL_GPL(blkcg_activate_policy); | 1255 | EXPORT_SYMBOL_GPL(blkcg_activate_policy); |
@@ -1139,13 +1279,12 @@ void blkcg_deactivate_policy(struct request_queue *q, | |||
1139 | /* grab blkcg lock too while removing @pd from @blkg */ | 1279 | /* grab blkcg lock too while removing @pd from @blkg */ |
1140 | spin_lock(&blkg->blkcg->lock); | 1280 | spin_lock(&blkg->blkcg->lock); |
1141 | 1281 | ||
1142 | if (pol->pd_offline_fn) | 1282 | if (blkg->pd[pol->plid]) { |
1143 | pol->pd_offline_fn(blkg); | 1283 | if (pol->pd_offline_fn) |
1144 | if (pol->pd_exit_fn) | 1284 | pol->pd_offline_fn(blkg->pd[pol->plid]); |
1145 | pol->pd_exit_fn(blkg); | 1285 | pol->pd_free_fn(blkg->pd[pol->plid]); |
1146 | 1286 | blkg->pd[pol->plid] = NULL; | |
1147 | kfree(blkg->pd[pol->plid]); | 1287 | } |
1148 | blkg->pd[pol->plid] = NULL; | ||
1149 | 1288 | ||
1150 | spin_unlock(&blkg->blkcg->lock); | 1289 | spin_unlock(&blkg->blkcg->lock); |
1151 | } | 1290 | } |
@@ -1167,9 +1306,6 @@ int blkcg_policy_register(struct blkcg_policy *pol) | |||
1167 | struct blkcg *blkcg; | 1306 | struct blkcg *blkcg; |
1168 | int i, ret; | 1307 | int i, ret; |
1169 | 1308 | ||
1170 | if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data))) | ||
1171 | return -EINVAL; | ||
1172 | |||
1173 | mutex_lock(&blkcg_pol_register_mutex); | 1309 | mutex_lock(&blkcg_pol_register_mutex); |
1174 | mutex_lock(&blkcg_pol_mutex); | 1310 | mutex_lock(&blkcg_pol_mutex); |
1175 | 1311 | ||
@@ -1186,36 +1322,42 @@ int blkcg_policy_register(struct blkcg_policy *pol) | |||
1186 | blkcg_policy[pol->plid] = pol; | 1322 | blkcg_policy[pol->plid] = pol; |
1187 | 1323 | ||
1188 | /* allocate and install cpd's */ | 1324 | /* allocate and install cpd's */ |
1189 | if (pol->cpd_size) { | 1325 | if (pol->cpd_alloc_fn) { |
1190 | list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { | 1326 | list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { |
1191 | struct blkcg_policy_data *cpd; | 1327 | struct blkcg_policy_data *cpd; |
1192 | 1328 | ||
1193 | cpd = kzalloc(pol->cpd_size, GFP_KERNEL); | 1329 | cpd = pol->cpd_alloc_fn(GFP_KERNEL); |
1194 | if (!cpd) { | 1330 | if (!cpd) { |
1195 | mutex_unlock(&blkcg_pol_mutex); | 1331 | mutex_unlock(&blkcg_pol_mutex); |
1196 | goto err_free_cpds; | 1332 | goto err_free_cpds; |
1197 | } | 1333 | } |
1198 | 1334 | ||
1199 | blkcg->pd[pol->plid] = cpd; | 1335 | blkcg->cpd[pol->plid] = cpd; |
1336 | cpd->blkcg = blkcg; | ||
1200 | cpd->plid = pol->plid; | 1337 | cpd->plid = pol->plid; |
1201 | pol->cpd_init_fn(blkcg); | 1338 | pol->cpd_init_fn(cpd); |
1202 | } | 1339 | } |
1203 | } | 1340 | } |
1204 | 1341 | ||
1205 | mutex_unlock(&blkcg_pol_mutex); | 1342 | mutex_unlock(&blkcg_pol_mutex); |
1206 | 1343 | ||
1207 | /* everything is in place, add intf files for the new policy */ | 1344 | /* everything is in place, add intf files for the new policy */ |
1208 | if (pol->cftypes) | 1345 | if (pol->dfl_cftypes) |
1209 | WARN_ON(cgroup_add_legacy_cftypes(&blkio_cgrp_subsys, | 1346 | WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys, |
1210 | pol->cftypes)); | 1347 | pol->dfl_cftypes)); |
1348 | if (pol->legacy_cftypes) | ||
1349 | WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys, | ||
1350 | pol->legacy_cftypes)); | ||
1211 | mutex_unlock(&blkcg_pol_register_mutex); | 1351 | mutex_unlock(&blkcg_pol_register_mutex); |
1212 | return 0; | 1352 | return 0; |
1213 | 1353 | ||
1214 | err_free_cpds: | 1354 | err_free_cpds: |
1215 | if (pol->cpd_size) { | 1355 | if (pol->cpd_alloc_fn) { |
1216 | list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { | 1356 | list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { |
1217 | kfree(blkcg->pd[pol->plid]); | 1357 | if (blkcg->cpd[pol->plid]) { |
1218 | blkcg->pd[pol->plid] = NULL; | 1358 | pol->cpd_free_fn(blkcg->cpd[pol->plid]); |
1359 | blkcg->cpd[pol->plid] = NULL; | ||
1360 | } | ||
1219 | } | 1361 | } |
1220 | } | 1362 | } |
1221 | blkcg_policy[pol->plid] = NULL; | 1363 | blkcg_policy[pol->plid] = NULL; |
@@ -1242,16 +1384,20 @@ void blkcg_policy_unregister(struct blkcg_policy *pol) | |||
1242 | goto out_unlock; | 1384 | goto out_unlock; |
1243 | 1385 | ||
1244 | /* kill the intf files first */ | 1386 | /* kill the intf files first */ |
1245 | if (pol->cftypes) | 1387 | if (pol->dfl_cftypes) |
1246 | cgroup_rm_cftypes(pol->cftypes); | 1388 | cgroup_rm_cftypes(pol->dfl_cftypes); |
1389 | if (pol->legacy_cftypes) | ||
1390 | cgroup_rm_cftypes(pol->legacy_cftypes); | ||
1247 | 1391 | ||
1248 | /* remove cpds and unregister */ | 1392 | /* remove cpds and unregister */ |
1249 | mutex_lock(&blkcg_pol_mutex); | 1393 | mutex_lock(&blkcg_pol_mutex); |
1250 | 1394 | ||
1251 | if (pol->cpd_size) { | 1395 | if (pol->cpd_alloc_fn) { |
1252 | list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { | 1396 | list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { |
1253 | kfree(blkcg->pd[pol->plid]); | 1397 | if (blkcg->cpd[pol->plid]) { |
1254 | blkcg->pd[pol->plid] = NULL; | 1398 | pol->cpd_free_fn(blkcg->cpd[pol->plid]); |
1399 | blkcg->cpd[pol->plid] = NULL; | ||
1400 | } | ||
1255 | } | 1401 | } |
1256 | } | 1402 | } |
1257 | blkcg_policy[pol->plid] = NULL; | 1403 | blkcg_policy[pol->plid] = NULL; |
diff --git a/block/blk-core.c b/block/blk-core.c index 60912e983f16..2eb722d48773 100644 --- a/block/blk-core.c +++ b/block/blk-core.c | |||
@@ -1888,8 +1888,8 @@ generic_make_request_checks(struct bio *bio) | |||
1888 | */ | 1888 | */ |
1889 | create_io_context(GFP_ATOMIC, q->node); | 1889 | create_io_context(GFP_ATOMIC, q->node); |
1890 | 1890 | ||
1891 | if (blk_throtl_bio(q, bio)) | 1891 | if (!blkcg_bio_issue_check(q, bio)) |
1892 | return false; /* throttled, will be resubmitted later */ | 1892 | return false; |
1893 | 1893 | ||
1894 | trace_block_bio_queue(q, bio); | 1894 | trace_block_bio_queue(q, bio); |
1895 | return true; | 1895 | return true; |
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index b23193518ac7..c75a2636dd40 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c | |||
@@ -83,14 +83,6 @@ enum tg_state_flags { | |||
83 | 83 | ||
84 | #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) | 84 | #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) |
85 | 85 | ||
86 | /* Per-cpu group stats */ | ||
87 | struct tg_stats_cpu { | ||
88 | /* total bytes transferred */ | ||
89 | struct blkg_rwstat service_bytes; | ||
90 | /* total IOs serviced, post merge */ | ||
91 | struct blkg_rwstat serviced; | ||
92 | }; | ||
93 | |||
94 | struct throtl_grp { | 86 | struct throtl_grp { |
95 | /* must be the first member */ | 87 | /* must be the first member */ |
96 | struct blkg_policy_data pd; | 88 | struct blkg_policy_data pd; |
@@ -141,12 +133,6 @@ struct throtl_grp { | |||
141 | /* When did we start a new slice */ | 133 | /* When did we start a new slice */ |
142 | unsigned long slice_start[2]; | 134 | unsigned long slice_start[2]; |
143 | unsigned long slice_end[2]; | 135 | unsigned long slice_end[2]; |
144 | |||
145 | /* Per cpu stats pointer */ | ||
146 | struct tg_stats_cpu __percpu *stats_cpu; | ||
147 | |||
148 | /* List of tgs waiting for per cpu stats memory to be allocated */ | ||
149 | struct list_head stats_alloc_node; | ||
150 | }; | 136 | }; |
151 | 137 | ||
152 | struct throtl_data | 138 | struct throtl_data |
@@ -168,13 +154,6 @@ struct throtl_data | |||
168 | struct work_struct dispatch_work; | 154 | struct work_struct dispatch_work; |
169 | }; | 155 | }; |
170 | 156 | ||
171 | /* list and work item to allocate percpu group stats */ | ||
172 | static DEFINE_SPINLOCK(tg_stats_alloc_lock); | ||
173 | static LIST_HEAD(tg_stats_alloc_list); | ||
174 | |||
175 | static void tg_stats_alloc_fn(struct work_struct *); | ||
176 | static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn); | ||
177 | |||
178 | static void throtl_pending_timer_fn(unsigned long arg); | 157 | static void throtl_pending_timer_fn(unsigned long arg); |
179 | 158 | ||
180 | static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) | 159 | static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) |
@@ -192,11 +171,6 @@ static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) | |||
192 | return pd_to_blkg(&tg->pd); | 171 | return pd_to_blkg(&tg->pd); |
193 | } | 172 | } |
194 | 173 | ||
195 | static inline struct throtl_grp *td_root_tg(struct throtl_data *td) | ||
196 | { | ||
197 | return blkg_to_tg(td->queue->root_blkg); | ||
198 | } | ||
199 | |||
200 | /** | 174 | /** |
201 | * sq_to_tg - return the throl_grp the specified service queue belongs to | 175 | * sq_to_tg - return the throl_grp the specified service queue belongs to |
202 | * @sq: the throtl_service_queue of interest | 176 | * @sq: the throtl_service_queue of interest |
@@ -256,53 +230,6 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) | |||
256 | } \ | 230 | } \ |
257 | } while (0) | 231 | } while (0) |
258 | 232 | ||
259 | static void tg_stats_init(struct tg_stats_cpu *tg_stats) | ||
260 | { | ||
261 | blkg_rwstat_init(&tg_stats->service_bytes); | ||
262 | blkg_rwstat_init(&tg_stats->serviced); | ||
263 | } | ||
264 | |||
265 | /* | ||
266 | * Worker for allocating per cpu stat for tgs. This is scheduled on the | ||
267 | * system_wq once there are some groups on the alloc_list waiting for | ||
268 | * allocation. | ||
269 | */ | ||
270 | static void tg_stats_alloc_fn(struct work_struct *work) | ||
271 | { | ||
272 | static struct tg_stats_cpu *stats_cpu; /* this fn is non-reentrant */ | ||
273 | struct delayed_work *dwork = to_delayed_work(work); | ||
274 | bool empty = false; | ||
275 | |||
276 | alloc_stats: | ||
277 | if (!stats_cpu) { | ||
278 | int cpu; | ||
279 | |||
280 | stats_cpu = alloc_percpu(struct tg_stats_cpu); | ||
281 | if (!stats_cpu) { | ||
282 | /* allocation failed, try again after some time */ | ||
283 | schedule_delayed_work(dwork, msecs_to_jiffies(10)); | ||
284 | return; | ||
285 | } | ||
286 | for_each_possible_cpu(cpu) | ||
287 | tg_stats_init(per_cpu_ptr(stats_cpu, cpu)); | ||
288 | } | ||
289 | |||
290 | spin_lock_irq(&tg_stats_alloc_lock); | ||
291 | |||
292 | if (!list_empty(&tg_stats_alloc_list)) { | ||
293 | struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list, | ||
294 | struct throtl_grp, | ||
295 | stats_alloc_node); | ||
296 | swap(tg->stats_cpu, stats_cpu); | ||
297 | list_del_init(&tg->stats_alloc_node); | ||
298 | } | ||
299 | |||
300 | empty = list_empty(&tg_stats_alloc_list); | ||
301 | spin_unlock_irq(&tg_stats_alloc_lock); | ||
302 | if (!empty) | ||
303 | goto alloc_stats; | ||
304 | } | ||
305 | |||
306 | static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) | 233 | static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) |
307 | { | 234 | { |
308 | INIT_LIST_HEAD(&qn->node); | 235 | INIT_LIST_HEAD(&qn->node); |
@@ -387,29 +314,46 @@ static struct bio *throtl_pop_queued(struct list_head *queued, | |||
387 | } | 314 | } |
388 | 315 | ||
389 | /* init a service_queue, assumes the caller zeroed it */ | 316 | /* init a service_queue, assumes the caller zeroed it */ |
390 | static void throtl_service_queue_init(struct throtl_service_queue *sq, | 317 | static void throtl_service_queue_init(struct throtl_service_queue *sq) |
391 | struct throtl_service_queue *parent_sq) | ||
392 | { | 318 | { |
393 | INIT_LIST_HEAD(&sq->queued[0]); | 319 | INIT_LIST_HEAD(&sq->queued[0]); |
394 | INIT_LIST_HEAD(&sq->queued[1]); | 320 | INIT_LIST_HEAD(&sq->queued[1]); |
395 | sq->pending_tree = RB_ROOT; | 321 | sq->pending_tree = RB_ROOT; |
396 | sq->parent_sq = parent_sq; | ||
397 | setup_timer(&sq->pending_timer, throtl_pending_timer_fn, | 322 | setup_timer(&sq->pending_timer, throtl_pending_timer_fn, |
398 | (unsigned long)sq); | 323 | (unsigned long)sq); |
399 | } | 324 | } |
400 | 325 | ||
401 | static void throtl_service_queue_exit(struct throtl_service_queue *sq) | 326 | static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) |
402 | { | 327 | { |
403 | del_timer_sync(&sq->pending_timer); | 328 | struct throtl_grp *tg; |
329 | int rw; | ||
330 | |||
331 | tg = kzalloc_node(sizeof(*tg), gfp, node); | ||
332 | if (!tg) | ||
333 | return NULL; | ||
334 | |||
335 | throtl_service_queue_init(&tg->service_queue); | ||
336 | |||
337 | for (rw = READ; rw <= WRITE; rw++) { | ||
338 | throtl_qnode_init(&tg->qnode_on_self[rw], tg); | ||
339 | throtl_qnode_init(&tg->qnode_on_parent[rw], tg); | ||
340 | } | ||
341 | |||
342 | RB_CLEAR_NODE(&tg->rb_node); | ||
343 | tg->bps[READ] = -1; | ||
344 | tg->bps[WRITE] = -1; | ||
345 | tg->iops[READ] = -1; | ||
346 | tg->iops[WRITE] = -1; | ||
347 | |||
348 | return &tg->pd; | ||
404 | } | 349 | } |
405 | 350 | ||
406 | static void throtl_pd_init(struct blkcg_gq *blkg) | 351 | static void throtl_pd_init(struct blkg_policy_data *pd) |
407 | { | 352 | { |
408 | struct throtl_grp *tg = blkg_to_tg(blkg); | 353 | struct throtl_grp *tg = pd_to_tg(pd); |
354 | struct blkcg_gq *blkg = tg_to_blkg(tg); | ||
409 | struct throtl_data *td = blkg->q->td; | 355 | struct throtl_data *td = blkg->q->td; |
410 | struct throtl_service_queue *parent_sq; | 356 | struct throtl_service_queue *sq = &tg->service_queue; |
411 | unsigned long flags; | ||
412 | int rw; | ||
413 | 357 | ||
414 | /* | 358 | /* |
415 | * If on the default hierarchy, we switch to properly hierarchical | 359 | * If on the default hierarchy, we switch to properly hierarchical |
@@ -424,35 +368,10 @@ static void throtl_pd_init(struct blkcg_gq *blkg) | |||
424 | * Limits of a group don't interact with limits of other groups | 368 | * Limits of a group don't interact with limits of other groups |
425 | * regardless of the position of the group in the hierarchy. | 369 | * regardless of the position of the group in the hierarchy. |
426 | */ | 370 | */ |
427 | parent_sq = &td->service_queue; | 371 | sq->parent_sq = &td->service_queue; |
428 | |||
429 | if (cgroup_on_dfl(blkg->blkcg->css.cgroup) && blkg->parent) | 372 | if (cgroup_on_dfl(blkg->blkcg->css.cgroup) && blkg->parent) |
430 | parent_sq = &blkg_to_tg(blkg->parent)->service_queue; | 373 | sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; |
431 | |||
432 | throtl_service_queue_init(&tg->service_queue, parent_sq); | ||
433 | |||
434 | for (rw = READ; rw <= WRITE; rw++) { | ||
435 | throtl_qnode_init(&tg->qnode_on_self[rw], tg); | ||
436 | throtl_qnode_init(&tg->qnode_on_parent[rw], tg); | ||
437 | } | ||
438 | |||
439 | RB_CLEAR_NODE(&tg->rb_node); | ||
440 | tg->td = td; | 374 | tg->td = td; |
441 | |||
442 | tg->bps[READ] = -1; | ||
443 | tg->bps[WRITE] = -1; | ||
444 | tg->iops[READ] = -1; | ||
445 | tg->iops[WRITE] = -1; | ||
446 | |||
447 | /* | ||
448 | * Ugh... We need to perform per-cpu allocation for tg->stats_cpu | ||
449 | * but percpu allocator can't be called from IO path. Queue tg on | ||
450 | * tg_stats_alloc_list and allocate from work item. | ||
451 | */ | ||
452 | spin_lock_irqsave(&tg_stats_alloc_lock, flags); | ||
453 | list_add(&tg->stats_alloc_node, &tg_stats_alloc_list); | ||
454 | schedule_delayed_work(&tg_stats_alloc_work, 0); | ||
455 | spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); | ||
456 | } | 375 | } |
457 | 376 | ||
458 | /* | 377 | /* |
@@ -470,83 +389,21 @@ static void tg_update_has_rules(struct throtl_grp *tg) | |||
470 | (tg->bps[rw] != -1 || tg->iops[rw] != -1); | 389 | (tg->bps[rw] != -1 || tg->iops[rw] != -1); |
471 | } | 390 | } |
472 | 391 | ||
473 | static void throtl_pd_online(struct blkcg_gq *blkg) | 392 | static void throtl_pd_online(struct blkg_policy_data *pd) |
474 | { | 393 | { |
475 | /* | 394 | /* |
476 | * We don't want new groups to escape the limits of its ancestors. | 395 | * We don't want new groups to escape the limits of its ancestors. |
477 | * Update has_rules[] after a new group is brought online. | 396 | * Update has_rules[] after a new group is brought online. |
478 | */ | 397 | */ |
479 | tg_update_has_rules(blkg_to_tg(blkg)); | 398 | tg_update_has_rules(pd_to_tg(pd)); |
480 | } | ||
481 | |||
482 | static void throtl_pd_exit(struct blkcg_gq *blkg) | ||
483 | { | ||
484 | struct throtl_grp *tg = blkg_to_tg(blkg); | ||
485 | unsigned long flags; | ||
486 | |||
487 | spin_lock_irqsave(&tg_stats_alloc_lock, flags); | ||
488 | list_del_init(&tg->stats_alloc_node); | ||
489 | spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); | ||
490 | |||
491 | free_percpu(tg->stats_cpu); | ||
492 | |||
493 | throtl_service_queue_exit(&tg->service_queue); | ||
494 | } | ||
495 | |||
496 | static void throtl_pd_reset_stats(struct blkcg_gq *blkg) | ||
497 | { | ||
498 | struct throtl_grp *tg = blkg_to_tg(blkg); | ||
499 | int cpu; | ||
500 | |||
501 | if (tg->stats_cpu == NULL) | ||
502 | return; | ||
503 | |||
504 | for_each_possible_cpu(cpu) { | ||
505 | struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); | ||
506 | |||
507 | blkg_rwstat_reset(&sc->service_bytes); | ||
508 | blkg_rwstat_reset(&sc->serviced); | ||
509 | } | ||
510 | } | ||
511 | |||
512 | static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td, | ||
513 | struct blkcg *blkcg) | ||
514 | { | ||
515 | /* | ||
516 | * This is the common case when there are no blkcgs. Avoid lookup | ||
517 | * in this case | ||
518 | */ | ||
519 | if (blkcg == &blkcg_root) | ||
520 | return td_root_tg(td); | ||
521 | |||
522 | return blkg_to_tg(blkg_lookup(blkcg, td->queue)); | ||
523 | } | 399 | } |
524 | 400 | ||
525 | static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td, | 401 | static void throtl_pd_free(struct blkg_policy_data *pd) |
526 | struct blkcg *blkcg) | ||
527 | { | 402 | { |
528 | struct request_queue *q = td->queue; | 403 | struct throtl_grp *tg = pd_to_tg(pd); |
529 | struct throtl_grp *tg = NULL; | ||
530 | |||
531 | /* | ||
532 | * This is the common case when there are no blkcgs. Avoid lookup | ||
533 | * in this case | ||
534 | */ | ||
535 | if (blkcg == &blkcg_root) { | ||
536 | tg = td_root_tg(td); | ||
537 | } else { | ||
538 | struct blkcg_gq *blkg; | ||
539 | |||
540 | blkg = blkg_lookup_create(blkcg, q); | ||
541 | |||
542 | /* if %NULL and @q is alive, fall back to root_tg */ | ||
543 | if (!IS_ERR(blkg)) | ||
544 | tg = blkg_to_tg(blkg); | ||
545 | else if (!blk_queue_dying(q)) | ||
546 | tg = td_root_tg(td); | ||
547 | } | ||
548 | 404 | ||
549 | return tg; | 405 | del_timer_sync(&tg->service_queue.pending_timer); |
406 | kfree(tg); | ||
550 | } | 407 | } |
551 | 408 | ||
552 | static struct throtl_grp * | 409 | static struct throtl_grp * |
@@ -956,32 +813,6 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, | |||
956 | return 0; | 813 | return 0; |
957 | } | 814 | } |
958 | 815 | ||
959 | static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes, | ||
960 | int rw) | ||
961 | { | ||
962 | struct throtl_grp *tg = blkg_to_tg(blkg); | ||
963 | struct tg_stats_cpu *stats_cpu; | ||
964 | unsigned long flags; | ||
965 | |||
966 | /* If per cpu stats are not allocated yet, don't do any accounting. */ | ||
967 | if (tg->stats_cpu == NULL) | ||
968 | return; | ||
969 | |||
970 | /* | ||
971 | * Disabling interrupts to provide mutual exclusion between two | ||
972 | * writes on same cpu. It probably is not needed for 64bit. Not | ||
973 | * optimizing that case yet. | ||
974 | */ | ||
975 | local_irq_save(flags); | ||
976 | |||
977 | stats_cpu = this_cpu_ptr(tg->stats_cpu); | ||
978 | |||
979 | blkg_rwstat_add(&stats_cpu->serviced, rw, 1); | ||
980 | blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes); | ||
981 | |||
982 | local_irq_restore(flags); | ||
983 | } | ||
984 | |||
985 | static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) | 816 | static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) |
986 | { | 817 | { |
987 | bool rw = bio_data_dir(bio); | 818 | bool rw = bio_data_dir(bio); |
@@ -995,17 +826,9 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) | |||
995 | * more than once as a throttled bio will go through blk-throtl the | 826 | * more than once as a throttled bio will go through blk-throtl the |
996 | * second time when it eventually gets issued. Set it when a bio | 827 | * second time when it eventually gets issued. Set it when a bio |
997 | * is being charged to a tg. | 828 | * is being charged to a tg. |
998 | * | ||
999 | * Dispatch stats aren't recursive and each @bio should only be | ||
1000 | * accounted by the @tg it was originally associated with. Let's | ||
1001 | * update the stats when setting REQ_THROTTLED for the first time | ||
1002 | * which is guaranteed to be for the @bio's original tg. | ||
1003 | */ | 829 | */ |
1004 | if (!(bio->bi_rw & REQ_THROTTLED)) { | 830 | if (!(bio->bi_rw & REQ_THROTTLED)) |
1005 | bio->bi_rw |= REQ_THROTTLED; | 831 | bio->bi_rw |= REQ_THROTTLED; |
1006 | throtl_update_dispatch_stats(tg_to_blkg(tg), | ||
1007 | bio->bi_iter.bi_size, bio->bi_rw); | ||
1008 | } | ||
1009 | } | 832 | } |
1010 | 833 | ||
1011 | /** | 834 | /** |
@@ -1285,34 +1108,6 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work) | |||
1285 | } | 1108 | } |
1286 | } | 1109 | } |
1287 | 1110 | ||
1288 | static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, | ||
1289 | struct blkg_policy_data *pd, int off) | ||
1290 | { | ||
1291 | struct throtl_grp *tg = pd_to_tg(pd); | ||
1292 | struct blkg_rwstat rwstat = { }, tmp; | ||
1293 | int i, cpu; | ||
1294 | |||
1295 | if (tg->stats_cpu == NULL) | ||
1296 | return 0; | ||
1297 | |||
1298 | for_each_possible_cpu(cpu) { | ||
1299 | struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); | ||
1300 | |||
1301 | tmp = blkg_rwstat_read((void *)sc + off); | ||
1302 | for (i = 0; i < BLKG_RWSTAT_NR; i++) | ||
1303 | rwstat.cnt[i] += tmp.cnt[i]; | ||
1304 | } | ||
1305 | |||
1306 | return __blkg_prfill_rwstat(sf, pd, &rwstat); | ||
1307 | } | ||
1308 | |||
1309 | static int tg_print_cpu_rwstat(struct seq_file *sf, void *v) | ||
1310 | { | ||
1311 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat, | ||
1312 | &blkcg_policy_throtl, seq_cft(sf)->private, true); | ||
1313 | return 0; | ||
1314 | } | ||
1315 | |||
1316 | static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, | 1111 | static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, |
1317 | int off) | 1112 | int off) |
1318 | { | 1113 | { |
@@ -1349,31 +1144,11 @@ static int tg_print_conf_uint(struct seq_file *sf, void *v) | |||
1349 | return 0; | 1144 | return 0; |
1350 | } | 1145 | } |
1351 | 1146 | ||
1352 | static ssize_t tg_set_conf(struct kernfs_open_file *of, | 1147 | static void tg_conf_updated(struct throtl_grp *tg) |
1353 | char *buf, size_t nbytes, loff_t off, bool is_u64) | ||
1354 | { | 1148 | { |
1355 | struct blkcg *blkcg = css_to_blkcg(of_css(of)); | 1149 | struct throtl_service_queue *sq = &tg->service_queue; |
1356 | struct blkg_conf_ctx ctx; | ||
1357 | struct throtl_grp *tg; | ||
1358 | struct throtl_service_queue *sq; | ||
1359 | struct blkcg_gq *blkg; | ||
1360 | struct cgroup_subsys_state *pos_css; | 1150 | struct cgroup_subsys_state *pos_css; |
1361 | int ret; | 1151 | struct blkcg_gq *blkg; |
1362 | |||
1363 | ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); | ||
1364 | if (ret) | ||
1365 | return ret; | ||
1366 | |||
1367 | tg = blkg_to_tg(ctx.blkg); | ||
1368 | sq = &tg->service_queue; | ||
1369 | |||
1370 | if (!ctx.v) | ||
1371 | ctx.v = -1; | ||
1372 | |||
1373 | if (is_u64) | ||
1374 | *(u64 *)((void *)tg + of_cft(of)->private) = ctx.v; | ||
1375 | else | ||
1376 | *(unsigned int *)((void *)tg + of_cft(of)->private) = ctx.v; | ||
1377 | 1152 | ||
1378 | throtl_log(&tg->service_queue, | 1153 | throtl_log(&tg->service_queue, |
1379 | "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", | 1154 | "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", |
@@ -1387,7 +1162,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, | |||
1387 | * restrictions in the whole hierarchy and allows them to bypass | 1162 | * restrictions in the whole hierarchy and allows them to bypass |
1388 | * blk-throttle. | 1163 | * blk-throttle. |
1389 | */ | 1164 | */ |
1390 | blkg_for_each_descendant_pre(blkg, pos_css, ctx.blkg) | 1165 | blkg_for_each_descendant_pre(blkg, pos_css, tg_to_blkg(tg)) |
1391 | tg_update_has_rules(blkg_to_tg(blkg)); | 1166 | tg_update_has_rules(blkg_to_tg(blkg)); |
1392 | 1167 | ||
1393 | /* | 1168 | /* |
@@ -1405,9 +1180,39 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, | |||
1405 | tg_update_disptime(tg); | 1180 | tg_update_disptime(tg); |
1406 | throtl_schedule_next_dispatch(sq->parent_sq, true); | 1181 | throtl_schedule_next_dispatch(sq->parent_sq, true); |
1407 | } | 1182 | } |
1183 | } | ||
1184 | |||
1185 | static ssize_t tg_set_conf(struct kernfs_open_file *of, | ||
1186 | char *buf, size_t nbytes, loff_t off, bool is_u64) | ||
1187 | { | ||
1188 | struct blkcg *blkcg = css_to_blkcg(of_css(of)); | ||
1189 | struct blkg_conf_ctx ctx; | ||
1190 | struct throtl_grp *tg; | ||
1191 | int ret; | ||
1192 | u64 v; | ||
1408 | 1193 | ||
1194 | ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); | ||
1195 | if (ret) | ||
1196 | return ret; | ||
1197 | |||
1198 | ret = -EINVAL; | ||
1199 | if (sscanf(ctx.body, "%llu", &v) != 1) | ||
1200 | goto out_finish; | ||
1201 | if (!v) | ||
1202 | v = -1; | ||
1203 | |||
1204 | tg = blkg_to_tg(ctx.blkg); | ||
1205 | |||
1206 | if (is_u64) | ||
1207 | *(u64 *)((void *)tg + of_cft(of)->private) = v; | ||
1208 | else | ||
1209 | *(unsigned int *)((void *)tg + of_cft(of)->private) = v; | ||
1210 | |||
1211 | tg_conf_updated(tg); | ||
1212 | ret = 0; | ||
1213 | out_finish: | ||
1409 | blkg_conf_finish(&ctx); | 1214 | blkg_conf_finish(&ctx); |
1410 | return nbytes; | 1215 | return ret ?: nbytes; |
1411 | } | 1216 | } |
1412 | 1217 | ||
1413 | static ssize_t tg_set_conf_u64(struct kernfs_open_file *of, | 1218 | static ssize_t tg_set_conf_u64(struct kernfs_open_file *of, |
@@ -1422,7 +1227,7 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, | |||
1422 | return tg_set_conf(of, buf, nbytes, off, false); | 1227 | return tg_set_conf(of, buf, nbytes, off, false); |
1423 | } | 1228 | } |
1424 | 1229 | ||
1425 | static struct cftype throtl_files[] = { | 1230 | static struct cftype throtl_legacy_files[] = { |
1426 | { | 1231 | { |
1427 | .name = "throttle.read_bps_device", | 1232 | .name = "throttle.read_bps_device", |
1428 | .private = offsetof(struct throtl_grp, bps[READ]), | 1233 | .private = offsetof(struct throtl_grp, bps[READ]), |
@@ -1449,13 +1254,124 @@ static struct cftype throtl_files[] = { | |||
1449 | }, | 1254 | }, |
1450 | { | 1255 | { |
1451 | .name = "throttle.io_service_bytes", | 1256 | .name = "throttle.io_service_bytes", |
1452 | .private = offsetof(struct tg_stats_cpu, service_bytes), | 1257 | .private = (unsigned long)&blkcg_policy_throtl, |
1453 | .seq_show = tg_print_cpu_rwstat, | 1258 | .seq_show = blkg_print_stat_bytes, |
1454 | }, | 1259 | }, |
1455 | { | 1260 | { |
1456 | .name = "throttle.io_serviced", | 1261 | .name = "throttle.io_serviced", |
1457 | .private = offsetof(struct tg_stats_cpu, serviced), | 1262 | .private = (unsigned long)&blkcg_policy_throtl, |
1458 | .seq_show = tg_print_cpu_rwstat, | 1263 | .seq_show = blkg_print_stat_ios, |
1264 | }, | ||
1265 | { } /* terminate */ | ||
1266 | }; | ||
1267 | |||
1268 | static u64 tg_prfill_max(struct seq_file *sf, struct blkg_policy_data *pd, | ||
1269 | int off) | ||
1270 | { | ||
1271 | struct throtl_grp *tg = pd_to_tg(pd); | ||
1272 | const char *dname = blkg_dev_name(pd->blkg); | ||
1273 | char bufs[4][21] = { "max", "max", "max", "max" }; | ||
1274 | |||
1275 | if (!dname) | ||
1276 | return 0; | ||
1277 | if (tg->bps[READ] == -1 && tg->bps[WRITE] == -1 && | ||
1278 | tg->iops[READ] == -1 && tg->iops[WRITE] == -1) | ||
1279 | return 0; | ||
1280 | |||
1281 | if (tg->bps[READ] != -1) | ||
1282 | snprintf(bufs[0], sizeof(bufs[0]), "%llu", tg->bps[READ]); | ||
1283 | if (tg->bps[WRITE] != -1) | ||
1284 | snprintf(bufs[1], sizeof(bufs[1]), "%llu", tg->bps[WRITE]); | ||
1285 | if (tg->iops[READ] != -1) | ||
1286 | snprintf(bufs[2], sizeof(bufs[2]), "%u", tg->iops[READ]); | ||
1287 | if (tg->iops[WRITE] != -1) | ||
1288 | snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops[WRITE]); | ||
1289 | |||
1290 | seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s\n", | ||
1291 | dname, bufs[0], bufs[1], bufs[2], bufs[3]); | ||
1292 | return 0; | ||
1293 | } | ||
1294 | |||
1295 | static int tg_print_max(struct seq_file *sf, void *v) | ||
1296 | { | ||
1297 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_max, | ||
1298 | &blkcg_policy_throtl, seq_cft(sf)->private, false); | ||
1299 | return 0; | ||
1300 | } | ||
1301 | |||
1302 | static ssize_t tg_set_max(struct kernfs_open_file *of, | ||
1303 | char *buf, size_t nbytes, loff_t off) | ||
1304 | { | ||
1305 | struct blkcg *blkcg = css_to_blkcg(of_css(of)); | ||
1306 | struct blkg_conf_ctx ctx; | ||
1307 | struct throtl_grp *tg; | ||
1308 | u64 v[4]; | ||
1309 | int ret; | ||
1310 | |||
1311 | ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); | ||
1312 | if (ret) | ||
1313 | return ret; | ||
1314 | |||
1315 | tg = blkg_to_tg(ctx.blkg); | ||
1316 | |||
1317 | v[0] = tg->bps[READ]; | ||
1318 | v[1] = tg->bps[WRITE]; | ||
1319 | v[2] = tg->iops[READ]; | ||
1320 | v[3] = tg->iops[WRITE]; | ||
1321 | |||
1322 | while (true) { | ||
1323 | char tok[27]; /* wiops=18446744073709551616 */ | ||
1324 | char *p; | ||
1325 | u64 val = -1; | ||
1326 | int len; | ||
1327 | |||
1328 | if (sscanf(ctx.body, "%26s%n", tok, &len) != 1) | ||
1329 | break; | ||
1330 | if (tok[0] == '\0') | ||
1331 | break; | ||
1332 | ctx.body += len; | ||
1333 | |||
1334 | ret = -EINVAL; | ||
1335 | p = tok; | ||
1336 | strsep(&p, "="); | ||
1337 | if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max"))) | ||
1338 | goto out_finish; | ||
1339 | |||
1340 | ret = -ERANGE; | ||
1341 | if (!val) | ||
1342 | goto out_finish; | ||
1343 | |||
1344 | ret = -EINVAL; | ||
1345 | if (!strcmp(tok, "rbps")) | ||
1346 | v[0] = val; | ||
1347 | else if (!strcmp(tok, "wbps")) | ||
1348 | v[1] = val; | ||
1349 | else if (!strcmp(tok, "riops")) | ||
1350 | v[2] = min_t(u64, val, UINT_MAX); | ||
1351 | else if (!strcmp(tok, "wiops")) | ||
1352 | v[3] = min_t(u64, val, UINT_MAX); | ||
1353 | else | ||
1354 | goto out_finish; | ||
1355 | } | ||
1356 | |||
1357 | tg->bps[READ] = v[0]; | ||
1358 | tg->bps[WRITE] = v[1]; | ||
1359 | tg->iops[READ] = v[2]; | ||
1360 | tg->iops[WRITE] = v[3]; | ||
1361 | |||
1362 | tg_conf_updated(tg); | ||
1363 | ret = 0; | ||
1364 | out_finish: | ||
1365 | blkg_conf_finish(&ctx); | ||
1366 | return ret ?: nbytes; | ||
1367 | } | ||
1368 | |||
1369 | static struct cftype throtl_files[] = { | ||
1370 | { | ||
1371 | .name = "max", | ||
1372 | .flags = CFTYPE_NOT_ON_ROOT, | ||
1373 | .seq_show = tg_print_max, | ||
1374 | .write = tg_set_max, | ||
1459 | }, | 1375 | }, |
1460 | { } /* terminate */ | 1376 | { } /* terminate */ |
1461 | }; | 1377 | }; |
@@ -1468,52 +1384,33 @@ static void throtl_shutdown_wq(struct request_queue *q) | |||
1468 | } | 1384 | } |
1469 | 1385 | ||
1470 | static struct blkcg_policy blkcg_policy_throtl = { | 1386 | static struct blkcg_policy blkcg_policy_throtl = { |
1471 | .pd_size = sizeof(struct throtl_grp), | 1387 | .dfl_cftypes = throtl_files, |
1472 | .cftypes = throtl_files, | 1388 | .legacy_cftypes = throtl_legacy_files, |
1473 | 1389 | ||
1390 | .pd_alloc_fn = throtl_pd_alloc, | ||
1474 | .pd_init_fn = throtl_pd_init, | 1391 | .pd_init_fn = throtl_pd_init, |
1475 | .pd_online_fn = throtl_pd_online, | 1392 | .pd_online_fn = throtl_pd_online, |
1476 | .pd_exit_fn = throtl_pd_exit, | 1393 | .pd_free_fn = throtl_pd_free, |
1477 | .pd_reset_stats_fn = throtl_pd_reset_stats, | ||
1478 | }; | 1394 | }; |
1479 | 1395 | ||
1480 | bool blk_throtl_bio(struct request_queue *q, struct bio *bio) | 1396 | bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, |
1397 | struct bio *bio) | ||
1481 | { | 1398 | { |
1482 | struct throtl_data *td = q->td; | ||
1483 | struct throtl_qnode *qn = NULL; | 1399 | struct throtl_qnode *qn = NULL; |
1484 | struct throtl_grp *tg; | 1400 | struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg); |
1485 | struct throtl_service_queue *sq; | 1401 | struct throtl_service_queue *sq; |
1486 | bool rw = bio_data_dir(bio); | 1402 | bool rw = bio_data_dir(bio); |
1487 | struct blkcg *blkcg; | ||
1488 | bool throttled = false; | 1403 | bool throttled = false; |
1489 | 1404 | ||
1405 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
1406 | |||
1490 | /* see throtl_charge_bio() */ | 1407 | /* see throtl_charge_bio() */ |
1491 | if (bio->bi_rw & REQ_THROTTLED) | 1408 | if ((bio->bi_rw & REQ_THROTTLED) || !tg->has_rules[rw]) |
1492 | goto out; | 1409 | goto out; |
1493 | 1410 | ||
1494 | /* | ||
1495 | * A throtl_grp pointer retrieved under rcu can be used to access | ||
1496 | * basic fields like stats and io rates. If a group has no rules, | ||
1497 | * just update the dispatch stats in lockless manner and return. | ||
1498 | */ | ||
1499 | rcu_read_lock(); | ||
1500 | blkcg = bio_blkcg(bio); | ||
1501 | tg = throtl_lookup_tg(td, blkcg); | ||
1502 | if (tg) { | ||
1503 | if (!tg->has_rules[rw]) { | ||
1504 | throtl_update_dispatch_stats(tg_to_blkg(tg), | ||
1505 | bio->bi_iter.bi_size, bio->bi_rw); | ||
1506 | goto out_unlock_rcu; | ||
1507 | } | ||
1508 | } | ||
1509 | |||
1510 | /* | ||
1511 | * Either group has not been allocated yet or it is not an unlimited | ||
1512 | * IO group | ||
1513 | */ | ||
1514 | spin_lock_irq(q->queue_lock); | 1411 | spin_lock_irq(q->queue_lock); |
1515 | tg = throtl_lookup_create_tg(td, blkcg); | 1412 | |
1516 | if (unlikely(!tg)) | 1413 | if (unlikely(blk_queue_bypass(q))) |
1517 | goto out_unlock; | 1414 | goto out_unlock; |
1518 | 1415 | ||
1519 | sq = &tg->service_queue; | 1416 | sq = &tg->service_queue; |
@@ -1580,8 +1477,6 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) | |||
1580 | 1477 | ||
1581 | out_unlock: | 1478 | out_unlock: |
1582 | spin_unlock_irq(q->queue_lock); | 1479 | spin_unlock_irq(q->queue_lock); |
1583 | out_unlock_rcu: | ||
1584 | rcu_read_unlock(); | ||
1585 | out: | 1480 | out: |
1586 | /* | 1481 | /* |
1587 | * As multiple blk-throtls may stack in the same issue path, we | 1482 | * As multiple blk-throtls may stack in the same issue path, we |
@@ -1667,7 +1562,7 @@ int blk_throtl_init(struct request_queue *q) | |||
1667 | return -ENOMEM; | 1562 | return -ENOMEM; |
1668 | 1563 | ||
1669 | INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); | 1564 | INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); |
1670 | throtl_service_queue_init(&td->service_queue, NULL); | 1565 | throtl_service_queue_init(&td->service_queue); |
1671 | 1566 | ||
1672 | q->td = td; | 1567 | q->td = td; |
1673 | td->queue = q; | 1568 | td->queue = q; |
diff --git a/block/blk.h b/block/blk.h index 838188b35a83..98614ad37c81 100644 --- a/block/blk.h +++ b/block/blk.h | |||
@@ -272,15 +272,10 @@ static inline struct io_context *create_io_context(gfp_t gfp_mask, int node) | |||
272 | * Internal throttling interface | 272 | * Internal throttling interface |
273 | */ | 273 | */ |
274 | #ifdef CONFIG_BLK_DEV_THROTTLING | 274 | #ifdef CONFIG_BLK_DEV_THROTTLING |
275 | extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio); | ||
276 | extern void blk_throtl_drain(struct request_queue *q); | 275 | extern void blk_throtl_drain(struct request_queue *q); |
277 | extern int blk_throtl_init(struct request_queue *q); | 276 | extern int blk_throtl_init(struct request_queue *q); |
278 | extern void blk_throtl_exit(struct request_queue *q); | 277 | extern void blk_throtl_exit(struct request_queue *q); |
279 | #else /* CONFIG_BLK_DEV_THROTTLING */ | 278 | #else /* CONFIG_BLK_DEV_THROTTLING */ |
280 | static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio) | ||
281 | { | ||
282 | return false; | ||
283 | } | ||
284 | static inline void blk_throtl_drain(struct request_queue *q) { } | 279 | static inline void blk_throtl_drain(struct request_queue *q) { } |
285 | static inline int blk_throtl_init(struct request_queue *q) { return 0; } | 280 | static inline int blk_throtl_init(struct request_queue *q) { return 0; } |
286 | static inline void blk_throtl_exit(struct request_queue *q) { } | 281 | static inline void blk_throtl_exit(struct request_queue *q) { } |
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index c62bb2e650b8..04de88463a98 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c | |||
@@ -68,9 +68,9 @@ static struct kmem_cache *cfq_pool; | |||
68 | #define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) | 68 | #define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) |
69 | 69 | ||
70 | /* blkio-related constants */ | 70 | /* blkio-related constants */ |
71 | #define CFQ_WEIGHT_MIN 10 | 71 | #define CFQ_WEIGHT_LEGACY_MIN 10 |
72 | #define CFQ_WEIGHT_MAX 1000 | 72 | #define CFQ_WEIGHT_LEGACY_DFL 500 |
73 | #define CFQ_WEIGHT_DEFAULT 500 | 73 | #define CFQ_WEIGHT_LEGACY_MAX 1000 |
74 | 74 | ||
75 | struct cfq_ttime { | 75 | struct cfq_ttime { |
76 | unsigned long last_end_request; | 76 | unsigned long last_end_request; |
@@ -177,10 +177,6 @@ enum wl_type_t { | |||
177 | 177 | ||
178 | struct cfqg_stats { | 178 | struct cfqg_stats { |
179 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | 179 | #ifdef CONFIG_CFQ_GROUP_IOSCHED |
180 | /* total bytes transferred */ | ||
181 | struct blkg_rwstat service_bytes; | ||
182 | /* total IOs serviced, post merge */ | ||
183 | struct blkg_rwstat serviced; | ||
184 | /* number of ios merged */ | 180 | /* number of ios merged */ |
185 | struct blkg_rwstat merged; | 181 | struct blkg_rwstat merged; |
186 | /* total time spent on device in ns, may not be accurate w/ queueing */ | 182 | /* total time spent on device in ns, may not be accurate w/ queueing */ |
@@ -189,8 +185,6 @@ struct cfqg_stats { | |||
189 | struct blkg_rwstat wait_time; | 185 | struct blkg_rwstat wait_time; |
190 | /* number of IOs queued up */ | 186 | /* number of IOs queued up */ |
191 | struct blkg_rwstat queued; | 187 | struct blkg_rwstat queued; |
192 | /* total sectors transferred */ | ||
193 | struct blkg_stat sectors; | ||
194 | /* total disk time and nr sectors dispatched by this group */ | 188 | /* total disk time and nr sectors dispatched by this group */ |
195 | struct blkg_stat time; | 189 | struct blkg_stat time; |
196 | #ifdef CONFIG_DEBUG_BLK_CGROUP | 190 | #ifdef CONFIG_DEBUG_BLK_CGROUP |
@@ -220,7 +214,7 @@ struct cfqg_stats { | |||
220 | /* Per-cgroup data */ | 214 | /* Per-cgroup data */ |
221 | struct cfq_group_data { | 215 | struct cfq_group_data { |
222 | /* must be the first member */ | 216 | /* must be the first member */ |
223 | struct blkcg_policy_data pd; | 217 | struct blkcg_policy_data cpd; |
224 | 218 | ||
225 | unsigned int weight; | 219 | unsigned int weight; |
226 | unsigned int leaf_weight; | 220 | unsigned int leaf_weight; |
@@ -304,7 +298,11 @@ struct cfq_group { | |||
304 | int dispatched; | 298 | int dispatched; |
305 | struct cfq_ttime ttime; | 299 | struct cfq_ttime ttime; |
306 | struct cfqg_stats stats; /* stats for this cfqg */ | 300 | struct cfqg_stats stats; /* stats for this cfqg */ |
307 | struct cfqg_stats dead_stats; /* stats pushed from dead children */ | 301 | |
302 | /* async queue for each priority case */ | ||
303 | struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; | ||
304 | struct cfq_queue *async_idle_cfqq; | ||
305 | |||
308 | }; | 306 | }; |
309 | 307 | ||
310 | struct cfq_io_cq { | 308 | struct cfq_io_cq { |
@@ -370,12 +368,6 @@ struct cfq_data { | |||
370 | struct cfq_queue *active_queue; | 368 | struct cfq_queue *active_queue; |
371 | struct cfq_io_cq *active_cic; | 369 | struct cfq_io_cq *active_cic; |
372 | 370 | ||
373 | /* | ||
374 | * async queue for each priority case | ||
375 | */ | ||
376 | struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; | ||
377 | struct cfq_queue *async_idle_cfqq; | ||
378 | |||
379 | sector_t last_position; | 371 | sector_t last_position; |
380 | 372 | ||
381 | /* | 373 | /* |
@@ -401,6 +393,7 @@ struct cfq_data { | |||
401 | }; | 393 | }; |
402 | 394 | ||
403 | static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); | 395 | static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); |
396 | static void cfq_put_queue(struct cfq_queue *cfqq); | ||
404 | 397 | ||
405 | static struct cfq_rb_root *st_for(struct cfq_group *cfqg, | 398 | static struct cfq_rb_root *st_for(struct cfq_group *cfqg, |
406 | enum wl_class_t class, | 399 | enum wl_class_t class, |
@@ -612,7 +605,7 @@ static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd) | |||
612 | static struct cfq_group_data | 605 | static struct cfq_group_data |
613 | *cpd_to_cfqgd(struct blkcg_policy_data *cpd) | 606 | *cpd_to_cfqgd(struct blkcg_policy_data *cpd) |
614 | { | 607 | { |
615 | return cpd ? container_of(cpd, struct cfq_group_data, pd) : NULL; | 608 | return cpd ? container_of(cpd, struct cfq_group_data, cpd) : NULL; |
616 | } | 609 | } |
617 | 610 | ||
618 | static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg) | 611 | static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg) |
@@ -693,14 +686,6 @@ static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) | |||
693 | blkg_rwstat_add(&cfqg->stats.merged, rw, 1); | 686 | blkg_rwstat_add(&cfqg->stats.merged, rw, 1); |
694 | } | 687 | } |
695 | 688 | ||
696 | static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg, | ||
697 | uint64_t bytes, int rw) | ||
698 | { | ||
699 | blkg_stat_add(&cfqg->stats.sectors, bytes >> 9); | ||
700 | blkg_rwstat_add(&cfqg->stats.serviced, rw, 1); | ||
701 | blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes); | ||
702 | } | ||
703 | |||
704 | static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, | 689 | static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, |
705 | uint64_t start_time, uint64_t io_start_time, int rw) | 690 | uint64_t start_time, uint64_t io_start_time, int rw) |
706 | { | 691 | { |
@@ -718,8 +703,6 @@ static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, | |||
718 | static void cfqg_stats_reset(struct cfqg_stats *stats) | 703 | static void cfqg_stats_reset(struct cfqg_stats *stats) |
719 | { | 704 | { |
720 | /* queued stats shouldn't be cleared */ | 705 | /* queued stats shouldn't be cleared */ |
721 | blkg_rwstat_reset(&stats->service_bytes); | ||
722 | blkg_rwstat_reset(&stats->serviced); | ||
723 | blkg_rwstat_reset(&stats->merged); | 706 | blkg_rwstat_reset(&stats->merged); |
724 | blkg_rwstat_reset(&stats->service_time); | 707 | blkg_rwstat_reset(&stats->service_time); |
725 | blkg_rwstat_reset(&stats->wait_time); | 708 | blkg_rwstat_reset(&stats->wait_time); |
@@ -736,28 +719,26 @@ static void cfqg_stats_reset(struct cfqg_stats *stats) | |||
736 | } | 719 | } |
737 | 720 | ||
738 | /* @to += @from */ | 721 | /* @to += @from */ |
739 | static void cfqg_stats_merge(struct cfqg_stats *to, struct cfqg_stats *from) | 722 | static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from) |
740 | { | 723 | { |
741 | /* queued stats shouldn't be cleared */ | 724 | /* queued stats shouldn't be cleared */ |
742 | blkg_rwstat_merge(&to->service_bytes, &from->service_bytes); | 725 | blkg_rwstat_add_aux(&to->merged, &from->merged); |
743 | blkg_rwstat_merge(&to->serviced, &from->serviced); | 726 | blkg_rwstat_add_aux(&to->service_time, &from->service_time); |
744 | blkg_rwstat_merge(&to->merged, &from->merged); | 727 | blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); |
745 | blkg_rwstat_merge(&to->service_time, &from->service_time); | 728 | blkg_stat_add_aux(&from->time, &from->time); |
746 | blkg_rwstat_merge(&to->wait_time, &from->wait_time); | ||
747 | blkg_stat_merge(&from->time, &from->time); | ||
748 | #ifdef CONFIG_DEBUG_BLK_CGROUP | 729 | #ifdef CONFIG_DEBUG_BLK_CGROUP |
749 | blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time); | 730 | blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time); |
750 | blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum); | 731 | blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); |
751 | blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples); | 732 | blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples); |
752 | blkg_stat_merge(&to->dequeue, &from->dequeue); | 733 | blkg_stat_add_aux(&to->dequeue, &from->dequeue); |
753 | blkg_stat_merge(&to->group_wait_time, &from->group_wait_time); | 734 | blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); |
754 | blkg_stat_merge(&to->idle_time, &from->idle_time); | 735 | blkg_stat_add_aux(&to->idle_time, &from->idle_time); |
755 | blkg_stat_merge(&to->empty_time, &from->empty_time); | 736 | blkg_stat_add_aux(&to->empty_time, &from->empty_time); |
756 | #endif | 737 | #endif |
757 | } | 738 | } |
758 | 739 | ||
759 | /* | 740 | /* |
760 | * Transfer @cfqg's stats to its parent's dead_stats so that the ancestors' | 741 | * Transfer @cfqg's stats to its parent's aux counts so that the ancestors' |
761 | * recursive stats can still account for the amount used by this cfqg after | 742 | * recursive stats can still account for the amount used by this cfqg after |
762 | * it's gone. | 743 | * it's gone. |
763 | */ | 744 | */ |
@@ -770,10 +751,8 @@ static void cfqg_stats_xfer_dead(struct cfq_group *cfqg) | |||
770 | if (unlikely(!parent)) | 751 | if (unlikely(!parent)) |
771 | return; | 752 | return; |
772 | 753 | ||
773 | cfqg_stats_merge(&parent->dead_stats, &cfqg->stats); | 754 | cfqg_stats_add_aux(&parent->stats, &cfqg->stats); |
774 | cfqg_stats_merge(&parent->dead_stats, &cfqg->dead_stats); | ||
775 | cfqg_stats_reset(&cfqg->stats); | 755 | cfqg_stats_reset(&cfqg->stats); |
776 | cfqg_stats_reset(&cfqg->dead_stats); | ||
777 | } | 756 | } |
778 | 757 | ||
779 | #else /* CONFIG_CFQ_GROUP_IOSCHED */ | 758 | #else /* CONFIG_CFQ_GROUP_IOSCHED */ |
@@ -795,8 +774,6 @@ static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, | |||
795 | unsigned long time, unsigned long unaccounted_time) { } | 774 | unsigned long time, unsigned long unaccounted_time) { } |
796 | static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { } | 775 | static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { } |
797 | static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { } | 776 | static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { } |
798 | static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg, | ||
799 | uint64_t bytes, int rw) { } | ||
800 | static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, | 777 | static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, |
801 | uint64_t start_time, uint64_t io_start_time, int rw) { } | 778 | uint64_t start_time, uint64_t io_start_time, int rw) { } |
802 | 779 | ||
@@ -883,8 +860,7 @@ static inline int cfqg_busy_async_queues(struct cfq_data *cfqd, | |||
883 | 860 | ||
884 | static void cfq_dispatch_insert(struct request_queue *, struct request *); | 861 | static void cfq_dispatch_insert(struct request_queue *, struct request *); |
885 | static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync, | 862 | static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync, |
886 | struct cfq_io_cq *cic, struct bio *bio, | 863 | struct cfq_io_cq *cic, struct bio *bio); |
887 | gfp_t gfp_mask); | ||
888 | 864 | ||
889 | static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq) | 865 | static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq) |
890 | { | 866 | { |
@@ -1546,130 +1522,171 @@ static void cfq_init_cfqg_base(struct cfq_group *cfqg) | |||
1546 | } | 1522 | } |
1547 | 1523 | ||
1548 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | 1524 | #ifdef CONFIG_CFQ_GROUP_IOSCHED |
1549 | static void cfqg_stats_init(struct cfqg_stats *stats) | 1525 | static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val, |
1526 | bool on_dfl, bool reset_dev, bool is_leaf_weight); | ||
1527 | |||
1528 | static void cfqg_stats_exit(struct cfqg_stats *stats) | ||
1550 | { | 1529 | { |
1551 | blkg_rwstat_init(&stats->service_bytes); | 1530 | blkg_rwstat_exit(&stats->merged); |
1552 | blkg_rwstat_init(&stats->serviced); | 1531 | blkg_rwstat_exit(&stats->service_time); |
1553 | blkg_rwstat_init(&stats->merged); | 1532 | blkg_rwstat_exit(&stats->wait_time); |
1554 | blkg_rwstat_init(&stats->service_time); | 1533 | blkg_rwstat_exit(&stats->queued); |
1555 | blkg_rwstat_init(&stats->wait_time); | 1534 | blkg_stat_exit(&stats->time); |
1556 | blkg_rwstat_init(&stats->queued); | 1535 | #ifdef CONFIG_DEBUG_BLK_CGROUP |
1536 | blkg_stat_exit(&stats->unaccounted_time); | ||
1537 | blkg_stat_exit(&stats->avg_queue_size_sum); | ||
1538 | blkg_stat_exit(&stats->avg_queue_size_samples); | ||
1539 | blkg_stat_exit(&stats->dequeue); | ||
1540 | blkg_stat_exit(&stats->group_wait_time); | ||
1541 | blkg_stat_exit(&stats->idle_time); | ||
1542 | blkg_stat_exit(&stats->empty_time); | ||
1543 | #endif | ||
1544 | } | ||
1557 | 1545 | ||
1558 | blkg_stat_init(&stats->sectors); | 1546 | static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp) |
1559 | blkg_stat_init(&stats->time); | 1547 | { |
1548 | if (blkg_rwstat_init(&stats->merged, gfp) || | ||
1549 | blkg_rwstat_init(&stats->service_time, gfp) || | ||
1550 | blkg_rwstat_init(&stats->wait_time, gfp) || | ||
1551 | blkg_rwstat_init(&stats->queued, gfp) || | ||
1552 | blkg_stat_init(&stats->time, gfp)) | ||
1553 | goto err; | ||
1560 | 1554 | ||
1561 | #ifdef CONFIG_DEBUG_BLK_CGROUP | 1555 | #ifdef CONFIG_DEBUG_BLK_CGROUP |
1562 | blkg_stat_init(&stats->unaccounted_time); | 1556 | if (blkg_stat_init(&stats->unaccounted_time, gfp) || |
1563 | blkg_stat_init(&stats->avg_queue_size_sum); | 1557 | blkg_stat_init(&stats->avg_queue_size_sum, gfp) || |
1564 | blkg_stat_init(&stats->avg_queue_size_samples); | 1558 | blkg_stat_init(&stats->avg_queue_size_samples, gfp) || |
1565 | blkg_stat_init(&stats->dequeue); | 1559 | blkg_stat_init(&stats->dequeue, gfp) || |
1566 | blkg_stat_init(&stats->group_wait_time); | 1560 | blkg_stat_init(&stats->group_wait_time, gfp) || |
1567 | blkg_stat_init(&stats->idle_time); | 1561 | blkg_stat_init(&stats->idle_time, gfp) || |
1568 | blkg_stat_init(&stats->empty_time); | 1562 | blkg_stat_init(&stats->empty_time, gfp)) |
1563 | goto err; | ||
1569 | #endif | 1564 | #endif |
1565 | return 0; | ||
1566 | err: | ||
1567 | cfqg_stats_exit(stats); | ||
1568 | return -ENOMEM; | ||
1570 | } | 1569 | } |
1571 | 1570 | ||
1572 | static void cfq_cpd_init(const struct blkcg *blkcg) | 1571 | static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp) |
1573 | { | 1572 | { |
1574 | struct cfq_group_data *cgd = | 1573 | struct cfq_group_data *cgd; |
1575 | cpd_to_cfqgd(blkcg->pd[blkcg_policy_cfq.plid]); | ||
1576 | 1574 | ||
1577 | if (blkcg == &blkcg_root) { | 1575 | cgd = kzalloc(sizeof(*cgd), GFP_KERNEL); |
1578 | cgd->weight = 2 * CFQ_WEIGHT_DEFAULT; | 1576 | if (!cgd) |
1579 | cgd->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT; | 1577 | return NULL; |
1580 | } else { | 1578 | return &cgd->cpd; |
1581 | cgd->weight = CFQ_WEIGHT_DEFAULT; | 1579 | } |
1582 | cgd->leaf_weight = CFQ_WEIGHT_DEFAULT; | 1580 | |
1583 | } | 1581 | static void cfq_cpd_init(struct blkcg_policy_data *cpd) |
1582 | { | ||
1583 | struct cfq_group_data *cgd = cpd_to_cfqgd(cpd); | ||
1584 | unsigned int weight = cgroup_on_dfl(blkcg_root.css.cgroup) ? | ||
1585 | CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL; | ||
1586 | |||
1587 | if (cpd_to_blkcg(cpd) == &blkcg_root) | ||
1588 | weight *= 2; | ||
1589 | |||
1590 | cgd->weight = weight; | ||
1591 | cgd->leaf_weight = weight; | ||
1584 | } | 1592 | } |
1585 | 1593 | ||
1586 | static void cfq_pd_init(struct blkcg_gq *blkg) | 1594 | static void cfq_cpd_free(struct blkcg_policy_data *cpd) |
1587 | { | 1595 | { |
1588 | struct cfq_group *cfqg = blkg_to_cfqg(blkg); | 1596 | kfree(cpd_to_cfqgd(cpd)); |
1589 | struct cfq_group_data *cgd = blkcg_to_cfqgd(blkg->blkcg); | 1597 | } |
1598 | |||
1599 | static void cfq_cpd_bind(struct blkcg_policy_data *cpd) | ||
1600 | { | ||
1601 | struct blkcg *blkcg = cpd_to_blkcg(cpd); | ||
1602 | bool on_dfl = cgroup_on_dfl(blkcg_root.css.cgroup); | ||
1603 | unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL; | ||
1604 | |||
1605 | if (blkcg == &blkcg_root) | ||
1606 | weight *= 2; | ||
1607 | |||
1608 | WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, false)); | ||
1609 | WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, true)); | ||
1610 | } | ||
1611 | |||
1612 | static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node) | ||
1613 | { | ||
1614 | struct cfq_group *cfqg; | ||
1615 | |||
1616 | cfqg = kzalloc_node(sizeof(*cfqg), gfp, node); | ||
1617 | if (!cfqg) | ||
1618 | return NULL; | ||
1590 | 1619 | ||
1591 | cfq_init_cfqg_base(cfqg); | 1620 | cfq_init_cfqg_base(cfqg); |
1621 | if (cfqg_stats_init(&cfqg->stats, gfp)) { | ||
1622 | kfree(cfqg); | ||
1623 | return NULL; | ||
1624 | } | ||
1625 | |||
1626 | return &cfqg->pd; | ||
1627 | } | ||
1628 | |||
1629 | static void cfq_pd_init(struct blkg_policy_data *pd) | ||
1630 | { | ||
1631 | struct cfq_group *cfqg = pd_to_cfqg(pd); | ||
1632 | struct cfq_group_data *cgd = blkcg_to_cfqgd(pd->blkg->blkcg); | ||
1633 | |||
1592 | cfqg->weight = cgd->weight; | 1634 | cfqg->weight = cgd->weight; |
1593 | cfqg->leaf_weight = cgd->leaf_weight; | 1635 | cfqg->leaf_weight = cgd->leaf_weight; |
1594 | cfqg_stats_init(&cfqg->stats); | ||
1595 | cfqg_stats_init(&cfqg->dead_stats); | ||
1596 | } | 1636 | } |
1597 | 1637 | ||
1598 | static void cfq_pd_offline(struct blkcg_gq *blkg) | 1638 | static void cfq_pd_offline(struct blkg_policy_data *pd) |
1599 | { | 1639 | { |
1640 | struct cfq_group *cfqg = pd_to_cfqg(pd); | ||
1641 | int i; | ||
1642 | |||
1643 | for (i = 0; i < IOPRIO_BE_NR; i++) { | ||
1644 | if (cfqg->async_cfqq[0][i]) | ||
1645 | cfq_put_queue(cfqg->async_cfqq[0][i]); | ||
1646 | if (cfqg->async_cfqq[1][i]) | ||
1647 | cfq_put_queue(cfqg->async_cfqq[1][i]); | ||
1648 | } | ||
1649 | |||
1650 | if (cfqg->async_idle_cfqq) | ||
1651 | cfq_put_queue(cfqg->async_idle_cfqq); | ||
1652 | |||
1600 | /* | 1653 | /* |
1601 | * @blkg is going offline and will be ignored by | 1654 | * @blkg is going offline and will be ignored by |
1602 | * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so | 1655 | * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so |
1603 | * that they don't get lost. If IOs complete after this point, the | 1656 | * that they don't get lost. If IOs complete after this point, the |
1604 | * stats for them will be lost. Oh well... | 1657 | * stats for them will be lost. Oh well... |
1605 | */ | 1658 | */ |
1606 | cfqg_stats_xfer_dead(blkg_to_cfqg(blkg)); | 1659 | cfqg_stats_xfer_dead(cfqg); |
1607 | } | 1660 | } |
1608 | 1661 | ||
1609 | /* offset delta from cfqg->stats to cfqg->dead_stats */ | 1662 | static void cfq_pd_free(struct blkg_policy_data *pd) |
1610 | static const int dead_stats_off_delta = offsetof(struct cfq_group, dead_stats) - | ||
1611 | offsetof(struct cfq_group, stats); | ||
1612 | |||
1613 | /* to be used by recursive prfill, sums live and dead stats recursively */ | ||
1614 | static u64 cfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off) | ||
1615 | { | 1663 | { |
1616 | u64 sum = 0; | 1664 | struct cfq_group *cfqg = pd_to_cfqg(pd); |
1617 | |||
1618 | sum += blkg_stat_recursive_sum(pd, off); | ||
1619 | sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta); | ||
1620 | return sum; | ||
1621 | } | ||
1622 | |||
1623 | /* to be used by recursive prfill, sums live and dead rwstats recursively */ | ||
1624 | static struct blkg_rwstat cfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd, | ||
1625 | int off) | ||
1626 | { | ||
1627 | struct blkg_rwstat a, b; | ||
1628 | 1665 | ||
1629 | a = blkg_rwstat_recursive_sum(pd, off); | 1666 | cfqg_stats_exit(&cfqg->stats); |
1630 | b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta); | 1667 | return kfree(cfqg); |
1631 | blkg_rwstat_merge(&a, &b); | ||
1632 | return a; | ||
1633 | } | 1668 | } |
1634 | 1669 | ||
1635 | static void cfq_pd_reset_stats(struct blkcg_gq *blkg) | 1670 | static void cfq_pd_reset_stats(struct blkg_policy_data *pd) |
1636 | { | 1671 | { |
1637 | struct cfq_group *cfqg = blkg_to_cfqg(blkg); | 1672 | struct cfq_group *cfqg = pd_to_cfqg(pd); |
1638 | 1673 | ||
1639 | cfqg_stats_reset(&cfqg->stats); | 1674 | cfqg_stats_reset(&cfqg->stats); |
1640 | cfqg_stats_reset(&cfqg->dead_stats); | ||
1641 | } | 1675 | } |
1642 | 1676 | ||
1643 | /* | 1677 | static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd, |
1644 | * Search for the cfq group current task belongs to. request_queue lock must | 1678 | struct blkcg *blkcg) |
1645 | * be held. | ||
1646 | */ | ||
1647 | static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd, | ||
1648 | struct blkcg *blkcg) | ||
1649 | { | 1679 | { |
1650 | struct request_queue *q = cfqd->queue; | 1680 | struct blkcg_gq *blkg; |
1651 | struct cfq_group *cfqg = NULL; | ||
1652 | |||
1653 | /* avoid lookup for the common case where there's no blkcg */ | ||
1654 | if (blkcg == &blkcg_root) { | ||
1655 | cfqg = cfqd->root_group; | ||
1656 | } else { | ||
1657 | struct blkcg_gq *blkg; | ||
1658 | |||
1659 | blkg = blkg_lookup_create(blkcg, q); | ||
1660 | if (!IS_ERR(blkg)) | ||
1661 | cfqg = blkg_to_cfqg(blkg); | ||
1662 | } | ||
1663 | 1681 | ||
1664 | return cfqg; | 1682 | blkg = blkg_lookup(blkcg, cfqd->queue); |
1683 | if (likely(blkg)) | ||
1684 | return blkg_to_cfqg(blkg); | ||
1685 | return NULL; | ||
1665 | } | 1686 | } |
1666 | 1687 | ||
1667 | static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) | 1688 | static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) |
1668 | { | 1689 | { |
1669 | /* Currently, all async queues are mapped to root group */ | ||
1670 | if (!cfq_cfqq_sync(cfqq)) | ||
1671 | cfqg = cfqq->cfqd->root_group; | ||
1672 | |||
1673 | cfqq->cfqg = cfqg; | 1690 | cfqq->cfqg = cfqg; |
1674 | /* cfqq reference on cfqg */ | 1691 | /* cfqq reference on cfqg */ |
1675 | cfqg_get(cfqg); | 1692 | cfqg_get(cfqg); |
@@ -1739,36 +1756,48 @@ static int cfq_print_leaf_weight(struct seq_file *sf, void *v) | |||
1739 | 1756 | ||
1740 | static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of, | 1757 | static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of, |
1741 | char *buf, size_t nbytes, loff_t off, | 1758 | char *buf, size_t nbytes, loff_t off, |
1742 | bool is_leaf_weight) | 1759 | bool on_dfl, bool is_leaf_weight) |
1743 | { | 1760 | { |
1761 | unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN; | ||
1762 | unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX; | ||
1744 | struct blkcg *blkcg = css_to_blkcg(of_css(of)); | 1763 | struct blkcg *blkcg = css_to_blkcg(of_css(of)); |
1745 | struct blkg_conf_ctx ctx; | 1764 | struct blkg_conf_ctx ctx; |
1746 | struct cfq_group *cfqg; | 1765 | struct cfq_group *cfqg; |
1747 | struct cfq_group_data *cfqgd; | 1766 | struct cfq_group_data *cfqgd; |
1748 | int ret; | 1767 | int ret; |
1768 | u64 v; | ||
1749 | 1769 | ||
1750 | ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx); | 1770 | ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx); |
1751 | if (ret) | 1771 | if (ret) |
1752 | return ret; | 1772 | return ret; |
1753 | 1773 | ||
1754 | ret = -EINVAL; | 1774 | if (sscanf(ctx.body, "%llu", &v) == 1) { |
1775 | /* require "default" on dfl */ | ||
1776 | ret = -ERANGE; | ||
1777 | if (!v && on_dfl) | ||
1778 | goto out_finish; | ||
1779 | } else if (!strcmp(strim(ctx.body), "default")) { | ||
1780 | v = 0; | ||
1781 | } else { | ||
1782 | ret = -EINVAL; | ||
1783 | goto out_finish; | ||
1784 | } | ||
1785 | |||
1755 | cfqg = blkg_to_cfqg(ctx.blkg); | 1786 | cfqg = blkg_to_cfqg(ctx.blkg); |
1756 | cfqgd = blkcg_to_cfqgd(blkcg); | 1787 | cfqgd = blkcg_to_cfqgd(blkcg); |
1757 | if (!cfqg || !cfqgd) | ||
1758 | goto err; | ||
1759 | 1788 | ||
1760 | if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) { | 1789 | ret = -ERANGE; |
1790 | if (!v || (v >= min && v <= max)) { | ||
1761 | if (!is_leaf_weight) { | 1791 | if (!is_leaf_weight) { |
1762 | cfqg->dev_weight = ctx.v; | 1792 | cfqg->dev_weight = v; |
1763 | cfqg->new_weight = ctx.v ?: cfqgd->weight; | 1793 | cfqg->new_weight = v ?: cfqgd->weight; |
1764 | } else { | 1794 | } else { |
1765 | cfqg->dev_leaf_weight = ctx.v; | 1795 | cfqg->dev_leaf_weight = v; |
1766 | cfqg->new_leaf_weight = ctx.v ?: cfqgd->leaf_weight; | 1796 | cfqg->new_leaf_weight = v ?: cfqgd->leaf_weight; |
1767 | } | 1797 | } |
1768 | ret = 0; | 1798 | ret = 0; |
1769 | } | 1799 | } |
1770 | 1800 | out_finish: | |
1771 | err: | ||
1772 | blkg_conf_finish(&ctx); | 1801 | blkg_conf_finish(&ctx); |
1773 | return ret ?: nbytes; | 1802 | return ret ?: nbytes; |
1774 | } | 1803 | } |
@@ -1776,25 +1805,27 @@ err: | |||
1776 | static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of, | 1805 | static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of, |
1777 | char *buf, size_t nbytes, loff_t off) | 1806 | char *buf, size_t nbytes, loff_t off) |
1778 | { | 1807 | { |
1779 | return __cfqg_set_weight_device(of, buf, nbytes, off, false); | 1808 | return __cfqg_set_weight_device(of, buf, nbytes, off, false, false); |
1780 | } | 1809 | } |
1781 | 1810 | ||
1782 | static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of, | 1811 | static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of, |
1783 | char *buf, size_t nbytes, loff_t off) | 1812 | char *buf, size_t nbytes, loff_t off) |
1784 | { | 1813 | { |
1785 | return __cfqg_set_weight_device(of, buf, nbytes, off, true); | 1814 | return __cfqg_set_weight_device(of, buf, nbytes, off, false, true); |
1786 | } | 1815 | } |
1787 | 1816 | ||
1788 | static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, | 1817 | static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val, |
1789 | u64 val, bool is_leaf_weight) | 1818 | bool on_dfl, bool reset_dev, bool is_leaf_weight) |
1790 | { | 1819 | { |
1820 | unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN; | ||
1821 | unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX; | ||
1791 | struct blkcg *blkcg = css_to_blkcg(css); | 1822 | struct blkcg *blkcg = css_to_blkcg(css); |
1792 | struct blkcg_gq *blkg; | 1823 | struct blkcg_gq *blkg; |
1793 | struct cfq_group_data *cfqgd; | 1824 | struct cfq_group_data *cfqgd; |
1794 | int ret = 0; | 1825 | int ret = 0; |
1795 | 1826 | ||
1796 | if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX) | 1827 | if (val < min || val > max) |
1797 | return -EINVAL; | 1828 | return -ERANGE; |
1798 | 1829 | ||
1799 | spin_lock_irq(&blkcg->lock); | 1830 | spin_lock_irq(&blkcg->lock); |
1800 | cfqgd = blkcg_to_cfqgd(blkcg); | 1831 | cfqgd = blkcg_to_cfqgd(blkcg); |
@@ -1815,9 +1846,13 @@ static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, | |||
1815 | continue; | 1846 | continue; |
1816 | 1847 | ||
1817 | if (!is_leaf_weight) { | 1848 | if (!is_leaf_weight) { |
1849 | if (reset_dev) | ||
1850 | cfqg->dev_weight = 0; | ||
1818 | if (!cfqg->dev_weight) | 1851 | if (!cfqg->dev_weight) |
1819 | cfqg->new_weight = cfqgd->weight; | 1852 | cfqg->new_weight = cfqgd->weight; |
1820 | } else { | 1853 | } else { |
1854 | if (reset_dev) | ||
1855 | cfqg->dev_leaf_weight = 0; | ||
1821 | if (!cfqg->dev_leaf_weight) | 1856 | if (!cfqg->dev_leaf_weight) |
1822 | cfqg->new_leaf_weight = cfqgd->leaf_weight; | 1857 | cfqg->new_leaf_weight = cfqgd->leaf_weight; |
1823 | } | 1858 | } |
@@ -1831,13 +1866,13 @@ out: | |||
1831 | static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, | 1866 | static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, |
1832 | u64 val) | 1867 | u64 val) |
1833 | { | 1868 | { |
1834 | return __cfq_set_weight(css, cft, val, false); | 1869 | return __cfq_set_weight(css, val, false, false, false); |
1835 | } | 1870 | } |
1836 | 1871 | ||
1837 | static int cfq_set_leaf_weight(struct cgroup_subsys_state *css, | 1872 | static int cfq_set_leaf_weight(struct cgroup_subsys_state *css, |
1838 | struct cftype *cft, u64 val) | 1873 | struct cftype *cft, u64 val) |
1839 | { | 1874 | { |
1840 | return __cfq_set_weight(css, cft, val, true); | 1875 | return __cfq_set_weight(css, val, false, false, true); |
1841 | } | 1876 | } |
1842 | 1877 | ||
1843 | static int cfqg_print_stat(struct seq_file *sf, void *v) | 1878 | static int cfqg_print_stat(struct seq_file *sf, void *v) |
@@ -1857,16 +1892,16 @@ static int cfqg_print_rwstat(struct seq_file *sf, void *v) | |||
1857 | static u64 cfqg_prfill_stat_recursive(struct seq_file *sf, | 1892 | static u64 cfqg_prfill_stat_recursive(struct seq_file *sf, |
1858 | struct blkg_policy_data *pd, int off) | 1893 | struct blkg_policy_data *pd, int off) |
1859 | { | 1894 | { |
1860 | u64 sum = cfqg_stat_pd_recursive_sum(pd, off); | 1895 | u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd), |
1861 | 1896 | &blkcg_policy_cfq, off); | |
1862 | return __blkg_prfill_u64(sf, pd, sum); | 1897 | return __blkg_prfill_u64(sf, pd, sum); |
1863 | } | 1898 | } |
1864 | 1899 | ||
1865 | static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf, | 1900 | static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf, |
1866 | struct blkg_policy_data *pd, int off) | 1901 | struct blkg_policy_data *pd, int off) |
1867 | { | 1902 | { |
1868 | struct blkg_rwstat sum = cfqg_rwstat_pd_recursive_sum(pd, off); | 1903 | struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd), |
1869 | 1904 | &blkcg_policy_cfq, off); | |
1870 | return __blkg_prfill_rwstat(sf, pd, &sum); | 1905 | return __blkg_prfill_rwstat(sf, pd, &sum); |
1871 | } | 1906 | } |
1872 | 1907 | ||
@@ -1886,6 +1921,40 @@ static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v) | |||
1886 | return 0; | 1921 | return 0; |
1887 | } | 1922 | } |
1888 | 1923 | ||
1924 | static u64 cfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, | ||
1925 | int off) | ||
1926 | { | ||
1927 | u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes); | ||
1928 | |||
1929 | return __blkg_prfill_u64(sf, pd, sum >> 9); | ||
1930 | } | ||
1931 | |||
1932 | static int cfqg_print_stat_sectors(struct seq_file *sf, void *v) | ||
1933 | { | ||
1934 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), | ||
1935 | cfqg_prfill_sectors, &blkcg_policy_cfq, 0, false); | ||
1936 | return 0; | ||
1937 | } | ||
1938 | |||
1939 | static u64 cfqg_prfill_sectors_recursive(struct seq_file *sf, | ||
1940 | struct blkg_policy_data *pd, int off) | ||
1941 | { | ||
1942 | struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL, | ||
1943 | offsetof(struct blkcg_gq, stat_bytes)); | ||
1944 | u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + | ||
1945 | atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); | ||
1946 | |||
1947 | return __blkg_prfill_u64(sf, pd, sum >> 9); | ||
1948 | } | ||
1949 | |||
1950 | static int cfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) | ||
1951 | { | ||
1952 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), | ||
1953 | cfqg_prfill_sectors_recursive, &blkcg_policy_cfq, 0, | ||
1954 | false); | ||
1955 | return 0; | ||
1956 | } | ||
1957 | |||
1889 | #ifdef CONFIG_DEBUG_BLK_CGROUP | 1958 | #ifdef CONFIG_DEBUG_BLK_CGROUP |
1890 | static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, | 1959 | static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, |
1891 | struct blkg_policy_data *pd, int off) | 1960 | struct blkg_policy_data *pd, int off) |
@@ -1912,7 +1981,7 @@ static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v) | |||
1912 | } | 1981 | } |
1913 | #endif /* CONFIG_DEBUG_BLK_CGROUP */ | 1982 | #endif /* CONFIG_DEBUG_BLK_CGROUP */ |
1914 | 1983 | ||
1915 | static struct cftype cfq_blkcg_files[] = { | 1984 | static struct cftype cfq_blkcg_legacy_files[] = { |
1916 | /* on root, weight is mapped to leaf_weight */ | 1985 | /* on root, weight is mapped to leaf_weight */ |
1917 | { | 1986 | { |
1918 | .name = "weight_device", | 1987 | .name = "weight_device", |
@@ -1960,18 +2029,17 @@ static struct cftype cfq_blkcg_files[] = { | |||
1960 | }, | 2029 | }, |
1961 | { | 2030 | { |
1962 | .name = "sectors", | 2031 | .name = "sectors", |
1963 | .private = offsetof(struct cfq_group, stats.sectors), | 2032 | .seq_show = cfqg_print_stat_sectors, |
1964 | .seq_show = cfqg_print_stat, | ||
1965 | }, | 2033 | }, |
1966 | { | 2034 | { |
1967 | .name = "io_service_bytes", | 2035 | .name = "io_service_bytes", |
1968 | .private = offsetof(struct cfq_group, stats.service_bytes), | 2036 | .private = (unsigned long)&blkcg_policy_cfq, |
1969 | .seq_show = cfqg_print_rwstat, | 2037 | .seq_show = blkg_print_stat_bytes, |
1970 | }, | 2038 | }, |
1971 | { | 2039 | { |
1972 | .name = "io_serviced", | 2040 | .name = "io_serviced", |
1973 | .private = offsetof(struct cfq_group, stats.serviced), | 2041 | .private = (unsigned long)&blkcg_policy_cfq, |
1974 | .seq_show = cfqg_print_rwstat, | 2042 | .seq_show = blkg_print_stat_ios, |
1975 | }, | 2043 | }, |
1976 | { | 2044 | { |
1977 | .name = "io_service_time", | 2045 | .name = "io_service_time", |
@@ -2002,18 +2070,17 @@ static struct cftype cfq_blkcg_files[] = { | |||
2002 | }, | 2070 | }, |
2003 | { | 2071 | { |
2004 | .name = "sectors_recursive", | 2072 | .name = "sectors_recursive", |
2005 | .private = offsetof(struct cfq_group, stats.sectors), | 2073 | .seq_show = cfqg_print_stat_sectors_recursive, |
2006 | .seq_show = cfqg_print_stat_recursive, | ||
2007 | }, | 2074 | }, |
2008 | { | 2075 | { |
2009 | .name = "io_service_bytes_recursive", | 2076 | .name = "io_service_bytes_recursive", |
2010 | .private = offsetof(struct cfq_group, stats.service_bytes), | 2077 | .private = (unsigned long)&blkcg_policy_cfq, |
2011 | .seq_show = cfqg_print_rwstat_recursive, | 2078 | .seq_show = blkg_print_stat_bytes_recursive, |
2012 | }, | 2079 | }, |
2013 | { | 2080 | { |
2014 | .name = "io_serviced_recursive", | 2081 | .name = "io_serviced_recursive", |
2015 | .private = offsetof(struct cfq_group, stats.serviced), | 2082 | .private = (unsigned long)&blkcg_policy_cfq, |
2016 | .seq_show = cfqg_print_rwstat_recursive, | 2083 | .seq_show = blkg_print_stat_ios_recursive, |
2017 | }, | 2084 | }, |
2018 | { | 2085 | { |
2019 | .name = "io_service_time_recursive", | 2086 | .name = "io_service_time_recursive", |
@@ -2068,9 +2135,51 @@ static struct cftype cfq_blkcg_files[] = { | |||
2068 | #endif /* CONFIG_DEBUG_BLK_CGROUP */ | 2135 | #endif /* CONFIG_DEBUG_BLK_CGROUP */ |
2069 | { } /* terminate */ | 2136 | { } /* terminate */ |
2070 | }; | 2137 | }; |
2138 | |||
2139 | static int cfq_print_weight_on_dfl(struct seq_file *sf, void *v) | ||
2140 | { | ||
2141 | struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); | ||
2142 | struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg); | ||
2143 | |||
2144 | seq_printf(sf, "default %u\n", cgd->weight); | ||
2145 | blkcg_print_blkgs(sf, blkcg, cfqg_prfill_weight_device, | ||
2146 | &blkcg_policy_cfq, 0, false); | ||
2147 | return 0; | ||
2148 | } | ||
2149 | |||
2150 | static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of, | ||
2151 | char *buf, size_t nbytes, loff_t off) | ||
2152 | { | ||
2153 | char *endp; | ||
2154 | int ret; | ||
2155 | u64 v; | ||
2156 | |||
2157 | buf = strim(buf); | ||
2158 | |||
2159 | /* "WEIGHT" or "default WEIGHT" sets the default weight */ | ||
2160 | v = simple_strtoull(buf, &endp, 0); | ||
2161 | if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) { | ||
2162 | ret = __cfq_set_weight(of_css(of), v, true, false, false); | ||
2163 | return ret ?: nbytes; | ||
2164 | } | ||
2165 | |||
2166 | /* "MAJ:MIN WEIGHT" */ | ||
2167 | return __cfqg_set_weight_device(of, buf, nbytes, off, true, false); | ||
2168 | } | ||
2169 | |||
2170 | static struct cftype cfq_blkcg_files[] = { | ||
2171 | { | ||
2172 | .name = "weight", | ||
2173 | .flags = CFTYPE_NOT_ON_ROOT, | ||
2174 | .seq_show = cfq_print_weight_on_dfl, | ||
2175 | .write = cfq_set_weight_on_dfl, | ||
2176 | }, | ||
2177 | { } /* terminate */ | ||
2178 | }; | ||
2179 | |||
2071 | #else /* GROUP_IOSCHED */ | 2180 | #else /* GROUP_IOSCHED */ |
2072 | static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd, | 2181 | static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd, |
2073 | struct blkcg *blkcg) | 2182 | struct blkcg *blkcg) |
2074 | { | 2183 | { |
2075 | return cfqd->root_group; | 2184 | return cfqd->root_group; |
2076 | } | 2185 | } |
@@ -2873,7 +2982,6 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) | |||
2873 | 2982 | ||
2874 | cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; | 2983 | cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; |
2875 | cfqq->nr_sectors += blk_rq_sectors(rq); | 2984 | cfqq->nr_sectors += blk_rq_sectors(rq); |
2876 | cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags); | ||
2877 | } | 2985 | } |
2878 | 2986 | ||
2879 | /* | 2987 | /* |
@@ -3506,14 +3614,14 @@ static void cfq_exit_icq(struct io_cq *icq) | |||
3506 | struct cfq_io_cq *cic = icq_to_cic(icq); | 3614 | struct cfq_io_cq *cic = icq_to_cic(icq); |
3507 | struct cfq_data *cfqd = cic_to_cfqd(cic); | 3615 | struct cfq_data *cfqd = cic_to_cfqd(cic); |
3508 | 3616 | ||
3509 | if (cic->cfqq[BLK_RW_ASYNC]) { | 3617 | if (cic_to_cfqq(cic, false)) { |
3510 | cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); | 3618 | cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, false)); |
3511 | cic->cfqq[BLK_RW_ASYNC] = NULL; | 3619 | cic_set_cfqq(cic, NULL, false); |
3512 | } | 3620 | } |
3513 | 3621 | ||
3514 | if (cic->cfqq[BLK_RW_SYNC]) { | 3622 | if (cic_to_cfqq(cic, true)) { |
3515 | cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]); | 3623 | cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, true)); |
3516 | cic->cfqq[BLK_RW_SYNC] = NULL; | 3624 | cic_set_cfqq(cic, NULL, true); |
3517 | } | 3625 | } |
3518 | } | 3626 | } |
3519 | 3627 | ||
@@ -3572,18 +3680,14 @@ static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio) | |||
3572 | if (unlikely(!cfqd) || likely(cic->ioprio == ioprio)) | 3680 | if (unlikely(!cfqd) || likely(cic->ioprio == ioprio)) |
3573 | return; | 3681 | return; |
3574 | 3682 | ||
3575 | cfqq = cic->cfqq[BLK_RW_ASYNC]; | 3683 | cfqq = cic_to_cfqq(cic, false); |
3576 | if (cfqq) { | 3684 | if (cfqq) { |
3577 | struct cfq_queue *new_cfqq; | 3685 | cfq_put_queue(cfqq); |
3578 | new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio, | 3686 | cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio); |
3579 | GFP_ATOMIC); | 3687 | cic_set_cfqq(cic, cfqq, false); |
3580 | if (new_cfqq) { | ||
3581 | cic->cfqq[BLK_RW_ASYNC] = new_cfqq; | ||
3582 | cfq_put_queue(cfqq); | ||
3583 | } | ||
3584 | } | 3688 | } |
3585 | 3689 | ||
3586 | cfqq = cic->cfqq[BLK_RW_SYNC]; | 3690 | cfqq = cic_to_cfqq(cic, true); |
3587 | if (cfqq) | 3691 | if (cfqq) |
3588 | cfq_mark_cfqq_prio_changed(cfqq); | 3692 | cfq_mark_cfqq_prio_changed(cfqq); |
3589 | 3693 | ||
@@ -3614,7 +3718,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, | |||
3614 | static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) | 3718 | static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) |
3615 | { | 3719 | { |
3616 | struct cfq_data *cfqd = cic_to_cfqd(cic); | 3720 | struct cfq_data *cfqd = cic_to_cfqd(cic); |
3617 | struct cfq_queue *sync_cfqq; | 3721 | struct cfq_queue *cfqq; |
3618 | uint64_t serial_nr; | 3722 | uint64_t serial_nr; |
3619 | 3723 | ||
3620 | rcu_read_lock(); | 3724 | rcu_read_lock(); |
@@ -3628,15 +3732,22 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) | |||
3628 | if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr)) | 3732 | if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr)) |
3629 | return; | 3733 | return; |
3630 | 3734 | ||
3631 | sync_cfqq = cic_to_cfqq(cic, 1); | 3735 | /* |
3632 | if (sync_cfqq) { | 3736 | * Drop reference to queues. New queues will be assigned in new |
3633 | /* | 3737 | * group upon arrival of fresh requests. |
3634 | * Drop reference to sync queue. A new sync queue will be | 3738 | */ |
3635 | * assigned in new group upon arrival of a fresh request. | 3739 | cfqq = cic_to_cfqq(cic, false); |
3636 | */ | 3740 | if (cfqq) { |
3637 | cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup"); | 3741 | cfq_log_cfqq(cfqd, cfqq, "changed cgroup"); |
3638 | cic_set_cfqq(cic, NULL, 1); | 3742 | cic_set_cfqq(cic, NULL, false); |
3639 | cfq_put_queue(sync_cfqq); | 3743 | cfq_put_queue(cfqq); |
3744 | } | ||
3745 | |||
3746 | cfqq = cic_to_cfqq(cic, true); | ||
3747 | if (cfqq) { | ||
3748 | cfq_log_cfqq(cfqd, cfqq, "changed cgroup"); | ||
3749 | cic_set_cfqq(cic, NULL, true); | ||
3750 | cfq_put_queue(cfqq); | ||
3640 | } | 3751 | } |
3641 | 3752 | ||
3642 | cic->blkcg_serial_nr = serial_nr; | 3753 | cic->blkcg_serial_nr = serial_nr; |
@@ -3645,81 +3756,19 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) | |||
3645 | static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { } | 3756 | static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { } |
3646 | #endif /* CONFIG_CFQ_GROUP_IOSCHED */ | 3757 | #endif /* CONFIG_CFQ_GROUP_IOSCHED */ |
3647 | 3758 | ||
3648 | static struct cfq_queue * | ||
3649 | cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, | ||
3650 | struct bio *bio, gfp_t gfp_mask) | ||
3651 | { | ||
3652 | struct blkcg *blkcg; | ||
3653 | struct cfq_queue *cfqq, *new_cfqq = NULL; | ||
3654 | struct cfq_group *cfqg; | ||
3655 | |||
3656 | retry: | ||
3657 | rcu_read_lock(); | ||
3658 | |||
3659 | blkcg = bio_blkcg(bio); | ||
3660 | cfqg = cfq_lookup_create_cfqg(cfqd, blkcg); | ||
3661 | if (!cfqg) { | ||
3662 | cfqq = &cfqd->oom_cfqq; | ||
3663 | goto out; | ||
3664 | } | ||
3665 | |||
3666 | cfqq = cic_to_cfqq(cic, is_sync); | ||
3667 | |||
3668 | /* | ||
3669 | * Always try a new alloc if we fell back to the OOM cfqq | ||
3670 | * originally, since it should just be a temporary situation. | ||
3671 | */ | ||
3672 | if (!cfqq || cfqq == &cfqd->oom_cfqq) { | ||
3673 | cfqq = NULL; | ||
3674 | if (new_cfqq) { | ||
3675 | cfqq = new_cfqq; | ||
3676 | new_cfqq = NULL; | ||
3677 | } else if (gfp_mask & __GFP_WAIT) { | ||
3678 | rcu_read_unlock(); | ||
3679 | spin_unlock_irq(cfqd->queue->queue_lock); | ||
3680 | new_cfqq = kmem_cache_alloc_node(cfq_pool, | ||
3681 | gfp_mask | __GFP_ZERO, | ||
3682 | cfqd->queue->node); | ||
3683 | spin_lock_irq(cfqd->queue->queue_lock); | ||
3684 | if (new_cfqq) | ||
3685 | goto retry; | ||
3686 | else | ||
3687 | return &cfqd->oom_cfqq; | ||
3688 | } else { | ||
3689 | cfqq = kmem_cache_alloc_node(cfq_pool, | ||
3690 | gfp_mask | __GFP_ZERO, | ||
3691 | cfqd->queue->node); | ||
3692 | } | ||
3693 | |||
3694 | if (cfqq) { | ||
3695 | cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); | ||
3696 | cfq_init_prio_data(cfqq, cic); | ||
3697 | cfq_link_cfqq_cfqg(cfqq, cfqg); | ||
3698 | cfq_log_cfqq(cfqd, cfqq, "alloced"); | ||
3699 | } else | ||
3700 | cfqq = &cfqd->oom_cfqq; | ||
3701 | } | ||
3702 | out: | ||
3703 | if (new_cfqq) | ||
3704 | kmem_cache_free(cfq_pool, new_cfqq); | ||
3705 | |||
3706 | rcu_read_unlock(); | ||
3707 | return cfqq; | ||
3708 | } | ||
3709 | |||
3710 | static struct cfq_queue ** | 3759 | static struct cfq_queue ** |
3711 | cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) | 3760 | cfq_async_queue_prio(struct cfq_group *cfqg, int ioprio_class, int ioprio) |
3712 | { | 3761 | { |
3713 | switch (ioprio_class) { | 3762 | switch (ioprio_class) { |
3714 | case IOPRIO_CLASS_RT: | 3763 | case IOPRIO_CLASS_RT: |
3715 | return &cfqd->async_cfqq[0][ioprio]; | 3764 | return &cfqg->async_cfqq[0][ioprio]; |
3716 | case IOPRIO_CLASS_NONE: | 3765 | case IOPRIO_CLASS_NONE: |
3717 | ioprio = IOPRIO_NORM; | 3766 | ioprio = IOPRIO_NORM; |
3718 | /* fall through */ | 3767 | /* fall through */ |
3719 | case IOPRIO_CLASS_BE: | 3768 | case IOPRIO_CLASS_BE: |
3720 | return &cfqd->async_cfqq[1][ioprio]; | 3769 | return &cfqg->async_cfqq[1][ioprio]; |
3721 | case IOPRIO_CLASS_IDLE: | 3770 | case IOPRIO_CLASS_IDLE: |
3722 | return &cfqd->async_idle_cfqq; | 3771 | return &cfqg->async_idle_cfqq; |
3723 | default: | 3772 | default: |
3724 | BUG(); | 3773 | BUG(); |
3725 | } | 3774 | } |
@@ -3727,12 +3776,20 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) | |||
3727 | 3776 | ||
3728 | static struct cfq_queue * | 3777 | static struct cfq_queue * |
3729 | cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, | 3778 | cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, |
3730 | struct bio *bio, gfp_t gfp_mask) | 3779 | struct bio *bio) |
3731 | { | 3780 | { |
3732 | int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); | 3781 | int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); |
3733 | int ioprio = IOPRIO_PRIO_DATA(cic->ioprio); | 3782 | int ioprio = IOPRIO_PRIO_DATA(cic->ioprio); |
3734 | struct cfq_queue **async_cfqq = NULL; | 3783 | struct cfq_queue **async_cfqq = NULL; |
3735 | struct cfq_queue *cfqq = NULL; | 3784 | struct cfq_queue *cfqq; |
3785 | struct cfq_group *cfqg; | ||
3786 | |||
3787 | rcu_read_lock(); | ||
3788 | cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio)); | ||
3789 | if (!cfqg) { | ||
3790 | cfqq = &cfqd->oom_cfqq; | ||
3791 | goto out; | ||
3792 | } | ||
3736 | 3793 | ||
3737 | if (!is_sync) { | 3794 | if (!is_sync) { |
3738 | if (!ioprio_valid(cic->ioprio)) { | 3795 | if (!ioprio_valid(cic->ioprio)) { |
@@ -3740,22 +3797,32 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, | |||
3740 | ioprio = task_nice_ioprio(tsk); | 3797 | ioprio = task_nice_ioprio(tsk); |
3741 | ioprio_class = task_nice_ioclass(tsk); | 3798 | ioprio_class = task_nice_ioclass(tsk); |
3742 | } | 3799 | } |
3743 | async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio); | 3800 | async_cfqq = cfq_async_queue_prio(cfqg, ioprio_class, ioprio); |
3744 | cfqq = *async_cfqq; | 3801 | cfqq = *async_cfqq; |
3802 | if (cfqq) | ||
3803 | goto out; | ||
3745 | } | 3804 | } |
3746 | 3805 | ||
3747 | if (!cfqq) | 3806 | cfqq = kmem_cache_alloc_node(cfq_pool, GFP_NOWAIT | __GFP_ZERO, |
3748 | cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask); | 3807 | cfqd->queue->node); |
3808 | if (!cfqq) { | ||
3809 | cfqq = &cfqd->oom_cfqq; | ||
3810 | goto out; | ||
3811 | } | ||
3749 | 3812 | ||
3750 | /* | 3813 | cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); |
3751 | * pin the queue now that it's allocated, scheduler exit will prune it | 3814 | cfq_init_prio_data(cfqq, cic); |
3752 | */ | 3815 | cfq_link_cfqq_cfqg(cfqq, cfqg); |
3753 | if (!is_sync && !(*async_cfqq)) { | 3816 | cfq_log_cfqq(cfqd, cfqq, "alloced"); |
3817 | |||
3818 | if (async_cfqq) { | ||
3819 | /* a new async queue is created, pin and remember */ | ||
3754 | cfqq->ref++; | 3820 | cfqq->ref++; |
3755 | *async_cfqq = cfqq; | 3821 | *async_cfqq = cfqq; |
3756 | } | 3822 | } |
3757 | 3823 | out: | |
3758 | cfqq->ref++; | 3824 | cfqq->ref++; |
3825 | rcu_read_unlock(); | ||
3759 | return cfqq; | 3826 | return cfqq; |
3760 | } | 3827 | } |
3761 | 3828 | ||
@@ -4289,8 +4356,6 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio, | |||
4289 | const bool is_sync = rq_is_sync(rq); | 4356 | const bool is_sync = rq_is_sync(rq); |
4290 | struct cfq_queue *cfqq; | 4357 | struct cfq_queue *cfqq; |
4291 | 4358 | ||
4292 | might_sleep_if(gfp_mask & __GFP_WAIT); | ||
4293 | |||
4294 | spin_lock_irq(q->queue_lock); | 4359 | spin_lock_irq(q->queue_lock); |
4295 | 4360 | ||
4296 | check_ioprio_changed(cic, bio); | 4361 | check_ioprio_changed(cic, bio); |
@@ -4298,7 +4363,9 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio, | |||
4298 | new_queue: | 4363 | new_queue: |
4299 | cfqq = cic_to_cfqq(cic, is_sync); | 4364 | cfqq = cic_to_cfqq(cic, is_sync); |
4300 | if (!cfqq || cfqq == &cfqd->oom_cfqq) { | 4365 | if (!cfqq || cfqq == &cfqd->oom_cfqq) { |
4301 | cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask); | 4366 | if (cfqq) |
4367 | cfq_put_queue(cfqq); | ||
4368 | cfqq = cfq_get_queue(cfqd, is_sync, cic, bio); | ||
4302 | cic_set_cfqq(cic, cfqq, is_sync); | 4369 | cic_set_cfqq(cic, cfqq, is_sync); |
4303 | } else { | 4370 | } else { |
4304 | /* | 4371 | /* |
@@ -4404,21 +4471,6 @@ static void cfq_shutdown_timer_wq(struct cfq_data *cfqd) | |||
4404 | cancel_work_sync(&cfqd->unplug_work); | 4471 | cancel_work_sync(&cfqd->unplug_work); |
4405 | } | 4472 | } |
4406 | 4473 | ||
4407 | static void cfq_put_async_queues(struct cfq_data *cfqd) | ||
4408 | { | ||
4409 | int i; | ||
4410 | |||
4411 | for (i = 0; i < IOPRIO_BE_NR; i++) { | ||
4412 | if (cfqd->async_cfqq[0][i]) | ||
4413 | cfq_put_queue(cfqd->async_cfqq[0][i]); | ||
4414 | if (cfqd->async_cfqq[1][i]) | ||
4415 | cfq_put_queue(cfqd->async_cfqq[1][i]); | ||
4416 | } | ||
4417 | |||
4418 | if (cfqd->async_idle_cfqq) | ||
4419 | cfq_put_queue(cfqd->async_idle_cfqq); | ||
4420 | } | ||
4421 | |||
4422 | static void cfq_exit_queue(struct elevator_queue *e) | 4474 | static void cfq_exit_queue(struct elevator_queue *e) |
4423 | { | 4475 | { |
4424 | struct cfq_data *cfqd = e->elevator_data; | 4476 | struct cfq_data *cfqd = e->elevator_data; |
@@ -4431,8 +4483,6 @@ static void cfq_exit_queue(struct elevator_queue *e) | |||
4431 | if (cfqd->active_queue) | 4483 | if (cfqd->active_queue) |
4432 | __cfq_slice_expired(cfqd, cfqd->active_queue, 0); | 4484 | __cfq_slice_expired(cfqd, cfqd->active_queue, 0); |
4433 | 4485 | ||
4434 | cfq_put_async_queues(cfqd); | ||
4435 | |||
4436 | spin_unlock_irq(q->queue_lock); | 4486 | spin_unlock_irq(q->queue_lock); |
4437 | 4487 | ||
4438 | cfq_shutdown_timer_wq(cfqd); | 4488 | cfq_shutdown_timer_wq(cfqd); |
@@ -4486,9 +4536,9 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e) | |||
4486 | goto out_free; | 4536 | goto out_free; |
4487 | 4537 | ||
4488 | cfq_init_cfqg_base(cfqd->root_group); | 4538 | cfq_init_cfqg_base(cfqd->root_group); |
4539 | cfqd->root_group->weight = 2 * CFQ_WEIGHT_LEGACY_DFL; | ||
4540 | cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL; | ||
4489 | #endif | 4541 | #endif |
4490 | cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT; | ||
4491 | cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT; | ||
4492 | 4542 | ||
4493 | /* | 4543 | /* |
4494 | * Not strictly needed (since RB_ROOT just clears the node and we | 4544 | * Not strictly needed (since RB_ROOT just clears the node and we |
@@ -4499,7 +4549,7 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e) | |||
4499 | cfqd->prio_trees[i] = RB_ROOT; | 4549 | cfqd->prio_trees[i] = RB_ROOT; |
4500 | 4550 | ||
4501 | /* | 4551 | /* |
4502 | * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues. | 4552 | * Our fallback cfqq if cfq_get_queue() runs into OOM issues. |
4503 | * Grab a permanent reference to it, so that the normal code flow | 4553 | * Grab a permanent reference to it, so that the normal code flow |
4504 | * will not attempt to free it. oom_cfqq is linked to root_group | 4554 | * will not attempt to free it. oom_cfqq is linked to root_group |
4505 | * but shouldn't hold a reference as it'll never be unlinked. Lose | 4555 | * but shouldn't hold a reference as it'll never be unlinked. Lose |
@@ -4683,13 +4733,18 @@ static struct elevator_type iosched_cfq = { | |||
4683 | 4733 | ||
4684 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | 4734 | #ifdef CONFIG_CFQ_GROUP_IOSCHED |
4685 | static struct blkcg_policy blkcg_policy_cfq = { | 4735 | static struct blkcg_policy blkcg_policy_cfq = { |
4686 | .pd_size = sizeof(struct cfq_group), | 4736 | .dfl_cftypes = cfq_blkcg_files, |
4687 | .cpd_size = sizeof(struct cfq_group_data), | 4737 | .legacy_cftypes = cfq_blkcg_legacy_files, |
4688 | .cftypes = cfq_blkcg_files, | ||
4689 | 4738 | ||
4739 | .cpd_alloc_fn = cfq_cpd_alloc, | ||
4690 | .cpd_init_fn = cfq_cpd_init, | 4740 | .cpd_init_fn = cfq_cpd_init, |
4741 | .cpd_free_fn = cfq_cpd_free, | ||
4742 | .cpd_bind_fn = cfq_cpd_bind, | ||
4743 | |||
4744 | .pd_alloc_fn = cfq_pd_alloc, | ||
4691 | .pd_init_fn = cfq_pd_init, | 4745 | .pd_init_fn = cfq_pd_init, |
4692 | .pd_offline_fn = cfq_pd_offline, | 4746 | .pd_offline_fn = cfq_pd_offline, |
4747 | .pd_free_fn = cfq_pd_free, | ||
4693 | .pd_reset_stats_fn = cfq_pd_reset_stats, | 4748 | .pd_reset_stats_fn = cfq_pd_reset_stats, |
4694 | }; | 4749 | }; |
4695 | #endif | 4750 | #endif |
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index ae0f438c2ee6..24489126f8ca 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c | |||
@@ -53,8 +53,6 @@ struct wb_writeback_work { | |||
53 | unsigned int for_background:1; | 53 | unsigned int for_background:1; |
54 | unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ | 54 | unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ |
55 | unsigned int auto_free:1; /* free on completion */ | 55 | unsigned int auto_free:1; /* free on completion */ |
56 | unsigned int single_wait:1; | ||
57 | unsigned int single_done:1; | ||
58 | enum wb_reason reason; /* why was writeback initiated? */ | 56 | enum wb_reason reason; /* why was writeback initiated? */ |
59 | 57 | ||
60 | struct list_head list; /* pending work list */ | 58 | struct list_head list; /* pending work list */ |
@@ -178,14 +176,11 @@ static void wb_wakeup(struct bdi_writeback *wb) | |||
178 | static void wb_queue_work(struct bdi_writeback *wb, | 176 | static void wb_queue_work(struct bdi_writeback *wb, |
179 | struct wb_writeback_work *work) | 177 | struct wb_writeback_work *work) |
180 | { | 178 | { |
181 | trace_writeback_queue(wb->bdi, work); | 179 | trace_writeback_queue(wb, work); |
182 | 180 | ||
183 | spin_lock_bh(&wb->work_lock); | 181 | spin_lock_bh(&wb->work_lock); |
184 | if (!test_bit(WB_registered, &wb->state)) { | 182 | if (!test_bit(WB_registered, &wb->state)) |
185 | if (work->single_wait) | ||
186 | work->single_done = 1; | ||
187 | goto out_unlock; | 183 | goto out_unlock; |
188 | } | ||
189 | if (work->done) | 184 | if (work->done) |
190 | atomic_inc(&work->done->cnt); | 185 | atomic_inc(&work->done->cnt); |
191 | list_add_tail(&work->list, &wb->work_list); | 186 | list_add_tail(&work->list, &wb->work_list); |
@@ -706,7 +701,7 @@ EXPORT_SYMBOL_GPL(wbc_account_io); | |||
706 | 701 | ||
707 | /** | 702 | /** |
708 | * inode_congested - test whether an inode is congested | 703 | * inode_congested - test whether an inode is congested |
709 | * @inode: inode to test for congestion | 704 | * @inode: inode to test for congestion (may be NULL) |
710 | * @cong_bits: mask of WB_[a]sync_congested bits to test | 705 | * @cong_bits: mask of WB_[a]sync_congested bits to test |
711 | * | 706 | * |
712 | * Tests whether @inode is congested. @cong_bits is the mask of congestion | 707 | * Tests whether @inode is congested. @cong_bits is the mask of congestion |
@@ -716,6 +711,9 @@ EXPORT_SYMBOL_GPL(wbc_account_io); | |||
716 | * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg | 711 | * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg |
717 | * associated with @inode is congested; otherwise, the root wb's congestion | 712 | * associated with @inode is congested; otherwise, the root wb's congestion |
718 | * state is used. | 713 | * state is used. |
714 | * | ||
715 | * @inode is allowed to be NULL as this function is often called on | ||
716 | * mapping->host which is NULL for the swapper space. | ||
719 | */ | 717 | */ |
720 | int inode_congested(struct inode *inode, int cong_bits) | 718 | int inode_congested(struct inode *inode, int cong_bits) |
721 | { | 719 | { |
@@ -738,32 +736,6 @@ int inode_congested(struct inode *inode, int cong_bits) | |||
738 | EXPORT_SYMBOL_GPL(inode_congested); | 736 | EXPORT_SYMBOL_GPL(inode_congested); |
739 | 737 | ||
740 | /** | 738 | /** |
741 | * wb_wait_for_single_work - wait for completion of a single bdi_writeback_work | ||
742 | * @bdi: bdi the work item was issued to | ||
743 | * @work: work item to wait for | ||
744 | * | ||
745 | * Wait for the completion of @work which was issued to one of @bdi's | ||
746 | * bdi_writeback's. The caller must have set @work->single_wait before | ||
747 | * issuing it. This wait operates independently fo | ||
748 | * wb_wait_for_completion() and also disables automatic freeing of @work. | ||
749 | */ | ||
750 | static void wb_wait_for_single_work(struct backing_dev_info *bdi, | ||
751 | struct wb_writeback_work *work) | ||
752 | { | ||
753 | if (WARN_ON_ONCE(!work->single_wait)) | ||
754 | return; | ||
755 | |||
756 | wait_event(bdi->wb_waitq, work->single_done); | ||
757 | |||
758 | /* | ||
759 | * Paired with smp_wmb() in wb_do_writeback() and ensures that all | ||
760 | * modifications to @work prior to assertion of ->single_done is | ||
761 | * visible to the caller once this function returns. | ||
762 | */ | ||
763 | smp_rmb(); | ||
764 | } | ||
765 | |||
766 | /** | ||
767 | * wb_split_bdi_pages - split nr_pages to write according to bandwidth | 739 | * wb_split_bdi_pages - split nr_pages to write according to bandwidth |
768 | * @wb: target bdi_writeback to split @nr_pages to | 740 | * @wb: target bdi_writeback to split @nr_pages to |
769 | * @nr_pages: number of pages to write for the whole bdi | 741 | * @nr_pages: number of pages to write for the whole bdi |
@@ -792,38 +764,6 @@ static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages) | |||
792 | } | 764 | } |
793 | 765 | ||
794 | /** | 766 | /** |
795 | * wb_clone_and_queue_work - clone a wb_writeback_work and issue it to a wb | ||
796 | * @wb: target bdi_writeback | ||
797 | * @base_work: source wb_writeback_work | ||
798 | * | ||
799 | * Try to make a clone of @base_work and issue it to @wb. If cloning | ||
800 | * succeeds, %true is returned; otherwise, @base_work is issued directly | ||
801 | * and %false is returned. In the latter case, the caller is required to | ||
802 | * wait for @base_work's completion using wb_wait_for_single_work(). | ||
803 | * | ||
804 | * A clone is auto-freed on completion. @base_work never is. | ||
805 | */ | ||
806 | static bool wb_clone_and_queue_work(struct bdi_writeback *wb, | ||
807 | struct wb_writeback_work *base_work) | ||
808 | { | ||
809 | struct wb_writeback_work *work; | ||
810 | |||
811 | work = kmalloc(sizeof(*work), GFP_ATOMIC); | ||
812 | if (work) { | ||
813 | *work = *base_work; | ||
814 | work->auto_free = 1; | ||
815 | work->single_wait = 0; | ||
816 | } else { | ||
817 | work = base_work; | ||
818 | work->auto_free = 0; | ||
819 | work->single_wait = 1; | ||
820 | } | ||
821 | work->single_done = 0; | ||
822 | wb_queue_work(wb, work); | ||
823 | return work != base_work; | ||
824 | } | ||
825 | |||
826 | /** | ||
827 | * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi | 767 | * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi |
828 | * @bdi: target backing_dev_info | 768 | * @bdi: target backing_dev_info |
829 | * @base_work: wb_writeback_work to issue | 769 | * @base_work: wb_writeback_work to issue |
@@ -838,15 +778,19 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, | |||
838 | struct wb_writeback_work *base_work, | 778 | struct wb_writeback_work *base_work, |
839 | bool skip_if_busy) | 779 | bool skip_if_busy) |
840 | { | 780 | { |
841 | long nr_pages = base_work->nr_pages; | 781 | int next_memcg_id = 0; |
842 | int next_blkcg_id = 0; | ||
843 | struct bdi_writeback *wb; | 782 | struct bdi_writeback *wb; |
844 | struct wb_iter iter; | 783 | struct wb_iter iter; |
845 | 784 | ||
846 | might_sleep(); | 785 | might_sleep(); |
847 | restart: | 786 | restart: |
848 | rcu_read_lock(); | 787 | rcu_read_lock(); |
849 | bdi_for_each_wb(wb, bdi, &iter, next_blkcg_id) { | 788 | bdi_for_each_wb(wb, bdi, &iter, next_memcg_id) { |
789 | DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done); | ||
790 | struct wb_writeback_work fallback_work; | ||
791 | struct wb_writeback_work *work; | ||
792 | long nr_pages; | ||
793 | |||
850 | /* SYNC_ALL writes out I_DIRTY_TIME too */ | 794 | /* SYNC_ALL writes out I_DIRTY_TIME too */ |
851 | if (!wb_has_dirty_io(wb) && | 795 | if (!wb_has_dirty_io(wb) && |
852 | (base_work->sync_mode == WB_SYNC_NONE || | 796 | (base_work->sync_mode == WB_SYNC_NONE || |
@@ -855,13 +799,30 @@ restart: | |||
855 | if (skip_if_busy && writeback_in_progress(wb)) | 799 | if (skip_if_busy && writeback_in_progress(wb)) |
856 | continue; | 800 | continue; |
857 | 801 | ||
858 | base_work->nr_pages = wb_split_bdi_pages(wb, nr_pages); | 802 | nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages); |
859 | if (!wb_clone_and_queue_work(wb, base_work)) { | 803 | |
860 | next_blkcg_id = wb->blkcg_css->id + 1; | 804 | work = kmalloc(sizeof(*work), GFP_ATOMIC); |
861 | rcu_read_unlock(); | 805 | if (work) { |
862 | wb_wait_for_single_work(bdi, base_work); | 806 | *work = *base_work; |
863 | goto restart; | 807 | work->nr_pages = nr_pages; |
808 | work->auto_free = 1; | ||
809 | wb_queue_work(wb, work); | ||
810 | continue; | ||
864 | } | 811 | } |
812 | |||
813 | /* alloc failed, execute synchronously using on-stack fallback */ | ||
814 | work = &fallback_work; | ||
815 | *work = *base_work; | ||
816 | work->nr_pages = nr_pages; | ||
817 | work->auto_free = 0; | ||
818 | work->done = &fallback_work_done; | ||
819 | |||
820 | wb_queue_work(wb, work); | ||
821 | |||
822 | next_memcg_id = wb->memcg_css->id + 1; | ||
823 | rcu_read_unlock(); | ||
824 | wb_wait_for_completion(bdi, &fallback_work_done); | ||
825 | goto restart; | ||
865 | } | 826 | } |
866 | rcu_read_unlock(); | 827 | rcu_read_unlock(); |
867 | } | 828 | } |
@@ -902,8 +863,6 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, | |||
902 | 863 | ||
903 | if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) { | 864 | if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) { |
904 | base_work->auto_free = 0; | 865 | base_work->auto_free = 0; |
905 | base_work->single_wait = 0; | ||
906 | base_work->single_done = 0; | ||
907 | wb_queue_work(&bdi->wb, base_work); | 866 | wb_queue_work(&bdi->wb, base_work); |
908 | } | 867 | } |
909 | } | 868 | } |
@@ -924,7 +883,7 @@ void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, | |||
924 | */ | 883 | */ |
925 | work = kzalloc(sizeof(*work), GFP_ATOMIC); | 884 | work = kzalloc(sizeof(*work), GFP_ATOMIC); |
926 | if (!work) { | 885 | if (!work) { |
927 | trace_writeback_nowork(wb->bdi); | 886 | trace_writeback_nowork(wb); |
928 | wb_wakeup(wb); | 887 | wb_wakeup(wb); |
929 | return; | 888 | return; |
930 | } | 889 | } |
@@ -954,7 +913,7 @@ void wb_start_background_writeback(struct bdi_writeback *wb) | |||
954 | * We just wake up the flusher thread. It will perform background | 913 | * We just wake up the flusher thread. It will perform background |
955 | * writeback as soon as there is no other work to do. | 914 | * writeback as soon as there is no other work to do. |
956 | */ | 915 | */ |
957 | trace_writeback_wake_background(wb->bdi); | 916 | trace_writeback_wake_background(wb); |
958 | wb_wakeup(wb); | 917 | wb_wakeup(wb); |
959 | } | 918 | } |
960 | 919 | ||
@@ -1660,14 +1619,14 @@ static long wb_writeback(struct bdi_writeback *wb, | |||
1660 | } else if (work->for_background) | 1619 | } else if (work->for_background) |
1661 | oldest_jif = jiffies; | 1620 | oldest_jif = jiffies; |
1662 | 1621 | ||
1663 | trace_writeback_start(wb->bdi, work); | 1622 | trace_writeback_start(wb, work); |
1664 | if (list_empty(&wb->b_io)) | 1623 | if (list_empty(&wb->b_io)) |
1665 | queue_io(wb, work); | 1624 | queue_io(wb, work); |
1666 | if (work->sb) | 1625 | if (work->sb) |
1667 | progress = writeback_sb_inodes(work->sb, wb, work); | 1626 | progress = writeback_sb_inodes(work->sb, wb, work); |
1668 | else | 1627 | else |
1669 | progress = __writeback_inodes_wb(wb, work); | 1628 | progress = __writeback_inodes_wb(wb, work); |
1670 | trace_writeback_written(wb->bdi, work); | 1629 | trace_writeback_written(wb, work); |
1671 | 1630 | ||
1672 | wb_update_bandwidth(wb, wb_start); | 1631 | wb_update_bandwidth(wb, wb_start); |
1673 | 1632 | ||
@@ -1692,7 +1651,7 @@ static long wb_writeback(struct bdi_writeback *wb, | |||
1692 | * we'll just busyloop. | 1651 | * we'll just busyloop. |
1693 | */ | 1652 | */ |
1694 | if (!list_empty(&wb->b_more_io)) { | 1653 | if (!list_empty(&wb->b_more_io)) { |
1695 | trace_writeback_wait(wb->bdi, work); | 1654 | trace_writeback_wait(wb, work); |
1696 | inode = wb_inode(wb->b_more_io.prev); | 1655 | inode = wb_inode(wb->b_more_io.prev); |
1697 | spin_lock(&inode->i_lock); | 1656 | spin_lock(&inode->i_lock); |
1698 | spin_unlock(&wb->list_lock); | 1657 | spin_unlock(&wb->list_lock); |
@@ -1797,26 +1756,14 @@ static long wb_do_writeback(struct bdi_writeback *wb) | |||
1797 | set_bit(WB_writeback_running, &wb->state); | 1756 | set_bit(WB_writeback_running, &wb->state); |
1798 | while ((work = get_next_work_item(wb)) != NULL) { | 1757 | while ((work = get_next_work_item(wb)) != NULL) { |
1799 | struct wb_completion *done = work->done; | 1758 | struct wb_completion *done = work->done; |
1800 | bool need_wake_up = false; | ||
1801 | 1759 | ||
1802 | trace_writeback_exec(wb->bdi, work); | 1760 | trace_writeback_exec(wb, work); |
1803 | 1761 | ||
1804 | wrote += wb_writeback(wb, work); | 1762 | wrote += wb_writeback(wb, work); |
1805 | 1763 | ||
1806 | if (work->single_wait) { | 1764 | if (work->auto_free) |
1807 | WARN_ON_ONCE(work->auto_free); | ||
1808 | /* paired w/ rmb in wb_wait_for_single_work() */ | ||
1809 | smp_wmb(); | ||
1810 | work->single_done = 1; | ||
1811 | need_wake_up = true; | ||
1812 | } else if (work->auto_free) { | ||
1813 | kfree(work); | 1765 | kfree(work); |
1814 | } | ||
1815 | |||
1816 | if (done && atomic_dec_and_test(&done->cnt)) | 1766 | if (done && atomic_dec_and_test(&done->cnt)) |
1817 | need_wake_up = true; | ||
1818 | |||
1819 | if (need_wake_up) | ||
1820 | wake_up_all(&wb->bdi->wb_waitq); | 1767 | wake_up_all(&wb->bdi->wb_waitq); |
1821 | } | 1768 | } |
1822 | 1769 | ||
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 2d48d28e1640..91e004518237 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c | |||
@@ -92,6 +92,29 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) | |||
92 | } | 92 | } |
93 | 93 | ||
94 | /** | 94 | /** |
95 | * kernfs_path_len - determine the length of the full path of a given node | ||
96 | * @kn: kernfs_node of interest | ||
97 | * | ||
98 | * The returned length doesn't include the space for the terminating '\0'. | ||
99 | */ | ||
100 | size_t kernfs_path_len(struct kernfs_node *kn) | ||
101 | { | ||
102 | size_t len = 0; | ||
103 | unsigned long flags; | ||
104 | |||
105 | spin_lock_irqsave(&kernfs_rename_lock, flags); | ||
106 | |||
107 | do { | ||
108 | len += strlen(kn->name) + 1; | ||
109 | kn = kn->parent; | ||
110 | } while (kn && kn->parent); | ||
111 | |||
112 | spin_unlock_irqrestore(&kernfs_rename_lock, flags); | ||
113 | |||
114 | return len; | ||
115 | } | ||
116 | |||
117 | /** | ||
95 | * kernfs_path - build full path of a given node | 118 | * kernfs_path - build full path of a given node |
96 | * @kn: kernfs_node of interest | 119 | * @kn: kernfs_node of interest |
97 | * @buf: buffer to copy @kn's name into | 120 | * @buf: buffer to copy @kn's name into |
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 0fe9df983ab7..5a5d79ee256f 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h | |||
@@ -286,7 +286,7 @@ static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi | |||
286 | * %current's blkcg equals the effective blkcg of its memcg. No | 286 | * %current's blkcg equals the effective blkcg of its memcg. No |
287 | * need to use the relatively expensive cgroup_get_e_css(). | 287 | * need to use the relatively expensive cgroup_get_e_css(). |
288 | */ | 288 | */ |
289 | if (likely(wb && wb->blkcg_css == task_css(current, blkio_cgrp_id))) | 289 | if (likely(wb && wb->blkcg_css == task_css(current, io_cgrp_id))) |
290 | return wb; | 290 | return wb; |
291 | return NULL; | 291 | return NULL; |
292 | } | 292 | } |
@@ -402,7 +402,7 @@ static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked) | |||
402 | } | 402 | } |
403 | 403 | ||
404 | struct wb_iter { | 404 | struct wb_iter { |
405 | int start_blkcg_id; | 405 | int start_memcg_id; |
406 | struct radix_tree_iter tree_iter; | 406 | struct radix_tree_iter tree_iter; |
407 | void **slot; | 407 | void **slot; |
408 | }; | 408 | }; |
@@ -414,9 +414,9 @@ static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter, | |||
414 | 414 | ||
415 | WARN_ON_ONCE(!rcu_read_lock_held()); | 415 | WARN_ON_ONCE(!rcu_read_lock_held()); |
416 | 416 | ||
417 | if (iter->start_blkcg_id >= 0) { | 417 | if (iter->start_memcg_id >= 0) { |
418 | iter->slot = radix_tree_iter_init(titer, iter->start_blkcg_id); | 418 | iter->slot = radix_tree_iter_init(titer, iter->start_memcg_id); |
419 | iter->start_blkcg_id = -1; | 419 | iter->start_memcg_id = -1; |
420 | } else { | 420 | } else { |
421 | iter->slot = radix_tree_next_slot(iter->slot, titer, 0); | 421 | iter->slot = radix_tree_next_slot(iter->slot, titer, 0); |
422 | } | 422 | } |
@@ -430,30 +430,30 @@ static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter, | |||
430 | 430 | ||
431 | static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter, | 431 | static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter, |
432 | struct backing_dev_info *bdi, | 432 | struct backing_dev_info *bdi, |
433 | int start_blkcg_id) | 433 | int start_memcg_id) |
434 | { | 434 | { |
435 | iter->start_blkcg_id = start_blkcg_id; | 435 | iter->start_memcg_id = start_memcg_id; |
436 | 436 | ||
437 | if (start_blkcg_id) | 437 | if (start_memcg_id) |
438 | return __wb_iter_next(iter, bdi); | 438 | return __wb_iter_next(iter, bdi); |
439 | else | 439 | else |
440 | return &bdi->wb; | 440 | return &bdi->wb; |
441 | } | 441 | } |
442 | 442 | ||
443 | /** | 443 | /** |
444 | * bdi_for_each_wb - walk all wb's of a bdi in ascending blkcg ID order | 444 | * bdi_for_each_wb - walk all wb's of a bdi in ascending memcg ID order |
445 | * @wb_cur: cursor struct bdi_writeback pointer | 445 | * @wb_cur: cursor struct bdi_writeback pointer |
446 | * @bdi: bdi to walk wb's of | 446 | * @bdi: bdi to walk wb's of |
447 | * @iter: pointer to struct wb_iter to be used as iteration buffer | 447 | * @iter: pointer to struct wb_iter to be used as iteration buffer |
448 | * @start_blkcg_id: blkcg ID to start iteration from | 448 | * @start_memcg_id: memcg ID to start iteration from |
449 | * | 449 | * |
450 | * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending | 450 | * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending |
451 | * blkcg ID order starting from @start_blkcg_id. @iter is struct wb_iter | 451 | * memcg ID order starting from @start_memcg_id. @iter is struct wb_iter |
452 | * to be used as temp storage during iteration. rcu_read_lock() must be | 452 | * to be used as temp storage during iteration. rcu_read_lock() must be |
453 | * held throughout iteration. | 453 | * held throughout iteration. |
454 | */ | 454 | */ |
455 | #define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id) \ | 455 | #define bdi_for_each_wb(wb_cur, bdi, iter, start_memcg_id) \ |
456 | for ((wb_cur) = __wb_iter_init(iter, bdi, start_blkcg_id); \ | 456 | for ((wb_cur) = __wb_iter_init(iter, bdi, start_memcg_id); \ |
457 | (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi)) | 457 | (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi)) |
458 | 458 | ||
459 | #else /* CONFIG_CGROUP_WRITEBACK */ | 459 | #else /* CONFIG_CGROUP_WRITEBACK */ |
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index a4cd1641e9e2..0a5cc7a1109b 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h | |||
@@ -14,12 +14,15 @@ | |||
14 | */ | 14 | */ |
15 | 15 | ||
16 | #include <linux/cgroup.h> | 16 | #include <linux/cgroup.h> |
17 | #include <linux/u64_stats_sync.h> | 17 | #include <linux/percpu_counter.h> |
18 | #include <linux/seq_file.h> | 18 | #include <linux/seq_file.h> |
19 | #include <linux/radix-tree.h> | 19 | #include <linux/radix-tree.h> |
20 | #include <linux/blkdev.h> | 20 | #include <linux/blkdev.h> |
21 | #include <linux/atomic.h> | 21 | #include <linux/atomic.h> |
22 | 22 | ||
23 | /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */ | ||
24 | #define BLKG_STAT_CPU_BATCH (INT_MAX / 2) | ||
25 | |||
23 | /* Max limits for throttle policy */ | 26 | /* Max limits for throttle policy */ |
24 | #define THROTL_IOPS_MAX UINT_MAX | 27 | #define THROTL_IOPS_MAX UINT_MAX |
25 | 28 | ||
@@ -45,7 +48,7 @@ struct blkcg { | |||
45 | struct blkcg_gq *blkg_hint; | 48 | struct blkcg_gq *blkg_hint; |
46 | struct hlist_head blkg_list; | 49 | struct hlist_head blkg_list; |
47 | 50 | ||
48 | struct blkcg_policy_data *pd[BLKCG_MAX_POLS]; | 51 | struct blkcg_policy_data *cpd[BLKCG_MAX_POLS]; |
49 | 52 | ||
50 | struct list_head all_blkcgs_node; | 53 | struct list_head all_blkcgs_node; |
51 | #ifdef CONFIG_CGROUP_WRITEBACK | 54 | #ifdef CONFIG_CGROUP_WRITEBACK |
@@ -53,14 +56,19 @@ struct blkcg { | |||
53 | #endif | 56 | #endif |
54 | }; | 57 | }; |
55 | 58 | ||
59 | /* | ||
60 | * blkg_[rw]stat->aux_cnt is excluded for local stats but included for | ||
61 | * recursive. Used to carry stats of dead children, and, for blkg_rwstat, | ||
62 | * to carry result values from read and sum operations. | ||
63 | */ | ||
56 | struct blkg_stat { | 64 | struct blkg_stat { |
57 | struct u64_stats_sync syncp; | 65 | struct percpu_counter cpu_cnt; |
58 | uint64_t cnt; | 66 | atomic64_t aux_cnt; |
59 | }; | 67 | }; |
60 | 68 | ||
61 | struct blkg_rwstat { | 69 | struct blkg_rwstat { |
62 | struct u64_stats_sync syncp; | 70 | struct percpu_counter cpu_cnt[BLKG_RWSTAT_NR]; |
63 | uint64_t cnt[BLKG_RWSTAT_NR]; | 71 | atomic64_t aux_cnt[BLKG_RWSTAT_NR]; |
64 | }; | 72 | }; |
65 | 73 | ||
66 | /* | 74 | /* |
@@ -68,32 +76,28 @@ struct blkg_rwstat { | |||
68 | * request_queue (q). This is used by blkcg policies which need to track | 76 | * request_queue (q). This is used by blkcg policies which need to track |
69 | * information per blkcg - q pair. | 77 | * information per blkcg - q pair. |
70 | * | 78 | * |
71 | * There can be multiple active blkcg policies and each has its private | 79 | * There can be multiple active blkcg policies and each blkg:policy pair is |
72 | * data on each blkg, the size of which is determined by | 80 | * represented by a blkg_policy_data which is allocated and freed by each |
73 | * blkcg_policy->pd_size. blkcg core allocates and frees such areas | 81 | * policy's pd_alloc/free_fn() methods. A policy can allocate private data |
74 | * together with blkg and invokes pd_init/exit_fn() methods. | 82 | * area by allocating larger data structure which embeds blkg_policy_data |
75 | * | 83 | * at the beginning. |
76 | * Such private data must embed struct blkg_policy_data (pd) at the | ||
77 | * beginning and pd_size can't be smaller than pd. | ||
78 | */ | 84 | */ |
79 | struct blkg_policy_data { | 85 | struct blkg_policy_data { |
80 | /* the blkg and policy id this per-policy data belongs to */ | 86 | /* the blkg and policy id this per-policy data belongs to */ |
81 | struct blkcg_gq *blkg; | 87 | struct blkcg_gq *blkg; |
82 | int plid; | 88 | int plid; |
83 | |||
84 | /* used during policy activation */ | ||
85 | struct list_head alloc_node; | ||
86 | }; | 89 | }; |
87 | 90 | ||
88 | /* | 91 | /* |
89 | * Policies that need to keep per-blkcg data which is independent | 92 | * Policies that need to keep per-blkcg data which is independent from any |
90 | * from any request_queue associated to it must specify its size | 93 | * request_queue associated to it should implement cpd_alloc/free_fn() |
91 | * with the cpd_size field of the blkcg_policy structure and | 94 | * methods. A policy can allocate private data area by allocating larger |
92 | * embed a blkcg_policy_data in it. cpd_init() is invoked to let | 95 | * data structure which embeds blkcg_policy_data at the beginning. |
93 | * each policy handle per-blkcg data. | 96 | * cpd_init() is invoked to let each policy handle per-blkcg data. |
94 | */ | 97 | */ |
95 | struct blkcg_policy_data { | 98 | struct blkcg_policy_data { |
96 | /* the policy id this per-policy data belongs to */ | 99 | /* the blkcg and policy id this per-policy data belongs to */ |
100 | struct blkcg *blkcg; | ||
97 | int plid; | 101 | int plid; |
98 | }; | 102 | }; |
99 | 103 | ||
@@ -123,40 +127,50 @@ struct blkcg_gq { | |||
123 | /* is this blkg online? protected by both blkcg and q locks */ | 127 | /* is this blkg online? protected by both blkcg and q locks */ |
124 | bool online; | 128 | bool online; |
125 | 129 | ||
130 | struct blkg_rwstat stat_bytes; | ||
131 | struct blkg_rwstat stat_ios; | ||
132 | |||
126 | struct blkg_policy_data *pd[BLKCG_MAX_POLS]; | 133 | struct blkg_policy_data *pd[BLKCG_MAX_POLS]; |
127 | 134 | ||
128 | struct rcu_head rcu_head; | 135 | struct rcu_head rcu_head; |
129 | }; | 136 | }; |
130 | 137 | ||
131 | typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg); | 138 | typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); |
132 | typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); | 139 | typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd); |
133 | typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg); | 140 | typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd); |
134 | typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg); | 141 | typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd); |
135 | typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg); | 142 | typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node); |
136 | typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); | 143 | typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd); |
144 | typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); | ||
145 | typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd); | ||
146 | typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); | ||
147 | typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); | ||
137 | 148 | ||
138 | struct blkcg_policy { | 149 | struct blkcg_policy { |
139 | int plid; | 150 | int plid; |
140 | /* policy specific private data size */ | ||
141 | size_t pd_size; | ||
142 | /* policy specific per-blkcg data size */ | ||
143 | size_t cpd_size; | ||
144 | /* cgroup files for the policy */ | 151 | /* cgroup files for the policy */ |
145 | struct cftype *cftypes; | 152 | struct cftype *dfl_cftypes; |
153 | struct cftype *legacy_cftypes; | ||
146 | 154 | ||
147 | /* operations */ | 155 | /* operations */ |
156 | blkcg_pol_alloc_cpd_fn *cpd_alloc_fn; | ||
148 | blkcg_pol_init_cpd_fn *cpd_init_fn; | 157 | blkcg_pol_init_cpd_fn *cpd_init_fn; |
158 | blkcg_pol_free_cpd_fn *cpd_free_fn; | ||
159 | blkcg_pol_bind_cpd_fn *cpd_bind_fn; | ||
160 | |||
161 | blkcg_pol_alloc_pd_fn *pd_alloc_fn; | ||
149 | blkcg_pol_init_pd_fn *pd_init_fn; | 162 | blkcg_pol_init_pd_fn *pd_init_fn; |
150 | blkcg_pol_online_pd_fn *pd_online_fn; | 163 | blkcg_pol_online_pd_fn *pd_online_fn; |
151 | blkcg_pol_offline_pd_fn *pd_offline_fn; | 164 | blkcg_pol_offline_pd_fn *pd_offline_fn; |
152 | blkcg_pol_exit_pd_fn *pd_exit_fn; | 165 | blkcg_pol_free_pd_fn *pd_free_fn; |
153 | blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; | 166 | blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; |
154 | }; | 167 | }; |
155 | 168 | ||
156 | extern struct blkcg blkcg_root; | 169 | extern struct blkcg blkcg_root; |
157 | extern struct cgroup_subsys_state * const blkcg_root_css; | 170 | extern struct cgroup_subsys_state * const blkcg_root_css; |
158 | 171 | ||
159 | struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q); | 172 | struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, |
173 | struct request_queue *q, bool update_hint); | ||
160 | struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, | 174 | struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, |
161 | struct request_queue *q); | 175 | struct request_queue *q); |
162 | int blkcg_init_queue(struct request_queue *q); | 176 | int blkcg_init_queue(struct request_queue *q); |
@@ -171,6 +185,7 @@ int blkcg_activate_policy(struct request_queue *q, | |||
171 | void blkcg_deactivate_policy(struct request_queue *q, | 185 | void blkcg_deactivate_policy(struct request_queue *q, |
172 | const struct blkcg_policy *pol); | 186 | const struct blkcg_policy *pol); |
173 | 187 | ||
188 | const char *blkg_dev_name(struct blkcg_gq *blkg); | ||
174 | void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, | 189 | void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, |
175 | u64 (*prfill)(struct seq_file *, | 190 | u64 (*prfill)(struct seq_file *, |
176 | struct blkg_policy_data *, int), | 191 | struct blkg_policy_data *, int), |
@@ -182,19 +197,24 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, | |||
182 | u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off); | 197 | u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off); |
183 | u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, | 198 | u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, |
184 | int off); | 199 | int off); |
200 | int blkg_print_stat_bytes(struct seq_file *sf, void *v); | ||
201 | int blkg_print_stat_ios(struct seq_file *sf, void *v); | ||
202 | int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v); | ||
203 | int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v); | ||
185 | 204 | ||
186 | u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off); | 205 | u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg, |
187 | struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, | 206 | struct blkcg_policy *pol, int off); |
188 | int off); | 207 | struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, |
208 | struct blkcg_policy *pol, int off); | ||
189 | 209 | ||
190 | struct blkg_conf_ctx { | 210 | struct blkg_conf_ctx { |
191 | struct gendisk *disk; | 211 | struct gendisk *disk; |
192 | struct blkcg_gq *blkg; | 212 | struct blkcg_gq *blkg; |
193 | u64 v; | 213 | char *body; |
194 | }; | 214 | }; |
195 | 215 | ||
196 | int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, | 216 | int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, |
197 | const char *input, struct blkg_conf_ctx *ctx); | 217 | char *input, struct blkg_conf_ctx *ctx); |
198 | void blkg_conf_finish(struct blkg_conf_ctx *ctx); | 218 | void blkg_conf_finish(struct blkg_conf_ctx *ctx); |
199 | 219 | ||
200 | 220 | ||
@@ -205,7 +225,7 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) | |||
205 | 225 | ||
206 | static inline struct blkcg *task_blkcg(struct task_struct *tsk) | 226 | static inline struct blkcg *task_blkcg(struct task_struct *tsk) |
207 | { | 227 | { |
208 | return css_to_blkcg(task_css(tsk, blkio_cgrp_id)); | 228 | return css_to_blkcg(task_css(tsk, io_cgrp_id)); |
209 | } | 229 | } |
210 | 230 | ||
211 | static inline struct blkcg *bio_blkcg(struct bio *bio) | 231 | static inline struct blkcg *bio_blkcg(struct bio *bio) |
@@ -218,7 +238,7 @@ static inline struct blkcg *bio_blkcg(struct bio *bio) | |||
218 | static inline struct cgroup_subsys_state * | 238 | static inline struct cgroup_subsys_state * |
219 | task_get_blkcg_css(struct task_struct *task) | 239 | task_get_blkcg_css(struct task_struct *task) |
220 | { | 240 | { |
221 | return task_get_css(task, blkio_cgrp_id); | 241 | return task_get_css(task, io_cgrp_id); |
222 | } | 242 | } |
223 | 243 | ||
224 | /** | 244 | /** |
@@ -233,6 +253,52 @@ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) | |||
233 | } | 253 | } |
234 | 254 | ||
235 | /** | 255 | /** |
256 | * __blkg_lookup - internal version of blkg_lookup() | ||
257 | * @blkcg: blkcg of interest | ||
258 | * @q: request_queue of interest | ||
259 | * @update_hint: whether to update lookup hint with the result or not | ||
260 | * | ||
261 | * This is internal version and shouldn't be used by policy | ||
262 | * implementations. Looks up blkgs for the @blkcg - @q pair regardless of | ||
263 | * @q's bypass state. If @update_hint is %true, the caller should be | ||
264 | * holding @q->queue_lock and lookup hint is updated on success. | ||
265 | */ | ||
266 | static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, | ||
267 | struct request_queue *q, | ||
268 | bool update_hint) | ||
269 | { | ||
270 | struct blkcg_gq *blkg; | ||
271 | |||
272 | if (blkcg == &blkcg_root) | ||
273 | return q->root_blkg; | ||
274 | |||
275 | blkg = rcu_dereference(blkcg->blkg_hint); | ||
276 | if (blkg && blkg->q == q) | ||
277 | return blkg; | ||
278 | |||
279 | return blkg_lookup_slowpath(blkcg, q, update_hint); | ||
280 | } | ||
281 | |||
282 | /** | ||
283 | * blkg_lookup - lookup blkg for the specified blkcg - q pair | ||
284 | * @blkcg: blkcg of interest | ||
285 | * @q: request_queue of interest | ||
286 | * | ||
287 | * Lookup blkg for the @blkcg - @q pair. This function should be called | ||
288 | * under RCU read lock and is guaranteed to return %NULL if @q is bypassing | ||
289 | * - see blk_queue_bypass_start() for details. | ||
290 | */ | ||
291 | static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, | ||
292 | struct request_queue *q) | ||
293 | { | ||
294 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
295 | |||
296 | if (unlikely(blk_queue_bypass(q))) | ||
297 | return NULL; | ||
298 | return __blkg_lookup(blkcg, q, false); | ||
299 | } | ||
300 | |||
301 | /** | ||
236 | * blkg_to_pdata - get policy private data | 302 | * blkg_to_pdata - get policy private data |
237 | * @blkg: blkg of interest | 303 | * @blkg: blkg of interest |
238 | * @pol: policy of interest | 304 | * @pol: policy of interest |
@@ -248,7 +314,7 @@ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, | |||
248 | static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg, | 314 | static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg, |
249 | struct blkcg_policy *pol) | 315 | struct blkcg_policy *pol) |
250 | { | 316 | { |
251 | return blkcg ? blkcg->pd[pol->plid] : NULL; | 317 | return blkcg ? blkcg->cpd[pol->plid] : NULL; |
252 | } | 318 | } |
253 | 319 | ||
254 | /** | 320 | /** |
@@ -262,6 +328,11 @@ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) | |||
262 | return pd ? pd->blkg : NULL; | 328 | return pd ? pd->blkg : NULL; |
263 | } | 329 | } |
264 | 330 | ||
331 | static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd) | ||
332 | { | ||
333 | return cpd ? cpd->blkcg : NULL; | ||
334 | } | ||
335 | |||
265 | /** | 336 | /** |
266 | * blkg_path - format cgroup path of blkg | 337 | * blkg_path - format cgroup path of blkg |
267 | * @blkg: blkg of interest | 338 | * @blkg: blkg of interest |
@@ -309,9 +380,6 @@ static inline void blkg_put(struct blkcg_gq *blkg) | |||
309 | call_rcu(&blkg->rcu_head, __blkg_release_rcu); | 380 | call_rcu(&blkg->rcu_head, __blkg_release_rcu); |
310 | } | 381 | } |
311 | 382 | ||
312 | struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, | ||
313 | bool update_hint); | ||
314 | |||
315 | /** | 383 | /** |
316 | * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants | 384 | * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants |
317 | * @d_blkg: loop cursor pointing to the current descendant | 385 | * @d_blkg: loop cursor pointing to the current descendant |
@@ -373,8 +441,8 @@ static inline struct request_list *blk_get_rl(struct request_queue *q, | |||
373 | * or if either the blkcg or queue is going away. Fall back to | 441 | * or if either the blkcg or queue is going away. Fall back to |
374 | * root_rl in such cases. | 442 | * root_rl in such cases. |
375 | */ | 443 | */ |
376 | blkg = blkg_lookup_create(blkcg, q); | 444 | blkg = blkg_lookup(blkcg, q); |
377 | if (IS_ERR(blkg)) | 445 | if (unlikely(!blkg)) |
378 | goto root_rl; | 446 | goto root_rl; |
379 | 447 | ||
380 | blkg_get(blkg); | 448 | blkg_get(blkg); |
@@ -394,8 +462,7 @@ root_rl: | |||
394 | */ | 462 | */ |
395 | static inline void blk_put_rl(struct request_list *rl) | 463 | static inline void blk_put_rl(struct request_list *rl) |
396 | { | 464 | { |
397 | /* root_rl may not have blkg set */ | 465 | if (rl->blkg->blkcg != &blkcg_root) |
398 | if (rl->blkg && rl->blkg->blkcg != &blkcg_root) | ||
399 | blkg_put(rl->blkg); | 466 | blkg_put(rl->blkg); |
400 | } | 467 | } |
401 | 468 | ||
@@ -433,9 +500,21 @@ struct request_list *__blk_queue_next_rl(struct request_list *rl, | |||
433 | #define blk_queue_for_each_rl(rl, q) \ | 500 | #define blk_queue_for_each_rl(rl, q) \ |
434 | for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q))) | 501 | for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q))) |
435 | 502 | ||
436 | static inline void blkg_stat_init(struct blkg_stat *stat) | 503 | static inline int blkg_stat_init(struct blkg_stat *stat, gfp_t gfp) |
437 | { | 504 | { |
438 | u64_stats_init(&stat->syncp); | 505 | int ret; |
506 | |||
507 | ret = percpu_counter_init(&stat->cpu_cnt, 0, gfp); | ||
508 | if (ret) | ||
509 | return ret; | ||
510 | |||
511 | atomic64_set(&stat->aux_cnt, 0); | ||
512 | return 0; | ||
513 | } | ||
514 | |||
515 | static inline void blkg_stat_exit(struct blkg_stat *stat) | ||
516 | { | ||
517 | percpu_counter_destroy(&stat->cpu_cnt); | ||
439 | } | 518 | } |
440 | 519 | ||
441 | /** | 520 | /** |
@@ -443,34 +522,21 @@ static inline void blkg_stat_init(struct blkg_stat *stat) | |||
443 | * @stat: target blkg_stat | 522 | * @stat: target blkg_stat |
444 | * @val: value to add | 523 | * @val: value to add |
445 | * | 524 | * |
446 | * Add @val to @stat. The caller is responsible for synchronizing calls to | 525 | * Add @val to @stat. The caller must ensure that IRQ on the same CPU |
447 | * this function. | 526 | * don't re-enter this function for the same counter. |
448 | */ | 527 | */ |
449 | static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val) | 528 | static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val) |
450 | { | 529 | { |
451 | u64_stats_update_begin(&stat->syncp); | 530 | __percpu_counter_add(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH); |
452 | stat->cnt += val; | ||
453 | u64_stats_update_end(&stat->syncp); | ||
454 | } | 531 | } |
455 | 532 | ||
456 | /** | 533 | /** |
457 | * blkg_stat_read - read the current value of a blkg_stat | 534 | * blkg_stat_read - read the current value of a blkg_stat |
458 | * @stat: blkg_stat to read | 535 | * @stat: blkg_stat to read |
459 | * | ||
460 | * Read the current value of @stat. This function can be called without | ||
461 | * synchroniztion and takes care of u64 atomicity. | ||
462 | */ | 536 | */ |
463 | static inline uint64_t blkg_stat_read(struct blkg_stat *stat) | 537 | static inline uint64_t blkg_stat_read(struct blkg_stat *stat) |
464 | { | 538 | { |
465 | unsigned int start; | 539 | return percpu_counter_sum_positive(&stat->cpu_cnt); |
466 | uint64_t v; | ||
467 | |||
468 | do { | ||
469 | start = u64_stats_fetch_begin_irq(&stat->syncp); | ||
470 | v = stat->cnt; | ||
471 | } while (u64_stats_fetch_retry_irq(&stat->syncp, start)); | ||
472 | |||
473 | return v; | ||
474 | } | 540 | } |
475 | 541 | ||
476 | /** | 542 | /** |
@@ -479,24 +545,46 @@ static inline uint64_t blkg_stat_read(struct blkg_stat *stat) | |||
479 | */ | 545 | */ |
480 | static inline void blkg_stat_reset(struct blkg_stat *stat) | 546 | static inline void blkg_stat_reset(struct blkg_stat *stat) |
481 | { | 547 | { |
482 | stat->cnt = 0; | 548 | percpu_counter_set(&stat->cpu_cnt, 0); |
549 | atomic64_set(&stat->aux_cnt, 0); | ||
483 | } | 550 | } |
484 | 551 | ||
485 | /** | 552 | /** |
486 | * blkg_stat_merge - merge a blkg_stat into another | 553 | * blkg_stat_add_aux - add a blkg_stat into another's aux count |
487 | * @to: the destination blkg_stat | 554 | * @to: the destination blkg_stat |
488 | * @from: the source | 555 | * @from: the source |
489 | * | 556 | * |
490 | * Add @from's count to @to. | 557 | * Add @from's count including the aux one to @to's aux count. |
491 | */ | 558 | */ |
492 | static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from) | 559 | static inline void blkg_stat_add_aux(struct blkg_stat *to, |
560 | struct blkg_stat *from) | ||
493 | { | 561 | { |
494 | blkg_stat_add(to, blkg_stat_read(from)); | 562 | atomic64_add(blkg_stat_read(from) + atomic64_read(&from->aux_cnt), |
563 | &to->aux_cnt); | ||
495 | } | 564 | } |
496 | 565 | ||
497 | static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat) | 566 | static inline int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp) |
498 | { | 567 | { |
499 | u64_stats_init(&rwstat->syncp); | 568 | int i, ret; |
569 | |||
570 | for (i = 0; i < BLKG_RWSTAT_NR; i++) { | ||
571 | ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp); | ||
572 | if (ret) { | ||
573 | while (--i >= 0) | ||
574 | percpu_counter_destroy(&rwstat->cpu_cnt[i]); | ||
575 | return ret; | ||
576 | } | ||
577 | atomic64_set(&rwstat->aux_cnt[i], 0); | ||
578 | } | ||
579 | return 0; | ||
580 | } | ||
581 | |||
582 | static inline void blkg_rwstat_exit(struct blkg_rwstat *rwstat) | ||
583 | { | ||
584 | int i; | ||
585 | |||
586 | for (i = 0; i < BLKG_RWSTAT_NR; i++) | ||
587 | percpu_counter_destroy(&rwstat->cpu_cnt[i]); | ||
500 | } | 588 | } |
501 | 589 | ||
502 | /** | 590 | /** |
@@ -511,39 +599,38 @@ static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat) | |||
511 | static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, | 599 | static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, |
512 | int rw, uint64_t val) | 600 | int rw, uint64_t val) |
513 | { | 601 | { |
514 | u64_stats_update_begin(&rwstat->syncp); | 602 | struct percpu_counter *cnt; |
515 | 603 | ||
516 | if (rw & REQ_WRITE) | 604 | if (rw & REQ_WRITE) |
517 | rwstat->cnt[BLKG_RWSTAT_WRITE] += val; | 605 | cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE]; |
518 | else | 606 | else |
519 | rwstat->cnt[BLKG_RWSTAT_READ] += val; | 607 | cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ]; |
608 | |||
609 | __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH); | ||
610 | |||
520 | if (rw & REQ_SYNC) | 611 | if (rw & REQ_SYNC) |
521 | rwstat->cnt[BLKG_RWSTAT_SYNC] += val; | 612 | cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC]; |
522 | else | 613 | else |
523 | rwstat->cnt[BLKG_RWSTAT_ASYNC] += val; | 614 | cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC]; |
524 | 615 | ||
525 | u64_stats_update_end(&rwstat->syncp); | 616 | __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH); |
526 | } | 617 | } |
527 | 618 | ||
528 | /** | 619 | /** |
529 | * blkg_rwstat_read - read the current values of a blkg_rwstat | 620 | * blkg_rwstat_read - read the current values of a blkg_rwstat |
530 | * @rwstat: blkg_rwstat to read | 621 | * @rwstat: blkg_rwstat to read |
531 | * | 622 | * |
532 | * Read the current snapshot of @rwstat and return it as the return value. | 623 | * Read the current snapshot of @rwstat and return it in the aux counts. |
533 | * This function can be called without synchronization and takes care of | ||
534 | * u64 atomicity. | ||
535 | */ | 624 | */ |
536 | static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat) | 625 | static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat) |
537 | { | 626 | { |
538 | unsigned int start; | 627 | struct blkg_rwstat result; |
539 | struct blkg_rwstat tmp; | 628 | int i; |
540 | |||
541 | do { | ||
542 | start = u64_stats_fetch_begin_irq(&rwstat->syncp); | ||
543 | tmp = *rwstat; | ||
544 | } while (u64_stats_fetch_retry_irq(&rwstat->syncp, start)); | ||
545 | 629 | ||
546 | return tmp; | 630 | for (i = 0; i < BLKG_RWSTAT_NR; i++) |
631 | atomic64_set(&result.aux_cnt[i], | ||
632 | percpu_counter_sum_positive(&rwstat->cpu_cnt[i])); | ||
633 | return result; | ||
547 | } | 634 | } |
548 | 635 | ||
549 | /** | 636 | /** |
@@ -558,7 +645,8 @@ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) | |||
558 | { | 645 | { |
559 | struct blkg_rwstat tmp = blkg_rwstat_read(rwstat); | 646 | struct blkg_rwstat tmp = blkg_rwstat_read(rwstat); |
560 | 647 | ||
561 | return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]; | 648 | return atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + |
649 | atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); | ||
562 | } | 650 | } |
563 | 651 | ||
564 | /** | 652 | /** |
@@ -567,26 +655,71 @@ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) | |||
567 | */ | 655 | */ |
568 | static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) | 656 | static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) |
569 | { | 657 | { |
570 | memset(rwstat->cnt, 0, sizeof(rwstat->cnt)); | 658 | int i; |
659 | |||
660 | for (i = 0; i < BLKG_RWSTAT_NR; i++) { | ||
661 | percpu_counter_set(&rwstat->cpu_cnt[i], 0); | ||
662 | atomic64_set(&rwstat->aux_cnt[i], 0); | ||
663 | } | ||
571 | } | 664 | } |
572 | 665 | ||
573 | /** | 666 | /** |
574 | * blkg_rwstat_merge - merge a blkg_rwstat into another | 667 | * blkg_rwstat_add_aux - add a blkg_rwstat into another's aux count |
575 | * @to: the destination blkg_rwstat | 668 | * @to: the destination blkg_rwstat |
576 | * @from: the source | 669 | * @from: the source |
577 | * | 670 | * |
578 | * Add @from's counts to @to. | 671 | * Add @from's count including the aux one to @to's aux count. |
579 | */ | 672 | */ |
580 | static inline void blkg_rwstat_merge(struct blkg_rwstat *to, | 673 | static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to, |
581 | struct blkg_rwstat *from) | 674 | struct blkg_rwstat *from) |
582 | { | 675 | { |
583 | struct blkg_rwstat v = blkg_rwstat_read(from); | 676 | struct blkg_rwstat v = blkg_rwstat_read(from); |
584 | int i; | 677 | int i; |
585 | 678 | ||
586 | u64_stats_update_begin(&to->syncp); | ||
587 | for (i = 0; i < BLKG_RWSTAT_NR; i++) | 679 | for (i = 0; i < BLKG_RWSTAT_NR; i++) |
588 | to->cnt[i] += v.cnt[i]; | 680 | atomic64_add(atomic64_read(&v.aux_cnt[i]) + |
589 | u64_stats_update_end(&to->syncp); | 681 | atomic64_read(&from->aux_cnt[i]), |
682 | &to->aux_cnt[i]); | ||
683 | } | ||
684 | |||
685 | #ifdef CONFIG_BLK_DEV_THROTTLING | ||
686 | extern bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, | ||
687 | struct bio *bio); | ||
688 | #else | ||
689 | static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, | ||
690 | struct bio *bio) { return false; } | ||
691 | #endif | ||
692 | |||
693 | static inline bool blkcg_bio_issue_check(struct request_queue *q, | ||
694 | struct bio *bio) | ||
695 | { | ||
696 | struct blkcg *blkcg; | ||
697 | struct blkcg_gq *blkg; | ||
698 | bool throtl = false; | ||
699 | |||
700 | rcu_read_lock(); | ||
701 | blkcg = bio_blkcg(bio); | ||
702 | |||
703 | blkg = blkg_lookup(blkcg, q); | ||
704 | if (unlikely(!blkg)) { | ||
705 | spin_lock_irq(q->queue_lock); | ||
706 | blkg = blkg_lookup_create(blkcg, q); | ||
707 | if (IS_ERR(blkg)) | ||
708 | blkg = NULL; | ||
709 | spin_unlock_irq(q->queue_lock); | ||
710 | } | ||
711 | |||
712 | throtl = blk_throtl_bio(q, blkg, bio); | ||
713 | |||
714 | if (!throtl) { | ||
715 | blkg = blkg ?: q->root_blkg; | ||
716 | blkg_rwstat_add(&blkg->stat_bytes, bio->bi_flags, | ||
717 | bio->bi_iter.bi_size); | ||
718 | blkg_rwstat_add(&blkg->stat_ios, bio->bi_flags, 1); | ||
719 | } | ||
720 | |||
721 | rcu_read_unlock(); | ||
722 | return !throtl; | ||
590 | } | 723 | } |
591 | 724 | ||
592 | #else /* CONFIG_BLK_CGROUP */ | 725 | #else /* CONFIG_BLK_CGROUP */ |
@@ -642,6 +775,9 @@ static inline void blk_put_rl(struct request_list *rl) { } | |||
642 | static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { } | 775 | static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { } |
643 | static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; } | 776 | static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; } |
644 | 777 | ||
778 | static inline bool blkcg_bio_issue_check(struct request_queue *q, | ||
779 | struct bio *bio) { return true; } | ||
780 | |||
645 | #define blk_queue_for_each_rl(rl, q) \ | 781 | #define blk_queue_for_each_rl(rl, q) \ |
646 | for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) | 782 | for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) |
647 | 783 | ||
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 1f36945fd23d..1a96fdaa33d5 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h | |||
@@ -27,7 +27,7 @@ SUBSYS(cpuacct) | |||
27 | #endif | 27 | #endif |
28 | 28 | ||
29 | #if IS_ENABLED(CONFIG_BLK_CGROUP) | 29 | #if IS_ENABLED(CONFIG_BLK_CGROUP) |
30 | SUBSYS(blkio) | 30 | SUBSYS(io) |
31 | #endif | 31 | #endif |
32 | 32 | ||
33 | #if IS_ENABLED(CONFIG_MEMCG) | 33 | #if IS_ENABLED(CONFIG_MEMCG) |
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 123be25ea15a..5d4e9c4b821d 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h | |||
@@ -266,6 +266,7 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn) | |||
266 | } | 266 | } |
267 | 267 | ||
268 | int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen); | 268 | int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen); |
269 | size_t kernfs_path_len(struct kernfs_node *kn); | ||
269 | char * __must_check kernfs_path(struct kernfs_node *kn, char *buf, | 270 | char * __must_check kernfs_path(struct kernfs_node *kn, char *buf, |
270 | size_t buflen); | 271 | size_t buflen); |
271 | void pr_cont_kernfs_name(struct kernfs_node *kn); | 272 | void pr_cont_kernfs_name(struct kernfs_node *kn); |
@@ -332,6 +333,9 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn) | |||
332 | static inline int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) | 333 | static inline int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) |
333 | { return -ENOSYS; } | 334 | { return -ENOSYS; } |
334 | 335 | ||
336 | static inline size_t kernfs_path_len(struct kernfs_node *kn) | ||
337 | { return 0; } | ||
338 | |||
335 | static inline char * __must_check kernfs_path(struct kernfs_node *kn, char *buf, | 339 | static inline char * __must_check kernfs_path(struct kernfs_node *kn, char *buf, |
336 | size_t buflen) | 340 | size_t buflen) |
337 | { return NULL; } | 341 | { return NULL; } |
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index a7aa607a4c55..fff846b512e6 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h | |||
@@ -131,6 +131,66 @@ DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode, | |||
131 | TP_ARGS(inode, flags) | 131 | TP_ARGS(inode, flags) |
132 | ); | 132 | ); |
133 | 133 | ||
134 | #ifdef CREATE_TRACE_POINTS | ||
135 | #ifdef CONFIG_CGROUP_WRITEBACK | ||
136 | |||
137 | static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb) | ||
138 | { | ||
139 | return kernfs_path_len(wb->memcg_css->cgroup->kn) + 1; | ||
140 | } | ||
141 | |||
142 | static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb) | ||
143 | { | ||
144 | struct cgroup *cgrp = wb->memcg_css->cgroup; | ||
145 | char *path; | ||
146 | |||
147 | path = cgroup_path(cgrp, buf, kernfs_path_len(cgrp->kn) + 1); | ||
148 | WARN_ON_ONCE(path != buf); | ||
149 | } | ||
150 | |||
151 | static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc) | ||
152 | { | ||
153 | if (wbc->wb) | ||
154 | return __trace_wb_cgroup_size(wbc->wb); | ||
155 | else | ||
156 | return 2; | ||
157 | } | ||
158 | |||
159 | static inline void __trace_wbc_assign_cgroup(char *buf, | ||
160 | struct writeback_control *wbc) | ||
161 | { | ||
162 | if (wbc->wb) | ||
163 | __trace_wb_assign_cgroup(buf, wbc->wb); | ||
164 | else | ||
165 | strcpy(buf, "/"); | ||
166 | } | ||
167 | |||
168 | #else /* CONFIG_CGROUP_WRITEBACK */ | ||
169 | |||
170 | static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb) | ||
171 | { | ||
172 | return 2; | ||
173 | } | ||
174 | |||
175 | static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb) | ||
176 | { | ||
177 | strcpy(buf, "/"); | ||
178 | } | ||
179 | |||
180 | static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc) | ||
181 | { | ||
182 | return 2; | ||
183 | } | ||
184 | |||
185 | static inline void __trace_wbc_assign_cgroup(char *buf, | ||
186 | struct writeback_control *wbc) | ||
187 | { | ||
188 | strcpy(buf, "/"); | ||
189 | } | ||
190 | |||
191 | #endif /* CONFIG_CGROUP_WRITEBACK */ | ||
192 | #endif /* CREATE_TRACE_POINTS */ | ||
193 | |||
134 | DECLARE_EVENT_CLASS(writeback_write_inode_template, | 194 | DECLARE_EVENT_CLASS(writeback_write_inode_template, |
135 | 195 | ||
136 | TP_PROTO(struct inode *inode, struct writeback_control *wbc), | 196 | TP_PROTO(struct inode *inode, struct writeback_control *wbc), |
@@ -141,6 +201,7 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template, | |||
141 | __array(char, name, 32) | 201 | __array(char, name, 32) |
142 | __field(unsigned long, ino) | 202 | __field(unsigned long, ino) |
143 | __field(int, sync_mode) | 203 | __field(int, sync_mode) |
204 | __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc)) | ||
144 | ), | 205 | ), |
145 | 206 | ||
146 | TP_fast_assign( | 207 | TP_fast_assign( |
@@ -148,12 +209,14 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template, | |||
148 | dev_name(inode_to_bdi(inode)->dev), 32); | 209 | dev_name(inode_to_bdi(inode)->dev), 32); |
149 | __entry->ino = inode->i_ino; | 210 | __entry->ino = inode->i_ino; |
150 | __entry->sync_mode = wbc->sync_mode; | 211 | __entry->sync_mode = wbc->sync_mode; |
212 | __trace_wbc_assign_cgroup(__get_str(cgroup), wbc); | ||
151 | ), | 213 | ), |
152 | 214 | ||
153 | TP_printk("bdi %s: ino=%lu sync_mode=%d", | 215 | TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup=%s", |
154 | __entry->name, | 216 | __entry->name, |
155 | __entry->ino, | 217 | __entry->ino, |
156 | __entry->sync_mode | 218 | __entry->sync_mode, |
219 | __get_str(cgroup) | ||
157 | ) | 220 | ) |
158 | ); | 221 | ); |
159 | 222 | ||
@@ -172,8 +235,8 @@ DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode, | |||
172 | ); | 235 | ); |
173 | 236 | ||
174 | DECLARE_EVENT_CLASS(writeback_work_class, | 237 | DECLARE_EVENT_CLASS(writeback_work_class, |
175 | TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), | 238 | TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), |
176 | TP_ARGS(bdi, work), | 239 | TP_ARGS(wb, work), |
177 | TP_STRUCT__entry( | 240 | TP_STRUCT__entry( |
178 | __array(char, name, 32) | 241 | __array(char, name, 32) |
179 | __field(long, nr_pages) | 242 | __field(long, nr_pages) |
@@ -183,10 +246,11 @@ DECLARE_EVENT_CLASS(writeback_work_class, | |||
183 | __field(int, range_cyclic) | 246 | __field(int, range_cyclic) |
184 | __field(int, for_background) | 247 | __field(int, for_background) |
185 | __field(int, reason) | 248 | __field(int, reason) |
249 | __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb)) | ||
186 | ), | 250 | ), |
187 | TP_fast_assign( | 251 | TP_fast_assign( |
188 | strncpy(__entry->name, | 252 | strncpy(__entry->name, |
189 | bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32); | 253 | wb->bdi->dev ? dev_name(wb->bdi->dev) : "(unknown)", 32); |
190 | __entry->nr_pages = work->nr_pages; | 254 | __entry->nr_pages = work->nr_pages; |
191 | __entry->sb_dev = work->sb ? work->sb->s_dev : 0; | 255 | __entry->sb_dev = work->sb ? work->sb->s_dev : 0; |
192 | __entry->sync_mode = work->sync_mode; | 256 | __entry->sync_mode = work->sync_mode; |
@@ -194,9 +258,10 @@ DECLARE_EVENT_CLASS(writeback_work_class, | |||
194 | __entry->range_cyclic = work->range_cyclic; | 258 | __entry->range_cyclic = work->range_cyclic; |
195 | __entry->for_background = work->for_background; | 259 | __entry->for_background = work->for_background; |
196 | __entry->reason = work->reason; | 260 | __entry->reason = work->reason; |
261 | __trace_wb_assign_cgroup(__get_str(cgroup), wb); | ||
197 | ), | 262 | ), |
198 | TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d " | 263 | TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d " |
199 | "kupdate=%d range_cyclic=%d background=%d reason=%s", | 264 | "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup=%s", |
200 | __entry->name, | 265 | __entry->name, |
201 | MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev), | 266 | MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev), |
202 | __entry->nr_pages, | 267 | __entry->nr_pages, |
@@ -204,13 +269,14 @@ DECLARE_EVENT_CLASS(writeback_work_class, | |||
204 | __entry->for_kupdate, | 269 | __entry->for_kupdate, |
205 | __entry->range_cyclic, | 270 | __entry->range_cyclic, |
206 | __entry->for_background, | 271 | __entry->for_background, |
207 | __print_symbolic(__entry->reason, WB_WORK_REASON) | 272 | __print_symbolic(__entry->reason, WB_WORK_REASON), |
273 | __get_str(cgroup) | ||
208 | ) | 274 | ) |
209 | ); | 275 | ); |
210 | #define DEFINE_WRITEBACK_WORK_EVENT(name) \ | 276 | #define DEFINE_WRITEBACK_WORK_EVENT(name) \ |
211 | DEFINE_EVENT(writeback_work_class, name, \ | 277 | DEFINE_EVENT(writeback_work_class, name, \ |
212 | TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \ | 278 | TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), \ |
213 | TP_ARGS(bdi, work)) | 279 | TP_ARGS(wb, work)) |
214 | DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); | 280 | DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); |
215 | DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); | 281 | DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); |
216 | DEFINE_WRITEBACK_WORK_EVENT(writeback_start); | 282 | DEFINE_WRITEBACK_WORK_EVENT(writeback_start); |
@@ -230,26 +296,42 @@ TRACE_EVENT(writeback_pages_written, | |||
230 | ); | 296 | ); |
231 | 297 | ||
232 | DECLARE_EVENT_CLASS(writeback_class, | 298 | DECLARE_EVENT_CLASS(writeback_class, |
233 | TP_PROTO(struct backing_dev_info *bdi), | 299 | TP_PROTO(struct bdi_writeback *wb), |
234 | TP_ARGS(bdi), | 300 | TP_ARGS(wb), |
235 | TP_STRUCT__entry( | 301 | TP_STRUCT__entry( |
236 | __array(char, name, 32) | 302 | __array(char, name, 32) |
303 | __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb)) | ||
237 | ), | 304 | ), |
238 | TP_fast_assign( | 305 | TP_fast_assign( |
239 | strncpy(__entry->name, dev_name(bdi->dev), 32); | 306 | strncpy(__entry->name, dev_name(wb->bdi->dev), 32); |
307 | __trace_wb_assign_cgroup(__get_str(cgroup), wb); | ||
240 | ), | 308 | ), |
241 | TP_printk("bdi %s", | 309 | TP_printk("bdi %s: cgroup=%s", |
242 | __entry->name | 310 | __entry->name, |
311 | __get_str(cgroup) | ||
243 | ) | 312 | ) |
244 | ); | 313 | ); |
245 | #define DEFINE_WRITEBACK_EVENT(name) \ | 314 | #define DEFINE_WRITEBACK_EVENT(name) \ |
246 | DEFINE_EVENT(writeback_class, name, \ | 315 | DEFINE_EVENT(writeback_class, name, \ |
247 | TP_PROTO(struct backing_dev_info *bdi), \ | 316 | TP_PROTO(struct bdi_writeback *wb), \ |
248 | TP_ARGS(bdi)) | 317 | TP_ARGS(wb)) |
249 | 318 | ||
250 | DEFINE_WRITEBACK_EVENT(writeback_nowork); | 319 | DEFINE_WRITEBACK_EVENT(writeback_nowork); |
251 | DEFINE_WRITEBACK_EVENT(writeback_wake_background); | 320 | DEFINE_WRITEBACK_EVENT(writeback_wake_background); |
252 | DEFINE_WRITEBACK_EVENT(writeback_bdi_register); | 321 | |
322 | TRACE_EVENT(writeback_bdi_register, | ||
323 | TP_PROTO(struct backing_dev_info *bdi), | ||
324 | TP_ARGS(bdi), | ||
325 | TP_STRUCT__entry( | ||
326 | __array(char, name, 32) | ||
327 | ), | ||
328 | TP_fast_assign( | ||
329 | strncpy(__entry->name, dev_name(bdi->dev), 32); | ||
330 | ), | ||
331 | TP_printk("bdi %s", | ||
332 | __entry->name | ||
333 | ) | ||
334 | ); | ||
253 | 335 | ||
254 | DECLARE_EVENT_CLASS(wbc_class, | 336 | DECLARE_EVENT_CLASS(wbc_class, |
255 | TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), | 337 | TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), |
@@ -265,6 +347,7 @@ DECLARE_EVENT_CLASS(wbc_class, | |||
265 | __field(int, range_cyclic) | 347 | __field(int, range_cyclic) |
266 | __field(long, range_start) | 348 | __field(long, range_start) |
267 | __field(long, range_end) | 349 | __field(long, range_end) |
350 | __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc)) | ||
268 | ), | 351 | ), |
269 | 352 | ||
270 | TP_fast_assign( | 353 | TP_fast_assign( |
@@ -278,11 +361,12 @@ DECLARE_EVENT_CLASS(wbc_class, | |||
278 | __entry->range_cyclic = wbc->range_cyclic; | 361 | __entry->range_cyclic = wbc->range_cyclic; |
279 | __entry->range_start = (long)wbc->range_start; | 362 | __entry->range_start = (long)wbc->range_start; |
280 | __entry->range_end = (long)wbc->range_end; | 363 | __entry->range_end = (long)wbc->range_end; |
364 | __trace_wbc_assign_cgroup(__get_str(cgroup), wbc); | ||
281 | ), | 365 | ), |
282 | 366 | ||
283 | TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d " | 367 | TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d " |
284 | "bgrd=%d reclm=%d cyclic=%d " | 368 | "bgrd=%d reclm=%d cyclic=%d " |
285 | "start=0x%lx end=0x%lx", | 369 | "start=0x%lx end=0x%lx cgroup=%s", |
286 | __entry->name, | 370 | __entry->name, |
287 | __entry->nr_to_write, | 371 | __entry->nr_to_write, |
288 | __entry->pages_skipped, | 372 | __entry->pages_skipped, |
@@ -292,7 +376,9 @@ DECLARE_EVENT_CLASS(wbc_class, | |||
292 | __entry->for_reclaim, | 376 | __entry->for_reclaim, |
293 | __entry->range_cyclic, | 377 | __entry->range_cyclic, |
294 | __entry->range_start, | 378 | __entry->range_start, |
295 | __entry->range_end) | 379 | __entry->range_end, |
380 | __get_str(cgroup) | ||
381 | ) | ||
296 | ) | 382 | ) |
297 | 383 | ||
298 | #define DEFINE_WBC_EVENT(name) \ | 384 | #define DEFINE_WBC_EVENT(name) \ |
@@ -312,6 +398,7 @@ TRACE_EVENT(writeback_queue_io, | |||
312 | __field(long, age) | 398 | __field(long, age) |
313 | __field(int, moved) | 399 | __field(int, moved) |
314 | __field(int, reason) | 400 | __field(int, reason) |
401 | __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb)) | ||
315 | ), | 402 | ), |
316 | TP_fast_assign( | 403 | TP_fast_assign( |
317 | unsigned long *older_than_this = work->older_than_this; | 404 | unsigned long *older_than_this = work->older_than_this; |
@@ -321,13 +408,15 @@ TRACE_EVENT(writeback_queue_io, | |||
321 | (jiffies - *older_than_this) * 1000 / HZ : -1; | 408 | (jiffies - *older_than_this) * 1000 / HZ : -1; |
322 | __entry->moved = moved; | 409 | __entry->moved = moved; |
323 | __entry->reason = work->reason; | 410 | __entry->reason = work->reason; |
411 | __trace_wb_assign_cgroup(__get_str(cgroup), wb); | ||
324 | ), | 412 | ), |
325 | TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s", | 413 | TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup=%s", |
326 | __entry->name, | 414 | __entry->name, |
327 | __entry->older, /* older_than_this in jiffies */ | 415 | __entry->older, /* older_than_this in jiffies */ |
328 | __entry->age, /* older_than_this in relative milliseconds */ | 416 | __entry->age, /* older_than_this in relative milliseconds */ |
329 | __entry->moved, | 417 | __entry->moved, |
330 | __print_symbolic(__entry->reason, WB_WORK_REASON) | 418 | __print_symbolic(__entry->reason, WB_WORK_REASON), |
419 | __get_str(cgroup) | ||
331 | ) | 420 | ) |
332 | ); | 421 | ); |
333 | 422 | ||
@@ -381,11 +470,11 @@ TRACE_EVENT(global_dirty_state, | |||
381 | 470 | ||
382 | TRACE_EVENT(bdi_dirty_ratelimit, | 471 | TRACE_EVENT(bdi_dirty_ratelimit, |
383 | 472 | ||
384 | TP_PROTO(struct backing_dev_info *bdi, | 473 | TP_PROTO(struct bdi_writeback *wb, |
385 | unsigned long dirty_rate, | 474 | unsigned long dirty_rate, |
386 | unsigned long task_ratelimit), | 475 | unsigned long task_ratelimit), |
387 | 476 | ||
388 | TP_ARGS(bdi, dirty_rate, task_ratelimit), | 477 | TP_ARGS(wb, dirty_rate, task_ratelimit), |
389 | 478 | ||
390 | TP_STRUCT__entry( | 479 | TP_STRUCT__entry( |
391 | __array(char, bdi, 32) | 480 | __array(char, bdi, 32) |
@@ -395,36 +484,39 @@ TRACE_EVENT(bdi_dirty_ratelimit, | |||
395 | __field(unsigned long, dirty_ratelimit) | 484 | __field(unsigned long, dirty_ratelimit) |
396 | __field(unsigned long, task_ratelimit) | 485 | __field(unsigned long, task_ratelimit) |
397 | __field(unsigned long, balanced_dirty_ratelimit) | 486 | __field(unsigned long, balanced_dirty_ratelimit) |
487 | __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb)) | ||
398 | ), | 488 | ), |
399 | 489 | ||
400 | TP_fast_assign( | 490 | TP_fast_assign( |
401 | strlcpy(__entry->bdi, dev_name(bdi->dev), 32); | 491 | strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32); |
402 | __entry->write_bw = KBps(bdi->wb.write_bandwidth); | 492 | __entry->write_bw = KBps(wb->write_bandwidth); |
403 | __entry->avg_write_bw = KBps(bdi->wb.avg_write_bandwidth); | 493 | __entry->avg_write_bw = KBps(wb->avg_write_bandwidth); |
404 | __entry->dirty_rate = KBps(dirty_rate); | 494 | __entry->dirty_rate = KBps(dirty_rate); |
405 | __entry->dirty_ratelimit = KBps(bdi->wb.dirty_ratelimit); | 495 | __entry->dirty_ratelimit = KBps(wb->dirty_ratelimit); |
406 | __entry->task_ratelimit = KBps(task_ratelimit); | 496 | __entry->task_ratelimit = KBps(task_ratelimit); |
407 | __entry->balanced_dirty_ratelimit = | 497 | __entry->balanced_dirty_ratelimit = |
408 | KBps(bdi->wb.balanced_dirty_ratelimit); | 498 | KBps(wb->balanced_dirty_ratelimit); |
499 | __trace_wb_assign_cgroup(__get_str(cgroup), wb); | ||
409 | ), | 500 | ), |
410 | 501 | ||
411 | TP_printk("bdi %s: " | 502 | TP_printk("bdi %s: " |
412 | "write_bw=%lu awrite_bw=%lu dirty_rate=%lu " | 503 | "write_bw=%lu awrite_bw=%lu dirty_rate=%lu " |
413 | "dirty_ratelimit=%lu task_ratelimit=%lu " | 504 | "dirty_ratelimit=%lu task_ratelimit=%lu " |
414 | "balanced_dirty_ratelimit=%lu", | 505 | "balanced_dirty_ratelimit=%lu cgroup=%s", |
415 | __entry->bdi, | 506 | __entry->bdi, |
416 | __entry->write_bw, /* write bandwidth */ | 507 | __entry->write_bw, /* write bandwidth */ |
417 | __entry->avg_write_bw, /* avg write bandwidth */ | 508 | __entry->avg_write_bw, /* avg write bandwidth */ |
418 | __entry->dirty_rate, /* bdi dirty rate */ | 509 | __entry->dirty_rate, /* bdi dirty rate */ |
419 | __entry->dirty_ratelimit, /* base ratelimit */ | 510 | __entry->dirty_ratelimit, /* base ratelimit */ |
420 | __entry->task_ratelimit, /* ratelimit with position control */ | 511 | __entry->task_ratelimit, /* ratelimit with position control */ |
421 | __entry->balanced_dirty_ratelimit /* the balanced ratelimit */ | 512 | __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */ |
513 | __get_str(cgroup) | ||
422 | ) | 514 | ) |
423 | ); | 515 | ); |
424 | 516 | ||
425 | TRACE_EVENT(balance_dirty_pages, | 517 | TRACE_EVENT(balance_dirty_pages, |
426 | 518 | ||
427 | TP_PROTO(struct backing_dev_info *bdi, | 519 | TP_PROTO(struct bdi_writeback *wb, |
428 | unsigned long thresh, | 520 | unsigned long thresh, |
429 | unsigned long bg_thresh, | 521 | unsigned long bg_thresh, |
430 | unsigned long dirty, | 522 | unsigned long dirty, |
@@ -437,7 +529,7 @@ TRACE_EVENT(balance_dirty_pages, | |||
437 | long pause, | 529 | long pause, |
438 | unsigned long start_time), | 530 | unsigned long start_time), |
439 | 531 | ||
440 | TP_ARGS(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty, | 532 | TP_ARGS(wb, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty, |
441 | dirty_ratelimit, task_ratelimit, | 533 | dirty_ratelimit, task_ratelimit, |
442 | dirtied, period, pause, start_time), | 534 | dirtied, period, pause, start_time), |
443 | 535 | ||
@@ -456,11 +548,12 @@ TRACE_EVENT(balance_dirty_pages, | |||
456 | __field( long, pause) | 548 | __field( long, pause) |
457 | __field(unsigned long, period) | 549 | __field(unsigned long, period) |
458 | __field( long, think) | 550 | __field( long, think) |
551 | __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb)) | ||
459 | ), | 552 | ), |
460 | 553 | ||
461 | TP_fast_assign( | 554 | TP_fast_assign( |
462 | unsigned long freerun = (thresh + bg_thresh) / 2; | 555 | unsigned long freerun = (thresh + bg_thresh) / 2; |
463 | strlcpy(__entry->bdi, dev_name(bdi->dev), 32); | 556 | strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32); |
464 | 557 | ||
465 | __entry->limit = global_wb_domain.dirty_limit; | 558 | __entry->limit = global_wb_domain.dirty_limit; |
466 | __entry->setpoint = (global_wb_domain.dirty_limit + | 559 | __entry->setpoint = (global_wb_domain.dirty_limit + |
@@ -478,6 +571,7 @@ TRACE_EVENT(balance_dirty_pages, | |||
478 | __entry->period = period * 1000 / HZ; | 571 | __entry->period = period * 1000 / HZ; |
479 | __entry->pause = pause * 1000 / HZ; | 572 | __entry->pause = pause * 1000 / HZ; |
480 | __entry->paused = (jiffies - start_time) * 1000 / HZ; | 573 | __entry->paused = (jiffies - start_time) * 1000 / HZ; |
574 | __trace_wb_assign_cgroup(__get_str(cgroup), wb); | ||
481 | ), | 575 | ), |
482 | 576 | ||
483 | 577 | ||
@@ -486,7 +580,7 @@ TRACE_EVENT(balance_dirty_pages, | |||
486 | "bdi_setpoint=%lu bdi_dirty=%lu " | 580 | "bdi_setpoint=%lu bdi_dirty=%lu " |
487 | "dirty_ratelimit=%lu task_ratelimit=%lu " | 581 | "dirty_ratelimit=%lu task_ratelimit=%lu " |
488 | "dirtied=%u dirtied_pause=%u " | 582 | "dirtied=%u dirtied_pause=%u " |
489 | "paused=%lu pause=%ld period=%lu think=%ld", | 583 | "paused=%lu pause=%ld period=%lu think=%ld cgroup=%s", |
490 | __entry->bdi, | 584 | __entry->bdi, |
491 | __entry->limit, | 585 | __entry->limit, |
492 | __entry->setpoint, | 586 | __entry->setpoint, |
@@ -500,7 +594,8 @@ TRACE_EVENT(balance_dirty_pages, | |||
500 | __entry->paused, /* ms */ | 594 | __entry->paused, /* ms */ |
501 | __entry->pause, /* ms */ | 595 | __entry->pause, /* ms */ |
502 | __entry->period, /* ms */ | 596 | __entry->period, /* ms */ |
503 | __entry->think /* ms */ | 597 | __entry->think, /* ms */ |
598 | __get_str(cgroup) | ||
504 | ) | 599 | ) |
505 | ); | 600 | ); |
506 | 601 | ||
@@ -514,6 +609,8 @@ TRACE_EVENT(writeback_sb_inodes_requeue, | |||
514 | __field(unsigned long, ino) | 609 | __field(unsigned long, ino) |
515 | __field(unsigned long, state) | 610 | __field(unsigned long, state) |
516 | __field(unsigned long, dirtied_when) | 611 | __field(unsigned long, dirtied_when) |
612 | __dynamic_array(char, cgroup, | ||
613 | __trace_wb_cgroup_size(inode_to_wb(inode))) | ||
517 | ), | 614 | ), |
518 | 615 | ||
519 | TP_fast_assign( | 616 | TP_fast_assign( |
@@ -522,14 +619,16 @@ TRACE_EVENT(writeback_sb_inodes_requeue, | |||
522 | __entry->ino = inode->i_ino; | 619 | __entry->ino = inode->i_ino; |
523 | __entry->state = inode->i_state; | 620 | __entry->state = inode->i_state; |
524 | __entry->dirtied_when = inode->dirtied_when; | 621 | __entry->dirtied_when = inode->dirtied_when; |
622 | __trace_wb_assign_cgroup(__get_str(cgroup), inode_to_wb(inode)); | ||
525 | ), | 623 | ), |
526 | 624 | ||
527 | TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu", | 625 | TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup=%s", |
528 | __entry->name, | 626 | __entry->name, |
529 | __entry->ino, | 627 | __entry->ino, |
530 | show_inode_state(__entry->state), | 628 | show_inode_state(__entry->state), |
531 | __entry->dirtied_when, | 629 | __entry->dirtied_when, |
532 | (jiffies - __entry->dirtied_when) / HZ | 630 | (jiffies - __entry->dirtied_when) / HZ, |
631 | __get_str(cgroup) | ||
533 | ) | 632 | ) |
534 | ); | 633 | ); |
535 | 634 | ||
@@ -585,6 +684,7 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template, | |||
585 | __field(unsigned long, writeback_index) | 684 | __field(unsigned long, writeback_index) |
586 | __field(long, nr_to_write) | 685 | __field(long, nr_to_write) |
587 | __field(unsigned long, wrote) | 686 | __field(unsigned long, wrote) |
687 | __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc)) | ||
588 | ), | 688 | ), |
589 | 689 | ||
590 | TP_fast_assign( | 690 | TP_fast_assign( |
@@ -596,10 +696,11 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template, | |||
596 | __entry->writeback_index = inode->i_mapping->writeback_index; | 696 | __entry->writeback_index = inode->i_mapping->writeback_index; |
597 | __entry->nr_to_write = nr_to_write; | 697 | __entry->nr_to_write = nr_to_write; |
598 | __entry->wrote = nr_to_write - wbc->nr_to_write; | 698 | __entry->wrote = nr_to_write - wbc->nr_to_write; |
699 | __trace_wbc_assign_cgroup(__get_str(cgroup), wbc); | ||
599 | ), | 700 | ), |
600 | 701 | ||
601 | TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu " | 702 | TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu " |
602 | "index=%lu to_write=%ld wrote=%lu", | 703 | "index=%lu to_write=%ld wrote=%lu cgroup=%s", |
603 | __entry->name, | 704 | __entry->name, |
604 | __entry->ino, | 705 | __entry->ino, |
605 | show_inode_state(__entry->state), | 706 | show_inode_state(__entry->state), |
@@ -607,7 +708,8 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template, | |||
607 | (jiffies - __entry->dirtied_when) / HZ, | 708 | (jiffies - __entry->dirtied_when) / HZ, |
608 | __entry->writeback_index, | 709 | __entry->writeback_index, |
609 | __entry->nr_to_write, | 710 | __entry->nr_to_write, |
610 | __entry->wrote | 711 | __entry->wrote, |
712 | __get_str(cgroup) | ||
611 | ) | 713 | ) |
612 | ); | 714 | ); |
613 | 715 | ||
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index ee8d7fd07be3..2df8ddcb0ca0 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -523,7 +523,7 @@ static int cgwb_create(struct backing_dev_info *bdi, | |||
523 | int ret = 0; | 523 | int ret = 0; |
524 | 524 | ||
525 | memcg = mem_cgroup_from_css(memcg_css); | 525 | memcg = mem_cgroup_from_css(memcg_css); |
526 | blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &blkio_cgrp_subsys); | 526 | blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); |
527 | blkcg = css_to_blkcg(blkcg_css); | 527 | blkcg = css_to_blkcg(blkcg_css); |
528 | memcg_cgwb_list = mem_cgroup_cgwb_list(memcg); | 528 | memcg_cgwb_list = mem_cgroup_cgwb_list(memcg); |
529 | blkcg_cgwb_list = &blkcg->cgwb_list; | 529 | blkcg_cgwb_list = &blkcg->cgwb_list; |
@@ -645,7 +645,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, | |||
645 | 645 | ||
646 | /* see whether the blkcg association has changed */ | 646 | /* see whether the blkcg association has changed */ |
647 | blkcg_css = cgroup_get_e_css(memcg_css->cgroup, | 647 | blkcg_css = cgroup_get_e_css(memcg_css->cgroup, |
648 | &blkio_cgrp_subsys); | 648 | &io_cgrp_subsys); |
649 | if (unlikely(wb->blkcg_css != blkcg_css || | 649 | if (unlikely(wb->blkcg_css != blkcg_css || |
650 | !wb_tryget(wb))) | 650 | !wb_tryget(wb))) |
651 | wb = NULL; | 651 | wb = NULL; |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5cccc127ef81..0a931cdd4f6b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -1289,7 +1289,7 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, | |||
1289 | wb->dirty_ratelimit = max(dirty_ratelimit, 1UL); | 1289 | wb->dirty_ratelimit = max(dirty_ratelimit, 1UL); |
1290 | wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit; | 1290 | wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit; |
1291 | 1291 | ||
1292 | trace_bdi_dirty_ratelimit(wb->bdi, dirty_rate, task_ratelimit); | 1292 | trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit); |
1293 | } | 1293 | } |
1294 | 1294 | ||
1295 | static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, | 1295 | static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, |
@@ -1683,7 +1683,7 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
1683 | * do a reset, as it may be a light dirtier. | 1683 | * do a reset, as it may be a light dirtier. |
1684 | */ | 1684 | */ |
1685 | if (pause < min_pause) { | 1685 | if (pause < min_pause) { |
1686 | trace_balance_dirty_pages(bdi, | 1686 | trace_balance_dirty_pages(wb, |
1687 | sdtc->thresh, | 1687 | sdtc->thresh, |
1688 | sdtc->bg_thresh, | 1688 | sdtc->bg_thresh, |
1689 | sdtc->dirty, | 1689 | sdtc->dirty, |
@@ -1712,7 +1712,7 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
1712 | } | 1712 | } |
1713 | 1713 | ||
1714 | pause: | 1714 | pause: |
1715 | trace_balance_dirty_pages(bdi, | 1715 | trace_balance_dirty_pages(wb, |
1716 | sdtc->thresh, | 1716 | sdtc->thresh, |
1717 | sdtc->bg_thresh, | 1717 | sdtc->bg_thresh, |
1718 | sdtc->dirty, | 1718 | sdtc->dirty, |