aboutsummaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-09-10 21:56:14 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-09-10 21:56:14 -0400
commitb0a1ea51bda4c2bcdde460221e1772f3a4f8c44f (patch)
tree9684c11b72718cd7e96e5eb93298690269ecf447 /include
parent33e247c7e58d335d70ecb84fd869091e2e4b8dcb (diff)
parent69d7fde5909b614114343974cfc52cb8ff30b544 (diff)
Merge branch 'for-4.3/blkcg' of git://git.kernel.dk/linux-block
Pull blk-cg updates from Jens Axboe: "A bit later in the cycle, but this has been in the block tree for a a while. This is basically four patchsets from Tejun, that improve our buffered cgroup writeback. It was dependent on the other cgroup changes, but they went in earlier in this cycle. Series 1 is set of 5 patches that has cgroup writeback updates: - bdi_writeback iteration fix which could lead to some wb's being skipped or repeated during e.g. sync under memory pressure. - Simplification of wb work wait mechanism. - Writeback tracepoints updated to report cgroup. Series 2 is is a set of updates for the CFQ cgroup writeback handling: cfq has always charged all async IOs to the root cgroup. It didn't have much choice as writeback didn't know about cgroups and there was no way to tell who to blame for a given writeback IO. writeback finally grew support for cgroups and now tags each writeback IO with the appropriate cgroup to charge it against. This patchset updates cfq so that it follows the blkcg each bio is tagged with. Async cfq_queues are now shared across cfq_group, which is per-cgroup, instead of per-request_queue cfq_data. This makes all IOs follow the weight based IO resource distribution implemented by cfq. - Switched from GFP_ATOMIC to GFP_NOWAIT as suggested by Jeff. - Other misc review points addressed, acks added and rebased. Series 3 is the blkcg policy cleanup patches: This patchset contains assorted cleanups for blkcg_policy methods and blk[c]g_policy_data handling. - alloc/free added for blkg_policy_data. exit dropped. - alloc/free added for blkcg_policy_data. - blk-throttle's async percpu allocation is replaced with direct allocation. - all methods now take blk[c]g_policy_data instead of blkcg_gq or blkcg. And finally, series 4 is a set of patches cleaning up the blkcg stats handling: blkcg's stats have always been somwhat of a mess. This patchset tries to improve the situation a bit. - The following patches added to consolidate blkcg entry point and blkg creation. This is in itself is an improvement and helps colllecting common stats on bio issue. - per-blkg stats now accounted on bio issue rather than request completion so that bio based and request based drivers can behave the same way. The issue was spotted by Vivek. - cfq-iosched implements custom recursive stats and blk-throttle implements custom per-cpu stats. This patchset make blkcg core support both by default. - cfq-iosched and blk-throttle keep track of the same stats multiple times. Unify them" * 'for-4.3/blkcg' of git://git.kernel.dk/linux-block: (45 commits) blkcg: use CGROUP_WEIGHT_* scale for io.weight on the unified hierarchy blkcg: s/CFQ_WEIGHT_*/CFQ_WEIGHT_LEGACY_*/ blkcg: implement interface for the unified hierarchy blkcg: misc preparations for unified hierarchy interface blkcg: separate out tg_conf_updated() from tg_set_conf() blkcg: move body parsing from blkg_conf_prep() to its callers blkcg: mark existing cftypes as legacy blkcg: rename subsystem name from blkio to io blkcg: refine error codes returned during blkcg configuration blkcg: remove unnecessary NULL checks from __cfqg_set_weight_device() blkcg: reduce stack usage of blkg_rwstat_recursive_sum() blkcg: remove cfqg_stats->sectors blkcg: move io_service_bytes and io_serviced stats into blkcg_gq blkcg: make blkg_[rw]stat_recursive_sum() to be able to index into blkcg_gq blkcg: make blkcg_[rw]stat per-cpu blkcg: add blkg_[rw]stat->aux_cnt and replace cfq_group->dead_stats with it blkcg: consolidate blkg creation in blkcg_bio_issue_check() blk-throttle: improve queue bypass handling blkcg: move root blkg lookup optimization from throtl_lookup_tg() to __blkg_lookup() blkcg: inline [__]blkg_lookup() ...
Diffstat (limited to 'include')
-rw-r--r--include/linux/backing-dev.h26
-rw-r--r--include/linux/blk-cgroup.h340
-rw-r--r--include/linux/cgroup_subsys.h2
-rw-r--r--include/linux/kernfs.h4
-rw-r--r--include/trace/events/writeback.h180
5 files changed, 397 insertions, 155 deletions
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 0fe9df983ab7..5a5d79ee256f 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -286,7 +286,7 @@ static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi
286 * %current's blkcg equals the effective blkcg of its memcg. No 286 * %current's blkcg equals the effective blkcg of its memcg. No
287 * need to use the relatively expensive cgroup_get_e_css(). 287 * need to use the relatively expensive cgroup_get_e_css().
288 */ 288 */
289 if (likely(wb && wb->blkcg_css == task_css(current, blkio_cgrp_id))) 289 if (likely(wb && wb->blkcg_css == task_css(current, io_cgrp_id)))
290 return wb; 290 return wb;
291 return NULL; 291 return NULL;
292} 292}
@@ -402,7 +402,7 @@ static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
402} 402}
403 403
404struct wb_iter { 404struct wb_iter {
405 int start_blkcg_id; 405 int start_memcg_id;
406 struct radix_tree_iter tree_iter; 406 struct radix_tree_iter tree_iter;
407 void **slot; 407 void **slot;
408}; 408};
@@ -414,9 +414,9 @@ static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter,
414 414
415 WARN_ON_ONCE(!rcu_read_lock_held()); 415 WARN_ON_ONCE(!rcu_read_lock_held());
416 416
417 if (iter->start_blkcg_id >= 0) { 417 if (iter->start_memcg_id >= 0) {
418 iter->slot = radix_tree_iter_init(titer, iter->start_blkcg_id); 418 iter->slot = radix_tree_iter_init(titer, iter->start_memcg_id);
419 iter->start_blkcg_id = -1; 419 iter->start_memcg_id = -1;
420 } else { 420 } else {
421 iter->slot = radix_tree_next_slot(iter->slot, titer, 0); 421 iter->slot = radix_tree_next_slot(iter->slot, titer, 0);
422 } 422 }
@@ -430,30 +430,30 @@ static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter,
430 430
431static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter, 431static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter,
432 struct backing_dev_info *bdi, 432 struct backing_dev_info *bdi,
433 int start_blkcg_id) 433 int start_memcg_id)
434{ 434{
435 iter->start_blkcg_id = start_blkcg_id; 435 iter->start_memcg_id = start_memcg_id;
436 436
437 if (start_blkcg_id) 437 if (start_memcg_id)
438 return __wb_iter_next(iter, bdi); 438 return __wb_iter_next(iter, bdi);
439 else 439 else
440 return &bdi->wb; 440 return &bdi->wb;
441} 441}
442 442
443/** 443/**
444 * bdi_for_each_wb - walk all wb's of a bdi in ascending blkcg ID order 444 * bdi_for_each_wb - walk all wb's of a bdi in ascending memcg ID order
445 * @wb_cur: cursor struct bdi_writeback pointer 445 * @wb_cur: cursor struct bdi_writeback pointer
446 * @bdi: bdi to walk wb's of 446 * @bdi: bdi to walk wb's of
447 * @iter: pointer to struct wb_iter to be used as iteration buffer 447 * @iter: pointer to struct wb_iter to be used as iteration buffer
448 * @start_blkcg_id: blkcg ID to start iteration from 448 * @start_memcg_id: memcg ID to start iteration from
449 * 449 *
450 * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending 450 * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending
451 * blkcg ID order starting from @start_blkcg_id. @iter is struct wb_iter 451 * memcg ID order starting from @start_memcg_id. @iter is struct wb_iter
452 * to be used as temp storage during iteration. rcu_read_lock() must be 452 * to be used as temp storage during iteration. rcu_read_lock() must be
453 * held throughout iteration. 453 * held throughout iteration.
454 */ 454 */
455#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id) \ 455#define bdi_for_each_wb(wb_cur, bdi, iter, start_memcg_id) \
456 for ((wb_cur) = __wb_iter_init(iter, bdi, start_blkcg_id); \ 456 for ((wb_cur) = __wb_iter_init(iter, bdi, start_memcg_id); \
457 (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi)) 457 (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi))
458 458
459#else /* CONFIG_CGROUP_WRITEBACK */ 459#else /* CONFIG_CGROUP_WRITEBACK */
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index a4cd1641e9e2..0a5cc7a1109b 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -14,12 +14,15 @@
14 */ 14 */
15 15
16#include <linux/cgroup.h> 16#include <linux/cgroup.h>
17#include <linux/u64_stats_sync.h> 17#include <linux/percpu_counter.h>
18#include <linux/seq_file.h> 18#include <linux/seq_file.h>
19#include <linux/radix-tree.h> 19#include <linux/radix-tree.h>
20#include <linux/blkdev.h> 20#include <linux/blkdev.h>
21#include <linux/atomic.h> 21#include <linux/atomic.h>
22 22
23/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
24#define BLKG_STAT_CPU_BATCH (INT_MAX / 2)
25
23/* Max limits for throttle policy */ 26/* Max limits for throttle policy */
24#define THROTL_IOPS_MAX UINT_MAX 27#define THROTL_IOPS_MAX UINT_MAX
25 28
@@ -45,7 +48,7 @@ struct blkcg {
45 struct blkcg_gq *blkg_hint; 48 struct blkcg_gq *blkg_hint;
46 struct hlist_head blkg_list; 49 struct hlist_head blkg_list;
47 50
48 struct blkcg_policy_data *pd[BLKCG_MAX_POLS]; 51 struct blkcg_policy_data *cpd[BLKCG_MAX_POLS];
49 52
50 struct list_head all_blkcgs_node; 53 struct list_head all_blkcgs_node;
51#ifdef CONFIG_CGROUP_WRITEBACK 54#ifdef CONFIG_CGROUP_WRITEBACK
@@ -53,14 +56,19 @@ struct blkcg {
53#endif 56#endif
54}; 57};
55 58
59/*
60 * blkg_[rw]stat->aux_cnt is excluded for local stats but included for
61 * recursive. Used to carry stats of dead children, and, for blkg_rwstat,
62 * to carry result values from read and sum operations.
63 */
56struct blkg_stat { 64struct blkg_stat {
57 struct u64_stats_sync syncp; 65 struct percpu_counter cpu_cnt;
58 uint64_t cnt; 66 atomic64_t aux_cnt;
59}; 67};
60 68
61struct blkg_rwstat { 69struct blkg_rwstat {
62 struct u64_stats_sync syncp; 70 struct percpu_counter cpu_cnt[BLKG_RWSTAT_NR];
63 uint64_t cnt[BLKG_RWSTAT_NR]; 71 atomic64_t aux_cnt[BLKG_RWSTAT_NR];
64}; 72};
65 73
66/* 74/*
@@ -68,32 +76,28 @@ struct blkg_rwstat {
68 * request_queue (q). This is used by blkcg policies which need to track 76 * request_queue (q). This is used by blkcg policies which need to track
69 * information per blkcg - q pair. 77 * information per blkcg - q pair.
70 * 78 *
71 * There can be multiple active blkcg policies and each has its private 79 * There can be multiple active blkcg policies and each blkg:policy pair is
72 * data on each blkg, the size of which is determined by 80 * represented by a blkg_policy_data which is allocated and freed by each
73 * blkcg_policy->pd_size. blkcg core allocates and frees such areas 81 * policy's pd_alloc/free_fn() methods. A policy can allocate private data
74 * together with blkg and invokes pd_init/exit_fn() methods. 82 * area by allocating larger data structure which embeds blkg_policy_data
75 * 83 * at the beginning.
76 * Such private data must embed struct blkg_policy_data (pd) at the
77 * beginning and pd_size can't be smaller than pd.
78 */ 84 */
79struct blkg_policy_data { 85struct blkg_policy_data {
80 /* the blkg and policy id this per-policy data belongs to */ 86 /* the blkg and policy id this per-policy data belongs to */
81 struct blkcg_gq *blkg; 87 struct blkcg_gq *blkg;
82 int plid; 88 int plid;
83
84 /* used during policy activation */
85 struct list_head alloc_node;
86}; 89};
87 90
88/* 91/*
89 * Policies that need to keep per-blkcg data which is independent 92 * Policies that need to keep per-blkcg data which is independent from any
90 * from any request_queue associated to it must specify its size 93 * request_queue associated to it should implement cpd_alloc/free_fn()
91 * with the cpd_size field of the blkcg_policy structure and 94 * methods. A policy can allocate private data area by allocating larger
92 * embed a blkcg_policy_data in it. cpd_init() is invoked to let 95 * data structure which embeds blkcg_policy_data at the beginning.
93 * each policy handle per-blkcg data. 96 * cpd_init() is invoked to let each policy handle per-blkcg data.
94 */ 97 */
95struct blkcg_policy_data { 98struct blkcg_policy_data {
96 /* the policy id this per-policy data belongs to */ 99 /* the blkcg and policy id this per-policy data belongs to */
100 struct blkcg *blkcg;
97 int plid; 101 int plid;
98}; 102};
99 103
@@ -123,40 +127,50 @@ struct blkcg_gq {
123 /* is this blkg online? protected by both blkcg and q locks */ 127 /* is this blkg online? protected by both blkcg and q locks */
124 bool online; 128 bool online;
125 129
130 struct blkg_rwstat stat_bytes;
131 struct blkg_rwstat stat_ios;
132
126 struct blkg_policy_data *pd[BLKCG_MAX_POLS]; 133 struct blkg_policy_data *pd[BLKCG_MAX_POLS];
127 134
128 struct rcu_head rcu_head; 135 struct rcu_head rcu_head;
129}; 136};
130 137
131typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg); 138typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
132typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); 139typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd);
133typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg); 140typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd);
134typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg); 141typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd);
135typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg); 142typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node);
136typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); 143typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd);
144typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
145typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd);
146typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd);
147typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
137 148
138struct blkcg_policy { 149struct blkcg_policy {
139 int plid; 150 int plid;
140 /* policy specific private data size */
141 size_t pd_size;
142 /* policy specific per-blkcg data size */
143 size_t cpd_size;
144 /* cgroup files for the policy */ 151 /* cgroup files for the policy */
145 struct cftype *cftypes; 152 struct cftype *dfl_cftypes;
153 struct cftype *legacy_cftypes;
146 154
147 /* operations */ 155 /* operations */
156 blkcg_pol_alloc_cpd_fn *cpd_alloc_fn;
148 blkcg_pol_init_cpd_fn *cpd_init_fn; 157 blkcg_pol_init_cpd_fn *cpd_init_fn;
158 blkcg_pol_free_cpd_fn *cpd_free_fn;
159 blkcg_pol_bind_cpd_fn *cpd_bind_fn;
160
161 blkcg_pol_alloc_pd_fn *pd_alloc_fn;
149 blkcg_pol_init_pd_fn *pd_init_fn; 162 blkcg_pol_init_pd_fn *pd_init_fn;
150 blkcg_pol_online_pd_fn *pd_online_fn; 163 blkcg_pol_online_pd_fn *pd_online_fn;
151 blkcg_pol_offline_pd_fn *pd_offline_fn; 164 blkcg_pol_offline_pd_fn *pd_offline_fn;
152 blkcg_pol_exit_pd_fn *pd_exit_fn; 165 blkcg_pol_free_pd_fn *pd_free_fn;
153 blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; 166 blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn;
154}; 167};
155 168
156extern struct blkcg blkcg_root; 169extern struct blkcg blkcg_root;
157extern struct cgroup_subsys_state * const blkcg_root_css; 170extern struct cgroup_subsys_state * const blkcg_root_css;
158 171
159struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q); 172struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
173 struct request_queue *q, bool update_hint);
160struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, 174struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
161 struct request_queue *q); 175 struct request_queue *q);
162int blkcg_init_queue(struct request_queue *q); 176int blkcg_init_queue(struct request_queue *q);
@@ -171,6 +185,7 @@ int blkcg_activate_policy(struct request_queue *q,
171void blkcg_deactivate_policy(struct request_queue *q, 185void blkcg_deactivate_policy(struct request_queue *q,
172 const struct blkcg_policy *pol); 186 const struct blkcg_policy *pol);
173 187
188const char *blkg_dev_name(struct blkcg_gq *blkg);
174void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, 189void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
175 u64 (*prfill)(struct seq_file *, 190 u64 (*prfill)(struct seq_file *,
176 struct blkg_policy_data *, int), 191 struct blkg_policy_data *, int),
@@ -182,19 +197,24 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
182u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off); 197u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off);
183u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, 198u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
184 int off); 199 int off);
200int blkg_print_stat_bytes(struct seq_file *sf, void *v);
201int blkg_print_stat_ios(struct seq_file *sf, void *v);
202int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v);
203int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v);
185 204
186u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off); 205u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
187struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, 206 struct blkcg_policy *pol, int off);
188 int off); 207struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
208 struct blkcg_policy *pol, int off);
189 209
190struct blkg_conf_ctx { 210struct blkg_conf_ctx {
191 struct gendisk *disk; 211 struct gendisk *disk;
192 struct blkcg_gq *blkg; 212 struct blkcg_gq *blkg;
193 u64 v; 213 char *body;
194}; 214};
195 215
196int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, 216int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
197 const char *input, struct blkg_conf_ctx *ctx); 217 char *input, struct blkg_conf_ctx *ctx);
198void blkg_conf_finish(struct blkg_conf_ctx *ctx); 218void blkg_conf_finish(struct blkg_conf_ctx *ctx);
199 219
200 220
@@ -205,7 +225,7 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
205 225
206static inline struct blkcg *task_blkcg(struct task_struct *tsk) 226static inline struct blkcg *task_blkcg(struct task_struct *tsk)
207{ 227{
208 return css_to_blkcg(task_css(tsk, blkio_cgrp_id)); 228 return css_to_blkcg(task_css(tsk, io_cgrp_id));
209} 229}
210 230
211static inline struct blkcg *bio_blkcg(struct bio *bio) 231static inline struct blkcg *bio_blkcg(struct bio *bio)
@@ -218,7 +238,7 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
218static inline struct cgroup_subsys_state * 238static inline struct cgroup_subsys_state *
219task_get_blkcg_css(struct task_struct *task) 239task_get_blkcg_css(struct task_struct *task)
220{ 240{
221 return task_get_css(task, blkio_cgrp_id); 241 return task_get_css(task, io_cgrp_id);
222} 242}
223 243
224/** 244/**
@@ -233,6 +253,52 @@ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
233} 253}
234 254
235/** 255/**
256 * __blkg_lookup - internal version of blkg_lookup()
257 * @blkcg: blkcg of interest
258 * @q: request_queue of interest
259 * @update_hint: whether to update lookup hint with the result or not
260 *
261 * This is internal version and shouldn't be used by policy
262 * implementations. Looks up blkgs for the @blkcg - @q pair regardless of
263 * @q's bypass state. If @update_hint is %true, the caller should be
264 * holding @q->queue_lock and lookup hint is updated on success.
265 */
266static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
267 struct request_queue *q,
268 bool update_hint)
269{
270 struct blkcg_gq *blkg;
271
272 if (blkcg == &blkcg_root)
273 return q->root_blkg;
274
275 blkg = rcu_dereference(blkcg->blkg_hint);
276 if (blkg && blkg->q == q)
277 return blkg;
278
279 return blkg_lookup_slowpath(blkcg, q, update_hint);
280}
281
282/**
283 * blkg_lookup - lookup blkg for the specified blkcg - q pair
284 * @blkcg: blkcg of interest
285 * @q: request_queue of interest
286 *
287 * Lookup blkg for the @blkcg - @q pair. This function should be called
288 * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
289 * - see blk_queue_bypass_start() for details.
290 */
291static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
292 struct request_queue *q)
293{
294 WARN_ON_ONCE(!rcu_read_lock_held());
295
296 if (unlikely(blk_queue_bypass(q)))
297 return NULL;
298 return __blkg_lookup(blkcg, q, false);
299}
300
301/**
236 * blkg_to_pdata - get policy private data 302 * blkg_to_pdata - get policy private data
237 * @blkg: blkg of interest 303 * @blkg: blkg of interest
238 * @pol: policy of interest 304 * @pol: policy of interest
@@ -248,7 +314,7 @@ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
248static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg, 314static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
249 struct blkcg_policy *pol) 315 struct blkcg_policy *pol)
250{ 316{
251 return blkcg ? blkcg->pd[pol->plid] : NULL; 317 return blkcg ? blkcg->cpd[pol->plid] : NULL;
252} 318}
253 319
254/** 320/**
@@ -262,6 +328,11 @@ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
262 return pd ? pd->blkg : NULL; 328 return pd ? pd->blkg : NULL;
263} 329}
264 330
331static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd)
332{
333 return cpd ? cpd->blkcg : NULL;
334}
335
265/** 336/**
266 * blkg_path - format cgroup path of blkg 337 * blkg_path - format cgroup path of blkg
267 * @blkg: blkg of interest 338 * @blkg: blkg of interest
@@ -309,9 +380,6 @@ static inline void blkg_put(struct blkcg_gq *blkg)
309 call_rcu(&blkg->rcu_head, __blkg_release_rcu); 380 call_rcu(&blkg->rcu_head, __blkg_release_rcu);
310} 381}
311 382
312struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
313 bool update_hint);
314
315/** 383/**
316 * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants 384 * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
317 * @d_blkg: loop cursor pointing to the current descendant 385 * @d_blkg: loop cursor pointing to the current descendant
@@ -373,8 +441,8 @@ static inline struct request_list *blk_get_rl(struct request_queue *q,
373 * or if either the blkcg or queue is going away. Fall back to 441 * or if either the blkcg or queue is going away. Fall back to
374 * root_rl in such cases. 442 * root_rl in such cases.
375 */ 443 */
376 blkg = blkg_lookup_create(blkcg, q); 444 blkg = blkg_lookup(blkcg, q);
377 if (IS_ERR(blkg)) 445 if (unlikely(!blkg))
378 goto root_rl; 446 goto root_rl;
379 447
380 blkg_get(blkg); 448 blkg_get(blkg);
@@ -394,8 +462,7 @@ root_rl:
394 */ 462 */
395static inline void blk_put_rl(struct request_list *rl) 463static inline void blk_put_rl(struct request_list *rl)
396{ 464{
397 /* root_rl may not have blkg set */ 465 if (rl->blkg->blkcg != &blkcg_root)
398 if (rl->blkg && rl->blkg->blkcg != &blkcg_root)
399 blkg_put(rl->blkg); 466 blkg_put(rl->blkg);
400} 467}
401 468
@@ -433,9 +500,21 @@ struct request_list *__blk_queue_next_rl(struct request_list *rl,
433#define blk_queue_for_each_rl(rl, q) \ 500#define blk_queue_for_each_rl(rl, q) \
434 for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q))) 501 for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
435 502
436static inline void blkg_stat_init(struct blkg_stat *stat) 503static inline int blkg_stat_init(struct blkg_stat *stat, gfp_t gfp)
437{ 504{
438 u64_stats_init(&stat->syncp); 505 int ret;
506
507 ret = percpu_counter_init(&stat->cpu_cnt, 0, gfp);
508 if (ret)
509 return ret;
510
511 atomic64_set(&stat->aux_cnt, 0);
512 return 0;
513}
514
515static inline void blkg_stat_exit(struct blkg_stat *stat)
516{
517 percpu_counter_destroy(&stat->cpu_cnt);
439} 518}
440 519
441/** 520/**
@@ -443,34 +522,21 @@ static inline void blkg_stat_init(struct blkg_stat *stat)
443 * @stat: target blkg_stat 522 * @stat: target blkg_stat
444 * @val: value to add 523 * @val: value to add
445 * 524 *
446 * Add @val to @stat. The caller is responsible for synchronizing calls to 525 * Add @val to @stat. The caller must ensure that IRQ on the same CPU
447 * this function. 526 * don't re-enter this function for the same counter.
448 */ 527 */
449static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val) 528static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val)
450{ 529{
451 u64_stats_update_begin(&stat->syncp); 530 __percpu_counter_add(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH);
452 stat->cnt += val;
453 u64_stats_update_end(&stat->syncp);
454} 531}
455 532
456/** 533/**
457 * blkg_stat_read - read the current value of a blkg_stat 534 * blkg_stat_read - read the current value of a blkg_stat
458 * @stat: blkg_stat to read 535 * @stat: blkg_stat to read
459 *
460 * Read the current value of @stat. This function can be called without
461 * synchroniztion and takes care of u64 atomicity.
462 */ 536 */
463static inline uint64_t blkg_stat_read(struct blkg_stat *stat) 537static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
464{ 538{
465 unsigned int start; 539 return percpu_counter_sum_positive(&stat->cpu_cnt);
466 uint64_t v;
467
468 do {
469 start = u64_stats_fetch_begin_irq(&stat->syncp);
470 v = stat->cnt;
471 } while (u64_stats_fetch_retry_irq(&stat->syncp, start));
472
473 return v;
474} 540}
475 541
476/** 542/**
@@ -479,24 +545,46 @@ static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
479 */ 545 */
480static inline void blkg_stat_reset(struct blkg_stat *stat) 546static inline void blkg_stat_reset(struct blkg_stat *stat)
481{ 547{
482 stat->cnt = 0; 548 percpu_counter_set(&stat->cpu_cnt, 0);
549 atomic64_set(&stat->aux_cnt, 0);
483} 550}
484 551
485/** 552/**
486 * blkg_stat_merge - merge a blkg_stat into another 553 * blkg_stat_add_aux - add a blkg_stat into another's aux count
487 * @to: the destination blkg_stat 554 * @to: the destination blkg_stat
488 * @from: the source 555 * @from: the source
489 * 556 *
490 * Add @from's count to @to. 557 * Add @from's count including the aux one to @to's aux count.
491 */ 558 */
492static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from) 559static inline void blkg_stat_add_aux(struct blkg_stat *to,
560 struct blkg_stat *from)
493{ 561{
494 blkg_stat_add(to, blkg_stat_read(from)); 562 atomic64_add(blkg_stat_read(from) + atomic64_read(&from->aux_cnt),
563 &to->aux_cnt);
495} 564}
496 565
497static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat) 566static inline int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp)
498{ 567{
499 u64_stats_init(&rwstat->syncp); 568 int i, ret;
569
570 for (i = 0; i < BLKG_RWSTAT_NR; i++) {
571 ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp);
572 if (ret) {
573 while (--i >= 0)
574 percpu_counter_destroy(&rwstat->cpu_cnt[i]);
575 return ret;
576 }
577 atomic64_set(&rwstat->aux_cnt[i], 0);
578 }
579 return 0;
580}
581
582static inline void blkg_rwstat_exit(struct blkg_rwstat *rwstat)
583{
584 int i;
585
586 for (i = 0; i < BLKG_RWSTAT_NR; i++)
587 percpu_counter_destroy(&rwstat->cpu_cnt[i]);
500} 588}
501 589
502/** 590/**
@@ -511,39 +599,38 @@ static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat)
511static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, 599static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
512 int rw, uint64_t val) 600 int rw, uint64_t val)
513{ 601{
514 u64_stats_update_begin(&rwstat->syncp); 602 struct percpu_counter *cnt;
515 603
516 if (rw & REQ_WRITE) 604 if (rw & REQ_WRITE)
517 rwstat->cnt[BLKG_RWSTAT_WRITE] += val; 605 cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE];
518 else 606 else
519 rwstat->cnt[BLKG_RWSTAT_READ] += val; 607 cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ];
608
609 __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH);
610
520 if (rw & REQ_SYNC) 611 if (rw & REQ_SYNC)
521 rwstat->cnt[BLKG_RWSTAT_SYNC] += val; 612 cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC];
522 else 613 else
523 rwstat->cnt[BLKG_RWSTAT_ASYNC] += val; 614 cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC];
524 615
525 u64_stats_update_end(&rwstat->syncp); 616 __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH);
526} 617}
527 618
528/** 619/**
529 * blkg_rwstat_read - read the current values of a blkg_rwstat 620 * blkg_rwstat_read - read the current values of a blkg_rwstat
530 * @rwstat: blkg_rwstat to read 621 * @rwstat: blkg_rwstat to read
531 * 622 *
532 * Read the current snapshot of @rwstat and return it as the return value. 623 * Read the current snapshot of @rwstat and return it in the aux counts.
533 * This function can be called without synchronization and takes care of
534 * u64 atomicity.
535 */ 624 */
536static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat) 625static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
537{ 626{
538 unsigned int start; 627 struct blkg_rwstat result;
539 struct blkg_rwstat tmp; 628 int i;
540
541 do {
542 start = u64_stats_fetch_begin_irq(&rwstat->syncp);
543 tmp = *rwstat;
544 } while (u64_stats_fetch_retry_irq(&rwstat->syncp, start));
545 629
546 return tmp; 630 for (i = 0; i < BLKG_RWSTAT_NR; i++)
631 atomic64_set(&result.aux_cnt[i],
632 percpu_counter_sum_positive(&rwstat->cpu_cnt[i]));
633 return result;
547} 634}
548 635
549/** 636/**
@@ -558,7 +645,8 @@ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
558{ 645{
559 struct blkg_rwstat tmp = blkg_rwstat_read(rwstat); 646 struct blkg_rwstat tmp = blkg_rwstat_read(rwstat);
560 647
561 return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]; 648 return atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
649 atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
562} 650}
563 651
564/** 652/**
@@ -567,26 +655,71 @@ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
567 */ 655 */
568static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) 656static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
569{ 657{
570 memset(rwstat->cnt, 0, sizeof(rwstat->cnt)); 658 int i;
659
660 for (i = 0; i < BLKG_RWSTAT_NR; i++) {
661 percpu_counter_set(&rwstat->cpu_cnt[i], 0);
662 atomic64_set(&rwstat->aux_cnt[i], 0);
663 }
571} 664}
572 665
573/** 666/**
574 * blkg_rwstat_merge - merge a blkg_rwstat into another 667 * blkg_rwstat_add_aux - add a blkg_rwstat into another's aux count
575 * @to: the destination blkg_rwstat 668 * @to: the destination blkg_rwstat
576 * @from: the source 669 * @from: the source
577 * 670 *
578 * Add @from's counts to @to. 671 * Add @from's count including the aux one to @to's aux count.
579 */ 672 */
580static inline void blkg_rwstat_merge(struct blkg_rwstat *to, 673static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to,
581 struct blkg_rwstat *from) 674 struct blkg_rwstat *from)
582{ 675{
583 struct blkg_rwstat v = blkg_rwstat_read(from); 676 struct blkg_rwstat v = blkg_rwstat_read(from);
584 int i; 677 int i;
585 678
586 u64_stats_update_begin(&to->syncp);
587 for (i = 0; i < BLKG_RWSTAT_NR; i++) 679 for (i = 0; i < BLKG_RWSTAT_NR; i++)
588 to->cnt[i] += v.cnt[i]; 680 atomic64_add(atomic64_read(&v.aux_cnt[i]) +
589 u64_stats_update_end(&to->syncp); 681 atomic64_read(&from->aux_cnt[i]),
682 &to->aux_cnt[i]);
683}
684
685#ifdef CONFIG_BLK_DEV_THROTTLING
686extern bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
687 struct bio *bio);
688#else
689static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
690 struct bio *bio) { return false; }
691#endif
692
693static inline bool blkcg_bio_issue_check(struct request_queue *q,
694 struct bio *bio)
695{
696 struct blkcg *blkcg;
697 struct blkcg_gq *blkg;
698 bool throtl = false;
699
700 rcu_read_lock();
701 blkcg = bio_blkcg(bio);
702
703 blkg = blkg_lookup(blkcg, q);
704 if (unlikely(!blkg)) {
705 spin_lock_irq(q->queue_lock);
706 blkg = blkg_lookup_create(blkcg, q);
707 if (IS_ERR(blkg))
708 blkg = NULL;
709 spin_unlock_irq(q->queue_lock);
710 }
711
712 throtl = blk_throtl_bio(q, blkg, bio);
713
714 if (!throtl) {
715 blkg = blkg ?: q->root_blkg;
716 blkg_rwstat_add(&blkg->stat_bytes, bio->bi_flags,
717 bio->bi_iter.bi_size);
718 blkg_rwstat_add(&blkg->stat_ios, bio->bi_flags, 1);
719 }
720
721 rcu_read_unlock();
722 return !throtl;
590} 723}
591 724
592#else /* CONFIG_BLK_CGROUP */ 725#else /* CONFIG_BLK_CGROUP */
@@ -642,6 +775,9 @@ static inline void blk_put_rl(struct request_list *rl) { }
642static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { } 775static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { }
643static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; } 776static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; }
644 777
778static inline bool blkcg_bio_issue_check(struct request_queue *q,
779 struct bio *bio) { return true; }
780
645#define blk_queue_for_each_rl(rl, q) \ 781#define blk_queue_for_each_rl(rl, q) \
646 for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) 782 for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
647 783
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 1f36945fd23d..1a96fdaa33d5 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -27,7 +27,7 @@ SUBSYS(cpuacct)
27#endif 27#endif
28 28
29#if IS_ENABLED(CONFIG_BLK_CGROUP) 29#if IS_ENABLED(CONFIG_BLK_CGROUP)
30SUBSYS(blkio) 30SUBSYS(io)
31#endif 31#endif
32 32
33#if IS_ENABLED(CONFIG_MEMCG) 33#if IS_ENABLED(CONFIG_MEMCG)
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 123be25ea15a..5d4e9c4b821d 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -266,6 +266,7 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
266} 266}
267 267
268int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen); 268int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen);
269size_t kernfs_path_len(struct kernfs_node *kn);
269char * __must_check kernfs_path(struct kernfs_node *kn, char *buf, 270char * __must_check kernfs_path(struct kernfs_node *kn, char *buf,
270 size_t buflen); 271 size_t buflen);
271void pr_cont_kernfs_name(struct kernfs_node *kn); 272void pr_cont_kernfs_name(struct kernfs_node *kn);
@@ -332,6 +333,9 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
332static inline int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) 333static inline int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
333{ return -ENOSYS; } 334{ return -ENOSYS; }
334 335
336static inline size_t kernfs_path_len(struct kernfs_node *kn)
337{ return 0; }
338
335static inline char * __must_check kernfs_path(struct kernfs_node *kn, char *buf, 339static inline char * __must_check kernfs_path(struct kernfs_node *kn, char *buf,
336 size_t buflen) 340 size_t buflen)
337{ return NULL; } 341{ return NULL; }
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index a7aa607a4c55..fff846b512e6 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -131,6 +131,66 @@ DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode,
131 TP_ARGS(inode, flags) 131 TP_ARGS(inode, flags)
132); 132);
133 133
134#ifdef CREATE_TRACE_POINTS
135#ifdef CONFIG_CGROUP_WRITEBACK
136
137static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb)
138{
139 return kernfs_path_len(wb->memcg_css->cgroup->kn) + 1;
140}
141
142static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb)
143{
144 struct cgroup *cgrp = wb->memcg_css->cgroup;
145 char *path;
146
147 path = cgroup_path(cgrp, buf, kernfs_path_len(cgrp->kn) + 1);
148 WARN_ON_ONCE(path != buf);
149}
150
151static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc)
152{
153 if (wbc->wb)
154 return __trace_wb_cgroup_size(wbc->wb);
155 else
156 return 2;
157}
158
159static inline void __trace_wbc_assign_cgroup(char *buf,
160 struct writeback_control *wbc)
161{
162 if (wbc->wb)
163 __trace_wb_assign_cgroup(buf, wbc->wb);
164 else
165 strcpy(buf, "/");
166}
167
168#else /* CONFIG_CGROUP_WRITEBACK */
169
170static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb)
171{
172 return 2;
173}
174
175static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb)
176{
177 strcpy(buf, "/");
178}
179
180static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc)
181{
182 return 2;
183}
184
185static inline void __trace_wbc_assign_cgroup(char *buf,
186 struct writeback_control *wbc)
187{
188 strcpy(buf, "/");
189}
190
191#endif /* CONFIG_CGROUP_WRITEBACK */
192#endif /* CREATE_TRACE_POINTS */
193
134DECLARE_EVENT_CLASS(writeback_write_inode_template, 194DECLARE_EVENT_CLASS(writeback_write_inode_template,
135 195
136 TP_PROTO(struct inode *inode, struct writeback_control *wbc), 196 TP_PROTO(struct inode *inode, struct writeback_control *wbc),
@@ -141,6 +201,7 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template,
141 __array(char, name, 32) 201 __array(char, name, 32)
142 __field(unsigned long, ino) 202 __field(unsigned long, ino)
143 __field(int, sync_mode) 203 __field(int, sync_mode)
204 __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
144 ), 205 ),
145 206
146 TP_fast_assign( 207 TP_fast_assign(
@@ -148,12 +209,14 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template,
148 dev_name(inode_to_bdi(inode)->dev), 32); 209 dev_name(inode_to_bdi(inode)->dev), 32);
149 __entry->ino = inode->i_ino; 210 __entry->ino = inode->i_ino;
150 __entry->sync_mode = wbc->sync_mode; 211 __entry->sync_mode = wbc->sync_mode;
212 __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
151 ), 213 ),
152 214
153 TP_printk("bdi %s: ino=%lu sync_mode=%d", 215 TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup=%s",
154 __entry->name, 216 __entry->name,
155 __entry->ino, 217 __entry->ino,
156 __entry->sync_mode 218 __entry->sync_mode,
219 __get_str(cgroup)
157 ) 220 )
158); 221);
159 222
@@ -172,8 +235,8 @@ DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode,
172); 235);
173 236
174DECLARE_EVENT_CLASS(writeback_work_class, 237DECLARE_EVENT_CLASS(writeback_work_class,
175 TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), 238 TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work),
176 TP_ARGS(bdi, work), 239 TP_ARGS(wb, work),
177 TP_STRUCT__entry( 240 TP_STRUCT__entry(
178 __array(char, name, 32) 241 __array(char, name, 32)
179 __field(long, nr_pages) 242 __field(long, nr_pages)
@@ -183,10 +246,11 @@ DECLARE_EVENT_CLASS(writeback_work_class,
183 __field(int, range_cyclic) 246 __field(int, range_cyclic)
184 __field(int, for_background) 247 __field(int, for_background)
185 __field(int, reason) 248 __field(int, reason)
249 __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
186 ), 250 ),
187 TP_fast_assign( 251 TP_fast_assign(
188 strncpy(__entry->name, 252 strncpy(__entry->name,
189 bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32); 253 wb->bdi->dev ? dev_name(wb->bdi->dev) : "(unknown)", 32);
190 __entry->nr_pages = work->nr_pages; 254 __entry->nr_pages = work->nr_pages;
191 __entry->sb_dev = work->sb ? work->sb->s_dev : 0; 255 __entry->sb_dev = work->sb ? work->sb->s_dev : 0;
192 __entry->sync_mode = work->sync_mode; 256 __entry->sync_mode = work->sync_mode;
@@ -194,9 +258,10 @@ DECLARE_EVENT_CLASS(writeback_work_class,
194 __entry->range_cyclic = work->range_cyclic; 258 __entry->range_cyclic = work->range_cyclic;
195 __entry->for_background = work->for_background; 259 __entry->for_background = work->for_background;
196 __entry->reason = work->reason; 260 __entry->reason = work->reason;
261 __trace_wb_assign_cgroup(__get_str(cgroup), wb);
197 ), 262 ),
198 TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d " 263 TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d "
199 "kupdate=%d range_cyclic=%d background=%d reason=%s", 264 "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup=%s",
200 __entry->name, 265 __entry->name,
201 MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev), 266 MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev),
202 __entry->nr_pages, 267 __entry->nr_pages,
@@ -204,13 +269,14 @@ DECLARE_EVENT_CLASS(writeback_work_class,
204 __entry->for_kupdate, 269 __entry->for_kupdate,
205 __entry->range_cyclic, 270 __entry->range_cyclic,
206 __entry->for_background, 271 __entry->for_background,
207 __print_symbolic(__entry->reason, WB_WORK_REASON) 272 __print_symbolic(__entry->reason, WB_WORK_REASON),
273 __get_str(cgroup)
208 ) 274 )
209); 275);
210#define DEFINE_WRITEBACK_WORK_EVENT(name) \ 276#define DEFINE_WRITEBACK_WORK_EVENT(name) \
211DEFINE_EVENT(writeback_work_class, name, \ 277DEFINE_EVENT(writeback_work_class, name, \
212 TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \ 278 TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), \
213 TP_ARGS(bdi, work)) 279 TP_ARGS(wb, work))
214DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); 280DEFINE_WRITEBACK_WORK_EVENT(writeback_queue);
215DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); 281DEFINE_WRITEBACK_WORK_EVENT(writeback_exec);
216DEFINE_WRITEBACK_WORK_EVENT(writeback_start); 282DEFINE_WRITEBACK_WORK_EVENT(writeback_start);
@@ -230,26 +296,42 @@ TRACE_EVENT(writeback_pages_written,
230); 296);
231 297
232DECLARE_EVENT_CLASS(writeback_class, 298DECLARE_EVENT_CLASS(writeback_class,
233 TP_PROTO(struct backing_dev_info *bdi), 299 TP_PROTO(struct bdi_writeback *wb),
234 TP_ARGS(bdi), 300 TP_ARGS(wb),
235 TP_STRUCT__entry( 301 TP_STRUCT__entry(
236 __array(char, name, 32) 302 __array(char, name, 32)
303 __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
237 ), 304 ),
238 TP_fast_assign( 305 TP_fast_assign(
239 strncpy(__entry->name, dev_name(bdi->dev), 32); 306 strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
307 __trace_wb_assign_cgroup(__get_str(cgroup), wb);
240 ), 308 ),
241 TP_printk("bdi %s", 309 TP_printk("bdi %s: cgroup=%s",
242 __entry->name 310 __entry->name,
311 __get_str(cgroup)
243 ) 312 )
244); 313);
245#define DEFINE_WRITEBACK_EVENT(name) \ 314#define DEFINE_WRITEBACK_EVENT(name) \
246DEFINE_EVENT(writeback_class, name, \ 315DEFINE_EVENT(writeback_class, name, \
247 TP_PROTO(struct backing_dev_info *bdi), \ 316 TP_PROTO(struct bdi_writeback *wb), \
248 TP_ARGS(bdi)) 317 TP_ARGS(wb))
249 318
250DEFINE_WRITEBACK_EVENT(writeback_nowork); 319DEFINE_WRITEBACK_EVENT(writeback_nowork);
251DEFINE_WRITEBACK_EVENT(writeback_wake_background); 320DEFINE_WRITEBACK_EVENT(writeback_wake_background);
252DEFINE_WRITEBACK_EVENT(writeback_bdi_register); 321
322TRACE_EVENT(writeback_bdi_register,
323 TP_PROTO(struct backing_dev_info *bdi),
324 TP_ARGS(bdi),
325 TP_STRUCT__entry(
326 __array(char, name, 32)
327 ),
328 TP_fast_assign(
329 strncpy(__entry->name, dev_name(bdi->dev), 32);
330 ),
331 TP_printk("bdi %s",
332 __entry->name
333 )
334);
253 335
254DECLARE_EVENT_CLASS(wbc_class, 336DECLARE_EVENT_CLASS(wbc_class,
255 TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), 337 TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),
@@ -265,6 +347,7 @@ DECLARE_EVENT_CLASS(wbc_class,
265 __field(int, range_cyclic) 347 __field(int, range_cyclic)
266 __field(long, range_start) 348 __field(long, range_start)
267 __field(long, range_end) 349 __field(long, range_end)
350 __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
268 ), 351 ),
269 352
270 TP_fast_assign( 353 TP_fast_assign(
@@ -278,11 +361,12 @@ DECLARE_EVENT_CLASS(wbc_class,
278 __entry->range_cyclic = wbc->range_cyclic; 361 __entry->range_cyclic = wbc->range_cyclic;
279 __entry->range_start = (long)wbc->range_start; 362 __entry->range_start = (long)wbc->range_start;
280 __entry->range_end = (long)wbc->range_end; 363 __entry->range_end = (long)wbc->range_end;
364 __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
281 ), 365 ),
282 366
283 TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d " 367 TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
284 "bgrd=%d reclm=%d cyclic=%d " 368 "bgrd=%d reclm=%d cyclic=%d "
285 "start=0x%lx end=0x%lx", 369 "start=0x%lx end=0x%lx cgroup=%s",
286 __entry->name, 370 __entry->name,
287 __entry->nr_to_write, 371 __entry->nr_to_write,
288 __entry->pages_skipped, 372 __entry->pages_skipped,
@@ -292,7 +376,9 @@ DECLARE_EVENT_CLASS(wbc_class,
292 __entry->for_reclaim, 376 __entry->for_reclaim,
293 __entry->range_cyclic, 377 __entry->range_cyclic,
294 __entry->range_start, 378 __entry->range_start,
295 __entry->range_end) 379 __entry->range_end,
380 __get_str(cgroup)
381 )
296) 382)
297 383
298#define DEFINE_WBC_EVENT(name) \ 384#define DEFINE_WBC_EVENT(name) \
@@ -312,6 +398,7 @@ TRACE_EVENT(writeback_queue_io,
312 __field(long, age) 398 __field(long, age)
313 __field(int, moved) 399 __field(int, moved)
314 __field(int, reason) 400 __field(int, reason)
401 __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
315 ), 402 ),
316 TP_fast_assign( 403 TP_fast_assign(
317 unsigned long *older_than_this = work->older_than_this; 404 unsigned long *older_than_this = work->older_than_this;
@@ -321,13 +408,15 @@ TRACE_EVENT(writeback_queue_io,
321 (jiffies - *older_than_this) * 1000 / HZ : -1; 408 (jiffies - *older_than_this) * 1000 / HZ : -1;
322 __entry->moved = moved; 409 __entry->moved = moved;
323 __entry->reason = work->reason; 410 __entry->reason = work->reason;
411 __trace_wb_assign_cgroup(__get_str(cgroup), wb);
324 ), 412 ),
325 TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s", 413 TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup=%s",
326 __entry->name, 414 __entry->name,
327 __entry->older, /* older_than_this in jiffies */ 415 __entry->older, /* older_than_this in jiffies */
328 __entry->age, /* older_than_this in relative milliseconds */ 416 __entry->age, /* older_than_this in relative milliseconds */
329 __entry->moved, 417 __entry->moved,
330 __print_symbolic(__entry->reason, WB_WORK_REASON) 418 __print_symbolic(__entry->reason, WB_WORK_REASON),
419 __get_str(cgroup)
331 ) 420 )
332); 421);
333 422
@@ -381,11 +470,11 @@ TRACE_EVENT(global_dirty_state,
381 470
382TRACE_EVENT(bdi_dirty_ratelimit, 471TRACE_EVENT(bdi_dirty_ratelimit,
383 472
384 TP_PROTO(struct backing_dev_info *bdi, 473 TP_PROTO(struct bdi_writeback *wb,
385 unsigned long dirty_rate, 474 unsigned long dirty_rate,
386 unsigned long task_ratelimit), 475 unsigned long task_ratelimit),
387 476
388 TP_ARGS(bdi, dirty_rate, task_ratelimit), 477 TP_ARGS(wb, dirty_rate, task_ratelimit),
389 478
390 TP_STRUCT__entry( 479 TP_STRUCT__entry(
391 __array(char, bdi, 32) 480 __array(char, bdi, 32)
@@ -395,36 +484,39 @@ TRACE_EVENT(bdi_dirty_ratelimit,
395 __field(unsigned long, dirty_ratelimit) 484 __field(unsigned long, dirty_ratelimit)
396 __field(unsigned long, task_ratelimit) 485 __field(unsigned long, task_ratelimit)
397 __field(unsigned long, balanced_dirty_ratelimit) 486 __field(unsigned long, balanced_dirty_ratelimit)
487 __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
398 ), 488 ),
399 489
400 TP_fast_assign( 490 TP_fast_assign(
401 strlcpy(__entry->bdi, dev_name(bdi->dev), 32); 491 strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32);
402 __entry->write_bw = KBps(bdi->wb.write_bandwidth); 492 __entry->write_bw = KBps(wb->write_bandwidth);
403 __entry->avg_write_bw = KBps(bdi->wb.avg_write_bandwidth); 493 __entry->avg_write_bw = KBps(wb->avg_write_bandwidth);
404 __entry->dirty_rate = KBps(dirty_rate); 494 __entry->dirty_rate = KBps(dirty_rate);
405 __entry->dirty_ratelimit = KBps(bdi->wb.dirty_ratelimit); 495 __entry->dirty_ratelimit = KBps(wb->dirty_ratelimit);
406 __entry->task_ratelimit = KBps(task_ratelimit); 496 __entry->task_ratelimit = KBps(task_ratelimit);
407 __entry->balanced_dirty_ratelimit = 497 __entry->balanced_dirty_ratelimit =
408 KBps(bdi->wb.balanced_dirty_ratelimit); 498 KBps(wb->balanced_dirty_ratelimit);
499 __trace_wb_assign_cgroup(__get_str(cgroup), wb);
409 ), 500 ),
410 501
411 TP_printk("bdi %s: " 502 TP_printk("bdi %s: "
412 "write_bw=%lu awrite_bw=%lu dirty_rate=%lu " 503 "write_bw=%lu awrite_bw=%lu dirty_rate=%lu "
413 "dirty_ratelimit=%lu task_ratelimit=%lu " 504 "dirty_ratelimit=%lu task_ratelimit=%lu "
414 "balanced_dirty_ratelimit=%lu", 505 "balanced_dirty_ratelimit=%lu cgroup=%s",
415 __entry->bdi, 506 __entry->bdi,
416 __entry->write_bw, /* write bandwidth */ 507 __entry->write_bw, /* write bandwidth */
417 __entry->avg_write_bw, /* avg write bandwidth */ 508 __entry->avg_write_bw, /* avg write bandwidth */
418 __entry->dirty_rate, /* bdi dirty rate */ 509 __entry->dirty_rate, /* bdi dirty rate */
419 __entry->dirty_ratelimit, /* base ratelimit */ 510 __entry->dirty_ratelimit, /* base ratelimit */
420 __entry->task_ratelimit, /* ratelimit with position control */ 511 __entry->task_ratelimit, /* ratelimit with position control */
421 __entry->balanced_dirty_ratelimit /* the balanced ratelimit */ 512 __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */
513 __get_str(cgroup)
422 ) 514 )
423); 515);
424 516
425TRACE_EVENT(balance_dirty_pages, 517TRACE_EVENT(balance_dirty_pages,
426 518
427 TP_PROTO(struct backing_dev_info *bdi, 519 TP_PROTO(struct bdi_writeback *wb,
428 unsigned long thresh, 520 unsigned long thresh,
429 unsigned long bg_thresh, 521 unsigned long bg_thresh,
430 unsigned long dirty, 522 unsigned long dirty,
@@ -437,7 +529,7 @@ TRACE_EVENT(balance_dirty_pages,
437 long pause, 529 long pause,
438 unsigned long start_time), 530 unsigned long start_time),
439 531
440 TP_ARGS(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty, 532 TP_ARGS(wb, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty,
441 dirty_ratelimit, task_ratelimit, 533 dirty_ratelimit, task_ratelimit,
442 dirtied, period, pause, start_time), 534 dirtied, period, pause, start_time),
443 535
@@ -456,11 +548,12 @@ TRACE_EVENT(balance_dirty_pages,
456 __field( long, pause) 548 __field( long, pause)
457 __field(unsigned long, period) 549 __field(unsigned long, period)
458 __field( long, think) 550 __field( long, think)
551 __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
459 ), 552 ),
460 553
461 TP_fast_assign( 554 TP_fast_assign(
462 unsigned long freerun = (thresh + bg_thresh) / 2; 555 unsigned long freerun = (thresh + bg_thresh) / 2;
463 strlcpy(__entry->bdi, dev_name(bdi->dev), 32); 556 strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32);
464 557
465 __entry->limit = global_wb_domain.dirty_limit; 558 __entry->limit = global_wb_domain.dirty_limit;
466 __entry->setpoint = (global_wb_domain.dirty_limit + 559 __entry->setpoint = (global_wb_domain.dirty_limit +
@@ -478,6 +571,7 @@ TRACE_EVENT(balance_dirty_pages,
478 __entry->period = period * 1000 / HZ; 571 __entry->period = period * 1000 / HZ;
479 __entry->pause = pause * 1000 / HZ; 572 __entry->pause = pause * 1000 / HZ;
480 __entry->paused = (jiffies - start_time) * 1000 / HZ; 573 __entry->paused = (jiffies - start_time) * 1000 / HZ;
574 __trace_wb_assign_cgroup(__get_str(cgroup), wb);
481 ), 575 ),
482 576
483 577
@@ -486,7 +580,7 @@ TRACE_EVENT(balance_dirty_pages,
486 "bdi_setpoint=%lu bdi_dirty=%lu " 580 "bdi_setpoint=%lu bdi_dirty=%lu "
487 "dirty_ratelimit=%lu task_ratelimit=%lu " 581 "dirty_ratelimit=%lu task_ratelimit=%lu "
488 "dirtied=%u dirtied_pause=%u " 582 "dirtied=%u dirtied_pause=%u "
489 "paused=%lu pause=%ld period=%lu think=%ld", 583 "paused=%lu pause=%ld period=%lu think=%ld cgroup=%s",
490 __entry->bdi, 584 __entry->bdi,
491 __entry->limit, 585 __entry->limit,
492 __entry->setpoint, 586 __entry->setpoint,
@@ -500,7 +594,8 @@ TRACE_EVENT(balance_dirty_pages,
500 __entry->paused, /* ms */ 594 __entry->paused, /* ms */
501 __entry->pause, /* ms */ 595 __entry->pause, /* ms */
502 __entry->period, /* ms */ 596 __entry->period, /* ms */
503 __entry->think /* ms */ 597 __entry->think, /* ms */
598 __get_str(cgroup)
504 ) 599 )
505); 600);
506 601
@@ -514,6 +609,8 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
514 __field(unsigned long, ino) 609 __field(unsigned long, ino)
515 __field(unsigned long, state) 610 __field(unsigned long, state)
516 __field(unsigned long, dirtied_when) 611 __field(unsigned long, dirtied_when)
612 __dynamic_array(char, cgroup,
613 __trace_wb_cgroup_size(inode_to_wb(inode)))
517 ), 614 ),
518 615
519 TP_fast_assign( 616 TP_fast_assign(
@@ -522,14 +619,16 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
522 __entry->ino = inode->i_ino; 619 __entry->ino = inode->i_ino;
523 __entry->state = inode->i_state; 620 __entry->state = inode->i_state;
524 __entry->dirtied_when = inode->dirtied_when; 621 __entry->dirtied_when = inode->dirtied_when;
622 __trace_wb_assign_cgroup(__get_str(cgroup), inode_to_wb(inode));
525 ), 623 ),
526 624
527 TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu", 625 TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup=%s",
528 __entry->name, 626 __entry->name,
529 __entry->ino, 627 __entry->ino,
530 show_inode_state(__entry->state), 628 show_inode_state(__entry->state),
531 __entry->dirtied_when, 629 __entry->dirtied_when,
532 (jiffies - __entry->dirtied_when) / HZ 630 (jiffies - __entry->dirtied_when) / HZ,
631 __get_str(cgroup)
533 ) 632 )
534); 633);
535 634
@@ -585,6 +684,7 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
585 __field(unsigned long, writeback_index) 684 __field(unsigned long, writeback_index)
586 __field(long, nr_to_write) 685 __field(long, nr_to_write)
587 __field(unsigned long, wrote) 686 __field(unsigned long, wrote)
687 __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
588 ), 688 ),
589 689
590 TP_fast_assign( 690 TP_fast_assign(
@@ -596,10 +696,11 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
596 __entry->writeback_index = inode->i_mapping->writeback_index; 696 __entry->writeback_index = inode->i_mapping->writeback_index;
597 __entry->nr_to_write = nr_to_write; 697 __entry->nr_to_write = nr_to_write;
598 __entry->wrote = nr_to_write - wbc->nr_to_write; 698 __entry->wrote = nr_to_write - wbc->nr_to_write;
699 __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
599 ), 700 ),
600 701
601 TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu " 702 TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu "
602 "index=%lu to_write=%ld wrote=%lu", 703 "index=%lu to_write=%ld wrote=%lu cgroup=%s",
603 __entry->name, 704 __entry->name,
604 __entry->ino, 705 __entry->ino,
605 show_inode_state(__entry->state), 706 show_inode_state(__entry->state),
@@ -607,7 +708,8 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
607 (jiffies - __entry->dirtied_when) / HZ, 708 (jiffies - __entry->dirtied_when) / HZ,
608 __entry->writeback_index, 709 __entry->writeback_index,
609 __entry->nr_to_write, 710 __entry->nr_to_write,
610 __entry->wrote 711 __entry->wrote,
712 __get_str(cgroup)
611 ) 713 )
612); 714);
613 715