aboutsummaryrefslogtreecommitdiffstats
path: root/block/blk-cgroup.c
diff options
context:
space:
mode:
authorDivyesh Shah <dpshah@google.com>2010-04-09 00:15:35 -0400
committerJens Axboe <jens.axboe@oracle.com>2010-04-09 02:36:08 -0400
commit812df48d127365ffd0869aa139738f572a86759c (patch)
tree772ef559057cd432ad874cd429287e7a912b1bb3 /block/blk-cgroup.c
parentcdc1184cf4a7bd99f5473a91244197accc49146b (diff)
blkio: Add more debug-only per-cgroup stats
1) group_wait_time - This is the amount of time the cgroup had to wait to get a timeslice for one of its queues from when it became busy, i.e., went from 0 to 1 request queued. This is different from the io_wait_time which is the cumulative total of the amount of time spent by each IO in that cgroup waiting in the scheduler queue. This stat is a great way to find out any jobs in the fleet that are being starved or waiting for longer than what is expected (due to an IO controller bug or any other issue). 2) empty_time - This is the amount of time a cgroup spends w/o any pending requests. This stat is useful when a job does not seem to be able to use its assigned disk share by helping check if that is happening due to an IO controller bug or because the job is not submitting enough IOs. 3) idle_time - This is the amount of time spent by the IO scheduler idling for a given cgroup in anticipation of a better request than the exising ones from other queues/cgroups. All these stats are recorded using start and stop events. When reading these stats, we do not add the delta between the current time and the last start time if we're between the start and stop events. We avoid doing this to make sure that these numbers are always monotonically increasing when read. Since we're using sched_clock() which may use the tsc as its source, it may induce some inconsistency (due to tsc resync across cpus) if we included the current delta. Signed-off-by: Divyesh Shah<dpshah@google.com> Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Diffstat (limited to 'block/blk-cgroup.c')
-rw-r--r--block/blk-cgroup.c159
1 files changed, 156 insertions, 3 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 1e0c4970b35d..1ecff7a39f2c 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -105,6 +105,76 @@ static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
105} 105}
106 106
107#ifdef CONFIG_DEBUG_BLK_CGROUP 107#ifdef CONFIG_DEBUG_BLK_CGROUP
108/* This should be called with the blkg->stats_lock held. */
109static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
110 struct blkio_group *curr_blkg)
111{
112 if (blkio_blkg_waiting(&blkg->stats))
113 return;
114 if (blkg == curr_blkg)
115 return;
116 blkg->stats.start_group_wait_time = sched_clock();
117 blkio_mark_blkg_waiting(&blkg->stats);
118}
119
120/* This should be called with the blkg->stats_lock held. */
121static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
122{
123 unsigned long long now;
124
125 if (!blkio_blkg_waiting(stats))
126 return;
127
128 now = sched_clock();
129 if (time_after64(now, stats->start_group_wait_time))
130 stats->group_wait_time += now - stats->start_group_wait_time;
131 blkio_clear_blkg_waiting(stats);
132}
133
134/* This should be called with the blkg->stats_lock held. */
135static void blkio_end_empty_time(struct blkio_group_stats *stats)
136{
137 unsigned long long now;
138
139 if (!blkio_blkg_empty(stats))
140 return;
141
142 now = sched_clock();
143 if (time_after64(now, stats->start_empty_time))
144 stats->empty_time += now - stats->start_empty_time;
145 blkio_clear_blkg_empty(stats);
146}
147
148void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
149{
150 unsigned long flags;
151
152 spin_lock_irqsave(&blkg->stats_lock, flags);
153 BUG_ON(blkio_blkg_idling(&blkg->stats));
154 blkg->stats.start_idle_time = sched_clock();
155 blkio_mark_blkg_idling(&blkg->stats);
156 spin_unlock_irqrestore(&blkg->stats_lock, flags);
157}
158EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
159
160void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
161{
162 unsigned long flags;
163 unsigned long long now;
164 struct blkio_group_stats *stats;
165
166 spin_lock_irqsave(&blkg->stats_lock, flags);
167 stats = &blkg->stats;
168 if (blkio_blkg_idling(stats)) {
169 now = sched_clock();
170 if (time_after64(now, stats->start_idle_time))
171 stats->idle_time += now - stats->start_idle_time;
172 blkio_clear_blkg_idling(stats);
173 }
174 spin_unlock_irqrestore(&blkg->stats_lock, flags);
175}
176EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
177
108void blkiocg_update_set_active_queue_stats(struct blkio_group *blkg) 178void blkiocg_update_set_active_queue_stats(struct blkio_group *blkg)
109{ 179{
110 unsigned long flags; 180 unsigned long flags;
@@ -116,9 +186,14 @@ void blkiocg_update_set_active_queue_stats(struct blkio_group *blkg)
116 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] + 186 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
117 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]; 187 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
118 stats->avg_queue_size_samples++; 188 stats->avg_queue_size_samples++;
189 blkio_update_group_wait_time(stats);
119 spin_unlock_irqrestore(&blkg->stats_lock, flags); 190 spin_unlock_irqrestore(&blkg->stats_lock, flags);
120} 191}
121EXPORT_SYMBOL_GPL(blkiocg_update_set_active_queue_stats); 192EXPORT_SYMBOL_GPL(blkiocg_update_set_active_queue_stats);
193#else
194static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
195 struct blkio_group *curr_blkg) {}
196static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {}
122#endif 197#endif
123 198
124void blkiocg_update_request_add_stats(struct blkio_group *blkg, 199void blkiocg_update_request_add_stats(struct blkio_group *blkg,
@@ -130,6 +205,8 @@ void blkiocg_update_request_add_stats(struct blkio_group *blkg,
130 spin_lock_irqsave(&blkg->stats_lock, flags); 205 spin_lock_irqsave(&blkg->stats_lock, flags);
131 blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction, 206 blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
132 sync); 207 sync);
208 blkio_end_empty_time(&blkg->stats);
209 blkio_set_start_group_wait_time(blkg, curr_blkg);
133 spin_unlock_irqrestore(&blkg->stats_lock, flags); 210 spin_unlock_irqrestore(&blkg->stats_lock, flags);
134} 211}
135EXPORT_SYMBOL_GPL(blkiocg_update_request_add_stats); 212EXPORT_SYMBOL_GPL(blkiocg_update_request_add_stats);
@@ -156,6 +233,33 @@ void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
156} 233}
157EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); 234EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
158 235
236void blkiocg_set_start_empty_time(struct blkio_group *blkg, bool ignore)
237{
238 unsigned long flags;
239 struct blkio_group_stats *stats;
240
241 spin_lock_irqsave(&blkg->stats_lock, flags);
242 stats = &blkg->stats;
243
244 if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
245 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
246 spin_unlock_irqrestore(&blkg->stats_lock, flags);
247 return;
248 }
249
250 /*
251 * If ignore is set, we do not panic on the empty flag being set
252 * already. This is to avoid cases where there are superfluous timeslice
253 * complete events (for eg., forced_dispatch in CFQ) when no IOs are
254 * served which could result in triggering the empty check incorrectly.
255 */
256 BUG_ON(!ignore && blkio_blkg_empty(stats));
257 stats->start_empty_time = sched_clock();
258 blkio_mark_blkg_empty(stats);
259 spin_unlock_irqrestore(&blkg->stats_lock, flags);
260}
261EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
262
159void blkiocg_update_dispatch_stats(struct blkio_group *blkg, 263void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
160 uint64_t bytes, bool direction, bool sync) 264 uint64_t bytes, bool direction, bool sync)
161{ 265{
@@ -317,19 +421,44 @@ blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
317{ 421{
318 struct blkio_cgroup *blkcg; 422 struct blkio_cgroup *blkcg;
319 struct blkio_group *blkg; 423 struct blkio_group *blkg;
424 struct blkio_group_stats *stats;
320 struct hlist_node *n; 425 struct hlist_node *n;
321 uint64_t queued[BLKIO_STAT_TOTAL]; 426 uint64_t queued[BLKIO_STAT_TOTAL];
322 int i; 427 int i;
428#ifdef CONFIG_DEBUG_BLK_CGROUP
429 bool idling, waiting, empty;
430 unsigned long long now = sched_clock();
431#endif
323 432
324 blkcg = cgroup_to_blkio_cgroup(cgroup); 433 blkcg = cgroup_to_blkio_cgroup(cgroup);
325 spin_lock_irq(&blkcg->lock); 434 spin_lock_irq(&blkcg->lock);
326 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { 435 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
327 spin_lock(&blkg->stats_lock); 436 spin_lock(&blkg->stats_lock);
437 stats = &blkg->stats;
438#ifdef CONFIG_DEBUG_BLK_CGROUP
439 idling = blkio_blkg_idling(stats);
440 waiting = blkio_blkg_waiting(stats);
441 empty = blkio_blkg_empty(stats);
442#endif
328 for (i = 0; i < BLKIO_STAT_TOTAL; i++) 443 for (i = 0; i < BLKIO_STAT_TOTAL; i++)
329 queued[i] = blkg->stats.stat_arr[BLKIO_STAT_QUEUED][i]; 444 queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
330 memset(&blkg->stats, 0, sizeof(struct blkio_group_stats)); 445 memset(stats, 0, sizeof(struct blkio_group_stats));
331 for (i = 0; i < BLKIO_STAT_TOTAL; i++) 446 for (i = 0; i < BLKIO_STAT_TOTAL; i++)
332 blkg->stats.stat_arr[BLKIO_STAT_QUEUED][i] = queued[i]; 447 stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
448#ifdef CONFIG_DEBUG_BLK_CGROUP
449 if (idling) {
450 blkio_mark_blkg_idling(stats);
451 stats->start_idle_time = now;
452 }
453 if (waiting) {
454 blkio_mark_blkg_waiting(stats);
455 stats->start_group_wait_time = now;
456 }
457 if (empty) {
458 blkio_mark_blkg_empty(stats);
459 stats->start_empty_time = now;
460 }
461#endif
333 spin_unlock(&blkg->stats_lock); 462 spin_unlock(&blkg->stats_lock);
334 } 463 }
335 spin_unlock_irq(&blkcg->lock); 464 spin_unlock_irq(&blkcg->lock);
@@ -401,6 +530,15 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
401 sum = 0; 530 sum = 0;
402 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev); 531 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev);
403 } 532 }
533 if (type == BLKIO_STAT_GROUP_WAIT_TIME)
534 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
535 blkg->stats.group_wait_time, cb, dev);
536 if (type == BLKIO_STAT_IDLE_TIME)
537 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
538 blkg->stats.idle_time, cb, dev);
539 if (type == BLKIO_STAT_EMPTY_TIME)
540 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
541 blkg->stats.empty_time, cb, dev);
404 if (type == BLKIO_STAT_DEQUEUE) 542 if (type == BLKIO_STAT_DEQUEUE)
405 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 543 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
406 blkg->stats.dequeue, cb, dev); 544 blkg->stats.dequeue, cb, dev);
@@ -458,6 +596,9 @@ SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1);
458#ifdef CONFIG_DEBUG_BLK_CGROUP 596#ifdef CONFIG_DEBUG_BLK_CGROUP
459SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0); 597SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0);
460SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0); 598SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0);
599SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0);
600SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0);
601SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0);
461#endif 602#endif
462#undef SHOW_FUNCTION_PER_GROUP 603#undef SHOW_FUNCTION_PER_GROUP
463 604
@@ -518,6 +659,18 @@ struct cftype blkio_files[] = {
518 .read_map = blkiocg_avg_queue_size_read, 659 .read_map = blkiocg_avg_queue_size_read,
519 }, 660 },
520 { 661 {
662 .name = "group_wait_time",
663 .read_map = blkiocg_group_wait_time_read,
664 },
665 {
666 .name = "idle_time",
667 .read_map = blkiocg_idle_time_read,
668 },
669 {
670 .name = "empty_time",
671 .read_map = blkiocg_empty_time_read,
672 },
673 {
521 .name = "dequeue", 674 .name = "dequeue",
522 .read_map = blkiocg_dequeue_read, 675 .read_map = blkiocg_dequeue_read,
523 }, 676 },