aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDivyesh Shah <dpshah@google.com>2010-04-09 00:15:35 -0400
committerJens Axboe <jens.axboe@oracle.com>2010-04-09 02:36:08 -0400
commit812df48d127365ffd0869aa139738f572a86759c (patch)
tree772ef559057cd432ad874cd429287e7a912b1bb3
parentcdc1184cf4a7bd99f5473a91244197accc49146b (diff)
blkio: Add more debug-only per-cgroup stats
1) group_wait_time - This is the amount of time the cgroup had to wait to get a timeslice for one of its queues from when it became busy, i.e., went from 0 to 1 request queued. This is different from the io_wait_time which is the cumulative total of the amount of time spent by each IO in that cgroup waiting in the scheduler queue. This stat is a great way to find out any jobs in the fleet that are being starved or waiting for longer than what is expected (due to an IO controller bug or any other issue). 2) empty_time - This is the amount of time a cgroup spends w/o any pending requests. This stat is useful when a job does not seem to be able to use its assigned disk share by helping check if that is happening due to an IO controller bug or because the job is not submitting enough IOs. 3) idle_time - This is the amount of time spent by the IO scheduler idling for a given cgroup in anticipation of a better request than the exising ones from other queues/cgroups. All these stats are recorded using start and stop events. When reading these stats, we do not add the delta between the current time and the last start time if we're between the start and stop events. We avoid doing this to make sure that these numbers are always monotonically increasing when read. Since we're using sched_clock() which may use the tsc as its source, it may induce some inconsistency (due to tsc resync across cpus) if we included the current delta. Signed-off-by: Divyesh Shah<dpshah@google.com> Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
-rw-r--r--Documentation/cgroups/blkio-controller.txt29
-rw-r--r--block/blk-cgroup.c159
-rw-r--r--block/blk-cgroup.h54
-rw-r--r--block/cfq-iosched.c50
4 files changed, 271 insertions, 21 deletions
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index 6e52e7c512a4..db054ea3e7fb 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -150,6 +150,35 @@ Details of cgroup files
150 cgroup's existence. Queue size samples are taken each time one of the 150 cgroup's existence. Queue size samples are taken each time one of the
151 queues of this cgroup gets a timeslice. 151 queues of this cgroup gets a timeslice.
152 152
153- blkio.group_wait_time
154 - Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y.
155 This is the amount of time the cgroup had to wait since it became busy
156 (i.e., went from 0 to 1 request queued) to get a timeslice for one of
157 its queues. This is different from the io_wait_time which is the
158 cumulative total of the amount of time spent by each IO in that cgroup
159 waiting in the scheduler queue. This is in nanoseconds. If this is
160 read when the cgroup is in a waiting (for timeslice) state, the stat
161 will only report the group_wait_time accumulated till the last time it
162 got a timeslice and will not include the current delta.
163
164- blkio.empty_time
165 - Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y.
166 This is the amount of time a cgroup spends without any pending
167 requests when not being served, i.e., it does not include any time
168 spent idling for one of the queues of the cgroup. This is in
169 nanoseconds. If this is read when the cgroup is in an empty state,
170 the stat will only report the empty_time accumulated till the last
171 time it had a pending request and will not include the current delta.
172
173- blkio.idle_time
174 - Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y.
175 This is the amount of time spent by the IO scheduler idling for a
176 given cgroup in anticipation of a better request than the exising ones
177 from other queues/cgroups. This is in nanoseconds. If this is read
178 when the cgroup is in an idling state, the stat will only report the
179 idle_time accumulated till the last idle period and will not include
180 the current delta.
181
153- blkio.dequeue 182- blkio.dequeue
154 - Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y. This 183 - Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y. This
155 gives the statistics about how many a times a group was dequeued 184 gives the statistics about how many a times a group was dequeued
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 1e0c4970b35d..1ecff7a39f2c 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -105,6 +105,76 @@ static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
105} 105}
106 106
107#ifdef CONFIG_DEBUG_BLK_CGROUP 107#ifdef CONFIG_DEBUG_BLK_CGROUP
108/* This should be called with the blkg->stats_lock held. */
109static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
110 struct blkio_group *curr_blkg)
111{
112 if (blkio_blkg_waiting(&blkg->stats))
113 return;
114 if (blkg == curr_blkg)
115 return;
116 blkg->stats.start_group_wait_time = sched_clock();
117 blkio_mark_blkg_waiting(&blkg->stats);
118}
119
120/* This should be called with the blkg->stats_lock held. */
121static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
122{
123 unsigned long long now;
124
125 if (!blkio_blkg_waiting(stats))
126 return;
127
128 now = sched_clock();
129 if (time_after64(now, stats->start_group_wait_time))
130 stats->group_wait_time += now - stats->start_group_wait_time;
131 blkio_clear_blkg_waiting(stats);
132}
133
134/* This should be called with the blkg->stats_lock held. */
135static void blkio_end_empty_time(struct blkio_group_stats *stats)
136{
137 unsigned long long now;
138
139 if (!blkio_blkg_empty(stats))
140 return;
141
142 now = sched_clock();
143 if (time_after64(now, stats->start_empty_time))
144 stats->empty_time += now - stats->start_empty_time;
145 blkio_clear_blkg_empty(stats);
146}
147
148void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
149{
150 unsigned long flags;
151
152 spin_lock_irqsave(&blkg->stats_lock, flags);
153 BUG_ON(blkio_blkg_idling(&blkg->stats));
154 blkg->stats.start_idle_time = sched_clock();
155 blkio_mark_blkg_idling(&blkg->stats);
156 spin_unlock_irqrestore(&blkg->stats_lock, flags);
157}
158EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
159
160void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
161{
162 unsigned long flags;
163 unsigned long long now;
164 struct blkio_group_stats *stats;
165
166 spin_lock_irqsave(&blkg->stats_lock, flags);
167 stats = &blkg->stats;
168 if (blkio_blkg_idling(stats)) {
169 now = sched_clock();
170 if (time_after64(now, stats->start_idle_time))
171 stats->idle_time += now - stats->start_idle_time;
172 blkio_clear_blkg_idling(stats);
173 }
174 spin_unlock_irqrestore(&blkg->stats_lock, flags);
175}
176EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
177
108void blkiocg_update_set_active_queue_stats(struct blkio_group *blkg) 178void blkiocg_update_set_active_queue_stats(struct blkio_group *blkg)
109{ 179{
110 unsigned long flags; 180 unsigned long flags;
@@ -116,9 +186,14 @@ void blkiocg_update_set_active_queue_stats(struct blkio_group *blkg)
116 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] + 186 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
117 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]; 187 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
118 stats->avg_queue_size_samples++; 188 stats->avg_queue_size_samples++;
189 blkio_update_group_wait_time(stats);
119 spin_unlock_irqrestore(&blkg->stats_lock, flags); 190 spin_unlock_irqrestore(&blkg->stats_lock, flags);
120} 191}
121EXPORT_SYMBOL_GPL(blkiocg_update_set_active_queue_stats); 192EXPORT_SYMBOL_GPL(blkiocg_update_set_active_queue_stats);
193#else
194static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
195 struct blkio_group *curr_blkg) {}
196static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {}
122#endif 197#endif
123 198
124void blkiocg_update_request_add_stats(struct blkio_group *blkg, 199void blkiocg_update_request_add_stats(struct blkio_group *blkg,
@@ -130,6 +205,8 @@ void blkiocg_update_request_add_stats(struct blkio_group *blkg,
130 spin_lock_irqsave(&blkg->stats_lock, flags); 205 spin_lock_irqsave(&blkg->stats_lock, flags);
131 blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction, 206 blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
132 sync); 207 sync);
208 blkio_end_empty_time(&blkg->stats);
209 blkio_set_start_group_wait_time(blkg, curr_blkg);
133 spin_unlock_irqrestore(&blkg->stats_lock, flags); 210 spin_unlock_irqrestore(&blkg->stats_lock, flags);
134} 211}
135EXPORT_SYMBOL_GPL(blkiocg_update_request_add_stats); 212EXPORT_SYMBOL_GPL(blkiocg_update_request_add_stats);
@@ -156,6 +233,33 @@ void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
156} 233}
157EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); 234EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
158 235
236void blkiocg_set_start_empty_time(struct blkio_group *blkg, bool ignore)
237{
238 unsigned long flags;
239 struct blkio_group_stats *stats;
240
241 spin_lock_irqsave(&blkg->stats_lock, flags);
242 stats = &blkg->stats;
243
244 if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
245 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
246 spin_unlock_irqrestore(&blkg->stats_lock, flags);
247 return;
248 }
249
250 /*
251 * If ignore is set, we do not panic on the empty flag being set
252 * already. This is to avoid cases where there are superfluous timeslice
253 * complete events (for eg., forced_dispatch in CFQ) when no IOs are
254 * served which could result in triggering the empty check incorrectly.
255 */
256 BUG_ON(!ignore && blkio_blkg_empty(stats));
257 stats->start_empty_time = sched_clock();
258 blkio_mark_blkg_empty(stats);
259 spin_unlock_irqrestore(&blkg->stats_lock, flags);
260}
261EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
262
159void blkiocg_update_dispatch_stats(struct blkio_group *blkg, 263void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
160 uint64_t bytes, bool direction, bool sync) 264 uint64_t bytes, bool direction, bool sync)
161{ 265{
@@ -317,19 +421,44 @@ blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
317{ 421{
318 struct blkio_cgroup *blkcg; 422 struct blkio_cgroup *blkcg;
319 struct blkio_group *blkg; 423 struct blkio_group *blkg;
424 struct blkio_group_stats *stats;
320 struct hlist_node *n; 425 struct hlist_node *n;
321 uint64_t queued[BLKIO_STAT_TOTAL]; 426 uint64_t queued[BLKIO_STAT_TOTAL];
322 int i; 427 int i;
428#ifdef CONFIG_DEBUG_BLK_CGROUP
429 bool idling, waiting, empty;
430 unsigned long long now = sched_clock();
431#endif
323 432
324 blkcg = cgroup_to_blkio_cgroup(cgroup); 433 blkcg = cgroup_to_blkio_cgroup(cgroup);
325 spin_lock_irq(&blkcg->lock); 434 spin_lock_irq(&blkcg->lock);
326 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { 435 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
327 spin_lock(&blkg->stats_lock); 436 spin_lock(&blkg->stats_lock);
437 stats = &blkg->stats;
438#ifdef CONFIG_DEBUG_BLK_CGROUP
439 idling = blkio_blkg_idling(stats);
440 waiting = blkio_blkg_waiting(stats);
441 empty = blkio_blkg_empty(stats);
442#endif
328 for (i = 0; i < BLKIO_STAT_TOTAL; i++) 443 for (i = 0; i < BLKIO_STAT_TOTAL; i++)
329 queued[i] = blkg->stats.stat_arr[BLKIO_STAT_QUEUED][i]; 444 queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
330 memset(&blkg->stats, 0, sizeof(struct blkio_group_stats)); 445 memset(stats, 0, sizeof(struct blkio_group_stats));
331 for (i = 0; i < BLKIO_STAT_TOTAL; i++) 446 for (i = 0; i < BLKIO_STAT_TOTAL; i++)
332 blkg->stats.stat_arr[BLKIO_STAT_QUEUED][i] = queued[i]; 447 stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
448#ifdef CONFIG_DEBUG_BLK_CGROUP
449 if (idling) {
450 blkio_mark_blkg_idling(stats);
451 stats->start_idle_time = now;
452 }
453 if (waiting) {
454 blkio_mark_blkg_waiting(stats);
455 stats->start_group_wait_time = now;
456 }
457 if (empty) {
458 blkio_mark_blkg_empty(stats);
459 stats->start_empty_time = now;
460 }
461#endif
333 spin_unlock(&blkg->stats_lock); 462 spin_unlock(&blkg->stats_lock);
334 } 463 }
335 spin_unlock_irq(&blkcg->lock); 464 spin_unlock_irq(&blkcg->lock);
@@ -401,6 +530,15 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
401 sum = 0; 530 sum = 0;
402 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev); 531 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev);
403 } 532 }
533 if (type == BLKIO_STAT_GROUP_WAIT_TIME)
534 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
535 blkg->stats.group_wait_time, cb, dev);
536 if (type == BLKIO_STAT_IDLE_TIME)
537 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
538 blkg->stats.idle_time, cb, dev);
539 if (type == BLKIO_STAT_EMPTY_TIME)
540 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
541 blkg->stats.empty_time, cb, dev);
404 if (type == BLKIO_STAT_DEQUEUE) 542 if (type == BLKIO_STAT_DEQUEUE)
405 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 543 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
406 blkg->stats.dequeue, cb, dev); 544 blkg->stats.dequeue, cb, dev);
@@ -458,6 +596,9 @@ SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1);
458#ifdef CONFIG_DEBUG_BLK_CGROUP 596#ifdef CONFIG_DEBUG_BLK_CGROUP
459SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0); 597SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0);
460SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0); 598SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0);
599SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0);
600SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0);
601SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0);
461#endif 602#endif
462#undef SHOW_FUNCTION_PER_GROUP 603#undef SHOW_FUNCTION_PER_GROUP
463 604
@@ -518,6 +659,18 @@ struct cftype blkio_files[] = {
518 .read_map = blkiocg_avg_queue_size_read, 659 .read_map = blkiocg_avg_queue_size_read,
519 }, 660 },
520 { 661 {
662 .name = "group_wait_time",
663 .read_map = blkiocg_group_wait_time_read,
664 },
665 {
666 .name = "idle_time",
667 .read_map = blkiocg_idle_time_read,
668 },
669 {
670 .name = "empty_time",
671 .read_map = blkiocg_empty_time_read,
672 },
673 {
521 .name = "dequeue", 674 .name = "dequeue",
522 .read_map = blkiocg_dequeue_read, 675 .read_map = blkiocg_dequeue_read,
523 }, 676 },
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index bea7f3b9a88e..bfce085b1962 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -43,6 +43,9 @@ enum stat_type {
43 BLKIO_STAT_SECTORS, 43 BLKIO_STAT_SECTORS,
44#ifdef CONFIG_DEBUG_BLK_CGROUP 44#ifdef CONFIG_DEBUG_BLK_CGROUP
45 BLKIO_STAT_AVG_QUEUE_SIZE, 45 BLKIO_STAT_AVG_QUEUE_SIZE,
46 BLKIO_STAT_IDLE_TIME,
47 BLKIO_STAT_EMPTY_TIME,
48 BLKIO_STAT_GROUP_WAIT_TIME,
46 BLKIO_STAT_DEQUEUE 49 BLKIO_STAT_DEQUEUE
47#endif 50#endif
48}; 51};
@@ -55,6 +58,13 @@ enum stat_sub_type {
55 BLKIO_STAT_TOTAL 58 BLKIO_STAT_TOTAL
56}; 59};
57 60
61/* blkg state flags */
62enum blkg_state_flags {
63 BLKG_waiting = 0,
64 BLKG_idling,
65 BLKG_empty,
66};
67
58struct blkio_cgroup { 68struct blkio_cgroup {
59 struct cgroup_subsys_state css; 69 struct cgroup_subsys_state css;
60 unsigned int weight; 70 unsigned int weight;
@@ -74,6 +84,21 @@ struct blkio_group_stats {
74 uint64_t avg_queue_size_samples; 84 uint64_t avg_queue_size_samples;
75 /* How many times this group has been removed from service tree */ 85 /* How many times this group has been removed from service tree */
76 unsigned long dequeue; 86 unsigned long dequeue;
87
88 /* Total time spent waiting for it to be assigned a timeslice. */
89 uint64_t group_wait_time;
90 uint64_t start_group_wait_time;
91
92 /* Time spent idling for this blkio_group */
93 uint64_t idle_time;
94 uint64_t start_idle_time;
95 /*
96 * Total time when we have requests queued and do not contain the
97 * current active queue.
98 */
99 uint64_t empty_time;
100 uint64_t start_empty_time;
101 uint16_t flags;
77#endif 102#endif
78}; 103};
79 104
@@ -137,12 +162,41 @@ static inline char *blkg_path(struct blkio_group *blkg)
137void blkiocg_update_set_active_queue_stats(struct blkio_group *blkg); 162void blkiocg_update_set_active_queue_stats(struct blkio_group *blkg);
138void blkiocg_update_dequeue_stats(struct blkio_group *blkg, 163void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
139 unsigned long dequeue); 164 unsigned long dequeue);
165void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg);
166void blkiocg_update_idle_time_stats(struct blkio_group *blkg);
167void blkiocg_set_start_empty_time(struct blkio_group *blkg, bool ignore);
168
169#define BLKG_FLAG_FNS(name) \
170static inline void blkio_mark_blkg_##name( \
171 struct blkio_group_stats *stats) \
172{ \
173 stats->flags |= (1 << BLKG_##name); \
174} \
175static inline void blkio_clear_blkg_##name( \
176 struct blkio_group_stats *stats) \
177{ \
178 stats->flags &= ~(1 << BLKG_##name); \
179} \
180static inline int blkio_blkg_##name(struct blkio_group_stats *stats) \
181{ \
182 return (stats->flags & (1 << BLKG_##name)) != 0; \
183} \
184
185BLKG_FLAG_FNS(waiting)
186BLKG_FLAG_FNS(idling)
187BLKG_FLAG_FNS(empty)
188#undef BLKG_FLAG_FNS
140#else 189#else
141static inline char *blkg_path(struct blkio_group *blkg) { return NULL; } 190static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
142static inline void blkiocg_update_set_active_queue_stats( 191static inline void blkiocg_update_set_active_queue_stats(
143 struct blkio_group *blkg) {} 192 struct blkio_group *blkg) {}
144static inline void blkiocg_update_dequeue_stats(struct blkio_group *blkg, 193static inline void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
145 unsigned long dequeue) {} 194 unsigned long dequeue) {}
195static inline void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
196{}
197static inline void blkiocg_update_idle_time_stats(struct blkio_group *blkg) {}
198static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg,
199 bool ignore) {}
146#endif 200#endif
147 201
148#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) 202#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 8e0b86a9111a..b6e095c7ef5e 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -886,7 +886,7 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
886} 886}
887 887
888static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, 888static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
889 struct cfq_queue *cfqq) 889 struct cfq_queue *cfqq, bool forced)
890{ 890{
891 struct cfq_rb_root *st = &cfqd->grp_service_tree; 891 struct cfq_rb_root *st = &cfqd->grp_service_tree;
892 unsigned int used_sl, charge_sl; 892 unsigned int used_sl, charge_sl;
@@ -916,6 +916,7 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
916 cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, 916 cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
917 st->min_vdisktime); 917 st->min_vdisktime);
918 blkiocg_update_timeslice_used(&cfqg->blkg, used_sl); 918 blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);
919 blkiocg_set_start_empty_time(&cfqg->blkg, forced);
919} 920}
920 921
921#ifdef CONFIG_CFQ_GROUP_IOSCHED 922#ifdef CONFIG_CFQ_GROUP_IOSCHED
@@ -1528,6 +1529,12 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
1528 return cfqq == RQ_CFQQ(rq); 1529 return cfqq == RQ_CFQQ(rq);
1529} 1530}
1530 1531
1532static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1533{
1534 del_timer(&cfqd->idle_slice_timer);
1535 blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg);
1536}
1537
1531static void __cfq_set_active_queue(struct cfq_data *cfqd, 1538static void __cfq_set_active_queue(struct cfq_data *cfqd,
1532 struct cfq_queue *cfqq) 1539 struct cfq_queue *cfqq)
1533{ 1540{
@@ -1547,7 +1554,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
1547 cfq_clear_cfqq_fifo_expire(cfqq); 1554 cfq_clear_cfqq_fifo_expire(cfqq);
1548 cfq_mark_cfqq_slice_new(cfqq); 1555 cfq_mark_cfqq_slice_new(cfqq);
1549 1556
1550 del_timer(&cfqd->idle_slice_timer); 1557 cfq_del_timer(cfqd, cfqq);
1551 } 1558 }
1552 1559
1553 cfqd->active_queue = cfqq; 1560 cfqd->active_queue = cfqq;
@@ -1558,12 +1565,12 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
1558 */ 1565 */
1559static void 1566static void
1560__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1567__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1561 bool timed_out) 1568 bool timed_out, bool forced)
1562{ 1569{
1563 cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out); 1570 cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
1564 1571
1565 if (cfq_cfqq_wait_request(cfqq)) 1572 if (cfq_cfqq_wait_request(cfqq))
1566 del_timer(&cfqd->idle_slice_timer); 1573 cfq_del_timer(cfqd, cfqq);
1567 1574
1568 cfq_clear_cfqq_wait_request(cfqq); 1575 cfq_clear_cfqq_wait_request(cfqq);
1569 cfq_clear_cfqq_wait_busy(cfqq); 1576 cfq_clear_cfqq_wait_busy(cfqq);
@@ -1585,7 +1592,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1585 cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid); 1592 cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
1586 } 1593 }
1587 1594
1588 cfq_group_served(cfqd, cfqq->cfqg, cfqq); 1595 cfq_group_served(cfqd, cfqq->cfqg, cfqq, forced);
1589 1596
1590 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) 1597 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
1591 cfq_del_cfqq_rr(cfqd, cfqq); 1598 cfq_del_cfqq_rr(cfqd, cfqq);
@@ -1604,12 +1611,13 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1604 } 1611 }
1605} 1612}
1606 1613
1607static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out) 1614static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out,
1615 bool forced)
1608{ 1616{
1609 struct cfq_queue *cfqq = cfqd->active_queue; 1617 struct cfq_queue *cfqq = cfqd->active_queue;
1610 1618
1611 if (cfqq) 1619 if (cfqq)
1612 __cfq_slice_expired(cfqd, cfqq, timed_out); 1620 __cfq_slice_expired(cfqd, cfqq, timed_out, forced);
1613} 1621}
1614 1622
1615/* 1623/*
@@ -1865,6 +1873,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
1865 sl = cfqd->cfq_slice_idle; 1873 sl = cfqd->cfq_slice_idle;
1866 1874
1867 mod_timer(&cfqd->idle_slice_timer, jiffies + sl); 1875 mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
1876 blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);
1868 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl); 1877 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
1869} 1878}
1870 1879
@@ -2176,7 +2185,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
2176 } 2185 }
2177 2186
2178expire: 2187expire:
2179 cfq_slice_expired(cfqd, 0); 2188 cfq_slice_expired(cfqd, 0, false);
2180new_queue: 2189new_queue:
2181 /* 2190 /*
2182 * Current queue expired. Check if we have to switch to a new 2191 * Current queue expired. Check if we have to switch to a new
@@ -2202,7 +2211,7 @@ static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
2202 BUG_ON(!list_empty(&cfqq->fifo)); 2211 BUG_ON(!list_empty(&cfqq->fifo));
2203 2212
2204 /* By default cfqq is not expired if it is empty. Do it explicitly */ 2213 /* By default cfqq is not expired if it is empty. Do it explicitly */
2205 __cfq_slice_expired(cfqq->cfqd, cfqq, 0); 2214 __cfq_slice_expired(cfqq->cfqd, cfqq, 0, true);
2206 return dispatched; 2215 return dispatched;
2207} 2216}
2208 2217
@@ -2218,7 +2227,7 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
2218 while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) 2227 while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL)
2219 dispatched += __cfq_forced_dispatch_cfqq(cfqq); 2228 dispatched += __cfq_forced_dispatch_cfqq(cfqq);
2220 2229
2221 cfq_slice_expired(cfqd, 0); 2230 cfq_slice_expired(cfqd, 0, true);
2222 BUG_ON(cfqd->busy_queues); 2231 BUG_ON(cfqd->busy_queues);
2223 2232
2224 cfq_log(cfqd, "forced_dispatch=%d", dispatched); 2233 cfq_log(cfqd, "forced_dispatch=%d", dispatched);
@@ -2382,10 +2391,15 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
2382 cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) || 2391 cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
2383 cfq_class_idle(cfqq))) { 2392 cfq_class_idle(cfqq))) {
2384 cfqq->slice_end = jiffies + 1; 2393 cfqq->slice_end = jiffies + 1;
2385 cfq_slice_expired(cfqd, 0); 2394 cfq_slice_expired(cfqd, 0, false);
2386 } 2395 }
2387 2396
2388 cfq_log_cfqq(cfqd, cfqq, "dispatched a request"); 2397 cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
2398 /*
2399 * This is needed since we don't exactly match the mod_timer() and
2400 * del_timer() calls in CFQ.
2401 */
2402 blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg);
2389 return 1; 2403 return 1;
2390} 2404}
2391 2405
@@ -2413,7 +2427,7 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
2413 orig_cfqg = cfqq->orig_cfqg; 2427 orig_cfqg = cfqq->orig_cfqg;
2414 2428
2415 if (unlikely(cfqd->active_queue == cfqq)) { 2429 if (unlikely(cfqd->active_queue == cfqq)) {
2416 __cfq_slice_expired(cfqd, cfqq, 0); 2430 __cfq_slice_expired(cfqd, cfqq, 0, false);
2417 cfq_schedule_dispatch(cfqd); 2431 cfq_schedule_dispatch(cfqd);
2418 } 2432 }
2419 2433
@@ -2514,7 +2528,7 @@ static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2514 struct cfq_queue *__cfqq, *next; 2528 struct cfq_queue *__cfqq, *next;
2515 2529
2516 if (unlikely(cfqq == cfqd->active_queue)) { 2530 if (unlikely(cfqq == cfqd->active_queue)) {
2517 __cfq_slice_expired(cfqd, cfqq, 0); 2531 __cfq_slice_expired(cfqd, cfqq, 0, false);
2518 cfq_schedule_dispatch(cfqd); 2532 cfq_schedule_dispatch(cfqd);
2519 } 2533 }
2520 2534
@@ -3143,7 +3157,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
3143static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) 3157static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3144{ 3158{
3145 cfq_log_cfqq(cfqd, cfqq, "preempt"); 3159 cfq_log_cfqq(cfqd, cfqq, "preempt");
3146 cfq_slice_expired(cfqd, 1); 3160 cfq_slice_expired(cfqd, 1, false);
3147 3161
3148 /* 3162 /*
3149 * Put the new queue at the front of the of the current list, 3163 * Put the new queue at the front of the of the current list,
@@ -3191,7 +3205,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3191 if (cfq_cfqq_wait_request(cfqq)) { 3205 if (cfq_cfqq_wait_request(cfqq)) {
3192 if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE || 3206 if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
3193 cfqd->busy_queues > 1) { 3207 cfqd->busy_queues > 1) {
3194 del_timer(&cfqd->idle_slice_timer); 3208 cfq_del_timer(cfqd, cfqq);
3195 cfq_clear_cfqq_wait_request(cfqq); 3209 cfq_clear_cfqq_wait_request(cfqq);
3196 __blk_run_queue(cfqd->queue); 3210 __blk_run_queue(cfqd->queue);
3197 } else 3211 } else
@@ -3352,7 +3366,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
3352 * - when there is a close cooperator 3366 * - when there is a close cooperator
3353 */ 3367 */
3354 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq)) 3368 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
3355 cfq_slice_expired(cfqd, 1); 3369 cfq_slice_expired(cfqd, 1, false);
3356 else if (sync && cfqq_empty && 3370 else if (sync && cfqq_empty &&
3357 !cfq_close_cooperator(cfqd, cfqq)) { 3371 !cfq_close_cooperator(cfqd, cfqq)) {
3358 cfqd->noidle_tree_requires_idle |= !rq_noidle(rq); 3372 cfqd->noidle_tree_requires_idle |= !rq_noidle(rq);
@@ -3612,7 +3626,7 @@ static void cfq_idle_slice_timer(unsigned long data)
3612 cfq_clear_cfqq_deep(cfqq); 3626 cfq_clear_cfqq_deep(cfqq);
3613 } 3627 }
3614expire: 3628expire:
3615 cfq_slice_expired(cfqd, timed_out); 3629 cfq_slice_expired(cfqd, timed_out, false);
3616out_kick: 3630out_kick:
3617 cfq_schedule_dispatch(cfqd); 3631 cfq_schedule_dispatch(cfqd);
3618out_cont: 3632out_cont:
@@ -3655,7 +3669,7 @@ static void cfq_exit_queue(struct elevator_queue *e)
3655 spin_lock_irq(q->queue_lock); 3669 spin_lock_irq(q->queue_lock);
3656 3670
3657 if (cfqd->active_queue) 3671 if (cfqd->active_queue)
3658 __cfq_slice_expired(cfqd, cfqd->active_queue, 0); 3672 __cfq_slice_expired(cfqd, cfqd->active_queue, 0, false);
3659 3673
3660 while (!list_empty(&cfqd->cic_list)) { 3674 while (!list_empty(&cfqd->cic_list)) {
3661 struct cfq_io_context *cic = list_entry(cfqd->cic_list.next, 3675 struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,