aboutsummaryrefslogtreecommitdiffstats
path: root/block/blk-wbt.c
diff options
context:
space:
mode:
authorOmar Sandoval <osandov@fb.com>2017-03-21 11:56:08 -0400
committerJens Axboe <axboe@fb.com>2017-03-21 12:03:11 -0400
commit34dbad5d26e2f4b88e60f0e9ad03f99480802812 (patch)
tree07aa5a1c8c06fd114cbf6f03716eb4080c7acc82 /block/blk-wbt.c
parent4875253fddd7b6d322f028ad023d44b6efb7f73b (diff)
blk-stat: convert to callback-based statistics reporting
Currently, statistics are gathered in ~0.13s windows, and users grab the statistics whenever they need them. This is not ideal for both in-tree users: 1. Writeback throttling wants its own dynamically sized window of statistics. Since the blk-stats statistics are reset after every window and the wbt windows don't line up with the blk-stats windows, wbt doesn't see every I/O. 2. Polling currently grabs the statistics on every I/O. Again, depending on how the window lines up, we may miss some I/Os. It's also unnecessary overhead to get the statistics on every I/O; the hybrid polling heuristic would be just as happy with the statistics from the previous full window. This reworks the blk-stats infrastructure to be callback-based: users register a callback that they want called at a given time with all of the statistics from the window during which the callback was active. Users can dynamically bucketize the statistics. wbt and polling both currently use read vs. write, but polling can be extended to further subdivide based on request size. The callbacks are kept on an RCU list, and each callback has percpu stats buffers. There will only be a few users, so the overhead on the I/O completion side is low. The stats flushing is also simplified considerably: since the timer function is responsible for clearing the statistics, we don't have to worry about stale statistics. wbt is a trivial conversion. After the conversion, the windowing problem mentioned above is fixed. For polling, we register an extra callback that caches the previous window's statistics in the struct request_queue for the hybrid polling heuristic to use. Since we no longer have a single stats buffer for the request queue, this also removes the sysfs and debugfs stats entries. To replace those, we add a debugfs entry for the poll statistics. Signed-off-by: Omar Sandoval <osandov@fb.com> Signed-off-by: Jens Axboe <axboe@fb.com>
Diffstat (limited to 'block/blk-wbt.c')
-rw-r--r--block/blk-wbt.c51
1 files changed, 19 insertions, 32 deletions
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index aafe5b551224..ffa80e11cf14 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -277,7 +277,7 @@ enum {
277 LAT_EXCEEDED, 277 LAT_EXCEEDED,
278}; 278};
279 279
280static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) 280static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
281{ 281{
282 struct backing_dev_info *bdi = rwb->queue->backing_dev_info; 282 struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
283 u64 thislat; 283 u64 thislat;
@@ -308,8 +308,8 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
308 * waited or still has writes in flights, consider us doing 308 * waited or still has writes in flights, consider us doing
309 * just writes as well. 309 * just writes as well.
310 */ 310 */
311 if ((stat[WRITE].nr_samples && blk_stat_is_current(stat)) || 311 if (stat[WRITE].nr_samples || wb_recent_wait(rwb) ||
312 wb_recent_wait(rwb) || wbt_inflight(rwb)) 312 wbt_inflight(rwb))
313 return LAT_UNKNOWN_WRITES; 313 return LAT_UNKNOWN_WRITES;
314 return LAT_UNKNOWN; 314 return LAT_UNKNOWN;
315 } 315 }
@@ -329,14 +329,6 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
329 return LAT_OK; 329 return LAT_OK;
330} 330}
331 331
332static int latency_exceeded(struct rq_wb *rwb)
333{
334 struct blk_rq_stat stat[2];
335
336 blk_queue_stat_get(rwb->queue, stat);
337 return __latency_exceeded(rwb, stat);
338}
339
340static void rwb_trace_step(struct rq_wb *rwb, const char *msg) 332static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
341{ 333{
342 struct backing_dev_info *bdi = rwb->queue->backing_dev_info; 334 struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
@@ -355,7 +347,6 @@ static void scale_up(struct rq_wb *rwb)
355 347
356 rwb->scale_step--; 348 rwb->scale_step--;
357 rwb->unknown_cnt = 0; 349 rwb->unknown_cnt = 0;
358 blk_stat_clear(rwb->queue);
359 350
360 rwb->scaled_max = calc_wb_limits(rwb); 351 rwb->scaled_max = calc_wb_limits(rwb);
361 352
@@ -385,15 +376,12 @@ static void scale_down(struct rq_wb *rwb, bool hard_throttle)
385 376
386 rwb->scaled_max = false; 377 rwb->scaled_max = false;
387 rwb->unknown_cnt = 0; 378 rwb->unknown_cnt = 0;
388 blk_stat_clear(rwb->queue);
389 calc_wb_limits(rwb); 379 calc_wb_limits(rwb);
390 rwb_trace_step(rwb, "step down"); 380 rwb_trace_step(rwb, "step down");
391} 381}
392 382
393static void rwb_arm_timer(struct rq_wb *rwb) 383static void rwb_arm_timer(struct rq_wb *rwb)
394{ 384{
395 unsigned long expires;
396
397 if (rwb->scale_step > 0) { 385 if (rwb->scale_step > 0) {
398 /* 386 /*
399 * We should speed this up, using some variant of a fast 387 * We should speed this up, using some variant of a fast
@@ -411,17 +399,16 @@ static void rwb_arm_timer(struct rq_wb *rwb)
411 rwb->cur_win_nsec = rwb->win_nsec; 399 rwb->cur_win_nsec = rwb->win_nsec;
412 } 400 }
413 401
414 expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec); 402 blk_stat_activate_nsecs(rwb->cb, rwb->cur_win_nsec);
415 mod_timer(&rwb->window_timer, expires);
416} 403}
417 404
418static void wb_timer_fn(unsigned long data) 405static void wb_timer_fn(struct blk_stat_callback *cb)
419{ 406{
420 struct rq_wb *rwb = (struct rq_wb *) data; 407 struct rq_wb *rwb = cb->data;
421 unsigned int inflight = wbt_inflight(rwb); 408 unsigned int inflight = wbt_inflight(rwb);
422 int status; 409 int status;
423 410
424 status = latency_exceeded(rwb); 411 status = latency_exceeded(rwb, cb->stat);
425 412
426 trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step, 413 trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step,
427 inflight); 414 inflight);
@@ -614,7 +601,7 @@ enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock)
614 601
615 __wbt_wait(rwb, bio->bi_opf, lock); 602 __wbt_wait(rwb, bio->bi_opf, lock);
616 603
617 if (!timer_pending(&rwb->window_timer)) 604 if (!blk_stat_is_active(rwb->cb))
618 rwb_arm_timer(rwb); 605 rwb_arm_timer(rwb);
619 606
620 if (current_is_kswapd()) 607 if (current_is_kswapd())
@@ -675,7 +662,7 @@ void wbt_disable_default(struct request_queue *q)
675 struct rq_wb *rwb = q->rq_wb; 662 struct rq_wb *rwb = q->rq_wb;
676 663
677 if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT) { 664 if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT) {
678 del_timer_sync(&rwb->window_timer); 665 blk_stat_remove_callback(q, rwb->cb);
679 rwb->win_nsec = rwb->min_lat_nsec = 0; 666 rwb->win_nsec = rwb->min_lat_nsec = 0;
680 wbt_update_limits(rwb); 667 wbt_update_limits(rwb);
681 } 668 }
@@ -699,24 +686,23 @@ int wbt_init(struct request_queue *q)
699 struct rq_wb *rwb; 686 struct rq_wb *rwb;
700 int i; 687 int i;
701 688
702 /*
703 * For now, we depend on the stats window being larger than
704 * our monitoring window. Ensure that this isn't inadvertently
705 * violated.
706 */
707 BUILD_BUG_ON(RWB_WINDOW_NSEC > BLK_STAT_NSEC);
708 BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS); 689 BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS);
709 690
710 rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); 691 rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
711 if (!rwb) 692 if (!rwb)
712 return -ENOMEM; 693 return -ENOMEM;
713 694
695 rwb->cb = blk_stat_alloc_callback(wb_timer_fn, blk_stat_rq_ddir, 2, rwb);
696 if (!rwb->cb) {
697 kfree(rwb);
698 return -ENOMEM;
699 }
700
714 for (i = 0; i < WBT_NUM_RWQ; i++) { 701 for (i = 0; i < WBT_NUM_RWQ; i++) {
715 atomic_set(&rwb->rq_wait[i].inflight, 0); 702 atomic_set(&rwb->rq_wait[i].inflight, 0);
716 init_waitqueue_head(&rwb->rq_wait[i].wait); 703 init_waitqueue_head(&rwb->rq_wait[i].wait);
717 } 704 }
718 705
719 setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb);
720 rwb->wc = 1; 706 rwb->wc = 1;
721 rwb->queue_depth = RWB_DEF_DEPTH; 707 rwb->queue_depth = RWB_DEF_DEPTH;
722 rwb->last_comp = rwb->last_issue = jiffies; 708 rwb->last_comp = rwb->last_issue = jiffies;
@@ -726,10 +712,10 @@ int wbt_init(struct request_queue *q)
726 wbt_update_limits(rwb); 712 wbt_update_limits(rwb);
727 713
728 /* 714 /*
729 * Assign rwb, and turn on stats tracking for this queue 715 * Assign rwb and add the stats callback.
730 */ 716 */
731 q->rq_wb = rwb; 717 q->rq_wb = rwb;
732 blk_stat_enable(q); 718 blk_stat_add_callback(q, rwb->cb);
733 719
734 rwb->min_lat_nsec = wbt_default_latency_nsec(q); 720 rwb->min_lat_nsec = wbt_default_latency_nsec(q);
735 721
@@ -744,7 +730,8 @@ void wbt_exit(struct request_queue *q)
744 struct rq_wb *rwb = q->rq_wb; 730 struct rq_wb *rwb = q->rq_wb;
745 731
746 if (rwb) { 732 if (rwb) {
747 del_timer_sync(&rwb->window_timer); 733 blk_stat_remove_callback(q, rwb->cb);
734 blk_stat_free_callback(rwb->cb);
748 q->rq_wb = NULL; 735 q->rq_wb = NULL;
749 kfree(rwb); 736 kfree(rwb);
750 } 737 }