aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorWu Fengguang <fengguang.wu@intel.com>2010-08-29 13:22:30 -0400
committerWu Fengguang <fengguang.wu@intel.com>2011-07-10 01:09:01 -0400
commite98be2d599207c6b31e9bb340d52a231b2f3662d (patch)
tree3ae28e7d621a6e2ddf8e7462f8d282901c113d5c
parentf7d2b1ecd0c714adefc7d3a942ef87beb828a763 (diff)
writeback: bdi write bandwidth estimation
The estimation value will start from 100MB/s and adapt to the real bandwidth in seconds. It tries to update the bandwidth only when disk is fully utilized. Any inactive period of more than one second will be skipped. The estimated bandwidth will be reflecting how fast the device can writeout when _fully utilized_, and won't drop to 0 when it goes idle. The value will remain constant at disk idle time. At busy write time, if not considering fluctuations, it will also remain high unless be knocked down by possible concurrent reads that compete for the disk time and bandwidth with async writes. The estimation is not done purely in the flusher because there is no guarantee for write_cache_pages() to return timely to update bandwidth. The bdi->avg_write_bandwidth smoothing is very effective for filtering out sudden spikes, however may be a little biased in long term. The overheads are low because the bdi bandwidth update only occurs at 200ms intervals. The 200ms update interval is suitable, because it's not possible to get the real bandwidth for the instance at all, due to large fluctuations. The NFS commits can be as large as seconds worth of data. One XFS completion may be as large as half second worth of data if we are going to increase the write chunk to half second worth of data. In ext4, fluctuations with time period of around 5 seconds is observed. And there is another pattern of irregular periods of up to 20 seconds on SSD tests. That's why we are not only doing the estimation at 200ms intervals, but also averaging them over a period of 3 seconds and then go further to do another level of smoothing in avg_write_bandwidth. CC: Li Shaohua <shaohua.li@intel.com> CC: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
-rw-r--r--fs/fs-writeback.c13
-rw-r--r--include/linux/backing-dev.h5
-rw-r--r--include/linux/writeback.h3
-rw-r--r--mm/backing-dev.c12
-rw-r--r--mm/page-writeback.c87
5 files changed, 120 insertions, 0 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 2c947da39f6e..5826992910e9 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -693,6 +693,16 @@ static inline bool over_bground_thresh(void)
693} 693}
694 694
695/* 695/*
696 * Called under wb->list_lock. If there are multiple wb per bdi,
697 * only the flusher working on the first wb should do it.
698 */
699static void wb_update_bandwidth(struct bdi_writeback *wb,
700 unsigned long start_time)
701{
702 __bdi_update_bandwidth(wb->bdi, start_time);
703}
704
705/*
696 * Explicit flushing or periodic writeback of "old" data. 706 * Explicit flushing or periodic writeback of "old" data.
697 * 707 *
698 * Define "old": the first time one of an inode's pages is dirtied, we mark the 708 * Define "old": the first time one of an inode's pages is dirtied, we mark the
@@ -710,6 +720,7 @@ static inline bool over_bground_thresh(void)
710static long wb_writeback(struct bdi_writeback *wb, 720static long wb_writeback(struct bdi_writeback *wb,
711 struct wb_writeback_work *work) 721 struct wb_writeback_work *work)
712{ 722{
723 unsigned long wb_start = jiffies;
713 long nr_pages = work->nr_pages; 724 long nr_pages = work->nr_pages;
714 unsigned long oldest_jif; 725 unsigned long oldest_jif;
715 struct inode *inode; 726 struct inode *inode;
@@ -758,6 +769,8 @@ static long wb_writeback(struct bdi_writeback *wb,
758 progress = __writeback_inodes_wb(wb, work); 769 progress = __writeback_inodes_wb(wb, work);
759 trace_writeback_written(wb->bdi, work); 770 trace_writeback_written(wb->bdi, work);
760 771
772 wb_update_bandwidth(wb, wb_start);
773
761 /* 774 /*
762 * Did we write something? Try for more 775 * Did we write something? Try for more
763 * 776 *
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 469d56443c63..a008982e7c08 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -73,6 +73,11 @@ struct backing_dev_info {
73 73
74 struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS]; 74 struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
75 75
76 unsigned long bw_time_stamp; /* last time write bw is updated */
77 unsigned long written_stamp; /* pages written at bw_time_stamp */
78 unsigned long write_bandwidth; /* the estimated write bandwidth */
79 unsigned long avg_write_bandwidth; /* further smoothed write bw */
80
76 struct prop_local_percpu completions; 81 struct prop_local_percpu completions;
77 int dirty_exceeded; 82 int dirty_exceeded;
78 83
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index df1b7f18f100..66862f2d90c8 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -118,6 +118,9 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
118unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, 118unsigned long bdi_dirty_limit(struct backing_dev_info *bdi,
119 unsigned long dirty); 119 unsigned long dirty);
120 120
121void __bdi_update_bandwidth(struct backing_dev_info *bdi,
122 unsigned long start_time);
123
121void page_writeback_init(void); 124void page_writeback_init(void);
122void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, 125void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
123 unsigned long nr_pages_dirtied); 126 unsigned long nr_pages_dirtied);
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 83f18a1d9d10..a76cdd160277 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -638,6 +638,11 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
638 setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); 638 setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
639} 639}
640 640
641/*
642 * Initial write bandwidth: 100 MB/s
643 */
644#define INIT_BW (100 << (20 - PAGE_SHIFT))
645
641int bdi_init(struct backing_dev_info *bdi) 646int bdi_init(struct backing_dev_info *bdi)
642{ 647{
643 int i, err; 648 int i, err;
@@ -660,6 +665,13 @@ int bdi_init(struct backing_dev_info *bdi)
660 } 665 }
661 666
662 bdi->dirty_exceeded = 0; 667 bdi->dirty_exceeded = 0;
668
669 bdi->bw_time_stamp = jiffies;
670 bdi->written_stamp = 0;
671
672 bdi->write_bandwidth = INIT_BW;
673 bdi->avg_write_bandwidth = INIT_BW;
674
663 err = prop_local_init_percpu(&bdi->completions); 675 err = prop_local_init_percpu(&bdi->completions);
664 676
665 if (err) { 677 if (err) {
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 8cd71376c63d..446bdf7b975b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -37,6 +37,11 @@
37#include <trace/events/writeback.h> 37#include <trace/events/writeback.h>
38 38
39/* 39/*
40 * Estimate write bandwidth at 200ms intervals.
41 */
42#define BANDWIDTH_INTERVAL max(HZ/5, 1)
43
44/*
40 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited 45 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
41 * will look to see if it needs to force writeback or throttling. 46 * will look to see if it needs to force writeback or throttling.
42 */ 47 */
@@ -471,6 +476,85 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
471 return bdi_dirty; 476 return bdi_dirty;
472} 477}
473 478
479static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
480 unsigned long elapsed,
481 unsigned long written)
482{
483 const unsigned long period = roundup_pow_of_two(3 * HZ);
484 unsigned long avg = bdi->avg_write_bandwidth;
485 unsigned long old = bdi->write_bandwidth;
486 u64 bw;
487
488 /*
489 * bw = written * HZ / elapsed
490 *
491 * bw * elapsed + write_bandwidth * (period - elapsed)
492 * write_bandwidth = ---------------------------------------------------
493 * period
494 */
495 bw = written - bdi->written_stamp;
496 bw *= HZ;
497 if (unlikely(elapsed > period)) {
498 do_div(bw, elapsed);
499 avg = bw;
500 goto out;
501 }
502 bw += (u64)bdi->write_bandwidth * (period - elapsed);
503 bw >>= ilog2(period);
504
505 /*
506 * one more level of smoothing, for filtering out sudden spikes
507 */
508 if (avg > old && old >= (unsigned long)bw)
509 avg -= (avg - old) >> 3;
510
511 if (avg < old && old <= (unsigned long)bw)
512 avg += (old - avg) >> 3;
513
514out:
515 bdi->write_bandwidth = bw;
516 bdi->avg_write_bandwidth = avg;
517}
518
519void __bdi_update_bandwidth(struct backing_dev_info *bdi,
520 unsigned long start_time)
521{
522 unsigned long now = jiffies;
523 unsigned long elapsed = now - bdi->bw_time_stamp;
524 unsigned long written;
525
526 /*
527 * rate-limit, only update once every 200ms.
528 */
529 if (elapsed < BANDWIDTH_INTERVAL)
530 return;
531
532 written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
533
534 /*
535 * Skip quiet periods when disk bandwidth is under-utilized.
536 * (at least 1s idle time between two flusher runs)
537 */
538 if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
539 goto snapshot;
540
541 bdi_update_write_bandwidth(bdi, elapsed, written);
542
543snapshot:
544 bdi->written_stamp = written;
545 bdi->bw_time_stamp = now;
546}
547
548static void bdi_update_bandwidth(struct backing_dev_info *bdi,
549 unsigned long start_time)
550{
551 if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
552 return;
553 spin_lock(&bdi->wb.list_lock);
554 __bdi_update_bandwidth(bdi, start_time);
555 spin_unlock(&bdi->wb.list_lock);
556}
557
474/* 558/*
475 * balance_dirty_pages() must be called by processes which are generating dirty 559 * balance_dirty_pages() must be called by processes which are generating dirty
476 * data. It looks at the number of dirty pages in the machine and will force 560 * data. It looks at the number of dirty pages in the machine and will force
@@ -490,6 +574,7 @@ static void balance_dirty_pages(struct address_space *mapping,
490 unsigned long pause = 1; 574 unsigned long pause = 1;
491 bool dirty_exceeded = false; 575 bool dirty_exceeded = false;
492 struct backing_dev_info *bdi = mapping->backing_dev_info; 576 struct backing_dev_info *bdi = mapping->backing_dev_info;
577 unsigned long start_time = jiffies;
493 578
494 for (;;) { 579 for (;;) {
495 nr_reclaimable = global_page_state(NR_FILE_DIRTY) + 580 nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
@@ -544,6 +629,8 @@ static void balance_dirty_pages(struct address_space *mapping,
544 if (!bdi->dirty_exceeded) 629 if (!bdi->dirty_exceeded)
545 bdi->dirty_exceeded = 1; 630 bdi->dirty_exceeded = 1;
546 631
632 bdi_update_bandwidth(bdi, start_time);
633
547 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. 634 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
548 * Unstable writes are a feature of certain networked 635 * Unstable writes are a feature of certain networked
549 * filesystems (i.e. NFS) in which data may have been 636 * filesystems (i.e. NFS) in which data may have been