aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorWu Fengguang <fengguang.wu@intel.com>2011-06-12 12:51:31 -0400
committerWu Fengguang <fengguang.wu@intel.com>2011-10-03 09:08:56 -0400
commitbe3ffa276446e1b691a2bf84e7621e5a6fb49db9 (patch)
treeca1b112195a9a8b63265f3204748cb23cff5b653 /mm
parentaf6a311384bce6c88e15c80ab22ab051a918b4eb (diff)
writeback: dirty rate control
It's all about bdi->dirty_ratelimit, which aims to be (write_bw / N) when there are N dd tasks. On write() syscall, use bdi->dirty_ratelimit ============================================ balance_dirty_pages(pages_dirtied) { task_ratelimit = bdi->dirty_ratelimit * bdi_position_ratio(); pause = pages_dirtied / task_ratelimit; sleep(pause); } On every 200ms, update bdi->dirty_ratelimit =========================================== bdi_update_dirty_ratelimit() { task_ratelimit = bdi->dirty_ratelimit * bdi_position_ratio(); balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate; bdi->dirty_ratelimit = balanced_dirty_ratelimit } Estimation of balanced bdi->dirty_ratelimit =========================================== balanced task_ratelimit ----------------------- balance_dirty_pages() needs to throttle tasks dirtying pages such that the total amount of dirty pages stays below the specified dirty limit in order to avoid memory deadlocks. Furthermore we desire fairness in that tasks get throttled proportionally to the amount of pages they dirty. IOW we want to throttle tasks such that we match the dirty rate to the writeout bandwidth, this yields a stable amount of dirty pages: dirty_rate == write_bw (1) The fairness requirement gives us: task_ratelimit = balanced_dirty_ratelimit == write_bw / N (2) where N is the number of dd tasks. We don't know N beforehand, but still can estimate balanced_dirty_ratelimit within 200ms. Start by throttling each dd task at rate task_ratelimit = task_ratelimit_0 (3) (any non-zero initial value is OK) After 200ms, we measured dirty_rate = # of pages dirtied by all dd's / 200ms write_bw = # of pages written to the disk / 200ms For the aggressive dd dirtiers, the equality holds dirty_rate == N * task_rate == N * task_ratelimit_0 (4) Or task_ratelimit_0 == dirty_rate / N (5) Now we conclude that the balanced task ratelimit can be estimated by write_bw balanced_dirty_ratelimit = task_ratelimit_0 * ---------- (6) dirty_rate Because with (4) and (5) we can get the desired equality (1): write_bw balanced_dirty_ratelimit == (dirty_rate / N) * ---------- dirty_rate == write_bw / N Then using the balanced task ratelimit we can compute task pause times like: task_pause = task->nr_dirtied / task_ratelimit task_ratelimit with position control ------------------------------------ However, while the above gives us means of matching the dirty rate to the writeout bandwidth, it at best provides us with a stable dirty page count (assuming a static system). In order to control the dirty page count such that it is high enough to provide performance, but does not exceed the specified limit we need another control. The dirty position control works by extending (2) to task_ratelimit = balanced_dirty_ratelimit * pos_ratio (7) where pos_ratio is a negative feedback function that subjects to 1) f(setpoint) = 1.0 2) df/dx < 0 That is, if the dirty pages are ABOVE the setpoint, we throttle each task a bit more HEAVY than balanced_dirty_ratelimit, so that the dirty pages are created less fast than they are cleaned, thus DROP to the setpoints (and the reverse). Based on (7) and the assumption that both dirty_ratelimit and pos_ratio remains CONSTANT for the past 200ms, we get task_ratelimit_0 = balanced_dirty_ratelimit * pos_ratio (8) Putting (8) into (6), we get the formula used in bdi_update_dirty_ratelimit(): write_bw balanced_dirty_ratelimit *= pos_ratio * ---------- (9) dirty_rate Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c1
-rw-r--r--mm/page-writeback.c83
2 files changed, 82 insertions, 2 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index fea7e6efd1d7..ba20f94cde93 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -686,6 +686,7 @@ int bdi_init(struct backing_dev_info *bdi)
686 bdi->bw_time_stamp = jiffies; 686 bdi->bw_time_stamp = jiffies;
687 bdi->written_stamp = 0; 687 bdi->written_stamp = 0;
688 688
689 bdi->dirty_ratelimit = INIT_BW;
689 bdi->write_bandwidth = INIT_BW; 690 bdi->write_bandwidth = INIT_BW;
690 bdi->avg_write_bandwidth = INIT_BW; 691 bdi->avg_write_bandwidth = INIT_BW;
691 692
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 4b954c9fe846..1721b6523c04 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -777,6 +777,79 @@ static void global_update_bandwidth(unsigned long thresh,
777 spin_unlock(&dirty_lock); 777 spin_unlock(&dirty_lock);
778} 778}
779 779
780/*
781 * Maintain bdi->dirty_ratelimit, the base dirty throttle rate.
782 *
783 * Normal bdi tasks will be curbed at or below it in long term.
784 * Obviously it should be around (write_bw / N) when there are N dd tasks.
785 */
786static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
787 unsigned long thresh,
788 unsigned long bg_thresh,
789 unsigned long dirty,
790 unsigned long bdi_thresh,
791 unsigned long bdi_dirty,
792 unsigned long dirtied,
793 unsigned long elapsed)
794{
795 unsigned long write_bw = bdi->avg_write_bandwidth;
796 unsigned long dirty_ratelimit = bdi->dirty_ratelimit;
797 unsigned long dirty_rate;
798 unsigned long task_ratelimit;
799 unsigned long balanced_dirty_ratelimit;
800 unsigned long pos_ratio;
801
802 /*
803 * The dirty rate will match the writeout rate in long term, except
804 * when dirty pages are truncated by userspace or re-dirtied by FS.
805 */
806 dirty_rate = (dirtied - bdi->dirtied_stamp) * HZ / elapsed;
807
808 pos_ratio = bdi_position_ratio(bdi, thresh, bg_thresh, dirty,
809 bdi_thresh, bdi_dirty);
810 /*
811 * task_ratelimit reflects each dd's dirty rate for the past 200ms.
812 */
813 task_ratelimit = (u64)dirty_ratelimit *
814 pos_ratio >> RATELIMIT_CALC_SHIFT;
815 task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */
816
817 /*
818 * A linear estimation of the "balanced" throttle rate. The theory is,
819 * if there are N dd tasks, each throttled at task_ratelimit, the bdi's
820 * dirty_rate will be measured to be (N * task_ratelimit). So the below
821 * formula will yield the balanced rate limit (write_bw / N).
822 *
823 * Note that the expanded form is not a pure rate feedback:
824 * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) (1)
825 * but also takes pos_ratio into account:
826 * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio (2)
827 *
828 * (1) is not realistic because pos_ratio also takes part in balancing
829 * the dirty rate. Consider the state
830 * pos_ratio = 0.5 (3)
831 * rate = 2 * (write_bw / N) (4)
832 * If (1) is used, it will stuck in that state! Because each dd will
833 * be throttled at
834 * task_ratelimit = pos_ratio * rate = (write_bw / N) (5)
835 * yielding
836 * dirty_rate = N * task_ratelimit = write_bw (6)
837 * put (6) into (1) we get
838 * rate_(i+1) = rate_(i) (7)
839 *
840 * So we end up using (2) to always keep
841 * rate_(i+1) ~= (write_bw / N) (8)
842 * regardless of the value of pos_ratio. As long as (8) is satisfied,
843 * pos_ratio is able to drive itself to 1.0, which is not only where
844 * the dirty count meet the setpoint, but also where the slope of
845 * pos_ratio is most flat and hence task_ratelimit is least fluctuated.
846 */
847 balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw,
848 dirty_rate | 1);
849
850 bdi->dirty_ratelimit = max(balanced_dirty_ratelimit, 1UL);
851}
852
780void __bdi_update_bandwidth(struct backing_dev_info *bdi, 853void __bdi_update_bandwidth(struct backing_dev_info *bdi,
781 unsigned long thresh, 854 unsigned long thresh,
782 unsigned long bg_thresh, 855 unsigned long bg_thresh,
@@ -787,6 +860,7 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi,
787{ 860{
788 unsigned long now = jiffies; 861 unsigned long now = jiffies;
789 unsigned long elapsed = now - bdi->bw_time_stamp; 862 unsigned long elapsed = now - bdi->bw_time_stamp;
863 unsigned long dirtied;
790 unsigned long written; 864 unsigned long written;
791 865
792 /* 866 /*
@@ -795,6 +869,7 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi,
795 if (elapsed < BANDWIDTH_INTERVAL) 869 if (elapsed < BANDWIDTH_INTERVAL)
796 return; 870 return;
797 871
872 dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]);
798 written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]); 873 written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
799 874
800 /* 875 /*
@@ -804,12 +879,16 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi,
804 if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time)) 879 if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
805 goto snapshot; 880 goto snapshot;
806 881
807 if (thresh) 882 if (thresh) {
808 global_update_bandwidth(thresh, dirty, now); 883 global_update_bandwidth(thresh, dirty, now);
809 884 bdi_update_dirty_ratelimit(bdi, thresh, bg_thresh, dirty,
885 bdi_thresh, bdi_dirty,
886 dirtied, elapsed);
887 }
810 bdi_update_write_bandwidth(bdi, elapsed, written); 888 bdi_update_write_bandwidth(bdi, elapsed, written);
811 889
812snapshot: 890snapshot:
891 bdi->dirtied_stamp = dirtied;
813 bdi->written_stamp = written; 892 bdi->written_stamp = written;
814 bdi->bw_time_stamp = now; 893 bdi->bw_time_stamp = now;
815} 894}