writeback: dirty rate control

It's all about bdi->dirty_ratelimit, which aims to be (write_bw / N) when there are N dd tasks. On write() syscall, use bdi->dirty_ratelimit ============================================ balance_dirty_pages(pages_dirtied) { task_ratelimit = bdi->dirty_ratelimit * bdi_position_ratio(); pause = pages_dirtied / task_ratelimit; sleep(pause); } On every 200ms, update bdi->dirty_ratelimit =========================================== bdi_update_dirty_ratelimit() { task_ratelimit = bdi->dirty_ratelimit * bdi_position_ratio(); balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate; bdi->dirty_ratelimit = balanced_dirty_ratelimit } Estimation of balanced bdi->dirty_ratelimit =========================================== balanced task_ratelimit ----------------------- balance_dirty_pages() needs to throttle tasks dirtying pages such that the total amount of dirty pages stays below the specified dirty limit in order to avoid memory deadlocks. Furthermore we desire fairness in that tasks get throttled proportionally to the amount of pages they dirty. IOW we want to throttle tasks such that we match the dirty rate to the writeout bandwidth, this yields a stable amount of dirty pages: dirty_rate == write_bw (1) The fairness requirement gives us: task_ratelimit = balanced_dirty_ratelimit == write_bw / N (2) where N is the number of dd tasks. We don't know N beforehand, but still can estimate balanced_dirty_ratelimit within 200ms. Start by throttling each dd task at rate task_ratelimit = task_ratelimit_0 (3) (any non-zero initial value is OK) After 200ms, we measured dirty_rate = # of pages dirtied by all dd's / 200ms write_bw = # of pages written to the disk / 200ms For the aggressive dd dirtiers, the equality holds dirty_rate == N * task_rate == N * task_ratelimit_0 (4) Or task_ratelimit_0 == dirty_rate / N (5) Now we conclude that the balanced task ratelimit can be estimated by write_bw balanced_dirty_ratelimit = task_ratelimit_0 * ---------- (6) dirty_rate Because with (4) and (5) we can get the desired equality (1): write_bw balanced_dirty_ratelimit == (dirty_rate / N) * ---------- dirty_rate == write_bw / N Then using the balanced task ratelimit we can compute task pause times like: task_pause = task->nr_dirtied / task_ratelimit task_ratelimit with position control ------------------------------------ However, while the above gives us means of matching the dirty rate to the writeout bandwidth, it at best provides us with a stable dirty page count (assuming a static system). In order to control the dirty page count such that it is high enough to provide performance, but does not exceed the specified limit we need another control. The dirty position control works by extending (2) to task_ratelimit = balanced_dirty_ratelimit * pos_ratio (7) where pos_ratio is a negative feedback function that subjects to 1) f(setpoint) = 1.0 2) df/dx < 0 That is, if the dirty pages are ABOVE the setpoint, we throttle each task a bit more HEAVY than balanced_dirty_ratelimit, so that the dirty pages are created less fast than they are cleaned, thus DROP to the setpoints (and the reverse). Based on (7) and the assumption that both dirty_ratelimit and pos_ratio remains CONSTANT for the past 200ms, we get task_ratelimit_0 = balanced_dirty_ratelimit * pos_ratio (8) Putting (8) into (6), we get the formula used in bdi_update_dirty_ratelimit(): write_bw balanced_dirty_ratelimit *= pos_ratio * ---------- (9) dirty_rate Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
author: Wu Fengguang <fengguang.wu@intel.com> 2011-06-12 12:51:31 -0400
committer: Wu Fengguang <fengguang.wu@intel.com> 2011-10-03 09:08:56 -0400
commit: be3ffa276446e1b691a2bf84e7621e5a6fb49db9 (patch)
tree: ca1b112195a9a8b63265f3204748cb23cff5b653 /mm/page-writeback.c
parent: af6a311384bce6c88e15c80ab22ab051a918b4eb (diff)
1 files changed, 81 insertions, 2 deletions
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 4b954c9fe846..1721b6523c04 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -777,6 +777,79 @@ static void global_update_bandwidth(unsigned long thresh,
        spin_unlock(&dirty_lock);
 }
+/*
+ * Maintain bdi->dirty_ratelimit, the base dirty throttle rate.
+ *
+ * Normal bdi tasks will be curbed at or below it in long term.
+ * Obviously it should be around (write_bw / N) when there are N dd tasks.
+ */
+static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
+                                       unsigned long thresh,
+                                       unsigned long bg_thresh,
+                                       unsigned long dirty,
+                                       unsigned long bdi_thresh,
+                                       unsigned long bdi_dirty,
+                                       unsigned long dirtied,
+                                       unsigned long elapsed)
+{
+        unsigned long write_bw = bdi->avg_write_bandwidth;
+        unsigned long dirty_ratelimit = bdi->dirty_ratelimit;
+        unsigned long dirty_rate;
+        unsigned long task_ratelimit;
+        unsigned long balanced_dirty_ratelimit;
+        unsigned long pos_ratio;
+        /*
+         * The dirty rate will match the writeout rate in long term, except
+         * when dirty pages are truncated by userspace or re-dirtied by FS.
+         */
+        dirty_rate = (dirtied - bdi->dirtied_stamp) * HZ / elapsed;
+        pos_ratio = bdi_position_ratio(bdi, thresh, bg_thresh, dirty,
+                                       bdi_thresh, bdi_dirty);
+        /*
+         * task_ratelimit reflects each dd's dirty rate for the past 200ms.
+         */
+        task_ratelimit = (u64)dirty_ratelimit *
+                                        pos_ratio >> RATELIMIT_CALC_SHIFT;
+        task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */
+        /*
+         * A linear estimation of the "balanced" throttle rate. The theory is,
+         * if there are N dd tasks, each throttled at task_ratelimit, the bdi's
+         * dirty_rate will be measured to be (N * task_ratelimit). So the below
+         * formula will yield the balanced rate limit (write_bw / N).
+         *
+         * Note that the expanded form is not a pure rate feedback:
+         *      rate_(i+1) = rate_(i) * (write_bw / dirty_rate)              (1)
+         * but also takes pos_ratio into account:
+         *      rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio  (2)
+         *
+         * (1) is not realistic because pos_ratio also takes part in balancing
+         * the dirty rate.  Consider the state
+         *      pos_ratio = 0.5                                              (3)
+         *      rate = 2 * (write_bw / N)                                    (4)
+         * If (1) is used, it will stuck in that state! Because each dd will
+         * be throttled at
+         *      task_ratelimit = pos_ratio * rate = (write_bw / N)           (5)
+         * yielding
+         *      dirty_rate = N * task_ratelimit = write_bw                   (6)
+         * put (6) into (1) we get
+         *      rate_(i+1) = rate_(i)                                        (7)
+         *
+         * So we end up using (2) to always keep
+         *      rate_(i+1) ~= (write_bw / N)                                 (8)
+         * regardless of the value of pos_ratio. As long as (8) is satisfied,
+         * pos_ratio is able to drive itself to 1.0, which is not only where
+         * the dirty count meet the setpoint, but also where the slope of
+         * pos_ratio is most flat and hence task_ratelimit is least fluctuated.
+         */
+        balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw,
+                                           dirty_rate | 1);
+        bdi->dirty_ratelimit = max(balanced_dirty_ratelimit, 1UL);
+}
 void __bdi_update_bandwidth(struct backing_dev_info *bdi,
                            unsigned long thresh,
                            unsigned long bg_thresh,
@@ -787,6 +860,7 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi,
 {
        unsigned long now = jiffies;
        unsigned long elapsed = now - bdi->bw_time_stamp;
+        unsigned long dirtied;
        unsigned long written;
        /*
@@ -795,6 +869,7 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi,
        if (elapsed < BANDWIDTH_INTERVAL)
                return;
+        dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]);
        written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
        /*
@@ -804,12 +879,16 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi,
        if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
                goto snapshot;
-        if (thresh)
+        if (thresh) {
                global_update_bandwidth(thresh, dirty, now);
+                bdi_update_dirty_ratelimit(bdi, thresh, bg_thresh, dirty,
+                                           bdi_thresh, bdi_dirty,
+                                           dirtied, elapsed);
+        }
        bdi_update_write_bandwidth(bdi, elapsed, written);
 snapshot:
+        bdi->dirtied_stamp = dirtied;
        bdi->written_stamp = written;
        bdi->bw_time_stamp = now;
 }
author	Wu Fengguang <fengguang.wu@intel.com>	2011-06-12 12:51:31 -0400
committer	Wu Fengguang <fengguang.wu@intel.com>	2011-10-03 09:08:56 -0400
commit	be3ffa276446e1b691a2bf84e7621e5a6fb49db9 (patch)
tree	ca1b112195a9a8b63265f3204748cb23cff5b653 /mm/page-writeback.c
parent	af6a311384bce6c88e15c80ab22ab051a918b4eb (diff)

diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 4b954c9fe846..1721b6523c04 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c
@@ -777,6 +777,79 @@ static void global_update_bandwidth(unsigned long thresh,
777	spin_unlock(&dirty_lock);	777	spin_unlock(&dirty_lock);
778	}	778	}
779		779
		780	/*
		781	* Maintain bdi->dirty_ratelimit, the base dirty throttle rate.
		782	*
		783	* Normal bdi tasks will be curbed at or below it in long term.
		784	* Obviously it should be around (write_bw / N) when there are N dd tasks.
		785	*/
		786	static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
		787	unsigned long thresh,
		788	unsigned long bg_thresh,
		789	unsigned long dirty,
		790	unsigned long bdi_thresh,
		791	unsigned long bdi_dirty,
		792	unsigned long dirtied,
		793	unsigned long elapsed)
		794	{
		795	unsigned long write_bw = bdi->avg_write_bandwidth;
		796	unsigned long dirty_ratelimit = bdi->dirty_ratelimit;
		797	unsigned long dirty_rate;
		798	unsigned long task_ratelimit;
		799	unsigned long balanced_dirty_ratelimit;
		800	unsigned long pos_ratio;
		801
		802	/*
		803	* The dirty rate will match the writeout rate in long term, except
		804	* when dirty pages are truncated by userspace or re-dirtied by FS.
		805	*/
		806	dirty_rate = (dirtied - bdi->dirtied_stamp) * HZ / elapsed;
		807
		808	pos_ratio = bdi_position_ratio(bdi, thresh, bg_thresh, dirty,
		809	bdi_thresh, bdi_dirty);
		810	/*
		811	* task_ratelimit reflects each dd's dirty rate for the past 200ms.
		812	*/
		813	task_ratelimit = (u64)dirty_ratelimit *
		814	pos_ratio >> RATELIMIT_CALC_SHIFT;
		815	task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */
		816
		817	/*
		818	* A linear estimation of the "balanced" throttle rate. The theory is,
		819	* if there are N dd tasks, each throttled at task_ratelimit, the bdi's
		820	* dirty_rate will be measured to be (N * task_ratelimit). So the below
		821	* formula will yield the balanced rate limit (write_bw / N).
		822	*
		823	* Note that the expanded form is not a pure rate feedback:
		824	* rate_(i+1) = rate_(i) * (write_bw / dirty_rate) (1)
		825	* but also takes pos_ratio into account:
		826	* rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio (2)
		827	*
		828	* (1) is not realistic because pos_ratio also takes part in balancing
		829	* the dirty rate. Consider the state
		830	* pos_ratio = 0.5 (3)
		831	* rate = 2 * (write_bw / N) (4)
		832	* If (1) is used, it will stuck in that state! Because each dd will
		833	* be throttled at
		834	* task_ratelimit = pos_ratio * rate = (write_bw / N) (5)
		835	* yielding
		836	* dirty_rate = N * task_ratelimit = write_bw (6)
		837	* put (6) into (1) we get
		838	* rate_(i+1) = rate_(i) (7)
		839	*
		840	* So we end up using (2) to always keep
		841	* rate_(i+1) ~= (write_bw / N) (8)
		842	* regardless of the value of pos_ratio. As long as (8) is satisfied,
		843	* pos_ratio is able to drive itself to 1.0, which is not only where
		844	* the dirty count meet the setpoint, but also where the slope of
		845	* pos_ratio is most flat and hence task_ratelimit is least fluctuated.
		846	*/
		847	balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw,
		848	dirty_rate \| 1);
		849
		850	bdi->dirty_ratelimit = max(balanced_dirty_ratelimit, 1UL);
		851	}
		852
780	void __bdi_update_bandwidth(struct backing_dev_info *bdi,	853	void __bdi_update_bandwidth(struct backing_dev_info *bdi,
781	unsigned long thresh,	854	unsigned long thresh,
782	unsigned long bg_thresh,	855	unsigned long bg_thresh,
@@ -787,6 +860,7 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi,
787	{	860	{
788	unsigned long now = jiffies;	861	unsigned long now = jiffies;
789	unsigned long elapsed = now - bdi->bw_time_stamp;	862	unsigned long elapsed = now - bdi->bw_time_stamp;
		863	unsigned long dirtied;
790	unsigned long written;	864	unsigned long written;
791		865
792	/*	866	/*
@@ -795,6 +869,7 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi,
795	if (elapsed < BANDWIDTH_INTERVAL)	869	if (elapsed < BANDWIDTH_INTERVAL)
796	return;	870	return;
797		871
		872	dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]);
798	written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);	873	written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
799		874
800	/*	875	/*
@@ -804,12 +879,16 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi,
804	if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))	879	if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
805	goto snapshot;	880	goto snapshot;
806		881
807	if (thresh)	882	if (thresh) {
808	global_update_bandwidth(thresh, dirty, now);	883	global_update_bandwidth(thresh, dirty, now);
809		884	bdi_update_dirty_ratelimit(bdi, thresh, bg_thresh, dirty,
		885	bdi_thresh, bdi_dirty,
		886	dirtied, elapsed);
		887	}
810	bdi_update_write_bandwidth(bdi, elapsed, written);	888	bdi_update_write_bandwidth(bdi, elapsed, written);
811		889
812	snapshot:	890	snapshot:
		891	bdi->dirtied_stamp = dirtied;
813	bdi->written_stamp = written;	892	bdi->written_stamp = written;
814	bdi->bw_time_stamp = now;	893	bdi->bw_time_stamp = now;
815	}	894	}