writeback: avoid tiny dirty poll intervals

The LKP tests see big 56% regression for the case fio_mmap_randwrite_64k. Shaohua manages to root cause it to be the much smaller dirty pause times and hence much more frequent invocations to the IO-less balance_dirty_pages(). Since fio_mmap_randwrite_64k effectively contains both reads and writes, the more frequent pauses triggered more idling in the cfq IO scheduler. The solution is to increase pause time all the way up to the max 200ms in this case, which is found to restore most performance. This will help reduce CPU overheads in other cases, too. Note that I don't expect many performance critical workloads to run this access pattern: the mmap read-on-write is rather inefficient and could be avoided by doing normal writes syscalls. CC: Jan Kara <jack@suse.cz> CC: Peter Zijlstra <a.p.zijlstra@chello.nl> Reported-by: Li Shaohua <shaohua.li@intel.com> Tested-by: Li Shaohua <shaohua.li@intel.com> Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
author: Wu Fengguang <fengguang.wu@intel.com> 2011-12-06 14:17:17 -0500
committer: Wu Fengguang <fengguang.wu@intel.com> 2011-12-18 01:20:30 -0500
commit: 5b9b357435a51ff14835c06d8b00765a4c68f313 (patch)
tree: 858bdc6ce0984aa0a9abc88d4c53931e6b299312 /mm/page-writeback.c
parent: 7ccb9ad5364d6ac0c803096c67e76a7545cf7a77 (diff)
1 files changed, 24 insertions, 1 deletions
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 5830991f261a..422cf4edab47 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -42,6 +42,12 @@
 #define MAX_PAUSE               max(HZ/5, 1)
 /*
+ * Try to keep balance_dirty_pages() call intervals higher than this many pages
+ * by raising pause time to max_pause when falls below it.
+ */
+#define DIRTY_POLL_THRESH       (128 >> (PAGE_SHIFT - 10))
+/*
 * Estimate write bandwidth at 200ms intervals.
 */
 #define BANDWIDTH_INTERVAL      max(HZ/5, 1)
@@ -1026,6 +1032,23 @@ static long bdi_min_pause(struct backing_dev_info *bdi,
        t = min(t, 1 + max_pause / 2);
        pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
+        /*
+         * Tiny nr_dirtied_pause is found to hurt I/O performance in the test
+         * case fio-mmap-randwrite-64k, which does 16*{sync read, async write}.
+         * When the 16 consecutive reads are often interrupted by some dirty
+         * throttling pause during the async writes, cfq will go into idles
+         * (deadline is fine). So push nr_dirtied_pause as high as possible
+         * until reaches DIRTY_POLL_THRESH=32 pages.
+         */
+        if (pages < DIRTY_POLL_THRESH) {
+                t = max_pause;
+                pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
+                if (pages > DIRTY_POLL_THRESH) {
+                        pages = DIRTY_POLL_THRESH;
+                        t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit;
+                }
+        }
        pause = HZ * pages / (task_ratelimit + 1);
        if (pause > max_pause) {
                t = max_pause;
@@ -1036,7 +1059,7 @@ static long bdi_min_pause(struct backing_dev_info *bdi,
        /*
         * The minimal pause time will normally be half the target pause time.
         */
-        return 1 + t / 2;
+        return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
 }
 /*
author	Wu Fengguang <fengguang.wu@intel.com>	2011-12-06 14:17:17 -0500
committer	Wu Fengguang <fengguang.wu@intel.com>	2011-12-18 01:20:30 -0500
commit	5b9b357435a51ff14835c06d8b00765a4c68f313 (patch)
tree	858bdc6ce0984aa0a9abc88d4c53931e6b299312 /mm/page-writeback.c
parent	7ccb9ad5364d6ac0c803096c67e76a7545cf7a77 (diff)

diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5830991f261a..422cf4edab47 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c
@@ -42,6 +42,12 @@
42	#define MAX_PAUSE max(HZ/5, 1)	42	#define MAX_PAUSE max(HZ/5, 1)
43		43
44	/*	44	/*
		45	* Try to keep balance_dirty_pages() call intervals higher than this many pages
		46	* by raising pause time to max_pause when falls below it.
		47	*/
		48	#define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10))
		49
		50	/*
45	* Estimate write bandwidth at 200ms intervals.	51	* Estimate write bandwidth at 200ms intervals.
46	*/	52	*/
47	#define BANDWIDTH_INTERVAL max(HZ/5, 1)	53	#define BANDWIDTH_INTERVAL max(HZ/5, 1)
@@ -1026,6 +1032,23 @@ static long bdi_min_pause(struct backing_dev_info *bdi,
1026	t = min(t, 1 + max_pause / 2);	1032	t = min(t, 1 + max_pause / 2);
1027	pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);	1033	pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
1028		1034
		1035	/*
		1036	* Tiny nr_dirtied_pause is found to hurt I/O performance in the test
		1037	* case fio-mmap-randwrite-64k, which does 16*{sync read, async write}.
		1038	* When the 16 consecutive reads are often interrupted by some dirty
		1039	* throttling pause during the async writes, cfq will go into idles
		1040	* (deadline is fine). So push nr_dirtied_pause as high as possible
		1041	* until reaches DIRTY_POLL_THRESH=32 pages.
		1042	*/
		1043	if (pages < DIRTY_POLL_THRESH) {
		1044	t = max_pause;
		1045	pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
		1046	if (pages > DIRTY_POLL_THRESH) {
		1047	pages = DIRTY_POLL_THRESH;
		1048	t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit;
		1049	}
		1050	}
		1051
1029	pause = HZ * pages / (task_ratelimit + 1);	1052	pause = HZ * pages / (task_ratelimit + 1);
1030	if (pause > max_pause) {	1053	if (pause > max_pause) {
1031	t = max_pause;	1054	t = max_pause;
@@ -1036,7 +1059,7 @@ static long bdi_min_pause(struct backing_dev_info *bdi,
1036	/*	1059	/*
1037	* The minimal pause time will normally be half the target pause time.	1060	* The minimal pause time will normally be half the target pause time.
1038	*/	1061	*/
1039	return 1 + t / 2;	1062	return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
1040	}	1063	}
1041		1064
1042	/*	1065	/*