writeback: per task dirty rate limit

Add two fields to task_struct. 1) account dirtied pages in the individual tasks, for accuracy 2) per-task balance_dirty_pages() call intervals, for flexibility The balance_dirty_pages() call interval (ie. nr_dirtied_pause) will scale near-sqrt to the safety gap between dirty pages and threshold. The main problem of per-task nr_dirtied is, if 1k+ tasks start dirtying pages at exactly the same time, each task will be assigned a large initial nr_dirtied_pause, so that the dirty threshold will be exceeded long before each task reached its nr_dirtied_pause and hence call balance_dirty_pages(). The solution is to watch for the number of pages dirtied on each CPU in between the calls into balance_dirty_pages(). If it exceeds ratelimit_pages (3% dirty threshold), force call balance_dirty_pages() for a chance to set bdi->dirty_exceeded. In normal situations, this safeguarding condition is not expected to trigger at all. On the sqrt in dirty_poll_interval(): It will serve as an initial guess when dirty pages are still in the freerun area. When dirty pages are floating inside the dirty control scope [freerun, limit], a followup patch will use some refined dirty poll interval to get the desired pause time. thresh-dirty (MB) sqrt 1 16 2 22 4 32 8 45 16 64 32 90 64 128 128 181 256 256 512 362 1024 512 The above table means, given 1MB (or 1GB) gap and the dd tasks polling balance_dirty_pages() on every 16 (or 512) pages, the dirty limit won't be exceeded as long as there are less than 16 (or 512) concurrent dd's. So sqrt naturally leads to less overheads and more safe concurrent tasks for large memory servers, which have large (thresh-freerun) gaps. peter: keep the per-CPU ratelimit for safeguarding the 1k+ tasks case CC: Peter Zijlstra <a.p.zijlstra@chello.nl> Reviewed-by: Andrea Righi <andrea@betterlinux.com> Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
author: Wu Fengguang <fengguang.wu@intel.com> 2011-06-11 20:10:12 -0400
committer: Wu Fengguang <fengguang.wu@intel.com> 2011-10-03 09:08:57 -0400
commit: 9d823e8f6b1b7b39f952d7d1795f29162143a433 (patch)
tree: 2ef4c0d29353452dd2f894e7dbd240a31bdd0a02 /mm/page-writeback.c
parent: 7381131cbcf7e15d201a0ffd782a4698efe4e740 (diff)
1 files changed, 50 insertions, 39 deletions
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d4a6e91bd9e5..daff320d263f 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -54,20 +54,6 @@
 */
 static long ratelimit_pages = 32;
-/*
- * When balance_dirty_pages decides that the caller needs to perform some
- * non-background writeback, this is how many pages it will attempt to write.
- * It should be somewhat larger than dirtied pages to ensure that reasonably
- * large amounts of I/O are submitted.
- */
-static inline long sync_writeback_pages(unsigned long dirtied)
-{
-        if (dirtied < ratelimit_pages)
-                dirtied = ratelimit_pages;
-        return dirtied + dirtied / 2;
-}
 /* The following parameters are exported via /proc/sys/vm */
 /*
@@ -169,6 +155,8 @@ static void update_completion_period(void)
        int shift = calc_period_shift();
        prop_change_shift(&vm_completions, shift);
        prop_change_shift(&vm_dirties, shift);
+        writeback_set_ratelimit();
 }
 int dirty_background_ratio_handler(struct ctl_table *table, int write,
@@ -979,6 +967,23 @@ static void bdi_update_bandwidth(struct backing_dev_info *bdi,
 }
 /*
+ * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr()
+ * will look to see if it needs to start dirty throttling.
+ *
+ * If dirty_poll_interval is too low, big NUMA machines will call the expensive
+ * global_page_state() too often. So scale it near-sqrt to the safety margin
+ * (the number of pages we may dirty without exceeding the dirty limits).
+ */
+static unsigned long dirty_poll_interval(unsigned long dirty,
+                                         unsigned long thresh)
+{
+        if (thresh > dirty)
+                return 1UL << (ilog2(thresh - dirty) >> 1);
+        return 1;
+}
+/*
 * balance_dirty_pages() must be called by processes which are generating dirty
 * data.  It looks at the number of dirty pages in the machine and will force
 * the caller to perform writeback if the system is over `vm_dirty_ratio'.
@@ -1112,6 +1117,9 @@ static void balance_dirty_pages(struct address_space *mapping,
        if (clear_dirty_exceeded && bdi->dirty_exceeded)
                bdi->dirty_exceeded = 0;
+        current->nr_dirtied = 0;
+        current->nr_dirtied_pause = dirty_poll_interval(nr_dirty, dirty_thresh);
        if (writeback_in_progress(bdi))
                return;
@@ -1138,7 +1146,7 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite)
        }
 }
-static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
+static DEFINE_PER_CPU(int, bdp_ratelimits);
 /**
 * balance_dirty_pages_ratelimited_nr - balance dirty memory state
@@ -1158,31 +1166,39 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
                                        unsigned long nr_pages_dirtied)
 {
        struct backing_dev_info *bdi = mapping->backing_dev_info;
-        unsigned long ratelimit;
+        int ratelimit;
-        unsigned long *p;
+        int *p;
        if (!bdi_cap_account_dirty(bdi))
                return;
-        ratelimit = ratelimit_pages;
+        ratelimit = current->nr_dirtied_pause;
-        if (mapping->backing_dev_info->dirty_exceeded)
+        if (bdi->dirty_exceeded)
-                ratelimit = 8;
+                ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
+        current->nr_dirtied += nr_pages_dirtied;
+        preempt_disable();
        /*
-         * Check the rate limiting. Also, we do not want to throttle real-time
+         * This prevents one CPU to accumulate too many dirtied pages without
-         * tasks in balance_dirty_pages(). Period.
+         * calling into balance_dirty_pages(), which can happen when there are
+         * 1000+ tasks, all of them start dirtying pages at exactly the same
+         * time, hence all honoured too large initial task->nr_dirtied_pause.
         */
-        preempt_disable();
        p =  &__get_cpu_var(bdp_ratelimits);
-        *p += nr_pages_dirtied;
+        if (unlikely(current->nr_dirtied >= ratelimit))
-        if (unlikely(*p >= ratelimit)) {
-                ratelimit = sync_writeback_pages(*p);
                *p = 0;
-                preempt_enable();
+        else {
-                balance_dirty_pages(mapping, ratelimit);
+                *p += nr_pages_dirtied;
-                return;
+                if (unlikely(*p >= ratelimit_pages)) {
+                        *p = 0;
+                        ratelimit = 0;
+                }
        }
        preempt_enable();
+        if (unlikely(current->nr_dirtied >= ratelimit))
+                balance_dirty_pages(mapping, current->nr_dirtied);
 }
 EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
@@ -1277,22 +1293,17 @@ void laptop_sync_completion(void)
 *
 * Here we set ratelimit_pages to a level which ensures that when all CPUs are
 * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
- * thresholds before writeback cuts in.
+ * thresholds.
- *
- * But the limit should not be set too high.  Because it also controls the
- * amount of memory which the balance_dirty_pages() caller has to write back.
- * If this is too large then the caller will block on the IO queue all the
- * time.  So limit it to four megabytes - the balance_dirty_pages() caller
- * will write six megabyte chunks, max.
 */
 void writeback_set_ratelimit(void)
 {
-        ratelimit_pages = vm_total_pages / (num_online_cpus() * 32);
+        unsigned long background_thresh;
+        unsigned long dirty_thresh;
+        global_dirty_limits(&background_thresh, &dirty_thresh);
+        ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
        if (ratelimit_pages < 16)
                ratelimit_pages = 16;
-        if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
-                ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
 }
 static int __cpuinit
author	Wu Fengguang <fengguang.wu@intel.com>	2011-06-11 20:10:12 -0400
committer	Wu Fengguang <fengguang.wu@intel.com>	2011-10-03 09:08:57 -0400
commit	9d823e8f6b1b7b39f952d7d1795f29162143a433 (patch)
tree	2ef4c0d29353452dd2f894e7dbd240a31bdd0a02 /mm/page-writeback.c
parent	7381131cbcf7e15d201a0ffd782a4698efe4e740 (diff)