mm: per device dirty threshold

Scale writeback cache per backing device, proportional to its writeout speed. By decoupling the BDI dirty thresholds a number of problems we currently have will go away, namely: - mutual interference starvation (for any number of BDIs); - deadlocks with stacked BDIs (loop, FUSE and local NFS mounts). It might be that all dirty pages are for a single BDI while other BDIs are idling. By giving each BDI a 'fair' share of the dirty limit, each one can have dirty pages outstanding and make progress. A global threshold also creates a deadlock for stacked BDIs; when A writes to B, and A generates enough dirty pages to get throttled, B will never start writeback until the dirty pages go away. Again, by giving each BDI its own 'independent' dirty limit, this problem is avoided. So the problem is to determine how to distribute the total dirty limit across the BDIs fairly and efficiently. A DBI that has a large dirty limit but does not have any dirty pages outstanding is a waste. What is done is to keep a floating proportion between the DBIs based on writeback completions. This way faster/more active devices get a larger share than slower/idle devices. [akpm@linux-foundation.org: fix warnings] [hugh@veritas.com: Fix occasional hang when a task couldn't get out of balance_dirty_pages] Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Peter Zijlstra <a.p.zijlstra@chello.nl> 2007-10-17 02:25:50 -0400
committer: Linus Torvalds <torvalds@woody.linux-foundation.org> 2007-10-17 11:42:45 -0400
commit: 04fbfdc14e5f48463820d6b9807daa5e9c92c51f (patch)
tree: c62905212c8c6373b2258c7f528398d3c831b075 /mm
parent: 145ca25eb2fbd20d4faf1bad4628c7650332058f (diff)
2 files changed, 185 insertions, 37 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index a47065e084a..b0ceb29da4c 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -12,11 +12,17 @@ int bdi_init(struct backing_dev_info *bdi)
        for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
                err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0);
-                if (err) {
+                if (err)
-                        for (j = 0; j < i; j++)
+                        goto err;
-                                percpu_counter_destroy(&bdi->bdi_stat[i]);
+        }
-                        break;
-                }
+        bdi->dirty_exceeded = 0;
+        err = prop_local_init_percpu(&bdi->completions);
+        if (err) {
+err:
+                for (j = 0; j < i; j++)
+                        percpu_counter_destroy(&bdi->bdi_stat[i]);
        }
        return err;
@@ -29,6 +35,8 @@ void bdi_destroy(struct backing_dev_info *bdi)
        for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
                percpu_counter_destroy(&bdi->bdi_stat[i]);
+        prop_local_destroy_percpu(&bdi->completions);
 }
 EXPORT_SYMBOL(bdi_destroy);
@@ -81,3 +89,4 @@ long congestion_wait(int rw, long timeout)
        return ret;
 }
 EXPORT_SYMBOL(congestion_wait);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index f1d201fdcf9..b0360546ac8 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2,6 +2,7 @@
 * mm/page-writeback.c
 *
 * Copyright (C) 2002, Linus Torvalds.
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
 *
 * Contains functions related to writing back dirty pages at the
 * address_space level.
@@ -49,8 +50,6 @@
 */
 static long ratelimit_pages = 32;
-static int dirty_exceeded __cacheline_aligned_in_smp;   /* Dirty mem may be over limit */
 /*
 * When balance_dirty_pages decides that the caller needs to perform some
 * non-background writeback, this is how many pages it will attempt to write.
@@ -103,6 +102,103 @@ EXPORT_SYMBOL(laptop_mode);
 static void background_writeout(unsigned long _min_pages);
 /*
+ * Scale the writeback cache size proportional to the relative writeout speeds.
+ *
+ * We do this by keeping a floating proportion between BDIs, based on page
+ * writeback completions [end_page_writeback()]. Those devices that write out
+ * pages fastest will get the larger share, while the slower will get a smaller
+ * share.
+ *
+ * We use page writeout completions because we are interested in getting rid of
+ * dirty pages. Having them written out is the primary goal.
+ *
+ * We introduce a concept of time, a period over which we measure these events,
+ * because demand can/will vary over time. The length of this period itself is
+ * measured in page writeback completions.
+ *
+ */
+static struct prop_descriptor vm_completions;
+static unsigned long determine_dirtyable_memory(void);
+/*
+ * couple the period to the dirty_ratio:
+ *
+ *   period/2 ~ roundup_pow_of_two(dirty limit)
+ */
+static int calc_period_shift(void)
+{
+        unsigned long dirty_total;
+        dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100;
+        return 2 + ilog2(dirty_total - 1);
+}
+/*
+ * update the period when the dirty ratio changes.
+ */
+int dirty_ratio_handler(struct ctl_table *table, int write,
+                struct file *filp, void __user *buffer, size_t *lenp,
+                loff_t *ppos)
+{
+        int old_ratio = vm_dirty_ratio;
+        int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+        if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
+                int shift = calc_period_shift();
+                prop_change_shift(&vm_completions, shift);
+        }
+        return ret;
+}
+/*
+ * Increment the BDI's writeout completion count and the global writeout
+ * completion count. Called from test_clear_page_writeback().
+ */
+static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
+{
+        __prop_inc_percpu(&vm_completions, &bdi->completions);
+}
+/*
+ * Obtain an accurate fraction of the BDI's portion.
+ */
+static void bdi_writeout_fraction(struct backing_dev_info *bdi,
+                long *numerator, long *denominator)
+{
+        if (bdi_cap_writeback_dirty(bdi)) {
+                prop_fraction_percpu(&vm_completions, &bdi->completions,
+                                numerator, denominator);
+        } else {
+                *numerator = 0;
+                *denominator = 1;
+        }
+}
+/*
+ * Clip the earned share of dirty pages to that which is actually available.
+ * This avoids exceeding the total dirty_limit when the floating averages
+ * fluctuate too quickly.
+ */
+static void
+clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty)
+{
+        long avail_dirty;
+        avail_dirty = dirty -
+                (global_page_state(NR_FILE_DIRTY) +
+                 global_page_state(NR_WRITEBACK) +
+                 global_page_state(NR_UNSTABLE_NFS));
+        if (avail_dirty < 0)
+                avail_dirty = 0;
+        avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
+                bdi_stat(bdi, BDI_WRITEBACK);
+        *pbdi_dirty = min(*pbdi_dirty, avail_dirty);
+}
+/*
 * Work out the current dirty-memory clamping and background writeout
 * thresholds.
 *
@@ -158,8 +254,8 @@ static unsigned long determine_dirtyable_memory(void)
 }
 static void
-get_dirty_limits(long *pbackground, long *pdirty,
+get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
-                                        struct address_space *mapping)
+                 struct backing_dev_info *bdi)
 {
        int background_ratio;           /* Percentages */
        int dirty_ratio;
@@ -193,6 +289,22 @@ get_dirty_limits(long *pbackground, long *pdirty,
        }
        *pbackground = background;
        *pdirty = dirty;
+        if (bdi) {
+                u64 bdi_dirty = dirty;
+                long numerator, denominator;
+                /*
+                 * Calculate this BDI's share of the dirty ratio.
+                 */
+                bdi_writeout_fraction(bdi, &numerator, &denominator);
+                bdi_dirty *= numerator;
+                do_div(bdi_dirty, denominator);
+                *pbdi_dirty = bdi_dirty;
+                clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
+        }
 }
 /*
@@ -204,9 +316,11 @@ get_dirty_limits(long *pbackground, long *pdirty,
 */
 static void balance_dirty_pages(struct address_space *mapping)
 {
-        long nr_reclaimable;
+        long bdi_nr_reclaimable;
+        long bdi_nr_writeback;
        long background_thresh;
        long dirty_thresh;
+        long bdi_thresh;
        unsigned long pages_written = 0;
        unsigned long write_chunk = sync_writeback_pages();
@@ -221,15 +335,15 @@ static void balance_dirty_pages(struct address_space *mapping)
                        .range_cyclic   = 1,
                };
-                get_dirty_limits(&background_thresh, &dirty_thresh, mapping);
+                get_dirty_limits(&background_thresh, &dirty_thresh,
-                nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
+                                &bdi_thresh, bdi);
-                                        global_page_state(NR_UNSTABLE_NFS);
+                bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
-                if (nr_reclaimable + global_page_state(NR_WRITEBACK) <=
+                bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
-                        dirty_thresh)
+                if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
-                                break;
+                        break;
-                if (!dirty_exceeded)
+                if (!bdi->dirty_exceeded)
-                        dirty_exceeded = 1;
+                        bdi->dirty_exceeded = 1;
                /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
                 * Unstable writes are a feature of certain networked
@@ -237,26 +351,42 @@ static void balance_dirty_pages(struct address_space *mapping)
                 * written to the server's write cache, but has not yet
                 * been flushed to permanent storage.
                 */
-                if (nr_reclaimable) {
+                if (bdi_nr_reclaimable) {
                        writeback_inodes(&wbc);
-                        get_dirty_limits(&background_thresh,
-                                                &dirty_thresh, mapping);
-                        nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
-                                        global_page_state(NR_UNSTABLE_NFS);
-                        if (nr_reclaimable +
-                                global_page_state(NR_WRITEBACK)
-                                        <= dirty_thresh)
-                                                break;
                        pages_written += write_chunk - wbc.nr_to_write;
-                        if (pages_written >= write_chunk)
+                        get_dirty_limits(&background_thresh, &dirty_thresh,
-                                break;          /* We've done our duty */
+                                       &bdi_thresh, bdi);
+                }
+                /*
+                 * In order to avoid the stacked BDI deadlock we need
+                 * to ensure we accurately count the 'dirty' pages when
+                 * the threshold is low.
+                 *
+                 * Otherwise it would be possible to get thresh+n pages
+                 * reported dirty, even though there are thresh-m pages
+                 * actually dirty; with m+n sitting in the percpu
+                 * deltas.
+                 */
+                if (bdi_thresh < 2*bdi_stat_error(bdi)) {
+                        bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
+                        bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
+                } else if (bdi_nr_reclaimable) {
+                        bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
+                        bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
                }
+                if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
+                        break;
+                if (pages_written >= write_chunk)
+                        break;          /* We've done our duty */
                congestion_wait(WRITE, HZ/10);
        }
-        if (nr_reclaimable + global_page_state(NR_WRITEBACK)
+        if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
-                <= dirty_thresh && dirty_exceeded)
+                        bdi->dirty_exceeded)
-                        dirty_exceeded = 0;
+                bdi->dirty_exceeded = 0;
        if (writeback_in_progress(bdi))
                return;         /* pdflush is already working this queue */
@@ -270,7 +400,9 @@ static void balance_dirty_pages(struct address_space *mapping)
         * background_thresh, to keep the amount of dirty memory low.
         */
        if ((laptop_mode && pages_written) ||
-             (!laptop_mode && (nr_reclaimable > background_thresh)))
+                        (!laptop_mode && (global_page_state(NR_FILE_DIRTY)
+                                          + global_page_state(NR_UNSTABLE_NFS)
+                                          > background_thresh)))
                pdflush_operation(background_writeout, 0);
 }
@@ -306,7 +438,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
        unsigned long *p;
        ratelimit = ratelimit_pages;
-        if (dirty_exceeded)
+        if (mapping->backing_dev_info->dirty_exceeded)
                ratelimit = 8;
        /*
@@ -342,7 +474,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
        }
        for ( ; ; ) {
-                get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
+                get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
                /*
                 * Boost the allowable dirty threshold a bit for page
@@ -377,7 +509,7 @@ static void background_writeout(unsigned long _min_pages)
                long background_thresh;
                long dirty_thresh;
-                get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
+                get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
                if (global_page_state(NR_FILE_DIRTY) +
                        global_page_state(NR_UNSTABLE_NFS) < background_thresh
                                && min_pages <= 0)
@@ -580,9 +712,14 @@ static struct notifier_block __cpuinitdata ratelimit_nb = {
 */
 void __init page_writeback_init(void)
 {
+        int shift;
        mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
        writeback_set_ratelimit();
        register_cpu_notifier(&ratelimit_nb);
+        shift = calc_period_shift();
+        prop_descriptor_init(&vm_completions, shift);
 }
 /**
@@ -988,8 +1125,10 @@ int test_clear_page_writeback(struct page *page)
                        radix_tree_tag_clear(&mapping->page_tree,
                                                page_index(page),
                                                PAGECACHE_TAG_WRITEBACK);
-                        if (bdi_cap_writeback_dirty(bdi))
+                        if (bdi_cap_writeback_dirty(bdi)) {
                                __dec_bdi_stat(bdi, BDI_WRITEBACK);
+                                __bdi_writeout_inc(bdi);
+                        }
                }
                write_unlock_irqrestore(&mapping->tree_lock, flags);
        } else {
author	Peter Zijlstra <a.p.zijlstra@chello.nl>	2007-10-17 02:25:50 -0400
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>	2007-10-17 11:42:45 -0400
commit	04fbfdc14e5f48463820d6b9807daa5e9c92c51f (patch)
tree	c62905212c8c6373b2258c7f528398d3c831b075 /mm
parent	145ca25eb2fbd20d4faf1bad4628c7650332058f (diff)

diff --git a/mm/backing-dev.c b/mm/backing-dev.c index a47065e084a..b0ceb29da4c 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c
@@ -12,11 +12,17 @@ int bdi_init(struct backing_dev_info *bdi)
12		12
13	for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {	13	for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
14	err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0);	14	err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0);
15	if (err) {	15	if (err)
16	for (j = 0; j < i; j++)	16	goto err;
17	percpu_counter_destroy(&bdi->bdi_stat[i]);	17	}
18	break;	18
19	}	19	bdi->dirty_exceeded = 0;
		20	err = prop_local_init_percpu(&bdi->completions);
		21
		22	if (err) {
		23	err:
		24	for (j = 0; j < i; j++)
		25	percpu_counter_destroy(&bdi->bdi_stat[i]);
20	}	26	}
21		27
22	return err;	28	return err;
@@ -29,6 +35,8 @@ void bdi_destroy(struct backing_dev_info *bdi)
29		35
30	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)	36	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
31	percpu_counter_destroy(&bdi->bdi_stat[i]);	37	percpu_counter_destroy(&bdi->bdi_stat[i]);
		38
		39	prop_local_destroy_percpu(&bdi->completions);
32	}	40	}
33	EXPORT_SYMBOL(bdi_destroy);	41	EXPORT_SYMBOL(bdi_destroy);
34		42
@@ -81,3 +89,4 @@ long congestion_wait(int rw, long timeout)
81	return ret;	89	return ret;
82	}	90	}
83	EXPORT_SYMBOL(congestion_wait);	91	EXPORT_SYMBOL(congestion_wait);
		92


diff --git a/mm/page-writeback.c b/mm/page-writeback.c index f1d201fdcf9..b0360546ac8 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c
@@ -2,6 +2,7 @@
2	* mm/page-writeback.c	2	* mm/page-writeback.c
3	*	3	*
4	* Copyright (C) 2002, Linus Torvalds.	4	* Copyright (C) 2002, Linus Torvalds.
		5	* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
5	*	6	*
6	* Contains functions related to writing back dirty pages at the	7	* Contains functions related to writing back dirty pages at the
7	* address_space level.	8	* address_space level.
@@ -49,8 +50,6 @@
49	*/	50	*/
50	static long ratelimit_pages = 32;	51	static long ratelimit_pages = 32;
51		52
52	static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */
53
54	/*	53	/*
55	* When balance_dirty_pages decides that the caller needs to perform some	54	* When balance_dirty_pages decides that the caller needs to perform some
56	* non-background writeback, this is how many pages it will attempt to write.	55	* non-background writeback, this is how many pages it will attempt to write.
@@ -103,6 +102,103 @@ EXPORT_SYMBOL(laptop_mode);
103	static void background_writeout(unsigned long _min_pages);	102	static void background_writeout(unsigned long _min_pages);
104		103
105	/*	104	/*
		105	* Scale the writeback cache size proportional to the relative writeout speeds.
		106	*
		107	* We do this by keeping a floating proportion between BDIs, based on page
		108	* writeback completions [end_page_writeback()]. Those devices that write out
		109	* pages fastest will get the larger share, while the slower will get a smaller
		110	* share.
		111	*
		112	* We use page writeout completions because we are interested in getting rid of
		113	* dirty pages. Having them written out is the primary goal.
		114	*
		115	* We introduce a concept of time, a period over which we measure these events,
		116	* because demand can/will vary over time. The length of this period itself is
		117	* measured in page writeback completions.
		118	*
		119	*/
		120	static struct prop_descriptor vm_completions;
		121
		122	static unsigned long determine_dirtyable_memory(void);
		123
		124	/*
		125	* couple the period to the dirty_ratio:
		126	*
		127	* period/2 ~ roundup_pow_of_two(dirty limit)
		128	*/
		129	static int calc_period_shift(void)
		130	{
		131	unsigned long dirty_total;
		132
		133	dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100;
		134	return 2 + ilog2(dirty_total - 1);
		135	}
		136
		137	/*
		138	* update the period when the dirty ratio changes.
		139	*/
		140	int dirty_ratio_handler(struct ctl_table *table, int write,
		141	struct file filp, void __user buffer, size_t *lenp,
		142	loff_t *ppos)
		143	{
		144	int old_ratio = vm_dirty_ratio;
		145	int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
		146	if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
		147	int shift = calc_period_shift();
		148	prop_change_shift(&vm_completions, shift);
		149	}
		150	return ret;
		151	}
		152
		153	/*
		154	* Increment the BDI's writeout completion count and the global writeout
		155	* completion count. Called from test_clear_page_writeback().
		156	*/
		157	static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
		158	{
		159	__prop_inc_percpu(&vm_completions, &bdi->completions);
		160	}
		161
		162	/*
		163	* Obtain an accurate fraction of the BDI's portion.
		164	*/
		165	static void bdi_writeout_fraction(struct backing_dev_info *bdi,
		166	long numerator, long denominator)
		167	{
		168	if (bdi_cap_writeback_dirty(bdi)) {
		169	prop_fraction_percpu(&vm_completions, &bdi->completions,
		170	numerator, denominator);
		171	} else {
		172	*numerator = 0;
		173	*denominator = 1;
		174	}
		175	}
		176
		177	/*
		178	* Clip the earned share of dirty pages to that which is actually available.
		179	* This avoids exceeding the total dirty_limit when the floating averages
		180	* fluctuate too quickly.
		181	*/
		182	static void
		183	clip_bdi_dirty_limit(struct backing_dev_info bdi, long dirty, long pbdi_dirty)
		184	{
		185	long avail_dirty;
		186
		187	avail_dirty = dirty -
		188	(global_page_state(NR_FILE_DIRTY) +
		189	global_page_state(NR_WRITEBACK) +
		190	global_page_state(NR_UNSTABLE_NFS));
		191
		192	if (avail_dirty < 0)
		193	avail_dirty = 0;
		194
		195	avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
		196	bdi_stat(bdi, BDI_WRITEBACK);
		197
		198	pbdi_dirty = min(pbdi_dirty, avail_dirty);
		199	}
		200
		201	/*
106	* Work out the current dirty-memory clamping and background writeout	202	* Work out the current dirty-memory clamping and background writeout
107	* thresholds.	203	* thresholds.
108	*	204	*
@@ -158,8 +254,8 @@ static unsigned long determine_dirtyable_memory(void)
158	}	254	}
159		255
160	static void	256	static void
161	get_dirty_limits(long pbackground, long pdirty,	257	get_dirty_limits(long pbackground, long pdirty, long *pbdi_dirty,
162	struct address_space *mapping)	258	struct backing_dev_info *bdi)
163	{	259	{
164	int background_ratio; /* Percentages */	260	int background_ratio; /* Percentages */
165	int dirty_ratio;	261	int dirty_ratio;
@@ -193,6 +289,22 @@ get_dirty_limits(long pbackground, long pdirty,
193	}	289	}
194	*pbackground = background;	290	*pbackground = background;
195	*pdirty = dirty;	291	*pdirty = dirty;
		292
		293	if (bdi) {
		294	u64 bdi_dirty = dirty;
		295	long numerator, denominator;
		296
		297	/*
		298	* Calculate this BDI's share of the dirty ratio.
		299	*/
		300	bdi_writeout_fraction(bdi, &numerator, &denominator);
		301
		302	bdi_dirty *= numerator;
		303	do_div(bdi_dirty, denominator);
		304
		305	*pbdi_dirty = bdi_dirty;
		306	clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
		307	}
196	}	308	}
197		309
198	/*	310	/*
@@ -204,9 +316,11 @@ get_dirty_limits(long pbackground, long pdirty,
204	*/	316	*/
205	static void balance_dirty_pages(struct address_space *mapping)	317	static void balance_dirty_pages(struct address_space *mapping)
206	{	318	{
207	long nr_reclaimable;	319	long bdi_nr_reclaimable;
		320	long bdi_nr_writeback;
208	long background_thresh;	321	long background_thresh;
209	long dirty_thresh;	322	long dirty_thresh;
		323	long bdi_thresh;
210	unsigned long pages_written = 0;	324	unsigned long pages_written = 0;
211	unsigned long write_chunk = sync_writeback_pages();	325	unsigned long write_chunk = sync_writeback_pages();
212		326
@@ -221,15 +335,15 @@ static void balance_dirty_pages(struct address_space *mapping)
221	.range_cyclic = 1,	335	.range_cyclic = 1,
222	};	336	};
223		337
224	get_dirty_limits(&background_thresh, &dirty_thresh, mapping);	338	get_dirty_limits(&background_thresh, &dirty_thresh,
225	nr_reclaimable = global_page_state(NR_FILE_DIRTY) +	339	&bdi_thresh, bdi);
226	global_page_state(NR_UNSTABLE_NFS);	340	bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
227	if (nr_reclaimable + global_page_state(NR_WRITEBACK) <=	341	bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
228	dirty_thresh)	342	if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
229	break;	343	break;
230		344
231	if (!dirty_exceeded)	345	if (!bdi->dirty_exceeded)
232	dirty_exceeded = 1;	346	bdi->dirty_exceeded = 1;
233		347
234	/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.	348	/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
235	* Unstable writes are a feature of certain networked	349	* Unstable writes are a feature of certain networked
@@ -237,26 +351,42 @@ static void balance_dirty_pages(struct address_space *mapping)
237	* written to the server's write cache, but has not yet	351	* written to the server's write cache, but has not yet
238	* been flushed to permanent storage.	352	* been flushed to permanent storage.
239	*/	353	*/
240	if (nr_reclaimable) {	354	if (bdi_nr_reclaimable) {
241	writeback_inodes(&wbc);	355	writeback_inodes(&wbc);
242	get_dirty_limits(&background_thresh,
243	&dirty_thresh, mapping);
244	nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
245	global_page_state(NR_UNSTABLE_NFS);
246	if (nr_reclaimable +
247	global_page_state(NR_WRITEBACK)
248	<= dirty_thresh)
249	break;
250	pages_written += write_chunk - wbc.nr_to_write;	356	pages_written += write_chunk - wbc.nr_to_write;
251	if (pages_written >= write_chunk)	357	get_dirty_limits(&background_thresh, &dirty_thresh,
252	break; /* We've done our duty */	358	&bdi_thresh, bdi);
		359	}
		360
		361	/*
		362	* In order to avoid the stacked BDI deadlock we need
		363	* to ensure we accurately count the 'dirty' pages when
		364	* the threshold is low.
		365	*
		366	* Otherwise it would be possible to get thresh+n pages
		367	* reported dirty, even though there are thresh-m pages
		368	* actually dirty; with m+n sitting in the percpu
		369	* deltas.
		370	*/
		371	if (bdi_thresh < 2*bdi_stat_error(bdi)) {
		372	bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
		373	bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
		374	} else if (bdi_nr_reclaimable) {
		375	bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
		376	bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
253	}	377	}
		378
		379	if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
		380	break;
		381	if (pages_written >= write_chunk)
		382	break; /* We've done our duty */
		383
254	congestion_wait(WRITE, HZ/10);	384	congestion_wait(WRITE, HZ/10);
255	}	385	}
256		386
257	if (nr_reclaimable + global_page_state(NR_WRITEBACK)	387	if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
258	<= dirty_thresh && dirty_exceeded)	388	bdi->dirty_exceeded)
259	dirty_exceeded = 0;	389	bdi->dirty_exceeded = 0;
260		390
261	if (writeback_in_progress(bdi))	391	if (writeback_in_progress(bdi))
262	return; /* pdflush is already working this queue */	392	return; /* pdflush is already working this queue */
@@ -270,7 +400,9 @@ static void balance_dirty_pages(struct address_space *mapping)
270	* background_thresh, to keep the amount of dirty memory low.	400	* background_thresh, to keep the amount of dirty memory low.
271	*/	401	*/
272	if ((laptop_mode && pages_written) \|\|	402	if ((laptop_mode && pages_written) \|\|
273	(!laptop_mode && (nr_reclaimable > background_thresh)))	403	(!laptop_mode && (global_page_state(NR_FILE_DIRTY)
		404	+ global_page_state(NR_UNSTABLE_NFS)
		405	> background_thresh)))
274	pdflush_operation(background_writeout, 0);	406	pdflush_operation(background_writeout, 0);
275	}	407	}
276		408
@@ -306,7 +438,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
306	unsigned long *p;	438	unsigned long *p;
307		439
308	ratelimit = ratelimit_pages;	440	ratelimit = ratelimit_pages;
309	if (dirty_exceeded)	441	if (mapping->backing_dev_info->dirty_exceeded)
310	ratelimit = 8;	442	ratelimit = 8;
311		443
312	/*	444	/*
@@ -342,7 +474,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
342	}	474	}
343		475
344	for ( ; ; ) {	476	for ( ; ; ) {
345	get_dirty_limits(&background_thresh, &dirty_thresh, NULL);	477	get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
346		478
347	/*	479	/*
348	* Boost the allowable dirty threshold a bit for page	480	* Boost the allowable dirty threshold a bit for page
@@ -377,7 +509,7 @@ static void background_writeout(unsigned long _min_pages)
377	long background_thresh;	509	long background_thresh;
378	long dirty_thresh;	510	long dirty_thresh;
379		511
380	get_dirty_limits(&background_thresh, &dirty_thresh, NULL);	512	get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
381	if (global_page_state(NR_FILE_DIRTY) +	513	if (global_page_state(NR_FILE_DIRTY) +
382	global_page_state(NR_UNSTABLE_NFS) < background_thresh	514	global_page_state(NR_UNSTABLE_NFS) < background_thresh
383	&& min_pages <= 0)	515	&& min_pages <= 0)
@@ -580,9 +712,14 @@ static struct notifier_block __cpuinitdata ratelimit_nb = {
580	*/	712	*/
581	void __init page_writeback_init(void)	713	void __init page_writeback_init(void)
582	{	714	{
		715	int shift;
		716
583	mod_timer(&wb_timer, jiffies + dirty_writeback_interval);	717	mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
584	writeback_set_ratelimit();	718	writeback_set_ratelimit();
585	register_cpu_notifier(&ratelimit_nb);	719	register_cpu_notifier(&ratelimit_nb);
		720
		721	shift = calc_period_shift();
		722	prop_descriptor_init(&vm_completions, shift);
586	}	723	}
587		724
588	/**	725	/**
@@ -988,8 +1125,10 @@ int test_clear_page_writeback(struct page *page)
988	radix_tree_tag_clear(&mapping->page_tree,	1125	radix_tree_tag_clear(&mapping->page_tree,
989	page_index(page),	1126	page_index(page),
990	PAGECACHE_TAG_WRITEBACK);	1127	PAGECACHE_TAG_WRITEBACK);
991	if (bdi_cap_writeback_dirty(bdi))	1128	if (bdi_cap_writeback_dirty(bdi)) {
992	__dec_bdi_stat(bdi, BDI_WRITEBACK);	1129	__dec_bdi_stat(bdi, BDI_WRITEBACK);
		1130	__bdi_writeout_inc(bdi);
		1131	}
993	}	1132	}
994	write_unlock_irqrestore(&mapping->tree_lock, flags);	1133	write_unlock_irqrestore(&mapping->tree_lock, flags);
995	} else {	1134	} else {