aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/backing-dev.h4
-rw-r--r--include/linux/writeback.h4
-rw-r--r--kernel/sysctl.c2
-rw-r--r--mm/backing-dev.c19
-rw-r--r--mm/page-writeback.c203
5 files changed, 194 insertions, 38 deletions
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 4d9222c2f222..48a62baace58 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -10,6 +10,7 @@
10 10
11#include <linux/percpu_counter.h> 11#include <linux/percpu_counter.h>
12#include <linux/log2.h> 12#include <linux/log2.h>
13#include <linux/proportions.h>
13#include <asm/atomic.h> 14#include <asm/atomic.h>
14 15
15struct page; 16struct page;
@@ -44,6 +45,9 @@ struct backing_dev_info {
44 void *unplug_io_data; 45 void *unplug_io_data;
45 46
46 struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS]; 47 struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
48
49 struct prop_local_percpu completions;
50 int dirty_exceeded;
47}; 51};
48 52
49int bdi_init(struct backing_dev_info *bdi); 53int bdi_init(struct backing_dev_info *bdi);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index d1321a81c9c4..52be879793ed 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -97,6 +97,10 @@ extern int dirty_expire_interval;
97extern int block_dump; 97extern int block_dump;
98extern int laptop_mode; 98extern int laptop_mode;
99 99
100extern int dirty_ratio_handler(struct ctl_table *table, int write,
101 struct file *filp, void __user *buffer, size_t *lenp,
102 loff_t *ppos);
103
100struct ctl_table; 104struct ctl_table;
101struct file; 105struct file;
102int dirty_writeback_centisecs_handler(struct ctl_table *, int, struct file *, 106int dirty_writeback_centisecs_handler(struct ctl_table *, int, struct file *,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 96efbb859997..c676b5ec88f5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -813,7 +813,7 @@ static ctl_table vm_table[] = {
813 .data = &vm_dirty_ratio, 813 .data = &vm_dirty_ratio,
814 .maxlen = sizeof(vm_dirty_ratio), 814 .maxlen = sizeof(vm_dirty_ratio),
815 .mode = 0644, 815 .mode = 0644,
816 .proc_handler = &proc_dointvec_minmax, 816 .proc_handler = &dirty_ratio_handler,
817 .strategy = &sysctl_intvec, 817 .strategy = &sysctl_intvec,
818 .extra1 = &zero, 818 .extra1 = &zero,
819 .extra2 = &one_hundred, 819 .extra2 = &one_hundred,
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index a47065e084a4..b0ceb29da4c7 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -12,11 +12,17 @@ int bdi_init(struct backing_dev_info *bdi)
12 12
13 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { 13 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
14 err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0); 14 err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0);
15 if (err) { 15 if (err)
16 for (j = 0; j < i; j++) 16 goto err;
17 percpu_counter_destroy(&bdi->bdi_stat[i]); 17 }
18 break; 18
19 } 19 bdi->dirty_exceeded = 0;
20 err = prop_local_init_percpu(&bdi->completions);
21
22 if (err) {
23err:
24 for (j = 0; j < i; j++)
25 percpu_counter_destroy(&bdi->bdi_stat[i]);
20 } 26 }
21 27
22 return err; 28 return err;
@@ -29,6 +35,8 @@ void bdi_destroy(struct backing_dev_info *bdi)
29 35
30 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) 36 for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
31 percpu_counter_destroy(&bdi->bdi_stat[i]); 37 percpu_counter_destroy(&bdi->bdi_stat[i]);
38
39 prop_local_destroy_percpu(&bdi->completions);
32} 40}
33EXPORT_SYMBOL(bdi_destroy); 41EXPORT_SYMBOL(bdi_destroy);
34 42
@@ -81,3 +89,4 @@ long congestion_wait(int rw, long timeout)
81 return ret; 89 return ret;
82} 90}
83EXPORT_SYMBOL(congestion_wait); 91EXPORT_SYMBOL(congestion_wait);
92
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index f1d201fdcf9c..b0360546ac86 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2,6 +2,7 @@
2 * mm/page-writeback.c 2 * mm/page-writeback.c
3 * 3 *
4 * Copyright (C) 2002, Linus Torvalds. 4 * Copyright (C) 2002, Linus Torvalds.
5 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
5 * 6 *
6 * Contains functions related to writing back dirty pages at the 7 * Contains functions related to writing back dirty pages at the
7 * address_space level. 8 * address_space level.
@@ -49,8 +50,6 @@
49 */ 50 */
50static long ratelimit_pages = 32; 51static long ratelimit_pages = 32;
51 52
52static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */
53
54/* 53/*
55 * When balance_dirty_pages decides that the caller needs to perform some 54 * When balance_dirty_pages decides that the caller needs to perform some
56 * non-background writeback, this is how many pages it will attempt to write. 55 * non-background writeback, this is how many pages it will attempt to write.
@@ -103,6 +102,103 @@ EXPORT_SYMBOL(laptop_mode);
103static void background_writeout(unsigned long _min_pages); 102static void background_writeout(unsigned long _min_pages);
104 103
105/* 104/*
105 * Scale the writeback cache size proportional to the relative writeout speeds.
106 *
107 * We do this by keeping a floating proportion between BDIs, based on page
108 * writeback completions [end_page_writeback()]. Those devices that write out
109 * pages fastest will get the larger share, while the slower will get a smaller
110 * share.
111 *
112 * We use page writeout completions because we are interested in getting rid of
113 * dirty pages. Having them written out is the primary goal.
114 *
115 * We introduce a concept of time, a period over which we measure these events,
116 * because demand can/will vary over time. The length of this period itself is
117 * measured in page writeback completions.
118 *
119 */
120static struct prop_descriptor vm_completions;
121
122static unsigned long determine_dirtyable_memory(void);
123
124/*
125 * couple the period to the dirty_ratio:
126 *
127 * period/2 ~ roundup_pow_of_two(dirty limit)
128 */
129static int calc_period_shift(void)
130{
131 unsigned long dirty_total;
132
133 dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100;
134 return 2 + ilog2(dirty_total - 1);
135}
136
137/*
138 * update the period when the dirty ratio changes.
139 */
140int dirty_ratio_handler(struct ctl_table *table, int write,
141 struct file *filp, void __user *buffer, size_t *lenp,
142 loff_t *ppos)
143{
144 int old_ratio = vm_dirty_ratio;
145 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
146 if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
147 int shift = calc_period_shift();
148 prop_change_shift(&vm_completions, shift);
149 }
150 return ret;
151}
152
153/*
154 * Increment the BDI's writeout completion count and the global writeout
155 * completion count. Called from test_clear_page_writeback().
156 */
157static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
158{
159 __prop_inc_percpu(&vm_completions, &bdi->completions);
160}
161
162/*
163 * Obtain an accurate fraction of the BDI's portion.
164 */
165static void bdi_writeout_fraction(struct backing_dev_info *bdi,
166 long *numerator, long *denominator)
167{
168 if (bdi_cap_writeback_dirty(bdi)) {
169 prop_fraction_percpu(&vm_completions, &bdi->completions,
170 numerator, denominator);
171 } else {
172 *numerator = 0;
173 *denominator = 1;
174 }
175}
176
177/*
178 * Clip the earned share of dirty pages to that which is actually available.
179 * This avoids exceeding the total dirty_limit when the floating averages
180 * fluctuate too quickly.
181 */
182static void
183clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty)
184{
185 long avail_dirty;
186
187 avail_dirty = dirty -
188 (global_page_state(NR_FILE_DIRTY) +
189 global_page_state(NR_WRITEBACK) +
190 global_page_state(NR_UNSTABLE_NFS));
191
192 if (avail_dirty < 0)
193 avail_dirty = 0;
194
195 avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
196 bdi_stat(bdi, BDI_WRITEBACK);
197
198 *pbdi_dirty = min(*pbdi_dirty, avail_dirty);
199}
200
201/*
106 * Work out the current dirty-memory clamping and background writeout 202 * Work out the current dirty-memory clamping and background writeout
107 * thresholds. 203 * thresholds.
108 * 204 *
@@ -158,8 +254,8 @@ static unsigned long determine_dirtyable_memory(void)
158} 254}
159 255
160static void 256static void
161get_dirty_limits(long *pbackground, long *pdirty, 257get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
162 struct address_space *mapping) 258 struct backing_dev_info *bdi)
163{ 259{
164 int background_ratio; /* Percentages */ 260 int background_ratio; /* Percentages */
165 int dirty_ratio; 261 int dirty_ratio;
@@ -193,6 +289,22 @@ get_dirty_limits(long *pbackground, long *pdirty,
193 } 289 }
194 *pbackground = background; 290 *pbackground = background;
195 *pdirty = dirty; 291 *pdirty = dirty;
292
293 if (bdi) {
294 u64 bdi_dirty = dirty;
295 long numerator, denominator;
296
297 /*
298 * Calculate this BDI's share of the dirty ratio.
299 */
300 bdi_writeout_fraction(bdi, &numerator, &denominator);
301
302 bdi_dirty *= numerator;
303 do_div(bdi_dirty, denominator);
304
305 *pbdi_dirty = bdi_dirty;
306 clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
307 }
196} 308}
197 309
198/* 310/*
@@ -204,9 +316,11 @@ get_dirty_limits(long *pbackground, long *pdirty,
204 */ 316 */
205static void balance_dirty_pages(struct address_space *mapping) 317static void balance_dirty_pages(struct address_space *mapping)
206{ 318{
207 long nr_reclaimable; 319 long bdi_nr_reclaimable;
320 long bdi_nr_writeback;
208 long background_thresh; 321 long background_thresh;
209 long dirty_thresh; 322 long dirty_thresh;
323 long bdi_thresh;
210 unsigned long pages_written = 0; 324 unsigned long pages_written = 0;
211 unsigned long write_chunk = sync_writeback_pages(); 325 unsigned long write_chunk = sync_writeback_pages();
212 326
@@ -221,15 +335,15 @@ static void balance_dirty_pages(struct address_space *mapping)
221 .range_cyclic = 1, 335 .range_cyclic = 1,
222 }; 336 };
223 337
224 get_dirty_limits(&background_thresh, &dirty_thresh, mapping); 338 get_dirty_limits(&background_thresh, &dirty_thresh,
225 nr_reclaimable = global_page_state(NR_FILE_DIRTY) + 339 &bdi_thresh, bdi);
226 global_page_state(NR_UNSTABLE_NFS); 340 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
227 if (nr_reclaimable + global_page_state(NR_WRITEBACK) <= 341 bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
228 dirty_thresh) 342 if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
229 break; 343 break;
230 344
231 if (!dirty_exceeded) 345 if (!bdi->dirty_exceeded)
232 dirty_exceeded = 1; 346 bdi->dirty_exceeded = 1;
233 347
234 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. 348 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
235 * Unstable writes are a feature of certain networked 349 * Unstable writes are a feature of certain networked
@@ -237,26 +351,42 @@ static void balance_dirty_pages(struct address_space *mapping)
237 * written to the server's write cache, but has not yet 351 * written to the server's write cache, but has not yet
238 * been flushed to permanent storage. 352 * been flushed to permanent storage.
239 */ 353 */
240 if (nr_reclaimable) { 354 if (bdi_nr_reclaimable) {
241 writeback_inodes(&wbc); 355 writeback_inodes(&wbc);
242 get_dirty_limits(&background_thresh,
243 &dirty_thresh, mapping);
244 nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
245 global_page_state(NR_UNSTABLE_NFS);
246 if (nr_reclaimable +
247 global_page_state(NR_WRITEBACK)
248 <= dirty_thresh)
249 break;
250 pages_written += write_chunk - wbc.nr_to_write; 356 pages_written += write_chunk - wbc.nr_to_write;
251 if (pages_written >= write_chunk) 357 get_dirty_limits(&background_thresh, &dirty_thresh,
252 break; /* We've done our duty */ 358 &bdi_thresh, bdi);
359 }
360
361 /*
362 * In order to avoid the stacked BDI deadlock we need
363 * to ensure we accurately count the 'dirty' pages when
364 * the threshold is low.
365 *
366 * Otherwise it would be possible to get thresh+n pages
367 * reported dirty, even though there are thresh-m pages
368 * actually dirty; with m+n sitting in the percpu
369 * deltas.
370 */
371 if (bdi_thresh < 2*bdi_stat_error(bdi)) {
372 bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
373 bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
374 } else if (bdi_nr_reclaimable) {
375 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
376 bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
253 } 377 }
378
379 if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
380 break;
381 if (pages_written >= write_chunk)
382 break; /* We've done our duty */
383
254 congestion_wait(WRITE, HZ/10); 384 congestion_wait(WRITE, HZ/10);
255 } 385 }
256 386
257 if (nr_reclaimable + global_page_state(NR_WRITEBACK) 387 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
258 <= dirty_thresh && dirty_exceeded) 388 bdi->dirty_exceeded)
259 dirty_exceeded = 0; 389 bdi->dirty_exceeded = 0;
260 390
261 if (writeback_in_progress(bdi)) 391 if (writeback_in_progress(bdi))
262 return; /* pdflush is already working this queue */ 392 return; /* pdflush is already working this queue */
@@ -270,7 +400,9 @@ static void balance_dirty_pages(struct address_space *mapping)
270 * background_thresh, to keep the amount of dirty memory low. 400 * background_thresh, to keep the amount of dirty memory low.
271 */ 401 */
272 if ((laptop_mode && pages_written) || 402 if ((laptop_mode && pages_written) ||
273 (!laptop_mode && (nr_reclaimable > background_thresh))) 403 (!laptop_mode && (global_page_state(NR_FILE_DIRTY)
404 + global_page_state(NR_UNSTABLE_NFS)
405 > background_thresh)))
274 pdflush_operation(background_writeout, 0); 406 pdflush_operation(background_writeout, 0);
275} 407}
276 408
@@ -306,7 +438,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
306 unsigned long *p; 438 unsigned long *p;
307 439
308 ratelimit = ratelimit_pages; 440 ratelimit = ratelimit_pages;
309 if (dirty_exceeded) 441 if (mapping->backing_dev_info->dirty_exceeded)
310 ratelimit = 8; 442 ratelimit = 8;
311 443
312 /* 444 /*
@@ -342,7 +474,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
342 } 474 }
343 475
344 for ( ; ; ) { 476 for ( ; ; ) {
345 get_dirty_limits(&background_thresh, &dirty_thresh, NULL); 477 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
346 478
347 /* 479 /*
348 * Boost the allowable dirty threshold a bit for page 480 * Boost the allowable dirty threshold a bit for page
@@ -377,7 +509,7 @@ static void background_writeout(unsigned long _min_pages)
377 long background_thresh; 509 long background_thresh;
378 long dirty_thresh; 510 long dirty_thresh;
379 511
380 get_dirty_limits(&background_thresh, &dirty_thresh, NULL); 512 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
381 if (global_page_state(NR_FILE_DIRTY) + 513 if (global_page_state(NR_FILE_DIRTY) +
382 global_page_state(NR_UNSTABLE_NFS) < background_thresh 514 global_page_state(NR_UNSTABLE_NFS) < background_thresh
383 && min_pages <= 0) 515 && min_pages <= 0)
@@ -580,9 +712,14 @@ static struct notifier_block __cpuinitdata ratelimit_nb = {
580 */ 712 */
581void __init page_writeback_init(void) 713void __init page_writeback_init(void)
582{ 714{
715 int shift;
716
583 mod_timer(&wb_timer, jiffies + dirty_writeback_interval); 717 mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
584 writeback_set_ratelimit(); 718 writeback_set_ratelimit();
585 register_cpu_notifier(&ratelimit_nb); 719 register_cpu_notifier(&ratelimit_nb);
720
721 shift = calc_period_shift();
722 prop_descriptor_init(&vm_completions, shift);
586} 723}
587 724
588/** 725/**
@@ -988,8 +1125,10 @@ int test_clear_page_writeback(struct page *page)
988 radix_tree_tag_clear(&mapping->page_tree, 1125 radix_tree_tag_clear(&mapping->page_tree,
989 page_index(page), 1126 page_index(page),
990 PAGECACHE_TAG_WRITEBACK); 1127 PAGECACHE_TAG_WRITEBACK);
991 if (bdi_cap_writeback_dirty(bdi)) 1128 if (bdi_cap_writeback_dirty(bdi)) {
992 __dec_bdi_stat(bdi, BDI_WRITEBACK); 1129 __dec_bdi_stat(bdi, BDI_WRITEBACK);
1130 __bdi_writeout_inc(bdi);
1131 }
993 } 1132 }
994 write_unlock_irqrestore(&mapping->tree_lock, flags); 1133 write_unlock_irqrestore(&mapping->tree_lock, flags);
995 } else { 1134 } else {