aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2007-10-17 02:25:50 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-10-17 11:42:45 -0400
commit04fbfdc14e5f48463820d6b9807daa5e9c92c51f (patch)
treec62905212c8c6373b2258c7f528398d3c831b075
parent145ca25eb2fbd20d4faf1bad4628c7650332058f (diff)
mm: per device dirty threshold
Scale writeback cache per backing device, proportional to its writeout speed. By decoupling the BDI dirty thresholds a number of problems we currently have will go away, namely: - mutual interference starvation (for any number of BDIs); - deadlocks with stacked BDIs (loop, FUSE and local NFS mounts). It might be that all dirty pages are for a single BDI while other BDIs are idling. By giving each BDI a 'fair' share of the dirty limit, each one can have dirty pages outstanding and make progress. A global threshold also creates a deadlock for stacked BDIs; when A writes to B, and A generates enough dirty pages to get throttled, B will never start writeback until the dirty pages go away. Again, by giving each BDI its own 'independent' dirty limit, this problem is avoided. So the problem is to determine how to distribute the total dirty limit across the BDIs fairly and efficiently. A DBI that has a large dirty limit but does not have any dirty pages outstanding is a waste. What is done is to keep a floating proportion between the DBIs based on writeback completions. This way faster/more active devices get a larger share than slower/idle devices. [akpm@linux-foundation.org: fix warnings] [hugh@veritas.com: Fix occasional hang when a task couldn't get out of balance_dirty_pages] Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/backing-dev.h4
-rw-r--r--include/linux/writeback.h4
-rw-r--r--kernel/sysctl.c2
-rw-r--r--mm/backing-dev.c19
-rw-r--r--mm/page-writeback.c203
5 files changed, 194 insertions, 38 deletions
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 4d9222c2f222..48a62baace58 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -10,6 +10,7 @@
10 10
11#include <linux/percpu_counter.h> 11#include <linux/percpu_counter.h>
12#include <linux/log2.h> 12#include <linux/log2.h>
13#include <linux/proportions.h>
13#include <asm/atomic.h> 14#include <asm/atomic.h>
14 15
15struct page; 16struct page;
@@ -44,6 +45,9 @@ struct backing_dev_info {
44 void *unplug_io_data; 45 void *unplug_io_data;
45 46
46 struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS]; 47 struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
48
49 struct prop_local_percpu completions;
50 int dirty_exceeded;
47}; 51};
48 52
49int bdi_init(struct backing_dev_info *bdi); 53int bdi_init(struct backing_dev_info *bdi);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index d1321a81c9c4..52be879793ed 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -97,6 +97,10 @@ extern int dirty_expire_interval;
97extern int block_dump; 97extern int block_dump;
98extern int laptop_mode; 98extern int laptop_mode;
99 99
100extern int dirty_ratio_handler(struct ctl_table *table, int write,
101 struct file *filp, void __user *buffer, size_t *lenp,
102 loff_t *ppos);
103
100struct ctl_table; 104struct ctl_table;
101struct file; 105struct file;
102int dirty_writeback_centisecs_handler(struct ctl_table *, int, struct file *, 106int dirty_writeback_centisecs_handler(struct ctl_table *, int, struct file *,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 96efbb859997..c676b5ec88f5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -813,7 +813,7 @@ static ctl_table vm_table[] = {
813 .data = &vm_dirty_ratio, 813 .data = &vm_dirty_ratio,
814 .maxlen = sizeof(vm_dirty_ratio), 814 .maxlen = sizeof(vm_dirty_ratio),
815 .mode = 0644, 815 .mode = 0644,
816 .proc_handler = &proc_dointvec_minmax, 816 .proc_handler = &dirty_ratio_handler,
817 .strategy = &sysctl_intvec, 817 .strategy = &sysctl_intvec,
818 .extra1 = &zero, 818 .extra1 = &zero,
819 .extra2 = &one_hundred, 819 .extra2 = &one_hundred,
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index a47065e084a4..b0ceb29da4c7 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -12,11 +12,17 @@ int bdi_init(struct backing_dev_info *bdi)
12 12
13 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { 13 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
14 err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0); 14 err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0);
15 if (err) { 15 if (err)
16 for (j = 0; j < i; j++) 16 goto err;
17 percpu_counter_destroy(&bdi->bdi_stat[i]); 17 }
18 break; 18
19 } 19 bdi->dirty_exceeded = 0;
20 err = prop_local_init_percpu(&bdi->completions);
21
22 if (err) {
23err:
24 for (j = 0; j < i; j++)
25 percpu_counter_destroy(&bdi->bdi_stat[i]);
20 } 26 }
21 27
22 return err; 28 return err;
@@ -29,6 +35,8 @@ void bdi_destroy(struct backing_dev_info *bdi)
29 35
30 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) 36 for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
31 percpu_counter_destroy(&bdi->bdi_stat[i]); 37 percpu_counter_destroy(&bdi->bdi_stat[i]);
38
39 prop_local_destroy_percpu(&bdi->completions);
32} 40}
33EXPORT_SYMBOL(bdi_destroy); 41EXPORT_SYMBOL(bdi_destroy);
34 42
@@ -81,3 +89,4 @@ long congestion_wait(int rw, long timeout)
81 return ret; 89 return ret;
82} 90}
83EXPORT_SYMBOL(congestion_wait); 91EXPORT_SYMBOL(congestion_wait);
92
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index f1d201fdcf9c..b0360546ac86 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2,6 +2,7 @@
2 * mm/page-writeback.c 2 * mm/page-writeback.c
3 * 3 *
4 * Copyright (C) 2002, Linus Torvalds. 4 * Copyright (C) 2002, Linus Torvalds.
5 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
5 * 6 *
6 * Contains functions related to writing back dirty pages at the 7 * Contains functions related to writing back dirty pages at the
7 * address_space level. 8 * address_space level.
@@ -49,8 +50,6 @@
49 */ 50 */
50static long ratelimit_pages = 32; 51static long ratelimit_pages = 32;
51 52
52static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */
53
54/* 53/*
55 * When balance_dirty_pages decides that the caller needs to perform some 54 * When balance_dirty_pages decides that the caller needs to perform some
56 * non-background writeback, this is how many pages it will attempt to write. 55 * non-background writeback, this is how many pages it will attempt to write.
@@ -103,6 +102,103 @@ EXPORT_SYMBOL(laptop_mode);
103static void background_writeout(unsigned long _min_pages); 102static void background_writeout(unsigned long _min_pages);
104 103
105/* 104/*
105 * Scale the writeback cache size proportional to the relative writeout speeds.
106 *
107 * We do this by keeping a floating proportion between BDIs, based on page
108 * writeback completions [end_page_writeback()]. Those devices that write out
109 * pages fastest will get the larger share, while the slower will get a smaller
110 * share.
111 *
112 * We use page writeout completions because we are interested in getting rid of
113 * dirty pages. Having them written out is the primary goal.
114 *
115 * We introduce a concept of time, a period over which we measure these events,
116 * because demand can/will vary over time. The length of this period itself is
117 * measured in page writeback completions.
118 *
119 */
120static struct prop_descriptor vm_completions;
121
122static unsigned long determine_dirtyable_memory(void);
123
124/*
125 * couple the period to the dirty_ratio:
126 *
127 * period/2 ~ roundup_pow_of_two(dirty limit)
128 */
129static int calc_period_shift(void)
130{
131 unsigned long dirty_total;
132
133 dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100;
134 return 2 + ilog2(dirty_total - 1);
135}
136
137/*
138 * update the period when the dirty ratio changes.
139 */
140int dirty_ratio_handler(struct ctl_table *table, int write,
141 struct file *filp, void __user *buffer, size_t *lenp,
142 loff_t *ppos)
143{
144 int old_ratio = vm_dirty_ratio;
145 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
146 if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
147 int shift = calc_period_shift();
148 prop_change_shift(&vm_completions, shift);
149 }
150 return ret;
151}
152
153/*
154 * Increment the BDI's writeout completion count and the global writeout
155 * completion count. Called from test_clear_page_writeback().
156 */
157static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
158{
159 __prop_inc_percpu(&vm_completions, &bdi->completions);
160}
161
162/*
163 * Obtain an accurate fraction of the BDI's portion.
164 */
165static void bdi_writeout_fraction(struct backing_dev_info *bdi,
166 long *numerator, long *denominator)
167{
168 if (bdi_cap_writeback_dirty(bdi)) {
169 prop_fraction_percpu(&vm_completions, &bdi->completions,
170 numerator, denominator);
171 } else {
172 *numerator = 0;
173 *denominator = 1;
174 }
175}
176
177/*
178 * Clip the earned share of dirty pages to that which is actually available.
179 * This avoids exceeding the total dirty_limit when the floating averages
180 * fluctuate too quickly.
181 */
182static void
183clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty)
184{
185 long avail_dirty;
186
187 avail_dirty = dirty -
188 (global_page_state(NR_FILE_DIRTY) +
189 global_page_state(NR_WRITEBACK) +
190 global_page_state(NR_UNSTABLE_NFS));
191
192 if (avail_dirty < 0)
193 avail_dirty = 0;
194
195 avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
196 bdi_stat(bdi, BDI_WRITEBACK);
197
198 *pbdi_dirty = min(*pbdi_dirty, avail_dirty);
199}
200
201/*
106 * Work out the current dirty-memory clamping and background writeout 202 * Work out the current dirty-memory clamping and background writeout
107 * thresholds. 203 * thresholds.
108 * 204 *
@@ -158,8 +254,8 @@ static unsigned long determine_dirtyable_memory(void)
158} 254}
159 255
160static void 256static void
161get_dirty_limits(long *pbackground, long *pdirty, 257get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
162 struct address_space *mapping) 258 struct backing_dev_info *bdi)
163{ 259{
164 int background_ratio; /* Percentages */ 260 int background_ratio; /* Percentages */
165 int dirty_ratio; 261 int dirty_ratio;
@@ -193,6 +289,22 @@ get_dirty_limits(long *pbackground, long *pdirty,
193 } 289 }
194 *pbackground = background; 290 *pbackground = background;
195 *pdirty = dirty; 291 *pdirty = dirty;
292
293 if (bdi) {
294 u64 bdi_dirty = dirty;
295 long numerator, denominator;
296
297 /*
298 * Calculate this BDI's share of the dirty ratio.
299 */
300 bdi_writeout_fraction(bdi, &numerator, &denominator);
301
302 bdi_dirty *= numerator;
303 do_div(bdi_dirty, denominator);
304
305 *pbdi_dirty = bdi_dirty;
306 clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
307 }
196} 308}
197 309
198/* 310/*
@@ -204,9 +316,11 @@ get_dirty_limits(long *pbackground, long *pdirty,
204 */ 316 */
205static void balance_dirty_pages(struct address_space *mapping) 317static void balance_dirty_pages(struct address_space *mapping)
206{ 318{
207 long nr_reclaimable; 319 long bdi_nr_reclaimable;
320 long bdi_nr_writeback;
208 long background_thresh; 321 long background_thresh;
209 long dirty_thresh; 322 long dirty_thresh;
323 long bdi_thresh;
210 unsigned long pages_written = 0; 324 unsigned long pages_written = 0;
211 unsigned long write_chunk = sync_writeback_pages(); 325 unsigned long write_chunk = sync_writeback_pages();
212 326
@@ -221,15 +335,15 @@ static void balance_dirty_pages(struct address_space *mapping)
221 .range_cyclic = 1, 335 .range_cyclic = 1,
222 }; 336 };
223 337
224 get_dirty_limits(&background_thresh, &dirty_thresh, mapping); 338 get_dirty_limits(&background_thresh, &dirty_thresh,
225 nr_reclaimable = global_page_state(NR_FILE_DIRTY) + 339 &bdi_thresh, bdi);
226 global_page_state(NR_UNSTABLE_NFS); 340 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
227 if (nr_reclaimable + global_page_state(NR_WRITEBACK) <= 341 bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
228 dirty_thresh) 342 if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
229 break; 343 break;
230 344
231 if (!dirty_exceeded) 345 if (!bdi->dirty_exceeded)
232 dirty_exceeded = 1; 346 bdi->dirty_exceeded = 1;
233 347
234 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. 348 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
235 * Unstable writes are a feature of certain networked 349 * Unstable writes are a feature of certain networked
@@ -237,26 +351,42 @@ static void balance_dirty_pages(struct address_space *mapping)
237 * written to the server's write cache, but has not yet 351 * written to the server's write cache, but has not yet
238 * been flushed to permanent storage. 352 * been flushed to permanent storage.
239 */ 353 */
240 if (nr_reclaimable) { 354 if (bdi_nr_reclaimable) {
241 writeback_inodes(&wbc); 355 writeback_inodes(&wbc);
242 get_dirty_limits(&background_thresh,
243 &dirty_thresh, mapping);
244 nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
245 global_page_state(NR_UNSTABLE_NFS);
246 if (nr_reclaimable +
247 global_page_state(NR_WRITEBACK)
248 <= dirty_thresh)
249 break;
250 pages_written += write_chunk - wbc.nr_to_write; 356 pages_written += write_chunk - wbc.nr_to_write;
251 if (pages_written >= write_chunk) 357 get_dirty_limits(&background_thresh, &dirty_thresh,
252 break; /* We've done our duty */ 358 &bdi_thresh, bdi);
359 }
360
361 /*
362 * In order to avoid the stacked BDI deadlock we need
363 * to ensure we accurately count the 'dirty' pages when
364 * the threshold is low.
365 *
366 * Otherwise it would be possible to get thresh+n pages
367 * reported dirty, even though there are thresh-m pages
368 * actually dirty; with m+n sitting in the percpu
369 * deltas.
370 */
371 if (bdi_thresh < 2*bdi_stat_error(bdi)) {
372 bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
373 bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
374 } else if (bdi_nr_reclaimable) {
375 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
376 bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
253 } 377 }
378
379 if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
380 break;
381 if (pages_written >= write_chunk)
382 break; /* We've done our duty */
383
254 congestion_wait(WRITE, HZ/10); 384 congestion_wait(WRITE, HZ/10);
255 } 385 }
256 386
257 if (nr_reclaimable + global_page_state(NR_WRITEBACK) 387 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
258 <= dirty_thresh && dirty_exceeded) 388 bdi->dirty_exceeded)
259 dirty_exceeded = 0; 389 bdi->dirty_exceeded = 0;
260 390
261 if (writeback_in_progress(bdi)) 391 if (writeback_in_progress(bdi))
262 return; /* pdflush is already working this queue */ 392 return; /* pdflush is already working this queue */
@@ -270,7 +400,9 @@ static void balance_dirty_pages(struct address_space *mapping)
270 * background_thresh, to keep the amount of dirty memory low. 400 * background_thresh, to keep the amount of dirty memory low.
271 */ 401 */
272 if ((laptop_mode && pages_written) || 402 if ((laptop_mode && pages_written) ||
273 (!laptop_mode && (nr_reclaimable > background_thresh))) 403 (!laptop_mode && (global_page_state(NR_FILE_DIRTY)
404 + global_page_state(NR_UNSTABLE_NFS)
405 > background_thresh)))
274 pdflush_operation(background_writeout, 0); 406 pdflush_operation(background_writeout, 0);
275} 407}
276 408
@@ -306,7 +438,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
306 unsigned long *p; 438 unsigned long *p;
307 439
308 ratelimit = ratelimit_pages; 440 ratelimit = ratelimit_pages;
309 if (dirty_exceeded) 441 if (mapping->backing_dev_info->dirty_exceeded)
310 ratelimit = 8; 442 ratelimit = 8;
311 443
312 /* 444 /*
@@ -342,7 +474,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
342 } 474 }
343 475
344 for ( ; ; ) { 476 for ( ; ; ) {
345 get_dirty_limits(&background_thresh, &dirty_thresh, NULL); 477 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
346 478
347 /* 479 /*
348 * Boost the allowable dirty threshold a bit for page 480 * Boost the allowable dirty threshold a bit for page
@@ -377,7 +509,7 @@ static void background_writeout(unsigned long _min_pages)
377 long background_thresh; 509 long background_thresh;
378 long dirty_thresh; 510 long dirty_thresh;
379 511
380 get_dirty_limits(&background_thresh, &dirty_thresh, NULL); 512 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
381 if (global_page_state(NR_FILE_DIRTY) + 513 if (global_page_state(NR_FILE_DIRTY) +
382 global_page_state(NR_UNSTABLE_NFS) < background_thresh 514 global_page_state(NR_UNSTABLE_NFS) < background_thresh
383 && min_pages <= 0) 515 && min_pages <= 0)
@@ -580,9 +712,14 @@ static struct notifier_block __cpuinitdata ratelimit_nb = {
580 */ 712 */
581void __init page_writeback_init(void) 713void __init page_writeback_init(void)
582{ 714{
715 int shift;
716
583 mod_timer(&wb_timer, jiffies + dirty_writeback_interval); 717 mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
584 writeback_set_ratelimit(); 718 writeback_set_ratelimit();
585 register_cpu_notifier(&ratelimit_nb); 719 register_cpu_notifier(&ratelimit_nb);
720
721 shift = calc_period_shift();
722 prop_descriptor_init(&vm_completions, shift);
586} 723}
587 724
588/** 725/**
@@ -988,8 +1125,10 @@ int test_clear_page_writeback(struct page *page)
988 radix_tree_tag_clear(&mapping->page_tree, 1125 radix_tree_tag_clear(&mapping->page_tree,
989 page_index(page), 1126 page_index(page),
990 PAGECACHE_TAG_WRITEBACK); 1127 PAGECACHE_TAG_WRITEBACK);
991 if (bdi_cap_writeback_dirty(bdi)) 1128 if (bdi_cap_writeback_dirty(bdi)) {
992 __dec_bdi_stat(bdi, BDI_WRITEBACK); 1129 __dec_bdi_stat(bdi, BDI_WRITEBACK);
1130 __bdi_writeout_inc(bdi);
1131 }
993 } 1132 }
994 write_unlock_irqrestore(&mapping->tree_lock, flags); 1133 write_unlock_irqrestore(&mapping->tree_lock, flags);
995 } else { 1134 } else {