diff options
-rw-r--r-- | include/linux/backing-dev.h | 4 | ||||
-rw-r--r-- | include/linux/writeback.h | 4 | ||||
-rw-r--r-- | kernel/sysctl.c | 2 | ||||
-rw-r--r-- | mm/backing-dev.c | 19 | ||||
-rw-r--r-- | mm/page-writeback.c | 203 |
5 files changed, 194 insertions, 38 deletions
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 4d9222c2f222..48a62baace58 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h | |||
@@ -10,6 +10,7 @@ | |||
10 | 10 | ||
11 | #include <linux/percpu_counter.h> | 11 | #include <linux/percpu_counter.h> |
12 | #include <linux/log2.h> | 12 | #include <linux/log2.h> |
13 | #include <linux/proportions.h> | ||
13 | #include <asm/atomic.h> | 14 | #include <asm/atomic.h> |
14 | 15 | ||
15 | struct page; | 16 | struct page; |
@@ -44,6 +45,9 @@ struct backing_dev_info { | |||
44 | void *unplug_io_data; | 45 | void *unplug_io_data; |
45 | 46 | ||
46 | struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS]; | 47 | struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS]; |
48 | |||
49 | struct prop_local_percpu completions; | ||
50 | int dirty_exceeded; | ||
47 | }; | 51 | }; |
48 | 52 | ||
49 | int bdi_init(struct backing_dev_info *bdi); | 53 | int bdi_init(struct backing_dev_info *bdi); |
diff --git a/include/linux/writeback.h b/include/linux/writeback.h index d1321a81c9c4..52be879793ed 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h | |||
@@ -97,6 +97,10 @@ extern int dirty_expire_interval; | |||
97 | extern int block_dump; | 97 | extern int block_dump; |
98 | extern int laptop_mode; | 98 | extern int laptop_mode; |
99 | 99 | ||
100 | extern int dirty_ratio_handler(struct ctl_table *table, int write, | ||
101 | struct file *filp, void __user *buffer, size_t *lenp, | ||
102 | loff_t *ppos); | ||
103 | |||
100 | struct ctl_table; | 104 | struct ctl_table; |
101 | struct file; | 105 | struct file; |
102 | int dirty_writeback_centisecs_handler(struct ctl_table *, int, struct file *, | 106 | int dirty_writeback_centisecs_handler(struct ctl_table *, int, struct file *, |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 96efbb859997..c676b5ec88f5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -813,7 +813,7 @@ static ctl_table vm_table[] = { | |||
813 | .data = &vm_dirty_ratio, | 813 | .data = &vm_dirty_ratio, |
814 | .maxlen = sizeof(vm_dirty_ratio), | 814 | .maxlen = sizeof(vm_dirty_ratio), |
815 | .mode = 0644, | 815 | .mode = 0644, |
816 | .proc_handler = &proc_dointvec_minmax, | 816 | .proc_handler = &dirty_ratio_handler, |
817 | .strategy = &sysctl_intvec, | 817 | .strategy = &sysctl_intvec, |
818 | .extra1 = &zero, | 818 | .extra1 = &zero, |
819 | .extra2 = &one_hundred, | 819 | .extra2 = &one_hundred, |
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index a47065e084a4..b0ceb29da4c7 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -12,11 +12,17 @@ int bdi_init(struct backing_dev_info *bdi) | |||
12 | 12 | ||
13 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { | 13 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { |
14 | err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0); | 14 | err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0); |
15 | if (err) { | 15 | if (err) |
16 | for (j = 0; j < i; j++) | 16 | goto err; |
17 | percpu_counter_destroy(&bdi->bdi_stat[i]); | 17 | } |
18 | break; | 18 | |
19 | } | 19 | bdi->dirty_exceeded = 0; |
20 | err = prop_local_init_percpu(&bdi->completions); | ||
21 | |||
22 | if (err) { | ||
23 | err: | ||
24 | for (j = 0; j < i; j++) | ||
25 | percpu_counter_destroy(&bdi->bdi_stat[i]); | ||
20 | } | 26 | } |
21 | 27 | ||
22 | return err; | 28 | return err; |
@@ -29,6 +35,8 @@ void bdi_destroy(struct backing_dev_info *bdi) | |||
29 | 35 | ||
30 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) | 36 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) |
31 | percpu_counter_destroy(&bdi->bdi_stat[i]); | 37 | percpu_counter_destroy(&bdi->bdi_stat[i]); |
38 | |||
39 | prop_local_destroy_percpu(&bdi->completions); | ||
32 | } | 40 | } |
33 | EXPORT_SYMBOL(bdi_destroy); | 41 | EXPORT_SYMBOL(bdi_destroy); |
34 | 42 | ||
@@ -81,3 +89,4 @@ long congestion_wait(int rw, long timeout) | |||
81 | return ret; | 89 | return ret; |
82 | } | 90 | } |
83 | EXPORT_SYMBOL(congestion_wait); | 91 | EXPORT_SYMBOL(congestion_wait); |
92 | |||
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index f1d201fdcf9c..b0360546ac86 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -2,6 +2,7 @@ | |||
2 | * mm/page-writeback.c | 2 | * mm/page-writeback.c |
3 | * | 3 | * |
4 | * Copyright (C) 2002, Linus Torvalds. | 4 | * Copyright (C) 2002, Linus Torvalds. |
5 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | ||
5 | * | 6 | * |
6 | * Contains functions related to writing back dirty pages at the | 7 | * Contains functions related to writing back dirty pages at the |
7 | * address_space level. | 8 | * address_space level. |
@@ -49,8 +50,6 @@ | |||
49 | */ | 50 | */ |
50 | static long ratelimit_pages = 32; | 51 | static long ratelimit_pages = 32; |
51 | 52 | ||
52 | static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */ | ||
53 | |||
54 | /* | 53 | /* |
55 | * When balance_dirty_pages decides that the caller needs to perform some | 54 | * When balance_dirty_pages decides that the caller needs to perform some |
56 | * non-background writeback, this is how many pages it will attempt to write. | 55 | * non-background writeback, this is how many pages it will attempt to write. |
@@ -103,6 +102,103 @@ EXPORT_SYMBOL(laptop_mode); | |||
103 | static void background_writeout(unsigned long _min_pages); | 102 | static void background_writeout(unsigned long _min_pages); |
104 | 103 | ||
105 | /* | 104 | /* |
105 | * Scale the writeback cache size proportional to the relative writeout speeds. | ||
106 | * | ||
107 | * We do this by keeping a floating proportion between BDIs, based on page | ||
108 | * writeback completions [end_page_writeback()]. Those devices that write out | ||
109 | * pages fastest will get the larger share, while the slower will get a smaller | ||
110 | * share. | ||
111 | * | ||
112 | * We use page writeout completions because we are interested in getting rid of | ||
113 | * dirty pages. Having them written out is the primary goal. | ||
114 | * | ||
115 | * We introduce a concept of time, a period over which we measure these events, | ||
116 | * because demand can/will vary over time. The length of this period itself is | ||
117 | * measured in page writeback completions. | ||
118 | * | ||
119 | */ | ||
120 | static struct prop_descriptor vm_completions; | ||
121 | |||
122 | static unsigned long determine_dirtyable_memory(void); | ||
123 | |||
124 | /* | ||
125 | * couple the period to the dirty_ratio: | ||
126 | * | ||
127 | * period/2 ~ roundup_pow_of_two(dirty limit) | ||
128 | */ | ||
129 | static int calc_period_shift(void) | ||
130 | { | ||
131 | unsigned long dirty_total; | ||
132 | |||
133 | dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100; | ||
134 | return 2 + ilog2(dirty_total - 1); | ||
135 | } | ||
136 | |||
137 | /* | ||
138 | * update the period when the dirty ratio changes. | ||
139 | */ | ||
140 | int dirty_ratio_handler(struct ctl_table *table, int write, | ||
141 | struct file *filp, void __user *buffer, size_t *lenp, | ||
142 | loff_t *ppos) | ||
143 | { | ||
144 | int old_ratio = vm_dirty_ratio; | ||
145 | int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); | ||
146 | if (ret == 0 && write && vm_dirty_ratio != old_ratio) { | ||
147 | int shift = calc_period_shift(); | ||
148 | prop_change_shift(&vm_completions, shift); | ||
149 | } | ||
150 | return ret; | ||
151 | } | ||
152 | |||
153 | /* | ||
154 | * Increment the BDI's writeout completion count and the global writeout | ||
155 | * completion count. Called from test_clear_page_writeback(). | ||
156 | */ | ||
157 | static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) | ||
158 | { | ||
159 | __prop_inc_percpu(&vm_completions, &bdi->completions); | ||
160 | } | ||
161 | |||
162 | /* | ||
163 | * Obtain an accurate fraction of the BDI's portion. | ||
164 | */ | ||
165 | static void bdi_writeout_fraction(struct backing_dev_info *bdi, | ||
166 | long *numerator, long *denominator) | ||
167 | { | ||
168 | if (bdi_cap_writeback_dirty(bdi)) { | ||
169 | prop_fraction_percpu(&vm_completions, &bdi->completions, | ||
170 | numerator, denominator); | ||
171 | } else { | ||
172 | *numerator = 0; | ||
173 | *denominator = 1; | ||
174 | } | ||
175 | } | ||
176 | |||
177 | /* | ||
178 | * Clip the earned share of dirty pages to that which is actually available. | ||
179 | * This avoids exceeding the total dirty_limit when the floating averages | ||
180 | * fluctuate too quickly. | ||
181 | */ | ||
182 | static void | ||
183 | clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty) | ||
184 | { | ||
185 | long avail_dirty; | ||
186 | |||
187 | avail_dirty = dirty - | ||
188 | (global_page_state(NR_FILE_DIRTY) + | ||
189 | global_page_state(NR_WRITEBACK) + | ||
190 | global_page_state(NR_UNSTABLE_NFS)); | ||
191 | |||
192 | if (avail_dirty < 0) | ||
193 | avail_dirty = 0; | ||
194 | |||
195 | avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + | ||
196 | bdi_stat(bdi, BDI_WRITEBACK); | ||
197 | |||
198 | *pbdi_dirty = min(*pbdi_dirty, avail_dirty); | ||
199 | } | ||
200 | |||
201 | /* | ||
106 | * Work out the current dirty-memory clamping and background writeout | 202 | * Work out the current dirty-memory clamping and background writeout |
107 | * thresholds. | 203 | * thresholds. |
108 | * | 204 | * |
@@ -158,8 +254,8 @@ static unsigned long determine_dirtyable_memory(void) | |||
158 | } | 254 | } |
159 | 255 | ||
160 | static void | 256 | static void |
161 | get_dirty_limits(long *pbackground, long *pdirty, | 257 | get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty, |
162 | struct address_space *mapping) | 258 | struct backing_dev_info *bdi) |
163 | { | 259 | { |
164 | int background_ratio; /* Percentages */ | 260 | int background_ratio; /* Percentages */ |
165 | int dirty_ratio; | 261 | int dirty_ratio; |
@@ -193,6 +289,22 @@ get_dirty_limits(long *pbackground, long *pdirty, | |||
193 | } | 289 | } |
194 | *pbackground = background; | 290 | *pbackground = background; |
195 | *pdirty = dirty; | 291 | *pdirty = dirty; |
292 | |||
293 | if (bdi) { | ||
294 | u64 bdi_dirty = dirty; | ||
295 | long numerator, denominator; | ||
296 | |||
297 | /* | ||
298 | * Calculate this BDI's share of the dirty ratio. | ||
299 | */ | ||
300 | bdi_writeout_fraction(bdi, &numerator, &denominator); | ||
301 | |||
302 | bdi_dirty *= numerator; | ||
303 | do_div(bdi_dirty, denominator); | ||
304 | |||
305 | *pbdi_dirty = bdi_dirty; | ||
306 | clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty); | ||
307 | } | ||
196 | } | 308 | } |
197 | 309 | ||
198 | /* | 310 | /* |
@@ -204,9 +316,11 @@ get_dirty_limits(long *pbackground, long *pdirty, | |||
204 | */ | 316 | */ |
205 | static void balance_dirty_pages(struct address_space *mapping) | 317 | static void balance_dirty_pages(struct address_space *mapping) |
206 | { | 318 | { |
207 | long nr_reclaimable; | 319 | long bdi_nr_reclaimable; |
320 | long bdi_nr_writeback; | ||
208 | long background_thresh; | 321 | long background_thresh; |
209 | long dirty_thresh; | 322 | long dirty_thresh; |
323 | long bdi_thresh; | ||
210 | unsigned long pages_written = 0; | 324 | unsigned long pages_written = 0; |
211 | unsigned long write_chunk = sync_writeback_pages(); | 325 | unsigned long write_chunk = sync_writeback_pages(); |
212 | 326 | ||
@@ -221,15 +335,15 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
221 | .range_cyclic = 1, | 335 | .range_cyclic = 1, |
222 | }; | 336 | }; |
223 | 337 | ||
224 | get_dirty_limits(&background_thresh, &dirty_thresh, mapping); | 338 | get_dirty_limits(&background_thresh, &dirty_thresh, |
225 | nr_reclaimable = global_page_state(NR_FILE_DIRTY) + | 339 | &bdi_thresh, bdi); |
226 | global_page_state(NR_UNSTABLE_NFS); | 340 | bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); |
227 | if (nr_reclaimable + global_page_state(NR_WRITEBACK) <= | 341 | bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); |
228 | dirty_thresh) | 342 | if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) |
229 | break; | 343 | break; |
230 | 344 | ||
231 | if (!dirty_exceeded) | 345 | if (!bdi->dirty_exceeded) |
232 | dirty_exceeded = 1; | 346 | bdi->dirty_exceeded = 1; |
233 | 347 | ||
234 | /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. | 348 | /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. |
235 | * Unstable writes are a feature of certain networked | 349 | * Unstable writes are a feature of certain networked |
@@ -237,26 +351,42 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
237 | * written to the server's write cache, but has not yet | 351 | * written to the server's write cache, but has not yet |
238 | * been flushed to permanent storage. | 352 | * been flushed to permanent storage. |
239 | */ | 353 | */ |
240 | if (nr_reclaimable) { | 354 | if (bdi_nr_reclaimable) { |
241 | writeback_inodes(&wbc); | 355 | writeback_inodes(&wbc); |
242 | get_dirty_limits(&background_thresh, | ||
243 | &dirty_thresh, mapping); | ||
244 | nr_reclaimable = global_page_state(NR_FILE_DIRTY) + | ||
245 | global_page_state(NR_UNSTABLE_NFS); | ||
246 | if (nr_reclaimable + | ||
247 | global_page_state(NR_WRITEBACK) | ||
248 | <= dirty_thresh) | ||
249 | break; | ||
250 | pages_written += write_chunk - wbc.nr_to_write; | 356 | pages_written += write_chunk - wbc.nr_to_write; |
251 | if (pages_written >= write_chunk) | 357 | get_dirty_limits(&background_thresh, &dirty_thresh, |
252 | break; /* We've done our duty */ | 358 | &bdi_thresh, bdi); |
359 | } | ||
360 | |||
361 | /* | ||
362 | * In order to avoid the stacked BDI deadlock we need | ||
363 | * to ensure we accurately count the 'dirty' pages when | ||
364 | * the threshold is low. | ||
365 | * | ||
366 | * Otherwise it would be possible to get thresh+n pages | ||
367 | * reported dirty, even though there are thresh-m pages | ||
368 | * actually dirty; with m+n sitting in the percpu | ||
369 | * deltas. | ||
370 | */ | ||
371 | if (bdi_thresh < 2*bdi_stat_error(bdi)) { | ||
372 | bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); | ||
373 | bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); | ||
374 | } else if (bdi_nr_reclaimable) { | ||
375 | bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); | ||
376 | bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); | ||
253 | } | 377 | } |
378 | |||
379 | if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) | ||
380 | break; | ||
381 | if (pages_written >= write_chunk) | ||
382 | break; /* We've done our duty */ | ||
383 | |||
254 | congestion_wait(WRITE, HZ/10); | 384 | congestion_wait(WRITE, HZ/10); |
255 | } | 385 | } |
256 | 386 | ||
257 | if (nr_reclaimable + global_page_state(NR_WRITEBACK) | 387 | if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && |
258 | <= dirty_thresh && dirty_exceeded) | 388 | bdi->dirty_exceeded) |
259 | dirty_exceeded = 0; | 389 | bdi->dirty_exceeded = 0; |
260 | 390 | ||
261 | if (writeback_in_progress(bdi)) | 391 | if (writeback_in_progress(bdi)) |
262 | return; /* pdflush is already working this queue */ | 392 | return; /* pdflush is already working this queue */ |
@@ -270,7 +400,9 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
270 | * background_thresh, to keep the amount of dirty memory low. | 400 | * background_thresh, to keep the amount of dirty memory low. |
271 | */ | 401 | */ |
272 | if ((laptop_mode && pages_written) || | 402 | if ((laptop_mode && pages_written) || |
273 | (!laptop_mode && (nr_reclaimable > background_thresh))) | 403 | (!laptop_mode && (global_page_state(NR_FILE_DIRTY) |
404 | + global_page_state(NR_UNSTABLE_NFS) | ||
405 | > background_thresh))) | ||
274 | pdflush_operation(background_writeout, 0); | 406 | pdflush_operation(background_writeout, 0); |
275 | } | 407 | } |
276 | 408 | ||
@@ -306,7 +438,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, | |||
306 | unsigned long *p; | 438 | unsigned long *p; |
307 | 439 | ||
308 | ratelimit = ratelimit_pages; | 440 | ratelimit = ratelimit_pages; |
309 | if (dirty_exceeded) | 441 | if (mapping->backing_dev_info->dirty_exceeded) |
310 | ratelimit = 8; | 442 | ratelimit = 8; |
311 | 443 | ||
312 | /* | 444 | /* |
@@ -342,7 +474,7 @@ void throttle_vm_writeout(gfp_t gfp_mask) | |||
342 | } | 474 | } |
343 | 475 | ||
344 | for ( ; ; ) { | 476 | for ( ; ; ) { |
345 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL); | 477 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); |
346 | 478 | ||
347 | /* | 479 | /* |
348 | * Boost the allowable dirty threshold a bit for page | 480 | * Boost the allowable dirty threshold a bit for page |
@@ -377,7 +509,7 @@ static void background_writeout(unsigned long _min_pages) | |||
377 | long background_thresh; | 509 | long background_thresh; |
378 | long dirty_thresh; | 510 | long dirty_thresh; |
379 | 511 | ||
380 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL); | 512 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); |
381 | if (global_page_state(NR_FILE_DIRTY) + | 513 | if (global_page_state(NR_FILE_DIRTY) + |
382 | global_page_state(NR_UNSTABLE_NFS) < background_thresh | 514 | global_page_state(NR_UNSTABLE_NFS) < background_thresh |
383 | && min_pages <= 0) | 515 | && min_pages <= 0) |
@@ -580,9 +712,14 @@ static struct notifier_block __cpuinitdata ratelimit_nb = { | |||
580 | */ | 712 | */ |
581 | void __init page_writeback_init(void) | 713 | void __init page_writeback_init(void) |
582 | { | 714 | { |
715 | int shift; | ||
716 | |||
583 | mod_timer(&wb_timer, jiffies + dirty_writeback_interval); | 717 | mod_timer(&wb_timer, jiffies + dirty_writeback_interval); |
584 | writeback_set_ratelimit(); | 718 | writeback_set_ratelimit(); |
585 | register_cpu_notifier(&ratelimit_nb); | 719 | register_cpu_notifier(&ratelimit_nb); |
720 | |||
721 | shift = calc_period_shift(); | ||
722 | prop_descriptor_init(&vm_completions, shift); | ||
586 | } | 723 | } |
587 | 724 | ||
588 | /** | 725 | /** |
@@ -988,8 +1125,10 @@ int test_clear_page_writeback(struct page *page) | |||
988 | radix_tree_tag_clear(&mapping->page_tree, | 1125 | radix_tree_tag_clear(&mapping->page_tree, |
989 | page_index(page), | 1126 | page_index(page), |
990 | PAGECACHE_TAG_WRITEBACK); | 1127 | PAGECACHE_TAG_WRITEBACK); |
991 | if (bdi_cap_writeback_dirty(bdi)) | 1128 | if (bdi_cap_writeback_dirty(bdi)) { |
992 | __dec_bdi_stat(bdi, BDI_WRITEBACK); | 1129 | __dec_bdi_stat(bdi, BDI_WRITEBACK); |
1130 | __bdi_writeout_inc(bdi); | ||
1131 | } | ||
993 | } | 1132 | } |
994 | write_unlock_irqrestore(&mapping->tree_lock, flags); | 1133 | write_unlock_irqrestore(&mapping->tree_lock, flags); |
995 | } else { | 1134 | } else { |