diff options
Diffstat (limited to 'mm/page-writeback.c')
-rw-r--r-- | mm/page-writeback.c | 203 |
1 files changed, 171 insertions, 32 deletions
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index f1d201fdcf9c..b0360546ac86 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -2,6 +2,7 @@ | |||
2 | * mm/page-writeback.c | 2 | * mm/page-writeback.c |
3 | * | 3 | * |
4 | * Copyright (C) 2002, Linus Torvalds. | 4 | * Copyright (C) 2002, Linus Torvalds. |
5 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | ||
5 | * | 6 | * |
6 | * Contains functions related to writing back dirty pages at the | 7 | * Contains functions related to writing back dirty pages at the |
7 | * address_space level. | 8 | * address_space level. |
@@ -49,8 +50,6 @@ | |||
49 | */ | 50 | */ |
50 | static long ratelimit_pages = 32; | 51 | static long ratelimit_pages = 32; |
51 | 52 | ||
52 | static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */ | ||
53 | |||
54 | /* | 53 | /* |
55 | * When balance_dirty_pages decides that the caller needs to perform some | 54 | * When balance_dirty_pages decides that the caller needs to perform some |
56 | * non-background writeback, this is how many pages it will attempt to write. | 55 | * non-background writeback, this is how many pages it will attempt to write. |
@@ -103,6 +102,103 @@ EXPORT_SYMBOL(laptop_mode); | |||
103 | static void background_writeout(unsigned long _min_pages); | 102 | static void background_writeout(unsigned long _min_pages); |
104 | 103 | ||
105 | /* | 104 | /* |
105 | * Scale the writeback cache size proportional to the relative writeout speeds. | ||
106 | * | ||
107 | * We do this by keeping a floating proportion between BDIs, based on page | ||
108 | * writeback completions [end_page_writeback()]. Those devices that write out | ||
109 | * pages fastest will get the larger share, while the slower will get a smaller | ||
110 | * share. | ||
111 | * | ||
112 | * We use page writeout completions because we are interested in getting rid of | ||
113 | * dirty pages. Having them written out is the primary goal. | ||
114 | * | ||
115 | * We introduce a concept of time, a period over which we measure these events, | ||
116 | * because demand can/will vary over time. The length of this period itself is | ||
117 | * measured in page writeback completions. | ||
118 | * | ||
119 | */ | ||
120 | static struct prop_descriptor vm_completions; | ||
121 | |||
122 | static unsigned long determine_dirtyable_memory(void); | ||
123 | |||
124 | /* | ||
125 | * couple the period to the dirty_ratio: | ||
126 | * | ||
127 | * period/2 ~ roundup_pow_of_two(dirty limit) | ||
128 | */ | ||
129 | static int calc_period_shift(void) | ||
130 | { | ||
131 | unsigned long dirty_total; | ||
132 | |||
133 | dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100; | ||
134 | return 2 + ilog2(dirty_total - 1); | ||
135 | } | ||
136 | |||
137 | /* | ||
138 | * update the period when the dirty ratio changes. | ||
139 | */ | ||
140 | int dirty_ratio_handler(struct ctl_table *table, int write, | ||
141 | struct file *filp, void __user *buffer, size_t *lenp, | ||
142 | loff_t *ppos) | ||
143 | { | ||
144 | int old_ratio = vm_dirty_ratio; | ||
145 | int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); | ||
146 | if (ret == 0 && write && vm_dirty_ratio != old_ratio) { | ||
147 | int shift = calc_period_shift(); | ||
148 | prop_change_shift(&vm_completions, shift); | ||
149 | } | ||
150 | return ret; | ||
151 | } | ||
152 | |||
153 | /* | ||
154 | * Increment the BDI's writeout completion count and the global writeout | ||
155 | * completion count. Called from test_clear_page_writeback(). | ||
156 | */ | ||
157 | static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) | ||
158 | { | ||
159 | __prop_inc_percpu(&vm_completions, &bdi->completions); | ||
160 | } | ||
161 | |||
162 | /* | ||
163 | * Obtain an accurate fraction of the BDI's portion. | ||
164 | */ | ||
165 | static void bdi_writeout_fraction(struct backing_dev_info *bdi, | ||
166 | long *numerator, long *denominator) | ||
167 | { | ||
168 | if (bdi_cap_writeback_dirty(bdi)) { | ||
169 | prop_fraction_percpu(&vm_completions, &bdi->completions, | ||
170 | numerator, denominator); | ||
171 | } else { | ||
172 | *numerator = 0; | ||
173 | *denominator = 1; | ||
174 | } | ||
175 | } | ||
176 | |||
177 | /* | ||
178 | * Clip the earned share of dirty pages to that which is actually available. | ||
179 | * This avoids exceeding the total dirty_limit when the floating averages | ||
180 | * fluctuate too quickly. | ||
181 | */ | ||
182 | static void | ||
183 | clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty) | ||
184 | { | ||
185 | long avail_dirty; | ||
186 | |||
187 | avail_dirty = dirty - | ||
188 | (global_page_state(NR_FILE_DIRTY) + | ||
189 | global_page_state(NR_WRITEBACK) + | ||
190 | global_page_state(NR_UNSTABLE_NFS)); | ||
191 | |||
192 | if (avail_dirty < 0) | ||
193 | avail_dirty = 0; | ||
194 | |||
195 | avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + | ||
196 | bdi_stat(bdi, BDI_WRITEBACK); | ||
197 | |||
198 | *pbdi_dirty = min(*pbdi_dirty, avail_dirty); | ||
199 | } | ||
200 | |||
201 | /* | ||
106 | * Work out the current dirty-memory clamping and background writeout | 202 | * Work out the current dirty-memory clamping and background writeout |
107 | * thresholds. | 203 | * thresholds. |
108 | * | 204 | * |
@@ -158,8 +254,8 @@ static unsigned long determine_dirtyable_memory(void) | |||
158 | } | 254 | } |
159 | 255 | ||
160 | static void | 256 | static void |
161 | get_dirty_limits(long *pbackground, long *pdirty, | 257 | get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty, |
162 | struct address_space *mapping) | 258 | struct backing_dev_info *bdi) |
163 | { | 259 | { |
164 | int background_ratio; /* Percentages */ | 260 | int background_ratio; /* Percentages */ |
165 | int dirty_ratio; | 261 | int dirty_ratio; |
@@ -193,6 +289,22 @@ get_dirty_limits(long *pbackground, long *pdirty, | |||
193 | } | 289 | } |
194 | *pbackground = background; | 290 | *pbackground = background; |
195 | *pdirty = dirty; | 291 | *pdirty = dirty; |
292 | |||
293 | if (bdi) { | ||
294 | u64 bdi_dirty = dirty; | ||
295 | long numerator, denominator; | ||
296 | |||
297 | /* | ||
298 | * Calculate this BDI's share of the dirty ratio. | ||
299 | */ | ||
300 | bdi_writeout_fraction(bdi, &numerator, &denominator); | ||
301 | |||
302 | bdi_dirty *= numerator; | ||
303 | do_div(bdi_dirty, denominator); | ||
304 | |||
305 | *pbdi_dirty = bdi_dirty; | ||
306 | clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty); | ||
307 | } | ||
196 | } | 308 | } |
197 | 309 | ||
198 | /* | 310 | /* |
@@ -204,9 +316,11 @@ get_dirty_limits(long *pbackground, long *pdirty, | |||
204 | */ | 316 | */ |
205 | static void balance_dirty_pages(struct address_space *mapping) | 317 | static void balance_dirty_pages(struct address_space *mapping) |
206 | { | 318 | { |
207 | long nr_reclaimable; | 319 | long bdi_nr_reclaimable; |
320 | long bdi_nr_writeback; | ||
208 | long background_thresh; | 321 | long background_thresh; |
209 | long dirty_thresh; | 322 | long dirty_thresh; |
323 | long bdi_thresh; | ||
210 | unsigned long pages_written = 0; | 324 | unsigned long pages_written = 0; |
211 | unsigned long write_chunk = sync_writeback_pages(); | 325 | unsigned long write_chunk = sync_writeback_pages(); |
212 | 326 | ||
@@ -221,15 +335,15 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
221 | .range_cyclic = 1, | 335 | .range_cyclic = 1, |
222 | }; | 336 | }; |
223 | 337 | ||
224 | get_dirty_limits(&background_thresh, &dirty_thresh, mapping); | 338 | get_dirty_limits(&background_thresh, &dirty_thresh, |
225 | nr_reclaimable = global_page_state(NR_FILE_DIRTY) + | 339 | &bdi_thresh, bdi); |
226 | global_page_state(NR_UNSTABLE_NFS); | 340 | bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); |
227 | if (nr_reclaimable + global_page_state(NR_WRITEBACK) <= | 341 | bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); |
228 | dirty_thresh) | 342 | if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) |
229 | break; | 343 | break; |
230 | 344 | ||
231 | if (!dirty_exceeded) | 345 | if (!bdi->dirty_exceeded) |
232 | dirty_exceeded = 1; | 346 | bdi->dirty_exceeded = 1; |
233 | 347 | ||
234 | /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. | 348 | /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. |
235 | * Unstable writes are a feature of certain networked | 349 | * Unstable writes are a feature of certain networked |
@@ -237,26 +351,42 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
237 | * written to the server's write cache, but has not yet | 351 | * written to the server's write cache, but has not yet |
238 | * been flushed to permanent storage. | 352 | * been flushed to permanent storage. |
239 | */ | 353 | */ |
240 | if (nr_reclaimable) { | 354 | if (bdi_nr_reclaimable) { |
241 | writeback_inodes(&wbc); | 355 | writeback_inodes(&wbc); |
242 | get_dirty_limits(&background_thresh, | ||
243 | &dirty_thresh, mapping); | ||
244 | nr_reclaimable = global_page_state(NR_FILE_DIRTY) + | ||
245 | global_page_state(NR_UNSTABLE_NFS); | ||
246 | if (nr_reclaimable + | ||
247 | global_page_state(NR_WRITEBACK) | ||
248 | <= dirty_thresh) | ||
249 | break; | ||
250 | pages_written += write_chunk - wbc.nr_to_write; | 356 | pages_written += write_chunk - wbc.nr_to_write; |
251 | if (pages_written >= write_chunk) | 357 | get_dirty_limits(&background_thresh, &dirty_thresh, |
252 | break; /* We've done our duty */ | 358 | &bdi_thresh, bdi); |
359 | } | ||
360 | |||
361 | /* | ||
362 | * In order to avoid the stacked BDI deadlock we need | ||
363 | * to ensure we accurately count the 'dirty' pages when | ||
364 | * the threshold is low. | ||
365 | * | ||
366 | * Otherwise it would be possible to get thresh+n pages | ||
367 | * reported dirty, even though there are thresh-m pages | ||
368 | * actually dirty; with m+n sitting in the percpu | ||
369 | * deltas. | ||
370 | */ | ||
371 | if (bdi_thresh < 2*bdi_stat_error(bdi)) { | ||
372 | bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); | ||
373 | bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); | ||
374 | } else if (bdi_nr_reclaimable) { | ||
375 | bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); | ||
376 | bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); | ||
253 | } | 377 | } |
378 | |||
379 | if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) | ||
380 | break; | ||
381 | if (pages_written >= write_chunk) | ||
382 | break; /* We've done our duty */ | ||
383 | |||
254 | congestion_wait(WRITE, HZ/10); | 384 | congestion_wait(WRITE, HZ/10); |
255 | } | 385 | } |
256 | 386 | ||
257 | if (nr_reclaimable + global_page_state(NR_WRITEBACK) | 387 | if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && |
258 | <= dirty_thresh && dirty_exceeded) | 388 | bdi->dirty_exceeded) |
259 | dirty_exceeded = 0; | 389 | bdi->dirty_exceeded = 0; |
260 | 390 | ||
261 | if (writeback_in_progress(bdi)) | 391 | if (writeback_in_progress(bdi)) |
262 | return; /* pdflush is already working this queue */ | 392 | return; /* pdflush is already working this queue */ |
@@ -270,7 +400,9 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
270 | * background_thresh, to keep the amount of dirty memory low. | 400 | * background_thresh, to keep the amount of dirty memory low. |
271 | */ | 401 | */ |
272 | if ((laptop_mode && pages_written) || | 402 | if ((laptop_mode && pages_written) || |
273 | (!laptop_mode && (nr_reclaimable > background_thresh))) | 403 | (!laptop_mode && (global_page_state(NR_FILE_DIRTY) |
404 | + global_page_state(NR_UNSTABLE_NFS) | ||
405 | > background_thresh))) | ||
274 | pdflush_operation(background_writeout, 0); | 406 | pdflush_operation(background_writeout, 0); |
275 | } | 407 | } |
276 | 408 | ||
@@ -306,7 +438,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, | |||
306 | unsigned long *p; | 438 | unsigned long *p; |
307 | 439 | ||
308 | ratelimit = ratelimit_pages; | 440 | ratelimit = ratelimit_pages; |
309 | if (dirty_exceeded) | 441 | if (mapping->backing_dev_info->dirty_exceeded) |
310 | ratelimit = 8; | 442 | ratelimit = 8; |
311 | 443 | ||
312 | /* | 444 | /* |
@@ -342,7 +474,7 @@ void throttle_vm_writeout(gfp_t gfp_mask) | |||
342 | } | 474 | } |
343 | 475 | ||
344 | for ( ; ; ) { | 476 | for ( ; ; ) { |
345 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL); | 477 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); |
346 | 478 | ||
347 | /* | 479 | /* |
348 | * Boost the allowable dirty threshold a bit for page | 480 | * Boost the allowable dirty threshold a bit for page |
@@ -377,7 +509,7 @@ static void background_writeout(unsigned long _min_pages) | |||
377 | long background_thresh; | 509 | long background_thresh; |
378 | long dirty_thresh; | 510 | long dirty_thresh; |
379 | 511 | ||
380 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL); | 512 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); |
381 | if (global_page_state(NR_FILE_DIRTY) + | 513 | if (global_page_state(NR_FILE_DIRTY) + |
382 | global_page_state(NR_UNSTABLE_NFS) < background_thresh | 514 | global_page_state(NR_UNSTABLE_NFS) < background_thresh |
383 | && min_pages <= 0) | 515 | && min_pages <= 0) |
@@ -580,9 +712,14 @@ static struct notifier_block __cpuinitdata ratelimit_nb = { | |||
580 | */ | 712 | */ |
581 | void __init page_writeback_init(void) | 713 | void __init page_writeback_init(void) |
582 | { | 714 | { |
715 | int shift; | ||
716 | |||
583 | mod_timer(&wb_timer, jiffies + dirty_writeback_interval); | 717 | mod_timer(&wb_timer, jiffies + dirty_writeback_interval); |
584 | writeback_set_ratelimit(); | 718 | writeback_set_ratelimit(); |
585 | register_cpu_notifier(&ratelimit_nb); | 719 | register_cpu_notifier(&ratelimit_nb); |
720 | |||
721 | shift = calc_period_shift(); | ||
722 | prop_descriptor_init(&vm_completions, shift); | ||
586 | } | 723 | } |
587 | 724 | ||
588 | /** | 725 | /** |
@@ -988,8 +1125,10 @@ int test_clear_page_writeback(struct page *page) | |||
988 | radix_tree_tag_clear(&mapping->page_tree, | 1125 | radix_tree_tag_clear(&mapping->page_tree, |
989 | page_index(page), | 1126 | page_index(page), |
990 | PAGECACHE_TAG_WRITEBACK); | 1127 | PAGECACHE_TAG_WRITEBACK); |
991 | if (bdi_cap_writeback_dirty(bdi)) | 1128 | if (bdi_cap_writeback_dirty(bdi)) { |
992 | __dec_bdi_stat(bdi, BDI_WRITEBACK); | 1129 | __dec_bdi_stat(bdi, BDI_WRITEBACK); |
1130 | __bdi_writeout_inc(bdi); | ||
1131 | } | ||
993 | } | 1132 | } |
994 | write_unlock_irqrestore(&mapping->tree_lock, flags); | 1133 | write_unlock_irqrestore(&mapping->tree_lock, flags); |
995 | } else { | 1134 | } else { |