diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-07-26 13:39:54 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-07-26 13:39:54 -0400 |
commit | f01ef569cddb1a8627b1c6b3a134998ad1cf4b22 (patch) | |
tree | 29ea1a0942c8549c24411e976cd6891c7e995e89 /mm | |
parent | a93a1329271038f0e8337061d3b41b3b212a851e (diff) | |
parent | bcff25fc8aa47a13faff8b4b992589813f7b450a (diff) |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/writeback
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/writeback: (27 commits)
mm: properly reflect task dirty limits in dirty_exceeded logic
writeback: don't busy retry writeback on new/freeing inodes
writeback: scale IO chunk size up to half device bandwidth
writeback: trace global_dirty_state
writeback: introduce max-pause and pass-good dirty limits
writeback: introduce smoothed global dirty limit
writeback: consolidate variable names in balance_dirty_pages()
writeback: show bdi write bandwidth in debugfs
writeback: bdi write bandwidth estimation
writeback: account per-bdi accumulated written pages
writeback: make writeback_control.nr_to_write straight
writeback: skip tmpfs early in balance_dirty_pages_ratelimited_nr()
writeback: trace event writeback_queue_io
writeback: trace event writeback_single_inode
writeback: remove .nonblocking and .encountered_congestion
writeback: remove writeback_control.more_io
writeback: skip balance_dirty_pages() for in-memory fs
writeback: add bdi_dirty_limit() kernel-doc
writeback: avoid extra sync work at enqueue time
writeback: elevate queue_io() into wb_writeback()
...
Fix up trivial conflicts in fs/fs-writeback.c and mm/filemap.c
Diffstat (limited to 'mm')
-rw-r--r-- | mm/backing-dev.c | 82 | ||||
-rw-r--r-- | mm/filemap.c | 6 | ||||
-rw-r--r-- | mm/page-writeback.c | 280 | ||||
-rw-r--r-- | mm/rmap.c | 4 |
4 files changed, 300 insertions, 72 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 8290b1e88257..d6edf8d14f9c 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -45,6 +45,17 @@ static struct timer_list sync_supers_timer; | |||
45 | static int bdi_sync_supers(void *); | 45 | static int bdi_sync_supers(void *); |
46 | static void sync_supers_timer_fn(unsigned long); | 46 | static void sync_supers_timer_fn(unsigned long); |
47 | 47 | ||
48 | void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) | ||
49 | { | ||
50 | if (wb1 < wb2) { | ||
51 | spin_lock(&wb1->list_lock); | ||
52 | spin_lock_nested(&wb2->list_lock, 1); | ||
53 | } else { | ||
54 | spin_lock(&wb2->list_lock); | ||
55 | spin_lock_nested(&wb1->list_lock, 1); | ||
56 | } | ||
57 | } | ||
58 | |||
48 | #ifdef CONFIG_DEBUG_FS | 59 | #ifdef CONFIG_DEBUG_FS |
49 | #include <linux/debugfs.h> | 60 | #include <linux/debugfs.h> |
50 | #include <linux/seq_file.h> | 61 | #include <linux/seq_file.h> |
@@ -67,34 +78,42 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) | |||
67 | struct inode *inode; | 78 | struct inode *inode; |
68 | 79 | ||
69 | nr_dirty = nr_io = nr_more_io = 0; | 80 | nr_dirty = nr_io = nr_more_io = 0; |
70 | spin_lock(&inode_wb_list_lock); | 81 | spin_lock(&wb->list_lock); |
71 | list_for_each_entry(inode, &wb->b_dirty, i_wb_list) | 82 | list_for_each_entry(inode, &wb->b_dirty, i_wb_list) |
72 | nr_dirty++; | 83 | nr_dirty++; |
73 | list_for_each_entry(inode, &wb->b_io, i_wb_list) | 84 | list_for_each_entry(inode, &wb->b_io, i_wb_list) |
74 | nr_io++; | 85 | nr_io++; |
75 | list_for_each_entry(inode, &wb->b_more_io, i_wb_list) | 86 | list_for_each_entry(inode, &wb->b_more_io, i_wb_list) |
76 | nr_more_io++; | 87 | nr_more_io++; |
77 | spin_unlock(&inode_wb_list_lock); | 88 | spin_unlock(&wb->list_lock); |
78 | 89 | ||
79 | global_dirty_limits(&background_thresh, &dirty_thresh); | 90 | global_dirty_limits(&background_thresh, &dirty_thresh); |
80 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); | 91 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); |
81 | 92 | ||
82 | #define K(x) ((x) << (PAGE_SHIFT - 10)) | 93 | #define K(x) ((x) << (PAGE_SHIFT - 10)) |
83 | seq_printf(m, | 94 | seq_printf(m, |
84 | "BdiWriteback: %8lu kB\n" | 95 | "BdiWriteback: %10lu kB\n" |
85 | "BdiReclaimable: %8lu kB\n" | 96 | "BdiReclaimable: %10lu kB\n" |
86 | "BdiDirtyThresh: %8lu kB\n" | 97 | "BdiDirtyThresh: %10lu kB\n" |
87 | "DirtyThresh: %8lu kB\n" | 98 | "DirtyThresh: %10lu kB\n" |
88 | "BackgroundThresh: %8lu kB\n" | 99 | "BackgroundThresh: %10lu kB\n" |
89 | "b_dirty: %8lu\n" | 100 | "BdiWritten: %10lu kB\n" |
90 | "b_io: %8lu\n" | 101 | "BdiWriteBandwidth: %10lu kBps\n" |
91 | "b_more_io: %8lu\n" | 102 | "b_dirty: %10lu\n" |
92 | "bdi_list: %8u\n" | 103 | "b_io: %10lu\n" |
93 | "state: %8lx\n", | 104 | "b_more_io: %10lu\n" |
105 | "bdi_list: %10u\n" | ||
106 | "state: %10lx\n", | ||
94 | (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), | 107 | (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), |
95 | (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), | 108 | (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), |
96 | K(bdi_thresh), K(dirty_thresh), | 109 | K(bdi_thresh), |
97 | K(background_thresh), nr_dirty, nr_io, nr_more_io, | 110 | K(dirty_thresh), |
111 | K(background_thresh), | ||
112 | (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)), | ||
113 | (unsigned long) K(bdi->write_bandwidth), | ||
114 | nr_dirty, | ||
115 | nr_io, | ||
116 | nr_more_io, | ||
98 | !list_empty(&bdi->bdi_list), bdi->state); | 117 | !list_empty(&bdi->bdi_list), bdi->state); |
99 | #undef K | 118 | #undef K |
100 | 119 | ||
@@ -249,18 +268,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi) | |||
249 | return wb_has_dirty_io(&bdi->wb); | 268 | return wb_has_dirty_io(&bdi->wb); |
250 | } | 269 | } |
251 | 270 | ||
252 | static void bdi_flush_io(struct backing_dev_info *bdi) | ||
253 | { | ||
254 | struct writeback_control wbc = { | ||
255 | .sync_mode = WB_SYNC_NONE, | ||
256 | .older_than_this = NULL, | ||
257 | .range_cyclic = 1, | ||
258 | .nr_to_write = 1024, | ||
259 | }; | ||
260 | |||
261 | writeback_inodes_wb(&bdi->wb, &wbc); | ||
262 | } | ||
263 | |||
264 | /* | 271 | /* |
265 | * kupdated() used to do this. We cannot do it from the bdi_forker_thread() | 272 | * kupdated() used to do this. We cannot do it from the bdi_forker_thread() |
266 | * or we risk deadlocking on ->s_umount. The longer term solution would be | 273 | * or we risk deadlocking on ->s_umount. The longer term solution would be |
@@ -446,9 +453,10 @@ static int bdi_forker_thread(void *ptr) | |||
446 | if (IS_ERR(task)) { | 453 | if (IS_ERR(task)) { |
447 | /* | 454 | /* |
448 | * If thread creation fails, force writeout of | 455 | * If thread creation fails, force writeout of |
449 | * the bdi from the thread. | 456 | * the bdi from the thread. Hopefully 1024 is |
457 | * large enough for efficient IO. | ||
450 | */ | 458 | */ |
451 | bdi_flush_io(bdi); | 459 | writeback_inodes_wb(&bdi->wb, 1024); |
452 | } else { | 460 | } else { |
453 | /* | 461 | /* |
454 | * The spinlock makes sure we do not lose | 462 | * The spinlock makes sure we do not lose |
@@ -629,9 +637,15 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) | |||
629 | INIT_LIST_HEAD(&wb->b_dirty); | 637 | INIT_LIST_HEAD(&wb->b_dirty); |
630 | INIT_LIST_HEAD(&wb->b_io); | 638 | INIT_LIST_HEAD(&wb->b_io); |
631 | INIT_LIST_HEAD(&wb->b_more_io); | 639 | INIT_LIST_HEAD(&wb->b_more_io); |
640 | spin_lock_init(&wb->list_lock); | ||
632 | setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); | 641 | setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); |
633 | } | 642 | } |
634 | 643 | ||
644 | /* | ||
645 | * Initial write bandwidth: 100 MB/s | ||
646 | */ | ||
647 | #define INIT_BW (100 << (20 - PAGE_SHIFT)) | ||
648 | |||
635 | int bdi_init(struct backing_dev_info *bdi) | 649 | int bdi_init(struct backing_dev_info *bdi) |
636 | { | 650 | { |
637 | int i, err; | 651 | int i, err; |
@@ -654,6 +668,13 @@ int bdi_init(struct backing_dev_info *bdi) | |||
654 | } | 668 | } |
655 | 669 | ||
656 | bdi->dirty_exceeded = 0; | 670 | bdi->dirty_exceeded = 0; |
671 | |||
672 | bdi->bw_time_stamp = jiffies; | ||
673 | bdi->written_stamp = 0; | ||
674 | |||
675 | bdi->write_bandwidth = INIT_BW; | ||
676 | bdi->avg_write_bandwidth = INIT_BW; | ||
677 | |||
657 | err = prop_local_init_percpu(&bdi->completions); | 678 | err = prop_local_init_percpu(&bdi->completions); |
658 | 679 | ||
659 | if (err) { | 680 | if (err) { |
@@ -677,11 +698,12 @@ void bdi_destroy(struct backing_dev_info *bdi) | |||
677 | if (bdi_has_dirty_io(bdi)) { | 698 | if (bdi_has_dirty_io(bdi)) { |
678 | struct bdi_writeback *dst = &default_backing_dev_info.wb; | 699 | struct bdi_writeback *dst = &default_backing_dev_info.wb; |
679 | 700 | ||
680 | spin_lock(&inode_wb_list_lock); | 701 | bdi_lock_two(&bdi->wb, dst); |
681 | list_splice(&bdi->wb.b_dirty, &dst->b_dirty); | 702 | list_splice(&bdi->wb.b_dirty, &dst->b_dirty); |
682 | list_splice(&bdi->wb.b_io, &dst->b_io); | 703 | list_splice(&bdi->wb.b_io, &dst->b_io); |
683 | list_splice(&bdi->wb.b_more_io, &dst->b_more_io); | 704 | list_splice(&bdi->wb.b_more_io, &dst->b_more_io); |
684 | spin_unlock(&inode_wb_list_lock); | 705 | spin_unlock(&bdi->wb.list_lock); |
706 | spin_unlock(&dst->list_lock); | ||
685 | } | 707 | } |
686 | 708 | ||
687 | bdi_unregister(bdi); | 709 | bdi_unregister(bdi); |
diff --git a/mm/filemap.c b/mm/filemap.c index 10a171113273..867d40222ec7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -78,7 +78,7 @@ | |||
78 | * ->i_mutex (generic_file_buffered_write) | 78 | * ->i_mutex (generic_file_buffered_write) |
79 | * ->mmap_sem (fault_in_pages_readable->do_page_fault) | 79 | * ->mmap_sem (fault_in_pages_readable->do_page_fault) |
80 | * | 80 | * |
81 | * inode_wb_list_lock | 81 | * bdi->wb.list_lock |
82 | * sb_lock (fs/fs-writeback.c) | 82 | * sb_lock (fs/fs-writeback.c) |
83 | * ->mapping->tree_lock (__sync_single_inode) | 83 | * ->mapping->tree_lock (__sync_single_inode) |
84 | * | 84 | * |
@@ -96,9 +96,9 @@ | |||
96 | * ->zone.lru_lock (check_pte_range->isolate_lru_page) | 96 | * ->zone.lru_lock (check_pte_range->isolate_lru_page) |
97 | * ->private_lock (page_remove_rmap->set_page_dirty) | 97 | * ->private_lock (page_remove_rmap->set_page_dirty) |
98 | * ->tree_lock (page_remove_rmap->set_page_dirty) | 98 | * ->tree_lock (page_remove_rmap->set_page_dirty) |
99 | * inode_wb_list_lock (page_remove_rmap->set_page_dirty) | 99 | * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) |
100 | * ->inode->i_lock (page_remove_rmap->set_page_dirty) | 100 | * ->inode->i_lock (page_remove_rmap->set_page_dirty) |
101 | * inode_wb_list_lock (zap_pte_range->set_page_dirty) | 101 | * bdi.wb->list_lock (zap_pte_range->set_page_dirty) |
102 | * ->inode->i_lock (zap_pte_range->set_page_dirty) | 102 | * ->inode->i_lock (zap_pte_range->set_page_dirty) |
103 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) | 103 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) |
104 | * | 104 | * |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d8767b381b9c..d1960744f881 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -37,6 +37,16 @@ | |||
37 | #include <trace/events/writeback.h> | 37 | #include <trace/events/writeback.h> |
38 | 38 | ||
39 | /* | 39 | /* |
40 | * Sleep at most 200ms at a time in balance_dirty_pages(). | ||
41 | */ | ||
42 | #define MAX_PAUSE max(HZ/5, 1) | ||
43 | |||
44 | /* | ||
45 | * Estimate write bandwidth at 200ms intervals. | ||
46 | */ | ||
47 | #define BANDWIDTH_INTERVAL max(HZ/5, 1) | ||
48 | |||
49 | /* | ||
40 | * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited | 50 | * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited |
41 | * will look to see if it needs to force writeback or throttling. | 51 | * will look to see if it needs to force writeback or throttling. |
42 | */ | 52 | */ |
@@ -111,6 +121,7 @@ EXPORT_SYMBOL(laptop_mode); | |||
111 | 121 | ||
112 | /* End of sysctl-exported parameters */ | 122 | /* End of sysctl-exported parameters */ |
113 | 123 | ||
124 | unsigned long global_dirty_limit; | ||
114 | 125 | ||
115 | /* | 126 | /* |
116 | * Scale the writeback cache size proportional to the relative writeout speeds. | 127 | * Scale the writeback cache size proportional to the relative writeout speeds. |
@@ -219,6 +230,7 @@ int dirty_bytes_handler(struct ctl_table *table, int write, | |||
219 | */ | 230 | */ |
220 | static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) | 231 | static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) |
221 | { | 232 | { |
233 | __inc_bdi_stat(bdi, BDI_WRITTEN); | ||
222 | __prop_inc_percpu_max(&vm_completions, &bdi->completions, | 234 | __prop_inc_percpu_max(&vm_completions, &bdi->completions, |
223 | bdi->max_prop_frac); | 235 | bdi->max_prop_frac); |
224 | } | 236 | } |
@@ -244,13 +256,8 @@ void task_dirty_inc(struct task_struct *tsk) | |||
244 | static void bdi_writeout_fraction(struct backing_dev_info *bdi, | 256 | static void bdi_writeout_fraction(struct backing_dev_info *bdi, |
245 | long *numerator, long *denominator) | 257 | long *numerator, long *denominator) |
246 | { | 258 | { |
247 | if (bdi_cap_writeback_dirty(bdi)) { | 259 | prop_fraction_percpu(&vm_completions, &bdi->completions, |
248 | prop_fraction_percpu(&vm_completions, &bdi->completions, | ||
249 | numerator, denominator); | 260 | numerator, denominator); |
250 | } else { | ||
251 | *numerator = 0; | ||
252 | *denominator = 1; | ||
253 | } | ||
254 | } | 261 | } |
255 | 262 | ||
256 | static inline void task_dirties_fraction(struct task_struct *tsk, | 263 | static inline void task_dirties_fraction(struct task_struct *tsk, |
@@ -274,12 +281,13 @@ static inline void task_dirties_fraction(struct task_struct *tsk, | |||
274 | * effectively curb the growth of dirty pages. Light dirtiers with high enough | 281 | * effectively curb the growth of dirty pages. Light dirtiers with high enough |
275 | * dirty threshold may never get throttled. | 282 | * dirty threshold may never get throttled. |
276 | */ | 283 | */ |
284 | #define TASK_LIMIT_FRACTION 8 | ||
277 | static unsigned long task_dirty_limit(struct task_struct *tsk, | 285 | static unsigned long task_dirty_limit(struct task_struct *tsk, |
278 | unsigned long bdi_dirty) | 286 | unsigned long bdi_dirty) |
279 | { | 287 | { |
280 | long numerator, denominator; | 288 | long numerator, denominator; |
281 | unsigned long dirty = bdi_dirty; | 289 | unsigned long dirty = bdi_dirty; |
282 | u64 inv = dirty >> 3; | 290 | u64 inv = dirty / TASK_LIMIT_FRACTION; |
283 | 291 | ||
284 | task_dirties_fraction(tsk, &numerator, &denominator); | 292 | task_dirties_fraction(tsk, &numerator, &denominator); |
285 | inv *= numerator; | 293 | inv *= numerator; |
@@ -290,6 +298,12 @@ static unsigned long task_dirty_limit(struct task_struct *tsk, | |||
290 | return max(dirty, bdi_dirty/2); | 298 | return max(dirty, bdi_dirty/2); |
291 | } | 299 | } |
292 | 300 | ||
301 | /* Minimum limit for any task */ | ||
302 | static unsigned long task_min_dirty_limit(unsigned long bdi_dirty) | ||
303 | { | ||
304 | return bdi_dirty - bdi_dirty / TASK_LIMIT_FRACTION; | ||
305 | } | ||
306 | |||
293 | /* | 307 | /* |
294 | * | 308 | * |
295 | */ | 309 | */ |
@@ -397,6 +411,11 @@ unsigned long determine_dirtyable_memory(void) | |||
397 | return x + 1; /* Ensure that we never return 0 */ | 411 | return x + 1; /* Ensure that we never return 0 */ |
398 | } | 412 | } |
399 | 413 | ||
414 | static unsigned long hard_dirty_limit(unsigned long thresh) | ||
415 | { | ||
416 | return max(thresh, global_dirty_limit); | ||
417 | } | ||
418 | |||
400 | /* | 419 | /* |
401 | * global_dirty_limits - background-writeback and dirty-throttling thresholds | 420 | * global_dirty_limits - background-writeback and dirty-throttling thresholds |
402 | * | 421 | * |
@@ -435,12 +454,20 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) | |||
435 | } | 454 | } |
436 | *pbackground = background; | 455 | *pbackground = background; |
437 | *pdirty = dirty; | 456 | *pdirty = dirty; |
457 | trace_global_dirty_state(background, dirty); | ||
438 | } | 458 | } |
439 | 459 | ||
440 | /* | 460 | /** |
441 | * bdi_dirty_limit - @bdi's share of dirty throttling threshold | 461 | * bdi_dirty_limit - @bdi's share of dirty throttling threshold |
462 | * @bdi: the backing_dev_info to query | ||
463 | * @dirty: global dirty limit in pages | ||
442 | * | 464 | * |
443 | * Allocate high/low dirty limits to fast/slow devices, in order to prevent | 465 | * Returns @bdi's dirty limit in pages. The term "dirty" in the context of |
466 | * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. | ||
467 | * And the "limit" in the name is not seriously taken as hard limit in | ||
468 | * balance_dirty_pages(). | ||
469 | * | ||
470 | * It allocates high/low dirty limits to fast/slow devices, in order to prevent | ||
444 | * - starving fast devices | 471 | * - starving fast devices |
445 | * - piling up dirty pages (that will take long time to sync) on slow devices | 472 | * - piling up dirty pages (that will take long time to sync) on slow devices |
446 | * | 473 | * |
@@ -468,6 +495,153 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) | |||
468 | return bdi_dirty; | 495 | return bdi_dirty; |
469 | } | 496 | } |
470 | 497 | ||
498 | static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, | ||
499 | unsigned long elapsed, | ||
500 | unsigned long written) | ||
501 | { | ||
502 | const unsigned long period = roundup_pow_of_two(3 * HZ); | ||
503 | unsigned long avg = bdi->avg_write_bandwidth; | ||
504 | unsigned long old = bdi->write_bandwidth; | ||
505 | u64 bw; | ||
506 | |||
507 | /* | ||
508 | * bw = written * HZ / elapsed | ||
509 | * | ||
510 | * bw * elapsed + write_bandwidth * (period - elapsed) | ||
511 | * write_bandwidth = --------------------------------------------------- | ||
512 | * period | ||
513 | */ | ||
514 | bw = written - bdi->written_stamp; | ||
515 | bw *= HZ; | ||
516 | if (unlikely(elapsed > period)) { | ||
517 | do_div(bw, elapsed); | ||
518 | avg = bw; | ||
519 | goto out; | ||
520 | } | ||
521 | bw += (u64)bdi->write_bandwidth * (period - elapsed); | ||
522 | bw >>= ilog2(period); | ||
523 | |||
524 | /* | ||
525 | * one more level of smoothing, for filtering out sudden spikes | ||
526 | */ | ||
527 | if (avg > old && old >= (unsigned long)bw) | ||
528 | avg -= (avg - old) >> 3; | ||
529 | |||
530 | if (avg < old && old <= (unsigned long)bw) | ||
531 | avg += (old - avg) >> 3; | ||
532 | |||
533 | out: | ||
534 | bdi->write_bandwidth = bw; | ||
535 | bdi->avg_write_bandwidth = avg; | ||
536 | } | ||
537 | |||
538 | /* | ||
539 | * The global dirtyable memory and dirty threshold could be suddenly knocked | ||
540 | * down by a large amount (eg. on the startup of KVM in a swapless system). | ||
541 | * This may throw the system into deep dirty exceeded state and throttle | ||
542 | * heavy/light dirtiers alike. To retain good responsiveness, maintain | ||
543 | * global_dirty_limit for tracking slowly down to the knocked down dirty | ||
544 | * threshold. | ||
545 | */ | ||
546 | static void update_dirty_limit(unsigned long thresh, unsigned long dirty) | ||
547 | { | ||
548 | unsigned long limit = global_dirty_limit; | ||
549 | |||
550 | /* | ||
551 | * Follow up in one step. | ||
552 | */ | ||
553 | if (limit < thresh) { | ||
554 | limit = thresh; | ||
555 | goto update; | ||
556 | } | ||
557 | |||
558 | /* | ||
559 | * Follow down slowly. Use the higher one as the target, because thresh | ||
560 | * may drop below dirty. This is exactly the reason to introduce | ||
561 | * global_dirty_limit which is guaranteed to lie above the dirty pages. | ||
562 | */ | ||
563 | thresh = max(thresh, dirty); | ||
564 | if (limit > thresh) { | ||
565 | limit -= (limit - thresh) >> 5; | ||
566 | goto update; | ||
567 | } | ||
568 | return; | ||
569 | update: | ||
570 | global_dirty_limit = limit; | ||
571 | } | ||
572 | |||
573 | static void global_update_bandwidth(unsigned long thresh, | ||
574 | unsigned long dirty, | ||
575 | unsigned long now) | ||
576 | { | ||
577 | static DEFINE_SPINLOCK(dirty_lock); | ||
578 | static unsigned long update_time; | ||
579 | |||
580 | /* | ||
581 | * check locklessly first to optimize away locking for the most time | ||
582 | */ | ||
583 | if (time_before(now, update_time + BANDWIDTH_INTERVAL)) | ||
584 | return; | ||
585 | |||
586 | spin_lock(&dirty_lock); | ||
587 | if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) { | ||
588 | update_dirty_limit(thresh, dirty); | ||
589 | update_time = now; | ||
590 | } | ||
591 | spin_unlock(&dirty_lock); | ||
592 | } | ||
593 | |||
594 | void __bdi_update_bandwidth(struct backing_dev_info *bdi, | ||
595 | unsigned long thresh, | ||
596 | unsigned long dirty, | ||
597 | unsigned long bdi_thresh, | ||
598 | unsigned long bdi_dirty, | ||
599 | unsigned long start_time) | ||
600 | { | ||
601 | unsigned long now = jiffies; | ||
602 | unsigned long elapsed = now - bdi->bw_time_stamp; | ||
603 | unsigned long written; | ||
604 | |||
605 | /* | ||
606 | * rate-limit, only update once every 200ms. | ||
607 | */ | ||
608 | if (elapsed < BANDWIDTH_INTERVAL) | ||
609 | return; | ||
610 | |||
611 | written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]); | ||
612 | |||
613 | /* | ||
614 | * Skip quiet periods when disk bandwidth is under-utilized. | ||
615 | * (at least 1s idle time between two flusher runs) | ||
616 | */ | ||
617 | if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time)) | ||
618 | goto snapshot; | ||
619 | |||
620 | if (thresh) | ||
621 | global_update_bandwidth(thresh, dirty, now); | ||
622 | |||
623 | bdi_update_write_bandwidth(bdi, elapsed, written); | ||
624 | |||
625 | snapshot: | ||
626 | bdi->written_stamp = written; | ||
627 | bdi->bw_time_stamp = now; | ||
628 | } | ||
629 | |||
630 | static void bdi_update_bandwidth(struct backing_dev_info *bdi, | ||
631 | unsigned long thresh, | ||
632 | unsigned long dirty, | ||
633 | unsigned long bdi_thresh, | ||
634 | unsigned long bdi_dirty, | ||
635 | unsigned long start_time) | ||
636 | { | ||
637 | if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL)) | ||
638 | return; | ||
639 | spin_lock(&bdi->wb.list_lock); | ||
640 | __bdi_update_bandwidth(bdi, thresh, dirty, bdi_thresh, bdi_dirty, | ||
641 | start_time); | ||
642 | spin_unlock(&bdi->wb.list_lock); | ||
643 | } | ||
644 | |||
471 | /* | 645 | /* |
472 | * balance_dirty_pages() must be called by processes which are generating dirty | 646 | * balance_dirty_pages() must be called by processes which are generating dirty |
473 | * data. It looks at the number of dirty pages in the machine and will force | 647 | * data. It looks at the number of dirty pages in the machine and will force |
@@ -478,27 +652,25 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) | |||
478 | static void balance_dirty_pages(struct address_space *mapping, | 652 | static void balance_dirty_pages(struct address_space *mapping, |
479 | unsigned long write_chunk) | 653 | unsigned long write_chunk) |
480 | { | 654 | { |
481 | long nr_reclaimable, bdi_nr_reclaimable; | 655 | unsigned long nr_reclaimable, bdi_nr_reclaimable; |
482 | long nr_writeback, bdi_nr_writeback; | 656 | unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ |
657 | unsigned long bdi_dirty; | ||
483 | unsigned long background_thresh; | 658 | unsigned long background_thresh; |
484 | unsigned long dirty_thresh; | 659 | unsigned long dirty_thresh; |
485 | unsigned long bdi_thresh; | 660 | unsigned long bdi_thresh; |
661 | unsigned long task_bdi_thresh; | ||
662 | unsigned long min_task_bdi_thresh; | ||
486 | unsigned long pages_written = 0; | 663 | unsigned long pages_written = 0; |
487 | unsigned long pause = 1; | 664 | unsigned long pause = 1; |
488 | bool dirty_exceeded = false; | 665 | bool dirty_exceeded = false; |
666 | bool clear_dirty_exceeded = true; | ||
489 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 667 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
668 | unsigned long start_time = jiffies; | ||
490 | 669 | ||
491 | for (;;) { | 670 | for (;;) { |
492 | struct writeback_control wbc = { | ||
493 | .sync_mode = WB_SYNC_NONE, | ||
494 | .older_than_this = NULL, | ||
495 | .nr_to_write = write_chunk, | ||
496 | .range_cyclic = 1, | ||
497 | }; | ||
498 | |||
499 | nr_reclaimable = global_page_state(NR_FILE_DIRTY) + | 671 | nr_reclaimable = global_page_state(NR_FILE_DIRTY) + |
500 | global_page_state(NR_UNSTABLE_NFS); | 672 | global_page_state(NR_UNSTABLE_NFS); |
501 | nr_writeback = global_page_state(NR_WRITEBACK); | 673 | nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK); |
502 | 674 | ||
503 | global_dirty_limits(&background_thresh, &dirty_thresh); | 675 | global_dirty_limits(&background_thresh, &dirty_thresh); |
504 | 676 | ||
@@ -507,12 +679,12 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
507 | * catch-up. This avoids (excessively) small writeouts | 679 | * catch-up. This avoids (excessively) small writeouts |
508 | * when the bdi limits are ramping up. | 680 | * when the bdi limits are ramping up. |
509 | */ | 681 | */ |
510 | if (nr_reclaimable + nr_writeback <= | 682 | if (nr_dirty <= (background_thresh + dirty_thresh) / 2) |
511 | (background_thresh + dirty_thresh) / 2) | ||
512 | break; | 683 | break; |
513 | 684 | ||
514 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); | 685 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); |
515 | bdi_thresh = task_dirty_limit(current, bdi_thresh); | 686 | min_task_bdi_thresh = task_min_dirty_limit(bdi_thresh); |
687 | task_bdi_thresh = task_dirty_limit(current, bdi_thresh); | ||
516 | 688 | ||
517 | /* | 689 | /* |
518 | * In order to avoid the stacked BDI deadlock we need | 690 | * In order to avoid the stacked BDI deadlock we need |
@@ -524,12 +696,14 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
524 | * actually dirty; with m+n sitting in the percpu | 696 | * actually dirty; with m+n sitting in the percpu |
525 | * deltas. | 697 | * deltas. |
526 | */ | 698 | */ |
527 | if (bdi_thresh < 2*bdi_stat_error(bdi)) { | 699 | if (task_bdi_thresh < 2 * bdi_stat_error(bdi)) { |
528 | bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); | 700 | bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); |
529 | bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); | 701 | bdi_dirty = bdi_nr_reclaimable + |
702 | bdi_stat_sum(bdi, BDI_WRITEBACK); | ||
530 | } else { | 703 | } else { |
531 | bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); | 704 | bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); |
532 | bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); | 705 | bdi_dirty = bdi_nr_reclaimable + |
706 | bdi_stat(bdi, BDI_WRITEBACK); | ||
533 | } | 707 | } |
534 | 708 | ||
535 | /* | 709 | /* |
@@ -538,9 +712,10 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
538 | * bdi or process from holding back light ones; The latter is | 712 | * bdi or process from holding back light ones; The latter is |
539 | * the last resort safeguard. | 713 | * the last resort safeguard. |
540 | */ | 714 | */ |
541 | dirty_exceeded = | 715 | dirty_exceeded = (bdi_dirty > task_bdi_thresh) || |
542 | (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh) | 716 | (nr_dirty > dirty_thresh); |
543 | || (nr_reclaimable + nr_writeback > dirty_thresh); | 717 | clear_dirty_exceeded = (bdi_dirty <= min_task_bdi_thresh) && |
718 | (nr_dirty <= dirty_thresh); | ||
544 | 719 | ||
545 | if (!dirty_exceeded) | 720 | if (!dirty_exceeded) |
546 | break; | 721 | break; |
@@ -548,6 +723,9 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
548 | if (!bdi->dirty_exceeded) | 723 | if (!bdi->dirty_exceeded) |
549 | bdi->dirty_exceeded = 1; | 724 | bdi->dirty_exceeded = 1; |
550 | 725 | ||
726 | bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty, | ||
727 | bdi_thresh, bdi_dirty, start_time); | ||
728 | |||
551 | /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. | 729 | /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. |
552 | * Unstable writes are a feature of certain networked | 730 | * Unstable writes are a feature of certain networked |
553 | * filesystems (i.e. NFS) in which data may have been | 731 | * filesystems (i.e. NFS) in which data may have been |
@@ -557,17 +735,40 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
557 | * threshold otherwise wait until the disk writes catch | 735 | * threshold otherwise wait until the disk writes catch |
558 | * up. | 736 | * up. |
559 | */ | 737 | */ |
560 | trace_wbc_balance_dirty_start(&wbc, bdi); | 738 | trace_balance_dirty_start(bdi); |
561 | if (bdi_nr_reclaimable > bdi_thresh) { | 739 | if (bdi_nr_reclaimable > task_bdi_thresh) { |
562 | writeback_inodes_wb(&bdi->wb, &wbc); | 740 | pages_written += writeback_inodes_wb(&bdi->wb, |
563 | pages_written += write_chunk - wbc.nr_to_write; | 741 | write_chunk); |
564 | trace_wbc_balance_dirty_written(&wbc, bdi); | 742 | trace_balance_dirty_written(bdi, pages_written); |
565 | if (pages_written >= write_chunk) | 743 | if (pages_written >= write_chunk) |
566 | break; /* We've done our duty */ | 744 | break; /* We've done our duty */ |
567 | } | 745 | } |
568 | trace_wbc_balance_dirty_wait(&wbc, bdi); | ||
569 | __set_current_state(TASK_UNINTERRUPTIBLE); | 746 | __set_current_state(TASK_UNINTERRUPTIBLE); |
570 | io_schedule_timeout(pause); | 747 | io_schedule_timeout(pause); |
748 | trace_balance_dirty_wait(bdi); | ||
749 | |||
750 | dirty_thresh = hard_dirty_limit(dirty_thresh); | ||
751 | /* | ||
752 | * max-pause area. If dirty exceeded but still within this | ||
753 | * area, no need to sleep for more than 200ms: (a) 8 pages per | ||
754 | * 200ms is typically more than enough to curb heavy dirtiers; | ||
755 | * (b) the pause time limit makes the dirtiers more responsive. | ||
756 | */ | ||
757 | if (nr_dirty < dirty_thresh + | ||
758 | dirty_thresh / DIRTY_MAXPAUSE_AREA && | ||
759 | time_after(jiffies, start_time + MAX_PAUSE)) | ||
760 | break; | ||
761 | /* | ||
762 | * pass-good area. When some bdi gets blocked (eg. NFS server | ||
763 | * not responding), or write bandwidth dropped dramatically due | ||
764 | * to concurrent reads, or dirty threshold suddenly dropped and | ||
765 | * the dirty pages cannot be brought down anytime soon (eg. on | ||
766 | * slow USB stick), at least let go of the good bdi's. | ||
767 | */ | ||
768 | if (nr_dirty < dirty_thresh + | ||
769 | dirty_thresh / DIRTY_PASSGOOD_AREA && | ||
770 | bdi_dirty < bdi_thresh) | ||
771 | break; | ||
571 | 772 | ||
572 | /* | 773 | /* |
573 | * Increase the delay for each loop, up to our previous | 774 | * Increase the delay for each loop, up to our previous |
@@ -578,7 +779,8 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
578 | pause = HZ / 10; | 779 | pause = HZ / 10; |
579 | } | 780 | } |
580 | 781 | ||
581 | if (!dirty_exceeded && bdi->dirty_exceeded) | 782 | /* Clear dirty_exceeded flag only when no task can exceed the limit */ |
783 | if (clear_dirty_exceeded && bdi->dirty_exceeded) | ||
582 | bdi->dirty_exceeded = 0; | 784 | bdi->dirty_exceeded = 0; |
583 | 785 | ||
584 | if (writeback_in_progress(bdi)) | 786 | if (writeback_in_progress(bdi)) |
@@ -626,9 +828,13 @@ static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0; | |||
626 | void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, | 828 | void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, |
627 | unsigned long nr_pages_dirtied) | 829 | unsigned long nr_pages_dirtied) |
628 | { | 830 | { |
831 | struct backing_dev_info *bdi = mapping->backing_dev_info; | ||
629 | unsigned long ratelimit; | 832 | unsigned long ratelimit; |
630 | unsigned long *p; | 833 | unsigned long *p; |
631 | 834 | ||
835 | if (!bdi_cap_account_dirty(bdi)) | ||
836 | return; | ||
837 | |||
632 | ratelimit = ratelimit_pages; | 838 | ratelimit = ratelimit_pages; |
633 | if (mapping->backing_dev_info->dirty_exceeded) | 839 | if (mapping->backing_dev_info->dirty_exceeded) |
634 | ratelimit = 8; | 840 | ratelimit = 8; |
@@ -892,12 +1098,12 @@ int write_cache_pages(struct address_space *mapping, | |||
892 | range_whole = 1; | 1098 | range_whole = 1; |
893 | cycled = 1; /* ignore range_cyclic tests */ | 1099 | cycled = 1; /* ignore range_cyclic tests */ |
894 | } | 1100 | } |
895 | if (wbc->sync_mode == WB_SYNC_ALL) | 1101 | if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) |
896 | tag = PAGECACHE_TAG_TOWRITE; | 1102 | tag = PAGECACHE_TAG_TOWRITE; |
897 | else | 1103 | else |
898 | tag = PAGECACHE_TAG_DIRTY; | 1104 | tag = PAGECACHE_TAG_DIRTY; |
899 | retry: | 1105 | retry: |
900 | if (wbc->sync_mode == WB_SYNC_ALL) | 1106 | if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) |
901 | tag_pages_for_writeback(mapping, index, end); | 1107 | tag_pages_for_writeback(mapping, index, end); |
902 | done_index = index; | 1108 | done_index = index; |
903 | while (!done && (index <= end)) { | 1109 | while (!done && (index <= end)) { |
@@ -31,11 +31,11 @@ | |||
31 | * mmlist_lock (in mmput, drain_mmlist and others) | 31 | * mmlist_lock (in mmput, drain_mmlist and others) |
32 | * mapping->private_lock (in __set_page_dirty_buffers) | 32 | * mapping->private_lock (in __set_page_dirty_buffers) |
33 | * inode->i_lock (in set_page_dirty's __mark_inode_dirty) | 33 | * inode->i_lock (in set_page_dirty's __mark_inode_dirty) |
34 | * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty) | 34 | * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) |
35 | * sb_lock (within inode_lock in fs/fs-writeback.c) | 35 | * sb_lock (within inode_lock in fs/fs-writeback.c) |
36 | * mapping->tree_lock (widely used, in set_page_dirty, | 36 | * mapping->tree_lock (widely used, in set_page_dirty, |
37 | * in arch-dependent flush_dcache_mmap_lock, | 37 | * in arch-dependent flush_dcache_mmap_lock, |
38 | * within inode_wb_list_lock in __sync_single_inode) | 38 | * within bdi.wb->list_lock in __sync_single_inode) |
39 | * | 39 | * |
40 | * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) | 40 | * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) |
41 | * ->tasklist_lock | 41 | * ->tasklist_lock |