diff options
author | Jan Kara <jack@suse.cz> | 2011-01-13 18:45:48 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-01-13 20:32:32 -0500 |
commit | b9543dac5bbc4aef0a598965b6b34f6259ab9a9b (patch) | |
tree | 255244e6bb9340d4b8ce1fa6b4d2ed4adece4b15 | |
parent | aa373cf550994623efb5d49a4d8775bafd10bbc1 (diff) |
writeback: avoid livelocking WB_SYNC_ALL writeback
When wb_writeback() is called in WB_SYNC_ALL mode, work->nr_to_write is
usually set to LONG_MAX. The logic in wb_writeback() then calls
__writeback_inodes_sb() with nr_to_write == MAX_WRITEBACK_PAGES and we
easily end up with non-positive nr_to_write after the function returns, if
the inode has more than MAX_WRITEBACK_PAGES dirty pages at the moment.
When nr_to_write is <= 0 wb_writeback() decides we need another round of
writeback but this is wrong in some cases! For example when a single
large file is continuously dirtied, we would never finish syncing it
because each pass would be able to write MAX_WRITEBACK_PAGES and inode
dirty timestamp never gets updated (as inode is never completely clean).
Thus __writeback_inodes_sb() would write the redirtied inode again and
again.
Fix the issue by setting nr_to_write to LONG_MAX in WB_SYNC_ALL mode. We
do not need nr_to_write in WB_SYNC_ALL mode anyway since
write_cache_pages() does livelock avoidance using page tagging in
WB_SYNC_ALL mode.
This makes wb_writeback() call __writeback_inodes_sb() only once on
WB_SYNC_ALL. The latter function won't livelock because it works on
- a finite set of files by doing queue_io() once at the beginning
- a finite set of pages by PAGECACHE_TAG_TOWRITE page tagging
After this patch, program from http://lkml.org/lkml/2010/10/24/154 is no
longer able to stall sync forever.
[fengguang.wu@intel.com: fix locking comment]
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Jan Engelhardt <jengelh@medozas.de>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | fs/fs-writeback.c | 27 |
1 files changed, 23 insertions, 4 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 9e72d04e706e..e8063c938dd2 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c | |||
@@ -630,6 +630,7 @@ static long wb_writeback(struct bdi_writeback *wb, | |||
630 | }; | 630 | }; |
631 | unsigned long oldest_jif; | 631 | unsigned long oldest_jif; |
632 | long wrote = 0; | 632 | long wrote = 0; |
633 | long write_chunk; | ||
633 | struct inode *inode; | 634 | struct inode *inode; |
634 | 635 | ||
635 | if (wbc.for_kupdate) { | 636 | if (wbc.for_kupdate) { |
@@ -642,6 +643,24 @@ static long wb_writeback(struct bdi_writeback *wb, | |||
642 | wbc.range_end = LLONG_MAX; | 643 | wbc.range_end = LLONG_MAX; |
643 | } | 644 | } |
644 | 645 | ||
646 | /* | ||
647 | * WB_SYNC_ALL mode does livelock avoidance by syncing dirty | ||
648 | * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX | ||
649 | * here avoids calling into writeback_inodes_wb() more than once. | ||
650 | * | ||
651 | * The intended call sequence for WB_SYNC_ALL writeback is: | ||
652 | * | ||
653 | * wb_writeback() | ||
654 | * __writeback_inodes_sb() <== called only once | ||
655 | * write_cache_pages() <== called once for each inode | ||
656 | * (quickly) tag currently dirty pages | ||
657 | * (maybe slowly) sync all tagged pages | ||
658 | */ | ||
659 | if (wbc.sync_mode == WB_SYNC_NONE) | ||
660 | write_chunk = MAX_WRITEBACK_PAGES; | ||
661 | else | ||
662 | write_chunk = LONG_MAX; | ||
663 | |||
645 | wbc.wb_start = jiffies; /* livelock avoidance */ | 664 | wbc.wb_start = jiffies; /* livelock avoidance */ |
646 | for (;;) { | 665 | for (;;) { |
647 | /* | 666 | /* |
@@ -668,7 +687,7 @@ static long wb_writeback(struct bdi_writeback *wb, | |||
668 | break; | 687 | break; |
669 | 688 | ||
670 | wbc.more_io = 0; | 689 | wbc.more_io = 0; |
671 | wbc.nr_to_write = MAX_WRITEBACK_PAGES; | 690 | wbc.nr_to_write = write_chunk; |
672 | wbc.pages_skipped = 0; | 691 | wbc.pages_skipped = 0; |
673 | 692 | ||
674 | trace_wbc_writeback_start(&wbc, wb->bdi); | 693 | trace_wbc_writeback_start(&wbc, wb->bdi); |
@@ -678,8 +697,8 @@ static long wb_writeback(struct bdi_writeback *wb, | |||
678 | writeback_inodes_wb(wb, &wbc); | 697 | writeback_inodes_wb(wb, &wbc); |
679 | trace_wbc_writeback_written(&wbc, wb->bdi); | 698 | trace_wbc_writeback_written(&wbc, wb->bdi); |
680 | 699 | ||
681 | work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; | 700 | work->nr_pages -= write_chunk - wbc.nr_to_write; |
682 | wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; | 701 | wrote += write_chunk - wbc.nr_to_write; |
683 | 702 | ||
684 | /* | 703 | /* |
685 | * If we consumed everything, see if we have more | 704 | * If we consumed everything, see if we have more |
@@ -694,7 +713,7 @@ static long wb_writeback(struct bdi_writeback *wb, | |||
694 | /* | 713 | /* |
695 | * Did we write something? Try for more | 714 | * Did we write something? Try for more |
696 | */ | 715 | */ |
697 | if (wbc.nr_to_write < MAX_WRITEBACK_PAGES) | 716 | if (wbc.nr_to_write < write_chunk) |
698 | continue; | 717 | continue; |
699 | /* | 718 | /* |
700 | * Nothing written. Wait for some inode to | 719 | * Nothing written. Wait for some inode to |