aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJan Kara <jack@suse.cz>2010-08-09 20:19:12 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-08-09 23:44:59 -0400
commitf446daaea9d4a420d16c606f755f3689dcb2d0ce (patch)
treebe2afc18f79aa4ff9be245b0a036aa06185b5dc4
parentebf8aa44beed48cd17893a83d92a4403e5f9d9e2 (diff)
mm: implement writeback livelock avoidance using page tagging
We try to avoid livelocks of writeback when some steadily creates dirty pages in a mapping we are writing out. For memory-cleaning writeback, using nr_to_write works reasonably well but we cannot really use it for data integrity writeback. This patch tries to solve the problem. The idea is simple: Tag all pages that should be written back with a special tag (TOWRITE) in the radix tree. This can be done rather quickly and thus livelocks should not happen in practice. Then we start doing the hard work of locking pages and sending them to disk only for those pages that have TOWRITE tag set. Note: Adding new radix tree tag grows radix tree node from 288 to 296 bytes for 32-bit archs and from 552 to 560 bytes for 64-bit archs. However, the number of slab/slub items per page remains the same (13 and 7 respectively). Signed-off-by: Jan Kara <jack@suse.cz> Cc: Dave Chinner <david@fromorbit.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Chris Mason <chris.mason@oracle.com> Cc: Theodore Ts'o <tytso@mit.edu> Cc: Jens Axboe <axboe@kernel.dk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/fs.h1
-rw-r--r--include/linux/radix-tree.h2
-rw-r--r--mm/page-writeback.c70
3 files changed, 55 insertions, 18 deletions
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e5106e49bd2c..488efec09d14 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -687,6 +687,7 @@ struct block_device {
687 */ 687 */
688#define PAGECACHE_TAG_DIRTY 0 688#define PAGECACHE_TAG_DIRTY 0
689#define PAGECACHE_TAG_WRITEBACK 1 689#define PAGECACHE_TAG_WRITEBACK 1
690#define PAGECACHE_TAG_TOWRITE 2
690 691
691int mapping_tagged(struct address_space *mapping, int tag); 692int mapping_tagged(struct address_space *mapping, int tag);
692 693
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index a4b00e9cca90..634b8e674ac5 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -55,7 +55,7 @@ static inline int radix_tree_is_indirect_ptr(void *ptr)
55 55
56/*** radix-tree API starts here ***/ 56/*** radix-tree API starts here ***/
57 57
58#define RADIX_TREE_MAX_TAGS 2 58#define RADIX_TREE_MAX_TAGS 3
59 59
60/* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */ 60/* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */
61struct radix_tree_root { 61struct radix_tree_root {
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 37498ef61548..df8202ebc7b8 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -805,6 +805,41 @@ void __init page_writeback_init(void)
805} 805}
806 806
807/** 807/**
808 * tag_pages_for_writeback - tag pages to be written by write_cache_pages
809 * @mapping: address space structure to write
810 * @start: starting page index
811 * @end: ending page index (inclusive)
812 *
813 * This function scans the page range from @start to @end (inclusive) and tags
814 * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is
815 * that write_cache_pages (or whoever calls this function) will then use
816 * TOWRITE tag to identify pages eligible for writeback. This mechanism is
817 * used to avoid livelocking of writeback by a process steadily creating new
818 * dirty pages in the file (thus it is important for this function to be quick
819 * so that it can tag pages faster than a dirtying process can create them).
820 */
821/*
822 * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency.
823 */
824#define WRITEBACK_TAG_BATCH 4096
825void tag_pages_for_writeback(struct address_space *mapping,
826 pgoff_t start, pgoff_t end)
827{
828 unsigned long tagged;
829
830 do {
831 spin_lock_irq(&mapping->tree_lock);
832 tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree,
833 &start, end, WRITEBACK_TAG_BATCH,
834 PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
835 spin_unlock_irq(&mapping->tree_lock);
836 WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH);
837 cond_resched();
838 } while (tagged >= WRITEBACK_TAG_BATCH);
839}
840EXPORT_SYMBOL(tag_pages_for_writeback);
841
842/**
808 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. 843 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
809 * @mapping: address space structure to write 844 * @mapping: address space structure to write
810 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 845 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
@@ -818,6 +853,13 @@ void __init page_writeback_init(void)
818 * the call was made get new I/O started against them. If wbc->sync_mode is 853 * the call was made get new I/O started against them. If wbc->sync_mode is
819 * WB_SYNC_ALL then we were called for data integrity and we must wait for 854 * WB_SYNC_ALL then we were called for data integrity and we must wait for
820 * existing IO to complete. 855 * existing IO to complete.
856 *
857 * To avoid livelocks (when other process dirties new pages), we first tag
858 * pages which should be written back with TOWRITE tag and only then start
859 * writing them. For data-integrity sync we have to be careful so that we do
860 * not miss some pages (e.g., because some other process has cleared TOWRITE
861 * tag we set). The rule we follow is that TOWRITE tag can be cleared only
862 * by the process clearing the DIRTY tag (and submitting the page for IO).
821 */ 863 */
822int write_cache_pages(struct address_space *mapping, 864int write_cache_pages(struct address_space *mapping,
823 struct writeback_control *wbc, writepage_t writepage, 865 struct writeback_control *wbc, writepage_t writepage,
@@ -833,6 +875,7 @@ int write_cache_pages(struct address_space *mapping,
833 pgoff_t done_index; 875 pgoff_t done_index;
834 int cycled; 876 int cycled;
835 int range_whole = 0; 877 int range_whole = 0;
878 int tag;
836 879
837 pagevec_init(&pvec, 0); 880 pagevec_init(&pvec, 0);
838 if (wbc->range_cyclic) { 881 if (wbc->range_cyclic) {
@@ -849,29 +892,19 @@ int write_cache_pages(struct address_space *mapping,
849 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 892 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
850 range_whole = 1; 893 range_whole = 1;
851 cycled = 1; /* ignore range_cyclic tests */ 894 cycled = 1; /* ignore range_cyclic tests */
852
853 /*
854 * If this is a data integrity sync, cap the writeback to the
855 * current end of file. Any extension to the file that occurs
856 * after this is a new write and we don't need to write those
857 * pages out to fulfil our data integrity requirements. If we
858 * try to write them out, we can get stuck in this scan until
859 * the concurrent writer stops adding dirty pages and extending
860 * EOF.
861 */
862 if (wbc->sync_mode == WB_SYNC_ALL &&
863 wbc->range_end == LLONG_MAX) {
864 end = i_size_read(mapping->host) >> PAGE_CACHE_SHIFT;
865 }
866 } 895 }
867 896 if (wbc->sync_mode == WB_SYNC_ALL)
897 tag = PAGECACHE_TAG_TOWRITE;
898 else
899 tag = PAGECACHE_TAG_DIRTY;
868retry: 900retry:
901 if (wbc->sync_mode == WB_SYNC_ALL)
902 tag_pages_for_writeback(mapping, index, end);
869 done_index = index; 903 done_index = index;
870 while (!done && (index <= end)) { 904 while (!done && (index <= end)) {
871 int i; 905 int i;
872 906
873 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 907 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
874 PAGECACHE_TAG_DIRTY,
875 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); 908 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
876 if (nr_pages == 0) 909 if (nr_pages == 0)
877 break; 910 break;
@@ -1327,6 +1360,9 @@ int test_set_page_writeback(struct page *page)
1327 radix_tree_tag_clear(&mapping->page_tree, 1360 radix_tree_tag_clear(&mapping->page_tree,
1328 page_index(page), 1361 page_index(page),
1329 PAGECACHE_TAG_DIRTY); 1362 PAGECACHE_TAG_DIRTY);
1363 radix_tree_tag_clear(&mapping->page_tree,
1364 page_index(page),
1365 PAGECACHE_TAG_TOWRITE);
1330 spin_unlock_irqrestore(&mapping->tree_lock, flags); 1366 spin_unlock_irqrestore(&mapping->tree_lock, flags);
1331 } else { 1367 } else {
1332 ret = TestSetPageWriteback(page); 1368 ret = TestSetPageWriteback(page);