aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTheodore Ts'o <tytso@mit.edu>2009-09-29 13:31:31 -0400
committerTheodore Ts'o <tytso@mit.edu>2009-09-29 13:31:31 -0400
commit55138e0bc29c0751e2152df9ad35deea542f29b3 (patch)
tree40dbc77de3e7a426030bd740ba8e2fd0d2171523
parent71780577306fd1e76c7a92e3b308db624d03adb9 (diff)
ext4: Adjust ext4_da_writepages() to write out larger contiguous chunks
Work around problems in the writeback code to force out writebacks in larger chunks than just 4mb, which is just too small. This also works around limitations in the ext4 block allocator, which can't allocate more than 2048 blocks at a time. So we need to defeat the round-robin characteristics of the writeback code and try to write out as many blocks in one inode before allowing the writeback code to move on to another inode. We add a a new per-filesystem tunable, max_writeback_mb_bump, which caps this to a default of 128mb per inode. Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
-rw-r--r--fs/ext4/ext4.h1
-rw-r--r--fs/ext4/inode.c105
-rw-r--r--fs/ext4/super.c3
-rw-r--r--include/trace/events/ext4.h14
4 files changed, 107 insertions, 16 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e227eea23f05..a58438e18d0b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -942,6 +942,7 @@ struct ext4_sb_info {
942 unsigned int s_mb_stats; 942 unsigned int s_mb_stats;
943 unsigned int s_mb_order2_reqs; 943 unsigned int s_mb_order2_reqs;
944 unsigned int s_mb_group_prealloc; 944 unsigned int s_mb_group_prealloc;
945 unsigned int s_max_writeback_mb_bump;
945 /* where last allocation was done - for stream allocation */ 946 /* where last allocation was done - for stream allocation */
946 unsigned long s_mb_last_group; 947 unsigned long s_mb_last_group;
947 unsigned long s_mb_last_start; 948 unsigned long s_mb_last_start;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5fb72a98ccbe..20e2d704dc2e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1145,6 +1145,64 @@ static int check_block_validity(struct inode *inode, const char *msg,
1145} 1145}
1146 1146
1147/* 1147/*
1148 * Return the number of dirty pages in the given inode starting at
1149 * page frame idx.
1150 */
1151static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
1152 unsigned int max_pages)
1153{
1154 struct address_space *mapping = inode->i_mapping;
1155 pgoff_t index;
1156 struct pagevec pvec;
1157 pgoff_t num = 0;
1158 int i, nr_pages, done = 0;
1159
1160 if (max_pages == 0)
1161 return 0;
1162 pagevec_init(&pvec, 0);
1163 while (!done) {
1164 index = idx;
1165 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
1166 PAGECACHE_TAG_DIRTY,
1167 (pgoff_t)PAGEVEC_SIZE);
1168 if (nr_pages == 0)
1169 break;
1170 for (i = 0; i < nr_pages; i++) {
1171 struct page *page = pvec.pages[i];
1172 struct buffer_head *bh, *head;
1173
1174 lock_page(page);
1175 if (unlikely(page->mapping != mapping) ||
1176 !PageDirty(page) ||
1177 PageWriteback(page) ||
1178 page->index != idx) {
1179 done = 1;
1180 unlock_page(page);
1181 break;
1182 }
1183 head = page_buffers(page);
1184 bh = head;
1185 do {
1186 if (!buffer_delay(bh) &&
1187 !buffer_unwritten(bh)) {
1188 done = 1;
1189 break;
1190 }
1191 } while ((bh = bh->b_this_page) != head);
1192 unlock_page(page);
1193 if (done)
1194 break;
1195 idx++;
1196 num++;
1197 if (num >= max_pages)
1198 break;
1199 }
1200 pagevec_release(&pvec);
1201 }
1202 return num;
1203}
1204
1205/*
1148 * The ext4_get_blocks() function tries to look up the requested blocks, 1206 * The ext4_get_blocks() function tries to look up the requested blocks,
1149 * and returns if the blocks are already mapped. 1207 * and returns if the blocks are already mapped.
1150 * 1208 *
@@ -2743,8 +2801,10 @@ static int ext4_da_writepages(struct address_space *mapping,
2743 int no_nrwrite_index_update; 2801 int no_nrwrite_index_update;
2744 int pages_written = 0; 2802 int pages_written = 0;
2745 long pages_skipped; 2803 long pages_skipped;
2804 unsigned int max_pages;
2746 int range_cyclic, cycled = 1, io_done = 0; 2805 int range_cyclic, cycled = 1, io_done = 0;
2747 int needed_blocks, ret = 0, nr_to_writebump = 0; 2806 int needed_blocks, ret = 0;
2807 long desired_nr_to_write, nr_to_writebump = 0;
2748 loff_t range_start = wbc->range_start; 2808 loff_t range_start = wbc->range_start;
2749 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2809 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2750 2810
@@ -2771,16 +2831,6 @@ static int ext4_da_writepages(struct address_space *mapping,
2771 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) 2831 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
2772 return -EROFS; 2832 return -EROFS;
2773 2833
2774 /*
2775 * Make sure nr_to_write is >= sbi->s_mb_stream_request
2776 * This make sure small files blocks are allocated in
2777 * single attempt. This ensure that small files
2778 * get less fragmented.
2779 */
2780 if (wbc->nr_to_write < sbi->s_mb_stream_request) {
2781 nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
2782 wbc->nr_to_write = sbi->s_mb_stream_request;
2783 }
2784 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2834 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2785 range_whole = 1; 2835 range_whole = 1;
2786 2836
@@ -2795,6 +2845,36 @@ static int ext4_da_writepages(struct address_space *mapping,
2795 } else 2845 } else
2796 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2846 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2797 2847
2848 /*
2849 * This works around two forms of stupidity. The first is in
2850 * the writeback code, which caps the maximum number of pages
2851 * written to be 1024 pages. This is wrong on multiple
2852 * levels; different architectues have a different page size,
2853 * which changes the maximum amount of data which gets
2854 * written. Secondly, 4 megabytes is way too small. XFS
2855 * forces this value to be 16 megabytes by multiplying
2856 * nr_to_write parameter by four, and then relies on its
2857 * allocator to allocate larger extents to make them
2858 * contiguous. Unfortunately this brings us to the second
2859 * stupidity, which is that ext4's mballoc code only allocates
2860 * at most 2048 blocks. So we force contiguous writes up to
2861 * the number of dirty blocks in the inode, or
2862 * sbi->max_writeback_mb_bump whichever is smaller.
2863 */
2864 max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
2865 if (!range_cyclic && range_whole)
2866 desired_nr_to_write = wbc->nr_to_write * 8;
2867 else
2868 desired_nr_to_write = ext4_num_dirty_pages(inode, index,
2869 max_pages);
2870 if (desired_nr_to_write > max_pages)
2871 desired_nr_to_write = max_pages;
2872
2873 if (wbc->nr_to_write < desired_nr_to_write) {
2874 nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
2875 wbc->nr_to_write = desired_nr_to_write;
2876 }
2877
2798 mpd.wbc = wbc; 2878 mpd.wbc = wbc;
2799 mpd.inode = mapping->host; 2879 mpd.inode = mapping->host;
2800 2880
@@ -2914,7 +2994,8 @@ retry:
2914out_writepages: 2994out_writepages:
2915 if (!no_nrwrite_index_update) 2995 if (!no_nrwrite_index_update)
2916 wbc->no_nrwrite_index_update = 0; 2996 wbc->no_nrwrite_index_update = 0;
2917 wbc->nr_to_write -= nr_to_writebump; 2997 if (wbc->nr_to_write > nr_to_writebump)
2998 wbc->nr_to_write -= nr_to_writebump;
2918 wbc->range_start = range_start; 2999 wbc->range_start = range_start;
2919 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); 3000 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
2920 return ret; 3001 return ret;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index df539ba27779..16817737ba52 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2197,6 +2197,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
2197EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); 2197EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
2198EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); 2198EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
2199EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); 2199EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
2200EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
2200 2201
2201static struct attribute *ext4_attrs[] = { 2202static struct attribute *ext4_attrs[] = {
2202 ATTR_LIST(delayed_allocation_blocks), 2203 ATTR_LIST(delayed_allocation_blocks),
@@ -2210,6 +2211,7 @@ static struct attribute *ext4_attrs[] = {
2210 ATTR_LIST(mb_order2_req), 2211 ATTR_LIST(mb_order2_req),
2211 ATTR_LIST(mb_stream_req), 2212 ATTR_LIST(mb_stream_req),
2212 ATTR_LIST(mb_group_prealloc), 2213 ATTR_LIST(mb_group_prealloc),
2214 ATTR_LIST(max_writeback_mb_bump),
2213 NULL, 2215 NULL,
2214}; 2216};
2215 2217
@@ -2679,6 +2681,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2679 } 2681 }
2680 2682
2681 sbi->s_stripe = ext4_get_stripe_size(sbi); 2683 sbi->s_stripe = ext4_get_stripe_size(sbi);
2684 sbi->s_max_writeback_mb_bump = 128;
2682 2685
2683 /* 2686 /*
2684 * set up enough so that it can read an inode 2687 * set up enough so that it can read an inode
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index c1bd8f1e8b94..7c6bbb7198a3 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -236,6 +236,7 @@ TRACE_EVENT(ext4_da_writepages,
236 __field( char, for_kupdate ) 236 __field( char, for_kupdate )
237 __field( char, for_reclaim ) 237 __field( char, for_reclaim )
238 __field( char, range_cyclic ) 238 __field( char, range_cyclic )
239 __field( pgoff_t, writeback_index )
239 ), 240 ),
240 241
241 TP_fast_assign( 242 TP_fast_assign(
@@ -249,15 +250,17 @@ TRACE_EVENT(ext4_da_writepages,
249 __entry->for_kupdate = wbc->for_kupdate; 250 __entry->for_kupdate = wbc->for_kupdate;
250 __entry->for_reclaim = wbc->for_reclaim; 251 __entry->for_reclaim = wbc->for_reclaim;
251 __entry->range_cyclic = wbc->range_cyclic; 252 __entry->range_cyclic = wbc->range_cyclic;
253 __entry->writeback_index = inode->i_mapping->writeback_index;
252 ), 254 ),
253 255
254 TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d range_cyclic %d", 256 TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d range_cyclic %d writeback_index %lu",
255 jbd2_dev_to_name(__entry->dev), 257 jbd2_dev_to_name(__entry->dev),
256 (unsigned long) __entry->ino, __entry->nr_to_write, 258 (unsigned long) __entry->ino, __entry->nr_to_write,
257 __entry->pages_skipped, __entry->range_start, 259 __entry->pages_skipped, __entry->range_start,
258 __entry->range_end, __entry->nonblocking, 260 __entry->range_end, __entry->nonblocking,
259 __entry->for_kupdate, __entry->for_reclaim, 261 __entry->for_kupdate, __entry->for_reclaim,
260 __entry->range_cyclic) 262 __entry->range_cyclic,
263 (unsigned long) __entry->writeback_index)
261); 264);
262 265
263TRACE_EVENT(ext4_da_write_pages, 266TRACE_EVENT(ext4_da_write_pages,
@@ -309,6 +312,7 @@ TRACE_EVENT(ext4_da_writepages_result,
309 __field( char, encountered_congestion ) 312 __field( char, encountered_congestion )
310 __field( char, more_io ) 313 __field( char, more_io )
311 __field( char, no_nrwrite_index_update ) 314 __field( char, no_nrwrite_index_update )
315 __field( pgoff_t, writeback_index )
312 ), 316 ),
313 317
314 TP_fast_assign( 318 TP_fast_assign(
@@ -320,14 +324,16 @@ TRACE_EVENT(ext4_da_writepages_result,
320 __entry->encountered_congestion = wbc->encountered_congestion; 324 __entry->encountered_congestion = wbc->encountered_congestion;
321 __entry->more_io = wbc->more_io; 325 __entry->more_io = wbc->more_io;
322 __entry->no_nrwrite_index_update = wbc->no_nrwrite_index_update; 326 __entry->no_nrwrite_index_update = wbc->no_nrwrite_index_update;
327 __entry->writeback_index = inode->i_mapping->writeback_index;
323 ), 328 ),
324 329
325 TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d", 330 TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d writeback_index %lu",
326 jbd2_dev_to_name(__entry->dev), 331 jbd2_dev_to_name(__entry->dev),
327 (unsigned long) __entry->ino, __entry->ret, 332 (unsigned long) __entry->ino, __entry->ret,
328 __entry->pages_written, __entry->pages_skipped, 333 __entry->pages_written, __entry->pages_skipped,
329 __entry->encountered_congestion, __entry->more_io, 334 __entry->encountered_congestion, __entry->more_io,
330 __entry->no_nrwrite_index_update) 335 __entry->no_nrwrite_index_update,
336 (unsigned long) __entry->writeback_index)
331); 337);
332 338
333TRACE_EVENT(ext4_da_write_begin, 339TRACE_EVENT(ext4_da_write_begin,