aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/ext4/ext4.h1
-rw-r--r--fs/ext4/inode.c105
-rw-r--r--fs/ext4/super.c3
-rw-r--r--include/trace/events/ext4.h14
4 files changed, 107 insertions, 16 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e227eea23f05..a58438e18d0b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -942,6 +942,7 @@ struct ext4_sb_info {
942 unsigned int s_mb_stats; 942 unsigned int s_mb_stats;
943 unsigned int s_mb_order2_reqs; 943 unsigned int s_mb_order2_reqs;
944 unsigned int s_mb_group_prealloc; 944 unsigned int s_mb_group_prealloc;
945 unsigned int s_max_writeback_mb_bump;
945 /* where last allocation was done - for stream allocation */ 946 /* where last allocation was done - for stream allocation */
946 unsigned long s_mb_last_group; 947 unsigned long s_mb_last_group;
947 unsigned long s_mb_last_start; 948 unsigned long s_mb_last_start;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5fb72a98ccbe..20e2d704dc2e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1145,6 +1145,64 @@ static int check_block_validity(struct inode *inode, const char *msg,
1145} 1145}
1146 1146
1147/* 1147/*
1148 * Return the number of dirty pages in the given inode starting at
1149 * page frame idx.
1150 */
1151static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
1152 unsigned int max_pages)
1153{
1154 struct address_space *mapping = inode->i_mapping;
1155 pgoff_t index;
1156 struct pagevec pvec;
1157 pgoff_t num = 0;
1158 int i, nr_pages, done = 0;
1159
1160 if (max_pages == 0)
1161 return 0;
1162 pagevec_init(&pvec, 0);
1163 while (!done) {
1164 index = idx;
1165 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
1166 PAGECACHE_TAG_DIRTY,
1167 (pgoff_t)PAGEVEC_SIZE);
1168 if (nr_pages == 0)
1169 break;
1170 for (i = 0; i < nr_pages; i++) {
1171 struct page *page = pvec.pages[i];
1172 struct buffer_head *bh, *head;
1173
1174 lock_page(page);
1175 if (unlikely(page->mapping != mapping) ||
1176 !PageDirty(page) ||
1177 PageWriteback(page) ||
1178 page->index != idx) {
1179 done = 1;
1180 unlock_page(page);
1181 break;
1182 }
1183 head = page_buffers(page);
1184 bh = head;
1185 do {
1186 if (!buffer_delay(bh) &&
1187 !buffer_unwritten(bh)) {
1188 done = 1;
1189 break;
1190 }
1191 } while ((bh = bh->b_this_page) != head);
1192 unlock_page(page);
1193 if (done)
1194 break;
1195 idx++;
1196 num++;
1197 if (num >= max_pages)
1198 break;
1199 }
1200 pagevec_release(&pvec);
1201 }
1202 return num;
1203}
1204
1205/*
1148 * The ext4_get_blocks() function tries to look up the requested blocks, 1206 * The ext4_get_blocks() function tries to look up the requested blocks,
1149 * and returns if the blocks are already mapped. 1207 * and returns if the blocks are already mapped.
1150 * 1208 *
@@ -2743,8 +2801,10 @@ static int ext4_da_writepages(struct address_space *mapping,
2743 int no_nrwrite_index_update; 2801 int no_nrwrite_index_update;
2744 int pages_written = 0; 2802 int pages_written = 0;
2745 long pages_skipped; 2803 long pages_skipped;
2804 unsigned int max_pages;
2746 int range_cyclic, cycled = 1, io_done = 0; 2805 int range_cyclic, cycled = 1, io_done = 0;
2747 int needed_blocks, ret = 0, nr_to_writebump = 0; 2806 int needed_blocks, ret = 0;
2807 long desired_nr_to_write, nr_to_writebump = 0;
2748 loff_t range_start = wbc->range_start; 2808 loff_t range_start = wbc->range_start;
2749 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2809 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2750 2810
@@ -2771,16 +2831,6 @@ static int ext4_da_writepages(struct address_space *mapping,
2771 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) 2831 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
2772 return -EROFS; 2832 return -EROFS;
2773 2833
2774 /*
2775 * Make sure nr_to_write is >= sbi->s_mb_stream_request
2776 * This make sure small files blocks are allocated in
2777 * single attempt. This ensure that small files
2778 * get less fragmented.
2779 */
2780 if (wbc->nr_to_write < sbi->s_mb_stream_request) {
2781 nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
2782 wbc->nr_to_write = sbi->s_mb_stream_request;
2783 }
2784 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2834 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2785 range_whole = 1; 2835 range_whole = 1;
2786 2836
@@ -2795,6 +2845,36 @@ static int ext4_da_writepages(struct address_space *mapping,
2795 } else 2845 } else
2796 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2846 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2797 2847
2848 /*
2849 * This works around two forms of stupidity. The first is in
2850 * the writeback code, which caps the maximum number of pages
2851 * written to be 1024 pages. This is wrong on multiple
2852 * levels; different architectues have a different page size,
2853 * which changes the maximum amount of data which gets
2854 * written. Secondly, 4 megabytes is way too small. XFS
2855 * forces this value to be 16 megabytes by multiplying
2856 * nr_to_write parameter by four, and then relies on its
2857 * allocator to allocate larger extents to make them
2858 * contiguous. Unfortunately this brings us to the second
2859 * stupidity, which is that ext4's mballoc code only allocates
2860 * at most 2048 blocks. So we force contiguous writes up to
2861 * the number of dirty blocks in the inode, or
2862 * sbi->max_writeback_mb_bump whichever is smaller.
2863 */
2864 max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
2865 if (!range_cyclic && range_whole)
2866 desired_nr_to_write = wbc->nr_to_write * 8;
2867 else
2868 desired_nr_to_write = ext4_num_dirty_pages(inode, index,
2869 max_pages);
2870 if (desired_nr_to_write > max_pages)
2871 desired_nr_to_write = max_pages;
2872
2873 if (wbc->nr_to_write < desired_nr_to_write) {
2874 nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
2875 wbc->nr_to_write = desired_nr_to_write;
2876 }
2877
2798 mpd.wbc = wbc; 2878 mpd.wbc = wbc;
2799 mpd.inode = mapping->host; 2879 mpd.inode = mapping->host;
2800 2880
@@ -2914,7 +2994,8 @@ retry:
2914out_writepages: 2994out_writepages:
2915 if (!no_nrwrite_index_update) 2995 if (!no_nrwrite_index_update)
2916 wbc->no_nrwrite_index_update = 0; 2996 wbc->no_nrwrite_index_update = 0;
2917 wbc->nr_to_write -= nr_to_writebump; 2997 if (wbc->nr_to_write > nr_to_writebump)
2998 wbc->nr_to_write -= nr_to_writebump;
2918 wbc->range_start = range_start; 2999 wbc->range_start = range_start;
2919 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); 3000 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
2920 return ret; 3001 return ret;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index df539ba27779..16817737ba52 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2197,6 +2197,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
2197EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); 2197EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
2198EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); 2198EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
2199EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); 2199EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
2200EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
2200 2201
2201static struct attribute *ext4_attrs[] = { 2202static struct attribute *ext4_attrs[] = {
2202 ATTR_LIST(delayed_allocation_blocks), 2203 ATTR_LIST(delayed_allocation_blocks),
@@ -2210,6 +2211,7 @@ static struct attribute *ext4_attrs[] = {
2210 ATTR_LIST(mb_order2_req), 2211 ATTR_LIST(mb_order2_req),
2211 ATTR_LIST(mb_stream_req), 2212 ATTR_LIST(mb_stream_req),
2212 ATTR_LIST(mb_group_prealloc), 2213 ATTR_LIST(mb_group_prealloc),
2214 ATTR_LIST(max_writeback_mb_bump),
2213 NULL, 2215 NULL,
2214}; 2216};
2215 2217
@@ -2679,6 +2681,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2679 } 2681 }
2680 2682
2681 sbi->s_stripe = ext4_get_stripe_size(sbi); 2683 sbi->s_stripe = ext4_get_stripe_size(sbi);
2684 sbi->s_max_writeback_mb_bump = 128;
2682 2685
2683 /* 2686 /*
2684 * set up enough so that it can read an inode 2687 * set up enough so that it can read an inode
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index c1bd8f1e8b94..7c6bbb7198a3 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -236,6 +236,7 @@ TRACE_EVENT(ext4_da_writepages,
236 __field( char, for_kupdate ) 236 __field( char, for_kupdate )
237 __field( char, for_reclaim ) 237 __field( char, for_reclaim )
238 __field( char, range_cyclic ) 238 __field( char, range_cyclic )
239 __field( pgoff_t, writeback_index )
239 ), 240 ),
240 241
241 TP_fast_assign( 242 TP_fast_assign(
@@ -249,15 +250,17 @@ TRACE_EVENT(ext4_da_writepages,
249 __entry->for_kupdate = wbc->for_kupdate; 250 __entry->for_kupdate = wbc->for_kupdate;
250 __entry->for_reclaim = wbc->for_reclaim; 251 __entry->for_reclaim = wbc->for_reclaim;
251 __entry->range_cyclic = wbc->range_cyclic; 252 __entry->range_cyclic = wbc->range_cyclic;
253 __entry->writeback_index = inode->i_mapping->writeback_index;
252 ), 254 ),
253 255
254 TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d range_cyclic %d", 256 TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d range_cyclic %d writeback_index %lu",
255 jbd2_dev_to_name(__entry->dev), 257 jbd2_dev_to_name(__entry->dev),
256 (unsigned long) __entry->ino, __entry->nr_to_write, 258 (unsigned long) __entry->ino, __entry->nr_to_write,
257 __entry->pages_skipped, __entry->range_start, 259 __entry->pages_skipped, __entry->range_start,
258 __entry->range_end, __entry->nonblocking, 260 __entry->range_end, __entry->nonblocking,
259 __entry->for_kupdate, __entry->for_reclaim, 261 __entry->for_kupdate, __entry->for_reclaim,
260 __entry->range_cyclic) 262 __entry->range_cyclic,
263 (unsigned long) __entry->writeback_index)
261); 264);
262 265
263TRACE_EVENT(ext4_da_write_pages, 266TRACE_EVENT(ext4_da_write_pages,
@@ -309,6 +312,7 @@ TRACE_EVENT(ext4_da_writepages_result,
309 __field( char, encountered_congestion ) 312 __field( char, encountered_congestion )
310 __field( char, more_io ) 313 __field( char, more_io )
311 __field( char, no_nrwrite_index_update ) 314 __field( char, no_nrwrite_index_update )
315 __field( pgoff_t, writeback_index )
312 ), 316 ),
313 317
314 TP_fast_assign( 318 TP_fast_assign(
@@ -320,14 +324,16 @@ TRACE_EVENT(ext4_da_writepages_result,
320 __entry->encountered_congestion = wbc->encountered_congestion; 324 __entry->encountered_congestion = wbc->encountered_congestion;
321 __entry->more_io = wbc->more_io; 325 __entry->more_io = wbc->more_io;
322 __entry->no_nrwrite_index_update = wbc->no_nrwrite_index_update; 326 __entry->no_nrwrite_index_update = wbc->no_nrwrite_index_update;
327 __entry->writeback_index = inode->i_mapping->writeback_index;
323 ), 328 ),
324 329
325 TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d", 330 TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d writeback_index %lu",
326 jbd2_dev_to_name(__entry->dev), 331 jbd2_dev_to_name(__entry->dev),
327 (unsigned long) __entry->ino, __entry->ret, 332 (unsigned long) __entry->ino, __entry->ret,
328 __entry->pages_written, __entry->pages_skipped, 333 __entry->pages_written, __entry->pages_skipped,
329 __entry->encountered_congestion, __entry->more_io, 334 __entry->encountered_congestion, __entry->more_io,
330 __entry->no_nrwrite_index_update) 335 __entry->no_nrwrite_index_update,
336 (unsigned long) __entry->writeback_index)
331); 337);
332 338
333TRACE_EVENT(ext4_da_write_begin, 339TRACE_EVENT(ext4_da_write_begin,