diff options
-rw-r--r-- | fs/ext4/ext4.h | 1 | ||||
-rw-r--r-- | fs/ext4/inode.c | 105 | ||||
-rw-r--r-- | fs/ext4/super.c | 3 | ||||
-rw-r--r-- | include/trace/events/ext4.h | 14 |
4 files changed, 107 insertions, 16 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index e227eea23f05..a58438e18d0b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h | |||
@@ -942,6 +942,7 @@ struct ext4_sb_info { | |||
942 | unsigned int s_mb_stats; | 942 | unsigned int s_mb_stats; |
943 | unsigned int s_mb_order2_reqs; | 943 | unsigned int s_mb_order2_reqs; |
944 | unsigned int s_mb_group_prealloc; | 944 | unsigned int s_mb_group_prealloc; |
945 | unsigned int s_max_writeback_mb_bump; | ||
945 | /* where last allocation was done - for stream allocation */ | 946 | /* where last allocation was done - for stream allocation */ |
946 | unsigned long s_mb_last_group; | 947 | unsigned long s_mb_last_group; |
947 | unsigned long s_mb_last_start; | 948 | unsigned long s_mb_last_start; |
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5fb72a98ccbe..20e2d704dc2e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -1145,6 +1145,64 @@ static int check_block_validity(struct inode *inode, const char *msg, | |||
1145 | } | 1145 | } |
1146 | 1146 | ||
1147 | /* | 1147 | /* |
1148 | * Return the number of dirty pages in the given inode starting at | ||
1149 | * page frame idx. | ||
1150 | */ | ||
1151 | static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, | ||
1152 | unsigned int max_pages) | ||
1153 | { | ||
1154 | struct address_space *mapping = inode->i_mapping; | ||
1155 | pgoff_t index; | ||
1156 | struct pagevec pvec; | ||
1157 | pgoff_t num = 0; | ||
1158 | int i, nr_pages, done = 0; | ||
1159 | |||
1160 | if (max_pages == 0) | ||
1161 | return 0; | ||
1162 | pagevec_init(&pvec, 0); | ||
1163 | while (!done) { | ||
1164 | index = idx; | ||
1165 | nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, | ||
1166 | PAGECACHE_TAG_DIRTY, | ||
1167 | (pgoff_t)PAGEVEC_SIZE); | ||
1168 | if (nr_pages == 0) | ||
1169 | break; | ||
1170 | for (i = 0; i < nr_pages; i++) { | ||
1171 | struct page *page = pvec.pages[i]; | ||
1172 | struct buffer_head *bh, *head; | ||
1173 | |||
1174 | lock_page(page); | ||
1175 | if (unlikely(page->mapping != mapping) || | ||
1176 | !PageDirty(page) || | ||
1177 | PageWriteback(page) || | ||
1178 | page->index != idx) { | ||
1179 | done = 1; | ||
1180 | unlock_page(page); | ||
1181 | break; | ||
1182 | } | ||
1183 | head = page_buffers(page); | ||
1184 | bh = head; | ||
1185 | do { | ||
1186 | if (!buffer_delay(bh) && | ||
1187 | !buffer_unwritten(bh)) { | ||
1188 | done = 1; | ||
1189 | break; | ||
1190 | } | ||
1191 | } while ((bh = bh->b_this_page) != head); | ||
1192 | unlock_page(page); | ||
1193 | if (done) | ||
1194 | break; | ||
1195 | idx++; | ||
1196 | num++; | ||
1197 | if (num >= max_pages) | ||
1198 | break; | ||
1199 | } | ||
1200 | pagevec_release(&pvec); | ||
1201 | } | ||
1202 | return num; | ||
1203 | } | ||
1204 | |||
1205 | /* | ||
1148 | * The ext4_get_blocks() function tries to look up the requested blocks, | 1206 | * The ext4_get_blocks() function tries to look up the requested blocks, |
1149 | * and returns if the blocks are already mapped. | 1207 | * and returns if the blocks are already mapped. |
1150 | * | 1208 | * |
@@ -2743,8 +2801,10 @@ static int ext4_da_writepages(struct address_space *mapping, | |||
2743 | int no_nrwrite_index_update; | 2801 | int no_nrwrite_index_update; |
2744 | int pages_written = 0; | 2802 | int pages_written = 0; |
2745 | long pages_skipped; | 2803 | long pages_skipped; |
2804 | unsigned int max_pages; | ||
2746 | int range_cyclic, cycled = 1, io_done = 0; | 2805 | int range_cyclic, cycled = 1, io_done = 0; |
2747 | int needed_blocks, ret = 0, nr_to_writebump = 0; | 2806 | int needed_blocks, ret = 0; |
2807 | long desired_nr_to_write, nr_to_writebump = 0; | ||
2748 | loff_t range_start = wbc->range_start; | 2808 | loff_t range_start = wbc->range_start; |
2749 | struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); | 2809 | struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); |
2750 | 2810 | ||
@@ -2771,16 +2831,6 @@ static int ext4_da_writepages(struct address_space *mapping, | |||
2771 | if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) | 2831 | if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) |
2772 | return -EROFS; | 2832 | return -EROFS; |
2773 | 2833 | ||
2774 | /* | ||
2775 | * Make sure nr_to_write is >= sbi->s_mb_stream_request | ||
2776 | * This make sure small files blocks are allocated in | ||
2777 | * single attempt. This ensure that small files | ||
2778 | * get less fragmented. | ||
2779 | */ | ||
2780 | if (wbc->nr_to_write < sbi->s_mb_stream_request) { | ||
2781 | nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; | ||
2782 | wbc->nr_to_write = sbi->s_mb_stream_request; | ||
2783 | } | ||
2784 | if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) | 2834 | if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) |
2785 | range_whole = 1; | 2835 | range_whole = 1; |
2786 | 2836 | ||
@@ -2795,6 +2845,36 @@ static int ext4_da_writepages(struct address_space *mapping, | |||
2795 | } else | 2845 | } else |
2796 | index = wbc->range_start >> PAGE_CACHE_SHIFT; | 2846 | index = wbc->range_start >> PAGE_CACHE_SHIFT; |
2797 | 2847 | ||
2848 | /* | ||
2849 | * This works around two forms of stupidity. The first is in | ||
2850 | * the writeback code, which caps the maximum number of pages | ||
2851 | * written to be 1024 pages. This is wrong on multiple | ||
2852 | * levels; different architectues have a different page size, | ||
2853 | * which changes the maximum amount of data which gets | ||
2854 | * written. Secondly, 4 megabytes is way too small. XFS | ||
2855 | * forces this value to be 16 megabytes by multiplying | ||
2856 | * nr_to_write parameter by four, and then relies on its | ||
2857 | * allocator to allocate larger extents to make them | ||
2858 | * contiguous. Unfortunately this brings us to the second | ||
2859 | * stupidity, which is that ext4's mballoc code only allocates | ||
2860 | * at most 2048 blocks. So we force contiguous writes up to | ||
2861 | * the number of dirty blocks in the inode, or | ||
2862 | * sbi->max_writeback_mb_bump whichever is smaller. | ||
2863 | */ | ||
2864 | max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT); | ||
2865 | if (!range_cyclic && range_whole) | ||
2866 | desired_nr_to_write = wbc->nr_to_write * 8; | ||
2867 | else | ||
2868 | desired_nr_to_write = ext4_num_dirty_pages(inode, index, | ||
2869 | max_pages); | ||
2870 | if (desired_nr_to_write > max_pages) | ||
2871 | desired_nr_to_write = max_pages; | ||
2872 | |||
2873 | if (wbc->nr_to_write < desired_nr_to_write) { | ||
2874 | nr_to_writebump = desired_nr_to_write - wbc->nr_to_write; | ||
2875 | wbc->nr_to_write = desired_nr_to_write; | ||
2876 | } | ||
2877 | |||
2798 | mpd.wbc = wbc; | 2878 | mpd.wbc = wbc; |
2799 | mpd.inode = mapping->host; | 2879 | mpd.inode = mapping->host; |
2800 | 2880 | ||
@@ -2914,7 +2994,8 @@ retry: | |||
2914 | out_writepages: | 2994 | out_writepages: |
2915 | if (!no_nrwrite_index_update) | 2995 | if (!no_nrwrite_index_update) |
2916 | wbc->no_nrwrite_index_update = 0; | 2996 | wbc->no_nrwrite_index_update = 0; |
2917 | wbc->nr_to_write -= nr_to_writebump; | 2997 | if (wbc->nr_to_write > nr_to_writebump) |
2998 | wbc->nr_to_write -= nr_to_writebump; | ||
2918 | wbc->range_start = range_start; | 2999 | wbc->range_start = range_start; |
2919 | trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); | 3000 | trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); |
2920 | return ret; | 3001 | return ret; |
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index df539ba27779..16817737ba52 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c | |||
@@ -2197,6 +2197,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); | |||
2197 | EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); | 2197 | EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); |
2198 | EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); | 2198 | EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); |
2199 | EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); | 2199 | EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); |
2200 | EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump); | ||
2200 | 2201 | ||
2201 | static struct attribute *ext4_attrs[] = { | 2202 | static struct attribute *ext4_attrs[] = { |
2202 | ATTR_LIST(delayed_allocation_blocks), | 2203 | ATTR_LIST(delayed_allocation_blocks), |
@@ -2210,6 +2211,7 @@ static struct attribute *ext4_attrs[] = { | |||
2210 | ATTR_LIST(mb_order2_req), | 2211 | ATTR_LIST(mb_order2_req), |
2211 | ATTR_LIST(mb_stream_req), | 2212 | ATTR_LIST(mb_stream_req), |
2212 | ATTR_LIST(mb_group_prealloc), | 2213 | ATTR_LIST(mb_group_prealloc), |
2214 | ATTR_LIST(max_writeback_mb_bump), | ||
2213 | NULL, | 2215 | NULL, |
2214 | }; | 2216 | }; |
2215 | 2217 | ||
@@ -2679,6 +2681,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) | |||
2679 | } | 2681 | } |
2680 | 2682 | ||
2681 | sbi->s_stripe = ext4_get_stripe_size(sbi); | 2683 | sbi->s_stripe = ext4_get_stripe_size(sbi); |
2684 | sbi->s_max_writeback_mb_bump = 128; | ||
2682 | 2685 | ||
2683 | /* | 2686 | /* |
2684 | * set up enough so that it can read an inode | 2687 | * set up enough so that it can read an inode |
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index c1bd8f1e8b94..7c6bbb7198a3 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h | |||
@@ -236,6 +236,7 @@ TRACE_EVENT(ext4_da_writepages, | |||
236 | __field( char, for_kupdate ) | 236 | __field( char, for_kupdate ) |
237 | __field( char, for_reclaim ) | 237 | __field( char, for_reclaim ) |
238 | __field( char, range_cyclic ) | 238 | __field( char, range_cyclic ) |
239 | __field( pgoff_t, writeback_index ) | ||
239 | ), | 240 | ), |
240 | 241 | ||
241 | TP_fast_assign( | 242 | TP_fast_assign( |
@@ -249,15 +250,17 @@ TRACE_EVENT(ext4_da_writepages, | |||
249 | __entry->for_kupdate = wbc->for_kupdate; | 250 | __entry->for_kupdate = wbc->for_kupdate; |
250 | __entry->for_reclaim = wbc->for_reclaim; | 251 | __entry->for_reclaim = wbc->for_reclaim; |
251 | __entry->range_cyclic = wbc->range_cyclic; | 252 | __entry->range_cyclic = wbc->range_cyclic; |
253 | __entry->writeback_index = inode->i_mapping->writeback_index; | ||
252 | ), | 254 | ), |
253 | 255 | ||
254 | TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d range_cyclic %d", | 256 | TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d range_cyclic %d writeback_index %lu", |
255 | jbd2_dev_to_name(__entry->dev), | 257 | jbd2_dev_to_name(__entry->dev), |
256 | (unsigned long) __entry->ino, __entry->nr_to_write, | 258 | (unsigned long) __entry->ino, __entry->nr_to_write, |
257 | __entry->pages_skipped, __entry->range_start, | 259 | __entry->pages_skipped, __entry->range_start, |
258 | __entry->range_end, __entry->nonblocking, | 260 | __entry->range_end, __entry->nonblocking, |
259 | __entry->for_kupdate, __entry->for_reclaim, | 261 | __entry->for_kupdate, __entry->for_reclaim, |
260 | __entry->range_cyclic) | 262 | __entry->range_cyclic, |
263 | (unsigned long) __entry->writeback_index) | ||
261 | ); | 264 | ); |
262 | 265 | ||
263 | TRACE_EVENT(ext4_da_write_pages, | 266 | TRACE_EVENT(ext4_da_write_pages, |
@@ -309,6 +312,7 @@ TRACE_EVENT(ext4_da_writepages_result, | |||
309 | __field( char, encountered_congestion ) | 312 | __field( char, encountered_congestion ) |
310 | __field( char, more_io ) | 313 | __field( char, more_io ) |
311 | __field( char, no_nrwrite_index_update ) | 314 | __field( char, no_nrwrite_index_update ) |
315 | __field( pgoff_t, writeback_index ) | ||
312 | ), | 316 | ), |
313 | 317 | ||
314 | TP_fast_assign( | 318 | TP_fast_assign( |
@@ -320,14 +324,16 @@ TRACE_EVENT(ext4_da_writepages_result, | |||
320 | __entry->encountered_congestion = wbc->encountered_congestion; | 324 | __entry->encountered_congestion = wbc->encountered_congestion; |
321 | __entry->more_io = wbc->more_io; | 325 | __entry->more_io = wbc->more_io; |
322 | __entry->no_nrwrite_index_update = wbc->no_nrwrite_index_update; | 326 | __entry->no_nrwrite_index_update = wbc->no_nrwrite_index_update; |
327 | __entry->writeback_index = inode->i_mapping->writeback_index; | ||
323 | ), | 328 | ), |
324 | 329 | ||
325 | TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d", | 330 | TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d writeback_index %lu", |
326 | jbd2_dev_to_name(__entry->dev), | 331 | jbd2_dev_to_name(__entry->dev), |
327 | (unsigned long) __entry->ino, __entry->ret, | 332 | (unsigned long) __entry->ino, __entry->ret, |
328 | __entry->pages_written, __entry->pages_skipped, | 333 | __entry->pages_written, __entry->pages_skipped, |
329 | __entry->encountered_congestion, __entry->more_io, | 334 | __entry->encountered_congestion, __entry->more_io, |
330 | __entry->no_nrwrite_index_update) | 335 | __entry->no_nrwrite_index_update, |
336 | (unsigned long) __entry->writeback_index) | ||
331 | ); | 337 | ); |
332 | 338 | ||
333 | TRACE_EVENT(ext4_da_write_begin, | 339 | TRACE_EVENT(ext4_da_write_begin, |