diff options
author | Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> | 2008-10-16 10:10:36 -0400 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2008-10-16 10:10:36 -0400 |
commit | 22208dedbd7626e5fc4339c417f8d24cc21f79d7 (patch) | |
tree | d03fc8524dc7cb2f18c2038ffe4d2146903f630e /fs/ext4 | |
parent | 17bc6c30cf6bfffd816bdc53682dd46fc34a2cf4 (diff) |
ext4: Fix file fragmentation during large file write.
The range_cyclic writeback mode uses the address_space writeback_index
as the start index for writeback. With delayed allocation we were
updating writeback_index wrongly resulting in highly fragmented file.
This patch reduces the number of extents reduced from 4000 to 27 for a
3GB file.
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Diffstat (limited to 'fs/ext4')
-rw-r--r-- | fs/ext4/inode.c | 91 |
1 files changed, 57 insertions, 34 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4ee3f0692eeb..27fc6b951221 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -1648,6 +1648,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) | |||
1648 | int ret = 0, err, nr_pages, i; | 1648 | int ret = 0, err, nr_pages, i; |
1649 | unsigned long index, end; | 1649 | unsigned long index, end; |
1650 | struct pagevec pvec; | 1650 | struct pagevec pvec; |
1651 | long pages_skipped; | ||
1651 | 1652 | ||
1652 | BUG_ON(mpd->next_page <= mpd->first_page); | 1653 | BUG_ON(mpd->next_page <= mpd->first_page); |
1653 | pagevec_init(&pvec, 0); | 1654 | pagevec_init(&pvec, 0); |
@@ -1655,7 +1656,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) | |||
1655 | end = mpd->next_page - 1; | 1656 | end = mpd->next_page - 1; |
1656 | 1657 | ||
1657 | while (index <= end) { | 1658 | while (index <= end) { |
1658 | /* XXX: optimize tail */ | ||
1659 | /* | 1659 | /* |
1660 | * We can use PAGECACHE_TAG_DIRTY lookup here because | 1660 | * We can use PAGECACHE_TAG_DIRTY lookup here because |
1661 | * even though we have cleared the dirty flag on the page | 1661 | * even though we have cleared the dirty flag on the page |
@@ -1673,8 +1673,13 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) | |||
1673 | for (i = 0; i < nr_pages; i++) { | 1673 | for (i = 0; i < nr_pages; i++) { |
1674 | struct page *page = pvec.pages[i]; | 1674 | struct page *page = pvec.pages[i]; |
1675 | 1675 | ||
1676 | pages_skipped = mpd->wbc->pages_skipped; | ||
1676 | err = mapping->a_ops->writepage(page, mpd->wbc); | 1677 | err = mapping->a_ops->writepage(page, mpd->wbc); |
1677 | if (!err) | 1678 | if (!err && (pages_skipped == mpd->wbc->pages_skipped)) |
1679 | /* | ||
1680 | * have successfully written the page | ||
1681 | * without skipping the same | ||
1682 | */ | ||
1678 | mpd->pages_written++; | 1683 | mpd->pages_written++; |
1679 | /* | 1684 | /* |
1680 | * In error case, we have to continue because | 1685 | * In error case, we have to continue because |
@@ -2110,7 +2115,6 @@ static int mpage_da_writepages(struct address_space *mapping, | |||
2110 | struct writeback_control *wbc, | 2115 | struct writeback_control *wbc, |
2111 | struct mpage_da_data *mpd) | 2116 | struct mpage_da_data *mpd) |
2112 | { | 2117 | { |
2113 | long to_write; | ||
2114 | int ret; | 2118 | int ret; |
2115 | 2119 | ||
2116 | if (!mpd->get_block) | 2120 | if (!mpd->get_block) |
@@ -2125,19 +2129,18 @@ static int mpage_da_writepages(struct address_space *mapping, | |||
2125 | mpd->pages_written = 0; | 2129 | mpd->pages_written = 0; |
2126 | mpd->retval = 0; | 2130 | mpd->retval = 0; |
2127 | 2131 | ||
2128 | to_write = wbc->nr_to_write; | ||
2129 | |||
2130 | ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd); | 2132 | ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd); |
2131 | |||
2132 | /* | 2133 | /* |
2133 | * Handle last extent of pages | 2134 | * Handle last extent of pages |
2134 | */ | 2135 | */ |
2135 | if (!mpd->io_done && mpd->next_page != mpd->first_page) { | 2136 | if (!mpd->io_done && mpd->next_page != mpd->first_page) { |
2136 | if (mpage_da_map_blocks(mpd) == 0) | 2137 | if (mpage_da_map_blocks(mpd) == 0) |
2137 | mpage_da_submit_io(mpd); | 2138 | mpage_da_submit_io(mpd); |
2138 | } | ||
2139 | 2139 | ||
2140 | wbc->nr_to_write = to_write - mpd->pages_written; | 2140 | mpd->io_done = 1; |
2141 | ret = MPAGE_DA_EXTENT_TAIL; | ||
2142 | } | ||
2143 | wbc->nr_to_write -= mpd->pages_written; | ||
2141 | return ret; | 2144 | return ret; |
2142 | } | 2145 | } |
2143 | 2146 | ||
@@ -2366,11 +2369,14 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) | |||
2366 | static int ext4_da_writepages(struct address_space *mapping, | 2369 | static int ext4_da_writepages(struct address_space *mapping, |
2367 | struct writeback_control *wbc) | 2370 | struct writeback_control *wbc) |
2368 | { | 2371 | { |
2372 | pgoff_t index; | ||
2373 | int range_whole = 0; | ||
2369 | handle_t *handle = NULL; | 2374 | handle_t *handle = NULL; |
2370 | struct mpage_da_data mpd; | 2375 | struct mpage_da_data mpd; |
2371 | struct inode *inode = mapping->host; | 2376 | struct inode *inode = mapping->host; |
2377 | int no_nrwrite_index_update; | ||
2378 | long pages_written = 0, pages_skipped; | ||
2372 | int needed_blocks, ret = 0, nr_to_writebump = 0; | 2379 | int needed_blocks, ret = 0, nr_to_writebump = 0; |
2373 | long to_write, pages_skipped = 0; | ||
2374 | struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); | 2380 | struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); |
2375 | 2381 | ||
2376 | /* | 2382 | /* |
@@ -2390,16 +2396,26 @@ static int ext4_da_writepages(struct address_space *mapping, | |||
2390 | nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; | 2396 | nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; |
2391 | wbc->nr_to_write = sbi->s_mb_stream_request; | 2397 | wbc->nr_to_write = sbi->s_mb_stream_request; |
2392 | } | 2398 | } |
2399 | if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) | ||
2400 | range_whole = 1; | ||
2393 | 2401 | ||
2394 | 2402 | if (wbc->range_cyclic) | |
2395 | pages_skipped = wbc->pages_skipped; | 2403 | index = mapping->writeback_index; |
2404 | else | ||
2405 | index = wbc->range_start >> PAGE_CACHE_SHIFT; | ||
2396 | 2406 | ||
2397 | mpd.wbc = wbc; | 2407 | mpd.wbc = wbc; |
2398 | mpd.inode = mapping->host; | 2408 | mpd.inode = mapping->host; |
2399 | 2409 | ||
2400 | restart_loop: | 2410 | /* |
2401 | to_write = wbc->nr_to_write; | 2411 | * we don't want write_cache_pages to update |
2402 | while (!ret && to_write > 0) { | 2412 | * nr_to_write and writeback_index |
2413 | */ | ||
2414 | no_nrwrite_index_update = wbc->no_nrwrite_index_update; | ||
2415 | wbc->no_nrwrite_index_update = 1; | ||
2416 | pages_skipped = wbc->pages_skipped; | ||
2417 | |||
2418 | while (!ret && wbc->nr_to_write > 0) { | ||
2403 | 2419 | ||
2404 | /* | 2420 | /* |
2405 | * we insert one extent at a time. So we need | 2421 | * we insert one extent at a time. So we need |
@@ -2420,46 +2436,53 @@ restart_loop: | |||
2420 | dump_stack(); | 2436 | dump_stack(); |
2421 | goto out_writepages; | 2437 | goto out_writepages; |
2422 | } | 2438 | } |
2423 | to_write -= wbc->nr_to_write; | ||
2424 | |||
2425 | mpd.get_block = ext4_da_get_block_write; | 2439 | mpd.get_block = ext4_da_get_block_write; |
2426 | ret = mpage_da_writepages(mapping, wbc, &mpd); | 2440 | ret = mpage_da_writepages(mapping, wbc, &mpd); |
2427 | 2441 | ||
2428 | ext4_journal_stop(handle); | 2442 | ext4_journal_stop(handle); |
2429 | 2443 | ||
2430 | if (mpd.retval == -ENOSPC) | 2444 | if (mpd.retval == -ENOSPC) { |
2445 | /* commit the transaction which would | ||
2446 | * free blocks released in the transaction | ||
2447 | * and try again | ||
2448 | */ | ||
2431 | jbd2_journal_force_commit_nested(sbi->s_journal); | 2449 | jbd2_journal_force_commit_nested(sbi->s_journal); |
2432 | 2450 | wbc->pages_skipped = pages_skipped; | |
2433 | /* reset the retry count */ | 2451 | ret = 0; |
2434 | if (ret == MPAGE_DA_EXTENT_TAIL) { | 2452 | } else if (ret == MPAGE_DA_EXTENT_TAIL) { |
2435 | /* | 2453 | /* |
2436 | * got one extent now try with | 2454 | * got one extent now try with |
2437 | * rest of the pages | 2455 | * rest of the pages |
2438 | */ | 2456 | */ |
2439 | to_write += wbc->nr_to_write; | 2457 | pages_written += mpd.pages_written; |
2458 | wbc->pages_skipped = pages_skipped; | ||
2440 | ret = 0; | 2459 | ret = 0; |
2441 | } else if (wbc->nr_to_write) { | 2460 | } else if (wbc->nr_to_write) |
2442 | /* | 2461 | /* |
2443 | * There is no more writeout needed | 2462 | * There is no more writeout needed |
2444 | * or we requested for a noblocking writeout | 2463 | * or we requested for a noblocking writeout |
2445 | * and we found the device congested | 2464 | * and we found the device congested |
2446 | */ | 2465 | */ |
2447 | to_write += wbc->nr_to_write; | ||
2448 | break; | 2466 | break; |
2449 | } | ||
2450 | wbc->nr_to_write = to_write; | ||
2451 | } | ||
2452 | |||
2453 | if (!wbc->range_cyclic && (pages_skipped != wbc->pages_skipped)) { | ||
2454 | /* We skipped pages in this loop */ | ||
2455 | wbc->nr_to_write = to_write + | ||
2456 | wbc->pages_skipped - pages_skipped; | ||
2457 | wbc->pages_skipped = pages_skipped; | ||
2458 | goto restart_loop; | ||
2459 | } | 2467 | } |
2468 | if (pages_skipped != wbc->pages_skipped) | ||
2469 | printk(KERN_EMERG "This should not happen leaving %s " | ||
2470 | "with nr_to_write = %ld ret = %d\n", | ||
2471 | __func__, wbc->nr_to_write, ret); | ||
2472 | |||
2473 | /* Update index */ | ||
2474 | index += pages_written; | ||
2475 | if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) | ||
2476 | /* | ||
2477 | * set the writeback_index so that range_cyclic | ||
2478 | * mode will write it back later | ||
2479 | */ | ||
2480 | mapping->writeback_index = index; | ||
2460 | 2481 | ||
2461 | out_writepages: | 2482 | out_writepages: |
2462 | wbc->nr_to_write = to_write - nr_to_writebump; | 2483 | if (!no_nrwrite_index_update) |
2484 | wbc->no_nrwrite_index_update = 0; | ||
2485 | wbc->nr_to_write -= nr_to_writebump; | ||
2463 | return ret; | 2486 | return ret; |
2464 | } | 2487 | } |
2465 | 2488 | ||