aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorAneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>2008-08-19 21:55:02 -0400
committerTheodore Ts'o <tytso@mit.edu>2008-08-19 21:55:02 -0400
commita1d6cc563bfdf1bf2829d3e6ce4d8b774251796b (patch)
treeee81b6842191beb85f3f3baab817d115633ba456 /fs
parentf3bd1f3fa8ca7ec70cfd87aa94dc5e1a260901f2 (diff)
ext4: Rework the ext4_da_writepages() function
With the below changes we reserve credit needed to insert only one extent resulting from a call to single get_block. This makes sure we don't take too much journal credits during writeout. We also don't limit the pages to write. That means we loop through the dirty pages building largest possible contiguous block request. Then we issue a single get_block request. We may get less block that we requested. If so we would end up not mapping some of the buffer_heads. That means those buffer_heads are still marked delay. Later in the writepage callback via __mpage_writepage we redirty those pages. We should also not limit/throttle wbc->nr_to_write in the filesystem writepages callback. That cause wrong behaviour in generic_sync_sb_inodes caused by wbc->nr_to_write being <= 0 Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> Reviewed-by: Mingming Cao <cmm@us.ibm.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Diffstat (limited to 'fs')
-rw-r--r--fs/ext4/inode.c201
1 files changed, 113 insertions, 88 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ffc95ba48859..8dd22eade42c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -41,6 +41,8 @@
41#include "acl.h" 41#include "acl.h"
42#include "ext4_extents.h" 42#include "ext4_extents.h"
43 43
44#define MPAGE_DA_EXTENT_TAIL 0x01
45
44static inline int ext4_begin_ordered_truncate(struct inode *inode, 46static inline int ext4_begin_ordered_truncate(struct inode *inode,
45 loff_t new_size) 47 loff_t new_size)
46{ 48{
@@ -1626,11 +1628,13 @@ struct mpage_da_data {
1626 unsigned long first_page, next_page; /* extent of pages */ 1628 unsigned long first_page, next_page; /* extent of pages */
1627 get_block_t *get_block; 1629 get_block_t *get_block;
1628 struct writeback_control *wbc; 1630 struct writeback_control *wbc;
1631 int io_done;
1632 long pages_written;
1629}; 1633};
1630 1634
1631/* 1635/*
1632 * mpage_da_submit_io - walks through extent of pages and try to write 1636 * mpage_da_submit_io - walks through extent of pages and try to write
1633 * them with __mpage_writepage() 1637 * them with writepage() call back
1634 * 1638 *
1635 * @mpd->inode: inode 1639 * @mpd->inode: inode
1636 * @mpd->first_page: first page of the extent 1640 * @mpd->first_page: first page of the extent
@@ -1645,18 +1649,11 @@ struct mpage_da_data {
1645static int mpage_da_submit_io(struct mpage_da_data *mpd) 1649static int mpage_da_submit_io(struct mpage_da_data *mpd)
1646{ 1650{
1647 struct address_space *mapping = mpd->inode->i_mapping; 1651 struct address_space *mapping = mpd->inode->i_mapping;
1648 struct mpage_data mpd_pp = {
1649 .bio = NULL,
1650 .last_block_in_bio = 0,
1651 .get_block = mpd->get_block,
1652 .use_writepage = 1,
1653 };
1654 int ret = 0, err, nr_pages, i; 1652 int ret = 0, err, nr_pages, i;
1655 unsigned long index, end; 1653 unsigned long index, end;
1656 struct pagevec pvec; 1654 struct pagevec pvec;
1657 1655
1658 BUG_ON(mpd->next_page <= mpd->first_page); 1656 BUG_ON(mpd->next_page <= mpd->first_page);
1659
1660 pagevec_init(&pvec, 0); 1657 pagevec_init(&pvec, 0);
1661 index = mpd->first_page; 1658 index = mpd->first_page;
1662 end = mpd->next_page - 1; 1659 end = mpd->next_page - 1;
@@ -1674,8 +1671,9 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
1674 break; 1671 break;
1675 index++; 1672 index++;
1676 1673
1677 err = __mpage_writepage(page, mpd->wbc, &mpd_pp); 1674 err = mapping->a_ops->writepage(page, mpd->wbc);
1678 1675 if (!err)
1676 mpd->pages_written++;
1679 /* 1677 /*
1680 * In error case, we have to continue because 1678 * In error case, we have to continue because
1681 * remaining pages are still locked 1679 * remaining pages are still locked
@@ -1686,9 +1684,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
1686 } 1684 }
1687 pagevec_release(&pvec); 1685 pagevec_release(&pvec);
1688 } 1686 }
1689 if (mpd_pp.bio)
1690 mpage_bio_submit(WRITE, mpd_pp.bio);
1691
1692 return ret; 1687 return ret;
1693} 1688}
1694 1689
@@ -1711,7 +1706,7 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
1711 int blocks = exbh->b_size >> inode->i_blkbits; 1706 int blocks = exbh->b_size >> inode->i_blkbits;
1712 sector_t pblock = exbh->b_blocknr, cur_logical; 1707 sector_t pblock = exbh->b_blocknr, cur_logical;
1713 struct buffer_head *head, *bh; 1708 struct buffer_head *head, *bh;
1714 unsigned long index, end; 1709 pgoff_t index, end;
1715 struct pagevec pvec; 1710 struct pagevec pvec;
1716 int nr_pages, i; 1711 int nr_pages, i;
1717 1712
@@ -1796,13 +1791,11 @@ static inline void __unmap_underlying_blocks(struct inode *inode,
1796 * 1791 *
1797 * The function skips space we know is already mapped to disk blocks. 1792 * The function skips space we know is already mapped to disk blocks.
1798 * 1793 *
1799 * The function ignores errors ->get_block() returns, thus real
1800 * error handling is postponed to __mpage_writepage()
1801 */ 1794 */
1802static void mpage_da_map_blocks(struct mpage_da_data *mpd) 1795static void mpage_da_map_blocks(struct mpage_da_data *mpd)
1803{ 1796{
1797 int err = 0;
1804 struct buffer_head *lbh = &mpd->lbh; 1798 struct buffer_head *lbh = &mpd->lbh;
1805 int err = 0, remain = lbh->b_size;
1806 sector_t next = lbh->b_blocknr; 1799 sector_t next = lbh->b_blocknr;
1807 struct buffer_head new; 1800 struct buffer_head new;
1808 1801
@@ -1812,35 +1805,32 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
1812 if (buffer_mapped(lbh) && !buffer_delay(lbh)) 1805 if (buffer_mapped(lbh) && !buffer_delay(lbh))
1813 return; 1806 return;
1814 1807
1815 while (remain) { 1808 new.b_state = lbh->b_state;
1816 new.b_state = lbh->b_state; 1809 new.b_blocknr = 0;
1817 new.b_blocknr = 0; 1810 new.b_size = lbh->b_size;
1818 new.b_size = remain;
1819 err = mpd->get_block(mpd->inode, next, &new, 1);
1820 if (err) {
1821 /*
1822 * Rather than implement own error handling
1823 * here, we just leave remaining blocks
1824 * unallocated and try again with ->writepage()
1825 */
1826 break;
1827 }
1828 BUG_ON(new.b_size == 0);
1829 1811
1830 if (buffer_new(&new)) 1812 /*
1831 __unmap_underlying_blocks(mpd->inode, &new); 1813 * If we didn't accumulate anything
1814 * to write simply return
1815 */
1816 if (!new.b_size)
1817 return;
1818 err = mpd->get_block(mpd->inode, next, &new, 1);
1819 if (err)
1820 return;
1821 BUG_ON(new.b_size == 0);
1832 1822
1833 /* 1823 if (buffer_new(&new))
1834 * If blocks are delayed marked, we need to 1824 __unmap_underlying_blocks(mpd->inode, &new);
1835 * put actual blocknr and drop delayed bit
1836 */
1837 if (buffer_delay(lbh) || buffer_unwritten(lbh))
1838 mpage_put_bnr_to_bhs(mpd, next, &new);
1839 1825
1840 /* go for the remaining blocks */ 1826 /*
1841 next += new.b_size >> mpd->inode->i_blkbits; 1827 * If blocks are delayed marked, we need to
1842 remain -= new.b_size; 1828 * put actual blocknr and drop delayed bit
1843 } 1829 */
1830 if (buffer_delay(lbh) || buffer_unwritten(lbh))
1831 mpage_put_bnr_to_bhs(mpd, next, &new);
1832
1833 return;
1844} 1834}
1845 1835
1846#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ 1836#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
@@ -1886,13 +1876,9 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
1886 * need to flush current extent and start new one 1876 * need to flush current extent and start new one
1887 */ 1877 */
1888 mpage_da_map_blocks(mpd); 1878 mpage_da_map_blocks(mpd);
1889 1879 mpage_da_submit_io(mpd);
1890 /* 1880 mpd->io_done = 1;
1891 * Now start a new extent 1881 return;
1892 */
1893 lbh->b_size = bh->b_size;
1894 lbh->b_state = bh->b_state & BH_FLAGS;
1895 lbh->b_blocknr = logical;
1896} 1882}
1897 1883
1898/* 1884/*
@@ -1912,17 +1898,35 @@ static int __mpage_da_writepage(struct page *page,
1912 struct buffer_head *bh, *head, fake; 1898 struct buffer_head *bh, *head, fake;
1913 sector_t logical; 1899 sector_t logical;
1914 1900
1901 if (mpd->io_done) {
1902 /*
1903 * Rest of the page in the page_vec
1904 * redirty then and skip then. We will
1905 * try to to write them again after
1906 * starting a new transaction
1907 */
1908 redirty_page_for_writepage(wbc, page);
1909 unlock_page(page);
1910 return MPAGE_DA_EXTENT_TAIL;
1911 }
1915 /* 1912 /*
1916 * Can we merge this page to current extent? 1913 * Can we merge this page to current extent?
1917 */ 1914 */
1918 if (mpd->next_page != page->index) { 1915 if (mpd->next_page != page->index) {
1919 /* 1916 /*
1920 * Nope, we can't. So, we map non-allocated blocks 1917 * Nope, we can't. So, we map non-allocated blocks
1921 * and start IO on them using __mpage_writepage() 1918 * and start IO on them using writepage()
1922 */ 1919 */
1923 if (mpd->next_page != mpd->first_page) { 1920 if (mpd->next_page != mpd->first_page) {
1924 mpage_da_map_blocks(mpd); 1921 mpage_da_map_blocks(mpd);
1925 mpage_da_submit_io(mpd); 1922 mpage_da_submit_io(mpd);
1923 /*
1924 * skip rest of the page in the page_vec
1925 */
1926 mpd->io_done = 1;
1927 redirty_page_for_writepage(wbc, page);
1928 unlock_page(page);
1929 return MPAGE_DA_EXTENT_TAIL;
1926 } 1930 }
1927 1931
1928 /* 1932 /*
@@ -1953,6 +1957,8 @@ static int __mpage_da_writepage(struct page *page,
1953 set_buffer_dirty(bh); 1957 set_buffer_dirty(bh);
1954 set_buffer_uptodate(bh); 1958 set_buffer_uptodate(bh);
1955 mpage_add_bh_to_extent(mpd, logical, bh); 1959 mpage_add_bh_to_extent(mpd, logical, bh);
1960 if (mpd->io_done)
1961 return MPAGE_DA_EXTENT_TAIL;
1956 } else { 1962 } else {
1957 /* 1963 /*
1958 * Page with regular buffer heads, just add all dirty ones 1964 * Page with regular buffer heads, just add all dirty ones
@@ -1961,8 +1967,12 @@ static int __mpage_da_writepage(struct page *page,
1961 bh = head; 1967 bh = head;
1962 do { 1968 do {
1963 BUG_ON(buffer_locked(bh)); 1969 BUG_ON(buffer_locked(bh));
1964 if (buffer_dirty(bh)) 1970 if (buffer_dirty(bh) &&
1971 (!buffer_mapped(bh) || buffer_delay(bh))) {
1965 mpage_add_bh_to_extent(mpd, logical, bh); 1972 mpage_add_bh_to_extent(mpd, logical, bh);
1973 if (mpd->io_done)
1974 return MPAGE_DA_EXTENT_TAIL;
1975 }
1966 logical++; 1976 logical++;
1967 } while ((bh = bh->b_this_page) != head); 1977 } while ((bh = bh->b_this_page) != head);
1968 } 1978 }
@@ -1981,22 +1991,13 @@ static int __mpage_da_writepage(struct page *page,
1981 * 1991 *
1982 * This is a library function, which implements the writepages() 1992 * This is a library function, which implements the writepages()
1983 * address_space_operation. 1993 * address_space_operation.
1984 *
1985 * In order to avoid duplication of logic that deals with partial pages,
1986 * multiple bio per page, etc, we find non-allocated blocks, allocate
1987 * them with minimal calls to ->get_block() and re-use __mpage_writepage()
1988 *
1989 * It's important that we call __mpage_writepage() only once for each
1990 * involved page, otherwise we'd have to implement more complicated logic
1991 * to deal with pages w/o PG_lock or w/ PG_writeback and so on.
1992 *
1993 * See comments to mpage_writepages()
1994 */ 1994 */
1995static int mpage_da_writepages(struct address_space *mapping, 1995static int mpage_da_writepages(struct address_space *mapping,
1996 struct writeback_control *wbc, 1996 struct writeback_control *wbc,
1997 get_block_t get_block) 1997 get_block_t get_block)
1998{ 1998{
1999 struct mpage_da_data mpd; 1999 struct mpage_da_data mpd;
2000 long to_write;
2000 int ret; 2001 int ret;
2001 2002
2002 if (!get_block) 2003 if (!get_block)
@@ -2010,17 +2011,22 @@ static int mpage_da_writepages(struct address_space *mapping,
2010 mpd.first_page = 0; 2011 mpd.first_page = 0;
2011 mpd.next_page = 0; 2012 mpd.next_page = 0;
2012 mpd.get_block = get_block; 2013 mpd.get_block = get_block;
2014 mpd.io_done = 0;
2015 mpd.pages_written = 0;
2016
2017 to_write = wbc->nr_to_write;
2013 2018
2014 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd); 2019 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
2015 2020
2016 /* 2021 /*
2017 * Handle last extent of pages 2022 * Handle last extent of pages
2018 */ 2023 */
2019 if (mpd.next_page != mpd.first_page) { 2024 if (!mpd.io_done && mpd.next_page != mpd.first_page) {
2020 mpage_da_map_blocks(&mpd); 2025 mpage_da_map_blocks(&mpd);
2021 mpage_da_submit_io(&mpd); 2026 mpage_da_submit_io(&mpd);
2022 } 2027 }
2023 2028
2029 wbc->nr_to_write = to_write - mpd.pages_written;
2024 return ret; 2030 return ret;
2025} 2031}
2026 2032
@@ -2238,7 +2244,7 @@ static int ext4_da_writepage(struct page *page,
2238#define EXT4_MAX_WRITEBACK_CREDITS 25 2244#define EXT4_MAX_WRITEBACK_CREDITS 25
2239 2245
2240static int ext4_da_writepages(struct address_space *mapping, 2246static int ext4_da_writepages(struct address_space *mapping,
2241 struct writeback_control *wbc) 2247 struct writeback_control *wbc)
2242{ 2248{
2243 struct inode *inode = mapping->host; 2249 struct inode *inode = mapping->host;
2244 handle_t *handle = NULL; 2250 handle_t *handle = NULL;
@@ -2246,42 +2252,53 @@ static int ext4_da_writepages(struct address_space *mapping,
2246 int ret = 0; 2252 int ret = 0;
2247 long to_write; 2253 long to_write;
2248 loff_t range_start = 0; 2254 loff_t range_start = 0;
2255 long pages_skipped = 0;
2249 2256
2250 /* 2257 /*
2251 * No pages to write? This is mainly a kludge to avoid starting 2258 * No pages to write? This is mainly a kludge to avoid starting
2252 * a transaction for special inodes like journal inode on last iput() 2259 * a transaction for special inodes like journal inode on last iput()
2253 * because that could violate lock ordering on umount 2260 * because that could violate lock ordering on umount
2254 */ 2261 */
2255 if (!mapping->nrpages) 2262 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
2256 return 0; 2263 return 0;
2257 2264
2258 /* 2265 if (!wbc->range_cyclic)
2259 * Estimate the worse case needed credits to write out
2260 * EXT4_MAX_BUF_BLOCKS pages
2261 */
2262 needed_blocks = EXT4_MAX_WRITEBACK_CREDITS;
2263
2264 to_write = wbc->nr_to_write;
2265 if (!wbc->range_cyclic) {
2266 /* 2266 /*
2267 * If range_cyclic is not set force range_cont 2267 * If range_cyclic is not set force range_cont
2268 * and save the old writeback_index 2268 * and save the old writeback_index
2269 */ 2269 */
2270 wbc->range_cont = 1; 2270 wbc->range_cont = 1;
2271 range_start = wbc->range_start;
2272 }
2273 2271
2274 while (!ret && to_write) { 2272 range_start = wbc->range_start;
2273 pages_skipped = wbc->pages_skipped;
2274
2275restart_loop:
2276 to_write = wbc->nr_to_write;
2277 while (!ret && to_write > 0) {
2278
2279 /*
2280 * we insert one extent at a time. So we need
2281 * credit needed for single extent allocation.
2282 * journalled mode is currently not supported
2283 * by delalloc
2284 */
2285 BUG_ON(ext4_should_journal_data(inode));
2286 needed_blocks = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
2287
2275 /* start a new transaction*/ 2288 /* start a new transaction*/
2276 handle = ext4_journal_start(inode, needed_blocks); 2289 handle = ext4_journal_start(inode, needed_blocks);
2277 if (IS_ERR(handle)) { 2290 if (IS_ERR(handle)) {
2278 ret = PTR_ERR(handle); 2291 ret = PTR_ERR(handle);
2292 printk(KERN_EMERG "%s: jbd2_start: "
2293 "%ld pages, ino %lu; err %d\n", __func__,
2294 wbc->nr_to_write, inode->i_ino, ret);
2295 dump_stack();
2279 goto out_writepages; 2296 goto out_writepages;
2280 } 2297 }
2281 if (ext4_should_order_data(inode)) { 2298 if (ext4_should_order_data(inode)) {
2282 /* 2299 /*
2283 * With ordered mode we need to add 2300 * With ordered mode we need to add
2284 * the inode to the journal handle 2301 * the inode to the journal handl
2285 * when we do block allocation. 2302 * when we do block allocation.
2286 */ 2303 */
2287 ret = ext4_jbd2_file_inode(handle, inode); 2304 ret = ext4_jbd2_file_inode(handle, inode);
@@ -2289,20 +2306,20 @@ static int ext4_da_writepages(struct address_space *mapping,
2289 ext4_journal_stop(handle); 2306 ext4_journal_stop(handle);
2290 goto out_writepages; 2307 goto out_writepages;
2291 } 2308 }
2292
2293 } 2309 }
2294 /*
2295 * set the max dirty pages could be write at a time
2296 * to fit into the reserved transaction credits
2297 */
2298 if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
2299 wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
2300 2310
2301 to_write -= wbc->nr_to_write; 2311 to_write -= wbc->nr_to_write;
2302 ret = mpage_da_writepages(mapping, wbc, 2312 ret = mpage_da_writepages(mapping, wbc,
2303 ext4_da_get_block_write); 2313 ext4_da_get_block_write);
2304 ext4_journal_stop(handle); 2314 ext4_journal_stop(handle);
2305 if (wbc->nr_to_write) { 2315 if (ret == MPAGE_DA_EXTENT_TAIL) {
2316 /*
2317 * got one extent now try with
2318 * rest of the pages
2319 */
2320 to_write += wbc->nr_to_write;
2321 ret = 0;
2322 } else if (wbc->nr_to_write) {
2306 /* 2323 /*
2307 * There is no more writeout needed 2324 * There is no more writeout needed
2308 * or we requested for a noblocking writeout 2325 * or we requested for a noblocking writeout
@@ -2314,10 +2331,18 @@ static int ext4_da_writepages(struct address_space *mapping,
2314 wbc->nr_to_write = to_write; 2331 wbc->nr_to_write = to_write;
2315 } 2332 }
2316 2333
2334 if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) {
2335 /* We skipped pages in this loop */
2336 wbc->range_start = range_start;
2337 wbc->nr_to_write = to_write +
2338 wbc->pages_skipped - pages_skipped;
2339 wbc->pages_skipped = pages_skipped;
2340 goto restart_loop;
2341 }
2342
2317out_writepages: 2343out_writepages:
2318 wbc->nr_to_write = to_write; 2344 wbc->nr_to_write = to_write;
2319 if (range_start) 2345 wbc->range_start = range_start;
2320 wbc->range_start = range_start;
2321 return ret; 2346 return ret;
2322} 2347}
2323 2348