aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMingming Cao <cmm@us.ibm.com>2008-07-11 19:27:31 -0400
committerTheodore Ts'o <tytso@mit.edu>2008-07-11 19:27:31 -0400
commit61628a3f3a37af2bf25daf8e26fd6b76a78c4f76 (patch)
tree25375b739b2e3f65c8dff3d3dd2a78e0724d0f96
parent06d6cf6959d22037fcec598f4f954db5db3d7356 (diff)
ext4: Invert lock ordering of page_lock and transaction start in delalloc
With the reverse locking, we need to start a transation before taking the page lock, so in ext4_da_writepages() we need to break the write-out into chunks, and restart the journal for each chunck to ensure the write-out fits in a single transaction. Updated patch from Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> which fixes delalloc sync hang with journal lock inversion, and address the performance regression issue. Signed-off-by: Mingming Cao <cmm@us.ibm.com> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
-rw-r--r--fs/ext4/extents.c10
-rw-r--r--fs/ext4/inode.c201
2 files changed, 152 insertions, 59 deletions
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index dabc3b68d249..42c4c0c892ed 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2565,6 +2565,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2565 int err = 0, depth, ret; 2565 int err = 0, depth, ret;
2566 unsigned long allocated = 0; 2566 unsigned long allocated = 0;
2567 struct ext4_allocation_request ar; 2567 struct ext4_allocation_request ar;
2568 loff_t disksize;
2568 2569
2569 __clear_bit(BH_New, &bh_result->b_state); 2570 __clear_bit(BH_New, &bh_result->b_state);
2570 ext_debug("blocks %u/%lu requested for inode %u\n", 2571 ext_debug("blocks %u/%lu requested for inode %u\n",
@@ -2755,8 +2756,13 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2755 newblock = ext_pblock(&newex); 2756 newblock = ext_pblock(&newex);
2756 allocated = ext4_ext_get_actual_len(&newex); 2757 allocated = ext4_ext_get_actual_len(&newex);
2757outnew: 2758outnew:
2758 if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize) 2759 if (extend_disksize) {
2759 EXT4_I(inode)->i_disksize = inode->i_size; 2760 disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits;
2761 if (disksize > i_size_read(inode))
2762 disksize = i_size_read(inode);
2763 if (disksize > EXT4_I(inode)->i_disksize)
2764 EXT4_I(inode)->i_disksize = disksize;
2765 }
2760 2766
2761 set_buffer_new(bh_result); 2767 set_buffer_new(bh_result);
2762 2768
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a6b800c58474..7923336ecf94 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -847,6 +847,7 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
847 struct ext4_inode_info *ei = EXT4_I(inode); 847 struct ext4_inode_info *ei = EXT4_I(inode);
848 int count = 0; 848 int count = 0;
849 ext4_fsblk_t first_block = 0; 849 ext4_fsblk_t first_block = 0;
850 loff_t disksize;
850 851
851 852
852 J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); 853 J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
@@ -922,8 +923,13 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
922 * protect it if you're about to implement concurrent 923 * protect it if you're about to implement concurrent
923 * ext4_get_block() -bzzz 924 * ext4_get_block() -bzzz
924 */ 925 */
925 if (!err && extend_disksize && inode->i_size > ei->i_disksize) 926 if (!err && extend_disksize) {
926 ei->i_disksize = inode->i_size; 927 disksize = ((loff_t) iblock + count) << inode->i_blkbits;
928 if (disksize > i_size_read(inode))
929 disksize = i_size_read(inode);
930 if (disksize > ei->i_disksize)
931 ei->i_disksize = disksize;
932 }
927 if (err) 933 if (err)
928 goto cleanup; 934 goto cleanup;
929 935
@@ -1683,13 +1689,11 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
1683 do { 1689 do {
1684 if (cur_logical >= logical + blocks) 1690 if (cur_logical >= logical + blocks)
1685 break; 1691 break;
1686
1687 if (buffer_delay(bh)) { 1692 if (buffer_delay(bh)) {
1688 bh->b_blocknr = pblock; 1693 bh->b_blocknr = pblock;
1689 clear_buffer_delay(bh); 1694 clear_buffer_delay(bh);
1690 } else if (buffer_mapped(bh)) { 1695 } else if (buffer_mapped(bh))
1691 BUG_ON(bh->b_blocknr != pblock); 1696 BUG_ON(bh->b_blocknr != pblock);
1692 }
1693 1697
1694 cur_logical++; 1698 cur_logical++;
1695 pblock++; 1699 pblock++;
@@ -1764,10 +1768,10 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
1764 if (buffer_delay(lbh)) 1768 if (buffer_delay(lbh))
1765 mpage_put_bnr_to_bhs(mpd, next, &new); 1769 mpage_put_bnr_to_bhs(mpd, next, &new);
1766 1770
1767 /* go for the remaining blocks */ 1771 /* go for the remaining blocks */
1768 next += new.b_size >> mpd->inode->i_blkbits; 1772 next += new.b_size >> mpd->inode->i_blkbits;
1769 remain -= new.b_size; 1773 remain -= new.b_size;
1770 } 1774 }
1771} 1775}
1772 1776
1773#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay)) 1777#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
@@ -1993,18 +1997,14 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
1993static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, 1997static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
1994 struct buffer_head *bh_result, int create) 1998 struct buffer_head *bh_result, int create)
1995{ 1999{
1996 int ret, needed_blocks = ext4_writepage_trans_blocks(inode); 2000 int ret;
1997 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 2001 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
1998 loff_t disksize = EXT4_I(inode)->i_disksize; 2002 loff_t disksize = EXT4_I(inode)->i_disksize;
1999 handle_t *handle = NULL; 2003 handle_t *handle = NULL;
2000 2004
2001 if (create) { 2005 handle = ext4_journal_current_handle();
2002 handle = ext4_journal_start(inode, needed_blocks); 2006 BUG_ON(handle == NULL);
2003 if (IS_ERR(handle)) { 2007 BUG_ON(create == 0);
2004 ret = PTR_ERR(handle);
2005 goto out;
2006 }
2007 }
2008 2008
2009 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, 2009 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
2010 bh_result, create, 0, EXT4_DELALLOC_RSVED); 2010 bh_result, create, 0, EXT4_DELALLOC_RSVED);
@@ -2029,65 +2029,157 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
2029 up_write(&EXT4_I(inode)->i_data_sem); 2029 up_write(&EXT4_I(inode)->i_data_sem);
2030 2030
2031 if (EXT4_I(inode)->i_disksize == disksize) { 2031 if (EXT4_I(inode)->i_disksize == disksize) {
2032 if (handle == NULL) 2032 ret = ext4_mark_inode_dirty(handle, inode);
2033 handle = ext4_journal_start(inode, 1); 2033 return ret;
2034 if (!IS_ERR(handle))
2035 ext4_mark_inode_dirty(handle, inode);
2036 } 2034 }
2037 } 2035 }
2038
2039 ret = 0; 2036 ret = 0;
2040 } 2037 }
2041
2042out:
2043 if (handle && !IS_ERR(handle))
2044 ext4_journal_stop(handle);
2045
2046 return ret; 2038 return ret;
2047} 2039}
2040
2041static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
2042{
2043 return !buffer_mapped(bh) || buffer_delay(bh);
2044}
2045
2048/* FIXME!! only support data=writeback mode */ 2046/* FIXME!! only support data=writeback mode */
2047/*
2048 * get called vi ext4_da_writepages after taking page lock
2049 * We may end up doing block allocation here in case
2050 * mpage_da_map_blocks failed to allocate blocks.
2051 */
2049static int ext4_da_writepage(struct page *page, 2052static int ext4_da_writepage(struct page *page,
2050 struct writeback_control *wbc) 2053 struct writeback_control *wbc)
2051{ 2054{
2052 struct inode *inode = page->mapping->host;
2053 handle_t *handle = NULL;
2054 int ret = 0; 2055 int ret = 0;
2055 int err; 2056 loff_t size;
2057 unsigned long len;
2058 handle_t *handle = NULL;
2059 struct buffer_head *page_bufs;
2060 struct inode *inode = page->mapping->host;
2056 2061
2057 if (ext4_journal_current_handle()) 2062 handle = ext4_journal_current_handle();
2058 goto out_fail; 2063 if (!handle) {
2064 /*
2065 * This can happen when we aren't called via
2066 * ext4_da_writepages() but directly (shrink_page_list).
2067 * We cannot easily start a transaction here so we just skip
2068 * writing the page in case we would have to do so.
2069 */
2070 size = i_size_read(inode);
2059 2071
2060 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); 2072 page_bufs = page_buffers(page);
2061 if (IS_ERR(handle)) { 2073 if (page->index == size >> PAGE_CACHE_SHIFT)
2062 ret = PTR_ERR(handle); 2074 len = size & ~PAGE_CACHE_MASK;
2063 goto out_fail; 2075 else
2076 len = PAGE_CACHE_SIZE;
2077
2078 if (walk_page_buffers(NULL, page_bufs, 0,
2079 len, NULL, ext4_bh_unmapped_or_delay)) {
2080 /*
2081 * We can't do block allocation under
2082 * page lock without a handle . So redirty
2083 * the page and return
2084 */
2085 BUG_ON(wbc->sync_mode != WB_SYNC_NONE);
2086 redirty_page_for_writepage(wbc, page);
2087 unlock_page(page);
2088 return 0;
2089 }
2064 } 2090 }
2065 2091
2066 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) 2092 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
2067 ret = nobh_writepage(page, ext4_get_block, wbc); 2093 ret = nobh_writepage(page, ext4_da_get_block_write, wbc);
2068 else 2094 else
2069 ret = block_write_full_page(page, ext4_get_block, wbc); 2095 ret = block_write_full_page(page, ext4_da_get_block_write, wbc);
2070
2071 if (!ret && inode->i_size > EXT4_I(inode)->i_disksize) {
2072 EXT4_I(inode)->i_disksize = inode->i_size;
2073 ext4_mark_inode_dirty(handle, inode);
2074 }
2075 2096
2076 err = ext4_journal_stop(handle);
2077 if (!ret)
2078 ret = err;
2079 return ret;
2080
2081out_fail:
2082 redirty_page_for_writepage(wbc, page);
2083 unlock_page(page);
2084 return ret; 2097 return ret;
2085} 2098}
2086 2099
2100
2101/*
2102 * For now just follow the DIO way to estimate the max credits
2103 * needed to write out EXT4_MAX_WRITEBACK_PAGES.
2104 * todo: need to calculate the max credits need for
2105 * extent based files, currently the DIO credits is based on
2106 * indirect-blocks mapping way.
2107 *
2108 * Probably should have a generic way to calculate credits
2109 * for DIO, writepages, and truncate
2110 */
2111#define EXT4_MAX_WRITEBACK_PAGES DIO_MAX_BLOCKS
2112#define EXT4_MAX_WRITEBACK_CREDITS DIO_CREDITS
2113
2087static int ext4_da_writepages(struct address_space *mapping, 2114static int ext4_da_writepages(struct address_space *mapping,
2088 struct writeback_control *wbc) 2115 struct writeback_control *wbc)
2089{ 2116{
2090 return mpage_da_writepages(mapping, wbc, ext4_da_get_block_write); 2117 struct inode *inode = mapping->host;
2118 handle_t *handle = NULL;
2119 int needed_blocks;
2120 int ret = 0;
2121 long to_write;
2122 loff_t range_start = 0;
2123
2124 /*
2125 * No pages to write? This is mainly a kludge to avoid starting
2126 * a transaction for special inodes like journal inode on last iput()
2127 * because that could violate lock ordering on umount
2128 */
2129 if (!mapping->nrpages)
2130 return 0;
2131
2132 /*
2133 * Estimate the worse case needed credits to write out
2134 * EXT4_MAX_BUF_BLOCKS pages
2135 */
2136 needed_blocks = EXT4_MAX_WRITEBACK_CREDITS;
2137
2138 to_write = wbc->nr_to_write;
2139 if (!wbc->range_cyclic) {
2140 /*
2141 * If range_cyclic is not set force range_cont
2142 * and save the old writeback_index
2143 */
2144 wbc->range_cont = 1;
2145 range_start = wbc->range_start;
2146 }
2147
2148 while (!ret && to_write) {
2149 /* start a new transaction*/
2150 handle = ext4_journal_start(inode, needed_blocks);
2151 if (IS_ERR(handle)) {
2152 ret = PTR_ERR(handle);
2153 goto out_writepages;
2154 }
2155 /*
2156 * set the max dirty pages could be write at a time
2157 * to fit into the reserved transaction credits
2158 */
2159 if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
2160 wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
2161
2162 to_write -= wbc->nr_to_write;
2163 ret = mpage_da_writepages(mapping, wbc,
2164 ext4_da_get_block_write);
2165 ext4_journal_stop(handle);
2166 if (wbc->nr_to_write) {
2167 /*
2168 * There is no more writeout needed
2169 * or we requested for a noblocking writeout
2170 * and we found the device congested
2171 */
2172 to_write += wbc->nr_to_write;
2173 break;
2174 }
2175 wbc->nr_to_write = to_write;
2176 }
2177
2178out_writepages:
2179 wbc->nr_to_write = to_write;
2180 if (range_start)
2181 wbc->range_start = range_start;
2182 return ret;
2091} 2183}
2092 2184
2093static int ext4_da_write_begin(struct file *file, struct address_space *mapping, 2185static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
@@ -2137,11 +2229,6 @@ out:
2137 return ret; 2229 return ret;
2138} 2230}
2139 2231
2140static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
2141{
2142 return !buffer_mapped(bh) || buffer_delay(bh);
2143}
2144
2145static int ext4_da_write_end(struct file *file, 2232static int ext4_da_write_end(struct file *file,
2146 struct address_space *mapping, 2233 struct address_space *mapping,
2147 loff_t pos, unsigned len, unsigned copied, 2234 loff_t pos, unsigned len, unsigned copied,