diff options
author | Mingming Cao <cmm@us.ibm.com> | 2008-07-11 19:27:31 -0400 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2008-07-11 19:27:31 -0400 |
commit | 61628a3f3a37af2bf25daf8e26fd6b76a78c4f76 (patch) | |
tree | 25375b739b2e3f65c8dff3d3dd2a78e0724d0f96 /fs/ext4 | |
parent | 06d6cf6959d22037fcec598f4f954db5db3d7356 (diff) |
ext4: Invert lock ordering of page_lock and transaction start in delalloc
With the reverse locking, we need to start a transation before taking
the page lock, so in ext4_da_writepages() we need to break the write-out
into chunks, and restart the journal for each chunck to ensure the
write-out fits in a single transaction.
Updated patch from Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
which fixes delalloc sync hang with journal lock inversion, and address
the performance regression issue.
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Diffstat (limited to 'fs/ext4')
-rw-r--r-- | fs/ext4/extents.c | 10 | ||||
-rw-r--r-- | fs/ext4/inode.c | 201 |
2 files changed, 152 insertions, 59 deletions
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index dabc3b68d249..42c4c0c892ed 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c | |||
@@ -2565,6 +2565,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, | |||
2565 | int err = 0, depth, ret; | 2565 | int err = 0, depth, ret; |
2566 | unsigned long allocated = 0; | 2566 | unsigned long allocated = 0; |
2567 | struct ext4_allocation_request ar; | 2567 | struct ext4_allocation_request ar; |
2568 | loff_t disksize; | ||
2568 | 2569 | ||
2569 | __clear_bit(BH_New, &bh_result->b_state); | 2570 | __clear_bit(BH_New, &bh_result->b_state); |
2570 | ext_debug("blocks %u/%lu requested for inode %u\n", | 2571 | ext_debug("blocks %u/%lu requested for inode %u\n", |
@@ -2755,8 +2756,13 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, | |||
2755 | newblock = ext_pblock(&newex); | 2756 | newblock = ext_pblock(&newex); |
2756 | allocated = ext4_ext_get_actual_len(&newex); | 2757 | allocated = ext4_ext_get_actual_len(&newex); |
2757 | outnew: | 2758 | outnew: |
2758 | if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize) | 2759 | if (extend_disksize) { |
2759 | EXT4_I(inode)->i_disksize = inode->i_size; | 2760 | disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits; |
2761 | if (disksize > i_size_read(inode)) | ||
2762 | disksize = i_size_read(inode); | ||
2763 | if (disksize > EXT4_I(inode)->i_disksize) | ||
2764 | EXT4_I(inode)->i_disksize = disksize; | ||
2765 | } | ||
2760 | 2766 | ||
2761 | set_buffer_new(bh_result); | 2767 | set_buffer_new(bh_result); |
2762 | 2768 | ||
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a6b800c58474..7923336ecf94 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -847,6 +847,7 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, | |||
847 | struct ext4_inode_info *ei = EXT4_I(inode); | 847 | struct ext4_inode_info *ei = EXT4_I(inode); |
848 | int count = 0; | 848 | int count = 0; |
849 | ext4_fsblk_t first_block = 0; | 849 | ext4_fsblk_t first_block = 0; |
850 | loff_t disksize; | ||
850 | 851 | ||
851 | 852 | ||
852 | J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); | 853 | J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); |
@@ -922,8 +923,13 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, | |||
922 | * protect it if you're about to implement concurrent | 923 | * protect it if you're about to implement concurrent |
923 | * ext4_get_block() -bzzz | 924 | * ext4_get_block() -bzzz |
924 | */ | 925 | */ |
925 | if (!err && extend_disksize && inode->i_size > ei->i_disksize) | 926 | if (!err && extend_disksize) { |
926 | ei->i_disksize = inode->i_size; | 927 | disksize = ((loff_t) iblock + count) << inode->i_blkbits; |
928 | if (disksize > i_size_read(inode)) | ||
929 | disksize = i_size_read(inode); | ||
930 | if (disksize > ei->i_disksize) | ||
931 | ei->i_disksize = disksize; | ||
932 | } | ||
927 | if (err) | 933 | if (err) |
928 | goto cleanup; | 934 | goto cleanup; |
929 | 935 | ||
@@ -1683,13 +1689,11 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, | |||
1683 | do { | 1689 | do { |
1684 | if (cur_logical >= logical + blocks) | 1690 | if (cur_logical >= logical + blocks) |
1685 | break; | 1691 | break; |
1686 | |||
1687 | if (buffer_delay(bh)) { | 1692 | if (buffer_delay(bh)) { |
1688 | bh->b_blocknr = pblock; | 1693 | bh->b_blocknr = pblock; |
1689 | clear_buffer_delay(bh); | 1694 | clear_buffer_delay(bh); |
1690 | } else if (buffer_mapped(bh)) { | 1695 | } else if (buffer_mapped(bh)) |
1691 | BUG_ON(bh->b_blocknr != pblock); | 1696 | BUG_ON(bh->b_blocknr != pblock); |
1692 | } | ||
1693 | 1697 | ||
1694 | cur_logical++; | 1698 | cur_logical++; |
1695 | pblock++; | 1699 | pblock++; |
@@ -1764,10 +1768,10 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd) | |||
1764 | if (buffer_delay(lbh)) | 1768 | if (buffer_delay(lbh)) |
1765 | mpage_put_bnr_to_bhs(mpd, next, &new); | 1769 | mpage_put_bnr_to_bhs(mpd, next, &new); |
1766 | 1770 | ||
1767 | /* go for the remaining blocks */ | 1771 | /* go for the remaining blocks */ |
1768 | next += new.b_size >> mpd->inode->i_blkbits; | 1772 | next += new.b_size >> mpd->inode->i_blkbits; |
1769 | remain -= new.b_size; | 1773 | remain -= new.b_size; |
1770 | } | 1774 | } |
1771 | } | 1775 | } |
1772 | 1776 | ||
1773 | #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay)) | 1777 | #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay)) |
@@ -1993,18 +1997,14 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, | |||
1993 | static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, | 1997 | static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, |
1994 | struct buffer_head *bh_result, int create) | 1998 | struct buffer_head *bh_result, int create) |
1995 | { | 1999 | { |
1996 | int ret, needed_blocks = ext4_writepage_trans_blocks(inode); | 2000 | int ret; |
1997 | unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; | 2001 | unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; |
1998 | loff_t disksize = EXT4_I(inode)->i_disksize; | 2002 | loff_t disksize = EXT4_I(inode)->i_disksize; |
1999 | handle_t *handle = NULL; | 2003 | handle_t *handle = NULL; |
2000 | 2004 | ||
2001 | if (create) { | 2005 | handle = ext4_journal_current_handle(); |
2002 | handle = ext4_journal_start(inode, needed_blocks); | 2006 | BUG_ON(handle == NULL); |
2003 | if (IS_ERR(handle)) { | 2007 | BUG_ON(create == 0); |
2004 | ret = PTR_ERR(handle); | ||
2005 | goto out; | ||
2006 | } | ||
2007 | } | ||
2008 | 2008 | ||
2009 | ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, | 2009 | ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, |
2010 | bh_result, create, 0, EXT4_DELALLOC_RSVED); | 2010 | bh_result, create, 0, EXT4_DELALLOC_RSVED); |
@@ -2029,65 +2029,157 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, | |||
2029 | up_write(&EXT4_I(inode)->i_data_sem); | 2029 | up_write(&EXT4_I(inode)->i_data_sem); |
2030 | 2030 | ||
2031 | if (EXT4_I(inode)->i_disksize == disksize) { | 2031 | if (EXT4_I(inode)->i_disksize == disksize) { |
2032 | if (handle == NULL) | 2032 | ret = ext4_mark_inode_dirty(handle, inode); |
2033 | handle = ext4_journal_start(inode, 1); | 2033 | return ret; |
2034 | if (!IS_ERR(handle)) | ||
2035 | ext4_mark_inode_dirty(handle, inode); | ||
2036 | } | 2034 | } |
2037 | } | 2035 | } |
2038 | |||
2039 | ret = 0; | 2036 | ret = 0; |
2040 | } | 2037 | } |
2041 | |||
2042 | out: | ||
2043 | if (handle && !IS_ERR(handle)) | ||
2044 | ext4_journal_stop(handle); | ||
2045 | |||
2046 | return ret; | 2038 | return ret; |
2047 | } | 2039 | } |
2040 | |||
2041 | static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) | ||
2042 | { | ||
2043 | return !buffer_mapped(bh) || buffer_delay(bh); | ||
2044 | } | ||
2045 | |||
2048 | /* FIXME!! only support data=writeback mode */ | 2046 | /* FIXME!! only support data=writeback mode */ |
2047 | /* | ||
2048 | * get called vi ext4_da_writepages after taking page lock | ||
2049 | * We may end up doing block allocation here in case | ||
2050 | * mpage_da_map_blocks failed to allocate blocks. | ||
2051 | */ | ||
2049 | static int ext4_da_writepage(struct page *page, | 2052 | static int ext4_da_writepage(struct page *page, |
2050 | struct writeback_control *wbc) | 2053 | struct writeback_control *wbc) |
2051 | { | 2054 | { |
2052 | struct inode *inode = page->mapping->host; | ||
2053 | handle_t *handle = NULL; | ||
2054 | int ret = 0; | 2055 | int ret = 0; |
2055 | int err; | 2056 | loff_t size; |
2057 | unsigned long len; | ||
2058 | handle_t *handle = NULL; | ||
2059 | struct buffer_head *page_bufs; | ||
2060 | struct inode *inode = page->mapping->host; | ||
2056 | 2061 | ||
2057 | if (ext4_journal_current_handle()) | 2062 | handle = ext4_journal_current_handle(); |
2058 | goto out_fail; | 2063 | if (!handle) { |
2064 | /* | ||
2065 | * This can happen when we aren't called via | ||
2066 | * ext4_da_writepages() but directly (shrink_page_list). | ||
2067 | * We cannot easily start a transaction here so we just skip | ||
2068 | * writing the page in case we would have to do so. | ||
2069 | */ | ||
2070 | size = i_size_read(inode); | ||
2059 | 2071 | ||
2060 | handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); | 2072 | page_bufs = page_buffers(page); |
2061 | if (IS_ERR(handle)) { | 2073 | if (page->index == size >> PAGE_CACHE_SHIFT) |
2062 | ret = PTR_ERR(handle); | 2074 | len = size & ~PAGE_CACHE_MASK; |
2063 | goto out_fail; | 2075 | else |
2076 | len = PAGE_CACHE_SIZE; | ||
2077 | |||
2078 | if (walk_page_buffers(NULL, page_bufs, 0, | ||
2079 | len, NULL, ext4_bh_unmapped_or_delay)) { | ||
2080 | /* | ||
2081 | * We can't do block allocation under | ||
2082 | * page lock without a handle . So redirty | ||
2083 | * the page and return | ||
2084 | */ | ||
2085 | BUG_ON(wbc->sync_mode != WB_SYNC_NONE); | ||
2086 | redirty_page_for_writepage(wbc, page); | ||
2087 | unlock_page(page); | ||
2088 | return 0; | ||
2089 | } | ||
2064 | } | 2090 | } |
2065 | 2091 | ||
2066 | if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) | 2092 | if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) |
2067 | ret = nobh_writepage(page, ext4_get_block, wbc); | 2093 | ret = nobh_writepage(page, ext4_da_get_block_write, wbc); |
2068 | else | 2094 | else |
2069 | ret = block_write_full_page(page, ext4_get_block, wbc); | 2095 | ret = block_write_full_page(page, ext4_da_get_block_write, wbc); |
2070 | |||
2071 | if (!ret && inode->i_size > EXT4_I(inode)->i_disksize) { | ||
2072 | EXT4_I(inode)->i_disksize = inode->i_size; | ||
2073 | ext4_mark_inode_dirty(handle, inode); | ||
2074 | } | ||
2075 | 2096 | ||
2076 | err = ext4_journal_stop(handle); | ||
2077 | if (!ret) | ||
2078 | ret = err; | ||
2079 | return ret; | ||
2080 | |||
2081 | out_fail: | ||
2082 | redirty_page_for_writepage(wbc, page); | ||
2083 | unlock_page(page); | ||
2084 | return ret; | 2097 | return ret; |
2085 | } | 2098 | } |
2086 | 2099 | ||
2100 | |||
2101 | /* | ||
2102 | * For now just follow the DIO way to estimate the max credits | ||
2103 | * needed to write out EXT4_MAX_WRITEBACK_PAGES. | ||
2104 | * todo: need to calculate the max credits need for | ||
2105 | * extent based files, currently the DIO credits is based on | ||
2106 | * indirect-blocks mapping way. | ||
2107 | * | ||
2108 | * Probably should have a generic way to calculate credits | ||
2109 | * for DIO, writepages, and truncate | ||
2110 | */ | ||
2111 | #define EXT4_MAX_WRITEBACK_PAGES DIO_MAX_BLOCKS | ||
2112 | #define EXT4_MAX_WRITEBACK_CREDITS DIO_CREDITS | ||
2113 | |||
2087 | static int ext4_da_writepages(struct address_space *mapping, | 2114 | static int ext4_da_writepages(struct address_space *mapping, |
2088 | struct writeback_control *wbc) | 2115 | struct writeback_control *wbc) |
2089 | { | 2116 | { |
2090 | return mpage_da_writepages(mapping, wbc, ext4_da_get_block_write); | 2117 | struct inode *inode = mapping->host; |
2118 | handle_t *handle = NULL; | ||
2119 | int needed_blocks; | ||
2120 | int ret = 0; | ||
2121 | long to_write; | ||
2122 | loff_t range_start = 0; | ||
2123 | |||
2124 | /* | ||
2125 | * No pages to write? This is mainly a kludge to avoid starting | ||
2126 | * a transaction for special inodes like journal inode on last iput() | ||
2127 | * because that could violate lock ordering on umount | ||
2128 | */ | ||
2129 | if (!mapping->nrpages) | ||
2130 | return 0; | ||
2131 | |||
2132 | /* | ||
2133 | * Estimate the worse case needed credits to write out | ||
2134 | * EXT4_MAX_BUF_BLOCKS pages | ||
2135 | */ | ||
2136 | needed_blocks = EXT4_MAX_WRITEBACK_CREDITS; | ||
2137 | |||
2138 | to_write = wbc->nr_to_write; | ||
2139 | if (!wbc->range_cyclic) { | ||
2140 | /* | ||
2141 | * If range_cyclic is not set force range_cont | ||
2142 | * and save the old writeback_index | ||
2143 | */ | ||
2144 | wbc->range_cont = 1; | ||
2145 | range_start = wbc->range_start; | ||
2146 | } | ||
2147 | |||
2148 | while (!ret && to_write) { | ||
2149 | /* start a new transaction*/ | ||
2150 | handle = ext4_journal_start(inode, needed_blocks); | ||
2151 | if (IS_ERR(handle)) { | ||
2152 | ret = PTR_ERR(handle); | ||
2153 | goto out_writepages; | ||
2154 | } | ||
2155 | /* | ||
2156 | * set the max dirty pages could be write at a time | ||
2157 | * to fit into the reserved transaction credits | ||
2158 | */ | ||
2159 | if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES) | ||
2160 | wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES; | ||
2161 | |||
2162 | to_write -= wbc->nr_to_write; | ||
2163 | ret = mpage_da_writepages(mapping, wbc, | ||
2164 | ext4_da_get_block_write); | ||
2165 | ext4_journal_stop(handle); | ||
2166 | if (wbc->nr_to_write) { | ||
2167 | /* | ||
2168 | * There is no more writeout needed | ||
2169 | * or we requested for a noblocking writeout | ||
2170 | * and we found the device congested | ||
2171 | */ | ||
2172 | to_write += wbc->nr_to_write; | ||
2173 | break; | ||
2174 | } | ||
2175 | wbc->nr_to_write = to_write; | ||
2176 | } | ||
2177 | |||
2178 | out_writepages: | ||
2179 | wbc->nr_to_write = to_write; | ||
2180 | if (range_start) | ||
2181 | wbc->range_start = range_start; | ||
2182 | return ret; | ||
2091 | } | 2183 | } |
2092 | 2184 | ||
2093 | static int ext4_da_write_begin(struct file *file, struct address_space *mapping, | 2185 | static int ext4_da_write_begin(struct file *file, struct address_space *mapping, |
@@ -2137,11 +2229,6 @@ out: | |||
2137 | return ret; | 2229 | return ret; |
2138 | } | 2230 | } |
2139 | 2231 | ||
2140 | static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) | ||
2141 | { | ||
2142 | return !buffer_mapped(bh) || buffer_delay(bh); | ||
2143 | } | ||
2144 | |||
2145 | static int ext4_da_write_end(struct file *file, | 2232 | static int ext4_da_write_end(struct file *file, |
2146 | struct address_space *mapping, | 2233 | struct address_space *mapping, |
2147 | loff_t pos, unsigned len, unsigned copied, | 2234 | loff_t pos, unsigned len, unsigned copied, |