From 89a4e48f8479f8145eca9698f39fe188c982212f Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 17 Aug 2012 08:54:52 -0400 Subject: ext4: fix kernel BUG on large-scale rm -rf commands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 968dee7722: "ext4: fix hole punch failure when depth is greater than 0" introduced a regression in v3.5.1/v3.6-rc1 which caused kernel crashes when users ran run "rm -rf" on large directory hierarchy on ext4 filesystems on RAID devices: BUG: unable to handle kernel NULL pointer dereference at 0000000000000028 Process rm (pid: 18229, threadinfo ffff8801276bc000, task ffff880123631710) Call Trace: [] ? __ext4_handle_dirty_metadata+0x83/0x110 [] ext4_ext_truncate+0x193/0x1d0 [] ? ext4_mark_inode_dirty+0x7f/0x1f0 [] ext4_truncate+0xf5/0x100 [] ext4_evict_inode+0x461/0x490 [] evict+0xa2/0x1a0 [] iput+0x103/0x1f0 [] do_unlinkat+0x154/0x1c0 [] ? sys_newfstatat+0x2a/0x40 [] sys_unlinkat+0x1b/0x50 [] system_call_fastpath+0x16/0x1b Code: 8b 4d 20 0f b7 41 02 48 8d 04 40 48 8d 04 81 49 89 45 18 0f b7 49 02 48 83 c1 01 49 89 4d 00 e9 ae f8 ff ff 0f 1f 00 49 8b 45 28 <48> 8b 40 28 49 89 45 20 e9 85 f8 ff ff 0f 1f 80 00 00 00 RIP [] ext4_ext_remove_space+0xa34/0xdf0 This could be reproduced as follows: The problem in commit 968dee7722 was that caused the variable 'i' to be left uninitialized if the truncate required more space than was available in the journal. This resulted in the function ext4_ext_truncate_extend_restart() returning -EAGAIN, which caused ext4_ext_remove_space() to restart the truncate operation after starting a new jbd2 handle. Reported-by: Maciej Żenczykowski Reported-by: Marti Raudsepp Tested-by: Fengguang Wu Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/ext4/extents.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index cd0c7ed06772..aabbb3f53683 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2662,6 +2662,7 @@ cont: } path[0].p_depth = depth; path[0].p_hdr = ext_inode_hdr(inode); + i = 0; if (ext4_ext_check(inode, path[0].p_hdr, depth)) { err = -EIO; -- cgit v1.2.2 From ecb94f5fdf4b72547fca022421a9dca1672bddd4 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 17 Aug 2012 09:44:17 -0400 Subject: ext4: collapse a single extent tree block into the inode if possible If an inode has more than 4 extents, but then later some of the extents are merged together, we can optimize the file system by moving the extents up into the inode, and discarding the extent tree block. This is important, because if there are a large number of inodes with an external extent tree blocks where the contents could fit in the inode, this can significantly increase the fsck time of the file system. Google-Bug-Id: 6801242 Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 72 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 58 insertions(+), 14 deletions(-) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index aabbb3f53683..e8755c21f4b9 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1655,17 +1655,61 @@ static int ext4_ext_try_to_merge_right(struct inode *inode, return merge_done; } +/* + * This function does a very simple check to see if we can collapse + * an extent tree with a single extent tree leaf block into the inode. + */ +static void ext4_ext_try_to_merge_up(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path) +{ + size_t s; + unsigned max_root = ext4_ext_space_root(inode, 0); + ext4_fsblk_t blk; + + if ((path[0].p_depth != 1) || + (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) || + (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root)) + return; + + /* + * We need to modify the block allocation bitmap and the block + * group descriptor to release the extent tree block. If we + * can't get the journal credits, give up. + */ + if (ext4_journal_extend(handle, 2)) + return; + + /* + * Copy the extent data up to the inode + */ + blk = ext4_idx_pblock(path[0].p_idx); + s = le16_to_cpu(path[1].p_hdr->eh_entries) * + sizeof(struct ext4_extent_idx); + s += sizeof(struct ext4_extent_header); + + memcpy(path[0].p_hdr, path[1].p_hdr, s); + path[0].p_depth = 0; + path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) + + (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr)); + path[0].p_hdr->eh_max = cpu_to_le16(max_root); + + brelse(path[1].p_bh); + ext4_free_blocks(handle, inode, NULL, blk, 1, + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); +} + /* * This function tries to merge the @ex extent to neighbours in the tree. * return 1 if merge left else 0. */ -static int ext4_ext_try_to_merge(struct inode *inode, +static void ext4_ext_try_to_merge(handle_t *handle, + struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *ex) { struct ext4_extent_header *eh; unsigned int depth; int merge_done = 0; - int ret = 0; depth = ext_depth(inode); BUG_ON(path[depth].p_hdr == NULL); @@ -1675,9 +1719,9 @@ static int ext4_ext_try_to_merge(struct inode *inode, merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1); if (!merge_done) - ret = ext4_ext_try_to_merge_right(inode, path, ex); + (void) ext4_ext_try_to_merge_right(inode, path, ex); - return ret; + ext4_ext_try_to_merge_up(handle, inode, path); } /* @@ -1893,7 +1937,7 @@ has_space: merge: /* try to merge extents */ if (!(flag & EXT4_GET_BLOCKS_PRE_IO)) - ext4_ext_try_to_merge(inode, path, nearex); + ext4_ext_try_to_merge(handle, inode, path, nearex); /* time to correct all indexes above */ @@ -1901,7 +1945,7 @@ merge: if (err) goto cleanup; - err = ext4_ext_dirty(handle, inode, path + depth); + err = ext4_ext_dirty(handle, inode, path + path->p_depth); cleanup: if (npath) { @@ -2924,9 +2968,9 @@ static int ext4_split_extent_at(handle_t *handle, ext4_ext_mark_initialized(ex); if (!(flags & EXT4_GET_BLOCKS_PRE_IO)) - ext4_ext_try_to_merge(inode, path, ex); + ext4_ext_try_to_merge(handle, inode, path, ex); - err = ext4_ext_dirty(handle, inode, path + depth); + err = ext4_ext_dirty(handle, inode, path + path->p_depth); goto out; } @@ -2958,8 +3002,8 @@ static int ext4_split_extent_at(handle_t *handle, goto fix_extent_len; /* update the extent length and mark as initialized */ ex->ee_len = cpu_to_le16(ee_len); - ext4_ext_try_to_merge(inode, path, ex); - err = ext4_ext_dirty(handle, inode, path + depth); + ext4_ext_try_to_merge(handle, inode, path, ex); + err = ext4_ext_dirty(handle, inode, path + path->p_depth); goto out; } else if (err) goto fix_extent_len; @@ -3191,8 +3235,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, if (err) goto out; ext4_ext_mark_initialized(ex); - ext4_ext_try_to_merge(inode, path, ex); - err = ext4_ext_dirty(handle, inode, path + depth); + ext4_ext_try_to_merge(handle, inode, path, ex); + err = ext4_ext_dirty(handle, inode, path + path->p_depth); goto out; } @@ -3333,10 +3377,10 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, /* note: ext4_ext_correct_indexes() isn't needed here because * borders are not changed */ - ext4_ext_try_to_merge(inode, path, ex); + ext4_ext_try_to_merge(handle, inode, path, ex); /* Mark modified extent as dirty */ - err = ext4_ext_dirty(handle, inode, path + depth); + err = ext4_ext_dirty(handle, inode, path + path->p_depth); out: ext4_ext_show_leaf(inode, path); return err; -- cgit v1.2.2 From 67a5da564f97f31c4054d358e00b34d7ee570da5 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Fri, 17 Aug 2012 09:54:17 -0400 Subject: ext4: make the zero-out chunk size tunable Currently in ext4 the length of zero-out chunk is set to 7 file system blocks. But if an inode has uninitailized extents from using fallocate to preallocate space, and the workload issues many random writes, this can cause a fragmented extent tree that will unnecessarily grow the extent tree. So create a new sysfs tunable, extent_max_zeroout_kb, which controls the maximum size where blocks will be zeroed out instead of creating a new uninitialized extent. The default of this has been sent to 32kb. CC: Zach Brown CC: Andreas Dilger Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e8755c21f4b9..2f082abf4992 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3085,7 +3085,6 @@ out: return err ? err : map->m_len; } -#define EXT4_EXT_ZERO_LEN 7 /* * This function is called by ext4_ext_map_blocks() if someone tries to write * to an uninitialized extent. It may result in splitting the uninitialized @@ -3111,13 +3110,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, struct ext4_map_blocks *map, struct ext4_ext_path *path) { + struct ext4_sb_info *sbi; struct ext4_extent_header *eh; struct ext4_map_blocks split_map; struct ext4_extent zero_ex; struct ext4_extent *ex; ext4_lblk_t ee_block, eof_block; unsigned int ee_len, depth; - int allocated; + int allocated, max_zeroout = 0; int err = 0; int split_flag = 0; @@ -3125,6 +3125,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, "block %llu, max_blocks %u\n", inode->i_ino, (unsigned long long)map->m_lblk, map->m_len); + sbi = EXT4_SB(inode->i_sb); eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; if (eof_block < map->m_lblk + map->m_len) @@ -3224,9 +3225,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, */ split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; - /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ - if (ee_len <= 2*EXT4_EXT_ZERO_LEN && - (EXT4_EXT_MAY_ZEROOUT & split_flag)) { + if (EXT4_EXT_MAY_ZEROOUT & split_flag) + max_zeroout = sbi->s_extent_max_zeroout_kb >> + inode->i_sb->s_blocksize_bits; + + /* If extent is less than s_max_zeroout_kb, zeroout directly */ + if (max_zeroout && (ee_len <= max_zeroout)) { err = ext4_ext_zeroout(inode, ex); if (err) goto out; @@ -3250,9 +3254,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, split_map.m_lblk = map->m_lblk; split_map.m_len = map->m_len; - if (allocated > map->m_len) { - if (allocated <= EXT4_EXT_ZERO_LEN && - (EXT4_EXT_MAY_ZEROOUT & split_flag)) { + if (max_zeroout && (allocated > map->m_len)) { + if (allocated <= max_zeroout) { /* case 3 */ zero_ex.ee_block = cpu_to_le32(map->m_lblk); @@ -3264,9 +3267,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, goto out; split_map.m_lblk = map->m_lblk; split_map.m_len = allocated; - } else if ((map->m_lblk - ee_block + map->m_len < - EXT4_EXT_ZERO_LEN) && - (EXT4_EXT_MAY_ZEROOUT & split_flag)) { + } else if (map->m_lblk - ee_block + map->m_len < max_zeroout) { /* case 2 */ if (map->m_lblk != ee_block) { zero_ex.ee_block = ex->ee_block; @@ -3286,7 +3287,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, } allocated = ext4_split_extent(handle, inode, path, - &split_map, split_flag, 0); + &split_map, split_flag, 0); if (allocated < 0) err = allocated; -- cgit v1.2.2 From e3d2e433e32a4b7a05818cbc9158037e26c74c8e Mon Sep 17 00:00:00 2001 From: Ashish Sangwan Date: Sat, 18 Aug 2012 22:29:46 -0400 Subject: ext4: no need to add inode to orphan list during hole punch While performing punch hole for an inode, i_disksize is not changed. So, there is no need to add the inode to orphan list. Signed-off-by: Ashish Sangwan Signed-off-by: Namjae Jeon Acked-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 2f082abf4992..2e56903e8d53 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4860,9 +4860,6 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) if (IS_ERR(handle)) return PTR_ERR(handle); - err = ext4_orphan_add(handle, inode); - if (err) - goto out; /* * Now we need to zero out the non-page-aligned data in the @@ -4948,7 +4945,6 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) up_write(&EXT4_I(inode)->i_data_sem); out: - ext4_orphan_del(handle, inode); inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); -- cgit v1.2.2 From 30cb27d661f5fdc582add0a9c297946a18ee4a4b Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Sat, 18 Aug 2012 22:38:07 -0400 Subject: ext4: fix trivial typo in comment Signed-off-by: Wang Sheng-Hui Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 2e56903e8d53..cc6d2b984e8f 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3301,7 +3301,7 @@ out: * to an uninitialized extent. * * Writing to an uninitialized extent may result in splitting the uninitialized - * extent into multiple /initialized uninitialized extents (up to three) + * extent into multiple initialized/uninitialized extents (up to three) * There are three possibilities: * a> There is no split required: Entire extent should be uninitialized * b> Splits in two extents: Write is happening at either end of the extent -- cgit v1.2.2 From 18888cf0883c286f238d44ee565530fe82752f06 Mon Sep 17 00:00:00 2001 From: Andrey Sidorov Date: Wed, 19 Sep 2012 14:14:53 -0400 Subject: ext4: speed up truncate/unlink by not using bforget() unless needed Do not iterate over data blocks scanning for bh's to forget as they're never exist. This improves time taken by unlink / truncate syscall. Tested by continuously truncating file that is being written by dd. Another test is rm -rf of linux tree while tar unpacks it. With ordered data mode condition unlikely(!tbh) was always met in ext4_free_blocks. With journal data mode tbh was found only few times, so optimisation is also possible. Unlinking fallocated 60G file after doing sync && echo 3 > /proc/sys/vm/drop_caches && time rm --help X86 before (linux 3.6-rc4): # time rm -f test1 real 0m2.710s user 0m0.000s sys 0m1.530s X86 after: # time rm -f test1 real 0m0.644s user 0m0.003s sys 0m0.060s MIPS before (linux 2.6.37): # time rm -f test1 real 0m 4.93s user 0m 0.00s sys 0m 4.61s MIPS after: # time rm -f test1 real 0m 0.16s user 0m 0.00s sys 0m 0.06s Signed-off-by: "Theodore Ts'o" Signed-off-by: Andrey Sidorov --- fs/ext4/extents.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index cc6d2b984e8f..a510917c175a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2318,10 +2318,13 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); unsigned short ee_len = ext4_ext_get_actual_len(ex); ext4_fsblk_t pblk; - int flags = EXT4_FREE_BLOCKS_FORGET; + int flags = 0; if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - flags |= EXT4_FREE_BLOCKS_METADATA; + flags |= EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET; + else if (ext4_should_journal_data(inode)) + flags |= EXT4_FREE_BLOCKS_FORGET; + /* * For bigalloc file systems, we never free a partial cluster * at the beginning of the extent. Instead, we make a note -- cgit v1.2.2 From 63fedaf1c2dd546da22ff5b4ae06e9b4efba5146 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Wed, 26 Sep 2012 21:09:06 -0400 Subject: ext4: remove unused function ext4_ext_check_cache Remove unused function ext4_ext_check_cache() and merge the code back to the ext4_ext_in_cache(). Reviewed-by: Carlos Maiolino Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 48 +++++++++--------------------------------------- 1 file changed, 9 insertions(+), 39 deletions(-) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index a510917c175a..cbcc6b3f2ae0 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2136,13 +2136,10 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, } /* - * ext4_ext_check_cache() + * ext4_ext_in_cache() * Checks to see if the given block is in the cache. * If it is, the cached extent is stored in the given - * cache extent pointer. If the cached extent is a hole, - * this routine should be used instead of - * ext4_ext_in_cache if the calling function needs to - * know the size of the hole. + * cache extent pointer. * * @inode: The files inode * @block: The block to look for in the cache @@ -2151,8 +2148,10 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, * * Return 0 if cache is invalid; 1 if the cache is valid */ -static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block, - struct ext4_ext_cache *ex){ +static int +ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, + struct ext4_extent *ex) +{ struct ext4_ext_cache *cex; struct ext4_sb_info *sbi; int ret = 0; @@ -2169,7 +2168,9 @@ static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block, goto errout; if (in_range(block, cex->ec_block, cex->ec_len)) { - memcpy(ex, cex, sizeof(struct ext4_ext_cache)); + ex->ee_block = cpu_to_le32(cex->ec_block); + ext4_ext_store_pblock(ex, cex->ec_start); + ex->ee_len = cpu_to_le16(cex->ec_len); ext_debug("%u cached by %u:%u:%llu\n", block, cex->ec_block, cex->ec_len, cex->ec_start); @@ -2181,37 +2182,6 @@ errout: return ret; } -/* - * ext4_ext_in_cache() - * Checks to see if the given block is in the cache. - * If it is, the cached extent is stored in the given - * extent pointer. - * - * @inode: The files inode - * @block: The block to look for in the cache - * @ex: Pointer where the cached extent will be stored - * if it contains block - * - * Return 0 if cache is invalid; 1 if the cache is valid - */ -static int -ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, - struct ext4_extent *ex) -{ - struct ext4_ext_cache cex; - int ret = 0; - - if (ext4_ext_check_cache(inode, block, &cex)) { - ex->ee_block = cpu_to_le32(cex.ec_block); - ext4_ext_store_pblock(ex, cex.ec_start); - ex->ee_len = cpu_to_le16(cex.ec_len); - ret = 1; - } - - return ret; -} - - /* * ext4_ext_rm_idx: * removes index from the index block. -- cgit v1.2.2 From ba39ebb61401cfe0ccd58dd0cd4da88465528c0a Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 27 Sep 2012 09:37:53 -0400 Subject: ext4: convert to use leXX_add_cpu() Convert cpu_to_leXX(leXX_to_cpu(E1) + E2) to use leXX_add_cpu(). dpatch engine is used to auto generate this patch. (https://github.com/weiyj/dpatch) Signed-off-by: Wei Yongjun Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index cbcc6b3f2ae0..8f08c7b77179 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1177,7 +1177,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), ext4_idx_pblock(EXT_FIRST_INDEX(neh))); - neh->eh_depth = cpu_to_le16(le16_to_cpu(neh->eh_depth) + 1); + le16_add_cpu(&neh->eh_depth, 1); ext4_mark_inode_dirty(handle, inode); out: brelse(bh); -- cgit v1.2.2 From f45ee3a1ea438af96e4fd2c0b16d195e67ef235f Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Fri, 28 Sep 2012 23:21:09 -0400 Subject: ext4: ext4_inode_info diet Generic inode has unused i_private pointer which may be used as cur_aio_dio storage. TODO: If cur_aio_dio will be passed as an argument to get_block_t this allow to have concurent AIO_DIO requests. Reviewed-by: Zheng Liu Reviewed-by: Jan Kara Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 8f08c7b77179..a1f56c3e773b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3618,7 +3618,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, { int ret = 0; int err = 0; - ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; + ext4_io_end_t *io = ext4_inode_aio(inode); ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical " "block %llu, max_blocks %u, flags %x, allocated %u\n", @@ -3876,7 +3876,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, unsigned int allocated = 0, offset = 0; unsigned int allocated_clusters = 0; struct ext4_allocation_request ar; - ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; + ext4_io_end_t *io = ext4_inode_aio(inode); ext4_lblk_t cluster_offset; ext_debug("blocks %u/%u requested for inode %lu\n", -- cgit v1.2.2 From 82e54229118785badffb4ef5ba4803df25fe007f Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Fri, 28 Sep 2012 23:36:25 -0400 Subject: ext4: fix unwritten counter leakage ext4_set_io_unwritten_flag() will increment i_unwritten counter, so once we mark end_io with EXT4_END_IO_UNWRITTEN we have to revert it back on error path. - add missed error checks to prevent counter leakage - ext4_end_io_nolock() will clear EXT4_END_IO_UNWRITTEN flag to signal that conversion finished. - add BUG_ON to ext4_free_end_io() to prevent similar leakage in future. Visible effect of this bug is that unaligned aio_stress may deadlock Reviewed-by: Jan Kara Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index a1f56c3e773b..54a94426ef7b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3633,6 +3633,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { ret = ext4_split_unwritten_extents(handle, inode, map, path, flags); + if (ret <= 0) + goto out; /* * Flag the inode(non aio case) or end_io struct (aio case) * that this IO needs to conversion to written when IO is @@ -3878,6 +3880,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_allocation_request ar; ext4_io_end_t *io = ext4_inode_aio(inode); ext4_lblk_t cluster_offset; + int set_unwritten = 0; ext_debug("blocks %u/%u requested for inode %lu\n", map->m_lblk, map->m_len, inode->i_ino); @@ -4100,13 +4103,8 @@ got_allocated_blocks: * For non asycn direct IO case, flag the inode state * that we need to perform conversion when IO is done. */ - if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { - if (io) - ext4_set_io_unwritten_flag(inode, io); - else - ext4_set_inode_state(inode, - EXT4_STATE_DIO_UNWRITTEN); - } + if ((flags & EXT4_GET_BLOCKS_PRE_IO)) + set_unwritten = 1; if (ext4_should_dioread_nolock(inode)) map->m_flags |= EXT4_MAP_UNINIT; } @@ -4118,6 +4116,15 @@ got_allocated_blocks: if (!err) err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); + + if (!err && set_unwritten) { + if (io) + ext4_set_io_unwritten_flag(inode, io); + else + ext4_set_inode_state(inode, + EXT4_STATE_DIO_UNWRITTEN); + } + if (err && free_on_err) { int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; -- cgit v1.2.2 From 28a535f9a0df060569dcc786e5bc2e1de43d7dc7 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Sat, 29 Sep 2012 00:14:55 -0400 Subject: ext4: completed_io locking cleanup Current unwritten extent conversion state-machine is very fuzzy. - For unknown reason it performs conversion under i_mutex. What for? My diagnosis: We already protect extent tree with i_data_sem, truncate and punch_hole should wait for DIO, so the only data we have to protect is end_io->flags modification, but only flush_completed_IO and end_io_work modified this flags and we can serialize them via i_completed_io_lock. Currently all these games with mutex_trylock result in the following deadlock truncate: kworker: ext4_setattr ext4_end_io_work mutex_lock(i_mutex) inode_dio_wait(inode) ->BLOCK DEADLOCK<- mutex_trylock() inode_dio_done() #TEST_CASE1_BEGIN MNT=/mnt_scrach unlink $MNT/file fallocate -l $((1024*1024*1024)) $MNT/file aio-stress -I 100000 -O -s 100m -n -t 1 -c 10 -o 2 -o 3 $MNT/file sleep 2 truncate -s 0 $MNT/file #TEST_CASE1_END Or use 286's xfstests https://github.com/dmonakhov/xfstests/blob/devel/286 This patch makes state machine simple and clean: (1) xxx_end_io schedule final extent conversion simply by calling ext4_add_complete_io(), which append it to ei->i_completed_io_list NOTE1: because of (2A) work should be queued only if ->i_completed_io_list was empty, otherwise the work is scheduled already. (2) ext4_flush_completed_IO is responsible for handling all pending end_io from ei->i_completed_io_list Flushing sequence consists of following stages: A) LOCKED: Atomically drain completed_io_list to local_list B) Perform extents conversion C) LOCKED: move converted io's to to_free list for final deletion This logic depends on context which we was called from. D) Final end_io context destruction NOTE1: i_mutex is no longer required because end_io->flags modification is protected by ei->ext4_complete_io_lock Full list of changes: - Move all completion end_io related routines to page-io.c in order to improve logic locality - Move open coded logic from various xx_end_xx routines to ext4_add_complete_io() - remove EXT4_IO_END_FSYNC - Improve SMP scalability by removing useless i_mutex which does not protect io->flags anymore. - Reduce lock contention on i_completed_io_lock by optimizing list walk. - Rename ext4_end_io_nolock to end4_end_io and make it static - Check flush completion status to ext4_ext_punch_hole(). Because it is not good idea to punch blocks from corrupted inode. Changes since V3 (in request to Jan's comments): Fall back to active flush_completed_IO() approach in order to prevent performance issues with nolocked DIO reads. Changes since V2: Fix use-after-free caused by race truncate vs end_io_work Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 54a94426ef7b..232077439aa8 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4833,7 +4833,9 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) } /* finish any pending end_io work */ - ext4_flush_completed_IO(inode); + err = ext4_flush_completed_IO(inode); + if (err) + return err; credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, credits); -- cgit v1.2.2 From 02d262dffcf4c74e5c4612ee736bdb94f18ed5b9 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Sun, 30 Sep 2012 23:03:42 -0400 Subject: ext4: punch_hole should wait for DIO writers punch_hole is the place where we have to wait for all existing writers (writeback, aio, dio), but currently we simply flush pended end_io request which is not sufficient. Other issue is that punch_hole performed w/o i_mutex held which obviously result in dangerous data corruption due to write-after-free. This patch performs following changes: - Guard punch_hole with i_mutex - Recheck inode flags under i_mutex - Block all new dio readers in order to prevent information leak caused by read-after-free pattern. - punch_hole now wait for all writers in flight NOTE: XXX write-after-free race is still possible because new dirty pages may appear due to mmap(), and currently there is no easy way to stop writeback while punch_hole is in progress. [ Fixed error return from ext4_ext_punch_hole() to make sure that we release i_mutex before returning EPERM or ETXTBUSY -- Ted ] Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 53 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 17 deletions(-) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 232077439aa8..5920e75fc05f 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4794,9 +4794,32 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) loff_t first_page_offset, last_page_offset; int credits, err = 0; + /* + * Write out all dirty pages to avoid race conditions + * Then release them. + */ + if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { + err = filemap_write_and_wait_range(mapping, + offset, offset + length - 1); + + if (err) + return err; + } + + mutex_lock(&inode->i_mutex); + /* It's not possible punch hole on append only file */ + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { + err = -EPERM; + goto out_mutex; + } + if (IS_SWAPFILE(inode)) { + err = -ETXTBSY; + goto out_mutex; + } + /* No need to punch hole beyond i_size */ if (offset >= inode->i_size) - return 0; + goto out_mutex; /* * If the hole extends beyond i_size, set the hole @@ -4814,33 +4837,25 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) first_page_offset = first_page << PAGE_CACHE_SHIFT; last_page_offset = last_page << PAGE_CACHE_SHIFT; - /* - * Write out all dirty pages to avoid race conditions - * Then release them. - */ - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - err = filemap_write_and_wait_range(mapping, - offset, offset + length - 1); - - if (err) - return err; - } - /* Now release the pages */ if (last_page_offset > first_page_offset) { truncate_pagecache_range(inode, first_page_offset, last_page_offset - 1); } - /* finish any pending end_io work */ + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); err = ext4_flush_completed_IO(inode); if (err) - return err; + goto out_dio; credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, credits); - if (IS_ERR(handle)) - return PTR_ERR(handle); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto out_dio; + } /* @@ -4930,6 +4945,10 @@ out: inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); +out_dio: + ext4_inode_resume_unlocked_dio(inode); +out_mutex: + mutex_unlock(&inode->i_mutex); return err; } int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, -- cgit v1.2.2 From 6f2080e64487b9963f9c6ff8a252e1abce98f2d4 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Sun, 30 Sep 2012 23:03:50 -0400 Subject: ext4: fix ext_remove_space for punch_hole case Inode is allowed to have empty leaf only if it this is blockless inode. Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 5920e75fc05f..c1fcf489e056 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2589,7 +2589,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, struct ext4_ext_path *path = NULL; ext4_fsblk_t partial_cluster = 0; handle_t *handle; - int i = 0, err; + int i = 0, err = 0; ext_debug("truncate since %u to %u\n", start, end); @@ -2621,12 +2621,16 @@ again: return PTR_ERR(path); } depth = ext_depth(inode); + /* Leaf not may not exist only if inode has no blocks at all */ ex = path[depth].p_ext; if (!ex) { - ext4_ext_drop_refs(path); - kfree(path); - path = NULL; - goto cont; + if (depth) { + EXT4_ERROR_INODE(inode, + "path[%d].p_hdr == NULL", + depth); + err = -EIO; + } + goto out; } ee_block = le32_to_cpu(ex->ee_block); @@ -2658,8 +2662,6 @@ again: goto out; } } -cont: - /* * We start scanning from right side, freeing all the blocks * after i_size and walking into the tree depth-wise. -- cgit v1.2.2 From c278531d39f3158bfee93dc67da0b77e09776de2 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Fri, 5 Oct 2012 11:31:55 -0400 Subject: ext4: fix ext4_flush_completed_IO wait semantics BUG #1) All places where we call ext4_flush_completed_IO are broken because buffered io and DIO/AIO goes through three stages 1) submitted io, 2) completed io (in i_completed_io_list) conversion pended 3) finished io (conversion done) And by calling ext4_flush_completed_IO we will flush only requests which were in (2) stage, which is wrong because: 1) punch_hole and truncate _must_ wait for all outstanding unwritten io regardless to it's state. 2) fsync and nolock_dio_read should also wait because there is a time window between end_page_writeback() and ext4_add_complete_io() As result integrity fsync is broken in case of buffered write to fallocated region: fsync blkdev_completion ->filemap_write_and_wait_range ->ext4_end_bio ->end_page_writeback <-- filemap_write_and_wait_range return ->ext4_flush_completed_IO sees empty i_completed_io_list but pended conversion still exist ->ext4_add_complete_io BUG #2) Race window becomes wider due to the 'ext4: completed_io locking cleanup V4' patch series This patch make following changes: 1) ext4_flush_completed_io() now first try to flush completed io and when wait for any outstanding unwritten io via ext4_unwritten_wait() 2) Rename function to more appropriate name. 3) Assert that all callers of ext4_flush_unwritten_io should hold i_mutex to prevent endless wait Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara --- fs/ext4/extents.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c1fcf489e056..1c94cca35ed1 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4268,7 +4268,7 @@ void ext4_ext_truncate(struct inode *inode) * finish any pending end_io work so we won't run the risk of * converting any truncated blocks to initialized later */ - ext4_flush_completed_IO(inode); + ext4_flush_unwritten_io(inode); /* * probably first extent we're gonna free will be last in block @@ -4847,10 +4847,10 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) /* Wait all existing dio workers, newcomers will block on i_mutex */ ext4_inode_block_unlocked_dio(inode); - inode_dio_wait(inode); - err = ext4_flush_completed_IO(inode); + err = ext4_flush_unwritten_io(inode); if (err) goto out_dio; + inode_dio_wait(inode); credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, credits); -- cgit v1.2.2