From 89a4e48f8479f8145eca9698f39fe188c982212f Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 17 Aug 2012 08:54:52 -0400
Subject: ext4: fix kernel BUG on large-scale rm -rf commands
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 968dee7722: "ext4: fix hole punch failure when depth is greater
than 0" introduced a regression in v3.5.1/v3.6-rc1 which caused kernel
crashes when users ran run "rm -rf" on large directory hierarchy on
ext4 filesystems on RAID devices:

    BUG: unable to handle kernel NULL pointer dereference at 0000000000000028

    Process rm (pid: 18229, threadinfo ffff8801276bc000, task ffff880123631710)
    Call Trace:
     [<ffffffff81236483>] ? __ext4_handle_dirty_metadata+0x83/0x110
     [<ffffffff812353d3>] ext4_ext_truncate+0x193/0x1d0
     [<ffffffff8120a8cf>] ? ext4_mark_inode_dirty+0x7f/0x1f0
     [<ffffffff81207e05>] ext4_truncate+0xf5/0x100
     [<ffffffff8120cd51>] ext4_evict_inode+0x461/0x490
     [<ffffffff811a1312>] evict+0xa2/0x1a0
     [<ffffffff811a1513>] iput+0x103/0x1f0
     [<ffffffff81196d84>] do_unlinkat+0x154/0x1c0
     [<ffffffff8118cc3a>] ? sys_newfstatat+0x2a/0x40
     [<ffffffff81197b0b>] sys_unlinkat+0x1b/0x50
     [<ffffffff816135e9>] system_call_fastpath+0x16/0x1b
    Code: 8b 4d 20 0f b7 41 02 48 8d 04 40 48 8d 04 81 49 89 45 18 0f b7 49 02 48 83 c1 01 49 89 4d 00 e9 ae f8 ff ff 0f 1f 00 49 8b 45 28 <48> 8b 40 28 49 89 45 20 e9 85 f8 ff ff 0f 1f 80 00 00 00

    RIP  [<ffffffff81233164>] ext4_ext_remove_space+0xa34/0xdf0

This could be reproduced as follows:

The problem in commit 968dee7722 was that caused the variable 'i' to
be left uninitialized if the truncate required more space than was
available in the journal.  This resulted in the function
ext4_ext_truncate_extend_restart() returning -EAGAIN, which caused
ext4_ext_remove_space() to restart the truncate operation after
starting a new jbd2 handle.

Reported-by: Maciej Żenczykowski <maze@google.com>
Reported-by: Marti Raudsepp <marti@juffo.org>
Tested-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/ext4/extents.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs/ext4/extents.c')
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index cd0c7ed06772..aabbb3f53683 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2662,6 +2662,7 @@ cont:
 		}
 		path[0].p_depth = depth;
 		path[0].p_hdr = ext_inode_hdr(inode);
+		i = 0;
 
 		if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
 			err = -EIO;
-- 
cgit v1.2.2


From ecb94f5fdf4b72547fca022421a9dca1672bddd4 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 17 Aug 2012 09:44:17 -0400
Subject: ext4: collapse a single extent tree block into the inode if possible

If an inode has more than 4 extents, but then later some of the
extents are merged together, we can optimize the file system by moving
the extents up into the inode, and discarding the extent tree block.
This is important, because if there are a large number of inodes with
an external extent tree blocks where the contents could fit in the
inode, this can significantly increase the fsck time of the file
system.

Google-Bug-Id: 6801242

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 72 ++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 58 insertions(+), 14 deletions(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index aabbb3f53683..e8755c21f4b9 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1655,17 +1655,61 @@ static int ext4_ext_try_to_merge_right(struct inode *inode,
 	return merge_done;
 }
 
+/*
+ * This function does a very simple check to see if we can collapse
+ * an extent tree with a single extent tree leaf block into the inode.
+ */
+static void ext4_ext_try_to_merge_up(handle_t *handle,
+				     struct inode *inode,
+				     struct ext4_ext_path *path)
+{
+	size_t s;
+	unsigned max_root = ext4_ext_space_root(inode, 0);
+	ext4_fsblk_t blk;
+
+	if ((path[0].p_depth != 1) ||
+	    (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) ||
+	    (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root))
+		return;
+
+	/*
+	 * We need to modify the block allocation bitmap and the block
+	 * group descriptor to release the extent tree block.  If we
+	 * can't get the journal credits, give up.
+	 */
+	if (ext4_journal_extend(handle, 2))
+		return;
+
+	/*
+	 * Copy the extent data up to the inode
+	 */
+	blk = ext4_idx_pblock(path[0].p_idx);
+	s = le16_to_cpu(path[1].p_hdr->eh_entries) *
+		sizeof(struct ext4_extent_idx);
+	s += sizeof(struct ext4_extent_header);
+
+	memcpy(path[0].p_hdr, path[1].p_hdr, s);
+	path[0].p_depth = 0;
+	path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
+		(path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr));
+	path[0].p_hdr->eh_max = cpu_to_le16(max_root);
+
+	brelse(path[1].p_bh);
+	ext4_free_blocks(handle, inode, NULL, blk, 1,
+			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
+}
+
 /*
  * This function tries to merge the @ex extent to neighbours in the tree.
  * return 1 if merge left else 0.
  */
-static int ext4_ext_try_to_merge(struct inode *inode,
+static void ext4_ext_try_to_merge(handle_t *handle,
+				  struct inode *inode,
 				  struct ext4_ext_path *path,
 				  struct ext4_extent *ex) {
 	struct ext4_extent_header *eh;
 	unsigned int depth;
 	int merge_done = 0;
-	int ret = 0;
 
 	depth = ext_depth(inode);
 	BUG_ON(path[depth].p_hdr == NULL);
@@ -1675,9 +1719,9 @@ static int ext4_ext_try_to_merge(struct inode *inode,
 		merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
 
 	if (!merge_done)
-		ret = ext4_ext_try_to_merge_right(inode, path, ex);
+		(void) ext4_ext_try_to_merge_right(inode, path, ex);
 
-	return ret;
+	ext4_ext_try_to_merge_up(handle, inode, path);
 }
 
 /*
@@ -1893,7 +1937,7 @@ has_space:
 merge:
 	/* try to merge extents */
 	if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
-		ext4_ext_try_to_merge(inode, path, nearex);
+		ext4_ext_try_to_merge(handle, inode, path, nearex);
 
 
 	/* time to correct all indexes above */
@@ -1901,7 +1945,7 @@ merge:
 	if (err)
 		goto cleanup;
 
-	err = ext4_ext_dirty(handle, inode, path + depth);
+	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
 
 cleanup:
 	if (npath) {
@@ -2924,9 +2968,9 @@ static int ext4_split_extent_at(handle_t *handle,
 			ext4_ext_mark_initialized(ex);
 
 		if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
-			ext4_ext_try_to_merge(inode, path, ex);
+			ext4_ext_try_to_merge(handle, inode, path, ex);
 
-		err = ext4_ext_dirty(handle, inode, path + depth);
+		err = ext4_ext_dirty(handle, inode, path + path->p_depth);
 		goto out;
 	}
 
@@ -2958,8 +3002,8 @@ static int ext4_split_extent_at(handle_t *handle,
 			goto fix_extent_len;
 		/* update the extent length and mark as initialized */
 		ex->ee_len = cpu_to_le16(ee_len);
-		ext4_ext_try_to_merge(inode, path, ex);
-		err = ext4_ext_dirty(handle, inode, path + depth);
+		ext4_ext_try_to_merge(handle, inode, path, ex);
+		err = ext4_ext_dirty(handle, inode, path + path->p_depth);
 		goto out;
 	} else if (err)
 		goto fix_extent_len;
@@ -3191,8 +3235,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		if (err)
 			goto out;
 		ext4_ext_mark_initialized(ex);
-		ext4_ext_try_to_merge(inode, path, ex);
-		err = ext4_ext_dirty(handle, inode, path + depth);
+		ext4_ext_try_to_merge(handle, inode, path, ex);
+		err = ext4_ext_dirty(handle, inode, path + path->p_depth);
 		goto out;
 	}
 
@@ -3333,10 +3377,10 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
 	/* note: ext4_ext_correct_indexes() isn't needed here because
 	 * borders are not changed
 	 */
-	ext4_ext_try_to_merge(inode, path, ex);
+	ext4_ext_try_to_merge(handle, inode, path, ex);
 
 	/* Mark modified extent as dirty */
-	err = ext4_ext_dirty(handle, inode, path + depth);
+	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
 out:
 	ext4_ext_show_leaf(inode, path);
 	return err;
-- 
cgit v1.2.2


From 67a5da564f97f31c4054d358e00b34d7ee570da5 Mon Sep 17 00:00:00 2001
From: Zheng Liu <wenqing.lz@taobao.com>
Date: Fri, 17 Aug 2012 09:54:17 -0400
Subject: ext4: make the zero-out chunk size tunable

Currently in ext4 the length of zero-out chunk is set to 7 file system
blocks.  But if an inode has uninitailized extents from using
fallocate to preallocate space, and the workload issues many random
writes, this can cause a fragmented extent tree that will
unnecessarily grow the extent tree.

So create a new sysfs tunable, extent_max_zeroout_kb, which controls
the maximum size where blocks will be zeroed out instead of creating a
new uninitialized extent.  The default of this has been sent to 32kb.

CC: Zach Brown <zab@zabbo.net>
CC: Andreas Dilger <adilger@dilger.ca>
Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e8755c21f4b9..2f082abf4992 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3085,7 +3085,6 @@ out:
 	return err ? err : map->m_len;
 }
 
-#define EXT4_EXT_ZERO_LEN 7
 /*
  * This function is called by ext4_ext_map_blocks() if someone tries to write
  * to an uninitialized extent. It may result in splitting the uninitialized
@@ -3111,13 +3110,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 					   struct ext4_map_blocks *map,
 					   struct ext4_ext_path *path)
 {
+	struct ext4_sb_info *sbi;
 	struct ext4_extent_header *eh;
 	struct ext4_map_blocks split_map;
 	struct ext4_extent zero_ex;
 	struct ext4_extent *ex;
 	ext4_lblk_t ee_block, eof_block;
 	unsigned int ee_len, depth;
-	int allocated;
+	int allocated, max_zeroout = 0;
 	int err = 0;
 	int split_flag = 0;
 
@@ -3125,6 +3125,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		"block %llu, max_blocks %u\n", inode->i_ino,
 		(unsigned long long)map->m_lblk, map->m_len);
 
+	sbi = EXT4_SB(inode->i_sb);
 	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
 		inode->i_sb->s_blocksize_bits;
 	if (eof_block < map->m_lblk + map->m_len)
@@ -3224,9 +3225,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	 */
 	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
 
-	/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
-	if (ee_len <= 2*EXT4_EXT_ZERO_LEN &&
-	    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+	if (EXT4_EXT_MAY_ZEROOUT & split_flag)
+		max_zeroout = sbi->s_extent_max_zeroout_kb >>
+			inode->i_sb->s_blocksize_bits;
+
+	/* If extent is less than s_max_zeroout_kb, zeroout directly */
+	if (max_zeroout && (ee_len <= max_zeroout)) {
 		err = ext4_ext_zeroout(inode, ex);
 		if (err)
 			goto out;
@@ -3250,9 +3254,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	split_map.m_lblk = map->m_lblk;
 	split_map.m_len = map->m_len;
 
-	if (allocated > map->m_len) {
-		if (allocated <= EXT4_EXT_ZERO_LEN &&
-		    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+	if (max_zeroout && (allocated > map->m_len)) {
+		if (allocated <= max_zeroout) {
 			/* case 3 */
 			zero_ex.ee_block =
 					 cpu_to_le32(map->m_lblk);
@@ -3264,9 +3267,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 				goto out;
 			split_map.m_lblk = map->m_lblk;
 			split_map.m_len = allocated;
-		} else if ((map->m_lblk - ee_block + map->m_len <
-			   EXT4_EXT_ZERO_LEN) &&
-			   (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+		} else if (map->m_lblk - ee_block + map->m_len < max_zeroout) {
 			/* case 2 */
 			if (map->m_lblk != ee_block) {
 				zero_ex.ee_block = ex->ee_block;
@@ -3286,7 +3287,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	}
 
 	allocated = ext4_split_extent(handle, inode, path,
-				       &split_map, split_flag, 0);
+				      &split_map, split_flag, 0);
 	if (allocated < 0)
 		err = allocated;
 
-- 
cgit v1.2.2


From e3d2e433e32a4b7a05818cbc9158037e26c74c8e Mon Sep 17 00:00:00 2001
From: Ashish Sangwan <ashishsangwan2@gmail.com>
Date: Sat, 18 Aug 2012 22:29:46 -0400
Subject: ext4: no need to add inode to orphan list during hole punch

While performing punch hole for an inode, i_disksize is not changed.
So, there is no need to add the inode to orphan list.

Signed-off-by: Ashish Sangwan <ashish.sangwan2@gmail.com>
Signed-off-by: Namjae Jeon <linkinjeon@gmail.com>
Acked-by: Zheng Liu <wenqing.lz@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 2f082abf4992..2e56903e8d53 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4860,9 +4860,6 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
-	err = ext4_orphan_add(handle, inode);
-	if (err)
-		goto out;
 
 	/*
 	 * Now we need to zero out the non-page-aligned data in the
@@ -4948,7 +4945,6 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
 	up_write(&EXT4_I(inode)->i_data_sem);
 
 out:
-	ext4_orphan_del(handle, inode);
 	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 	ext4_journal_stop(handle);
-- 
cgit v1.2.2


From 30cb27d661f5fdc582add0a9c297946a18ee4a4b Mon Sep 17 00:00:00 2001
From: Wang Sheng-Hui <shhuiw@gmail.com>
Date: Sat, 18 Aug 2012 22:38:07 -0400
Subject: ext4: fix trivial typo in comment

Signed-off-by: Wang Sheng-Hui <shhuiw@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 2e56903e8d53..cc6d2b984e8f 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3301,7 +3301,7 @@ out:
  * to an uninitialized extent.
  *
  * Writing to an uninitialized extent may result in splitting the uninitialized
- * extent into multiple /initialized uninitialized extents (up to three)
+ * extent into multiple initialized/uninitialized extents (up to three)
  * There are three possibilities:
  *   a> There is no split required: Entire extent should be uninitialized
  *   b> Splits in two extents: Write is happening at either end of the extent
-- 
cgit v1.2.2


From 18888cf0883c286f238d44ee565530fe82752f06 Mon Sep 17 00:00:00 2001
From: Andrey Sidorov <qrxd43@motorola.com>
Date: Wed, 19 Sep 2012 14:14:53 -0400
Subject: ext4: speed up truncate/unlink by not using bforget() unless needed

Do not iterate over data blocks scanning for bh's to forget as they're
never exist. This improves time taken by unlink / truncate syscall.
Tested by continuously truncating file that is being written by dd.
Another test is rm -rf of linux tree while tar unpacks it. With
ordered data mode condition unlikely(!tbh) was always met in
ext4_free_blocks. With journal data mode tbh was found only few times,
so optimisation is also possible.

Unlinking fallocated 60G file after doing sync && echo 3 >
/proc/sys/vm/drop_caches && time rm --help

X86 before (linux 3.6-rc4):
# time rm -f test1
real    0m2.710s
user    0m0.000s
sys     0m1.530s

X86 after:
# time rm -f test1
real    0m0.644s
user    0m0.003s
sys     0m0.060s

MIPS before (linux 2.6.37):
# time rm -f test1
real    0m 4.93s
user    0m 0.00s
sys     0m 4.61s

MIPS after:
# time rm -f test1
real    0m 0.16s
user    0m 0.00s
sys     0m 0.06s

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrey Sidorov <qrxd43@motorola.com>
---
 fs/ext4/extents.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index cc6d2b984e8f..a510917c175a 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2318,10 +2318,13 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	unsigned short ee_len =  ext4_ext_get_actual_len(ex);
 	ext4_fsblk_t pblk;
-	int flags = EXT4_FREE_BLOCKS_FORGET;
+	int flags = 0;
 
 	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-		flags |= EXT4_FREE_BLOCKS_METADATA;
+		flags |= EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
+	else if (ext4_should_journal_data(inode))
+		flags |= EXT4_FREE_BLOCKS_FORGET;
+
 	/*
 	 * For bigalloc file systems, we never free a partial cluster
 	 * at the beginning of the extent.  Instead, we make a note
-- 
cgit v1.2.2


From 63fedaf1c2dd546da22ff5b4ae06e9b4efba5146 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Wed, 26 Sep 2012 21:09:06 -0400
Subject: ext4: remove unused function ext4_ext_check_cache

Remove unused function ext4_ext_check_cache() and merge the code back to
the ext4_ext_in_cache().

Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 48 +++++++++---------------------------------------
 1 file changed, 9 insertions(+), 39 deletions(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index a510917c175a..cbcc6b3f2ae0 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2136,13 +2136,10 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
 }
 
 /*
- * ext4_ext_check_cache()
+ * ext4_ext_in_cache()
  * Checks to see if the given block is in the cache.
  * If it is, the cached extent is stored in the given
- * cache extent pointer.  If the cached extent is a hole,
- * this routine should be used instead of
- * ext4_ext_in_cache if the calling function needs to
- * know the size of the hole.
+ * cache extent pointer.
  *
  * @inode: The files inode
  * @block: The block to look for in the cache
@@ -2151,8 +2148,10 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
  *
  * Return 0 if cache is invalid; 1 if the cache is valid
  */
-static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block,
-	struct ext4_ext_cache *ex){
+static int
+ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
+		  struct ext4_extent *ex)
+{
 	struct ext4_ext_cache *cex;
 	struct ext4_sb_info *sbi;
 	int ret = 0;
@@ -2169,7 +2168,9 @@ static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block,
 		goto errout;
 
 	if (in_range(block, cex->ec_block, cex->ec_len)) {
-		memcpy(ex, cex, sizeof(struct ext4_ext_cache));
+		ex->ee_block = cpu_to_le32(cex->ec_block);
+		ext4_ext_store_pblock(ex, cex->ec_start);
+		ex->ee_len = cpu_to_le16(cex->ec_len);
 		ext_debug("%u cached by %u:%u:%llu\n",
 				block,
 				cex->ec_block, cex->ec_len, cex->ec_start);
@@ -2181,37 +2182,6 @@ errout:
 	return ret;
 }
 
-/*
- * ext4_ext_in_cache()
- * Checks to see if the given block is in the cache.
- * If it is, the cached extent is stored in the given
- * extent pointer.
- *
- * @inode: The files inode
- * @block: The block to look for in the cache
- * @ex:    Pointer where the cached extent will be stored
- *         if it contains block
- *
- * Return 0 if cache is invalid; 1 if the cache is valid
- */
-static int
-ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
-			struct ext4_extent *ex)
-{
-	struct ext4_ext_cache cex;
-	int ret = 0;
-
-	if (ext4_ext_check_cache(inode, block, &cex)) {
-		ex->ee_block = cpu_to_le32(cex.ec_block);
-		ext4_ext_store_pblock(ex, cex.ec_start);
-		ex->ee_len = cpu_to_le16(cex.ec_len);
-		ret = 1;
-	}
-
-	return ret;
-}
-
-
 /*
  * ext4_ext_rm_idx:
  * removes index from the index block.
-- 
cgit v1.2.2


From ba39ebb61401cfe0ccd58dd0cd4da88465528c0a Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Thu, 27 Sep 2012 09:37:53 -0400
Subject: ext4: convert to use leXX_add_cpu()

Convert cpu_to_leXX(leXX_to_cpu(E1) + E2) to use leXX_add_cpu().

dpatch engine is used to auto generate this patch.
(https://github.com/weiyj/dpatch)

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index cbcc6b3f2ae0..8f08c7b77179 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1177,7 +1177,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
 		  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
 		  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
 
-	neh->eh_depth = cpu_to_le16(le16_to_cpu(neh->eh_depth) + 1);
+	le16_add_cpu(&neh->eh_depth, 1);
 	ext4_mark_inode_dirty(handle, inode);
 out:
 	brelse(bh);
-- 
cgit v1.2.2


From f45ee3a1ea438af96e4fd2c0b16d195e67ef235f Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Fri, 28 Sep 2012 23:21:09 -0400
Subject: ext4: ext4_inode_info diet

Generic inode has unused i_private pointer which may be used as cur_aio_dio
storage.

TODO: If cur_aio_dio will be passed as an argument to get_block_t this allow
      to have concurent AIO_DIO requests.

Reviewed-by: Zheng Liu <wenqing.lz@taobao.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 8f08c7b77179..a1f56c3e773b 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3618,7 +3618,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 {
 	int ret = 0;
 	int err = 0;
-	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+	ext4_io_end_t *io = ext4_inode_aio(inode);
 
 	ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical "
 		  "block %llu, max_blocks %u, flags %x, allocated %u\n",
@@ -3876,7 +3876,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	unsigned int allocated = 0, offset = 0;
 	unsigned int allocated_clusters = 0;
 	struct ext4_allocation_request ar;
-	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+	ext4_io_end_t *io = ext4_inode_aio(inode);
 	ext4_lblk_t cluster_offset;
 
 	ext_debug("blocks %u/%u requested for inode %lu\n",
-- 
cgit v1.2.2


From 82e54229118785badffb4ef5ba4803df25fe007f Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Fri, 28 Sep 2012 23:36:25 -0400
Subject: ext4: fix unwritten counter leakage

ext4_set_io_unwritten_flag() will increment i_unwritten counter, so
once we mark end_io with EXT4_END_IO_UNWRITTEN we have to revert it back
on error path.

 - add missed error checks to prevent counter leakage
 - ext4_end_io_nolock() will clear EXT4_END_IO_UNWRITTEN flag to signal
   that conversion finished.
 - add BUG_ON to ext4_free_end_io() to prevent similar leakage in future.

Visible effect of this bug is that unaligned aio_stress may deadlock

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index a1f56c3e773b..54a94426ef7b 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3633,6 +3633,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 	if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
 		ret = ext4_split_unwritten_extents(handle, inode, map,
 						   path, flags);
+		if (ret <= 0)
+			goto out;
 		/*
 		 * Flag the inode(non aio case) or end_io struct (aio case)
 		 * that this IO needs to conversion to written when IO is
@@ -3878,6 +3880,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	struct ext4_allocation_request ar;
 	ext4_io_end_t *io = ext4_inode_aio(inode);
 	ext4_lblk_t cluster_offset;
+	int set_unwritten = 0;
 
 	ext_debug("blocks %u/%u requested for inode %lu\n",
 		  map->m_lblk, map->m_len, inode->i_ino);
@@ -4100,13 +4103,8 @@ got_allocated_blocks:
 		 * For non asycn direct IO case, flag the inode state
 		 * that we need to perform conversion when IO is done.
 		 */
-		if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
-			if (io)
-				ext4_set_io_unwritten_flag(inode, io);
-			else
-				ext4_set_inode_state(inode,
-						     EXT4_STATE_DIO_UNWRITTEN);
-		}
+		if ((flags & EXT4_GET_BLOCKS_PRE_IO))
+			set_unwritten = 1;
 		if (ext4_should_dioread_nolock(inode))
 			map->m_flags |= EXT4_MAP_UNINIT;
 	}
@@ -4118,6 +4116,15 @@ got_allocated_blocks:
 	if (!err)
 		err = ext4_ext_insert_extent(handle, inode, path,
 					     &newex, flags);
+
+	if (!err && set_unwritten) {
+		if (io)
+			ext4_set_io_unwritten_flag(inode, io);
+		else
+			ext4_set_inode_state(inode,
+					     EXT4_STATE_DIO_UNWRITTEN);
+	}
+
 	if (err && free_on_err) {
 		int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
 			EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
-- 
cgit v1.2.2


From 28a535f9a0df060569dcc786e5bc2e1de43d7dc7 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Sat, 29 Sep 2012 00:14:55 -0400
Subject: ext4: completed_io locking cleanup

Current unwritten extent conversion state-machine is very fuzzy.
- For unknown reason it performs conversion under i_mutex. What for?
  My diagnosis:
  We already protect extent tree with i_data_sem, truncate and punch_hole
  should wait for DIO, so the only data we have to protect is end_io->flags
  modification, but only flush_completed_IO and end_io_work modified this
  flags and we can serialize them via i_completed_io_lock.

  Currently all these games with mutex_trylock result in the following deadlock
   truncate:                          kworker:
    ext4_setattr                       ext4_end_io_work
    mutex_lock(i_mutex)
    inode_dio_wait(inode)  ->BLOCK
                             DEADLOCK<- mutex_trylock()
                                        inode_dio_done()
  #TEST_CASE1_BEGIN
  MNT=/mnt_scrach
  unlink $MNT/file
  fallocate -l $((1024*1024*1024)) $MNT/file
  aio-stress -I 100000 -O -s 100m -n -t 1 -c 10 -o 2 -o 3 $MNT/file
  sleep 2
  truncate -s 0 $MNT/file
  #TEST_CASE1_END

Or use 286's xfstests https://github.com/dmonakhov/xfstests/blob/devel/286

This patch makes state machine simple and clean:

(1) xxx_end_io schedule final extent conversion simply by calling
    ext4_add_complete_io(), which append it to ei->i_completed_io_list
    NOTE1: because of (2A) work should be queued only if
    ->i_completed_io_list was empty, otherwise the work is scheduled already.

(2) ext4_flush_completed_IO is responsible for handling all pending
    end_io from ei->i_completed_io_list
    Flushing sequence consists of following stages:
    A) LOCKED: Atomically drain completed_io_list to local_list
    B) Perform extents conversion
    C) LOCKED: move converted io's to to_free list for final deletion
       	     This logic depends on context which we was called from.
    D) Final end_io context destruction
    NOTE1: i_mutex is no longer required because end_io->flags modification
    is protected by ei->ext4_complete_io_lock

Full list of changes:
- Move all completion end_io related routines to page-io.c in order to improve
  logic locality
- Move open coded logic from various xx_end_xx routines to ext4_add_complete_io()
- remove EXT4_IO_END_FSYNC
- Improve SMP scalability by removing useless i_mutex which does not
  protect io->flags anymore.
- Reduce lock contention on i_completed_io_lock by optimizing list walk.
- Rename ext4_end_io_nolock to end4_end_io and make it static
- Check flush completion status to ext4_ext_punch_hole(). Because it is
  not good idea to punch blocks from corrupted inode.

Changes since V3 (in request to Jan's comments):
  Fall back to active flush_completed_IO() approach in order to prevent
  performance issues with nolocked DIO reads.
Changes since V2:
  Fix use-after-free caused by race truncate vs end_io_work

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 54a94426ef7b..232077439aa8 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4833,7 +4833,9 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
 	}
 
 	/* finish any pending end_io work */
-	ext4_flush_completed_IO(inode);
+	err = ext4_flush_completed_IO(inode);
+	if (err)
+		return err;
 
 	credits = ext4_writepage_trans_blocks(inode);
 	handle = ext4_journal_start(inode, credits);
-- 
cgit v1.2.2


From 02d262dffcf4c74e5c4612ee736bdb94f18ed5b9 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Sun, 30 Sep 2012 23:03:42 -0400
Subject: ext4: punch_hole should wait for DIO writers

punch_hole is the place where we have to wait for all existing writers
(writeback, aio, dio), but currently we simply flush pended end_io request
which is not sufficient. Other issue is that punch_hole performed w/o i_mutex
held which obviously result in dangerous data corruption due to
write-after-free.

This patch performs following changes:
- Guard punch_hole with i_mutex
- Recheck inode flags under i_mutex
- Block all new dio readers in order to prevent information leak caused by
  read-after-free pattern.
- punch_hole now wait for all writers in flight
  NOTE: XXX write-after-free race is still possible because new dirty pages
  may appear due to mmap(), and currently there is no easy way to stop
  writeback while punch_hole is in progress.

[ Fixed error return from ext4_ext_punch_hole() to make sure that we
  release i_mutex before returning EPERM or ETXTBUSY -- Ted ]

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 53 ++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 36 insertions(+), 17 deletions(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 232077439aa8..5920e75fc05f 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4794,9 +4794,32 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
 	loff_t first_page_offset, last_page_offset;
 	int credits, err = 0;
 
+	/*
+	 * Write out all dirty pages to avoid race conditions
+	 * Then release them.
+	 */
+	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+		err = filemap_write_and_wait_range(mapping,
+			offset, offset + length - 1);
+
+		if (err)
+			return err;
+	}
+
+	mutex_lock(&inode->i_mutex);
+	/* It's not possible punch hole on append only file */
+	if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
+		err = -EPERM;
+		goto out_mutex;
+	}
+	if (IS_SWAPFILE(inode)) {
+		err = -ETXTBSY;
+		goto out_mutex;
+	}
+
 	/* No need to punch hole beyond i_size */
 	if (offset >= inode->i_size)
-		return 0;
+		goto out_mutex;
 
 	/*
 	 * If the hole extends beyond i_size, set the hole
@@ -4814,33 +4837,25 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
 	first_page_offset = first_page << PAGE_CACHE_SHIFT;
 	last_page_offset = last_page << PAGE_CACHE_SHIFT;
 
-	/*
-	 * Write out all dirty pages to avoid race conditions
-	 * Then release them.
-	 */
-	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
-		err = filemap_write_and_wait_range(mapping,
-			offset, offset + length - 1);
-
-		if (err)
-			return err;
-	}
-
 	/* Now release the pages */
 	if (last_page_offset > first_page_offset) {
 		truncate_pagecache_range(inode, first_page_offset,
 					 last_page_offset - 1);
 	}
 
-	/* finish any pending end_io work */
+	/* Wait all existing dio workers, newcomers will block on i_mutex */
+	ext4_inode_block_unlocked_dio(inode);
+	inode_dio_wait(inode);
 	err = ext4_flush_completed_IO(inode);
 	if (err)
-		return err;
+		goto out_dio;
 
 	credits = ext4_writepage_trans_blocks(inode);
 	handle = ext4_journal_start(inode, credits);
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		goto out_dio;
+	}
 
 
 	/*
@@ -4930,6 +4945,10 @@ out:
 	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 	ext4_journal_stop(handle);
+out_dio:
+	ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+	mutex_unlock(&inode->i_mutex);
 	return err;
 }
 int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
-- 
cgit v1.2.2


From 6f2080e64487b9963f9c6ff8a252e1abce98f2d4 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Sun, 30 Sep 2012 23:03:50 -0400
Subject: ext4: fix ext_remove_space for punch_hole case

Inode is allowed to have empty leaf only if it this is blockless inode.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 5920e75fc05f..c1fcf489e056 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2589,7 +2589,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
 	struct ext4_ext_path *path = NULL;
 	ext4_fsblk_t partial_cluster = 0;
 	handle_t *handle;
-	int i = 0, err;
+	int i = 0, err = 0;
 
 	ext_debug("truncate since %u to %u\n", start, end);
 
@@ -2621,12 +2621,16 @@ again:
 			return PTR_ERR(path);
 		}
 		depth = ext_depth(inode);
+		/* Leaf not may not exist only if inode has no blocks at all */
 		ex = path[depth].p_ext;
 		if (!ex) {
-			ext4_ext_drop_refs(path);
-			kfree(path);
-			path = NULL;
-			goto cont;
+			if (depth) {
+				EXT4_ERROR_INODE(inode,
+						 "path[%d].p_hdr == NULL",
+						 depth);
+				err = -EIO;
+			}
+			goto out;
 		}
 
 		ee_block = le32_to_cpu(ex->ee_block);
@@ -2658,8 +2662,6 @@ again:
 				goto out;
 		}
 	}
-cont:
-
 	/*
 	 * We start scanning from right side, freeing all the blocks
 	 * after i_size and walking into the tree depth-wise.
-- 
cgit v1.2.2


From c278531d39f3158bfee93dc67da0b77e09776de2 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Fri, 5 Oct 2012 11:31:55 -0400
Subject: ext4: fix ext4_flush_completed_IO wait semantics

BUG #1) All places where we call ext4_flush_completed_IO are broken
    because buffered io and DIO/AIO goes through three stages
    1) submitted io,
    2) completed io (in i_completed_io_list) conversion pended
    3) finished  io (conversion done)
    And by calling ext4_flush_completed_IO we will flush only
    requests which were in (2) stage, which is wrong because:
     1) punch_hole and truncate _must_ wait for all outstanding unwritten io
      regardless to it's state.
     2) fsync and nolock_dio_read should also wait because there is
        a time window between end_page_writeback() and ext4_add_complete_io()
        As result integrity fsync is broken in case of buffered write
        to fallocated region:
        fsync                                      blkdev_completion
	 ->filemap_write_and_wait_range
                                                   ->ext4_end_bio
                                                     ->end_page_writeback
          <-- filemap_write_and_wait_range return
	 ->ext4_flush_completed_IO
   	 sees empty i_completed_io_list but pended
   	 conversion still exist
                                                     ->ext4_add_complete_io

BUG #2) Race window becomes wider due to the 'ext4: completed_io
locking cleanup V4' patch series

This patch make following changes:
1) ext4_flush_completed_io() now first try to flush completed io and when
   wait for any outstanding unwritten io via ext4_unwritten_wait()
2) Rename function to more appropriate name.
3) Assert that all callers of ext4_flush_unwritten_io should hold i_mutex to
   prevent endless wait

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/extents.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs/ext4/extents.c')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c1fcf489e056..1c94cca35ed1 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4268,7 +4268,7 @@ void ext4_ext_truncate(struct inode *inode)
 	 * finish any pending end_io work so we won't run the risk of
 	 * converting any truncated blocks to initialized later
 	 */
-	ext4_flush_completed_IO(inode);
+	ext4_flush_unwritten_io(inode);
 
 	/*
 	 * probably first extent we're gonna free will be last in block
@@ -4847,10 +4847,10 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
 
 	/* Wait all existing dio workers, newcomers will block on i_mutex */
 	ext4_inode_block_unlocked_dio(inode);
-	inode_dio_wait(inode);
-	err = ext4_flush_completed_IO(inode);
+	err = ext4_flush_unwritten_io(inode);
 	if (err)
 		goto out_dio;
+	inode_dio_wait(inode);
 
 	credits = ext4_writepage_trans_blocks(inode);
 	handle = ext4_journal_start(inode, credits);
-- 
cgit v1.2.2