From 67710892ec983aa79ad1e2a2642fe8e3a4a194ea Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 6 Jun 2016 11:51:25 +0100
Subject: Btrfs: be more precise on errors when getting an inode from disk

When we attempt to read an inode from disk, we end up always returning an
-ESTALE error to the caller regardless of the actual failure reason, which
can be an out of memory problem (when allocating a path), some error found
when reading from the fs/subvolume btree (like a genuine IO error) or the
inode does not exists. So lets start returning the real error code to the
callers so that they don't treat all -ESTALE errors as meaning that the
inode does not exists (such as during orphan cleanup). This will also be
needed for a subsequent patch in the same series dealing with a special
fsync case.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/inode.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3e61bd1f3f65..f9686541997b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3428,10 +3428,10 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 		found_key.offset = 0;
 		inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
 		ret = PTR_ERR_OR_ZERO(inode);
-		if (ret && ret != -ESTALE)
+		if (ret && ret != -ENOENT)
 			goto out;
 
-		if (ret == -ESTALE && root == root->fs_info->tree_root) {
+		if (ret == -ENOENT && root == root->fs_info->tree_root) {
 			struct btrfs_root *dead_root;
 			struct btrfs_fs_info *fs_info = root->fs_info;
 			int is_dead_root = 0;
@@ -3467,7 +3467,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 		 * Inode is already gone but the orphan item is still there,
 		 * kill the orphan item.
 		 */
-		if (ret == -ESTALE) {
+		if (ret == -ENOENT) {
 			trans = btrfs_start_transaction(root, 1);
 			if (IS_ERR(trans)) {
 				ret = PTR_ERR(trans);
@@ -3626,7 +3626,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
 /*
  * read an inode from the btree into the in-memory inode
  */
-static void btrfs_read_locked_inode(struct inode *inode)
+static int btrfs_read_locked_inode(struct inode *inode)
 {
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
@@ -3645,14 +3645,19 @@ static void btrfs_read_locked_inode(struct inode *inode)
 		filled = true;
 
 	path = btrfs_alloc_path();
-	if (!path)
+	if (!path) {
+		ret = -ENOMEM;
 		goto make_bad;
+	}
 
 	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
 
 	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
-	if (ret)
+	if (ret) {
+		if (ret > 0)
+			ret = -ENOENT;
 		goto make_bad;
+	}
 
 	leaf = path->nodes[0];
 
@@ -3805,11 +3810,12 @@ cache_acl:
 	}
 
 	btrfs_update_iflags(inode);
-	return;
+	return 0;
 
 make_bad:
 	btrfs_free_path(path);
 	make_bad_inode(inode);
+	return ret;
 }
 
 /*
@@ -5595,7 +5601,9 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
 		return ERR_PTR(-ENOMEM);
 
 	if (inode->i_state & I_NEW) {
-		btrfs_read_locked_inode(inode);
+		int ret;
+
+		ret = btrfs_read_locked_inode(inode);
 		if (!is_bad_inode(inode)) {
 			inode_tree_add(inode);
 			unlock_new_inode(inode);
@@ -5604,7 +5612,8 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
 		} else {
 			unlock_new_inode(inode);
 			iput(inode);
-			inode = ERR_PTR(-ESTALE);
+			ASSERT(ret < 0);
+			inode = ERR_PTR(ret < 0 ? ret : -ESTALE);
 		}
 	}
 
-- 
cgit v1.2.2


From 44f714dae50a2e795d3268a6831762aa6fa54f55 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 6 Jun 2016 16:11:13 +0100
Subject: Btrfs: improve performance on fsync against new inode after
 rename/unlink

With commit 56f23fdbb600 ("Btrfs: fix file/data loss caused by fsync after
rename and new inode") we got simple fix for a functional issue when the
following sequence of actions is done:

  at transaction N
  create file A at directory D
  at transaction N + M (where M >= 1)
  move/rename existing file A from directory D to directory E
  create a new file named A at directory D
  fsync the new file
  power fail

The solution was to simply detect such scenario and fallback to a full
transaction commit when we detect it. However this turned out to had a
significant impact on throughput (and a bit on latency too) for benchmarks
using the dbench tool, which simulates real workloads from smbd (Samba)
servers. For example on a test vm (with a debug kernel):

Unpatched:
Throughput 19.1572 MB/sec  32 clients  32 procs  max_latency=1005.229 ms

Patched:
Throughput 23.7015 MB/sec  32 clients  32 procs  max_latency=809.206 ms

The patched results (this patch is applied) are similar to the results of
a kernel with the commit 56f23fdbb600 ("Btrfs: fix file/data loss caused
by fsync after rename and new inode") reverted.

This change avoids the fallback to a transaction commit and instead makes
sure all the names of the conflicting inode (the one that had a name in a
past transaction that matches the name of the new file in the same parent
directory) are logged so that at log replay time we don't lose neither the
new file nor the old file, and the old file gets the name it was renamed
to.

This also ends up avoiding a full transaction commit for a similar case
that involves an unlink instead of a rename of the old file:

  at transaction N
  create file A at directory D
  at transaction N + M (where M >= 1)
  remove file A
  create a new file named A at directory D
  fsync the new file
  power fail

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/inode.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f9686541997b..453b9d0da5f1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4203,6 +4203,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	int err = 0;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct btrfs_trans_handle *trans;
+	u64 last_unlink_trans;
 
 	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
 		return -ENOTEMPTY;
@@ -4225,11 +4226,27 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	if (err)
 		goto out;
 
+	last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
+
 	/* now the directory is empty */
 	err = btrfs_unlink_inode(trans, root, dir, d_inode(dentry),
 				 dentry->d_name.name, dentry->d_name.len);
-	if (!err)
+	if (!err) {
 		btrfs_i_size_write(inode, 0);
+		/*
+		 * Propagate the last_unlink_trans value of the deleted dir to
+		 * its parent directory. This is to prevent an unrecoverable
+		 * log tree in the case we do something like this:
+		 * 1) create dir foo
+		 * 2) create snapshot under dir foo
+		 * 3) delete the snapshot
+		 * 4) rmdir foo
+		 * 5) mkdir foo
+		 * 6) fsync foo or some file inside foo
+		 */
+		if (last_unlink_trans >= trans->transid)
+			BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
+	}
 out:
 	btrfs_end_transaction(trans, root);
 	btrfs_btree_balance_dirty(root);
-- 
cgit v1.2.2


From 18513091af9483ba84328d42092bd4d42a3c958f Mon Sep 17 00:00:00 2001
From: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
Date: Mon, 25 Jul 2016 15:51:40 +0800
Subject: btrfs: update btrfs_space_info's bytes_may_use timely

This patch can fix some false ENOSPC errors, below test script can
reproduce one false ENOSPC error:
	#!/bin/bash
	dd if=/dev/zero of=fs.img bs=$((1024*1024)) count=128
	dev=$(losetup --show -f fs.img)
	mkfs.btrfs -f -M $dev
	mkdir /tmp/mntpoint
	mount $dev /tmp/mntpoint
	cd /tmp/mntpoint
	xfs_io -f -c "falloc 0 $((64*1024*1024))" testfile

Above script will fail for ENOSPC reason, but indeed fs still has free
space to satisfy this request. Please see call graph:
btrfs_fallocate()
|-> btrfs_alloc_data_chunk_ondemand()
|   bytes_may_use += 64M
|-> btrfs_prealloc_file_range()
    |-> btrfs_reserve_extent()
        |-> btrfs_add_reserved_bytes()
        |   alloc_type is RESERVE_ALLOC_NO_ACCOUNT, so it does not
        |   change bytes_may_use, and bytes_reserved += 64M. Now
        |   bytes_may_use + bytes_reserved == 128M, which is greater
        |   than btrfs_space_info's total_bytes, false enospc occurs.
        |   Note, the bytes_may_use decrease operation will be done in
        |   end of btrfs_fallocate(), which is too late.

Here is another simple case for buffered write:
                    CPU 1              |              CPU 2
                                       |
|-> cow_file_range()                   |-> __btrfs_buffered_write()
    |-> btrfs_reserve_extent()         |   |
    |                                  |   |
    |                                  |   |
    |    .....                         |   |-> btrfs_check_data_free_space()
    |                                  |
    |                                  |
    |-> extent_clear_unlock_delalloc() |

In CPU 1, btrfs_reserve_extent()->find_free_extent()->
btrfs_add_reserved_bytes() do not decrease bytes_may_use, the decrease
operation will be delayed to be done in extent_clear_unlock_delalloc().
Assume in this case, btrfs_reserve_extent() reserved 128MB data, CPU2's
btrfs_check_data_free_space() tries to reserve 100MB data space.
If
	100MB > data_sinfo->total_bytes - data_sinfo->bytes_used -
		data_sinfo->bytes_reserved - data_sinfo->bytes_pinned -
		data_sinfo->bytes_readonly - data_sinfo->bytes_may_use
btrfs_check_data_free_space() will try to allcate new data chunk or call
btrfs_start_delalloc_roots(), or commit current transaction in order to
reserve some free space, obviously a lot of work. But indeed it's not
necessary as long as decreasing bytes_may_use timely, we still have
free space, decreasing 128M from bytes_may_use.

To fix this issue, this patch chooses to update bytes_may_use for both
data and metadata in btrfs_add_reserved_bytes(). For compress path, real
extent length may not be equal to file content length, so introduce a
ram_bytes argument for btrfs_reserve_extent(), find_free_extent() and
btrfs_add_reserved_bytes(), it's becasue bytes_may_use is increased by
file content length. Then compress path can update bytes_may_use
correctly. Also now we can discard RESERVE_ALLOC_NO_ACCOUNT, RESERVE_ALLOC
and RESERVE_FREE.

As we know, usually EXTENT_DO_ACCOUNTING is used for error path. In
run_delalloc_nocow(), for inode marked as NODATACOW or extent marked as
PREALLOC, we also need to update bytes_may_use, but can not pass
EXTENT_DO_ACCOUNTING, because it also clears metadata reservation, so
here we introduce EXTENT_CLEAR_DATA_RESV flag to indicate btrfs_clear_bit_hook()
to update btrfs_space_info's bytes_may_use.

Meanwhile __btrfs_prealloc_file_range() will call
btrfs_free_reserved_data_space() internally for both sucessful and failed
path, btrfs_prealloc_file_range()'s callers does not need to call
btrfs_free_reserved_data_space() any more.

Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/inode.c | 37 +++++++++++++++++++++++++++----------
 1 file changed, 27 insertions(+), 10 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 29636624a427..bcea20015d9a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -566,6 +566,8 @@ cont:
 						     PAGE_SET_WRITEBACK |
 						     page_error_op |
 						     PAGE_END_WRITEBACK);
+			btrfs_free_reserved_data_space_noquota(inode, start,
+						end - start + 1);
 			goto free_pages_out;
 		}
 	}
@@ -742,7 +744,7 @@ retry:
 		lock_extent(io_tree, async_extent->start,
 			    async_extent->start + async_extent->ram_size - 1);
 
-		ret = btrfs_reserve_extent(root,
+		ret = btrfs_reserve_extent(root, async_extent->ram_size,
 					   async_extent->compressed_size,
 					   async_extent->compressed_size,
 					   0, alloc_hint, &ins, 1, 1);
@@ -969,7 +971,8 @@ static noinline int cow_file_range(struct inode *inode,
 				     EXTENT_DEFRAG, PAGE_UNLOCK |
 				     PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
 				     PAGE_END_WRITEBACK);
-
+			btrfs_free_reserved_data_space_noquota(inode, start,
+						end - start + 1);
 			*nr_written = *nr_written +
 			     (end - start + PAGE_SIZE) / PAGE_SIZE;
 			*page_started = 1;
@@ -989,7 +992,7 @@ static noinline int cow_file_range(struct inode *inode,
 		unsigned long op;
 
 		cur_alloc_size = disk_num_bytes;
-		ret = btrfs_reserve_extent(root, cur_alloc_size,
+		ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
 					   root->sectorsize, 0, alloc_hint,
 					   &ins, 1, 1);
 		if (ret < 0)
@@ -1489,8 +1492,10 @@ out_check:
 		extent_clear_unlock_delalloc(inode, cur_offset,
 					     cur_offset + num_bytes - 1,
 					     locked_page, EXTENT_LOCKED |
-					     EXTENT_DELALLOC, PAGE_UNLOCK |
-					     PAGE_SET_PRIVATE2);
+					     EXTENT_DELALLOC |
+					     EXTENT_CLEAR_DATA_RESV,
+					     PAGE_UNLOCK | PAGE_SET_PRIVATE2);
+
 		if (!nolock && nocow)
 			btrfs_end_write_no_snapshoting(root);
 		cur_offset = extent_end;
@@ -1807,7 +1812,9 @@ static void btrfs_clear_bit_hook(struct inode *inode,
 			return;
 
 		if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
-		    && do_list && !(state->state & EXTENT_NORESERVE))
+		    && do_list && !(state->state & EXTENT_NORESERVE)
+		    && (*bits & (EXTENT_DO_ACCOUNTING |
+		    EXTENT_CLEAR_DATA_RESV)))
 			btrfs_free_reserved_data_space_noquota(inode,
 					state->start, len);
 
@@ -7252,7 +7259,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
 	int ret;
 
 	alloc_hint = get_extent_allocation_hint(inode, start, len);
-	ret = btrfs_reserve_extent(root, len, root->sectorsize, 0,
+	ret = btrfs_reserve_extent(root, len, len, root->sectorsize, 0,
 				   alloc_hint, &ins, 1, 1);
 	if (ret)
 		return ERR_PTR(ret);
@@ -7752,6 +7759,13 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 				ret = PTR_ERR(em2);
 				goto unlock_err;
 			}
+			/*
+			 * For inode marked NODATACOW or extent marked PREALLOC,
+			 * use the existing or preallocated extent, so does not
+			 * need to adjust btrfs_space_info's bytes_may_use.
+			 */
+			btrfs_free_reserved_data_space_noquota(inode,
+					start, len);
 			goto unlock;
 		}
 	}
@@ -7786,7 +7800,6 @@ unlock:
 			i_size_write(inode, start + len);
 
 		adjust_dio_outstanding_extents(inode, dio_data, len);
-		btrfs_free_reserved_data_space(inode, start, len);
 		WARN_ON(dio_data->reserve < len);
 		dio_data->reserve -= len;
 		dio_data->unsubmitted_oe_range_end = start + len;
@@ -10306,6 +10319,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 	u64 last_alloc = (u64)-1;
 	int ret = 0;
 	bool own_trans = true;
+	u64 end = start + num_bytes - 1;
 
 	if (trans)
 		own_trans = false;
@@ -10327,8 +10341,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 		 * sized chunks.
 		 */
 		cur_bytes = min(cur_bytes, last_alloc);
-		ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
-					   *alloc_hint, &ins, 1, 0);
+		ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
+				min_size, 0, *alloc_hint, &ins, 1, 0);
 		if (ret) {
 			if (own_trans)
 				btrfs_end_transaction(trans, root);
@@ -10414,6 +10428,9 @@ next:
 		if (own_trans)
 			btrfs_end_transaction(trans, root);
 	}
+	if (cur_offset < end)
+		btrfs_free_reserved_data_space(inode, cur_offset,
+			end - cur_offset + 1);
 	return ret;
 }
 
-- 
cgit v1.2.2