From 45ea6095c8f0d6caad5658306416a5d254f1205e Mon Sep 17 00:00:00 2001
From: "slyich@gmail.com" <slyich@gmail.com>
Date: Mon, 7 Nov 2011 16:08:01 -0500
Subject: btrfs: fix double-free 'tree_root' in 'btrfs_mount()'

On error path 'tree_root' is treed in 'free_fs_info()'.
No need to free it explicitely. Noticed by SLUB in debug mode:

Complete reproducer under usermode linux (discovered on real
machine):

    bdev=/dev/ubda
    btr_root=/btr
    /mkfs.btrfs $bdev
    mount $bdev $btr_root
    mkdir $btr_root/subvols/
    cd $btr_root/subvols/
    /btrfs su cr foo
    /btrfs su cr bar
    mount $bdev -osubvol=subvols/foo $btr_root/subvols/bar
    umount $btr_root/subvols/bar

which gives

device fsid 4d55aa28-45b1-474b-b4ec-da912322195e devid 1 transid 7 /dev/ubda
=============================================================================
BUG kmalloc-2048: Object already free
-----------------------------------------------------------------------------

INFO: Allocated in btrfs_mount+0x389/0x7f0 age=0 cpu=0 pid=277
INFO: Freed in btrfs_mount+0x51c/0x7f0 age=0 cpu=0 pid=277
INFO: Slab 0x0000000062886200 objects=15 used=9 fp=0x0000000070b4d2d0 flags=0x4081
INFO: Object 0x0000000070b4d2d0 @offset=21200 fp=0x0000000070b4a968
...
Call Trace:
70b31948:  [<6008c522>] print_trailer+0xe2/0x130
70b31978:  [<6008c5aa>] object_err+0x3a/0x50
70b319a8:  [<6008e242>] free_debug_processing+0x142/0x2a0
70b319e0:  [<600ebf6f>] btrfs_mount+0x55f/0x7f0
70b319f8:  [<6008e5c1>] __slab_free+0x221/0x2d0

Signed-off-by: Sergei Trofimovich <slyfox@gentoo.org>
Cc: Arne Jansen <sensille@gmx.net>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 57080dffdfc6..dcd5aef6b614 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -933,8 +933,12 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	 * then open_ctree will properly initialize everything later.
 	 */
 	fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
+	if (!fs_info) {
+		error = -ENOMEM;
+		goto error_close_devices;
+	}
 	tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
-	if (!fs_info || !tree_root) {
+	if (!tree_root) {
 		error = -ENOMEM;
 		goto error_close_devices;
 	}
@@ -964,7 +968,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 
 		btrfs_close_devices(fs_devices);
 		free_fs_info(fs_info);
-		kfree(tree_root);
 	} else {
 		char b[BDEVNAME_SIZE];
 
@@ -992,7 +995,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 error_close_devices:
 	btrfs_close_devices(fs_devices);
 	free_fs_info(fs_info);
-	kfree(tree_root);
 	return ERR_PTR(error);
 }
 
-- 
cgit v1.2.2


From 917c16b2b69fc2eeb432eabca73258f08c58361e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 8 Nov 2011 14:49:59 -0500
Subject: Btrfs: fix oops on NULL trans handle in btrfs_truncate

If we fail to reserve space in the transaction during truncate, we can
error out with a NULL trans handle.  The cleanup code needs an extra
check to make sure we aren't trying to use the bad handle.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9d0eaa57d4ee..f60e2490bd0d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6529,14 +6529,16 @@ end_trans:
 		ret = btrfs_orphan_del(NULL, inode);
 	}
 
-	trans->block_rsv = &root->fs_info->trans_block_rsv;
-	ret = btrfs_update_inode(trans, root, inode);
-	if (ret && !err)
-		err = ret;
+	if (trans) {
+		trans->block_rsv = &root->fs_info->trans_block_rsv;
+		ret = btrfs_update_inode(trans, root, inode);
+		if (ret && !err)
+			err = ret;
 
-	nr = trans->blocks_used;
-	ret = btrfs_end_transaction_throttle(trans, root);
-	btrfs_btree_balance_dirty(root, nr);
+		nr = trans->blocks_used;
+		ret = btrfs_end_transaction_throttle(trans, root);
+		btrfs_btree_balance_dirty(root, nr);
+	}
 
 out:
 	btrfs_free_block_rsv(root, rsv);
-- 
cgit v1.2.2


From 7fd2ae21a42d178982679b86086661292b4afe4a Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 8 Nov 2011 15:47:34 -0500
Subject: Btrfs: fix our reservations for updating an inode when completing io

People have been reporting ENOSPC crashes in finish_ordered_io.  This is because
we try to steal from the delalloc block rsv to satisfy a reservation to update
the inode.  The problem with this is we don't explicitly save space for updating
the inode when doing delalloc.  This is kind of a problem and we've gotten away
with this because way back when we just stole from the delalloc reserve without
any questions, and this worked out fine because generally speaking the leaf had
been modified either by the mtime update when we did the original write or
because we just updated the leaf when we inserted the file extent item, only on
rare occasions had the leaf not actually been modified, and that was still ok
because we'd just use a block or two out of the over-reservation that is
delalloc.

Then came the delayed inode stuff.  This is amazing, except it wants a full
reservation for updating the inode since it may do it at some point down the
road after we've written the blocks and we have to recow everything again.  This
worked out because the delayed inode stuff just stole from the global reserve,
that is until recently when I changed that because it caused other problems.

So here we are, we're doing everything right and being screwed for it.  So take
an extra reservation for the inode at delalloc reservation time and carry it
through the life of the delalloc reservation.  If we need it we can steal it in
the delayed inode stuff.  If we have already stolen it try and do a normal
metadata reservation.  If that fails try to steal from the delalloc reservation.
If _that_ fails we'll get a WARN_ON() so I can start thinking of a better way to
solve this and in the meantime we'll steal from the global reserve.

With this patch I ran xfstests 13 in a loop for a couple of hours and didn't see
any problems.

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h   |  4 +--
 fs/btrfs/delayed-inode.c | 65 +++++++++++++++++++++++++++++++++++++++++++++---
 fs/btrfs/extent-tree.c   | 22 +++++++++++++---
 fs/btrfs/inode.c         |  1 +
 4 files changed, 83 insertions(+), 9 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 5a5d325a3935..634608d2a6d0 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -147,14 +147,12 @@ struct btrfs_inode {
 	 * the btrfs file release call will add this inode to the
 	 * ordered operations list so that we make sure to flush out any
 	 * new data the application may have written before commit.
-	 *
-	 * yes, its silly to have a single bitflag, but we might grow more
-	 * of these.
 	 */
 	unsigned ordered_data_close:1;
 	unsigned orphan_meta_reserved:1;
 	unsigned dummy_inode:1;
 	unsigned in_defrag:1;
+	unsigned delalloc_meta_reserved:1;
 
 	/*
 	 * always compress this one file
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index bbe8496d5339..313ee14cf3b7 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -617,12 +617,14 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
 static int btrfs_delayed_inode_reserve_metadata(
 					struct btrfs_trans_handle *trans,
 					struct btrfs_root *root,
+					struct inode *inode,
 					struct btrfs_delayed_node *node)
 {
 	struct btrfs_block_rsv *src_rsv;
 	struct btrfs_block_rsv *dst_rsv;
 	u64 num_bytes;
 	int ret;
+	int release = false;
 
 	src_rsv = trans->block_rsv;
 	dst_rsv = &root->fs_info->delayed_block_rsv;
@@ -652,11 +654,67 @@ static int btrfs_delayed_inode_reserve_metadata(
 		if (!ret)
 			node->bytes_reserved = num_bytes;
 		return ret;
+	} else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
+		spin_lock(&BTRFS_I(inode)->lock);
+		if (BTRFS_I(inode)->delalloc_meta_reserved) {
+			BTRFS_I(inode)->delalloc_meta_reserved = 0;
+			spin_unlock(&BTRFS_I(inode)->lock);
+			release = true;
+			goto migrate;
+		}
+		spin_unlock(&BTRFS_I(inode)->lock);
+
+		/* Ok we didn't have space pre-reserved.  This shouldn't happen
+		 * too often but it can happen if we do delalloc to an existing
+		 * inode which gets dirtied because of the time update, and then
+		 * isn't touched again until after the transaction commits and
+		 * then we try to write out the data.  First try to be nice and
+		 * reserve something strictly for us.  If not be a pain and try
+		 * to steal from the delalloc block rsv.
+		 */
+		ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
+		if (!ret)
+			goto out;
+
+		ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
+		if (!ret)
+			goto out;
+
+		/*
+		 * Ok this is a problem, let's just steal from the global rsv
+		 * since this really shouldn't happen that often.
+		 */
+		WARN_ON(1);
+		ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv,
+					      dst_rsv, num_bytes);
+		goto out;
 	}
 
+migrate:
 	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
-	if (!ret)
-		node->bytes_reserved = num_bytes;
+	if (unlikely(ret)) {
+		/* This shouldn't happen */
+		BUG_ON(release);
+		return ret;
+	}
+
+out:
+	/*
+	 * Migrate only takes a reservation, it doesn't touch the size of the
+	 * block_rsv.  This is to simplify people who don't normally have things
+	 * migrated from their block rsv.  If they go to release their
+	 * reservation, that will decrease the size as well, so if migrate
+	 * reduced size we'd end up with a negative size.  But for the
+	 * delalloc_meta_reserved stuff we will only know to drop 1 reservation,
+	 * but we could in fact do this reserve/migrate dance several times
+	 * between the time we did the original reservation and we'd clean it
+	 * up.  So to take care of this, release the space for the meta
+	 * reservation here.  I think it may be time for a documentation page on
+	 * how block rsvs. work.
+	 */
+	if (release)
+		btrfs_block_rsv_release(root, src_rsv, num_bytes);
+	node->bytes_reserved = num_bytes;
 
 	return ret;
 }
@@ -1708,7 +1766,8 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
 		goto release_node;
 	}
 
-	ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node);
+	ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode,
+						   delayed_node);
 	if (ret)
 		goto release_node;
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 18ea90c8943b..0b044e509e9f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4063,23 +4063,30 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
  */
 static unsigned drop_outstanding_extent(struct inode *inode)
 {
+	unsigned drop_inode_space = 0;
 	unsigned dropped_extents = 0;
 
 	BUG_ON(!BTRFS_I(inode)->outstanding_extents);
 	BTRFS_I(inode)->outstanding_extents--;
 
+	if (BTRFS_I(inode)->outstanding_extents == 0 &&
+	    BTRFS_I(inode)->delalloc_meta_reserved) {
+		drop_inode_space = 1;
+		BTRFS_I(inode)->delalloc_meta_reserved = 0;
+	}
+
 	/*
 	 * If we have more or the same amount of outsanding extents than we have
 	 * reserved then we need to leave the reserved extents count alone.
 	 */
 	if (BTRFS_I(inode)->outstanding_extents >=
 	    BTRFS_I(inode)->reserved_extents)
-		return 0;
+		return drop_inode_space;
 
 	dropped_extents = BTRFS_I(inode)->reserved_extents -
 		BTRFS_I(inode)->outstanding_extents;
 	BTRFS_I(inode)->reserved_extents -= dropped_extents;
-	return dropped_extents;
+	return dropped_extents + drop_inode_space;
 }
 
 /**
@@ -4165,9 +4172,18 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 		nr_extents = BTRFS_I(inode)->outstanding_extents -
 			BTRFS_I(inode)->reserved_extents;
 		BTRFS_I(inode)->reserved_extents += nr_extents;
+	}
 
-		to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
+	/*
+	 * Add an item to reserve for updating the inode when we complete the
+	 * delalloc io.
+	 */
+	if (!BTRFS_I(inode)->delalloc_meta_reserved) {
+		nr_extents++;
+		BTRFS_I(inode)->delalloc_meta_reserved = 1;
 	}
+
+	to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
 	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
 	spin_unlock(&BTRFS_I(inode)->lock);
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f60e2490bd0d..2b920596c126 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6607,6 +6607,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	ei->orphan_meta_reserved = 0;
 	ei->dummy_inode = 0;
 	ei->in_defrag = 0;
+	ei->delalloc_meta_reserved = 0;
 	ei->force_compress = BTRFS_COMPRESS_NONE;
 
 	ei->delayed_node = NULL;
-- 
cgit v1.2.2


From a90e8b6fb80db43b029e1e76205452afa8bdc77a Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Tue, 8 Nov 2011 16:47:55 +0200
Subject: Btrfs: fix memory leak in btrfs_parse_early_options()

Don't leak subvol_name string in case multiple subvol= options are
given.  "The lastest option is effective" behavior (consistent with
subvolid= and subvolrootid= options) is preserved.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/super.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index dcd5aef6b614..6befcaf253bd 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -448,6 +448,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 		token = match_token(p, tokens, args);
 		switch (token) {
 		case Opt_subvol:
+			kfree(*subvol_name);
 			*subvol_name = match_strdup(&args[0]);
 			break;
 		case Opt_subvolid:
-- 
cgit v1.2.2


From f23c8af8ca2789eeb0ab9ea90c214f9694d96cc5 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Tue, 8 Nov 2011 19:15:05 +0200
Subject: Btrfs: fix subvol_name leak on error in btrfs_mount()

btrfs_parse_early_options() can fail due to error while scanning devices
(-o device= option), but still strdup() subvol_name string:

mount -o subvol=SUBV,device=BAD_DEVICE <dev> <mnt>

So free subvol_name string on error.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/super.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 6befcaf253bd..58e9492230ce 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -905,8 +905,10 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	error = btrfs_parse_early_options(data, mode, fs_type,
 					  &subvol_name, &subvol_objectid,
 					  &subvol_rootid, &fs_devices);
-	if (error)
+	if (error) {
+		kfree(subvol_name);
 		return ERR_PTR(error);
+	}
 
 	if (subvol_name) {
 		root = mount_subvol(subvol_name, flags, device_name, data);
-- 
cgit v1.2.2


From 4d34b2789538befa45a68a191dc12e0886a69f7d Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 9 Nov 2011 00:08:15 +0200
Subject: Btrfs: avoid null dereference and leaks when bailing from
 open_ctree()

Fix bugs introduced by 6c41761f.  Firstly, after failing to allocate any
of the tree roots (first 'goto fail' in open_ctree()) we would
dereference a NULL fs_info pointer in free_fs_info().  Secondly, after
failures from init_srcu_struct(), setup_bdi() and new_inode() we would
leak all earlier allocated roots: fs_info fields haven't been
initialized yet so free_fs_info() is rendered useless.

Fix this by initializing fs_info pointer and fs_info fields before any
allocations happen.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/disk-io.c | 35 +++++++++++++++--------------------
 1 file changed, 15 insertions(+), 20 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e53a5bb85670..91db90b526c2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1890,31 +1890,32 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	u64 features;
 	struct btrfs_key location;
 	struct buffer_head *bh;
-	struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),
-						 GFP_NOFS);
-	struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
-						 GFP_NOFS);
+	struct btrfs_super_block *disk_super;
 	struct btrfs_root *tree_root = btrfs_sb(sb);
-	struct btrfs_fs_info *fs_info = NULL;
-	struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
-						GFP_NOFS);
-	struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
-					      GFP_NOFS);
+	struct btrfs_fs_info *fs_info = tree_root->fs_info;
+	struct btrfs_root *extent_root;
+	struct btrfs_root *csum_root;
+	struct btrfs_root *chunk_root;
+	struct btrfs_root *dev_root;
 	struct btrfs_root *log_tree_root;
-
 	int ret;
 	int err = -EINVAL;
 	int num_backups_tried = 0;
 	int backup_index = 0;
 
-	struct btrfs_super_block *disk_super;
+	extent_root = fs_info->extent_root =
+		kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+	csum_root = fs_info->csum_root =
+		kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+	chunk_root = fs_info->chunk_root =
+		kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+	dev_root = fs_info->dev_root =
+		kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
 
-	if (!extent_root || !tree_root || !tree_root->fs_info ||
-	    !chunk_root || !dev_root || !csum_root) {
+	if (!extent_root || !csum_root || !chunk_root || !dev_root) {
 		err = -ENOMEM;
 		goto fail;
 	}
-	fs_info = tree_root->fs_info;
 
 	ret = init_srcu_struct(&fs_info->subvol_srcu);
 	if (ret) {
@@ -1954,12 +1955,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	mutex_init(&fs_info->reloc_mutex);
 
 	init_completion(&fs_info->kobj_unregister);
-	fs_info->tree_root = tree_root;
-	fs_info->extent_root = extent_root;
-	fs_info->csum_root = csum_root;
-	fs_info->chunk_root = chunk_root;
-	fs_info->dev_root = dev_root;
-	fs_info->fs_devices = fs_devices;
 	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
 	INIT_LIST_HEAD(&fs_info->space_info);
 	btrfs_mapping_init(&fs_info->mapping_tree);
-- 
cgit v1.2.2


From 586e46e2813c589d26258a599580421fb6fb576b Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 9 Nov 2011 13:26:37 +0200
Subject: Btrfs: close devices on all error paths in open_ctree()

Fix a bug introduced by 7e662854 where we would leave devices busy on
certain error paths in open_ctree().  fs_info is guaranteed to be
non-NULL now so it's safe to dereference it on all error paths.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/disk-io.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 91db90b526c2..b6a5c0dd0dd8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2460,21 +2460,20 @@ fail_sb_buffer:
 	btrfs_stop_workers(&fs_info->caching_workers);
 fail_alloc:
 fail_iput:
+	btrfs_mapping_tree_free(&fs_info->mapping_tree);
+
 	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
 	iput(fs_info->btree_inode);
-
-	btrfs_close_devices(fs_info->fs_devices);
-	btrfs_mapping_tree_free(&fs_info->mapping_tree);
 fail_bdi:
 	bdi_destroy(&fs_info->bdi);
 fail_srcu:
 	cleanup_srcu_struct(&fs_info->subvol_srcu);
 fail:
+	btrfs_close_devices(fs_info->fs_devices);
 	free_fs_info(fs_info);
 	return ERR_PTR(err);
 
 recovery_tree_root:
-
 	if (!btrfs_test_opt(tree_root, RECOVERY))
 		goto fail_tree_roots;
 
-- 
cgit v1.2.2


From 04d21a244fdf79d0ac892eaaa9a46b682467277c Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 9 Nov 2011 14:41:22 +0200
Subject: Btrfs: rework error handling in btrfs_mount()

Commits 6c41761f and 45ea6095 introduced the possibility of NULL pointer
dereference on error paths, also we would leave all devices busy and
leak fs_info with all sub-structures on error when trying to mount an
already mounted fs to a different directory.

Fix this by doing all allocations before trying to open any of the
devices, adjust error path for mount-already-mounted-fs case.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/super.c | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 58e9492230ce..629281c65ff5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -891,7 +891,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	struct super_block *s;
 	struct dentry *root;
 	struct btrfs_fs_devices *fs_devices = NULL;
-	struct btrfs_root *tree_root = NULL;
 	struct btrfs_fs_info *fs_info = NULL;
 	fmode_t mode = FMODE_READ;
 	char *subvol_name = NULL;
@@ -920,15 +919,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	if (error)
 		return ERR_PTR(error);
 
-	error = btrfs_open_devices(fs_devices, mode, fs_type);
-	if (error)
-		return ERR_PTR(error);
-
-	if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
-		error = -EACCES;
-		goto error_close_devices;
-	}
-
 	/*
 	 * Setup a dummy root and fs_info for test/set super.  This is because
 	 * we don't actually fill this stuff out until open_ctree, but we need
@@ -936,28 +926,36 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	 * then open_ctree will properly initialize everything later.
 	 */
 	fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
-	if (!fs_info) {
-		error = -ENOMEM;
-		goto error_close_devices;
-	}
-	tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
-	if (!tree_root) {
+	if (!fs_info)
+		return ERR_PTR(-ENOMEM);
+
+	fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+	if (!fs_info->tree_root) {
 		error = -ENOMEM;
-		goto error_close_devices;
+		goto error_fs_info;
 	}
-	fs_info->tree_root = tree_root;
+	fs_info->tree_root->fs_info = fs_info;
 	fs_info->fs_devices = fs_devices;
-	tree_root->fs_info = fs_info;
 
 	fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
 	fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
 	if (!fs_info->super_copy || !fs_info->super_for_commit) {
 		error = -ENOMEM;
+		goto error_fs_info;
+	}
+
+	error = btrfs_open_devices(fs_devices, mode, fs_type);
+	if (error)
+		goto error_fs_info;
+
+	if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
+		error = -EACCES;
 		goto error_close_devices;
 	}
 
 	bdev = fs_devices->latest_bdev;
-	s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root);
+	s = sget(fs_type, btrfs_test_super, btrfs_set_super,
+		 fs_info->tree_root);
 	if (IS_ERR(s)) {
 		error = PTR_ERR(s);
 		goto error_close_devices;
@@ -966,7 +964,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	if (s->s_root) {
 		if ((flags ^ s->s_flags) & MS_RDONLY) {
 			deactivate_locked_super(s);
-			return ERR_PTR(-EBUSY);
+			error = -EBUSY;
+			goto error_close_devices;
 		}
 
 		btrfs_close_devices(fs_devices);
@@ -997,6 +996,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 
 error_close_devices:
 	btrfs_close_devices(fs_devices);
+error_fs_info:
 	free_fs_info(fs_info);
 	return ERR_PTR(error);
 }
-- 
cgit v1.2.2


From 2115133f8b9a8dbdb217d14080814df07ce90479 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 10 Nov 2011 20:39:08 -0500
Subject: Btrfs: tweak the delayed inode reservations again

Josef sent along an incremental to the inode reservation
code to make sure we try and fall back to directly updating
the inode item if things go horribly wrong.

This reworks that patch slightly, adding a fallback function
that will always try to update the inode item directly without
going through the delayed_inode code.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/delayed-inode.c |  9 +++----
 fs/btrfs/inode.c         | 64 +++++++++++++++++++++++++++++++++---------------
 2 files changed, 47 insertions(+), 26 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 313ee14cf3b7..6a1a6800776c 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -692,11 +692,6 @@ static int btrfs_delayed_inode_reserve_metadata(
 
 migrate:
 	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
-	if (unlikely(ret)) {
-		/* This shouldn't happen */
-		BUG_ON(release);
-		return ret;
-	}
 
 out:
 	/*
@@ -712,9 +707,11 @@ out:
 	 * reservation here.  I think it may be time for a documentation page on
 	 * how block rsvs. work.
 	 */
+	if (!ret)
+		node->bytes_reserved = num_bytes;
+
 	if (release)
 		btrfs_block_rsv_release(root, src_rsv, num_bytes);
-	node->bytes_reserved = num_bytes;
 
 	return ret;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2b920596c126..7d394335bcb5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -93,6 +93,8 @@ static noinline int cow_file_range(struct inode *inode,
 				   struct page *locked_page,
 				   u64 start, u64 end, int *page_started,
 				   unsigned long *nr_written, int unlock);
+static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, struct inode *inode);
 
 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
 				     struct inode *inode,  struct inode *dir,
@@ -1741,7 +1743,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 				trans = btrfs_join_transaction(root);
 			BUG_ON(IS_ERR(trans));
 			trans->block_rsv = &root->fs_info->delalloc_block_rsv;
-			ret = btrfs_update_inode(trans, root, inode);
+			ret = btrfs_update_inode_fallback(trans, root, inode);
 			BUG_ON(ret);
 		}
 		goto out;
@@ -1791,7 +1793,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 
 	ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
 	if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
-		ret = btrfs_update_inode(trans, root, inode);
+		ret = btrfs_update_inode_fallback(trans, root, inode);
 		BUG_ON(ret);
 	}
 	ret = 0;
@@ -2426,7 +2428,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
 /*
  * copy everything in the in-memory inode into the btree.
  */
-noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
+static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root, struct inode *inode)
 {
 	struct btrfs_inode_item *inode_item;
@@ -2434,21 +2436,6 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
 	struct extent_buffer *leaf;
 	int ret;
 
-	/*
-	 * If the inode is a free space inode, we can deadlock during commit
-	 * if we put it into the delayed code.
-	 *
-	 * The data relocation inode should also be directly updated
-	 * without delay
-	 */
-	if (!btrfs_is_free_space_inode(root, inode)
-	    && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
-		ret = btrfs_delayed_update_inode(trans, root, inode);
-		if (!ret)
-			btrfs_set_inode_last_trans(trans, inode);
-		return ret;
-	}
-
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -2476,6 +2463,43 @@ failed:
 	return ret;
 }
 
+/*
+ * copy everything in the in-memory inode into the btree.
+ */
+noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, struct inode *inode)
+{
+	int ret;
+
+	/*
+	 * If the inode is a free space inode, we can deadlock during commit
+	 * if we put it into the delayed code.
+	 *
+	 * The data relocation inode should also be directly updated
+	 * without delay
+	 */
+	if (!btrfs_is_free_space_inode(root, inode)
+	    && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
+		ret = btrfs_delayed_update_inode(trans, root, inode);
+		if (!ret)
+			btrfs_set_inode_last_trans(trans, inode);
+		return ret;
+	}
+
+	return btrfs_update_inode_item(trans, root, inode);
+}
+
+static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, struct inode *inode)
+{
+	int ret;
+
+	ret = btrfs_update_inode(trans, root, inode);
+	if (ret == -ENOSPC)
+		return btrfs_update_inode_item(trans, root, inode);
+	return ret;
+}
+
 /*
  * unlink helper that gets used here in inode.c and in the tree logging
  * recovery code.  It remove a link in a directory with a given name, and
@@ -5632,7 +5656,7 @@ again:
 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
 		ret = btrfs_ordered_update_i_size(inode, 0, ordered);
 		if (!ret)
-			err = btrfs_update_inode(trans, root, inode);
+			err = btrfs_update_inode_fallback(trans, root, inode);
 		goto out;
 	}
 
@@ -5670,7 +5694,7 @@ again:
 	add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
 	ret = btrfs_ordered_update_i_size(inode, 0, ordered);
 	if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags))
-		btrfs_update_inode(trans, root, inode);
+		btrfs_update_inode_fallback(trans, root, inode);
 	ret = 0;
 out_unlock:
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
-- 
cgit v1.2.2


From 924cd8fbe41851eda2b68bf2ed501b2777fd77b4 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 10 Nov 2011 20:45:04 -0500
Subject: Btrfs: fix nocow when deleting the item

btrfs_previous_item() just search the b+ tree, do not COW the nodes or leaves,
if we modify the result of it, the meta-data will be broken. fix it.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f8e2943101a1..c37433d3cd82 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -999,7 +999,7 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
 	key.objectid = device->devid;
 	key.offset = start;
 	key.type = BTRFS_DEV_EXTENT_KEY;
-
+again:
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret > 0) {
 		ret = btrfs_previous_item(root, path, key.objectid,
@@ -1012,6 +1012,9 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
 					struct btrfs_dev_extent);
 		BUG_ON(found_key.offset > start || found_key.offset +
 		       btrfs_dev_extent_length(leaf, extent) < start);
+		key = found_key;
+		btrfs_release_path(path);
+		goto again;
 	} else if (ret == 0) {
 		leaf = path->nodes[0];
 		extent = btrfs_item_ptr(leaf, path->slots[0],
-- 
cgit v1.2.2


From ba38eb4de354d228f2792f93cde2c748a3a3f3b2 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 10 Nov 2011 20:45:04 -0500
Subject: Btrfs: fix no reserved space for writing out inode cache

I-node cache forgets to reserve the space when writing out it. And when
we do some stress test, such as synctest, it will trigger WARN_ON() in
use_block_rsv().

WARNING: at fs/btrfs/extent-tree.c:5718 btrfs_alloc_free_block+0xbf/0x281 [btrfs]()
...
Call Trace:
 [<ffffffff8104df86>] warn_slowpath_common+0x80/0x98
 [<ffffffff8104dfb3>] warn_slowpath_null+0x15/0x17
 [<ffffffffa0369c60>] btrfs_alloc_free_block+0xbf/0x281 [btrfs]
 [<ffffffff810cbcb8>] ? __set_page_dirty_nobuffers+0xfe/0x108
 [<ffffffffa035c040>] __btrfs_cow_block+0x118/0x3b5 [btrfs]
 [<ffffffffa035c7ba>] btrfs_cow_block+0x103/0x14e [btrfs]
 [<ffffffffa035e4c4>] btrfs_search_slot+0x249/0x6a4 [btrfs]
 [<ffffffffa036d086>] btrfs_lookup_inode+0x2a/0x8a [btrfs]
 [<ffffffffa03788b7>] btrfs_update_inode+0xaa/0x141 [btrfs]
 [<ffffffffa036d7ec>] btrfs_save_ino_cache+0xea/0x202 [btrfs]
 [<ffffffffa03a761e>] ? btrfs_update_reloc_root+0x17e/0x197 [btrfs]
 [<ffffffffa0373867>] commit_fs_roots+0xaa/0x158 [btrfs]
 [<ffffffffa03746a6>] btrfs_commit_transaction+0x405/0x731 [btrfs]
 [<ffffffff810690df>] ? wake_up_bit+0x25/0x25
 [<ffffffffa039d652>] ? btrfs_log_dentry_safe+0x43/0x51 [btrfs]
 [<ffffffffa0381c5f>] btrfs_sync_file+0x16a/0x198 [btrfs]
 [<ffffffff81122806>] ? mntput+0x21/0x23
 [<ffffffff8112d150>] vfs_fsync_range+0x18/0x21
 [<ffffffff8112d170>] vfs_fsync+0x17/0x19
 [<ffffffff8112d316>] do_fsync+0x29/0x3e
 [<ffffffff8112d348>] sys_fsync+0xb/0xf
 [<ffffffff81468352>] system_call_fastpath+0x16/0x1b

Sometimes it causes BUG_ON() in the reservation code of the delayed inode
is triggered.

So we must reserve enough space for inode cache.

Note: If we can not reserve the enough space for inode cache, we will
give up writing out it.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode-map.c | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 53dcbdf446cd..f8962a957d65 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -398,6 +398,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
 	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
 	struct btrfs_path *path;
 	struct inode *inode;
+	struct btrfs_block_rsv *rsv;
+	u64 num_bytes;
 	u64 alloc_hint = 0;
 	int ret;
 	int prealloc;
@@ -421,11 +423,26 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
 	if (!path)
 		return -ENOMEM;
 
+	rsv = trans->block_rsv;
+	trans->block_rsv = &root->fs_info->trans_block_rsv;
+
+	num_bytes = trans->bytes_reserved;
+	/*
+	 * 1 item for inode item insertion if need
+	 * 3 items for inode item update (in the worst case)
+	 * 1 item for free space object
+	 * 3 items for pre-allocation
+	 */
+	trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8);
+	ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv,
+					  trans->bytes_reserved);
+	if (ret)
+		goto out;
 again:
 	inode = lookup_free_ino_inode(root, path);
 	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
 		ret = PTR_ERR(inode);
-		goto out;
+		goto out_release;
 	}
 
 	if (IS_ERR(inode)) {
@@ -434,7 +451,7 @@ again:
 
 		ret = create_free_ino_inode(root, trans, path);
 		if (ret)
-			goto out;
+			goto out_release;
 		goto again;
 	}
 
@@ -477,11 +494,14 @@ again:
 	}
 	btrfs_free_reserved_data_space(inode, prealloc);
 
+	ret = btrfs_write_out_ino_cache(root, trans, path);
 out_put:
 	iput(inode);
+out_release:
+	btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
 out:
-	if (ret == 0)
-		ret = btrfs_write_out_ino_cache(root, trans, path);
+	trans->block_rsv = rsv;
+	trans->bytes_reserved = num_bytes;
 
 	btrfs_free_path(path);
 	return ret;
-- 
cgit v1.2.2


From 3254c87618354e58fa2a7b375c6664f567480c33 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 10 Nov 2011 20:45:05 -0500
Subject: Btrfs: fix unreleased path in btrfs_orphan_cleanup()

When we did stress test for the space relocation, the deadlock happened.
By debugging, We found it was caused by the carelessness that we forgot
to unlock the read lock of the extent buffers in btrfs_orphan_cleanup()
before we end the transaction handle, so the transaction commit task waited
the task, which called btrfs_orphan_cleanup(), to unlock the extent buffer,
but that task waited the commit task to end the transaction commit, and
the deadlock happened. Fix it.

Signed-ff-by: Miao Xie <miaox@cn.fujitsu.com>

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7d394335bcb5..e16215f480d0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2201,6 +2201,9 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 		if (ret)
 			goto out;
 	}
+	/* release the path since we're done with it */
+	btrfs_release_path(path);
+
 	root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
 
 	if (root->orphan_block_rsv)
-- 
cgit v1.2.2


From 61b520a9d0083b9b361638e456af45fd75150c87 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 10 Nov 2011 20:45:05 -0500
Subject: Btrfs: Abstract similar code for btrfs_block_rsv_add{, _noflush}

btrfs_block_rsv_add{, _noflush}() have similar code, so abstract that code.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0b044e509e9f..0f47b3e2010e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3796,16 +3796,16 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
 	kfree(rsv);
 }
 
-int btrfs_block_rsv_add(struct btrfs_root *root,
-			struct btrfs_block_rsv *block_rsv,
-			u64 num_bytes)
+static inline int __block_rsv_add(struct btrfs_root *root,
+				  struct btrfs_block_rsv *block_rsv,
+				  u64 num_bytes, int flush)
 {
 	int ret;
 
 	if (num_bytes == 0)
 		return 0;
 
-	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
+	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
 	if (!ret) {
 		block_rsv_add_bytes(block_rsv, num_bytes, 1);
 		return 0;
@@ -3814,22 +3814,18 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
 	return ret;
 }
 
+int btrfs_block_rsv_add(struct btrfs_root *root,
+			struct btrfs_block_rsv *block_rsv,
+			u64 num_bytes)
+{
+	return __block_rsv_add(root, block_rsv, num_bytes, 1);
+}
+
 int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
 				struct btrfs_block_rsv *block_rsv,
 				u64 num_bytes)
 {
-	int ret;
-
-	if (num_bytes == 0)
-		return 0;
-
-	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 0);
-	if (!ret) {
-		block_rsv_add_bytes(block_rsv, num_bytes, 1);
-		return 0;
-	}
-
-	return ret;
+	return __block_rsv_add(root, block_rsv, num_bytes, 0);
 }
 
 int btrfs_block_rsv_check(struct btrfs_root *root,
-- 
cgit v1.2.2


From 76b9e23d25d5c99f994bee3172de39492e452e93 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 10 Nov 2011 20:45:05 -0500
Subject: Btrfs: fix orphan backref nodes

If the root node of a fs/file tree is in the block group that is
being relocated, but the others are not in the other block groups.
when we create a snapshot for this tree between the relocation tree
creation ends and ->create_reloc_tree is set to 0, Btrfs will create
some backref nodes that are the lowest nodes of the backrefs cache.
But we forget to add them into ->leaves list of the backref cache
and deal with them, and at last, they will triggered BUG_ON().

  kernel BUG at fs/btrfs/relocation.c:239!

This patch fixes it by adding them into ->leaves list of backref cache.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/relocation.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 24d654ce7a06..dff29d5e151a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1174,6 +1174,8 @@ static int clone_backref_node(struct btrfs_trans_handle *trans,
 			list_add_tail(&new_edge->list[UPPER],
 				      &new_node->lower);
 		}
+	} else {
+		list_add_tail(&new_node->lower, &cache->leaves);
 	}
 
 	rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
-- 
cgit v1.2.2


From 2f120c05e67ae34c93786b1050c6828904314429 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 10 Nov 2011 20:45:05 -0500
Subject: Btrfs: only map pages if we know we need them when reading the space
 cache

People have been running into a warning when loading space cache because the
page is already mapped when trying to read in a bitmap.  The way we read in
entries and pages is kind of convoluted, so fix it so that io_ctl_read_entry
maps the entries if it needs to, and if it hits the end of the page it simply
unmaps the page.  That way we can unconditionally unmap the io_ctl before
reading in the bitmap and we should stop hitting these warnings.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/free-space-cache.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 7a15fcfb3e1f..181760f9d2ab 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -537,6 +537,13 @@ static int io_ctl_read_entry(struct io_ctl *io_ctl,
 			    struct btrfs_free_space *entry, u8 *type)
 {
 	struct btrfs_free_space_entry *e;
+	int ret;
+
+	if (!io_ctl->cur) {
+		ret = io_ctl_check_crc(io_ctl, io_ctl->index);
+		if (ret)
+			return ret;
+	}
 
 	e = io_ctl->cur;
 	entry->offset = le64_to_cpu(e->offset);
@@ -550,10 +557,7 @@ static int io_ctl_read_entry(struct io_ctl *io_ctl,
 
 	io_ctl_unmap_page(io_ctl);
 
-	if (io_ctl->index >= io_ctl->num_pages)
-		return 0;
-
-	return io_ctl_check_crc(io_ctl, io_ctl->index);
+	return 0;
 }
 
 static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
@@ -561,9 +565,6 @@ static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
 {
 	int ret;
 
-	if (io_ctl->cur && io_ctl->cur != io_ctl->orig)
-		io_ctl_unmap_page(io_ctl);
-
 	ret = io_ctl_check_crc(io_ctl, io_ctl->index);
 	if (ret)
 		return ret;
@@ -699,6 +700,8 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 		num_entries--;
 	}
 
+	io_ctl_unmap_page(&io_ctl);
+
 	/*
 	 * We add the bitmaps at the end of the entries in order that
 	 * the bitmap entries are added to the cache.
-- 
cgit v1.2.2


From 62f30c5462374b991e7e3f42d49ce2265c1b82f1 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 10 Nov 2011 20:45:05 -0500
Subject: Btrfs: fix deadlock caused by the race between relocation

We can not do flushable reservation for the relocation when we create snapshot,
because it may make the transaction commit task and the flush task wait for
each other and the deadlock happens.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/transaction.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 960835eaf4da..6a0574e923bc 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -882,8 +882,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
 
 	if (to_reserve > 0) {
-		ret = btrfs_block_rsv_add(root, &pending->block_rsv,
-					  to_reserve);
+		ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv,
+						  to_reserve);
 		if (ret) {
 			pending->error = ret;
 			goto fail;
-- 
cgit v1.2.2


From 69f4cb526bd02ae5af35846f9a710c099eec3347 Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Fri, 11 Nov 2011 08:17:10 -0500
Subject: Btrfs: handle bio_add_page failure gracefully in scrub

Currently scrub fails with ENOMEM when bio_add_page fails. Unfortunately
dm based targets accept only one page per bio, thus making scrub always
fails. This patch just submits the current bio when an error is encountered
and starts a new one.

Signed-off-by: Arne Jansen <sensille@gmx.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/scrub.c | 64 +++++++++++++++++++++++++-------------------------------
 1 file changed, 29 insertions(+), 35 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ed11d3866afd..f4190f22edfb 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -944,50 +944,18 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
 static int scrub_submit(struct scrub_dev *sdev)
 {
 	struct scrub_bio *sbio;
-	struct bio *bio;
-	int i;
 
 	if (sdev->curr == -1)
 		return 0;
 
 	sbio = sdev->bios[sdev->curr];
-
-	bio = bio_alloc(GFP_NOFS, sbio->count);
-	if (!bio)
-		goto nomem;
-
-	bio->bi_private = sbio;
-	bio->bi_end_io = scrub_bio_end_io;
-	bio->bi_bdev = sdev->dev->bdev;
-	bio->bi_sector = sbio->physical >> 9;
-
-	for (i = 0; i < sbio->count; ++i) {
-		struct page *page;
-		int ret;
-
-		page = alloc_page(GFP_NOFS);
-		if (!page)
-			goto nomem;
-
-		ret = bio_add_page(bio, page, PAGE_SIZE, 0);
-		if (!ret) {
-			__free_page(page);
-			goto nomem;
-		}
-	}
-
 	sbio->err = 0;
 	sdev->curr = -1;
 	atomic_inc(&sdev->in_flight);
 
-	submit_bio(READ, bio);
+	submit_bio(READ, sbio->bio);
 
 	return 0;
-
-nomem:
-	scrub_free_bio(bio);
-
-	return -ENOMEM;
 }
 
 static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
@@ -995,6 +963,8 @@ static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
 		      u8 *csum, int force)
 {
 	struct scrub_bio *sbio;
+	struct page *page;
+	int ret;
 
 again:
 	/*
@@ -1015,12 +985,22 @@ again:
 	}
 	sbio = sdev->bios[sdev->curr];
 	if (sbio->count == 0) {
+		struct bio *bio;
+
 		sbio->physical = physical;
 		sbio->logical = logical;
+		bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO);
+		if (!bio)
+			return -ENOMEM;
+
+		bio->bi_private = sbio;
+		bio->bi_end_io = scrub_bio_end_io;
+		bio->bi_bdev = sdev->dev->bdev;
+		bio->bi_sector = sbio->physical >> 9;
+		sbio->err = 0;
+		sbio->bio = bio;
 	} else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||
 		   sbio->logical + sbio->count * PAGE_SIZE != logical) {
-		int ret;
-
 		ret = scrub_submit(sdev);
 		if (ret)
 			return ret;
@@ -1030,6 +1010,20 @@ again:
 	sbio->spag[sbio->count].generation = gen;
 	sbio->spag[sbio->count].have_csum = 0;
 	sbio->spag[sbio->count].mirror_num = mirror_num;
+
+	page = alloc_page(GFP_NOFS);
+	if (!page)
+		return -ENOMEM;
+
+	ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0);
+	if (!ret) {
+		__free_page(page);
+		ret = scrub_submit(sdev);
+		if (ret)
+			return ret;
+		goto again;
+	}
+
 	if (csum) {
 		sbio->spag[sbio->count].have_csum = 1;
 		memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
-- 
cgit v1.2.2


From 8965593e41dd2d0e2a2f1e6f245336005ea94a2c Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Fri, 11 Nov 2011 10:14:57 -0500
Subject: btrfs: rename the option to nospace_cache

Rename no_space_cache option to nospace_cache to be more consistent with
the rest, where the simple prefix 'no' is used to negate an option.

The option has been introduced during the -rc1 cycle and there are has not been
widely used, so it's safe.

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 629281c65ff5..8bd9d6d0e07a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -197,7 +197,7 @@ static match_table_t tokens = {
 	{Opt_subvolrootid, "subvolrootid=%d"},
 	{Opt_defrag, "autodefrag"},
 	{Opt_inode_cache, "inode_cache"},
-	{Opt_no_space_cache, "no_space_cache"},
+	{Opt_no_space_cache, "nospace_cache"},
 	{Opt_recovery, "recovery"},
 	{Opt_err, NULL},
 };
@@ -711,7 +711,7 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	if (btrfs_test_opt(root, SPACE_CACHE))
 		seq_puts(seq, ",space_cache");
 	else
-		seq_puts(seq, ",no_space_cache");
+		seq_puts(seq, ",nospace_cache");
 	if (btrfs_test_opt(root, CLEAR_CACHE))
 		seq_puts(seq, ",clear_cache");
 	if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
-- 
cgit v1.2.2


From f1ebcc74d5b2159f44c96b479b6eb8afc7829095 Mon Sep 17 00:00:00 2001
From: Liu Bo <liubo2009@cn.fujitsu.com>
Date: Mon, 14 Nov 2011 20:48:06 -0500
Subject: Btrfs: fix tree corruption after multi-thread snapshots and
 inode_cache flush

The btrfs snapshotting code requires that once a root has been
snapshotted, we don't change it during a commit.

But there are two cases to lead to tree corruptions:

1) multi-thread snapshots can commit serveral snapshots in a transaction,
   and this may change the src root when processing the following pending
   snapshots, which lead to the former snapshots corruptions;

2) the free inode cache was changing the roots when it root the cache,
   which lead to corruptions.

This fixes things by making sure we force COW the block after we create a
snapshot during commiting a transaction, then any changes to the roots
will result in COW, and we get all the fs roots and snapshot roots to be
consistent.

Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 17 ++++++++++++++++-
 fs/btrfs/ctree.h       |  2 ++
 fs/btrfs/transaction.c |  8 ++++++++
 3 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0fe615e4ea38..dede441bdeee 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -514,10 +514,25 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root,
 				   struct extent_buffer *buf)
 {
+	/* ensure we can see the force_cow */
+	smp_rmb();
+
+	/*
+	 * We do not need to cow a block if
+	 * 1) this block is not created or changed in this transaction;
+	 * 2) this block does not belong to TREE_RELOC tree;
+	 * 3) the root is not forced COW.
+	 *
+	 * What is forced COW:
+	 *    when we create snapshot during commiting the transaction,
+	 *    after we've finished coping src root, we must COW the shared
+	 *    block to ensure the metadata consistency.
+	 */
 	if (btrfs_header_generation(buf) == trans->transid &&
 	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
 	    !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
-	      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
+	      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
+	    !root->force_cow)
 		return 0;
 	return 1;
 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b9ba59ff9292..b1cb3c052484 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1271,6 +1271,8 @@ struct btrfs_root {
 	 * for stat.  It may be used for more later
 	 */
 	dev_t anon_dev;
+
+	int force_cow;
 };
 
 struct btrfs_ioctl_defrag_range_args {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 6a0574e923bc..81376d94cd3c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -785,6 +785,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
 
 			btrfs_save_ino_cache(root, trans);
 
+			/* see comments in should_cow_block() */
+			root->force_cow = 0;
+			smp_wmb();
+
 			if (root->commit_root != root->node) {
 				mutex_lock(&root->fs_commit_mutex);
 				switch_commit_root(root);
@@ -947,6 +951,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	btrfs_tree_unlock(old);
 	free_extent_buffer(old);
 
+	/* see comments in should_cow_block() */
+	root->force_cow = 1;
+	smp_wmb();
+
 	btrfs_set_root_node(new_root_item, tmp);
 	/* record when the snapshot was created in key.offset */
 	key.offset = trans->transid;
-- 
cgit v1.2.2


From 8d514bbf37eecf0a3e309284728637816a36764b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 16 Nov 2011 16:06:09 -0500
Subject: btrfs: fix double mntput() in mount_subvol()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/super.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8bd9d6d0e07a..969a7747e889 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -861,7 +861,6 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,
 
 	if (!is_subvolume_inode(path.dentry->d_inode)) {
 		path_put(&path);
-		mntput(mnt);
 		error = -EINVAL;
 		printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n",
 				subvol_name);
-- 
cgit v1.2.2


From c13344958780b4046305ee6235d686c846535529 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 16 Nov 2011 16:12:14 -0500
Subject: switch create_mnt_ns() to saner calling conventions, fix double
 mntput() in nfs

Life is much saner if create_mnt_ns(mnt) drops mnt in case of error...
Switch it to such calling conventions, switch callers, fix double mntput() in
fs/nfs/super.c one.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/super.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 969a7747e889..cfbedd7755b0 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -843,10 +843,8 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,
 		return ERR_CAST(mnt);
 
 	ns_private = create_mnt_ns(mnt);
-	if (IS_ERR(ns_private)) {
-		mntput(mnt);
+	if (IS_ERR(ns_private))
 		return ERR_CAST(ns_private);
-	}
 
 	/*
 	 * This will trigger the automount of the subvol so we can just
-- 
cgit v1.2.2


From ea441d1104cf1efb471fa81bc91e9fd1e6ae29fd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 16 Nov 2011 21:43:59 -0500
Subject: new helper: mount_subtree()

takes vfsmount and relative path, does lookup within that vfsmount
(possibly triggering automounts) and returns the result as root
of subtree suitable for return by ->mount() (i.e. a reference to
dentry and an active reference to its superblock grabbed, superblock
locked exclusive).

btrfs and nfs switched to it instead of open-coding the sucker.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/super.c | 35 ++++++-----------------------------
 1 file changed, 6 insertions(+), 29 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index cfbedd7755b0..17ee7fc5e64e 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -825,13 +825,9 @@ static char *setup_root_args(char *args)
 static struct dentry *mount_subvol(const char *subvol_name, int flags,
 				   const char *device_name, char *data)
 {
-	struct super_block *s;
 	struct dentry *root;
 	struct vfsmount *mnt;
-	struct mnt_namespace *ns_private;
 	char *newargs;
-	struct path path;
-	int error;
 
 	newargs = setup_root_args(data);
 	if (!newargs)
@@ -842,36 +838,17 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,
 	if (IS_ERR(mnt))
 		return ERR_CAST(mnt);
 
-	ns_private = create_mnt_ns(mnt);
-	if (IS_ERR(ns_private))
-		return ERR_CAST(ns_private);
-
-	/*
-	 * This will trigger the automount of the subvol so we can just
-	 * drop the mnt we have here and return the dentry that we
-	 * found.
-	 */
-	error = vfs_path_lookup(mnt->mnt_root, mnt, subvol_name,
-				LOOKUP_FOLLOW, &path);
-	put_mnt_ns(ns_private);
-	if (error)
-		return ERR_PTR(error);
+	root = mount_subtree(mnt, subvol_name);
 
-	if (!is_subvolume_inode(path.dentry->d_inode)) {
-		path_put(&path);
-		error = -EINVAL;
+	if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) {
+		struct super_block *s = root->d_sb;
+		dput(root);
+		root = ERR_PTR(-EINVAL);
+		deactivate_locked_super(s);
 		printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n",
 				subvol_name);
-		return ERR_PTR(-EINVAL);
 	}
 
-	/* Get a ref to the sb and the dentry we found and return it */
-	s = path.mnt->mnt_sb;
-	atomic_inc(&s->s_active);
-	root = dget(path.dentry);
-	path_put(&path);
-	down_write(&s->s_umount);
-
 	return root;
 }
 
-- 
cgit v1.2.2


From 387125fc722a8ed432066b85a552917343bdafca Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 18 Nov 2011 15:07:51 -0500
Subject: Btrfs: fix barrier flushes

When btrfs is writing the super blocks, it send barrier flushes to make
sure writeback caching drives get all the metadata on disk in the
right order.

But, we have two bugs in the way these are sent down.  When doing
full commits (not via the tree log), we are sending the barrier down
before the last super when it should be going down before the first.

In multi-device setups, we should be waiting for the barriers to
complete on all devices before writing any of the supers.

Both of these bugs can cause corruptions on power failures.  We fix it
with some new code to send down empty barriers to all devices before
writing the first super.

Alexandre Oliva found the multi-device bug.  Arne Jansen did the async
barrier loop.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
Reported-by: Alexandre Oliva <oliva@lsd.ic.unicamp.br>
---
 fs/btrfs/disk-io.c | 145 ++++++++++++++++++++++++++++++++++++++++++++++-------
 fs/btrfs/volumes.h |   6 +++
 2 files changed, 134 insertions(+), 17 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b6a5c0dd0dd8..48d30138237f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2573,22 +2573,10 @@ static int write_dev_supers(struct btrfs_device *device,
 	int errors = 0;
 	u32 crc;
 	u64 bytenr;
-	int last_barrier = 0;
 
 	if (max_mirrors == 0)
 		max_mirrors = BTRFS_SUPER_MIRROR_MAX;
 
-	/* make sure only the last submit_bh does a barrier */
-	if (do_barriers) {
-		for (i = 0; i < max_mirrors; i++) {
-			bytenr = btrfs_sb_offset(i);
-			if (bytenr + BTRFS_SUPER_INFO_SIZE >=
-			    device->total_bytes)
-				break;
-			last_barrier = i;
-		}
-	}
-
 	for (i = 0; i < max_mirrors; i++) {
 		bytenr = btrfs_sb_offset(i);
 		if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
@@ -2634,17 +2622,136 @@ static int write_dev_supers(struct btrfs_device *device,
 			bh->b_end_io = btrfs_end_buffer_write_sync;
 		}
 
-		if (i == last_barrier && do_barriers)
-			ret = submit_bh(WRITE_FLUSH_FUA, bh);
-		else
-			ret = submit_bh(WRITE_SYNC, bh);
-
+		/*
+		 * we fua the first super.  The others we allow
+		 * to go down lazy.
+		 */
+		ret = submit_bh(WRITE_FUA, bh);
 		if (ret)
 			errors++;
 	}
 	return errors < i ? 0 : -1;
 }
 
+/*
+ * endio for the write_dev_flush, this will wake anyone waiting
+ * for the barrier when it is done
+ */
+static void btrfs_end_empty_barrier(struct bio *bio, int err)
+{
+	if (err) {
+		if (err == -EOPNOTSUPP)
+			set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+		clear_bit(BIO_UPTODATE, &bio->bi_flags);
+	}
+	if (bio->bi_private)
+		complete(bio->bi_private);
+	bio_put(bio);
+}
+
+/*
+ * trigger flushes for one the devices.  If you pass wait == 0, the flushes are
+ * sent down.  With wait == 1, it waits for the previous flush.
+ *
+ * any device where the flush fails with eopnotsupp are flagged as not-barrier
+ * capable
+ */
+static int write_dev_flush(struct btrfs_device *device, int wait)
+{
+	struct bio *bio;
+	int ret = 0;
+
+	if (device->nobarriers)
+		return 0;
+
+	if (wait) {
+		bio = device->flush_bio;
+		if (!bio)
+			return 0;
+
+		wait_for_completion(&device->flush_wait);
+
+		if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
+			printk("btrfs: disabling barriers on dev %s\n",
+			       device->name);
+			device->nobarriers = 1;
+		}
+		if (!bio_flagged(bio, BIO_UPTODATE)) {
+			ret = -EIO;
+		}
+
+		/* drop the reference from the wait == 0 run */
+		bio_put(bio);
+		device->flush_bio = NULL;
+
+		return ret;
+	}
+
+	/*
+	 * one reference for us, and we leave it for the
+	 * caller
+	 */
+	device->flush_bio = NULL;;
+	bio = bio_alloc(GFP_NOFS, 0);
+	if (!bio)
+		return -ENOMEM;
+
+	bio->bi_end_io = btrfs_end_empty_barrier;
+	bio->bi_bdev = device->bdev;
+	init_completion(&device->flush_wait);
+	bio->bi_private = &device->flush_wait;
+	device->flush_bio = bio;
+
+	bio_get(bio);
+	submit_bio(WRITE_FLUSH, bio);
+
+	return 0;
+}
+
+/*
+ * send an empty flush down to each device in parallel,
+ * then wait for them
+ */
+static int barrier_all_devices(struct btrfs_fs_info *info)
+{
+	struct list_head *head;
+	struct btrfs_device *dev;
+	int errors = 0;
+	int ret;
+
+	/* send down all the barriers */
+	head = &info->fs_devices->devices;
+	list_for_each_entry_rcu(dev, head, dev_list) {
+		if (!dev->bdev) {
+			errors++;
+			continue;
+		}
+		if (!dev->in_fs_metadata || !dev->writeable)
+			continue;
+
+		ret = write_dev_flush(dev, 0);
+		if (ret)
+			errors++;
+	}
+
+	/* wait for all the barriers */
+	list_for_each_entry_rcu(dev, head, dev_list) {
+		if (!dev->bdev) {
+			errors++;
+			continue;
+		}
+		if (!dev->in_fs_metadata || !dev->writeable)
+			continue;
+
+		ret = write_dev_flush(dev, 1);
+		if (ret)
+			errors++;
+	}
+	if (errors)
+		return -EIO;
+	return 0;
+}
+
 int write_all_supers(struct btrfs_root *root, int max_mirrors)
 {
 	struct list_head *head;
@@ -2666,6 +2773,10 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
 
 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
 	head = &root->fs_info->fs_devices->devices;
+
+	if (do_barriers)
+		barrier_all_devices(root->fs_info);
+
 	list_for_each_entry_rcu(dev, head, dev_list) {
 		if (!dev->bdev) {
 			total_errors++;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index ab5b1c49f352..78f2d4d4f37f 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -100,6 +100,12 @@ struct btrfs_device {
 	struct reada_zone *reada_curr_zone;
 	struct radix_tree_root reada_zones;
 	struct radix_tree_root reada_extents;
+
+	/* for sending down flush barriers */
+	struct bio *flush_bio;
+	struct completion flush_wait;
+	int nobarriers;
+
 };
 
 struct btrfs_fs_devices {
-- 
cgit v1.2.2


From 745c4d8e160afaf6c75e887c27ea4b75c8142b26 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Sun, 20 Nov 2011 07:31:57 -0500
Subject: btrfs: Fix up 32/64-bit compatibility for new ioctls

 This patch casts to unsigned long before casting to a pointer and fixes
 the following warnings:
fs/btrfs/extent_io.c:2289:20: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
fs/btrfs/ioctl.c:2933:37: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
fs/btrfs/ioctl.c:2937:21: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
fs/btrfs/ioctl.c:3020:21: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
fs/btrfs/scrub.c:275:4: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
fs/btrfs/backref.c:686:27: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/backref.c   | 2 +-
 fs/btrfs/extent_io.c | 2 +-
 fs/btrfs/ioctl.c     | 9 ++++++---
 fs/btrfs/scrub.c     | 2 +-
 4 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 8855aad3929c..22c64fff1bd5 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -683,7 +683,7 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
 		return PTR_ERR(fspath);
 
 	if (fspath > fspath_min) {
-		ipath->fspath->val[i] = (u64)fspath;
+		ipath->fspath->val[i] = (u64)(unsigned long)fspath;
 		++ipath->fspath->elem_cnt;
 		ipath->fspath->bytes_left = fspath - fspath_min;
 	} else {
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 1f87c4d0e7a0..47fdba7853c3 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2286,7 +2286,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 		}
 		if (!uptodate) {
 			u64 failed_mirror;
-			failed_mirror = (u64)bio->bi_bdev;
+			failed_mirror = (unsigned long)bio->bi_bdev;
 			if (tree->ops && tree->ops->readpage_io_failed_hook)
 				ret = tree->ops->readpage_io_failed_hook(
 						bio, page, start, end,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4a34c472f126..bda53fcd1a09 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2930,11 +2930,13 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
 		goto out;
 
 	for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
-		rel_ptr = ipath->fspath->val[i] - (u64)ipath->fspath->val;
+		rel_ptr = ipath->fspath->val[i] -
+			  (u64)(unsigned long)ipath->fspath->val;
 		ipath->fspath->val[i] = rel_ptr;
 	}
 
-	ret = copy_to_user((void *)ipa->fspath, (void *)ipath->fspath, size);
+	ret = copy_to_user((void *)(unsigned long)ipa->fspath,
+			   (void *)(unsigned long)ipath->fspath, size);
 	if (ret) {
 		ret = -EFAULT;
 		goto out;
@@ -3017,7 +3019,8 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
 	if (ret < 0)
 		goto out;
 
-	ret = copy_to_user((void *)loi->inodes, (void *)inodes, size);
+	ret = copy_to_user((void *)(unsigned long)loi->inodes,
+			   (void *)(unsigned long)inodes, size);
 	if (ret)
 		ret = -EFAULT;
 
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index f4190f22edfb..fab420db5121 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -272,7 +272,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
 			swarn->logical, swarn->dev->name,
 			(unsigned long long)swarn->sector, root, inum, offset,
 			min(isize - offset, (u64)PAGE_SIZE), nlink,
-			(char *)ipath->fspath->val[i]);
+			(char *)(unsigned long)ipath->fspath->val[i]);
 
 	free_ipath(ipath);
 	return 0;
-- 
cgit v1.2.2


From 32240a913d9f3a5aad42175d7696590ea1bfdb08 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Sun, 20 Nov 2011 07:33:38 -0500
Subject: btrfs: mirror_num should be int, not u64

My previous patch introduced some u64 for failed_mirror variables, this one
makes it consistent again.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c   | 2 +-
 fs/btrfs/extent_io.c | 4 ++--
 fs/btrfs/extent_io.h | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 48d30138237f..94abc25392f6 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -620,7 +620,7 @@ out:
 
 static int btree_io_failed_hook(struct bio *failed_bio,
 			 struct page *page, u64 start, u64 end,
-			 u64 mirror_num, struct extent_state *state)
+			 int mirror_num, struct extent_state *state)
 {
 	struct extent_io_tree *tree;
 	unsigned long len;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 47fdba7853c3..a877b8b212eb 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2285,8 +2285,8 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 				clean_io_failure(start, page);
 		}
 		if (!uptodate) {
-			u64 failed_mirror;
-			failed_mirror = (unsigned long)bio->bi_bdev;
+			int failed_mirror;
+			failed_mirror = (int)(unsigned long)bio->bi_bdev;
 			if (tree->ops && tree->ops->readpage_io_failed_hook)
 				ret = tree->ops->readpage_io_failed_hook(
 						bio, page, start, end,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index feb9be0e23bc..7604c3001322 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -70,7 +70,7 @@ struct extent_io_ops {
 			      unsigned long bio_flags);
 	int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
 	int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
-				       u64 start, u64 end, u64 failed_mirror,
+				       u64 start, u64 end, int failed_mirror,
 				       struct extent_state *state);
 	int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
 					u64 start, u64 end,
-- 
cgit v1.2.2


From 0f0fbf1d0e188d129756e9508090af4bdbfde00b Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Sun, 20 Nov 2011 07:33:38 -0500
Subject: Btrfs: fix to search one more bitmap for cluster setup

Suppose there are two bitmaps [0, 256], [256, 512] and one extent
[100, 120] in the free space cache, and we want to setup a cluster
with offset=100, bytes=50.

In this case, there will be only one bitmap [256, 512] in the temporary
bitmaps list, and then setup_cluster_bitmap() won't search bitmap [0, 256].

The cause is, the list is constructed in setup_cluster_no_bitmap(),
and only bitmaps with bitmap_entry->offset >= offset will be added
into the list, and the very bitmap that convers offset has
bitmap_entry->offset <= offset.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/free-space-cache.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 181760f9d2ab..8f792f41feab 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2453,10 +2453,22 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
 	struct btrfs_free_space *entry;
 	struct rb_node *node;
 	int ret = -ENOSPC;
+	u64 bitmap_offset = offset_to_bitmap(ctl, offset);
 
 	if (ctl->total_bitmaps == 0)
 		return -ENOSPC;
 
+	/*
+	 * The bitmap that covers offset won't be in the list unless offset
+	 * is just its start offset.
+	 */
+	entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
+	if (entry->offset != bitmap_offset) {
+		entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
+		if (entry && list_empty(&entry->list))
+			list_add(&entry->list, bitmaps);
+	}
+
 	/*
 	 * First check our cached list of bitmaps and see if there is an entry
 	 * here that will work.
-- 
cgit v1.2.2


From 52621cb6ed0e0e14358bb317bda7cd5fbd5c2a27 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Sun, 20 Nov 2011 07:33:38 -0500
Subject: Btrfs: avoid unnecessary bitmap search for cluster setup

setup_cluster_no_bitmap() searches all the extents and bitmaps starting
from offset. Therefore if it returns -ENOSPC, all the bitmaps starting
from offset are in the bitmaps list, so it's sufficient to search from
this list in setup_cluser_bitmap().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/free-space-cache.c | 42 ++++--------------------------------------
 1 file changed, 4 insertions(+), 38 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 8f792f41feab..8c32434da2cb 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2451,7 +2451,6 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
 {
 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	struct btrfs_free_space *entry;
-	struct rb_node *node;
 	int ret = -ENOSPC;
 	u64 bitmap_offset = offset_to_bitmap(ctl, offset);
 
@@ -2469,10 +2468,6 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
 			list_add(&entry->list, bitmaps);
 	}
 
-	/*
-	 * First check our cached list of bitmaps and see if there is an entry
-	 * here that will work.
-	 */
 	list_for_each_entry(entry, bitmaps, list) {
 		if (entry->bytes < min_bytes)
 			continue;
@@ -2483,38 +2478,10 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
 	}
 
 	/*
-	 * If we do have entries on our list and we are here then we didn't find
-	 * anything, so go ahead and get the next entry after the last entry in
-	 * this list and start the search from there.
+	 * The bitmaps list has all the bitmaps that record free space
+	 * starting after offset, so no more search is required.
 	 */
-	if (!list_empty(bitmaps)) {
-		entry = list_entry(bitmaps->prev, struct btrfs_free_space,
-				   list);
-		node = rb_next(&entry->offset_index);
-		if (!node)
-			return -ENOSPC;
-		entry = rb_entry(node, struct btrfs_free_space, offset_index);
-		goto search;
-	}
-
-	entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1);
-	if (!entry)
-		return -ENOSPC;
-
-search:
-	node = &entry->offset_index;
-	do {
-		entry = rb_entry(node, struct btrfs_free_space, offset_index);
-		node = rb_next(&entry->offset_index);
-		if (!entry->bitmap)
-			continue;
-		if (entry->bytes < min_bytes)
-			continue;
-		ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
-					   bytes, min_bytes);
-	} while (ret && node);
-
-	return ret;
+	return -ENOSPC;
 }
 
 /*
@@ -2532,8 +2499,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
 			     u64 offset, u64 bytes, u64 empty_size)
 {
 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
-	struct list_head bitmaps;
 	struct btrfs_free_space *entry, *tmp;
+	LIST_HEAD(bitmaps);
 	u64 min_bytes;
 	int ret;
 
@@ -2572,7 +2539,6 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 
-	INIT_LIST_HEAD(&bitmaps);
 	ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
 				      bytes, min_bytes);
 	if (ret)
-- 
cgit v1.2.2


From fadc0d8be4dfca80f6c568bc5874931893c6709b Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Sun, 20 Nov 2011 07:33:38 -0500
Subject: btrfs: fix stat blocks accounting

Round inode bytes and delalloc bytes up to real blocksize before
converting to sector size. Otherwise eg. files smaller than 512
are reported with zero blocks due to incorrect rounding.

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e16215f480d0..8ad26b135a1c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6794,11 +6794,13 @@ static int btrfs_getattr(struct vfsmount *mnt,
 			 struct dentry *dentry, struct kstat *stat)
 {
 	struct inode *inode = dentry->d_inode;
+	u32 blocksize = inode->i_sb->s_blocksize;
+
 	generic_fillattr(inode, stat);
 	stat->dev = BTRFS_I(inode)->root->anon_dev;
 	stat->blksize = PAGE_CACHE_SIZE;
-	stat->blocks = (inode_get_bytes(inode) +
-			BTRFS_I(inode)->delalloc_bytes) >> 9;
+	stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
+		ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9;
 	return 0;
 }
 
-- 
cgit v1.2.2


From 5bb1468238e20b15921909e9f9601e945f03bac7 Mon Sep 17 00:00:00 2001
From: Arnd Hannemann <arnd@arndnet.de>
Date: Sun, 20 Nov 2011 07:33:38 -0500
Subject: Btrfs: prefix resize related printks with btrfs:

For the user it is confusing to find something like:
[10197.627710] new size for /dev/mapper/vg0-usr_share is 3221225472
in kernel log, because it doesn't point directly to btrfs.

This patch prefixes those messages with "btrfs:" like other btrfs
related printks.

Signed-off-by: Arnd Hannemann <arnd@arndnet.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index bda53fcd1a09..a90e749ed6d2 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1216,12 +1216,12 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 		*devstr = '\0';
 		devstr = vol_args->name;
 		devid = simple_strtoull(devstr, &end, 10);
-		printk(KERN_INFO "resizing devid %llu\n",
+		printk(KERN_INFO "btrfs: resizing devid %llu\n",
 		       (unsigned long long)devid);
 	}
 	device = btrfs_find_device(root, devid, NULL, NULL);
 	if (!device) {
-		printk(KERN_INFO "resizer unable to find device %llu\n",
+		printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
 		       (unsigned long long)devid);
 		ret = -EINVAL;
 		goto out_unlock;
@@ -1267,7 +1267,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 	do_div(new_size, root->sectorsize);
 	new_size *= root->sectorsize;
 
-	printk(KERN_INFO "new size for %s is %llu\n",
+	printk(KERN_INFO "btrfs: new size for %s is %llu\n",
 		device->name, (unsigned long long)new_size);
 
 	if (new_size > old_size) {
-- 
cgit v1.2.2


From 291c7d2f577428f896daa5002e784959328a80aa Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 14 Nov 2011 13:52:14 -0500
Subject: Btrfs: wait on caching if we're loading the free space cache

We've been hitting panics when running xfstest 13 in a loop for long periods of
time.  And actually this problem has always existed so we've been hitting these
things randomly for a while.  Basically what happens is we get a thread coming
into the allocator and reading the space cache off of disk and adding the
entries to the free space cache as we go.  Then we get another thread that comes
in and tries to allocate from that block group.  Since block_group->cached !=
BTRFS_CACHE_NO it goes ahead and tries to do the allocation.  We do this because
if we're doing the old slow way of caching we don't want to hold people up and
wait for everything to finish.  The problem with this is we could end up
discarding the space cache at some arbitrary point in the future, which means we
could very well end up allocating space that is either bad, or when the real
caching happens it could end up thinking the space isn't in use when it really
is and cause all sorts of other problems.

The solution is to add a new flag to indicate we are loading the free space
cache from disk, and always try to cache the block group if cache->cached !=
BTRFS_CACHE_FINISHED.  That way if we are loading the space cache anybody else
who tries to allocate from the block group will have to wait until it's finished
to make sure it completes successfully.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h       |   3 +-
 fs/btrfs/extent-tree.c | 119 ++++++++++++++++++++++++++++++++-----------------
 2 files changed, 81 insertions(+), 41 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b1cb3c052484..04a5dfcee5a1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -848,7 +848,8 @@ struct btrfs_free_cluster {
 enum btrfs_caching_type {
 	BTRFS_CACHE_NO		= 0,
 	BTRFS_CACHE_STARTED	= 1,
-	BTRFS_CACHE_FINISHED	= 2,
+	BTRFS_CACHE_FAST	= 2,
+	BTRFS_CACHE_FINISHED	= 3,
 };
 
 enum btrfs_disk_cache_state {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0f47b3e2010e..5d86877f10e1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -467,13 +467,59 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
 			     struct btrfs_root *root,
 			     int load_cache_only)
 {
+	DEFINE_WAIT(wait);
 	struct btrfs_fs_info *fs_info = cache->fs_info;
 	struct btrfs_caching_control *caching_ctl;
 	int ret = 0;
 
-	smp_mb();
-	if (cache->cached != BTRFS_CACHE_NO)
+	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
+	BUG_ON(!caching_ctl);
+
+	INIT_LIST_HEAD(&caching_ctl->list);
+	mutex_init(&caching_ctl->mutex);
+	init_waitqueue_head(&caching_ctl->wait);
+	caching_ctl->block_group = cache;
+	caching_ctl->progress = cache->key.objectid;
+	atomic_set(&caching_ctl->count, 1);
+	caching_ctl->work.func = caching_thread;
+
+	spin_lock(&cache->lock);
+	/*
+	 * This should be a rare occasion, but this could happen I think in the
+	 * case where one thread starts to load the space cache info, and then
+	 * some other thread starts a transaction commit which tries to do an
+	 * allocation while the other thread is still loading the space cache
+	 * info.  The previous loop should have kept us from choosing this block
+	 * group, but if we've moved to the state where we will wait on caching
+	 * block groups we need to first check if we're doing a fast load here,
+	 * so we can wait for it to finish, otherwise we could end up allocating
+	 * from a block group who's cache gets evicted for one reason or
+	 * another.
+	 */
+	while (cache->cached == BTRFS_CACHE_FAST) {
+		struct btrfs_caching_control *ctl;
+
+		ctl = cache->caching_ctl;
+		atomic_inc(&ctl->count);
+		prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
+		spin_unlock(&cache->lock);
+
+		schedule();
+
+		finish_wait(&ctl->wait, &wait);
+		put_caching_control(ctl);
+		spin_lock(&cache->lock);
+	}
+
+	if (cache->cached != BTRFS_CACHE_NO) {
+		spin_unlock(&cache->lock);
+		kfree(caching_ctl);
 		return 0;
+	}
+	WARN_ON(cache->caching_ctl);
+	cache->caching_ctl = caching_ctl;
+	cache->cached = BTRFS_CACHE_FAST;
+	spin_unlock(&cache->lock);
 
 	/*
 	 * We can't do the read from on-disk cache during a commit since we need
@@ -484,56 +530,51 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
 	if (trans && (!trans->transaction->in_commit) &&
 	    (root && root != root->fs_info->tree_root) &&
 	    btrfs_test_opt(root, SPACE_CACHE)) {
-		spin_lock(&cache->lock);
-		if (cache->cached != BTRFS_CACHE_NO) {
-			spin_unlock(&cache->lock);
-			return 0;
-		}
-		cache->cached = BTRFS_CACHE_STARTED;
-		spin_unlock(&cache->lock);
-
 		ret = load_free_space_cache(fs_info, cache);
 
 		spin_lock(&cache->lock);
 		if (ret == 1) {
+			cache->caching_ctl = NULL;
 			cache->cached = BTRFS_CACHE_FINISHED;
 			cache->last_byte_to_unpin = (u64)-1;
 		} else {
-			cache->cached = BTRFS_CACHE_NO;
+			if (load_cache_only) {
+				cache->caching_ctl = NULL;
+				cache->cached = BTRFS_CACHE_NO;
+			} else {
+				cache->cached = BTRFS_CACHE_STARTED;
+			}
 		}
 		spin_unlock(&cache->lock);
+		wake_up(&caching_ctl->wait);
 		if (ret == 1) {
+			put_caching_control(caching_ctl);
 			free_excluded_extents(fs_info->extent_root, cache);
 			return 0;
 		}
+	} else {
+		/*
+		 * We are not going to do the fast caching, set cached to the
+		 * appropriate value and wakeup any waiters.
+		 */
+		spin_lock(&cache->lock);
+		if (load_cache_only) {
+			cache->caching_ctl = NULL;
+			cache->cached = BTRFS_CACHE_NO;
+		} else {
+			cache->cached = BTRFS_CACHE_STARTED;
+		}
+		spin_unlock(&cache->lock);
+		wake_up(&caching_ctl->wait);
 	}
 
-	if (load_cache_only)
-		return 0;
-
-	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
-	BUG_ON(!caching_ctl);
-
-	INIT_LIST_HEAD(&caching_ctl->list);
-	mutex_init(&caching_ctl->mutex);
-	init_waitqueue_head(&caching_ctl->wait);
-	caching_ctl->block_group = cache;
-	caching_ctl->progress = cache->key.objectid;
-	/* one for caching kthread, one for caching block group list */
-	atomic_set(&caching_ctl->count, 2);
-	caching_ctl->work.func = caching_thread;
-
-	spin_lock(&cache->lock);
-	if (cache->cached != BTRFS_CACHE_NO) {
-		spin_unlock(&cache->lock);
-		kfree(caching_ctl);
+	if (load_cache_only) {
+		put_caching_control(caching_ctl);
 		return 0;
 	}
-	cache->caching_ctl = caching_ctl;
-	cache->cached = BTRFS_CACHE_STARTED;
-	spin_unlock(&cache->lock);
 
 	down_write(&fs_info->extent_commit_sem);
+	atomic_inc(&caching_ctl->count);
 	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
 	up_write(&fs_info->extent_commit_sem);
 
@@ -5177,13 +5218,15 @@ search:
 		}
 
 have_block_group:
-		if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
+		cached = block_group_cache_done(block_group);
+		if (unlikely(!cached)) {
 			u64 free_percent;
 
+			found_uncached_bg = true;
 			ret = cache_block_group(block_group, trans,
 						orig_root, 1);
 			if (block_group->cached == BTRFS_CACHE_FINISHED)
-				goto have_block_group;
+				goto alloc;
 
 			free_percent = btrfs_block_group_used(&block_group->item);
 			free_percent *= 100;
@@ -5205,7 +5248,6 @@ have_block_group:
 							orig_root, 0);
 				BUG_ON(ret);
 			}
-			found_uncached_bg = true;
 
 			/*
 			 * If loop is set for cached only, try the next block
@@ -5215,10 +5257,7 @@ have_block_group:
 				goto loop;
 		}
 
-		cached = block_group_cache_done(block_group);
-		if (unlikely(!cached))
-			found_uncached_bg = true;
-
+alloc:
 		if (unlikely(block_group->ro))
 			goto loop;
 
-- 
cgit v1.2.2


From f7d61dcd6873c49bcc42be2caa2af1c2511aa915 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 15 Nov 2011 09:31:24 -0500
Subject: Btrfs: clear pages dirty for io and set them extent mapped

When doing the io_ctl helpers to clean up the free space cache stuff I stopped
using our normal prepare_pages stuff, which means I of course forgot to do
things like set the pages extent mapped, which will cause us all sorts of
wonderful propblems.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/free-space-cache.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 8c32434da2cb..aedacdbf77e2 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -351,6 +351,11 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
 		}
 	}
 
+	for (i = 0; i < io_ctl->num_pages; i++) {
+		clear_page_dirty_for_io(io_ctl->pages[i]);
+		set_page_extent_mapped(io_ctl->pages[i]);
+	}
+
 	return 0;
 }
 
-- 
cgit v1.2.2


From 4d479cf010d56ec9c54f3099992d039918f1296b Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 17 Nov 2011 11:34:31 -0500
Subject: Btrfs: sectorsize align offsets in fiemap

We've been hitting BUG()'s in btrfs_cont_expand and btrfs_fallocate and anywhere
else that calls btrfs_get_extent while running xfstests 13 in a loop.  This is
because fiemap is calling btrfs_get_extent with non-sectorsize aligned offsets,
which will end up adding mappings that are not sectorsize aligned, which will
cause problems in some cases for subsequent calls to btrfs_get_extent for
similar areas that are sectorsize aligned.  With this patch I ran xfstests 13 in
a loop for a couple of hours and didn't hit the problem that I could previously
hit in at most 20 minutes.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent_io.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a877b8b212eb..9472d3de5e52 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3366,6 +3366,9 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		return -ENOMEM;
 	path->leave_spinning = 1;
 
+	start = ALIGN(start, BTRFS_I(inode)->root->sectorsize);
+	len = ALIGN(len, BTRFS_I(inode)->root->sectorsize);
+
 	/*
 	 * lookup the last file extent.  We're not using i_size here
 	 * because there might be preallocation past i_size
@@ -3413,7 +3416,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
 			 &cached_state, GFP_NOFS);
 
-	em = get_extent_skip_holes(inode, off, last_for_get_extent,
+	em = get_extent_skip_holes(inode, start, last_for_get_extent,
 				   get_extent);
 	if (!em)
 		goto out;
-- 
cgit v1.2.2


From 24a70313969fc3fc440216b40babdb42564acff3 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 21 Nov 2011 09:39:11 -0500
Subject: Btrfs: remove free-space-cache.c WARN during log replay

The log replay code only partially loads block groups, since
the block group caching code is able to detect and deal with
extents the logging code has pinned down.

While the logging code is pinning down block groups, there is
a bogus WARN_ON we're hitting if the code wasn't able to find
an extent in the cache.  This commit removes the warning because
it can happen any time there isn't a valid free space cache
for that block group.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/free-space-cache.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index aedacdbf77e2..6e5b7e463698 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1849,7 +1849,13 @@ again:
 		info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
 					  1, 0);
 		if (!info) {
-			WARN_ON(1);
+			/* the tree logging code might be calling us before we
+			 * have fully loaded the free space rbtree for this
+			 * block group.  So it is possible the entry won't
+			 * be in the rbtree yet at all.  The caching code
+			 * will make sure not to put it in the rbtree if
+			 * the logging code has pinned it.
+			 */
 			goto out_lock;
 		}
 	}
-- 
cgit v1.2.2


From 26bdef541d26fd6a5ddffdf8949ace22f94f809f Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 16 Nov 2011 11:28:01 +0300
Subject: btrfs scrub: handle -ENOMEM from init_ipath()

init_ipath() can return an ERR_PTR(-ENOMEM).

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
---
 fs/btrfs/scrub.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index fab420db5121..c27bcb67f330 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -256,6 +256,11 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
 	btrfs_release_path(swarn->path);
 
 	ipath = init_ipath(4096, local_root, swarn->path);
+	if (IS_ERR(ipath)) {
+		ret = PTR_ERR(ipath);
+		ipath = NULL;
+		goto err;
+	}
 	ret = paths_from_inode(inum, ipath);
 
 	if (ret < 0)
-- 
cgit v1.2.2


From aa38a711a893accf5b5192f3d705a120deaa81e0 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Fri, 18 Nov 2011 17:43:00 +0800
Subject: Btrfs: fix deadlock on metadata reservation when evicting a inode

When I ran the xfstests, I found the test tasks was blocked on meta-data
reservation.

By debugging, I found the reason of this bug:
   start transaction
        |
	v
   reserve meta-data space
	|
	v
   flush delay allocation -> iput inode -> evict inode
	^					|
	|					v
   wait for delay allocation flush <- reserve meta-data space

And besides that, the flush on evicting inode will block the thread, which
is reclaiming the memory, and make oom happen easily.

Fix this bug by skipping the flush step when evicting inode.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/ctree.h       |  3 +++
 fs/btrfs/extent-tree.c | 22 ++++++++++++++++++----
 fs/btrfs/inode.c       |  2 +-
 3 files changed, 22 insertions(+), 5 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 04a5dfcee5a1..50634abef9b4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2369,6 +2369,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
 int btrfs_block_rsv_refill(struct btrfs_root *root,
 			  struct btrfs_block_rsv *block_rsv,
 			  u64 min_reserved);
+int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
+				   struct btrfs_block_rsv *block_rsv,
+				   u64 min_reserved);
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
 			    struct btrfs_block_rsv *dst_rsv,
 			    u64 num_bytes);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5d86877f10e1..b7e5f6898d07 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3887,9 +3887,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
 	return ret;
 }
 
-int btrfs_block_rsv_refill(struct btrfs_root *root,
-			  struct btrfs_block_rsv *block_rsv,
-			  u64 min_reserved)
+static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
+					   struct btrfs_block_rsv *block_rsv,
+					   u64 min_reserved, int flush)
 {
 	u64 num_bytes = 0;
 	int ret = -ENOSPC;
@@ -3908,7 +3908,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
 	if (!ret)
 		return 0;
 
-	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
+	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
 	if (!ret) {
 		block_rsv_add_bytes(block_rsv, num_bytes, 0);
 		return 0;
@@ -3917,6 +3917,20 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
 	return ret;
 }
 
+int btrfs_block_rsv_refill(struct btrfs_root *root,
+			   struct btrfs_block_rsv *block_rsv,
+			   u64 min_reserved)
+{
+	return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
+}
+
+int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
+				   struct btrfs_block_rsv *block_rsv,
+				   u64 min_reserved)
+{
+	return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
+}
+
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
 			    struct btrfs_block_rsv *dst_rsv,
 			    u64 num_bytes)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8ad26b135a1c..c5ccec23984c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3490,7 +3490,7 @@ void btrfs_evict_inode(struct inode *inode)
 	 * doing the truncate.
 	 */
 	while (1) {
-		ret = btrfs_block_rsv_refill(root, rsv, min_size);
+		ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size);
 
 		/*
 		 * Try and steal from the global reserve since we will
-- 
cgit v1.2.2


From ece7d20e8be6730fbb29f4550de6b19b1a3a9387 Mon Sep 17 00:00:00 2001
From: Mike Fleetwood <mike.fleetwood@googlemail.com>
Date: Fri, 18 Nov 2011 18:55:01 +0000
Subject: Btrfs: Don't error on resizing FS to same size

It seems overly harsh to fail a resize of a btrfs file system to the
same size when a shrink or grow would succeed.  User app GParted trips
over this error.  Allow it by bypassing the shrink or grow operation.

Signed-off-by: Mike Fleetwood <mike.fleetwood@googlemail.com>
---
 fs/btrfs/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a90e749ed6d2..72d461656f60 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1278,7 +1278,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 		}
 		ret = btrfs_grow_device(trans, device, new_size);
 		btrfs_commit_transaction(trans, root);
-	} else {
+	} else if (new_size < old_size) {
 		ret = btrfs_shrink_device(device, new_size);
 	}
 
-- 
cgit v1.2.2


From b772a86ea6d932ac29d5e50e67c977653c832f8a Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 28 Nov 2011 16:43:00 +0800
Subject: Btrfs: fix oops when calling statfs on readonly device

To reproduce this bug:

  # dd if=/dev/zero of=img bs=1M count=256
  # mkfs.btrfs img
  # losetup -r /dev/loop1 img
  # mount /dev/loop1 /mnt
  OOPS!!

It triggered BUG_ON(!nr_devices) in btrfs_calc_avail_data_space().

To fix this, instead of checking write-only devices, we check all open
deivces:

  # df -h /dev/loop1
  Filesystem            Size  Used Avail Use% Mounted on
  /dev/loop1            250M   28K  238M   1% /mnt

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/super.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8bd9d6d0e07a..1a3ce9e0b495 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1083,7 +1083,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
 	int i = 0, nr_devices;
 	int ret;
 
-	nr_devices = fs_info->fs_devices->rw_devices;
+	nr_devices = fs_info->fs_devices->open_devices;
 	BUG_ON(!nr_devices);
 
 	devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
@@ -1105,8 +1105,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
 	else
 		min_stripe_size = BTRFS_STRIPE_LEN;
 
-	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
-		if (!device->in_fs_metadata)
+	list_for_each_entry(device, &fs_devices->devices, dev_list) {
+		if (!device->in_fs_metadata || !device->bdev)
 			continue;
 
 		avail_space = device->total_bytes - device->bytes_used;
-- 
cgit v1.2.2


From f2d0f6765d6332f9be732965a0c6f3b8a55082b4 Mon Sep 17 00:00:00 2001
From: Alexandre Oliva <oliva@lsd.ic.unicamp.br>
Date: Mon, 28 Nov 2011 12:04:43 -0200
Subject: Btrfs: initialize new bitmaps' list

We're failing to create clusters with bitmaps because
setup_cluster_no_bitmap checks that the list is empty before inserting
the bitmap entry in the list for setup_cluster_bitmap, but the list
field is only initialized when it is restored from the on-disk free
space cache, or when it is written out to disk.

Besides a potential race condition due to the multiple use of the list
field, filesystem performance severely degrades over time: as we use
up all non-bitmap free extents, the try-to-set-up-cluster dance is
done at every metadata block allocation.  For every block group, we
fail to set up a cluster, and after failing on them all up to twice,
we fall back to the much slower unclustered allocation.

To make matters worse, before the unclustered allocation, we try to
create new block groups until we reach the 1% threshold, which
introduces additional bitmaps and thus block groups that we'll iterate
over at each metadata block request.
---
 fs/btrfs/free-space-cache.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 6e5b7e463698..ff179b1e7423 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1470,6 +1470,7 @@ static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
 {
 	info->offset = offset_to_bitmap(ctl, offset);
 	info->bytes = 0;
+	INIT_LIST_HEAD(&info->list);
 	link_free_space(ctl, info);
 	ctl->total_bitmaps++;
 
-- 
cgit v1.2.2


From b78d09bceb524ee6481c21b77bda22d766b10e6a Mon Sep 17 00:00:00 2001
From: Alexandre Oliva <oliva@lsd.ic.unicamp.br>
Date: Wed, 30 Nov 2011 13:43:00 -0500
Subject: Btrfs: reset cluster's max_size when creating bitmap

The field that indicates the size of the largest contiguous chunk of
free space in the cluster is not initialized when setting up bitmaps,
it's only increased when we find a larger contiguous chunk.  We end up
retaining a larger value than appropriate for highly-fragmented
clusters, which may cause pointless searches for large contiguous
groups, and even cause clusters that do not meet the density
requirements to be set up.

Signed-off-by: Alexandre Oliva <oliva@lsd.ic.unicamp.br>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/free-space-cache.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index ff179b1e7423..ec23d43d0c35 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2320,6 +2320,7 @@ again:
 
 	if (!found) {
 		start = i;
+		cluster->max_size = 0;
 		found = true;
 	}
 
-- 
cgit v1.2.2


From 1b22bad779be7fe07242be04749ec969164528b8 Mon Sep 17 00:00:00 2001
From: Alexandre Oliva <oliva@lsd.ic.unicamp.br>
Date: Wed, 30 Nov 2011 13:43:00 -0500
Subject: Btrfs: start search for new cluster at the beginning

Instead of starting at zero (offset is always zero), request a cluster
starting at search_start, that denotes the beginning of the current
block group.

Signed-off-by: Alexandre Oliva <oliva@lsd.ic.unicamp.br>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b7e5f6898d07..97c12067a4b0 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5301,10 +5301,8 @@ alloc:
 			spin_lock(&last_ptr->refill_lock);
 			if (last_ptr->block_group &&
 			    (last_ptr->block_group->ro ||
-			    !block_group_bits(last_ptr->block_group, data))) {
-				offset = 0;
+			    !block_group_bits(last_ptr->block_group, data)))
 				goto refill_cluster;
-			}
 
 			offset = btrfs_alloc_from_cluster(block_group, last_ptr,
 						 num_bytes, search_start);
@@ -5355,7 +5353,7 @@ refill_cluster:
 			/* allocate a cluster in this block group */
 			ret = btrfs_find_space_cluster(trans, root,
 					       block_group, last_ptr,
-					       offset, num_bytes,
+					       search_start, num_bytes,
 					       empty_cluster + empty_size);
 			if (ret == 0) {
 				/*
-- 
cgit v1.2.2


From 425d83156ca27f74e2cc3f370138038c3c8947f8 Mon Sep 17 00:00:00 2001
From: Alexandre Oliva <oliva@lsd.ic.unicamp.br>
Date: Wed, 30 Nov 2011 13:43:00 -0500
Subject: Btrfs: skip block groups without enough space for a cluster

We test whether a block group has enough free space to hold the
requested block, but when we're doing clustered allocation, we can
save some cycles by testing whether it has enough room for the cluster
upfront, otherwise we end up attempting to set up a cluster and
failing.  Only in the NO_EMPTY_SIZE loop do we attempt an unclustered
allocation, and by then we'll have zeroed the cluster size, so this
patch won't stop us from using the block group as a last resort.

Signed-off-by: Alexandre Oliva <oliva@lsd.ic.unicamp.br>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 97c12067a4b0..71c8e7049d0c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5278,7 +5278,7 @@ alloc:
 		spin_lock(&block_group->free_space_ctl->tree_lock);
 		if (cached &&
 		    block_group->free_space_ctl->free_space <
-		    num_bytes + empty_size) {
+		    num_bytes + empty_cluster + empty_size) {
 			spin_unlock(&block_group->free_space_ctl->tree_lock);
 			goto loop;
 		}
-- 
cgit v1.2.2


From be064d113906f04ea13088a8260e1e68ae0a4050 Mon Sep 17 00:00:00 2001
From: Alexandre Oliva <oliva@lsd.ic.unicamp.br>
Date: Wed, 30 Nov 2011 13:43:00 -0500
Subject: Btrfs: skip allocation attempt from empty cluster

If we don't have a cluster, don't bother trying to allocate from it,
jumping right away to the attempt to allocate a new cluster.

Signed-off-by: Alexandre Oliva <oliva@lsd.ic.unicamp.br>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 71c8e7049d0c..813c6bb96c9a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5299,9 +5299,9 @@ alloc:
 			 * people trying to start a new cluster
 			 */
 			spin_lock(&last_ptr->refill_lock);
-			if (last_ptr->block_group &&
-			    (last_ptr->block_group->ro ||
-			    !block_group_bits(last_ptr->block_group, data)))
+			if (!last_ptr->block_group ||
+			    last_ptr->block_group->ro ||
+			    !block_group_bits(last_ptr->block_group, data))
 				goto refill_cluster;
 
 			offset = btrfs_alloc_from_cluster(block_group, last_ptr,
-- 
cgit v1.2.2


From f4a8e6563ea5366f563cb741a27fe90c5fa7f0fc Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Thu, 1 Dec 2011 09:30:36 -0500
Subject: Btrfs: fix meta data raid-repair merge problem

Commit 4a54c8c16 introduced raid-repair, killing the individual
readpage_io_failed_hook entries from inode.c and disk-io.c. Commit
4bb31e92 introduced new readahead code, adding a readpage_io_failed_hook to
disk-io.c.

The raid-repair commit had logic to disable raid-repair, if
readpage_io_failed_hook is set. Thus, the readahead commit effectively
disabled raid-repair for meta data.

This commit changes the logic to always attempt raid-repair when needed and
call the readpage_io_failed_hook in case raid-repair fails. This is much
more straight forward and should have been like that from the beginning.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Reported-by: Stefan Behrens <sbehrens@giantdisaster.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 9472d3de5e52..be1bf627a14b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2287,14 +2287,20 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 		if (!uptodate) {
 			int failed_mirror;
 			failed_mirror = (int)(unsigned long)bio->bi_bdev;
-			if (tree->ops && tree->ops->readpage_io_failed_hook)
-				ret = tree->ops->readpage_io_failed_hook(
-						bio, page, start, end,
-						failed_mirror, state);
-			else
-				ret = bio_readpage_error(bio, page, start, end,
-							 failed_mirror, NULL);
+			/*
+			 * The generic bio_readpage_error handles errors the
+			 * following way: If possible, new read requests are
+			 * created and submitted and will end up in
+			 * end_bio_extent_readpage as well (if we're lucky, not
+			 * in the !uptodate case). In that case it returns 0 and
+			 * we just go on with the next page in our bio. If it
+			 * can't handle the error it will return -EIO and we
+			 * remain responsible for that page.
+			 */
+			ret = bio_readpage_error(bio, page, start, end,
+							failed_mirror, NULL);
 			if (ret == 0) {
+error_handled:
 				uptodate =
 					test_bit(BIO_UPTODATE, &bio->bi_flags);
 				if (err)
@@ -2302,6 +2308,13 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 				uncache_state(&cached);
 				continue;
 			}
+			if (tree->ops && tree->ops->readpage_io_failed_hook) {
+				ret = tree->ops->readpage_io_failed_hook(
+							bio, page, start, end,
+							failed_mirror, state);
+				if (ret == 0)
+					goto error_handled;
+			}
 		}
 
 		if (uptodate) {
-- 
cgit v1.2.2


From 062c05c46bd4358aad7a0e0cb5ffeb98ab935286 Mon Sep 17 00:00:00 2001
From: Alexandre Oliva <oliva@lsd.ic.unicamp.br>
Date: Wed, 7 Dec 2011 19:50:42 -0500
Subject: Btrfs: try to allocate from cluster even at LOOP_NO_EMPTY_SIZE

If we reach LOOP_NO_EMPTY_SIZE, we won't even try to use a cluster that
others might have set up.  Odds are that there won't be one, but if
someone else succeeded in setting it up, we might as well use it, even
if we don't try to set up a cluster again.

Signed-off-by: Alexandre Oliva <oliva@lsd.ic.unicamp.br>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 813c6bb96c9a..db0b23b14f20 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5285,15 +5285,10 @@ alloc:
 		spin_unlock(&block_group->free_space_ctl->tree_lock);
 
 		/*
-		 * Ok we want to try and use the cluster allocator, so lets look
-		 * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
-		 * have tried the cluster allocator plenty of times at this
-		 * point and not have found anything, so we are likely way too
-		 * fragmented for the clustering stuff to find anything, so lets
-		 * just skip it and let the allocator find whatever block it can
-		 * find
+		 * Ok we want to try and use the cluster allocator, so
+		 * lets look there
 		 */
-		if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
+		if (last_ptr) {
 			/*
 			 * the refill lock keeps out other
 			 * people trying to start a new cluster
@@ -5342,6 +5337,20 @@ alloc:
 			}
 			spin_unlock(&last_ptr->lock);
 refill_cluster:
+			/* If we are on LOOP_NO_EMPTY_SIZE, we can't
+			 * set up a new clusters, so lets just skip it
+			 * and let the allocator find whatever block
+			 * it can find.  If we reach this point, we
+			 * will have tried the cluster allocator
+			 * plenty of times and not have found
+			 * anything, so we are likely way too
+			 * fragmented for the clustering stuff to find
+			 * anything.  */
+			if (loop >= LOOP_NO_EMPTY_SIZE) {
+				spin_unlock(&last_ptr->refill_lock);
+				goto unclustered_alloc;
+			}
+
 			/*
 			 * this cluster didn't work out, free it and
 			 * start over
@@ -5389,6 +5398,7 @@ refill_cluster:
 			goto loop;
 		}
 
+unclustered_alloc:
 		offset = btrfs_find_space_for_alloc(block_group, search_start,
 						    num_bytes, empty_size);
 		/*
-- 
cgit v1.2.2


From 274bd4fb3ed6b72c1d77ef8850511f09fc6b8e4d Mon Sep 17 00:00:00 2001
From: Alexandre Oliva <oliva@lsd.ic.unicamp.br>
Date: Wed, 7 Dec 2011 20:08:40 -0500
Subject: Btrfs: try cluster but don't advance in search list

When we find an existing cluster, we switch to its block group as the
current block group, possibly skipping multiple blocks in the process.
Furthermore, under heavy contention, multiple threads may fail to
allocate from a cluster and then release just-created clusters just to
proceed to create new ones in a different block group.

This patch tries to allocate from an existing cluster regardless of its
block group, and doesn't switch to that group, instead proceeding to
try to allocate a cluster from the group it was iterating before the
attempt.

Signed-off-by: Alexandre Oliva <oliva@lsd.ic.unicamp.br>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 74 +++++++++++++++++++++-----------------------------
 1 file changed, 31 insertions(+), 43 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index db0b23b14f20..05e1386b8bec 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5106,11 +5106,11 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_root *root = orig_root->fs_info->extent_root;
 	struct btrfs_free_cluster *last_ptr = NULL;
 	struct btrfs_block_group_cache *block_group = NULL;
+	struct btrfs_block_group_cache *used_block_group;
 	int empty_cluster = 2 * 1024 * 1024;
 	int allowed_chunk_alloc = 0;
 	int done_chunk_alloc = 0;
 	struct btrfs_space_info *space_info;
-	int last_ptr_loop = 0;
 	int loop = 0;
 	int index = 0;
 	int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
@@ -5172,6 +5172,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 ideal_cache:
 		block_group = btrfs_lookup_block_group(root->fs_info,
 						       search_start);
+		used_block_group = block_group;
 		/*
 		 * we don't want to use the block group if it doesn't match our
 		 * allocation bits, or if its not cached.
@@ -5209,6 +5210,7 @@ search:
 		u64 offset;
 		int cached;
 
+		used_block_group = block_group;
 		btrfs_get_block_group(block_group);
 		search_start = block_group->key.objectid;
 
@@ -5294,49 +5296,33 @@ alloc:
 			 * people trying to start a new cluster
 			 */
 			spin_lock(&last_ptr->refill_lock);
-			if (!last_ptr->block_group ||
-			    last_ptr->block_group->ro ||
-			    !block_group_bits(last_ptr->block_group, data))
+			used_block_group = last_ptr->block_group;
+			if (used_block_group != block_group &&
+			    (!used_block_group ||
+			     used_block_group->ro ||
+			     !block_group_bits(used_block_group, data))) {
+				used_block_group = block_group;
 				goto refill_cluster;
+			}
+
+			if (used_block_group != block_group)
+				btrfs_get_block_group(used_block_group);
 
-			offset = btrfs_alloc_from_cluster(block_group, last_ptr,
-						 num_bytes, search_start);
+			offset = btrfs_alloc_from_cluster(used_block_group,
+			  last_ptr, num_bytes, used_block_group->key.objectid);
 			if (offset) {
 				/* we have a block, we're done */
 				spin_unlock(&last_ptr->refill_lock);
 				goto checks;
 			}
 
-			spin_lock(&last_ptr->lock);
-			/*
-			 * whoops, this cluster doesn't actually point to
-			 * this block group.  Get a ref on the block
-			 * group is does point to and try again
-			 */
-			if (!last_ptr_loop && last_ptr->block_group &&
-			    last_ptr->block_group != block_group &&
-			    index <=
-				 get_block_group_index(last_ptr->block_group)) {
-
-				btrfs_put_block_group(block_group);
-				block_group = last_ptr->block_group;
-				btrfs_get_block_group(block_group);
-				spin_unlock(&last_ptr->lock);
-				spin_unlock(&last_ptr->refill_lock);
-
-				last_ptr_loop = 1;
-				search_start = block_group->key.objectid;
-				/*
-				 * we know this block group is properly
-				 * in the list because
-				 * btrfs_remove_block_group, drops the
-				 * cluster before it removes the block
-				 * group from the list
-				 */
-				goto have_block_group;
+			WARN_ON(last_ptr->block_group != used_block_group);
+			if (used_block_group != block_group) {
+				btrfs_put_block_group(used_block_group);
+				used_block_group = block_group;
 			}
-			spin_unlock(&last_ptr->lock);
 refill_cluster:
+			BUG_ON(used_block_group != block_group);
 			/* If we are on LOOP_NO_EMPTY_SIZE, we can't
 			 * set up a new clusters, so lets just skip it
 			 * and let the allocator find whatever block
@@ -5357,8 +5343,6 @@ refill_cluster:
 			 */
 			btrfs_return_cluster_to_free_space(NULL, last_ptr);
 
-			last_ptr_loop = 0;
-
 			/* allocate a cluster in this block group */
 			ret = btrfs_find_space_cluster(trans, root,
 					       block_group, last_ptr,
@@ -5425,14 +5409,14 @@ checks:
 		search_start = stripe_align(root, offset);
 		/* move on to the next group */
 		if (search_start + num_bytes >= search_end) {
-			btrfs_add_free_space(block_group, offset, num_bytes);
+			btrfs_add_free_space(used_block_group, offset, num_bytes);
 			goto loop;
 		}
 
 		/* move on to the next group */
 		if (search_start + num_bytes >
-		    block_group->key.objectid + block_group->key.offset) {
-			btrfs_add_free_space(block_group, offset, num_bytes);
+		    used_block_group->key.objectid + used_block_group->key.offset) {
+			btrfs_add_free_space(used_block_group, offset, num_bytes);
 			goto loop;
 		}
 
@@ -5440,14 +5424,14 @@ checks:
 		ins->offset = num_bytes;
 
 		if (offset < search_start)
-			btrfs_add_free_space(block_group, offset,
+			btrfs_add_free_space(used_block_group, offset,
 					     search_start - offset);
 		BUG_ON(offset > search_start);
 
-		ret = btrfs_update_reserved_bytes(block_group, num_bytes,
+		ret = btrfs_update_reserved_bytes(used_block_group, num_bytes,
 						  alloc_type);
 		if (ret == -EAGAIN) {
-			btrfs_add_free_space(block_group, offset, num_bytes);
+			btrfs_add_free_space(used_block_group, offset, num_bytes);
 			goto loop;
 		}
 
@@ -5456,15 +5440,19 @@ checks:
 		ins->offset = num_bytes;
 
 		if (offset < search_start)
-			btrfs_add_free_space(block_group, offset,
+			btrfs_add_free_space(used_block_group, offset,
 					     search_start - offset);
 		BUG_ON(offset > search_start);
+		if (used_block_group != block_group)
+			btrfs_put_block_group(used_block_group);
 		btrfs_put_block_group(block_group);
 		break;
 loop:
 		failed_cluster_refill = false;
 		failed_alloc = false;
 		BUG_ON(index != get_block_group_index(block_group));
+		if (used_block_group != block_group)
+			btrfs_put_block_group(used_block_group);
 		btrfs_put_block_group(block_group);
 	}
 	up_read(&space_info->groups_sem);
-- 
cgit v1.2.2


From a5d16333612718569ffd26064270e535cb9c3928 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 7 Dec 2011 20:08:40 -0500
Subject: Btrfs: check if the to-be-added device is writable

If we call ioctl(BTRFS_IOC_ADD_DEV) directly, we'll succeed in adding
a readonly device to a btrfs filesystem, and btrfs will write to
that device, emitting kernel errors:

[ 3109.833692] lost page write due to I/O error on loop2
[ 3109.833720] lost page write due to I/O error on loop2
...

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c37433d3cd82..0a8c8f8304b1 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1611,7 +1611,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
 		return -EINVAL;
 
-	bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
+	bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
 				  root->fs_info->bdev_holder);
 	if (IS_ERR(bdev))
 		return PTR_ERR(bdev);
-- 
cgit v1.2.2


From 1cf4ffdb3289624a6462c94f2ce05545b32ef736 Mon Sep 17 00:00:00 2001
From: Liu Bo <liubo2009@cn.fujitsu.com>
Date: Wed, 7 Dec 2011 20:08:40 -0500
Subject: Btrfs: drop spin lock when memory alloc fails

Drop spin lock in convert_extent_bit() when memory alloc fails,
otherwise, it will be a deadlock.

Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

(limited to 'fs/btrfs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index be1bf627a14b..49f3c9dc09f4 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -935,8 +935,10 @@ again:
 	node = tree_search(tree, start);
 	if (!node) {
 		prealloc = alloc_extent_state_atomic(prealloc);
-		if (!prealloc)
-			return -ENOMEM;
+		if (!prealloc) {
+			err = -ENOMEM;
+			goto out;
+		}
 		err = insert_state(tree, prealloc, start, end, &bits);
 		prealloc = NULL;
 		BUG_ON(err == -EEXIST);
@@ -992,8 +994,10 @@ hit_next:
 	 */
 	if (state->start < start) {
 		prealloc = alloc_extent_state_atomic(prealloc);
-		if (!prealloc)
-			return -ENOMEM;
+		if (!prealloc) {
+			err = -ENOMEM;
+			goto out;
+		}
 		err = split_state(tree, state, prealloc, start);
 		BUG_ON(err == -EEXIST);
 		prealloc = NULL;
@@ -1024,8 +1028,10 @@ hit_next:
 			this_end = last_start - 1;
 
 		prealloc = alloc_extent_state_atomic(prealloc);
-		if (!prealloc)
-			return -ENOMEM;
+		if (!prealloc) {
+			err = -ENOMEM;
+			goto out;
+		}
 
 		/*
 		 * Avoid to free 'prealloc' if it can be merged with
@@ -1051,8 +1057,10 @@ hit_next:
 	 */
 	if (state->start <= end && state->end > end) {
 		prealloc = alloc_extent_state_atomic(prealloc);
-		if (!prealloc)
-			return -ENOMEM;
+		if (!prealloc) {
+			err = -ENOMEM;
+			goto out;
+		}
 
 		err = split_state(tree, state, prealloc, end + 1);
 		BUG_ON(err == -EEXIST);
-- 
cgit v1.2.2