From efd049fb26a162c3830fd3cb1001fdc09b147f3b Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 2 Feb 2010 20:50:10 +0000
Subject: Btrfs: do not try and lookup the file extent when finishing ordered
 io

When running the following fio job

[torrent]
filename=torrent-test
rw=randwrite
size=4g
filesize=4g
bs=4k
ioengine=sync

you would see long stalls where no work was being done.  That is because we were
doing all this extra work to read in the file extent outside of the transaction,
however in the random io case this ends up hurting us because the file extents
are not there to begin with.  So axe this logic, since we end up reading in the
file extent when we go to update it anyway.  This took the fio job from 11 mb/s
with several ~10 second stalls to 24 mb/s to a couple of 1-2 second stalls.

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 46 ++--------------------------------------------
 1 file changed, 2 insertions(+), 44 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8cd109972fa6..6782aa19130d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1681,24 +1681,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
  * before we start the transaction.  It limits the amount of btree
  * reads required while inside the transaction.
  */
-static noinline void reada_csum(struct btrfs_root *root,
-				struct btrfs_path *path,
-				struct btrfs_ordered_extent *ordered_extent)
-{
-	struct btrfs_ordered_sum *sum;
-	u64 bytenr;
-
-	sum = list_entry(ordered_extent->list.next, struct btrfs_ordered_sum,
-			 list);
-	bytenr = sum->sums[0].bytenr;
-
-	/*
-	 * we don't care about the results, the point of this search is
-	 * just to get the btree leaves into ram
-	 */
-	btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, bytenr, 0);
-}
-
 /* as ordered data IO finishes, this gets called so we can finish
  * an ordered extent if the range of bytes in the file it covers are
  * fully written.
@@ -1709,7 +1691,6 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_ordered_extent *ordered_extent = NULL;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-	struct btrfs_path *path;
 	int compressed = 0;
 	int ret;
 
@@ -1717,32 +1698,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	if (!ret)
 		return 0;
 
-	/*
-	 * before we join the transaction, try to do some of our IO.
-	 * This will limit the amount of IO that we have to do with
-	 * the transaction running.  We're unlikely to need to do any
-	 * IO if the file extents are new, the disk_i_size checks
-	 * covers the most common case.
-	 */
-	if (start < BTRFS_I(inode)->disk_i_size) {
-		path = btrfs_alloc_path();
-		if (path) {
-			ret = btrfs_lookup_file_extent(NULL, root, path,
-						       inode->i_ino,
-						       start, 0);
-			ordered_extent = btrfs_lookup_ordered_extent(inode,
-								     start);
-			if (!list_empty(&ordered_extent->list)) {
-				btrfs_release_path(root, path);
-				reada_csum(root, path, ordered_extent);
-			}
-			btrfs_free_path(path);
-		}
-	}
-
-	if (!ordered_extent)
-		ordered_extent = btrfs_lookup_ordered_extent(inode, start);
+	ordered_extent = btrfs_lookup_ordered_extent(inode, start);
 	BUG_ON(!ordered_extent);
+
 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
 		BUG_ON(!list_empty(&ordered_extent->list));
 		ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
-- 
cgit v1.2.2


From 23b5c50945f2294add0137799400329c0ebba290 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 4 Feb 2010 11:33:03 -0500
Subject: Btrfs: apply updated fallocate i_size fix

This version of the i_size fix for fallocate makes sure we only update
the i_size when the current fallocate is really operating outside of
i_size.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6782aa19130d..4deb280f8969 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5799,7 +5799,9 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
 		inode->i_ctime = CURRENT_TIME;
 		BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
 		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-		    cur_offset > inode->i_size) {
+			(actual_len > inode->i_size) &&
+			(cur_offset > inode->i_size)) {
+
 			if (cur_offset > actual_len)
 				i_size  = actual_len;
 			else
-- 
cgit v1.2.2


From a9185b41a4f84971b930c519f0c63bd450c4810d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 5 Mar 2010 09:21:37 +0100
Subject: pass writeback_control to ->write_inode

This gives the filesystem more information about the writeback that
is happening.  Trond requested this for the NFS unstable write handling,
and other filesystems might benefit from this too by beeing able to
distinguish between the different callers in more detail.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4deb280f8969..c41db6d45ab6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3968,7 +3968,7 @@ err:
 	return ret;
 }
 
-int btrfs_write_inode(struct inode *inode, int wait)
+int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
@@ -3977,7 +3977,7 @@ int btrfs_write_inode(struct inode *inode, int wait)
 	if (root->fs_info->btree_inode == inode)
 		return 0;
 
-	if (wait) {
+	if (wbc->sync_mode == WB_SYNC_ALL) {
 		trans = btrfs_join_transaction(root, 1);
 		btrfs_set_trans_block_group(trans, inode);
 		ret = btrfs_commit_transaction(trans, root);
-- 
cgit v1.2.2


From 73f73415caddbc01d9f10c03e0a677d5b3d11569 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 4 Dec 2009 17:38:27 +0000
Subject: Btrfs: change how we mount subvolumes

This work is in preperation for being able to set a different root as the
default mounting root.

There is currently a problem with how we mount subvolumes.  We cannot currently
mount a subvolume of a subvolume, you can only mount subvolumes/snapshots of the
default subvolume.  So say you take a snapshot of the default subvolume and call
it snap1, and then take a snapshot of snap1 and call it snap2, so now you have

/
/snap1
/snap1/snap2

as your available volumes.  Currently you can only mount / and /snap1,
you cannot mount /snap1/snap2.  To fix this problem instead of passing
subvolid=<name> you must pass in subvolid=<treeid>, where <treeid> is
the tree id that gets spit out via the subvolume listing you get from
the subvolume listing patches (btrfs filesystem list).  This allows us
to mount /, /snap1 and /snap1/snap2 as the root volume.

In addition to the above, we also now read the default dir item in the
tree root to get the root key that it points to.  For now this just
points at what has always been the default subvolme, but later on I plan
to change it to point at whatever root you want to be the new default
root, so you can just set the default mount and not have to mount with
-o subvolid=<treeid>.  I tested this out with the above scenario and it
worked perfectly.  Thanks,

mount -o subvol operates inside the selected subvolid.  For example:

mount -o subvol=snap1,subvolid=256 /dev/xxx /mnt

/mnt will have the snap1 directory for the subvolume with id
256.

mount -o subvol=snap /dev/xxx /mnt

/mnt will be the snap directory of whatever the default subvolume
is.

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4deb280f8969..7d10d1ccb0fe 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2153,7 +2153,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 		found_key.objectid = found_key.offset;
 		found_key.type = BTRFS_INODE_ITEM_KEY;
 		found_key.offset = 0;
-		inode = btrfs_iget(root->fs_info->sb, &found_key, root);
+		inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
 		if (IS_ERR(inode))
 			break;
 
@@ -3687,7 +3687,7 @@ static struct inode *btrfs_iget_locked(struct super_block *s,
  * Returns in *is_new if the inode was read from disk
  */
 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
-			 struct btrfs_root *root)
+			 struct btrfs_root *root, int *new)
 {
 	struct inode *inode;
 
@@ -3702,6 +3702,8 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
 
 		inode_tree_add(inode);
 		unlock_new_inode(inode);
+		if (new)
+			*new = 1;
 	}
 
 	return inode;
@@ -3754,7 +3756,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 		return NULL;
 
 	if (location.type == BTRFS_INODE_ITEM_KEY) {
-		inode = btrfs_iget(dir->i_sb, &location, root);
+		inode = btrfs_iget(dir->i_sb, &location, root, NULL);
 		return inode;
 	}
 
@@ -3769,7 +3771,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 		else
 			inode = new_simple_dir(dir->i_sb, &location, sub_root);
 	} else {
-		inode = btrfs_iget(dir->i_sb, &location, sub_root);
+		inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
 	}
 	srcu_read_unlock(&root->fs_info->subvol_srcu, index);
 
-- 
cgit v1.2.2


From 1e701a3292e25a6c4939cad9f24951dc6b6ad853 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 11 Mar 2010 09:42:04 -0500
Subject: Btrfs: add new defrag-range ioctl.

The btrfs defrag ioctl was limited to doing the entire file.  This
commit adds a new interface that can defrag a specific range inside
the file.

It can also force compression on the file, allowing you to selectively
compress individual files after they were created, even when mount -o
compress isn't turned on.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7d10d1ccb0fe..3657925c2461 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -379,7 +379,8 @@ again:
 	 * change at any time if we discover bad compression ratios.
 	 */
 	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
-	    btrfs_test_opt(root, COMPRESS)) {
+	    (btrfs_test_opt(root, COMPRESS) ||
+	     (BTRFS_I(inode)->force_compress))) {
 		WARN_ON(pages);
 		pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
 
@@ -483,8 +484,10 @@ again:
 		nr_pages_ret = 0;
 
 		/* flag the file so we don't compress in the future */
-		if (!btrfs_test_opt(root, FORCE_COMPRESS))
+		if (!btrfs_test_opt(root, FORCE_COMPRESS) &&
+		    !(BTRFS_I(inode)->force_compress)) {
 			BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+		}
 	}
 	if (will_compress) {
 		*num_added += 1;
@@ -1211,7 +1214,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 	else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
 					 page_started, 0, nr_written);
-	else if (!btrfs_test_opt(root, COMPRESS))
+	else if (!btrfs_test_opt(root, COMPRESS) &&
+		 !(BTRFS_I(inode)->force_compress))
 		ret = cow_file_range(inode, locked_page, start, end,
 				      page_started, nr_written, 1);
 	else
@@ -3639,6 +3643,7 @@ static noinline void init_btrfs_i(struct inode *inode)
 	bi->index_cnt = (u64)-1;
 	bi->last_unlink_trans = 0;
 	bi->ordered_data_close = 0;
+	bi->force_compress = 0;
 	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 			     inode->i_mapping, GFP_NOFS);
-- 
cgit v1.2.2


From 0be2e98173f8badd5ccc7c2e994891746ba1caf4 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 11 Feb 2010 08:06:58 +0000
Subject: btrfs: fix btrfs_mkdir goto for no free objectids

btrfs_mkdir() must jump to the place of ending transaction after
btrfs_find_free_objectid() failed. Or this transaction can't end.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3657925c2461..50ce8840a99b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4508,7 +4508,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
 	if (err) {
 		err = -ENOSPC;
-		goto out_unlock;
+		goto out_fail;
 	}
 
 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-- 
cgit v1.2.2


From 5a1a3df1f6c86926cfe8657e6f9b4b4c2f467d60 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 2 Feb 2010 20:51:14 +0000
Subject: Btrfs: cache ordered extent when completing io

When finishing io we run btrfs_dec_test_ordered_pending, and then immediately
run btrfs_lookup_ordered_extent, but btrfs_dec_test_ordered_pending does that
already, so we're searching twice when we don't have to.  This patch lets us
pass a btrfs_ordered_extent in to btrfs_dec_test_ordered_pending so if we do
complete io on that ordered extent we can just use the one we found then instead
of having to do another btrfs_lookup_ordered_extent.  This made my fio job with
the other patch go from 24 mb/s to 29 mb/s.

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 50ce8840a99b..1824dda1d351 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1698,11 +1698,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	int compressed = 0;
 	int ret;
 
-	ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1);
+	ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
+					     end - start + 1);
 	if (!ret)
 		return 0;
-
-	ordered_extent = btrfs_lookup_ordered_extent(inode, start);
 	BUG_ON(!ordered_extent);
 
 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
-- 
cgit v1.2.2


From 2ac55d41b5d6bf49e76bc85db5431240617e2f8f Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 3 Feb 2010 19:33:23 +0000
Subject: Btrfs: cache the extent state everywhere we possibly can V2

This patch just goes through and fixes everybody that does

lock_extent()
blah
unlock_extent()

to use

lock_extent_bits()
blah
unlock_extent_cached()

and pass around a extent_state so we only have to do the searches once per
function.  This gives me about a 3 mb/s boots on my random write test.  I have
not converted some things, like the relocation and ioctl's, since they aren't
heavily used and the relocation stuff is in the middle of being re-written.  I
also changed the clear_extent_bit() to only unset the cached state if we are
clearing EXTENT_LOCKED and related stuff, so we can do things like this

lock_extent_bits()
clear delalloc bits
unlock_extent_cached()

without losing our cached state.  I tested this thoroughly and turned on
LEAK_DEBUG to make sure we weren't leaking extent states, everything worked out
fine.

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 111 ++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 69 insertions(+), 42 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1824dda1d351..2a337a09c650 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -573,8 +573,8 @@ retry:
 			unsigned long nr_written = 0;
 
 			lock_extent(io_tree, async_extent->start,
-				    async_extent->start +
-				    async_extent->ram_size - 1, GFP_NOFS);
+					 async_extent->start +
+					 async_extent->ram_size - 1, GFP_NOFS);
 
 			/* allocate blocks */
 			ret = cow_file_range(inode, async_cow->locked_page,
@@ -1512,12 +1512,13 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
+int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
+			      struct extent_state **cached_state)
 {
 	if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
 		WARN_ON(1);
 	return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
-				   GFP_NOFS);
+				   cached_state, GFP_NOFS);
 }
 
 /* see btrfs_writepage_start_hook for details on why this is required */
@@ -1530,6 +1531,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
 {
 	struct btrfs_writepage_fixup *fixup;
 	struct btrfs_ordered_extent *ordered;
+	struct extent_state *cached_state = NULL;
 	struct page *page;
 	struct inode *inode;
 	u64 page_start;
@@ -1548,7 +1550,8 @@ again:
 	page_start = page_offset(page);
 	page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
 
-	lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
+	lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
+			 &cached_state, GFP_NOFS);
 
 	/* already ordered? We're done */
 	if (PagePrivate2(page))
@@ -1556,17 +1559,18 @@ again:
 
 	ordered = btrfs_lookup_ordered_extent(inode, page_start);
 	if (ordered) {
-		unlock_extent(&BTRFS_I(inode)->io_tree, page_start,
-			      page_end, GFP_NOFS);
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
+				     page_end, &cached_state, GFP_NOFS);
 		unlock_page(page);
 		btrfs_start_ordered_extent(inode, ordered, 1);
 		goto again;
 	}
 
-	btrfs_set_extent_delalloc(inode, page_start, page_end);
+	btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
 	ClearPageChecked(page);
 out:
-	unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
+			     &cached_state, GFP_NOFS);
 out_page:
 	unlock_page(page);
 	page_cache_release(page);
@@ -1695,6 +1699,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_ordered_extent *ordered_extent = NULL;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct extent_state *cached_state = NULL;
 	int compressed = 0;
 	int ret;
 
@@ -1716,9 +1721,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 		goto out;
 	}
 
-	lock_extent(io_tree, ordered_extent->file_offset,
-		    ordered_extent->file_offset + ordered_extent->len - 1,
-		    GFP_NOFS);
+	lock_extent_bits(io_tree, ordered_extent->file_offset,
+			 ordered_extent->file_offset + ordered_extent->len - 1,
+			 0, &cached_state, GFP_NOFS);
 
 	trans = btrfs_join_transaction(root, 1);
 
@@ -1745,9 +1750,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 				   ordered_extent->len);
 		BUG_ON(ret);
 	}
-	unlock_extent(io_tree, ordered_extent->file_offset,
-		    ordered_extent->file_offset + ordered_extent->len - 1,
-		    GFP_NOFS);
+	unlock_extent_cached(io_tree, ordered_extent->file_offset,
+			     ordered_extent->file_offset +
+			     ordered_extent->len - 1, &cached_state, GFP_NOFS);
+
 	add_pending_csums(trans, inode, ordered_extent->file_offset,
 			  &ordered_extent->list);
 
@@ -3084,6 +3090,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_ordered_extent *ordered;
+	struct extent_state *cached_state = NULL;
 	char *kaddr;
 	u32 blocksize = root->sectorsize;
 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
@@ -3130,12 +3137,14 @@ again:
 	}
 	wait_on_page_writeback(page);
 
-	lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+	lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
+			 GFP_NOFS);
 	set_page_extent_mapped(page);
 
 	ordered = btrfs_lookup_ordered_extent(inode, page_start);
 	if (ordered) {
-		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+		unlock_extent_cached(io_tree, page_start, page_end,
+				     &cached_state, GFP_NOFS);
 		unlock_page(page);
 		page_cache_release(page);
 		btrfs_start_ordered_extent(inode, ordered, 1);
@@ -3143,13 +3152,15 @@ again:
 		goto again;
 	}
 
-	clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
+	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
 			  EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
-			  GFP_NOFS);
+			  0, 0, &cached_state, GFP_NOFS);
 
-	ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
+	ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
+					&cached_state);
 	if (ret) {
-		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+		unlock_extent_cached(io_tree, page_start, page_end,
+				     &cached_state, GFP_NOFS);
 		goto out_unlock;
 	}
 
@@ -3162,7 +3173,8 @@ again:
 	}
 	ClearPageChecked(page);
 	set_page_dirty(page);
-	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+	unlock_extent_cached(io_tree, page_start, page_end, &cached_state,
+			     GFP_NOFS);
 
 out_unlock:
 	if (ret)
@@ -3180,6 +3192,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct extent_map *em;
+	struct extent_state *cached_state = NULL;
 	u64 mask = root->sectorsize - 1;
 	u64 hole_start = (inode->i_size + mask) & ~mask;
 	u64 block_end = (size + mask) & ~mask;
@@ -3195,11 +3208,13 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
 		struct btrfs_ordered_extent *ordered;
 		btrfs_wait_ordered_range(inode, hole_start,
 					 block_end - hole_start);
-		lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+		lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
+				 &cached_state, GFP_NOFS);
 		ordered = btrfs_lookup_ordered_extent(inode, hole_start);
 		if (!ordered)
 			break;
-		unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+		unlock_extent_cached(io_tree, hole_start, block_end - 1,
+				     &cached_state, GFP_NOFS);
 		btrfs_put_ordered_extent(ordered);
 	}
 
@@ -3244,7 +3259,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
 			break;
 	}
 
-	unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+	unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
+			     GFP_NOFS);
 	return err;
 }
 
@@ -4985,6 +5001,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 {
 	struct extent_io_tree *tree;
 	struct btrfs_ordered_extent *ordered;
+	struct extent_state *cached_state = NULL;
 	u64 page_start = page_offset(page);
 	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
 
@@ -5003,7 +5020,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 		btrfs_releasepage(page, GFP_NOFS);
 		return;
 	}
-	lock_extent(tree, page_start, page_end, GFP_NOFS);
+	lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
+			 GFP_NOFS);
 	ordered = btrfs_lookup_ordered_extent(page->mapping->host,
 					   page_offset(page));
 	if (ordered) {
@@ -5014,7 +5032,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 		clear_extent_bit(tree, page_start, page_end,
 				 EXTENT_DIRTY | EXTENT_DELALLOC |
 				 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
-				 NULL, GFP_NOFS);
+				 &cached_state, GFP_NOFS);
 		/*
 		 * whoever cleared the private bit is responsible
 		 * for the finish_ordered_io
@@ -5024,11 +5042,13 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 						page_start, page_end);
 		}
 		btrfs_put_ordered_extent(ordered);
-		lock_extent(tree, page_start, page_end, GFP_NOFS);
+		cached_state = NULL;
+		lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
+				 GFP_NOFS);
 	}
 	clear_extent_bit(tree, page_start, page_end,
 		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
-		 EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS);
+		 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS);
 	__btrfs_releasepage(page, GFP_NOFS);
 
 	ClearPageChecked(page);
@@ -5061,6 +5081,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_ordered_extent *ordered;
+	struct extent_state *cached_state = NULL;
 	char *kaddr;
 	unsigned long zero_start;
 	loff_t size;
@@ -5099,7 +5120,8 @@ again:
 	}
 	wait_on_page_writeback(page);
 
-	lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+	lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
+			 GFP_NOFS);
 	set_page_extent_mapped(page);
 
 	/*
@@ -5108,7 +5130,8 @@ again:
 	 */
 	ordered = btrfs_lookup_ordered_extent(inode, page_start);
 	if (ordered) {
-		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+		unlock_extent_cached(io_tree, page_start, page_end,
+				     &cached_state, GFP_NOFS);
 		unlock_page(page);
 		btrfs_start_ordered_extent(inode, ordered, 1);
 		btrfs_put_ordered_extent(ordered);
@@ -5122,13 +5145,15 @@ again:
 	 * is probably a better way to do this, but for now keep consistent with
 	 * prepare_pages in the normal write path.
 	 */
-	clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
+	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
 			  EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
-			  GFP_NOFS);
+			  0, 0, &cached_state, GFP_NOFS);
 
-	ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
+	ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
+					&cached_state);
 	if (ret) {
-		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+		unlock_extent_cached(io_tree, page_start, page_end,
+				     &cached_state, GFP_NOFS);
 		ret = VM_FAULT_SIGBUS;
 		btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
 		goto out_unlock;
@@ -5154,7 +5179,7 @@ again:
 	BTRFS_I(inode)->last_trans = root->fs_info->generation;
 	BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
 
-	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+	unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
 
 out_unlock:
 	btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
@@ -5833,6 +5858,7 @@ stop_trans:
 static long btrfs_fallocate(struct inode *inode, int mode,
 			    loff_t offset, loff_t len)
 {
+	struct extent_state *cached_state = NULL;
 	u64 cur_offset;
 	u64 last_byte;
 	u64 alloc_start;
@@ -5871,16 +5897,17 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 		/* the extent lock is ordered inside the running
 		 * transaction
 		 */
-		lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
-			    GFP_NOFS);
+		lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
+				 locked_end, 0, &cached_state, GFP_NOFS);
 		ordered = btrfs_lookup_first_ordered_extent(inode,
 							    alloc_end - 1);
 		if (ordered &&
 		    ordered->file_offset + ordered->len > alloc_start &&
 		    ordered->file_offset < alloc_end) {
 			btrfs_put_ordered_extent(ordered);
-			unlock_extent(&BTRFS_I(inode)->io_tree,
-				      alloc_start, locked_end, GFP_NOFS);
+			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+					     alloc_start, locked_end,
+					     &cached_state, GFP_NOFS);
 			/*
 			 * we can't wait on the range with the transaction
 			 * running or with the extent lock held
@@ -5922,8 +5949,8 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 			break;
 		}
 	}
-	unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
-		      GFP_NOFS);
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+			     &cached_state, GFP_NOFS);
 
 	btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode,
 				       alloc_end - alloc_start);
-- 
cgit v1.2.2


From 5a0e3ad6af8660be21ca98a971cd00f331318c05 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 24 Mar 2010 17:04:11 +0900
Subject: include cleanup: Update gfp.h and slab.h includes to prepare for
 breaking implicit slab.h inclusion from percpu.h

percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files.  percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.

percpu.h -> slab.h dependency is about to be removed.  Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability.  As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.

  http://userweb.kernel.org/~tj/misc/slabh-sweep.py

The script does the followings.

* Scan files for gfp and slab usages and update includes such that
  only the necessary includes are there.  ie. if only gfp is used,
  gfp.h, if slab is used, slab.h.

* When the script inserts a new include, it looks at the include
  blocks and try to put the new include such that its order conforms
  to its surrounding.  It's put in the include block which contains
  core kernel includes, in the same order that the rest are ordered -
  alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
  doesn't seem to be any matching order.

* If the script can't find a place to put a new include (mostly
  because the file doesn't have fitting include block), it prints out
  an error message indicating which .h file needs to be added to the
  file.

The conversion was done in the following steps.

1. The initial automatic conversion of all .c files updated slightly
   over 4000 files, deleting around 700 includes and adding ~480 gfp.h
   and ~3000 slab.h inclusions.  The script emitted errors for ~400
   files.

2. Each error was manually checked.  Some didn't need the inclusion,
   some needed manual addition while adding it to implementation .h or
   embedding .c file was more appropriate for others.  This step added
   inclusions to around 150 files.

3. The script was run again and the output was compared to the edits
   from #2 to make sure no file was left behind.

4. Several build tests were done and a couple of problems were fixed.
   e.g. lib/decompress_*.c used malloc/free() wrappers around slab
   APIs requiring slab.h to be added manually.

5. The script was run on all .h files but without automatically
   editing them as sprinkling gfp.h and slab.h inclusions around .h
   files could easily lead to inclusion dependency hell.  Most gfp.h
   inclusion directives were ignored as stuff from gfp.h was usually
   wildly available and often used in preprocessor macros.  Each
   slab.h inclusion directive was examined and added manually as
   necessary.

6. percpu.h was updated not to include slab.h.

7. Build test were done on the following configurations and failures
   were fixed.  CONFIG_GCOV_KERNEL was turned off for all tests (as my
   distributed build env didn't work with gcov compiles) and a few
   more options had to be turned off depending on archs to make things
   build (like ipr on powerpc/64 which failed due to missing writeq).

   * x86 and x86_64 UP and SMP allmodconfig and a custom test config.
   * powerpc and powerpc64 SMP allmodconfig
   * sparc and sparc64 SMP allmodconfig
   * ia64 SMP allmodconfig
   * s390 SMP allmodconfig
   * alpha SMP allmodconfig
   * um on x86_64 SMP allmodconfig

8. percpu.h modifications were reverted so that it could be applied as
   a separate patch and serve as bisection point.

Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.

Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
---
 fs/btrfs/inode.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 02bb099845fd..2fe494c84e60 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -36,6 +36,7 @@
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/falloc.h>
+#include <linux/slab.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
-- 
cgit v1.2.2


From 287a0ab91d25ca982f895a76402e5893b47ed7a6 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 19 Mar 2010 18:07:23 +0000
Subject: Btrfs: kill max_extent mount option

As Yan pointed out, theres not much reason for all this complicated math to
account for file extents being split up into max_extent chunks, since they are
likely to all end up in the same leaf anyway.  Since there isn't much reason to
use max_extent, just remove the option altogether so we have one less thing we
need to test.

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 59 ++++----------------------------------------------------
 1 file changed, 4 insertions(+), 55 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2a337a09c650..a85b90c86cb0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -796,7 +796,7 @@ static noinline int cow_file_range(struct inode *inode,
 	while (disk_num_bytes > 0) {
 		unsigned long op;
 
-		cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
+		cur_alloc_size = disk_num_bytes;
 		ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
 					   root->sectorsize, 0, alloc_hint,
 					   (u64)-1, &ins, 1);
@@ -1227,30 +1227,9 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 static int btrfs_split_extent_hook(struct inode *inode,
 				    struct extent_state *orig, u64 split)
 {
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	u64 size;
-
 	if (!(orig->state & EXTENT_DELALLOC))
 		return 0;
 
-	size = orig->end - orig->start + 1;
-	if (size > root->fs_info->max_extent) {
-		u64 num_extents;
-		u64 new_size;
-
-		new_size = orig->end - split + 1;
-		num_extents = div64_u64(size + root->fs_info->max_extent - 1,
-					root->fs_info->max_extent);
-
-		/*
-		 * if we break a large extent up then leave oustanding_extents
-		 * be, since we've already accounted for the large extent.
-		 */
-		if (div64_u64(new_size + root->fs_info->max_extent - 1,
-			      root->fs_info->max_extent) < num_extents)
-			return 0;
-	}
-
 	spin_lock(&BTRFS_I(inode)->accounting_lock);
 	BTRFS_I(inode)->outstanding_extents++;
 	spin_unlock(&BTRFS_I(inode)->accounting_lock);
@@ -1268,38 +1247,10 @@ static int btrfs_merge_extent_hook(struct inode *inode,
 				   struct extent_state *new,
 				   struct extent_state *other)
 {
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	u64 new_size, old_size;
-	u64 num_extents;
-
 	/* not delalloc, ignore it */
 	if (!(other->state & EXTENT_DELALLOC))
 		return 0;
 
-	old_size = other->end - other->start + 1;
-	if (new->start < other->start)
-		new_size = other->end - new->start + 1;
-	else
-		new_size = new->end - other->start + 1;
-
-	/* we're not bigger than the max, unreserve the space and go */
-	if (new_size <= root->fs_info->max_extent) {
-		spin_lock(&BTRFS_I(inode)->accounting_lock);
-		BTRFS_I(inode)->outstanding_extents--;
-		spin_unlock(&BTRFS_I(inode)->accounting_lock);
-		return 0;
-	}
-
-	/*
-	 * If we grew by another max_extent, just return, we want to keep that
-	 * reserved amount.
-	 */
-	num_extents = div64_u64(old_size + root->fs_info->max_extent - 1,
-				root->fs_info->max_extent);
-	if (div64_u64(new_size + root->fs_info->max_extent - 1,
-		      root->fs_info->max_extent) > num_extents)
-		return 0;
-
 	spin_lock(&BTRFS_I(inode)->accounting_lock);
 	BTRFS_I(inode)->outstanding_extents--;
 	spin_unlock(&BTRFS_I(inode)->accounting_lock);
@@ -1328,6 +1279,7 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
 		BTRFS_I(inode)->outstanding_extents++;
 		spin_unlock(&BTRFS_I(inode)->accounting_lock);
 		btrfs_delalloc_reserve_space(root, inode, end - start + 1);
+
 		spin_lock(&root->fs_info->delalloc_lock);
 		BTRFS_I(inode)->delalloc_bytes += end - start + 1;
 		root->fs_info->delalloc_bytes += end - start + 1;
@@ -1356,6 +1308,7 @@ static int btrfs_clear_bit_hook(struct inode *inode,
 
 		if (bits & EXTENT_DO_ACCOUNTING) {
 			spin_lock(&BTRFS_I(inode)->accounting_lock);
+			WARN_ON(!BTRFS_I(inode)->outstanding_extents);
 			BTRFS_I(inode)->outstanding_extents--;
 			spin_unlock(&BTRFS_I(inode)->accounting_lock);
 			btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
@@ -5384,7 +5337,6 @@ free:
 void btrfs_drop_inode(struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-
 	if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0)
 		generic_delete_inode(inode);
 	else
@@ -5788,18 +5740,15 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_key ins;
-	u64 alloc_size;
 	u64 cur_offset = start;
 	u64 num_bytes = end - start;
 	int ret = 0;
 	u64 i_size;
 
 	while (num_bytes > 0) {
-		alloc_size = min(num_bytes, root->fs_info->max_extent);
-
 		trans = btrfs_start_transaction(root, 1);
 
-		ret = btrfs_reserve_extent(trans, root, alloc_size,
+		ret = btrfs_reserve_extent(trans, root, num_bytes,
 					   root->sectorsize, 0, alloc_hint,
 					   (u64)-1, &ins, 1);
 		if (ret) {
-- 
cgit v1.2.2


From ecc11fabf7ce16309112afe68fac466193ee7520 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Thu, 4 Mar 2010 17:31:47 +0300
Subject: btrfs: replace inode uid,gid,mode initialization with helper function

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/inode.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2bfdc641d4e3..d601629b85d1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4121,16 +4121,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	if (ret != 0)
 		goto fail;
 
-	inode->i_uid = current_fsuid();
-
-	if (dir && (dir->i_mode & S_ISGID)) {
-		inode->i_gid = dir->i_gid;
-		if (S_ISDIR(mode))
-			mode |= S_ISGID;
-	} else
-		inode->i_gid = current_fsgid();
-
-	inode->i_mode = mode;
+	inode_init_owner(inode, dir, mode);
 	inode->i_ino = objectid;
 	inode_set_bytes(inode, 0);
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-- 
cgit v1.2.2


From 5da9d01b66458b180a6bee0e637a1d0a3effc622 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.yan@oracle.com>
Date: Sun, 16 May 2010 10:46:25 -0400
Subject: Btrfs: Shrink delay allocated space in a synchronized

Shrink delayed allocation space in a synchronized manner is more
controllable than flushing all delay allocated space in an async
thread.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2bfdc641d4e3..d53cad1afe26 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5611,6 +5611,38 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 	return 0;
 }
 
+int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput)
+{
+	struct btrfs_inode *binode;
+	struct inode *inode = NULL;
+
+	spin_lock(&root->fs_info->delalloc_lock);
+	while (!list_empty(&root->fs_info->delalloc_inodes)) {
+		binode = list_entry(root->fs_info->delalloc_inodes.next,
+				    struct btrfs_inode, delalloc_inodes);
+		inode = igrab(&binode->vfs_inode);
+		if (inode) {
+			list_move_tail(&binode->delalloc_inodes,
+				       &root->fs_info->delalloc_inodes);
+			break;
+		}
+
+		list_del_init(&binode->delalloc_inodes);
+		cond_resched_lock(&root->fs_info->delalloc_lock);
+	}
+	spin_unlock(&root->fs_info->delalloc_lock);
+
+	if (inode) {
+		write_inode_now(inode, 0);
+		if (delay_iput)
+			btrfs_add_delayed_iput(inode);
+		else
+			iput(inode);
+		return 1;
+	}
+	return 0;
+}
+
 static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 			 const char *symname)
 {
-- 
cgit v1.2.2


From 2ead6ae770d9f9dec9f4286bf0fd9001b4388c4b Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.yan@oracle.com>
Date: Sun, 16 May 2010 10:46:25 -0400
Subject: Btrfs: Kill init_btrfs_i()

All code in init_btrfs_i can be moved into btrfs_alloc_inode()

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 64 +++++++++++++++++++++++++-------------------------------
 1 file changed, 28 insertions(+), 36 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d53cad1afe26..f44425081c02 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3596,40 +3596,10 @@ again:
 	return 0;
 }
 
-static noinline void init_btrfs_i(struct inode *inode)
-{
-	struct btrfs_inode *bi = BTRFS_I(inode);
-
-	bi->generation = 0;
-	bi->sequence = 0;
-	bi->last_trans = 0;
-	bi->last_sub_trans = 0;
-	bi->logged_trans = 0;
-	bi->delalloc_bytes = 0;
-	bi->reserved_bytes = 0;
-	bi->disk_i_size = 0;
-	bi->flags = 0;
-	bi->index_cnt = (u64)-1;
-	bi->last_unlink_trans = 0;
-	bi->ordered_data_close = 0;
-	bi->force_compress = 0;
-	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
-	extent_io_tree_init(&BTRFS_I(inode)->io_tree,
-			     inode->i_mapping, GFP_NOFS);
-	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
-			     inode->i_mapping, GFP_NOFS);
-	INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
-	INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
-	RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
-	btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
-	mutex_init(&BTRFS_I(inode)->log_mutex);
-}
-
 static int btrfs_init_locked_inode(struct inode *inode, void *p)
 {
 	struct btrfs_iget_args *args = p;
 	inode->i_ino = args->ino;
-	init_btrfs_i(inode);
 	BTRFS_I(inode)->root = args->root;
 	btrfs_set_inode_space_info(args->root, inode);
 	return 0;
@@ -3692,8 +3662,6 @@ static struct inode *new_simple_dir(struct super_block *s,
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 
-	init_btrfs_i(inode);
-
 	BTRFS_I(inode)->root = root;
 	memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
 	BTRFS_I(inode)->dummy_inode = 1;
@@ -4092,7 +4060,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	 * btrfs_get_inode_index_count has an explanation for the magic
 	 * number
 	 */
-	init_btrfs_i(inode);
 	BTRFS_I(inode)->index_cnt = 2;
 	BTRFS_I(inode)->root = root;
 	BTRFS_I(inode)->generation = trans->transid;
@@ -5263,21 +5230,46 @@ unsigned long btrfs_force_ra(struct address_space *mapping,
 struct inode *btrfs_alloc_inode(struct super_block *sb)
 {
 	struct btrfs_inode *ei;
+	struct inode *inode;
 
 	ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
 	if (!ei)
 		return NULL;
+
+	ei->root = NULL;
+	ei->space_info = NULL;
+	ei->generation = 0;
+	ei->sequence = 0;
 	ei->last_trans = 0;
 	ei->last_sub_trans = 0;
 	ei->logged_trans = 0;
+	ei->delalloc_bytes = 0;
+	ei->reserved_bytes = 0;
+	ei->disk_i_size = 0;
+	ei->flags = 0;
+	ei->index_cnt = (u64)-1;
+	ei->last_unlink_trans = 0;
+
+	spin_lock_init(&ei->accounting_lock);
 	ei->outstanding_extents = 0;
 	ei->reserved_extents = 0;
-	ei->root = NULL;
-	spin_lock_init(&ei->accounting_lock);
+
+	ei->ordered_data_close = 0;
+	ei->dummy_inode = 0;
+	ei->force_compress = 0;
+
+	inode = &ei->vfs_inode;
+	extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
+	extent_io_tree_init(&ei->io_tree, &inode->i_data, GFP_NOFS);
+	extent_io_tree_init(&ei->io_failure_tree, &inode->i_data, GFP_NOFS);
+	mutex_init(&ei->log_mutex);
 	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
 	INIT_LIST_HEAD(&ei->i_orphan);
+	INIT_LIST_HEAD(&ei->delalloc_inodes);
 	INIT_LIST_HEAD(&ei->ordered_operations);
-	return &ei->vfs_inode;
+	RB_CLEAR_NODE(&ei->rb_node);
+
+	return inode;
 }
 
 void btrfs_destroy_inode(struct inode *inode)
-- 
cgit v1.2.2


From a22285a6a32390195235171b89d157ed1a1fe932 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.yan@oracle.com>
Date: Sun, 16 May 2010 10:48:46 -0400
Subject: Btrfs: Integrate metadata reservation with start_transaction

Besides simplify the code, this change makes sure all metadata
reservation for normal metadata operations are released after
committing transaction.

Changes since V1:

Add code that check if unlink and rmdir will free space.

Add ENOSPC handling for clone ioctl.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 403 +++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 257 insertions(+), 146 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f44425081c02..c4b0fd12df68 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2135,7 +2135,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 		 * do a destroy_inode
 		 */
 		if (is_bad_inode(inode)) {
-			trans = btrfs_start_transaction(root, 1);
+			trans = btrfs_start_transaction(root, 0);
 			btrfs_orphan_del(trans, inode);
 			btrfs_end_transaction(trans, root);
 			iput(inode);
@@ -2478,29 +2478,201 @@ out:
 	return ret;
 }
 
-static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+/* helper to check if there is any shared block in the path */
+static int check_path_shared(struct btrfs_root *root,
+			     struct btrfs_path *path)
+{
+	struct extent_buffer *eb;
+	int level;
+	int ret;
+	u64 refs;
+
+	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+		if (!path->nodes[level])
+			break;
+		eb = path->nodes[level];
+		if (!btrfs_block_can_be_shared(root, eb))
+			continue;
+		ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len,
+					       &refs, NULL);
+		if (refs > 1)
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * helper to start transaction for unlink and rmdir.
+ *
+ * unlink and rmdir are special in btrfs, they do not always free space.
+ * so in enospc case, we should make sure they will free space before
+ * allowing them to use the global metadata reservation.
+ */
+static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
+						       struct dentry *dentry)
 {
-	struct btrfs_root *root;
 	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct btrfs_path *path;
+	struct btrfs_inode_ref *ref;
+	struct btrfs_dir_item *di;
 	struct inode *inode = dentry->d_inode;
+	u64 index;
+	int check_link = 1;
+	int err = -ENOSPC;
 	int ret;
-	unsigned long nr = 0;
 
-	root = BTRFS_I(dir)->root;
+	trans = btrfs_start_transaction(root, 10);
+	if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
+		return trans;
 
-	/*
-	 * 5 items for unlink inode
-	 * 1 for orphan
-	 */
-	ret = btrfs_reserve_metadata_space(root, 6);
-	if (ret)
-		return ret;
+	if (inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
+		return ERR_PTR(-ENOSPC);
 
-	trans = btrfs_start_transaction(root, 1);
+	/* check if there is someone else holds reference */
+	if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
+		return ERR_PTR(-ENOSPC);
+
+	if (atomic_read(&inode->i_count) > 2)
+		return ERR_PTR(-ENOSPC);
+
+	if (xchg(&root->fs_info->enospc_unlink, 1))
+		return ERR_PTR(-ENOSPC);
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		root->fs_info->enospc_unlink = 0;
+		return ERR_PTR(-ENOMEM);
+	}
+
+	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
-		btrfs_unreserve_metadata_space(root, 6);
-		return PTR_ERR(trans);
+		btrfs_free_path(path);
+		root->fs_info->enospc_unlink = 0;
+		return trans;
+	}
+
+	path->skip_locking = 1;
+	path->search_commit_root = 1;
+
+	ret = btrfs_lookup_inode(trans, root, path,
+				&BTRFS_I(dir)->location, 0);
+	if (ret < 0) {
+		err = ret;
+		goto out;
+	}
+	if (ret == 0) {
+		if (check_path_shared(root, path))
+			goto out;
+	} else {
+		check_link = 0;
 	}
+	btrfs_release_path(root, path);
+
+	ret = btrfs_lookup_inode(trans, root, path,
+				&BTRFS_I(inode)->location, 0);
+	if (ret < 0) {
+		err = ret;
+		goto out;
+	}
+	if (ret == 0) {
+		if (check_path_shared(root, path))
+			goto out;
+	} else {
+		check_link = 0;
+	}
+	btrfs_release_path(root, path);
+
+	if (ret == 0 && S_ISREG(inode->i_mode)) {
+		ret = btrfs_lookup_file_extent(trans, root, path,
+					       inode->i_ino, (u64)-1, 0);
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+		BUG_ON(ret == 0);
+		if (check_path_shared(root, path))
+			goto out;
+		btrfs_release_path(root, path);
+	}
+
+	if (!check_link) {
+		err = 0;
+		goto out;
+	}
+
+	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+				dentry->d_name.name, dentry->d_name.len, 0);
+	if (IS_ERR(di)) {
+		err = PTR_ERR(di);
+		goto out;
+	}
+	if (di) {
+		if (check_path_shared(root, path))
+			goto out;
+	} else {
+		err = 0;
+		goto out;
+	}
+	btrfs_release_path(root, path);
+
+	ref = btrfs_lookup_inode_ref(trans, root, path,
+				dentry->d_name.name, dentry->d_name.len,
+				inode->i_ino, dir->i_ino, 0);
+	if (IS_ERR(ref)) {
+		err = PTR_ERR(ref);
+		goto out;
+	}
+	BUG_ON(!ref);
+	if (check_path_shared(root, path))
+		goto out;
+	index = btrfs_inode_ref_index(path->nodes[0], ref);
+	btrfs_release_path(root, path);
+
+	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, index,
+				dentry->d_name.name, dentry->d_name.len, 0);
+	if (IS_ERR(di)) {
+		err = PTR_ERR(di);
+		goto out;
+	}
+	BUG_ON(ret == -ENOENT);
+	if (check_path_shared(root, path))
+		goto out;
+
+	err = 0;
+out:
+	btrfs_free_path(path);
+	if (err) {
+		btrfs_end_transaction(trans, root);
+		root->fs_info->enospc_unlink = 0;
+		return ERR_PTR(err);
+	}
+
+	trans->block_rsv = &root->fs_info->global_block_rsv;
+	return trans;
+}
+
+static void __unlink_end_trans(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root)
+{
+	if (trans->block_rsv == &root->fs_info->global_block_rsv) {
+		BUG_ON(!root->fs_info->enospc_unlink);
+		root->fs_info->enospc_unlink = 0;
+	}
+	btrfs_end_transaction_throttle(trans, root);
+}
+
+static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct btrfs_trans_handle *trans;
+	struct inode *inode = dentry->d_inode;
+	int ret;
+	unsigned long nr = 0;
+
+	trans = __unlink_start_trans(dir, dentry);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
 
 	btrfs_set_trans_block_group(trans, dir);
 
@@ -2508,14 +2680,15 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 
 	ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
 				 dentry->d_name.name, dentry->d_name.len);
+	BUG_ON(ret);
 
-	if (inode->i_nlink == 0)
+	if (inode->i_nlink == 0) {
 		ret = btrfs_orphan_add(trans, inode);
+		BUG_ON(ret);
+	}
 
 	nr = trans->blocks_used;
-
-	btrfs_end_transaction_throttle(trans, root);
-	btrfs_unreserve_metadata_space(root, 6);
+	__unlink_end_trans(trans, root);
 	btrfs_btree_balance_dirty(root, nr);
 	return ret;
 }
@@ -2587,7 +2760,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
 	int err = 0;
-	int ret;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct btrfs_trans_handle *trans;
 	unsigned long nr = 0;
@@ -2596,15 +2768,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	    inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
 		return -ENOTEMPTY;
 
-	ret = btrfs_reserve_metadata_space(root, 5);
-	if (ret)
-		return ret;
-
-	trans = btrfs_start_transaction(root, 1);
-	if (IS_ERR(trans)) {
-		btrfs_unreserve_metadata_space(root, 5);
+	trans = __unlink_start_trans(dir, dentry);
+	if (IS_ERR(trans))
 		return PTR_ERR(trans);
-	}
 
 	btrfs_set_trans_block_group(trans, dir);
 
@@ -2627,12 +2793,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 		btrfs_i_size_write(inode, 0);
 out:
 	nr = trans->blocks_used;
-	ret = btrfs_end_transaction_throttle(trans, root);
-	btrfs_unreserve_metadata_space(root, 5);
+	__unlink_end_trans(trans, root);
 	btrfs_btree_balance_dirty(root, nr);
 
-	if (ret && !err)
-		err = ret;
 	return err;
 }
 
@@ -3145,7 +3308,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-	struct extent_map *em;
+	struct extent_map *em = NULL;
 	struct extent_state *cached_state = NULL;
 	u64 mask = root->sectorsize - 1;
 	u64 hole_start = (inode->i_size + mask) & ~mask;
@@ -3183,11 +3346,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
 			u64 hint_byte = 0;
 			hole_size = last_byte - cur_offset;
 
-			err = btrfs_reserve_metadata_space(root, 2);
-			if (err)
+			trans = btrfs_start_transaction(root, 2);
+			if (IS_ERR(trans)) {
+				err = PTR_ERR(trans);
 				break;
-
-			trans = btrfs_start_transaction(root, 1);
+			}
 			btrfs_set_trans_block_group(trans, inode);
 
 			err = btrfs_drop_extents(trans, inode, cur_offset,
@@ -3205,14 +3368,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
 					last_byte - 1, 0);
 
 			btrfs_end_transaction(trans, root);
-			btrfs_unreserve_metadata_space(root, 2);
 		}
 		free_extent_map(em);
+		em = NULL;
 		cur_offset = last_byte;
 		if (cur_offset >= block_end)
 			break;
 	}
 
+	free_extent_map(em);
 	unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
 			     GFP_NOFS);
 	return err;
@@ -3239,10 +3403,6 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
 		}
 	}
 
-	ret = btrfs_reserve_metadata_space(root, 1);
-	if (ret)
-		return ret;
-
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, inode);
 
@@ -3251,7 +3411,6 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
 
 	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
-	btrfs_unreserve_metadata_space(root, 1);
 	btrfs_btree_balance_dirty(root, nr);
 
 	if (attr->ia_size > inode->i_size) {
@@ -4223,26 +4382,21 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
 
+	err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+	if (err)
+		return err;
+
 	/*
 	 * 2 for inode item and ref
 	 * 2 for dir items
 	 * 1 for xattr if selinux is on
 	 */
-	err = btrfs_reserve_metadata_space(root, 5);
-	if (err)
-		return err;
+	trans = btrfs_start_transaction(root, 5);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
 
-	trans = btrfs_start_transaction(root, 1);
-	if (!trans)
-		goto fail;
 	btrfs_set_trans_block_group(trans, dir);
 
-	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
-	if (err) {
-		err = -ENOSPC;
-		goto out_unlock;
-	}
-
 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
 				dentry->d_name.len,
 				dentry->d_parent->d_inode->i_ino, objectid,
@@ -4271,13 +4425,11 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 out_unlock:
 	nr = trans->blocks_used;
 	btrfs_end_transaction_throttle(trans, root);
-fail:
-	btrfs_unreserve_metadata_space(root, 5);
+	btrfs_btree_balance_dirty(root, nr);
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
 	}
-	btrfs_btree_balance_dirty(root, nr);
 	return err;
 }
 
@@ -4287,32 +4439,26 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct inode *inode = NULL;
-	int err;
 	int drop_inode = 0;
+	int err;
 	unsigned long nr = 0;
 	u64 objectid;
 	u64 index = 0;
 
+	err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+	if (err)
+		return err;
 	/*
 	 * 2 for inode item and ref
 	 * 2 for dir items
 	 * 1 for xattr if selinux is on
 	 */
-	err = btrfs_reserve_metadata_space(root, 5);
-	if (err)
-		return err;
+	trans = btrfs_start_transaction(root, 5);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
 
-	trans = btrfs_start_transaction(root, 1);
-	if (!trans)
-		goto fail;
 	btrfs_set_trans_block_group(trans, dir);
 
-	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
-	if (err) {
-		err = -ENOSPC;
-		goto out_unlock;
-	}
-
 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
 				dentry->d_name.len,
 				dentry->d_parent->d_inode->i_ino,
@@ -4344,8 +4490,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 out_unlock:
 	nr = trans->blocks_used;
 	btrfs_end_transaction_throttle(trans, root);
-fail:
-	btrfs_unreserve_metadata_space(root, 5);
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
@@ -4372,21 +4516,21 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	if (root->objectid != BTRFS_I(inode)->root->objectid)
 		return -EPERM;
 
-	/*
-	 * 1 item for inode ref
-	 * 2 items for dir items
-	 */
-	err = btrfs_reserve_metadata_space(root, 3);
-	if (err)
-		return err;
-
 	btrfs_inc_nlink(inode);
 
 	err = btrfs_set_inode_index(dir, &index);
 	if (err)
 		goto fail;
 
-	trans = btrfs_start_transaction(root, 1);
+	/*
+	 * 1 item for inode ref
+	 * 2 items for dir items
+	 */
+	trans = btrfs_start_transaction(root, 3);
+	if (IS_ERR(trans)) {
+		err = PTR_ERR(trans);
+		goto fail;
+	}
 
 	btrfs_set_trans_block_group(trans, dir);
 	atomic_inc(&inode->i_count);
@@ -4405,7 +4549,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	nr = trans->blocks_used;
 	btrfs_end_transaction_throttle(trans, root);
 fail:
-	btrfs_unreserve_metadata_space(root, 3);
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
@@ -4425,28 +4568,20 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	u64 index = 0;
 	unsigned long nr = 1;
 
+	err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+	if (err)
+		return err;
+
 	/*
 	 * 2 items for inode and ref
 	 * 2 items for dir items
 	 * 1 for xattr if selinux is on
 	 */
-	err = btrfs_reserve_metadata_space(root, 5);
-	if (err)
-		return err;
-
-	trans = btrfs_start_transaction(root, 1);
-	if (!trans) {
-		err = -ENOMEM;
-		goto out_unlock;
-	}
+	trans = btrfs_start_transaction(root, 5);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
 	btrfs_set_trans_block_group(trans, dir);
 
-	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
-	if (err) {
-		err = -ENOSPC;
-		goto out_fail;
-	}
-
 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
 				dentry->d_name.len,
 				dentry->d_parent->d_inode->i_ino, objectid,
@@ -4486,9 +4621,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 out_fail:
 	nr = trans->blocks_used;
 	btrfs_end_transaction_throttle(trans, root);
-
-out_unlock:
-	btrfs_unreserve_metadata_space(root, 5);
 	if (drop_on_err)
 		iput(inode);
 	btrfs_btree_balance_dirty(root, nr);
@@ -5426,19 +5558,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (S_ISDIR(old_inode->i_mode) && new_inode &&
 	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
 		return -ENOTEMPTY;
-
-	/*
-	 * We want to reserve the absolute worst case amount of items.  So if
-	 * both inodes are subvols and we need to unlink them then that would
-	 * require 4 item modifications, but if they are both normal inodes it
-	 * would require 5 item modifications, so we'll assume their normal
-	 * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
-	 * should cover the worst case number of items we'll modify.
-	 */
-	ret = btrfs_reserve_metadata_space(root, 11);
-	if (ret)
-		return ret;
-
 	/*
 	 * we're using rename to replace one file with another.
 	 * and the replacement file is large.  Start IO on it now so
@@ -5451,8 +5570,18 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	/* close the racy window with snapshot create/destroy ioctl */
 	if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
 		down_read(&root->fs_info->subvol_sem);
+	/*
+	 * We want to reserve the absolute worst case amount of items.  So if
+	 * both inodes are subvols and we need to unlink them then that would
+	 * require 4 item modifications, but if they are both normal inodes it
+	 * would require 5 item modifications, so we'll assume their normal
+	 * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
+	 * should cover the worst case number of items we'll modify.
+	 */
+	trans = btrfs_start_transaction(root, 20);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
 
-	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, new_dir);
 
 	if (dest != root)
@@ -5551,7 +5680,6 @@ out_fail:
 	if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
 		up_read(&root->fs_info->subvol_sem);
 
-	btrfs_unreserve_metadata_space(root, 11);
 	return ret;
 }
 
@@ -5658,26 +5786,20 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
 		return -ENAMETOOLONG;
 
+	err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+	if (err)
+		return err;
 	/*
 	 * 2 items for inode item and ref
 	 * 2 items for dir items
 	 * 1 item for xattr if selinux is on
 	 */
-	err = btrfs_reserve_metadata_space(root, 5);
-	if (err)
-		return err;
+	trans = btrfs_start_transaction(root, 5);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
 
-	trans = btrfs_start_transaction(root, 1);
-	if (!trans)
-		goto out_fail;
 	btrfs_set_trans_block_group(trans, dir);
 
-	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
-	if (err) {
-		err = -ENOSPC;
-		goto out_unlock;
-	}
-
 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
 				dentry->d_name.len,
 				dentry->d_parent->d_inode->i_ino, objectid,
@@ -5749,8 +5871,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 out_unlock:
 	nr = trans->blocks_used;
 	btrfs_end_transaction_throttle(trans, root);
-out_fail:
-	btrfs_unreserve_metadata_space(root, 5);
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
@@ -5771,21 +5891,18 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
 	u64 i_size;
 
 	while (num_bytes > 0) {
-		trans = btrfs_start_transaction(root, 1);
+		trans = btrfs_start_transaction(root, 3);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			break;
+		}
 
 		ret = btrfs_reserve_extent(trans, root, num_bytes,
 					   root->sectorsize, 0, alloc_hint,
 					   (u64)-1, &ins, 1);
 		if (ret) {
-			WARN_ON(1);
-			goto stop_trans;
-		}
-
-		ret = btrfs_reserve_metadata_space(root, 3);
-		if (ret) {
-			btrfs_free_reserved_extent(root, ins.objectid,
-						   ins.offset);
-			goto stop_trans;
+			btrfs_end_transaction(trans, root);
+			break;
 		}
 
 		ret = insert_reserved_file_extent(trans, inode,
@@ -5819,14 +5936,8 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
 		BUG_ON(ret);
 
 		btrfs_end_transaction(trans, root);
-		btrfs_unreserve_metadata_space(root, 3);
 	}
 	return ret;
-
-stop_trans:
-	btrfs_end_transaction(trans, root);
-	return ret;
-
 }
 
 static long btrfs_fallocate(struct inode *inode, int mode,
-- 
cgit v1.2.2


From 0ca1f7ceb1991099ed5273885ebcf4323948c72e Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.yan@oracle.com>
Date: Sun, 16 May 2010 10:48:47 -0400
Subject: Btrfs: Update metadata reservation for delayed allocation

Introduce metadata reservation context for delayed allocation
and update various related functions.

This patch also introduces EXTENT_FIRST_DELALLOC control bit for
set/clear_extent_bit. It tells set/clear_bit_hook whether they
are processing the first extent_state with EXTENT_DELALLOC bit
set. This change is important if set/clear_extent_bit involves
multiple extent_state.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 131 ++++++++++++++++++++++---------------------------------
 1 file changed, 53 insertions(+), 78 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c4b0fd12df68..6e54665d37f7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -252,6 +252,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
 				   inline_len, compressed_size,
 				   compressed_pages);
 	BUG_ON(ret);
+	btrfs_delalloc_release_metadata(inode, end + 1 - start);
 	btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
 	return 0;
 }
@@ -414,6 +415,7 @@ again:
 		trans = btrfs_join_transaction(root, 1);
 		BUG_ON(!trans);
 		btrfs_set_trans_block_group(trans, inode);
+		trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
 		/* lets try to make an inline extent */
 		if (ret || total_in < (actual_end - start)) {
@@ -439,7 +441,6 @@ again:
 			     start, end, NULL,
 			     EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
 			     EXTENT_CLEAR_DELALLOC |
-			     EXTENT_CLEAR_ACCOUNTING |
 			     EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
 
 			btrfs_end_transaction(trans, root);
@@ -734,6 +735,7 @@ static noinline int cow_file_range(struct inode *inode,
 	trans = btrfs_join_transaction(root, 1);
 	BUG_ON(!trans);
 	btrfs_set_trans_block_group(trans, inode);
+	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
 	actual_end = min_t(u64, isize, end + 1);
 
@@ -753,7 +755,6 @@ static noinline int cow_file_range(struct inode *inode,
 				     EXTENT_CLEAR_UNLOCK_PAGE |
 				     EXTENT_CLEAR_UNLOCK |
 				     EXTENT_CLEAR_DELALLOC |
-				     EXTENT_CLEAR_ACCOUNTING |
 				     EXTENT_CLEAR_DIRTY |
 				     EXTENT_SET_WRITEBACK |
 				     EXTENT_END_WRITEBACK);
@@ -1226,15 +1227,13 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 }
 
 static int btrfs_split_extent_hook(struct inode *inode,
-				    struct extent_state *orig, u64 split)
+				   struct extent_state *orig, u64 split)
 {
+	/* not delalloc, ignore it */
 	if (!(orig->state & EXTENT_DELALLOC))
 		return 0;
 
-	spin_lock(&BTRFS_I(inode)->accounting_lock);
-	BTRFS_I(inode)->outstanding_extents++;
-	spin_unlock(&BTRFS_I(inode)->accounting_lock);
-
+	atomic_inc(&BTRFS_I(inode)->outstanding_extents);
 	return 0;
 }
 
@@ -1252,10 +1251,7 @@ static int btrfs_merge_extent_hook(struct inode *inode,
 	if (!(other->state & EXTENT_DELALLOC))
 		return 0;
 
-	spin_lock(&BTRFS_I(inode)->accounting_lock);
-	BTRFS_I(inode)->outstanding_extents--;
-	spin_unlock(&BTRFS_I(inode)->accounting_lock);
-
+	atomic_dec(&BTRFS_I(inode)->outstanding_extents);
 	return 0;
 }
 
@@ -1264,8 +1260,8 @@ static int btrfs_merge_extent_hook(struct inode *inode,
  * bytes in this file, and to maintain the list of inodes that
  * have pending delalloc work to be done.
  */
-static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
-		       unsigned long old, unsigned long bits)
+static int btrfs_set_bit_hook(struct inode *inode,
+			      struct extent_state *state, int *bits)
 {
 
 	/*
@@ -1273,17 +1269,18 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
 	 * but in this case, we are only testeing for the DELALLOC
 	 * bit, which is only set or cleared with irqs on
 	 */
-	if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+	if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
 		struct btrfs_root *root = BTRFS_I(inode)->root;
+		u64 len = state->end + 1 - state->start;
 
-		spin_lock(&BTRFS_I(inode)->accounting_lock);
-		BTRFS_I(inode)->outstanding_extents++;
-		spin_unlock(&BTRFS_I(inode)->accounting_lock);
-		btrfs_delalloc_reserve_space(root, inode, end - start + 1);
+		if (*bits & EXTENT_FIRST_DELALLOC)
+			*bits &= ~EXTENT_FIRST_DELALLOC;
+		else
+			atomic_inc(&BTRFS_I(inode)->outstanding_extents);
 
 		spin_lock(&root->fs_info->delalloc_lock);
-		BTRFS_I(inode)->delalloc_bytes += end - start + 1;
-		root->fs_info->delalloc_bytes += end - start + 1;
+		BTRFS_I(inode)->delalloc_bytes += len;
+		root->fs_info->delalloc_bytes += len;
 		if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
 			list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
 				      &root->fs_info->delalloc_inodes);
@@ -1297,45 +1294,32 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
  * extent_io.c clear_bit_hook, see set_bit_hook for why
  */
 static int btrfs_clear_bit_hook(struct inode *inode,
-				struct extent_state *state, unsigned long bits)
+				struct extent_state *state, int *bits)
 {
 	/*
 	 * set_bit and clear bit hooks normally require _irqsave/restore
 	 * but in this case, we are only testeing for the DELALLOC
 	 * bit, which is only set or cleared with irqs on
 	 */
-	if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+	if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
 		struct btrfs_root *root = BTRFS_I(inode)->root;
+		u64 len = state->end + 1 - state->start;
 
-		if (bits & EXTENT_DO_ACCOUNTING) {
-			spin_lock(&BTRFS_I(inode)->accounting_lock);
-			WARN_ON(!BTRFS_I(inode)->outstanding_extents);
-			BTRFS_I(inode)->outstanding_extents--;
-			spin_unlock(&BTRFS_I(inode)->accounting_lock);
-			btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
-		}
+		if (*bits & EXTENT_FIRST_DELALLOC)
+			*bits &= ~EXTENT_FIRST_DELALLOC;
+		else if (!(*bits & EXTENT_DO_ACCOUNTING))
+			atomic_dec(&BTRFS_I(inode)->outstanding_extents);
+
+		if (*bits & EXTENT_DO_ACCOUNTING)
+			btrfs_delalloc_release_metadata(inode, len);
+
+		if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
+			btrfs_free_reserved_data_space(inode, len);
 
 		spin_lock(&root->fs_info->delalloc_lock);
-		if (state->end - state->start + 1 >
-		    root->fs_info->delalloc_bytes) {
-			printk(KERN_INFO "btrfs warning: delalloc account "
-			       "%llu %llu\n",
-			       (unsigned long long)
-			       state->end - state->start + 1,
-			       (unsigned long long)
-			       root->fs_info->delalloc_bytes);
-			btrfs_delalloc_free_space(root, inode, (u64)-1);
-			root->fs_info->delalloc_bytes = 0;
-			BTRFS_I(inode)->delalloc_bytes = 0;
-		} else {
-			btrfs_delalloc_free_space(root, inode,
-						  state->end -
-						  state->start + 1);
-			root->fs_info->delalloc_bytes -= state->end -
-				state->start + 1;
-			BTRFS_I(inode)->delalloc_bytes -= state->end -
-				state->start + 1;
-		}
+		root->fs_info->delalloc_bytes -= len;
+		BTRFS_I(inode)->delalloc_bytes -= len;
+
 		if (BTRFS_I(inode)->delalloc_bytes == 0 &&
 		    !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
 			list_del_init(&BTRFS_I(inode)->delalloc_inodes);
@@ -1520,6 +1504,7 @@ again:
 		goto again;
 	}
 
+	BUG();
 	btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
 	ClearPageChecked(page);
 out:
@@ -1650,7 +1635,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_trans_handle *trans;
+	struct btrfs_trans_handle *trans = NULL;
 	struct btrfs_ordered_extent *ordered_extent = NULL;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct extent_state *cached_state = NULL;
@@ -1668,9 +1653,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 		ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
 		if (!ret) {
 			trans = btrfs_join_transaction(root, 1);
+			btrfs_set_trans_block_group(trans, inode);
+			trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 			ret = btrfs_update_inode(trans, root, inode);
 			BUG_ON(ret);
-			btrfs_end_transaction(trans, root);
 		}
 		goto out;
 	}
@@ -1680,6 +1666,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 			 0, &cached_state, GFP_NOFS);
 
 	trans = btrfs_join_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, inode);
+	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
 	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
 		compressed = 1;
@@ -1711,12 +1699,13 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	add_pending_csums(trans, inode, ordered_extent->file_offset,
 			  &ordered_extent->list);
 
-	/* this also removes the ordered extent from the tree */
 	btrfs_ordered_update_i_size(inode, 0, ordered_extent);
 	ret = btrfs_update_inode(trans, root, inode);
 	BUG_ON(ret);
-	btrfs_end_transaction(trans, root);
 out:
+	btrfs_delalloc_release_metadata(inode, ordered_extent->len);
+	if (trans)
+		btrfs_end_transaction(trans, root);
 	/* once for us */
 	btrfs_put_ordered_extent(ordered_extent);
 	/* once for the tree */
@@ -3219,11 +3208,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 
 	if ((offset & (blocksize - 1)) == 0)
 		goto out;
-	ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
-	if (ret)
-		goto out;
-
-	ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
+	ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
 	if (ret)
 		goto out;
 
@@ -3231,8 +3216,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 again:
 	page = grab_cache_page(mapping, index);
 	if (!page) {
-		btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
-		btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+		btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
 		goto out;
 	}
 
@@ -3295,8 +3279,7 @@ again:
 
 out_unlock:
 	if (ret)
-		btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
-	btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+		btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
 	unlock_page(page);
 	page_cache_release(page);
 out:
@@ -4878,6 +4861,7 @@ again:
 			}
 			flush_dcache_page(page);
 		} else if (create && PageUptodate(page)) {
+			WARN_ON(1);
 			if (!trans) {
 				kunmap(page);
 				free_extent_map(em);
@@ -5142,7 +5126,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	u64 page_start;
 	u64 page_end;
 
-	ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
+	ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
 	if (ret) {
 		if (ret == -ENOMEM)
 			ret = VM_FAULT_OOM;
@@ -5151,13 +5135,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 		goto out;
 	}
 
-	ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
-	if (ret) {
-		btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
-		ret = VM_FAULT_SIGBUS;
-		goto out;
-	}
-
 	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
 again:
 	lock_page(page);
@@ -5167,7 +5144,6 @@ again:
 
 	if ((page->mapping != inode->i_mapping) ||
 	    (page_start >= size)) {
-		btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
 		/* page got truncated out from underneath us */
 		goto out_unlock;
 	}
@@ -5208,7 +5184,6 @@ again:
 		unlock_extent_cached(io_tree, page_start, page_end,
 				     &cached_state, GFP_NOFS);
 		ret = VM_FAULT_SIGBUS;
-		btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
 		goto out_unlock;
 	}
 	ret = 0;
@@ -5235,10 +5210,10 @@ again:
 	unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
 
 out_unlock:
-	btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
 	if (!ret)
 		return VM_FAULT_LOCKED;
 	unlock_page(page);
+	btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
 out:
 	return ret;
 }
@@ -5383,7 +5358,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	ei->last_unlink_trans = 0;
 
 	spin_lock_init(&ei->accounting_lock);
-	ei->outstanding_extents = 0;
+	atomic_set(&ei->outstanding_extents, 0);
 	ei->reserved_extents = 0;
 
 	ei->ordered_data_close = 0;
@@ -5411,6 +5386,8 @@ void btrfs_destroy_inode(struct inode *inode)
 
 	WARN_ON(!list_empty(&inode->i_dentry));
 	WARN_ON(inode->i_data.nrpages);
+	WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents));
+	WARN_ON(BTRFS_I(inode)->reserved_extents);
 
 	/*
 	 * This can happen where we create an inode, but somebody else also
@@ -5970,8 +5947,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 			goto out;
 	}
 
-	ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode,
-					  alloc_end - alloc_start);
+	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
 	if (ret)
 		goto out;
 
@@ -6037,8 +6013,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
 			     &cached_state, GFP_NOFS);
 
-	btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode,
-				       alloc_end - alloc_start);
+	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
 out:
 	mutex_unlock(&inode->i_mutex);
 	return ret;
-- 
cgit v1.2.2


From 8929ecfa50f266163832eeacfbc3642ed5eb83b6 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.yan@oracle.com>
Date: Sun, 16 May 2010 10:49:58 -0400
Subject: Btrfs: Introduce global metadata reservation

Reserve metadata space for extent tree, checksum tree and root tree

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6e54665d37f7..f9c40b839452 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4060,7 +4060,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 	struct btrfs_trans_handle *trans;
 	int ret = 0;
 
-	if (root->fs_info->btree_inode == inode)
+	if (BTRFS_I(inode)->dummy_inode)
 		return 0;
 
 	if (wbc->sync_mode == WB_SYNC_ALL) {
@@ -4081,10 +4081,19 @@ void btrfs_dirty_inode(struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
+	int ret;
+
+	if (BTRFS_I(inode)->dummy_inode)
+		return;
 
 	trans = btrfs_join_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, inode);
-	btrfs_update_inode(trans, root, inode);
+
+	ret = btrfs_update_inode(trans, root, inode);
+	if (ret)
+		printk(KERN_ERR"btrfs: fail to dirty inode %lu error %d\n",
+			inode->i_ino, ret);
+
 	btrfs_end_transaction(trans, root);
 }
 
-- 
cgit v1.2.2


From d68fc57b7e3245cfacf2e3b47acfed1946a11786 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.yan@oracle.com>
Date: Sun, 16 May 2010 10:49:58 -0400
Subject: Btrfs: Metadata reservation for orphan inodes

reserve metadata space for handling orphan inodes

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 299 +++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 257 insertions(+), 42 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f9c40b839452..bef69bedf3cf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1981,33 +1981,197 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
 	up_read(&root->fs_info->cleanup_work_sem);
 }
 
+/*
+ * calculate extra metadata reservation when snapshotting a subvolume
+ * contains orphan files.
+ */
+void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
+				struct btrfs_pending_snapshot *pending,
+				u64 *bytes_to_reserve)
+{
+	struct btrfs_root *root;
+	struct btrfs_block_rsv *block_rsv;
+	u64 num_bytes;
+	int index;
+
+	root = pending->root;
+	if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
+		return;
+
+	block_rsv = root->orphan_block_rsv;
+
+	/* orphan block reservation for the snapshot */
+	num_bytes = block_rsv->size;
+
+	/*
+	 * after the snapshot is created, COWing tree blocks may use more
+	 * space than it frees. So we should make sure there is enough
+	 * reserved space.
+	 */
+	index = trans->transid & 0x1;
+	if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
+		num_bytes += block_rsv->size -
+			     (block_rsv->reserved + block_rsv->freed[index]);
+	}
+
+	*bytes_to_reserve += num_bytes;
+}
+
+void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
+				struct btrfs_pending_snapshot *pending)
+{
+	struct btrfs_root *root = pending->root;
+	struct btrfs_root *snap = pending->snap;
+	struct btrfs_block_rsv *block_rsv;
+	u64 num_bytes;
+	int index;
+	int ret;
+
+	if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
+		return;
+
+	/* refill source subvolume's orphan block reservation */
+	block_rsv = root->orphan_block_rsv;
+	index = trans->transid & 0x1;
+	if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
+		num_bytes = block_rsv->size -
+			    (block_rsv->reserved + block_rsv->freed[index]);
+		ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+					      root->orphan_block_rsv,
+					      num_bytes);
+		BUG_ON(ret);
+	}
+
+	/* setup orphan block reservation for the snapshot */
+	block_rsv = btrfs_alloc_block_rsv(snap);
+	BUG_ON(!block_rsv);
+
+	btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
+	snap->orphan_block_rsv = block_rsv;
+
+	num_bytes = root->orphan_block_rsv->size;
+	ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+				      block_rsv, num_bytes);
+	BUG_ON(ret);
+
+#if 0
+	/* insert orphan item for the snapshot */
+	WARN_ON(!root->orphan_item_inserted);
+	ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
+				       snap->root_key.objectid);
+	BUG_ON(ret);
+	snap->orphan_item_inserted = 1;
+#endif
+}
+
+enum btrfs_orphan_cleanup_state {
+	ORPHAN_CLEANUP_STARTED	= 1,
+	ORPHAN_CLEANUP_DONE	= 2,
+};
+
+/*
+ * This is called in transaction commmit time. If there are no orphan
+ * files in the subvolume, it removes orphan item and frees block_rsv
+ * structure.
+ */
+void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root)
+{
+	int ret;
+
+	if (!list_empty(&root->orphan_list) ||
+	    root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
+		return;
+
+	if (root->orphan_item_inserted &&
+	    btrfs_root_refs(&root->root_item) > 0) {
+		ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
+					    root->root_key.objectid);
+		BUG_ON(ret);
+		root->orphan_item_inserted = 0;
+	}
+
+	if (root->orphan_block_rsv) {
+		WARN_ON(root->orphan_block_rsv->size > 0);
+		btrfs_free_block_rsv(root, root->orphan_block_rsv);
+		root->orphan_block_rsv = NULL;
+	}
+}
+
 /*
  * This creates an orphan entry for the given inode in case something goes
  * wrong in the middle of an unlink/truncate.
+ *
+ * NOTE: caller of this function should reserve 5 units of metadata for
+ *	 this function.
  */
 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	int ret = 0;
+	struct btrfs_block_rsv *block_rsv = NULL;
+	int reserve = 0;
+	int insert = 0;
+	int ret;
 
-	spin_lock(&root->list_lock);
+	if (!root->orphan_block_rsv) {
+		block_rsv = btrfs_alloc_block_rsv(root);
+		BUG_ON(!block_rsv);
+	}
 
-	/* already on the orphan list, we're good */
-	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
-		spin_unlock(&root->list_lock);
-		return 0;
+	spin_lock(&root->orphan_lock);
+	if (!root->orphan_block_rsv) {
+		root->orphan_block_rsv = block_rsv;
+	} else if (block_rsv) {
+		btrfs_free_block_rsv(root, block_rsv);
+		block_rsv = NULL;
 	}
 
-	list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+	if (list_empty(&BTRFS_I(inode)->i_orphan)) {
+		list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+#if 0
+		/*
+		 * For proper ENOSPC handling, we should do orphan
+		 * cleanup when mounting. But this introduces backward
+		 * compatibility issue.
+		 */
+		if (!xchg(&root->orphan_item_inserted, 1))
+			insert = 2;
+		else
+			insert = 1;
+#endif
+		insert = 1;
+	} else {
+		WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved);
+	}
 
-	spin_unlock(&root->list_lock);
+	if (!BTRFS_I(inode)->orphan_meta_reserved) {
+		BTRFS_I(inode)->orphan_meta_reserved = 1;
+		reserve = 1;
+	}
+	spin_unlock(&root->orphan_lock);
 
-	/*
-	 * insert an orphan item to track this unlinked/truncated file
-	 */
-	ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
+	if (block_rsv)
+		btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
 
-	return ret;
+	/* grab metadata reservation from transaction handle */
+	if (reserve) {
+		ret = btrfs_orphan_reserve_metadata(trans, inode);
+		BUG_ON(ret);
+	}
+
+	/* insert an orphan item to track this unlinked/truncated file */
+	if (insert >= 1) {
+		ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
+		BUG_ON(ret);
+	}
+
+	/* insert an orphan item to track subvolume contains orphan files */
+	if (insert >= 2) {
+		ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
+					       root->root_key.objectid);
+		BUG_ON(ret);
+	}
+	return 0;
 }
 
 /*
@@ -2017,26 +2181,31 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int delete_item = 0;
+	int release_rsv = 0;
 	int ret = 0;
 
-	spin_lock(&root->list_lock);
-
-	if (list_empty(&BTRFS_I(inode)->i_orphan)) {
-		spin_unlock(&root->list_lock);
-		return 0;
+	spin_lock(&root->orphan_lock);
+	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
+		list_del_init(&BTRFS_I(inode)->i_orphan);
+		delete_item = 1;
 	}
 
-	list_del_init(&BTRFS_I(inode)->i_orphan);
-	if (!trans) {
-		spin_unlock(&root->list_lock);
-		return 0;
+	if (BTRFS_I(inode)->orphan_meta_reserved) {
+		BTRFS_I(inode)->orphan_meta_reserved = 0;
+		release_rsv = 1;
 	}
+	spin_unlock(&root->orphan_lock);
 
-	spin_unlock(&root->list_lock);
+	if (trans && delete_item) {
+		ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
+		BUG_ON(ret);
+	}
 
-	ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
+	if (release_rsv)
+		btrfs_orphan_release_metadata(inode);
 
-	return ret;
+	return 0;
 }
 
 /*
@@ -2053,7 +2222,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 	struct inode *inode;
 	int ret = 0, nr_unlink = 0, nr_truncate = 0;
 
-	if (!xchg(&root->clean_orphans, 0))
+	if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
 		return;
 
 	path = btrfs_alloc_path();
@@ -2106,16 +2275,15 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 		found_key.type = BTRFS_INODE_ITEM_KEY;
 		found_key.offset = 0;
 		inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
-		if (IS_ERR(inode))
-			break;
+		BUG_ON(IS_ERR(inode));
 
 		/*
 		 * add this inode to the orphan list so btrfs_orphan_del does
 		 * the proper thing when we hit it
 		 */
-		spin_lock(&root->list_lock);
+		spin_lock(&root->orphan_lock);
 		list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
-		spin_unlock(&root->list_lock);
+		spin_unlock(&root->orphan_lock);
 
 		/*
 		 * if this is a bad inode, means we actually succeeded in
@@ -2142,13 +2310,23 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 		/* this will do delete_inode and everything for us */
 		iput(inode);
 	}
+	btrfs_free_path(path);
+
+	root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
+
+	if (root->orphan_block_rsv)
+		btrfs_block_rsv_release(root, root->orphan_block_rsv,
+					(u64)-1);
+
+	if (root->orphan_block_rsv || root->orphan_item_inserted) {
+		trans = btrfs_join_transaction(root, 1);
+		btrfs_end_transaction(trans, root);
+	}
 
 	if (nr_unlink)
 		printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
 	if (nr_truncate)
 		printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
-
-	btrfs_free_path(path);
 }
 
 /*
@@ -3181,6 +3359,7 @@ out:
 	if (pending_del_nr) {
 		ret = btrfs_del_items(trans, root, path, pending_del_slot,
 				      pending_del_nr);
+		BUG_ON(ret);
 	}
 	btrfs_free_path(path);
 	return err;
@@ -3386,7 +3565,10 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
 		}
 	}
 
-	trans = btrfs_start_transaction(root, 1);
+	trans = btrfs_start_transaction(root, 5);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
 	btrfs_set_trans_block_group(trans, inode);
 
 	ret = btrfs_orphan_add(trans, inode);
@@ -3406,8 +3588,11 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
 		i_size_write(inode, attr->ia_size);
 		btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
 
-		trans = btrfs_start_transaction(root, 1);
+		trans = btrfs_start_transaction(root, 0);
+		BUG_ON(IS_ERR(trans));
 		btrfs_set_trans_block_group(trans, inode);
+		trans->block_rsv = root->orphan_block_rsv;
+		BUG_ON(!trans->block_rsv);
 
 		ret = btrfs_update_inode(trans, root, inode);
 		BUG_ON(ret);
@@ -3487,10 +3672,21 @@ void btrfs_delete_inode(struct inode *inode)
 	btrfs_i_size_write(inode, 0);
 
 	while (1) {
-		trans = btrfs_start_transaction(root, 1);
+		trans = btrfs_start_transaction(root, 0);
+		BUG_ON(IS_ERR(trans));
 		btrfs_set_trans_block_group(trans, inode);
-		ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
+		trans->block_rsv = root->orphan_block_rsv;
+
+		ret = btrfs_block_rsv_check(trans, root,
+					    root->orphan_block_rsv, 0, 5);
+		if (ret) {
+			BUG_ON(ret != -EAGAIN);
+			ret = btrfs_commit_transaction(trans, root);
+			BUG_ON(ret);
+			continue;
+		}
 
+		ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
 		if (ret != -EAGAIN)
 			break;
 
@@ -3498,6 +3694,7 @@ void btrfs_delete_inode(struct inode *inode)
 		btrfs_end_transaction(trans, root);
 		trans = NULL;
 		btrfs_btree_balance_dirty(root, nr);
+
 	}
 
 	if (ret == 0) {
@@ -5247,8 +5444,10 @@ static void btrfs_truncate(struct inode *inode)
 	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
 	btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
 
-	trans = btrfs_start_transaction(root, 1);
+	trans = btrfs_start_transaction(root, 0);
+	BUG_ON(IS_ERR(trans));
 	btrfs_set_trans_block_group(trans, inode);
+	trans->block_rsv = root->orphan_block_rsv;
 
 	/*
 	 * setattr is responsible for setting the ordered_data_close flag,
@@ -5271,6 +5470,23 @@ static void btrfs_truncate(struct inode *inode)
 		btrfs_add_ordered_operation(trans, root, inode);
 
 	while (1) {
+		if (!trans) {
+			trans = btrfs_start_transaction(root, 0);
+			BUG_ON(IS_ERR(trans));
+			btrfs_set_trans_block_group(trans, inode);
+			trans->block_rsv = root->orphan_block_rsv;
+		}
+
+		ret = btrfs_block_rsv_check(trans, root,
+					    root->orphan_block_rsv, 0, 5);
+		if (ret) {
+			BUG_ON(ret != -EAGAIN);
+			ret = btrfs_commit_transaction(trans, root);
+			BUG_ON(ret);
+			trans = NULL;
+			continue;
+		}
+
 		ret = btrfs_truncate_inode_items(trans, root, inode,
 						 inode->i_size,
 						 BTRFS_EXTENT_DATA_KEY);
@@ -5282,10 +5498,8 @@ static void btrfs_truncate(struct inode *inode)
 
 		nr = trans->blocks_used;
 		btrfs_end_transaction(trans, root);
+		trans = NULL;
 		btrfs_btree_balance_dirty(root, nr);
-
-		trans = btrfs_start_transaction(root, 1);
-		btrfs_set_trans_block_group(trans, inode);
 	}
 
 	if (ret == 0 && inode->i_nlink > 0) {
@@ -5371,6 +5585,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	ei->reserved_extents = 0;
 
 	ei->ordered_data_close = 0;
+	ei->orphan_meta_reserved = 0;
 	ei->dummy_inode = 0;
 	ei->force_compress = 0;
 
@@ -5417,13 +5632,13 @@ void btrfs_destroy_inode(struct inode *inode)
 		spin_unlock(&root->fs_info->ordered_extent_lock);
 	}
 
-	spin_lock(&root->list_lock);
+	spin_lock(&root->orphan_lock);
 	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
 		printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
 		       inode->i_ino);
 		list_del_init(&BTRFS_I(inode)->i_orphan);
 	}
-	spin_unlock(&root->list_lock);
+	spin_unlock(&root->orphan_lock);
 
 	while (1) {
 		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
-- 
cgit v1.2.2


From efa56464562991b8c24f965199888806bd8c4b38 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.yan@oracle.com>
Date: Sun, 16 May 2010 10:49:59 -0400
Subject: Btrfs: Pre-allocate space for data relocation

Pre-allocate space for data relocation. This can detect ENOPSC
condition caused by fragmentation of free space.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 44 ++++++++++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 20 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bef69bedf3cf..460dd512eebd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1175,6 +1175,13 @@ out_check:
 					       num_bytes, num_bytes, type);
 		BUG_ON(ret);
 
+		if (root->root_key.objectid ==
+		    BTRFS_DATA_RELOC_TREE_OBJECTID) {
+			ret = btrfs_reloc_clone_csums(inode, cur_offset,
+						      num_bytes);
+			BUG_ON(ret);
+		}
+
 		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
 				cur_offset, cur_offset + num_bytes - 1,
 				locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
@@ -6080,16 +6087,15 @@ out_unlock:
 	return err;
 }
 
-static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
-			u64 alloc_hint, int mode, loff_t actual_len)
+int btrfs_prealloc_file_range(struct inode *inode, int mode,
+			      u64 start, u64 num_bytes, u64 min_size,
+			      loff_t actual_len, u64 *alloc_hint)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_key ins;
 	u64 cur_offset = start;
-	u64 num_bytes = end - start;
 	int ret = 0;
-	u64 i_size;
 
 	while (num_bytes > 0) {
 		trans = btrfs_start_transaction(root, 3);
@@ -6098,9 +6104,8 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
 			break;
 		}
 
-		ret = btrfs_reserve_extent(trans, root, num_bytes,
-					   root->sectorsize, 0, alloc_hint,
-					   (u64)-1, &ins, 1);
+		ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
+					   0, *alloc_hint, (u64)-1, &ins, 1);
 		if (ret) {
 			btrfs_end_transaction(trans, root);
 			break;
@@ -6117,20 +6122,19 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
 
 		num_bytes -= ins.offset;
 		cur_offset += ins.offset;
-		alloc_hint = ins.objectid + ins.offset;
+		*alloc_hint = ins.objectid + ins.offset;
 
 		inode->i_ctime = CURRENT_TIME;
 		BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
 		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-			(actual_len > inode->i_size) &&
-			(cur_offset > inode->i_size)) {
-
+		    (actual_len > inode->i_size) &&
+		    (cur_offset > inode->i_size)) {
 			if (cur_offset > actual_len)
-				i_size  = actual_len;
+				i_size_write(inode, actual_len);
 			else
-				i_size = cur_offset;
-			i_size_write(inode, i_size);
-			btrfs_ordered_update_i_size(inode, i_size, NULL);
+				i_size_write(inode, cur_offset);
+			i_size_write(inode, cur_offset);
+			btrfs_ordered_update_i_size(inode, cur_offset, NULL);
 		}
 
 		ret = btrfs_update_inode(trans, root, inode);
@@ -6216,16 +6220,16 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 		if (em->block_start == EXTENT_MAP_HOLE ||
 		    (cur_offset >= inode->i_size &&
 		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
-			ret = prealloc_file_range(inode,
-						  cur_offset, last_byte,
-						alloc_hint, mode, offset+len);
+			ret = btrfs_prealloc_file_range(inode, 0, cur_offset,
+							last_byte - cur_offset,
+							1 << inode->i_blkbits,
+							offset + len,
+							&alloc_hint);
 			if (ret < 0) {
 				free_extent_map(em);
 				break;
 			}
 		}
-		if (em->block_start <= EXTENT_MAP_LAST_BYTE)
-			alloc_hint = em->block_start;
 		free_extent_map(em);
 
 		cur_offset = last_byte;
-- 
cgit v1.2.2


From 4b46fce23349bfca781a32e2707a18328ca5ae22 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Sun, 23 May 2010 11:00:55 -0400
Subject: Btrfs: add basic DIO read/write support

This provides basic DIO support for reading and writing.  It does not do the
work to recover from mismatching checksums, that will come later.  A few design
changes have been made from Jim's code (sorry Jim!)

1) Use the generic direct-io code.  Jim originally re-wrote all the generic DIO
code in order to account for all of BTRFS's oddities, but thanks to that work it
seems like the best bet is to just ignore compression and such and just opt to
fallback on buffered IO.

2) Fallback on buffered IO for compressed or inline extents.  Jim's code did
it's own buffering to make dio with compressed extents work.  Now we just
fallback onto normal buffered IO.

3) Use ordered extents for the writes so that all of the

lock_extent()
lookup_ordered()

type checks continue to work.

4) Do the lock_extent() lookup_ordered() loop in readpage so we don't race with
DIO writes.

I've tested this with fsx and everything works great.  This patch depends on my
dio and filemap.c patches to work.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 487 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 463 insertions(+), 24 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 460dd512eebd..1695440a59a4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -698,6 +698,38 @@ retry:
 	return 0;
 }
 
+static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
+				      u64 num_bytes)
+{
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_map *em;
+	u64 alloc_hint = 0;
+
+	read_lock(&em_tree->lock);
+	em = search_extent_mapping(em_tree, start, num_bytes);
+	if (em) {
+		/*
+		 * if block start isn't an actual block number then find the
+		 * first block in this inode and use that as a hint.  If that
+		 * block is also bogus then just don't worry about it.
+		 */
+		if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+			free_extent_map(em);
+			em = search_extent_mapping(em_tree, 0, 0);
+			if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
+				alloc_hint = em->block_start;
+			if (em)
+				free_extent_map(em);
+		} else {
+			alloc_hint = em->block_start;
+			free_extent_map(em);
+		}
+	}
+	read_unlock(&em_tree->lock);
+
+	return alloc_hint;
+}
+
 /*
  * when extent_io.c finds a delayed allocation range in the file,
  * the call backs end up in this code.  The basic idea is to
@@ -770,29 +802,7 @@ static noinline int cow_file_range(struct inode *inode,
 	BUG_ON(disk_num_bytes >
 	       btrfs_super_total_bytes(&root->fs_info->super_copy));
 
-
-	read_lock(&BTRFS_I(inode)->extent_tree.lock);
-	em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
-				   start, num_bytes);
-	if (em) {
-		/*
-		 * if block start isn't an actual block number then find the
-		 * first block in this inode and use that as a hint.  If that
-		 * block is also bogus then just don't worry about it.
-		 */
-		if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
-			free_extent_map(em);
-			em = search_extent_mapping(em_tree, 0, 0);
-			if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
-				alloc_hint = em->block_start;
-			if (em)
-				free_extent_map(em);
-		} else {
-			alloc_hint = em->block_start;
-			free_extent_map(em);
-		}
-	}
-	read_unlock(&BTRFS_I(inode)->extent_tree.lock);
+	alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
 	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
 
 	while (disk_num_bytes > 0) {
@@ -5171,11 +5181,440 @@ out:
 	return em;
 }
 
+static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
+						  u64 start, u64 len)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	struct extent_map *em;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct btrfs_key ins;
+	u64 alloc_hint;
+	int ret;
+
+	btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
+
+	trans = btrfs_join_transaction(root, 0);
+	if (!trans)
+		return ERR_PTR(-ENOMEM);
+
+	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+
+	alloc_hint = get_extent_allocation_hint(inode, start, len);
+	ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
+				   alloc_hint, (u64)-1, &ins, 1);
+	if (ret) {
+		em = ERR_PTR(ret);
+		goto out;
+	}
+
+	em = alloc_extent_map(GFP_NOFS);
+	if (!em) {
+		em = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	em->start = start;
+	em->orig_start = em->start;
+	em->len = ins.offset;
+
+	em->block_start = ins.objectid;
+	em->block_len = ins.offset;
+	em->bdev = root->fs_info->fs_devices->latest_bdev;
+	set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+	while (1) {
+		write_lock(&em_tree->lock);
+		ret = add_extent_mapping(em_tree, em);
+		write_unlock(&em_tree->lock);
+		if (ret != -EEXIST)
+			break;
+		btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
+	}
+
+	ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
+					   ins.offset, ins.offset, 0);
+	if (ret) {
+		btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+		em = ERR_PTR(ret);
+	}
+out:
+	btrfs_end_transaction(trans, root);
+	return em;
+}
+
+static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
+				   struct buffer_head *bh_result, int create)
+{
+	struct extent_map *em;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 start = iblock << inode->i_blkbits;
+	u64 len = bh_result->b_size;
+
+	em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+	if (IS_ERR(em))
+		return PTR_ERR(em);
+
+	/*
+	 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
+	 * io.  INLINE is special, and we could probably kludge it in here, but
+	 * it's still buffered so for safety lets just fall back to the generic
+	 * buffered path.
+	 *
+	 * For COMPRESSED we _have_ to read the entire extent in so we can
+	 * decompress it, so there will be buffering required no matter what we
+	 * do, so go ahead and fallback to buffered.
+	 *
+	 * We return -ENOTBLK because thats what makes DIO go ahead and go back
+	 * to buffered IO.  Don't blame me, this is the price we pay for using
+	 * the generic code.
+	 */
+	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
+	    em->block_start == EXTENT_MAP_INLINE) {
+		free_extent_map(em);
+		return -ENOTBLK;
+	}
+
+	/* Just a good old fashioned hole, return */
+	if (!create && (em->block_start == EXTENT_MAP_HOLE ||
+			test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+		free_extent_map(em);
+		/* DIO will do one hole at a time, so just unlock a sector */
+		unlock_extent(&BTRFS_I(inode)->io_tree, start,
+			      start + root->sectorsize - 1, GFP_NOFS);
+		return 0;
+	}
+
+	/*
+	 * We don't allocate a new extent in the following cases
+	 *
+	 * 1) The inode is marked as NODATACOW.  In this case we'll just use the
+	 * existing extent.
+	 * 2) The extent is marked as PREALLOC.  We're good to go here and can
+	 * just use the extent.
+	 *
+	 */
+	if (!create)
+		goto map;
+
+	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+	    ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+	     em->block_start != EXTENT_MAP_HOLE)) {
+		u64 block_start;
+		int type;
+		int ret;
+
+		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+			type = BTRFS_ORDERED_PREALLOC;
+		else
+			type = BTRFS_ORDERED_NOCOW;
+		len = min(len, em->block_len - (start - em->start));
+		block_start = em->block_start + (start - em->start);
+		ret = btrfs_add_ordered_extent_dio(inode, start,
+						   start, len, len, type);
+		if (ret) {
+			free_extent_map(em);
+			return ret;
+		}
+	} else {
+		free_extent_map(em);
+		em = btrfs_new_extent_direct(inode, start, len);
+		if (IS_ERR(em))
+			return PTR_ERR(em);
+		len = min(len, em->block_len);
+	}
+	unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len - 1,
+		      GFP_NOFS);
+map:
+	bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
+		inode->i_blkbits;
+	bh_result->b_size = em->len - (start - em->start);
+	bh_result->b_bdev = em->bdev;
+	set_buffer_mapped(bh_result);
+	if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+		set_buffer_new(bh_result);
+
+	free_extent_map(em);
+
+	return 0;
+}
+
+struct btrfs_dio_private {
+	struct inode *inode;
+	u64 logical_offset;
+	u64 disk_bytenr;
+	u64 bytes;
+	u32 *csums;
+	void *private;
+};
+
+static void btrfs_endio_direct_read(struct bio *bio, int err)
+{
+	struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct bio_vec *bvec = bio->bi_io_vec;
+	struct btrfs_dio_private *dip = bio->bi_private;
+	struct inode *inode = dip->inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 start;
+	u32 *private = dip->csums;
+
+	start = dip->logical_offset;
+	do {
+		if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+			struct page *page = bvec->bv_page;
+			char *kaddr;
+			u32 csum = ~(u32)0;
+			unsigned long flags;
+
+			local_irq_save(flags);
+			kaddr = kmap_atomic(page, KM_IRQ0);
+			csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
+					       csum, bvec->bv_len);
+			btrfs_csum_final(csum, (char *)&csum);
+			kunmap_atomic(kaddr, KM_IRQ0);
+			local_irq_restore(flags);
+
+			flush_dcache_page(bvec->bv_page);
+			if (csum != *private) {
+				printk(KERN_ERR "btrfs csum failed ino %lu off"
+				      " %llu csum %u private %u\n",
+				      inode->i_ino, (unsigned long long)start,
+				      csum, *private);
+				err = -EIO;
+			}
+		}
+
+		start += bvec->bv_len;
+		private++;
+		bvec++;
+	} while (bvec <= bvec_end);
+
+	unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
+		      dip->logical_offset + dip->bytes - 1, GFP_NOFS);
+	bio->bi_private = dip->private;
+
+	kfree(dip->csums);
+	kfree(dip);
+	dio_end_io(bio, err);
+}
+
+static void btrfs_endio_direct_write(struct bio *bio, int err)
+{
+	struct btrfs_dio_private *dip = bio->bi_private;
+	struct inode *inode = dip->inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_ordered_extent *ordered = NULL;
+	struct extent_state *cached_state = NULL;
+	int ret;
+
+	if (err)
+		goto out_done;
+
+	ret = btrfs_dec_test_ordered_pending(inode, &ordered,
+					     dip->logical_offset, dip->bytes);
+	if (!ret)
+		goto out_done;
+
+	BUG_ON(!ordered);
+
+	trans = btrfs_join_transaction(root, 1);
+	if (!trans) {
+		err = -ENOMEM;
+		goto out;
+	}
+	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+
+	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
+		ret = btrfs_ordered_update_i_size(inode, 0, ordered);
+		if (!ret)
+			ret = btrfs_update_inode(trans, root, inode);
+		err = ret;
+		goto out;
+	}
+
+	lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
+			 ordered->file_offset + ordered->len - 1, 0,
+			 &cached_state, GFP_NOFS);
+
+	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
+		ret = btrfs_mark_extent_written(trans, inode,
+						ordered->file_offset,
+						ordered->file_offset +
+						ordered->len);
+		if (ret) {
+			err = ret;
+			goto out_unlock;
+		}
+	} else {
+		ret = insert_reserved_file_extent(trans, inode,
+						  ordered->file_offset,
+						  ordered->start,
+						  ordered->disk_len,
+						  ordered->len,
+						  ordered->len,
+						  0, 0, 0,
+						  BTRFS_FILE_EXTENT_REG);
+		unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
+				   ordered->file_offset, ordered->len);
+		if (ret) {
+			err = ret;
+			WARN_ON(1);
+			goto out_unlock;
+		}
+	}
+
+	add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
+	btrfs_ordered_update_i_size(inode, 0, ordered);
+	btrfs_update_inode(trans, root, inode);
+out_unlock:
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
+			     ordered->file_offset + ordered->len - 1,
+			     &cached_state, GFP_NOFS);
+out:
+	btrfs_delalloc_release_metadata(inode, ordered->len);
+	btrfs_end_transaction(trans, root);
+	btrfs_put_ordered_extent(ordered);
+	btrfs_put_ordered_extent(ordered);
+out_done:
+	bio->bi_private = dip->private;
+
+	kfree(dip->csums);
+	kfree(dip);
+	dio_end_io(bio, err);
+}
+
+static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
+				loff_t file_offset)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_dio_private *dip;
+	struct bio_vec *bvec = bio->bi_io_vec;
+	u64 start;
+	int skip_sum;
+	int write = rw & (1 << BIO_RW);
+	int ret = 0;
+
+	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+
+	dip = kmalloc(sizeof(*dip), GFP_NOFS);
+	if (!dip) {
+		ret = -ENOMEM;
+		goto free_ordered;
+	}
+	dip->csums = NULL;
+
+	if (!skip_sum) {
+		dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
+		if (!dip->csums) {
+			ret = -ENOMEM;
+			goto free_ordered;
+		}
+	}
+
+	dip->private = bio->bi_private;
+	dip->inode = inode;
+	dip->logical_offset = file_offset;
+
+	start = dip->logical_offset;
+	dip->bytes = 0;
+	do {
+		dip->bytes += bvec->bv_len;
+		bvec++;
+	} while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
+
+	dip->disk_bytenr = bio->bi_sector << 9;
+	bio->bi_private = dip;
+
+	if (write)
+		bio->bi_end_io = btrfs_endio_direct_write;
+	else
+		bio->bi_end_io = btrfs_endio_direct_read;
+
+	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+	if (ret)
+		goto out_err;
+
+	if (write && !skip_sum)
+		btrfs_csum_one_bio(root, inode, bio, dip->logical_offset, 1);
+	else if (!skip_sum)
+		btrfs_lookup_bio_sums_dio(root, inode, bio,
+					  dip->logical_offset, dip->csums);
+
+	ret = btrfs_map_bio(root, rw, bio, 0, 0);
+	if (ret)
+		goto out_err;
+	return;
+out_err:
+	kfree(dip->csums);
+	kfree(dip);
+free_ordered:
+	/*
+	 * If this is a write, we need to clean up the reserved space and kill
+	 * the ordered extent.
+	 */
+	if (write) {
+		struct btrfs_ordered_extent *ordered;
+		ordered = btrfs_lookup_ordered_extent(inode,
+						      dip->logical_offset);
+		if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
+		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
+			btrfs_free_reserved_extent(root, ordered->start,
+						   ordered->disk_len);
+		btrfs_put_ordered_extent(ordered);
+		btrfs_put_ordered_extent(ordered);
+	}
+	bio_endio(bio, ret);
+}
+
 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 			const struct iovec *iov, loff_t offset,
 			unsigned long nr_segs)
 {
-	return -EINVAL;
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	struct btrfs_ordered_extent *ordered;
+	u64 lockstart, lockend;
+	ssize_t ret;
+
+	lockstart = offset;
+	lockend = offset + iov_length(iov, nr_segs) - 1;
+	while (1) {
+		lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+			    GFP_NOFS);
+		/*
+		 * We're concerned with the entire range that we're going to be
+		 * doing DIO to, so we need to make sure theres no ordered
+		 * extents in this range.
+		 */
+		ordered = btrfs_lookup_ordered_range(inode, lockstart,
+						     lockend - lockstart + 1);
+		if (!ordered)
+			break;
+		unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+			      GFP_NOFS);
+		btrfs_start_ordered_extent(inode, ordered, 1);
+		btrfs_put_ordered_extent(ordered);
+		cond_resched();
+	}
+
+	ret = __blockdev_direct_IO(rw, iocb, inode, NULL, iov, offset, nr_segs,
+				   btrfs_get_blocks_direct, NULL,
+				   btrfs_submit_direct, 0);
+
+	if (ret < 0 && ret != -EIOCBQUEUED) {
+		unlock_extent(&BTRFS_I(inode)->io_tree, offset,
+			      offset + iov_length(iov, nr_segs) - 1, GFP_NOFS);
+	} else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
+		/*
+		 * We're falling back to buffered, unlock the section we didn't
+		 * do IO on.
+		 */
+		unlock_extent(&BTRFS_I(inode)->io_tree, offset + ret,
+			      offset + iov_length(iov, nr_segs) - 1, GFP_NOFS);
+	}
+
+	return ret;
 }
 
 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
-- 
cgit v1.2.2


From eaf25d933e64c2bf3c79b83e8820404f36fdfc52 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 25 May 2010 09:48:28 -0400
Subject: Btrfs: use async helpers for DIO write checksumming

The async helper threads offload crc work onto all the
CPUs, and make streaming writes much faster.  This
changes the O_DIRECT write code to use them.  The only
small complication was that we need to pass in the
logical offset in the file for each bio, because we can't
find it in the bio's pages.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 40 +++++++++++++++++++++++++++++++---------
 1 file changed, 31 insertions(+), 9 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1695440a59a4..13a4aa222861 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1385,7 +1385,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
  */
 static int __btrfs_submit_bio_start(struct inode *inode, int rw,
 				    struct bio *bio, int mirror_num,
-				    unsigned long bio_flags)
+				    unsigned long bio_flags,
+				    u64 bio_offset)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
@@ -1404,7 +1405,8 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw,
  * are inserted into the btree
  */
 static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
-			  int mirror_num, unsigned long bio_flags)
+			  int mirror_num, unsigned long bio_flags,
+			  u64 bio_offset)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	return btrfs_map_bio(root, rw, bio, mirror_num, 1);
@@ -1415,7 +1417,8 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
  * on write, or reading the csums from the tree before a read
  */
 static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-			  int mirror_num, unsigned long bio_flags)
+			  int mirror_num, unsigned long bio_flags,
+			  u64 bio_offset)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
@@ -1440,7 +1443,8 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 		/* we're doing a write, do the async checksumming */
 		return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
 				   inode, rw, bio, mirror_num,
-				   bio_flags, __btrfs_submit_bio_start,
+				   bio_flags, bio_offset,
+				   __btrfs_submit_bio_start,
 				   __btrfs_submit_bio_done);
 	}
 
@@ -1844,7 +1848,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
 
 	BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
 						      failrec->last_mirror,
-						      failrec->bio_flags);
+						      failrec->bio_flags, 0);
 	return 0;
 }
 
@@ -5484,6 +5488,17 @@ out_done:
 	dio_end_io(bio, err);
 }
 
+static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
+				    struct bio *bio, int mirror_num,
+				    unsigned long bio_flags, u64 offset)
+{
+	int ret;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
+	BUG_ON(ret);
+	return 0;
+}
+
 static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
 				loff_t file_offset)
 {
@@ -5535,13 +5550,20 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
 	if (ret)
 		goto out_err;
 
-	if (write && !skip_sum)
-		btrfs_csum_one_bio(root, inode, bio, dip->logical_offset, 1);
-	else if (!skip_sum)
+	if (write && !skip_sum) {
+		ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+				   inode, rw, bio, 0, 0,
+				   dip->logical_offset,
+				   __btrfs_submit_bio_start_direct_io,
+				   __btrfs_submit_bio_done);
+		if (ret)
+			goto out_err;
+		return;
+	} else if (!skip_sum)
 		btrfs_lookup_bio_sums_dio(root, inode, bio,
 					  dip->logical_offset, dip->csums);
 
-	ret = btrfs_map_bio(root, rw, bio, 0, 0);
+	ret = btrfs_map_bio(root, rw, bio, 0, 1);
 	if (ret)
 		goto out_err;
 	return;
-- 
cgit v1.2.2


From 4845e44ffdb26be9b25610664228e8ecaf949a0d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 25 May 2010 20:56:50 -0400
Subject: Btrfs: rework O_DIRECT enospc handling

This changes O_DIRECT write code to mark extents as delalloc
while it is processing them.  Yan Zheng has reworked the
enospc accounting based on tracking delalloc extents and
this makes it much easier to track enospc in the O_DIRECT code.

There are a few space cases with the O_DIRECT code though,
it only sets the EXTENT_DELALLOC bits, instead of doing
EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, because
we don't want to mess with clearing the dirty and uptodate
bits when things go wrong.  This is important because there
are no pages in the page cache, so any extent state structs
that we put in the tree won't get freed by releasepage.  We have
to clear them ourselves as the DIO ends.

With this commit, we reserve space at in btrfs_file_aio_write,
and then as each btrfs_direct_IO call progresses it sets
EXTENT_DELALLOC on the range.

btrfs_get_blocks_direct is responsible for clearing the delalloc
at the same time it drops the extent lock.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 52 +++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 41 insertions(+), 11 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 13a4aa222861..00aefbdcc2df 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5327,8 +5327,9 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 			return PTR_ERR(em);
 		len = min(len, em->block_len);
 	}
-	unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len - 1,
-		      GFP_NOFS);
+	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
+			  EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
+			  0, NULL, GFP_NOFS);
 map:
 	bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
 		inode->i_blkbits;
@@ -5596,14 +5597,18 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	struct btrfs_ordered_extent *ordered;
+	struct extent_state *cached_state = NULL;
 	u64 lockstart, lockend;
 	ssize_t ret;
+	int writing = rw & WRITE;
+	int write_bits = 0;
 
 	lockstart = offset;
 	lockend = offset + iov_length(iov, nr_segs) - 1;
+
 	while (1) {
-		lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-			    GFP_NOFS);
+		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+				 0, &cached_state, GFP_NOFS);
 		/*
 		 * We're concerned with the entire range that we're going to be
 		 * doing DIO to, so we need to make sure theres no ordered
@@ -5613,29 +5618,54 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 						     lockend - lockstart + 1);
 		if (!ordered)
 			break;
-		unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-			      GFP_NOFS);
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+				     &cached_state, GFP_NOFS);
 		btrfs_start_ordered_extent(inode, ordered, 1);
 		btrfs_put_ordered_extent(ordered);
 		cond_resched();
 	}
 
+	/*
+	 * we don't use btrfs_set_extent_delalloc because we don't want
+	 * the dirty or uptodate bits
+	 */
+	if (writing) {
+		write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
+		ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+				     EXTENT_DELALLOC, 0, NULL, &cached_state,
+				     GFP_NOFS);
+		if (ret) {
+			clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+					 lockend, EXTENT_LOCKED | write_bits,
+					 1, 0, &cached_state, GFP_NOFS);
+			goto out;
+		}
+	}
+
+	free_extent_state(cached_state);
+	cached_state = NULL;
+
 	ret = __blockdev_direct_IO(rw, iocb, inode, NULL, iov, offset, nr_segs,
 				   btrfs_get_blocks_direct, NULL,
 				   btrfs_submit_direct, 0);
 
 	if (ret < 0 && ret != -EIOCBQUEUED) {
-		unlock_extent(&BTRFS_I(inode)->io_tree, offset,
-			      offset + iov_length(iov, nr_segs) - 1, GFP_NOFS);
+		clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
+			      offset + iov_length(iov, nr_segs) - 1,
+			      EXTENT_LOCKED | write_bits, 1, 0,
+			      &cached_state, GFP_NOFS);
 	} else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
 		/*
 		 * We're falling back to buffered, unlock the section we didn't
 		 * do IO on.
 		 */
-		unlock_extent(&BTRFS_I(inode)->io_tree, offset + ret,
-			      offset + iov_length(iov, nr_segs) - 1, GFP_NOFS);
+		clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
+			      offset + iov_length(iov, nr_segs) - 1,
+			      EXTENT_LOCKED | write_bits, 1, 0,
+			      &cached_state, GFP_NOFS);
 	}
-
+out:
+	free_extent_state(cached_state);
 	return ret;
 }
 
-- 
cgit v1.2.2


From 3f7c579c41a3d20af76fd6ff1f6b949edf105fd1 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 26 May 2010 10:59:53 -0400
Subject: Btrfs: move O_DIRECT space reservation to btrfs_direct_IO

This moves the delalloc space reservation done for O_DIRECT
into btrfs_direct_IO.  This way we don't leak reserved space
if the generic O_DIRECT write code errors out before it
calls into btrfs_direct_IO.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 00aefbdcc2df..ca9d5501d340 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5602,9 +5602,16 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 	ssize_t ret;
 	int writing = rw & WRITE;
 	int write_bits = 0;
+	size_t count = iov_length(iov, nr_segs);
 
 	lockstart = offset;
-	lockend = offset + iov_length(iov, nr_segs) - 1;
+	lockend = offset + count - 1;
+
+	if (writing) {
+		ret = btrfs_delalloc_reserve_space(inode, count);
+		if (ret)
+			goto out;
+	}
 
 	while (1) {
 		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-- 
cgit v1.2.2


From 94b604429a6cd94ddc128fa2772c57209bb1318f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 26 May 2010 11:02:00 -0400
Subject: Btrfs: avoid ENOSPC errors in btrfs_dirty_inode

btrfs_dirty_inode tries to sneak in without much waiting or
space reservation, mostly for performance reasons.  This
usually works well but can cause problems when there are
many many writers.

When btrfs_update_inode fails with ENOSPC, we fallback
to a slower btrfs_start_transaction call that will reserve
some space.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ca9d5501d340..5d62f21b2e45 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4308,10 +4308,18 @@ void btrfs_dirty_inode(struct inode *inode)
 	btrfs_set_trans_block_group(trans, inode);
 
 	ret = btrfs_update_inode(trans, root, inode);
-	if (ret)
-		printk(KERN_ERR"btrfs: fail to dirty inode %lu error %d\n",
-			inode->i_ino, ret);
+	if (ret && ret == -ENOSPC) {
+		/* whoops, lets try again with the full transaction */
+		btrfs_end_transaction(trans, root);
+		trans = btrfs_start_transaction(root, 1);
+		btrfs_set_trans_block_group(trans, inode);
 
+		ret = btrfs_update_inode(trans, root, inode);
+		if (ret) {
+			printk(KERN_ERR"btrfs: fail to dirty inode %lu error %d\n",
+				inode->i_ino, ret);
+		}
+	}
 	btrfs_end_transaction(trans, root);
 }
 
-- 
cgit v1.2.2


From 46bfbb5c0740c200d2b66afcbd1f9d64ab320940 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 26 May 2010 11:04:10 -0400
Subject: Btrfs: fix preallocation and nodatacow checks in O_DIRECT

The O_DIRECT code wasn't checking for multiple references
on preallocated or nodatacow extents.  This means it
wasn't honoring snapshots properly.

The fix here is to add an explicit check for multiple references
This also fixes the math for selecting the correct disk block,
making sure not to go past the end of the extent.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 156 +++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 140 insertions(+), 16 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5d62f21b2e45..5ab120d544bc 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5255,6 +5255,106 @@ out:
 	return em;
 }
 
+/*
+ * returns 1 when the nocow is safe, < 1 on error, 0 if the
+ * block must be cow'd
+ */
+static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
+				      struct inode *inode, u64 offset, u64 len)
+{
+	struct btrfs_path *path;
+	int ret;
+	struct extent_buffer *leaf;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_key key;
+	u64 disk_bytenr;
+	u64 backref_offset;
+	u64 extent_end;
+	u64 num_bytes;
+	int slot;
+	int found_type;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+				       offset, 0);
+	if (ret < 0)
+		goto out;
+
+	slot = path->slots[0];
+	if (ret == 1) {
+		if (slot == 0) {
+			/* can't find the item, must cow */
+			ret = 0;
+			goto out;
+		}
+		slot--;
+	}
+	ret = 0;
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &key, slot);
+	if (key.objectid != inode->i_ino ||
+	    key.type != BTRFS_EXTENT_DATA_KEY) {
+		/* not our file or wrong item type, must cow */
+		goto out;
+	}
+
+	if (key.offset > offset) {
+		/* Wrong offset, must cow */
+		goto out;
+	}
+
+	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+	found_type = btrfs_file_extent_type(leaf, fi);
+	if (found_type != BTRFS_FILE_EXTENT_REG &&
+	    found_type != BTRFS_FILE_EXTENT_PREALLOC) {
+		/* not a regular extent, must cow */
+		goto out;
+	}
+	disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+	backref_offset = btrfs_file_extent_offset(leaf, fi);
+
+	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+	if (extent_end < offset + len) {
+		/* extent doesn't include our full range, must cow */
+		goto out;
+	}
+
+	if (btrfs_extent_readonly(root, disk_bytenr))
+		goto out;
+
+	/*
+	 * look for other files referencing this extent, if we
+	 * find any we must cow
+	 */
+	if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
+				  key.offset - backref_offset, disk_bytenr))
+		goto out;
+
+	/*
+	 * adjust disk_bytenr and num_bytes to cover just the bytes
+	 * in this extent we are about to write.  If there
+	 * are any csums in that range we have to cow in order
+	 * to keep the csums correct
+	 */
+	disk_bytenr += backref_offset;
+	disk_bytenr += offset - key.offset;
+	num_bytes = min(offset + len, extent_end) - offset;
+	if (csum_exist_in_range(root, disk_bytenr, num_bytes))
+				goto out;
+	/*
+	 * all of the above have passed, it is safe to overwrite this extent
+	 * without cow
+	 */
+	ret = 1;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 				   struct buffer_head *bh_result, int create)
 {
@@ -5262,6 +5362,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u64 start = iblock << inode->i_blkbits;
 	u64 len = bh_result->b_size;
+	struct btrfs_trans_handle *trans;
 
 	em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
 	if (IS_ERR(em))
@@ -5306,42 +5407,65 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 	 * just use the extent.
 	 *
 	 */
-	if (!create)
+	if (!create) {
+		len = em->len - (start - em->start);
 		goto map;
+	}
 
 	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
 	    ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
 	     em->block_start != EXTENT_MAP_HOLE)) {
-		u64 block_start;
 		int type;
 		int ret;
+		u64 block_start;
 
 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
 			type = BTRFS_ORDERED_PREALLOC;
 		else
 			type = BTRFS_ORDERED_NOCOW;
-		len = min(len, em->block_len - (start - em->start));
+		len = min(len, em->len - (start - em->start));
 		block_start = em->block_start + (start - em->start);
-		ret = btrfs_add_ordered_extent_dio(inode, start,
-						   start, len, len, type);
-		if (ret) {
-			free_extent_map(em);
-			return ret;
+
+		/*
+		 * we're not going to log anything, but we do need
+		 * to make sure the current transaction stays open
+		 * while we look for nocow cross refs
+		 */
+		trans = btrfs_join_transaction(root, 0);
+		if (!trans)
+			goto must_cow;
+
+		if (can_nocow_odirect(trans, inode, start, len) == 1) {
+			ret = btrfs_add_ordered_extent_dio(inode, start,
+					   block_start, len, len, type);
+			btrfs_end_transaction(trans, root);
+			if (ret) {
+				free_extent_map(em);
+				return ret;
+			}
+			goto unlock;
 		}
-	} else {
-		free_extent_map(em);
-		em = btrfs_new_extent_direct(inode, start, len);
-		if (IS_ERR(em))
-			return PTR_ERR(em);
-		len = min(len, em->block_len);
+		btrfs_end_transaction(trans, root);
 	}
+must_cow:
+	/*
+	 * this will cow the extent, reset the len in case we changed
+	 * it above
+	 */
+	len = bh_result->b_size;
+	free_extent_map(em);
+	em = btrfs_new_extent_direct(inode, start, len);
+	if (IS_ERR(em))
+		return PTR_ERR(em);
+	len = min(len, em->len - (start - em->start));
+unlock:
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
 			  EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
 			  0, NULL, GFP_NOFS);
 map:
 	bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
 		inode->i_blkbits;
-	bh_result->b_size = em->len - (start - em->start);
+	bh_result->b_size = len;
 	bh_result->b_bdev = em->bdev;
 	set_buffer_mapped(bh_result);
 	if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
@@ -5547,7 +5671,7 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
 		bvec++;
 	} while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
 
-	dip->disk_bytenr = bio->bi_sector << 9;
+	dip->disk_bytenr = (u64)bio->bi_sector << 9;
 	bio->bi_private = dip;
 
 	if (write)
-- 
cgit v1.2.2


From 5a5f79b57069c5691f5b6fd8381fdf487f548ae5 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 26 May 2010 21:33:37 -0400
Subject: Btrfs: allow unaligned DIO

In order to support DIO that isn't aligned to the filesystem blocksize,
we fall back to buffered for any unaligned DIOs.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 38 +++++++++++++++++++++++++++++++++++---
 1 file changed, 35 insertions(+), 3 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5ab120d544bc..6866c36c26fb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5722,6 +5722,32 @@ free_ordered:
 	bio_endio(bio, ret);
 }
 
+static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
+			const struct iovec *iov, loff_t offset,
+			unsigned long nr_segs)
+{
+	int seg;
+	size_t size;
+	unsigned long addr;
+	unsigned blocksize_mask = root->sectorsize - 1;
+	ssize_t retval = -EINVAL;
+	loff_t end = offset;
+
+	if (offset & blocksize_mask)
+		goto out;
+
+	/* Check the memory alignment.  Blocks cannot straddle pages */
+	for (seg = 0; seg < nr_segs; seg++) {
+		addr = (unsigned long)iov[seg].iov_base;
+		size = iov[seg].iov_len;
+		end += size;
+		if ((addr & blocksize_mask) || (size & blocksize_mask)) 
+			goto out;
+	}
+	retval = 0;
+out:
+	return retval;
+}
 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 			const struct iovec *iov, loff_t offset,
 			unsigned long nr_segs)
@@ -5736,6 +5762,11 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 	int write_bits = 0;
 	size_t count = iov_length(iov, nr_segs);
 
+	if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
+			    offset, nr_segs)) {
+		return 0;
+	}
+
 	lockstart = offset;
 	lockend = offset + count - 1;
 
@@ -5784,9 +5815,10 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 	free_extent_state(cached_state);
 	cached_state = NULL;
 
-	ret = __blockdev_direct_IO(rw, iocb, inode, NULL, iov, offset, nr_segs,
-				   btrfs_get_blocks_direct, NULL,
-				   btrfs_submit_direct, 0);
+	ret = __blockdev_direct_IO(rw, iocb, inode,
+		   BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
+		   iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
+		   btrfs_submit_direct, 0);
 
 	if (ret < 0 && ret != -EIOCBQUEUED) {
 		clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
-- 
cgit v1.2.2


From 9aeead73782c4b8e2a91def36dbf95db28605c95 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 27 May 2010 10:23:00 -0400
Subject: Btrfs: add more error checking to btrfs_dirty_inode

The ENOSPC code will now return ENOSPC to btrfs_start_transaction.
btrfs_dirty_inode needs to check for this and error out appropriately.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6866c36c26fb..2551b8018399 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4312,12 +4312,23 @@ void btrfs_dirty_inode(struct inode *inode)
 		/* whoops, lets try again with the full transaction */
 		btrfs_end_transaction(trans, root);
 		trans = btrfs_start_transaction(root, 1);
+		if (IS_ERR(trans)) {
+			if (printk_ratelimit()) {
+				printk(KERN_ERR "btrfs: fail to "
+				       "dirty  inode %lu error %ld\n",
+				       inode->i_ino, PTR_ERR(trans));
+			}
+			return;
+		}
 		btrfs_set_trans_block_group(trans, inode);
 
 		ret = btrfs_update_inode(trans, root, inode);
 		if (ret) {
-			printk(KERN_ERR"btrfs: fail to dirty inode %lu error %d\n",
-				inode->i_ino, ret);
+			if (printk_ratelimit()) {
+				printk(KERN_ERR "btrfs: fail to "
+				       "dirty  inode %lu error %d\n",
+				       inode->i_ino, ret);
+			}
 		}
 	}
 	btrfs_end_transaction(trans, root);
-- 
cgit v1.2.2


From 836097797236fd727f82ec2f3f376ac41a430876 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 7 Jun 2010 18:26:37 +0000
Subject: Btrfs: fix fallocate regression

Seems that when btrfs_fallocate was converted to use the new ENOSPC stuff we
dropped passing the mode to the function that actually does the preallocation.
This breaks anybody who wants to use FALLOC_FL_KEEP_SIZE.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2551b8018399..d999c538cdc2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6893,7 +6893,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 		if (em->block_start == EXTENT_MAP_HOLE ||
 		    (cur_offset >= inode->i_size &&
 		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
-			ret = btrfs_prealloc_file_range(inode, 0, cur_offset,
+			ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
 							last_byte - cur_offset,
 							1 << inode->i_blkbits,
 							offset + len,
-- 
cgit v1.2.2


From 0e4dcbef1c0c3e29f9c7f824359445d385b2649a Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Tue, 1 Jun 2010 08:23:11 +0000
Subject: Btrfs: uninitialized data is check_path_shared()

refs can be used with uninitialized data if btrfs_lookup_extent_info()
fails on the first pass through the loop.  In the original code if that
happens then check_path_shared() probably returns 1, this patch
changes it to return 1 for safety.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d999c538cdc2..f08427c70a78 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2673,7 +2673,7 @@ static int check_path_shared(struct btrfs_root *root,
 	struct extent_buffer *eb;
 	int level;
 	int ret;
-	u64 refs;
+	u64 refs = 1;
 
 	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
 		if (!path->nodes[level])
-- 
cgit v1.2.2


From 7b6d91daee5cac6402186ff224c3af39d79f4a0e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 7 Aug 2010 18:20:39 +0200
Subject: block: unify flags for struct bio and struct request

Remove the current bio flags and reuse the request flags for the bio, too.
This allows to more easily trace the type of I/O from the filesystem
down to the block driver.  There were two flags in the bio that were
missing in the requests:  BIO_RW_UNPLUG and BIO_RW_AHEAD.  Also I've
renamed two request flags that had a superflous RW in them.

Note that the flags are in bio.h despite having the REQ_ name - as
blkdev.h includes bio.h that is the only way to go for now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/btrfs/inode.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1bff92ad4744..e975d7180a88 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1429,7 +1429,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
 	BUG_ON(ret);
 
-	if (!(rw & (1 << BIO_RW))) {
+	if (!(rw & REQ_WRITE)) {
 		if (bio_flags & EXTENT_BIO_COMPRESSED) {
 			return btrfs_submit_compressed_read(inode, bio,
 						    mirror_num, bio_flags);
@@ -1841,7 +1841,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
 	bio->bi_size = 0;
 
 	bio_add_page(bio, page, failrec->len, start - page_offset(page));
-	if (failed_bio->bi_rw & (1 << BIO_RW))
+	if (failed_bio->bi_rw & REQ_WRITE)
 		rw = WRITE;
 	else
 		rw = READ;
@@ -5642,7 +5642,7 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
 	struct bio_vec *bvec = bio->bi_io_vec;
 	u64 start;
 	int skip_sum;
-	int write = rw & (1 << BIO_RW);
+	int write = rw & REQ_WRITE;
 	int ret = 0;
 
 	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
-- 
cgit v1.2.2


From 1025774ce411f2bd4b059ad7b53f0003569b74fa Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 4 Jun 2010 11:30:02 +0200
Subject: remove inode_setattr

Replace inode_setattr with opencoded variants of it in all callers.  This
moves the remaining call to vmtruncate into the filesystem methods where it
can be replaced with the proper truncate sequence.

In a few cases it was obvious that we would never end up calling vmtruncate
so it was left out in the opencoded variant:

 spufs: explicitly checks for ATTR_SIZE earlier
 btrfs,hugetlbfs,logfs,dlmfs: explicitly clears ATTR_SIZE earlier
 ufs: contains an opencoded simple_seattr + truncate that sets the filesize just above

In addition to that ncpfs called inode_setattr with handcrafted iattrs,
which allowed to trim down the opencoded variant.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/inode.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1bff92ad4744..7f9e0536db1a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3656,13 +3656,15 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 		if (err)
 			return err;
 	}
-	attr->ia_valid &= ~ATTR_SIZE;
 
-	if (attr->ia_valid)
-		err = inode_setattr(inode, attr);
+	if (attr->ia_valid) {
+		setattr_copy(inode, attr);
+		mark_inode_dirty(inode);
+
+		if (attr->ia_valid & ATTR_MODE)
+			err = btrfs_acl_chmod(inode);
+	}
 
-	if (!err && ((attr->ia_valid & ATTR_MODE)))
-		err = btrfs_acl_chmod(inode);
 	return err;
 }
 
-- 
cgit v1.2.2


From a4ffdde6e56fdf8c34ddadc2674d6eb978083369 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 2 Jun 2010 17:38:30 -0400
Subject: simplify checks for I_CLEAR/I_FREEING

add I_CLEAR instead of replacing I_FREEING with it.  I_CLEAR is
equivalent to I_FREEING for almost all code looking at either;
it's there to keep track of having called clear_inode() exactly
once per inode lifetime, at some point after having set I_FREEING.
I_CLEAR and I_FREEING never get set at the same time with the
current code, so we can switch to setting i_flags to I_FREEING | I_CLEAR
instead of I_CLEAR without loss of information.  As the result of
such change, checks become simpler and the amount of code that needs
to know about I_CLEAR shrinks a lot.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7f9e0536db1a..95eac0116963 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3860,7 +3860,7 @@ again:
 			p = &parent->rb_right;
 		else {
 			WARN_ON(!(entry->vfs_inode.i_state &
-				  (I_WILL_FREE | I_FREEING | I_CLEAR)));
+				  (I_WILL_FREE | I_FREEING)));
 			rb_erase(parent, &root->inode_tree);
 			RB_CLEAR_NODE(parent);
 			spin_unlock(&root->inode_lock);
-- 
cgit v1.2.2


From bd55597520a2eaa0d71dd7683513a14bfd1bdf5c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 7 Jun 2010 11:35:40 -0400
Subject: convert btrfs to ->evict_inode()

NB: do we want btrfs_wait_ordered_range() on eviction of
inodes with positive i_nlink on subvolume with zero root_refs?
If not, btrfs_evict_inode() can be simplified by unconditionally
bailing out in case of i_nlink > 0 in the very beginning...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/inode.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 95eac0116963..ce02199ec4e5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3668,7 +3668,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 	return err;
 }
 
-void btrfs_delete_inode(struct inode *inode)
+void btrfs_evict_inode(struct inode *inode)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -3676,10 +3676,14 @@ void btrfs_delete_inode(struct inode *inode)
 	int ret;
 
 	truncate_inode_pages(&inode->i_data, 0);
+	if (inode->i_nlink && btrfs_root_refs(&root->root_item) != 0)
+		goto no_delete;
+
 	if (is_bad_inode(inode)) {
 		btrfs_orphan_del(NULL, inode);
 		goto no_delete;
 	}
+	/* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
 	btrfs_wait_ordered_range(inode, 0, (u64)-1);
 
 	if (root->fs_info->log_root_recovering) {
@@ -3729,7 +3733,7 @@ void btrfs_delete_inode(struct inode *inode)
 	btrfs_end_transaction(trans, root);
 	btrfs_btree_balance_dirty(root, nr);
 no_delete:
-	clear_inode(inode);
+	end_writeback(inode);
 	return;
 }
 
-- 
cgit v1.2.2


From 45321ac54316eaeeebde0b5f728a1791e500974c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 7 Jun 2010 13:43:19 -0400
Subject: Make ->drop_inode() just return whether inode needs to be dropped

... and let iput_final() do the actual eviction or retention

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/inode.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ce02199ec4e5..2c54f04a0bf5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3943,7 +3943,7 @@ again:
 			if (atomic_read(&inode->i_count) > 1)
 				d_prune_aliases(inode);
 			/*
-			 * btrfs_drop_inode will remove it from
+			 * btrfs_drop_inode will have it removed from
 			 * the inode cache when its usage count
 			 * hits zero.
 			 */
@@ -6337,13 +6337,14 @@ free:
 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 }
 
-void btrfs_drop_inode(struct inode *inode)
+int btrfs_drop_inode(struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0)
-		generic_delete_inode(inode);
+
+	if (btrfs_root_refs(&root->root_item) == 0)
+		return 1;
 	else
-		generic_drop_inode(inode);
+		return generic_drop_inode(inode);
 }
 
 static void init_once(void *foo)
-- 
cgit v1.2.2


From 696ac96c2757963cd6751c26215e3c6d328705aa Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Mon, 5 Jul 2010 15:15:02 +0300
Subject: btrfs: remove junk sb_dirt change

BTRFS does not define a '->write_super()' method, so it should
not mark its superblock as dirty. This looks like some left-over.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Acked-by: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/inode.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs/btrfs/inode.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2c54f04a0bf5..8976c3343a96 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2938,7 +2938,6 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
 	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
 	ret = btrfs_update_inode(trans, root, dir);
 	BUG_ON(ret);
-	dir->i_sb->s_dirt = 1;
 
 	btrfs_free_path(path);
 	return 0;
-- 
cgit v1.2.2