1 files changed, 154 insertions, 62 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 82d0342763c5..0165b8672f09 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -692,7 +692,10 @@ next:
 int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root, struct inode *inode,
                         struct btrfs_path *path, u64 start, u64 end,
-                         u64 *drop_end, int drop_cache)
+                         u64 *drop_end, int drop_cache,
+                         int replace_extent,
+                         u32 extent_item_size,
+                         int *key_inserted)
 {
        struct extent_buffer *leaf;
        struct btrfs_file_extent_item *fi;
@@ -712,6 +715,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
        int modify_tree = -1;
        int update_refs = (root->ref_cows || root == root->fs_info->tree_root);
        int found = 0;
+        int leafs_visited = 0;
        if (drop_cache)
                btrfs_drop_extent_cache(inode, start, end - 1, 0);
@@ -733,6 +737,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
                                path->slots[0]--;
                }
                ret = 0;
+                leafs_visited++;
 next_slot:
                leaf = path->nodes[0];
                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
@@ -744,6 +749,7 @@ next_slot:
                                ret = 0;
                                break;
                        }
+                        leafs_visited++;
                        leaf = path->nodes[0];
                        recow = 1;
                }
@@ -766,7 +772,8 @@ next_slot:
                                btrfs_file_extent_num_bytes(leaf, fi);
                } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
                        extent_end = key.offset +
-                                btrfs_file_extent_inline_len(leaf, fi);
+                                btrfs_file_extent_inline_len(leaf,
+                                                     path->slots[0], fi);
                } else {
                        WARN_ON(1);
                        extent_end = search_start;
@@ -927,14 +934,44 @@ next_slot:
        }
        if (!ret && del_nr > 0) {
+                /*
+                 * Set path->slots[0] to first slot, so that after the delete
+                 * if items are move off from our leaf to its immediate left or
+                 * right neighbor leafs, we end up with a correct and adjusted
+                 * path->slots[0] for our insertion.
+                 */
+                path->slots[0] = del_slot;
                ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
                if (ret)
                        btrfs_abort_transaction(trans, root, ret);
+                leaf = path->nodes[0];
+                /*
+                 * leaf eb has flag EXTENT_BUFFER_STALE if it was deleted (that
+                 * is, its contents got pushed to its neighbors), in which case
+                 * it means path->locks[0] == 0
+                 */
+                if (!ret && replace_extent && leafs_visited == 1 &&
+                    path->locks[0] &&
+                    btrfs_leaf_free_space(root, leaf) >=
+                    sizeof(struct btrfs_item) + extent_item_size) {
+                        key.objectid = ino;
+                        key.type = BTRFS_EXTENT_DATA_KEY;
+                        key.offset = start;
+                        setup_items_for_insert(root, path, &key,
+                                               &extent_item_size,
+                                               extent_item_size,
+                                               sizeof(struct btrfs_item) +
+                                               extent_item_size, 1);
+                        *key_inserted = 1;
+                }
        }
+        if (!replace_extent || !(*key_inserted))
+                btrfs_release_path(path);
        if (drop_end)
                *drop_end = found ? min(end, extent_end) : end;
-        btrfs_release_path(path);
        return ret;
 }
@@ -949,7 +986,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
        if (!path)
                return -ENOMEM;
        ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
-                                   drop_cache);
+                                   drop_cache, 0, 0, NULL);
        btrfs_free_path(path);
        return ret;
 }
@@ -1235,29 +1272,18 @@ static int prepare_uptodate_page(struct page *page, u64 pos,
 }
 /*
- * this gets pages into the page cache and locks them down, it also properly
+ * this just gets pages into the page cache and locks them down.
- * waits for data=ordered extents to finish before allowing the pages to be
- * modified.
 */
-static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
+static noinline int prepare_pages(struct inode *inode, struct page **pages,
-                         struct page **pages, size_t num_pages,
+                                  size_t num_pages, loff_t pos,
-                         loff_t pos, unsigned long first_index,
+                                  size_t write_bytes, bool force_uptodate)
-                         size_t write_bytes, bool force_uptodate)
 {
-        struct extent_state *cached_state = NULL;
        int i;
        unsigned long index = pos >> PAGE_CACHE_SHIFT;
-        struct inode *inode = file_inode(file);
        gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
        int err = 0;
-        int faili = 0;
+        int faili;
-        u64 start_pos;
-        u64 last_pos;
-        start_pos = pos & ~((u64)root->sectorsize - 1);
-        last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
-again:
        for (i = 0; i < num_pages; i++) {
                pages[i] = find_or_create_page(inode->i_mapping, index + i,
                                               mask | __GFP_WRITE);
@@ -1280,57 +1306,85 @@ again:
                }
                wait_on_page_writeback(pages[i]);
        }
-        faili = num_pages - 1;
-        err = 0;
+        return 0;
+fail:
+        while (faili >= 0) {
+                unlock_page(pages[faili]);
+                page_cache_release(pages[faili]);
+                faili--;
+        }
+        return err;
+}
+/*
+ * This function locks the extent and properly waits for data=ordered extents
+ * to finish before allowing the pages to be modified if need.
+ *
+ * The return value:
+ * 1 - the extent is locked
+ * 0 - the extent is not locked, and everything is OK
+ * -EAGAIN - need re-prepare the pages
+ * the other < 0 number - Something wrong happens
+ */
+static noinline int
+lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
+                                size_t num_pages, loff_t pos,
+                                u64 *lockstart, u64 *lockend,
+                                struct extent_state **cached_state)
+{
+        u64 start_pos;
+        u64 last_pos;
+        int i;
+        int ret = 0;
+        start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
+        last_pos = start_pos + ((u64)num_pages << PAGE_CACHE_SHIFT) - 1;
        if (start_pos < inode->i_size) {
                struct btrfs_ordered_extent *ordered;
                lock_extent_bits(&BTRFS_I(inode)->io_tree,
-                                 start_pos, last_pos - 1, 0, &cached_state);
+                                 start_pos, last_pos, 0, cached_state);
-                ordered = btrfs_lookup_first_ordered_extent(inode,
+                ordered = btrfs_lookup_first_ordered_extent(inode, last_pos);
-                                                            last_pos - 1);
                if (ordered &&
                    ordered->file_offset + ordered->len > start_pos &&
-                    ordered->file_offset < last_pos) {
+                    ordered->file_offset <= last_pos) {
                        btrfs_put_ordered_extent(ordered);
                        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
-                                             start_pos, last_pos - 1,
+                                             start_pos, last_pos,
-                                             &cached_state, GFP_NOFS);
+                                             cached_state, GFP_NOFS);
                        for (i = 0; i < num_pages; i++) {
                                unlock_page(pages[i]);
                                page_cache_release(pages[i]);
                        }
-                        err = btrfs_wait_ordered_range(inode, start_pos,
+                        ret = btrfs_wait_ordered_range(inode, start_pos,
-                                                       last_pos - start_pos);
+                                                last_pos - start_pos + 1);
-                        if (err)
+                        if (ret)
-                                goto fail;
+                                return ret;
-                        goto again;
+                        else
+                                return -EAGAIN;
                }
                if (ordered)
                        btrfs_put_ordered_extent(ordered);
                clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
-                                  last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
+                                  last_pos, EXTENT_DIRTY | EXTENT_DELALLOC |
                                  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
-                                  0, 0, &cached_state, GFP_NOFS);
+                                  0, 0, cached_state, GFP_NOFS);
-                unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+                *lockstart = start_pos;
-                                     start_pos, last_pos - 1, &cached_state,
+                *lockend = last_pos;
-                                     GFP_NOFS);
+                ret = 1;
        }
        for (i = 0; i < num_pages; i++) {
                if (clear_page_dirty_for_io(pages[i]))
                        account_page_redirty(pages[i]);
                set_page_extent_mapped(pages[i]);
                WARN_ON(!PageLocked(pages[i]));
        }
-        return 0;
-fail:
-        while (faili >= 0) {
-                unlock_page(pages[faili]);
-                page_cache_release(pages[faili]);
-                faili--;
-        }
-        return err;
+        return ret;
 }
 static noinline int check_can_nocow(struct inode *inode, loff_t pos,
@@ -1381,13 +1435,17 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
        struct inode *inode = file_inode(file);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct page **pages = NULL;
+        struct extent_state *cached_state = NULL;
        u64 release_bytes = 0;
+        u64 lockstart;
+        u64 lockend;
        unsigned long first_index;
        size_t num_written = 0;
        int nrptrs;
        int ret = 0;
        bool only_release_metadata = false;
        bool force_page_uptodate = false;
+        bool need_unlock;
        nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
                     PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
@@ -1456,18 +1514,31 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                }
                release_bytes = reserve_bytes;
+                need_unlock = false;
+again:
                /*
                 * This is going to setup the pages array with the number of
                 * pages we want, so we don't really need to worry about the
                 * contents of pages from loop to loop
                 */
-                ret = prepare_pages(root, file, pages, num_pages,
+                ret = prepare_pages(inode, pages, num_pages,
-                                    pos, first_index, write_bytes,
+                                    pos, write_bytes,
                                    force_page_uptodate);
                if (ret)
                        break;
+                ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages,
+                                                      pos, &lockstart, &lockend,
+                                                      &cached_state);
+                if (ret < 0) {
+                        if (ret == -EAGAIN)
+                                goto again;
+                        break;
+                } else if (ret > 0) {
+                        need_unlock = true;
+                        ret = 0;
+                }
                copied = btrfs_copy_from_user(pos, num_pages,
                                           write_bytes, pages, i);
@@ -1512,19 +1583,21 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                }
                release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
-                if (copied > 0) {
+                if (copied > 0)
                        ret = btrfs_dirty_pages(root, inode, pages,
                                                dirty_pages, pos, copied,
                                                NULL);
-                        if (ret) {
+                if (need_unlock)
-                                btrfs_drop_pages(pages, num_pages);
+                        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
-                                break;
+                                             lockstart, lockend, &cached_state,
-                        }
+                                             GFP_NOFS);
+                if (ret) {
+                        btrfs_drop_pages(pages, num_pages);
+                        break;
                }
                release_bytes = 0;
-                btrfs_drop_pages(pages, num_pages);
                if (only_release_metadata && copied > 0) {
                        u64 lockstart = round_down(pos, root->sectorsize);
                        u64 lockend = lockstart +
@@ -1536,6 +1609,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                        only_release_metadata = false;
                }
+                btrfs_drop_pages(pages, num_pages);
                cond_resched();
                balance_dirty_pages_ratelimited(inode->i_mapping);
@@ -1857,12 +1932,24 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        if (file->private_data)
                btrfs_ioctl_trans_end(file);
+        /*
+         * We use start here because we will need to wait on the IO to complete
+         * in btrfs_sync_log, which could require joining a transaction (for
+         * example checking cross references in the nocow path).  If we use join
+         * here we could get into a situation where we're waiting on IO to
+         * happen that is blocked on a transaction trying to commit.  With start
+         * we inc the extwriter counter, so we wait for all extwriters to exit
+         * before we start blocking join'ers.  This comment is to keep somebody
+         * from thinking they are super smart and changing this to
+         * btrfs_join_transaction *cough*Josef*cough*.
+         */
        trans = btrfs_start_transaction(root, 0);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                mutex_unlock(&inode->i_mutex);
                goto out;
        }
+        trans->sync = true;
        ret = btrfs_log_dentry_safe(trans, root, dentry);
        if (ret < 0) {
@@ -1963,11 +2050,13 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
        struct btrfs_key key;
        int ret;
+        if (btrfs_fs_incompat(root->fs_info, NO_HOLES))
+                goto out;
        key.objectid = btrfs_ino(inode);
        key.type = BTRFS_EXTENT_DATA_KEY;
        key.offset = offset;
        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
        if (ret < 0)
                return ret;
@@ -2064,8 +2153,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
        u64 drop_end;
        int ret = 0;
        int err = 0;
+        int rsv_count;
        bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
                          ((offset + len - 1) >> PAGE_CACHE_SHIFT));
+        bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
        ret = btrfs_wait_ordered_range(inode, offset, len);
        if (ret)
@@ -2125,7 +2216,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
                 * we need to try again.
                 */
                if ((!ordered ||
-                    (ordered->file_offset + ordered->len < lockstart ||
+                    (ordered->file_offset + ordered->len <= lockstart ||
                     ordered->file_offset > lockend)) &&
                     !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart,
                                     lockend, EXTENT_UPTODATE, 0,
@@ -2163,9 +2254,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
        /*
         * 1 - update the inode
         * 1 - removing the extents in the range
-         * 1 - adding the hole extent
+         * 1 - adding the hole extent if no_holes isn't set
         */
-        trans = btrfs_start_transaction(root, 3);
+        rsv_count = no_holes ? 2 : 3;
+        trans = btrfs_start_transaction(root, rsv_count);
        if (IS_ERR(trans)) {
                err = PTR_ERR(trans);
                goto out_free;
@@ -2179,7 +2271,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
        while (cur_offset < lockend) {
                ret = __btrfs_drop_extents(trans, root, inode, path,
                                           cur_offset, lockend + 1,
-                                           &drop_end, 1);
+                                           &drop_end, 1, 0, 0, NULL);
                if (ret != -ENOSPC)
                        break;
@@ -2202,7 +2294,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
                btrfs_end_transaction(trans, root);
                btrfs_btree_balance_dirty(root);
-                trans = btrfs_start_transaction(root, 3);
+                trans = btrfs_start_transaction(root, rsv_count);
                if (IS_ERR(trans)) {
                        ret = PTR_ERR(trans);
                        trans = NULL;

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 82d0342763c5..0165b8672f09 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c
@@ -692,7 +692,10 @@ next:
692	int __btrfs_drop_extents(struct btrfs_trans_handle *trans,	692	int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
693	struct btrfs_root root, struct inode inode,	693	struct btrfs_root root, struct inode inode,
694	struct btrfs_path *path, u64 start, u64 end,	694	struct btrfs_path *path, u64 start, u64 end,
695	u64 *drop_end, int drop_cache)	695	u64 *drop_end, int drop_cache,
		696	int replace_extent,
		697	u32 extent_item_size,
		698	int *key_inserted)
696	{	699	{
697	struct extent_buffer *leaf;	700	struct extent_buffer *leaf;
698	struct btrfs_file_extent_item *fi;	701	struct btrfs_file_extent_item *fi;
@@ -712,6 +715,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
712	int modify_tree = -1;	715	int modify_tree = -1;
713	int update_refs = (root->ref_cows \|\| root == root->fs_info->tree_root);	716	int update_refs = (root->ref_cows \|\| root == root->fs_info->tree_root);
714	int found = 0;	717	int found = 0;
		718	int leafs_visited = 0;
715		719
716	if (drop_cache)	720	if (drop_cache)
717	btrfs_drop_extent_cache(inode, start, end - 1, 0);	721	btrfs_drop_extent_cache(inode, start, end - 1, 0);
@@ -733,6 +737,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
733	path->slots[0]--;	737	path->slots[0]--;
734	}	738	}
735	ret = 0;	739	ret = 0;
		740	leafs_visited++;
736	next_slot:	741	next_slot:
737	leaf = path->nodes[0];	742	leaf = path->nodes[0];
738	if (path->slots[0] >= btrfs_header_nritems(leaf)) {	743	if (path->slots[0] >= btrfs_header_nritems(leaf)) {
@@ -744,6 +749,7 @@ next_slot:
744	ret = 0;	749	ret = 0;
745	break;	750	break;
746	}	751	}
		752	leafs_visited++;
747	leaf = path->nodes[0];	753	leaf = path->nodes[0];
748	recow = 1;	754	recow = 1;
749	}	755	}
@@ -766,7 +772,8 @@ next_slot:
766	btrfs_file_extent_num_bytes(leaf, fi);	772	btrfs_file_extent_num_bytes(leaf, fi);
767	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {	773	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
768	extent_end = key.offset +	774	extent_end = key.offset +
769	btrfs_file_extent_inline_len(leaf, fi);	775	btrfs_file_extent_inline_len(leaf,
		776	path->slots[0], fi);
770	} else {	777	} else {
771	WARN_ON(1);	778	WARN_ON(1);
772	extent_end = search_start;	779	extent_end = search_start;
@@ -927,14 +934,44 @@ next_slot:
927	}	934	}
928		935
929	if (!ret && del_nr > 0) {	936	if (!ret && del_nr > 0) {
		937	/*
		938	* Set path->slots[0] to first slot, so that after the delete
		939	* if items are move off from our leaf to its immediate left or
		940	* right neighbor leafs, we end up with a correct and adjusted
		941	* path->slots[0] for our insertion.
		942	*/
		943	path->slots[0] = del_slot;
930	ret = btrfs_del_items(trans, root, path, del_slot, del_nr);	944	ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
931	if (ret)	945	if (ret)
932	btrfs_abort_transaction(trans, root, ret);	946	btrfs_abort_transaction(trans, root, ret);
		947
		948	leaf = path->nodes[0];
		949	/*
		950	* leaf eb has flag EXTENT_BUFFER_STALE if it was deleted (that
		951	* is, its contents got pushed to its neighbors), in which case
		952	* it means path->locks[0] == 0
		953	*/
		954	if (!ret && replace_extent && leafs_visited == 1 &&
		955	path->locks[0] &&
		956	btrfs_leaf_free_space(root, leaf) >=
		957	sizeof(struct btrfs_item) + extent_item_size) {
		958
		959	key.objectid = ino;
		960	key.type = BTRFS_EXTENT_DATA_KEY;
		961	key.offset = start;
		962	setup_items_for_insert(root, path, &key,
		963	&extent_item_size,
		964	extent_item_size,
		965	sizeof(struct btrfs_item) +
		966	extent_item_size, 1);
		967	*key_inserted = 1;
		968	}
933	}	969	}
934		970
		971	if (!replace_extent \|\| !(*key_inserted))
		972	btrfs_release_path(path);
935	if (drop_end)	973	if (drop_end)
936	*drop_end = found ? min(end, extent_end) : end;	974	*drop_end = found ? min(end, extent_end) : end;
937	btrfs_release_path(path);
938	return ret;	975	return ret;
939	}	976	}
940		977
@@ -949,7 +986,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
949	if (!path)	986	if (!path)
950	return -ENOMEM;	987	return -ENOMEM;
951	ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,	988	ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
952	drop_cache);	989	drop_cache, 0, 0, NULL);
953	btrfs_free_path(path);	990	btrfs_free_path(path);
954	return ret;	991	return ret;
955	}	992	}
@@ -1235,29 +1272,18 @@ static int prepare_uptodate_page(struct page *page, u64 pos,
1235	}	1272	}
1236		1273
1237	/*	1274	/*
1238	* this gets pages into the page cache and locks them down, it also properly	1275	* this just gets pages into the page cache and locks them down.
1239	* waits for data=ordered extents to finish before allowing the pages to be
1240	* modified.
1241	*/	1276	*/
1242	static noinline int prepare_pages(struct btrfs_root root, struct file file,	1277	static noinline int prepare_pages(struct inode inode, struct page *pages,
1243	struct page **pages, size_t num_pages,	1278	size_t num_pages, loff_t pos,
1244	loff_t pos, unsigned long first_index,	1279	size_t write_bytes, bool force_uptodate)
1245	size_t write_bytes, bool force_uptodate)
1246	{	1280	{
1247	struct extent_state *cached_state = NULL;
1248	int i;	1281	int i;
1249	unsigned long index = pos >> PAGE_CACHE_SHIFT;	1282	unsigned long index = pos >> PAGE_CACHE_SHIFT;
1250	struct inode *inode = file_inode(file);
1251	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);	1283	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
1252	int err = 0;	1284	int err = 0;
1253	int faili = 0;	1285	int faili;
1254	u64 start_pos;
1255	u64 last_pos;
1256
1257	start_pos = pos & ~((u64)root->sectorsize - 1);
1258	last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
1259		1286
1260	again:
1261	for (i = 0; i < num_pages; i++) {	1287	for (i = 0; i < num_pages; i++) {
1262	pages[i] = find_or_create_page(inode->i_mapping, index + i,	1288	pages[i] = find_or_create_page(inode->i_mapping, index + i,
1263	mask \| __GFP_WRITE);	1289	mask \| __GFP_WRITE);
@@ -1280,57 +1306,85 @@ again:
1280	}	1306	}
1281	wait_on_page_writeback(pages[i]);	1307	wait_on_page_writeback(pages[i]);
1282	}	1308	}
1283	faili = num_pages - 1;	1309
1284	err = 0;	1310	return 0;
		1311	fail:
		1312	while (faili >= 0) {
		1313	unlock_page(pages[faili]);
		1314	page_cache_release(pages[faili]);
		1315	faili--;
		1316	}
		1317	return err;
		1318
		1319	}
		1320
		1321	/*
		1322	* This function locks the extent and properly waits for data=ordered extents
		1323	* to finish before allowing the pages to be modified if need.
		1324	*
		1325	* The return value:
		1326	* 1 - the extent is locked
		1327	* 0 - the extent is not locked, and everything is OK
		1328	* -EAGAIN - need re-prepare the pages
		1329	* the other < 0 number - Something wrong happens
		1330	*/
		1331	static noinline int
		1332	lock_and_cleanup_extent_if_need(struct inode inode, struct page *pages,
		1333	size_t num_pages, loff_t pos,
		1334	u64 lockstart, u64 lockend,
		1335	struct extent_state **cached_state)
		1336	{
		1337	u64 start_pos;
		1338	u64 last_pos;
		1339	int i;
		1340	int ret = 0;
		1341
		1342	start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
		1343	last_pos = start_pos + ((u64)num_pages << PAGE_CACHE_SHIFT) - 1;
		1344
1285	if (start_pos < inode->i_size) {	1345	if (start_pos < inode->i_size) {
1286	struct btrfs_ordered_extent *ordered;	1346	struct btrfs_ordered_extent *ordered;
1287	lock_extent_bits(&BTRFS_I(inode)->io_tree,	1347	lock_extent_bits(&BTRFS_I(inode)->io_tree,
1288	start_pos, last_pos - 1, 0, &cached_state);	1348	start_pos, last_pos, 0, cached_state);
1289	ordered = btrfs_lookup_first_ordered_extent(inode,	1349	ordered = btrfs_lookup_first_ordered_extent(inode, last_pos);
1290	last_pos - 1);
1291	if (ordered &&	1350	if (ordered &&
1292	ordered->file_offset + ordered->len > start_pos &&	1351	ordered->file_offset + ordered->len > start_pos &&
1293	ordered->file_offset < last_pos) {	1352	ordered->file_offset <= last_pos) {
1294	btrfs_put_ordered_extent(ordered);	1353	btrfs_put_ordered_extent(ordered);
1295	unlock_extent_cached(&BTRFS_I(inode)->io_tree,	1354	unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1296	start_pos, last_pos - 1,	1355	start_pos, last_pos,
1297	&cached_state, GFP_NOFS);	1356	cached_state, GFP_NOFS);
1298	for (i = 0; i < num_pages; i++) {	1357	for (i = 0; i < num_pages; i++) {
1299	unlock_page(pages[i]);	1358	unlock_page(pages[i]);
1300	page_cache_release(pages[i]);	1359	page_cache_release(pages[i]);
1301	}	1360	}
1302	err = btrfs_wait_ordered_range(inode, start_pos,	1361	ret = btrfs_wait_ordered_range(inode, start_pos,
1303	last_pos - start_pos);	1362	last_pos - start_pos + 1);
1304	if (err)	1363	if (ret)
1305	goto fail;	1364	return ret;
1306	goto again;	1365	else
		1366	return -EAGAIN;
1307	}	1367	}
1308	if (ordered)	1368	if (ordered)
1309	btrfs_put_ordered_extent(ordered);	1369	btrfs_put_ordered_extent(ordered);
1310		1370
1311	clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,	1371	clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
1312	last_pos - 1, EXTENT_DIRTY \| EXTENT_DELALLOC \|	1372	last_pos, EXTENT_DIRTY \| EXTENT_DELALLOC \|
1313	EXTENT_DO_ACCOUNTING \| EXTENT_DEFRAG,	1373	EXTENT_DO_ACCOUNTING \| EXTENT_DEFRAG,
1314	0, 0, &cached_state, GFP_NOFS);	1374	0, 0, cached_state, GFP_NOFS);
1315	unlock_extent_cached(&BTRFS_I(inode)->io_tree,	1375	*lockstart = start_pos;
1316	start_pos, last_pos - 1, &cached_state,	1376	*lockend = last_pos;
1317	GFP_NOFS);	1377	ret = 1;
1318	}	1378	}
		1379
1319	for (i = 0; i < num_pages; i++) {	1380	for (i = 0; i < num_pages; i++) {
1320	if (clear_page_dirty_for_io(pages[i]))	1381	if (clear_page_dirty_for_io(pages[i]))
1321	account_page_redirty(pages[i]);	1382	account_page_redirty(pages[i]);
1322	set_page_extent_mapped(pages[i]);	1383	set_page_extent_mapped(pages[i]);
1323	WARN_ON(!PageLocked(pages[i]));	1384	WARN_ON(!PageLocked(pages[i]));
1324	}	1385	}
1325	return 0;
1326	fail:
1327	while (faili >= 0) {
1328	unlock_page(pages[faili]);
1329	page_cache_release(pages[faili]);
1330	faili--;
1331	}
1332	return err;
1333		1386
		1387	return ret;
1334	}	1388	}
1335		1389
1336	static noinline int check_can_nocow(struct inode *inode, loff_t pos,	1390	static noinline int check_can_nocow(struct inode *inode, loff_t pos,
@@ -1381,13 +1435,17 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1381	struct inode *inode = file_inode(file);	1435	struct inode *inode = file_inode(file);
1382	struct btrfs_root *root = BTRFS_I(inode)->root;	1436	struct btrfs_root *root = BTRFS_I(inode)->root;
1383	struct page **pages = NULL;	1437	struct page **pages = NULL;
		1438	struct extent_state *cached_state = NULL;
1384	u64 release_bytes = 0;	1439	u64 release_bytes = 0;
		1440	u64 lockstart;
		1441	u64 lockend;
1385	unsigned long first_index;	1442	unsigned long first_index;
1386	size_t num_written = 0;	1443	size_t num_written = 0;
1387	int nrptrs;	1444	int nrptrs;
1388	int ret = 0;	1445	int ret = 0;
1389	bool only_release_metadata = false;	1446	bool only_release_metadata = false;
1390	bool force_page_uptodate = false;	1447	bool force_page_uptodate = false;
		1448	bool need_unlock;
1391		1449
1392	nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /	1450	nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
1393	PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /	1451	PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
@@ -1456,18 +1514,31 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1456	}	1514	}
1457		1515
1458	release_bytes = reserve_bytes;	1516	release_bytes = reserve_bytes;
1459		1517	need_unlock = false;
		1518	again:
1460	/*	1519	/*
1461	* This is going to setup the pages array with the number of	1520	* This is going to setup the pages array with the number of
1462	* pages we want, so we don't really need to worry about the	1521	* pages we want, so we don't really need to worry about the
1463	* contents of pages from loop to loop	1522	* contents of pages from loop to loop
1464	*/	1523	*/
1465	ret = prepare_pages(root, file, pages, num_pages,	1524	ret = prepare_pages(inode, pages, num_pages,
1466	pos, first_index, write_bytes,	1525	pos, write_bytes,
1467	force_page_uptodate);	1526	force_page_uptodate);
1468	if (ret)	1527	if (ret)
1469	break;	1528	break;
1470		1529
		1530	ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages,
		1531	pos, &lockstart, &lockend,
		1532	&cached_state);
		1533	if (ret < 0) {
		1534	if (ret == -EAGAIN)
		1535	goto again;
		1536	break;
		1537	} else if (ret > 0) {
		1538	need_unlock = true;
		1539	ret = 0;
		1540	}
		1541
1471	copied = btrfs_copy_from_user(pos, num_pages,	1542	copied = btrfs_copy_from_user(pos, num_pages,
1472	write_bytes, pages, i);	1543	write_bytes, pages, i);
1473		1544
@@ -1512,19 +1583,21 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1512	}	1583	}
1513		1584
1514	release_bytes = dirty_pages << PAGE_CACHE_SHIFT;	1585	release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
1515	if (copied > 0) {	1586
		1587	if (copied > 0)
1516	ret = btrfs_dirty_pages(root, inode, pages,	1588	ret = btrfs_dirty_pages(root, inode, pages,
1517	dirty_pages, pos, copied,	1589	dirty_pages, pos, copied,
1518	NULL);	1590	NULL);
1519	if (ret) {	1591	if (need_unlock)
1520	btrfs_drop_pages(pages, num_pages);	1592	unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1521	break;	1593	lockstart, lockend, &cached_state,
1522	}	1594	GFP_NOFS);
		1595	if (ret) {
		1596	btrfs_drop_pages(pages, num_pages);
		1597	break;
1523	}	1598	}
1524		1599
1525	release_bytes = 0;	1600	release_bytes = 0;
1526	btrfs_drop_pages(pages, num_pages);
1527
1528	if (only_release_metadata && copied > 0) {	1601	if (only_release_metadata && copied > 0) {
1529	u64 lockstart = round_down(pos, root->sectorsize);	1602	u64 lockstart = round_down(pos, root->sectorsize);
1530	u64 lockend = lockstart +	1603	u64 lockend = lockstart +
@@ -1536,6 +1609,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1536	only_release_metadata = false;	1609	only_release_metadata = false;
1537	}	1610	}
1538		1611
		1612	btrfs_drop_pages(pages, num_pages);
		1613
1539	cond_resched();	1614	cond_resched();
1540		1615
1541	balance_dirty_pages_ratelimited(inode->i_mapping);	1616	balance_dirty_pages_ratelimited(inode->i_mapping);
@@ -1857,12 +1932,24 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1857	if (file->private_data)	1932	if (file->private_data)
1858	btrfs_ioctl_trans_end(file);	1933	btrfs_ioctl_trans_end(file);
1859		1934
		1935	/*
		1936	* We use start here because we will need to wait on the IO to complete
		1937	* in btrfs_sync_log, which could require joining a transaction (for
		1938	* example checking cross references in the nocow path). If we use join
		1939	* here we could get into a situation where we're waiting on IO to
		1940	* happen that is blocked on a transaction trying to commit. With start
		1941	* we inc the extwriter counter, so we wait for all extwriters to exit
		1942	* before we start blocking join'ers. This comment is to keep somebody
		1943	* from thinking they are super smart and changing this to
		1944	* btrfs_join_transaction coughJosefcough.
		1945	*/
1860	trans = btrfs_start_transaction(root, 0);	1946	trans = btrfs_start_transaction(root, 0);
1861	if (IS_ERR(trans)) {	1947	if (IS_ERR(trans)) {
1862	ret = PTR_ERR(trans);	1948	ret = PTR_ERR(trans);
1863	mutex_unlock(&inode->i_mutex);	1949	mutex_unlock(&inode->i_mutex);
1864	goto out;	1950	goto out;
1865	}	1951	}
		1952	trans->sync = true;
1866		1953
1867	ret = btrfs_log_dentry_safe(trans, root, dentry);	1954	ret = btrfs_log_dentry_safe(trans, root, dentry);
1868	if (ret < 0) {	1955	if (ret < 0) {
@@ -1963,11 +2050,13 @@ static int fill_holes(struct btrfs_trans_handle trans, struct inode inode,
1963	struct btrfs_key key;	2050	struct btrfs_key key;
1964	int ret;	2051	int ret;
1965		2052
		2053	if (btrfs_fs_incompat(root->fs_info, NO_HOLES))
		2054	goto out;
		2055
1966	key.objectid = btrfs_ino(inode);	2056	key.objectid = btrfs_ino(inode);
1967	key.type = BTRFS_EXTENT_DATA_KEY;	2057	key.type = BTRFS_EXTENT_DATA_KEY;
1968	key.offset = offset;	2058	key.offset = offset;
1969		2059
1970
1971	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);	2060	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1972	if (ret < 0)	2061	if (ret < 0)
1973	return ret;	2062	return ret;
@@ -2064,8 +2153,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2064	u64 drop_end;	2153	u64 drop_end;
2065	int ret = 0;	2154	int ret = 0;
2066	int err = 0;	2155	int err = 0;
		2156	int rsv_count;
2067	bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==	2157	bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
2068	((offset + len - 1) >> PAGE_CACHE_SHIFT));	2158	((offset + len - 1) >> PAGE_CACHE_SHIFT));
		2159	bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
2069		2160
2070	ret = btrfs_wait_ordered_range(inode, offset, len);	2161	ret = btrfs_wait_ordered_range(inode, offset, len);
2071	if (ret)	2162	if (ret)
@@ -2125,7 +2216,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2125	* we need to try again.	2216	* we need to try again.
2126	*/	2217	*/
2127	if ((!ordered \|\|	2218	if ((!ordered \|\|
2128	(ordered->file_offset + ordered->len < lockstart \|\|	2219	(ordered->file_offset + ordered->len <= lockstart \|\|
2129	ordered->file_offset > lockend)) &&	2220	ordered->file_offset > lockend)) &&
2130	!test_range_bit(&BTRFS_I(inode)->io_tree, lockstart,	2221	!test_range_bit(&BTRFS_I(inode)->io_tree, lockstart,
2131	lockend, EXTENT_UPTODATE, 0,	2222	lockend, EXTENT_UPTODATE, 0,
@@ -2163,9 +2254,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2163	/*	2254	/*
2164	* 1 - update the inode	2255	* 1 - update the inode
2165	* 1 - removing the extents in the range	2256	* 1 - removing the extents in the range
2166	* 1 - adding the hole extent	2257	* 1 - adding the hole extent if no_holes isn't set
2167	*/	2258	*/
2168	trans = btrfs_start_transaction(root, 3);	2259	rsv_count = no_holes ? 2 : 3;
		2260	trans = btrfs_start_transaction(root, rsv_count);
2169	if (IS_ERR(trans)) {	2261	if (IS_ERR(trans)) {
2170	err = PTR_ERR(trans);	2262	err = PTR_ERR(trans);
2171	goto out_free;	2263	goto out_free;
@@ -2179,7 +2271,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2179	while (cur_offset < lockend) {	2271	while (cur_offset < lockend) {
2180	ret = __btrfs_drop_extents(trans, root, inode, path,	2272	ret = __btrfs_drop_extents(trans, root, inode, path,
2181	cur_offset, lockend + 1,	2273	cur_offset, lockend + 1,
2182	&drop_end, 1);	2274	&drop_end, 1, 0, 0, NULL);
2183	if (ret != -ENOSPC)	2275	if (ret != -ENOSPC)
2184	break;	2276	break;
2185		2277
@@ -2202,7 +2294,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2202	btrfs_end_transaction(trans, root);	2294	btrfs_end_transaction(trans, root);
2203	btrfs_btree_balance_dirty(root);	2295	btrfs_btree_balance_dirty(root);
2204		2296
2205	trans = btrfs_start_transaction(root, 3);	2297	trans = btrfs_start_transaction(root, rsv_count);
2206	if (IS_ERR(trans)) {	2298	if (IS_ERR(trans)) {
2207	ret = PTR_ERR(trans);	2299	ret = PTR_ERR(trans);
2208	trans = NULL;	2300	trans = NULL;