aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-07-09 15:33:09 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-07-09 15:33:09 -0400
commite3a0dd98e1ddfd135b7ef889fcc0269e8c2ca445 (patch)
tree7f942b10ffe7ea1498e4b1d3a206291692647040 /fs
parentda89bd213fe719ec3552abbeb8be12d0cc0337ca (diff)
parent0e267c44c3a402d35111d1935be1167240b5b79f (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs
Pull btrfs update from Chris Mason: "These are the usual mixture of bugs, cleanups and performance fixes. Miao has some really nice tuning of our crc code as well as our transaction commits. Josef is peeling off more and more problems related to early enospc, and has a number of important bug fixes in here too" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (81 commits) Btrfs: wait ordered range before doing direct io Btrfs: only do the tree_mod_log_free_eb if this is our last ref Btrfs: hold the tree mod lock in __tree_mod_log_rewind Btrfs: make backref walking code handle skinny metadata Btrfs: fix crash regarding to ulist_add_merge Btrfs: fix several potential problems in copy_nocow_pages_for_inode Btrfs: cleanup the code of copy_nocow_pages_for_inode() Btrfs: fix oops when recovering the file data by scrub function Btrfs: make the chunk allocator completely tree lockless Btrfs: cleanup orphaned root orphan item Btrfs: fix wrong mirror number tuning Btrfs: cleanup redundant code in btrfs_submit_direct() Btrfs: remove btrfs_sector_sum structure Btrfs: check if we can nocow if we don't have data space Btrfs: stop using try_to_writeback_inodes_sb_nr to flush delalloc Btrfs: use a percpu to keep track of possibly pinned bytes Btrfs: check for actual acls rather than just xattrs when caching no acl Btrfs: move btrfs_truncate_page to btrfs_cont_expand instead of btrfs_truncate Btrfs: optimize reada_for_balance Btrfs: optimize read_block_for_search ...
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/backref.c72
-rw-r--r--fs/btrfs/backref.h2
-rw-r--r--fs/btrfs/ctree.c118
-rw-r--r--fs/btrfs/ctree.h105
-rw-r--r--fs/btrfs/delayed-inode.c14
-rw-r--r--fs/btrfs/dev-replace.c6
-rw-r--r--fs/btrfs/disk-io.c483
-rw-r--r--fs/btrfs/disk-io.h32
-rw-r--r--fs/btrfs/export.c5
-rw-r--r--fs/btrfs/extent-tree.c315
-rw-r--r--fs/btrfs/extent_io.c41
-rw-r--r--fs/btrfs/extent_io.h1
-rw-r--r--fs/btrfs/file-item.c144
-rw-r--r--fs/btrfs/file.c150
-rw-r--r--fs/btrfs/free-space-cache.c103
-rw-r--r--fs/btrfs/free-space-cache.h2
-rw-r--r--fs/btrfs/inode.c501
-rw-r--r--fs/btrfs/ioctl.c74
-rw-r--r--fs/btrfs/lzo.c4
-rw-r--r--fs/btrfs/ordered-data.c128
-rw-r--r--fs/btrfs/ordered-data.h27
-rw-r--r--fs/btrfs/qgroup.c283
-rw-r--r--fs/btrfs/relocation.c102
-rw-r--r--fs/btrfs/root-tree.c201
-rw-r--r--fs/btrfs/scrub.c90
-rw-r--r--fs/btrfs/send.c235
-rw-r--r--fs/btrfs/super.c25
-rw-r--r--fs/btrfs/transaction.c322
-rw-r--r--fs/btrfs/transaction.h50
-rw-r--r--fs/btrfs/tree-log.c41
-rw-r--r--fs/btrfs/ulist.c15
-rw-r--r--fs/btrfs/version.h4
-rw-r--r--fs/btrfs/volumes.c351
-rw-r--r--fs/btrfs/volumes.h7
34 files changed, 2327 insertions, 1726 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 290e347b6db3..eaf133384a8f 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -255,13 +255,11 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
255 * to a logical address 255 * to a logical address
256 */ 256 */
257static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, 257static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
258 int search_commit_root, 258 struct btrfs_path *path, u64 time_seq,
259 u64 time_seq, 259 struct __prelim_ref *ref,
260 struct __prelim_ref *ref, 260 struct ulist *parents,
261 struct ulist *parents, 261 const u64 *extent_item_pos)
262 const u64 *extent_item_pos)
263{ 262{
264 struct btrfs_path *path;
265 struct btrfs_root *root; 263 struct btrfs_root *root;
266 struct btrfs_key root_key; 264 struct btrfs_key root_key;
267 struct extent_buffer *eb; 265 struct extent_buffer *eb;
@@ -269,11 +267,6 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
269 int root_level; 267 int root_level;
270 int level = ref->level; 268 int level = ref->level;
271 269
272 path = btrfs_alloc_path();
273 if (!path)
274 return -ENOMEM;
275 path->search_commit_root = !!search_commit_root;
276
277 root_key.objectid = ref->root_id; 270 root_key.objectid = ref->root_id;
278 root_key.type = BTRFS_ROOT_ITEM_KEY; 271 root_key.type = BTRFS_ROOT_ITEM_KEY;
279 root_key.offset = (u64)-1; 272 root_key.offset = (u64)-1;
@@ -314,7 +307,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
314 time_seq, ref->wanted_disk_byte, 307 time_seq, ref->wanted_disk_byte,
315 extent_item_pos); 308 extent_item_pos);
316out: 309out:
317 btrfs_free_path(path); 310 path->lowest_level = 0;
311 btrfs_release_path(path);
318 return ret; 312 return ret;
319} 313}
320 314
@@ -322,7 +316,7 @@ out:
322 * resolve all indirect backrefs from the list 316 * resolve all indirect backrefs from the list
323 */ 317 */
324static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, 318static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
325 int search_commit_root, u64 time_seq, 319 struct btrfs_path *path, u64 time_seq,
326 struct list_head *head, 320 struct list_head *head,
327 const u64 *extent_item_pos) 321 const u64 *extent_item_pos)
328{ 322{
@@ -349,9 +343,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
349 continue; 343 continue;
350 if (ref->count == 0) 344 if (ref->count == 0)
351 continue; 345 continue;
352 err = __resolve_indirect_ref(fs_info, search_commit_root, 346 err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
353 time_seq, ref, parents, 347 parents, extent_item_pos);
354 extent_item_pos);
355 if (err == -ENOMEM) 348 if (err == -ENOMEM)
356 goto out; 349 goto out;
357 if (err) 350 if (err)
@@ -604,6 +597,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
604 int slot; 597 int slot;
605 struct extent_buffer *leaf; 598 struct extent_buffer *leaf;
606 struct btrfs_key key; 599 struct btrfs_key key;
600 struct btrfs_key found_key;
607 unsigned long ptr; 601 unsigned long ptr;
608 unsigned long end; 602 unsigned long end;
609 struct btrfs_extent_item *ei; 603 struct btrfs_extent_item *ei;
@@ -621,17 +615,21 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
621 615
622 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); 616 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
623 flags = btrfs_extent_flags(leaf, ei); 617 flags = btrfs_extent_flags(leaf, ei);
618 btrfs_item_key_to_cpu(leaf, &found_key, slot);
624 619
625 ptr = (unsigned long)(ei + 1); 620 ptr = (unsigned long)(ei + 1);
626 end = (unsigned long)ei + item_size; 621 end = (unsigned long)ei + item_size;
627 622
628 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 623 if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
624 flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
629 struct btrfs_tree_block_info *info; 625 struct btrfs_tree_block_info *info;
630 626
631 info = (struct btrfs_tree_block_info *)ptr; 627 info = (struct btrfs_tree_block_info *)ptr;
632 *info_level = btrfs_tree_block_level(leaf, info); 628 *info_level = btrfs_tree_block_level(leaf, info);
633 ptr += sizeof(struct btrfs_tree_block_info); 629 ptr += sizeof(struct btrfs_tree_block_info);
634 BUG_ON(ptr > end); 630 BUG_ON(ptr > end);
631 } else if (found_key.type == BTRFS_METADATA_ITEM_KEY) {
632 *info_level = found_key.offset;
635 } else { 633 } else {
636 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA)); 634 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
637 } 635 }
@@ -795,7 +793,6 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
795 struct btrfs_delayed_ref_head *head; 793 struct btrfs_delayed_ref_head *head;
796 int info_level = 0; 794 int info_level = 0;
797 int ret; 795 int ret;
798 int search_commit_root = (trans == BTRFS_BACKREF_SEARCH_COMMIT_ROOT);
799 struct list_head prefs_delayed; 796 struct list_head prefs_delayed;
800 struct list_head prefs; 797 struct list_head prefs;
801 struct __prelim_ref *ref; 798 struct __prelim_ref *ref;
@@ -804,13 +801,17 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
804 INIT_LIST_HEAD(&prefs_delayed); 801 INIT_LIST_HEAD(&prefs_delayed);
805 802
806 key.objectid = bytenr; 803 key.objectid = bytenr;
807 key.type = BTRFS_EXTENT_ITEM_KEY;
808 key.offset = (u64)-1; 804 key.offset = (u64)-1;
805 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
806 key.type = BTRFS_METADATA_ITEM_KEY;
807 else
808 key.type = BTRFS_EXTENT_ITEM_KEY;
809 809
810 path = btrfs_alloc_path(); 810 path = btrfs_alloc_path();
811 if (!path) 811 if (!path)
812 return -ENOMEM; 812 return -ENOMEM;
813 path->search_commit_root = !!search_commit_root; 813 if (!trans)
814 path->search_commit_root = 1;
814 815
815 /* 816 /*
816 * grab both a lock on the path and a lock on the delayed ref head. 817 * grab both a lock on the path and a lock on the delayed ref head.
@@ -825,7 +826,7 @@ again:
825 goto out; 826 goto out;
826 BUG_ON(ret == 0); 827 BUG_ON(ret == 0);
827 828
828 if (trans != BTRFS_BACKREF_SEARCH_COMMIT_ROOT) { 829 if (trans) {
829 /* 830 /*
830 * look if there are updates for this ref queued and lock the 831 * look if there are updates for this ref queued and lock the
831 * head 832 * head
@@ -869,7 +870,8 @@ again:
869 slot = path->slots[0]; 870 slot = path->slots[0];
870 btrfs_item_key_to_cpu(leaf, &key, slot); 871 btrfs_item_key_to_cpu(leaf, &key, slot);
871 if (key.objectid == bytenr && 872 if (key.objectid == bytenr &&
872 key.type == BTRFS_EXTENT_ITEM_KEY) { 873 (key.type == BTRFS_EXTENT_ITEM_KEY ||
874 key.type == BTRFS_METADATA_ITEM_KEY)) {
873 ret = __add_inline_refs(fs_info, path, bytenr, 875 ret = __add_inline_refs(fs_info, path, bytenr,
874 &info_level, &prefs); 876 &info_level, &prefs);
875 if (ret) 877 if (ret)
@@ -890,8 +892,8 @@ again:
890 892
891 __merge_refs(&prefs, 1); 893 __merge_refs(&prefs, 1);
892 894
893 ret = __resolve_indirect_refs(fs_info, search_commit_root, time_seq, 895 ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
894 &prefs, extent_item_pos); 896 extent_item_pos);
895 if (ret) 897 if (ret)
896 goto out; 898 goto out;
897 899
@@ -1283,12 +1285,16 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
1283{ 1285{
1284 int ret; 1286 int ret;
1285 u64 flags; 1287 u64 flags;
1288 u64 size = 0;
1286 u32 item_size; 1289 u32 item_size;
1287 struct extent_buffer *eb; 1290 struct extent_buffer *eb;
1288 struct btrfs_extent_item *ei; 1291 struct btrfs_extent_item *ei;
1289 struct btrfs_key key; 1292 struct btrfs_key key;
1290 1293
1291 key.type = BTRFS_EXTENT_ITEM_KEY; 1294 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
1295 key.type = BTRFS_METADATA_ITEM_KEY;
1296 else
1297 key.type = BTRFS_EXTENT_ITEM_KEY;
1292 key.objectid = logical; 1298 key.objectid = logical;
1293 key.offset = (u64)-1; 1299 key.offset = (u64)-1;
1294 1300
@@ -1301,9 +1307,15 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
1301 return ret; 1307 return ret;
1302 1308
1303 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); 1309 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
1304 if (found_key->type != BTRFS_EXTENT_ITEM_KEY || 1310 if (found_key->type == BTRFS_METADATA_ITEM_KEY)
1311 size = fs_info->extent_root->leafsize;
1312 else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
1313 size = found_key->offset;
1314
1315 if ((found_key->type != BTRFS_EXTENT_ITEM_KEY &&
1316 found_key->type != BTRFS_METADATA_ITEM_KEY) ||
1305 found_key->objectid > logical || 1317 found_key->objectid > logical ||
1306 found_key->objectid + found_key->offset <= logical) { 1318 found_key->objectid + size <= logical) {
1307 pr_debug("logical %llu is not within any extent\n", 1319 pr_debug("logical %llu is not within any extent\n",
1308 (unsigned long long)logical); 1320 (unsigned long long)logical);
1309 return -ENOENT; 1321 return -ENOENT;
@@ -1459,7 +1471,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1459 iterate_extent_inodes_t *iterate, void *ctx) 1471 iterate_extent_inodes_t *iterate, void *ctx)
1460{ 1472{
1461 int ret; 1473 int ret;
1462 struct btrfs_trans_handle *trans; 1474 struct btrfs_trans_handle *trans = NULL;
1463 struct ulist *refs = NULL; 1475 struct ulist *refs = NULL;
1464 struct ulist *roots = NULL; 1476 struct ulist *roots = NULL;
1465 struct ulist_node *ref_node = NULL; 1477 struct ulist_node *ref_node = NULL;
@@ -1471,9 +1483,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1471 pr_debug("resolving all inodes for extent %llu\n", 1483 pr_debug("resolving all inodes for extent %llu\n",
1472 extent_item_objectid); 1484 extent_item_objectid);
1473 1485
1474 if (search_commit_root) { 1486 if (!search_commit_root) {
1475 trans = BTRFS_BACKREF_SEARCH_COMMIT_ROOT;
1476 } else {
1477 trans = btrfs_join_transaction(fs_info->extent_root); 1487 trans = btrfs_join_transaction(fs_info->extent_root);
1478 if (IS_ERR(trans)) 1488 if (IS_ERR(trans))
1479 return PTR_ERR(trans); 1489 return PTR_ERR(trans);
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 0f446d7ca2c0..8f2e76702932 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -23,8 +23,6 @@
23#include "ulist.h" 23#include "ulist.h"
24#include "extent_io.h" 24#include "extent_io.h"
25 25
26#define BTRFS_BACKREF_SEARCH_COMMIT_ROOT ((struct btrfs_trans_handle *)0)
27
28struct inode_fs_paths { 26struct inode_fs_paths {
29 struct btrfs_path *btrfs_path; 27 struct btrfs_path *btrfs_path;
30 struct btrfs_root *fs_root; 28 struct btrfs_root *fs_root;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 17dffe33e8d0..5bf4c39e2ad6 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1089,7 +1089,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1089 btrfs_set_node_ptr_generation(parent, parent_slot, 1089 btrfs_set_node_ptr_generation(parent, parent_slot,
1090 trans->transid); 1090 trans->transid);
1091 btrfs_mark_buffer_dirty(parent); 1091 btrfs_mark_buffer_dirty(parent);
1092 tree_mod_log_free_eb(root->fs_info, buf); 1092 if (last_ref)
1093 tree_mod_log_free_eb(root->fs_info, buf);
1093 btrfs_free_tree_block(trans, root, buf, parent_start, 1094 btrfs_free_tree_block(trans, root, buf, parent_start,
1094 last_ref); 1095 last_ref);
1095 } 1096 }
@@ -1161,8 +1162,8 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
1161 * time_seq). 1162 * time_seq).
1162 */ 1163 */
1163static void 1164static void
1164__tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, 1165__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1165 struct tree_mod_elem *first_tm) 1166 u64 time_seq, struct tree_mod_elem *first_tm)
1166{ 1167{
1167 u32 n; 1168 u32 n;
1168 struct rb_node *next; 1169 struct rb_node *next;
@@ -1172,6 +1173,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
1172 unsigned long p_size = sizeof(struct btrfs_key_ptr); 1173 unsigned long p_size = sizeof(struct btrfs_key_ptr);
1173 1174
1174 n = btrfs_header_nritems(eb); 1175 n = btrfs_header_nritems(eb);
1176 tree_mod_log_read_lock(fs_info);
1175 while (tm && tm->seq >= time_seq) { 1177 while (tm && tm->seq >= time_seq) {
1176 /* 1178 /*
1177 * all the operations are recorded with the operator used for 1179 * all the operations are recorded with the operator used for
@@ -1226,6 +1228,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
1226 if (tm->index != first_tm->index) 1228 if (tm->index != first_tm->index)
1227 break; 1229 break;
1228 } 1230 }
1231 tree_mod_log_read_unlock(fs_info);
1229 btrfs_set_header_nritems(eb, n); 1232 btrfs_set_header_nritems(eb, n);
1230} 1233}
1231 1234
@@ -1274,7 +1277,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1274 1277
1275 extent_buffer_get(eb_rewin); 1278 extent_buffer_get(eb_rewin);
1276 btrfs_tree_read_lock(eb_rewin); 1279 btrfs_tree_read_lock(eb_rewin);
1277 __tree_mod_log_rewind(eb_rewin, time_seq, tm); 1280 __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1278 WARN_ON(btrfs_header_nritems(eb_rewin) > 1281 WARN_ON(btrfs_header_nritems(eb_rewin) >
1279 BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root)); 1282 BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
1280 1283
@@ -1350,7 +1353,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
1350 btrfs_set_header_generation(eb, old_generation); 1353 btrfs_set_header_generation(eb, old_generation);
1351 } 1354 }
1352 if (tm) 1355 if (tm)
1353 __tree_mod_log_rewind(eb, time_seq, tm); 1356 __tree_mod_log_rewind(root->fs_info, eb, time_seq, tm);
1354 else 1357 else
1355 WARN_ON(btrfs_header_level(eb) != 0); 1358 WARN_ON(btrfs_header_level(eb) != 0);
1356 WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root)); 1359 WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
@@ -2178,12 +2181,8 @@ static void reada_for_search(struct btrfs_root *root,
2178 } 2181 }
2179} 2182}
2180 2183
2181/* 2184static noinline void reada_for_balance(struct btrfs_root *root,
2182 * returns -EAGAIN if it had to drop the path, or zero if everything was in 2185 struct btrfs_path *path, int level)
2183 * cache
2184 */
2185static noinline int reada_for_balance(struct btrfs_root *root,
2186 struct btrfs_path *path, int level)
2187{ 2186{
2188 int slot; 2187 int slot;
2189 int nritems; 2188 int nritems;
@@ -2192,12 +2191,11 @@ static noinline int reada_for_balance(struct btrfs_root *root,
2192 u64 gen; 2191 u64 gen;
2193 u64 block1 = 0; 2192 u64 block1 = 0;
2194 u64 block2 = 0; 2193 u64 block2 = 0;
2195 int ret = 0;
2196 int blocksize; 2194 int blocksize;
2197 2195
2198 parent = path->nodes[level + 1]; 2196 parent = path->nodes[level + 1];
2199 if (!parent) 2197 if (!parent)
2200 return 0; 2198 return;
2201 2199
2202 nritems = btrfs_header_nritems(parent); 2200 nritems = btrfs_header_nritems(parent);
2203 slot = path->slots[level + 1]; 2201 slot = path->slots[level + 1];
@@ -2224,28 +2222,11 @@ static noinline int reada_for_balance(struct btrfs_root *root,
2224 block2 = 0; 2222 block2 = 0;
2225 free_extent_buffer(eb); 2223 free_extent_buffer(eb);
2226 } 2224 }
2227 if (block1 || block2) {
2228 ret = -EAGAIN;
2229
2230 /* release the whole path */
2231 btrfs_release_path(path);
2232
2233 /* read the blocks */
2234 if (block1)
2235 readahead_tree_block(root, block1, blocksize, 0);
2236 if (block2)
2237 readahead_tree_block(root, block2, blocksize, 0);
2238 2225
2239 if (block1) { 2226 if (block1)
2240 eb = read_tree_block(root, block1, blocksize, 0); 2227 readahead_tree_block(root, block1, blocksize, 0);
2241 free_extent_buffer(eb); 2228 if (block2)
2242 } 2229 readahead_tree_block(root, block2, blocksize, 0);
2243 if (block2) {
2244 eb = read_tree_block(root, block2, blocksize, 0);
2245 free_extent_buffer(eb);
2246 }
2247 }
2248 return ret;
2249} 2230}
2250 2231
2251 2232
@@ -2359,35 +2340,28 @@ read_block_for_search(struct btrfs_trans_handle *trans,
2359 tmp = btrfs_find_tree_block(root, blocknr, blocksize); 2340 tmp = btrfs_find_tree_block(root, blocknr, blocksize);
2360 if (tmp) { 2341 if (tmp) {
2361 /* first we do an atomic uptodate check */ 2342 /* first we do an atomic uptodate check */
2362 if (btrfs_buffer_uptodate(tmp, 0, 1) > 0) { 2343 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2363 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { 2344 *eb_ret = tmp;
2364 /* 2345 return 0;
2365 * we found an up to date block without 2346 }
2366 * sleeping, return
2367 * right away
2368 */
2369 *eb_ret = tmp;
2370 return 0;
2371 }
2372 /* the pages were up to date, but we failed
2373 * the generation number check. Do a full
2374 * read for the generation number that is correct.
2375 * We must do this without dropping locks so
2376 * we can trust our generation number
2377 */
2378 free_extent_buffer(tmp);
2379 btrfs_set_path_blocking(p);
2380 2347
2381 /* now we're allowed to do a blocking uptodate check */ 2348 /* the pages were up to date, but we failed
2382 tmp = read_tree_block(root, blocknr, blocksize, gen); 2349 * the generation number check. Do a full
2383 if (tmp && btrfs_buffer_uptodate(tmp, gen, 0) > 0) { 2350 * read for the generation number that is correct.
2384 *eb_ret = tmp; 2351 * We must do this without dropping locks so
2385 return 0; 2352 * we can trust our generation number
2386 } 2353 */
2387 free_extent_buffer(tmp); 2354 btrfs_set_path_blocking(p);
2388 btrfs_release_path(p); 2355
2389 return -EIO; 2356 /* now we're allowed to do a blocking uptodate check */
2357 ret = btrfs_read_buffer(tmp, gen);
2358 if (!ret) {
2359 *eb_ret = tmp;
2360 return 0;
2390 } 2361 }
2362 free_extent_buffer(tmp);
2363 btrfs_release_path(p);
2364 return -EIO;
2391 } 2365 }
2392 2366
2393 /* 2367 /*
@@ -2448,11 +2422,8 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
2448 goto again; 2422 goto again;
2449 } 2423 }
2450 2424
2451 sret = reada_for_balance(root, p, level);
2452 if (sret)
2453 goto again;
2454
2455 btrfs_set_path_blocking(p); 2425 btrfs_set_path_blocking(p);
2426 reada_for_balance(root, p, level);
2456 sret = split_node(trans, root, p, level); 2427 sret = split_node(trans, root, p, level);
2457 btrfs_clear_path_blocking(p, NULL, 0); 2428 btrfs_clear_path_blocking(p, NULL, 0);
2458 2429
@@ -2472,11 +2443,8 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
2472 goto again; 2443 goto again;
2473 } 2444 }
2474 2445
2475 sret = reada_for_balance(root, p, level);
2476 if (sret)
2477 goto again;
2478
2479 btrfs_set_path_blocking(p); 2446 btrfs_set_path_blocking(p);
2447 reada_for_balance(root, p, level);
2480 sret = balance_level(trans, root, p, level); 2448 sret = balance_level(trans, root, p, level);
2481 btrfs_clear_path_blocking(p, NULL, 0); 2449 btrfs_clear_path_blocking(p, NULL, 0);
2482 2450
@@ -3143,7 +3111,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
3143 */ 3111 */
3144static noinline int insert_new_root(struct btrfs_trans_handle *trans, 3112static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3145 struct btrfs_root *root, 3113 struct btrfs_root *root,
3146 struct btrfs_path *path, int level, int log_removal) 3114 struct btrfs_path *path, int level)
3147{ 3115{
3148 u64 lower_gen; 3116 u64 lower_gen;
3149 struct extent_buffer *lower; 3117 struct extent_buffer *lower;
@@ -3194,7 +3162,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3194 btrfs_mark_buffer_dirty(c); 3162 btrfs_mark_buffer_dirty(c);
3195 3163
3196 old = root->node; 3164 old = root->node;
3197 tree_mod_log_set_root_pointer(root, c, log_removal); 3165 tree_mod_log_set_root_pointer(root, c, 0);
3198 rcu_assign_pointer(root->node, c); 3166 rcu_assign_pointer(root->node, c);
3199 3167
3200 /* the super has an extra ref to root->node */ 3168 /* the super has an extra ref to root->node */
@@ -3278,14 +3246,14 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
3278 /* 3246 /*
3279 * trying to split the root, lets make a new one 3247 * trying to split the root, lets make a new one
3280 * 3248 *
3281 * tree mod log: We pass 0 as log_removal parameter to 3249 * tree mod log: We don't log_removal old root in
3282 * insert_new_root, because that root buffer will be kept as a 3250 * insert_new_root, because that root buffer will be kept as a
3283 * normal node. We are going to log removal of half of the 3251 * normal node. We are going to log removal of half of the
3284 * elements below with tree_mod_log_eb_copy. We're holding a 3252 * elements below with tree_mod_log_eb_copy. We're holding a
3285 * tree lock on the buffer, which is why we cannot race with 3253 * tree lock on the buffer, which is why we cannot race with
3286 * other tree_mod_log users. 3254 * other tree_mod_log users.
3287 */ 3255 */
3288 ret = insert_new_root(trans, root, path, level + 1, 0); 3256 ret = insert_new_root(trans, root, path, level + 1);
3289 if (ret) 3257 if (ret)
3290 return ret; 3258 return ret;
3291 } else { 3259 } else {
@@ -3986,7 +3954,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
3986 return -EOVERFLOW; 3954 return -EOVERFLOW;
3987 3955
3988 /* first try to make some room by pushing left and right */ 3956 /* first try to make some room by pushing left and right */
3989 if (data_size) { 3957 if (data_size && path->nodes[1]) {
3990 wret = push_leaf_right(trans, root, path, data_size, 3958 wret = push_leaf_right(trans, root, path, data_size,
3991 data_size, 0, 0); 3959 data_size, 0, 0);
3992 if (wret < 0) 3960 if (wret < 0)
@@ -4005,7 +3973,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
4005 } 3973 }
4006 3974
4007 if (!path->nodes[1]) { 3975 if (!path->nodes[1]) {
4008 ret = insert_new_root(trans, root, path, 1, 1); 3976 ret = insert_new_root(trans, root, path, 1);
4009 if (ret) 3977 if (ret)
4010 return ret; 3978 return ret;
4011 } 3979 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d6dd49b51ba8..e795bf135e80 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -961,8 +961,8 @@ struct btrfs_dev_replace_item {
961#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) 961#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4)
962#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) 962#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5)
963#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) 963#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
964#define BTRFS_BLOCK_GROUP_RAID5 (1 << 7) 964#define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7)
965#define BTRFS_BLOCK_GROUP_RAID6 (1 << 8) 965#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8)
966#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE 966#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE
967 967
968enum btrfs_raid_types { 968enum btrfs_raid_types {
@@ -1102,6 +1102,18 @@ struct btrfs_space_info {
1102 account */ 1102 account */
1103 1103
1104 /* 1104 /*
1105 * bytes_pinned is kept in line with what is actually pinned, as in
1106 * we've called update_block_group and dropped the bytes_used counter
1107 * and increased the bytes_pinned counter. However this means that
1108 * bytes_pinned does not reflect the bytes that will be pinned once the
1109 * delayed refs are flushed, so this counter is inc'ed everytime we call
1110 * btrfs_free_extent so it is a realtime count of what will be freed
1111 * once the transaction is committed. It will be zero'ed everytime the
1112 * transaction commits.
1113 */
1114 struct percpu_counter total_bytes_pinned;
1115
1116 /*
1105 * we bump reservation progress every time we decrement 1117 * we bump reservation progress every time we decrement
1106 * bytes_reserved. This way people waiting for reservations 1118 * bytes_reserved. This way people waiting for reservations
1107 * know something good has happened and they can check 1119 * know something good has happened and they can check
@@ -1437,25 +1449,22 @@ struct btrfs_fs_info {
1437 atomic_t open_ioctl_trans; 1449 atomic_t open_ioctl_trans;
1438 1450
1439 /* 1451 /*
1440 * this is used by the balancing code to wait for all the pending 1452 * this is used to protect the following list -- ordered_roots.
1441 * ordered extents
1442 */ 1453 */
1443 spinlock_t ordered_extent_lock; 1454 spinlock_t ordered_root_lock;
1444 1455
1445 /* 1456 /*
1446 * all of the data=ordered extents pending writeback 1457 * all fs/file tree roots in which there are data=ordered extents
1458 * pending writeback are added into this list.
1459 *
1447 * these can span multiple transactions and basically include 1460 * these can span multiple transactions and basically include
1448 * every dirty data page that isn't from nodatacow 1461 * every dirty data page that isn't from nodatacow
1449 */ 1462 */
1450 struct list_head ordered_extents; 1463 struct list_head ordered_roots;
1451 1464
1452 spinlock_t delalloc_lock; 1465 spinlock_t delalloc_root_lock;
1453 /* 1466 /* all fs/file tree roots that have delalloc inodes. */
1454 * all of the inodes that have delalloc bytes. It is possible for 1467 struct list_head delalloc_roots;
1455 * this list to be empty even when there is still dirty data=ordered
1456 * extents waiting to finish IO.
1457 */
1458 struct list_head delalloc_inodes;
1459 1468
1460 /* 1469 /*
1461 * there is a pool of worker threads for checksumming during writes 1470 * there is a pool of worker threads for checksumming during writes
@@ -1498,8 +1507,6 @@ struct btrfs_fs_info {
1498 int do_barriers; 1507 int do_barriers;
1499 int closing; 1508 int closing;
1500 int log_root_recovering; 1509 int log_root_recovering;
1501 int enospc_unlink;
1502 int trans_no_join;
1503 1510
1504 u64 total_pinned; 1511 u64 total_pinned;
1505 1512
@@ -1594,6 +1601,12 @@ struct btrfs_fs_info {
1594 struct rb_root qgroup_tree; 1601 struct rb_root qgroup_tree;
1595 spinlock_t qgroup_lock; 1602 spinlock_t qgroup_lock;
1596 1603
1604 /*
1605 * used to avoid frequently calling ulist_alloc()/ulist_free()
1606 * when doing qgroup accounting, it must be protected by qgroup_lock.
1607 */
1608 struct ulist *qgroup_ulist;
1609
1597 /* protect user change for quota operations */ 1610 /* protect user change for quota operations */
1598 struct mutex qgroup_ioctl_lock; 1611 struct mutex qgroup_ioctl_lock;
1599 1612
@@ -1607,6 +1620,8 @@ struct btrfs_fs_info {
1607 struct mutex qgroup_rescan_lock; /* protects the progress item */ 1620 struct mutex qgroup_rescan_lock; /* protects the progress item */
1608 struct btrfs_key qgroup_rescan_progress; 1621 struct btrfs_key qgroup_rescan_progress;
1609 struct btrfs_workers qgroup_rescan_workers; 1622 struct btrfs_workers qgroup_rescan_workers;
1623 struct completion qgroup_rescan_completion;
1624 struct btrfs_work qgroup_rescan_work;
1610 1625
1611 /* filesystem state */ 1626 /* filesystem state */
1612 unsigned long fs_state; 1627 unsigned long fs_state;
@@ -1739,6 +1754,31 @@ struct btrfs_root {
1739 int force_cow; 1754 int force_cow;
1740 1755
1741 spinlock_t root_item_lock; 1756 spinlock_t root_item_lock;
1757 atomic_t refs;
1758
1759 spinlock_t delalloc_lock;
1760 /*
1761 * all of the inodes that have delalloc bytes. It is possible for
1762 * this list to be empty even when there is still dirty data=ordered
1763 * extents waiting to finish IO.
1764 */
1765 struct list_head delalloc_inodes;
1766 struct list_head delalloc_root;
1767 u64 nr_delalloc_inodes;
1768 /*
1769 * this is used by the balancing code to wait for all the pending
1770 * ordered extents
1771 */
1772 spinlock_t ordered_extent_lock;
1773
1774 /*
1775 * all of the data=ordered extents pending writeback
1776 * these can span multiple transactions and basically include
1777 * every dirty data page that isn't from nodatacow
1778 */
1779 struct list_head ordered_extents;
1780 struct list_head ordered_root;
1781 u64 nr_ordered_extents;
1742}; 1782};
1743 1783
1744struct btrfs_ioctl_defrag_range_args { 1784struct btrfs_ioctl_defrag_range_args {
@@ -3028,6 +3068,8 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
3028 num_items; 3068 num_items;
3029} 3069}
3030 3070
3071int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
3072 struct btrfs_root *root);
3031void btrfs_put_block_group(struct btrfs_block_group_cache *cache); 3073void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
3032int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 3074int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
3033 struct btrfs_root *root, unsigned long count); 3075 struct btrfs_root *root, unsigned long count);
@@ -3039,6 +3081,8 @@ int btrfs_pin_extent(struct btrfs_root *root,
3039 u64 bytenr, u64 num, int reserved); 3081 u64 bytenr, u64 num, int reserved);
3040int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, 3082int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
3041 u64 bytenr, u64 num_bytes); 3083 u64 bytenr, u64 num_bytes);
3084int btrfs_exclude_logged_extents(struct btrfs_root *root,
3085 struct extent_buffer *eb);
3042int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 3086int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
3043 struct btrfs_root *root, 3087 struct btrfs_root *root,
3044 u64 objectid, u64 offset, u64 bytenr); 3088 u64 objectid, u64 offset, u64 bytenr);
@@ -3155,6 +3199,9 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
3155int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 3199int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
3156 struct btrfs_block_rsv *dst_rsv, 3200 struct btrfs_block_rsv *dst_rsv,
3157 u64 num_bytes); 3201 u64 num_bytes);
3202int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
3203 struct btrfs_block_rsv *dest, u64 num_bytes,
3204 int min_factor);
3158void btrfs_block_rsv_release(struct btrfs_root *root, 3205void btrfs_block_rsv_release(struct btrfs_root *root,
3159 struct btrfs_block_rsv *block_rsv, 3206 struct btrfs_block_rsv *block_rsv,
3160 u64 num_bytes); 3207 u64 num_bytes);
@@ -3311,6 +3358,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
3311 smp_mb(); 3358 smp_mb();
3312 return fs_info->closing; 3359 return fs_info->closing;
3313} 3360}
3361
3362/*
3363 * If we remount the fs to be R/O or umount the fs, the cleaner needn't do
3364 * anything except sleeping. This function is used to check the status of
3365 * the fs.
3366 */
3367static inline int btrfs_need_cleaner_sleep(struct btrfs_root *root)
3368{
3369 return (root->fs_info->sb->s_flags & MS_RDONLY ||
3370 btrfs_fs_closing(root->fs_info));
3371}
3372
3314static inline void free_fs_info(struct btrfs_fs_info *fs_info) 3373static inline void free_fs_info(struct btrfs_fs_info *fs_info)
3315{ 3374{
3316 kfree(fs_info->balance_ctl); 3375 kfree(fs_info->balance_ctl);
@@ -3357,9 +3416,9 @@ int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
3357 struct btrfs_root_item *item); 3416 struct btrfs_root_item *item);
3358void btrfs_read_root_item(struct extent_buffer *eb, int slot, 3417void btrfs_read_root_item(struct extent_buffer *eb, int slot,
3359 struct btrfs_root_item *item); 3418 struct btrfs_root_item *item);
3360int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct 3419int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key,
3361 btrfs_root_item *item, struct btrfs_key *key); 3420 struct btrfs_path *path, struct btrfs_root_item *root_item,
3362int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); 3421 struct btrfs_key *root_key);
3363int btrfs_find_orphan_roots(struct btrfs_root *tree_root); 3422int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
3364void btrfs_set_root_node(struct btrfs_root_item *item, 3423void btrfs_set_root_node(struct btrfs_root_item *item,
3365 struct extent_buffer *node); 3424 struct extent_buffer *node);
@@ -3493,6 +3552,10 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
3493struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, 3552struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
3494 size_t pg_offset, u64 start, u64 len, 3553 size_t pg_offset, u64 start, u64 len,
3495 int create); 3554 int create);
3555noinline int can_nocow_extent(struct btrfs_trans_handle *trans,
3556 struct inode *inode, u64 offset, u64 *len,
3557 u64 *orig_start, u64 *orig_block_len,
3558 u64 *ram_bytes);
3496 3559
3497/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */ 3560/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
3498#if defined(ClearPageFsMisc) && !defined(ClearPageChecked) 3561#if defined(ClearPageFsMisc) && !defined(ClearPageChecked)
@@ -3530,6 +3593,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3530 u32 min_type); 3593 u32 min_type);
3531 3594
3532int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); 3595int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
3596int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
3597 int delay_iput);
3533int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 3598int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
3534 struct extent_state **cached_state); 3599 struct extent_state **cached_state);
3535int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 3600int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
@@ -3814,6 +3879,8 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
3814int btrfs_quota_disable(struct btrfs_trans_handle *trans, 3879int btrfs_quota_disable(struct btrfs_trans_handle *trans,
3815 struct btrfs_fs_info *fs_info); 3880 struct btrfs_fs_info *fs_info);
3816int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); 3881int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
3882void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
3883int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info);
3817int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, 3884int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
3818 struct btrfs_fs_info *fs_info, u64 src, u64 dst); 3885 struct btrfs_fs_info *fs_info, u64 src, u64 dst);
3819int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, 3886int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index eb34438ddedb..375510913fe7 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -535,20 +535,6 @@ static struct btrfs_delayed_item *__btrfs_next_delayed_item(
535 return next; 535 return next;
536} 536}
537 537
538static inline struct btrfs_root *btrfs_get_fs_root(struct btrfs_root *root,
539 u64 root_id)
540{
541 struct btrfs_key root_key;
542
543 if (root->objectid == root_id)
544 return root;
545
546 root_key.objectid = root_id;
547 root_key.type = BTRFS_ROOT_ITEM_KEY;
548 root_key.offset = (u64)-1;
549 return btrfs_read_fs_root_no_name(root->fs_info, &root_key);
550}
551
552static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, 538static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
553 struct btrfs_root *root, 539 struct btrfs_root *root,
554 struct btrfs_delayed_item *item) 540 struct btrfs_delayed_item *item)
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 65241f32d3f8..4253ad580e39 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -400,7 +400,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
400 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; 400 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
401 btrfs_dev_replace_unlock(dev_replace); 401 btrfs_dev_replace_unlock(dev_replace);
402 402
403 btrfs_wait_ordered_extents(root, 0); 403 btrfs_wait_all_ordered_extents(root->fs_info, 0);
404 404
405 /* force writing the updated state information to disk */ 405 /* force writing the updated state information to disk */
406 trans = btrfs_start_transaction(root, 0); 406 trans = btrfs_start_transaction(root, 0);
@@ -470,12 +470,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
470 * flush all outstanding I/O and inode extent mappings before the 470 * flush all outstanding I/O and inode extent mappings before the
471 * copy operation is declared as being finished 471 * copy operation is declared as being finished
472 */ 472 */
473 ret = btrfs_start_delalloc_inodes(root, 0); 473 ret = btrfs_start_all_delalloc_inodes(root->fs_info, 0);
474 if (ret) { 474 if (ret) {
475 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 475 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
476 return ret; 476 return ret;
477 } 477 }
478 btrfs_wait_ordered_extents(root, 0); 478 btrfs_wait_all_ordered_extents(root->fs_info, 0);
479 479
480 trans = btrfs_start_transaction(root, 0); 480 trans = btrfs_start_transaction(root, 0);
481 if (IS_ERR(trans)) { 481 if (IS_ERR(trans)) {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b0292b3ead54..6b092a1c4e37 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1192,6 +1192,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1192 root->objectid = objectid; 1192 root->objectid = objectid;
1193 root->last_trans = 0; 1193 root->last_trans = 0;
1194 root->highest_objectid = 0; 1194 root->highest_objectid = 0;
1195 root->nr_delalloc_inodes = 0;
1196 root->nr_ordered_extents = 0;
1195 root->name = NULL; 1197 root->name = NULL;
1196 root->inode_tree = RB_ROOT; 1198 root->inode_tree = RB_ROOT;
1197 INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); 1199 INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
@@ -1200,10 +1202,16 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1200 1202
1201 INIT_LIST_HEAD(&root->dirty_list); 1203 INIT_LIST_HEAD(&root->dirty_list);
1202 INIT_LIST_HEAD(&root->root_list); 1204 INIT_LIST_HEAD(&root->root_list);
1205 INIT_LIST_HEAD(&root->delalloc_inodes);
1206 INIT_LIST_HEAD(&root->delalloc_root);
1207 INIT_LIST_HEAD(&root->ordered_extents);
1208 INIT_LIST_HEAD(&root->ordered_root);
1203 INIT_LIST_HEAD(&root->logged_list[0]); 1209 INIT_LIST_HEAD(&root->logged_list[0]);
1204 INIT_LIST_HEAD(&root->logged_list[1]); 1210 INIT_LIST_HEAD(&root->logged_list[1]);
1205 spin_lock_init(&root->orphan_lock); 1211 spin_lock_init(&root->orphan_lock);
1206 spin_lock_init(&root->inode_lock); 1212 spin_lock_init(&root->inode_lock);
1213 spin_lock_init(&root->delalloc_lock);
1214 spin_lock_init(&root->ordered_extent_lock);
1207 spin_lock_init(&root->accounting_lock); 1215 spin_lock_init(&root->accounting_lock);
1208 spin_lock_init(&root->log_extents_lock[0]); 1216 spin_lock_init(&root->log_extents_lock[0]);
1209 spin_lock_init(&root->log_extents_lock[1]); 1217 spin_lock_init(&root->log_extents_lock[1]);
@@ -1217,6 +1225,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1217 atomic_set(&root->log_writers, 0); 1225 atomic_set(&root->log_writers, 0);
1218 atomic_set(&root->log_batch, 0); 1226 atomic_set(&root->log_batch, 0);
1219 atomic_set(&root->orphan_inodes, 0); 1227 atomic_set(&root->orphan_inodes, 0);
1228 atomic_set(&root->refs, 1);
1220 root->log_transid = 0; 1229 root->log_transid = 0;
1221 root->last_log_commit = 0; 1230 root->last_log_commit = 0;
1222 extent_io_tree_init(&root->dirty_log_pages, 1231 extent_io_tree_init(&root->dirty_log_pages,
@@ -1235,39 +1244,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1235 spin_lock_init(&root->root_item_lock); 1244 spin_lock_init(&root->root_item_lock);
1236} 1245}
1237 1246
1238static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
1239 struct btrfs_fs_info *fs_info,
1240 u64 objectid,
1241 struct btrfs_root *root)
1242{
1243 int ret;
1244 u32 blocksize;
1245 u64 generation;
1246
1247 __setup_root(tree_root->nodesize, tree_root->leafsize,
1248 tree_root->sectorsize, tree_root->stripesize,
1249 root, fs_info, objectid);
1250 ret = btrfs_find_last_root(tree_root, objectid,
1251 &root->root_item, &root->root_key);
1252 if (ret > 0)
1253 return -ENOENT;
1254 else if (ret < 0)
1255 return ret;
1256
1257 generation = btrfs_root_generation(&root->root_item);
1258 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1259 root->commit_root = NULL;
1260 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1261 blocksize, generation);
1262 if (!root->node || !btrfs_buffer_uptodate(root->node, generation, 0)) {
1263 free_extent_buffer(root->node);
1264 root->node = NULL;
1265 return -EIO;
1266 }
1267 root->commit_root = btrfs_root_node(root);
1268 return 0;
1269}
1270
1271static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info) 1247static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
1272{ 1248{
1273 struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS); 1249 struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
@@ -1452,70 +1428,73 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1452 return 0; 1428 return 0;
1453} 1429}
1454 1430
1455struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, 1431struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
1456 struct btrfs_key *location) 1432 struct btrfs_key *key)
1457{ 1433{
1458 struct btrfs_root *root; 1434 struct btrfs_root *root;
1459 struct btrfs_fs_info *fs_info = tree_root->fs_info; 1435 struct btrfs_fs_info *fs_info = tree_root->fs_info;
1460 struct btrfs_path *path; 1436 struct btrfs_path *path;
1461 struct extent_buffer *l;
1462 u64 generation; 1437 u64 generation;
1463 u32 blocksize; 1438 u32 blocksize;
1464 int ret = 0; 1439 int ret;
1465 int slot;
1466 1440
1467 root = btrfs_alloc_root(fs_info); 1441 path = btrfs_alloc_path();
1468 if (!root) 1442 if (!path)
1469 return ERR_PTR(-ENOMEM); 1443 return ERR_PTR(-ENOMEM);
1470 if (location->offset == (u64)-1) { 1444
1471 ret = find_and_setup_root(tree_root, fs_info, 1445 root = btrfs_alloc_root(fs_info);
1472 location->objectid, root); 1446 if (!root) {
1473 if (ret) { 1447 ret = -ENOMEM;
1474 kfree(root); 1448 goto alloc_fail;
1475 return ERR_PTR(ret);
1476 }
1477 goto out;
1478 } 1449 }
1479 1450
1480 __setup_root(tree_root->nodesize, tree_root->leafsize, 1451 __setup_root(tree_root->nodesize, tree_root->leafsize,
1481 tree_root->sectorsize, tree_root->stripesize, 1452 tree_root->sectorsize, tree_root->stripesize,
1482 root, fs_info, location->objectid); 1453 root, fs_info, key->objectid);
1483 1454
1484 path = btrfs_alloc_path(); 1455 ret = btrfs_find_root(tree_root, key, path,
1485 if (!path) { 1456 &root->root_item, &root->root_key);
1486 kfree(root);
1487 return ERR_PTR(-ENOMEM);
1488 }
1489 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
1490 if (ret == 0) {
1491 l = path->nodes[0];
1492 slot = path->slots[0];
1493 btrfs_read_root_item(l, slot, &root->root_item);
1494 memcpy(&root->root_key, location, sizeof(*location));
1495 }
1496 btrfs_free_path(path);
1497 if (ret) { 1457 if (ret) {
1498 kfree(root);
1499 if (ret > 0) 1458 if (ret > 0)
1500 ret = -ENOENT; 1459 ret = -ENOENT;
1501 return ERR_PTR(ret); 1460 goto find_fail;
1502 } 1461 }
1503 1462
1504 generation = btrfs_root_generation(&root->root_item); 1463 generation = btrfs_root_generation(&root->root_item);
1505 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); 1464 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1506 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 1465 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1507 blocksize, generation); 1466 blocksize, generation);
1508 if (!root->node || !extent_buffer_uptodate(root->node)) { 1467 if (!root->node) {
1509 ret = (!root->node) ? -ENOMEM : -EIO; 1468 ret = -ENOMEM;
1510 1469 goto find_fail;
1511 free_extent_buffer(root->node); 1470 } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
1512 kfree(root); 1471 ret = -EIO;
1513 return ERR_PTR(ret); 1472 goto read_fail;
1514 } 1473 }
1515
1516 root->commit_root = btrfs_root_node(root); 1474 root->commit_root = btrfs_root_node(root);
1517out: 1475out:
1518 if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { 1476 btrfs_free_path(path);
1477 return root;
1478
1479read_fail:
1480 free_extent_buffer(root->node);
1481find_fail:
1482 kfree(root);
1483alloc_fail:
1484 root = ERR_PTR(ret);
1485 goto out;
1486}
1487
1488struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
1489 struct btrfs_key *location)
1490{
1491 struct btrfs_root *root;
1492
1493 root = btrfs_read_tree_root(tree_root, location);
1494 if (IS_ERR(root))
1495 return root;
1496
1497 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
1519 root->ref_cows = 1; 1498 root->ref_cows = 1;
1520 btrfs_check_and_init_root_item(&root->root_item); 1499 btrfs_check_and_init_root_item(&root->root_item);
1521 } 1500 }
@@ -1523,6 +1502,66 @@ out:
1523 return root; 1502 return root;
1524} 1503}
1525 1504
1505int btrfs_init_fs_root(struct btrfs_root *root)
1506{
1507 int ret;
1508
1509 root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
1510 root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
1511 GFP_NOFS);
1512 if (!root->free_ino_pinned || !root->free_ino_ctl) {
1513 ret = -ENOMEM;
1514 goto fail;
1515 }
1516
1517 btrfs_init_free_ino_ctl(root);
1518 mutex_init(&root->fs_commit_mutex);
1519 spin_lock_init(&root->cache_lock);
1520 init_waitqueue_head(&root->cache_wait);
1521
1522 ret = get_anon_bdev(&root->anon_dev);
1523 if (ret)
1524 goto fail;
1525 return 0;
1526fail:
1527 kfree(root->free_ino_ctl);
1528 kfree(root->free_ino_pinned);
1529 return ret;
1530}
1531
1532struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
1533 u64 root_id)
1534{
1535 struct btrfs_root *root;
1536
1537 spin_lock(&fs_info->fs_roots_radix_lock);
1538 root = radix_tree_lookup(&fs_info->fs_roots_radix,
1539 (unsigned long)root_id);
1540 spin_unlock(&fs_info->fs_roots_radix_lock);
1541 return root;
1542}
1543
1544int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
1545 struct btrfs_root *root)
1546{
1547 int ret;
1548
1549 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
1550 if (ret)
1551 return ret;
1552
1553 spin_lock(&fs_info->fs_roots_radix_lock);
1554 ret = radix_tree_insert(&fs_info->fs_roots_radix,
1555 (unsigned long)root->root_key.objectid,
1556 root);
1557 if (ret == 0)
1558 root->in_radix = 1;
1559 spin_unlock(&fs_info->fs_roots_radix_lock);
1560 radix_tree_preload_end();
1561
1562 return ret;
1563}
1564
1526struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, 1565struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
1527 struct btrfs_key *location) 1566 struct btrfs_key *location)
1528{ 1567{
@@ -1543,58 +1582,30 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
1543 return fs_info->quota_root ? fs_info->quota_root : 1582 return fs_info->quota_root ? fs_info->quota_root :
1544 ERR_PTR(-ENOENT); 1583 ERR_PTR(-ENOENT);
1545again: 1584again:
1546 spin_lock(&fs_info->fs_roots_radix_lock); 1585 root = btrfs_lookup_fs_root(fs_info, location->objectid);
1547 root = radix_tree_lookup(&fs_info->fs_roots_radix,
1548 (unsigned long)location->objectid);
1549 spin_unlock(&fs_info->fs_roots_radix_lock);
1550 if (root) 1586 if (root)
1551 return root; 1587 return root;
1552 1588
1553 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location); 1589 root = btrfs_read_fs_root(fs_info->tree_root, location);
1554 if (IS_ERR(root)) 1590 if (IS_ERR(root))
1555 return root; 1591 return root;
1556 1592
1557 root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); 1593 if (btrfs_root_refs(&root->root_item) == 0) {
1558 root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), 1594 ret = -ENOENT;
1559 GFP_NOFS);
1560 if (!root->free_ino_pinned || !root->free_ino_ctl) {
1561 ret = -ENOMEM;
1562 goto fail; 1595 goto fail;
1563 } 1596 }
1564 1597
1565 btrfs_init_free_ino_ctl(root); 1598 ret = btrfs_init_fs_root(root);
1566 mutex_init(&root->fs_commit_mutex);
1567 spin_lock_init(&root->cache_lock);
1568 init_waitqueue_head(&root->cache_wait);
1569
1570 ret = get_anon_bdev(&root->anon_dev);
1571 if (ret) 1599 if (ret)
1572 goto fail; 1600 goto fail;
1573 1601
1574 if (btrfs_root_refs(&root->root_item) == 0) {
1575 ret = -ENOENT;
1576 goto fail;
1577 }
1578
1579 ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid); 1602 ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
1580 if (ret < 0) 1603 if (ret < 0)
1581 goto fail; 1604 goto fail;
1582 if (ret == 0) 1605 if (ret == 0)
1583 root->orphan_item_inserted = 1; 1606 root->orphan_item_inserted = 1;
1584 1607
1585 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 1608 ret = btrfs_insert_fs_root(fs_info, root);
1586 if (ret)
1587 goto fail;
1588
1589 spin_lock(&fs_info->fs_roots_radix_lock);
1590 ret = radix_tree_insert(&fs_info->fs_roots_radix,
1591 (unsigned long)root->root_key.objectid,
1592 root);
1593 if (ret == 0)
1594 root->in_radix = 1;
1595
1596 spin_unlock(&fs_info->fs_roots_radix_lock);
1597 radix_tree_preload_end();
1598 if (ret) { 1609 if (ret) {
1599 if (ret == -EEXIST) { 1610 if (ret == -EEXIST) {
1600 free_fs_root(root); 1611 free_fs_root(root);
@@ -1602,10 +1613,6 @@ again:
1602 } 1613 }
1603 goto fail; 1614 goto fail;
1604 } 1615 }
1605
1606 ret = btrfs_find_dead_roots(fs_info->tree_root,
1607 root->root_key.objectid);
1608 WARN_ON(ret);
1609 return root; 1616 return root;
1610fail: 1617fail:
1611 free_fs_root(root); 1618 free_fs_root(root);
@@ -1677,21 +1684,37 @@ static void end_workqueue_fn(struct btrfs_work *work)
1677static int cleaner_kthread(void *arg) 1684static int cleaner_kthread(void *arg)
1678{ 1685{
1679 struct btrfs_root *root = arg; 1686 struct btrfs_root *root = arg;
1687 int again;
1680 1688
1681 do { 1689 do {
1682 int again = 0; 1690 again = 0;
1683 1691
1684 if (!(root->fs_info->sb->s_flags & MS_RDONLY) && 1692 /* Make the cleaner go to sleep early. */
1685 down_read_trylock(&root->fs_info->sb->s_umount)) { 1693 if (btrfs_need_cleaner_sleep(root))
1686 if (mutex_trylock(&root->fs_info->cleaner_mutex)) { 1694 goto sleep;
1687 btrfs_run_delayed_iputs(root); 1695
1688 again = btrfs_clean_one_deleted_snapshot(root); 1696 if (!mutex_trylock(&root->fs_info->cleaner_mutex))
1689 mutex_unlock(&root->fs_info->cleaner_mutex); 1697 goto sleep;
1690 } 1698
1691 btrfs_run_defrag_inodes(root->fs_info); 1699 /*
1692 up_read(&root->fs_info->sb->s_umount); 1700 * Avoid the problem that we change the status of the fs
1701 * during the above check and trylock.
1702 */
1703 if (btrfs_need_cleaner_sleep(root)) {
1704 mutex_unlock(&root->fs_info->cleaner_mutex);
1705 goto sleep;
1693 } 1706 }
1694 1707
1708 btrfs_run_delayed_iputs(root);
1709 again = btrfs_clean_one_deleted_snapshot(root);
1710 mutex_unlock(&root->fs_info->cleaner_mutex);
1711
1712 /*
1713 * The defragger has dealt with the R/O remount and umount,
1714 * needn't do anything special here.
1715 */
1716 btrfs_run_defrag_inodes(root->fs_info);
1717sleep:
1695 if (!try_to_freeze() && !again) { 1718 if (!try_to_freeze() && !again) {
1696 set_current_state(TASK_INTERRUPTIBLE); 1719 set_current_state(TASK_INTERRUPTIBLE);
1697 if (!kthread_should_stop()) 1720 if (!kthread_should_stop())
@@ -1725,7 +1748,7 @@ static int transaction_kthread(void *arg)
1725 } 1748 }
1726 1749
1727 now = get_seconds(); 1750 now = get_seconds();
1728 if (!cur->blocked && 1751 if (cur->state < TRANS_STATE_BLOCKED &&
1729 (now < cur->start_time || now - cur->start_time < 30)) { 1752 (now < cur->start_time || now - cur->start_time < 30)) {
1730 spin_unlock(&root->fs_info->trans_lock); 1753 spin_unlock(&root->fs_info->trans_lock);
1731 delay = HZ * 5; 1754 delay = HZ * 5;
@@ -2035,11 +2058,11 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info)
2035 list_del(&gang[0]->root_list); 2058 list_del(&gang[0]->root_list);
2036 2059
2037 if (gang[0]->in_radix) { 2060 if (gang[0]->in_radix) {
2038 btrfs_free_fs_root(fs_info, gang[0]); 2061 btrfs_drop_and_free_fs_root(fs_info, gang[0]);
2039 } else { 2062 } else {
2040 free_extent_buffer(gang[0]->node); 2063 free_extent_buffer(gang[0]->node);
2041 free_extent_buffer(gang[0]->commit_root); 2064 free_extent_buffer(gang[0]->commit_root);
2042 kfree(gang[0]); 2065 btrfs_put_fs_root(gang[0]);
2043 } 2066 }
2044 } 2067 }
2045 2068
@@ -2050,7 +2073,7 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info)
2050 if (!ret) 2073 if (!ret)
2051 break; 2074 break;
2052 for (i = 0; i < ret; i++) 2075 for (i = 0; i < ret; i++)
2053 btrfs_free_fs_root(fs_info, gang[i]); 2076 btrfs_drop_and_free_fs_root(fs_info, gang[i]);
2054 } 2077 }
2055} 2078}
2056 2079
@@ -2082,14 +2105,8 @@ int open_ctree(struct super_block *sb,
2082 int backup_index = 0; 2105 int backup_index = 0;
2083 2106
2084 tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info); 2107 tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
2085 extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info);
2086 csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info);
2087 chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info); 2108 chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
2088 dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info); 2109 if (!tree_root || !chunk_root) {
2089 quota_root = fs_info->quota_root = btrfs_alloc_root(fs_info);
2090
2091 if (!tree_root || !extent_root || !csum_root ||
2092 !chunk_root || !dev_root || !quota_root) {
2093 err = -ENOMEM; 2110 err = -ENOMEM;
2094 goto fail; 2111 goto fail;
2095 } 2112 }
@@ -2132,9 +2149,9 @@ int open_ctree(struct super_block *sb,
2132 INIT_LIST_HEAD(&fs_info->trans_list); 2149 INIT_LIST_HEAD(&fs_info->trans_list);
2133 INIT_LIST_HEAD(&fs_info->dead_roots); 2150 INIT_LIST_HEAD(&fs_info->dead_roots);
2134 INIT_LIST_HEAD(&fs_info->delayed_iputs); 2151 INIT_LIST_HEAD(&fs_info->delayed_iputs);
2135 INIT_LIST_HEAD(&fs_info->delalloc_inodes); 2152 INIT_LIST_HEAD(&fs_info->delalloc_roots);
2136 INIT_LIST_HEAD(&fs_info->caching_block_groups); 2153 INIT_LIST_HEAD(&fs_info->caching_block_groups);
2137 spin_lock_init(&fs_info->delalloc_lock); 2154 spin_lock_init(&fs_info->delalloc_root_lock);
2138 spin_lock_init(&fs_info->trans_lock); 2155 spin_lock_init(&fs_info->trans_lock);
2139 spin_lock_init(&fs_info->fs_roots_radix_lock); 2156 spin_lock_init(&fs_info->fs_roots_radix_lock);
2140 spin_lock_init(&fs_info->delayed_iput_lock); 2157 spin_lock_init(&fs_info->delayed_iput_lock);
@@ -2170,7 +2187,6 @@ int open_ctree(struct super_block *sb,
2170 fs_info->max_inline = 8192 * 1024; 2187 fs_info->max_inline = 8192 * 1024;
2171 fs_info->metadata_ratio = 0; 2188 fs_info->metadata_ratio = 0;
2172 fs_info->defrag_inodes = RB_ROOT; 2189 fs_info->defrag_inodes = RB_ROOT;
2173 fs_info->trans_no_join = 0;
2174 fs_info->free_chunk_space = 0; 2190 fs_info->free_chunk_space = 0;
2175 fs_info->tree_mod_log = RB_ROOT; 2191 fs_info->tree_mod_log = RB_ROOT;
2176 2192
@@ -2181,8 +2197,8 @@ int open_ctree(struct super_block *sb,
2181 fs_info->thread_pool_size = min_t(unsigned long, 2197 fs_info->thread_pool_size = min_t(unsigned long,
2182 num_online_cpus() + 2, 8); 2198 num_online_cpus() + 2, 8);
2183 2199
2184 INIT_LIST_HEAD(&fs_info->ordered_extents); 2200 INIT_LIST_HEAD(&fs_info->ordered_roots);
2185 spin_lock_init(&fs_info->ordered_extent_lock); 2201 spin_lock_init(&fs_info->ordered_root_lock);
2186 fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), 2202 fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
2187 GFP_NOFS); 2203 GFP_NOFS);
2188 if (!fs_info->delayed_root) { 2204 if (!fs_info->delayed_root) {
@@ -2275,6 +2291,7 @@ int open_ctree(struct super_block *sb,
2275 fs_info->qgroup_seq = 1; 2291 fs_info->qgroup_seq = 1;
2276 fs_info->quota_enabled = 0; 2292 fs_info->quota_enabled = 0;
2277 fs_info->pending_quota_state = 0; 2293 fs_info->pending_quota_state = 0;
2294 fs_info->qgroup_ulist = NULL;
2278 mutex_init(&fs_info->qgroup_rescan_lock); 2295 mutex_init(&fs_info->qgroup_rescan_lock);
2279 2296
2280 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); 2297 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
@@ -2639,33 +2656,44 @@ retry_root_backup:
2639 btrfs_set_root_node(&tree_root->root_item, tree_root->node); 2656 btrfs_set_root_node(&tree_root->root_item, tree_root->node);
2640 tree_root->commit_root = btrfs_root_node(tree_root); 2657 tree_root->commit_root = btrfs_root_node(tree_root);
2641 2658
2642 ret = find_and_setup_root(tree_root, fs_info, 2659 location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
2643 BTRFS_EXTENT_TREE_OBJECTID, extent_root); 2660 location.type = BTRFS_ROOT_ITEM_KEY;
2644 if (ret) 2661 location.offset = 0;
2662
2663 extent_root = btrfs_read_tree_root(tree_root, &location);
2664 if (IS_ERR(extent_root)) {
2665 ret = PTR_ERR(extent_root);
2645 goto recovery_tree_root; 2666 goto recovery_tree_root;
2667 }
2646 extent_root->track_dirty = 1; 2668 extent_root->track_dirty = 1;
2669 fs_info->extent_root = extent_root;
2647 2670
2648 ret = find_and_setup_root(tree_root, fs_info, 2671 location.objectid = BTRFS_DEV_TREE_OBJECTID;
2649 BTRFS_DEV_TREE_OBJECTID, dev_root); 2672 dev_root = btrfs_read_tree_root(tree_root, &location);
2650 if (ret) 2673 if (IS_ERR(dev_root)) {
2674 ret = PTR_ERR(dev_root);
2651 goto recovery_tree_root; 2675 goto recovery_tree_root;
2676 }
2652 dev_root->track_dirty = 1; 2677 dev_root->track_dirty = 1;
2678 fs_info->dev_root = dev_root;
2679 btrfs_init_devices_late(fs_info);
2653 2680
2654 ret = find_and_setup_root(tree_root, fs_info, 2681 location.objectid = BTRFS_CSUM_TREE_OBJECTID;
2655 BTRFS_CSUM_TREE_OBJECTID, csum_root); 2682 csum_root = btrfs_read_tree_root(tree_root, &location);
2656 if (ret) 2683 if (IS_ERR(csum_root)) {
2684 ret = PTR_ERR(csum_root);
2657 goto recovery_tree_root; 2685 goto recovery_tree_root;
2686 }
2658 csum_root->track_dirty = 1; 2687 csum_root->track_dirty = 1;
2688 fs_info->csum_root = csum_root;
2659 2689
2660 ret = find_and_setup_root(tree_root, fs_info, 2690 location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
2661 BTRFS_QUOTA_TREE_OBJECTID, quota_root); 2691 quota_root = btrfs_read_tree_root(tree_root, &location);
2662 if (ret) { 2692 if (!IS_ERR(quota_root)) {
2663 kfree(quota_root);
2664 quota_root = fs_info->quota_root = NULL;
2665 } else {
2666 quota_root->track_dirty = 1; 2693 quota_root->track_dirty = 1;
2667 fs_info->quota_enabled = 1; 2694 fs_info->quota_enabled = 1;
2668 fs_info->pending_quota_state = 1; 2695 fs_info->pending_quota_state = 1;
2696 fs_info->quota_root = quota_root;
2669 } 2697 }
2670 2698
2671 fs_info->generation = generation; 2699 fs_info->generation = generation;
@@ -2818,11 +2846,9 @@ retry_root_backup:
2818 2846
2819 location.objectid = BTRFS_FS_TREE_OBJECTID; 2847 location.objectid = BTRFS_FS_TREE_OBJECTID;
2820 location.type = BTRFS_ROOT_ITEM_KEY; 2848 location.type = BTRFS_ROOT_ITEM_KEY;
2821 location.offset = (u64)-1; 2849 location.offset = 0;
2822 2850
2823 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); 2851 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
2824 if (!fs_info->fs_root)
2825 goto fail_qgroup;
2826 if (IS_ERR(fs_info->fs_root)) { 2852 if (IS_ERR(fs_info->fs_root)) {
2827 err = PTR_ERR(fs_info->fs_root); 2853 err = PTR_ERR(fs_info->fs_root);
2828 goto fail_qgroup; 2854 goto fail_qgroup;
@@ -2854,6 +2880,8 @@ retry_root_backup:
2854 return ret; 2880 return ret;
2855 } 2881 }
2856 2882
2883 btrfs_qgroup_rescan_resume(fs_info);
2884
2857 return 0; 2885 return 0;
2858 2886
2859fail_qgroup: 2887fail_qgroup:
@@ -3259,7 +3287,7 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
3259 BTRFS_BLOCK_GROUP_RAID10)) { 3287 BTRFS_BLOCK_GROUP_RAID10)) {
3260 num_tolerated_disk_barrier_failures = 1; 3288 num_tolerated_disk_barrier_failures = 1;
3261 } else if (flags & 3289 } else if (flags &
3262 BTRFS_BLOCK_GROUP_RAID5) { 3290 BTRFS_BLOCK_GROUP_RAID6) {
3263 num_tolerated_disk_barrier_failures = 2; 3291 num_tolerated_disk_barrier_failures = 2;
3264 } 3292 }
3265 } 3293 }
@@ -3367,7 +3395,9 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
3367 return ret; 3395 return ret;
3368} 3396}
3369 3397
3370void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) 3398/* Drop a fs root from the radix tree and free it. */
3399void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
3400 struct btrfs_root *root)
3371{ 3401{
3372 spin_lock(&fs_info->fs_roots_radix_lock); 3402 spin_lock(&fs_info->fs_roots_radix_lock);
3373 radix_tree_delete(&fs_info->fs_roots_radix, 3403 radix_tree_delete(&fs_info->fs_roots_radix,
@@ -3398,7 +3428,12 @@ static void free_fs_root(struct btrfs_root *root)
3398 kfree(root->free_ino_ctl); 3428 kfree(root->free_ino_ctl);
3399 kfree(root->free_ino_pinned); 3429 kfree(root->free_ino_pinned);
3400 kfree(root->name); 3430 kfree(root->name);
3401 kfree(root); 3431 btrfs_put_fs_root(root);
3432}
3433
3434void btrfs_free_fs_root(struct btrfs_root *root)
3435{
3436 free_fs_root(root);
3402} 3437}
3403 3438
3404int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) 3439int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
@@ -3654,7 +3689,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
3654 INIT_LIST_HEAD(&splice); 3689 INIT_LIST_HEAD(&splice);
3655 3690
3656 mutex_lock(&root->fs_info->ordered_operations_mutex); 3691 mutex_lock(&root->fs_info->ordered_operations_mutex);
3657 spin_lock(&root->fs_info->ordered_extent_lock); 3692 spin_lock(&root->fs_info->ordered_root_lock);
3658 3693
3659 list_splice_init(&t->ordered_operations, &splice); 3694 list_splice_init(&t->ordered_operations, &splice);
3660 while (!list_empty(&splice)) { 3695 while (!list_empty(&splice)) {
@@ -3662,14 +3697,14 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
3662 ordered_operations); 3697 ordered_operations);
3663 3698
3664 list_del_init(&btrfs_inode->ordered_operations); 3699 list_del_init(&btrfs_inode->ordered_operations);
3665 spin_unlock(&root->fs_info->ordered_extent_lock); 3700 spin_unlock(&root->fs_info->ordered_root_lock);
3666 3701
3667 btrfs_invalidate_inodes(btrfs_inode->root); 3702 btrfs_invalidate_inodes(btrfs_inode->root);
3668 3703
3669 spin_lock(&root->fs_info->ordered_extent_lock); 3704 spin_lock(&root->fs_info->ordered_root_lock);
3670 } 3705 }
3671 3706
3672 spin_unlock(&root->fs_info->ordered_extent_lock); 3707 spin_unlock(&root->fs_info->ordered_root_lock);
3673 mutex_unlock(&root->fs_info->ordered_operations_mutex); 3708 mutex_unlock(&root->fs_info->ordered_operations_mutex);
3674} 3709}
3675 3710
@@ -3677,15 +3712,36 @@ static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
3677{ 3712{
3678 struct btrfs_ordered_extent *ordered; 3713 struct btrfs_ordered_extent *ordered;
3679 3714
3680 spin_lock(&root->fs_info->ordered_extent_lock); 3715 spin_lock(&root->ordered_extent_lock);
3681 /* 3716 /*
3682 * This will just short circuit the ordered completion stuff which will 3717 * This will just short circuit the ordered completion stuff which will
3683 * make sure the ordered extent gets properly cleaned up. 3718 * make sure the ordered extent gets properly cleaned up.
3684 */ 3719 */
3685 list_for_each_entry(ordered, &root->fs_info->ordered_extents, 3720 list_for_each_entry(ordered, &root->ordered_extents,
3686 root_extent_list) 3721 root_extent_list)
3687 set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); 3722 set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
3688 spin_unlock(&root->fs_info->ordered_extent_lock); 3723 spin_unlock(&root->ordered_extent_lock);
3724}
3725
3726static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
3727{
3728 struct btrfs_root *root;
3729 struct list_head splice;
3730
3731 INIT_LIST_HEAD(&splice);
3732
3733 spin_lock(&fs_info->ordered_root_lock);
3734 list_splice_init(&fs_info->ordered_roots, &splice);
3735 while (!list_empty(&splice)) {
3736 root = list_first_entry(&splice, struct btrfs_root,
3737 ordered_root);
3738 list_del_init(&root->ordered_root);
3739
3740 btrfs_destroy_ordered_extents(root);
3741
3742 cond_resched_lock(&fs_info->ordered_root_lock);
3743 }
3744 spin_unlock(&fs_info->ordered_root_lock);
3689} 3745}
3690 3746
3691int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, 3747int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
@@ -3707,6 +3763,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
3707 3763
3708 while ((node = rb_first(&delayed_refs->root)) != NULL) { 3764 while ((node = rb_first(&delayed_refs->root)) != NULL) {
3709 struct btrfs_delayed_ref_head *head = NULL; 3765 struct btrfs_delayed_ref_head *head = NULL;
3766 bool pin_bytes = false;
3710 3767
3711 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 3768 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
3712 atomic_set(&ref->refs, 1); 3769 atomic_set(&ref->refs, 1);
@@ -3727,8 +3784,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
3727 } 3784 }
3728 3785
3729 if (head->must_insert_reserved) 3786 if (head->must_insert_reserved)
3730 btrfs_pin_extent(root, ref->bytenr, 3787 pin_bytes = true;
3731 ref->num_bytes, 1);
3732 btrfs_free_delayed_extent_op(head->extent_op); 3788 btrfs_free_delayed_extent_op(head->extent_op);
3733 delayed_refs->num_heads--; 3789 delayed_refs->num_heads--;
3734 if (list_empty(&head->cluster)) 3790 if (list_empty(&head->cluster))
@@ -3739,9 +3795,13 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
3739 ref->in_tree = 0; 3795 ref->in_tree = 0;
3740 rb_erase(&ref->rb_node, &delayed_refs->root); 3796 rb_erase(&ref->rb_node, &delayed_refs->root);
3741 delayed_refs->num_entries--; 3797 delayed_refs->num_entries--;
3742 if (head)
3743 mutex_unlock(&head->mutex);
3744 spin_unlock(&delayed_refs->lock); 3798 spin_unlock(&delayed_refs->lock);
3799 if (head) {
3800 if (pin_bytes)
3801 btrfs_pin_extent(root, ref->bytenr,
3802 ref->num_bytes, 1);
3803 mutex_unlock(&head->mutex);
3804 }
3745 btrfs_put_delayed_ref(ref); 3805 btrfs_put_delayed_ref(ref);
3746 3806
3747 cond_resched(); 3807 cond_resched();
@@ -3778,24 +3838,49 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
3778 3838
3779 INIT_LIST_HEAD(&splice); 3839 INIT_LIST_HEAD(&splice);
3780 3840
3781 spin_lock(&root->fs_info->delalloc_lock); 3841 spin_lock(&root->delalloc_lock);
3782 list_splice_init(&root->fs_info->delalloc_inodes, &splice); 3842 list_splice_init(&root->delalloc_inodes, &splice);
3783 3843
3784 while (!list_empty(&splice)) { 3844 while (!list_empty(&splice)) {
3785 btrfs_inode = list_entry(splice.next, struct btrfs_inode, 3845 btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
3786 delalloc_inodes); 3846 delalloc_inodes);
3787 3847
3788 list_del_init(&btrfs_inode->delalloc_inodes); 3848 list_del_init(&btrfs_inode->delalloc_inodes);
3789 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, 3849 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
3790 &btrfs_inode->runtime_flags); 3850 &btrfs_inode->runtime_flags);
3791 spin_unlock(&root->fs_info->delalloc_lock); 3851 spin_unlock(&root->delalloc_lock);
3792 3852
3793 btrfs_invalidate_inodes(btrfs_inode->root); 3853 btrfs_invalidate_inodes(btrfs_inode->root);
3794 3854
3795 spin_lock(&root->fs_info->delalloc_lock); 3855 spin_lock(&root->delalloc_lock);
3796 } 3856 }
3797 3857
3798 spin_unlock(&root->fs_info->delalloc_lock); 3858 spin_unlock(&root->delalloc_lock);
3859}
3860
3861static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
3862{
3863 struct btrfs_root *root;
3864 struct list_head splice;
3865
3866 INIT_LIST_HEAD(&splice);
3867
3868 spin_lock(&fs_info->delalloc_root_lock);
3869 list_splice_init(&fs_info->delalloc_roots, &splice);
3870 while (!list_empty(&splice)) {
3871 root = list_first_entry(&splice, struct btrfs_root,
3872 delalloc_root);
3873 list_del_init(&root->delalloc_root);
3874 root = btrfs_grab_fs_root(root);
3875 BUG_ON(!root);
3876 spin_unlock(&fs_info->delalloc_root_lock);
3877
3878 btrfs_destroy_delalloc_inodes(root);
3879 btrfs_put_fs_root(root);
3880
3881 spin_lock(&fs_info->delalloc_root_lock);
3882 }
3883 spin_unlock(&fs_info->delalloc_root_lock);
3799} 3884}
3800 3885
3801static int btrfs_destroy_marked_extents(struct btrfs_root *root, 3886static int btrfs_destroy_marked_extents(struct btrfs_root *root,
@@ -3879,19 +3964,14 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
3879 btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv, 3964 btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
3880 cur_trans->dirty_pages.dirty_bytes); 3965 cur_trans->dirty_pages.dirty_bytes);
3881 3966
3882 /* FIXME: cleanup wait for commit */ 3967 cur_trans->state = TRANS_STATE_COMMIT_START;
3883 cur_trans->in_commit = 1;
3884 cur_trans->blocked = 1;
3885 wake_up(&root->fs_info->transaction_blocked_wait); 3968 wake_up(&root->fs_info->transaction_blocked_wait);
3886 3969
3887 btrfs_evict_pending_snapshots(cur_trans); 3970 btrfs_evict_pending_snapshots(cur_trans);
3888 3971
3889 cur_trans->blocked = 0; 3972 cur_trans->state = TRANS_STATE_UNBLOCKED;
3890 wake_up(&root->fs_info->transaction_wait); 3973 wake_up(&root->fs_info->transaction_wait);
3891 3974
3892 cur_trans->commit_done = 1;
3893 wake_up(&cur_trans->commit_wait);
3894
3895 btrfs_destroy_delayed_inodes(root); 3975 btrfs_destroy_delayed_inodes(root);
3896 btrfs_assert_delayed_root_empty(root); 3976 btrfs_assert_delayed_root_empty(root);
3897 3977
@@ -3900,6 +3980,9 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
3900 btrfs_destroy_pinned_extent(root, 3980 btrfs_destroy_pinned_extent(root,
3901 root->fs_info->pinned_extents); 3981 root->fs_info->pinned_extents);
3902 3982
3983 cur_trans->state =TRANS_STATE_COMPLETED;
3984 wake_up(&cur_trans->commit_wait);
3985
3903 /* 3986 /*
3904 memset(cur_trans, 0, sizeof(*cur_trans)); 3987 memset(cur_trans, 0, sizeof(*cur_trans));
3905 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 3988 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
@@ -3915,7 +3998,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
3915 3998
3916 spin_lock(&root->fs_info->trans_lock); 3999 spin_lock(&root->fs_info->trans_lock);
3917 list_splice_init(&root->fs_info->trans_list, &list); 4000 list_splice_init(&root->fs_info->trans_list, &list);
3918 root->fs_info->trans_no_join = 1; 4001 root->fs_info->running_transaction = NULL;
3919 spin_unlock(&root->fs_info->trans_lock); 4002 spin_unlock(&root->fs_info->trans_lock);
3920 4003
3921 while (!list_empty(&list)) { 4004 while (!list_empty(&list)) {
@@ -3923,37 +4006,31 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
3923 4006
3924 btrfs_destroy_ordered_operations(t, root); 4007 btrfs_destroy_ordered_operations(t, root);
3925 4008
3926 btrfs_destroy_ordered_extents(root); 4009 btrfs_destroy_all_ordered_extents(root->fs_info);
3927 4010
3928 btrfs_destroy_delayed_refs(t, root); 4011 btrfs_destroy_delayed_refs(t, root);
3929 4012
3930 /* FIXME: cleanup wait for commit */ 4013 /*
3931 t->in_commit = 1; 4014 * FIXME: cleanup wait for commit
3932 t->blocked = 1; 4015 * We needn't acquire the lock here, because we are during
4016 * the umount, there is no other task which will change it.
4017 */
4018 t->state = TRANS_STATE_COMMIT_START;
3933 smp_mb(); 4019 smp_mb();
3934 if (waitqueue_active(&root->fs_info->transaction_blocked_wait)) 4020 if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
3935 wake_up(&root->fs_info->transaction_blocked_wait); 4021 wake_up(&root->fs_info->transaction_blocked_wait);
3936 4022
3937 btrfs_evict_pending_snapshots(t); 4023 btrfs_evict_pending_snapshots(t);
3938 4024
3939 t->blocked = 0; 4025 t->state = TRANS_STATE_UNBLOCKED;
3940 smp_mb(); 4026 smp_mb();
3941 if (waitqueue_active(&root->fs_info->transaction_wait)) 4027 if (waitqueue_active(&root->fs_info->transaction_wait))
3942 wake_up(&root->fs_info->transaction_wait); 4028 wake_up(&root->fs_info->transaction_wait);
3943 4029
3944 t->commit_done = 1;
3945 smp_mb();
3946 if (waitqueue_active(&t->commit_wait))
3947 wake_up(&t->commit_wait);
3948
3949 btrfs_destroy_delayed_inodes(root); 4030 btrfs_destroy_delayed_inodes(root);
3950 btrfs_assert_delayed_root_empty(root); 4031 btrfs_assert_delayed_root_empty(root);
3951 4032
3952 btrfs_destroy_delalloc_inodes(root); 4033 btrfs_destroy_all_delalloc_inodes(root->fs_info);
3953
3954 spin_lock(&root->fs_info->trans_lock);
3955 root->fs_info->running_transaction = NULL;
3956 spin_unlock(&root->fs_info->trans_lock);
3957 4034
3958 btrfs_destroy_marked_extents(root, &t->dirty_pages, 4035 btrfs_destroy_marked_extents(root, &t->dirty_pages,
3959 EXTENT_DIRTY); 4036 EXTENT_DIRTY);
@@ -3961,15 +4038,17 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
3961 btrfs_destroy_pinned_extent(root, 4038 btrfs_destroy_pinned_extent(root,
3962 root->fs_info->pinned_extents); 4039 root->fs_info->pinned_extents);
3963 4040
4041 t->state = TRANS_STATE_COMPLETED;
4042 smp_mb();
4043 if (waitqueue_active(&t->commit_wait))
4044 wake_up(&t->commit_wait);
4045
3964 atomic_set(&t->use_count, 0); 4046 atomic_set(&t->use_count, 0);
3965 list_del_init(&t->list); 4047 list_del_init(&t->list);
3966 memset(t, 0, sizeof(*t)); 4048 memset(t, 0, sizeof(*t));
3967 kmem_cache_free(btrfs_transaction_cachep, t); 4049 kmem_cache_free(btrfs_transaction_cachep, t);
3968 } 4050 }
3969 4051
3970 spin_lock(&root->fs_info->trans_lock);
3971 root->fs_info->trans_no_join = 0;
3972 spin_unlock(&root->fs_info->trans_lock);
3973 mutex_unlock(&root->fs_info->transaction_kthread_mutex); 4052 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
3974 4053
3975 return 0; 4054 return 0;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index be69ce1b07a2..b71acd6e1e5b 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -63,14 +63,40 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
63int btrfs_commit_super(struct btrfs_root *root); 63int btrfs_commit_super(struct btrfs_root *root);
64struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, 64struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
65 u64 bytenr, u32 blocksize); 65 u64 bytenr, u32 blocksize);
66struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, 66struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
67 struct btrfs_key *location); 67 struct btrfs_key *location);
68int btrfs_init_fs_root(struct btrfs_root *root);
69int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
70 struct btrfs_root *root);
68struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, 71struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
69 struct btrfs_key *location); 72 struct btrfs_key *location);
70int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); 73int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
71void btrfs_btree_balance_dirty(struct btrfs_root *root); 74void btrfs_btree_balance_dirty(struct btrfs_root *root);
72void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root); 75void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root);
73void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); 76void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
77 struct btrfs_root *root);
78void btrfs_free_fs_root(struct btrfs_root *root);
79
80/*
81 * This function is used to grab the root, and avoid it is freed when we
82 * access it. But it doesn't ensure that the tree is not dropped.
83 *
84 * If you want to ensure the whole tree is safe, you should use
85 * fs_info->subvol_srcu
86 */
87static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root)
88{
89 if (atomic_inc_not_zero(&root->refs))
90 return root;
91 return NULL;
92}
93
94static inline void btrfs_put_fs_root(struct btrfs_root *root)
95{
96 if (atomic_dec_and_test(&root->refs))
97 kfree(root);
98}
99
74void btrfs_mark_buffer_dirty(struct extent_buffer *buf); 100void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
75int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, 101int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
76 int atomic); 102 int atomic);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 81ee29eeb7ca..4b8691607373 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -82,11 +82,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
82 goto fail; 82 goto fail;
83 } 83 }
84 84
85 if (btrfs_root_refs(&root->root_item) == 0) {
86 err = -ENOENT;
87 goto fail;
88 }
89
90 key.objectid = objectid; 85 key.objectid = objectid;
91 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 86 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
92 key.offset = 0; 87 key.offset = 0;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index df472ab1b5ac..0236de711989 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -24,6 +24,7 @@
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/ratelimit.h> 26#include <linux/ratelimit.h>
27#include <linux/percpu_counter.h>
27#include "compat.h" 28#include "compat.h"
28#include "hash.h" 29#include "hash.h"
29#include "ctree.h" 30#include "ctree.h"
@@ -2526,6 +2527,51 @@ static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
2526 return 0; 2527 return 0;
2527} 2528}
2528 2529
2530static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
2531{
2532 u64 num_bytes;
2533
2534 num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2535 sizeof(struct btrfs_extent_inline_ref));
2536 if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
2537 num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2538
2539 /*
2540 * We don't ever fill up leaves all the way so multiply by 2 just to be
2541 * closer to what we're really going to want to ouse.
2542 */
2543 return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
2544}
2545
2546int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
2547 struct btrfs_root *root)
2548{
2549 struct btrfs_block_rsv *global_rsv;
2550 u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
2551 u64 num_bytes;
2552 int ret = 0;
2553
2554 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
2555 num_heads = heads_to_leaves(root, num_heads);
2556 if (num_heads > 1)
2557 num_bytes += (num_heads - 1) * root->leafsize;
2558 num_bytes <<= 1;
2559 global_rsv = &root->fs_info->global_block_rsv;
2560
2561 /*
2562 * If we can't allocate any more chunks lets make sure we have _lots_ of
2563 * wiggle room since running delayed refs can create more delayed refs.
2564 */
2565 if (global_rsv->space_info->full)
2566 num_bytes <<= 1;
2567
2568 spin_lock(&global_rsv->lock);
2569 if (global_rsv->reserved <= num_bytes)
2570 ret = 1;
2571 spin_unlock(&global_rsv->lock);
2572 return ret;
2573}
2574
2529/* 2575/*
2530 * this starts processing the delayed reference count updates and 2576 * this starts processing the delayed reference count updates and
2531 * extent insertions we have queued up so far. count can be 2577 * extent insertions we have queued up so far. count can be
@@ -2573,7 +2619,8 @@ progress:
2573 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); 2619 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
2574 if (old) { 2620 if (old) {
2575 DEFINE_WAIT(__wait); 2621 DEFINE_WAIT(__wait);
2576 if (delayed_refs->num_entries < 16348) 2622 if (delayed_refs->flushing ||
2623 !btrfs_should_throttle_delayed_refs(trans, root))
2577 return 0; 2624 return 0;
2578 2625
2579 prepare_to_wait(&delayed_refs->wait, &__wait, 2626 prepare_to_wait(&delayed_refs->wait, &__wait,
@@ -2608,7 +2655,7 @@ again:
2608 2655
2609 while (1) { 2656 while (1) {
2610 if (!(run_all || run_most) && 2657 if (!(run_all || run_most) &&
2611 delayed_refs->num_heads_ready < 64) 2658 !btrfs_should_throttle_delayed_refs(trans, root))
2612 break; 2659 break;
2613 2660
2614 /* 2661 /*
@@ -2629,6 +2676,7 @@ again:
2629 spin_unlock(&delayed_refs->lock); 2676 spin_unlock(&delayed_refs->lock);
2630 btrfs_abort_transaction(trans, root, ret); 2677 btrfs_abort_transaction(trans, root, ret);
2631 atomic_dec(&delayed_refs->procs_running_refs); 2678 atomic_dec(&delayed_refs->procs_running_refs);
2679 wake_up(&delayed_refs->wait);
2632 return ret; 2680 return ret;
2633 } 2681 }
2634 2682
@@ -3310,6 +3358,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3310 struct btrfs_space_info *found; 3358 struct btrfs_space_info *found;
3311 int i; 3359 int i;
3312 int factor; 3360 int factor;
3361 int ret;
3313 3362
3314 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3363 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3315 BTRFS_BLOCK_GROUP_RAID10)) 3364 BTRFS_BLOCK_GROUP_RAID10))
@@ -3333,6 +3382,12 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3333 if (!found) 3382 if (!found)
3334 return -ENOMEM; 3383 return -ENOMEM;
3335 3384
3385 ret = percpu_counter_init(&found->total_bytes_pinned, 0);
3386 if (ret) {
3387 kfree(found);
3388 return ret;
3389 }
3390
3336 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 3391 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3337 INIT_LIST_HEAD(&found->block_groups[i]); 3392 INIT_LIST_HEAD(&found->block_groups[i]);
3338 init_rwsem(&found->groups_sem); 3393 init_rwsem(&found->groups_sem);
@@ -3565,10 +3620,11 @@ alloc:
3565 } 3620 }
3566 3621
3567 /* 3622 /*
3568 * If we have less pinned bytes than we want to allocate then 3623 * If we don't have enough pinned space to deal with this
3569 * don't bother committing the transaction, it won't help us. 3624 * allocation don't bother committing the transaction.
3570 */ 3625 */
3571 if (data_sinfo->bytes_pinned < bytes) 3626 if (percpu_counter_compare(&data_sinfo->total_bytes_pinned,
3627 bytes) < 0)
3572 committed = 1; 3628 committed = 1;
3573 spin_unlock(&data_sinfo->lock); 3629 spin_unlock(&data_sinfo->lock);
3574 3630
@@ -3577,6 +3633,7 @@ commit_trans:
3577 if (!committed && 3633 if (!committed &&
3578 !atomic_read(&root->fs_info->open_ioctl_trans)) { 3634 !atomic_read(&root->fs_info->open_ioctl_trans)) {
3579 committed = 1; 3635 committed = 1;
3636
3580 trans = btrfs_join_transaction(root); 3637 trans = btrfs_join_transaction(root);
3581 if (IS_ERR(trans)) 3638 if (IS_ERR(trans))
3582 return PTR_ERR(trans); 3639 return PTR_ERR(trans);
@@ -3609,6 +3666,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3609 3666
3610 data_sinfo = root->fs_info->data_sinfo; 3667 data_sinfo = root->fs_info->data_sinfo;
3611 spin_lock(&data_sinfo->lock); 3668 spin_lock(&data_sinfo->lock);
3669 WARN_ON(data_sinfo->bytes_may_use < bytes);
3612 data_sinfo->bytes_may_use -= bytes; 3670 data_sinfo->bytes_may_use -= bytes;
3613 trace_btrfs_space_reservation(root->fs_info, "space_info", 3671 trace_btrfs_space_reservation(root->fs_info, "space_info",
3614 data_sinfo->flags, bytes, 0); 3672 data_sinfo->flags, bytes, 0);
@@ -3886,12 +3944,11 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
3886 unsigned long nr_pages) 3944 unsigned long nr_pages)
3887{ 3945{
3888 struct super_block *sb = root->fs_info->sb; 3946 struct super_block *sb = root->fs_info->sb;
3889 int started;
3890 3947
3891 /* If we can not start writeback, just sync all the delalloc file. */ 3948 if (down_read_trylock(&sb->s_umount)) {
3892 started = try_to_writeback_inodes_sb_nr(sb, nr_pages, 3949 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
3893 WB_REASON_FS_FREE_SPACE); 3950 up_read(&sb->s_umount);
3894 if (!started) { 3951 } else {
3895 /* 3952 /*
3896 * We needn't worry the filesystem going from r/w to r/o though 3953 * We needn't worry the filesystem going from r/w to r/o though
3897 * we don't acquire ->s_umount mutex, because the filesystem 3954 * we don't acquire ->s_umount mutex, because the filesystem
@@ -3899,9 +3956,9 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
3899 * the filesystem is readonly(all dirty pages are written to 3956 * the filesystem is readonly(all dirty pages are written to
3900 * the disk). 3957 * the disk).
3901 */ 3958 */
3902 btrfs_start_delalloc_inodes(root, 0); 3959 btrfs_start_all_delalloc_inodes(root->fs_info, 0);
3903 if (!current->journal_info) 3960 if (!current->journal_info)
3904 btrfs_wait_ordered_extents(root, 0); 3961 btrfs_wait_all_ordered_extents(root->fs_info, 0);
3905 } 3962 }
3906} 3963}
3907 3964
@@ -3931,7 +3988,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3931 if (delalloc_bytes == 0) { 3988 if (delalloc_bytes == 0) {
3932 if (trans) 3989 if (trans)
3933 return; 3990 return;
3934 btrfs_wait_ordered_extents(root, 0); 3991 btrfs_wait_all_ordered_extents(root->fs_info, 0);
3935 return; 3992 return;
3936 } 3993 }
3937 3994
@@ -3959,7 +4016,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3959 4016
3960 loops++; 4017 loops++;
3961 if (wait_ordered && !trans) { 4018 if (wait_ordered && !trans) {
3962 btrfs_wait_ordered_extents(root, 0); 4019 btrfs_wait_all_ordered_extents(root->fs_info, 0);
3963 } else { 4020 } else {
3964 time_left = schedule_timeout_killable(1); 4021 time_left = schedule_timeout_killable(1);
3965 if (time_left) 4022 if (time_left)
@@ -3997,7 +4054,8 @@ static int may_commit_transaction(struct btrfs_root *root,
3997 4054
3998 /* See if there is enough pinned space to make this reservation */ 4055 /* See if there is enough pinned space to make this reservation */
3999 spin_lock(&space_info->lock); 4056 spin_lock(&space_info->lock);
4000 if (space_info->bytes_pinned >= bytes) { 4057 if (percpu_counter_compare(&space_info->total_bytes_pinned,
4058 bytes) >= 0) {
4001 spin_unlock(&space_info->lock); 4059 spin_unlock(&space_info->lock);
4002 goto commit; 4060 goto commit;
4003 } 4061 }
@@ -4012,7 +4070,8 @@ static int may_commit_transaction(struct btrfs_root *root,
4012 4070
4013 spin_lock(&space_info->lock); 4071 spin_lock(&space_info->lock);
4014 spin_lock(&delayed_rsv->lock); 4072 spin_lock(&delayed_rsv->lock);
4015 if (space_info->bytes_pinned + delayed_rsv->size < bytes) { 4073 if (percpu_counter_compare(&space_info->total_bytes_pinned,
4074 bytes - delayed_rsv->size) >= 0) {
4016 spin_unlock(&delayed_rsv->lock); 4075 spin_unlock(&delayed_rsv->lock);
4017 spin_unlock(&space_info->lock); 4076 spin_unlock(&space_info->lock);
4018 return -ENOSPC; 4077 return -ENOSPC;
@@ -4297,6 +4356,31 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
4297 spin_unlock(&block_rsv->lock); 4356 spin_unlock(&block_rsv->lock);
4298} 4357}
4299 4358
4359int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
4360 struct btrfs_block_rsv *dest, u64 num_bytes,
4361 int min_factor)
4362{
4363 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4364 u64 min_bytes;
4365
4366 if (global_rsv->space_info != dest->space_info)
4367 return -ENOSPC;
4368
4369 spin_lock(&global_rsv->lock);
4370 min_bytes = div_factor(global_rsv->size, min_factor);
4371 if (global_rsv->reserved < min_bytes + num_bytes) {
4372 spin_unlock(&global_rsv->lock);
4373 return -ENOSPC;
4374 }
4375 global_rsv->reserved -= num_bytes;
4376 if (global_rsv->reserved < global_rsv->size)
4377 global_rsv->full = 0;
4378 spin_unlock(&global_rsv->lock);
4379
4380 block_rsv_add_bytes(dest, num_bytes, 1);
4381 return 0;
4382}
4383
4300static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, 4384static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
4301 struct btrfs_block_rsv *block_rsv, 4385 struct btrfs_block_rsv *block_rsv,
4302 struct btrfs_block_rsv *dest, u64 num_bytes) 4386 struct btrfs_block_rsv *dest, u64 num_bytes)
@@ -5030,14 +5114,14 @@ static int update_block_group(struct btrfs_root *root,
5030 int factor; 5114 int factor;
5031 5115
5032 /* block accounting for super block */ 5116 /* block accounting for super block */
5033 spin_lock(&info->delalloc_lock); 5117 spin_lock(&info->delalloc_root_lock);
5034 old_val = btrfs_super_bytes_used(info->super_copy); 5118 old_val = btrfs_super_bytes_used(info->super_copy);
5035 if (alloc) 5119 if (alloc)
5036 old_val += num_bytes; 5120 old_val += num_bytes;
5037 else 5121 else
5038 old_val -= num_bytes; 5122 old_val -= num_bytes;
5039 btrfs_set_super_bytes_used(info->super_copy, old_val); 5123 btrfs_set_super_bytes_used(info->super_copy, old_val);
5040 spin_unlock(&info->delalloc_lock); 5124 spin_unlock(&info->delalloc_root_lock);
5041 5125
5042 while (total) { 5126 while (total) {
5043 cache = btrfs_lookup_block_group(info, bytenr); 5127 cache = btrfs_lookup_block_group(info, bytenr);
@@ -5189,6 +5273,80 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
5189 return ret; 5273 return ret;
5190} 5274}
5191 5275
5276static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes)
5277{
5278 int ret;
5279 struct btrfs_block_group_cache *block_group;
5280 struct btrfs_caching_control *caching_ctl;
5281
5282 block_group = btrfs_lookup_block_group(root->fs_info, start);
5283 if (!block_group)
5284 return -EINVAL;
5285
5286 cache_block_group(block_group, 0);
5287 caching_ctl = get_caching_control(block_group);
5288
5289 if (!caching_ctl) {
5290 /* Logic error */
5291 BUG_ON(!block_group_cache_done(block_group));
5292 ret = btrfs_remove_free_space(block_group, start, num_bytes);
5293 } else {
5294 mutex_lock(&caching_ctl->mutex);
5295
5296 if (start >= caching_ctl->progress) {
5297 ret = add_excluded_extent(root, start, num_bytes);
5298 } else if (start + num_bytes <= caching_ctl->progress) {
5299 ret = btrfs_remove_free_space(block_group,
5300 start, num_bytes);
5301 } else {
5302 num_bytes = caching_ctl->progress - start;
5303 ret = btrfs_remove_free_space(block_group,
5304 start, num_bytes);
5305 if (ret)
5306 goto out_lock;
5307
5308 num_bytes = (start + num_bytes) -
5309 caching_ctl->progress;
5310 start = caching_ctl->progress;
5311 ret = add_excluded_extent(root, start, num_bytes);
5312 }
5313out_lock:
5314 mutex_unlock(&caching_ctl->mutex);
5315 put_caching_control(caching_ctl);
5316 }
5317 btrfs_put_block_group(block_group);
5318 return ret;
5319}
5320
5321int btrfs_exclude_logged_extents(struct btrfs_root *log,
5322 struct extent_buffer *eb)
5323{
5324 struct btrfs_file_extent_item *item;
5325 struct btrfs_key key;
5326 int found_type;
5327 int i;
5328
5329 if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS))
5330 return 0;
5331
5332 for (i = 0; i < btrfs_header_nritems(eb); i++) {
5333 btrfs_item_key_to_cpu(eb, &key, i);
5334 if (key.type != BTRFS_EXTENT_DATA_KEY)
5335 continue;
5336 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
5337 found_type = btrfs_file_extent_type(eb, item);
5338 if (found_type == BTRFS_FILE_EXTENT_INLINE)
5339 continue;
5340 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
5341 continue;
5342 key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
5343 key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
5344 __exclude_logged_extent(log, key.objectid, key.offset);
5345 }
5346
5347 return 0;
5348}
5349
5192/** 5350/**
5193 * btrfs_update_reserved_bytes - update the block_group and space info counters 5351 * btrfs_update_reserved_bytes - update the block_group and space info counters
5194 * @cache: The cache we are manipulating 5352 * @cache: The cache we are manipulating
@@ -5251,6 +5409,7 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
5251 struct btrfs_caching_control *next; 5409 struct btrfs_caching_control *next;
5252 struct btrfs_caching_control *caching_ctl; 5410 struct btrfs_caching_control *caching_ctl;
5253 struct btrfs_block_group_cache *cache; 5411 struct btrfs_block_group_cache *cache;
5412 struct btrfs_space_info *space_info;
5254 5413
5255 down_write(&fs_info->extent_commit_sem); 5414 down_write(&fs_info->extent_commit_sem);
5256 5415
@@ -5273,6 +5432,9 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
5273 5432
5274 up_write(&fs_info->extent_commit_sem); 5433 up_write(&fs_info->extent_commit_sem);
5275 5434
5435 list_for_each_entry_rcu(space_info, &fs_info->space_info, list)
5436 percpu_counter_set(&space_info->total_bytes_pinned, 0);
5437
5276 update_global_block_rsv(fs_info); 5438 update_global_block_rsv(fs_info);
5277} 5439}
5278 5440
@@ -5370,6 +5532,27 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
5370 return 0; 5532 return 0;
5371} 5533}
5372 5534
5535static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
5536 u64 owner, u64 root_objectid)
5537{
5538 struct btrfs_space_info *space_info;
5539 u64 flags;
5540
5541 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
5542 if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
5543 flags = BTRFS_BLOCK_GROUP_SYSTEM;
5544 else
5545 flags = BTRFS_BLOCK_GROUP_METADATA;
5546 } else {
5547 flags = BTRFS_BLOCK_GROUP_DATA;
5548 }
5549
5550 space_info = __find_space_info(fs_info, flags);
5551 BUG_ON(!space_info); /* Logic bug */
5552 percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
5553}
5554
5555
5373static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 5556static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5374 struct btrfs_root *root, 5557 struct btrfs_root *root,
5375 u64 bytenr, u64 num_bytes, u64 parent, 5558 u64 bytenr, u64 num_bytes, u64 parent,
@@ -5590,6 +5773,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5590 goto out; 5773 goto out;
5591 } 5774 }
5592 } 5775 }
5776 add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid,
5777 root_objectid);
5593 } else { 5778 } else {
5594 if (found_extent) { 5779 if (found_extent) {
5595 BUG_ON(is_data && refs_to_drop != 5780 BUG_ON(is_data && refs_to_drop !=
@@ -5713,6 +5898,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
5713 u64 parent, int last_ref) 5898 u64 parent, int last_ref)
5714{ 5899{
5715 struct btrfs_block_group_cache *cache = NULL; 5900 struct btrfs_block_group_cache *cache = NULL;
5901 int pin = 1;
5716 int ret; 5902 int ret;
5717 5903
5718 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 5904 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
@@ -5745,8 +5931,14 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
5745 5931
5746 btrfs_add_free_space(cache, buf->start, buf->len); 5932 btrfs_add_free_space(cache, buf->start, buf->len);
5747 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE); 5933 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
5934 pin = 0;
5748 } 5935 }
5749out: 5936out:
5937 if (pin)
5938 add_pinned_bytes(root->fs_info, buf->len,
5939 btrfs_header_level(buf),
5940 root->root_key.objectid);
5941
5750 /* 5942 /*
5751 * Deleting the buffer, clear the corrupt flag since it doesn't matter 5943 * Deleting the buffer, clear the corrupt flag since it doesn't matter
5752 * anymore. 5944 * anymore.
@@ -5763,6 +5955,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5763 int ret; 5955 int ret;
5764 struct btrfs_fs_info *fs_info = root->fs_info; 5956 struct btrfs_fs_info *fs_info = root->fs_info;
5765 5957
5958 add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
5959
5766 /* 5960 /*
5767 * tree log blocks never actually go into the extent allocation 5961 * tree log blocks never actually go into the extent allocation
5768 * tree, just update pinning info and exit early. 5962 * tree, just update pinning info and exit early.
@@ -6560,52 +6754,26 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
6560{ 6754{
6561 int ret; 6755 int ret;
6562 struct btrfs_block_group_cache *block_group; 6756 struct btrfs_block_group_cache *block_group;
6563 struct btrfs_caching_control *caching_ctl;
6564 u64 start = ins->objectid;
6565 u64 num_bytes = ins->offset;
6566
6567 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
6568 cache_block_group(block_group, 0);
6569 caching_ctl = get_caching_control(block_group);
6570
6571 if (!caching_ctl) {
6572 BUG_ON(!block_group_cache_done(block_group));
6573 ret = btrfs_remove_free_space(block_group, start, num_bytes);
6574 if (ret)
6575 goto out;
6576 } else {
6577 mutex_lock(&caching_ctl->mutex);
6578 6757
6579 if (start >= caching_ctl->progress) { 6758 /*
6580 ret = add_excluded_extent(root, start, num_bytes); 6759 * Mixed block groups will exclude before processing the log so we only
6581 } else if (start + num_bytes <= caching_ctl->progress) { 6760 * need to do the exlude dance if this fs isn't mixed.
6582 ret = btrfs_remove_free_space(block_group, 6761 */
6583 start, num_bytes); 6762 if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) {
6584 } else { 6763 ret = __exclude_logged_extent(root, ins->objectid, ins->offset);
6585 num_bytes = caching_ctl->progress - start;
6586 ret = btrfs_remove_free_space(block_group,
6587 start, num_bytes);
6588 if (ret)
6589 goto out_lock;
6590
6591 start = caching_ctl->progress;
6592 num_bytes = ins->objectid + ins->offset -
6593 caching_ctl->progress;
6594 ret = add_excluded_extent(root, start, num_bytes);
6595 }
6596out_lock:
6597 mutex_unlock(&caching_ctl->mutex);
6598 put_caching_control(caching_ctl);
6599 if (ret) 6764 if (ret)
6600 goto out; 6765 return ret;
6601 } 6766 }
6602 6767
6768 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
6769 if (!block_group)
6770 return -EINVAL;
6771
6603 ret = btrfs_update_reserved_bytes(block_group, ins->offset, 6772 ret = btrfs_update_reserved_bytes(block_group, ins->offset,
6604 RESERVE_ALLOC_NO_ACCOUNT); 6773 RESERVE_ALLOC_NO_ACCOUNT);
6605 BUG_ON(ret); /* logic error */ 6774 BUG_ON(ret); /* logic error */
6606 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 6775 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
6607 0, owner, offset, ins, 1); 6776 0, owner, offset, ins, 1);
6608out:
6609 btrfs_put_block_group(block_group); 6777 btrfs_put_block_group(block_group);
6610 return ret; 6778 return ret;
6611} 6779}
@@ -7384,7 +7552,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
7384 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 7552 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
7385 7553
7386 while (1) { 7554 while (1) {
7387 if (!for_reloc && btrfs_fs_closing(root->fs_info)) { 7555 if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
7388 pr_debug("btrfs: drop snapshot early exit\n"); 7556 pr_debug("btrfs: drop snapshot early exit\n");
7389 err = -EAGAIN; 7557 err = -EAGAIN;
7390 goto out_end_trans; 7558 goto out_end_trans;
@@ -7447,8 +7615,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
7447 } 7615 }
7448 7616
7449 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 7617 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
7450 ret = btrfs_find_last_root(tree_root, root->root_key.objectid, 7618 ret = btrfs_find_root(tree_root, &root->root_key, path,
7451 NULL, NULL); 7619 NULL, NULL);
7452 if (ret < 0) { 7620 if (ret < 0) {
7453 btrfs_abort_transaction(trans, tree_root, ret); 7621 btrfs_abort_transaction(trans, tree_root, ret);
7454 err = ret; 7622 err = ret;
@@ -7465,11 +7633,11 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
7465 } 7633 }
7466 7634
7467 if (root->in_radix) { 7635 if (root->in_radix) {
7468 btrfs_free_fs_root(tree_root->fs_info, root); 7636 btrfs_drop_and_free_fs_root(tree_root->fs_info, root);
7469 } else { 7637 } else {
7470 free_extent_buffer(root->node); 7638 free_extent_buffer(root->node);
7471 free_extent_buffer(root->commit_root); 7639 free_extent_buffer(root->commit_root);
7472 kfree(root); 7640 btrfs_put_fs_root(root);
7473 } 7641 }
7474out_end_trans: 7642out_end_trans:
7475 btrfs_end_transaction_throttle(trans, tree_root); 7643 btrfs_end_transaction_throttle(trans, tree_root);
@@ -7782,6 +7950,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7782 struct btrfs_space_info *space_info; 7950 struct btrfs_space_info *space_info;
7783 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 7951 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
7784 struct btrfs_device *device; 7952 struct btrfs_device *device;
7953 struct btrfs_trans_handle *trans;
7785 u64 min_free; 7954 u64 min_free;
7786 u64 dev_min = 1; 7955 u64 dev_min = 1;
7787 u64 dev_nr = 0; 7956 u64 dev_nr = 0;
@@ -7868,6 +8037,13 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7868 do_div(min_free, dev_min); 8037 do_div(min_free, dev_min);
7869 } 8038 }
7870 8039
8040 /* We need to do this so that we can look at pending chunks */
8041 trans = btrfs_join_transaction(root);
8042 if (IS_ERR(trans)) {
8043 ret = PTR_ERR(trans);
8044 goto out;
8045 }
8046
7871 mutex_lock(&root->fs_info->chunk_mutex); 8047 mutex_lock(&root->fs_info->chunk_mutex);
7872 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 8048 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
7873 u64 dev_offset; 8049 u64 dev_offset;
@@ -7878,7 +8054,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7878 */ 8054 */
7879 if (device->total_bytes > device->bytes_used + min_free && 8055 if (device->total_bytes > device->bytes_used + min_free &&
7880 !device->is_tgtdev_for_dev_replace) { 8056 !device->is_tgtdev_for_dev_replace) {
7881 ret = find_free_dev_extent(device, min_free, 8057 ret = find_free_dev_extent(trans, device, min_free,
7882 &dev_offset, NULL); 8058 &dev_offset, NULL);
7883 if (!ret) 8059 if (!ret)
7884 dev_nr++; 8060 dev_nr++;
@@ -7890,6 +8066,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7890 } 8066 }
7891 } 8067 }
7892 mutex_unlock(&root->fs_info->chunk_mutex); 8068 mutex_unlock(&root->fs_info->chunk_mutex);
8069 btrfs_end_transaction(trans, root);
7893out: 8070out:
7894 btrfs_put_block_group(block_group); 8071 btrfs_put_block_group(block_group);
7895 return ret; 8072 return ret;
@@ -8032,6 +8209,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
8032 dump_space_info(space_info, 0, 0); 8209 dump_space_info(space_info, 0, 0);
8033 } 8210 }
8034 } 8211 }
8212 percpu_counter_destroy(&space_info->total_bytes_pinned);
8035 list_del(&space_info->list); 8213 list_del(&space_info->list);
8036 kfree(space_info); 8214 kfree(space_info);
8037 } 8215 }
@@ -8254,6 +8432,10 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
8254 sizeof(item)); 8432 sizeof(item));
8255 if (ret) 8433 if (ret)
8256 btrfs_abort_transaction(trans, extent_root, ret); 8434 btrfs_abort_transaction(trans, extent_root, ret);
8435 ret = btrfs_finish_chunk_alloc(trans, extent_root,
8436 key.objectid, key.offset);
8437 if (ret)
8438 btrfs_abort_transaction(trans, extent_root, ret);
8257 } 8439 }
8258} 8440}
8259 8441
@@ -8591,8 +8773,15 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
8591 if (end - start >= range->minlen) { 8773 if (end - start >= range->minlen) {
8592 if (!block_group_cache_done(cache)) { 8774 if (!block_group_cache_done(cache)) {
8593 ret = cache_block_group(cache, 0); 8775 ret = cache_block_group(cache, 0);
8594 if (!ret) 8776 if (ret) {
8595 wait_block_group_cache_done(cache); 8777 btrfs_put_block_group(cache);
8778 break;
8779 }
8780 ret = wait_block_group_cache_done(cache);
8781 if (ret) {
8782 btrfs_put_block_group(cache);
8783 break;
8784 }
8596 } 8785 }
8597 ret = btrfs_trim_block_group(cache, 8786 ret = btrfs_trim_block_group(cache,
8598 &group_trimmed, 8787 &group_trimmed,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 6bca9472f313..583d98bd065e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -77,10 +77,29 @@ void btrfs_leak_debug_check(void)
77 kmem_cache_free(extent_buffer_cache, eb); 77 kmem_cache_free(extent_buffer_cache, eb);
78 } 78 }
79} 79}
80
81#define btrfs_debug_check_extent_io_range(inode, start, end) \
82 __btrfs_debug_check_extent_io_range(__func__, (inode), (start), (end))
83static inline void __btrfs_debug_check_extent_io_range(const char *caller,
84 struct inode *inode, u64 start, u64 end)
85{
86 u64 isize = i_size_read(inode);
87
88 if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
89 printk_ratelimited(KERN_DEBUG
90 "btrfs: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
91 caller,
92 (unsigned long long)btrfs_ino(inode),
93 (unsigned long long)isize,
94 (unsigned long long)start,
95 (unsigned long long)end);
96 }
97}
80#else 98#else
81#define btrfs_leak_debug_add(new, head) do {} while (0) 99#define btrfs_leak_debug_add(new, head) do {} while (0)
82#define btrfs_leak_debug_del(entry) do {} while (0) 100#define btrfs_leak_debug_del(entry) do {} while (0)
83#define btrfs_leak_debug_check() do {} while (0) 101#define btrfs_leak_debug_check() do {} while (0)
102#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
84#endif 103#endif
85 104
86#define BUFFER_LRU_MAX 64 105#define BUFFER_LRU_MAX 64
@@ -522,6 +541,11 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
522 int err; 541 int err;
523 int clear = 0; 542 int clear = 0;
524 543
544 btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
545
546 if (bits & EXTENT_DELALLOC)
547 bits |= EXTENT_NORESERVE;
548
525 if (delete) 549 if (delete)
526 bits |= ~EXTENT_CTLBITS; 550 bits |= ~EXTENT_CTLBITS;
527 bits |= EXTENT_FIRST_DELALLOC; 551 bits |= EXTENT_FIRST_DELALLOC;
@@ -677,6 +701,8 @@ static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
677 struct extent_state *state; 701 struct extent_state *state;
678 struct rb_node *node; 702 struct rb_node *node;
679 703
704 btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
705
680 spin_lock(&tree->lock); 706 spin_lock(&tree->lock);
681again: 707again:
682 while (1) { 708 while (1) {
@@ -769,6 +795,8 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
769 u64 last_start; 795 u64 last_start;
770 u64 last_end; 796 u64 last_end;
771 797
798 btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
799
772 bits |= EXTENT_FIRST_DELALLOC; 800 bits |= EXTENT_FIRST_DELALLOC;
773again: 801again:
774 if (!prealloc && (mask & __GFP_WAIT)) { 802 if (!prealloc && (mask & __GFP_WAIT)) {
@@ -989,6 +1017,8 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
989 u64 last_start; 1017 u64 last_start;
990 u64 last_end; 1018 u64 last_end;
991 1019
1020 btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
1021
992again: 1022again:
993 if (!prealloc && (mask & __GFP_WAIT)) { 1023 if (!prealloc && (mask & __GFP_WAIT)) {
994 prealloc = alloc_extent_state(mask); 1024 prealloc = alloc_extent_state(mask);
@@ -2450,11 +2480,12 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2450 struct extent_state *cached = NULL; 2480 struct extent_state *cached = NULL;
2451 struct extent_state *state; 2481 struct extent_state *state;
2452 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); 2482 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
2483 struct inode *inode = page->mapping->host;
2453 2484
2454 pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " 2485 pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
2455 "mirror=%lu\n", (u64)bio->bi_sector, err, 2486 "mirror=%lu\n", (u64)bio->bi_sector, err,
2456 io_bio->mirror_num); 2487 io_bio->mirror_num);
2457 tree = &BTRFS_I(page->mapping->host)->io_tree; 2488 tree = &BTRFS_I(inode)->io_tree;
2458 2489
2459 /* We always issue full-page reads, but if some block 2490 /* We always issue full-page reads, but if some block
2460 * in a page fails to read, blk_update_request() will 2491 * in a page fails to read, blk_update_request() will
@@ -2528,6 +2559,14 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2528 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); 2559 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
2529 2560
2530 if (uptodate) { 2561 if (uptodate) {
2562 loff_t i_size = i_size_read(inode);
2563 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2564 unsigned offset;
2565
2566 /* Zero out the end if this page straddles i_size */
2567 offset = i_size & (PAGE_CACHE_SIZE-1);
2568 if (page->index == end_index && offset)
2569 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2531 SetPageUptodate(page); 2570 SetPageUptodate(page);
2532 } else { 2571 } else {
2533 ClearPageUptodate(page); 2572 ClearPageUptodate(page);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 41fb81e7ec53..3b8c4e26e1da 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -19,6 +19,7 @@
19#define EXTENT_FIRST_DELALLOC (1 << 12) 19#define EXTENT_FIRST_DELALLOC (1 << 12)
20#define EXTENT_NEED_WAIT (1 << 13) 20#define EXTENT_NEED_WAIT (1 << 13)
21#define EXTENT_DAMAGED (1 << 14) 21#define EXTENT_DAMAGED (1 << 14)
22#define EXTENT_NORESERVE (1 << 15)
22#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 23#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
23#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) 24#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
24 25
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index b193bf324a41..a7bfc9541803 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -34,8 +34,7 @@
34 34
35#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \ 35#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
36 sizeof(struct btrfs_ordered_sum)) / \ 36 sizeof(struct btrfs_ordered_sum)) / \
37 sizeof(struct btrfs_sector_sum) * \ 37 sizeof(u32) * (r)->sectorsize)
38 (r)->sectorsize - (r)->sectorsize)
39 38
40int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, 39int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
41 struct btrfs_root *root, 40 struct btrfs_root *root,
@@ -297,7 +296,6 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
297 struct btrfs_path *path; 296 struct btrfs_path *path;
298 struct extent_buffer *leaf; 297 struct extent_buffer *leaf;
299 struct btrfs_ordered_sum *sums; 298 struct btrfs_ordered_sum *sums;
300 struct btrfs_sector_sum *sector_sum;
301 struct btrfs_csum_item *item; 299 struct btrfs_csum_item *item;
302 LIST_HEAD(tmplist); 300 LIST_HEAD(tmplist);
303 unsigned long offset; 301 unsigned long offset;
@@ -368,34 +366,28 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
368 struct btrfs_csum_item); 366 struct btrfs_csum_item);
369 while (start < csum_end) { 367 while (start < csum_end) {
370 size = min_t(size_t, csum_end - start, 368 size = min_t(size_t, csum_end - start,
371 MAX_ORDERED_SUM_BYTES(root)); 369 MAX_ORDERED_SUM_BYTES(root));
372 sums = kzalloc(btrfs_ordered_sum_size(root, size), 370 sums = kzalloc(btrfs_ordered_sum_size(root, size),
373 GFP_NOFS); 371 GFP_NOFS);
374 if (!sums) { 372 if (!sums) {
375 ret = -ENOMEM; 373 ret = -ENOMEM;
376 goto fail; 374 goto fail;
377 } 375 }
378 376
379 sector_sum = sums->sums;
380 sums->bytenr = start; 377 sums->bytenr = start;
381 sums->len = size; 378 sums->len = (int)size;
382 379
383 offset = (start - key.offset) >> 380 offset = (start - key.offset) >>
384 root->fs_info->sb->s_blocksize_bits; 381 root->fs_info->sb->s_blocksize_bits;
385 offset *= csum_size; 382 offset *= csum_size;
383 size >>= root->fs_info->sb->s_blocksize_bits;
386 384
387 while (size > 0) { 385 read_extent_buffer(path->nodes[0],
388 read_extent_buffer(path->nodes[0], 386 sums->sums,
389 &sector_sum->sum, 387 ((unsigned long)item) + offset,
390 ((unsigned long)item) + 388 csum_size * size);
391 offset, csum_size); 389
392 sector_sum->bytenr = start; 390 start += root->sectorsize * size;
393
394 size -= root->sectorsize;
395 start += root->sectorsize;
396 offset += csum_size;
397 sector_sum++;
398 }
399 list_add_tail(&sums->list, &tmplist); 391 list_add_tail(&sums->list, &tmplist);
400 } 392 }
401 path->slots[0]++; 393 path->slots[0]++;
@@ -417,23 +409,20 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
417 struct bio *bio, u64 file_start, int contig) 409 struct bio *bio, u64 file_start, int contig)
418{ 410{
419 struct btrfs_ordered_sum *sums; 411 struct btrfs_ordered_sum *sums;
420 struct btrfs_sector_sum *sector_sum;
421 struct btrfs_ordered_extent *ordered; 412 struct btrfs_ordered_extent *ordered;
422 char *data; 413 char *data;
423 struct bio_vec *bvec = bio->bi_io_vec; 414 struct bio_vec *bvec = bio->bi_io_vec;
424 int bio_index = 0; 415 int bio_index = 0;
416 int index;
425 unsigned long total_bytes = 0; 417 unsigned long total_bytes = 0;
426 unsigned long this_sum_bytes = 0; 418 unsigned long this_sum_bytes = 0;
427 u64 offset; 419 u64 offset;
428 u64 disk_bytenr;
429 420
430 WARN_ON(bio->bi_vcnt <= 0); 421 WARN_ON(bio->bi_vcnt <= 0);
431 sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS); 422 sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
432 if (!sums) 423 if (!sums)
433 return -ENOMEM; 424 return -ENOMEM;
434 425
435 sector_sum = sums->sums;
436 disk_bytenr = (u64)bio->bi_sector << 9;
437 sums->len = bio->bi_size; 426 sums->len = bio->bi_size;
438 INIT_LIST_HEAD(&sums->list); 427 INIT_LIST_HEAD(&sums->list);
439 428
@@ -444,7 +433,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
444 433
445 ordered = btrfs_lookup_ordered_extent(inode, offset); 434 ordered = btrfs_lookup_ordered_extent(inode, offset);
446 BUG_ON(!ordered); /* Logic error */ 435 BUG_ON(!ordered); /* Logic error */
447 sums->bytenr = ordered->start; 436 sums->bytenr = (u64)bio->bi_sector << 9;
437 index = 0;
448 438
449 while (bio_index < bio->bi_vcnt) { 439 while (bio_index < bio->bi_vcnt) {
450 if (!contig) 440 if (!contig)
@@ -463,28 +453,27 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
463 sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left), 453 sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
464 GFP_NOFS); 454 GFP_NOFS);
465 BUG_ON(!sums); /* -ENOMEM */ 455 BUG_ON(!sums); /* -ENOMEM */
466 sector_sum = sums->sums;
467 sums->len = bytes_left; 456 sums->len = bytes_left;
468 ordered = btrfs_lookup_ordered_extent(inode, offset); 457 ordered = btrfs_lookup_ordered_extent(inode, offset);
469 BUG_ON(!ordered); /* Logic error */ 458 BUG_ON(!ordered); /* Logic error */
470 sums->bytenr = ordered->start; 459 sums->bytenr = ((u64)bio->bi_sector << 9) +
460 total_bytes;
461 index = 0;
471 } 462 }
472 463
473 data = kmap_atomic(bvec->bv_page); 464 data = kmap_atomic(bvec->bv_page);
474 sector_sum->sum = ~(u32)0; 465 sums->sums[index] = ~(u32)0;
475 sector_sum->sum = btrfs_csum_data(data + bvec->bv_offset, 466 sums->sums[index] = btrfs_csum_data(data + bvec->bv_offset,
476 sector_sum->sum, 467 sums->sums[index],
477 bvec->bv_len); 468 bvec->bv_len);
478 kunmap_atomic(data); 469 kunmap_atomic(data);
479 btrfs_csum_final(sector_sum->sum, 470 btrfs_csum_final(sums->sums[index],
480 (char *)&sector_sum->sum); 471 (char *)(sums->sums + index));
481 sector_sum->bytenr = disk_bytenr;
482 472
483 sector_sum++;
484 bio_index++; 473 bio_index++;
474 index++;
485 total_bytes += bvec->bv_len; 475 total_bytes += bvec->bv_len;
486 this_sum_bytes += bvec->bv_len; 476 this_sum_bytes += bvec->bv_len;
487 disk_bytenr += bvec->bv_len;
488 offset += bvec->bv_len; 477 offset += bvec->bv_len;
489 bvec++; 478 bvec++;
490 } 479 }
@@ -672,62 +661,46 @@ out:
672 return ret; 661 return ret;
673} 662}
674 663
675static u64 btrfs_sector_sum_left(struct btrfs_ordered_sum *sums,
676 struct btrfs_sector_sum *sector_sum,
677 u64 total_bytes, u64 sectorsize)
678{
679 u64 tmp = sectorsize;
680 u64 next_sector = sector_sum->bytenr;
681 struct btrfs_sector_sum *next = sector_sum + 1;
682
683 while ((tmp + total_bytes) < sums->len) {
684 if (next_sector + sectorsize != next->bytenr)
685 break;
686 tmp += sectorsize;
687 next_sector = next->bytenr;
688 next++;
689 }
690 return tmp;
691}
692
693int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, 664int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
694 struct btrfs_root *root, 665 struct btrfs_root *root,
695 struct btrfs_ordered_sum *sums) 666 struct btrfs_ordered_sum *sums)
696{ 667{
697 u64 bytenr;
698 int ret;
699 struct btrfs_key file_key; 668 struct btrfs_key file_key;
700 struct btrfs_key found_key; 669 struct btrfs_key found_key;
701 u64 next_offset;
702 u64 total_bytes = 0;
703 int found_next;
704 struct btrfs_path *path; 670 struct btrfs_path *path;
705 struct btrfs_csum_item *item; 671 struct btrfs_csum_item *item;
706 struct btrfs_csum_item *item_end; 672 struct btrfs_csum_item *item_end;
707 struct extent_buffer *leaf = NULL; 673 struct extent_buffer *leaf = NULL;
674 u64 next_offset;
675 u64 total_bytes = 0;
708 u64 csum_offset; 676 u64 csum_offset;
709 struct btrfs_sector_sum *sector_sum; 677 u64 bytenr;
710 u32 nritems; 678 u32 nritems;
711 u32 ins_size; 679 u32 ins_size;
680 int index = 0;
681 int found_next;
682 int ret;
712 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); 683 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
713 684
714 path = btrfs_alloc_path(); 685 path = btrfs_alloc_path();
715 if (!path) 686 if (!path)
716 return -ENOMEM; 687 return -ENOMEM;
717
718 sector_sum = sums->sums;
719again: 688again:
720 next_offset = (u64)-1; 689 next_offset = (u64)-1;
721 found_next = 0; 690 found_next = 0;
691 bytenr = sums->bytenr + total_bytes;
722 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 692 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
723 file_key.offset = sector_sum->bytenr; 693 file_key.offset = bytenr;
724 bytenr = sector_sum->bytenr;
725 btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY); 694 btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
726 695
727 item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1); 696 item = btrfs_lookup_csum(trans, root, path, bytenr, 1);
728 if (!IS_ERR(item)) { 697 if (!IS_ERR(item)) {
729 leaf = path->nodes[0];
730 ret = 0; 698 ret = 0;
699 leaf = path->nodes[0];
700 item_end = btrfs_item_ptr(leaf, path->slots[0],
701 struct btrfs_csum_item);
702 item_end = (struct btrfs_csum_item *)((char *)item_end +
703 btrfs_item_size_nr(leaf, path->slots[0]));
731 goto found; 704 goto found;
732 } 705 }
733 ret = PTR_ERR(item); 706 ret = PTR_ERR(item);
@@ -807,8 +780,7 @@ again:
807 780
808 free_space = btrfs_leaf_free_space(root, leaf) - 781 free_space = btrfs_leaf_free_space(root, leaf) -
809 sizeof(struct btrfs_item) - csum_size; 782 sizeof(struct btrfs_item) - csum_size;
810 tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes, 783 tmp = sums->len - total_bytes;
811 root->sectorsize);
812 tmp >>= root->fs_info->sb->s_blocksize_bits; 784 tmp >>= root->fs_info->sb->s_blocksize_bits;
813 WARN_ON(tmp < 1); 785 WARN_ON(tmp < 1);
814 786
@@ -822,6 +794,7 @@ again:
822 diff *= csum_size; 794 diff *= csum_size;
823 795
824 btrfs_extend_item(root, path, diff); 796 btrfs_extend_item(root, path, diff);
797 ret = 0;
825 goto csum; 798 goto csum;
826 } 799 }
827 800
@@ -831,8 +804,7 @@ insert:
831 if (found_next) { 804 if (found_next) {
832 u64 tmp; 805 u64 tmp;
833 806
834 tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes, 807 tmp = sums->len - total_bytes;
835 root->sectorsize);
836 tmp >>= root->fs_info->sb->s_blocksize_bits; 808 tmp >>= root->fs_info->sb->s_blocksize_bits;
837 tmp = min(tmp, (next_offset - file_key.offset) >> 809 tmp = min(tmp, (next_offset - file_key.offset) >>
838 root->fs_info->sb->s_blocksize_bits); 810 root->fs_info->sb->s_blocksize_bits);
@@ -853,31 +825,25 @@ insert:
853 WARN_ON(1); 825 WARN_ON(1);
854 goto fail_unlock; 826 goto fail_unlock;
855 } 827 }
856csum:
857 leaf = path->nodes[0]; 828 leaf = path->nodes[0];
829csum:
858 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); 830 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
859 ret = 0; 831 item_end = (struct btrfs_csum_item *)((unsigned char *)item +
832 btrfs_item_size_nr(leaf, path->slots[0]));
860 item = (struct btrfs_csum_item *)((unsigned char *)item + 833 item = (struct btrfs_csum_item *)((unsigned char *)item +
861 csum_offset * csum_size); 834 csum_offset * csum_size);
862found: 835found:
863 item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); 836 ins_size = (u32)(sums->len - total_bytes) >>
864 item_end = (struct btrfs_csum_item *)((unsigned char *)item_end + 837 root->fs_info->sb->s_blocksize_bits;
865 btrfs_item_size_nr(leaf, path->slots[0])); 838 ins_size *= csum_size;
866next_sector: 839 ins_size = min_t(u32, (unsigned long)item_end - (unsigned long)item,
867 840 ins_size);
868 write_extent_buffer(leaf, &sector_sum->sum, (unsigned long)item, csum_size); 841 write_extent_buffer(leaf, sums->sums + index, (unsigned long)item,
869 842 ins_size);
870 total_bytes += root->sectorsize; 843
871 sector_sum++; 844 ins_size /= csum_size;
872 if (total_bytes < sums->len) { 845 total_bytes += ins_size * root->sectorsize;
873 item = (struct btrfs_csum_item *)((char *)item + 846 index += ins_size;
874 csum_size);
875 if (item < item_end && bytenr + PAGE_CACHE_SIZE ==
876 sector_sum->bytenr) {
877 bytenr = sector_sum->bytenr;
878 goto next_sector;
879 }
880 }
881 847
882 btrfs_mark_buffer_dirty(path->nodes[0]); 848 btrfs_mark_buffer_dirty(path->nodes[0]);
883 if (total_bytes < sums->len) { 849 if (total_bytes < sums->len) {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 89da56a58b63..a005fe2c072a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -309,10 +309,6 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
309 ret = PTR_ERR(inode_root); 309 ret = PTR_ERR(inode_root);
310 goto cleanup; 310 goto cleanup;
311 } 311 }
312 if (btrfs_root_refs(&inode_root->root_item) == 0) {
313 ret = -ENOENT;
314 goto cleanup;
315 }
316 312
317 key.objectid = defrag->ino; 313 key.objectid = defrag->ino;
318 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 314 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
@@ -1317,6 +1313,56 @@ fail:
1317 1313
1318} 1314}
1319 1315
1316static noinline int check_can_nocow(struct inode *inode, loff_t pos,
1317 size_t *write_bytes)
1318{
1319 struct btrfs_trans_handle *trans;
1320 struct btrfs_root *root = BTRFS_I(inode)->root;
1321 struct btrfs_ordered_extent *ordered;
1322 u64 lockstart, lockend;
1323 u64 num_bytes;
1324 int ret;
1325
1326 lockstart = round_down(pos, root->sectorsize);
1327 lockend = lockstart + round_up(*write_bytes, root->sectorsize) - 1;
1328
1329 while (1) {
1330 lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
1331 ordered = btrfs_lookup_ordered_range(inode, lockstart,
1332 lockend - lockstart + 1);
1333 if (!ordered) {
1334 break;
1335 }
1336 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
1337 btrfs_start_ordered_extent(inode, ordered, 1);
1338 btrfs_put_ordered_extent(ordered);
1339 }
1340
1341 trans = btrfs_join_transaction(root);
1342 if (IS_ERR(trans)) {
1343 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
1344 return PTR_ERR(trans);
1345 }
1346
1347 num_bytes = lockend - lockstart + 1;
1348 ret = can_nocow_extent(trans, inode, lockstart, &num_bytes, NULL, NULL,
1349 NULL);
1350 btrfs_end_transaction(trans, root);
1351 if (ret <= 0) {
1352 ret = 0;
1353 } else {
1354 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
1355 EXTENT_DIRTY | EXTENT_DELALLOC |
1356 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
1357 NULL, GFP_NOFS);
1358 *write_bytes = min_t(size_t, *write_bytes, num_bytes);
1359 }
1360
1361 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
1362
1363 return ret;
1364}
1365
1320static noinline ssize_t __btrfs_buffered_write(struct file *file, 1366static noinline ssize_t __btrfs_buffered_write(struct file *file,
1321 struct iov_iter *i, 1367 struct iov_iter *i,
1322 loff_t pos) 1368 loff_t pos)
@@ -1324,10 +1370,12 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1324 struct inode *inode = file_inode(file); 1370 struct inode *inode = file_inode(file);
1325 struct btrfs_root *root = BTRFS_I(inode)->root; 1371 struct btrfs_root *root = BTRFS_I(inode)->root;
1326 struct page **pages = NULL; 1372 struct page **pages = NULL;
1373 u64 release_bytes = 0;
1327 unsigned long first_index; 1374 unsigned long first_index;
1328 size_t num_written = 0; 1375 size_t num_written = 0;
1329 int nrptrs; 1376 int nrptrs;
1330 int ret = 0; 1377 int ret = 0;
1378 bool only_release_metadata = false;
1331 bool force_page_uptodate = false; 1379 bool force_page_uptodate = false;
1332 1380
1333 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / 1381 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
@@ -1348,6 +1396,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1348 offset); 1396 offset);
1349 size_t num_pages = (write_bytes + offset + 1397 size_t num_pages = (write_bytes + offset +
1350 PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1398 PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1399 size_t reserve_bytes;
1351 size_t dirty_pages; 1400 size_t dirty_pages;
1352 size_t copied; 1401 size_t copied;
1353 1402
@@ -1362,11 +1411,41 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1362 break; 1411 break;
1363 } 1412 }
1364 1413
1365 ret = btrfs_delalloc_reserve_space(inode, 1414 reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
1366 num_pages << PAGE_CACHE_SHIFT); 1415 ret = btrfs_check_data_free_space(inode, reserve_bytes);
1416 if (ret == -ENOSPC &&
1417 (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
1418 BTRFS_INODE_PREALLOC))) {
1419 ret = check_can_nocow(inode, pos, &write_bytes);
1420 if (ret > 0) {
1421 only_release_metadata = true;
1422 /*
1423 * our prealloc extent may be smaller than
1424 * write_bytes, so scale down.
1425 */
1426 num_pages = (write_bytes + offset +
1427 PAGE_CACHE_SIZE - 1) >>
1428 PAGE_CACHE_SHIFT;
1429 reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
1430 ret = 0;
1431 } else {
1432 ret = -ENOSPC;
1433 }
1434 }
1435
1367 if (ret) 1436 if (ret)
1368 break; 1437 break;
1369 1438
1439 ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
1440 if (ret) {
1441 if (!only_release_metadata)
1442 btrfs_free_reserved_data_space(inode,
1443 reserve_bytes);
1444 break;
1445 }
1446
1447 release_bytes = reserve_bytes;
1448
1370 /* 1449 /*
1371 * This is going to setup the pages array with the number of 1450 * This is going to setup the pages array with the number of
1372 * pages we want, so we don't really need to worry about the 1451 * pages we want, so we don't really need to worry about the
@@ -1375,11 +1454,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1375 ret = prepare_pages(root, file, pages, num_pages, 1454 ret = prepare_pages(root, file, pages, num_pages,
1376 pos, first_index, write_bytes, 1455 pos, first_index, write_bytes,
1377 force_page_uptodate); 1456 force_page_uptodate);
1378 if (ret) { 1457 if (ret)
1379 btrfs_delalloc_release_space(inode,
1380 num_pages << PAGE_CACHE_SHIFT);
1381 break; 1458 break;
1382 }
1383 1459
1384 copied = btrfs_copy_from_user(pos, num_pages, 1460 copied = btrfs_copy_from_user(pos, num_pages,
1385 write_bytes, pages, i); 1461 write_bytes, pages, i);
@@ -1409,30 +1485,46 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1409 * managed to copy. 1485 * managed to copy.
1410 */ 1486 */
1411 if (num_pages > dirty_pages) { 1487 if (num_pages > dirty_pages) {
1488 release_bytes = (num_pages - dirty_pages) <<
1489 PAGE_CACHE_SHIFT;
1412 if (copied > 0) { 1490 if (copied > 0) {
1413 spin_lock(&BTRFS_I(inode)->lock); 1491 spin_lock(&BTRFS_I(inode)->lock);
1414 BTRFS_I(inode)->outstanding_extents++; 1492 BTRFS_I(inode)->outstanding_extents++;
1415 spin_unlock(&BTRFS_I(inode)->lock); 1493 spin_unlock(&BTRFS_I(inode)->lock);
1416 } 1494 }
1417 btrfs_delalloc_release_space(inode, 1495 if (only_release_metadata)
1418 (num_pages - dirty_pages) << 1496 btrfs_delalloc_release_metadata(inode,
1419 PAGE_CACHE_SHIFT); 1497 release_bytes);
1498 else
1499 btrfs_delalloc_release_space(inode,
1500 release_bytes);
1420 } 1501 }
1421 1502
1503 release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
1422 if (copied > 0) { 1504 if (copied > 0) {
1423 ret = btrfs_dirty_pages(root, inode, pages, 1505 ret = btrfs_dirty_pages(root, inode, pages,
1424 dirty_pages, pos, copied, 1506 dirty_pages, pos, copied,
1425 NULL); 1507 NULL);
1426 if (ret) { 1508 if (ret) {
1427 btrfs_delalloc_release_space(inode,
1428 dirty_pages << PAGE_CACHE_SHIFT);
1429 btrfs_drop_pages(pages, num_pages); 1509 btrfs_drop_pages(pages, num_pages);
1430 break; 1510 break;
1431 } 1511 }
1432 } 1512 }
1433 1513
1514 release_bytes = 0;
1434 btrfs_drop_pages(pages, num_pages); 1515 btrfs_drop_pages(pages, num_pages);
1435 1516
1517 if (only_release_metadata && copied > 0) {
1518 u64 lockstart = round_down(pos, root->sectorsize);
1519 u64 lockend = lockstart +
1520 (dirty_pages << PAGE_CACHE_SHIFT) - 1;
1521
1522 set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
1523 lockend, EXTENT_NORESERVE, NULL,
1524 NULL, GFP_NOFS);
1525 only_release_metadata = false;
1526 }
1527
1436 cond_resched(); 1528 cond_resched();
1437 1529
1438 balance_dirty_pages_ratelimited(inode->i_mapping); 1530 balance_dirty_pages_ratelimited(inode->i_mapping);
@@ -1445,6 +1537,13 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1445 1537
1446 kfree(pages); 1538 kfree(pages);
1447 1539
1540 if (release_bytes) {
1541 if (only_release_metadata)
1542 btrfs_delalloc_release_metadata(inode, release_bytes);
1543 else
1544 btrfs_delalloc_release_space(inode, release_bytes);
1545 }
1546
1448 return num_written ? num_written : ret; 1547 return num_written ? num_written : ret;
1449} 1548}
1450 1549
@@ -2175,12 +2274,6 @@ static long btrfs_fallocate(struct file *file, int mode,
2175 goto out_reserve_fail; 2274 goto out_reserve_fail;
2176 } 2275 }
2177 2276
2178 /*
2179 * wait for ordered IO before we have any locks. We'll loop again
2180 * below with the locks held.
2181 */
2182 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
2183
2184 mutex_lock(&inode->i_mutex); 2277 mutex_lock(&inode->i_mutex);
2185 ret = inode_newsize_ok(inode, alloc_end); 2278 ret = inode_newsize_ok(inode, alloc_end);
2186 if (ret) 2279 if (ret)
@@ -2191,8 +2284,23 @@ static long btrfs_fallocate(struct file *file, int mode,
2191 alloc_start); 2284 alloc_start);
2192 if (ret) 2285 if (ret)
2193 goto out; 2286 goto out;
2287 } else {
2288 /*
2289 * If we are fallocating from the end of the file onward we
2290 * need to zero out the end of the page if i_size lands in the
2291 * middle of a page.
2292 */
2293 ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
2294 if (ret)
2295 goto out;
2194 } 2296 }
2195 2297
2298 /*
2299 * wait for ordered IO before we have any locks. We'll loop again
2300 * below with the locks held.
2301 */
2302 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
2303
2196 locked_end = alloc_end - 1; 2304 locked_end = alloc_end - 1;
2197 while (1) { 2305 while (1) {
2198 struct btrfs_ordered_extent *ordered; 2306 struct btrfs_ordered_extent *ordered;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 2750b5023526..b21a3cd667d8 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -213,7 +213,7 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
213 else 213 else
214 ret = 0; 214 ret = 0;
215 spin_unlock(&rsv->lock); 215 spin_unlock(&rsv->lock);
216 return 0; 216 return ret;
217} 217}
218 218
219int btrfs_truncate_free_space_cache(struct btrfs_root *root, 219int btrfs_truncate_free_space_cache(struct btrfs_root *root,
@@ -3150,6 +3150,8 @@ again:
3150 return 0; 3150 return 0;
3151} 3151}
3152 3152
3153#define test_msg(fmt, ...) printk(KERN_INFO "btrfs: selftest: " fmt, ##__VA_ARGS__)
3154
3153/* 3155/*
3154 * This test just does basic sanity checking, making sure we can add an exten 3156 * This test just does basic sanity checking, making sure we can add an exten
3155 * entry and remove space from either end and the middle, and make sure we can 3157 * entry and remove space from either end and the middle, and make sure we can
@@ -3159,63 +3161,63 @@ static int test_extents(struct btrfs_block_group_cache *cache)
3159{ 3161{
3160 int ret = 0; 3162 int ret = 0;
3161 3163
3162 printk(KERN_ERR "Running extent only tests\n"); 3164 test_msg("Running extent only tests\n");
3163 3165
3164 /* First just make sure we can remove an entire entry */ 3166 /* First just make sure we can remove an entire entry */
3165 ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024); 3167 ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
3166 if (ret) { 3168 if (ret) {
3167 printk(KERN_ERR "Error adding initial extents %d\n", ret); 3169 test_msg("Error adding initial extents %d\n", ret);
3168 return ret; 3170 return ret;
3169 } 3171 }
3170 3172
3171 ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024); 3173 ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
3172 if (ret) { 3174 if (ret) {
3173 printk(KERN_ERR "Error removing extent %d\n", ret); 3175 test_msg("Error removing extent %d\n", ret);
3174 return ret; 3176 return ret;
3175 } 3177 }
3176 3178
3177 if (check_exists(cache, 0, 4 * 1024 * 1024)) { 3179 if (check_exists(cache, 0, 4 * 1024 * 1024)) {
3178 printk(KERN_ERR "Full remove left some lingering space\n"); 3180 test_msg("Full remove left some lingering space\n");
3179 return -1; 3181 return -1;
3180 } 3182 }
3181 3183
3182 /* Ok edge and middle cases now */ 3184 /* Ok edge and middle cases now */
3183 ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024); 3185 ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
3184 if (ret) { 3186 if (ret) {
3185 printk(KERN_ERR "Error adding half extent %d\n", ret); 3187 test_msg("Error adding half extent %d\n", ret);
3186 return ret; 3188 return ret;
3187 } 3189 }
3188 3190
3189 ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024); 3191 ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024);
3190 if (ret) { 3192 if (ret) {
3191 printk(KERN_ERR "Error removing tail end %d\n", ret); 3193 test_msg("Error removing tail end %d\n", ret);
3192 return ret; 3194 return ret;
3193 } 3195 }
3194 3196
3195 ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024); 3197 ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
3196 if (ret) { 3198 if (ret) {
3197 printk(KERN_ERR "Error removing front end %d\n", ret); 3199 test_msg("Error removing front end %d\n", ret);
3198 return ret; 3200 return ret;
3199 } 3201 }
3200 3202
3201 ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096); 3203 ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096);
3202 if (ret) { 3204 if (ret) {
3203 printk(KERN_ERR "Error removing middle piece %d\n", ret); 3205 test_msg("Error removing middle piece %d\n", ret);
3204 return ret; 3206 return ret;
3205 } 3207 }
3206 3208
3207 if (check_exists(cache, 0, 1 * 1024 * 1024)) { 3209 if (check_exists(cache, 0, 1 * 1024 * 1024)) {
3208 printk(KERN_ERR "Still have space at the front\n"); 3210 test_msg("Still have space at the front\n");
3209 return -1; 3211 return -1;
3210 } 3212 }
3211 3213
3212 if (check_exists(cache, 2 * 1024 * 1024, 4096)) { 3214 if (check_exists(cache, 2 * 1024 * 1024, 4096)) {
3213 printk(KERN_ERR "Still have space in the middle\n"); 3215 test_msg("Still have space in the middle\n");
3214 return -1; 3216 return -1;
3215 } 3217 }
3216 3218
3217 if (check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) { 3219 if (check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) {
3218 printk(KERN_ERR "Still have space at the end\n"); 3220 test_msg("Still have space at the end\n");
3219 return -1; 3221 return -1;
3220 } 3222 }
3221 3223
@@ -3230,34 +3232,34 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache)
3230 u64 next_bitmap_offset; 3232 u64 next_bitmap_offset;
3231 int ret; 3233 int ret;
3232 3234
3233 printk(KERN_ERR "Running bitmap only tests\n"); 3235 test_msg("Running bitmap only tests\n");
3234 3236
3235 ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1); 3237 ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
3236 if (ret) { 3238 if (ret) {
3237 printk(KERN_ERR "Couldn't create a bitmap entry %d\n", ret); 3239 test_msg("Couldn't create a bitmap entry %d\n", ret);
3238 return ret; 3240 return ret;
3239 } 3241 }
3240 3242
3241 ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024); 3243 ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
3242 if (ret) { 3244 if (ret) {
3243 printk(KERN_ERR "Error removing bitmap full range %d\n", ret); 3245 test_msg("Error removing bitmap full range %d\n", ret);
3244 return ret; 3246 return ret;
3245 } 3247 }
3246 3248
3247 if (check_exists(cache, 0, 4 * 1024 * 1024)) { 3249 if (check_exists(cache, 0, 4 * 1024 * 1024)) {
3248 printk(KERN_ERR "Left some space in bitmap\n"); 3250 test_msg("Left some space in bitmap\n");
3249 return -1; 3251 return -1;
3250 } 3252 }
3251 3253
3252 ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1); 3254 ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
3253 if (ret) { 3255 if (ret) {
3254 printk(KERN_ERR "Couldn't add to our bitmap entry %d\n", ret); 3256 test_msg("Couldn't add to our bitmap entry %d\n", ret);
3255 return ret; 3257 return ret;
3256 } 3258 }
3257 3259
3258 ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024); 3260 ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024);
3259 if (ret) { 3261 if (ret) {
3260 printk(KERN_ERR "Couldn't remove middle chunk %d\n", ret); 3262 test_msg("Couldn't remove middle chunk %d\n", ret);
3261 return ret; 3263 return ret;
3262 } 3264 }
3263 3265
@@ -3271,21 +3273,21 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache)
3271 ret = add_free_space_entry(cache, next_bitmap_offset - 3273 ret = add_free_space_entry(cache, next_bitmap_offset -
3272 (2 * 1024 * 1024), 4 * 1024 * 1024, 1); 3274 (2 * 1024 * 1024), 4 * 1024 * 1024, 1);
3273 if (ret) { 3275 if (ret) {
3274 printk(KERN_ERR "Couldn't add space that straddles two bitmaps" 3276 test_msg("Couldn't add space that straddles two bitmaps %d\n",
3275 " %d\n", ret); 3277 ret);
3276 return ret; 3278 return ret;
3277 } 3279 }
3278 3280
3279 ret = btrfs_remove_free_space(cache, next_bitmap_offset - 3281 ret = btrfs_remove_free_space(cache, next_bitmap_offset -
3280 (1 * 1024 * 1024), 2 * 1024 * 1024); 3282 (1 * 1024 * 1024), 2 * 1024 * 1024);
3281 if (ret) { 3283 if (ret) {
3282 printk(KERN_ERR "Couldn't remove overlapping space %d\n", ret); 3284 test_msg("Couldn't remove overlapping space %d\n", ret);
3283 return ret; 3285 return ret;
3284 } 3286 }
3285 3287
3286 if (check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024), 3288 if (check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024),
3287 2 * 1024 * 1024)) { 3289 2 * 1024 * 1024)) {
3288 printk(KERN_ERR "Left some space when removing overlapping\n"); 3290 test_msg("Left some space when removing overlapping\n");
3289 return -1; 3291 return -1;
3290 } 3292 }
3291 3293
@@ -3300,7 +3302,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
3300 u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096); 3302 u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096);
3301 int ret; 3303 int ret;
3302 3304
3303 printk(KERN_ERR "Running bitmap and extent tests\n"); 3305 test_msg("Running bitmap and extent tests\n");
3304 3306
3305 /* 3307 /*
3306 * First let's do something simple, an extent at the same offset as the 3308 * First let's do something simple, an extent at the same offset as the
@@ -3309,42 +3311,42 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
3309 */ 3311 */
3310 ret = add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1); 3312 ret = add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1);
3311 if (ret) { 3313 if (ret) {
3312 printk(KERN_ERR "Couldn't create bitmap entry %d\n", ret); 3314 test_msg("Couldn't create bitmap entry %d\n", ret);
3313 return ret; 3315 return ret;
3314 } 3316 }
3315 3317
3316 ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0); 3318 ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
3317 if (ret) { 3319 if (ret) {
3318 printk(KERN_ERR "Couldn't add extent entry %d\n", ret); 3320 test_msg("Couldn't add extent entry %d\n", ret);
3319 return ret; 3321 return ret;
3320 } 3322 }
3321 3323
3322 ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024); 3324 ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
3323 if (ret) { 3325 if (ret) {
3324 printk(KERN_ERR "Couldn't remove extent entry %d\n", ret); 3326 test_msg("Couldn't remove extent entry %d\n", ret);
3325 return ret; 3327 return ret;
3326 } 3328 }
3327 3329
3328 if (check_exists(cache, 0, 1 * 1024 * 1024)) { 3330 if (check_exists(cache, 0, 1 * 1024 * 1024)) {
3329 printk(KERN_ERR "Left remnants after our remove\n"); 3331 test_msg("Left remnants after our remove\n");
3330 return -1; 3332 return -1;
3331 } 3333 }
3332 3334
3333 /* Now to add back the extent entry and remove from the bitmap */ 3335 /* Now to add back the extent entry and remove from the bitmap */
3334 ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0); 3336 ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
3335 if (ret) { 3337 if (ret) {
3336 printk(KERN_ERR "Couldn't re-add extent entry %d\n", ret); 3338 test_msg("Couldn't re-add extent entry %d\n", ret);
3337 return ret; 3339 return ret;
3338 } 3340 }
3339 3341
3340 ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024); 3342 ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024);
3341 if (ret) { 3343 if (ret) {
3342 printk(KERN_ERR "Couldn't remove from bitmap %d\n", ret); 3344 test_msg("Couldn't remove from bitmap %d\n", ret);
3343 return ret; 3345 return ret;
3344 } 3346 }
3345 3347
3346 if (check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) { 3348 if (check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) {
3347 printk(KERN_ERR "Left remnants in the bitmap\n"); 3349 test_msg("Left remnants in the bitmap\n");
3348 return -1; 3350 return -1;
3349 } 3351 }
3350 3352
@@ -3354,19 +3356,18 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
3354 */ 3356 */
3355 ret = add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1); 3357 ret = add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1);
3356 if (ret) { 3358 if (ret) {
3357 printk(KERN_ERR "Couldn't add to a bitmap %d\n", ret); 3359 test_msg("Couldn't add to a bitmap %d\n", ret);
3358 return ret; 3360 return ret;
3359 } 3361 }
3360 3362
3361 ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024); 3363 ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024);
3362 if (ret) { 3364 if (ret) {
3363 printk(KERN_ERR "Couldn't remove overlapping space %d\n", ret); 3365 test_msg("Couldn't remove overlapping space %d\n", ret);
3364 return ret; 3366 return ret;
3365 } 3367 }
3366 3368
3367 if (check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) { 3369 if (check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) {
3368 printk(KERN_ERR "Left over peices after removing " 3370 test_msg("Left over peices after removing overlapping\n");
3369 "overlapping\n");
3370 return -1; 3371 return -1;
3371 } 3372 }
3372 3373
@@ -3375,24 +3376,24 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
3375 /* Now with the extent entry offset into the bitmap */ 3376 /* Now with the extent entry offset into the bitmap */
3376 ret = add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1); 3377 ret = add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1);
3377 if (ret) { 3378 if (ret) {
3378 printk(KERN_ERR "Couldn't add space to the bitmap %d\n", ret); 3379 test_msg("Couldn't add space to the bitmap %d\n", ret);
3379 return ret; 3380 return ret;
3380 } 3381 }
3381 3382
3382 ret = add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0); 3383 ret = add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0);
3383 if (ret) { 3384 if (ret) {
3384 printk(KERN_ERR "Couldn't add extent to the cache %d\n", ret); 3385 test_msg("Couldn't add extent to the cache %d\n", ret);
3385 return ret; 3386 return ret;
3386 } 3387 }
3387 3388
3388 ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024); 3389 ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024);
3389 if (ret) { 3390 if (ret) {
3390 printk(KERN_ERR "Problem removing overlapping space %d\n", ret); 3391 test_msg("Problem removing overlapping space %d\n", ret);
3391 return ret; 3392 return ret;
3392 } 3393 }
3393 3394
3394 if (check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) { 3395 if (check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) {
3395 printk(KERN_ERR "Left something behind when removing space"); 3396 test_msg("Left something behind when removing space");
3396 return -1; 3397 return -1;
3397 } 3398 }
3398 3399
@@ -3410,27 +3411,27 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
3410 ret = add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024, 3411 ret = add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024,
3411 4 * 1024 * 1024, 1); 3412 4 * 1024 * 1024, 1);
3412 if (ret) { 3413 if (ret) {
3413 printk(KERN_ERR "Couldn't add bitmap %d\n", ret); 3414 test_msg("Couldn't add bitmap %d\n", ret);
3414 return ret; 3415 return ret;
3415 } 3416 }
3416 3417
3417 ret = add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024, 3418 ret = add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024,
3418 5 * 1024 * 1024, 0); 3419 5 * 1024 * 1024, 0);
3419 if (ret) { 3420 if (ret) {
3420 printk(KERN_ERR "Couldn't add extent entry %d\n", ret); 3421 test_msg("Couldn't add extent entry %d\n", ret);
3421 return ret; 3422 return ret;
3422 } 3423 }
3423 3424
3424 ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024, 3425 ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024,
3425 5 * 1024 * 1024); 3426 5 * 1024 * 1024);
3426 if (ret) { 3427 if (ret) {
3427 printk(KERN_ERR "Failed to free our space %d\n", ret); 3428 test_msg("Failed to free our space %d\n", ret);
3428 return ret; 3429 return ret;
3429 } 3430 }
3430 3431
3431 if (check_exists(cache, bitmap_offset + 1 * 1024 * 1024, 3432 if (check_exists(cache, bitmap_offset + 1 * 1024 * 1024,
3432 5 * 1024 * 1024)) { 3433 5 * 1024 * 1024)) {
3433 printk(KERN_ERR "Left stuff over\n"); 3434 test_msg("Left stuff over\n");
3434 return -1; 3435 return -1;
3435 } 3436 }
3436 3437
@@ -3444,20 +3445,19 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
3444 */ 3445 */
3445 ret = add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1); 3446 ret = add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1);
3446 if (ret) { 3447 if (ret) {
3447 printk(KERN_ERR "Couldn't add bitmap entry %d\n", ret); 3448 test_msg("Couldn't add bitmap entry %d\n", ret);
3448 return ret; 3449 return ret;
3449 } 3450 }
3450 3451
3451 ret = add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0); 3452 ret = add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0);
3452 if (ret) { 3453 if (ret) {
3453 printk(KERN_ERR "Couldn't add extent entry %d\n", ret); 3454 test_msg("Couldn't add extent entry %d\n", ret);
3454 return ret; 3455 return ret;
3455 } 3456 }
3456 3457
3457 ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024); 3458 ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024);
3458 if (ret) { 3459 if (ret) {
3459 printk(KERN_ERR "Error removing bitmap and extent " 3460 test_msg("Error removing bitmap and extent overlapping %d\n", ret);
3460 "overlapping %d\n", ret);
3461 return ret; 3461 return ret;
3462 } 3462 }
3463 3463
@@ -3469,11 +3469,11 @@ void btrfs_test_free_space_cache(void)
3469{ 3469{
3470 struct btrfs_block_group_cache *cache; 3470 struct btrfs_block_group_cache *cache;
3471 3471
3472 printk(KERN_ERR "Running btrfs free space cache tests\n"); 3472 test_msg("Running btrfs free space cache tests\n");
3473 3473
3474 cache = init_test_block_group(); 3474 cache = init_test_block_group();
3475 if (!cache) { 3475 if (!cache) {
3476 printk(KERN_ERR "Couldn't run the tests\n"); 3476 test_msg("Couldn't run the tests\n");
3477 return; 3477 return;
3478 } 3478 }
3479 3479
@@ -3487,6 +3487,9 @@ out:
3487 __btrfs_remove_free_space_cache(cache->free_space_ctl); 3487 __btrfs_remove_free_space_cache(cache->free_space_ctl);
3488 kfree(cache->free_space_ctl); 3488 kfree(cache->free_space_ctl);
3489 kfree(cache); 3489 kfree(cache);
3490 printk(KERN_ERR "Free space cache tests finished\n"); 3490 test_msg("Free space cache tests finished\n");
3491} 3491}
3492#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */ 3492#undef test_msg
3493#else /* !CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
3494void btrfs_test_free_space_cache(void) {}
3495#endif /* !CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 8b7f19f44961..894116b71304 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -113,8 +113,6 @@ int btrfs_return_cluster_to_free_space(
113int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, 113int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
114 u64 *trimmed, u64 start, u64 end, u64 minlen); 114 u64 *trimmed, u64 start, u64 end, u64 minlen);
115 115
116#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
117void btrfs_test_free_space_cache(void); 116void btrfs_test_free_space_cache(void);
118#endif
119 117
120#endif 118#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4f9d16b70d3d..6d1b93c8aafb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -42,6 +42,7 @@
42#include <linux/mount.h> 42#include <linux/mount.h>
43#include <linux/btrfs.h> 43#include <linux/btrfs.h>
44#include <linux/blkdev.h> 44#include <linux/blkdev.h>
45#include <linux/posix_acl_xattr.h>
45#include "compat.h" 46#include "compat.h"
46#include "ctree.h" 47#include "ctree.h"
47#include "disk-io.h" 48#include "disk-io.h"
@@ -57,6 +58,7 @@
57#include "free-space-cache.h" 58#include "free-space-cache.h"
58#include "inode-map.h" 59#include "inode-map.h"
59#include "backref.h" 60#include "backref.h"
61#include "hash.h"
60 62
61struct btrfs_iget_args { 63struct btrfs_iget_args {
62 u64 ino; 64 u64 ino;
@@ -701,8 +703,12 @@ retry:
701 async_extent->nr_pages = 0; 703 async_extent->nr_pages = 0;
702 async_extent->pages = NULL; 704 async_extent->pages = NULL;
703 705
704 if (ret == -ENOSPC) 706 if (ret == -ENOSPC) {
707 unlock_extent(io_tree, async_extent->start,
708 async_extent->start +
709 async_extent->ram_size - 1);
705 goto retry; 710 goto retry;
711 }
706 goto out_free; 712 goto out_free;
707 } 713 }
708 714
@@ -1529,6 +1535,46 @@ static void btrfs_merge_extent_hook(struct inode *inode,
1529 spin_unlock(&BTRFS_I(inode)->lock); 1535 spin_unlock(&BTRFS_I(inode)->lock);
1530} 1536}
1531 1537
1538static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
1539 struct inode *inode)
1540{
1541 spin_lock(&root->delalloc_lock);
1542 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1543 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1544 &root->delalloc_inodes);
1545 set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1546 &BTRFS_I(inode)->runtime_flags);
1547 root->nr_delalloc_inodes++;
1548 if (root->nr_delalloc_inodes == 1) {
1549 spin_lock(&root->fs_info->delalloc_root_lock);
1550 BUG_ON(!list_empty(&root->delalloc_root));
1551 list_add_tail(&root->delalloc_root,
1552 &root->fs_info->delalloc_roots);
1553 spin_unlock(&root->fs_info->delalloc_root_lock);
1554 }
1555 }
1556 spin_unlock(&root->delalloc_lock);
1557}
1558
1559static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1560 struct inode *inode)
1561{
1562 spin_lock(&root->delalloc_lock);
1563 if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1564 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1565 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1566 &BTRFS_I(inode)->runtime_flags);
1567 root->nr_delalloc_inodes--;
1568 if (!root->nr_delalloc_inodes) {
1569 spin_lock(&root->fs_info->delalloc_root_lock);
1570 BUG_ON(list_empty(&root->delalloc_root));
1571 list_del_init(&root->delalloc_root);
1572 spin_unlock(&root->fs_info->delalloc_root_lock);
1573 }
1574 }
1575 spin_unlock(&root->delalloc_lock);
1576}
1577
1532/* 1578/*
1533 * extent_io.c set_bit_hook, used to track delayed allocation 1579 * extent_io.c set_bit_hook, used to track delayed allocation
1534 * bytes in this file, and to maintain the list of inodes that 1580 * bytes in this file, and to maintain the list of inodes that
@@ -1561,16 +1607,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
1561 spin_lock(&BTRFS_I(inode)->lock); 1607 spin_lock(&BTRFS_I(inode)->lock);
1562 BTRFS_I(inode)->delalloc_bytes += len; 1608 BTRFS_I(inode)->delalloc_bytes += len;
1563 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, 1609 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1564 &BTRFS_I(inode)->runtime_flags)) { 1610 &BTRFS_I(inode)->runtime_flags))
1565 spin_lock(&root->fs_info->delalloc_lock); 1611 btrfs_add_delalloc_inodes(root, inode);
1566 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1567 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1568 &root->fs_info->delalloc_inodes);
1569 set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1570 &BTRFS_I(inode)->runtime_flags);
1571 }
1572 spin_unlock(&root->fs_info->delalloc_lock);
1573 }
1574 spin_unlock(&BTRFS_I(inode)->lock); 1612 spin_unlock(&BTRFS_I(inode)->lock);
1575 } 1613 }
1576} 1614}
@@ -1604,7 +1642,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,
1604 btrfs_delalloc_release_metadata(inode, len); 1642 btrfs_delalloc_release_metadata(inode, len);
1605 1643
1606 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID 1644 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
1607 && do_list) 1645 && do_list && !(state->state & EXTENT_NORESERVE))
1608 btrfs_free_reserved_data_space(inode, len); 1646 btrfs_free_reserved_data_space(inode, len);
1609 1647
1610 __percpu_counter_add(&root->fs_info->delalloc_bytes, -len, 1648 __percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
@@ -1613,15 +1651,8 @@ static void btrfs_clear_bit_hook(struct inode *inode,
1613 BTRFS_I(inode)->delalloc_bytes -= len; 1651 BTRFS_I(inode)->delalloc_bytes -= len;
1614 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && 1652 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
1615 test_bit(BTRFS_INODE_IN_DELALLOC_LIST, 1653 test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1616 &BTRFS_I(inode)->runtime_flags)) { 1654 &BTRFS_I(inode)->runtime_flags))
1617 spin_lock(&root->fs_info->delalloc_lock); 1655 btrfs_del_delalloc_inode(root, inode);
1618 if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1619 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1620 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1621 &BTRFS_I(inode)->runtime_flags);
1622 }
1623 spin_unlock(&root->fs_info->delalloc_lock);
1624 }
1625 spin_unlock(&BTRFS_I(inode)->lock); 1656 spin_unlock(&BTRFS_I(inode)->lock);
1626 } 1657 }
1627} 1658}
@@ -2263,11 +2294,6 @@ static noinline int relink_extent_backref(struct btrfs_path *path,
2263 return 0; 2294 return 0;
2264 return PTR_ERR(root); 2295 return PTR_ERR(root);
2265 } 2296 }
2266 if (btrfs_root_refs(&root->root_item) == 0) {
2267 srcu_read_unlock(&fs_info->subvol_srcu, index);
2268 /* parse ENOENT to 0 */
2269 return 0;
2270 }
2271 2297
2272 /* step 2: get inode */ 2298 /* step 2: get inode */
2273 key.objectid = backref->inum; 2299 key.objectid = backref->inum;
@@ -3215,13 +3241,16 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
3215 /* 1 for the orphan item deletion. */ 3241 /* 1 for the orphan item deletion. */
3216 trans = btrfs_start_transaction(root, 1); 3242 trans = btrfs_start_transaction(root, 1);
3217 if (IS_ERR(trans)) { 3243 if (IS_ERR(trans)) {
3244 iput(inode);
3218 ret = PTR_ERR(trans); 3245 ret = PTR_ERR(trans);
3219 goto out; 3246 goto out;
3220 } 3247 }
3221 ret = btrfs_orphan_add(trans, inode); 3248 ret = btrfs_orphan_add(trans, inode);
3222 btrfs_end_transaction(trans, root); 3249 btrfs_end_transaction(trans, root);
3223 if (ret) 3250 if (ret) {
3251 iput(inode);
3224 goto out; 3252 goto out;
3253 }
3225 3254
3226 ret = btrfs_truncate(inode); 3255 ret = btrfs_truncate(inode);
3227 if (ret) 3256 if (ret)
@@ -3274,8 +3303,17 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3274{ 3303{
3275 u32 nritems = btrfs_header_nritems(leaf); 3304 u32 nritems = btrfs_header_nritems(leaf);
3276 struct btrfs_key found_key; 3305 struct btrfs_key found_key;
3306 static u64 xattr_access = 0;
3307 static u64 xattr_default = 0;
3277 int scanned = 0; 3308 int scanned = 0;
3278 3309
3310 if (!xattr_access) {
3311 xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS,
3312 strlen(POSIX_ACL_XATTR_ACCESS));
3313 xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT,
3314 strlen(POSIX_ACL_XATTR_DEFAULT));
3315 }
3316
3279 slot++; 3317 slot++;
3280 while (slot < nritems) { 3318 while (slot < nritems) {
3281 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3319 btrfs_item_key_to_cpu(leaf, &found_key, slot);
@@ -3285,8 +3323,11 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3285 return 0; 3323 return 0;
3286 3324
3287 /* we found an xattr, assume we've got an acl */ 3325 /* we found an xattr, assume we've got an acl */
3288 if (found_key.type == BTRFS_XATTR_ITEM_KEY) 3326 if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3289 return 1; 3327 if (found_key.offset == xattr_access ||
3328 found_key.offset == xattr_default)
3329 return 1;
3330 }
3290 3331
3291 /* 3332 /*
3292 * we found a key greater than an xattr key, there can't 3333 * we found a key greater than an xattr key, there can't
@@ -3660,53 +3701,20 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
3660 } 3701 }
3661 return ret; 3702 return ret;
3662} 3703}
3663
3664
3665/* helper to check if there is any shared block in the path */
3666static int check_path_shared(struct btrfs_root *root,
3667 struct btrfs_path *path)
3668{
3669 struct extent_buffer *eb;
3670 int level;
3671 u64 refs = 1;
3672
3673 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
3674 int ret;
3675
3676 if (!path->nodes[level])
3677 break;
3678 eb = path->nodes[level];
3679 if (!btrfs_block_can_be_shared(root, eb))
3680 continue;
3681 ret = btrfs_lookup_extent_info(NULL, root, eb->start, level, 1,
3682 &refs, NULL);
3683 if (refs > 1)
3684 return 1;
3685 }
3686 return 0;
3687}
3688 3704
3689/* 3705/*
3690 * helper to start transaction for unlink and rmdir. 3706 * helper to start transaction for unlink and rmdir.
3691 * 3707 *
3692 * unlink and rmdir are special in btrfs, they do not always free space. 3708 * unlink and rmdir are special in btrfs, they do not always free space, so
3693 * so in enospc case, we should make sure they will free space before 3709 * if we cannot make our reservations the normal way try and see if there is
3694 * allowing them to use the global metadata reservation. 3710 * plenty of slack room in the global reserve to migrate, otherwise we cannot
3711 * allow the unlink to occur.
3695 */ 3712 */
3696static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, 3713static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
3697 struct dentry *dentry)
3698{ 3714{
3699 struct btrfs_trans_handle *trans; 3715 struct btrfs_trans_handle *trans;
3700 struct btrfs_root *root = BTRFS_I(dir)->root; 3716 struct btrfs_root *root = BTRFS_I(dir)->root;
3701 struct btrfs_path *path;
3702 struct btrfs_dir_item *di;
3703 struct inode *inode = dentry->d_inode;
3704 u64 index;
3705 int check_link = 1;
3706 int err = -ENOSPC;
3707 int ret; 3717 int ret;
3708 u64 ino = btrfs_ino(inode);
3709 u64 dir_ino = btrfs_ino(dir);
3710 3718
3711 /* 3719 /*
3712 * 1 for the possible orphan item 3720 * 1 for the possible orphan item
@@ -3719,158 +3727,23 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
3719 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) 3727 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
3720 return trans; 3728 return trans;
3721 3729
3722 if (ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 3730 if (PTR_ERR(trans) == -ENOSPC) {
3723 return ERR_PTR(-ENOSPC); 3731 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
3724
3725 /* check if there is someone else holds reference */
3726 if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
3727 return ERR_PTR(-ENOSPC);
3728
3729 if (atomic_read(&inode->i_count) > 2)
3730 return ERR_PTR(-ENOSPC);
3731
3732 if (xchg(&root->fs_info->enospc_unlink, 1))
3733 return ERR_PTR(-ENOSPC);
3734
3735 path = btrfs_alloc_path();
3736 if (!path) {
3737 root->fs_info->enospc_unlink = 0;
3738 return ERR_PTR(-ENOMEM);
3739 }
3740 3732
3741 /* 1 for the orphan item */ 3733 trans = btrfs_start_transaction(root, 0);
3742 trans = btrfs_start_transaction(root, 1); 3734 if (IS_ERR(trans))
3743 if (IS_ERR(trans)) { 3735 return trans;
3744 btrfs_free_path(path); 3736 ret = btrfs_cond_migrate_bytes(root->fs_info,
3745 root->fs_info->enospc_unlink = 0; 3737 &root->fs_info->trans_block_rsv,
3746 return trans; 3738 num_bytes, 5);
3747 } 3739 if (ret) {
3748 3740 btrfs_end_transaction(trans, root);
3749 path->skip_locking = 1; 3741 return ERR_PTR(ret);
3750 path->search_commit_root = 1;
3751
3752 ret = btrfs_lookup_inode(trans, root, path,
3753 &BTRFS_I(dir)->location, 0);
3754 if (ret < 0) {
3755 err = ret;
3756 goto out;
3757 }
3758 if (ret == 0) {
3759 if (check_path_shared(root, path))
3760 goto out;
3761 } else {
3762 check_link = 0;
3763 }
3764 btrfs_release_path(path);
3765
3766 ret = btrfs_lookup_inode(trans, root, path,
3767 &BTRFS_I(inode)->location, 0);
3768 if (ret < 0) {
3769 err = ret;
3770 goto out;
3771 }
3772 if (ret == 0) {
3773 if (check_path_shared(root, path))
3774 goto out;
3775 } else {
3776 check_link = 0;
3777 }
3778 btrfs_release_path(path);
3779
3780 if (ret == 0 && S_ISREG(inode->i_mode)) {
3781 ret = btrfs_lookup_file_extent(trans, root, path,
3782 ino, (u64)-1, 0);
3783 if (ret < 0) {
3784 err = ret;
3785 goto out;
3786 } 3742 }
3787 BUG_ON(ret == 0); /* Corruption */
3788 if (check_path_shared(root, path))
3789 goto out;
3790 btrfs_release_path(path);
3791 }
3792
3793 if (!check_link) {
3794 err = 0;
3795 goto out;
3796 }
3797
3798 di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
3799 dentry->d_name.name, dentry->d_name.len, 0);
3800 if (IS_ERR(di)) {
3801 err = PTR_ERR(di);
3802 goto out;
3803 }
3804 if (di) {
3805 if (check_path_shared(root, path))
3806 goto out;
3807 } else {
3808 err = 0;
3809 goto out;
3810 }
3811 btrfs_release_path(path);
3812
3813 ret = btrfs_get_inode_ref_index(trans, root, path, dentry->d_name.name,
3814 dentry->d_name.len, ino, dir_ino, 0,
3815 &index);
3816 if (ret) {
3817 err = ret;
3818 goto out;
3819 }
3820
3821 if (check_path_shared(root, path))
3822 goto out;
3823
3824 btrfs_release_path(path);
3825
3826 /*
3827 * This is a commit root search, if we can lookup inode item and other
3828 * relative items in the commit root, it means the transaction of
3829 * dir/file creation has been committed, and the dir index item that we
3830 * delay to insert has also been inserted into the commit root. So
3831 * we needn't worry about the delayed insertion of the dir index item
3832 * here.
3833 */
3834 di = btrfs_lookup_dir_index_item(trans, root, path, dir_ino, index,
3835 dentry->d_name.name, dentry->d_name.len, 0);
3836 if (IS_ERR(di)) {
3837 err = PTR_ERR(di);
3838 goto out;
3839 }
3840 BUG_ON(ret == -ENOENT);
3841 if (check_path_shared(root, path))
3842 goto out;
3843
3844 err = 0;
3845out:
3846 btrfs_free_path(path);
3847 /* Migrate the orphan reservation over */
3848 if (!err)
3849 err = btrfs_block_rsv_migrate(trans->block_rsv,
3850 &root->fs_info->global_block_rsv,
3851 trans->bytes_reserved);
3852
3853 if (err) {
3854 btrfs_end_transaction(trans, root);
3855 root->fs_info->enospc_unlink = 0;
3856 return ERR_PTR(err);
3857 }
3858
3859 trans->block_rsv = &root->fs_info->global_block_rsv;
3860 return trans;
3861}
3862
3863static void __unlink_end_trans(struct btrfs_trans_handle *trans,
3864 struct btrfs_root *root)
3865{
3866 if (trans->block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL) {
3867 btrfs_block_rsv_release(root, trans->block_rsv,
3868 trans->bytes_reserved);
3869 trans->block_rsv = &root->fs_info->trans_block_rsv; 3743 trans->block_rsv = &root->fs_info->trans_block_rsv;
3870 BUG_ON(!root->fs_info->enospc_unlink); 3744 trans->bytes_reserved = num_bytes;
3871 root->fs_info->enospc_unlink = 0;
3872 } 3745 }
3873 btrfs_end_transaction(trans, root); 3746 return trans;
3874} 3747}
3875 3748
3876static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 3749static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -3880,7 +3753,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
3880 struct inode *inode = dentry->d_inode; 3753 struct inode *inode = dentry->d_inode;
3881 int ret; 3754 int ret;
3882 3755
3883 trans = __unlink_start_trans(dir, dentry); 3756 trans = __unlink_start_trans(dir);
3884 if (IS_ERR(trans)) 3757 if (IS_ERR(trans))
3885 return PTR_ERR(trans); 3758 return PTR_ERR(trans);
3886 3759
@@ -3898,7 +3771,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
3898 } 3771 }
3899 3772
3900out: 3773out:
3901 __unlink_end_trans(trans, root); 3774 btrfs_end_transaction(trans, root);
3902 btrfs_btree_balance_dirty(root); 3775 btrfs_btree_balance_dirty(root);
3903 return ret; 3776 return ret;
3904} 3777}
@@ -3995,7 +3868,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
3995 if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) 3868 if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
3996 return -EPERM; 3869 return -EPERM;
3997 3870
3998 trans = __unlink_start_trans(dir, dentry); 3871 trans = __unlink_start_trans(dir);
3999 if (IS_ERR(trans)) 3872 if (IS_ERR(trans))
4000 return PTR_ERR(trans); 3873 return PTR_ERR(trans);
4001 3874
@@ -4017,7 +3890,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
4017 if (!err) 3890 if (!err)
4018 btrfs_i_size_write(inode, 0); 3891 btrfs_i_size_write(inode, 0);
4019out: 3892out:
4020 __unlink_end_trans(trans, root); 3893 btrfs_end_transaction(trans, root);
4021 btrfs_btree_balance_dirty(root); 3894 btrfs_btree_balance_dirty(root);
4022 3895
4023 return err; 3896 return err;
@@ -4395,6 +4268,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
4395 u64 hole_size; 4268 u64 hole_size;
4396 int err = 0; 4269 int err = 0;
4397 4270
4271 /*
4272 * If our size started in the middle of a page we need to zero out the
4273 * rest of the page before we expand the i_size, otherwise we could
4274 * expose stale data.
4275 */
4276 err = btrfs_truncate_page(inode, oldsize, 0, 0);
4277 if (err)
4278 return err;
4279
4398 if (size <= hole_start) 4280 if (size <= hole_start)
4399 return 0; 4281 return 0;
4400 4282
@@ -4822,11 +4704,6 @@ static int fixup_tree_root_location(struct btrfs_root *root,
4822 goto out; 4704 goto out;
4823 } 4705 }
4824 4706
4825 if (btrfs_root_refs(&new_root->root_item) == 0) {
4826 err = -ENOENT;
4827 goto out;
4828 }
4829
4830 *sub_root = new_root; 4707 *sub_root = new_root;
4831 location->objectid = btrfs_root_dirid(&new_root->root_item); 4708 location->objectid = btrfs_root_dirid(&new_root->root_item);
4832 location->type = BTRFS_INODE_ITEM_KEY; 4709 location->type = BTRFS_INODE_ITEM_KEY;
@@ -5092,8 +4969,10 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
5092 if (!(inode->i_sb->s_flags & MS_RDONLY)) 4969 if (!(inode->i_sb->s_flags & MS_RDONLY))
5093 ret = btrfs_orphan_cleanup(sub_root); 4970 ret = btrfs_orphan_cleanup(sub_root);
5094 up_read(&root->fs_info->cleanup_work_sem); 4971 up_read(&root->fs_info->cleanup_work_sem);
5095 if (ret) 4972 if (ret) {
4973 iput(inode);
5096 inode = ERR_PTR(ret); 4974 inode = ERR_PTR(ret);
4975 }
5097 } 4976 }
5098 4977
5099 return inode; 4978 return inode;
@@ -6501,10 +6380,10 @@ out:
6501 * returns 1 when the nocow is safe, < 1 on error, 0 if the 6380 * returns 1 when the nocow is safe, < 1 on error, 0 if the
6502 * block must be cow'd 6381 * block must be cow'd
6503 */ 6382 */
6504static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans, 6383noinline int can_nocow_extent(struct btrfs_trans_handle *trans,
6505 struct inode *inode, u64 offset, u64 *len, 6384 struct inode *inode, u64 offset, u64 *len,
6506 u64 *orig_start, u64 *orig_block_len, 6385 u64 *orig_start, u64 *orig_block_len,
6507 u64 *ram_bytes) 6386 u64 *ram_bytes)
6508{ 6387{
6509 struct btrfs_path *path; 6388 struct btrfs_path *path;
6510 int ret; 6389 int ret;
@@ -6518,7 +6397,7 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
6518 u64 num_bytes; 6397 u64 num_bytes;
6519 int slot; 6398 int slot;
6520 int found_type; 6399 int found_type;
6521 6400 bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
6522 path = btrfs_alloc_path(); 6401 path = btrfs_alloc_path();
6523 if (!path) 6402 if (!path)
6524 return -ENOMEM; 6403 return -ENOMEM;
@@ -6558,18 +6437,28 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
6558 /* not a regular extent, must cow */ 6437 /* not a regular extent, must cow */
6559 goto out; 6438 goto out;
6560 } 6439 }
6440
6441 if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
6442 goto out;
6443
6561 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 6444 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
6445 if (disk_bytenr == 0)
6446 goto out;
6447
6448 if (btrfs_file_extent_compression(leaf, fi) ||
6449 btrfs_file_extent_encryption(leaf, fi) ||
6450 btrfs_file_extent_other_encoding(leaf, fi))
6451 goto out;
6452
6562 backref_offset = btrfs_file_extent_offset(leaf, fi); 6453 backref_offset = btrfs_file_extent_offset(leaf, fi);
6563 6454
6564 *orig_start = key.offset - backref_offset; 6455 if (orig_start) {
6565 *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); 6456 *orig_start = key.offset - backref_offset;
6566 *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); 6457 *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
6458 *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
6459 }
6567 6460
6568 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 6461 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
6569 if (extent_end < offset + *len) {
6570 /* extent doesn't include our full range, must cow */
6571 goto out;
6572 }
6573 6462
6574 if (btrfs_extent_readonly(root, disk_bytenr)) 6463 if (btrfs_extent_readonly(root, disk_bytenr))
6575 goto out; 6464 goto out;
@@ -6813,8 +6702,8 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
6813 if (IS_ERR(trans)) 6702 if (IS_ERR(trans))
6814 goto must_cow; 6703 goto must_cow;
6815 6704
6816 if (can_nocow_odirect(trans, inode, start, &len, &orig_start, 6705 if (can_nocow_extent(trans, inode, start, &len, &orig_start,
6817 &orig_block_len, &ram_bytes) == 1) { 6706 &orig_block_len, &ram_bytes) == 1) {
6818 if (type == BTRFS_ORDERED_PREALLOC) { 6707 if (type == BTRFS_ORDERED_PREALLOC) {
6819 free_extent_map(em); 6708 free_extent_map(em);
6820 em = create_pinned_em(inode, start, len, 6709 em = create_pinned_em(inode, start, len,
@@ -7243,7 +7132,6 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
7243{ 7132{
7244 struct btrfs_root *root = BTRFS_I(inode)->root; 7133 struct btrfs_root *root = BTRFS_I(inode)->root;
7245 struct btrfs_dio_private *dip; 7134 struct btrfs_dio_private *dip;
7246 struct bio_vec *bvec = dio_bio->bi_io_vec;
7247 struct bio *io_bio; 7135 struct bio *io_bio;
7248 int skip_sum; 7136 int skip_sum;
7249 int write = rw & REQ_WRITE; 7137 int write = rw & REQ_WRITE;
@@ -7265,16 +7153,9 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
7265 } 7153 }
7266 7154
7267 dip->private = dio_bio->bi_private; 7155 dip->private = dio_bio->bi_private;
7268 io_bio->bi_private = dio_bio->bi_private;
7269 dip->inode = inode; 7156 dip->inode = inode;
7270 dip->logical_offset = file_offset; 7157 dip->logical_offset = file_offset;
7271 7158 dip->bytes = dio_bio->bi_size;
7272 dip->bytes = 0;
7273 do {
7274 dip->bytes += bvec->bv_len;
7275 bvec++;
7276 } while (bvec <= (dio_bio->bi_io_vec + dio_bio->bi_vcnt - 1));
7277
7278 dip->disk_bytenr = (u64)dio_bio->bi_sector << 9; 7159 dip->disk_bytenr = (u64)dio_bio->bi_sector << 9;
7279 io_bio->bi_private = dip; 7160 io_bio->bi_private = dip;
7280 dip->errors = 0; 7161 dip->errors = 0;
@@ -7373,8 +7254,16 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
7373 atomic_inc(&inode->i_dio_count); 7254 atomic_inc(&inode->i_dio_count);
7374 smp_mb__after_atomic_inc(); 7255 smp_mb__after_atomic_inc();
7375 7256
7257 /*
7258 * The generic stuff only does filemap_write_and_wait_range, which isn't
7259 * enough if we've written compressed pages to this area, so we need to
7260 * call btrfs_wait_ordered_range to make absolutely sure that any
7261 * outstanding dirty pages are on disk.
7262 */
7263 count = iov_length(iov, nr_segs);
7264 btrfs_wait_ordered_range(inode, offset, count);
7265
7376 if (rw & WRITE) { 7266 if (rw & WRITE) {
7377 count = iov_length(iov, nr_segs);
7378 /* 7267 /*
7379 * If the write DIO is beyond the EOF, we need update 7268 * If the write DIO is beyond the EOF, we need update
7380 * the isize, but it is protected by i_mutex. So we can 7269 * the isize, but it is protected by i_mutex. So we can
@@ -7694,16 +7583,12 @@ static int btrfs_truncate(struct inode *inode)
7694{ 7583{
7695 struct btrfs_root *root = BTRFS_I(inode)->root; 7584 struct btrfs_root *root = BTRFS_I(inode)->root;
7696 struct btrfs_block_rsv *rsv; 7585 struct btrfs_block_rsv *rsv;
7697 int ret; 7586 int ret = 0;
7698 int err = 0; 7587 int err = 0;
7699 struct btrfs_trans_handle *trans; 7588 struct btrfs_trans_handle *trans;
7700 u64 mask = root->sectorsize - 1; 7589 u64 mask = root->sectorsize - 1;
7701 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 7590 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
7702 7591
7703 ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
7704 if (ret)
7705 return ret;
7706
7707 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 7592 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
7708 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 7593 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
7709 7594
@@ -7961,9 +7846,9 @@ void btrfs_destroy_inode(struct inode *inode)
7961 */ 7846 */
7962 smp_mb(); 7847 smp_mb();
7963 if (!list_empty(&BTRFS_I(inode)->ordered_operations)) { 7848 if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
7964 spin_lock(&root->fs_info->ordered_extent_lock); 7849 spin_lock(&root->fs_info->ordered_root_lock);
7965 list_del_init(&BTRFS_I(inode)->ordered_operations); 7850 list_del_init(&BTRFS_I(inode)->ordered_operations);
7966 spin_unlock(&root->fs_info->ordered_extent_lock); 7851 spin_unlock(&root->fs_info->ordered_root_lock);
7967 } 7852 }
7968 7853
7969 if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 7854 if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
@@ -8333,7 +8218,7 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
8333 * some fairly slow code that needs optimization. This walks the list 8218 * some fairly slow code that needs optimization. This walks the list
8334 * of all the inodes with pending delalloc and forces them to disk. 8219 * of all the inodes with pending delalloc and forces them to disk.
8335 */ 8220 */
8336int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) 8221static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8337{ 8222{
8338 struct btrfs_inode *binode; 8223 struct btrfs_inode *binode;
8339 struct inode *inode; 8224 struct inode *inode;
@@ -8342,30 +8227,23 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8342 struct list_head splice; 8227 struct list_head splice;
8343 int ret = 0; 8228 int ret = 0;
8344 8229
8345 if (root->fs_info->sb->s_flags & MS_RDONLY)
8346 return -EROFS;
8347
8348 INIT_LIST_HEAD(&works); 8230 INIT_LIST_HEAD(&works);
8349 INIT_LIST_HEAD(&splice); 8231 INIT_LIST_HEAD(&splice);
8350 8232
8351 spin_lock(&root->fs_info->delalloc_lock); 8233 spin_lock(&root->delalloc_lock);
8352 list_splice_init(&root->fs_info->delalloc_inodes, &splice); 8234 list_splice_init(&root->delalloc_inodes, &splice);
8353 while (!list_empty(&splice)) { 8235 while (!list_empty(&splice)) {
8354 binode = list_entry(splice.next, struct btrfs_inode, 8236 binode = list_entry(splice.next, struct btrfs_inode,
8355 delalloc_inodes); 8237 delalloc_inodes);
8356 8238
8357 list_del_init(&binode->delalloc_inodes); 8239 list_move_tail(&binode->delalloc_inodes,
8358 8240 &root->delalloc_inodes);
8359 inode = igrab(&binode->vfs_inode); 8241 inode = igrab(&binode->vfs_inode);
8360 if (!inode) { 8242 if (!inode) {
8361 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, 8243 cond_resched_lock(&root->delalloc_lock);
8362 &binode->runtime_flags);
8363 continue; 8244 continue;
8364 } 8245 }
8365 8246 spin_unlock(&root->delalloc_lock);
8366 list_add_tail(&binode->delalloc_inodes,
8367 &root->fs_info->delalloc_inodes);
8368 spin_unlock(&root->fs_info->delalloc_lock);
8369 8247
8370 work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); 8248 work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
8371 if (unlikely(!work)) { 8249 if (unlikely(!work)) {
@@ -8377,16 +8255,39 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8377 &work->work); 8255 &work->work);
8378 8256
8379 cond_resched(); 8257 cond_resched();
8380 spin_lock(&root->fs_info->delalloc_lock); 8258 spin_lock(&root->delalloc_lock);
8381 } 8259 }
8382 spin_unlock(&root->fs_info->delalloc_lock); 8260 spin_unlock(&root->delalloc_lock);
8383 8261
8384 list_for_each_entry_safe(work, next, &works, list) { 8262 list_for_each_entry_safe(work, next, &works, list) {
8385 list_del_init(&work->list); 8263 list_del_init(&work->list);
8386 btrfs_wait_and_free_delalloc_work(work); 8264 btrfs_wait_and_free_delalloc_work(work);
8387 } 8265 }
8266 return 0;
8267out:
8268 list_for_each_entry_safe(work, next, &works, list) {
8269 list_del_init(&work->list);
8270 btrfs_wait_and_free_delalloc_work(work);
8271 }
8272
8273 if (!list_empty_careful(&splice)) {
8274 spin_lock(&root->delalloc_lock);
8275 list_splice_tail(&splice, &root->delalloc_inodes);
8276 spin_unlock(&root->delalloc_lock);
8277 }
8278 return ret;
8279}
8280
8281int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8282{
8283 int ret;
8388 8284
8389 /* the filemap_flush will queue IO into the worker threads, but 8285 if (root->fs_info->sb->s_flags & MS_RDONLY)
8286 return -EROFS;
8287
8288 ret = __start_delalloc_inodes(root, delay_iput);
8289 /*
8290 * the filemap_flush will queue IO into the worker threads, but
8390 * we have to make sure the IO is actually started and that 8291 * we have to make sure the IO is actually started and that
8391 * ordered extents get created before we return 8292 * ordered extents get created before we return
8392 */ 8293 */
@@ -8398,17 +8299,55 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8398 atomic_read(&root->fs_info->async_delalloc_pages) == 0)); 8299 atomic_read(&root->fs_info->async_delalloc_pages) == 0));
8399 } 8300 }
8400 atomic_dec(&root->fs_info->async_submit_draining); 8301 atomic_dec(&root->fs_info->async_submit_draining);
8401 return 0; 8302 return ret;
8402out: 8303}
8403 list_for_each_entry_safe(work, next, &works, list) { 8304
8404 list_del_init(&work->list); 8305int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
8405 btrfs_wait_and_free_delalloc_work(work); 8306 int delay_iput)
8307{
8308 struct btrfs_root *root;
8309 struct list_head splice;
8310 int ret;
8311
8312 if (fs_info->sb->s_flags & MS_RDONLY)
8313 return -EROFS;
8314
8315 INIT_LIST_HEAD(&splice);
8316
8317 spin_lock(&fs_info->delalloc_root_lock);
8318 list_splice_init(&fs_info->delalloc_roots, &splice);
8319 while (!list_empty(&splice)) {
8320 root = list_first_entry(&splice, struct btrfs_root,
8321 delalloc_root);
8322 root = btrfs_grab_fs_root(root);
8323 BUG_ON(!root);
8324 list_move_tail(&root->delalloc_root,
8325 &fs_info->delalloc_roots);
8326 spin_unlock(&fs_info->delalloc_root_lock);
8327
8328 ret = __start_delalloc_inodes(root, delay_iput);
8329 btrfs_put_fs_root(root);
8330 if (ret)
8331 goto out;
8332
8333 spin_lock(&fs_info->delalloc_root_lock);
8406 } 8334 }
8335 spin_unlock(&fs_info->delalloc_root_lock);
8407 8336
8337 atomic_inc(&fs_info->async_submit_draining);
8338 while (atomic_read(&fs_info->nr_async_submits) ||
8339 atomic_read(&fs_info->async_delalloc_pages)) {
8340 wait_event(fs_info->async_submit_wait,
8341 (atomic_read(&fs_info->nr_async_submits) == 0 &&
8342 atomic_read(&fs_info->async_delalloc_pages) == 0));
8343 }
8344 atomic_dec(&fs_info->async_submit_draining);
8345 return 0;
8346out:
8408 if (!list_empty_careful(&splice)) { 8347 if (!list_empty_careful(&splice)) {
8409 spin_lock(&root->fs_info->delalloc_lock); 8348 spin_lock(&fs_info->delalloc_root_lock);
8410 list_splice_tail(&splice, &root->fs_info->delalloc_inodes); 8349 list_splice_tail(&splice, &fs_info->delalloc_roots);
8411 spin_unlock(&root->fs_info->delalloc_lock); 8350 spin_unlock(&fs_info->delalloc_root_lock);
8412 } 8351 }
8413 return ret; 8352 return ret;
8414} 8353}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index cd7e96c73cb7..238a05545ee2 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -555,6 +555,12 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
555 if (!root->ref_cows) 555 if (!root->ref_cows)
556 return -EINVAL; 556 return -EINVAL;
557 557
558 ret = btrfs_start_delalloc_inodes(root, 0);
559 if (ret)
560 return ret;
561
562 btrfs_wait_ordered_extents(root, 0);
563
558 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); 564 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
559 if (!pending_snapshot) 565 if (!pending_snapshot)
560 return -ENOMEM; 566 return -ENOMEM;
@@ -2354,14 +2360,6 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
2354 if (ret) 2360 if (ret)
2355 return ret; 2361 return ret;
2356 2362
2357 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
2358 1)) {
2359 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
2360 mnt_drop_write_file(file);
2361 return -EINVAL;
2362 }
2363
2364 mutex_lock(&root->fs_info->volume_mutex);
2365 vol_args = memdup_user(arg, sizeof(*vol_args)); 2363 vol_args = memdup_user(arg, sizeof(*vol_args));
2366 if (IS_ERR(vol_args)) { 2364 if (IS_ERR(vol_args)) {
2367 ret = PTR_ERR(vol_args); 2365 ret = PTR_ERR(vol_args);
@@ -2369,12 +2367,20 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
2369 } 2367 }
2370 2368
2371 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 2369 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
2372 ret = btrfs_rm_device(root, vol_args->name);
2373 2370
2374 kfree(vol_args); 2371 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
2375out: 2372 1)) {
2373 ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
2374 goto out;
2375 }
2376
2377 mutex_lock(&root->fs_info->volume_mutex);
2378 ret = btrfs_rm_device(root, vol_args->name);
2376 mutex_unlock(&root->fs_info->volume_mutex); 2379 mutex_unlock(&root->fs_info->volume_mutex);
2377 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); 2380 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
2381
2382out:
2383 kfree(vol_args);
2378 mnt_drop_write_file(file); 2384 mnt_drop_write_file(file);
2379 return ret; 2385 return ret;
2380} 2386}
@@ -2480,6 +2486,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2480 int ret; 2486 int ret;
2481 u64 len = olen; 2487 u64 len = olen;
2482 u64 bs = root->fs_info->sb->s_blocksize; 2488 u64 bs = root->fs_info->sb->s_blocksize;
2489 int same_inode = 0;
2483 2490
2484 /* 2491 /*
2485 * TODO: 2492 * TODO:
@@ -2516,7 +2523,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2516 2523
2517 ret = -EINVAL; 2524 ret = -EINVAL;
2518 if (src == inode) 2525 if (src == inode)
2519 goto out_fput; 2526 same_inode = 1;
2520 2527
2521 /* the src must be open for reading */ 2528 /* the src must be open for reading */
2522 if (!(src_file.file->f_mode & FMODE_READ)) 2529 if (!(src_file.file->f_mode & FMODE_READ))
@@ -2547,12 +2554,16 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2547 } 2554 }
2548 path->reada = 2; 2555 path->reada = 2;
2549 2556
2550 if (inode < src) { 2557 if (!same_inode) {
2551 mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); 2558 if (inode < src) {
2552 mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD); 2559 mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
2560 mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
2561 } else {
2562 mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
2563 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
2564 }
2553 } else { 2565 } else {
2554 mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT); 2566 mutex_lock(&src->i_mutex);
2555 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
2556 } 2567 }
2557 2568
2558 /* determine range to clone */ 2569 /* determine range to clone */
@@ -2570,6 +2581,12 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2570 !IS_ALIGNED(destoff, bs)) 2581 !IS_ALIGNED(destoff, bs))
2571 goto out_unlock; 2582 goto out_unlock;
2572 2583
2584 /* verify if ranges are overlapped within the same file */
2585 if (same_inode) {
2586 if (destoff + len > off && destoff < off + len)
2587 goto out_unlock;
2588 }
2589
2573 if (destoff > inode->i_size) { 2590 if (destoff > inode->i_size) {
2574 ret = btrfs_cont_expand(inode, inode->i_size, destoff); 2591 ret = btrfs_cont_expand(inode, inode->i_size, destoff);
2575 if (ret) 2592 if (ret)
@@ -2846,7 +2863,8 @@ out:
2846 unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); 2863 unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
2847out_unlock: 2864out_unlock:
2848 mutex_unlock(&src->i_mutex); 2865 mutex_unlock(&src->i_mutex);
2849 mutex_unlock(&inode->i_mutex); 2866 if (!same_inode)
2867 mutex_unlock(&inode->i_mutex);
2850 vfree(buf); 2868 vfree(buf);
2851 btrfs_free_path(path); 2869 btrfs_free_path(path);
2852out_fput: 2870out_fput:
@@ -2951,11 +2969,6 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2951 goto out; 2969 goto out;
2952 } 2970 }
2953 2971
2954 if (btrfs_root_refs(&new_root->root_item) == 0) {
2955 ret = -ENOENT;
2956 goto out;
2957 }
2958
2959 path = btrfs_alloc_path(); 2972 path = btrfs_alloc_path();
2960 if (!path) { 2973 if (!path) {
2961 ret = -ENOMEM; 2974 ret = -ENOMEM;
@@ -3719,9 +3732,6 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
3719 break; 3732 break;
3720 } 3733 }
3721 3734
3722 if (copy_to_user(arg, sa, sizeof(*sa)))
3723 ret = -EFAULT;
3724
3725 err = btrfs_commit_transaction(trans, root->fs_info->tree_root); 3735 err = btrfs_commit_transaction(trans, root->fs_info->tree_root);
3726 if (err && !ret) 3736 if (err && !ret)
3727 ret = err; 3737 ret = err;
@@ -3937,6 +3947,16 @@ static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
3937 return ret; 3947 return ret;
3938} 3948}
3939 3949
3950static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
3951{
3952 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3953
3954 if (!capable(CAP_SYS_ADMIN))
3955 return -EPERM;
3956
3957 return btrfs_qgroup_wait_for_completion(root->fs_info);
3958}
3959
3940static long btrfs_ioctl_set_received_subvol(struct file *file, 3960static long btrfs_ioctl_set_received_subvol(struct file *file,
3941 void __user *arg) 3961 void __user *arg)
3942{ 3962{
@@ -4179,6 +4199,8 @@ long btrfs_ioctl(struct file *file, unsigned int
4179 return btrfs_ioctl_quota_rescan(file, argp); 4199 return btrfs_ioctl_quota_rescan(file, argp);
4180 case BTRFS_IOC_QUOTA_RESCAN_STATUS: 4200 case BTRFS_IOC_QUOTA_RESCAN_STATUS:
4181 return btrfs_ioctl_quota_rescan_status(file, argp); 4201 return btrfs_ioctl_quota_rescan_status(file, argp);
4202 case BTRFS_IOC_QUOTA_RESCAN_WAIT:
4203 return btrfs_ioctl_quota_rescan_wait(file, argp);
4182 case BTRFS_IOC_DEV_REPLACE: 4204 case BTRFS_IOC_DEV_REPLACE:
4183 return btrfs_ioctl_dev_replace(root, argp); 4205 return btrfs_ioctl_dev_replace(root, argp);
4184 case BTRFS_IOC_GET_FSLABEL: 4206 case BTRFS_IOC_GET_FSLABEL:
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 743b86fa4fcb..f93151a98886 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -31,8 +31,8 @@
31 31
32struct workspace { 32struct workspace {
33 void *mem; 33 void *mem;
34 void *buf; /* where compressed data goes */ 34 void *buf; /* where decompressed data goes */
35 void *cbuf; /* where decompressed data goes */ 35 void *cbuf; /* where compressed data goes */
36 struct list_head list; 36 struct list_head list;
37}; 37};
38 38
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 1ddd728541ee..81369827e514 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -24,6 +24,7 @@
24#include "transaction.h" 24#include "transaction.h"
25#include "btrfs_inode.h" 25#include "btrfs_inode.h"
26#include "extent_io.h" 26#include "extent_io.h"
27#include "disk-io.h"
27 28
28static struct kmem_cache *btrfs_ordered_extent_cache; 29static struct kmem_cache *btrfs_ordered_extent_cache;
29 30
@@ -184,6 +185,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
184 u64 start, u64 len, u64 disk_len, 185 u64 start, u64 len, u64 disk_len,
185 int type, int dio, int compress_type) 186 int type, int dio, int compress_type)
186{ 187{
188 struct btrfs_root *root = BTRFS_I(inode)->root;
187 struct btrfs_ordered_inode_tree *tree; 189 struct btrfs_ordered_inode_tree *tree;
188 struct rb_node *node; 190 struct rb_node *node;
189 struct btrfs_ordered_extent *entry; 191 struct btrfs_ordered_extent *entry;
@@ -227,10 +229,18 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
227 ordered_data_tree_panic(inode, -EEXIST, file_offset); 229 ordered_data_tree_panic(inode, -EEXIST, file_offset);
228 spin_unlock_irq(&tree->lock); 230 spin_unlock_irq(&tree->lock);
229 231
230 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 232 spin_lock(&root->ordered_extent_lock);
231 list_add_tail(&entry->root_extent_list, 233 list_add_tail(&entry->root_extent_list,
232 &BTRFS_I(inode)->root->fs_info->ordered_extents); 234 &root->ordered_extents);
233 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 235 root->nr_ordered_extents++;
236 if (root->nr_ordered_extents == 1) {
237 spin_lock(&root->fs_info->ordered_root_lock);
238 BUG_ON(!list_empty(&root->ordered_root));
239 list_add_tail(&root->ordered_root,
240 &root->fs_info->ordered_roots);
241 spin_unlock(&root->fs_info->ordered_root_lock);
242 }
243 spin_unlock(&root->ordered_extent_lock);
234 244
235 return 0; 245 return 0;
236} 246}
@@ -516,8 +526,9 @@ void btrfs_remove_ordered_extent(struct inode *inode,
516 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 526 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
517 spin_unlock_irq(&tree->lock); 527 spin_unlock_irq(&tree->lock);
518 528
519 spin_lock(&root->fs_info->ordered_extent_lock); 529 spin_lock(&root->ordered_extent_lock);
520 list_del_init(&entry->root_extent_list); 530 list_del_init(&entry->root_extent_list);
531 root->nr_ordered_extents--;
521 532
522 trace_btrfs_ordered_extent_remove(inode, entry); 533 trace_btrfs_ordered_extent_remove(inode, entry);
523 534
@@ -530,7 +541,14 @@ void btrfs_remove_ordered_extent(struct inode *inode,
530 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { 541 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
531 list_del_init(&BTRFS_I(inode)->ordered_operations); 542 list_del_init(&BTRFS_I(inode)->ordered_operations);
532 } 543 }
533 spin_unlock(&root->fs_info->ordered_extent_lock); 544
545 if (!root->nr_ordered_extents) {
546 spin_lock(&root->fs_info->ordered_root_lock);
547 BUG_ON(list_empty(&root->ordered_root));
548 list_del_init(&root->ordered_root);
549 spin_unlock(&root->fs_info->ordered_root_lock);
550 }
551 spin_unlock(&root->ordered_extent_lock);
534 wake_up(&entry->wait); 552 wake_up(&entry->wait);
535} 553}
536 554
@@ -550,7 +568,6 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
550void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) 568void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
551{ 569{
552 struct list_head splice, works; 570 struct list_head splice, works;
553 struct list_head *cur;
554 struct btrfs_ordered_extent *ordered, *next; 571 struct btrfs_ordered_extent *ordered, *next;
555 struct inode *inode; 572 struct inode *inode;
556 573
@@ -558,35 +575,34 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
558 INIT_LIST_HEAD(&works); 575 INIT_LIST_HEAD(&works);
559 576
560 mutex_lock(&root->fs_info->ordered_operations_mutex); 577 mutex_lock(&root->fs_info->ordered_operations_mutex);
561 spin_lock(&root->fs_info->ordered_extent_lock); 578 spin_lock(&root->ordered_extent_lock);
562 list_splice_init(&root->fs_info->ordered_extents, &splice); 579 list_splice_init(&root->ordered_extents, &splice);
563 while (!list_empty(&splice)) { 580 while (!list_empty(&splice)) {
564 cur = splice.next; 581 ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
565 ordered = list_entry(cur, struct btrfs_ordered_extent, 582 root_extent_list);
566 root_extent_list); 583 list_move_tail(&ordered->root_extent_list,
567 list_del_init(&ordered->root_extent_list); 584 &root->ordered_extents);
568 atomic_inc(&ordered->refs);
569
570 /* 585 /*
571 * the inode may be getting freed (in sys_unlink path). 586 * the inode may be getting freed (in sys_unlink path).
572 */ 587 */
573 inode = igrab(ordered->inode); 588 inode = igrab(ordered->inode);
589 if (!inode) {
590 cond_resched_lock(&root->ordered_extent_lock);
591 continue;
592 }
574 593
575 spin_unlock(&root->fs_info->ordered_extent_lock); 594 atomic_inc(&ordered->refs);
595 spin_unlock(&root->ordered_extent_lock);
576 596
577 if (inode) { 597 ordered->flush_work.func = btrfs_run_ordered_extent_work;
578 ordered->flush_work.func = btrfs_run_ordered_extent_work; 598 list_add_tail(&ordered->work_list, &works);
579 list_add_tail(&ordered->work_list, &works); 599 btrfs_queue_worker(&root->fs_info->flush_workers,
580 btrfs_queue_worker(&root->fs_info->flush_workers, 600 &ordered->flush_work);
581 &ordered->flush_work);
582 } else {
583 btrfs_put_ordered_extent(ordered);
584 }
585 601
586 cond_resched(); 602 cond_resched();
587 spin_lock(&root->fs_info->ordered_extent_lock); 603 spin_lock(&root->ordered_extent_lock);
588 } 604 }
589 spin_unlock(&root->fs_info->ordered_extent_lock); 605 spin_unlock(&root->ordered_extent_lock);
590 606
591 list_for_each_entry_safe(ordered, next, &works, work_list) { 607 list_for_each_entry_safe(ordered, next, &works, work_list) {
592 list_del_init(&ordered->work_list); 608 list_del_init(&ordered->work_list);
@@ -604,6 +620,33 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
604 mutex_unlock(&root->fs_info->ordered_operations_mutex); 620 mutex_unlock(&root->fs_info->ordered_operations_mutex);
605} 621}
606 622
623void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info,
624 int delay_iput)
625{
626 struct btrfs_root *root;
627 struct list_head splice;
628
629 INIT_LIST_HEAD(&splice);
630
631 spin_lock(&fs_info->ordered_root_lock);
632 list_splice_init(&fs_info->ordered_roots, &splice);
633 while (!list_empty(&splice)) {
634 root = list_first_entry(&splice, struct btrfs_root,
635 ordered_root);
636 root = btrfs_grab_fs_root(root);
637 BUG_ON(!root);
638 list_move_tail(&root->ordered_root,
639 &fs_info->ordered_roots);
640 spin_unlock(&fs_info->ordered_root_lock);
641
642 btrfs_wait_ordered_extents(root, delay_iput);
643 btrfs_put_fs_root(root);
644
645 spin_lock(&fs_info->ordered_root_lock);
646 }
647 spin_unlock(&fs_info->ordered_root_lock);
648}
649
607/* 650/*
608 * this is used during transaction commit to write all the inodes 651 * this is used during transaction commit to write all the inodes
609 * added to the ordered operation list. These files must be fully on 652 * added to the ordered operation list. These files must be fully on
@@ -629,7 +672,7 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
629 INIT_LIST_HEAD(&works); 672 INIT_LIST_HEAD(&works);
630 673
631 mutex_lock(&root->fs_info->ordered_operations_mutex); 674 mutex_lock(&root->fs_info->ordered_operations_mutex);
632 spin_lock(&root->fs_info->ordered_extent_lock); 675 spin_lock(&root->fs_info->ordered_root_lock);
633 list_splice_init(&cur_trans->ordered_operations, &splice); 676 list_splice_init(&cur_trans->ordered_operations, &splice);
634 while (!list_empty(&splice)) { 677 while (!list_empty(&splice)) {
635 btrfs_inode = list_entry(splice.next, struct btrfs_inode, 678 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
@@ -648,17 +691,17 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
648 if (!wait) 691 if (!wait)
649 list_add_tail(&BTRFS_I(inode)->ordered_operations, 692 list_add_tail(&BTRFS_I(inode)->ordered_operations,
650 &cur_trans->ordered_operations); 693 &cur_trans->ordered_operations);
651 spin_unlock(&root->fs_info->ordered_extent_lock); 694 spin_unlock(&root->fs_info->ordered_root_lock);
652 695
653 work = btrfs_alloc_delalloc_work(inode, wait, 1); 696 work = btrfs_alloc_delalloc_work(inode, wait, 1);
654 if (!work) { 697 if (!work) {
655 spin_lock(&root->fs_info->ordered_extent_lock); 698 spin_lock(&root->fs_info->ordered_root_lock);
656 if (list_empty(&BTRFS_I(inode)->ordered_operations)) 699 if (list_empty(&BTRFS_I(inode)->ordered_operations))
657 list_add_tail(&btrfs_inode->ordered_operations, 700 list_add_tail(&btrfs_inode->ordered_operations,
658 &splice); 701 &splice);
659 list_splice_tail(&splice, 702 list_splice_tail(&splice,
660 &cur_trans->ordered_operations); 703 &cur_trans->ordered_operations);
661 spin_unlock(&root->fs_info->ordered_extent_lock); 704 spin_unlock(&root->fs_info->ordered_root_lock);
662 ret = -ENOMEM; 705 ret = -ENOMEM;
663 goto out; 706 goto out;
664 } 707 }
@@ -667,9 +710,9 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
667 &work->work); 710 &work->work);
668 711
669 cond_resched(); 712 cond_resched();
670 spin_lock(&root->fs_info->ordered_extent_lock); 713 spin_lock(&root->fs_info->ordered_root_lock);
671 } 714 }
672 spin_unlock(&root->fs_info->ordered_extent_lock); 715 spin_unlock(&root->fs_info->ordered_root_lock);
673out: 716out:
674 list_for_each_entry_safe(work, next, &works, list) { 717 list_for_each_entry_safe(work, next, &works, list) {
675 list_del_init(&work->list); 718 list_del_init(&work->list);
@@ -989,7 +1032,6 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
989 u32 *sum, int len) 1032 u32 *sum, int len)
990{ 1033{
991 struct btrfs_ordered_sum *ordered_sum; 1034 struct btrfs_ordered_sum *ordered_sum;
992 struct btrfs_sector_sum *sector_sums;
993 struct btrfs_ordered_extent *ordered; 1035 struct btrfs_ordered_extent *ordered;
994 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; 1036 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
995 unsigned long num_sectors; 1037 unsigned long num_sectors;
@@ -1007,18 +1049,16 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
1007 disk_bytenr < ordered_sum->bytenr + ordered_sum->len) { 1049 disk_bytenr < ordered_sum->bytenr + ordered_sum->len) {
1008 i = (disk_bytenr - ordered_sum->bytenr) >> 1050 i = (disk_bytenr - ordered_sum->bytenr) >>
1009 inode->i_sb->s_blocksize_bits; 1051 inode->i_sb->s_blocksize_bits;
1010 sector_sums = ordered_sum->sums + i;
1011 num_sectors = ordered_sum->len >> 1052 num_sectors = ordered_sum->len >>
1012 inode->i_sb->s_blocksize_bits; 1053 inode->i_sb->s_blocksize_bits;
1013 for (; i < num_sectors; i++) { 1054 num_sectors = min_t(int, len - index, num_sectors - i);
1014 if (sector_sums[i].bytenr == disk_bytenr) { 1055 memcpy(sum + index, ordered_sum->sums + i,
1015 sum[index] = sector_sums[i].sum; 1056 num_sectors);
1016 index++; 1057
1017 if (index == len) 1058 index += (int)num_sectors;
1018 goto out; 1059 if (index == len)
1019 disk_bytenr += sectorsize; 1060 goto out;
1020 } 1061 disk_bytenr += num_sectors * sectorsize;
1021 }
1022 } 1062 }
1023 } 1063 }
1024out: 1064out:
@@ -1055,12 +1095,12 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
1055 if (last_mod < root->fs_info->last_trans_committed) 1095 if (last_mod < root->fs_info->last_trans_committed)
1056 return; 1096 return;
1057 1097
1058 spin_lock(&root->fs_info->ordered_extent_lock); 1098 spin_lock(&root->fs_info->ordered_root_lock);
1059 if (list_empty(&BTRFS_I(inode)->ordered_operations)) { 1099 if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
1060 list_add_tail(&BTRFS_I(inode)->ordered_operations, 1100 list_add_tail(&BTRFS_I(inode)->ordered_operations,
1061 &cur_trans->ordered_operations); 1101 &cur_trans->ordered_operations);
1062 } 1102 }
1063 spin_unlock(&root->fs_info->ordered_extent_lock); 1103 spin_unlock(&root->fs_info->ordered_root_lock);
1064} 1104}
1065 1105
1066int __init ordered_data_init(void) 1106int __init ordered_data_init(void)
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 58b0e3b0ebad..68844d59ee6f 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -26,18 +26,6 @@ struct btrfs_ordered_inode_tree {
26 struct rb_node *last; 26 struct rb_node *last;
27}; 27};
28 28
29/*
30 * these are used to collect checksums done just before bios submission.
31 * They are attached via a list into the ordered extent, and
32 * checksum items are inserted into the tree after all the blocks in
33 * the ordered extent are on disk
34 */
35struct btrfs_sector_sum {
36 /* bytenr on disk */
37 u64 bytenr;
38 u32 sum;
39};
40
41struct btrfs_ordered_sum { 29struct btrfs_ordered_sum {
42 /* bytenr is the start of this extent on disk */ 30 /* bytenr is the start of this extent on disk */
43 u64 bytenr; 31 u64 bytenr;
@@ -45,10 +33,10 @@ struct btrfs_ordered_sum {
45 /* 33 /*
46 * this is the length in bytes covered by the sums array below. 34 * this is the length in bytes covered by the sums array below.
47 */ 35 */
48 unsigned long len; 36 int len;
49 struct list_head list; 37 struct list_head list;
50 /* last field is a variable length array of btrfs_sector_sums */ 38 /* last field is a variable length array of csums */
51 struct btrfs_sector_sum sums[]; 39 u32 sums[];
52}; 40};
53 41
54/* 42/*
@@ -149,11 +137,8 @@ struct btrfs_ordered_extent {
149static inline int btrfs_ordered_sum_size(struct btrfs_root *root, 137static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
150 unsigned long bytes) 138 unsigned long bytes)
151{ 139{
152 unsigned long num_sectors = (bytes + root->sectorsize - 1) / 140 int num_sectors = (int)DIV_ROUND_UP(bytes, root->sectorsize);
153 root->sectorsize; 141 return sizeof(struct btrfs_ordered_sum) + num_sectors * sizeof(u32);
154 num_sectors++;
155 return sizeof(struct btrfs_ordered_sum) +
156 num_sectors * sizeof(struct btrfs_sector_sum);
157} 142}
158 143
159static inline void 144static inline void
@@ -204,6 +189,8 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
204 struct btrfs_root *root, 189 struct btrfs_root *root,
205 struct inode *inode); 190 struct inode *inode);
206void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput); 191void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput);
192void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info,
193 int delay_iput);
207void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode); 194void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode);
208void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid); 195void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
209void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); 196void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 9d49c586995a..1280eff8af56 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -98,13 +98,10 @@ struct btrfs_qgroup_list {
98 struct btrfs_qgroup *member; 98 struct btrfs_qgroup *member;
99}; 99};
100 100
101struct qgroup_rescan { 101static int
102 struct btrfs_work work; 102qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
103 struct btrfs_fs_info *fs_info; 103 int init_flags);
104}; 104static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
105
106static void qgroup_rescan_start(struct btrfs_fs_info *fs_info,
107 struct qgroup_rescan *qscan);
108 105
109/* must be called with qgroup_ioctl_lock held */ 106/* must be called with qgroup_ioctl_lock held */
110static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info, 107static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
@@ -255,10 +252,17 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
255 int slot; 252 int slot;
256 int ret = 0; 253 int ret = 0;
257 u64 flags = 0; 254 u64 flags = 0;
255 u64 rescan_progress = 0;
258 256
259 if (!fs_info->quota_enabled) 257 if (!fs_info->quota_enabled)
260 return 0; 258 return 0;
261 259
260 fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS);
261 if (!fs_info->qgroup_ulist) {
262 ret = -ENOMEM;
263 goto out;
264 }
265
262 path = btrfs_alloc_path(); 266 path = btrfs_alloc_path();
263 if (!path) { 267 if (!path) {
264 ret = -ENOMEM; 268 ret = -ENOMEM;
@@ -306,20 +310,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
306 } 310 }
307 fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, 311 fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
308 ptr); 312 ptr);
309 fs_info->qgroup_rescan_progress.objectid = 313 rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
310 btrfs_qgroup_status_rescan(l, ptr);
311 if (fs_info->qgroup_flags &
312 BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
313 struct qgroup_rescan *qscan =
314 kmalloc(sizeof(*qscan), GFP_NOFS);
315 if (!qscan) {
316 ret = -ENOMEM;
317 goto out;
318 }
319 fs_info->qgroup_rescan_progress.type = 0;
320 fs_info->qgroup_rescan_progress.offset = 0;
321 qgroup_rescan_start(fs_info, qscan);
322 }
323 goto next1; 314 goto next1;
324 } 315 }
325 316
@@ -421,9 +412,18 @@ out:
421 if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) { 412 if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) {
422 fs_info->quota_enabled = 0; 413 fs_info->quota_enabled = 0;
423 fs_info->pending_quota_state = 0; 414 fs_info->pending_quota_state = 0;
415 } else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
416 ret >= 0) {
417 ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
424 } 418 }
425 btrfs_free_path(path); 419 btrfs_free_path(path);
426 420
421 if (ret < 0) {
422 ulist_free(fs_info->qgroup_ulist);
423 fs_info->qgroup_ulist = NULL;
424 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
425 }
426
427 return ret < 0 ? ret : 0; 427 return ret < 0 ? ret : 0;
428} 428}
429 429
@@ -460,6 +460,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
460 } 460 }
461 kfree(qgroup); 461 kfree(qgroup);
462 } 462 }
463 ulist_free(fs_info->qgroup_ulist);
463} 464}
464 465
465static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, 466static int add_qgroup_relation_item(struct btrfs_trans_handle *trans,
@@ -819,6 +820,12 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
819 goto out; 820 goto out;
820 } 821 }
821 822
823 fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS);
824 if (!fs_info->qgroup_ulist) {
825 ret = -ENOMEM;
826 goto out;
827 }
828
822 /* 829 /*
823 * initially create the quota tree 830 * initially create the quota tree
824 */ 831 */
@@ -916,6 +923,10 @@ out_free_root:
916 kfree(quota_root); 923 kfree(quota_root);
917 } 924 }
918out: 925out:
926 if (ret) {
927 ulist_free(fs_info->qgroup_ulist);
928 fs_info->qgroup_ulist = NULL;
929 }
919 mutex_unlock(&fs_info->qgroup_ioctl_lock); 930 mutex_unlock(&fs_info->qgroup_ioctl_lock);
920 return ret; 931 return ret;
921} 932}
@@ -1355,7 +1366,6 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1355 u64 ref_root; 1366 u64 ref_root;
1356 struct btrfs_qgroup *qgroup; 1367 struct btrfs_qgroup *qgroup;
1357 struct ulist *roots = NULL; 1368 struct ulist *roots = NULL;
1358 struct ulist *tmp = NULL;
1359 u64 seq; 1369 u64 seq;
1360 int ret = 0; 1370 int ret = 0;
1361 int sgn; 1371 int sgn;
@@ -1428,14 +1438,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1428 if (ret < 0) 1438 if (ret < 0)
1429 return ret; 1439 return ret;
1430 1440
1431 mutex_lock(&fs_info->qgroup_rescan_lock);
1432 spin_lock(&fs_info->qgroup_lock); 1441 spin_lock(&fs_info->qgroup_lock);
1433 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
1434 if (fs_info->qgroup_rescan_progress.objectid <= node->bytenr) {
1435 ret = 0;
1436 goto unlock;
1437 }
1438 }
1439 1442
1440 quota_root = fs_info->quota_root; 1443 quota_root = fs_info->quota_root;
1441 if (!quota_root) 1444 if (!quota_root)
@@ -1448,39 +1451,34 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1448 /* 1451 /*
1449 * step 1: for each old ref, visit all nodes once and inc refcnt 1452 * step 1: for each old ref, visit all nodes once and inc refcnt
1450 */ 1453 */
1451 tmp = ulist_alloc(GFP_ATOMIC); 1454 ulist_reinit(fs_info->qgroup_ulist);
1452 if (!tmp) {
1453 ret = -ENOMEM;
1454 goto unlock;
1455 }
1456 seq = fs_info->qgroup_seq; 1455 seq = fs_info->qgroup_seq;
1457 fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ 1456 fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
1458 1457
1459 ret = qgroup_account_ref_step1(fs_info, roots, tmp, seq); 1458 ret = qgroup_account_ref_step1(fs_info, roots, fs_info->qgroup_ulist,
1459 seq);
1460 if (ret) 1460 if (ret)
1461 goto unlock; 1461 goto unlock;
1462 1462
1463 /* 1463 /*
1464 * step 2: walk from the new root 1464 * step 2: walk from the new root
1465 */ 1465 */
1466 ret = qgroup_account_ref_step2(fs_info, roots, tmp, seq, sgn, 1466 ret = qgroup_account_ref_step2(fs_info, roots, fs_info->qgroup_ulist,
1467 node->num_bytes, qgroup); 1467 seq, sgn, node->num_bytes, qgroup);
1468 if (ret) 1468 if (ret)
1469 goto unlock; 1469 goto unlock;
1470 1470
1471 /* 1471 /*
1472 * step 3: walk again from old refs 1472 * step 3: walk again from old refs
1473 */ 1473 */
1474 ret = qgroup_account_ref_step3(fs_info, roots, tmp, seq, sgn, 1474 ret = qgroup_account_ref_step3(fs_info, roots, fs_info->qgroup_ulist,
1475 node->num_bytes); 1475 seq, sgn, node->num_bytes);
1476 if (ret) 1476 if (ret)
1477 goto unlock; 1477 goto unlock;
1478 1478
1479unlock: 1479unlock:
1480 spin_unlock(&fs_info->qgroup_lock); 1480 spin_unlock(&fs_info->qgroup_lock);
1481 mutex_unlock(&fs_info->qgroup_rescan_lock);
1482 ulist_free(roots); 1481 ulist_free(roots);
1483 ulist_free(tmp);
1484 1482
1485 return ret; 1483 return ret;
1486} 1484}
@@ -1527,9 +1525,12 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
1527 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 1525 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1528 1526
1529 if (!ret && start_rescan_worker) { 1527 if (!ret && start_rescan_worker) {
1530 ret = btrfs_qgroup_rescan(fs_info); 1528 ret = qgroup_rescan_init(fs_info, 0, 1);
1531 if (ret) 1529 if (!ret) {
1532 pr_err("btrfs: start rescan quota failed: %d\n", ret); 1530 qgroup_rescan_zero_tracking(fs_info);
1531 btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
1532 &fs_info->qgroup_rescan_work);
1533 }
1533 ret = 0; 1534 ret = 0;
1534 } 1535 }
1535 1536
@@ -1720,7 +1721,6 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1720 struct btrfs_fs_info *fs_info = root->fs_info; 1721 struct btrfs_fs_info *fs_info = root->fs_info;
1721 u64 ref_root = root->root_key.objectid; 1722 u64 ref_root = root->root_key.objectid;
1722 int ret = 0; 1723 int ret = 0;
1723 struct ulist *ulist = NULL;
1724 struct ulist_node *unode; 1724 struct ulist_node *unode;
1725 struct ulist_iterator uiter; 1725 struct ulist_iterator uiter;
1726 1726
@@ -1743,17 +1743,13 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1743 * in a first step, we check all affected qgroups if any limits would 1743 * in a first step, we check all affected qgroups if any limits would
1744 * be exceeded 1744 * be exceeded
1745 */ 1745 */
1746 ulist = ulist_alloc(GFP_ATOMIC); 1746 ulist_reinit(fs_info->qgroup_ulist);
1747 if (!ulist) { 1747 ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
1748 ret = -ENOMEM;
1749 goto out;
1750 }
1751 ret = ulist_add(ulist, qgroup->qgroupid,
1752 (uintptr_t)qgroup, GFP_ATOMIC); 1748 (uintptr_t)qgroup, GFP_ATOMIC);
1753 if (ret < 0) 1749 if (ret < 0)
1754 goto out; 1750 goto out;
1755 ULIST_ITER_INIT(&uiter); 1751 ULIST_ITER_INIT(&uiter);
1756 while ((unode = ulist_next(ulist, &uiter))) { 1752 while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
1757 struct btrfs_qgroup *qg; 1753 struct btrfs_qgroup *qg;
1758 struct btrfs_qgroup_list *glist; 1754 struct btrfs_qgroup_list *glist;
1759 1755
@@ -1774,7 +1770,8 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1774 } 1770 }
1775 1771
1776 list_for_each_entry(glist, &qg->groups, next_group) { 1772 list_for_each_entry(glist, &qg->groups, next_group) {
1777 ret = ulist_add(ulist, glist->group->qgroupid, 1773 ret = ulist_add(fs_info->qgroup_ulist,
1774 glist->group->qgroupid,
1778 (uintptr_t)glist->group, GFP_ATOMIC); 1775 (uintptr_t)glist->group, GFP_ATOMIC);
1779 if (ret < 0) 1776 if (ret < 0)
1780 goto out; 1777 goto out;
@@ -1785,7 +1782,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1785 * no limits exceeded, now record the reservation into all qgroups 1782 * no limits exceeded, now record the reservation into all qgroups
1786 */ 1783 */
1787 ULIST_ITER_INIT(&uiter); 1784 ULIST_ITER_INIT(&uiter);
1788 while ((unode = ulist_next(ulist, &uiter))) { 1785 while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
1789 struct btrfs_qgroup *qg; 1786 struct btrfs_qgroup *qg;
1790 1787
1791 qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; 1788 qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
@@ -1795,8 +1792,6 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1795 1792
1796out: 1793out:
1797 spin_unlock(&fs_info->qgroup_lock); 1794 spin_unlock(&fs_info->qgroup_lock);
1798 ulist_free(ulist);
1799
1800 return ret; 1795 return ret;
1801} 1796}
1802 1797
@@ -1805,7 +1800,6 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
1805 struct btrfs_root *quota_root; 1800 struct btrfs_root *quota_root;
1806 struct btrfs_qgroup *qgroup; 1801 struct btrfs_qgroup *qgroup;
1807 struct btrfs_fs_info *fs_info = root->fs_info; 1802 struct btrfs_fs_info *fs_info = root->fs_info;
1808 struct ulist *ulist = NULL;
1809 struct ulist_node *unode; 1803 struct ulist_node *unode;
1810 struct ulist_iterator uiter; 1804 struct ulist_iterator uiter;
1811 u64 ref_root = root->root_key.objectid; 1805 u64 ref_root = root->root_key.objectid;
@@ -1827,17 +1821,13 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
1827 if (!qgroup) 1821 if (!qgroup)
1828 goto out; 1822 goto out;
1829 1823
1830 ulist = ulist_alloc(GFP_ATOMIC); 1824 ulist_reinit(fs_info->qgroup_ulist);
1831 if (!ulist) { 1825 ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
1832 btrfs_std_error(fs_info, -ENOMEM);
1833 goto out;
1834 }
1835 ret = ulist_add(ulist, qgroup->qgroupid,
1836 (uintptr_t)qgroup, GFP_ATOMIC); 1826 (uintptr_t)qgroup, GFP_ATOMIC);
1837 if (ret < 0) 1827 if (ret < 0)
1838 goto out; 1828 goto out;
1839 ULIST_ITER_INIT(&uiter); 1829 ULIST_ITER_INIT(&uiter);
1840 while ((unode = ulist_next(ulist, &uiter))) { 1830 while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
1841 struct btrfs_qgroup *qg; 1831 struct btrfs_qgroup *qg;
1842 struct btrfs_qgroup_list *glist; 1832 struct btrfs_qgroup_list *glist;
1843 1833
@@ -1846,7 +1836,8 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
1846 qg->reserved -= num_bytes; 1836 qg->reserved -= num_bytes;
1847 1837
1848 list_for_each_entry(glist, &qg->groups, next_group) { 1838 list_for_each_entry(glist, &qg->groups, next_group) {
1849 ret = ulist_add(ulist, glist->group->qgroupid, 1839 ret = ulist_add(fs_info->qgroup_ulist,
1840 glist->group->qgroupid,
1850 (uintptr_t)glist->group, GFP_ATOMIC); 1841 (uintptr_t)glist->group, GFP_ATOMIC);
1851 if (ret < 0) 1842 if (ret < 0)
1852 goto out; 1843 goto out;
@@ -1855,7 +1846,6 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
1855 1846
1856out: 1847out:
1857 spin_unlock(&fs_info->qgroup_lock); 1848 spin_unlock(&fs_info->qgroup_lock);
1858 ulist_free(ulist);
1859} 1849}
1860 1850
1861void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) 1851void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
@@ -1874,12 +1864,11 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
1874 * returns 1 when done, 2 when done and FLAG_INCONSISTENT was cleared. 1864 * returns 1 when done, 2 when done and FLAG_INCONSISTENT was cleared.
1875 */ 1865 */
1876static int 1866static int
1877qgroup_rescan_leaf(struct qgroup_rescan *qscan, struct btrfs_path *path, 1867qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1878 struct btrfs_trans_handle *trans, struct ulist *tmp, 1868 struct btrfs_trans_handle *trans, struct ulist *tmp,
1879 struct extent_buffer *scratch_leaf) 1869 struct extent_buffer *scratch_leaf)
1880{ 1870{
1881 struct btrfs_key found; 1871 struct btrfs_key found;
1882 struct btrfs_fs_info *fs_info = qscan->fs_info;
1883 struct ulist *roots = NULL; 1872 struct ulist *roots = NULL;
1884 struct ulist_node *unode; 1873 struct ulist_node *unode;
1885 struct ulist_iterator uiter; 1874 struct ulist_iterator uiter;
@@ -2007,11 +1996,10 @@ out:
2007 1996
2008static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) 1997static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
2009{ 1998{
2010 struct qgroup_rescan *qscan = container_of(work, struct qgroup_rescan, 1999 struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
2011 work); 2000 qgroup_rescan_work);
2012 struct btrfs_path *path; 2001 struct btrfs_path *path;
2013 struct btrfs_trans_handle *trans = NULL; 2002 struct btrfs_trans_handle *trans = NULL;
2014 struct btrfs_fs_info *fs_info = qscan->fs_info;
2015 struct ulist *tmp = NULL; 2003 struct ulist *tmp = NULL;
2016 struct extent_buffer *scratch_leaf = NULL; 2004 struct extent_buffer *scratch_leaf = NULL;
2017 int err = -ENOMEM; 2005 int err = -ENOMEM;
@@ -2036,7 +2024,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
2036 if (!fs_info->quota_enabled) { 2024 if (!fs_info->quota_enabled) {
2037 err = -EINTR; 2025 err = -EINTR;
2038 } else { 2026 } else {
2039 err = qgroup_rescan_leaf(qscan, path, trans, 2027 err = qgroup_rescan_leaf(fs_info, path, trans,
2040 tmp, scratch_leaf); 2028 tmp, scratch_leaf);
2041 } 2029 }
2042 if (err > 0) 2030 if (err > 0)
@@ -2049,7 +2037,6 @@ out:
2049 kfree(scratch_leaf); 2037 kfree(scratch_leaf);
2050 ulist_free(tmp); 2038 ulist_free(tmp);
2051 btrfs_free_path(path); 2039 btrfs_free_path(path);
2052 kfree(qscan);
2053 2040
2054 mutex_lock(&fs_info->qgroup_rescan_lock); 2041 mutex_lock(&fs_info->qgroup_rescan_lock);
2055 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 2042 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
@@ -2068,47 +2055,74 @@ out:
2068 } else { 2055 } else {
2069 pr_err("btrfs: qgroup scan failed with %d\n", err); 2056 pr_err("btrfs: qgroup scan failed with %d\n", err);
2070 } 2057 }
2071}
2072 2058
2073static void 2059 complete_all(&fs_info->qgroup_rescan_completion);
2074qgroup_rescan_start(struct btrfs_fs_info *fs_info, struct qgroup_rescan *qscan)
2075{
2076 memset(&qscan->work, 0, sizeof(qscan->work));
2077 qscan->work.func = btrfs_qgroup_rescan_worker;
2078 qscan->fs_info = fs_info;
2079
2080 pr_info("btrfs: qgroup scan started\n");
2081 btrfs_queue_worker(&fs_info->qgroup_rescan_workers, &qscan->work);
2082} 2060}
2083 2061
2084int 2062/*
2085btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) 2063 * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all
2064 * memory required for the rescan context.
2065 */
2066static int
2067qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
2068 int init_flags)
2086{ 2069{
2087 int ret = 0; 2070 int ret = 0;
2088 struct rb_node *n;
2089 struct btrfs_qgroup *qgroup;
2090 struct qgroup_rescan *qscan = kmalloc(sizeof(*qscan), GFP_NOFS);
2091 2071
2092 if (!qscan) 2072 if (!init_flags &&
2093 return -ENOMEM; 2073 (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) ||
2074 !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))) {
2075 ret = -EINVAL;
2076 goto err;
2077 }
2094 2078
2095 mutex_lock(&fs_info->qgroup_rescan_lock); 2079 mutex_lock(&fs_info->qgroup_rescan_lock);
2096 spin_lock(&fs_info->qgroup_lock); 2080 spin_lock(&fs_info->qgroup_lock);
2097 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) 2081
2098 ret = -EINPROGRESS; 2082 if (init_flags) {
2099 else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) 2083 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
2100 ret = -EINVAL; 2084 ret = -EINPROGRESS;
2101 if (ret) { 2085 else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
2102 spin_unlock(&fs_info->qgroup_lock); 2086 ret = -EINVAL;
2103 mutex_unlock(&fs_info->qgroup_rescan_lock); 2087
2104 kfree(qscan); 2088 if (ret) {
2105 return ret; 2089 spin_unlock(&fs_info->qgroup_lock);
2090 mutex_unlock(&fs_info->qgroup_rescan_lock);
2091 goto err;
2092 }
2093
2094 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
2106 } 2095 }
2107 2096
2108 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
2109 memset(&fs_info->qgroup_rescan_progress, 0, 2097 memset(&fs_info->qgroup_rescan_progress, 0,
2110 sizeof(fs_info->qgroup_rescan_progress)); 2098 sizeof(fs_info->qgroup_rescan_progress));
2099 fs_info->qgroup_rescan_progress.objectid = progress_objectid;
2100
2101 spin_unlock(&fs_info->qgroup_lock);
2102 mutex_unlock(&fs_info->qgroup_rescan_lock);
2103
2104 init_completion(&fs_info->qgroup_rescan_completion);
2105
2106 memset(&fs_info->qgroup_rescan_work, 0,
2107 sizeof(fs_info->qgroup_rescan_work));
2108 fs_info->qgroup_rescan_work.func = btrfs_qgroup_rescan_worker;
2109
2110 if (ret) {
2111err:
2112 pr_info("btrfs: qgroup_rescan_init failed with %d\n", ret);
2113 return ret;
2114 }
2115
2116 return 0;
2117}
2118
2119static void
2120qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
2121{
2122 struct rb_node *n;
2123 struct btrfs_qgroup *qgroup;
2111 2124
2125 spin_lock(&fs_info->qgroup_lock);
2112 /* clear all current qgroup tracking information */ 2126 /* clear all current qgroup tracking information */
2113 for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) { 2127 for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) {
2114 qgroup = rb_entry(n, struct btrfs_qgroup, node); 2128 qgroup = rb_entry(n, struct btrfs_qgroup, node);
@@ -2118,9 +2132,74 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
2118 qgroup->excl_cmpr = 0; 2132 qgroup->excl_cmpr = 0;
2119 } 2133 }
2120 spin_unlock(&fs_info->qgroup_lock); 2134 spin_unlock(&fs_info->qgroup_lock);
2121 mutex_unlock(&fs_info->qgroup_rescan_lock); 2135}
2136
2137int
2138btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
2139{
2140 int ret = 0;
2141 struct btrfs_trans_handle *trans;
2122 2142
2123 qgroup_rescan_start(fs_info, qscan); 2143 ret = qgroup_rescan_init(fs_info, 0, 1);
2144 if (ret)
2145 return ret;
2146
2147 /*
2148 * We have set the rescan_progress to 0, which means no more
2149 * delayed refs will be accounted by btrfs_qgroup_account_ref.
2150 * However, btrfs_qgroup_account_ref may be right after its call
2151 * to btrfs_find_all_roots, in which case it would still do the
2152 * accounting.
2153 * To solve this, we're committing the transaction, which will
2154 * ensure we run all delayed refs and only after that, we are
2155 * going to clear all tracking information for a clean start.
2156 */
2157
2158 trans = btrfs_join_transaction(fs_info->fs_root);
2159 if (IS_ERR(trans)) {
2160 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
2161 return PTR_ERR(trans);
2162 }
2163 ret = btrfs_commit_transaction(trans, fs_info->fs_root);
2164 if (ret) {
2165 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
2166 return ret;
2167 }
2168
2169 qgroup_rescan_zero_tracking(fs_info);
2170
2171 btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
2172 &fs_info->qgroup_rescan_work);
2124 2173
2125 return 0; 2174 return 0;
2126} 2175}
2176
2177int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info)
2178{
2179 int running;
2180 int ret = 0;
2181
2182 mutex_lock(&fs_info->qgroup_rescan_lock);
2183 spin_lock(&fs_info->qgroup_lock);
2184 running = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN;
2185 spin_unlock(&fs_info->qgroup_lock);
2186 mutex_unlock(&fs_info->qgroup_rescan_lock);
2187
2188 if (running)
2189 ret = wait_for_completion_interruptible(
2190 &fs_info->qgroup_rescan_completion);
2191
2192 return ret;
2193}
2194
2195/*
2196 * this is only called from open_ctree where we're still single threaded, thus
2197 * locking is omitted here.
2198 */
2199void
2200btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
2201{
2202 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
2203 btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
2204 &fs_info->qgroup_rescan_work);
2205}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 4febca4fc2de..12096496cc99 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1305,6 +1305,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1305 struct extent_buffer *eb; 1305 struct extent_buffer *eb;
1306 struct btrfs_root_item *root_item; 1306 struct btrfs_root_item *root_item;
1307 struct btrfs_key root_key; 1307 struct btrfs_key root_key;
1308 u64 last_snap = 0;
1308 int ret; 1309 int ret;
1309 1310
1310 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 1311 root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
@@ -1320,6 +1321,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1320 BTRFS_TREE_RELOC_OBJECTID); 1321 BTRFS_TREE_RELOC_OBJECTID);
1321 BUG_ON(ret); 1322 BUG_ON(ret);
1322 1323
1324 last_snap = btrfs_root_last_snapshot(&root->root_item);
1323 btrfs_set_root_last_snapshot(&root->root_item, 1325 btrfs_set_root_last_snapshot(&root->root_item,
1324 trans->transid - 1); 1326 trans->transid - 1);
1325 } else { 1327 } else {
@@ -1345,6 +1347,12 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1345 memset(&root_item->drop_progress, 0, 1347 memset(&root_item->drop_progress, 0,
1346 sizeof(struct btrfs_disk_key)); 1348 sizeof(struct btrfs_disk_key));
1347 root_item->drop_level = 0; 1349 root_item->drop_level = 0;
1350 /*
1351 * abuse rtransid, it is safe because it is impossible to
1352 * receive data into a relocation tree.
1353 */
1354 btrfs_set_root_rtransid(root_item, last_snap);
1355 btrfs_set_root_otransid(root_item, trans->transid);
1348 } 1356 }
1349 1357
1350 btrfs_tree_unlock(eb); 1358 btrfs_tree_unlock(eb);
@@ -1355,8 +1363,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1355 BUG_ON(ret); 1363 BUG_ON(ret);
1356 kfree(root_item); 1364 kfree(root_item);
1357 1365
1358 reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root, 1366 reloc_root = btrfs_read_fs_root(root->fs_info->tree_root, &root_key);
1359 &root_key);
1360 BUG_ON(IS_ERR(reloc_root)); 1367 BUG_ON(IS_ERR(reloc_root));
1361 reloc_root->last_trans = trans->transid; 1368 reloc_root->last_trans = trans->transid;
1362 return reloc_root; 1369 return reloc_root;
@@ -2273,8 +2280,12 @@ void free_reloc_roots(struct list_head *list)
2273static noinline_for_stack 2280static noinline_for_stack
2274int merge_reloc_roots(struct reloc_control *rc) 2281int merge_reloc_roots(struct reloc_control *rc)
2275{ 2282{
2283 struct btrfs_trans_handle *trans;
2276 struct btrfs_root *root; 2284 struct btrfs_root *root;
2277 struct btrfs_root *reloc_root; 2285 struct btrfs_root *reloc_root;
2286 u64 last_snap;
2287 u64 otransid;
2288 u64 objectid;
2278 LIST_HEAD(reloc_roots); 2289 LIST_HEAD(reloc_roots);
2279 int found = 0; 2290 int found = 0;
2280 int ret = 0; 2291 int ret = 0;
@@ -2308,12 +2319,44 @@ again:
2308 } else { 2319 } else {
2309 list_del_init(&reloc_root->root_list); 2320 list_del_init(&reloc_root->root_list);
2310 } 2321 }
2322
2323 /*
2324 * we keep the old last snapshod transid in rtranid when we
2325 * created the relocation tree.
2326 */
2327 last_snap = btrfs_root_rtransid(&reloc_root->root_item);
2328 otransid = btrfs_root_otransid(&reloc_root->root_item);
2329 objectid = reloc_root->root_key.offset;
2330
2311 ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1); 2331 ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
2312 if (ret < 0) { 2332 if (ret < 0) {
2313 if (list_empty(&reloc_root->root_list)) 2333 if (list_empty(&reloc_root->root_list))
2314 list_add_tail(&reloc_root->root_list, 2334 list_add_tail(&reloc_root->root_list,
2315 &reloc_roots); 2335 &reloc_roots);
2316 goto out; 2336 goto out;
2337 } else if (!ret) {
2338 /*
2339 * recover the last snapshot tranid to avoid
2340 * the space balance break NOCOW.
2341 */
2342 root = read_fs_root(rc->extent_root->fs_info,
2343 objectid);
2344 if (IS_ERR(root))
2345 continue;
2346
2347 if (btrfs_root_refs(&root->root_item) == 0)
2348 continue;
2349
2350 trans = btrfs_join_transaction(root);
2351 BUG_ON(IS_ERR(trans));
2352
2353 /* Check if the fs/file tree was snapshoted or not. */
2354 if (btrfs_root_last_snapshot(&root->root_item) ==
2355 otransid - 1)
2356 btrfs_set_root_last_snapshot(&root->root_item,
2357 last_snap);
2358
2359 btrfs_end_transaction(trans, root);
2317 } 2360 }
2318 } 2361 }
2319 2362
@@ -3266,6 +3309,8 @@ static int __add_tree_block(struct reloc_control *rc,
3266 struct btrfs_path *path; 3309 struct btrfs_path *path;
3267 struct btrfs_key key; 3310 struct btrfs_key key;
3268 int ret; 3311 int ret;
3312 bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info,
3313 SKINNY_METADATA);
3269 3314
3270 if (tree_block_processed(bytenr, blocksize, rc)) 3315 if (tree_block_processed(bytenr, blocksize, rc))
3271 return 0; 3316 return 0;
@@ -3276,10 +3321,15 @@ static int __add_tree_block(struct reloc_control *rc,
3276 path = btrfs_alloc_path(); 3321 path = btrfs_alloc_path();
3277 if (!path) 3322 if (!path)
3278 return -ENOMEM; 3323 return -ENOMEM;
3279 3324again:
3280 key.objectid = bytenr; 3325 key.objectid = bytenr;
3281 key.type = BTRFS_EXTENT_ITEM_KEY; 3326 if (skinny) {
3282 key.offset = blocksize; 3327 key.type = BTRFS_METADATA_ITEM_KEY;
3328 key.offset = (u64)-1;
3329 } else {
3330 key.type = BTRFS_EXTENT_ITEM_KEY;
3331 key.offset = blocksize;
3332 }
3283 3333
3284 path->search_commit_root = 1; 3334 path->search_commit_root = 1;
3285 path->skip_locking = 1; 3335 path->skip_locking = 1;
@@ -3287,11 +3337,23 @@ static int __add_tree_block(struct reloc_control *rc,
3287 if (ret < 0) 3337 if (ret < 0)
3288 goto out; 3338 goto out;
3289 3339
3290 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 3340 if (ret > 0 && skinny) {
3291 if (ret > 0) { 3341 if (path->slots[0]) {
3292 if (key.objectid == bytenr && 3342 path->slots[0]--;
3293 key.type == BTRFS_METADATA_ITEM_KEY) 3343 btrfs_item_key_to_cpu(path->nodes[0], &key,
3294 ret = 0; 3344 path->slots[0]);
3345 if (key.objectid == bytenr &&
3346 (key.type == BTRFS_METADATA_ITEM_KEY ||
3347 (key.type == BTRFS_EXTENT_ITEM_KEY &&
3348 key.offset == blocksize)))
3349 ret = 0;
3350 }
3351
3352 if (ret) {
3353 skinny = false;
3354 btrfs_release_path(path);
3355 goto again;
3356 }
3295 } 3357 }
3296 BUG_ON(ret); 3358 BUG_ON(ret);
3297 3359
@@ -4160,12 +4222,12 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4160 (unsigned long long)rc->block_group->key.objectid, 4222 (unsigned long long)rc->block_group->key.objectid,
4161 (unsigned long long)rc->block_group->flags); 4223 (unsigned long long)rc->block_group->flags);
4162 4224
4163 ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0); 4225 ret = btrfs_start_all_delalloc_inodes(fs_info, 0);
4164 if (ret < 0) { 4226 if (ret < 0) {
4165 err = ret; 4227 err = ret;
4166 goto out; 4228 goto out;
4167 } 4229 }
4168 btrfs_wait_ordered_extents(fs_info->tree_root, 0); 4230 btrfs_wait_all_ordered_extents(fs_info, 0);
4169 4231
4170 while (1) { 4232 while (1) {
4171 mutex_lock(&fs_info->cleaner_mutex); 4233 mutex_lock(&fs_info->cleaner_mutex);
@@ -4277,7 +4339,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
4277 key.type != BTRFS_ROOT_ITEM_KEY) 4339 key.type != BTRFS_ROOT_ITEM_KEY)
4278 break; 4340 break;
4279 4341
4280 reloc_root = btrfs_read_fs_root_no_radix(root, &key); 4342 reloc_root = btrfs_read_fs_root(root, &key);
4281 if (IS_ERR(reloc_root)) { 4343 if (IS_ERR(reloc_root)) {
4282 err = PTR_ERR(reloc_root); 4344 err = PTR_ERR(reloc_root);
4283 goto out; 4345 goto out;
@@ -4396,10 +4458,8 @@ out:
4396int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) 4458int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
4397{ 4459{
4398 struct btrfs_ordered_sum *sums; 4460 struct btrfs_ordered_sum *sums;
4399 struct btrfs_sector_sum *sector_sum;
4400 struct btrfs_ordered_extent *ordered; 4461 struct btrfs_ordered_extent *ordered;
4401 struct btrfs_root *root = BTRFS_I(inode)->root; 4462 struct btrfs_root *root = BTRFS_I(inode)->root;
4402 size_t offset;
4403 int ret; 4463 int ret;
4404 u64 disk_bytenr; 4464 u64 disk_bytenr;
4405 LIST_HEAD(list); 4465 LIST_HEAD(list);
@@ -4413,19 +4473,13 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
4413 if (ret) 4473 if (ret)
4414 goto out; 4474 goto out;
4415 4475
4476 disk_bytenr = ordered->start;
4416 while (!list_empty(&list)) { 4477 while (!list_empty(&list)) {
4417 sums = list_entry(list.next, struct btrfs_ordered_sum, list); 4478 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
4418 list_del_init(&sums->list); 4479 list_del_init(&sums->list);
4419 4480
4420 sector_sum = sums->sums; 4481 sums->bytenr = disk_bytenr;
4421 sums->bytenr = ordered->start; 4482 disk_bytenr += sums->len;
4422
4423 offset = 0;
4424 while (offset < sums->len) {
4425 sector_sum->bytenr += ordered->start - disk_bytenr;
4426 sector_sum++;
4427 offset += root->sectorsize;
4428 }
4429 4483
4430 btrfs_add_ordered_sum(inode, ordered, sums); 4484 btrfs_add_ordered_sum(inode, ordered, sums);
4431 } 4485 }
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 5bf1ed57f178..ffb1036ef10d 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -64,52 +64,59 @@ void btrfs_read_root_item(struct extent_buffer *eb, int slot,
64} 64}
65 65
66/* 66/*
67 * lookup the root with the highest offset for a given objectid. The key we do 67 * btrfs_find_root - lookup the root by the key.
68 * find is copied into 'key'. If we find something return 0, otherwise 1, < 0 68 * root: the root of the root tree
69 * on error. 69 * search_key: the key to search
70 * path: the path we search
71 * root_item: the root item of the tree we look for
72 * root_key: the reak key of the tree we look for
73 *
74 * If ->offset of 'seach_key' is -1ULL, it means we are not sure the offset
75 * of the search key, just lookup the root with the highest offset for a
76 * given objectid.
77 *
78 * If we find something return 0, otherwise > 0, < 0 on error.
70 */ 79 */
71int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, 80int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key,
72 struct btrfs_root_item *item, struct btrfs_key *key) 81 struct btrfs_path *path, struct btrfs_root_item *root_item,
82 struct btrfs_key *root_key)
73{ 83{
74 struct btrfs_path *path;
75 struct btrfs_key search_key;
76 struct btrfs_key found_key; 84 struct btrfs_key found_key;
77 struct extent_buffer *l; 85 struct extent_buffer *l;
78 int ret; 86 int ret;
79 int slot; 87 int slot;
80 88
81 search_key.objectid = objectid; 89 ret = btrfs_search_slot(NULL, root, search_key, path, 0, 0);
82 search_key.type = BTRFS_ROOT_ITEM_KEY;
83 search_key.offset = (u64)-1;
84
85 path = btrfs_alloc_path();
86 if (!path)
87 return -ENOMEM;
88 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
89 if (ret < 0) 90 if (ret < 0)
90 goto out; 91 return ret;
91 92
92 BUG_ON(ret == 0); 93 if (search_key->offset != -1ULL) { /* the search key is exact */
93 if (path->slots[0] == 0) { 94 if (ret > 0)
94 ret = 1; 95 goto out;
95 goto out; 96 } else {
97 BUG_ON(ret == 0); /* Logical error */
98 if (path->slots[0] == 0)
99 goto out;
100 path->slots[0]--;
101 ret = 0;
96 } 102 }
103
97 l = path->nodes[0]; 104 l = path->nodes[0];
98 slot = path->slots[0] - 1; 105 slot = path->slots[0];
106
99 btrfs_item_key_to_cpu(l, &found_key, slot); 107 btrfs_item_key_to_cpu(l, &found_key, slot);
100 if (found_key.objectid != objectid || 108 if (found_key.objectid != search_key->objectid ||
101 found_key.type != BTRFS_ROOT_ITEM_KEY) { 109 found_key.type != BTRFS_ROOT_ITEM_KEY) {
102 ret = 1; 110 ret = 1;
103 goto out; 111 goto out;
104 } 112 }
105 if (item)
106 btrfs_read_root_item(l, slot, item);
107 if (key)
108 memcpy(key, &found_key, sizeof(found_key));
109 113
110 ret = 0; 114 if (root_item)
115 btrfs_read_root_item(l, slot, root_item);
116 if (root_key)
117 memcpy(root_key, &found_key, sizeof(found_key));
111out: 118out:
112 btrfs_free_path(path); 119 btrfs_release_path(path);
113 return ret; 120 return ret;
114} 121}
115 122
@@ -212,86 +219,6 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
212 return btrfs_insert_item(trans, root, key, item, sizeof(*item)); 219 return btrfs_insert_item(trans, root, key, item, sizeof(*item));
213} 220}
214 221
215/*
216 * at mount time we want to find all the old transaction snapshots that were in
217 * the process of being deleted if we crashed. This is any root item with an
218 * offset lower than the latest root. They need to be queued for deletion to
219 * finish what was happening when we crashed.
220 */
221int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid)
222{
223 struct btrfs_root *dead_root;
224 struct btrfs_root_item *ri;
225 struct btrfs_key key;
226 struct btrfs_key found_key;
227 struct btrfs_path *path;
228 int ret;
229 u32 nritems;
230 struct extent_buffer *leaf;
231 int slot;
232
233 key.objectid = objectid;
234 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
235 key.offset = 0;
236 path = btrfs_alloc_path();
237 if (!path)
238 return -ENOMEM;
239
240again:
241 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
242 if (ret < 0)
243 goto err;
244 while (1) {
245 leaf = path->nodes[0];
246 nritems = btrfs_header_nritems(leaf);
247 slot = path->slots[0];
248 if (slot >= nritems) {
249 ret = btrfs_next_leaf(root, path);
250 if (ret)
251 break;
252 leaf = path->nodes[0];
253 nritems = btrfs_header_nritems(leaf);
254 slot = path->slots[0];
255 }
256 btrfs_item_key_to_cpu(leaf, &key, slot);
257 if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
258 goto next;
259
260 if (key.objectid < objectid)
261 goto next;
262
263 if (key.objectid > objectid)
264 break;
265
266 ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item);
267 if (btrfs_disk_root_refs(leaf, ri) != 0)
268 goto next;
269
270 memcpy(&found_key, &key, sizeof(key));
271 key.offset++;
272 btrfs_release_path(path);
273 dead_root =
274 btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
275 &found_key);
276 if (IS_ERR(dead_root)) {
277 ret = PTR_ERR(dead_root);
278 goto err;
279 }
280
281 ret = btrfs_add_dead_root(dead_root);
282 if (ret)
283 goto err;
284 goto again;
285next:
286 slot++;
287 path->slots[0]++;
288 }
289 ret = 0;
290err:
291 btrfs_free_path(path);
292 return ret;
293}
294
295int btrfs_find_orphan_roots(struct btrfs_root *tree_root) 222int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
296{ 223{
297 struct extent_buffer *leaf; 224 struct extent_buffer *leaf;
@@ -301,6 +228,10 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
301 struct btrfs_root *root; 228 struct btrfs_root *root;
302 int err = 0; 229 int err = 0;
303 int ret; 230 int ret;
231 bool can_recover = true;
232
233 if (tree_root->fs_info->sb->s_flags & MS_RDONLY)
234 can_recover = false;
304 235
305 path = btrfs_alloc_path(); 236 path = btrfs_alloc_path();
306 if (!path) 237 if (!path)
@@ -340,20 +271,52 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
340 root_key.objectid = key.offset; 271 root_key.objectid = key.offset;
341 key.offset++; 272 key.offset++;
342 273
343 root = btrfs_read_fs_root_no_name(tree_root->fs_info, 274 root = btrfs_read_fs_root(tree_root, &root_key);
344 &root_key); 275 err = PTR_RET(root);
345 if (!IS_ERR(root)) 276 if (err && err != -ENOENT) {
277 break;
278 } else if (err == -ENOENT) {
279 struct btrfs_trans_handle *trans;
280
281 btrfs_release_path(path);
282
283 trans = btrfs_join_transaction(tree_root);
284 if (IS_ERR(trans)) {
285 err = PTR_ERR(trans);
286 btrfs_error(tree_root->fs_info, err,
287 "Failed to start trans to delete "
288 "orphan item");
289 break;
290 }
291 err = btrfs_del_orphan_item(trans, tree_root,
292 root_key.objectid);
293 btrfs_end_transaction(trans, tree_root);
294 if (err) {
295 btrfs_error(tree_root->fs_info, err,
296 "Failed to delete root orphan "
297 "item");
298 break;
299 }
346 continue; 300 continue;
301 }
347 302
348 ret = PTR_ERR(root); 303 if (btrfs_root_refs(&root->root_item) == 0) {
349 if (ret != -ENOENT) { 304 btrfs_add_dead_root(root);
350 err = ret; 305 continue;
306 }
307
308 err = btrfs_init_fs_root(root);
309 if (err) {
310 btrfs_free_fs_root(root);
351 break; 311 break;
352 } 312 }
353 313
354 ret = btrfs_find_dead_roots(tree_root, root_key.objectid); 314 root->orphan_item_inserted = 1;
355 if (ret) { 315
356 err = ret; 316 err = btrfs_insert_fs_root(root->fs_info, root);
317 if (err) {
318 BUG_ON(err == -EEXIST);
319 btrfs_free_fs_root(root);
357 break; 320 break;
358 } 321 }
359 } 322 }
@@ -368,8 +331,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
368{ 331{
369 struct btrfs_path *path; 332 struct btrfs_path *path;
370 int ret; 333 int ret;
371 struct btrfs_root_item *ri;
372 struct extent_buffer *leaf;
373 334
374 path = btrfs_alloc_path(); 335 path = btrfs_alloc_path();
375 if (!path) 336 if (!path)
@@ -379,8 +340,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
379 goto out; 340 goto out;
380 341
381 BUG_ON(ret != 0); 342 BUG_ON(ret != 0);
382 leaf = path->nodes[0];
383 ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
384 343
385 ret = btrfs_del_item(trans, root, path); 344 ret = btrfs_del_item(trans, root, path);
386out: 345out:
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 79bd479317cb..4ba2a69a60ad 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2126,8 +2126,7 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
2126 u8 *csum) 2126 u8 *csum)
2127{ 2127{
2128 struct btrfs_ordered_sum *sum = NULL; 2128 struct btrfs_ordered_sum *sum = NULL;
2129 int ret = 0; 2129 unsigned long index;
2130 unsigned long i;
2131 unsigned long num_sectors; 2130 unsigned long num_sectors;
2132 2131
2133 while (!list_empty(&sctx->csum_list)) { 2132 while (!list_empty(&sctx->csum_list)) {
@@ -2146,19 +2145,14 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
2146 if (!sum) 2145 if (!sum)
2147 return 0; 2146 return 0;
2148 2147
2148 index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
2149 num_sectors = sum->len / sctx->sectorsize; 2149 num_sectors = sum->len / sctx->sectorsize;
2150 for (i = 0; i < num_sectors; ++i) { 2150 memcpy(csum, sum->sums + index, sctx->csum_size);
2151 if (sum->sums[i].bytenr == logical) { 2151 if (index == num_sectors - 1) {
2152 memcpy(csum, &sum->sums[i].sum, sctx->csum_size);
2153 ret = 1;
2154 break;
2155 }
2156 }
2157 if (ret && i == num_sectors - 1) {
2158 list_del(&sum->list); 2152 list_del(&sum->list);
2159 kfree(sum); 2153 kfree(sum);
2160 } 2154 }
2161 return ret; 2155 return 1;
2162} 2156}
2163 2157
2164/* scrub extent tries to collect up to 64 kB for each bio */ 2158/* scrub extent tries to collect up to 64 kB for each bio */
@@ -2505,6 +2499,7 @@ again:
2505 if (ret) 2499 if (ret)
2506 goto out; 2500 goto out;
2507 2501
2502 scrub_free_csums(sctx);
2508 if (extent_logical + extent_len < 2503 if (extent_logical + extent_len <
2509 key.objectid + bytes) { 2504 key.objectid + bytes) {
2510 logical += increment; 2505 logical += increment;
@@ -3204,16 +3199,18 @@ out:
3204 3199
3205static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) 3200static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
3206{ 3201{
3207 unsigned long index;
3208 struct scrub_copy_nocow_ctx *nocow_ctx = ctx; 3202 struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
3209 int ret = 0; 3203 struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
3210 struct btrfs_key key; 3204 struct btrfs_key key;
3211 struct inode *inode = NULL; 3205 struct inode *inode;
3206 struct page *page;
3212 struct btrfs_root *local_root; 3207 struct btrfs_root *local_root;
3213 u64 physical_for_dev_replace; 3208 u64 physical_for_dev_replace;
3214 u64 len; 3209 u64 len;
3215 struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; 3210 unsigned long index;
3216 int srcu_index; 3211 int srcu_index;
3212 int ret;
3213 int err;
3217 3214
3218 key.objectid = root; 3215 key.objectid = root;
3219 key.type = BTRFS_ROOT_ITEM_KEY; 3216 key.type = BTRFS_ROOT_ITEM_KEY;
@@ -3227,6 +3224,11 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
3227 return PTR_ERR(local_root); 3224 return PTR_ERR(local_root);
3228 } 3225 }
3229 3226
3227 if (btrfs_root_refs(&local_root->root_item) == 0) {
3228 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
3229 return -ENOENT;
3230 }
3231
3230 key.type = BTRFS_INODE_ITEM_KEY; 3232 key.type = BTRFS_INODE_ITEM_KEY;
3231 key.objectid = inum; 3233 key.objectid = inum;
3232 key.offset = 0; 3234 key.offset = 0;
@@ -3235,19 +3237,21 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
3235 if (IS_ERR(inode)) 3237 if (IS_ERR(inode))
3236 return PTR_ERR(inode); 3238 return PTR_ERR(inode);
3237 3239
3240 /* Avoid truncate/dio/punch hole.. */
3241 mutex_lock(&inode->i_mutex);
3242 inode_dio_wait(inode);
3243
3244 ret = 0;
3238 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; 3245 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3239 len = nocow_ctx->len; 3246 len = nocow_ctx->len;
3240 while (len >= PAGE_CACHE_SIZE) { 3247 while (len >= PAGE_CACHE_SIZE) {
3241 struct page *page = NULL;
3242 int ret_sub;
3243
3244 index = offset >> PAGE_CACHE_SHIFT; 3248 index = offset >> PAGE_CACHE_SHIFT;
3245 3249again:
3246 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); 3250 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
3247 if (!page) { 3251 if (!page) {
3248 pr_err("find_or_create_page() failed\n"); 3252 pr_err("find_or_create_page() failed\n");
3249 ret = -ENOMEM; 3253 ret = -ENOMEM;
3250 goto next_page; 3254 goto out;
3251 } 3255 }
3252 3256
3253 if (PageUptodate(page)) { 3257 if (PageUptodate(page)) {
@@ -3255,39 +3259,49 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
3255 goto next_page; 3259 goto next_page;
3256 } else { 3260 } else {
3257 ClearPageError(page); 3261 ClearPageError(page);
3258 ret_sub = extent_read_full_page(&BTRFS_I(inode)-> 3262 err = extent_read_full_page(&BTRFS_I(inode)->
3259 io_tree, 3263 io_tree,
3260 page, btrfs_get_extent, 3264 page, btrfs_get_extent,
3261 nocow_ctx->mirror_num); 3265 nocow_ctx->mirror_num);
3262 if (ret_sub) { 3266 if (err) {
3263 ret = ret_sub; 3267 ret = err;
3264 goto next_page; 3268 goto next_page;
3265 } 3269 }
3266 wait_on_page_locked(page); 3270
3271 lock_page(page);
3272 /*
3273 * If the page has been remove from the page cache,
3274 * the data on it is meaningless, because it may be
3275 * old one, the new data may be written into the new
3276 * page in the page cache.
3277 */
3278 if (page->mapping != inode->i_mapping) {
3279 page_cache_release(page);
3280 goto again;
3281 }
3267 if (!PageUptodate(page)) { 3282 if (!PageUptodate(page)) {
3268 ret = -EIO; 3283 ret = -EIO;
3269 goto next_page; 3284 goto next_page;
3270 } 3285 }
3271 } 3286 }
3272 ret_sub = write_page_nocow(nocow_ctx->sctx, 3287 err = write_page_nocow(nocow_ctx->sctx,
3273 physical_for_dev_replace, page); 3288 physical_for_dev_replace, page);
3274 if (ret_sub) { 3289 if (err)
3275 ret = ret_sub; 3290 ret = err;
3276 goto next_page;
3277 }
3278
3279next_page: 3291next_page:
3280 if (page) { 3292 unlock_page(page);
3281 unlock_page(page); 3293 page_cache_release(page);
3282 put_page(page); 3294
3283 } 3295 if (ret)
3296 break;
3297
3284 offset += PAGE_CACHE_SIZE; 3298 offset += PAGE_CACHE_SIZE;
3285 physical_for_dev_replace += PAGE_CACHE_SIZE; 3299 physical_for_dev_replace += PAGE_CACHE_SIZE;
3286 len -= PAGE_CACHE_SIZE; 3300 len -= PAGE_CACHE_SIZE;
3287 } 3301 }
3288 3302out:
3289 if (inode) 3303 mutex_unlock(&inode->i_mutex);
3290 iput(inode); 3304 iput(inode);
3291 return ret; 3305 return ret;
3292} 3306}
3293 3307
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index ff40f1c00ce3..d3f3b43cae0b 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -158,7 +158,7 @@ static void fs_path_reset(struct fs_path *p)
158 } 158 }
159} 159}
160 160
161static struct fs_path *fs_path_alloc(struct send_ctx *sctx) 161static struct fs_path *fs_path_alloc(void)
162{ 162{
163 struct fs_path *p; 163 struct fs_path *p;
164 164
@@ -173,11 +173,11 @@ static struct fs_path *fs_path_alloc(struct send_ctx *sctx)
173 return p; 173 return p;
174} 174}
175 175
176static struct fs_path *fs_path_alloc_reversed(struct send_ctx *sctx) 176static struct fs_path *fs_path_alloc_reversed(void)
177{ 177{
178 struct fs_path *p; 178 struct fs_path *p;
179 179
180 p = fs_path_alloc(sctx); 180 p = fs_path_alloc();
181 if (!p) 181 if (!p)
182 return NULL; 182 return NULL;
183 p->reversed = 1; 183 p->reversed = 1;
@@ -185,7 +185,7 @@ static struct fs_path *fs_path_alloc_reversed(struct send_ctx *sctx)
185 return p; 185 return p;
186} 186}
187 187
188static void fs_path_free(struct send_ctx *sctx, struct fs_path *p) 188static void fs_path_free(struct fs_path *p)
189{ 189{
190 if (!p) 190 if (!p)
191 return; 191 return;
@@ -753,8 +753,7 @@ typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index,
753 * 753 *
754 * path must point to the INODE_REF or INODE_EXTREF when called. 754 * path must point to the INODE_REF or INODE_EXTREF when called.
755 */ 755 */
756static int iterate_inode_ref(struct send_ctx *sctx, 756static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
757 struct btrfs_root *root, struct btrfs_path *path,
758 struct btrfs_key *found_key, int resolve, 757 struct btrfs_key *found_key, int resolve,
759 iterate_inode_ref_t iterate, void *ctx) 758 iterate_inode_ref_t iterate, void *ctx)
760{ 759{
@@ -777,13 +776,13 @@ static int iterate_inode_ref(struct send_ctx *sctx,
777 unsigned long elem_size; 776 unsigned long elem_size;
778 unsigned long ptr; 777 unsigned long ptr;
779 778
780 p = fs_path_alloc_reversed(sctx); 779 p = fs_path_alloc_reversed();
781 if (!p) 780 if (!p)
782 return -ENOMEM; 781 return -ENOMEM;
783 782
784 tmp_path = alloc_path_for_send(); 783 tmp_path = alloc_path_for_send();
785 if (!tmp_path) { 784 if (!tmp_path) {
786 fs_path_free(sctx, p); 785 fs_path_free(p);
787 return -ENOMEM; 786 return -ENOMEM;
788 } 787 }
789 788
@@ -858,7 +857,7 @@ static int iterate_inode_ref(struct send_ctx *sctx,
858 857
859out: 858out:
860 btrfs_free_path(tmp_path); 859 btrfs_free_path(tmp_path);
861 fs_path_free(sctx, p); 860 fs_path_free(p);
862 return ret; 861 return ret;
863} 862}
864 863
@@ -874,8 +873,7 @@ typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key,
874 * 873 *
875 * path must point to the dir item when called. 874 * path must point to the dir item when called.
876 */ 875 */
877static int iterate_dir_item(struct send_ctx *sctx, 876static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
878 struct btrfs_root *root, struct btrfs_path *path,
879 struct btrfs_key *found_key, 877 struct btrfs_key *found_key,
880 iterate_dir_item_t iterate, void *ctx) 878 iterate_dir_item_t iterate, void *ctx)
881{ 879{
@@ -990,7 +988,7 @@ static int __copy_first_ref(int num, u64 dir, int index,
990 * Retrieve the first path of an inode. If an inode has more then one 988 * Retrieve the first path of an inode. If an inode has more then one
991 * ref/hardlink, this is ignored. 989 * ref/hardlink, this is ignored.
992 */ 990 */
993static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root, 991static int get_inode_path(struct btrfs_root *root,
994 u64 ino, struct fs_path *path) 992 u64 ino, struct fs_path *path)
995{ 993{
996 int ret; 994 int ret;
@@ -1022,8 +1020,8 @@ static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root,
1022 goto out; 1020 goto out;
1023 } 1021 }
1024 1022
1025 ret = iterate_inode_ref(sctx, root, p, &found_key, 1, 1023 ret = iterate_inode_ref(root, p, &found_key, 1,
1026 __copy_first_ref, path); 1024 __copy_first_ref, path);
1027 if (ret < 0) 1025 if (ret < 0)
1028 goto out; 1026 goto out;
1029 ret = 0; 1027 ret = 0;
@@ -1314,8 +1312,7 @@ out:
1314 return ret; 1312 return ret;
1315} 1313}
1316 1314
1317static int read_symlink(struct send_ctx *sctx, 1315static int read_symlink(struct btrfs_root *root,
1318 struct btrfs_root *root,
1319 u64 ino, 1316 u64 ino,
1320 struct fs_path *dest) 1317 struct fs_path *dest)
1321{ 1318{
@@ -1562,8 +1559,7 @@ out:
1562 * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir, 1559 * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
1563 * generation of the parent dir and the name of the dir entry. 1560 * generation of the parent dir and the name of the dir entry.
1564 */ 1561 */
1565static int get_first_ref(struct send_ctx *sctx, 1562static int get_first_ref(struct btrfs_root *root, u64 ino,
1566 struct btrfs_root *root, u64 ino,
1567 u64 *dir, u64 *dir_gen, struct fs_path *name) 1563 u64 *dir, u64 *dir_gen, struct fs_path *name)
1568{ 1564{
1569 int ret; 1565 int ret;
@@ -1628,8 +1624,7 @@ out:
1628 return ret; 1624 return ret;
1629} 1625}
1630 1626
1631static int is_first_ref(struct send_ctx *sctx, 1627static int is_first_ref(struct btrfs_root *root,
1632 struct btrfs_root *root,
1633 u64 ino, u64 dir, 1628 u64 ino, u64 dir,
1634 const char *name, int name_len) 1629 const char *name, int name_len)
1635{ 1630{
@@ -1638,11 +1633,11 @@ static int is_first_ref(struct send_ctx *sctx,
1638 u64 tmp_dir; 1633 u64 tmp_dir;
1639 u64 tmp_dir_gen; 1634 u64 tmp_dir_gen;
1640 1635
1641 tmp_name = fs_path_alloc(sctx); 1636 tmp_name = fs_path_alloc();
1642 if (!tmp_name) 1637 if (!tmp_name)
1643 return -ENOMEM; 1638 return -ENOMEM;
1644 1639
1645 ret = get_first_ref(sctx, root, ino, &tmp_dir, &tmp_dir_gen, tmp_name); 1640 ret = get_first_ref(root, ino, &tmp_dir, &tmp_dir_gen, tmp_name);
1646 if (ret < 0) 1641 if (ret < 0)
1647 goto out; 1642 goto out;
1648 1643
@@ -1654,7 +1649,7 @@ static int is_first_ref(struct send_ctx *sctx,
1654 ret = !memcmp(tmp_name->start, name, name_len); 1649 ret = !memcmp(tmp_name->start, name, name_len);
1655 1650
1656out: 1651out:
1657 fs_path_free(sctx, tmp_name); 1652 fs_path_free(tmp_name);
1658 return ret; 1653 return ret;
1659} 1654}
1660 1655
@@ -1783,11 +1778,11 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
1783 if (!sctx->parent_root) 1778 if (!sctx->parent_root)
1784 goto out; 1779 goto out;
1785 1780
1786 name = fs_path_alloc(sctx); 1781 name = fs_path_alloc();
1787 if (!name) 1782 if (!name)
1788 return -ENOMEM; 1783 return -ENOMEM;
1789 1784
1790 ret = get_first_ref(sctx, sctx->parent_root, ino, &dir, &dir_gen, name); 1785 ret = get_first_ref(sctx->parent_root, ino, &dir, &dir_gen, name);
1791 if (ret < 0) 1786 if (ret < 0)
1792 goto out; 1787 goto out;
1793 1788
@@ -1795,7 +1790,7 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
1795 name->start, fs_path_len(name)); 1790 name->start, fs_path_len(name));
1796 1791
1797out: 1792out:
1798 fs_path_free(sctx, name); 1793 fs_path_free(name);
1799 return ret; 1794 return ret;
1800} 1795}
1801 1796
@@ -1979,11 +1974,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
1979 * send_root or parent_root for ref lookup. 1974 * send_root or parent_root for ref lookup.
1980 */ 1975 */
1981 if (ino < sctx->send_progress) 1976 if (ino < sctx->send_progress)
1982 ret = get_first_ref(sctx, sctx->send_root, ino, 1977 ret = get_first_ref(sctx->send_root, ino,
1983 parent_ino, parent_gen, dest); 1978 parent_ino, parent_gen, dest);
1984 else 1979 else
1985 ret = get_first_ref(sctx, sctx->parent_root, ino, 1980 ret = get_first_ref(sctx->parent_root, ino,
1986 parent_ino, parent_gen, dest); 1981 parent_ino, parent_gen, dest);
1987 if (ret < 0) 1982 if (ret < 0)
1988 goto out; 1983 goto out;
1989 1984
@@ -2070,7 +2065,7 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
2070 u64 parent_gen = 0; 2065 u64 parent_gen = 0;
2071 int stop = 0; 2066 int stop = 0;
2072 2067
2073 name = fs_path_alloc(sctx); 2068 name = fs_path_alloc();
2074 if (!name) { 2069 if (!name) {
2075 ret = -ENOMEM; 2070 ret = -ENOMEM;
2076 goto out; 2071 goto out;
@@ -2098,7 +2093,7 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
2098 } 2093 }
2099 2094
2100out: 2095out:
2101 fs_path_free(sctx, name); 2096 fs_path_free(name);
2102 if (!ret) 2097 if (!ret)
2103 fs_path_unreverse(dest); 2098 fs_path_unreverse(dest);
2104 return ret; 2099 return ret;
@@ -2263,7 +2258,7 @@ static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
2263 2258
2264verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size); 2259verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size);
2265 2260
2266 p = fs_path_alloc(sctx); 2261 p = fs_path_alloc();
2267 if (!p) 2262 if (!p)
2268 return -ENOMEM; 2263 return -ENOMEM;
2269 2264
@@ -2281,7 +2276,7 @@ verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size);
2281 2276
2282tlv_put_failure: 2277tlv_put_failure:
2283out: 2278out:
2284 fs_path_free(sctx, p); 2279 fs_path_free(p);
2285 return ret; 2280 return ret;
2286} 2281}
2287 2282
@@ -2292,7 +2287,7 @@ static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
2292 2287
2293verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode); 2288verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode);
2294 2289
2295 p = fs_path_alloc(sctx); 2290 p = fs_path_alloc();
2296 if (!p) 2291 if (!p)
2297 return -ENOMEM; 2292 return -ENOMEM;
2298 2293
@@ -2310,7 +2305,7 @@ verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode);
2310 2305
2311tlv_put_failure: 2306tlv_put_failure:
2312out: 2307out:
2313 fs_path_free(sctx, p); 2308 fs_path_free(p);
2314 return ret; 2309 return ret;
2315} 2310}
2316 2311
@@ -2321,7 +2316,7 @@ static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
2321 2316
2322verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid); 2317verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid);
2323 2318
2324 p = fs_path_alloc(sctx); 2319 p = fs_path_alloc();
2325 if (!p) 2320 if (!p)
2326 return -ENOMEM; 2321 return -ENOMEM;
2327 2322
@@ -2340,7 +2335,7 @@ verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid);
2340 2335
2341tlv_put_failure: 2336tlv_put_failure:
2342out: 2337out:
2343 fs_path_free(sctx, p); 2338 fs_path_free(p);
2344 return ret; 2339 return ret;
2345} 2340}
2346 2341
@@ -2356,7 +2351,7 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
2356 2351
2357verbose_printk("btrfs: send_utimes %llu\n", ino); 2352verbose_printk("btrfs: send_utimes %llu\n", ino);
2358 2353
2359 p = fs_path_alloc(sctx); 2354 p = fs_path_alloc();
2360 if (!p) 2355 if (!p)
2361 return -ENOMEM; 2356 return -ENOMEM;
2362 2357
@@ -2397,7 +2392,7 @@ verbose_printk("btrfs: send_utimes %llu\n", ino);
2397 2392
2398tlv_put_failure: 2393tlv_put_failure:
2399out: 2394out:
2400 fs_path_free(sctx, p); 2395 fs_path_free(p);
2401 btrfs_free_path(path); 2396 btrfs_free_path(path);
2402 return ret; 2397 return ret;
2403} 2398}
@@ -2418,7 +2413,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
2418 2413
2419verbose_printk("btrfs: send_create_inode %llu\n", ino); 2414verbose_printk("btrfs: send_create_inode %llu\n", ino);
2420 2415
2421 p = fs_path_alloc(sctx); 2416 p = fs_path_alloc();
2422 if (!p) 2417 if (!p)
2423 return -ENOMEM; 2418 return -ENOMEM;
2424 2419
@@ -2459,7 +2454,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
2459 2454
2460 if (S_ISLNK(mode)) { 2455 if (S_ISLNK(mode)) {
2461 fs_path_reset(p); 2456 fs_path_reset(p);
2462 ret = read_symlink(sctx, sctx->send_root, ino, p); 2457 ret = read_symlink(sctx->send_root, ino, p);
2463 if (ret < 0) 2458 if (ret < 0)
2464 goto out; 2459 goto out;
2465 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p); 2460 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
@@ -2476,7 +2471,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
2476 2471
2477tlv_put_failure: 2472tlv_put_failure:
2478out: 2473out:
2479 fs_path_free(sctx, p); 2474 fs_path_free(p);
2480 return ret; 2475 return ret;
2481} 2476}
2482 2477
@@ -2615,13 +2610,13 @@ static int record_ref(struct list_head *head, u64 dir,
2615 return 0; 2610 return 0;
2616} 2611}
2617 2612
2618static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head) 2613static void __free_recorded_refs(struct list_head *head)
2619{ 2614{
2620 struct recorded_ref *cur; 2615 struct recorded_ref *cur;
2621 2616
2622 while (!list_empty(head)) { 2617 while (!list_empty(head)) {
2623 cur = list_entry(head->next, struct recorded_ref, list); 2618 cur = list_entry(head->next, struct recorded_ref, list);
2624 fs_path_free(sctx, cur->full_path); 2619 fs_path_free(cur->full_path);
2625 list_del(&cur->list); 2620 list_del(&cur->list);
2626 kfree(cur); 2621 kfree(cur);
2627 } 2622 }
@@ -2629,8 +2624,8 @@ static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head)
2629 2624
2630static void free_recorded_refs(struct send_ctx *sctx) 2625static void free_recorded_refs(struct send_ctx *sctx)
2631{ 2626{
2632 __free_recorded_refs(sctx, &sctx->new_refs); 2627 __free_recorded_refs(&sctx->new_refs);
2633 __free_recorded_refs(sctx, &sctx->deleted_refs); 2628 __free_recorded_refs(&sctx->deleted_refs);
2634} 2629}
2635 2630
2636/* 2631/*
@@ -2644,7 +2639,7 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
2644 int ret; 2639 int ret;
2645 struct fs_path *orphan; 2640 struct fs_path *orphan;
2646 2641
2647 orphan = fs_path_alloc(sctx); 2642 orphan = fs_path_alloc();
2648 if (!orphan) 2643 if (!orphan)
2649 return -ENOMEM; 2644 return -ENOMEM;
2650 2645
@@ -2655,7 +2650,7 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
2655 ret = send_rename(sctx, path, orphan); 2650 ret = send_rename(sctx, path, orphan);
2656 2651
2657out: 2652out:
2658 fs_path_free(sctx, orphan); 2653 fs_path_free(orphan);
2659 return ret; 2654 return ret;
2660} 2655}
2661 2656
@@ -2746,7 +2741,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2746 */ 2741 */
2747 BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID); 2742 BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
2748 2743
2749 valid_path = fs_path_alloc(sctx); 2744 valid_path = fs_path_alloc();
2750 if (!valid_path) { 2745 if (!valid_path) {
2751 ret = -ENOMEM; 2746 ret = -ENOMEM;
2752 goto out; 2747 goto out;
@@ -2843,9 +2838,9 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2843 if (ret < 0) 2838 if (ret < 0)
2844 goto out; 2839 goto out;
2845 if (ret) { 2840 if (ret) {
2846 ret = is_first_ref(sctx, sctx->parent_root, 2841 ret = is_first_ref(sctx->parent_root,
2847 ow_inode, cur->dir, cur->name, 2842 ow_inode, cur->dir, cur->name,
2848 cur->name_len); 2843 cur->name_len);
2849 if (ret < 0) 2844 if (ret < 0)
2850 goto out; 2845 goto out;
2851 if (ret) { 2846 if (ret) {
@@ -3024,7 +3019,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3024out: 3019out:
3025 free_recorded_refs(sctx); 3020 free_recorded_refs(sctx);
3026 ulist_free(check_dirs); 3021 ulist_free(check_dirs);
3027 fs_path_free(sctx, valid_path); 3022 fs_path_free(valid_path);
3028 return ret; 3023 return ret;
3029} 3024}
3030 3025
@@ -3037,7 +3032,7 @@ static int __record_new_ref(int num, u64 dir, int index,
3037 struct fs_path *p; 3032 struct fs_path *p;
3038 u64 gen; 3033 u64 gen;
3039 3034
3040 p = fs_path_alloc(sctx); 3035 p = fs_path_alloc();
3041 if (!p) 3036 if (!p)
3042 return -ENOMEM; 3037 return -ENOMEM;
3043 3038
@@ -3057,7 +3052,7 @@ static int __record_new_ref(int num, u64 dir, int index,
3057 3052
3058out: 3053out:
3059 if (ret) 3054 if (ret)
3060 fs_path_free(sctx, p); 3055 fs_path_free(p);
3061 return ret; 3056 return ret;
3062} 3057}
3063 3058
@@ -3070,7 +3065,7 @@ static int __record_deleted_ref(int num, u64 dir, int index,
3070 struct fs_path *p; 3065 struct fs_path *p;
3071 u64 gen; 3066 u64 gen;
3072 3067
3073 p = fs_path_alloc(sctx); 3068 p = fs_path_alloc();
3074 if (!p) 3069 if (!p)
3075 return -ENOMEM; 3070 return -ENOMEM;
3076 3071
@@ -3090,7 +3085,7 @@ static int __record_deleted_ref(int num, u64 dir, int index,
3090 3085
3091out: 3086out:
3092 if (ret) 3087 if (ret)
3093 fs_path_free(sctx, p); 3088 fs_path_free(p);
3094 return ret; 3089 return ret;
3095} 3090}
3096 3091
@@ -3098,8 +3093,8 @@ static int record_new_ref(struct send_ctx *sctx)
3098{ 3093{
3099 int ret; 3094 int ret;
3100 3095
3101 ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path, 3096 ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
3102 sctx->cmp_key, 0, __record_new_ref, sctx); 3097 sctx->cmp_key, 0, __record_new_ref, sctx);
3103 if (ret < 0) 3098 if (ret < 0)
3104 goto out; 3099 goto out;
3105 ret = 0; 3100 ret = 0;
@@ -3112,8 +3107,8 @@ static int record_deleted_ref(struct send_ctx *sctx)
3112{ 3107{
3113 int ret; 3108 int ret;
3114 3109
3115 ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path, 3110 ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
3116 sctx->cmp_key, 0, __record_deleted_ref, sctx); 3111 sctx->cmp_key, 0, __record_deleted_ref, sctx);
3117 if (ret < 0) 3112 if (ret < 0)
3118 goto out; 3113 goto out;
3119 ret = 0; 3114 ret = 0;
@@ -3142,8 +3137,7 @@ static int __find_iref(int num, u64 dir, int index,
3142 return 0; 3137 return 0;
3143} 3138}
3144 3139
3145static int find_iref(struct send_ctx *sctx, 3140static int find_iref(struct btrfs_root *root,
3146 struct btrfs_root *root,
3147 struct btrfs_path *path, 3141 struct btrfs_path *path,
3148 struct btrfs_key *key, 3142 struct btrfs_key *key,
3149 u64 dir, struct fs_path *name) 3143 u64 dir, struct fs_path *name)
@@ -3155,7 +3149,7 @@ static int find_iref(struct send_ctx *sctx,
3155 ctx.name = name; 3149 ctx.name = name;
3156 ctx.found_idx = -1; 3150 ctx.found_idx = -1;
3157 3151
3158 ret = iterate_inode_ref(sctx, root, path, key, 0, __find_iref, &ctx); 3152 ret = iterate_inode_ref(root, path, key, 0, __find_iref, &ctx);
3159 if (ret < 0) 3153 if (ret < 0)
3160 return ret; 3154 return ret;
3161 3155
@@ -3172,7 +3166,7 @@ static int __record_changed_new_ref(int num, u64 dir, int index,
3172 int ret; 3166 int ret;
3173 struct send_ctx *sctx = ctx; 3167 struct send_ctx *sctx = ctx;
3174 3168
3175 ret = find_iref(sctx, sctx->parent_root, sctx->right_path, 3169 ret = find_iref(sctx->parent_root, sctx->right_path,
3176 sctx->cmp_key, dir, name); 3170 sctx->cmp_key, dir, name);
3177 if (ret == -ENOENT) 3171 if (ret == -ENOENT)
3178 ret = __record_new_ref(num, dir, index, name, sctx); 3172 ret = __record_new_ref(num, dir, index, name, sctx);
@@ -3189,7 +3183,7 @@ static int __record_changed_deleted_ref(int num, u64 dir, int index,
3189 int ret; 3183 int ret;
3190 struct send_ctx *sctx = ctx; 3184 struct send_ctx *sctx = ctx;
3191 3185
3192 ret = find_iref(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key, 3186 ret = find_iref(sctx->send_root, sctx->left_path, sctx->cmp_key,
3193 dir, name); 3187 dir, name);
3194 if (ret == -ENOENT) 3188 if (ret == -ENOENT)
3195 ret = __record_deleted_ref(num, dir, index, name, sctx); 3189 ret = __record_deleted_ref(num, dir, index, name, sctx);
@@ -3203,11 +3197,11 @@ static int record_changed_ref(struct send_ctx *sctx)
3203{ 3197{
3204 int ret = 0; 3198 int ret = 0;
3205 3199
3206 ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path, 3200 ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
3207 sctx->cmp_key, 0, __record_changed_new_ref, sctx); 3201 sctx->cmp_key, 0, __record_changed_new_ref, sctx);
3208 if (ret < 0) 3202 if (ret < 0)
3209 goto out; 3203 goto out;
3210 ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path, 3204 ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
3211 sctx->cmp_key, 0, __record_changed_deleted_ref, sctx); 3205 sctx->cmp_key, 0, __record_changed_deleted_ref, sctx);
3212 if (ret < 0) 3206 if (ret < 0)
3213 goto out; 3207 goto out;
@@ -3266,8 +3260,7 @@ static int process_all_refs(struct send_ctx *sctx,
3266 found_key.type != BTRFS_INODE_EXTREF_KEY)) 3260 found_key.type != BTRFS_INODE_EXTREF_KEY))
3267 break; 3261 break;
3268 3262
3269 ret = iterate_inode_ref(sctx, root, path, &found_key, 0, cb, 3263 ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
3270 sctx);
3271 btrfs_release_path(path); 3264 btrfs_release_path(path);
3272 if (ret < 0) 3265 if (ret < 0)
3273 goto out; 3266 goto out;
@@ -3335,7 +3328,7 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
3335 struct fs_path *p; 3328 struct fs_path *p;
3336 posix_acl_xattr_header dummy_acl; 3329 posix_acl_xattr_header dummy_acl;
3337 3330
3338 p = fs_path_alloc(sctx); 3331 p = fs_path_alloc();
3339 if (!p) 3332 if (!p)
3340 return -ENOMEM; 3333 return -ENOMEM;
3341 3334
@@ -3362,7 +3355,7 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
3362 ret = send_set_xattr(sctx, p, name, name_len, data, data_len); 3355 ret = send_set_xattr(sctx, p, name, name_len, data, data_len);
3363 3356
3364out: 3357out:
3365 fs_path_free(sctx, p); 3358 fs_path_free(p);
3366 return ret; 3359 return ret;
3367} 3360}
3368 3361
@@ -3375,7 +3368,7 @@ static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
3375 struct send_ctx *sctx = ctx; 3368 struct send_ctx *sctx = ctx;
3376 struct fs_path *p; 3369 struct fs_path *p;
3377 3370
3378 p = fs_path_alloc(sctx); 3371 p = fs_path_alloc();
3379 if (!p) 3372 if (!p)
3380 return -ENOMEM; 3373 return -ENOMEM;
3381 3374
@@ -3386,7 +3379,7 @@ static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
3386 ret = send_remove_xattr(sctx, p, name, name_len); 3379 ret = send_remove_xattr(sctx, p, name, name_len);
3387 3380
3388out: 3381out:
3389 fs_path_free(sctx, p); 3382 fs_path_free(p);
3390 return ret; 3383 return ret;
3391} 3384}
3392 3385
@@ -3394,8 +3387,8 @@ static int process_new_xattr(struct send_ctx *sctx)
3394{ 3387{
3395 int ret = 0; 3388 int ret = 0;
3396 3389
3397 ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path, 3390 ret = iterate_dir_item(sctx->send_root, sctx->left_path,
3398 sctx->cmp_key, __process_new_xattr, sctx); 3391 sctx->cmp_key, __process_new_xattr, sctx);
3399 3392
3400 return ret; 3393 return ret;
3401} 3394}
@@ -3404,8 +3397,8 @@ static int process_deleted_xattr(struct send_ctx *sctx)
3404{ 3397{
3405 int ret; 3398 int ret;
3406 3399
3407 ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path, 3400 ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
3408 sctx->cmp_key, __process_deleted_xattr, sctx); 3401 sctx->cmp_key, __process_deleted_xattr, sctx);
3409 3402
3410 return ret; 3403 return ret;
3411} 3404}
@@ -3429,17 +3422,15 @@ static int __find_xattr(int num, struct btrfs_key *di_key,
3429 strncmp(name, ctx->name, name_len) == 0) { 3422 strncmp(name, ctx->name, name_len) == 0) {
3430 ctx->found_idx = num; 3423 ctx->found_idx = num;
3431 ctx->found_data_len = data_len; 3424 ctx->found_data_len = data_len;
3432 ctx->found_data = kmalloc(data_len, GFP_NOFS); 3425 ctx->found_data = kmemdup(data, data_len, GFP_NOFS);
3433 if (!ctx->found_data) 3426 if (!ctx->found_data)
3434 return -ENOMEM; 3427 return -ENOMEM;
3435 memcpy(ctx->found_data, data, data_len);
3436 return 1; 3428 return 1;
3437 } 3429 }
3438 return 0; 3430 return 0;
3439} 3431}
3440 3432
3441static int find_xattr(struct send_ctx *sctx, 3433static int find_xattr(struct btrfs_root *root,
3442 struct btrfs_root *root,
3443 struct btrfs_path *path, 3434 struct btrfs_path *path,
3444 struct btrfs_key *key, 3435 struct btrfs_key *key,
3445 const char *name, int name_len, 3436 const char *name, int name_len,
@@ -3454,7 +3445,7 @@ static int find_xattr(struct send_ctx *sctx,
3454 ctx.found_data = NULL; 3445 ctx.found_data = NULL;
3455 ctx.found_data_len = 0; 3446 ctx.found_data_len = 0;
3456 3447
3457 ret = iterate_dir_item(sctx, root, path, key, __find_xattr, &ctx); 3448 ret = iterate_dir_item(root, path, key, __find_xattr, &ctx);
3458 if (ret < 0) 3449 if (ret < 0)
3459 return ret; 3450 return ret;
3460 3451
@@ -3480,9 +3471,9 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
3480 char *found_data = NULL; 3471 char *found_data = NULL;
3481 int found_data_len = 0; 3472 int found_data_len = 0;
3482 3473
3483 ret = find_xattr(sctx, sctx->parent_root, sctx->right_path, 3474 ret = find_xattr(sctx->parent_root, sctx->right_path,
3484 sctx->cmp_key, name, name_len, &found_data, 3475 sctx->cmp_key, name, name_len, &found_data,
3485 &found_data_len); 3476 &found_data_len);
3486 if (ret == -ENOENT) { 3477 if (ret == -ENOENT) {
3487 ret = __process_new_xattr(num, di_key, name, name_len, data, 3478 ret = __process_new_xattr(num, di_key, name, name_len, data,
3488 data_len, type, ctx); 3479 data_len, type, ctx);
@@ -3508,8 +3499,8 @@ static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
3508 int ret; 3499 int ret;
3509 struct send_ctx *sctx = ctx; 3500 struct send_ctx *sctx = ctx;
3510 3501
3511 ret = find_xattr(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key, 3502 ret = find_xattr(sctx->send_root, sctx->left_path, sctx->cmp_key,
3512 name, name_len, NULL, NULL); 3503 name, name_len, NULL, NULL);
3513 if (ret == -ENOENT) 3504 if (ret == -ENOENT)
3514 ret = __process_deleted_xattr(num, di_key, name, name_len, data, 3505 ret = __process_deleted_xattr(num, di_key, name, name_len, data,
3515 data_len, type, ctx); 3506 data_len, type, ctx);
@@ -3523,11 +3514,11 @@ static int process_changed_xattr(struct send_ctx *sctx)
3523{ 3514{
3524 int ret = 0; 3515 int ret = 0;
3525 3516
3526 ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path, 3517 ret = iterate_dir_item(sctx->send_root, sctx->left_path,
3527 sctx->cmp_key, __process_changed_new_xattr, sctx); 3518 sctx->cmp_key, __process_changed_new_xattr, sctx);
3528 if (ret < 0) 3519 if (ret < 0)
3529 goto out; 3520 goto out;
3530 ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path, 3521 ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
3531 sctx->cmp_key, __process_changed_deleted_xattr, sctx); 3522 sctx->cmp_key, __process_changed_deleted_xattr, sctx);
3532 3523
3533out: 3524out:
@@ -3572,8 +3563,8 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
3572 goto out; 3563 goto out;
3573 } 3564 }
3574 3565
3575 ret = iterate_dir_item(sctx, root, path, &found_key, 3566 ret = iterate_dir_item(root, path, &found_key,
3576 __process_new_xattr, sctx); 3567 __process_new_xattr, sctx);
3577 if (ret < 0) 3568 if (ret < 0)
3578 goto out; 3569 goto out;
3579 3570
@@ -3598,7 +3589,7 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
3598 int num_read = 0; 3589 int num_read = 0;
3599 mm_segment_t old_fs; 3590 mm_segment_t old_fs;
3600 3591
3601 p = fs_path_alloc(sctx); 3592 p = fs_path_alloc();
3602 if (!p) 3593 if (!p)
3603 return -ENOMEM; 3594 return -ENOMEM;
3604 3595
@@ -3640,7 +3631,7 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
3640 3631
3641tlv_put_failure: 3632tlv_put_failure:
3642out: 3633out:
3643 fs_path_free(sctx, p); 3634 fs_path_free(p);
3644 set_fs(old_fs); 3635 set_fs(old_fs);
3645 if (ret < 0) 3636 if (ret < 0)
3646 return ret; 3637 return ret;
@@ -3663,7 +3654,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
3663 clone_root->root->objectid, clone_root->ino, 3654 clone_root->root->objectid, clone_root->ino,
3664 clone_root->offset); 3655 clone_root->offset);
3665 3656
3666 p = fs_path_alloc(sctx); 3657 p = fs_path_alloc();
3667 if (!p) 3658 if (!p)
3668 return -ENOMEM; 3659 return -ENOMEM;
3669 3660
@@ -3686,8 +3677,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
3686 goto out; 3677 goto out;
3687 ret = get_cur_path(sctx, clone_root->ino, gen, p); 3678 ret = get_cur_path(sctx, clone_root->ino, gen, p);
3688 } else { 3679 } else {
3689 ret = get_inode_path(sctx, clone_root->root, 3680 ret = get_inode_path(clone_root->root, clone_root->ino, p);
3690 clone_root->ino, p);
3691 } 3681 }
3692 if (ret < 0) 3682 if (ret < 0)
3693 goto out; 3683 goto out;
@@ -3704,7 +3694,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
3704 3694
3705tlv_put_failure: 3695tlv_put_failure:
3706out: 3696out:
3707 fs_path_free(sctx, p); 3697 fs_path_free(p);
3708 return ret; 3698 return ret;
3709} 3699}
3710 3700
@@ -3717,7 +3707,7 @@ static int send_update_extent(struct send_ctx *sctx,
3717 int ret = 0; 3707 int ret = 0;
3718 struct fs_path *p; 3708 struct fs_path *p;
3719 3709
3720 p = fs_path_alloc(sctx); 3710 p = fs_path_alloc();
3721 if (!p) 3711 if (!p)
3722 return -ENOMEM; 3712 return -ENOMEM;
3723 3713
@@ -3737,7 +3727,7 @@ static int send_update_extent(struct send_ctx *sctx,
3737 3727
3738tlv_put_failure: 3728tlv_put_failure:
3739out: 3729out:
3740 fs_path_free(sctx, p); 3730 fs_path_free(p);
3741 return ret; 3731 return ret;
3742} 3732}
3743 3733
@@ -4579,6 +4569,41 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4579 send_root = BTRFS_I(file_inode(mnt_file))->root; 4569 send_root = BTRFS_I(file_inode(mnt_file))->root;
4580 fs_info = send_root->fs_info; 4570 fs_info = send_root->fs_info;
4581 4571
4572 /*
4573 * This is done when we lookup the root, it should already be complete
4574 * by the time we get here.
4575 */
4576 WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE);
4577
4578 /*
4579 * If we just created this root we need to make sure that the orphan
4580 * cleanup has been done and committed since we search the commit root,
4581 * so check its commit root transid with our otransid and if they match
4582 * commit the transaction to make sure everything is updated.
4583 */
4584 down_read(&send_root->fs_info->extent_commit_sem);
4585 if (btrfs_header_generation(send_root->commit_root) ==
4586 btrfs_root_otransid(&send_root->root_item)) {
4587 struct btrfs_trans_handle *trans;
4588
4589 up_read(&send_root->fs_info->extent_commit_sem);
4590
4591 trans = btrfs_attach_transaction_barrier(send_root);
4592 if (IS_ERR(trans)) {
4593 if (PTR_ERR(trans) != -ENOENT) {
4594 ret = PTR_ERR(trans);
4595 goto out;
4596 }
4597 /* ENOENT means theres no transaction */
4598 } else {
4599 ret = btrfs_commit_transaction(trans, send_root);
4600 if (ret)
4601 goto out;
4602 }
4603 } else {
4604 up_read(&send_root->fs_info->extent_commit_sem);
4605 }
4606
4582 arg = memdup_user(arg_, sizeof(*arg)); 4607 arg = memdup_user(arg_, sizeof(*arg));
4583 if (IS_ERR(arg)) { 4608 if (IS_ERR(arg)) {
4584 ret = PTR_ERR(arg); 4609 ret = PTR_ERR(arg);
@@ -4663,10 +4688,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4663 key.type = BTRFS_ROOT_ITEM_KEY; 4688 key.type = BTRFS_ROOT_ITEM_KEY;
4664 key.offset = (u64)-1; 4689 key.offset = (u64)-1;
4665 clone_root = btrfs_read_fs_root_no_name(fs_info, &key); 4690 clone_root = btrfs_read_fs_root_no_name(fs_info, &key);
4666 if (!clone_root) {
4667 ret = -EINVAL;
4668 goto out;
4669 }
4670 if (IS_ERR(clone_root)) { 4691 if (IS_ERR(clone_root)) {
4671 ret = PTR_ERR(clone_root); 4692 ret = PTR_ERR(clone_root);
4672 goto out; 4693 goto out;
@@ -4682,8 +4703,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4682 key.type = BTRFS_ROOT_ITEM_KEY; 4703 key.type = BTRFS_ROOT_ITEM_KEY;
4683 key.offset = (u64)-1; 4704 key.offset = (u64)-1;
4684 sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key); 4705 sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key);
4685 if (!sctx->parent_root) { 4706 if (IS_ERR(sctx->parent_root)) {
4686 ret = -EINVAL; 4707 ret = PTR_ERR(sctx->parent_root);
4687 goto out; 4708 goto out;
4688 } 4709 }
4689 } 4710 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f0857e092a3c..8eb6191d86da 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -51,7 +51,6 @@
51#include "print-tree.h" 51#include "print-tree.h"
52#include "xattr.h" 52#include "xattr.h"
53#include "volumes.h" 53#include "volumes.h"
54#include "version.h"
55#include "export.h" 54#include "export.h"
56#include "compression.h" 55#include "compression.h"
57#include "rcu-string.h" 56#include "rcu-string.h"
@@ -266,6 +265,9 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
266 return; 265 return;
267 } 266 }
268 ACCESS_ONCE(trans->transaction->aborted) = errno; 267 ACCESS_ONCE(trans->transaction->aborted) = errno;
268 /* Wake up anybody who may be waiting on this transaction */
269 wake_up(&root->fs_info->transaction_wait);
270 wake_up(&root->fs_info->transaction_blocked_wait);
269 __btrfs_std_error(root->fs_info, function, line, errno, NULL); 271 __btrfs_std_error(root->fs_info, function, line, errno, NULL);
270} 272}
271/* 273/*
@@ -776,9 +778,6 @@ find_root:
776 if (IS_ERR(new_root)) 778 if (IS_ERR(new_root))
777 return ERR_CAST(new_root); 779 return ERR_CAST(new_root);
778 780
779 if (btrfs_root_refs(&new_root->root_item) == 0)
780 return ERR_PTR(-ENOENT);
781
782 dir_id = btrfs_root_dirid(&new_root->root_item); 781 dir_id = btrfs_root_dirid(&new_root->root_item);
783setup_root: 782setup_root:
784 location.objectid = dir_id; 783 location.objectid = dir_id;
@@ -866,7 +865,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
866 return 0; 865 return 0;
867 } 866 }
868 867
869 btrfs_wait_ordered_extents(root, 1); 868 btrfs_wait_all_ordered_extents(fs_info, 1);
870 869
871 trans = btrfs_attach_transaction_barrier(root); 870 trans = btrfs_attach_transaction_barrier(root);
872 if (IS_ERR(trans)) { 871 if (IS_ERR(trans)) {
@@ -1685,6 +1684,18 @@ static void btrfs_interface_exit(void)
1685 printk(KERN_INFO "btrfs: misc_deregister failed for control device\n"); 1684 printk(KERN_INFO "btrfs: misc_deregister failed for control device\n");
1686} 1685}
1687 1686
1687static void btrfs_print_info(void)
1688{
1689 printk(KERN_INFO "Btrfs loaded"
1690#ifdef CONFIG_BTRFS_DEBUG
1691 ", debug=on"
1692#endif
1693#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
1694 ", integrity-checker=on"
1695#endif
1696 "\n");
1697}
1698
1688static int __init init_btrfs_fs(void) 1699static int __init init_btrfs_fs(void)
1689{ 1700{
1690 int err; 1701 int err;
@@ -1733,11 +1744,9 @@ static int __init init_btrfs_fs(void)
1733 1744
1734 btrfs_init_lockdep(); 1745 btrfs_init_lockdep();
1735 1746
1736#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 1747 btrfs_print_info();
1737 btrfs_test_free_space_cache(); 1748 btrfs_test_free_space_cache();
1738#endif
1739 1749
1740 printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION);
1741 return 0; 1750 return 0;
1742 1751
1743unregister_ioctl: 1752unregister_ioctl:
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 0544587d74f4..d58cce77fc6c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -34,12 +34,43 @@
34 34
35#define BTRFS_ROOT_TRANS_TAG 0 35#define BTRFS_ROOT_TRANS_TAG 0
36 36
37static unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
38 [TRANS_STATE_RUNNING] = 0U,
39 [TRANS_STATE_BLOCKED] = (__TRANS_USERSPACE |
40 __TRANS_START),
41 [TRANS_STATE_COMMIT_START] = (__TRANS_USERSPACE |
42 __TRANS_START |
43 __TRANS_ATTACH),
44 [TRANS_STATE_COMMIT_DOING] = (__TRANS_USERSPACE |
45 __TRANS_START |
46 __TRANS_ATTACH |
47 __TRANS_JOIN),
48 [TRANS_STATE_UNBLOCKED] = (__TRANS_USERSPACE |
49 __TRANS_START |
50 __TRANS_ATTACH |
51 __TRANS_JOIN |
52 __TRANS_JOIN_NOLOCK),
53 [TRANS_STATE_COMPLETED] = (__TRANS_USERSPACE |
54 __TRANS_START |
55 __TRANS_ATTACH |
56 __TRANS_JOIN |
57 __TRANS_JOIN_NOLOCK),
58};
59
37static void put_transaction(struct btrfs_transaction *transaction) 60static void put_transaction(struct btrfs_transaction *transaction)
38{ 61{
39 WARN_ON(atomic_read(&transaction->use_count) == 0); 62 WARN_ON(atomic_read(&transaction->use_count) == 0);
40 if (atomic_dec_and_test(&transaction->use_count)) { 63 if (atomic_dec_and_test(&transaction->use_count)) {
41 BUG_ON(!list_empty(&transaction->list)); 64 BUG_ON(!list_empty(&transaction->list));
42 WARN_ON(transaction->delayed_refs.root.rb_node); 65 WARN_ON(transaction->delayed_refs.root.rb_node);
66 while (!list_empty(&transaction->pending_chunks)) {
67 struct extent_map *em;
68
69 em = list_first_entry(&transaction->pending_chunks,
70 struct extent_map, list);
71 list_del_init(&em->list);
72 free_extent_map(em);
73 }
43 kmem_cache_free(btrfs_transaction_cachep, transaction); 74 kmem_cache_free(btrfs_transaction_cachep, transaction);
44 } 75 }
45} 76}
@@ -50,18 +81,35 @@ static noinline void switch_commit_root(struct btrfs_root *root)
50 root->commit_root = btrfs_root_node(root); 81 root->commit_root = btrfs_root_node(root);
51} 82}
52 83
53static inline int can_join_transaction(struct btrfs_transaction *trans, 84static inline void extwriter_counter_inc(struct btrfs_transaction *trans,
54 int type) 85 unsigned int type)
86{
87 if (type & TRANS_EXTWRITERS)
88 atomic_inc(&trans->num_extwriters);
89}
90
91static inline void extwriter_counter_dec(struct btrfs_transaction *trans,
92 unsigned int type)
93{
94 if (type & TRANS_EXTWRITERS)
95 atomic_dec(&trans->num_extwriters);
96}
97
98static inline void extwriter_counter_init(struct btrfs_transaction *trans,
99 unsigned int type)
100{
101 atomic_set(&trans->num_extwriters, ((type & TRANS_EXTWRITERS) ? 1 : 0));
102}
103
104static inline int extwriter_counter_read(struct btrfs_transaction *trans)
55{ 105{
56 return !(trans->in_commit && 106 return atomic_read(&trans->num_extwriters);
57 type != TRANS_JOIN &&
58 type != TRANS_JOIN_NOLOCK);
59} 107}
60 108
61/* 109/*
62 * either allocate a new transaction or hop into the existing one 110 * either allocate a new transaction or hop into the existing one
63 */ 111 */
64static noinline int join_transaction(struct btrfs_root *root, int type) 112static noinline int join_transaction(struct btrfs_root *root, unsigned int type)
65{ 113{
66 struct btrfs_transaction *cur_trans; 114 struct btrfs_transaction *cur_trans;
67 struct btrfs_fs_info *fs_info = root->fs_info; 115 struct btrfs_fs_info *fs_info = root->fs_info;
@@ -74,32 +122,19 @@ loop:
74 return -EROFS; 122 return -EROFS;
75 } 123 }
76 124
77 if (fs_info->trans_no_join) {
78 /*
79 * If we are JOIN_NOLOCK we're already committing a current
80 * transaction, we just need a handle to deal with something
81 * when committing the transaction, such as inode cache and
82 * space cache. It is a special case.
83 */
84 if (type != TRANS_JOIN_NOLOCK) {
85 spin_unlock(&fs_info->trans_lock);
86 return -EBUSY;
87 }
88 }
89
90 cur_trans = fs_info->running_transaction; 125 cur_trans = fs_info->running_transaction;
91 if (cur_trans) { 126 if (cur_trans) {
92 if (cur_trans->aborted) { 127 if (cur_trans->aborted) {
93 spin_unlock(&fs_info->trans_lock); 128 spin_unlock(&fs_info->trans_lock);
94 return cur_trans->aborted; 129 return cur_trans->aborted;
95 } 130 }
96 if (!can_join_transaction(cur_trans, type)) { 131 if (btrfs_blocked_trans_types[cur_trans->state] & type) {
97 spin_unlock(&fs_info->trans_lock); 132 spin_unlock(&fs_info->trans_lock);
98 return -EBUSY; 133 return -EBUSY;
99 } 134 }
100 atomic_inc(&cur_trans->use_count); 135 atomic_inc(&cur_trans->use_count);
101 atomic_inc(&cur_trans->num_writers); 136 atomic_inc(&cur_trans->num_writers);
102 cur_trans->num_joined++; 137 extwriter_counter_inc(cur_trans, type);
103 spin_unlock(&fs_info->trans_lock); 138 spin_unlock(&fs_info->trans_lock);
104 return 0; 139 return 0;
105 } 140 }
@@ -112,6 +147,12 @@ loop:
112 if (type == TRANS_ATTACH) 147 if (type == TRANS_ATTACH)
113 return -ENOENT; 148 return -ENOENT;
114 149
150 /*
151 * JOIN_NOLOCK only happens during the transaction commit, so
152 * it is impossible that ->running_transaction is NULL
153 */
154 BUG_ON(type == TRANS_JOIN_NOLOCK);
155
115 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); 156 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
116 if (!cur_trans) 157 if (!cur_trans)
117 return -ENOMEM; 158 return -ENOMEM;
@@ -120,7 +161,7 @@ loop:
120 if (fs_info->running_transaction) { 161 if (fs_info->running_transaction) {
121 /* 162 /*
122 * someone started a transaction after we unlocked. Make sure 163 * someone started a transaction after we unlocked. Make sure
123 * to redo the trans_no_join checks above 164 * to redo the checks above
124 */ 165 */
125 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 166 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
126 goto loop; 167 goto loop;
@@ -131,17 +172,15 @@ loop:
131 } 172 }
132 173
133 atomic_set(&cur_trans->num_writers, 1); 174 atomic_set(&cur_trans->num_writers, 1);
134 cur_trans->num_joined = 0; 175 extwriter_counter_init(cur_trans, type);
135 init_waitqueue_head(&cur_trans->writer_wait); 176 init_waitqueue_head(&cur_trans->writer_wait);
136 init_waitqueue_head(&cur_trans->commit_wait); 177 init_waitqueue_head(&cur_trans->commit_wait);
137 cur_trans->in_commit = 0; 178 cur_trans->state = TRANS_STATE_RUNNING;
138 cur_trans->blocked = 0;
139 /* 179 /*
140 * One for this trans handle, one so it will live on until we 180 * One for this trans handle, one so it will live on until we
141 * commit the transaction. 181 * commit the transaction.
142 */ 182 */
143 atomic_set(&cur_trans->use_count, 2); 183 atomic_set(&cur_trans->use_count, 2);
144 cur_trans->commit_done = 0;
145 cur_trans->start_time = get_seconds(); 184 cur_trans->start_time = get_seconds();
146 185
147 cur_trans->delayed_refs.root = RB_ROOT; 186 cur_trans->delayed_refs.root = RB_ROOT;
@@ -164,7 +203,6 @@ loop:
164 "creating a fresh transaction\n"); 203 "creating a fresh transaction\n");
165 atomic64_set(&fs_info->tree_mod_seq, 0); 204 atomic64_set(&fs_info->tree_mod_seq, 0);
166 205
167 spin_lock_init(&cur_trans->commit_lock);
168 spin_lock_init(&cur_trans->delayed_refs.lock); 206 spin_lock_init(&cur_trans->delayed_refs.lock);
169 atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0); 207 atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0);
170 atomic_set(&cur_trans->delayed_refs.ref_seq, 0); 208 atomic_set(&cur_trans->delayed_refs.ref_seq, 0);
@@ -172,6 +210,7 @@ loop:
172 210
173 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 211 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
174 INIT_LIST_HEAD(&cur_trans->ordered_operations); 212 INIT_LIST_HEAD(&cur_trans->ordered_operations);
213 INIT_LIST_HEAD(&cur_trans->pending_chunks);
175 list_add_tail(&cur_trans->list, &fs_info->trans_list); 214 list_add_tail(&cur_trans->list, &fs_info->trans_list);
176 extent_io_tree_init(&cur_trans->dirty_pages, 215 extent_io_tree_init(&cur_trans->dirty_pages,
177 fs_info->btree_inode->i_mapping); 216 fs_info->btree_inode->i_mapping);
@@ -269,6 +308,13 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
269 return 0; 308 return 0;
270} 309}
271 310
311static inline int is_transaction_blocked(struct btrfs_transaction *trans)
312{
313 return (trans->state >= TRANS_STATE_BLOCKED &&
314 trans->state < TRANS_STATE_UNBLOCKED &&
315 !trans->aborted);
316}
317
272/* wait for commit against the current transaction to become unblocked 318/* wait for commit against the current transaction to become unblocked
273 * when this is done, it is safe to start a new transaction, but the current 319 * when this is done, it is safe to start a new transaction, but the current
274 * transaction might not be fully on disk. 320 * transaction might not be fully on disk.
@@ -279,12 +325,13 @@ static void wait_current_trans(struct btrfs_root *root)
279 325
280 spin_lock(&root->fs_info->trans_lock); 326 spin_lock(&root->fs_info->trans_lock);
281 cur_trans = root->fs_info->running_transaction; 327 cur_trans = root->fs_info->running_transaction;
282 if (cur_trans && cur_trans->blocked) { 328 if (cur_trans && is_transaction_blocked(cur_trans)) {
283 atomic_inc(&cur_trans->use_count); 329 atomic_inc(&cur_trans->use_count);
284 spin_unlock(&root->fs_info->trans_lock); 330 spin_unlock(&root->fs_info->trans_lock);
285 331
286 wait_event(root->fs_info->transaction_wait, 332 wait_event(root->fs_info->transaction_wait,
287 !cur_trans->blocked); 333 cur_trans->state >= TRANS_STATE_UNBLOCKED ||
334 cur_trans->aborted);
288 put_transaction(cur_trans); 335 put_transaction(cur_trans);
289 } else { 336 } else {
290 spin_unlock(&root->fs_info->trans_lock); 337 spin_unlock(&root->fs_info->trans_lock);
@@ -307,7 +354,7 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
307} 354}
308 355
309static struct btrfs_trans_handle * 356static struct btrfs_trans_handle *
310start_transaction(struct btrfs_root *root, u64 num_items, int type, 357start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
311 enum btrfs_reserve_flush_enum flush) 358 enum btrfs_reserve_flush_enum flush)
312{ 359{
313 struct btrfs_trans_handle *h; 360 struct btrfs_trans_handle *h;
@@ -320,7 +367,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type,
320 return ERR_PTR(-EROFS); 367 return ERR_PTR(-EROFS);
321 368
322 if (current->journal_info) { 369 if (current->journal_info) {
323 WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK); 370 WARN_ON(type & TRANS_EXTWRITERS);
324 h = current->journal_info; 371 h = current->journal_info;
325 h->use_count++; 372 h->use_count++;
326 WARN_ON(h->use_count > 2); 373 WARN_ON(h->use_count > 2);
@@ -366,7 +413,7 @@ again:
366 * If we are ATTACH, it means we just want to catch the current 413 * If we are ATTACH, it means we just want to catch the current
367 * transaction and commit it, so we needn't do sb_start_intwrite(). 414 * transaction and commit it, so we needn't do sb_start_intwrite().
368 */ 415 */
369 if (type < TRANS_JOIN_NOLOCK) 416 if (type & __TRANS_FREEZABLE)
370 sb_start_intwrite(root->fs_info->sb); 417 sb_start_intwrite(root->fs_info->sb);
371 418
372 if (may_wait_transaction(root, type)) 419 if (may_wait_transaction(root, type))
@@ -408,7 +455,8 @@ again:
408 INIT_LIST_HEAD(&h->new_bgs); 455 INIT_LIST_HEAD(&h->new_bgs);
409 456
410 smp_mb(); 457 smp_mb();
411 if (cur_trans->blocked && may_wait_transaction(root, type)) { 458 if (cur_trans->state >= TRANS_STATE_BLOCKED &&
459 may_wait_transaction(root, type)) {
412 btrfs_commit_transaction(h, root); 460 btrfs_commit_transaction(h, root);
413 goto again; 461 goto again;
414 } 462 }
@@ -429,7 +477,7 @@ got_it:
429 return h; 477 return h;
430 478
431join_fail: 479join_fail:
432 if (type < TRANS_JOIN_NOLOCK) 480 if (type & __TRANS_FREEZABLE)
433 sb_end_intwrite(root->fs_info->sb); 481 sb_end_intwrite(root->fs_info->sb);
434 kmem_cache_free(btrfs_trans_handle_cachep, h); 482 kmem_cache_free(btrfs_trans_handle_cachep, h);
435alloc_fail: 483alloc_fail:
@@ -490,7 +538,7 @@ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
490} 538}
491 539
492/* 540/*
493 * btrfs_attach_transaction() - catch the running transaction 541 * btrfs_attach_transaction_barrier() - catch the running transaction
494 * 542 *
495 * It is similar to the above function, the differentia is this one 543 * It is similar to the above function, the differentia is this one
496 * will wait for all the inactive transactions until they fully 544 * will wait for all the inactive transactions until they fully
@@ -512,7 +560,7 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root)
512static noinline void wait_for_commit(struct btrfs_root *root, 560static noinline void wait_for_commit(struct btrfs_root *root,
513 struct btrfs_transaction *commit) 561 struct btrfs_transaction *commit)
514{ 562{
515 wait_event(commit->commit_wait, commit->commit_done); 563 wait_event(commit->commit_wait, commit->state == TRANS_STATE_COMPLETED);
516} 564}
517 565
518int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) 566int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
@@ -548,8 +596,8 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
548 spin_lock(&root->fs_info->trans_lock); 596 spin_lock(&root->fs_info->trans_lock);
549 list_for_each_entry_reverse(t, &root->fs_info->trans_list, 597 list_for_each_entry_reverse(t, &root->fs_info->trans_list,
550 list) { 598 list) {
551 if (t->in_commit) { 599 if (t->state >= TRANS_STATE_COMMIT_START) {
552 if (t->commit_done) 600 if (t->state == TRANS_STATE_COMPLETED)
553 break; 601 break;
554 cur_trans = t; 602 cur_trans = t;
555 atomic_inc(&cur_trans->use_count); 603 atomic_inc(&cur_trans->use_count);
@@ -576,10 +624,11 @@ void btrfs_throttle(struct btrfs_root *root)
576static int should_end_transaction(struct btrfs_trans_handle *trans, 624static int should_end_transaction(struct btrfs_trans_handle *trans,
577 struct btrfs_root *root) 625 struct btrfs_root *root)
578{ 626{
579 int ret; 627 if (root->fs_info->global_block_rsv.space_info->full &&
628 btrfs_should_throttle_delayed_refs(trans, root))
629 return 1;
580 630
581 ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5); 631 return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
582 return ret ? 1 : 0;
583} 632}
584 633
585int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, 634int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
@@ -590,7 +639,8 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
590 int err; 639 int err;
591 640
592 smp_mb(); 641 smp_mb();
593 if (cur_trans->blocked || cur_trans->delayed_refs.flushing) 642 if (cur_trans->state >= TRANS_STATE_BLOCKED ||
643 cur_trans->delayed_refs.flushing)
594 return 1; 644 return 1;
595 645
596 updates = trans->delayed_ref_updates; 646 updates = trans->delayed_ref_updates;
@@ -609,7 +659,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
609{ 659{
610 struct btrfs_transaction *cur_trans = trans->transaction; 660 struct btrfs_transaction *cur_trans = trans->transaction;
611 struct btrfs_fs_info *info = root->fs_info; 661 struct btrfs_fs_info *info = root->fs_info;
612 int count = 0; 662 unsigned long cur = trans->delayed_ref_updates;
613 int lock = (trans->type != TRANS_JOIN_NOLOCK); 663 int lock = (trans->type != TRANS_JOIN_NOLOCK);
614 int err = 0; 664 int err = 0;
615 665
@@ -638,17 +688,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
638 if (!list_empty(&trans->new_bgs)) 688 if (!list_empty(&trans->new_bgs))
639 btrfs_create_pending_block_groups(trans, root); 689 btrfs_create_pending_block_groups(trans, root);
640 690
641 while (count < 1) { 691 trans->delayed_ref_updates = 0;
642 unsigned long cur = trans->delayed_ref_updates; 692 if (btrfs_should_throttle_delayed_refs(trans, root)) {
693 cur = max_t(unsigned long, cur, 1);
643 trans->delayed_ref_updates = 0; 694 trans->delayed_ref_updates = 0;
644 if (cur && 695 btrfs_run_delayed_refs(trans, root, cur);
645 trans->transaction->delayed_refs.num_heads_ready > 64) {
646 trans->delayed_ref_updates = 0;
647 btrfs_run_delayed_refs(trans, root, cur);
648 } else {
649 break;
650 }
651 count++;
652 } 696 }
653 697
654 btrfs_trans_release_metadata(trans, root); 698 btrfs_trans_release_metadata(trans, root);
@@ -658,12 +702,15 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
658 btrfs_create_pending_block_groups(trans, root); 702 btrfs_create_pending_block_groups(trans, root);
659 703
660 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && 704 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
661 should_end_transaction(trans, root)) { 705 should_end_transaction(trans, root) &&
662 trans->transaction->blocked = 1; 706 ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
663 smp_wmb(); 707 spin_lock(&info->trans_lock);
708 if (cur_trans->state == TRANS_STATE_RUNNING)
709 cur_trans->state = TRANS_STATE_BLOCKED;
710 spin_unlock(&info->trans_lock);
664 } 711 }
665 712
666 if (lock && cur_trans->blocked && !cur_trans->in_commit) { 713 if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
667 if (throttle) { 714 if (throttle) {
668 /* 715 /*
669 * We may race with somebody else here so end up having 716 * We may race with somebody else here so end up having
@@ -677,12 +724,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
677 } 724 }
678 } 725 }
679 726
680 if (trans->type < TRANS_JOIN_NOLOCK) 727 if (trans->type & __TRANS_FREEZABLE)
681 sb_end_intwrite(root->fs_info->sb); 728 sb_end_intwrite(root->fs_info->sb);
682 729
683 WARN_ON(cur_trans != info->running_transaction); 730 WARN_ON(cur_trans != info->running_transaction);
684 WARN_ON(atomic_read(&cur_trans->num_writers) < 1); 731 WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
685 atomic_dec(&cur_trans->num_writers); 732 atomic_dec(&cur_trans->num_writers);
733 extwriter_counter_dec(cur_trans, trans->type);
686 734
687 smp_mb(); 735 smp_mb();
688 if (waitqueue_active(&cur_trans->writer_wait)) 736 if (waitqueue_active(&cur_trans->writer_wait))
@@ -736,9 +784,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
736 struct extent_state *cached_state = NULL; 784 struct extent_state *cached_state = NULL;
737 u64 start = 0; 785 u64 start = 0;
738 u64 end; 786 u64 end;
739 struct blk_plug plug;
740 787
741 blk_start_plug(&plug);
742 while (!find_first_extent_bit(dirty_pages, start, &start, &end, 788 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
743 mark, &cached_state)) { 789 mark, &cached_state)) {
744 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, 790 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
@@ -752,7 +798,6 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
752 } 798 }
753 if (err) 799 if (err)
754 werr = err; 800 werr = err;
755 blk_finish_plug(&plug);
756 return werr; 801 return werr;
757} 802}
758 803
@@ -797,8 +842,11 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
797{ 842{
798 int ret; 843 int ret;
799 int ret2; 844 int ret2;
845 struct blk_plug plug;
800 846
847 blk_start_plug(&plug);
801 ret = btrfs_write_marked_extents(root, dirty_pages, mark); 848 ret = btrfs_write_marked_extents(root, dirty_pages, mark);
849 blk_finish_plug(&plug);
802 ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark); 850 ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
803 851
804 if (ret) 852 if (ret)
@@ -1318,20 +1366,26 @@ static void update_super_roots(struct btrfs_root *root)
1318 1366
1319int btrfs_transaction_in_commit(struct btrfs_fs_info *info) 1367int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
1320{ 1368{
1369 struct btrfs_transaction *trans;
1321 int ret = 0; 1370 int ret = 0;
1371
1322 spin_lock(&info->trans_lock); 1372 spin_lock(&info->trans_lock);
1323 if (info->running_transaction) 1373 trans = info->running_transaction;
1324 ret = info->running_transaction->in_commit; 1374 if (trans)
1375 ret = (trans->state >= TRANS_STATE_COMMIT_START);
1325 spin_unlock(&info->trans_lock); 1376 spin_unlock(&info->trans_lock);
1326 return ret; 1377 return ret;
1327} 1378}
1328 1379
1329int btrfs_transaction_blocked(struct btrfs_fs_info *info) 1380int btrfs_transaction_blocked(struct btrfs_fs_info *info)
1330{ 1381{
1382 struct btrfs_transaction *trans;
1331 int ret = 0; 1383 int ret = 0;
1384
1332 spin_lock(&info->trans_lock); 1385 spin_lock(&info->trans_lock);
1333 if (info->running_transaction) 1386 trans = info->running_transaction;
1334 ret = info->running_transaction->blocked; 1387 if (trans)
1388 ret = is_transaction_blocked(trans);
1335 spin_unlock(&info->trans_lock); 1389 spin_unlock(&info->trans_lock);
1336 return ret; 1390 return ret;
1337} 1391}
@@ -1343,7 +1397,9 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info)
1343static void wait_current_trans_commit_start(struct btrfs_root *root, 1397static void wait_current_trans_commit_start(struct btrfs_root *root,
1344 struct btrfs_transaction *trans) 1398 struct btrfs_transaction *trans)
1345{ 1399{
1346 wait_event(root->fs_info->transaction_blocked_wait, trans->in_commit); 1400 wait_event(root->fs_info->transaction_blocked_wait,
1401 trans->state >= TRANS_STATE_COMMIT_START ||
1402 trans->aborted);
1347} 1403}
1348 1404
1349/* 1405/*
@@ -1354,7 +1410,8 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
1354 struct btrfs_transaction *trans) 1410 struct btrfs_transaction *trans)
1355{ 1411{
1356 wait_event(root->fs_info->transaction_wait, 1412 wait_event(root->fs_info->transaction_wait,
1357 trans->commit_done || (trans->in_commit && !trans->blocked)); 1413 trans->state >= TRANS_STATE_UNBLOCKED ||
1414 trans->aborted);
1358} 1415}
1359 1416
1360/* 1417/*
@@ -1450,26 +1507,31 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
1450 1507
1451 spin_lock(&root->fs_info->trans_lock); 1508 spin_lock(&root->fs_info->trans_lock);
1452 1509
1453 if (list_empty(&cur_trans->list)) { 1510 /*
1454 spin_unlock(&root->fs_info->trans_lock); 1511 * If the transaction is removed from the list, it means this
1455 btrfs_end_transaction(trans, root); 1512 * transaction has been committed successfully, so it is impossible
1456 return; 1513 * to call the cleanup function.
1457 } 1514 */
1515 BUG_ON(list_empty(&cur_trans->list));
1458 1516
1459 list_del_init(&cur_trans->list); 1517 list_del_init(&cur_trans->list);
1460 if (cur_trans == root->fs_info->running_transaction) { 1518 if (cur_trans == root->fs_info->running_transaction) {
1461 root->fs_info->trans_no_join = 1; 1519 cur_trans->state = TRANS_STATE_COMMIT_DOING;
1462 spin_unlock(&root->fs_info->trans_lock); 1520 spin_unlock(&root->fs_info->trans_lock);
1463 wait_event(cur_trans->writer_wait, 1521 wait_event(cur_trans->writer_wait,
1464 atomic_read(&cur_trans->num_writers) == 1); 1522 atomic_read(&cur_trans->num_writers) == 1);
1465 1523
1466 spin_lock(&root->fs_info->trans_lock); 1524 spin_lock(&root->fs_info->trans_lock);
1467 root->fs_info->running_transaction = NULL;
1468 } 1525 }
1469 spin_unlock(&root->fs_info->trans_lock); 1526 spin_unlock(&root->fs_info->trans_lock);
1470 1527
1471 btrfs_cleanup_one_transaction(trans->transaction, root); 1528 btrfs_cleanup_one_transaction(trans->transaction, root);
1472 1529
1530 spin_lock(&root->fs_info->trans_lock);
1531 if (cur_trans == root->fs_info->running_transaction)
1532 root->fs_info->running_transaction = NULL;
1533 spin_unlock(&root->fs_info->trans_lock);
1534
1473 put_transaction(cur_trans); 1535 put_transaction(cur_trans);
1474 put_transaction(cur_trans); 1536 put_transaction(cur_trans);
1475 1537
@@ -1481,33 +1543,13 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
1481 current->journal_info = NULL; 1543 current->journal_info = NULL;
1482 1544
1483 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1545 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1484
1485 spin_lock(&root->fs_info->trans_lock);
1486 root->fs_info->trans_no_join = 0;
1487 spin_unlock(&root->fs_info->trans_lock);
1488} 1546}
1489 1547
1490static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, 1548static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
1491 struct btrfs_root *root) 1549 struct btrfs_root *root)
1492{ 1550{
1493 int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
1494 int snap_pending = 0;
1495 int ret; 1551 int ret;
1496 1552
1497 if (!flush_on_commit) {
1498 spin_lock(&root->fs_info->trans_lock);
1499 if (!list_empty(&trans->transaction->pending_snapshots))
1500 snap_pending = 1;
1501 spin_unlock(&root->fs_info->trans_lock);
1502 }
1503
1504 if (flush_on_commit || snap_pending) {
1505 ret = btrfs_start_delalloc_inodes(root, 1);
1506 if (ret)
1507 return ret;
1508 btrfs_wait_ordered_extents(root, 1);
1509 }
1510
1511 ret = btrfs_run_delayed_items(trans, root); 1553 ret = btrfs_run_delayed_items(trans, root);
1512 if (ret) 1554 if (ret)
1513 return ret; 1555 return ret;
@@ -1531,23 +1573,25 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
1531 return ret; 1573 return ret;
1532} 1574}
1533 1575
1534/* 1576static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
1535 * btrfs_transaction state sequence: 1577{
1536 * in_commit = 0, blocked = 0 (initial) 1578 if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
1537 * in_commit = 1, blocked = 1 1579 return btrfs_start_all_delalloc_inodes(fs_info, 1);
1538 * blocked = 0 1580 return 0;
1539 * commit_done = 1 1581}
1540 */ 1582
1583static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
1584{
1585 if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
1586 btrfs_wait_all_ordered_extents(fs_info, 1);
1587}
1588
1541int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 1589int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1542 struct btrfs_root *root) 1590 struct btrfs_root *root)
1543{ 1591{
1544 unsigned long joined = 0;
1545 struct btrfs_transaction *cur_trans = trans->transaction; 1592 struct btrfs_transaction *cur_trans = trans->transaction;
1546 struct btrfs_transaction *prev_trans = NULL; 1593 struct btrfs_transaction *prev_trans = NULL;
1547 DEFINE_WAIT(wait);
1548 int ret; 1594 int ret;
1549 int should_grow = 0;
1550 unsigned long now = get_seconds();
1551 1595
1552 ret = btrfs_run_ordered_operations(trans, root, 0); 1596 ret = btrfs_run_ordered_operations(trans, root, 0);
1553 if (ret) { 1597 if (ret) {
@@ -1586,6 +1630,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1586 * start sending their work down. 1630 * start sending their work down.
1587 */ 1631 */
1588 cur_trans->delayed_refs.flushing = 1; 1632 cur_trans->delayed_refs.flushing = 1;
1633 smp_wmb();
1589 1634
1590 if (!list_empty(&trans->new_bgs)) 1635 if (!list_empty(&trans->new_bgs))
1591 btrfs_create_pending_block_groups(trans, root); 1636 btrfs_create_pending_block_groups(trans, root);
@@ -1596,9 +1641,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1596 return ret; 1641 return ret;
1597 } 1642 }
1598 1643
1599 spin_lock(&cur_trans->commit_lock); 1644 spin_lock(&root->fs_info->trans_lock);
1600 if (cur_trans->in_commit) { 1645 if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
1601 spin_unlock(&cur_trans->commit_lock); 1646 spin_unlock(&root->fs_info->trans_lock);
1602 atomic_inc(&cur_trans->use_count); 1647 atomic_inc(&cur_trans->use_count);
1603 ret = btrfs_end_transaction(trans, root); 1648 ret = btrfs_end_transaction(trans, root);
1604 1649
@@ -1609,16 +1654,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1609 return ret; 1654 return ret;
1610 } 1655 }
1611 1656
1612 trans->transaction->in_commit = 1; 1657 cur_trans->state = TRANS_STATE_COMMIT_START;
1613 trans->transaction->blocked = 1;
1614 spin_unlock(&cur_trans->commit_lock);
1615 wake_up(&root->fs_info->transaction_blocked_wait); 1658 wake_up(&root->fs_info->transaction_blocked_wait);
1616 1659
1617 spin_lock(&root->fs_info->trans_lock);
1618 if (cur_trans->list.prev != &root->fs_info->trans_list) { 1660 if (cur_trans->list.prev != &root->fs_info->trans_list) {
1619 prev_trans = list_entry(cur_trans->list.prev, 1661 prev_trans = list_entry(cur_trans->list.prev,
1620 struct btrfs_transaction, list); 1662 struct btrfs_transaction, list);
1621 if (!prev_trans->commit_done) { 1663 if (prev_trans->state != TRANS_STATE_COMPLETED) {
1622 atomic_inc(&prev_trans->use_count); 1664 atomic_inc(&prev_trans->use_count);
1623 spin_unlock(&root->fs_info->trans_lock); 1665 spin_unlock(&root->fs_info->trans_lock);
1624 1666
@@ -1632,42 +1674,32 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1632 spin_unlock(&root->fs_info->trans_lock); 1674 spin_unlock(&root->fs_info->trans_lock);
1633 } 1675 }
1634 1676
1635 if (!btrfs_test_opt(root, SSD) && 1677 extwriter_counter_dec(cur_trans, trans->type);
1636 (now < cur_trans->start_time || now - cur_trans->start_time < 1))
1637 should_grow = 1;
1638
1639 do {
1640 joined = cur_trans->num_joined;
1641
1642 WARN_ON(cur_trans != trans->transaction);
1643
1644 ret = btrfs_flush_all_pending_stuffs(trans, root);
1645 if (ret)
1646 goto cleanup_transaction;
1647 1678
1648 prepare_to_wait(&cur_trans->writer_wait, &wait, 1679 ret = btrfs_start_delalloc_flush(root->fs_info);
1649 TASK_UNINTERRUPTIBLE); 1680 if (ret)
1681 goto cleanup_transaction;
1650 1682
1651 if (atomic_read(&cur_trans->num_writers) > 1) 1683 ret = btrfs_flush_all_pending_stuffs(trans, root);
1652 schedule_timeout(MAX_SCHEDULE_TIMEOUT); 1684 if (ret)
1653 else if (should_grow) 1685 goto cleanup_transaction;
1654 schedule_timeout(1);
1655 1686
1656 finish_wait(&cur_trans->writer_wait, &wait); 1687 wait_event(cur_trans->writer_wait,
1657 } while (atomic_read(&cur_trans->num_writers) > 1 || 1688 extwriter_counter_read(cur_trans) == 0);
1658 (should_grow && cur_trans->num_joined != joined));
1659 1689
1690 /* some pending stuffs might be added after the previous flush. */
1660 ret = btrfs_flush_all_pending_stuffs(trans, root); 1691 ret = btrfs_flush_all_pending_stuffs(trans, root);
1661 if (ret) 1692 if (ret)
1662 goto cleanup_transaction; 1693 goto cleanup_transaction;
1663 1694
1695 btrfs_wait_delalloc_flush(root->fs_info);
1664 /* 1696 /*
1665 * Ok now we need to make sure to block out any other joins while we 1697 * Ok now we need to make sure to block out any other joins while we
1666 * commit the transaction. We could have started a join before setting 1698 * commit the transaction. We could have started a join before setting
1667 * no_join so make sure to wait for num_writers to == 1 again. 1699 * COMMIT_DOING so make sure to wait for num_writers to == 1 again.
1668 */ 1700 */
1669 spin_lock(&root->fs_info->trans_lock); 1701 spin_lock(&root->fs_info->trans_lock);
1670 root->fs_info->trans_no_join = 1; 1702 cur_trans->state = TRANS_STATE_COMMIT_DOING;
1671 spin_unlock(&root->fs_info->trans_lock); 1703 spin_unlock(&root->fs_info->trans_lock);
1672 wait_event(cur_trans->writer_wait, 1704 wait_event(cur_trans->writer_wait,
1673 atomic_read(&cur_trans->num_writers) == 1); 1705 atomic_read(&cur_trans->num_writers) == 1);
@@ -1794,10 +1826,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1794 memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy, 1826 memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
1795 sizeof(*root->fs_info->super_copy)); 1827 sizeof(*root->fs_info->super_copy));
1796 1828
1797 trans->transaction->blocked = 0;
1798 spin_lock(&root->fs_info->trans_lock); 1829 spin_lock(&root->fs_info->trans_lock);
1830 cur_trans->state = TRANS_STATE_UNBLOCKED;
1799 root->fs_info->running_transaction = NULL; 1831 root->fs_info->running_transaction = NULL;
1800 root->fs_info->trans_no_join = 0;
1801 spin_unlock(&root->fs_info->trans_lock); 1832 spin_unlock(&root->fs_info->trans_lock);
1802 mutex_unlock(&root->fs_info->reloc_mutex); 1833 mutex_unlock(&root->fs_info->reloc_mutex);
1803 1834
@@ -1825,10 +1856,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1825 1856
1826 btrfs_finish_extent_commit(trans, root); 1857 btrfs_finish_extent_commit(trans, root);
1827 1858
1828 cur_trans->commit_done = 1;
1829
1830 root->fs_info->last_trans_committed = cur_trans->transid; 1859 root->fs_info->last_trans_committed = cur_trans->transid;
1831 1860 /*
1861 * We needn't acquire the lock here because there is no other task
1862 * which can change it.
1863 */
1864 cur_trans->state = TRANS_STATE_COMPLETED;
1832 wake_up(&cur_trans->commit_wait); 1865 wake_up(&cur_trans->commit_wait);
1833 1866
1834 spin_lock(&root->fs_info->trans_lock); 1867 spin_lock(&root->fs_info->trans_lock);
@@ -1838,7 +1871,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1838 put_transaction(cur_trans); 1871 put_transaction(cur_trans);
1839 put_transaction(cur_trans); 1872 put_transaction(cur_trans);
1840 1873
1841 if (trans->type < TRANS_JOIN_NOLOCK) 1874 if (trans->type & __TRANS_FREEZABLE)
1842 sb_end_intwrite(root->fs_info->sb); 1875 sb_end_intwrite(root->fs_info->sb);
1843 1876
1844 trace_btrfs_transaction_commit(root); 1877 trace_btrfs_transaction_commit(root);
@@ -1885,11 +1918,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
1885 int ret; 1918 int ret;
1886 struct btrfs_fs_info *fs_info = root->fs_info; 1919 struct btrfs_fs_info *fs_info = root->fs_info;
1887 1920
1888 if (fs_info->sb->s_flags & MS_RDONLY) {
1889 pr_debug("btrfs: cleaner called for RO fs!\n");
1890 return 0;
1891 }
1892
1893 spin_lock(&fs_info->trans_lock); 1921 spin_lock(&fs_info->trans_lock);
1894 if (list_empty(&fs_info->dead_roots)) { 1922 if (list_empty(&fs_info->dead_roots)) {
1895 spin_unlock(&fs_info->trans_lock); 1923 spin_unlock(&fs_info->trans_lock);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 24c97335a59f..005b0375d18c 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -22,21 +22,33 @@
22#include "delayed-ref.h" 22#include "delayed-ref.h"
23#include "ctree.h" 23#include "ctree.h"
24 24
25enum btrfs_trans_state {
26 TRANS_STATE_RUNNING = 0,
27 TRANS_STATE_BLOCKED = 1,
28 TRANS_STATE_COMMIT_START = 2,
29 TRANS_STATE_COMMIT_DOING = 3,
30 TRANS_STATE_UNBLOCKED = 4,
31 TRANS_STATE_COMPLETED = 5,
32 TRANS_STATE_MAX = 6,
33};
34
25struct btrfs_transaction { 35struct btrfs_transaction {
26 u64 transid; 36 u64 transid;
27 /* 37 /*
38 * total external writers(USERSPACE/START/ATTACH) in this
39 * transaction, it must be zero before the transaction is
40 * being committed
41 */
42 atomic_t num_extwriters;
43 /*
28 * total writers in this transaction, it must be zero before the 44 * total writers in this transaction, it must be zero before the
29 * transaction can end 45 * transaction can end
30 */ 46 */
31 atomic_t num_writers; 47 atomic_t num_writers;
32 atomic_t use_count; 48 atomic_t use_count;
33 49
34 unsigned long num_joined; 50 /* Be protected by fs_info->trans_lock when we want to change it. */
35 51 enum btrfs_trans_state state;
36 spinlock_t commit_lock;
37 int in_commit;
38 int commit_done;
39 int blocked;
40 struct list_head list; 52 struct list_head list;
41 struct extent_io_tree dirty_pages; 53 struct extent_io_tree dirty_pages;
42 unsigned long start_time; 54 unsigned long start_time;
@@ -44,17 +56,27 @@ struct btrfs_transaction {
44 wait_queue_head_t commit_wait; 56 wait_queue_head_t commit_wait;
45 struct list_head pending_snapshots; 57 struct list_head pending_snapshots;
46 struct list_head ordered_operations; 58 struct list_head ordered_operations;
59 struct list_head pending_chunks;
47 struct btrfs_delayed_ref_root delayed_refs; 60 struct btrfs_delayed_ref_root delayed_refs;
48 int aborted; 61 int aborted;
49}; 62};
50 63
51enum btrfs_trans_type { 64#define __TRANS_FREEZABLE (1U << 0)
52 TRANS_START, 65
53 TRANS_JOIN, 66#define __TRANS_USERSPACE (1U << 8)
54 TRANS_USERSPACE, 67#define __TRANS_START (1U << 9)
55 TRANS_JOIN_NOLOCK, 68#define __TRANS_ATTACH (1U << 10)
56 TRANS_ATTACH, 69#define __TRANS_JOIN (1U << 11)
57}; 70#define __TRANS_JOIN_NOLOCK (1U << 12)
71
72#define TRANS_USERSPACE (__TRANS_USERSPACE | __TRANS_FREEZABLE)
73#define TRANS_START (__TRANS_START | __TRANS_FREEZABLE)
74#define TRANS_ATTACH (__TRANS_ATTACH)
75#define TRANS_JOIN (__TRANS_JOIN | __TRANS_FREEZABLE)
76#define TRANS_JOIN_NOLOCK (__TRANS_JOIN_NOLOCK)
77
78#define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \
79 __TRANS_ATTACH)
58 80
59struct btrfs_trans_handle { 81struct btrfs_trans_handle {
60 u64 transid; 82 u64 transid;
@@ -70,7 +92,7 @@ struct btrfs_trans_handle {
70 short aborted; 92 short aborted;
71 short adding_csums; 93 short adding_csums;
72 bool allocating_chunk; 94 bool allocating_chunk;
73 enum btrfs_trans_type type; 95 unsigned int type;
74 /* 96 /*
75 * this root is only needed to validate that the root passed to 97 * this root is only needed to validate that the root passed to
76 * start_transaction is the same as the one passed to end_transaction. 98 * start_transaction is the same as the one passed to end_transaction.
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c276ac9a0ec3..2c6791493637 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -18,6 +18,7 @@
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/blkdev.h>
21#include <linux/list_sort.h> 22#include <linux/list_sort.h>
22#include "ctree.h" 23#include "ctree.h"
23#include "transaction.h" 24#include "transaction.h"
@@ -279,11 +280,23 @@ static int process_one_buffer(struct btrfs_root *log,
279{ 280{
280 int ret = 0; 281 int ret = 0;
281 282
283 /*
284 * If this fs is mixed then we need to be able to process the leaves to
285 * pin down any logged extents, so we have to read the block.
286 */
287 if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) {
288 ret = btrfs_read_buffer(eb, gen);
289 if (ret)
290 return ret;
291 }
292
282 if (wc->pin) 293 if (wc->pin)
283 ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root, 294 ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root,
284 eb->start, eb->len); 295 eb->start, eb->len);
285 296
286 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { 297 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
298 if (wc->pin && btrfs_header_level(eb) == 0)
299 ret = btrfs_exclude_logged_extents(log, eb);
287 if (wc->write) 300 if (wc->write)
288 btrfs_write_tree_block(eb); 301 btrfs_write_tree_block(eb);
289 if (wc->wait) 302 if (wc->wait)
@@ -2016,13 +2029,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2016 eb, i, &key); 2029 eb, i, &key);
2017 if (ret) 2030 if (ret)
2018 break; 2031 break;
2019 } else if (key.type == BTRFS_INODE_REF_KEY) { 2032 } else if (key.type == BTRFS_INODE_REF_KEY ||
2020 ret = add_inode_ref(wc->trans, root, log, path, 2033 key.type == BTRFS_INODE_EXTREF_KEY) {
2021 eb, i, &key);
2022 if (ret && ret != -ENOENT)
2023 break;
2024 ret = 0;
2025 } else if (key.type == BTRFS_INODE_EXTREF_KEY) {
2026 ret = add_inode_ref(wc->trans, root, log, path, 2034 ret = add_inode_ref(wc->trans, root, log, path,
2027 eb, i, &key); 2035 eb, i, &key);
2028 if (ret && ret != -ENOENT) 2036 if (ret && ret != -ENOENT)
@@ -2358,6 +2366,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2358 struct btrfs_root *log = root->log_root; 2366 struct btrfs_root *log = root->log_root;
2359 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; 2367 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
2360 unsigned long log_transid = 0; 2368 unsigned long log_transid = 0;
2369 struct blk_plug plug;
2361 2370
2362 mutex_lock(&root->log_mutex); 2371 mutex_lock(&root->log_mutex);
2363 log_transid = root->log_transid; 2372 log_transid = root->log_transid;
@@ -2401,8 +2410,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2401 /* we start IO on all the marked extents here, but we don't actually 2410 /* we start IO on all the marked extents here, but we don't actually
2402 * wait for them until later. 2411 * wait for them until later.
2403 */ 2412 */
2413 blk_start_plug(&plug);
2404 ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); 2414 ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
2405 if (ret) { 2415 if (ret) {
2416 blk_finish_plug(&plug);
2406 btrfs_abort_transaction(trans, root, ret); 2417 btrfs_abort_transaction(trans, root, ret);
2407 btrfs_free_logged_extents(log, log_transid); 2418 btrfs_free_logged_extents(log, log_transid);
2408 mutex_unlock(&root->log_mutex); 2419 mutex_unlock(&root->log_mutex);
@@ -2437,6 +2448,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2437 } 2448 }
2438 2449
2439 if (ret) { 2450 if (ret) {
2451 blk_finish_plug(&plug);
2440 if (ret != -ENOSPC) { 2452 if (ret != -ENOSPC) {
2441 btrfs_abort_transaction(trans, root, ret); 2453 btrfs_abort_transaction(trans, root, ret);
2442 mutex_unlock(&log_root_tree->log_mutex); 2454 mutex_unlock(&log_root_tree->log_mutex);
@@ -2452,6 +2464,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2452 2464
2453 index2 = log_root_tree->log_transid % 2; 2465 index2 = log_root_tree->log_transid % 2;
2454 if (atomic_read(&log_root_tree->log_commit[index2])) { 2466 if (atomic_read(&log_root_tree->log_commit[index2])) {
2467 blk_finish_plug(&plug);
2455 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2468 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2456 wait_log_commit(trans, log_root_tree, 2469 wait_log_commit(trans, log_root_tree,
2457 log_root_tree->log_transid); 2470 log_root_tree->log_transid);
@@ -2474,6 +2487,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2474 * check the full commit flag again 2487 * check the full commit flag again
2475 */ 2488 */
2476 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2489 if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2490 blk_finish_plug(&plug);
2477 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2491 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2478 btrfs_free_logged_extents(log, log_transid); 2492 btrfs_free_logged_extents(log, log_transid);
2479 mutex_unlock(&log_root_tree->log_mutex); 2493 mutex_unlock(&log_root_tree->log_mutex);
@@ -2481,9 +2495,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2481 goto out_wake_log_root; 2495 goto out_wake_log_root;
2482 } 2496 }
2483 2497
2484 ret = btrfs_write_and_wait_marked_extents(log_root_tree, 2498 ret = btrfs_write_marked_extents(log_root_tree,
2485 &log_root_tree->dirty_log_pages, 2499 &log_root_tree->dirty_log_pages,
2486 EXTENT_DIRTY | EXTENT_NEW); 2500 EXTENT_DIRTY | EXTENT_NEW);
2501 blk_finish_plug(&plug);
2487 if (ret) { 2502 if (ret) {
2488 btrfs_abort_transaction(trans, root, ret); 2503 btrfs_abort_transaction(trans, root, ret);
2489 btrfs_free_logged_extents(log, log_transid); 2504 btrfs_free_logged_extents(log, log_transid);
@@ -2491,6 +2506,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2491 goto out_wake_log_root; 2506 goto out_wake_log_root;
2492 } 2507 }
2493 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2508 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2509 btrfs_wait_marked_extents(log_root_tree,
2510 &log_root_tree->dirty_log_pages,
2511 EXTENT_NEW | EXTENT_DIRTY);
2494 btrfs_wait_logged_extents(log, log_transid); 2512 btrfs_wait_logged_extents(log, log_transid);
2495 2513
2496 btrfs_set_super_log_root(root->fs_info->super_for_commit, 2514 btrfs_set_super_log_root(root->fs_info->super_for_commit,
@@ -4016,8 +4034,7 @@ again:
4016 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 4034 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
4017 break; 4035 break;
4018 4036
4019 log = btrfs_read_fs_root_no_radix(log_root_tree, 4037 log = btrfs_read_fs_root(log_root_tree, &found_key);
4020 &found_key);
4021 if (IS_ERR(log)) { 4038 if (IS_ERR(log)) {
4022 ret = PTR_ERR(log); 4039 ret = PTR_ERR(log);
4023 btrfs_error(fs_info, ret, 4040 btrfs_error(fs_info, ret,
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 7b417e20efe2..b0a523b2c60e 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -205,6 +205,10 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
205 u64 new_alloced = ulist->nodes_alloced + 128; 205 u64 new_alloced = ulist->nodes_alloced + 128;
206 struct ulist_node *new_nodes; 206 struct ulist_node *new_nodes;
207 void *old = NULL; 207 void *old = NULL;
208 int i;
209
210 for (i = 0; i < ulist->nnodes; i++)
211 rb_erase(&ulist->nodes[i].rb_node, &ulist->root);
208 212
209 /* 213 /*
210 * if nodes_alloced == ULIST_SIZE no memory has been allocated 214 * if nodes_alloced == ULIST_SIZE no memory has been allocated
@@ -224,6 +228,17 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
224 228
225 ulist->nodes = new_nodes; 229 ulist->nodes = new_nodes;
226 ulist->nodes_alloced = new_alloced; 230 ulist->nodes_alloced = new_alloced;
231
232 /*
233 * krealloc actually uses memcpy, which does not copy rb_node
234 * pointers, so we have to do it ourselves. Otherwise we may
235 * be bitten by crashes.
236 */
237 for (i = 0; i < ulist->nnodes; i++) {
238 ret = ulist_rbtree_insert(ulist, &ulist->nodes[i]);
239 if (ret < 0)
240 return ret;
241 }
227 } 242 }
228 ulist->nodes[ulist->nnodes].val = val; 243 ulist->nodes[ulist->nnodes].val = val;
229 ulist->nodes[ulist->nnodes].aux = aux; 244 ulist->nodes[ulist->nnodes].aux = aux;
diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h
deleted file mode 100644
index 9bf3946d5ef2..000000000000
--- a/fs/btrfs/version.h
+++ /dev/null
@@ -1,4 +0,0 @@
1#ifndef __BTRFS_VERSION_H
2#define __BTRFS_VERSION_H
3#define BTRFS_BUILD_VERSION "Btrfs"
4#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8bffb9174afb..78b871753cb6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -982,6 +982,35 @@ out:
982 return ret; 982 return ret;
983} 983}
984 984
985static int contains_pending_extent(struct btrfs_trans_handle *trans,
986 struct btrfs_device *device,
987 u64 *start, u64 len)
988{
989 struct extent_map *em;
990 int ret = 0;
991
992 list_for_each_entry(em, &trans->transaction->pending_chunks, list) {
993 struct map_lookup *map;
994 int i;
995
996 map = (struct map_lookup *)em->bdev;
997 for (i = 0; i < map->num_stripes; i++) {
998 if (map->stripes[i].dev != device)
999 continue;
1000 if (map->stripes[i].physical >= *start + len ||
1001 map->stripes[i].physical + em->orig_block_len <=
1002 *start)
1003 continue;
1004 *start = map->stripes[i].physical +
1005 em->orig_block_len;
1006 ret = 1;
1007 }
1008 }
1009
1010 return ret;
1011}
1012
1013
985/* 1014/*
986 * find_free_dev_extent - find free space in the specified device 1015 * find_free_dev_extent - find free space in the specified device
987 * @device: the device which we search the free space in 1016 * @device: the device which we search the free space in
@@ -1002,7 +1031,8 @@ out:
1002 * But if we don't find suitable free space, it is used to store the size of 1031 * But if we don't find suitable free space, it is used to store the size of
1003 * the max free space. 1032 * the max free space.
1004 */ 1033 */
1005int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, 1034int find_free_dev_extent(struct btrfs_trans_handle *trans,
1035 struct btrfs_device *device, u64 num_bytes,
1006 u64 *start, u64 *len) 1036 u64 *start, u64 *len)
1007{ 1037{
1008 struct btrfs_key key; 1038 struct btrfs_key key;
@@ -1026,21 +1056,22 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
1026 */ 1056 */
1027 search_start = max(root->fs_info->alloc_start, 1024ull * 1024); 1057 search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
1028 1058
1059 path = btrfs_alloc_path();
1060 if (!path)
1061 return -ENOMEM;
1062again:
1029 max_hole_start = search_start; 1063 max_hole_start = search_start;
1030 max_hole_size = 0; 1064 max_hole_size = 0;
1031 hole_size = 0; 1065 hole_size = 0;
1032 1066
1033 if (search_start >= search_end || device->is_tgtdev_for_dev_replace) { 1067 if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
1034 ret = -ENOSPC; 1068 ret = -ENOSPC;
1035 goto error; 1069 goto out;
1036 } 1070 }
1037 1071
1038 path = btrfs_alloc_path();
1039 if (!path) {
1040 ret = -ENOMEM;
1041 goto error;
1042 }
1043 path->reada = 2; 1072 path->reada = 2;
1073 path->search_commit_root = 1;
1074 path->skip_locking = 1;
1044 1075
1045 key.objectid = device->devid; 1076 key.objectid = device->devid;
1046 key.offset = search_start; 1077 key.offset = search_start;
@@ -1081,6 +1112,15 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
1081 if (key.offset > search_start) { 1112 if (key.offset > search_start) {
1082 hole_size = key.offset - search_start; 1113 hole_size = key.offset - search_start;
1083 1114
1115 /*
1116 * Have to check before we set max_hole_start, otherwise
1117 * we could end up sending back this offset anyway.
1118 */
1119 if (contains_pending_extent(trans, device,
1120 &search_start,
1121 hole_size))
1122 hole_size = 0;
1123
1084 if (hole_size > max_hole_size) { 1124 if (hole_size > max_hole_size) {
1085 max_hole_start = search_start; 1125 max_hole_start = search_start;
1086 max_hole_size = hole_size; 1126 max_hole_size = hole_size;
@@ -1124,6 +1164,11 @@ next:
1124 max_hole_size = hole_size; 1164 max_hole_size = hole_size;
1125 } 1165 }
1126 1166
1167 if (contains_pending_extent(trans, device, &search_start, hole_size)) {
1168 btrfs_release_path(path);
1169 goto again;
1170 }
1171
1127 /* See above. */ 1172 /* See above. */
1128 if (hole_size < num_bytes) 1173 if (hole_size < num_bytes)
1129 ret = -ENOSPC; 1174 ret = -ENOSPC;
@@ -1132,7 +1177,6 @@ next:
1132 1177
1133out: 1178out:
1134 btrfs_free_path(path); 1179 btrfs_free_path(path);
1135error:
1136 *start = max_hole_start; 1180 *start = max_hole_start;
1137 if (len) 1181 if (len)
1138 *len = max_hole_size; 1182 *len = max_hole_size;
@@ -1244,47 +1288,22 @@ out:
1244 return ret; 1288 return ret;
1245} 1289}
1246 1290
1247static noinline int find_next_chunk(struct btrfs_root *root, 1291static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1248 u64 objectid, u64 *offset)
1249{ 1292{
1250 struct btrfs_path *path; 1293 struct extent_map_tree *em_tree;
1251 int ret; 1294 struct extent_map *em;
1252 struct btrfs_key key; 1295 struct rb_node *n;
1253 struct btrfs_chunk *chunk; 1296 u64 ret = 0;
1254 struct btrfs_key found_key;
1255
1256 path = btrfs_alloc_path();
1257 if (!path)
1258 return -ENOMEM;
1259
1260 key.objectid = objectid;
1261 key.offset = (u64)-1;
1262 key.type = BTRFS_CHUNK_ITEM_KEY;
1263
1264 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1265 if (ret < 0)
1266 goto error;
1267
1268 BUG_ON(ret == 0); /* Corruption */
1269 1297
1270 ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY); 1298 em_tree = &fs_info->mapping_tree.map_tree;
1271 if (ret) { 1299 read_lock(&em_tree->lock);
1272 *offset = 0; 1300 n = rb_last(&em_tree->map);
1273 } else { 1301 if (n) {
1274 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1302 em = rb_entry(n, struct extent_map, rb_node);
1275 path->slots[0]); 1303 ret = em->start + em->len;
1276 if (found_key.objectid != objectid)
1277 *offset = 0;
1278 else {
1279 chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
1280 struct btrfs_chunk);
1281 *offset = found_key.offset +
1282 btrfs_chunk_length(path->nodes[0], chunk);
1283 }
1284 } 1304 }
1285 ret = 0; 1305 read_unlock(&em_tree->lock);
1286error: 1306
1287 btrfs_free_path(path);
1288 return ret; 1307 return ret;
1289} 1308}
1290 1309
@@ -1462,31 +1481,23 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1462 btrfs_dev_replace_unlock(&root->fs_info->dev_replace); 1481 btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
1463 1482
1464 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { 1483 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
1465 printk(KERN_ERR "btrfs: unable to go below four devices " 1484 ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET;
1466 "on raid10\n");
1467 ret = -EINVAL;
1468 goto out; 1485 goto out;
1469 } 1486 }
1470 1487
1471 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) { 1488 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
1472 printk(KERN_ERR "btrfs: unable to go below two " 1489 ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET;
1473 "devices on raid1\n");
1474 ret = -EINVAL;
1475 goto out; 1490 goto out;
1476 } 1491 }
1477 1492
1478 if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && 1493 if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
1479 root->fs_info->fs_devices->rw_devices <= 2) { 1494 root->fs_info->fs_devices->rw_devices <= 2) {
1480 printk(KERN_ERR "btrfs: unable to go below two " 1495 ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET;
1481 "devices on raid5\n");
1482 ret = -EINVAL;
1483 goto out; 1496 goto out;
1484 } 1497 }
1485 if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && 1498 if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
1486 root->fs_info->fs_devices->rw_devices <= 3) { 1499 root->fs_info->fs_devices->rw_devices <= 3) {
1487 printk(KERN_ERR "btrfs: unable to go below three " 1500 ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET;
1488 "devices on raid6\n");
1489 ret = -EINVAL;
1490 goto out; 1501 goto out;
1491 } 1502 }
1492 1503
@@ -1512,8 +1523,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1512 bh = NULL; 1523 bh = NULL;
1513 disk_super = NULL; 1524 disk_super = NULL;
1514 if (!device) { 1525 if (!device) {
1515 printk(KERN_ERR "btrfs: no missing devices found to " 1526 ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
1516 "remove\n");
1517 goto out; 1527 goto out;
1518 } 1528 }
1519 } else { 1529 } else {
@@ -1535,15 +1545,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1535 } 1545 }
1536 1546
1537 if (device->is_tgtdev_for_dev_replace) { 1547 if (device->is_tgtdev_for_dev_replace) {
1538 pr_err("btrfs: unable to remove the dev_replace target dev\n"); 1548 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
1539 ret = -EINVAL;
1540 goto error_brelse; 1549 goto error_brelse;
1541 } 1550 }
1542 1551
1543 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { 1552 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1544 printk(KERN_ERR "btrfs: unable to remove the only writeable " 1553 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
1545 "device\n");
1546 ret = -EINVAL;
1547 goto error_brelse; 1554 goto error_brelse;
1548 } 1555 }
1549 1556
@@ -3295,10 +3302,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
3295 } 3302 }
3296 3303
3297 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 3304 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
3298 if (IS_ERR(tsk)) 3305 return PTR_RET(tsk);
3299 return PTR_ERR(tsk);
3300
3301 return 0;
3302} 3306}
3303 3307
3304int btrfs_recover_balance(struct btrfs_fs_info *fs_info) 3308int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
@@ -3681,10 +3685,8 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
3681} 3685}
3682 3686
3683static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 3687static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3684 struct btrfs_root *extent_root, 3688 struct btrfs_root *extent_root, u64 start,
3685 struct map_lookup **map_ret, 3689 u64 type)
3686 u64 *num_bytes_out, u64 *stripe_size_out,
3687 u64 start, u64 type)
3688{ 3690{
3689 struct btrfs_fs_info *info = extent_root->fs_info; 3691 struct btrfs_fs_info *info = extent_root->fs_info;
3690 struct btrfs_fs_devices *fs_devices = info->fs_devices; 3692 struct btrfs_fs_devices *fs_devices = info->fs_devices;
@@ -3791,7 +3793,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3791 if (total_avail == 0) 3793 if (total_avail == 0)
3792 continue; 3794 continue;
3793 3795
3794 ret = find_free_dev_extent(device, 3796 ret = find_free_dev_extent(trans, device,
3795 max_stripe_size * dev_stripes, 3797 max_stripe_size * dev_stripes,
3796 &dev_offset, &max_avail); 3798 &dev_offset, &max_avail);
3797 if (ret && ret != -ENOSPC) 3799 if (ret && ret != -ENOSPC)
@@ -3903,12 +3905,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3903 map->type = type; 3905 map->type = type;
3904 map->sub_stripes = sub_stripes; 3906 map->sub_stripes = sub_stripes;
3905 3907
3906 *map_ret = map;
3907 num_bytes = stripe_size * data_stripes; 3908 num_bytes = stripe_size * data_stripes;
3908 3909
3909 *stripe_size_out = stripe_size;
3910 *num_bytes_out = num_bytes;
3911
3912 trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes); 3910 trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
3913 3911
3914 em = alloc_extent_map(); 3912 em = alloc_extent_map();
@@ -3921,38 +3919,26 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3921 em->len = num_bytes; 3919 em->len = num_bytes;
3922 em->block_start = 0; 3920 em->block_start = 0;
3923 em->block_len = em->len; 3921 em->block_len = em->len;
3922 em->orig_block_len = stripe_size;
3924 3923
3925 em_tree = &extent_root->fs_info->mapping_tree.map_tree; 3924 em_tree = &extent_root->fs_info->mapping_tree.map_tree;
3926 write_lock(&em_tree->lock); 3925 write_lock(&em_tree->lock);
3927 ret = add_extent_mapping(em_tree, em, 0); 3926 ret = add_extent_mapping(em_tree, em, 0);
3927 if (!ret) {
3928 list_add_tail(&em->list, &trans->transaction->pending_chunks);
3929 atomic_inc(&em->refs);
3930 }
3928 write_unlock(&em_tree->lock); 3931 write_unlock(&em_tree->lock);
3929 if (ret) { 3932 if (ret) {
3930 free_extent_map(em); 3933 free_extent_map(em);
3931 goto error; 3934 goto error;
3932 } 3935 }
3933 3936
3934 for (i = 0; i < map->num_stripes; ++i) {
3935 struct btrfs_device *device;
3936 u64 dev_offset;
3937
3938 device = map->stripes[i].dev;
3939 dev_offset = map->stripes[i].physical;
3940
3941 ret = btrfs_alloc_dev_extent(trans, device,
3942 info->chunk_root->root_key.objectid,
3943 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3944 start, dev_offset, stripe_size);
3945 if (ret)
3946 goto error_dev_extent;
3947 }
3948
3949 ret = btrfs_make_block_group(trans, extent_root, 0, type, 3937 ret = btrfs_make_block_group(trans, extent_root, 0, type,
3950 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 3938 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3951 start, num_bytes); 3939 start, num_bytes);
3952 if (ret) { 3940 if (ret)
3953 i = map->num_stripes - 1; 3941 goto error_del_extent;
3954 goto error_dev_extent;
3955 }
3956 3942
3957 free_extent_map(em); 3943 free_extent_map(em);
3958 check_raid56_incompat_flag(extent_root->fs_info, type); 3944 check_raid56_incompat_flag(extent_root->fs_info, type);
@@ -3960,18 +3946,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3960 kfree(devices_info); 3946 kfree(devices_info);
3961 return 0; 3947 return 0;
3962 3948
3963error_dev_extent: 3949error_del_extent:
3964 for (; i >= 0; i--) {
3965 struct btrfs_device *device;
3966 int err;
3967
3968 device = map->stripes[i].dev;
3969 err = btrfs_free_dev_extent(trans, device, start);
3970 if (err) {
3971 btrfs_abort_transaction(trans, extent_root, err);
3972 break;
3973 }
3974 }
3975 write_lock(&em_tree->lock); 3950 write_lock(&em_tree->lock);
3976 remove_extent_mapping(em_tree, em); 3951 remove_extent_mapping(em_tree, em);
3977 write_unlock(&em_tree->lock); 3952 write_unlock(&em_tree->lock);
@@ -3986,33 +3961,68 @@ error:
3986 return ret; 3961 return ret;
3987} 3962}
3988 3963
3989static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, 3964int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
3990 struct btrfs_root *extent_root, 3965 struct btrfs_root *extent_root,
3991 struct map_lookup *map, u64 chunk_offset, 3966 u64 chunk_offset, u64 chunk_size)
3992 u64 chunk_size, u64 stripe_size)
3993{ 3967{
3994 u64 dev_offset;
3995 struct btrfs_key key; 3968 struct btrfs_key key;
3996 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; 3969 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
3997 struct btrfs_device *device; 3970 struct btrfs_device *device;
3998 struct btrfs_chunk *chunk; 3971 struct btrfs_chunk *chunk;
3999 struct btrfs_stripe *stripe; 3972 struct btrfs_stripe *stripe;
4000 size_t item_size = btrfs_chunk_item_size(map->num_stripes); 3973 struct extent_map_tree *em_tree;
4001 int index = 0; 3974 struct extent_map *em;
3975 struct map_lookup *map;
3976 size_t item_size;
3977 u64 dev_offset;
3978 u64 stripe_size;
3979 int i = 0;
4002 int ret; 3980 int ret;
4003 3981
3982 em_tree = &extent_root->fs_info->mapping_tree.map_tree;
3983 read_lock(&em_tree->lock);
3984 em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size);
3985 read_unlock(&em_tree->lock);
3986
3987 if (!em) {
3988 btrfs_crit(extent_root->fs_info, "unable to find logical "
3989 "%Lu len %Lu", chunk_offset, chunk_size);
3990 return -EINVAL;
3991 }
3992
3993 if (em->start != chunk_offset || em->len != chunk_size) {
3994 btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted"
3995 " %Lu-%Lu, found %Lu-%Lu\n", chunk_offset,
3996 chunk_size, em->start, em->len);
3997 free_extent_map(em);
3998 return -EINVAL;
3999 }
4000
4001 map = (struct map_lookup *)em->bdev;
4002 item_size = btrfs_chunk_item_size(map->num_stripes);
4003 stripe_size = em->orig_block_len;
4004
4004 chunk = kzalloc(item_size, GFP_NOFS); 4005 chunk = kzalloc(item_size, GFP_NOFS);
4005 if (!chunk) 4006 if (!chunk) {
4006 return -ENOMEM; 4007 ret = -ENOMEM;
4008 goto out;
4009 }
4010
4011 for (i = 0; i < map->num_stripes; i++) {
4012 device = map->stripes[i].dev;
4013 dev_offset = map->stripes[i].physical;
4007 4014
4008 index = 0;
4009 while (index < map->num_stripes) {
4010 device = map->stripes[index].dev;
4011 device->bytes_used += stripe_size; 4015 device->bytes_used += stripe_size;
4012 ret = btrfs_update_device(trans, device); 4016 ret = btrfs_update_device(trans, device);
4013 if (ret) 4017 if (ret)
4014 goto out_free; 4018 goto out;
4015 index++; 4019 ret = btrfs_alloc_dev_extent(trans, device,
4020 chunk_root->root_key.objectid,
4021 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
4022 chunk_offset, dev_offset,
4023 stripe_size);
4024 if (ret)
4025 goto out;
4016 } 4026 }
4017 4027
4018 spin_lock(&extent_root->fs_info->free_chunk_lock); 4028 spin_lock(&extent_root->fs_info->free_chunk_lock);
@@ -4020,17 +4030,15 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
4020 map->num_stripes); 4030 map->num_stripes);
4021 spin_unlock(&extent_root->fs_info->free_chunk_lock); 4031 spin_unlock(&extent_root->fs_info->free_chunk_lock);
4022 4032
4023 index = 0;
4024 stripe = &chunk->stripe; 4033 stripe = &chunk->stripe;
4025 while (index < map->num_stripes) { 4034 for (i = 0; i < map->num_stripes; i++) {
4026 device = map->stripes[index].dev; 4035 device = map->stripes[i].dev;
4027 dev_offset = map->stripes[index].physical; 4036 dev_offset = map->stripes[i].physical;
4028 4037
4029 btrfs_set_stack_stripe_devid(stripe, device->devid); 4038 btrfs_set_stack_stripe_devid(stripe, device->devid);
4030 btrfs_set_stack_stripe_offset(stripe, dev_offset); 4039 btrfs_set_stack_stripe_offset(stripe, dev_offset);
4031 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 4040 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
4032 stripe++; 4041 stripe++;
4033 index++;
4034 } 4042 }
4035 4043
4036 btrfs_set_stack_chunk_length(chunk, chunk_size); 4044 btrfs_set_stack_chunk_length(chunk, chunk_size);
@@ -4048,7 +4056,6 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
4048 key.offset = chunk_offset; 4056 key.offset = chunk_offset;
4049 4057
4050 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 4058 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
4051
4052 if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 4059 if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
4053 /* 4060 /*
4054 * TODO: Cleanup of inserted chunk root in case of 4061 * TODO: Cleanup of inserted chunk root in case of
@@ -4058,8 +4065,9 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
4058 item_size); 4065 item_size);
4059 } 4066 }
4060 4067
4061out_free: 4068out:
4062 kfree(chunk); 4069 kfree(chunk);
4070 free_extent_map(em);
4063 return ret; 4071 return ret;
4064} 4072}
4065 4073
@@ -4074,27 +4082,9 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4074 struct btrfs_root *extent_root, u64 type) 4082 struct btrfs_root *extent_root, u64 type)
4075{ 4083{
4076 u64 chunk_offset; 4084 u64 chunk_offset;
4077 u64 chunk_size;
4078 u64 stripe_size;
4079 struct map_lookup *map;
4080 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
4081 int ret;
4082
4083 ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
4084 &chunk_offset);
4085 if (ret)
4086 return ret;
4087 4085
4088 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 4086 chunk_offset = find_next_chunk(extent_root->fs_info);
4089 &stripe_size, chunk_offset, type); 4087 return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type);
4090 if (ret)
4091 return ret;
4092
4093 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
4094 chunk_size, stripe_size);
4095 if (ret)
4096 return ret;
4097 return 0;
4098} 4088}
4099 4089
4100static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, 4090static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
@@ -4103,66 +4093,31 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
4103{ 4093{
4104 u64 chunk_offset; 4094 u64 chunk_offset;
4105 u64 sys_chunk_offset; 4095 u64 sys_chunk_offset;
4106 u64 chunk_size;
4107 u64 sys_chunk_size;
4108 u64 stripe_size;
4109 u64 sys_stripe_size;
4110 u64 alloc_profile; 4096 u64 alloc_profile;
4111 struct map_lookup *map;
4112 struct map_lookup *sys_map;
4113 struct btrfs_fs_info *fs_info = root->fs_info; 4097 struct btrfs_fs_info *fs_info = root->fs_info;
4114 struct btrfs_root *extent_root = fs_info->extent_root; 4098 struct btrfs_root *extent_root = fs_info->extent_root;
4115 int ret; 4099 int ret;
4116 4100
4117 ret = find_next_chunk(fs_info->chunk_root, 4101 chunk_offset = find_next_chunk(fs_info);
4118 BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
4119 if (ret)
4120 return ret;
4121
4122 alloc_profile = btrfs_get_alloc_profile(extent_root, 0); 4102 alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
4123 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 4103 ret = __btrfs_alloc_chunk(trans, extent_root, chunk_offset,
4124 &stripe_size, chunk_offset, alloc_profile); 4104 alloc_profile);
4125 if (ret) 4105 if (ret)
4126 return ret; 4106 return ret;
4127 4107
4128 sys_chunk_offset = chunk_offset + chunk_size; 4108 sys_chunk_offset = find_next_chunk(root->fs_info);
4129
4130 alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); 4109 alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
4131 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, 4110 ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset,
4132 &sys_chunk_size, &sys_stripe_size, 4111 alloc_profile);
4133 sys_chunk_offset, alloc_profile);
4134 if (ret) { 4112 if (ret) {
4135 btrfs_abort_transaction(trans, root, ret); 4113 btrfs_abort_transaction(trans, root, ret);
4136 goto out; 4114 goto out;
4137 } 4115 }
4138 4116
4139 ret = btrfs_add_device(trans, fs_info->chunk_root, device); 4117 ret = btrfs_add_device(trans, fs_info->chunk_root, device);
4140 if (ret) {
4141 btrfs_abort_transaction(trans, root, ret);
4142 goto out;
4143 }
4144
4145 /*
4146 * Modifying chunk tree needs allocating new blocks from both
4147 * system block group and metadata block group. So we only can
4148 * do operations require modifying the chunk tree after both
4149 * block groups were created.
4150 */
4151 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
4152 chunk_size, stripe_size);
4153 if (ret) {
4154 btrfs_abort_transaction(trans, root, ret);
4155 goto out;
4156 }
4157
4158 ret = __finish_chunk_alloc(trans, extent_root, sys_map,
4159 sys_chunk_offset, sys_chunk_size,
4160 sys_stripe_size);
4161 if (ret) 4118 if (ret)
4162 btrfs_abort_transaction(trans, root, ret); 4119 btrfs_abort_transaction(trans, root, ret);
4163
4164out: 4120out:
4165
4166 return ret; 4121 return ret;
4167} 4122}
4168 4123
@@ -4435,9 +4390,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4435 map = (struct map_lookup *)em->bdev; 4390 map = (struct map_lookup *)em->bdev;
4436 offset = logical - em->start; 4391 offset = logical - em->start;
4437 4392
4438 if (mirror_num > map->num_stripes)
4439 mirror_num = 0;
4440
4441 stripe_len = map->stripe_len; 4393 stripe_len = map->stripe_len;
4442 stripe_nr = offset; 4394 stripe_nr = offset;
4443 /* 4395 /*
@@ -5367,7 +5319,6 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
5367 return NULL; 5319 return NULL;
5368 list_add(&device->dev_list, 5320 list_add(&device->dev_list,
5369 &fs_devices->devices); 5321 &fs_devices->devices);
5370 device->dev_root = root->fs_info->dev_root;
5371 device->devid = devid; 5322 device->devid = devid;
5372 device->work.func = pending_bios_fn; 5323 device->work.func = pending_bios_fn;
5373 device->fs_devices = fs_devices; 5324 device->fs_devices = fs_devices;
@@ -5593,7 +5544,6 @@ static int read_one_dev(struct btrfs_root *root,
5593 } 5544 }
5594 5545
5595 fill_device_from_item(leaf, dev_item, device); 5546 fill_device_from_item(leaf, dev_item, device);
5596 device->dev_root = root->fs_info->dev_root;
5597 device->in_fs_metadata = 1; 5547 device->in_fs_metadata = 1;
5598 if (device->writeable && !device->is_tgtdev_for_dev_replace) { 5548 if (device->writeable && !device->is_tgtdev_for_dev_replace) {
5599 device->fs_devices->total_rw_bytes += device->total_bytes; 5549 device->fs_devices->total_rw_bytes += device->total_bytes;
@@ -5751,6 +5701,17 @@ error:
5751 return ret; 5701 return ret;
5752} 5702}
5753 5703
5704void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
5705{
5706 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
5707 struct btrfs_device *device;
5708
5709 mutex_lock(&fs_devices->device_list_mutex);
5710 list_for_each_entry(device, &fs_devices->devices, dev_list)
5711 device->dev_root = fs_info->dev_root;
5712 mutex_unlock(&fs_devices->device_list_mutex);
5713}
5714
5754static void __btrfs_reset_dev_stats(struct btrfs_device *dev) 5715static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
5755{ 5716{
5756 int i; 5717 int i;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index f6247e2a47f7..86705583480d 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -316,11 +316,13 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
316int btrfs_pause_balance(struct btrfs_fs_info *fs_info); 316int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
317int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); 317int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
318int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); 318int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
319int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, 319int find_free_dev_extent(struct btrfs_trans_handle *trans,
320 struct btrfs_device *device, u64 num_bytes,
320 u64 *start, u64 *max_avail); 321 u64 *start, u64 *max_avail);
321void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); 322void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
322int btrfs_get_dev_stats(struct btrfs_root *root, 323int btrfs_get_dev_stats(struct btrfs_root *root,
323 struct btrfs_ioctl_get_dev_stats *stats); 324 struct btrfs_ioctl_get_dev_stats *stats);
325void btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
324int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); 326int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
325int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, 327int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
326 struct btrfs_fs_info *fs_info); 328 struct btrfs_fs_info *fs_info);
@@ -336,6 +338,9 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
336unsigned long btrfs_full_stripe_len(struct btrfs_root *root, 338unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
337 struct btrfs_mapping_tree *map_tree, 339 struct btrfs_mapping_tree *map_tree,
338 u64 logical); 340 u64 logical);
341int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
342 struct btrfs_root *extent_root,
343 u64 chunk_offset, u64 chunk_size);
339static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, 344static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
340 int index) 345 int index)
341{ 346{